1
1
// ==============================================================
2
- // Copyright © 2020-2023 Intel Corporation
2
+ // Copyright © 2020 Intel Corporation
3
3
//
4
4
// SPDX-License-Identifier: MIT
5
5
// =============================================================
9
9
// This samples uses the oneAPI Math Kernel Library (oneMKL) to accelerate
10
10
// the computation.
11
11
12
+ // The test is updated based on oneAPI samples oneAPI-samples/Libraries/oneMKL/matrix_mul_mkl
12
13
#include < iostream>
14
+ #include < iomanip>
13
15
#include < limits>
14
16
15
17
#include < sycl/sycl.hpp>
16
18
#include " oneapi/mkl.hpp"
19
+ #include " dpc_common.hpp"
17
20
18
- float rand_uniform ();
19
- bool verify_result (int m, int n, int k, int ldc, const float *C, const float *C_reference);
21
+ #ifndef USE_DOUBLE
22
+ #define FLOAT float
23
+ #else
24
+ #define FLOAT double
25
+ #endif
20
26
21
- int main ()
27
+ FLOAT rand_uniform ();
28
+ bool verify_result (int m, int n, int k, int ldc, FLOAT *C, FLOAT *C_reference);
29
+
30
+ #define WARMUP 10
31
+ #define LOOPS 100
32
+ // default matrix size 8192x8192
33
+ #define MSIZE 8192
34
+ #define VERIFY_RESULT False
35
+
36
+
37
+ using namespace std ;
38
+
39
+ int main (int argc, char * argv[])
22
40
{
23
41
try {
42
+
43
+ int msize = MSIZE;
44
+ int loops = LOOPS;
45
+ int verify = 0 ;
46
+
24
47
// Initialize data for GEMM. The full GEMM operation is:
25
48
//
26
49
// C = alpha * op(A) * op(B) + beta * C
@@ -29,7 +52,7 @@ int main()
29
52
// optional matrix transposition.
30
53
//
31
54
// For this simple matrix multiplication, no transposition is needed.
32
- //
55
+ //
33
56
// By choosing alpha = 1, beta = 0, GEMM will calculate C = A * B.
34
57
//
35
58
// In this example, matrices are stored in row-major layout.
@@ -38,12 +61,19 @@ int main()
38
61
auto transB = oneapi::mkl::transpose::nontrans;
39
62
40
63
// Matrix data sizes.
41
- //
64
+ //
42
65
// A is m x k
43
66
// B is k x n --> product C is m x n
44
- int m = 600 ;
45
- int k = 1200 ;
46
- int n = 2400 ;
67
+ int m = msize;
68
+ int k = msize;
69
+ int n = msize;
70
+
71
+ cout << " Problem size: "
72
+ << " A (" << m << ' x' << k << " ) *"
73
+ << " B (" << k << ' x' << n << " ) --> "
74
+ << " C (" << m << ' x' << n << " )\n " ;
75
+
76
+ cout << " Benchmark interations: " << loops << endl;
47
77
48
78
// Leading dimensions of data. For row-major matrices, the leading
49
79
// dimension is the stride between adjacent rows.
@@ -52,8 +82,8 @@ int main()
52
82
int ldc = n;
53
83
54
84
// Scaling factors.
55
- float alpha = 1 .0f ;
56
- float beta = 0 .0f ;
85
+ FLOAT alpha = 1 .0f ;
86
+ FLOAT beta = 0 .0f ;
57
87
58
88
// Create a queue on the default device.
59
89
sycl::queue device_queue{sycl::default_selector_v};
@@ -63,72 +93,109 @@ int main()
63
93
<< std::endl;
64
94
65
95
// Allocate shared memory for matrices.
66
- auto A = sycl::malloc_shared<float >(m * k, device_queue);
67
- auto B = sycl::malloc_shared<float >(k * n, device_queue);
68
- auto C = sycl::malloc_shared<float >(m * n, device_queue);
69
- auto C_reference = (float *) calloc (m * n, sizeof (float ));
96
+ const size_t alignment = 4096 ;
97
+ auto a = sycl::aligned_alloc_host<FLOAT>(alignment, m * k, device_queue);
98
+ auto b = sycl::aligned_alloc_host<FLOAT>(alignment, k * n, device_queue);
99
+ auto c = sycl::aligned_alloc_host<FLOAT>(alignment, m * n, device_queue);
100
+
101
+ auto C_reference = (FLOAT *) calloc (m * n, sizeof (FLOAT));
70
102
71
- if (!A || !B || !C || !C_reference) {
103
+ if (!a || !b || !c || !C_reference) {
72
104
std::cerr << " Could not allocate memory for matrices." << std::endl;
73
105
exit (1 );
74
106
}
75
107
76
108
// Initialize matrix data.
77
109
for (int i = 0 ; i < m; i++)
78
110
for (int j = 0 ; j < k; j++)
79
- A [i * lda + j] = rand_uniform ();
111
+ a [i * lda + j] = rand_uniform ();
80
112
81
113
for (int i = 0 ; i < k; i++)
82
114
for (int j = 0 ; j < n; j++)
83
- B [i * ldb + j] = rand_uniform ();
115
+ b [i * ldb + j] = rand_uniform ();
84
116
85
- std::cout << " Problem size: "
86
- << " A (" << m << ' x' << k << " ) *"
87
- << " B (" << k << ' x' << n << " ) --> "
88
- << " C (" << m << ' x' << n << " )\n " ;
117
+ auto A = sycl::aligned_alloc_device<FLOAT>(alignment, m * k, device_queue);
118
+ auto B = sycl::aligned_alloc_device<FLOAT>(alignment, m * n, device_queue);
119
+ auto C = sycl::aligned_alloc_device<FLOAT>(alignment, m * n, device_queue);
120
+ device_queue.wait ();
121
+
122
+ device_queue.memcpy (A, &(a[0 ]), m * k * sizeof (FLOAT));
123
+ device_queue.memcpy (B, &(b[0 ]), k * n * sizeof (FLOAT));
124
+ device_queue.memcpy (C, &(c[0 ]), m * n * sizeof (FLOAT));
125
+ device_queue.wait ();
126
+
89
127
90
128
// Call GEMM to do matrix multiplication, asynchronously.
91
129
std::cerr << " Launching oneMKL GEMM calculation..." << std::endl;
92
- oneapi::mkl::blas::row_major::gemm (device_queue, transA, transB, m, n, k,
93
- alpha, A, lda, B, ldb, beta, C, ldc);
94
-
95
- // While calculation occurs, compute reference result to check accuracy.
96
- std::cerr << " Performing reference calculation..." << std::endl;
97
- for (int i = 0 ; i < m; i++)
98
- for (int h = 0 ; h < k; h++)
99
- for (int j = 0 ; j < n; j++)
100
- C_reference[i * ldc + j] += A[i * lda + h] * B[h * ldb + j];
101
-
102
- // Wait for oneMKL computation to complete.
130
+ dpc_common::TimeInterval timer;
131
+ double start_time = 0.0 ;
132
+
133
+ // warm up
134
+ for (int w=0 ; w < WARMUP; w++)
135
+ {
136
+ oneapi::mkl::blas::row_major::gemm (device_queue, transA, transB, m, n, k,
137
+ alpha, A, lda, B, ldb, beta, C, ldc);
138
+ }
139
+ device_queue.wait_and_throw ();
140
+
141
+ start_time = timer.Elapsed ();
142
+ for (int l=0 ; l < loops; l++)
143
+ {
144
+ oneapi::mkl::blas::row_major::gemm (device_queue, transA, transB, m, n, k,
145
+ alpha, A, lda, B, ldb, beta, C, ldc);
146
+ }
147
+ // Wait for oneMKL computation to complete.
103
148
device_queue.wait_and_throw ();
104
149
105
- // Check results for accuracy.
106
- bool ok = verify_result (m, n, k, ldc, C, C_reference);
150
+ double stop_time = timer.Elapsed ();
151
+ double avg_gemm_time = (stop_time - start_time)/loops;
152
+
153
+ double gflops = 2.0 * (double )m * (double )m * (double )m;
154
+ #ifdef USE_DOUBLE
155
+ cout << " DGEMM performance : " << gflops / avg_gemm_time * 1 .e -9 << " GFLOPS" << endl;
156
+ #else
157
+ cout << " SGEMM performance : " << gflops / avg_gemm_time * 1 .e -9 << " GFLOPS" << endl;
158
+ #endif
159
+
160
+
161
+ if (verify)
162
+ {
163
+ // While calculation occurs, compute reference result to check accuracy.
164
+ std::cerr << " Performing reference calculation..." << std::endl;
165
+ for (int i = 0 ; i < m; i++)
166
+ for (int h = 0 ; h < k; h++)
167
+ for (int j = 0 ; j < n; j++)
168
+ C_reference[i * ldc + j] += a[i * lda + h] * b[h * ldb + j];
169
+ // Check results for accuracy.
170
+ device_queue.memcpy (&(c[0 ]), C, m*n*sizeof (FLOAT)).wait ();
171
+ verify_result (m, n, k, ldc, c, C_reference);
172
+ }
173
+
107
174
108
175
// Free memory.
109
176
free (A, device_queue);
110
177
free (B, device_queue);
111
178
free (C, device_queue);
112
- free (C_reference);
179
+ free (C_reference);
180
+ free (a, device_queue);
181
+ free (b, device_queue);
182
+ free (c, device_queue);
113
183
114
- if (!ok)
115
- exit (2 );
116
184
} catch (const std::exception &e) {
117
185
std::cerr << " An exception occurred: "
118
186
<< e.what () << std::endl;
119
187
exit (1 );
120
188
}
121
189
}
122
190
123
- float rand_uniform ()
191
+ FLOAT rand_uniform ()
124
192
{
125
- return float (rand ()) / float (RAND_MAX);
193
+ return static_cast <FLOAT> (rand ()) / static_cast <FLOAT> (RAND_MAX);
126
194
}
127
195
128
- bool verify_result (int m, int n, int k, int ldc,
129
- const float *C, const float *C_reference)
196
+ bool verify_result (int m, int n, int k, int ldc, FLOAT *C, FLOAT *C_reference)
130
197
{
131
- float tolerance = 1e-3 ;
198
+ FLOAT tolerance = 1e-3 ;
132
199
bool ok = true ;
133
200
134
201
// Compare host side results with the result buffer from device side: print
@@ -150,7 +217,9 @@ bool verify_result(int m, int n, int k, int ldc,
150
217
}
151
218
152
219
if (ok)
153
- std::cout << " Results are accurate.\n " ;
220
+ std::cout << " Results are accurate with tolerance = " << tolerance << endl;
221
+ else
222
+ std::cout << " Results may not be accurate with tolerance = " << tolerance << endl;
154
223
155
224
return ok;
156
225
}
0 commit comments