Skip to content

Commit 31763cd

Browse files
authored
Merge pull request oneapi-src#1766 from jimmytwei/benchmkl
Updated matrix_mul_mkl sample
2 parents 7b1df6b + fb993c5 commit 31763cd

File tree

4 files changed

+144
-67
lines changed

4 files changed

+144
-67
lines changed
Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,25 @@
11
# Makefile for GNU Make
22

3-
default: run
3+
default: all
44

5-
all: run
5+
all: sgemm.mkl dgemm.mkl
66

7-
run: matrix_mul_mkl
8-
./matrix_mul_mkl
7+
run: sgemm.mkl dgemm.mkl
8+
./sgemm.mkl
9+
./dgemm.mkl
910

10-
MKL_COPTS = -DMKL_ILP64 -I"${MKLROOT}/include"
11-
MKL_LIBS = -L${MKLROOT}/lib/intel64 -lmkl_sycl -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core -lsycl -lOpenCL -lpthread -lm -ldl
11+
MKL_COPTS =
12+
MKL_LIBS = -qmkl
1213

13-
DPCPP_OPTS = $(MKL_COPTS) -fsycl-device-code-split=per_kernel $(MKL_LIBS)
14+
DPCPP_OPTS = -O3 $(MKL_COPTS) $(MKL_LIBS)
1415

15-
matrix_mul_mkl: matrix_mul_mkl.cpp
16-
icpx $< -fsycl -o $@ $(DPCPP_OPTS)
16+
sgemm.mkl: matrix_mul_mkl.cpp
17+
icpx -fsycl $< -o $@ $(DPCPP_OPTS)
18+
19+
dgemm.mkl: matrix_mul_mkl.cpp
20+
icpx -fsycl $< -o $@ $(DPCPP_OPTS) -DUSE_DOUBLE
1721

1822
clean:
19-
-rm -f matrix_mul_mkl
23+
-rm -f sgemm.mkl dgemm.mkl
2024

2125
.PHONY: clean run all

Libraries/oneMKL/matrix_mul_mkl/README.md

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -71,12 +71,12 @@ Run `nmake` to build and run the sample. `nmake clean` removes temporary files.
7171
If everything is working correctly, the program will generate two input matrices and call oneMKL to multiply them. It will also compute the product matrix itself to verify the results from oneMKL.
7272

7373
```
74-
./matrix_mul_mkl
75-
Device: Intel(R) Gen9 HD Graphics NEO
76-
Problem size: A (600x1200) * B (1200x2400) --> C (600x2400)
74+
./dgemm.mkl
75+
Problem size: A (8192x8192) * B (8192x8192) --> C (8192x8192)
76+
Benchmark interations: 100
77+
Device: Intel(R) Data Center GPU Max 1100
7778
Launching oneMKL GEMM calculation...
78-
Performing reference calculation...
79-
Results are accurate.
79+
DGEMM performance : 14979.3 GFLOPS
8080
```
8181

8282
### Troubleshooting
@@ -87,4 +87,4 @@ If an error occurs, troubleshoot the problem using the Diagnostics Utility for I
8787
Code samples are licensed under the MIT license. See
8888
[License.txt](https://github.com/oneapi-src/oneAPI-samples/blob/master/License.txt) for details.
8989

90-
Third party program Licenses can be found here: [third-party-programs.txt](https://github.com/oneapi-src/oneAPI-samples/blob/master/third-party-programs.txt).
90+
Third party program Licenses can be found here: [third-party-programs.txt](https://github.com/oneapi-src/oneAPI-samples/blob/master/third-party-programs.txt).
Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,22 @@
11
# Makefile for NMAKE
22

3-
default: run
3+
default: all
44

5-
all: run
5+
all: sgemm.exe dgemm.exe
66

7-
run: matrix_mul_mkl.exe
8-
.\matrix_mul_mkl.exe
7+
run: sgemm.exe dgemm.exe
8+
.\sgemm.exe
9+
.\dgemm.exe
910

1011
DPCPP_OPTS=/I"$(MKLROOT)\include" /Qmkl /EHsc -fsycl-device-code-split=per_kernel OpenCL.lib
1112

12-
matrix_mul_mkl.exe: matrix_mul_mkl.cpp
13-
icx-cl -fsycl matrix_mul_mkl.cpp /Fematrix_mul_mkl.exe $(DPCPP_OPTS)
13+
sgemm.exe: matrix_mul_mkl.cpp
14+
icx-cl -fsycl matrix_mul_mkl.cpp /Fesgemm.exe $(DPCPP_OPTS)
15+
16+
dgemm.exe: matrix_mul_mkl.cpp
17+
icx-cl -fsycl matrix_mul_mkl.cpp /Fedgemm.exe $(DPCPP_OPTS) -DUSE_DOUBLE
1418

1519
clean:
16-
del /q matrix_mul_mkl.exe matrix_mul_mkl.exp matrix_mul_mkl.lib
20+
del /q sgemm.exe sgemm.exp sgemm.lib dgemm.exe dgemm.exp dgemm.lib
1721

1822
pseudo: clean run all
Lines changed: 113 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
//==============================================================
2-
// Copyright © 2020-2023 Intel Corporation
2+
// Copyright © 2020 Intel Corporation
33
//
44
// SPDX-License-Identifier: MIT
55
// =============================================================
@@ -9,18 +9,41 @@
99
// This samples uses the oneAPI Math Kernel Library (oneMKL) to accelerate
1010
// the computation.
1111

12+
// The test is updated based on oneAPI samples oneAPI-samples/Libraries/oneMKL/matrix_mul_mkl
1213
#include <iostream>
14+
#include <iomanip>
1315
#include <limits>
1416

1517
#include <sycl/sycl.hpp>
1618
#include "oneapi/mkl.hpp"
19+
#include "dpc_common.hpp"
1720

18-
float rand_uniform();
19-
bool verify_result(int m, int n, int k, int ldc, const float *C, const float *C_reference);
21+
#ifndef USE_DOUBLE
22+
#define FLOAT float
23+
#else
24+
#define FLOAT double
25+
#endif
2026

21-
int main()
27+
FLOAT rand_uniform();
28+
bool verify_result(int m, int n, int k, int ldc, FLOAT *C, FLOAT *C_reference);
29+
30+
#define WARMUP 10
31+
#define LOOPS 100
32+
//default matrix size 8192x8192
33+
#define MSIZE 8192
34+
#define VERIFY_RESULT False
35+
36+
37+
using namespace std ;
38+
39+
int main(int argc, char* argv[])
2240
{
2341
try {
42+
43+
int msize = MSIZE;
44+
int loops = LOOPS;
45+
int verify = 0;
46+
2447
// Initialize data for GEMM. The full GEMM operation is:
2548
//
2649
// C = alpha * op(A) * op(B) + beta * C
@@ -29,7 +52,7 @@ int main()
2952
// optional matrix transposition.
3053
//
3154
// For this simple matrix multiplication, no transposition is needed.
32-
//
55+
//
3356
// By choosing alpha = 1, beta = 0, GEMM will calculate C = A * B.
3457
//
3558
// In this example, matrices are stored in row-major layout.
@@ -38,12 +61,19 @@ int main()
3861
auto transB = oneapi::mkl::transpose::nontrans;
3962

4063
// Matrix data sizes.
41-
//
64+
//
4265
// A is m x k
4366
// B is k x n --> product C is m x n
44-
int m = 600;
45-
int k = 1200;
46-
int n = 2400;
67+
int m = msize;
68+
int k = msize;
69+
int n = msize;
70+
71+
cout << "Problem size: "
72+
<< " A (" << m << 'x' << k << ") *"
73+
<< " B (" << k << 'x' << n << ") --> "
74+
<< " C (" << m << 'x' << n << ")\n";
75+
76+
cout << "Benchmark interations: " << loops << endl;
4777

4878
// Leading dimensions of data. For row-major matrices, the leading
4979
// dimension is the stride between adjacent rows.
@@ -52,8 +82,8 @@ int main()
5282
int ldc = n;
5383

5484
// Scaling factors.
55-
float alpha = 1.0f;
56-
float beta = 0.0f;
85+
FLOAT alpha = 1.0f;
86+
FLOAT beta = 0.0f;
5787

5888
// Create a queue on the default device.
5989
sycl::queue device_queue{sycl::default_selector_v};
@@ -63,72 +93,109 @@ int main()
6393
<< std::endl;
6494

6595
// Allocate shared memory for matrices.
66-
auto A = sycl::malloc_shared<float>(m * k, device_queue);
67-
auto B = sycl::malloc_shared<float>(k * n, device_queue);
68-
auto C = sycl::malloc_shared<float>(m * n, device_queue);
69-
auto C_reference = (float *) calloc(m * n, sizeof(float));
96+
const size_t alignment = 4096;
97+
auto a = sycl::aligned_alloc_host<FLOAT>(alignment, m * k, device_queue);
98+
auto b = sycl::aligned_alloc_host<FLOAT>(alignment, k * n, device_queue);
99+
auto c = sycl::aligned_alloc_host<FLOAT>(alignment, m * n, device_queue);
100+
101+
auto C_reference = (FLOAT *) calloc(m * n, sizeof(FLOAT));
70102

71-
if (!A || !B || !C || !C_reference) {
103+
if (!a || !b || !c || !C_reference) {
72104
std::cerr << "Could not allocate memory for matrices." << std::endl;
73105
exit(1);
74106
}
75107

76108
// Initialize matrix data.
77109
for (int i = 0; i < m; i++)
78110
for (int j = 0; j < k; j++)
79-
A[i * lda + j] = rand_uniform();
111+
a[i * lda + j] = rand_uniform();
80112

81113
for (int i = 0; i < k; i++)
82114
for (int j = 0; j < n; j++)
83-
B[i * ldb + j] = rand_uniform();
115+
b[i * ldb + j] = rand_uniform();
84116

85-
std::cout << "Problem size: "
86-
<< " A (" << m << 'x' << k << ") *"
87-
<< " B (" << k << 'x' << n << ") --> "
88-
<< " C (" << m << 'x' << n << ")\n";
117+
auto A = sycl::aligned_alloc_device<FLOAT>(alignment, m * k, device_queue);
118+
auto B = sycl::aligned_alloc_device<FLOAT>(alignment, m * n, device_queue);
119+
auto C = sycl::aligned_alloc_device<FLOAT>(alignment, m * n, device_queue);
120+
device_queue.wait();
121+
122+
device_queue.memcpy(A, &(a[0]), m * k * sizeof(FLOAT));
123+
device_queue.memcpy(B, &(b[0]), k * n * sizeof(FLOAT));
124+
device_queue.memcpy(C, &(c[0]), m * n * sizeof(FLOAT));
125+
device_queue.wait();
126+
89127

90128
// Call GEMM to do matrix multiplication, asynchronously.
91129
std::cerr << "Launching oneMKL GEMM calculation..." << std::endl;
92-
oneapi::mkl::blas::row_major::gemm(device_queue, transA, transB, m, n, k,
93-
alpha, A, lda, B, ldb, beta, C, ldc);
94-
95-
// While calculation occurs, compute reference result to check accuracy.
96-
std::cerr << "Performing reference calculation..." << std::endl;
97-
for (int i = 0; i < m; i++)
98-
for (int h = 0; h < k; h++)
99-
for (int j = 0; j < n; j++)
100-
C_reference[i * ldc + j] += A[i * lda + h] * B[h * ldb + j];
101-
102-
// Wait for oneMKL computation to complete.
130+
dpc_common::TimeInterval timer;
131+
double start_time = 0.0;
132+
133+
//warm up
134+
for (int w=0; w < WARMUP; w++)
135+
{
136+
oneapi::mkl::blas::row_major::gemm(device_queue, transA, transB, m, n, k,
137+
alpha, A, lda, B, ldb, beta, C, ldc);
138+
}
139+
device_queue.wait_and_throw();
140+
141+
start_time = timer.Elapsed();
142+
for (int l=0; l < loops; l++)
143+
{
144+
oneapi::mkl::blas::row_major::gemm(device_queue, transA, transB, m, n, k,
145+
alpha, A, lda, B, ldb, beta, C, ldc);
146+
}
147+
// Wait for oneMKL computation to complete.
103148
device_queue.wait_and_throw();
104149

105-
// Check results for accuracy.
106-
bool ok = verify_result(m, n, k, ldc, C, C_reference);
150+
double stop_time = timer.Elapsed();
151+
double avg_gemm_time = (stop_time - start_time)/loops;
152+
153+
double gflops = 2.0 * (double)m * (double)m * (double)m;
154+
#ifdef USE_DOUBLE
155+
cout << "DGEMM performance : " << gflops / avg_gemm_time * 1.e-9 << " GFLOPS" << endl;
156+
#else
157+
cout << "SGEMM performance : " << gflops / avg_gemm_time * 1.e-9 << " GFLOPS" << endl;
158+
#endif
159+
160+
161+
if(verify)
162+
{
163+
// While calculation occurs, compute reference result to check accuracy.
164+
std::cerr << "Performing reference calculation..." << std::endl;
165+
for (int i = 0; i < m; i++)
166+
for (int h = 0; h < k; h++)
167+
for (int j = 0; j < n; j++)
168+
C_reference[i * ldc + j] += a[i * lda + h] * b[h * ldb + j];
169+
// Check results for accuracy.
170+
device_queue.memcpy(&(c[0]), C, m*n*sizeof(FLOAT)).wait();
171+
verify_result(m, n, k, ldc, c, C_reference);
172+
}
173+
107174

108175
// Free memory.
109176
free(A, device_queue);
110177
free(B, device_queue);
111178
free(C, device_queue);
112-
free(C_reference);
179+
free(C_reference);
180+
free(a, device_queue);
181+
free(b, device_queue);
182+
free(c, device_queue);
113183

114-
if (!ok)
115-
exit(2);
116184
} catch (const std::exception &e) {
117185
std::cerr << "An exception occurred: "
118186
<< e.what() << std::endl;
119187
exit(1);
120188
}
121189
}
122190

123-
float rand_uniform()
191+
FLOAT rand_uniform()
124192
{
125-
return float(rand()) / float(RAND_MAX);
193+
return static_cast <FLOAT> (rand()) / static_cast <FLOAT> (RAND_MAX);
126194
}
127195

128-
bool verify_result(int m, int n, int k, int ldc,
129-
const float *C, const float *C_reference)
196+
bool verify_result(int m, int n, int k, int ldc, FLOAT *C, FLOAT *C_reference)
130197
{
131-
float tolerance = 1e-3;
198+
FLOAT tolerance = 1e-3;
132199
bool ok = true;
133200

134201
// Compare host side results with the result buffer from device side: print
@@ -150,7 +217,9 @@ bool verify_result(int m, int n, int k, int ldc,
150217
}
151218

152219
if (ok)
153-
std::cout << "Results are accurate.\n";
220+
std::cout << "Results are accurate with tolerance = " << tolerance << endl;
221+
else
222+
std::cout << "Results may not be accurate with tolerance = " << tolerance << endl;
154223

155224
return ok;
156225
}

0 commit comments

Comments
 (0)