36
36
#include < helper_cuda.h>
37
37
#include < vector>
38
38
#include < chrono>
39
+ #include < chrono>
39
40
#include < taskflow/sycl/syclflow.hpp>
40
41
41
42
using Time = std::chrono::steady_clock;
@@ -55,7 +56,7 @@ void reduce(float *inputVec, double *outputVec, size_t inputSize,
55
56
size_t outputSize, const sycl::nd_item<3 > &item_ct1,
56
57
double *tmp) {
57
58
58
- auto cta = item_ct1.get_group ();
59
+ sycl::group< 3 > cta = item_ct1.get_group ();
59
60
size_t globaltid = item_ct1.get_group (2 ) * item_ct1.get_local_range (2 ) +
60
61
item_ct1.get_local_id (2 );
61
62
@@ -68,29 +69,27 @@ void reduce(float *inputVec, double *outputVec, size_t inputSize,
68
69
69
70
item_ct1.barrier ();
70
71
71
- sycl::sub_group tile_sg = item_ct1.get_sub_group ();
72
+ sycl::sub_group tile32 = item_ct1.get_sub_group ();
72
73
73
74
double beta = temp_sum;
74
75
double temp;
75
76
76
- for (int i = tile_sg .get_local_linear_range () / 2 ; i > 0 ;
77
+ for (int i = tile32 .get_local_linear_range () / 2 ; i > 0 ;
77
78
i >>= 1 ) {
78
- if (tile_sg .get_local_linear_id () < i) {
79
+ if (tile32 .get_local_linear_id () < i) {
79
80
temp = tmp[item_ct1.get_local_linear_id () + i];
80
81
beta += temp;
81
82
tmp[item_ct1.get_local_linear_id ()] = beta;
82
83
}
83
- tile_sg. barrier ();
84
- }
84
+ }
85
+
85
86
item_ct1.barrier ();
86
87
87
88
if (item_ct1.get_local_linear_id () == 0 &&
88
89
item_ct1.get_group (2 ) < outputSize) {
89
90
beta = 0.0 ;
90
- int cta_size = cta.get_local_linear_range ();
91
-
92
- for (int i = 0 ; i < cta_size;
93
- i += tile_sg.get_local_linear_range ()) {
91
+ for (int i = 0 ; i < item_ct1.get_group ().get_local_linear_range ();
92
+ i += tile32.get_local_linear_range ()) {
94
93
beta += tmp[i];
95
94
}
96
95
outputVec[item_ct1.get_group (2 )] = beta;
@@ -101,6 +100,7 @@ void reduceFinal(double *inputVec, double *result,
101
100
size_t inputSize, const sycl::nd_item<3 > &item_ct1,
102
101
double *tmp) {
103
102
103
+ sycl::group<3 > cta = item_ct1.get_group ();
104
104
size_t globaltid = item_ct1.get_group (2 ) * item_ct1.get_local_range (2 ) +
105
105
item_ct1.get_local_id (2 );
106
106
@@ -113,7 +113,7 @@ void reduceFinal(double *inputVec, double *result,
113
113
114
114
item_ct1.barrier ();
115
115
116
- sycl::sub_group tile_sg = item_ct1.get_sub_group ();
116
+ sycl::sub_group tile32 = item_ct1.get_sub_group ();
117
117
118
118
// do reduction in shared mem
119
119
if ((item_ct1.get_local_range (2 ) >= 512 ) &&
@@ -145,11 +145,11 @@ void reduceFinal(double *inputVec, double *result,
145
145
if (item_ct1.get_local_range (2 ) >= 64 ) temp_sum +=
146
146
tmp[item_ct1.get_local_linear_id () + 32 ];
147
147
// Reduce final warp using shuffle
148
- for (int offset = tile_sg .get_local_linear_range () / 2 ;
148
+ for (int offset =tile32 .get_local_linear_range () / 2 ;
149
149
offset > 0 ; offset /= 2 ) {
150
150
temp_sum +=
151
- sycl::shift_group_left (tile_sg , temp_sum, offset);
152
- }
151
+ sycl::shift_group_left (tile32 , temp_sum, offset);
152
+ }
153
153
}
154
154
// write result for this block to global mem
155
155
if (item_ct1.get_local_linear_id () == 0 ) result[0 ] = temp_sum;
@@ -169,9 +169,8 @@ void myHostNodeCallback(void *data) {
169
169
*result = 0.0 ; // reset the result
170
170
}
171
171
172
- void syclTaskFlowManual (float *inputVec_h, float *inputVec_d,
173
- double *outputVec_d, double *result_d, size_t inputSize,
174
- size_t numOfBlocks, sycl::queue q_ct1) {
172
+ void syclTaskFlowManual (float *inputVec_h, float *inputVec_d, double *outputVec_d,
173
+ double *result_d, size_t inputSize, size_t numOfBlocks, sycl::queue q_ct1) {
175
174
tf::Taskflow tflow;
176
175
tf::Executor exe;
177
176
@@ -202,7 +201,9 @@ void syclTaskFlowManual(float *inputVec_h, float *inputVec_d,
202
201
[[intel::reqd_sub_group_size (SUB_GRP_SIZE)]] {
203
202
reduce (inputVec_d, outputVec_d, inputSize,
204
203
numOfBlocks, item_ct1,
205
- tmp.get_pointer ());
204
+
205
+ tmp.get_multi_ptr <sycl::access::decorated::no>()
206
+ .get ());
206
207
});
207
208
}).name (" reduce_kernel" );
208
209
@@ -222,7 +223,8 @@ void syclTaskFlowManual(float *inputVec_h, float *inputVec_d,
222
223
[[intel::reqd_sub_group_size (SUB_GRP_SIZE)]] {
223
224
reduceFinal (outputVec_d, result_d,
224
225
numOfBlocks, item_ct1,
225
- tmp.get_pointer ());
226
+ tmp.get_multi_ptr <sycl::access::decorated::no>()
227
+ .get ());
226
228
});
227
229
}).name (" reduceFinal_kernel" );
228
230
@@ -259,7 +261,7 @@ void syclTaskFlowManual(float *inputVec_h, float *inputVec_d,
259
261
" %zu\n " ,
260
262
sf_Task + tf_Task);
261
263
262
- printf (" Cloned Graph Output.. \n " );
264
+ printf (" Cloned Graph Output.. \n " );
263
265
tf::Taskflow tflow_clone (std::move (tflow));
264
266
exe.run_n (tflow_clone, GRAPH_LAUNCH_ITERATIONS).wait ();
265
267
}
@@ -293,11 +295,11 @@ int main(int argc, char **argv) {
293
295
294
296
auto startTimer1 = Time::now ();
295
297
syclTaskFlowManual (inputVec_h, inputVec_d, outputVec_d, result_d, size,
296
- maxBlocks, q_ct1);
298
+ maxBlocks, q_ct1);
297
299
auto stopTimer1 = Time::now ();
298
300
auto Timer_duration1 =
299
301
std::chrono::duration_cast<float_ms>(stopTimer1 - startTimer1).count ();
300
- printf (" Elapsed Time of SYCL TaskFlow Manual : %f (ms)\n " , Timer_duration1);
302
+ printf (" Elapsed Time of SYCL Taskflow Manual : %f (ms)\n " , Timer_duration1);
301
303
302
304
DPCT_CHECK_ERROR (sycl::free (inputVec_d, q_ct1));
303
305
DPCT_CHECK_ERROR (sycl::free (outputVec_d, q_ct1));
0 commit comments