Skip to content

Commit d1a8806

Browse files
author
Shwetha-Selma
authored
updated simpleCudaGraphs sample with 2024.2 (oneapi-src#2368)
1 parent b73377e commit d1a8806

File tree

5 files changed

+114
-127
lines changed
  • DirectProgramming/C++SYCL

5 files changed

+114
-127
lines changed

DirectProgramming/C++SYCL/Jupyter/cuda-to-sycl-migration-training/06_SYCL_Migration_SimpleCUDAGraphs/sycl_migrated_option1/Samples/3_CUDA_Features/simpleCudaGraphs/simpleCudaGraphs.dp.cpp

Lines changed: 24 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
#include <helper_cuda.h>
3737
#include <vector>
3838
#include <chrono>
39+
#include <chrono>
3940
#include <taskflow/sycl/syclflow.hpp>
4041

4142
using Time = std::chrono::steady_clock;
@@ -55,7 +56,7 @@ void reduce(float *inputVec, double *outputVec, size_t inputSize,
5556
size_t outputSize, const sycl::nd_item<3> &item_ct1,
5657
double *tmp) {
5758

58-
auto cta = item_ct1.get_group();
59+
sycl::group<3> cta = item_ct1.get_group();
5960
size_t globaltid = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
6061
item_ct1.get_local_id(2);
6162

@@ -68,29 +69,27 @@ void reduce(float *inputVec, double *outputVec, size_t inputSize,
6869

6970
item_ct1.barrier();
7071

71-
sycl::sub_group tile_sg = item_ct1.get_sub_group();
72+
sycl::sub_group tile32 = item_ct1.get_sub_group();
7273

7374
double beta = temp_sum;
7475
double temp;
7576

76-
for (int i = tile_sg.get_local_linear_range() / 2; i > 0;
77+
for (int i = tile32.get_local_linear_range() / 2; i > 0;
7778
i >>= 1) {
78-
if (tile_sg.get_local_linear_id() < i) {
79+
if (tile32.get_local_linear_id() < i) {
7980
temp = tmp[item_ct1.get_local_linear_id() + i];
8081
beta += temp;
8182
tmp[item_ct1.get_local_linear_id()] = beta;
8283
}
83-
tile_sg.barrier();
84-
}
84+
}
85+
8586
item_ct1.barrier();
8687

8788
if (item_ct1.get_local_linear_id() == 0 &&
8889
item_ct1.get_group(2) < outputSize) {
8990
beta = 0.0;
90-
int cta_size = cta.get_local_linear_range();
91-
92-
for (int i = 0; i < cta_size;
93-
i += tile_sg.get_local_linear_range()) {
91+
for (int i = 0; i < item_ct1.get_group().get_local_linear_range();
92+
i += tile32.get_local_linear_range()) {
9493
beta += tmp[i];
9594
}
9695
outputVec[item_ct1.get_group(2)] = beta;
@@ -101,6 +100,7 @@ void reduceFinal(double *inputVec, double *result,
101100
size_t inputSize, const sycl::nd_item<3> &item_ct1,
102101
double *tmp) {
103102

103+
sycl::group<3> cta = item_ct1.get_group();
104104
size_t globaltid = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
105105
item_ct1.get_local_id(2);
106106

@@ -113,7 +113,7 @@ void reduceFinal(double *inputVec, double *result,
113113

114114
item_ct1.barrier();
115115

116-
sycl::sub_group tile_sg = item_ct1.get_sub_group();
116+
sycl::sub_group tile32 = item_ct1.get_sub_group();
117117

118118
// do reduction in shared mem
119119
if ((item_ct1.get_local_range(2) >= 512) &&
@@ -145,11 +145,11 @@ void reduceFinal(double *inputVec, double *result,
145145
if (item_ct1.get_local_range(2) >= 64) temp_sum +=
146146
tmp[item_ct1.get_local_linear_id() + 32];
147147
// Reduce final warp using shuffle
148-
for (int offset = tile_sg.get_local_linear_range() / 2;
148+
for (int offset =tile32.get_local_linear_range() / 2;
149149
offset > 0; offset /= 2) {
150150
temp_sum +=
151-
sycl::shift_group_left(tile_sg, temp_sum, offset);
152-
}
151+
sycl::shift_group_left(tile32, temp_sum, offset);
152+
}
153153
}
154154
// write result for this block to global mem
155155
if (item_ct1.get_local_linear_id() == 0) result[0] = temp_sum;
@@ -169,9 +169,8 @@ void myHostNodeCallback(void *data) {
169169
*result = 0.0; // reset the result
170170
}
171171

172-
void syclTaskFlowManual(float *inputVec_h, float *inputVec_d,
173-
double *outputVec_d, double *result_d, size_t inputSize,
174-
size_t numOfBlocks, sycl::queue q_ct1) {
172+
void syclTaskFlowManual(float *inputVec_h, float *inputVec_d, double *outputVec_d,
173+
double *result_d, size_t inputSize, size_t numOfBlocks, sycl::queue q_ct1) {
175174
tf::Taskflow tflow;
176175
tf::Executor exe;
177176

@@ -202,7 +201,9 @@ void syclTaskFlowManual(float *inputVec_h, float *inputVec_d,
202201
[[intel::reqd_sub_group_size(SUB_GRP_SIZE)]] {
203202
reduce(inputVec_d, outputVec_d, inputSize,
204203
numOfBlocks, item_ct1,
205-
tmp.get_pointer());
204+
205+
tmp.get_multi_ptr<sycl::access::decorated::no>()
206+
.get());
206207
});
207208
}).name("reduce_kernel");
208209

@@ -222,7 +223,8 @@ void syclTaskFlowManual(float *inputVec_h, float *inputVec_d,
222223
[[intel::reqd_sub_group_size(SUB_GRP_SIZE)]] {
223224
reduceFinal(outputVec_d, result_d,
224225
numOfBlocks, item_ct1,
225-
tmp.get_pointer());
226+
tmp.get_multi_ptr<sycl::access::decorated::no>()
227+
.get());
226228
});
227229
}).name("reduceFinal_kernel");
228230

@@ -259,7 +261,7 @@ void syclTaskFlowManual(float *inputVec_h, float *inputVec_d,
259261
"%zu\n",
260262
sf_Task + tf_Task);
261263

262-
printf("Cloned Graph Output.. \n");
264+
printf("Cloned Graph Output.. \n");
263265
tf::Taskflow tflow_clone(std::move(tflow));
264266
exe.run_n(tflow_clone, GRAPH_LAUNCH_ITERATIONS).wait();
265267
}
@@ -293,11 +295,11 @@ int main(int argc, char **argv) {
293295

294296
auto startTimer1 = Time::now();
295297
syclTaskFlowManual(inputVec_h, inputVec_d, outputVec_d, result_d, size,
296-
maxBlocks, q_ct1);
298+
maxBlocks, q_ct1);
297299
auto stopTimer1 = Time::now();
298300
auto Timer_duration1 =
299301
std::chrono::duration_cast<float_ms>(stopTimer1 - startTimer1).count();
300-
printf("Elapsed Time of SYCL TaskFlow Manual : %f (ms)\n", Timer_duration1);
302+
printf("Elapsed Time of SYCL Taskflow Manual : %f (ms)\n", Timer_duration1);
301303

302304
DPCT_CHECK_ERROR(sycl::free(inputVec_d, q_ct1));
303305
DPCT_CHECK_ERROR(sycl::free(outputVec_d, q_ct1));

0 commit comments

Comments
 (0)