1
+ #include " merge_sort.h"
2
+
3
+ #define min (a, b ) (a < b ? a : b)
4
+ // Based on https://github.com/kevin-albert/cuda-mergesort/blob/master/mergesort.cu
5
+
6
+
7
+ __host__ std::tuple<dim3 , dim3 , int > parseCommandLineArguments (int argc, char **argv)
8
+ {
9
+ int numElements = 32 ;
10
+ dim3 threadsPerBlock;
11
+ dim3 blocksPerGrid;
12
+
13
+ threadsPerBlock.x = 32 ;
14
+ threadsPerBlock.y = 1 ;
15
+ threadsPerBlock.z = 1 ;
16
+
17
+ blocksPerGrid.x = 8 ;
18
+ blocksPerGrid.y = 1 ;
19
+ blocksPerGrid.z = 1 ;
20
+
21
+ for (int i = 1 ; i < argc; i++)
22
+ {
23
+ if (argv[i][0 ] == ' -' && argv[i][1 ] && !argv[i][2 ])
24
+ {
25
+ char arg = argv[i][1 ];
26
+ unsigned int *toSet = 0 ;
27
+ switch (arg)
28
+ {
29
+ case ' x' :
30
+ toSet = &threadsPerBlock.x ;
31
+ break ;
32
+ case ' y' :
33
+ toSet = &threadsPerBlock.y ;
34
+ break ;
35
+ case ' z' :
36
+ toSet = &threadsPerBlock.z ;
37
+ break ;
38
+ case ' X' :
39
+ toSet = &blocksPerGrid.x ;
40
+ break ;
41
+ case ' Y' :
42
+ toSet = &blocksPerGrid.y ;
43
+ break ;
44
+ case ' Z' :
45
+ toSet = &blocksPerGrid.z ;
46
+ break ;
47
+ case ' n' :
48
+ i++;
49
+ numElements = stoi (argv[i]);
50
+ break ;
51
+ }
52
+ if (toSet)
53
+ {
54
+ i++;
55
+ *toSet = (unsigned int ) strtol (argv[i], 0 , 10 );
56
+ }
57
+ }
58
+ }
59
+ return {threadsPerBlock, blocksPerGrid, numElements};
60
+ }
61
+
62
+ __host__ long *generateRandomLongArray (int numElements)
63
+ {
64
+ // TODO generate random array of long integers of size numElements
65
+
66
+ long * randomLongs = (long *)malloc (numElements * sizeof (long ));
67
+ for (int i = 0 ; i < numElements; i++) {
68
+ randomLongs[i] = rand () % 100 ;
69
+ }
70
+ return randomLongs;
71
+ }
72
+
73
+ __host__ void printHostMemory (long *host_mem, int num_elments)
74
+ {
75
+ // Output results
76
+ for (int i = 0 ; i < num_elments; i++)
77
+ {
78
+ printf (" %d " , host_mem[i]);
79
+ }
80
+ printf (" \n " );
81
+ }
82
+
83
+ __host__ int main (int argc, char **argv)
84
+ {
85
+
86
+ auto [threadsPerBlock, blocksPerGrid, numElements] = parseCommandLineArguments (argc, argv);
87
+
88
+ long *data = generateRandomLongArray (numElements);
89
+
90
+ printf (" Unsorted data: " );
91
+ printHostMemory (data, numElements);
92
+
93
+ mergesort (data, numElements, threadsPerBlock, blocksPerGrid);
94
+
95
+ printf (" Sorted data: " );
96
+ printHostMemory (data, numElements);
97
+ }
98
+
99
+
100
+ __host__ void mergesort (long *data, long size, dim3 threadsPerBlock, dim3 blocksPerGrid)
101
+ {
102
+
103
+ long *D_data;
104
+ long *D_swp;
105
+ dim3 *D_threads;
106
+ dim3 *D_blocks;
107
+
108
+ tm ();
109
+ cudaMalloc (&D_data, size * sizeof (long ));
110
+ cudaMalloc (&D_swp, size * sizeof (long ));
111
+
112
+
113
+ cudaMalloc (&D_threads, sizeof (dim3 ));
114
+ cudaMalloc (&D_blocks, sizeof (dim3 ));
115
+
116
+ checkCudaErrors (cudaMemcpy (D_data, data, size * sizeof (long ), cudaMemcpyHostToDevice));
117
+
118
+ checkCudaErrors (cudaMemcpy (D_threads, &threadsPerBlock, sizeof (dim3 ), cudaMemcpyHostToDevice));
119
+ checkCudaErrors (cudaMemcpy (D_blocks, &blocksPerGrid, sizeof (dim3 ), cudaMemcpyHostToDevice));
120
+
121
+ long *A = D_data;
122
+ long *B = D_swp;
123
+
124
+ long nThreads = threadsPerBlock.x * threadsPerBlock.y * threadsPerBlock.z *
125
+ blocksPerGrid.x * blocksPerGrid.y * blocksPerGrid.z ;
126
+
127
+ for (int width = 2 ; width < (size << 1 ); width <<= 1 )
128
+ {
129
+ long slices = size / ((nThreads) * width) + 1 ;
130
+
131
+ tm ();
132
+ gpu_mergesort<<<blocksPerGrid, threadsPerBlock>>> (A, B, size, width, slices, D_threads, D_blocks);
133
+ // Switch the input / output arrays instead of copying them around
134
+ // Switch the input / output arrays instead of copying them around
135
+ A = A == D_data ? D_swp : D_data;
136
+ B = B == D_data ? D_swp : D_data;
137
+
138
+ }
139
+
140
+ checkCudaErrors (cudaMemcpy (data, A, size * sizeof (long ), cudaMemcpyDeviceToHost));
141
+
142
+ // TODO calculate and print to stdout kernel execution time
143
+ std::cout << " call mergesort kernel: " << tm () << " microseconds\n " ;
144
+ // Free the GPU memory
145
+ checkCudaErrors (cudaFree (A));
146
+ checkCudaErrors (cudaFree (B));
147
+
148
+ }
149
+
150
+ // GPU helper function
151
+ // calculate the id of the current thread
152
+ __device__ unsigned int getIdx (dim3 *threads, dim3 *blocks)
153
+ {
154
+ int x;
155
+ return threadIdx .x +
156
+ threadIdx .y * (x = threads->x ) +
157
+ threadIdx .z * (x *= threads->y ) +
158
+ blockIdx .x * (x *= threads->z ) +
159
+ blockIdx .y * (x *= blocks->z ) +
160
+ blockIdx .z * (x *= blocks->y );
161
+ }
162
+
163
+ //
164
+ // Perform a full mergesort on our section of the data.
165
+ //
166
+ __global__ void gpu_mergesort (long *source, long *dest, long size, long width, long slices, dim3 *threads, dim3 *blocks)
167
+ {
168
+ unsigned int idx = getIdx (threads, blocks);
169
+ long start = width * idx * slices,
170
+ middle,
171
+ end;
172
+
173
+ for (long slice = 0 ; slice < slices; slice++)
174
+ {
175
+ if (start >= size)
176
+ break ;
177
+
178
+ middle = min (start + (width >> 1 ), size);
179
+ end = min (start + width, size);
180
+ gpu_bottomUpMerge (source, dest, start, middle, end);
181
+ start += width;
182
+ }
183
+ }
184
+
185
+ //
186
+ // Finally, sort something gets called by gpu_mergesort() for each slice
187
+ // Note that the pseudocode below is not necessarily 100% complete you may want to review the merge sort algorithm.
188
+ //
189
+ __device__ void gpu_bottomUpMerge (long *source, long *dest, long start, long middle, long end)
190
+ {
191
+ long i = start;
192
+ long j = middle;
193
+ for (long k = start; k < end; k++) {
194
+ if (i < middle && (j >= end || source[i] < source[j])) {
195
+ dest[k] = source[i];
196
+ i++;
197
+ } else {
198
+ dest[k] = source[j];
199
+ j++;
200
+ }
201
+ }
202
+ }
203
+
204
+ timeval tStart;
205
+ int tm () {
206
+ timeval tEnd;
207
+ gettimeofday (&tEnd, 0 );
208
+ int t = (tEnd.tv_sec - tStart.tv_sec ) * 1000000 + tEnd.tv_usec - tStart.tv_usec ;
209
+ tStart = tEnd;
210
+ return t;
211
+ }
0 commit comments