Skip to content

Commit 385f276

Browse files
committed
added cuda merge sort example
1 parent d1a7d68 commit 385f276

File tree

4 files changed

+236
-3
lines changed

4 files changed

+236
-3
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
.idea
2-
build
2+
build
3+
output.txt

CMakeLists.txt

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,17 @@ set(CUDA_ROOT /usr/local/cuda)
1515

1616
# Include directories
1717
include_directories(${CUDA_ROOT}/include)
18+
include_directories(/home/sajith/lib/cuda-samples/Common)
1819

1920
# Specify the source files
20-
file(GLOB SOURCES "*.cu")
2121

2222
# Create the executable
23-
add_executable(streams.exe ${SOURCES})
23+
add_executable(streams.exe stream.cu stream.h)
24+
add_executable(mergesort.exe merge_sort.cu merge_sort.h)
2425

2526
# Link CUDA libraries directly
2627
target_link_libraries(streams.exe ${CUDA_ROOT}/lib64/libcudart.so)
28+
target_link_libraries(mergesort.exe ${CUDA_ROOT}/lib64/libcudart.so)
2729

2830
# Suppress deprecated GPU target warnings
2931
add_compile_options(-Wno-deprecated-gpu-targets)

merge_sort.cu

Lines changed: 211 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,211 @@
1+
#include "merge_sort.h"
2+
3+
#define min(a, b) (a < b ? a : b)
4+
// Based on https://github.com/kevin-albert/cuda-mergesort/blob/master/mergesort.cu
5+
6+
7+
__host__ std::tuple<dim3, dim3, int> parseCommandLineArguments(int argc, char **argv)
8+
{
9+
int numElements = 32;
10+
dim3 threadsPerBlock;
11+
dim3 blocksPerGrid;
12+
13+
threadsPerBlock.x = 32;
14+
threadsPerBlock.y = 1;
15+
threadsPerBlock.z = 1;
16+
17+
blocksPerGrid.x = 8;
18+
blocksPerGrid.y = 1;
19+
blocksPerGrid.z = 1;
20+
21+
for (int i = 1; i < argc; i++)
22+
{
23+
if (argv[i][0] == '-' && argv[i][1] && !argv[i][2])
24+
{
25+
char arg = argv[i][1];
26+
unsigned int *toSet = 0;
27+
switch (arg)
28+
{
29+
case 'x':
30+
toSet = &threadsPerBlock.x;
31+
break;
32+
case 'y':
33+
toSet = &threadsPerBlock.y;
34+
break;
35+
case 'z':
36+
toSet = &threadsPerBlock.z;
37+
break;
38+
case 'X':
39+
toSet = &blocksPerGrid.x;
40+
break;
41+
case 'Y':
42+
toSet = &blocksPerGrid.y;
43+
break;
44+
case 'Z':
45+
toSet = &blocksPerGrid.z;
46+
break;
47+
case 'n':
48+
i++;
49+
numElements = stoi(argv[i]);
50+
break;
51+
}
52+
if (toSet)
53+
{
54+
i++;
55+
*toSet = (unsigned int) strtol(argv[i], 0, 10);
56+
}
57+
}
58+
}
59+
return {threadsPerBlock, blocksPerGrid, numElements};
60+
}
61+
62+
__host__ long *generateRandomLongArray(int numElements)
63+
{
64+
//TODO generate random array of long integers of size numElements
65+
66+
long* randomLongs = (long*)malloc(numElements * sizeof(long));
67+
for (int i = 0; i < numElements; i++) {
68+
randomLongs[i] = rand() % 100;
69+
}
70+
return randomLongs;
71+
}
72+
73+
__host__ void printHostMemory(long *host_mem, int num_elments)
74+
{
75+
// Output results
76+
for (int i = 0; i < num_elments; i++)
77+
{
78+
printf("%d ", host_mem[i]);
79+
}
80+
printf("\n");
81+
}
82+
83+
__host__ int main(int argc, char **argv)
84+
{
85+
86+
auto [threadsPerBlock, blocksPerGrid, numElements] = parseCommandLineArguments(argc, argv);
87+
88+
long *data = generateRandomLongArray(numElements);
89+
90+
printf("Unsorted data: ");
91+
printHostMemory(data, numElements);
92+
93+
mergesort(data, numElements, threadsPerBlock, blocksPerGrid);
94+
95+
printf("Sorted data: ");
96+
printHostMemory(data, numElements);
97+
}
98+
99+
100+
__host__ void mergesort(long *data, long size, dim3 threadsPerBlock, dim3 blocksPerGrid)
101+
{
102+
103+
long *D_data;
104+
long *D_swp;
105+
dim3 *D_threads;
106+
dim3 *D_blocks;
107+
108+
tm();
109+
cudaMalloc(&D_data, size * sizeof(long));
110+
cudaMalloc(&D_swp, size * sizeof(long));
111+
112+
113+
cudaMalloc(&D_threads, sizeof(dim3));
114+
cudaMalloc(&D_blocks, sizeof(dim3));
115+
116+
checkCudaErrors(cudaMemcpy(D_data, data, size * sizeof(long), cudaMemcpyHostToDevice));
117+
118+
checkCudaErrors(cudaMemcpy(D_threads, &threadsPerBlock, sizeof(dim3), cudaMemcpyHostToDevice));
119+
checkCudaErrors(cudaMemcpy(D_blocks, &blocksPerGrid, sizeof(dim3), cudaMemcpyHostToDevice));
120+
121+
long *A = D_data;
122+
long *B = D_swp;
123+
124+
long nThreads = threadsPerBlock.x * threadsPerBlock.y * threadsPerBlock.z *
125+
blocksPerGrid.x * blocksPerGrid.y * blocksPerGrid.z;
126+
127+
for (int width = 2; width < (size << 1); width <<= 1)
128+
{
129+
long slices = size / ((nThreads) * width) + 1;
130+
131+
tm();
132+
gpu_mergesort<<<blocksPerGrid, threadsPerBlock>>>(A, B, size, width, slices, D_threads, D_blocks);
133+
// Switch the input / output arrays instead of copying them around
134+
// Switch the input / output arrays instead of copying them around
135+
A = A == D_data ? D_swp : D_data;
136+
B = B == D_data ? D_swp : D_data;
137+
138+
}
139+
140+
checkCudaErrors(cudaMemcpy(data, A, size * sizeof(long), cudaMemcpyDeviceToHost));
141+
142+
// TODO calculate and print to stdout kernel execution time
143+
std::cout << "call mergesort kernel: " << tm() << " microseconds\n";
144+
// Free the GPU memory
145+
checkCudaErrors(cudaFree(A));
146+
checkCudaErrors(cudaFree(B));
147+
148+
}
149+
150+
// GPU helper function
151+
// calculate the id of the current thread
152+
__device__ unsigned int getIdx(dim3 *threads, dim3 *blocks)
153+
{
154+
int x;
155+
return threadIdx.x +
156+
threadIdx.y * (x = threads->x) +
157+
threadIdx.z * (x *= threads->y) +
158+
blockIdx.x * (x *= threads->z) +
159+
blockIdx.y * (x *= blocks->z) +
160+
blockIdx.z * (x *= blocks->y);
161+
}
162+
163+
//
164+
// Perform a full mergesort on our section of the data.
165+
//
166+
__global__ void gpu_mergesort(long *source, long *dest, long size, long width, long slices, dim3 *threads, dim3 *blocks)
167+
{
168+
unsigned int idx = getIdx(threads, blocks);
169+
long start = width * idx * slices,
170+
middle,
171+
end;
172+
173+
for (long slice = 0; slice < slices; slice++)
174+
{
175+
if (start >= size)
176+
break;
177+
178+
middle = min(start + (width >> 1), size);
179+
end = min(start + width, size);
180+
gpu_bottomUpMerge(source, dest, start, middle, end);
181+
start += width;
182+
}
183+
}
184+
185+
//
186+
// Finally, sort something gets called by gpu_mergesort() for each slice
187+
// Note that the pseudocode below is not necessarily 100% complete you may want to review the merge sort algorithm.
188+
//
189+
__device__ void gpu_bottomUpMerge(long *source, long *dest, long start, long middle, long end)
190+
{
191+
long i = start;
192+
long j = middle;
193+
for (long k = start; k < end; k++) {
194+
if (i < middle && (j >= end || source[i] < source[j])) {
195+
dest[k] = source[i];
196+
i++;
197+
} else {
198+
dest[k] = source[j];
199+
j++;
200+
}
201+
}
202+
}
203+
204+
timeval tStart;
205+
int tm() {
206+
timeval tEnd;
207+
gettimeofday(&tEnd, 0);
208+
int t = (tEnd.tv_sec - tStart.tv_sec) * 1000000 + tEnd.tv_usec - tStart.tv_usec;
209+
tStart = tEnd;
210+
return t;
211+
}

merge_sort.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#include <iostream>
2+
#include <helper_cuda.h>
3+
#include <sys/time.h>
4+
#include <tuple>
5+
#include <cstdio>
6+
7+
8+
using namespace std;
9+
10+
11+
__host__ void mergesort(long *data, long size, dim3 threadsPerBlock, dim3 blocksPerGrid);
12+
__host__ long *generateRandomLongArray(int numElements);
13+
__host__ void printHostMemory(long *host_mem, int num_elments);
14+
__host__ std::tuple<dim3, dim3, int> parseCommandLineArguments(int argc, char **argv);
15+
16+
__global__ void gpu_mergesort(long* source, long* dest, long size, long width, long slices, dim3* threads, dim3* blocks) ;
17+
__device__ void gpu_bottomUpMerge(long* source, long* dest, long start, long middle, long end);
18+
19+
int tm();

0 commit comments

Comments
 (0)