added cuda merge sort example

snandasena · snandasena · commit 385f2760b2d0 · 2024-11-16T17:13:28.000+01:00
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,3 @@
 .idea
-build
+build
+output.txt
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -15,15 +15,17 @@ set(CUDA_ROOT /usr/local/cuda)
 
 # Include directories
 include_directories(${CUDA_ROOT}/include)
+include_directories(/home/sajith/lib/cuda-samples/Common)
 
 # Specify the source files
-file(GLOB SOURCES "*.cu")
 
 # Create the executable
-add_executable(streams.exe ${SOURCES})
+add_executable(streams.exe stream.cu stream.h)
+add_executable(mergesort.exe merge_sort.cu merge_sort.h)
 
 # Link CUDA libraries directly
 target_link_libraries(streams.exe ${CUDA_ROOT}/lib64/libcudart.so)
+target_link_libraries(mergesort.exe ${CUDA_ROOT}/lib64/libcudart.so)
 
 # Suppress deprecated GPU target warnings
 add_compile_options(-Wno-deprecated-gpu-targets)
diff --git a/merge_sort.cu b/merge_sort.cu
@@ -0,0 +1,211 @@
+#include "merge_sort.h"
+
+#define min(a, b) (a < b ? a : b)
+// Based on https://github.com/kevin-albert/cuda-mergesort/blob/master/mergesort.cu
+
+
+__host__ std::tuple<dim3, dim3, int> parseCommandLineArguments(int argc, char **argv)
+{
+    int numElements = 32;
+    dim3 threadsPerBlock;
+    dim3 blocksPerGrid;
+
+    threadsPerBlock.x = 32;
+    threadsPerBlock.y = 1;
+    threadsPerBlock.z = 1;
+
+    blocksPerGrid.x = 8;
+    blocksPerGrid.y = 1;
+    blocksPerGrid.z = 1;
+
+    for (int i = 1; i < argc; i++)
+    {
+        if (argv[i][0] == '-' && argv[i][1] && !argv[i][2])
+        {
+            char arg = argv[i][1];
+            unsigned int *toSet = 0;
+            switch (arg)
+            {
+                case 'x':
+                    toSet = &threadsPerBlock.x;
+                    break;
+                case 'y':
+                    toSet = &threadsPerBlock.y;
+                    break;
+                case 'z':
+                    toSet = &threadsPerBlock.z;
+                    break;
+                case 'X':
+                    toSet = &blocksPerGrid.x;
+                    break;
+                case 'Y':
+                    toSet = &blocksPerGrid.y;
+                    break;
+                case 'Z':
+                    toSet = &blocksPerGrid.z;
+                    break;
+                case 'n':
+                    i++;
+                    numElements = stoi(argv[i]);
+                    break;
+            }
+            if (toSet)
+            {
+                i++;
+                *toSet = (unsigned int) strtol(argv[i], 0, 10);
+            }
+        }
+    }
+    return {threadsPerBlock, blocksPerGrid, numElements};
+}
+
+__host__ long *generateRandomLongArray(int numElements)
+{
+    //TODO generate random array of long integers of size numElements
+
+    long* randomLongs = (long*)malloc(numElements * sizeof(long));
+    for (int i = 0; i < numElements; i++) {
+        randomLongs[i] = rand() % 100;
+    }
+    return randomLongs;
+}
+
+__host__ void printHostMemory(long *host_mem, int num_elments)
+{
+    // Output results
+    for (int i = 0; i < num_elments; i++)
+    {
+        printf("%d ", host_mem[i]);
+    }
+    printf("\n");
+}
+
+__host__ int main(int argc, char **argv)
+{
+
+    auto [threadsPerBlock, blocksPerGrid, numElements] = parseCommandLineArguments(argc, argv);
+
+    long *data = generateRandomLongArray(numElements);
+
+    printf("Unsorted data: ");
+    printHostMemory(data, numElements);
+
+    mergesort(data, numElements, threadsPerBlock, blocksPerGrid);
+
+    printf("Sorted data: ");
+    printHostMemory(data, numElements);
+}
+
+
+__host__ void mergesort(long *data, long size, dim3 threadsPerBlock, dim3 blocksPerGrid)
+{
+
+    long *D_data;
+    long *D_swp;
+    dim3 *D_threads;
+    dim3 *D_blocks;
+
+    tm();
+    cudaMalloc(&D_data, size * sizeof(long));
+    cudaMalloc(&D_swp, size * sizeof(long));
+
+
+    cudaMalloc(&D_threads, sizeof(dim3));
+    cudaMalloc(&D_blocks, sizeof(dim3));
+
+    checkCudaErrors(cudaMemcpy(D_data, data, size * sizeof(long), cudaMemcpyHostToDevice));
+
+    checkCudaErrors(cudaMemcpy(D_threads, &threadsPerBlock, sizeof(dim3), cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(D_blocks, &blocksPerGrid, sizeof(dim3), cudaMemcpyHostToDevice));
+
+    long *A = D_data;
+    long *B = D_swp;
+
+    long nThreads = threadsPerBlock.x * threadsPerBlock.y * threadsPerBlock.z *
+                    blocksPerGrid.x * blocksPerGrid.y * blocksPerGrid.z;
+
+    for (int width = 2; width < (size << 1); width <<= 1)
+    {
+        long slices = size / ((nThreads) * width) + 1;
+
+        tm();
+        gpu_mergesort<<<blocksPerGrid, threadsPerBlock>>>(A, B, size, width, slices, D_threads, D_blocks);
+        // Switch the input / output arrays instead of copying them around
+        // Switch the input / output arrays instead of copying them around
+        A = A == D_data ? D_swp : D_data;
+        B = B == D_data ? D_swp : D_data;
+
+    }
+
+    checkCudaErrors(cudaMemcpy(data, A, size * sizeof(long), cudaMemcpyDeviceToHost));
+
+    // TODO calculate and print to stdout kernel execution time
+    std::cout << "call mergesort kernel: " << tm() << " microseconds\n";
+    // Free the GPU memory
+    checkCudaErrors(cudaFree(A));
+    checkCudaErrors(cudaFree(B));
+
+}
+
+// GPU helper function
+// calculate the id of the current thread
+__device__ unsigned int getIdx(dim3 *threads, dim3 *blocks)
+{
+    int x;
+    return threadIdx.x +
+           threadIdx.y * (x = threads->x) +
+           threadIdx.z * (x *= threads->y) +
+           blockIdx.x * (x *= threads->z) +
+           blockIdx.y * (x *= blocks->z) +
+           blockIdx.z * (x *= blocks->y);
+}
+
+//
+// Perform a full mergesort on our section of the data.
+//
+__global__ void gpu_mergesort(long *source, long *dest, long size, long width, long slices, dim3 *threads, dim3 *blocks)
+{
+    unsigned int idx = getIdx(threads, blocks);
+    long start = width * idx * slices,
+            middle,
+            end;
+
+    for (long slice = 0; slice < slices; slice++)
+    {
+        if (start >= size)
+            break;
+
+        middle = min(start + (width >> 1), size);
+        end = min(start + width, size);
+        gpu_bottomUpMerge(source, dest, start, middle, end);
+        start += width;
+    }
+}
+
+//
+// Finally, sort something gets called by gpu_mergesort() for each slice
+// Note that the pseudocode below is not necessarily 100% complete you may want to review the merge sort algorithm.
+//
+__device__ void gpu_bottomUpMerge(long *source, long *dest, long start, long middle, long end)
+{
+    long i = start;
+    long j = middle;
+    for (long k = start; k < end; k++) {
+        if (i < middle && (j >= end || source[i] < source[j])) {
+            dest[k] = source[i];
+            i++;
+        } else {
+            dest[k] = source[j];
+            j++;
+        }
+    }
+}
+
+timeval tStart;
+int tm() {
+    timeval tEnd;
+    gettimeofday(&tEnd, 0);
+    int t = (tEnd.tv_sec - tStart.tv_sec) * 1000000 + tEnd.tv_usec - tStart.tv_usec;
+    tStart = tEnd;
+    return t;
+}
diff --git a/merge_sort.h b/merge_sort.h
@@ -0,0 +1,19 @@
+#include <iostream>
+#include <helper_cuda.h>
+#include <sys/time.h>
+#include <tuple>
+#include <cstdio>
+
+
+using namespace std;
+
+
+__host__ void mergesort(long *data, long size, dim3 threadsPerBlock, dim3 blocksPerGrid);
+__host__ long *generateRandomLongArray(int numElements);
+__host__ void printHostMemory(long *host_mem, int num_elments);
+__host__ std::tuple<dim3, dim3, int> parseCommandLineArguments(int argc, char **argv);
+
+__global__ void gpu_mergesort(long* source, long* dest, long size, long width, long slices, dim3* threads, dim3* blocks) ;
+__device__ void gpu_bottomUpMerge(long* source, long* dest, long start, long middle, long end);
+
+int tm();

-Original file line number
+Diff line change
@@ @@ -1,2 +1,3 @@ @@
 .idea
 -build
 +build
 +output.txt