22#include < iostream>
33#include < memory>
44#include < string>
5+ #include < ctime>
56#include < vector>
67#include < cmath>
78#include < cuda_runtime.h>
89
9- __global__ void findNearestNeighborCosine (float *points, float *queries, float *max_cosine, int *max_index, int n, int num_queries, int dimensions, float target_similarity ) {
10+ __global__ void findNearestNeighborCosine (float *points, float *queries, float *max_cosine, int n, int num_queries, int dimensions) {
1011 extern __shared__ char shared[];
1112 float *s_cosine = (float *)shared;
1213 int *s_index = (int *)(shared + blockDim .x * sizeof (float ));
@@ -33,8 +34,6 @@ __global__ void findNearestNeighborCosine(float *points, float *queries, float *
3334
3435 s_cosine[threadIdx .x ] = cosine_similarity;
3536 s_index[threadIdx .x ] = tid;
36- if (cosine_similarity > target_similarity)
37- max_index[qid] = tid;
3837 __syncthreads ();
3938 }
4039}
@@ -55,11 +54,13 @@ std::vector<std::vector<float>> read_matrix(FILE* fin, int row, int col) {
5554
5655int main (int argc, char * argv[]) {
5756 FILE* fin = fopen (argv[1 ], " r" );
58- FILE* fout = fopen (argv[2 ], " w" );
5957
6058 int n = 0 , d = 0 , m = 0 ;
61- float target_similarity = 0 ;
62- fscanf (fin, " %d%d%d%f" , &d, &n, &m, &target_similarity);
59+ fscanf (fin, " %d%d%d" , &d, &n, &m);
60+
61+ double total_cosine_GPU_time = 0.0 ;
62+
63+ clock_t start_time, end_time;
6364
6465 std::vector<std::vector<float >> base = read_matrix (fin, n, d);
6566 std::vector<std::vector<float >> query = read_matrix (fin, m, d);
@@ -73,13 +74,12 @@ int main(int argc, char* argv[]) {
7374
7475
7576 float * d_base, * d_query, *d_max_cosine;
76- int *d_max_index;
7777
78-
78+ start_time = clock ();
79+
7980 cudaMalloc (&d_base, n * d * sizeof (float ));
8081 cudaMalloc (&d_query, m * d * sizeof (float ));
8182 cudaMalloc (&d_max_cosine, m * sizeof (float ));
82- cudaMalloc (&d_max_index, m * sizeof (int ));
8383
8484
8585 float *max_cosine_host = new float [m];
@@ -98,23 +98,20 @@ int main(int argc, char* argv[]) {
9898
9999
100100 int sharedMemSize = threadsPerBlock.x * (sizeof (float ) + sizeof (int ));
101- findNearestNeighborCosine<<<blocksPerGrid, threadsPerBlock, sharedMemSize>>> (d_base, d_query, d_max_cosine, d_max_index, n, m, d, target_similarity );
101+ findNearestNeighborCosine<<<blocksPerGrid, threadsPerBlock, sharedMemSize>>> (d_base, d_query, d_max_cosine, n, m, d);
102102
103103
104- int *max_index_host = new int [m];
105104 cudaMemcpy (max_cosine_host, d_max_cosine, m * sizeof (float ), cudaMemcpyDeviceToHost);
106- cudaMemcpy (max_index_host, d_max_index, m * sizeof (int ), cudaMemcpyDeviceToHost);
107-
108-
109- for (int i = 0 ; i < m; ++i) {
110- fprintf (fout, " %d\n " , max_index_host[i]);
111- }
112-
113105
114106 cudaFree (d_base);
115107 cudaFree (d_query);
116108 cudaFree (d_max_cosine);
117- cudaFree (d_max_index);
109+
110+ end_time = clock (); // Record the ending time
111+
112+ total_cosine_GPU_time = static_cast <double >(end_time - start_time) / CLOCKS_PER_SEC;
113+
114+ std::cout << " Total cosine similarity with GPU: " << total_cosine_GPU_time << " seconds." << std::endl;
118115
119116 return 0 ;
120117}
0 commit comments