1+ #include < cstring>
2+ #include < iostream>
3+ #include < memory>
4+ #include < string>
5+ #include < vector>
6+ #include < cmath>
7+ #include < cuda_runtime.h>
8+
9+ __global__ void findNearestNeighborCosine (float *points, float *queries, float *max_cosine, int *max_index, int n, int num_queries, int dimensions, float target_similarity) {
10+ extern __shared__ char shared[];
11+ float *s_cosine = (float *)shared;
12+ int *s_index = (int *)(shared + blockDim .x * sizeof (float ));
13+
14+ int tid = threadIdx .x + blockIdx .x * blockDim .x ;
15+ int qid = blockIdx .y ;
16+
17+ if (tid < n && qid < num_queries) {
18+ float dot_product = 0 , query_magnitude = 0 , point_magnitude = 0 ;
19+ for (int d = 0 ; d < dimensions; ++d) {
20+ int idx = tid * dimensions + d;
21+ int q_idx = qid * dimensions + d;
22+ dot_product += queries[q_idx] * points[idx];
23+ query_magnitude += queries[q_idx] * queries[q_idx];
24+ point_magnitude += points[idx] * points[idx];
25+ }
26+ query_magnitude = sqrt (query_magnitude);
27+ point_magnitude = sqrt (point_magnitude);
28+
29+ float cosine_similarity = 0 ;
30+ if (query_magnitude > 0 && point_magnitude > 0 ) {
31+ cosine_similarity = dot_product / (query_magnitude * point_magnitude);
32+ }
33+
34+ s_cosine[threadIdx .x ] = cosine_similarity;
35+ s_index[threadIdx .x ] = tid;
36+ if (cosine_similarity > target_similarity)
37+ max_index[qid] = tid;
38+ __syncthreads ();
39+ }
40+ }
41+
42+
43+ std::vector<std::vector<float >> read_matrix (FILE* fin, int row, int col) {
44+ std::vector<std::vector<float >> ret;
45+ for (int i = 0 ; i < row; ++i) {
46+ std::vector<float > curr;
47+ float tmp = 0 ;
48+ for (int j = 0 ; j < col; ++j) {
49+ fscanf (fin, " %f" , &tmp);
50+ curr.push_back (tmp);
51+ }
52+ ret.push_back (curr);
53+ }
54+ return ret;
55+ }
56+
57+ int main (int argc, char * argv[]) {
58+ FILE* fin = fopen (argv[1 ], " r" );
59+ FILE* fout = fopen (argv[2 ], " w" );
60+
61+ int n = 0 , d = 0 , m = 0 ;
62+ float target_similarity = 0.9 ;
63+ fscanf (fin, " %d%d%d" , &d, &n, &m);
64+
65+ std::vector<std::vector<float >> base = read_matrix (fin, n, d);
66+ std::vector<std::vector<float >> query = read_matrix (fin, m, d);
67+
68+ float * flat_base = new float [n * d];
69+ float * flat_query = new float [m * d];
70+ for (int i = 0 ; i < n; ++i)
71+ memcpy (flat_base + i * d, base[i].data (), d * sizeof (float ));
72+ for (int i = 0 ; i < m; ++i)
73+ memcpy (flat_query + i * d, query[i].data (), d * sizeof (float ));
74+
75+ float * d_base, * d_query, *d_max_cosine;
76+ int *d_max_index;
77+
78+ cudaMalloc (&d_base, n * d * sizeof (float ));
79+ cudaMalloc (&d_query, m * d * sizeof (float ));
80+ cudaMalloc (&d_max_cosine, m * sizeof (float ));
81+ cudaMalloc (&d_max_index, m * sizeof (int ));
82+
83+ float *max_cosine_host = new float [m];
84+ for (int i = 0 ; i < m; i++) {
85+ max_cosine_host[i] = -1 .0f ;
86+ }
87+
88+ cudaMemcpy (d_base, flat_base, n * d * sizeof (float ), cudaMemcpyHostToDevice);
89+ cudaMemcpy (d_query, flat_query, m * d * sizeof (float ), cudaMemcpyHostToDevice);
90+ cudaMemcpy (d_max_cosine, max_cosine_host, m * sizeof (float ), cudaMemcpyHostToDevice);
91+
92+ dim3 threadsPerBlock (256 );
93+ dim3 blocksPerGrid ((n + threadsPerBlock.x - 1 ) / threadsPerBlock.x , m);
94+
95+ int sharedMemSize = threadsPerBlock.x * (sizeof (float ) + sizeof (int ));
96+ findNearestNeighborCosine<<<blocksPerGrid, threadsPerBlock, sharedMemSize>>> (d_base, d_query, d_max_cosine, d_max_index, n, m, d, target_similarity);
97+
98+ int *max_index_host = new int [m];
99+ cudaMemcpy (max_cosine_host, d_max_cosine, m * sizeof (float ), cudaMemcpyDeviceToHost);
100+ cudaMemcpy (max_index_host, d_max_index, m * sizeof (int ), cudaMemcpyDeviceToHost);
101+
102+ for (int i = 0 ; i < m; ++i) {
103+ fprintf (fout, " %d\n " , max_index_host[i]);
104+ }
105+
106+ cudaFree (d_base);
107+ cudaFree (d_query);
108+ cudaFree (d_max_cosine);
109+ cudaFree (d_max_index);
110+
111+ return 0 ;
112+ }
0 commit comments