Skip to content

Commit c0923e1

Browse files
committed
Computations and Documentations
1 parent dbbb30e commit c0923e1

File tree

2 files changed

+180
-3
lines changed

2 files changed

+180
-3
lines changed

GPU/ann.cu

Lines changed: 60 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,60 @@
66
#include <cmath>
77
#include <cuda_runtime.h>
88

9+
__global__ void cosine_similarity_kernel(float *base, float *query, int *output, int n, int m, int d, float target_similarity, int k) {
10+
int queryIdx = blockIdx.x * blockDim.x + threadIdx.x;
11+
12+
if (queryIdx < m) {
13+
// Arrays to hold top-k similarities and indices
14+
float *topKSimilarities = (float*)malloc(k * sizeof(float));
15+
int *topKIndices = (int*)malloc(k * sizeof(int));
16+
17+
// Initialize with minimum similarity and invalid index
18+
for (int i = 0; i < k; ++i) {
19+
topKSimilarities[i] = -1.0;
20+
topKIndices[i] = -1;
21+
}
22+
23+
for (int baseIdx = 0; baseIdx < n; ++baseIdx) {
24+
float ip = 0;
25+
float sumu2 = 0;
26+
float sumv2 = 0;
27+
28+
for (int dim = 0; dim < d; ++dim) {
29+
float u = base[baseIdx * d + dim];
30+
float v = query[queryIdx * d + dim];
31+
ip += u * v;
32+
sumu2 += u * u;
33+
sumv2 += v * v;
34+
}
35+
36+
float sim = ip / (sqrt(sumu2) * sqrt(sumv2));
37+
38+
// Check if this similarity is in the top-k
39+
for (int i = 0; i < k; ++i) {
40+
if (sim > topKSimilarities[i]) {
41+
// Shift lower similarities down
42+
for (int j = k - 1; j > i; --j) {
43+
topKSimilarities[j] = topKSimilarities[j - 1];
44+
topKIndices[j] = topKIndices[j - 1];
45+
}
46+
47+
// Insert new similarity
48+
topKSimilarities[i] = sim;
49+
topKIndices[i] = baseIdx;
50+
break;
51+
}
52+
}
53+
}
54+
55+
// Write the top-k indices to the output
56+
for (int i = 0; i < k; ++i) {
57+
output[queryIdx * k + i] = topKIndices[i];
58+
}
59+
}
60+
}
61+
62+
963
__global__ void findNearestNeighborCosine(float *points, float *queries, float *max_cosine, int *max_index, int n, int num_queries, int dimensions, float target_similarity) {
1064
extern __shared__ char shared[];
1165
float *s_cosine = (float*)shared;
@@ -93,14 +147,17 @@ int main(int argc, char* argv[]) {
93147
dim3 blocksPerGrid((n + threadsPerBlock.x - 1) / threadsPerBlock.x, m);
94148

95149
int sharedMemSize = threadsPerBlock.x * (sizeof(float) + sizeof(int));
96-
findNearestNeighborCosine<<<blocksPerGrid, threadsPerBlock, sharedMemSize>>>(d_base, d_query, d_max_cosine, d_max_index, n, m, d, target_similarity);
150+
int k = 5;
151+
int output[10];
152+
// findNearestNeighborCosine<<<blocksPerGrid, threadsPerBlock, sharedMemSize>>>(d_base, d_query, d_max_cosine, d_max_index, n, m, d, target_similarity);
153+
cosine_similarity_kernel<<<blocksPerGrid, threadsPerBlock, sharedMemSize>>>(d_base, d_query, output, n, m, d, target_similarity, k);
97154

98155
int *max_index_host = new int[m];
99156
cudaMemcpy(max_cosine_host, d_max_cosine, m * sizeof(float), cudaMemcpyDeviceToHost);
100-
cudaMemcpy(max_index_host, d_max_index, m * sizeof(int), cudaMemcpyDeviceToHost);
157+
cudaMemcpy(max_index_host, output, m * sizeof(int), cudaMemcpyDeviceToHost);
101158

102159
for (int i = 0; i < m; ++i) {
103-
fprintf(fout, "%d\n", max_index_host[i]);
160+
printf("%d\n", max_index_host[i]);
104161
}
105162

106163
cudaFree(d_base);

GPU/nn.cu

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
#include <cstring>
2+
#include <iostream>
3+
#include <memory>
4+
#include <string>
5+
#include <vector>
6+
#include <cmath>
7+
#include <cuda_runtime.h>
8+
9+
__global__ void findNearestNeighborCosine(float *points, float *queries, float *max_cosine, int *max_index, int n, int num_queries, int dimensions, float target_similarity) {
10+
extern __shared__ char shared[];
11+
float *s_cosine = (float*)shared;
12+
int *s_index = (int*)(shared + blockDim.x * sizeof(float));
13+
14+
int tid = threadIdx.x + blockIdx.x * blockDim.x;
15+
int qid = blockIdx.y;
16+
17+
if (tid < n && qid < num_queries) {
18+
float dot_product = 0, query_magnitude = 0, point_magnitude = 0;
19+
for (int d = 0; d < dimensions; ++d) {
20+
int idx = tid * dimensions + d;
21+
int q_idx = qid * dimensions + d;
22+
dot_product += queries[q_idx] * points[idx];
23+
query_magnitude += queries[q_idx] * queries[q_idx];
24+
point_magnitude += points[idx] * points[idx];
25+
}
26+
query_magnitude = sqrt(query_magnitude);
27+
point_magnitude = sqrt(point_magnitude);
28+
29+
float cosine_similarity = 0;
30+
if (query_magnitude > 0 && point_magnitude > 0) {
31+
cosine_similarity = dot_product / (query_magnitude * point_magnitude);
32+
}
33+
34+
s_cosine[threadIdx.x] = cosine_similarity;
35+
s_index[threadIdx.x] = tid;
36+
if(cosine_similarity > target_similarity)
37+
max_index[qid] = tid;
38+
__syncthreads();
39+
}
40+
}
41+
42+
std::vector<std::vector<float>> read_matrix(FILE* fin, int row, int col) {
43+
std::vector<std::vector<float>> ret;
44+
for (int i = 0; i < row; ++i) {
45+
std::vector<float> curr;
46+
float tmp = 0;
47+
for (int j = 0; j < col; ++j) {
48+
fscanf(fin, "%f", &tmp);
49+
curr.push_back(tmp);
50+
}
51+
ret.push_back(curr);
52+
}
53+
return ret;
54+
}
55+
56+
int main(int argc, char* argv[]) {
57+
FILE* fin = fopen(argv[1], "r");
58+
FILE* fout = fopen(argv[2], "w");
59+
60+
int n = 0, d = 0, m = 0;
61+
float target_similarity = 0;
62+
fscanf(fin, "%d%d%d%f", &d, &n, &m, &target_similarity);
63+
64+
std::vector<std::vector<float>> base = read_matrix(fin, n, d);
65+
std::vector<std::vector<float>> query = read_matrix(fin, m, d);
66+
67+
float* flat_base = new float[n * d];
68+
float* flat_query = new float[m * d];
69+
for (int i = 0; i < n; ++i)
70+
memcpy(flat_base + i * d, base[i].data(), d * sizeof(float));
71+
for (int i = 0; i < m; ++i)
72+
memcpy(flat_query + i * d, query[i].data(), d * sizeof(float));
73+
74+
75+
float* d_base, * d_query, *d_max_cosine;
76+
int *d_max_index;
77+
78+
79+
cudaMalloc(&d_base, n * d * sizeof(float));
80+
cudaMalloc(&d_query, m * d * sizeof(float));
81+
cudaMalloc(&d_max_cosine, m * sizeof(float));
82+
cudaMalloc(&d_max_index, m * sizeof(int));
83+
84+
85+
float *max_cosine_host = new float[m];
86+
for (int i = 0; i < m; i++) {
87+
max_cosine_host[i] = -1.0f;
88+
}
89+
90+
91+
cudaMemcpy(d_base, flat_base, n * d * sizeof(float), cudaMemcpyHostToDevice);
92+
cudaMemcpy(d_query, flat_query, m * d * sizeof(float), cudaMemcpyHostToDevice);
93+
cudaMemcpy(d_max_cosine, max_cosine_host, m * sizeof(float), cudaMemcpyHostToDevice);
94+
95+
96+
dim3 threadsPerBlock(256);
97+
dim3 blocksPerGrid((n + threadsPerBlock.x - 1) / threadsPerBlock.x, m);
98+
99+
100+
int sharedMemSize = threadsPerBlock.x * (sizeof(float) + sizeof(int));
101+
findNearestNeighborCosine<<<blocksPerGrid, threadsPerBlock, sharedMemSize>>>(d_base, d_query, d_max_cosine, d_max_index, n, m, d, target_similarity);
102+
103+
104+
int *max_index_host = new int[m];
105+
cudaMemcpy(max_cosine_host, d_max_cosine, m * sizeof(float), cudaMemcpyDeviceToHost);
106+
cudaMemcpy(max_index_host, d_max_index, m * sizeof(int), cudaMemcpyDeviceToHost);
107+
108+
109+
for (int i = 0; i < m; ++i) {
110+
fprintf(fout, "%d\n", max_index_host[i]);
111+
}
112+
113+
114+
cudaFree(d_base);
115+
cudaFree(d_query);
116+
cudaFree(d_max_cosine);
117+
cudaFree(d_max_index);
118+
119+
return 0;
120+
}

0 commit comments

Comments
 (0)