Skip to content

Commit f8768c4

Browse files
committed
HNSW basic neighbor selection
1 parent 928f251 commit f8768c4

File tree

13 files changed

+283
-0
lines changed

13 files changed

+283
-0
lines changed

.vscode/c_cpp_properties.json

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
{
2+
"configurations": [
3+
{
4+
"name": "Win32",
5+
"includePath": [
6+
"${workspaceFolder}/**",
7+
"C:\\users\\adish\\appdata\\local\\packages\\pythonsoftwarefoundation.python.3.9_qbz5n2kfra8p0\\localcache\\local-packages\\python39\\site-packages\\pybind11\\include",
8+
"C:\\Users\\Adish\\AppData\\Local\\Programs\\Python\\Python311\\include"
9+
],
10+
"defines": [
11+
"_DEBUG",
12+
"UNICODE",
13+
"_UNICODE"
14+
],
15+
"windowsSdkVersion": "10.0.18362.0",
16+
"compilerPath": "cl.exe",
17+
"cStandard": "c17",
18+
"cppStandard": "c++17",
19+
"intelliSenseMode": "windows-msvc-x64"
20+
}
21+
],
22+
"version": 4
23+
}

.vscode/settings.json

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
{
2+
"files.associations": {
3+
"vector": "cpp",
4+
"iostream": "cpp"
5+
}
6+
}

compile.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
g++ -c main.cpp -o main.o
2+
g++ -c hnsw.cpp -o hnsw.o
3+
g++ main.o hnsw.o -o my_program

hnsw.cpp

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
#include "hnsw.h"
2+
3+
#include <algorithm>
4+
#include <iostream>
5+
#include <queue>
6+
#include <random>
7+
#include <set>
8+
#include <unordered_set>
9+
#include <vector>
10+
using namespace std;
11+
12+
vector<int> HNSWGraph::searchLayer(Item& q, int ep, int ef, int lc) {
13+
set<pair<double, int>> candidates;
14+
set<pair<double, int>> nearestNeighbors;
15+
unordered_set<int> isVisited;
16+
17+
double td = q.dist(items[ep]);
18+
candidates.insert(make_pair(td, ep));
19+
nearestNeighbors.insert(make_pair(td, ep));
20+
isVisited.insert(ep);
21+
while (!candidates.empty()) {
22+
auto ci = candidates.begin(); candidates.erase(candidates.begin());
23+
int nid = ci->second;
24+
auto fi = nearestNeighbors.end(); fi--;
25+
if (ci->first > fi->first) break;
26+
for (int ed: layerEdgeLists[lc][nid]) {
27+
if (isVisited.find(ed) != isVisited.end()) continue;
28+
fi = nearestNeighbors.end(); fi--;
29+
isVisited.insert(ed);
30+
td = q.dist(items[ed]);
31+
if ((td < fi->first) || nearestNeighbors.size() < ef) {
32+
candidates.insert(make_pair(td, ed));
33+
nearestNeighbors.insert(make_pair(td, ed));
34+
if (nearestNeighbors.size() > ef) nearestNeighbors.erase(fi);
35+
}
36+
}
37+
}
38+
vector<int> results;
39+
for(auto &p: nearestNeighbors) results.push_back(p.second);
40+
return results;
41+
}
42+
43+
vector<int> HNSWGraph::KNNSearch(Item& q, int K) {
44+
int maxLyer = layerEdgeLists.size() - 1;
45+
int ep = enterNode;
46+
for (int l = maxLyer; l >= 1; l--) ep = searchLayer(q, ep, 1, l)[0];
47+
return searchLayer(q, ep, K, 0);
48+
}
49+
50+
void HNSWGraph::addEdge(int st, int ed, int lc) {
51+
if (st == ed) return;
52+
layerEdgeLists[lc][st].push_back(ed);
53+
layerEdgeLists[lc][ed].push_back(st);
54+
}
55+
56+
void HNSWGraph::Insert(Item& q) {
57+
int nid = items.size();
58+
itemNum++; items.push_back(q);
59+
// sample layer
60+
int maxLyer = layerEdgeLists.size() - 1;
61+
int l = 0;
62+
uniform_real_distribution<double> distribution(0.0,1.0);
63+
while(l < ml && (1.0 / ml <= distribution(generator))) {
64+
l++;
65+
if (layerEdgeLists.size() <= l) layerEdgeLists.push_back(unordered_map<int, vector<int>>());
66+
}
67+
if (nid == 0) {
68+
enterNode = nid;
69+
return;
70+
}
71+
// search up layer entrance
72+
int ep = enterNode;
73+
for (int i = maxLyer; i > l; i--) ep = searchLayer(q, ep, 1, i)[0];
74+
for (int i = min(l, maxLyer); i >= 0; i--) {
75+
int MM = l == 0 ? MMax0 : MMax;
76+
vector<int> neighbors = searchLayer(q, ep, efConstruction, i);
77+
vector<int> selectedNeighbors = vector<int>(neighbors.begin(), neighbors.begin()+min(int(neighbors.size()), M));
78+
for (int n: selectedNeighbors) addEdge(n, nid, i);
79+
for (int n: selectedNeighbors) {
80+
if (layerEdgeLists[i][n].size() > MM) {
81+
vector<pair<double, int>> distPairs;
82+
for (int nn: layerEdgeLists[i][n]) distPairs.emplace_back(items[n].dist(items[nn]), nn);
83+
sort(distPairs.begin(), distPairs.end());
84+
layerEdgeLists[i][n].clear();
85+
for (int d = 0; d < min(int(distPairs.size()), MM); d++) layerEdgeLists[i][n].push_back(distPairs[d].second);
86+
}
87+
}
88+
ep = selectedNeighbors[0];
89+
}
90+
if (l == layerEdgeLists.size() - 1) enterNode = nid;
91+
}

hnsw.h

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
#ifndef HNSW_H
2+
#define HNSW_H
3+
4+
#include <random>
5+
#include <vector>
6+
#include <unordered_map>
7+
#include <iostream>
8+
using namespace std;
9+
10+
struct Item {
11+
Item(vector<double> _values):values(_values) {}
12+
vector<double> values;
13+
double dist(Item& other) {
14+
double result = 0.0;
15+
for (int i = 0; i < values.size(); i++){
16+
result += (values[i] - other.values[i]) * (values[i] - other.values[i]);
17+
}
18+
return result;
19+
}
20+
};
21+
22+
struct HNSWGraph {
23+
HNSWGraph(int _M, int _MMax, int _MMax0, int _efConstruction, int _ml):M(_M),MMax(_MMax),MMax0(_MMax0),efConstruction(_efConstruction),ml(_ml){
24+
layerEdgeLists.push_back(unordered_map<int, vector<int>>());
25+
}
26+
int M;
27+
int MMax;
28+
int MMax0;
29+
int efConstruction;
30+
int ml;
31+
int itemNum;
32+
vector<Item> items;
33+
vector<unordered_map<int, vector<int>>> layerEdgeLists;
34+
int enterNode;
35+
36+
default_random_engine generator;
37+
void addEdge(int st, int ed, int lc);
38+
vector<int> searchLayer(Item& q, int ep, int ef, int lc);
39+
void Insert(Item& q);
40+
vector<int> KNNSearch(Item& q, int K);
41+
42+
void printGraph() {
43+
for (int l = 0; l < layerEdgeLists.size(); l++) {
44+
cout << "Layer:" << l << endl;
45+
for (auto it = layerEdgeLists[l].begin(); it != layerEdgeLists[l].end(); ++it) {
46+
cout << it->first << ":";
47+
for (auto ed: it->second) cout << ed << " ";
48+
cout << endl;
49+
}
50+
}
51+
}
52+
};
53+
54+
#endif

hnsw.o

662 KB
Binary file not shown.

hnsw_py.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
#include <pybind11/stl.h>
2+
#include <pybind11/pybind11.h>
3+
#include "hnsw.h"
4+
5+
namespace py = pybind11;
6+
7+
PYBIND11_MODULE(pyhnsw, m) {
8+
py::class_<Item>(m, "Item")
9+
.def(py::init<std::vector<double>>());
10+
11+
py::class_<HNSWGraph>(m, "HNSWGraph")
12+
.def(py::init<int, int, int, int, int>())
13+
.def("addEdge", &HNSWGraph::addEdge)
14+
.def("searchLayer", &HNSWGraph::searchLayer)
15+
.def("Insert", &HNSWGraph::Insert)
16+
.def("KNNSearch", &HNSWGraph::KNNSearch)
17+
.def("printGraph", &HNSWGraph::printGraph);
18+
}

main.cpp

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
#include "hnsw.h"
2+
3+
#include <algorithm>
4+
#include <ctime>
5+
#include <iostream>
6+
#include <random>
7+
#include <vector>
8+
using namespace std;
9+
10+
void randomTest(int numItems, int dim, int numQueries, int K) {
11+
std::default_random_engine generator;
12+
std::uniform_real_distribution<double> distribution(0.0, 1.0);
13+
std::vector<Item> randomItems;
14+
randomItems.reserve(numItems);
15+
16+
for (int i = 0; i < numItems; i++) {
17+
std::vector<double> temp(dim);
18+
for (int d = 0; d < dim; d++) {
19+
temp[d] = distribution(generator);
20+
}
21+
randomItems.emplace_back(temp);
22+
}
23+
24+
std::shuffle(randomItems.begin(), randomItems.end(), generator);
25+
HNSWGraph myHNSWGraph(10, 30, 30, 10, 2);
26+
for (int i = 0; i < numItems; i++) {
27+
if (i % 10000 == 0) cout << i << endl;
28+
myHNSWGraph.Insert(randomItems[i]);
29+
}
30+
31+
double total_brute_force_time = 0.0;
32+
double total_hnsw_time = 0.0;
33+
34+
cout << "START QUERY" << endl;
35+
int numHits = 0;
36+
for (int i = 0; i < numQueries; i++) {
37+
vector<double> temp(dim);
38+
for (int d = 0; d < dim; d++) {
39+
temp[d] = distribution(generator);
40+
}
41+
Item query(temp);
42+
43+
// Brute force
44+
clock_t begin_time = clock();
45+
vector<pair<double, int>> distPairs;
46+
for (int j = 0; j < numItems; j++) {
47+
if (j == i) continue;
48+
distPairs.emplace_back(query.dist(randomItems[j]), j);
49+
}
50+
sort(distPairs.begin(), distPairs.end());
51+
total_brute_force_time += double( clock () - begin_time ) / CLOCKS_PER_SEC;
52+
53+
begin_time = clock();
54+
vector<int> knns = myHNSWGraph.KNNSearch(query, K);
55+
// cout << "Printing vectors";
56+
// std::cout << "Contents of knns vector:" << std::endl;
57+
// for (size_t i = 0; i < knns.size(); ++i) {
58+
// std::cout << "knns[" << i << "] = " << knns[i] << std::endl;
59+
// }
60+
// cout << "\nPrinted Vectors";
61+
total_hnsw_time += double( clock () - begin_time ) / CLOCKS_PER_SEC;
62+
63+
if (knns[0] == distPairs[0].second) numHits++;
64+
}
65+
cout << numHits << " " << total_brute_force_time / numQueries << " " << total_hnsw_time / numQueries << endl;
66+
}
67+
68+
int main() {
69+
randomTest(10000, 4, 100, 5);
70+
return 0;
71+
}

main.o

405 KB
Binary file not shown.

my_program.exe

481 KB
Binary file not shown.

0 commit comments

Comments
 (0)