Skip to content
This repository was archived by the owner on Nov 21, 2024. It is now read-only.

Product Quantizer implementation #37

Open
wants to merge 38 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
bbd4fff
Implement product quantizer
Aug 10, 2023
91726b3
Update pq test to show index size
Aug 10, 2023
560a60b
Fix GCC build
Aug 11, 2023
5672b36
Fix GCC build
Aug 11, 2023
78a4267
Fix GCC build
Aug 11, 2023
d152c55
Merge with main
Aug 11, 2023
b9da52f
Use simple hash
Aug 11, 2023
efc6897
Update expected file
Aug 12, 2023
d2bb2da
Use OpenMP
Aug 12, 2023
af5eac7
Use OpenMP
Aug 12, 2023
3dbb157
Add alternative version of expected output of pq test for PG14
Aug 12, 2023
f7eba62
Support OpenMP at OS/X
Aug 12, 2023
ac2c19e
Yet another version of pq.sql test output
Aug 12, 2023
4798183
:Do not output distance in pq.sql test because it is slightly differe…
Aug 13, 2023
5d6f06f
Do not build with OpenMp at OS/X
Aug 13, 2023
fcc3e6d
Early stop of training
Aug 16, 2023
8bff9c0
Fix problem with OpenMP in compute_centroids
Aug 17, 2023
3090e93
Fix bug in pq_create_centroids
Aug 17, 2023
3c90331
Fix bug in initialization of PQ pages
Aug 17, 2023
b1cd4f6
Stop PQ iterations if improvement is smaller than threshold
Aug 18, 2023
54ba50b
Restore redo loop for PQ tranining
Aug 19, 2023
4c81918
Minor refactoring
Aug 19, 2023
e11e2e3
Minor refactoring
Aug 19, 2023
27ba8bd
Increase max_points_per_centroid
Aug 19, 2023
2703a06
Simplify code
Aug 20, 2023
139c4fd
Use hypercube for centroids initialization
Aug 22, 2023
a4ecdcb
Implement principal component analysis for PQ training
Aug 22, 2023
cd38761
Remove occasinally added object file
Aug 22, 2023
e89ff87
Add missed assert.h include
Aug 22, 2023
a311aec
Fix GCC warnings
Aug 22, 2023
c0fdd3c
Fix GCC warnings
Aug 22, 2023
abcf100
Perform subsampling after hypercube initialization
Aug 22, 2023
f9539e2
Define bigidx_t as replacement of FAISS idx_t
Aug 23, 2023
1664ae8
Define bigidx_t as replacement of FAISS idx_t
Aug 23, 2023
13b67e9
Onemore refactoring of vector transform code
Aug 24, 2023
79f9090
Onemore refactoring of vector transform code
Aug 24, 2023
190adad
Add overflow checks
Aug 24, 2023
4a86c88
Use kmeans as default train algoeithm
Aug 25, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Use OpenMP
  • Loading branch information
Konstantin Knizhnik committed Aug 12, 2023
commit d2bb2da87e4d4a46f84019ccc2ec4b8191182c82
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ REGRESS_OPTS = --inputdir=test --load-extension=embedding
# For auto-vectorization:
# - GCC&clang needs -Ofast or -O3: https://gcc.gnu.org/projects/tree-ssa/vectorization.html
PG_CFLAGS += -Ofast
PG_CXXFLAGS += -std=c++11
PG_CXXFLAGS += -DUSE_OMP -fopenmp -std=c++11
PG_LDFLAGS += -lstdc++

all: $(EXTENSION)--$(EXTVERSION).sql
Expand Down
63 changes: 45 additions & 18 deletions clustering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@
#include <random>
#include <assert.h>

#ifdef USE_OMP
#include <omp.h>
#endif

extern "C" {
#include "embedding.h"
}
Expand Down Expand Up @@ -102,27 +106,46 @@ void compute_centroids(

memset(centroids, 0, sizeof(coord_t) * d * k);

for (size_t i = 0; i < n; i++) {
idx_t ci = assign[i];
assert(ci < k + k_frozen);
ci -= k_frozen;
coord_t* c = centroids + ci * d;
const coord_t* xi = &x[i * d];
if (weights) {
dist_t w = weights[i];
hassign[ci] += w;
for (size_t j = 0; j < d; j++) {
c[j] += xi[j] * w;
}
} else {
hassign[ci] += 1.0;
for (size_t j = 0; j < d; j++) {
c[j] += xi[j];
#ifdef USE_OMP
#pragma omp parallel
{
int nt = omp_get_num_threads();
int rank = omp_get_thread_num();

// this thread is taking care of centroids c0:c1
size_t c0 = (k * rank) / nt;
size_t c1 = (k * (rank + 1)) / nt;
#else
{
#endif
for (size_t i = 0; i < n; i++) {
idx_t ci = assign[i];
assert(ci < k + k_frozen);
ci -= k_frozen;
#ifdef USE_OMP
if (ci > c0 || ci >= c1)
continue;
#endif
coord_t* c = centroids + ci * d;
const coord_t* xi = &x[i * d];
if (weights) {
dist_t w = weights[i];
hassign[ci] += w;
for (size_t j = 0; j < d; j++) {
c[j] += xi[j] * w;
}
} else {
hassign[ci] += 1.0;
for (size_t j = 0; j < d; j++) {
c[j] += xi[j];
}
}
}
}

for (size_t ci = 0; ci < k; ci++) {
#ifdef USE_OMP
#pragma omp parallel for
#endif
for (size_t ci = 0; ci < k; ci++) {
if (hassign[ci] == 0) {
continue;
}
Expand Down Expand Up @@ -203,6 +226,10 @@ calculate_distances(HnswMetadata* meta, coord_t const* centroids, size_t nx, coo
{
size_t d = meta->pqSubdim; ///< dimension of the vectors
idx_t k = 1 << meta->pqBits; ///< nb of centroids

#ifdef USE_OMP
#pragma omp parallel for
#endif
for (size_t i = 0; i < nx; i++) {
const coord_t* x_i = x + i * d;
const coord_t* y_j = centroids;
Expand Down