Add ANN search support for vec0 virtual table (#273)

Add approximate nearest neighbor infrastructure to vec0: shared distance
dispatch (vec0_distance_full), flat index type with parser, NEON-optimized
cosine/Hamming for float32/int8, amalgamation script, and benchmark suite
(benchmarks-ann/) with ground-truth generation and profiling tools. Remove
unused vec_npy_each/vec_static_blobs code, fix missing stdint.h include.
This commit is contained in:
Alex Garcia 2026-03-31 01:03:32 -07:00 committed by GitHub
parent e9f598abfa
commit 0de765f457
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
27 changed files with 2177 additions and 2116 deletions

View file

@ -500,6 +500,83 @@ void test_vec0_parse_vector_column() {
assert(rc == SQLITE_ERROR);
}
// indexed by flat()
{
const char *input = "emb float[768] indexed by flat()";
rc = vec0_parse_vector_column(input, (int)strlen(input), &col);
assert(rc == SQLITE_OK);
assert(col.index_type == VEC0_INDEX_TYPE_FLAT);
assert(col.dimensions == 768);
sqlite3_free(col.name);
}
// indexed by flat() with distance_metric
{
const char *input = "emb float[768] distance_metric=cosine indexed by flat()";
rc = vec0_parse_vector_column(input, (int)strlen(input), &col);
assert(rc == SQLITE_OK);
assert(col.index_type == VEC0_INDEX_TYPE_FLAT);
assert(col.distance_metric == VEC0_DISTANCE_METRIC_COSINE);
sqlite3_free(col.name);
}
// indexed by flat() on int8
{
const char *input = "emb int8[256] indexed by flat()";
rc = vec0_parse_vector_column(input, (int)strlen(input), &col);
assert(rc == SQLITE_OK);
assert(col.index_type == VEC0_INDEX_TYPE_FLAT);
assert(col.element_type == SQLITE_VEC_ELEMENT_TYPE_INT8);
sqlite3_free(col.name);
}
// indexed by flat() on bit
{
const char *input = "emb bit[64] indexed by flat()";
rc = vec0_parse_vector_column(input, (int)strlen(input), &col);
assert(rc == SQLITE_OK);
assert(col.index_type == VEC0_INDEX_TYPE_FLAT);
assert(col.element_type == SQLITE_VEC_ELEMENT_TYPE_BIT);
sqlite3_free(col.name);
}
// default index_type is FLAT
{
const char *input = "emb float[768]";
rc = vec0_parse_vector_column(input, (int)strlen(input), &col);
assert(rc == SQLITE_OK);
assert(col.index_type == VEC0_INDEX_TYPE_FLAT);
sqlite3_free(col.name);
}
// Error: indexed by (missing type name)
{
const char *input = "emb float[768] indexed by";
rc = vec0_parse_vector_column(input, (int)strlen(input), &col);
assert(rc == SQLITE_ERROR);
}
// Error: indexed by unknown()
{
const char *input = "emb float[768] indexed by unknown()";
rc = vec0_parse_vector_column(input, (int)strlen(input), &col);
assert(rc == SQLITE_ERROR);
}
// Error: indexed by flat (missing parens)
{
const char *input = "emb float[768] indexed by flat";
rc = vec0_parse_vector_column(input, (int)strlen(input), &col);
assert(rc == SQLITE_ERROR);
}
// Error: indexed flat() (missing "by")
{
const char *input = "emb float[768] indexed flat()";
rc = vec0_parse_vector_column(input, (int)strlen(input), &col);
assert(rc == SQLITE_ERROR);
}
printf(" All vec0_parse_vector_column tests passed.\n");
}
@ -656,6 +733,30 @@ void test_distance_hamming() {
assert(d == 16.0f);
}
// Large vector (256 bits = 32 bytes) — exercises NEON path on ARM
{
unsigned char a[32];
unsigned char b[32];
memset(a, 0xFF, 32);
memset(b, 0x00, 32);
d = _test_distance_hamming(a, b, 256);
assert(d == 256.0f);
}
// Large vector (1024 bits = 128 bytes) — exercises 64-byte NEON loop
{
unsigned char a[128];
unsigned char b[128];
memset(a, 0x00, 128);
memset(b, 0x00, 128);
// Set every other byte to 0xFF in a, 0x00 in b -> 8 bits per byte * 64 bytes = 512
for (int i = 0; i < 128; i += 2) {
a[i] = 0xFF;
}
d = _test_distance_hamming(a, b, 1024);
assert(d == 512.0f);
}
printf(" All distance_hamming tests passed.\n");
}