mirror of
https://github.com/asg017/sqlite-vec.git
synced 2026-04-25 00:36:56 +02:00
Add rescore index for ANN queries
Add rescore index type: stores full-precision float vectors in a rowid-keyed shadow table, quantizes to int8 for fast initial scan, then rescores top candidates with original vectors. Includes config parser, shadow table management, insert/delete support, KNN integration, compile flag (SQLITE_VEC_ENABLE_RESCORE), fuzz targets, and tests.
This commit is contained in:
parent
bf2455f2ba
commit
ba0db0b6d6
19 changed files with 3378 additions and 8 deletions
178
tests/fuzz/rescore-knn-deep.c
Normal file
178
tests/fuzz/rescore-knn-deep.c
Normal file
|
|
@ -0,0 +1,178 @@
|
|||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "sqlite-vec.h"
|
||||
#include "sqlite3.h"
|
||||
#include <assert.h>
|
||||
|
||||
/**
|
||||
* Fuzz target: deep exercise of rescore KNN with fuzz-controlled query vectors
|
||||
* and both quantizer types (bit + int8), multiple distance metrics.
|
||||
*
|
||||
* The existing rescore-operations.c only tests bit quantizer with a fixed
|
||||
* query vector. This target:
|
||||
* - Tests both bit and int8 quantizers
|
||||
* - Uses fuzz-controlled query vectors (hits NaN/Inf/denormal paths)
|
||||
* - Tests all distance metrics with int8 (L2, cosine, L1)
|
||||
* - Exercises large LIMIT values (oversample multiplication)
|
||||
* - Tests KNN with rowid IN constraints
|
||||
* - Exercises the insert->query->update->query->delete->query cycle
|
||||
*/
|
||||
int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
|
||||
if (size < 20) return 0;
|
||||
|
||||
int rc;
|
||||
sqlite3 *db;
|
||||
|
||||
rc = sqlite3_open(":memory:", &db);
|
||||
assert(rc == SQLITE_OK);
|
||||
rc = sqlite3_vec_init(db, NULL, NULL);
|
||||
assert(rc == SQLITE_OK);
|
||||
|
||||
/* Use first 4 bytes for configuration */
|
||||
uint8_t config = data[0];
|
||||
uint8_t num_inserts = (data[1] % 20) + 3; /* 3..22 inserts */
|
||||
uint8_t limit_val = (data[2] % 50) + 1; /* 1..50 for LIMIT */
|
||||
uint8_t metric_choice = data[3] % 3;
|
||||
data += 4;
|
||||
size -= 4;
|
||||
|
||||
int use_int8 = config & 1;
|
||||
const char *metric_str;
|
||||
switch (metric_choice) {
|
||||
case 0: metric_str = ""; break; /* default L2 */
|
||||
case 1: metric_str = " distance_metric=cosine"; break;
|
||||
case 2: metric_str = " distance_metric=l1"; break;
|
||||
default: metric_str = ""; break;
|
||||
}
|
||||
|
||||
/* Build CREATE TABLE statement */
|
||||
char create_sql[256];
|
||||
if (use_int8) {
|
||||
snprintf(create_sql, sizeof(create_sql),
|
||||
"CREATE VIRTUAL TABLE v USING vec0("
|
||||
"emb float[16] indexed by rescore(quantizer=int8)%s)", metric_str);
|
||||
} else {
|
||||
/* bit quantizer ignores distance_metric for the coarse pass (always hamming),
|
||||
but the float rescore phase uses the specified metric */
|
||||
snprintf(create_sql, sizeof(create_sql),
|
||||
"CREATE VIRTUAL TABLE v USING vec0("
|
||||
"emb float[16] indexed by rescore(quantizer=bit)%s)", metric_str);
|
||||
}
|
||||
|
||||
rc = sqlite3_exec(db, create_sql, NULL, NULL, NULL);
|
||||
if (rc != SQLITE_OK) { sqlite3_close(db); return 0; }
|
||||
|
||||
/* Insert vectors using fuzz data */
|
||||
{
|
||||
sqlite3_stmt *ins = NULL;
|
||||
sqlite3_prepare_v2(db,
|
||||
"INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &ins, NULL);
|
||||
if (!ins) { sqlite3_close(db); return 0; }
|
||||
|
||||
size_t cursor = 0;
|
||||
for (int i = 0; i < num_inserts && cursor + 1 < size; i++) {
|
||||
float vec[16];
|
||||
for (int j = 0; j < 16; j++) {
|
||||
if (cursor < size) {
|
||||
/* Map fuzz byte to float -- includes potential for
|
||||
interesting float values via reinterpretation */
|
||||
int8_t sb = (int8_t)data[cursor++];
|
||||
vec[j] = (float)sb / 5.0f;
|
||||
} else {
|
||||
vec[j] = 0.0f;
|
||||
}
|
||||
}
|
||||
sqlite3_reset(ins);
|
||||
sqlite3_bind_int64(ins, 1, (sqlite3_int64)(i + 1));
|
||||
sqlite3_bind_blob(ins, 2, vec, sizeof(vec), SQLITE_TRANSIENT);
|
||||
sqlite3_step(ins);
|
||||
}
|
||||
sqlite3_finalize(ins);
|
||||
}
|
||||
|
||||
/* Build a fuzz-controlled query vector from remaining data */
|
||||
float qvec[16] = {0};
|
||||
{
|
||||
size_t cursor = 0;
|
||||
for (int j = 0; j < 16 && cursor < size; j++) {
|
||||
int8_t sb = (int8_t)data[cursor++];
|
||||
qvec[j] = (float)sb / 3.0f;
|
||||
}
|
||||
}
|
||||
|
||||
/* KNN query with fuzz-controlled vector and LIMIT */
|
||||
{
|
||||
char knn_sql[256];
|
||||
snprintf(knn_sql, sizeof(knn_sql),
|
||||
"SELECT rowid, distance FROM v WHERE emb MATCH ? "
|
||||
"ORDER BY distance LIMIT %d", (int)limit_val);
|
||||
|
||||
sqlite3_stmt *knn = NULL;
|
||||
sqlite3_prepare_v2(db, knn_sql, -1, &knn, NULL);
|
||||
if (knn) {
|
||||
sqlite3_bind_blob(knn, 1, qvec, sizeof(qvec), SQLITE_STATIC);
|
||||
while (sqlite3_step(knn) == SQLITE_ROW) {
|
||||
/* Read results to ensure distance computation didn't produce garbage
|
||||
that crashes the cursor iteration */
|
||||
(void)sqlite3_column_int64(knn, 0);
|
||||
(void)sqlite3_column_double(knn, 1);
|
||||
}
|
||||
sqlite3_finalize(knn);
|
||||
}
|
||||
}
|
||||
|
||||
/* Update some vectors, then query again */
|
||||
{
|
||||
float uvec[16];
|
||||
for (int j = 0; j < 16; j++) uvec[j] = qvec[15 - j]; /* reverse of query */
|
||||
sqlite3_stmt *upd = NULL;
|
||||
sqlite3_prepare_v2(db,
|
||||
"UPDATE v SET emb = ? WHERE rowid = 1", -1, &upd, NULL);
|
||||
if (upd) {
|
||||
sqlite3_bind_blob(upd, 1, uvec, sizeof(uvec), SQLITE_STATIC);
|
||||
sqlite3_step(upd);
|
||||
sqlite3_finalize(upd);
|
||||
}
|
||||
}
|
||||
|
||||
/* Second KNN after update */
|
||||
{
|
||||
sqlite3_stmt *knn = NULL;
|
||||
sqlite3_prepare_v2(db,
|
||||
"SELECT rowid, distance FROM v WHERE emb MATCH ? "
|
||||
"ORDER BY distance LIMIT 10", -1, &knn, NULL);
|
||||
if (knn) {
|
||||
sqlite3_bind_blob(knn, 1, qvec, sizeof(qvec), SQLITE_STATIC);
|
||||
while (sqlite3_step(knn) == SQLITE_ROW) {}
|
||||
sqlite3_finalize(knn);
|
||||
}
|
||||
}
|
||||
|
||||
/* Delete half the rows, then KNN again */
|
||||
for (int i = 1; i <= num_inserts; i += 2) {
|
||||
char del_sql[64];
|
||||
snprintf(del_sql, sizeof(del_sql),
|
||||
"DELETE FROM v WHERE rowid = %d", i);
|
||||
sqlite3_exec(db, del_sql, NULL, NULL, NULL);
|
||||
}
|
||||
|
||||
/* Third KNN after deletes -- exercises distance computation over
|
||||
zeroed-out slots in the quantized chunk */
|
||||
{
|
||||
sqlite3_stmt *knn = NULL;
|
||||
sqlite3_prepare_v2(db,
|
||||
"SELECT rowid, distance FROM v WHERE emb MATCH ? "
|
||||
"ORDER BY distance LIMIT 5", -1, &knn, NULL);
|
||||
if (knn) {
|
||||
sqlite3_bind_blob(knn, 1, qvec, sizeof(qvec), SQLITE_STATIC);
|
||||
while (sqlite3_step(knn) == SQLITE_ROW) {}
|
||||
sqlite3_finalize(knn);
|
||||
}
|
||||
}
|
||||
|
||||
sqlite3_close(db);
|
||||
return 0;
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue