mirror of
https://github.com/asg017/sqlite-vec.git
synced 2026-04-25 08:46:49 +02:00
Add DiskANN index for vec0 virtual table
Add DiskANN graph-based index: builds a Vamana graph with configurable R (max degree) and L (search list size, separate for insert/query), supports int8 quantization with rescore, lazy reverse-edge replacement, pre-quantized query optimization, and insert buffer reuse. Includes shadow table management, delete support, KNN integration, compile flag (SQLITE_VEC_ENABLE_DISKANN), release-demo workflow, fuzz targets, and tests. Fixes rescore int8 quantization bug.
This commit is contained in:
parent
e2c38f387c
commit
575371d751
23 changed files with 6550 additions and 135 deletions
|
|
@ -26,7 +26,7 @@ FUZZ_LDFLAGS ?= $(shell \
|
|||
echo "-Wl,-ld_classic"; \
|
||||
fi)
|
||||
|
||||
FUZZ_CFLAGS = $(FUZZ_SANITIZERS) -I ../../ -I ../../vendor -DSQLITE_CORE -g $(FUZZ_LDFLAGS)
|
||||
FUZZ_CFLAGS = $(FUZZ_SANITIZERS) -I ../../ -I ../../vendor -DSQLITE_CORE -DSQLITE_VEC_ENABLE_DISKANN=1 -g $(FUZZ_LDFLAGS)
|
||||
FUZZ_SRCS = ../../vendor/sqlite3.c ../../sqlite-vec.c
|
||||
|
||||
TARGET_DIR = ./targets
|
||||
|
|
@ -115,6 +115,34 @@ $(TARGET_DIR)/ivf_cell_overflow: ivf-cell-overflow.c $(FUZZ_SRCS) | $(TARGET_DIR
|
|||
$(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@
|
||||
|
||||
$(TARGET_DIR)/ivf_rescore: ivf-rescore.c $(FUZZ_SRCS) | $(TARGET_DIR)
|
||||
$(TARGET_DIR)/diskann_operations: diskann-operations.c $(FUZZ_SRCS) | $(TARGET_DIR)
|
||||
$(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@
|
||||
|
||||
$(TARGET_DIR)/diskann_create: diskann-create.c $(FUZZ_SRCS) | $(TARGET_DIR)
|
||||
$(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@
|
||||
|
||||
$(TARGET_DIR)/diskann_graph_corrupt: diskann-graph-corrupt.c $(FUZZ_SRCS) | $(TARGET_DIR)
|
||||
$(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@
|
||||
|
||||
$(TARGET_DIR)/diskann_deep_search: diskann-deep-search.c $(FUZZ_SRCS) | $(TARGET_DIR)
|
||||
$(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@
|
||||
|
||||
$(TARGET_DIR)/diskann_blob_truncate: diskann-blob-truncate.c $(FUZZ_SRCS) | $(TARGET_DIR)
|
||||
$(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@
|
||||
|
||||
$(TARGET_DIR)/diskann_delete_stress: diskann-delete-stress.c $(FUZZ_SRCS) | $(TARGET_DIR)
|
||||
$(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@
|
||||
|
||||
$(TARGET_DIR)/diskann_buffer_flush: diskann-buffer-flush.c $(FUZZ_SRCS) | $(TARGET_DIR)
|
||||
$(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@
|
||||
|
||||
$(TARGET_DIR)/diskann_int8_quant: diskann-int8-quant.c $(FUZZ_SRCS) | $(TARGET_DIR)
|
||||
$(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@
|
||||
|
||||
$(TARGET_DIR)/diskann_prune_direct: diskann-prune-direct.c $(FUZZ_SRCS) | $(TARGET_DIR)
|
||||
$(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@
|
||||
|
||||
$(TARGET_DIR)/diskann_command_inject: diskann-command-inject.c $(FUZZ_SRCS) | $(TARGET_DIR)
|
||||
$(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@
|
||||
|
||||
FUZZ_TARGETS = vec0_create exec json numpy \
|
||||
|
|
@ -127,6 +155,11 @@ FUZZ_TARGETS = vec0_create exec json numpy \
|
|||
ivf_create ivf_operations \
|
||||
ivf_quantize ivf_kmeans ivf_shadow_corrupt \
|
||||
ivf_knn_deep ivf_cell_overflow ivf_rescore
|
||||
diskann_operations diskann_create diskann_graph_corrupt \
|
||||
diskann_deep_search diskann_blob_truncate \
|
||||
diskann_delete_stress diskann_buffer_flush \
|
||||
diskann_int8_quant diskann_prune_direct \
|
||||
diskann_command_inject
|
||||
|
||||
all: $(addprefix $(TARGET_DIR)/,$(FUZZ_TARGETS))
|
||||
|
||||
|
|
|
|||
250
tests/fuzz/diskann-blob-truncate.c
Normal file
250
tests/fuzz/diskann-blob-truncate.c
Normal file
|
|
@ -0,0 +1,250 @@
|
|||
/**
|
||||
* Fuzz target for DiskANN shadow table blob size mismatches.
|
||||
*
|
||||
* The critical vulnerability: diskann_node_read() copies whatever blob size
|
||||
* SQLite returns, but diskann_search/insert/delete index into those blobs
|
||||
* using cfg->n_neighbors * sizeof(i64) etc. If the blob is truncated,
|
||||
* extended, or has wrong size, this causes out-of-bounds reads/writes.
|
||||
*
|
||||
* This fuzzer:
|
||||
* 1. Creates a valid DiskANN graph with several nodes
|
||||
* 2. Uses fuzz data to directly write malformed blobs to shadow tables:
|
||||
* - Truncated neighbor_ids (fewer bytes than n_neighbors * 8)
|
||||
* - Truncated validity bitmaps
|
||||
* - Oversized blobs with garbage trailing data
|
||||
* - Zero-length blobs
|
||||
* - Blobs with valid headers but corrupted neighbor rowids
|
||||
* 3. Runs INSERT, DELETE, and KNN operations that traverse the corrupted graph
|
||||
*
|
||||
* Key code paths targeted:
|
||||
* - diskann_node_read with mismatched blob sizes
|
||||
* - diskann_validity_get / diskann_neighbor_id_get on truncated blobs
|
||||
* - diskann_add_reverse_edge reading corrupted neighbor data
|
||||
* - diskann_repair_reverse_edges traversing corrupted neighbor lists
|
||||
* - diskann_search iterating neighbors from corrupted blobs
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "sqlite-vec.h"
|
||||
#include "sqlite3.h"
|
||||
#include <assert.h>
|
||||
|
||||
static uint8_t fuzz_byte(const uint8_t **data, size_t *size, uint8_t def) {
|
||||
if (*size == 0) return def;
|
||||
uint8_t b = **data;
|
||||
(*data)++;
|
||||
(*size)--;
|
||||
return b;
|
||||
}
|
||||
|
||||
int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
|
||||
if (size < 32) return 0;
|
||||
|
||||
int rc;
|
||||
sqlite3 *db;
|
||||
|
||||
rc = sqlite3_open(":memory:", &db);
|
||||
assert(rc == SQLITE_OK);
|
||||
rc = sqlite3_vec_init(db, NULL, NULL);
|
||||
assert(rc == SQLITE_OK);
|
||||
|
||||
/* Use binary quantizer, float[16], n_neighbors=8 for predictable blob sizes:
|
||||
* validity: 8/8 = 1 byte
|
||||
* neighbor_ids: 8 * 8 = 64 bytes
|
||||
* qvecs: 8 * (16/8) = 16 bytes (binary: 2 bytes per qvec)
|
||||
*/
|
||||
rc = sqlite3_exec(db,
|
||||
"CREATE VIRTUAL TABLE v USING vec0("
|
||||
"emb float[16] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8))",
|
||||
NULL, NULL, NULL);
|
||||
if (rc != SQLITE_OK) { sqlite3_close(db); return 0; }
|
||||
|
||||
/* Insert 12 vectors to create a valid graph structure */
|
||||
{
|
||||
sqlite3_stmt *stmt;
|
||||
sqlite3_prepare_v2(db,
|
||||
"INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &stmt, NULL);
|
||||
for (int i = 1; i <= 12; i++) {
|
||||
float vec[16];
|
||||
for (int j = 0; j < 16; j++) {
|
||||
vec[j] = (float)i * 0.1f + (float)j * 0.01f;
|
||||
}
|
||||
sqlite3_reset(stmt);
|
||||
sqlite3_bind_int64(stmt, 1, i);
|
||||
sqlite3_bind_blob(stmt, 2, vec, sizeof(vec), SQLITE_TRANSIENT);
|
||||
sqlite3_step(stmt);
|
||||
}
|
||||
sqlite3_finalize(stmt);
|
||||
}
|
||||
|
||||
/* Now corrupt shadow table blobs using fuzz data */
|
||||
const char *columns[] = {
|
||||
"neighbors_validity",
|
||||
"neighbor_ids",
|
||||
"neighbor_quantized_vectors"
|
||||
};
|
||||
|
||||
/* Expected sizes for n_neighbors=8, dims=16, binary quantizer */
|
||||
int expected_sizes[] = {1, 64, 16};
|
||||
|
||||
while (size >= 4) {
|
||||
int target_row = (fuzz_byte(&data, &size, 0) % 12) + 1;
|
||||
int col_idx = fuzz_byte(&data, &size, 0) % 3;
|
||||
uint8_t corrupt_mode = fuzz_byte(&data, &size, 0) % 6;
|
||||
uint8_t extra = fuzz_byte(&data, &size, 0);
|
||||
|
||||
char sqlbuf[256];
|
||||
snprintf(sqlbuf, sizeof(sqlbuf),
|
||||
"UPDATE v_diskann_nodes00 SET %s = ? WHERE rowid = ?",
|
||||
columns[col_idx]);
|
||||
|
||||
sqlite3_stmt *writeStmt;
|
||||
rc = sqlite3_prepare_v2(db, sqlbuf, -1, &writeStmt, NULL);
|
||||
if (rc != SQLITE_OK) continue;
|
||||
|
||||
int expected = expected_sizes[col_idx];
|
||||
unsigned char *blob = NULL;
|
||||
int blob_size = 0;
|
||||
|
||||
switch (corrupt_mode) {
|
||||
case 0: {
|
||||
/* Truncated blob: 0 to expected-1 bytes */
|
||||
blob_size = extra % expected;
|
||||
if (blob_size == 0) blob_size = 0; /* zero-length is interesting */
|
||||
blob = sqlite3_malloc(blob_size > 0 ? blob_size : 1);
|
||||
if (!blob) { sqlite3_finalize(writeStmt); continue; }
|
||||
for (int i = 0; i < blob_size; i++) {
|
||||
blob[i] = fuzz_byte(&data, &size, 0);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 1: {
|
||||
/* Oversized blob: expected + extra bytes */
|
||||
blob_size = expected + (extra % 64);
|
||||
blob = sqlite3_malloc(blob_size);
|
||||
if (!blob) { sqlite3_finalize(writeStmt); continue; }
|
||||
for (int i = 0; i < blob_size; i++) {
|
||||
blob[i] = fuzz_byte(&data, &size, 0xFF);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 2: {
|
||||
/* Zero-length blob */
|
||||
blob_size = 0;
|
||||
blob = NULL;
|
||||
sqlite3_bind_zeroblob(writeStmt, 1, 0);
|
||||
sqlite3_bind_int64(writeStmt, 2, target_row);
|
||||
sqlite3_step(writeStmt);
|
||||
sqlite3_finalize(writeStmt);
|
||||
continue;
|
||||
}
|
||||
case 3: {
|
||||
/* Correct size but all-ones validity (all slots "valid") with
|
||||
* garbage neighbor IDs -- forces reading non-existent nodes */
|
||||
blob_size = expected;
|
||||
blob = sqlite3_malloc(blob_size);
|
||||
if (!blob) { sqlite3_finalize(writeStmt); continue; }
|
||||
memset(blob, 0xFF, blob_size);
|
||||
break;
|
||||
}
|
||||
case 4: {
|
||||
/* neighbor_ids with very large rowid values (near INT64_MAX) */
|
||||
blob_size = expected;
|
||||
blob = sqlite3_malloc(blob_size);
|
||||
if (!blob) { sqlite3_finalize(writeStmt); continue; }
|
||||
memset(blob, 0x7F, blob_size); /* fills with large positive values */
|
||||
break;
|
||||
}
|
||||
case 5: {
|
||||
/* neighbor_ids with negative rowid values (rowid=0 is sentinel) */
|
||||
blob_size = expected;
|
||||
blob = sqlite3_malloc(blob_size);
|
||||
if (!blob) { sqlite3_finalize(writeStmt); continue; }
|
||||
memset(blob, 0x80, blob_size); /* fills with large negative values */
|
||||
/* Flip some bytes from fuzz data */
|
||||
for (int i = 0; i < blob_size && size > 0; i++) {
|
||||
blob[i] ^= fuzz_byte(&data, &size, 0);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (blob) {
|
||||
sqlite3_bind_blob(writeStmt, 1, blob, blob_size, SQLITE_TRANSIENT);
|
||||
} else {
|
||||
sqlite3_bind_blob(writeStmt, 1, "", 0, SQLITE_STATIC);
|
||||
}
|
||||
sqlite3_bind_int64(writeStmt, 2, target_row);
|
||||
sqlite3_step(writeStmt);
|
||||
sqlite3_finalize(writeStmt);
|
||||
sqlite3_free(blob);
|
||||
}
|
||||
|
||||
/* Exercise the corrupted graph with various operations */
|
||||
|
||||
/* KNN query */
|
||||
{
|
||||
float qvec[16];
|
||||
for (int j = 0; j < 16; j++) qvec[j] = (float)j * 0.1f;
|
||||
sqlite3_stmt *knnStmt;
|
||||
rc = sqlite3_prepare_v2(db,
|
||||
"SELECT rowid, distance FROM v WHERE emb MATCH ? AND k = 5",
|
||||
-1, &knnStmt, NULL);
|
||||
if (rc == SQLITE_OK) {
|
||||
sqlite3_bind_blob(knnStmt, 1, qvec, sizeof(qvec), SQLITE_STATIC);
|
||||
while (sqlite3_step(knnStmt) == SQLITE_ROW) {}
|
||||
sqlite3_finalize(knnStmt);
|
||||
}
|
||||
}
|
||||
|
||||
/* Insert into corrupted graph (triggers add_reverse_edge on corrupted nodes) */
|
||||
{
|
||||
float vec[16];
|
||||
for (int j = 0; j < 16; j++) vec[j] = 0.5f;
|
||||
sqlite3_stmt *stmt;
|
||||
sqlite3_prepare_v2(db,
|
||||
"INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &stmt, NULL);
|
||||
if (stmt) {
|
||||
sqlite3_bind_int64(stmt, 1, 100);
|
||||
sqlite3_bind_blob(stmt, 2, vec, sizeof(vec), SQLITE_TRANSIENT);
|
||||
sqlite3_step(stmt);
|
||||
sqlite3_finalize(stmt);
|
||||
}
|
||||
}
|
||||
|
||||
/* Delete from corrupted graph (triggers repair_reverse_edges) */
|
||||
{
|
||||
sqlite3_stmt *stmt;
|
||||
sqlite3_prepare_v2(db,
|
||||
"DELETE FROM v WHERE rowid = ?", -1, &stmt, NULL);
|
||||
if (stmt) {
|
||||
sqlite3_bind_int64(stmt, 1, 5);
|
||||
sqlite3_step(stmt);
|
||||
sqlite3_finalize(stmt);
|
||||
}
|
||||
}
|
||||
|
||||
/* Another KNN to traverse the post-mutation graph */
|
||||
{
|
||||
float qvec[16];
|
||||
for (int j = 0; j < 16; j++) qvec[j] = -0.5f + (float)j * 0.07f;
|
||||
sqlite3_stmt *knnStmt;
|
||||
rc = sqlite3_prepare_v2(db,
|
||||
"SELECT rowid, distance FROM v WHERE emb MATCH ? AND k = 12",
|
||||
-1, &knnStmt, NULL);
|
||||
if (rc == SQLITE_OK) {
|
||||
sqlite3_bind_blob(knnStmt, 1, qvec, sizeof(qvec), SQLITE_STATIC);
|
||||
while (sqlite3_step(knnStmt) == SQLITE_ROW) {}
|
||||
sqlite3_finalize(knnStmt);
|
||||
}
|
||||
}
|
||||
|
||||
/* Full scan */
|
||||
sqlite3_exec(db, "SELECT * FROM v", NULL, NULL, NULL);
|
||||
|
||||
sqlite3_close(db);
|
||||
return 0;
|
||||
}
|
||||
164
tests/fuzz/diskann-buffer-flush.c
Normal file
164
tests/fuzz/diskann-buffer-flush.c
Normal file
|
|
@ -0,0 +1,164 @@
|
|||
/**
|
||||
* Fuzz target for DiskANN buffered insert and flush paths.
|
||||
*
|
||||
* When buffer_threshold > 0, inserts go into a flat buffer table and
|
||||
* are flushed into the graph in batch. This fuzzer exercises:
|
||||
*
|
||||
* - diskann_buffer_write / diskann_buffer_delete / diskann_buffer_exists
|
||||
* - diskann_flush_buffer (batch graph insertion)
|
||||
* - diskann_insert with buffer_threshold (batching logic)
|
||||
* - Buffer-graph merge in vec0Filter_knn_diskann (unflushed vectors
|
||||
* must be scanned during KNN and merged with graph results)
|
||||
* - Delete of a buffered (not yet flushed) vector
|
||||
* - Delete of a graph vector while buffer has pending inserts
|
||||
* - Interaction: insert to buffer, query (triggers buffer scan), flush,
|
||||
* query again (now from graph)
|
||||
*
|
||||
* The buffer merge path in vec0Filter_knn_diskann is particularly
|
||||
* interesting because it does a brute-force scan of buffer vectors and
|
||||
* merges with the top-k from graph search.
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "sqlite-vec.h"
|
||||
#include "sqlite3.h"
|
||||
#include <assert.h>
|
||||
|
||||
static uint8_t fuzz_byte(const uint8_t **data, size_t *size, uint8_t def) {
|
||||
if (*size == 0) return def;
|
||||
uint8_t b = **data;
|
||||
(*data)++;
|
||||
(*size)--;
|
||||
return b;
|
||||
}
|
||||
|
||||
int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
|
||||
if (size < 16) return 0;
|
||||
|
||||
int rc;
|
||||
sqlite3 *db;
|
||||
rc = sqlite3_open(":memory:", &db);
|
||||
assert(rc == SQLITE_OK);
|
||||
rc = sqlite3_vec_init(db, NULL, NULL);
|
||||
assert(rc == SQLITE_OK);
|
||||
|
||||
/* buffer_threshold: small (3-8) to trigger frequent flushes */
|
||||
int buf_threshold = 3 + (fuzz_byte(&data, &size, 0) % 6);
|
||||
int dims = 8;
|
||||
|
||||
char sql[512];
|
||||
snprintf(sql, sizeof(sql),
|
||||
"CREATE VIRTUAL TABLE v USING vec0("
|
||||
"emb float[%d] INDEXED BY diskann("
|
||||
"neighbor_quantizer=binary, n_neighbors=8, "
|
||||
"search_list_size=16, buffer_threshold=%d"
|
||||
"))", dims, buf_threshold);
|
||||
|
||||
rc = sqlite3_exec(db, sql, NULL, NULL, NULL);
|
||||
if (rc != SQLITE_OK) { sqlite3_close(db); return 0; }
|
||||
|
||||
sqlite3_stmt *stmtInsert = NULL, *stmtDelete = NULL, *stmtKnn = NULL;
|
||||
sqlite3_prepare_v2(db,
|
||||
"INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &stmtInsert, NULL);
|
||||
sqlite3_prepare_v2(db,
|
||||
"DELETE FROM v WHERE rowid = ?", -1, &stmtDelete, NULL);
|
||||
sqlite3_prepare_v2(db,
|
||||
"SELECT rowid, distance FROM v WHERE emb MATCH ? AND k = ?",
|
||||
-1, &stmtKnn, NULL);
|
||||
|
||||
if (!stmtInsert || !stmtDelete || !stmtKnn) goto cleanup;
|
||||
|
||||
float vec[8];
|
||||
int next_rowid = 1;
|
||||
|
||||
while (size >= 2) {
|
||||
uint8_t op = fuzz_byte(&data, &size, 0) % 6;
|
||||
uint8_t param = fuzz_byte(&data, &size, 0);
|
||||
|
||||
switch (op) {
|
||||
case 0: { /* Insert: accumulates in buffer until threshold */
|
||||
int64_t rowid = next_rowid++;
|
||||
if (next_rowid > 64) next_rowid = 1; /* wrap around for reuse */
|
||||
for (int j = 0; j < dims; j++) {
|
||||
vec[j] = (float)((int8_t)fuzz_byte(&data, &size, 0)) / 10.0f;
|
||||
}
|
||||
sqlite3_reset(stmtInsert);
|
||||
sqlite3_bind_int64(stmtInsert, 1, rowid);
|
||||
sqlite3_bind_blob(stmtInsert, 2, vec, sizeof(vec), SQLITE_TRANSIENT);
|
||||
sqlite3_step(stmtInsert);
|
||||
break;
|
||||
}
|
||||
case 1: { /* KNN query while buffer may have unflushed vectors */
|
||||
for (int j = 0; j < dims; j++) {
|
||||
vec[j] = (float)((int8_t)fuzz_byte(&data, &size, 0)) / 10.0f;
|
||||
}
|
||||
int k = (param % 10) + 1;
|
||||
sqlite3_reset(stmtKnn);
|
||||
sqlite3_bind_blob(stmtKnn, 1, vec, sizeof(vec), SQLITE_TRANSIENT);
|
||||
sqlite3_bind_int(stmtKnn, 2, k);
|
||||
while (sqlite3_step(stmtKnn) == SQLITE_ROW) {}
|
||||
break;
|
||||
}
|
||||
case 2: { /* Delete a potentially-buffered vector */
|
||||
int64_t rowid = (int64_t)(param % 64) + 1;
|
||||
sqlite3_reset(stmtDelete);
|
||||
sqlite3_bind_int64(stmtDelete, 1, rowid);
|
||||
sqlite3_step(stmtDelete);
|
||||
break;
|
||||
}
|
||||
case 3: { /* Insert several at once to trigger flush mid-batch */
|
||||
for (int i = 0; i < buf_threshold + 1 && size >= 2; i++) {
|
||||
int64_t rowid = (int64_t)(fuzz_byte(&data, &size, 0) % 64) + 1;
|
||||
for (int j = 0; j < dims; j++) {
|
||||
vec[j] = (float)((int8_t)fuzz_byte(&data, &size, 0)) / 10.0f;
|
||||
}
|
||||
sqlite3_reset(stmtInsert);
|
||||
sqlite3_bind_int64(stmtInsert, 1, rowid);
|
||||
sqlite3_bind_blob(stmtInsert, 2, vec, sizeof(vec), SQLITE_TRANSIENT);
|
||||
sqlite3_step(stmtInsert);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 4: { /* Insert then immediately delete (still in buffer) */
|
||||
int64_t rowid = (int64_t)(param % 64) + 1;
|
||||
for (int j = 0; j < dims; j++) vec[j] = 0.1f * param;
|
||||
sqlite3_reset(stmtInsert);
|
||||
sqlite3_bind_int64(stmtInsert, 1, rowid);
|
||||
sqlite3_bind_blob(stmtInsert, 2, vec, sizeof(vec), SQLITE_TRANSIENT);
|
||||
sqlite3_step(stmtInsert);
|
||||
|
||||
sqlite3_reset(stmtDelete);
|
||||
sqlite3_bind_int64(stmtDelete, 1, rowid);
|
||||
sqlite3_step(stmtDelete);
|
||||
break;
|
||||
}
|
||||
case 5: { /* Query with k=0 and k=1 (boundary) */
|
||||
for (int j = 0; j < dims; j++) vec[j] = 0.0f;
|
||||
sqlite3_reset(stmtKnn);
|
||||
sqlite3_bind_blob(stmtKnn, 1, vec, sizeof(vec), SQLITE_TRANSIENT);
|
||||
sqlite3_bind_int(stmtKnn, 2, param % 2); /* k=0 or k=1 */
|
||||
while (sqlite3_step(stmtKnn) == SQLITE_ROW) {}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Final query to exercise post-operation state */
|
||||
{
|
||||
float qvec[8] = {1.0f, -1.0f, 0.5f, -0.5f, 0.0f, 0.0f, 0.0f, 0.0f};
|
||||
sqlite3_reset(stmtKnn);
|
||||
sqlite3_bind_blob(stmtKnn, 1, qvec, sizeof(qvec), SQLITE_TRANSIENT);
|
||||
sqlite3_bind_int(stmtKnn, 2, 20);
|
||||
while (sqlite3_step(stmtKnn) == SQLITE_ROW) {}
|
||||
}
|
||||
|
||||
cleanup:
|
||||
sqlite3_finalize(stmtInsert);
|
||||
sqlite3_finalize(stmtDelete);
|
||||
sqlite3_finalize(stmtKnn);
|
||||
sqlite3_close(db);
|
||||
return 0;
|
||||
}
|
||||
158
tests/fuzz/diskann-command-inject.c
Normal file
158
tests/fuzz/diskann-command-inject.c
Normal file
|
|
@ -0,0 +1,158 @@
|
|||
/**
|
||||
* Fuzz target for DiskANN runtime command dispatch (diskann_handle_command).
|
||||
*
|
||||
* The command handler parses strings like "search_list_size_search=42" and
|
||||
* modifies live DiskANN config. This fuzzer exercises:
|
||||
*
|
||||
* - atoi on fuzz-controlled strings (integer overflow, negative, non-numeric)
|
||||
* - strncmp boundary with fuzz data (near-matches to valid commands)
|
||||
* - Changing search_list_size mid-operation (affects subsequent queries)
|
||||
* - Setting search_list_size to 1 (minimum - single-candidate beam search)
|
||||
* - Setting search_list_size very large (memory pressure)
|
||||
* - Interleaving command changes with inserts and queries
|
||||
*
|
||||
* Also tests the UPDATE v SET command = ? path through the vtable.
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "sqlite-vec.h"
|
||||
#include "sqlite3.h"
|
||||
#include <assert.h>
|
||||
|
||||
static uint8_t fuzz_byte(const uint8_t **data, size_t *size, uint8_t def) {
|
||||
if (*size == 0) return def;
|
||||
uint8_t b = **data;
|
||||
(*data)++;
|
||||
(*size)--;
|
||||
return b;
|
||||
}
|
||||
|
||||
int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
|
||||
if (size < 20) return 0;
|
||||
|
||||
int rc;
|
||||
sqlite3 *db;
|
||||
rc = sqlite3_open(":memory:", &db);
|
||||
assert(rc == SQLITE_OK);
|
||||
rc = sqlite3_vec_init(db, NULL, NULL);
|
||||
assert(rc == SQLITE_OK);
|
||||
|
||||
rc = sqlite3_exec(db,
|
||||
"CREATE VIRTUAL TABLE v USING vec0("
|
||||
"emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8))",
|
||||
NULL, NULL, NULL);
|
||||
if (rc != SQLITE_OK) { sqlite3_close(db); return 0; }
|
||||
|
||||
/* Insert some vectors first */
|
||||
{
|
||||
sqlite3_stmt *stmt;
|
||||
sqlite3_prepare_v2(db,
|
||||
"INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &stmt, NULL);
|
||||
for (int i = 1; i <= 8; i++) {
|
||||
float vec[8];
|
||||
for (int j = 0; j < 8; j++) vec[j] = (float)i * 0.1f + (float)j * 0.01f;
|
||||
sqlite3_reset(stmt);
|
||||
sqlite3_bind_int64(stmt, 1, i);
|
||||
sqlite3_bind_blob(stmt, 2, vec, sizeof(vec), SQLITE_TRANSIENT);
|
||||
sqlite3_step(stmt);
|
||||
}
|
||||
sqlite3_finalize(stmt);
|
||||
}
|
||||
|
||||
sqlite3_stmt *stmtCmd = NULL;
|
||||
sqlite3_stmt *stmtInsert = NULL;
|
||||
sqlite3_stmt *stmtKnn = NULL;
|
||||
|
||||
/* Commands are dispatched via INSERT INTO t(rowid) VALUES ('cmd_string') */
|
||||
sqlite3_prepare_v2(db,
|
||||
"INSERT INTO v(rowid) VALUES (?)", -1, &stmtCmd, NULL);
|
||||
sqlite3_prepare_v2(db,
|
||||
"INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &stmtInsert, NULL);
|
||||
sqlite3_prepare_v2(db,
|
||||
"SELECT rowid, distance FROM v WHERE emb MATCH ? AND k = ?",
|
||||
-1, &stmtKnn, NULL);
|
||||
|
||||
if (!stmtCmd || !stmtInsert || !stmtKnn) goto cleanup;
|
||||
|
||||
/* Fuzz-driven command + operation interleaving */
|
||||
while (size >= 2) {
|
||||
uint8_t op = fuzz_byte(&data, &size, 0) % 5;
|
||||
|
||||
switch (op) {
|
||||
case 0: { /* Send fuzz command string */
|
||||
int cmd_len = fuzz_byte(&data, &size, 0) % 64;
|
||||
char cmd[65];
|
||||
for (int i = 0; i < cmd_len && size > 0; i++) {
|
||||
cmd[i] = (char)fuzz_byte(&data, &size, 0);
|
||||
}
|
||||
cmd[cmd_len] = '\0';
|
||||
sqlite3_reset(stmtCmd);
|
||||
sqlite3_bind_text(stmtCmd, 1, cmd, -1, SQLITE_TRANSIENT);
|
||||
sqlite3_step(stmtCmd); /* May fail -- that's expected */
|
||||
break;
|
||||
}
|
||||
case 1: { /* Send valid-looking command with fuzz value */
|
||||
const char *prefixes[] = {
|
||||
"search_list_size=",
|
||||
"search_list_size_search=",
|
||||
"search_list_size_insert=",
|
||||
};
|
||||
int prefix_idx = fuzz_byte(&data, &size, 0) % 3;
|
||||
int val = (int)(int8_t)fuzz_byte(&data, &size, 0);
|
||||
|
||||
char cmd[128];
|
||||
snprintf(cmd, sizeof(cmd), "%s%d", prefixes[prefix_idx], val);
|
||||
sqlite3_reset(stmtCmd);
|
||||
sqlite3_bind_text(stmtCmd, 1, cmd, -1, SQLITE_TRANSIENT);
|
||||
sqlite3_step(stmtCmd);
|
||||
break;
|
||||
}
|
||||
case 2: { /* KNN query (uses whatever search_list_size is set) */
|
||||
float qvec[8] = {1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
|
||||
qvec[0] = (float)((int8_t)fuzz_byte(&data, &size, 127)) / 10.0f;
|
||||
int k = fuzz_byte(&data, &size, 3) % 10 + 1;
|
||||
sqlite3_reset(stmtKnn);
|
||||
sqlite3_bind_blob(stmtKnn, 1, qvec, sizeof(qvec), SQLITE_TRANSIENT);
|
||||
sqlite3_bind_int(stmtKnn, 2, k);
|
||||
while (sqlite3_step(stmtKnn) == SQLITE_ROW) {}
|
||||
break;
|
||||
}
|
||||
case 3: { /* Insert (uses whatever search_list_size_insert is set) */
|
||||
int64_t rowid = (int64_t)(fuzz_byte(&data, &size, 0) % 32) + 1;
|
||||
float vec[8];
|
||||
for (int j = 0; j < 8; j++) {
|
||||
vec[j] = (float)((int8_t)fuzz_byte(&data, &size, 0)) / 10.0f;
|
||||
}
|
||||
sqlite3_reset(stmtInsert);
|
||||
sqlite3_bind_int64(stmtInsert, 1, rowid);
|
||||
sqlite3_bind_blob(stmtInsert, 2, vec, sizeof(vec), SQLITE_TRANSIENT);
|
||||
sqlite3_step(stmtInsert);
|
||||
break;
|
||||
}
|
||||
case 4: { /* Set search_list_size to extreme values */
|
||||
const char *extreme_cmds[] = {
|
||||
"search_list_size=1",
|
||||
"search_list_size=2",
|
||||
"search_list_size=1000",
|
||||
"search_list_size_search=1",
|
||||
"search_list_size_insert=1",
|
||||
};
|
||||
int idx = fuzz_byte(&data, &size, 0) % 5;
|
||||
sqlite3_reset(stmtCmd);
|
||||
sqlite3_bind_text(stmtCmd, 1, extreme_cmds[idx], -1, SQLITE_STATIC);
|
||||
sqlite3_step(stmtCmd);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cleanup:
|
||||
sqlite3_finalize(stmtCmd);
|
||||
sqlite3_finalize(stmtInsert);
|
||||
sqlite3_finalize(stmtKnn);
|
||||
sqlite3_close(db);
|
||||
return 0;
|
||||
}
|
||||
44
tests/fuzz/diskann-create.c
Normal file
44
tests/fuzz/diskann-create.c
Normal file
|
|
@ -0,0 +1,44 @@
|
|||
/**
|
||||
* Fuzz target for DiskANN CREATE TABLE config parsing.
|
||||
* Feeds fuzz data as the INDEXED BY diskann(...) option string.
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "sqlite-vec.h"
|
||||
#include "sqlite3.h"
|
||||
#include <assert.h>
|
||||
|
||||
int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
|
||||
if (size > 4096) return 0; /* Limit input size */
|
||||
|
||||
int rc;
|
||||
sqlite3 *db;
|
||||
sqlite3_stmt *stmt;
|
||||
|
||||
rc = sqlite3_open(":memory:", &db);
|
||||
assert(rc == SQLITE_OK);
|
||||
rc = sqlite3_vec_init(db, NULL, NULL);
|
||||
assert(rc == SQLITE_OK);
|
||||
|
||||
sqlite3_str *s = sqlite3_str_new(NULL);
|
||||
assert(s);
|
||||
sqlite3_str_appendall(s,
|
||||
"CREATE VIRTUAL TABLE v USING vec0("
|
||||
"emb float[64] INDEXED BY diskann(");
|
||||
sqlite3_str_appendf(s, "%.*s", (int)size, data);
|
||||
sqlite3_str_appendall(s, "))");
|
||||
const char *zSql = sqlite3_str_finish(s);
|
||||
assert(zSql);
|
||||
|
||||
rc = sqlite3_prepare_v2(db, zSql, -1, &stmt, NULL);
|
||||
sqlite3_free((char *)zSql);
|
||||
if (rc == SQLITE_OK) {
|
||||
sqlite3_step(stmt);
|
||||
}
|
||||
sqlite3_finalize(stmt);
|
||||
sqlite3_close(db);
|
||||
return 0;
|
||||
}
|
||||
187
tests/fuzz/diskann-deep-search.c
Normal file
187
tests/fuzz/diskann-deep-search.c
Normal file
|
|
@ -0,0 +1,187 @@
|
|||
/**
|
||||
* Fuzz target for DiskANN greedy beam search deep paths.
|
||||
*
|
||||
* Builds a graph with enough nodes to force multi-hop traversal, then
|
||||
* uses fuzz data to control: query vector values, k, search_list_size
|
||||
* overrides, and interleaved insert/delete/query sequences that stress
|
||||
* the candidate list growth, visited set hash collisions, and the
|
||||
* re-ranking logic.
|
||||
*
|
||||
* Key code paths targeted:
|
||||
* - diskann_candidate_list_insert (sorted insert, dedup, eviction)
|
||||
* - diskann_visited_set (hash collisions, capacity)
|
||||
* - diskann_search (full beam search loop, re-ranking with exact dist)
|
||||
* - diskann_distance_quantized_precomputed (both binary and int8)
|
||||
* - Buffer merge in vec0Filter_knn_diskann
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include "sqlite-vec.h"
|
||||
#include "sqlite3.h"
|
||||
#include <assert.h>
|
||||
|
||||
/* Consume one byte from fuzz input, or return default. */
|
||||
static uint8_t fuzz_byte(const uint8_t **data, size_t *size, uint8_t def) {
|
||||
if (*size == 0) return def;
|
||||
uint8_t b = **data;
|
||||
(*data)++;
|
||||
(*size)--;
|
||||
return b;
|
||||
}
|
||||
|
||||
static uint16_t fuzz_u16(const uint8_t **data, size_t *size) {
|
||||
uint8_t lo = fuzz_byte(data, size, 0);
|
||||
uint8_t hi = fuzz_byte(data, size, 0);
|
||||
return (uint16_t)hi << 8 | lo;
|
||||
}
|
||||
|
||||
static float fuzz_float(const uint8_t **data, size_t *size) {
|
||||
return (float)((int8_t)fuzz_byte(data, size, 0)) / 10.0f;
|
||||
}
|
||||
|
||||
int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
|
||||
if (size < 32) return 0;
|
||||
|
||||
/* Use first bytes to pick quantizer type and dimensions */
|
||||
uint8_t quantizer_choice = fuzz_byte(&data, &size, 0) % 2;
|
||||
const char *quantizer = quantizer_choice ? "int8" : "binary";
|
||||
|
||||
/* Dimensions must be divisible by 8. Pick from {8, 16, 32} */
|
||||
int dim_choices[] = {8, 16, 32};
|
||||
int dims = dim_choices[fuzz_byte(&data, &size, 0) % 3];
|
||||
|
||||
/* n_neighbors: 8 or 16 -- small to force full-neighbor scenarios quickly */
|
||||
int n_neighbors = (fuzz_byte(&data, &size, 0) % 2) ? 16 : 8;
|
||||
|
||||
/* search_list_size: small so beam search terminates quickly but still exercises loops */
|
||||
int search_list_size = 8 + (fuzz_byte(&data, &size, 0) % 24);
|
||||
|
||||
/* alpha: vary to test RobustPrune pruning logic */
|
||||
float alpha_choices[] = {1.0f, 1.2f, 1.5f, 2.0f};
|
||||
float alpha = alpha_choices[fuzz_byte(&data, &size, 0) % 4];
|
||||
|
||||
int rc;
|
||||
sqlite3 *db;
|
||||
rc = sqlite3_open(":memory:", &db);
|
||||
assert(rc == SQLITE_OK);
|
||||
rc = sqlite3_vec_init(db, NULL, NULL);
|
||||
assert(rc == SQLITE_OK);
|
||||
|
||||
char sql[512];
|
||||
snprintf(sql, sizeof(sql),
|
||||
"CREATE VIRTUAL TABLE v USING vec0("
|
||||
"emb float[%d] INDEXED BY diskann("
|
||||
"neighbor_quantizer=%s, n_neighbors=%d, "
|
||||
"search_list_size=%d"
|
||||
"))", dims, quantizer, n_neighbors, search_list_size);
|
||||
|
||||
rc = sqlite3_exec(db, sql, NULL, NULL, NULL);
|
||||
if (rc != SQLITE_OK) { sqlite3_close(db); return 0; }
|
||||
|
||||
sqlite3_stmt *stmtInsert = NULL, *stmtDelete = NULL, *stmtKnn = NULL;
|
||||
sqlite3_prepare_v2(db,
|
||||
"INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &stmtInsert, NULL);
|
||||
sqlite3_prepare_v2(db,
|
||||
"DELETE FROM v WHERE rowid = ?", -1, &stmtDelete, NULL);
|
||||
|
||||
char knn_sql[256];
|
||||
snprintf(knn_sql, sizeof(knn_sql),
|
||||
"SELECT rowid, distance FROM v WHERE emb MATCH ? AND k = ?");
|
||||
sqlite3_prepare_v2(db, knn_sql, -1, &stmtKnn, NULL);
|
||||
|
||||
if (!stmtInsert || !stmtDelete || !stmtKnn) goto cleanup;
|
||||
|
||||
/* Phase 1: Seed the graph with enough nodes to create multi-hop structure.
|
||||
* Insert 2*n_neighbors nodes so the graph is dense enough for search
|
||||
* to actually traverse multiple hops. */
|
||||
int seed_count = n_neighbors * 2;
|
||||
if (seed_count > 64) seed_count = 64; /* Bound for performance */
|
||||
{
|
||||
float *vec = malloc(dims * sizeof(float));
|
||||
if (!vec) goto cleanup;
|
||||
for (int i = 1; i <= seed_count; i++) {
|
||||
for (int j = 0; j < dims; j++) {
|
||||
vec[j] = fuzz_float(&data, &size);
|
||||
}
|
||||
sqlite3_reset(stmtInsert);
|
||||
sqlite3_bind_int64(stmtInsert, 1, i);
|
||||
sqlite3_bind_blob(stmtInsert, 2, vec, dims * sizeof(float), SQLITE_TRANSIENT);
|
||||
sqlite3_step(stmtInsert);
|
||||
}
|
||||
free(vec);
|
||||
}
|
||||
|
||||
/* Phase 2: Fuzz-driven operations on the seeded graph */
|
||||
float *vec = malloc(dims * sizeof(float));
|
||||
if (!vec) goto cleanup;
|
||||
|
||||
while (size >= 2) {
|
||||
uint8_t op = fuzz_byte(&data, &size, 0) % 5;
|
||||
uint8_t param = fuzz_byte(&data, &size, 0);
|
||||
|
||||
switch (op) {
|
||||
case 0: { /* INSERT with fuzz-controlled vector and rowid */
|
||||
int64_t rowid = (int64_t)(param % 128) + 1;
|
||||
for (int j = 0; j < dims; j++) {
|
||||
vec[j] = fuzz_float(&data, &size);
|
||||
}
|
||||
sqlite3_reset(stmtInsert);
|
||||
sqlite3_bind_int64(stmtInsert, 1, rowid);
|
||||
sqlite3_bind_blob(stmtInsert, 2, vec, dims * sizeof(float), SQLITE_TRANSIENT);
|
||||
sqlite3_step(stmtInsert);
|
||||
break;
|
||||
}
|
||||
case 1: { /* DELETE */
|
||||
int64_t rowid = (int64_t)(param % 128) + 1;
|
||||
sqlite3_reset(stmtDelete);
|
||||
sqlite3_bind_int64(stmtDelete, 1, rowid);
|
||||
sqlite3_step(stmtDelete);
|
||||
break;
|
||||
}
|
||||
case 2: { /* KNN with fuzz query vector and variable k */
|
||||
for (int j = 0; j < dims; j++) {
|
||||
vec[j] = fuzz_float(&data, &size);
|
||||
}
|
||||
int k = (param % 20) + 1;
|
||||
sqlite3_reset(stmtKnn);
|
||||
sqlite3_bind_blob(stmtKnn, 1, vec, dims * sizeof(float), SQLITE_TRANSIENT);
|
||||
sqlite3_bind_int(stmtKnn, 2, k);
|
||||
while (sqlite3_step(stmtKnn) == SQLITE_ROW) {}
|
||||
break;
|
||||
}
|
||||
case 3: { /* KNN with k > number of nodes (boundary) */
|
||||
for (int j = 0; j < dims; j++) {
|
||||
vec[j] = fuzz_float(&data, &size);
|
||||
}
|
||||
sqlite3_reset(stmtKnn);
|
||||
sqlite3_bind_blob(stmtKnn, 1, vec, dims * sizeof(float), SQLITE_TRANSIENT);
|
||||
sqlite3_bind_int(stmtKnn, 2, 1000); /* k >> graph size */
|
||||
while (sqlite3_step(stmtKnn) == SQLITE_ROW) {}
|
||||
break;
|
||||
}
|
||||
case 4: { /* INSERT duplicate rowid (triggers OR REPLACE path) */
|
||||
int64_t rowid = (int64_t)(param % 32) + 1;
|
||||
for (int j = 0; j < dims; j++) {
|
||||
vec[j] = (float)(param + j) / 50.0f;
|
||||
}
|
||||
sqlite3_reset(stmtInsert);
|
||||
sqlite3_bind_int64(stmtInsert, 1, rowid);
|
||||
sqlite3_bind_blob(stmtInsert, 2, vec, dims * sizeof(float), SQLITE_TRANSIENT);
|
||||
sqlite3_step(stmtInsert);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
free(vec);
|
||||
|
||||
cleanup:
|
||||
sqlite3_finalize(stmtInsert);
|
||||
sqlite3_finalize(stmtDelete);
|
||||
sqlite3_finalize(stmtKnn);
|
||||
sqlite3_close(db);
|
||||
return 0;
|
||||
}
|
||||
175
tests/fuzz/diskann-delete-stress.c
Normal file
175
tests/fuzz/diskann-delete-stress.c
Normal file
|
|
@ -0,0 +1,175 @@
|
|||
/**
|
||||
* Fuzz target for DiskANN delete path and graph connectivity maintenance.
|
||||
*
|
||||
* The delete path is the most complex graph mutation:
|
||||
* 1. Read deleted node's neighbor list
|
||||
* 2. For each neighbor, remove deleted node from their list
|
||||
* 3. Try to fill the gap with one of deleted node's other neighbors
|
||||
* 4. Handle medoid deletion (pick new medoid)
|
||||
*
|
||||
* Edge cases this targets:
|
||||
* - Delete the medoid (entry point) -- forces medoid reassignment
|
||||
* - Delete all nodes except one -- graph degenerates
|
||||
* - Delete nodes in a chain -- cascading dangling edges
|
||||
* - Re-insert at deleted rowids -- stale graph edges to old data
|
||||
* - Delete nonexistent rowids -- should be no-op
|
||||
* - Insert-delete-insert same rowid rapidly
|
||||
* - Delete when graph has exactly n_neighbors entries (full nodes)
|
||||
*
|
||||
* Key code paths:
|
||||
* - diskann_delete -> diskann_repair_reverse_edges
|
||||
* - diskann_medoid_handle_delete
|
||||
* - diskann_node_clear_neighbor
|
||||
* - Interaction between delete and concurrent search
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "sqlite-vec.h"
|
||||
#include "sqlite3.h"
|
||||
#include <assert.h>
|
||||
|
||||
static uint8_t fuzz_byte(const uint8_t **data, size_t *size, uint8_t def) {
|
||||
if (*size == 0) return def;
|
||||
uint8_t b = **data;
|
||||
(*data)++;
|
||||
(*size)--;
|
||||
return b;
|
||||
}
|
||||
|
||||
int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
|
||||
if (size < 20) return 0;
|
||||
|
||||
int rc;
|
||||
sqlite3 *db;
|
||||
rc = sqlite3_open(":memory:", &db);
|
||||
assert(rc == SQLITE_OK);
|
||||
rc = sqlite3_vec_init(db, NULL, NULL);
|
||||
assert(rc == SQLITE_OK);
|
||||
|
||||
/* int8 quantizer to exercise that distance code path */
|
||||
uint8_t quant = fuzz_byte(&data, &size, 0) % 2;
|
||||
const char *qname = quant ? "int8" : "binary";
|
||||
|
||||
char sql[256];
|
||||
snprintf(sql, sizeof(sql),
|
||||
"CREATE VIRTUAL TABLE v USING vec0("
|
||||
"emb float[8] INDEXED BY diskann(neighbor_quantizer=%s, n_neighbors=8))",
|
||||
qname);
|
||||
rc = sqlite3_exec(db, sql, NULL, NULL, NULL);
|
||||
if (rc != SQLITE_OK) { sqlite3_close(db); return 0; }
|
||||
|
||||
sqlite3_stmt *stmtInsert = NULL, *stmtDelete = NULL, *stmtKnn = NULL;
|
||||
sqlite3_prepare_v2(db,
|
||||
"INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &stmtInsert, NULL);
|
||||
sqlite3_prepare_v2(db,
|
||||
"DELETE FROM v WHERE rowid = ?", -1, &stmtDelete, NULL);
|
||||
sqlite3_prepare_v2(db,
|
||||
"SELECT rowid, distance FROM v WHERE emb MATCH ? AND k = ?",
|
||||
-1, &stmtKnn, NULL);
|
||||
|
||||
if (!stmtInsert || !stmtDelete || !stmtKnn) goto cleanup;
|
||||
|
||||
/* Phase 1: Build a graph of exactly n_neighbors+2 = 10 nodes.
|
||||
* This makes every node nearly full, maximizing the chance that
|
||||
* inserts trigger the "full node" path in add_reverse_edge. */
|
||||
for (int i = 1; i <= 10; i++) {
|
||||
float vec[8];
|
||||
for (int j = 0; j < 8; j++) {
|
||||
vec[j] = (float)((int8_t)fuzz_byte(&data, &size, (uint8_t)(i*13+j*7))) / 20.0f;
|
||||
}
|
||||
sqlite3_reset(stmtInsert);
|
||||
sqlite3_bind_int64(stmtInsert, 1, i);
|
||||
sqlite3_bind_blob(stmtInsert, 2, vec, sizeof(vec), SQLITE_TRANSIENT);
|
||||
sqlite3_step(stmtInsert);
|
||||
}
|
||||
|
||||
/* Phase 2: Fuzz-driven delete-heavy workload */
|
||||
while (size >= 2) {
|
||||
uint8_t op = fuzz_byte(&data, &size, 0);
|
||||
uint8_t param = fuzz_byte(&data, &size, 0);
|
||||
|
||||
switch (op % 6) {
|
||||
case 0: /* Delete existing node */
|
||||
case 1: { /* (weighted toward deletes) */
|
||||
int64_t rowid = (int64_t)(param % 16) + 1;
|
||||
sqlite3_reset(stmtDelete);
|
||||
sqlite3_bind_int64(stmtDelete, 1, rowid);
|
||||
sqlite3_step(stmtDelete);
|
||||
break;
|
||||
}
|
||||
case 2: { /* Delete then immediately re-insert same rowid */
|
||||
int64_t rowid = (int64_t)(param % 10) + 1;
|
||||
sqlite3_reset(stmtDelete);
|
||||
sqlite3_bind_int64(stmtDelete, 1, rowid);
|
||||
sqlite3_step(stmtDelete);
|
||||
|
||||
float vec[8];
|
||||
for (int j = 0; j < 8; j++) {
|
||||
vec[j] = (float)((int8_t)fuzz_byte(&data, &size, (uint8_t)(rowid+j))) / 15.0f;
|
||||
}
|
||||
sqlite3_reset(stmtInsert);
|
||||
sqlite3_bind_int64(stmtInsert, 1, rowid);
|
||||
sqlite3_bind_blob(stmtInsert, 2, vec, sizeof(vec), SQLITE_TRANSIENT);
|
||||
sqlite3_step(stmtInsert);
|
||||
break;
|
||||
}
|
||||
case 3: { /* KNN query on potentially sparse/empty graph */
|
||||
float qvec[8];
|
||||
for (int j = 0; j < 8; j++) {
|
||||
qvec[j] = (float)((int8_t)fuzz_byte(&data, &size, 0)) / 10.0f;
|
||||
}
|
||||
int k = (param % 15) + 1;
|
||||
sqlite3_reset(stmtKnn);
|
||||
sqlite3_bind_blob(stmtKnn, 1, qvec, sizeof(qvec), SQLITE_TRANSIENT);
|
||||
sqlite3_bind_int(stmtKnn, 2, k);
|
||||
while (sqlite3_step(stmtKnn) == SQLITE_ROW) {}
|
||||
break;
|
||||
}
|
||||
case 4: { /* Insert new node */
|
||||
int64_t rowid = (int64_t)(param % 32) + 1;
|
||||
float vec[8];
|
||||
for (int j = 0; j < 8; j++) {
|
||||
vec[j] = (float)((int8_t)fuzz_byte(&data, &size, 0)) / 10.0f;
|
||||
}
|
||||
sqlite3_reset(stmtInsert);
|
||||
sqlite3_bind_int64(stmtInsert, 1, rowid);
|
||||
sqlite3_bind_blob(stmtInsert, 2, vec, sizeof(vec), SQLITE_TRANSIENT);
|
||||
sqlite3_step(stmtInsert);
|
||||
break;
|
||||
}
|
||||
case 5: { /* Delete ALL remaining nodes, then insert fresh */
|
||||
for (int i = 1; i <= 32; i++) {
|
||||
sqlite3_reset(stmtDelete);
|
||||
sqlite3_bind_int64(stmtDelete, 1, i);
|
||||
sqlite3_step(stmtDelete);
|
||||
}
|
||||
/* Now insert one node into empty graph */
|
||||
float vec[8] = {1.0f, 0, 0, 0, 0, 0, 0, 0};
|
||||
sqlite3_reset(stmtInsert);
|
||||
sqlite3_bind_int64(stmtInsert, 1, 1);
|
||||
sqlite3_bind_blob(stmtInsert, 2, vec, sizeof(vec), SQLITE_TRANSIENT);
|
||||
sqlite3_step(stmtInsert);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Final KNN on whatever state the graph is in */
|
||||
{
|
||||
float qvec[8] = {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f};
|
||||
sqlite3_reset(stmtKnn);
|
||||
sqlite3_bind_blob(stmtKnn, 1, qvec, sizeof(qvec), SQLITE_TRANSIENT);
|
||||
sqlite3_bind_int(stmtKnn, 2, 10);
|
||||
while (sqlite3_step(stmtKnn) == SQLITE_ROW) {}
|
||||
}
|
||||
|
||||
cleanup:
|
||||
sqlite3_finalize(stmtInsert);
|
||||
sqlite3_finalize(stmtDelete);
|
||||
sqlite3_finalize(stmtKnn);
|
||||
sqlite3_close(db);
|
||||
return 0;
|
||||
}
|
||||
123
tests/fuzz/diskann-graph-corrupt.c
Normal file
123
tests/fuzz/diskann-graph-corrupt.c
Normal file
|
|
@ -0,0 +1,123 @@
|
|||
/**
|
||||
* Fuzz target for DiskANN shadow table corruption resilience.
|
||||
* Creates and populates a DiskANN table, then corrupts shadow table blobs
|
||||
* using fuzz data and runs queries.
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "sqlite-vec.h"
|
||||
#include "sqlite3.h"
|
||||
#include <assert.h>
|
||||
|
||||
int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
|
||||
if (size < 16) return 0;
|
||||
|
||||
int rc;
|
||||
sqlite3 *db;
|
||||
|
||||
rc = sqlite3_open(":memory:", &db);
|
||||
assert(rc == SQLITE_OK);
|
||||
rc = sqlite3_vec_init(db, NULL, NULL);
|
||||
assert(rc == SQLITE_OK);
|
||||
|
||||
rc = sqlite3_exec(db,
|
||||
"CREATE VIRTUAL TABLE v USING vec0("
|
||||
"emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8))",
|
||||
NULL, NULL, NULL);
|
||||
if (rc != SQLITE_OK) { sqlite3_close(db); return 0; }
|
||||
|
||||
/* Insert a few vectors to create graph structure */
|
||||
{
|
||||
sqlite3_stmt *stmt;
|
||||
sqlite3_prepare_v2(db,
|
||||
"INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &stmt, NULL);
|
||||
for (int i = 1; i <= 10; i++) {
|
||||
float vec[8];
|
||||
for (int j = 0; j < 8; j++) {
|
||||
vec[j] = (float)i * 0.1f + (float)j * 0.01f;
|
||||
}
|
||||
sqlite3_reset(stmt);
|
||||
sqlite3_bind_int64(stmt, 1, i);
|
||||
sqlite3_bind_blob(stmt, 2, vec, sizeof(vec), SQLITE_TRANSIENT);
|
||||
sqlite3_step(stmt);
|
||||
}
|
||||
sqlite3_finalize(stmt);
|
||||
}
|
||||
|
||||
/* Corrupt shadow table data using fuzz bytes */
|
||||
size_t offset = 0;
|
||||
|
||||
/* Determine which row and column to corrupt */
|
||||
int target_row = (data[offset++] % 10) + 1;
|
||||
int corrupt_type = data[offset++] % 3; /* 0=validity, 1=neighbor_ids, 2=qvecs */
|
||||
|
||||
const char *column_name;
|
||||
switch (corrupt_type) {
|
||||
case 0: column_name = "neighbors_validity"; break;
|
||||
case 1: column_name = "neighbor_ids"; break;
|
||||
default: column_name = "neighbor_quantized_vectors"; break;
|
||||
}
|
||||
|
||||
/* Read the blob, corrupt it, write it back */
|
||||
{
|
||||
sqlite3_stmt *readStmt;
|
||||
char sqlbuf[256];
|
||||
snprintf(sqlbuf, sizeof(sqlbuf),
|
||||
"SELECT %s FROM v_diskann_nodes00 WHERE rowid = ?", column_name);
|
||||
rc = sqlite3_prepare_v2(db, sqlbuf, -1, &readStmt, NULL);
|
||||
if (rc == SQLITE_OK) {
|
||||
sqlite3_bind_int64(readStmt, 1, target_row);
|
||||
if (sqlite3_step(readStmt) == SQLITE_ROW) {
|
||||
const void *blob = sqlite3_column_blob(readStmt, 0);
|
||||
int blobSize = sqlite3_column_bytes(readStmt, 0);
|
||||
if (blob && blobSize > 0) {
|
||||
unsigned char *corrupt = sqlite3_malloc(blobSize);
|
||||
if (corrupt) {
|
||||
memcpy(corrupt, blob, blobSize);
|
||||
/* Apply fuzz bytes as XOR corruption */
|
||||
size_t remaining = size - offset;
|
||||
for (size_t i = 0; i < remaining && i < (size_t)blobSize; i++) {
|
||||
corrupt[i % blobSize] ^= data[offset + i];
|
||||
}
|
||||
/* Write back */
|
||||
sqlite3_stmt *writeStmt;
|
||||
snprintf(sqlbuf, sizeof(sqlbuf),
|
||||
"UPDATE v_diskann_nodes00 SET %s = ? WHERE rowid = ?", column_name);
|
||||
rc = sqlite3_prepare_v2(db, sqlbuf, -1, &writeStmt, NULL);
|
||||
if (rc == SQLITE_OK) {
|
||||
sqlite3_bind_blob(writeStmt, 1, corrupt, blobSize, SQLITE_TRANSIENT);
|
||||
sqlite3_bind_int64(writeStmt, 2, target_row);
|
||||
sqlite3_step(writeStmt);
|
||||
sqlite3_finalize(writeStmt);
|
||||
}
|
||||
sqlite3_free(corrupt);
|
||||
}
|
||||
}
|
||||
}
|
||||
sqlite3_finalize(readStmt);
|
||||
}
|
||||
}
|
||||
|
||||
/* Run queries on corrupted graph -- should not crash */
|
||||
{
|
||||
float qvec[8] = {1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
|
||||
sqlite3_stmt *knnStmt;
|
||||
rc = sqlite3_prepare_v2(db,
|
||||
"SELECT rowid, distance FROM v WHERE emb MATCH ? AND k = 5",
|
||||
-1, &knnStmt, NULL);
|
||||
if (rc == SQLITE_OK) {
|
||||
sqlite3_bind_blob(knnStmt, 1, qvec, sizeof(qvec), SQLITE_STATIC);
|
||||
while (sqlite3_step(knnStmt) == SQLITE_ROW) {}
|
||||
sqlite3_finalize(knnStmt);
|
||||
}
|
||||
}
|
||||
|
||||
/* Full scan */
|
||||
sqlite3_exec(db, "SELECT * FROM v", NULL, NULL, NULL);
|
||||
|
||||
sqlite3_close(db);
|
||||
return 0;
|
||||
}
|
||||
164
tests/fuzz/diskann-int8-quant.c
Normal file
164
tests/fuzz/diskann-int8-quant.c
Normal file
|
|
@ -0,0 +1,164 @@
|
|||
/**
|
||||
* Fuzz target for DiskANN int8 quantizer edge cases.
|
||||
*
|
||||
* The binary quantizer is simple (sign bit), but the int8 quantizer has
|
||||
* interesting arithmetic:
|
||||
* i8_val = (i8)(((src - (-1.0f)) / step) - 128.0f)
|
||||
* where step = 2.0f / 255.0f
|
||||
*
|
||||
* Edge cases in this formula:
|
||||
* - src values outside [-1, 1] cause clamping issues (no explicit clamp!)
|
||||
* - src = NaN, +Inf, -Inf (from corrupted vectors or div-by-zero)
|
||||
* - src very close to boundaries (-1.0, 1.0) -- rounding
|
||||
* - The cast to i8 can overflow for extreme src values
|
||||
*
|
||||
* Also exercises int8 distance functions:
|
||||
* - distance_l2_sqr_int8: accumulates squared differences, possible overflow
|
||||
* - distance_cosine_int8: dot product with normalization
|
||||
* - distance_l1_int8: absolute differences
|
||||
*
|
||||
* This fuzzer also tests the cosine distance metric path which the
|
||||
* other fuzzers (using L2 default) don't cover.
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include "sqlite-vec.h"
|
||||
#include "sqlite3.h"
|
||||
#include <assert.h>
|
||||
|
||||
static uint8_t fuzz_byte(const uint8_t **data, size_t *size, uint8_t def) {
|
||||
if (*size == 0) return def;
|
||||
uint8_t b = **data;
|
||||
(*data)++;
|
||||
(*size)--;
|
||||
return b;
|
||||
}
|
||||
|
||||
static float fuzz_extreme_float(const uint8_t **data, size_t *size) {
|
||||
uint8_t mode = fuzz_byte(data, size, 0) % 8;
|
||||
uint8_t raw = fuzz_byte(data, size, 0);
|
||||
switch (mode) {
|
||||
case 0: return (float)((int8_t)raw) / 10.0f; /* Normal range */
|
||||
case 1: return (float)((int8_t)raw) * 100.0f; /* Large values */
|
||||
case 2: return (float)((int8_t)raw) / 1000.0f; /* Tiny values near 0 */
|
||||
case 3: return -1.0f; /* Exact boundary */
|
||||
case 4: return 1.0f; /* Exact boundary */
|
||||
case 5: return 0.0f; /* Zero */
|
||||
case 6: return (float)raw / 255.0f; /* [0, 1] range */
|
||||
case 7: return -(float)raw / 255.0f; /* [-1, 0] range */
|
||||
}
|
||||
return 0.0f;
|
||||
}
|
||||
|
||||
int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
|
||||
if (size < 40) return 0;
|
||||
|
||||
int rc;
|
||||
sqlite3 *db;
|
||||
rc = sqlite3_open(":memory:", &db);
|
||||
assert(rc == SQLITE_OK);
|
||||
rc = sqlite3_vec_init(db, NULL, NULL);
|
||||
assert(rc == SQLITE_OK);
|
||||
|
||||
/* Test both distance metrics with int8 quantizer */
|
||||
uint8_t metric_choice = fuzz_byte(&data, &size, 0) % 2;
|
||||
const char *metric = metric_choice ? "cosine" : "L2";
|
||||
|
||||
int dims = 8 + (fuzz_byte(&data, &size, 0) % 3) * 8; /* 8, 16, or 24 */
|
||||
|
||||
char sql[512];
|
||||
snprintf(sql, sizeof(sql),
|
||||
"CREATE VIRTUAL TABLE v USING vec0("
|
||||
"emb float[%d] distance_metric=%s "
|
||||
"INDEXED BY diskann(neighbor_quantizer=int8, n_neighbors=8, search_list_size=16))",
|
||||
dims, metric);
|
||||
|
||||
rc = sqlite3_exec(db, sql, NULL, NULL, NULL);
|
||||
if (rc != SQLITE_OK) { sqlite3_close(db); return 0; }
|
||||
|
||||
sqlite3_stmt *stmtInsert = NULL, *stmtKnn = NULL, *stmtDelete = NULL;
|
||||
sqlite3_prepare_v2(db,
|
||||
"INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &stmtInsert, NULL);
|
||||
sqlite3_prepare_v2(db,
|
||||
"SELECT rowid, distance FROM v WHERE emb MATCH ? AND k = ?",
|
||||
-1, &stmtKnn, NULL);
|
||||
sqlite3_prepare_v2(db,
|
||||
"DELETE FROM v WHERE rowid = ?", -1, &stmtDelete, NULL);
|
||||
|
||||
if (!stmtInsert || !stmtKnn || !stmtDelete) goto cleanup;
|
||||
|
||||
/* Insert vectors with extreme float values to stress quantization */
|
||||
float *vec = malloc(dims * sizeof(float));
|
||||
if (!vec) goto cleanup;
|
||||
|
||||
for (int i = 1; i <= 16; i++) {
|
||||
for (int j = 0; j < dims; j++) {
|
||||
vec[j] = fuzz_extreme_float(&data, &size);
|
||||
}
|
||||
sqlite3_reset(stmtInsert);
|
||||
sqlite3_bind_int64(stmtInsert, 1, i);
|
||||
sqlite3_bind_blob(stmtInsert, 2, vec, dims * sizeof(float), SQLITE_TRANSIENT);
|
||||
sqlite3_step(stmtInsert);
|
||||
}
|
||||
|
||||
/* Fuzz-driven operations */
|
||||
while (size >= 2) {
|
||||
uint8_t op = fuzz_byte(&data, &size, 0) % 4;
|
||||
uint8_t param = fuzz_byte(&data, &size, 0);
|
||||
|
||||
switch (op) {
|
||||
case 0: { /* KNN with extreme query values */
|
||||
for (int j = 0; j < dims; j++) {
|
||||
vec[j] = fuzz_extreme_float(&data, &size);
|
||||
}
|
||||
int k = (param % 10) + 1;
|
||||
sqlite3_reset(stmtKnn);
|
||||
sqlite3_bind_blob(stmtKnn, 1, vec, dims * sizeof(float), SQLITE_TRANSIENT);
|
||||
sqlite3_bind_int(stmtKnn, 2, k);
|
||||
while (sqlite3_step(stmtKnn) == SQLITE_ROW) {}
|
||||
break;
|
||||
}
|
||||
case 1: { /* Insert with extreme values */
|
||||
int64_t rowid = (int64_t)(param % 32) + 1;
|
||||
for (int j = 0; j < dims; j++) {
|
||||
vec[j] = fuzz_extreme_float(&data, &size);
|
||||
}
|
||||
sqlite3_reset(stmtInsert);
|
||||
sqlite3_bind_int64(stmtInsert, 1, rowid);
|
||||
sqlite3_bind_blob(stmtInsert, 2, vec, dims * sizeof(float), SQLITE_TRANSIENT);
|
||||
sqlite3_step(stmtInsert);
|
||||
break;
|
||||
}
|
||||
case 2: { /* Delete */
|
||||
int64_t rowid = (int64_t)(param % 32) + 1;
|
||||
sqlite3_reset(stmtDelete);
|
||||
sqlite3_bind_int64(stmtDelete, 1, rowid);
|
||||
sqlite3_step(stmtDelete);
|
||||
break;
|
||||
}
|
||||
case 3: { /* KNN with all-zero or all-same-value query */
|
||||
float val = (param % 3 == 0) ? 0.0f :
|
||||
(param % 3 == 1) ? 1.0f : -1.0f;
|
||||
for (int j = 0; j < dims; j++) vec[j] = val;
|
||||
sqlite3_reset(stmtKnn);
|
||||
sqlite3_bind_blob(stmtKnn, 1, vec, dims * sizeof(float), SQLITE_TRANSIENT);
|
||||
sqlite3_bind_int(stmtKnn, 2, 5);
|
||||
while (sqlite3_step(stmtKnn) == SQLITE_ROW) {}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
free(vec);
|
||||
|
||||
cleanup:
|
||||
sqlite3_finalize(stmtInsert);
|
||||
sqlite3_finalize(stmtKnn);
|
||||
sqlite3_finalize(stmtDelete);
|
||||
sqlite3_close(db);
|
||||
return 0;
|
||||
}
|
||||
100
tests/fuzz/diskann-operations.c
Normal file
100
tests/fuzz/diskann-operations.c
Normal file
|
|
@ -0,0 +1,100 @@
|
|||
/**
|
||||
* Fuzz target for DiskANN insert/delete/query operation sequences.
|
||||
* Uses fuzz bytes to drive random operations on a DiskANN-indexed table.
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "sqlite-vec.h"
|
||||
#include "sqlite3.h"
|
||||
#include <assert.h>
|
||||
|
||||
int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
|
||||
if (size < 6) return 0;
|
||||
|
||||
int rc;
|
||||
sqlite3 *db;
|
||||
sqlite3_stmt *stmtInsert = NULL;
|
||||
sqlite3_stmt *stmtDelete = NULL;
|
||||
sqlite3_stmt *stmtKnn = NULL;
|
||||
sqlite3_stmt *stmtScan = NULL;
|
||||
|
||||
rc = sqlite3_open(":memory:", &db);
|
||||
assert(rc == SQLITE_OK);
|
||||
rc = sqlite3_vec_init(db, NULL, NULL);
|
||||
assert(rc == SQLITE_OK);
|
||||
|
||||
rc = sqlite3_exec(db,
|
||||
"CREATE VIRTUAL TABLE v USING vec0("
|
||||
"emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8))",
|
||||
NULL, NULL, NULL);
|
||||
if (rc != SQLITE_OK) { sqlite3_close(db); return 0; }
|
||||
|
||||
sqlite3_prepare_v2(db,
|
||||
"INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &stmtInsert, NULL);
|
||||
sqlite3_prepare_v2(db,
|
||||
"DELETE FROM v WHERE rowid = ?", -1, &stmtDelete, NULL);
|
||||
sqlite3_prepare_v2(db,
|
||||
"SELECT rowid, distance FROM v WHERE emb MATCH ? AND k = 3",
|
||||
-1, &stmtKnn, NULL);
|
||||
sqlite3_prepare_v2(db,
|
||||
"SELECT rowid FROM v", -1, &stmtScan, NULL);
|
||||
|
||||
if (!stmtInsert || !stmtDelete || !stmtKnn || !stmtScan) goto cleanup;
|
||||
|
||||
size_t i = 0;
|
||||
while (i + 2 <= size) {
|
||||
uint8_t op = data[i++] % 4;
|
||||
uint8_t rowid_byte = data[i++];
|
||||
int64_t rowid = (int64_t)(rowid_byte % 32) + 1;
|
||||
|
||||
switch (op) {
|
||||
case 0: {
|
||||
/* INSERT: consume 32 bytes for 8 floats, or use what's left */
|
||||
float vec[8] = {0};
|
||||
for (int j = 0; j < 8 && i < size; j++, i++) {
|
||||
vec[j] = (float)((int8_t)data[i]) / 10.0f;
|
||||
}
|
||||
sqlite3_reset(stmtInsert);
|
||||
sqlite3_bind_int64(stmtInsert, 1, rowid);
|
||||
sqlite3_bind_blob(stmtInsert, 2, vec, sizeof(vec), SQLITE_TRANSIENT);
|
||||
sqlite3_step(stmtInsert);
|
||||
break;
|
||||
}
|
||||
case 1: {
|
||||
/* DELETE */
|
||||
sqlite3_reset(stmtDelete);
|
||||
sqlite3_bind_int64(stmtDelete, 1, rowid);
|
||||
sqlite3_step(stmtDelete);
|
||||
break;
|
||||
}
|
||||
case 2: {
|
||||
/* KNN query */
|
||||
float qvec[8] = {1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
|
||||
sqlite3_reset(stmtKnn);
|
||||
sqlite3_bind_blob(stmtKnn, 1, qvec, sizeof(qvec), SQLITE_STATIC);
|
||||
while (sqlite3_step(stmtKnn) == SQLITE_ROW) {}
|
||||
break;
|
||||
}
|
||||
case 3: {
|
||||
/* Full scan */
|
||||
sqlite3_reset(stmtScan);
|
||||
while (sqlite3_step(stmtScan) == SQLITE_ROW) {}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Final operations -- must not crash regardless of prior state */
|
||||
sqlite3_exec(db, "SELECT * FROM v", NULL, NULL, NULL);
|
||||
|
||||
cleanup:
|
||||
sqlite3_finalize(stmtInsert);
|
||||
sqlite3_finalize(stmtDelete);
|
||||
sqlite3_finalize(stmtKnn);
|
||||
sqlite3_finalize(stmtScan);
|
||||
sqlite3_close(db);
|
||||
return 0;
|
||||
}
|
||||
131
tests/fuzz/diskann-prune-direct.c
Normal file
131
tests/fuzz/diskann-prune-direct.c
Normal file
|
|
@ -0,0 +1,131 @@
|
|||
/**
|
||||
* Fuzz target for DiskANN RobustPrune algorithm (diskann_prune_select).
|
||||
*
|
||||
* diskann_prune_select is exposed for testing and takes:
|
||||
* - inter_distances: flattened NxN matrix of inter-candidate distances
|
||||
* - p_distances: N distances from node p to each candidate
|
||||
* - num_candidates, alpha, max_neighbors
|
||||
*
|
||||
* This is a pure function that doesn't need a database, so we can
|
||||
* call it directly with fuzz-controlled inputs. This gives the fuzzer
|
||||
* maximum speed (no SQLite overhead) to explore:
|
||||
*
|
||||
* - alpha boundary: alpha=0 (prunes nothing), alpha=very large (prunes all)
|
||||
* - max_neighbors = 0, 1, num_candidates, > num_candidates
|
||||
* - num_candidates = 0, 1, large
|
||||
* - Distance matrices with: all zeros, all same, negative values, NaN, Inf
|
||||
* - Non-symmetric distance matrices (should still work)
|
||||
* - Memory: large num_candidates to stress malloc
|
||||
*
|
||||
* Key code paths:
|
||||
* - diskann_prune_select alpha-pruning loop
|
||||
* - Boundary: selectedCount reaches max_neighbors exactly
|
||||
* - All candidates pruned before max_neighbors reached
|
||||
*/
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include "sqlite-vec.h"
|
||||
#include "sqlite3.h"
|
||||
#include <assert.h>
|
||||
|
||||
/* Declare the test-exposed function.
|
||||
* diskann_prune_select is not static -- it's a public symbol. */
|
||||
extern int diskann_prune_select(
|
||||
const float *inter_distances, const float *p_distances,
|
||||
int num_candidates, float alpha, int max_neighbors,
|
||||
int *outSelected, int *outCount);
|
||||
|
||||
static uint8_t fuzz_byte(const uint8_t **data, size_t *size, uint8_t def) {
|
||||
if (*size == 0) return def;
|
||||
uint8_t b = **data;
|
||||
(*data)++;
|
||||
(*size)--;
|
||||
return b;
|
||||
}
|
||||
|
||||
int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
|
||||
if (size < 8) return 0;
|
||||
|
||||
/* Consume parameters from fuzz data */
|
||||
int num_candidates = fuzz_byte(&data, &size, 0) % 33; /* 0..32 */
|
||||
int max_neighbors = fuzz_byte(&data, &size, 0) % 17; /* 0..16 */
|
||||
|
||||
/* Alpha: pick from interesting values */
|
||||
uint8_t alpha_idx = fuzz_byte(&data, &size, 0) % 8;
|
||||
float alpha_values[] = {0.0f, 0.5f, 1.0f, 1.2f, 1.5f, 2.0f, 10.0f, 100.0f};
|
||||
float alpha = alpha_values[alpha_idx];
|
||||
|
||||
if (num_candidates == 0) {
|
||||
/* Test empty case */
|
||||
int outCount = -1;
|
||||
int rc = diskann_prune_select(NULL, NULL, 0, alpha, max_neighbors,
|
||||
NULL, &outCount);
|
||||
assert(rc == 0 /* SQLITE_OK */);
|
||||
assert(outCount == 0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Allocate arrays */
|
||||
int n = num_candidates;
|
||||
float *inter_distances = malloc(n * n * sizeof(float));
|
||||
float *p_distances = malloc(n * sizeof(float));
|
||||
int *outSelected = malloc(n * sizeof(int));
|
||||
if (!inter_distances || !p_distances || !outSelected) {
|
||||
free(inter_distances);
|
||||
free(p_distances);
|
||||
free(outSelected);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Fill p_distances from fuzz data (sorted ascending for correct input) */
|
||||
for (int i = 0; i < n; i++) {
|
||||
uint8_t raw = fuzz_byte(&data, &size, (uint8_t)(i * 10));
|
||||
p_distances[i] = (float)raw / 10.0f;
|
||||
}
|
||||
/* Sort p_distances ascending (prune_select expects sorted input) */
|
||||
for (int i = 1; i < n; i++) {
|
||||
float tmp = p_distances[i];
|
||||
int j = i - 1;
|
||||
while (j >= 0 && p_distances[j] > tmp) {
|
||||
p_distances[j + 1] = p_distances[j];
|
||||
j--;
|
||||
}
|
||||
p_distances[j + 1] = tmp;
|
||||
}
|
||||
|
||||
/* Fill inter-distance matrix from fuzz data */
|
||||
for (int i = 0; i < n * n; i++) {
|
||||
uint8_t raw = fuzz_byte(&data, &size, (uint8_t)(i % 256));
|
||||
inter_distances[i] = (float)raw / 10.0f;
|
||||
}
|
||||
/* Make diagonal zero */
|
||||
for (int i = 0; i < n; i++) {
|
||||
inter_distances[i * n + i] = 0.0f;
|
||||
}
|
||||
|
||||
int outCount = -1;
|
||||
int rc = diskann_prune_select(inter_distances, p_distances,
|
||||
n, alpha, max_neighbors,
|
||||
outSelected, &outCount);
|
||||
/* Basic sanity: should not crash, count should be valid */
|
||||
assert(rc == 0);
|
||||
assert(outCount >= 0);
|
||||
assert(outCount <= max_neighbors || max_neighbors == 0);
|
||||
assert(outCount <= n);
|
||||
|
||||
/* Verify outSelected flags are consistent with outCount */
|
||||
int flagCount = 0;
|
||||
for (int i = 0; i < n; i++) {
|
||||
if (outSelected[i]) flagCount++;
|
||||
}
|
||||
assert(flagCount == outCount);
|
||||
|
||||
free(inter_distances);
|
||||
free(p_distances);
|
||||
free(outSelected);
|
||||
return 0;
|
||||
}
|
||||
10
tests/fuzz/diskann.dict
Normal file
10
tests/fuzz/diskann.dict
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
"neighbor_quantizer"
|
||||
"binary"
|
||||
"int8"
|
||||
"n_neighbors"
|
||||
"search_list_size"
|
||||
"search_list_size_search"
|
||||
"search_list_size_insert"
|
||||
"alpha"
|
||||
"="
|
||||
","
|
||||
Loading…
Add table
Add a link
Reference in a new issue