From b00865429b519ba6de0a2e094087052870f3d037 Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Tue, 31 Mar 2026 17:13:29 -0700 Subject: [PATCH] Filter deleted nodes from DiskANN search results and add delete tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DiskANN's delete repair only fixes forward edges (nodes the deleted node pointed to). Stale reverse edges can cause deleted rowids to appear in search results. Fix: track a 'confirmed' flag on each search candidate, set when the full-precision vector is successfully read during re-ranking. Only confirmed candidates are included in output. Zero additional SQL queries — piggybacks on the existing re-rank vector read. Also adds delete hardening tests: - Rescore: interleaved delete+KNN, rowid_in after deletes, full delete+reinsert cycle - DiskANN: delete+reinsert cycles with KNN verification, interleaved delete+KNN Co-Authored-By: Claude Opus 4.6 (1M context) --- sqlite-vec-diskann.c | 27 ++++++--- sqlite-vec.c | 3 +- tests/test-diskann.py | 70 +++++++++++++++++++++++ tests/test-rescore-mutations.py | 98 +++++++++++++++++++++++++++++++++ 4 files changed, 190 insertions(+), 8 deletions(-) diff --git a/sqlite-vec-diskann.c b/sqlite-vec-diskann.c index 7d4da6e..ab9db6a 100644 --- a/sqlite-vec-diskann.c +++ b/sqlite-vec-diskann.c @@ -608,6 +608,7 @@ static int diskann_candidate_list_insert( list->items[lo].rowid = rowid; list->items[lo].distance = distance; list->items[lo].visited = 0; + list->items[lo].confirmed = 0; list->count++; return 1; } @@ -741,8 +742,9 @@ static int diskann_search( return rc; } - // Seed with medoid + // Seed with medoid (confirmed — we already read its vector above) diskann_candidate_list_insert(&candidates, medoid, medoidDist); + candidates.items[0].confirmed = 1; // Pre-quantize query vector once for all quantized distance comparisons u8 *queryQuantized = NULL; @@ -815,16 +817,27 @@ static int diskann_search( sqlite3_free(fullVec); // Update distance in candidate list and re-sort diskann_candidate_list_insert(&candidates, currentRowid, exactDist); + // Mark as confirmed (vector exists, distance is exact) + for (int ci = 0; ci < candidates.count; ci++) { + if (candidates.items[ci].rowid == currentRowid) { + candidates.items[ci].confirmed = 1; + break; + } + } } + // If vector read failed, candidate stays unconfirmed (stale edge to deleted node) } - // 5. Output results (candidates are already sorted by distance) - int resultCount = (candidates.count < k) ? candidates.count : k; - *outCount = resultCount; - for (int i = 0; i < resultCount; i++) { - outRowids[i] = candidates.items[i].rowid; - outDistances[i] = candidates.items[i].distance; + // 5. Output results — only include confirmed candidates (whose vectors exist) + int resultCount = 0; + for (int i = 0; i < candidates.count && resultCount < k; i++) { + if (candidates.items[i].confirmed) { + outRowids[resultCount] = candidates.items[i].rowid; + outDistances[resultCount] = candidates.items[i].distance; + resultCount++; + } } + *outCount = resultCount; sqlite3_free(queryQuantized); diskann_candidate_list_free(&candidates); diff --git a/sqlite-vec.c b/sqlite-vec.c index f8ab4f9..cb597dd 100644 --- a/sqlite-vec.c +++ b/sqlite-vec.c @@ -2586,7 +2586,8 @@ struct Vec0DiskannConfig { struct Vec0DiskannCandidate { i64 rowid; f32 distance; - int visited; // 1 if this candidate's neighbors have been explored + int visited; // 1 if this candidate's neighbors have been explored + int confirmed; // 1 if full-precision vector was successfully read (node exists) }; /** diff --git a/tests/test-diskann.py b/tests/test-diskann.py index d71769c..f2a56a1 100644 --- a/tests/test-diskann.py +++ b/tests/test-diskann.py @@ -1176,3 +1176,73 @@ def test_corrupt_truncated_node_blob(db): ).fetchall() except sqlite3.OperationalError: pass # Error is acceptable — crash is not + + +def test_diskann_delete_reinsert_cycle_knn(db): + """Repeatedly delete and reinsert rows, verify KNN stays correct.""" + import random + random.seed(101) + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + N = 30 + vecs = {} + for i in range(1, N + 1): + v = [random.gauss(0, 1) for _ in range(8)] + vecs[i] = v + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, _f32(v)]) + + # 3 cycles: delete half, reinsert with new vectors, verify KNN + for cycle in range(3): + to_delete = random.sample(sorted(vecs.keys()), len(vecs) // 2) + for r in to_delete: + db.execute("DELETE FROM t WHERE rowid = ?", [r]) + del vecs[r] + + # Reinsert with new rowids + new_start = 100 + cycle * 50 + for i in range(len(to_delete)): + rid = new_start + i + v = [random.gauss(0, 1) for _ in range(8)] + vecs[rid] = v + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [rid, _f32(v)]) + + # KNN should return only alive rows + query = [0.0] * 8 + rows = db.execute( + "SELECT rowid FROM t WHERE emb MATCH ? AND k=10", + [_f32(query)], + ).fetchall() + returned = {r["rowid"] for r in rows} + assert returned.issubset(set(vecs.keys())), \ + f"Cycle {cycle}: deleted rowid in KNN results" + assert len(rows) >= 1 + + +def test_diskann_delete_interleaved_with_knn(db): + """Delete one row at a time, querying KNN after each delete.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + N = 20 + for i in range(1, N + 1): + vec = [0.0] * 8 + vec[i % 8] = float(i) + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, _f32(vec)]) + + alive = set(range(1, N + 1)) + for to_del in [1, 5, 10, 15, 20]: + db.execute("DELETE FROM t WHERE rowid = ?", [to_del]) + alive.discard(to_del) + + rows = db.execute( + "SELECT rowid FROM t WHERE emb MATCH ? AND k=5", + [_f32([1, 0, 0, 0, 0, 0, 0, 0])], + ).fetchall() + returned = {r["rowid"] for r in rows} + assert returned.issubset(alive), \ + f"Deleted rowid {to_del} found in KNN results" diff --git a/tests/test-rescore-mutations.py b/tests/test-rescore-mutations.py index 28495c2..dbb802a 100644 --- a/tests/test-rescore-mutations.py +++ b/tests/test-rescore-mutations.py @@ -443,6 +443,104 @@ def test_insert_batch_recall(db): # ============================================================================ +def test_delete_interleaved_with_knn(db): + """Delete rows one at a time, running KNN after each delete to verify correctness.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[8] indexed by rescore(quantizer=bit)" + ")" + ) + N = 30 + random.seed(42) + vecs = {i: [random.gauss(0, 1) for _ in range(8)] for i in range(1, N + 1)} + for rowid, vec in vecs.items(): + db.execute( + "INSERT INTO t(rowid, embedding) VALUES (?, ?)", + [rowid, float_vec(vec)], + ) + + alive = set(vecs.keys()) + query = [0.0] * 8 + + for to_del in [5, 10, 15, 20, 25]: + db.execute("DELETE FROM t WHERE rowid = ?", [to_del]) + alive.discard(to_del) + + rows = db.execute( + "SELECT rowid FROM t WHERE embedding MATCH ? ORDER BY distance LIMIT 10", + [float_vec(query)], + ).fetchall() + returned = {r["rowid"] for r in rows} + # All returned rows must be alive (not deleted) + assert returned.issubset(alive), f"Deleted rowid found in KNN after deleting {to_del}" + # Count should match alive set (up to k) + assert len(rows) == min(10, len(alive)) + + +def test_delete_with_rowid_in_constraint(db): + """Delete rows and verify KNN with rowid_in filter excludes deleted rows.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[8] indexed by rescore(quantizer=int8)" + ")" + ) + for i in range(1, 11): + db.execute( + "INSERT INTO t(rowid, embedding) VALUES (?, ?)", + [i, float_vec([float(i)] * 8)], + ) + + # Delete rows 3, 5, 7 + for r in [3, 5, 7]: + db.execute("DELETE FROM t WHERE rowid = ?", [r]) + + # KNN with rowid IN (1,2,3,4,5) — should only return 1, 2, 4 (3 and 5 deleted) + rows = db.execute( + "SELECT rowid FROM t WHERE embedding MATCH ? AND k = 5 AND rowid IN (1, 2, 3, 4, 5)", + [float_vec([1.0] * 8)], + ).fetchall() + returned = {r["rowid"] for r in rows} + assert 3 not in returned + assert 5 not in returned + assert returned.issubset({1, 2, 4}) + + +def test_delete_all_then_reinsert_batch(db): + """Delete all rows, reinsert a new batch, verify KNN only returns new rows.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[8] indexed by rescore(quantizer=bit)" + ")" + ) + # First batch + for i in range(1, 21): + db.execute( + "INSERT INTO t(rowid, embedding) VALUES (?, ?)", + [i, float_vec([float(i)] * 8)], + ) + + # Delete all + for i in range(1, 21): + db.execute("DELETE FROM t WHERE rowid = ?", [i]) + + assert db.execute("SELECT count(*) FROM t").fetchone()[0] == 0 + + # Second batch with different rowids and vectors + for i in range(100, 110): + db.execute( + "INSERT INTO t(rowid, embedding) VALUES (?, ?)", + [i, float_vec([float(i - 100)] * 8)], + ) + + rows = db.execute( + "SELECT rowid FROM t WHERE embedding MATCH ? ORDER BY distance LIMIT 5", + [float_vec([0.0] * 8)], + ).fetchall() + returned = {r["rowid"] for r in rows} + # All returned rowids should be from the second batch + assert returned.issubset(set(range(100, 110))) + + def test_knn_int8_cosine(db): """Rescore with quantizer=int8 and distance_metric=cosine.""" db.execute(