Filter deleted nodes from DiskANN search results and add delete tests

DiskANN's delete repair only fixes forward edges (nodes the deleted
node pointed to). Stale reverse edges can cause deleted rowids to
appear in search results. Fix: track a 'confirmed' flag on each
search candidate, set when the full-precision vector is successfully
read during re-ranking. Only confirmed candidates are included in
output. Zero additional SQL queries — piggybacks on the existing
re-rank vector read.

Also adds delete hardening tests:
- Rescore: interleaved delete+KNN, rowid_in after deletes, full
  delete+reinsert cycle
- DiskANN: delete+reinsert cycles with KNN verification, interleaved
  delete+KNN

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Alex Garcia 2026-03-31 17:13:29 -07:00
parent 2f4c2e4bdb
commit b00865429b
4 changed files with 190 additions and 8 deletions

View file

@ -608,6 +608,7 @@ static int diskann_candidate_list_insert(
list->items[lo].rowid = rowid;
list->items[lo].distance = distance;
list->items[lo].visited = 0;
list->items[lo].confirmed = 0;
list->count++;
return 1;
}
@ -741,8 +742,9 @@ static int diskann_search(
return rc;
}
// Seed with medoid
// Seed with medoid (confirmed — we already read its vector above)
diskann_candidate_list_insert(&candidates, medoid, medoidDist);
candidates.items[0].confirmed = 1;
// Pre-quantize query vector once for all quantized distance comparisons
u8 *queryQuantized = NULL;
@ -815,16 +817,27 @@ static int diskann_search(
sqlite3_free(fullVec);
// Update distance in candidate list and re-sort
diskann_candidate_list_insert(&candidates, currentRowid, exactDist);
// Mark as confirmed (vector exists, distance is exact)
for (int ci = 0; ci < candidates.count; ci++) {
if (candidates.items[ci].rowid == currentRowid) {
candidates.items[ci].confirmed = 1;
break;
}
}
}
// If vector read failed, candidate stays unconfirmed (stale edge to deleted node)
}
// 5. Output results (candidates are already sorted by distance)
int resultCount = (candidates.count < k) ? candidates.count : k;
*outCount = resultCount;
for (int i = 0; i < resultCount; i++) {
outRowids[i] = candidates.items[i].rowid;
outDistances[i] = candidates.items[i].distance;
// 5. Output results — only include confirmed candidates (whose vectors exist)
int resultCount = 0;
for (int i = 0; i < candidates.count && resultCount < k; i++) {
if (candidates.items[i].confirmed) {
outRowids[resultCount] = candidates.items[i].rowid;
outDistances[resultCount] = candidates.items[i].distance;
resultCount++;
}
}
*outCount = resultCount;
sqlite3_free(queryQuantized);
diskann_candidate_list_free(&candidates);

View file

@ -2586,7 +2586,8 @@ struct Vec0DiskannConfig {
struct Vec0DiskannCandidate {
i64 rowid;
f32 distance;
int visited; // 1 if this candidate's neighbors have been explored
int visited; // 1 if this candidate's neighbors have been explored
int confirmed; // 1 if full-precision vector was successfully read (node exists)
};
/**

View file

@ -1176,3 +1176,73 @@ def test_corrupt_truncated_node_blob(db):
).fetchall()
except sqlite3.OperationalError:
pass # Error is acceptable — crash is not
def test_diskann_delete_reinsert_cycle_knn(db):
"""Repeatedly delete and reinsert rows, verify KNN stays correct."""
import random
random.seed(101)
db.execute("""
CREATE VIRTUAL TABLE t USING vec0(
emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8)
)
""")
N = 30
vecs = {}
for i in range(1, N + 1):
v = [random.gauss(0, 1) for _ in range(8)]
vecs[i] = v
db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, _f32(v)])
# 3 cycles: delete half, reinsert with new vectors, verify KNN
for cycle in range(3):
to_delete = random.sample(sorted(vecs.keys()), len(vecs) // 2)
for r in to_delete:
db.execute("DELETE FROM t WHERE rowid = ?", [r])
del vecs[r]
# Reinsert with new rowids
new_start = 100 + cycle * 50
for i in range(len(to_delete)):
rid = new_start + i
v = [random.gauss(0, 1) for _ in range(8)]
vecs[rid] = v
db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [rid, _f32(v)])
# KNN should return only alive rows
query = [0.0] * 8
rows = db.execute(
"SELECT rowid FROM t WHERE emb MATCH ? AND k=10",
[_f32(query)],
).fetchall()
returned = {r["rowid"] for r in rows}
assert returned.issubset(set(vecs.keys())), \
f"Cycle {cycle}: deleted rowid in KNN results"
assert len(rows) >= 1
def test_diskann_delete_interleaved_with_knn(db):
"""Delete one row at a time, querying KNN after each delete."""
db.execute("""
CREATE VIRTUAL TABLE t USING vec0(
emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8)
)
""")
N = 20
for i in range(1, N + 1):
vec = [0.0] * 8
vec[i % 8] = float(i)
db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, _f32(vec)])
alive = set(range(1, N + 1))
for to_del in [1, 5, 10, 15, 20]:
db.execute("DELETE FROM t WHERE rowid = ?", [to_del])
alive.discard(to_del)
rows = db.execute(
"SELECT rowid FROM t WHERE emb MATCH ? AND k=5",
[_f32([1, 0, 0, 0, 0, 0, 0, 0])],
).fetchall()
returned = {r["rowid"] for r in rows}
assert returned.issubset(alive), \
f"Deleted rowid {to_del} found in KNN results"

View file

@ -443,6 +443,104 @@ def test_insert_batch_recall(db):
# ============================================================================
def test_delete_interleaved_with_knn(db):
"""Delete rows one at a time, running KNN after each delete to verify correctness."""
db.execute(
"CREATE VIRTUAL TABLE t USING vec0("
" embedding float[8] indexed by rescore(quantizer=bit)"
")"
)
N = 30
random.seed(42)
vecs = {i: [random.gauss(0, 1) for _ in range(8)] for i in range(1, N + 1)}
for rowid, vec in vecs.items():
db.execute(
"INSERT INTO t(rowid, embedding) VALUES (?, ?)",
[rowid, float_vec(vec)],
)
alive = set(vecs.keys())
query = [0.0] * 8
for to_del in [5, 10, 15, 20, 25]:
db.execute("DELETE FROM t WHERE rowid = ?", [to_del])
alive.discard(to_del)
rows = db.execute(
"SELECT rowid FROM t WHERE embedding MATCH ? ORDER BY distance LIMIT 10",
[float_vec(query)],
).fetchall()
returned = {r["rowid"] for r in rows}
# All returned rows must be alive (not deleted)
assert returned.issubset(alive), f"Deleted rowid found in KNN after deleting {to_del}"
# Count should match alive set (up to k)
assert len(rows) == min(10, len(alive))
def test_delete_with_rowid_in_constraint(db):
"""Delete rows and verify KNN with rowid_in filter excludes deleted rows."""
db.execute(
"CREATE VIRTUAL TABLE t USING vec0("
" embedding float[8] indexed by rescore(quantizer=int8)"
")"
)
for i in range(1, 11):
db.execute(
"INSERT INTO t(rowid, embedding) VALUES (?, ?)",
[i, float_vec([float(i)] * 8)],
)
# Delete rows 3, 5, 7
for r in [3, 5, 7]:
db.execute("DELETE FROM t WHERE rowid = ?", [r])
# KNN with rowid IN (1,2,3,4,5) — should only return 1, 2, 4 (3 and 5 deleted)
rows = db.execute(
"SELECT rowid FROM t WHERE embedding MATCH ? AND k = 5 AND rowid IN (1, 2, 3, 4, 5)",
[float_vec([1.0] * 8)],
).fetchall()
returned = {r["rowid"] for r in rows}
assert 3 not in returned
assert 5 not in returned
assert returned.issubset({1, 2, 4})
def test_delete_all_then_reinsert_batch(db):
"""Delete all rows, reinsert a new batch, verify KNN only returns new rows."""
db.execute(
"CREATE VIRTUAL TABLE t USING vec0("
" embedding float[8] indexed by rescore(quantizer=bit)"
")"
)
# First batch
for i in range(1, 21):
db.execute(
"INSERT INTO t(rowid, embedding) VALUES (?, ?)",
[i, float_vec([float(i)] * 8)],
)
# Delete all
for i in range(1, 21):
db.execute("DELETE FROM t WHERE rowid = ?", [i])
assert db.execute("SELECT count(*) FROM t").fetchone()[0] == 0
# Second batch with different rowids and vectors
for i in range(100, 110):
db.execute(
"INSERT INTO t(rowid, embedding) VALUES (?, ?)",
[i, float_vec([float(i - 100)] * 8)],
)
rows = db.execute(
"SELECT rowid FROM t WHERE embedding MATCH ? ORDER BY distance LIMIT 5",
[float_vec([0.0] * 8)],
).fetchall()
returned = {r["rowid"] for r in rows}
# All returned rowids should be from the second batch
assert returned.issubset(set(range(100, 110)))
def test_knn_int8_cosine(db):
"""Rescore with quantizer=int8 and distance_metric=cosine."""
db.execute(