mirror of
https://github.com/asg017/sqlite-vec.git
synced 2026-04-25 00:36:56 +02:00
Filter deleted nodes from DiskANN search results and add delete tests
DiskANN's delete repair only fixes forward edges (nodes the deleted node pointed to). Stale reverse edges can cause deleted rowids to appear in search results. Fix: track a 'confirmed' flag on each search candidate, set when the full-precision vector is successfully read during re-ranking. Only confirmed candidates are included in output. Zero additional SQL queries — piggybacks on the existing re-rank vector read. Also adds delete hardening tests: - Rescore: interleaved delete+KNN, rowid_in after deletes, full delete+reinsert cycle - DiskANN: delete+reinsert cycles with KNN verification, interleaved delete+KNN Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
2f4c2e4bdb
commit
b00865429b
4 changed files with 190 additions and 8 deletions
|
|
@ -1176,3 +1176,73 @@ def test_corrupt_truncated_node_blob(db):
|
|||
).fetchall()
|
||||
except sqlite3.OperationalError:
|
||||
pass # Error is acceptable — crash is not
|
||||
|
||||
|
||||
def test_diskann_delete_reinsert_cycle_knn(db):
|
||||
"""Repeatedly delete and reinsert rows, verify KNN stays correct."""
|
||||
import random
|
||||
random.seed(101)
|
||||
db.execute("""
|
||||
CREATE VIRTUAL TABLE t USING vec0(
|
||||
emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8)
|
||||
)
|
||||
""")
|
||||
N = 30
|
||||
vecs = {}
|
||||
for i in range(1, N + 1):
|
||||
v = [random.gauss(0, 1) for _ in range(8)]
|
||||
vecs[i] = v
|
||||
db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, _f32(v)])
|
||||
|
||||
# 3 cycles: delete half, reinsert with new vectors, verify KNN
|
||||
for cycle in range(3):
|
||||
to_delete = random.sample(sorted(vecs.keys()), len(vecs) // 2)
|
||||
for r in to_delete:
|
||||
db.execute("DELETE FROM t WHERE rowid = ?", [r])
|
||||
del vecs[r]
|
||||
|
||||
# Reinsert with new rowids
|
||||
new_start = 100 + cycle * 50
|
||||
for i in range(len(to_delete)):
|
||||
rid = new_start + i
|
||||
v = [random.gauss(0, 1) for _ in range(8)]
|
||||
vecs[rid] = v
|
||||
db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [rid, _f32(v)])
|
||||
|
||||
# KNN should return only alive rows
|
||||
query = [0.0] * 8
|
||||
rows = db.execute(
|
||||
"SELECT rowid FROM t WHERE emb MATCH ? AND k=10",
|
||||
[_f32(query)],
|
||||
).fetchall()
|
||||
returned = {r["rowid"] for r in rows}
|
||||
assert returned.issubset(set(vecs.keys())), \
|
||||
f"Cycle {cycle}: deleted rowid in KNN results"
|
||||
assert len(rows) >= 1
|
||||
|
||||
|
||||
def test_diskann_delete_interleaved_with_knn(db):
|
||||
"""Delete one row at a time, querying KNN after each delete."""
|
||||
db.execute("""
|
||||
CREATE VIRTUAL TABLE t USING vec0(
|
||||
emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8)
|
||||
)
|
||||
""")
|
||||
N = 20
|
||||
for i in range(1, N + 1):
|
||||
vec = [0.0] * 8
|
||||
vec[i % 8] = float(i)
|
||||
db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, _f32(vec)])
|
||||
|
||||
alive = set(range(1, N + 1))
|
||||
for to_del in [1, 5, 10, 15, 20]:
|
||||
db.execute("DELETE FROM t WHERE rowid = ?", [to_del])
|
||||
alive.discard(to_del)
|
||||
|
||||
rows = db.execute(
|
||||
"SELECT rowid FROM t WHERE emb MATCH ? AND k=5",
|
||||
[_f32([1, 0, 0, 0, 0, 0, 0, 0])],
|
||||
).fetchall()
|
||||
returned = {r["rowid"] for r in rows}
|
||||
assert returned.issubset(alive), \
|
||||
f"Deleted rowid {to_del} found in KNN results"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue