Scrub stale reverse edges on DiskANN delete (data leak fix)

After deleting a node, its rowid and quantized vector remained in
other nodes' neighbor blobs via unidirectional reverse edges. This
is a data leak — the deleted vector's compressed representation was
still readable in shadow tables.

Fix: after deleting the node and repairing forward edges, scan all
remaining nodes and clear any neighbor slot that references the
deleted rowid. Uses a lightweight two-pass approach: first scan
reads only validity + neighbor_ids to find affected nodes, then
does full read/clear/write only for those nodes.

Tradeoff: O(N) scan per delete adds ~1ms/row at 10k vectors, ~10ms
at 100k. Recall and query latency are unaffected.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Alex Garcia 2026-03-31 18:10:48 -07:00
parent c36a995f1e
commit 01b4b2a965
2 changed files with 132 additions and 0 deletions

View file

@ -1289,3 +1289,40 @@ def test_diskann_text_pk_insert_knn_delete(db):
).fetchall()
ids = [r["id"] for r in rows]
assert "alpha" not in ids
def test_diskann_delete_scrubs_all_references(db):
"""After DELETE, no shadow table should contain the deleted rowid or its data."""
import struct
db.execute("""
CREATE VIRTUAL TABLE t USING vec0(
emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8)
)
""")
for i in range(20):
vec = struct.pack("8f", *[float(i + d) for d in range(8)])
db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, vec])
target = 5
db.execute("DELETE FROM t WHERE rowid = ?", [target])
# Node row itself should be gone
assert db.execute(
"SELECT count(*) FROM t_diskann_nodes00 WHERE rowid=?", [target]
).fetchone()[0] == 0
# Vector should be gone
assert db.execute(
"SELECT count(*) FROM t_vectors00 WHERE rowid=?", [target]
).fetchone()[0] == 0
# No other node should reference the deleted rowid in neighbor_ids
for row in db.execute("SELECT rowid, neighbor_ids FROM t_diskann_nodes00"):
node_rowid = row[0]
ids_blob = row[1]
for j in range(0, len(ids_blob), 8):
nid = struct.unpack("<q", ids_blob[j : j + 8])[0]
assert nid != target, (
f"Node {node_rowid} slot {j // 8} still references "
f"deleted rowid {target}"
)