Add FTS5-style command column and runtime oversample for rescore

Replace the old INSERT INTO t(rowid) VALUES('command') hack with a
proper hidden command column named after the table (FTS5 pattern):

  INSERT INTO t(t) VALUES ('oversample=16')

The command column is the first hidden column (before distance and k)
to reserve ability for future table-valued function argument use.

Schema: CREATE TABLE x(rowid, <cols>, "<table>" hidden, distance hidden, k hidden)

For backwards compat, pre-v0.1.10 tables (detected via _info shadow
table version) skip the command column to avoid name conflicts with
user columns that may share the table's name. Verified with legacy
fixture DB generated by sqlite-vec v0.1.6.

Changes:
- Add hidden command column to sqlite3_declare_vtab for new tables
- Version-gate via _info shadow table for existing tables
- Validate at CREATE time that no column name matches table name
- Add rescore_handle_command() with oversample=N support
- rescore_knn() prefers runtime oversample_search over CREATE default
- Remove old rowid-based command dispatch
- Migrate all DiskANN/IVF/fuzz tests and benchmarks to new syntax
- Add legacy DB fixture (v0.1.6) and 9 backwards-compat tests

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Alex Garcia 2026-03-31 22:39:18 -07:00
parent b7fc459be4
commit 6e2c4c6bab
21 changed files with 512 additions and 105 deletions

View file

@ -78,7 +78,7 @@ def test_batch_insert_knn_recall(db):
)
assert ivf_total_vectors(db) == 200
db.execute("INSERT INTO t(rowid) VALUES ('compute-centroids')")
db.execute("INSERT INTO t(t) VALUES ('compute-centroids')")
assert ivf_assigned_count(db) == 200
# Query near 100 -- closest should be rowid 100
@ -107,7 +107,7 @@ def test_delete_rows_gone_from_knn(db):
[i, _f32([float(i), 0, 0, 0])],
)
db.execute("INSERT INTO t(rowid) VALUES ('compute-centroids')")
db.execute("INSERT INTO t(t) VALUES ('compute-centroids')")
# Delete rowid 10
db.execute("DELETE FROM t WHERE rowid = 10")
@ -127,7 +127,7 @@ def test_delete_all_rows_empty_results(db):
[i, _f32([float(i), 0, 0, 0])],
)
db.execute("INSERT INTO t(rowid) VALUES ('compute-centroids')")
db.execute("INSERT INTO t(t) VALUES ('compute-centroids')")
for i in range(10):
db.execute("DELETE FROM t WHERE rowid = ?", [i])
@ -152,7 +152,7 @@ def test_insert_after_delete_reuse_rowid(db):
[i, _f32([float(i), 0, 0, 0])],
)
db.execute("INSERT INTO t(rowid) VALUES ('compute-centroids')")
db.execute("INSERT INTO t(t) VALUES ('compute-centroids')")
# Delete rowid 5
db.execute("DELETE FROM t WHERE rowid = 5")
@ -184,7 +184,7 @@ def test_update_vector_via_delete_insert(db):
[i, _f32([float(i), 0, 0, 0])],
)
db.execute("INSERT INTO t(rowid) VALUES ('compute-centroids')")
db.execute("INSERT INTO t(t) VALUES ('compute-centroids')")
# "Update" rowid 3: delete and re-insert with new vector
db.execute("DELETE FROM t WHERE rowid = 3")
@ -316,7 +316,7 @@ def test_single_row_compute_centroids(db):
db.execute(
"INSERT INTO t(rowid, v) VALUES (1, ?)", [_f32([1, 2, 3, 4])]
)
db.execute("INSERT INTO t(rowid) VALUES ('compute-centroids')")
db.execute("INSERT INTO t(t) VALUES ('compute-centroids')")
assert ivf_assigned_count(db) == 1
results = knn(db, [1, 2, 3, 4], 1)
@ -343,10 +343,10 @@ def test_cell_overflow_many_vectors(db):
# Set a single centroid so all vectors go there
db.execute(
"INSERT INTO t(rowid, v) VALUES ('set-centroid:0', ?)",
"INSERT INTO t(t, v) VALUES ('set-centroid:0', ?)",
[_f32([1.0, 0, 0, 0])],
)
db.execute("INSERT INTO t(rowid) VALUES ('assign-vectors')")
db.execute("INSERT INTO t(t) VALUES ('assign-vectors')")
assert ivf_assigned_count(db) == 100
@ -377,7 +377,7 @@ def test_large_batch_with_training(db):
[i, _f32([float(i), 0, 0, 0])],
)
db.execute("INSERT INTO t(rowid) VALUES ('compute-centroids')")
db.execute("INSERT INTO t(t) VALUES ('compute-centroids')")
for i in range(500, 1000):
db.execute(
@ -409,7 +409,7 @@ def test_knn_after_interleaved_insert_delete(db):
[i, _f32([float(i), 0, 0, 0])],
)
db.execute("INSERT INTO t(rowid) VALUES ('compute-centroids')")
db.execute("INSERT INTO t(t) VALUES ('compute-centroids')")
# Delete rowids 0-9 (closest to query at 5.0)
for i in range(10):
@ -434,7 +434,7 @@ def test_knn_empty_centroids_after_deletes(db):
[i, _f32([float(i % 10) * 10, 0, 0, 0])],
)
db.execute("INSERT INTO t(rowid) VALUES ('compute-centroids')")
db.execute("INSERT INTO t(t) VALUES ('compute-centroids')")
# Delete a bunch, potentially emptying some centroids
for i in range(30):
@ -458,7 +458,7 @@ def test_knn_correct_distances(db):
db.execute("INSERT INTO t(rowid, v) VALUES (2, ?)", [_f32([3, 0, 0, 0])])
db.execute("INSERT INTO t(rowid, v) VALUES (3, ?)", [_f32([0, 4, 0, 0])])
db.execute("INSERT INTO t(rowid) VALUES ('compute-centroids')")
db.execute("INSERT INTO t(t) VALUES ('compute-centroids')")
results = knn(db, [0, 0, 0, 0], 3)
result_map = {r[0]: r[1] for r in results}
@ -547,7 +547,7 @@ def test_interleaved_ops_correctness(db):
[i, _f32([float(i), 0, 0, 0])],
)
db.execute("INSERT INTO t(rowid) VALUES ('compute-centroids')")
db.execute("INSERT INTO t(t) VALUES ('compute-centroids')")
# Phase 2: Delete even-numbered rowids
for i in range(0, 50, 2):