Add FTS5-style command column and runtime oversample for rescore

Replace the old INSERT INTO t(rowid) VALUES('command') hack with a
proper hidden command column named after the table (FTS5 pattern):

  INSERT INTO t(t) VALUES ('oversample=16')

The command column is the first hidden column (before distance and k)
to reserve ability for future table-valued function argument use.

Schema: CREATE TABLE x(rowid, <cols>, "<table>" hidden, distance hidden, k hidden)

For backwards compat, pre-v0.1.10 tables (detected via _info shadow
table version) skip the command column to avoid name conflicts with
user columns that may share the table's name. Verified with legacy
fixture DB generated by sqlite-vec v0.1.6.

Changes:
- Add hidden command column to sqlite3_declare_vtab for new tables
- Version-gate via _info shadow table for existing tables
- Validate at CREATE time that no column name matches table name
- Add rescore_handle_command() with oversample=N support
- rescore_knn() prefers runtime oversample_search over CREATE default
- Remove old rowid-based command dispatch
- Migrate all DiskANN/IVF/fuzz tests and benchmarks to new syntax
- Add legacy DB fixture (v0.1.6) and 9 backwards-compat tests

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Alex Garcia 2026-03-31 22:39:18 -07:00
parent b7fc459be4
commit 6e2c4c6bab
21 changed files with 512 additions and 105 deletions

View file

@ -456,7 +456,7 @@ def _ivf_create_table_sql(params):
def _ivf_post_insert_hook(conn, params):
print(" Training k-means centroids (built-in)...", flush=True)
t0 = time.perf_counter()
conn.execute("INSERT INTO vec_items(id) VALUES ('compute-centroids')")
conn.execute("INSERT INTO vec_items(vec_items) VALUES ('compute-centroids')")
conn.commit()
elapsed = time.perf_counter() - t0
print(f" Training done in {elapsed:.1f}s", flush=True)
@ -514,7 +514,7 @@ def _ivf_faiss_kmeans_hook(conn, params):
for cid, blob in centroids:
conn.execute(
"INSERT INTO vec_items(id, embedding) VALUES (?, ?)",
"INSERT INTO vec_items(vec_items, embedding) VALUES (?, ?)",
(f"set-centroid:{cid}", blob),
)
conn.commit()
@ -540,7 +540,7 @@ def _ivf_pre_query_hook(conn, params):
nprobe = params.get("nprobe")
if nprobe:
conn.execute(
"INSERT INTO vec_items(id) VALUES (?)",
"INSERT INTO vec_items(vec_items) VALUES (?)",
(f"nprobe={nprobe}",),
)
conn.commit()
@ -572,7 +572,7 @@ INDEX_REGISTRY["ivf"] = {
"insert_sql": None,
"post_insert_hook": _ivf_post_insert_hook,
"pre_query_hook": _ivf_pre_query_hook,
"train_sql": lambda _: "INSERT INTO vec_items(id) VALUES ('compute-centroids')",
"train_sql": lambda _: "INSERT INTO vec_items(vec_items) VALUES ('compute-centroids')",
"run_query": None,
"query_sql": None,
"describe": _ivf_describe,
@ -616,7 +616,7 @@ def _diskann_pre_query_hook(conn, params):
L_search = params.get("L_search", 0)
if L_search:
conn.execute(
"INSERT INTO vec_items(id) VALUES (?)",
"INSERT INTO vec_items(vec_items) VALUES (?)",
(f"search_list_size_search={L_search}",),
)
conn.commit()