Add IVF index for vec0 virtual table

Add inverted file (IVF) index type: partitions vectors into clusters via
k-means, quantizes to int8, and scans only the nearest nprobe partitions at
query time. Includes shadow table management, insert/delete, KNN integration,
compile flag (SQLITE_VEC_ENABLE_IVF), fuzz targets, and tests. Removes
superseded ivf-benchmarks/ directory.
This commit is contained in:
Alex Garcia 2026-03-29 19:46:23 -07:00
parent 43982c144b
commit 3358e127f6
22 changed files with 5237 additions and 28 deletions

View file

@ -173,6 +173,48 @@ INDEX_REGISTRY["rescore"] = {
}
# ============================================================================
# IVF implementation
# ============================================================================
def _ivf_create_table_sql(params):
return (
f"CREATE VIRTUAL TABLE vec_items USING vec0("
f" id integer primary key,"
f" embedding float[768] distance_metric=cosine"
f" indexed by ivf("
f" nlist={params['nlist']},"
f" nprobe={params['nprobe']}"
f" )"
f")"
)
def _ivf_post_insert_hook(conn, params):
print(" Training k-means centroids...", flush=True)
t0 = time.perf_counter()
conn.execute("INSERT INTO vec_items(id) VALUES ('compute-centroids')")
conn.commit()
elapsed = time.perf_counter() - t0
print(f" Training done in {elapsed:.1f}s", flush=True)
return elapsed
def _ivf_describe(params):
return f"ivf nlist={params['nlist']:<4} nprobe={params['nprobe']}"
INDEX_REGISTRY["ivf"] = {
"defaults": {"nlist": 128, "nprobe": 16},
"create_table_sql": _ivf_create_table_sql,
"insert_sql": None,
"post_insert_hook": _ivf_post_insert_hook,
"run_query": None,
"describe": _ivf_describe,
}
# ============================================================================
# Config parsing
# ============================================================================