mirror of
https://github.com/asg017/sqlite-vec.git
synced 2026-04-27 01:36:32 +02:00
Add IVF index for vec0 virtual table
Add inverted file (IVF) index type: partitions vectors into clusters via k-means, quantizes to int8, and scans only the nearest nprobe partitions at query time. Includes shadow table management, insert/delete, KNN integration, compile flag (SQLITE_VEC_ENABLE_IVF), fuzz targets, and tests. Removes superseded ivf-benchmarks/ directory.
This commit is contained in:
parent
43982c144b
commit
3358e127f6
22 changed files with 5237 additions and 28 deletions
|
|
@ -8,27 +8,20 @@ BASELINES = \
|
|||
"brute-int8:type=baseline,variant=int8" \
|
||||
"brute-bit:type=baseline,variant=bit"
|
||||
|
||||
# --- Index-specific configs ---
|
||||
# Each index branch should add its own configs here. Example:
|
||||
#
|
||||
# DISKANN_CONFIGS = \
|
||||
# "diskann-R48-binary:type=diskann,R=48,L=128,quantizer=binary" \
|
||||
# "diskann-R72-int8:type=diskann,R=72,L=128,quantizer=int8"
|
||||
#
|
||||
# IVF_CONFIGS = \
|
||||
# "ivf-n128-p16:type=ivf,nlist=128,nprobe=16"
|
||||
#
|
||||
# ANNOY_CONFIGS = \
|
||||
# "annoy-t50:type=annoy,n_trees=50"
|
||||
# --- IVF configs ---
|
||||
IVF_CONFIGS = \
|
||||
"ivf-n32-p8:type=ivf,nlist=32,nprobe=8" \
|
||||
"ivf-n128-p16:type=ivf,nlist=128,nprobe=16" \
|
||||
"ivf-n512-p32:type=ivf,nlist=512,nprobe=32"
|
||||
|
||||
RESCORE_CONFIGS = \
|
||||
"rescore-bit-os8:type=rescore,quantizer=bit,oversample=8" \
|
||||
"rescore-bit-os16:type=rescore,quantizer=bit,oversample=16" \
|
||||
"rescore-int8-os8:type=rescore,quantizer=int8,oversample=8"
|
||||
|
||||
ALL_CONFIGS = $(BASELINES) $(RESCORE_CONFIGS)
|
||||
ALL_CONFIGS = $(BASELINES) $(RESCORE_CONFIGS) $(IVF_CONFIGS)
|
||||
|
||||
.PHONY: seed ground-truth bench-smoke bench-rescore bench-10k bench-50k bench-100k bench-all \
|
||||
.PHONY: seed ground-truth bench-smoke bench-rescore bench-ivf bench-10k bench-50k bench-100k bench-all \
|
||||
report clean
|
||||
|
||||
# --- Data preparation ---
|
||||
|
|
@ -43,7 +36,8 @@ ground-truth: seed
|
|||
# --- Quick smoke test ---
|
||||
bench-smoke: seed
|
||||
$(BENCH) --subset-size 5000 -k 10 -n 20 -o runs/smoke \
|
||||
$(BASELINES)
|
||||
"brute-float:type=baseline,variant=float" \
|
||||
"ivf-quick:type=ivf,nlist=16,nprobe=4"
|
||||
|
||||
bench-rescore: seed
|
||||
$(BENCH) --subset-size 10000 -k 10 -o runs/rescore \
|
||||
|
|
@ -62,6 +56,12 @@ bench-100k: seed
|
|||
|
||||
bench-all: bench-10k bench-50k bench-100k
|
||||
|
||||
# --- IVF across sizes ---
|
||||
bench-ivf: seed
|
||||
$(BENCH) --subset-size 10000 -k 10 -o runs/ivf $(BASELINES) $(IVF_CONFIGS)
|
||||
$(BENCH) --subset-size 50000 -k 10 -o runs/ivf $(BASELINES) $(IVF_CONFIGS)
|
||||
$(BENCH) --subset-size 100000 -k 10 -o runs/ivf $(BASELINES) $(IVF_CONFIGS)
|
||||
|
||||
# --- Report ---
|
||||
report:
|
||||
@echo "Use: sqlite3 runs/<dir>/results.db 'SELECT * FROM bench_results ORDER BY recall DESC'"
|
||||
|
|
|
|||
|
|
@ -173,6 +173,48 @@ INDEX_REGISTRY["rescore"] = {
|
|||
}
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# IVF implementation
|
||||
# ============================================================================
|
||||
|
||||
|
||||
def _ivf_create_table_sql(params):
|
||||
return (
|
||||
f"CREATE VIRTUAL TABLE vec_items USING vec0("
|
||||
f" id integer primary key,"
|
||||
f" embedding float[768] distance_metric=cosine"
|
||||
f" indexed by ivf("
|
||||
f" nlist={params['nlist']},"
|
||||
f" nprobe={params['nprobe']}"
|
||||
f" )"
|
||||
f")"
|
||||
)
|
||||
|
||||
|
||||
def _ivf_post_insert_hook(conn, params):
|
||||
print(" Training k-means centroids...", flush=True)
|
||||
t0 = time.perf_counter()
|
||||
conn.execute("INSERT INTO vec_items(id) VALUES ('compute-centroids')")
|
||||
conn.commit()
|
||||
elapsed = time.perf_counter() - t0
|
||||
print(f" Training done in {elapsed:.1f}s", flush=True)
|
||||
return elapsed
|
||||
|
||||
|
||||
def _ivf_describe(params):
|
||||
return f"ivf nlist={params['nlist']:<4} nprobe={params['nprobe']}"
|
||||
|
||||
|
||||
INDEX_REGISTRY["ivf"] = {
|
||||
"defaults": {"nlist": 128, "nprobe": 16},
|
||||
"create_table_sql": _ivf_create_table_sql,
|
||||
"insert_sql": None,
|
||||
"post_insert_hook": _ivf_post_insert_hook,
|
||||
"run_query": None,
|
||||
"describe": _ivf_describe,
|
||||
}
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Config parsing
|
||||
# ============================================================================
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue