Add rescore index for ANN queries

Add rescore index type: stores full-precision float vectors in a rowid-keyed
shadow table, quantizes to int8 for fast initial scan, then rescores top
candidates with original vectors. Includes config parser, shadow table
management, insert/delete support, KNN integration, compile flag
(SQLITE_VEC_ENABLE_RESCORE), fuzz targets, and tests.
This commit is contained in:
Alex Garcia 2026-03-29 19:45:54 -07:00
parent bf2455f2ba
commit ba0db0b6d6
19 changed files with 3378 additions and 8 deletions

View file

@ -21,9 +21,14 @@ BASELINES = \
# ANNOY_CONFIGS = \
# "annoy-t50:type=annoy,n_trees=50"
ALL_CONFIGS = $(BASELINES)
RESCORE_CONFIGS = \
"rescore-bit-os8:type=rescore,quantizer=bit,oversample=8" \
"rescore-bit-os16:type=rescore,quantizer=bit,oversample=16" \
"rescore-int8-os8:type=rescore,quantizer=int8,oversample=8"
.PHONY: seed ground-truth bench-smoke bench-10k bench-50k bench-100k bench-all \
ALL_CONFIGS = $(BASELINES) $(RESCORE_CONFIGS)
.PHONY: seed ground-truth bench-smoke bench-rescore bench-10k bench-50k bench-100k bench-all \
report clean
# --- Data preparation ---
@ -40,6 +45,10 @@ bench-smoke: seed
$(BENCH) --subset-size 5000 -k 10 -n 20 -o runs/smoke \
$(BASELINES)
bench-rescore: seed
$(BENCH) --subset-size 10000 -k 10 -o runs/rescore \
$(RESCORE_CONFIGS)
# --- Standard sizes ---
bench-10k: seed
$(BENCH) --subset-size 10000 -k 10 -o runs/10k $(ALL_CONFIGS)

View file

@ -140,6 +140,39 @@ INDEX_REGISTRY["baseline"] = {
}
# ============================================================================
# Rescore implementation
# ============================================================================
def _rescore_create_table_sql(params):
quantizer = params.get("quantizer", "bit")
oversample = params.get("oversample", 8)
return (
f"CREATE VIRTUAL TABLE vec_items USING vec0("
f" chunk_size=256,"
f" id integer primary key,"
f" embedding float[768] distance_metric=cosine"
f" indexed by rescore(quantizer={quantizer}, oversample={oversample}))"
)
def _rescore_describe(params):
q = params.get("quantizer", "bit")
os = params.get("oversample", 8)
return f"rescore {q} (os={os})"
INDEX_REGISTRY["rescore"] = {
"defaults": {"quantizer": "bit", "oversample": 8},
"create_table_sql": _rescore_create_table_sql,
"insert_sql": None,
"post_insert_hook": None,
"run_query": None, # default MATCH query works — rescore is automatic
"describe": _rescore_describe,
}
# ============================================================================
# Config parsing
# ============================================================================