From bf2455f2bacb0aef976ee03e03e265f270d5afc9 Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Sun, 29 Mar 2026 19:44:44 -0700 Subject: [PATCH 01/38] Add ANN search support for vec0 virtual table Add approximate nearest neighbor infrastructure to vec0: shared distance dispatch (vec0_distance_full), flat index type with parser, NEON-optimized cosine/Hamming for float32/int8, amalgamation script, and benchmark suite (benchmarks-ann/) with ground-truth generation and profiling tools. Remove unused vec_npy_each/vec_static_blobs code, fix missing stdint.h include. --- Makefile | 14 +- TODO.md | 73 + benchmarks-ann/.gitignore | 2 + benchmarks-ann/Makefile | 61 + benchmarks-ann/README.md | 81 + benchmarks-ann/bench.py | 488 ++++++ benchmarks-ann/ground_truth.py | 168 ++ benchmarks-ann/profile.py | 440 ++++++ benchmarks-ann/schema.sql | 35 + benchmarks-ann/seed/.gitignore | 2 + benchmarks-ann/seed/Makefile | 24 + benchmarks-ann/seed/build_base_db.py | 121 ++ benchmarks/exhaustive-memory/bench.py | 57 +- benchmarks/profiling/build-from-npy.sql | 7 - benchmarks/self-params/build.py | 14 +- bindings/go/ncruces/go-sqlite3.patch | 1 - bindings/python/extra_init.py | 31 - scripts/amalgamate.py | 119 ++ site/api-reference.md | 59 - site/compiling.md | 1 - sqlite-vec.c | 1863 +++++------------------ tests/correctness/test-correctness.py | 17 +- tests/fuzz/numpy.c | 37 - tests/sqlite-vec-internal.h | 6 + tests/test-loadable.py | 415 +---- tests/test-unit.c | 101 ++ tmp-static.py | 56 - 27 files changed, 2177 insertions(+), 2116 deletions(-) create mode 100644 TODO.md create mode 100644 benchmarks-ann/.gitignore create mode 100644 benchmarks-ann/Makefile create mode 100644 benchmarks-ann/README.md create mode 100644 benchmarks-ann/bench.py create mode 100644 benchmarks-ann/ground_truth.py create mode 100644 benchmarks-ann/profile.py create mode 100644 benchmarks-ann/schema.sql create mode 100644 benchmarks-ann/seed/.gitignore create mode 100644 benchmarks-ann/seed/Makefile create mode 100644 benchmarks-ann/seed/build_base_db.py create mode 100644 scripts/amalgamate.py delete mode 100644 tests/fuzz/numpy.c delete mode 100644 tmp-static.py diff --git a/Makefile b/Makefile index 1ebdbed..051590e 100644 --- a/Makefile +++ b/Makefile @@ -42,6 +42,11 @@ ifndef OMIT_SIMD ifeq ($(shell uname -sm),Darwin arm64) CFLAGS += -mcpu=apple-m1 -DSQLITE_VEC_ENABLE_NEON endif + ifeq ($(shell uname -s),Linux) + ifneq ($(filter avx,$(shell grep -o 'avx[^ ]*' /proc/cpuinfo 2>/dev/null | head -1)),) + CFLAGS += -mavx -DSQLITE_VEC_ENABLE_AVX + endif + endif endif ifdef USE_BREW_SQLITE @@ -155,6 +160,13 @@ clean: rm -rf dist +TARGET_AMALGAMATION=$(prefix)/sqlite-vec.c + +amalgamation: $(TARGET_AMALGAMATION) + +$(TARGET_AMALGAMATION): sqlite-vec.c $(wildcard sqlite-vec-*.c) scripts/amalgamate.py $(prefix) + python3 scripts/amalgamate.py sqlite-vec.c > $@ + FORMAT_FILES=sqlite-vec.h sqlite-vec.c format: $(FORMAT_FILES) clang-format -i $(FORMAT_FILES) @@ -174,7 +186,7 @@ evidence-of: test: sqlite3 :memory: '.read test.sql' -.PHONY: version loadable static test clean gh-release evidence-of install uninstall +.PHONY: version loadable static test clean gh-release evidence-of install uninstall amalgamation publish-release: ./scripts/publish-release.sh diff --git a/TODO.md b/TODO.md new file mode 100644 index 0000000..4c3cc19 --- /dev/null +++ b/TODO.md @@ -0,0 +1,73 @@ +# TODO: `ann` base branch + consolidated benchmarks + +## 1. Create `ann` branch with shared code + +### 1.1 Branch setup +- [x] `git checkout -B ann origin/main` +- [x] Cherry-pick `624f998` (vec0_distance_full shared distance dispatch) +- [x] Cherry-pick stdint.h fix for test header +- [ ] Pull NEON cosine optimization from ivf-yolo3 into shared code + - Currently only in ivf branch but is general-purpose (benefits all distance calcs) + - Lives in `distance_cosine_float()` — ~57 lines of ARM NEON vectorized cosine + +### 1.2 Benchmark infrastructure (`benchmarks-ann/`) +- [x] Seed data pipeline (`seed/Makefile`, `seed/build_base_db.py`) +- [x] Ground truth generator (`ground_truth.py`) +- [x] Results schema (`schema.sql`) +- [x] Benchmark runner with `INDEX_REGISTRY` extension point (`bench.py`) + - Baseline configs (float, int8-rescore, bit-rescore) implemented + - Index branches register their types via `INDEX_REGISTRY` dict +- [x] Makefile with baseline targets +- [x] README + +### 1.3 Rebase feature branches onto `ann` +- [x] Rebase `diskann-yolo2` onto `ann` (1 commit: DiskANN implementation) +- [x] Rebase `ivf-yolo3` onto `ann` (1 commit: IVF implementation) +- [x] Rebase `annoy-yolo2` onto `ann` (2 commits: Annoy implementation + schema fix) +- [x] Verify each branch has only its index-specific commits remaining +- [ ] Force-push all 4 branches to origin + +--- + +## 2. Per-branch: register index type in benchmarks + +Each index branch should add to `benchmarks-ann/` when rebased onto `ann`: + +### 2.1 Register in `bench.py` + +Add an `INDEX_REGISTRY` entry. Each entry provides: +- `defaults` — default param values +- `create_table_sql(params)` — CREATE VIRTUAL TABLE with INDEXED BY clause +- `insert_sql(params)` — custom insert SQL, or None for default +- `post_insert_hook(conn, params)` — training/building step, returns time +- `run_query(conn, params, query, k)` — custom query, or None for default MATCH +- `describe(params)` — one-line description for report output + +### 2.2 Add configs to `Makefile` + +Append index-specific config variables and targets. Example pattern: + +```makefile +DISKANN_CONFIGS = \ + "diskann-R48-binary:type=diskann,R=48,L=128,quantizer=binary" \ + ... + +ALL_CONFIGS += $(DISKANN_CONFIGS) + +bench-diskann: seed + $(BENCH) --subset-size 10000 -k 10 -o runs/diskann $(BASELINES) $(DISKANN_CONFIGS) + ... +``` + +### 2.3 Migrate existing benchmark results/docs + +- Move useful results docs (RESULTS.md, etc.) into `benchmarks-ann/results/` +- Delete redundant per-branch benchmark directories once consolidated infra is proven + +--- + +## 3. Future improvements + +- [ ] Reporting script (`report.py`) — query results.db, produce markdown comparison tables +- [ ] Profiling targets in Makefile (lift from ivf-yolo3's Instruments/perf wrappers) +- [ ] Pre-computed ground truth integration (use GT DB files instead of on-the-fly brute-force) diff --git a/benchmarks-ann/.gitignore b/benchmarks-ann/.gitignore new file mode 100644 index 0000000..c418b76 --- /dev/null +++ b/benchmarks-ann/.gitignore @@ -0,0 +1,2 @@ +*.db +runs/ diff --git a/benchmarks-ann/Makefile b/benchmarks-ann/Makefile new file mode 100644 index 0000000..59e2dcd --- /dev/null +++ b/benchmarks-ann/Makefile @@ -0,0 +1,61 @@ +BENCH = python bench.py +BASE_DB = seed/base.db +EXT = ../dist/vec0 + +# --- Baseline (brute-force) configs --- +BASELINES = \ + "brute-float:type=baseline,variant=float" \ + "brute-int8:type=baseline,variant=int8" \ + "brute-bit:type=baseline,variant=bit" + +# --- Index-specific configs --- +# Each index branch should add its own configs here. Example: +# +# DISKANN_CONFIGS = \ +# "diskann-R48-binary:type=diskann,R=48,L=128,quantizer=binary" \ +# "diskann-R72-int8:type=diskann,R=72,L=128,quantizer=int8" +# +# IVF_CONFIGS = \ +# "ivf-n128-p16:type=ivf,nlist=128,nprobe=16" +# +# ANNOY_CONFIGS = \ +# "annoy-t50:type=annoy,n_trees=50" + +ALL_CONFIGS = $(BASELINES) + +.PHONY: seed ground-truth bench-smoke bench-10k bench-50k bench-100k bench-all \ + report clean + +# --- Data preparation --- +seed: + $(MAKE) -C seed + +ground-truth: seed + python ground_truth.py --subset-size 10000 + python ground_truth.py --subset-size 50000 + python ground_truth.py --subset-size 100000 + +# --- Quick smoke test --- +bench-smoke: seed + $(BENCH) --subset-size 5000 -k 10 -n 20 -o runs/smoke \ + $(BASELINES) + +# --- Standard sizes --- +bench-10k: seed + $(BENCH) --subset-size 10000 -k 10 -o runs/10k $(ALL_CONFIGS) + +bench-50k: seed + $(BENCH) --subset-size 50000 -k 10 -o runs/50k $(ALL_CONFIGS) + +bench-100k: seed + $(BENCH) --subset-size 100000 -k 10 -o runs/100k $(ALL_CONFIGS) + +bench-all: bench-10k bench-50k bench-100k + +# --- Report --- +report: + @echo "Use: sqlite3 runs//results.db 'SELECT * FROM bench_results ORDER BY recall DESC'" + +# --- Cleanup --- +clean: + rm -rf runs/ diff --git a/benchmarks-ann/README.md b/benchmarks-ann/README.md new file mode 100644 index 0000000..1f7fd5c --- /dev/null +++ b/benchmarks-ann/README.md @@ -0,0 +1,81 @@ +# KNN Benchmarks for sqlite-vec + +Benchmarking infrastructure for vec0 KNN configurations. Includes brute-force +baselines (float, int8, bit); index-specific branches add their own types +via the `INDEX_REGISTRY` in `bench.py`. + +## Prerequisites + +- Built `dist/vec0` extension (run `make` from repo root) +- Python 3.10+ +- `uv` (for seed data prep): `pip install uv` + +## Quick start + +```bash +# 1. Download dataset and build seed DB (~3 GB download, ~5 min) +make seed + +# 2. Run a quick smoke test (5k vectors, ~1 min) +make bench-smoke + +# 3. Run full benchmark at 10k +make bench-10k +``` + +## Usage + +### Direct invocation + +```bash +python bench.py --subset-size 10000 \ + "brute-float:type=baseline,variant=float" \ + "brute-int8:type=baseline,variant=int8" \ + "brute-bit:type=baseline,variant=bit" +``` + +### Config format + +`name:type=,key=val,key=val` + +| Index type | Keys | Branch | +|-----------|------|--------| +| `baseline` | `variant` (float/int8/bit), `oversample` | this branch | + +Index branches register additional types in `INDEX_REGISTRY`. See the +docstring in `bench.py` for the extension API. + +### Make targets + +| Target | Description | +|--------|-------------| +| `make seed` | Download COHERE 1M dataset | +| `make ground-truth` | Pre-compute ground truth for 10k/50k/100k | +| `make bench-smoke` | Quick 5k baseline test | +| `make bench-10k` | All configs at 10k vectors | +| `make bench-50k` | All configs at 50k vectors | +| `make bench-100k` | All configs at 100k vectors | +| `make bench-all` | 10k + 50k + 100k | + +## Adding an index type + +In your index branch, add an entry to `INDEX_REGISTRY` in `bench.py` and +append your configs to `ALL_CONFIGS` in the `Makefile`. See the existing +`baseline` entry and the comments in both files for the pattern. + +## Results + +Results are stored in `runs//results.db` using the schema in `schema.sql`. + +```bash +sqlite3 runs/10k/results.db " + SELECT config_name, recall, mean_ms, qps + FROM bench_results + ORDER BY recall DESC +" +``` + +## Dataset + +[Zilliz COHERE Medium 1M](https://zilliz.com/learn/datasets-for-vector-database-benchmarks): +768 dimensions, cosine distance, 1M train vectors + 10k query vectors with precomputed neighbors. diff --git a/benchmarks-ann/bench.py b/benchmarks-ann/bench.py new file mode 100644 index 0000000..93f8f82 --- /dev/null +++ b/benchmarks-ann/bench.py @@ -0,0 +1,488 @@ +#!/usr/bin/env python3 +"""Benchmark runner for sqlite-vec KNN configurations. + +Measures insert time, build/train time, DB size, KNN latency, and recall +across different vec0 configurations. + +Config format: name:type=,key=val,key=val + + Baseline (brute-force) keys: + type=baseline, variant=float|int8|bit, oversample=8 + + Index-specific types can be registered via INDEX_REGISTRY (see below). + +Usage: + python bench.py --subset-size 10000 \ + "brute-float:type=baseline,variant=float" \ + "brute-int8:type=baseline,variant=int8" \ + "brute-bit:type=baseline,variant=bit" +""" +import argparse +import os +import sqlite3 +import statistics +import time + +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +EXT_PATH = os.path.join(_SCRIPT_DIR, "..", "dist", "vec0") +BASE_DB = os.path.join(_SCRIPT_DIR, "seed", "base.db") +INSERT_BATCH_SIZE = 1000 + + +# ============================================================================ +# Index registry — extension point for ANN index branches +# ============================================================================ +# +# Each index type provides a dict with: +# "defaults": dict of default params +# "create_table_sql": fn(params) -> SQL string +# "insert_sql": fn(params) -> SQL string (or None for default) +# "post_insert_hook": fn(conn, params) -> train_time_s (or None) +# "run_query": fn(conn, params, query, k) -> [(id, distance), ...] (or None for default MATCH) +# "describe": fn(params) -> str (one-line description) +# +# To add a new index type, add an entry here. Example (in your branch): +# +# INDEX_REGISTRY["diskann"] = { +# "defaults": {"R": 72, "L": 128, "quantizer": "binary", "buffer_threshold": 0}, +# "create_table_sql": lambda p: f"CREATE VIRTUAL TABLE vec_items USING vec0(...)", +# "insert_sql": None, +# "post_insert_hook": None, +# "run_query": None, +# "describe": lambda p: f"diskann q={p['quantizer']} R={p['R']} L={p['L']}", +# } + +INDEX_REGISTRY = {} + + +# ============================================================================ +# Baseline implementation +# ============================================================================ + + +def _baseline_create_table_sql(params): + variant = params["variant"] + extra = "" + if variant == "int8": + extra = ", embedding_int8 int8[768]" + elif variant == "bit": + extra = ", embedding_bq bit[768]" + return ( + f"CREATE VIRTUAL TABLE vec_items USING vec0(" + f" chunk_size=256," + f" id integer primary key," + f" embedding float[768] distance_metric=cosine" + f" {extra})" + ) + + +def _baseline_insert_sql(params): + variant = params["variant"] + if variant == "int8": + return ( + "INSERT INTO vec_items(id, embedding, embedding_int8) " + "SELECT id, vector, vec_quantize_int8(vector, 'unit') " + "FROM base.train WHERE id >= :lo AND id < :hi" + ) + elif variant == "bit": + return ( + "INSERT INTO vec_items(id, embedding, embedding_bq) " + "SELECT id, vector, vec_quantize_binary(vector) " + "FROM base.train WHERE id >= :lo AND id < :hi" + ) + return None # use default + + +def _baseline_run_query(conn, params, query, k): + variant = params["variant"] + oversample = params.get("oversample", 8) + + if variant == "int8": + return conn.execute( + "WITH coarse AS (" + " SELECT id, embedding FROM vec_items" + " WHERE embedding_int8 MATCH vec_quantize_int8(:query, 'unit')" + " LIMIT :oversample_k" + ") " + "SELECT id, vec_distance_cosine(embedding, :query) as distance " + "FROM coarse ORDER BY 2 LIMIT :k", + {"query": query, "k": k, "oversample_k": k * oversample}, + ).fetchall() + elif variant == "bit": + return conn.execute( + "WITH coarse AS (" + " SELECT id, embedding FROM vec_items" + " WHERE embedding_bq MATCH vec_quantize_binary(:query)" + " LIMIT :oversample_k" + ") " + "SELECT id, vec_distance_cosine(embedding, :query) as distance " + "FROM coarse ORDER BY 2 LIMIT :k", + {"query": query, "k": k, "oversample_k": k * oversample}, + ).fetchall() + + return None # use default MATCH + + +def _baseline_describe(params): + v = params["variant"] + if v in ("int8", "bit"): + return f"baseline {v} (os={params['oversample']})" + return f"baseline {v}" + + +INDEX_REGISTRY["baseline"] = { + "defaults": {"variant": "float", "oversample": 8}, + "create_table_sql": _baseline_create_table_sql, + "insert_sql": _baseline_insert_sql, + "post_insert_hook": None, + "run_query": _baseline_run_query, + "describe": _baseline_describe, +} + + +# ============================================================================ +# Config parsing +# ============================================================================ + +INT_KEYS = { + "R", "L", "buffer_threshold", "nlist", "nprobe", "oversample", + "n_trees", "search_k", +} + + +def parse_config(spec): + """Parse 'name:type=baseline,key=val,...' into (name, params_dict).""" + if ":" in spec: + name, opts_str = spec.split(":", 1) + else: + name, opts_str = spec, "" + + raw = {} + if opts_str: + for kv in opts_str.split(","): + k, v = kv.split("=", 1) + raw[k.strip()] = v.strip() + + index_type = raw.pop("type", "baseline") + if index_type not in INDEX_REGISTRY: + raise ValueError( + f"Unknown index type: {index_type}. " + f"Available: {', '.join(sorted(INDEX_REGISTRY.keys()))}" + ) + + reg = INDEX_REGISTRY[index_type] + params = dict(reg["defaults"]) + for k, v in raw.items(): + if k in INT_KEYS: + params[k] = int(v) + else: + params[k] = v + params["index_type"] = index_type + + return name, params + + +# ============================================================================ +# Shared helpers +# ============================================================================ + + +def load_query_vectors(base_db_path, n): + conn = sqlite3.connect(base_db_path) + rows = conn.execute( + "SELECT id, vector FROM query_vectors ORDER BY id LIMIT :n", {"n": n} + ).fetchall() + conn.close() + return [(r[0], r[1]) for r in rows] + + +def insert_loop(conn, sql, subset_size, label=""): + t0 = time.perf_counter() + for lo in range(0, subset_size, INSERT_BATCH_SIZE): + hi = min(lo + INSERT_BATCH_SIZE, subset_size) + conn.execute(sql, {"lo": lo, "hi": hi}) + conn.commit() + done = hi + if done % 5000 == 0 or done == subset_size: + elapsed = time.perf_counter() - t0 + rate = done / elapsed if elapsed > 0 else 0 + print( + f" [{label}] {done:>8}/{subset_size} " + f"{elapsed:.1f}s {rate:.0f} rows/s", + flush=True, + ) + return time.perf_counter() - t0 + + +def open_bench_db(db_path, ext_path, base_db): + if os.path.exists(db_path): + os.remove(db_path) + conn = sqlite3.connect(db_path) + conn.enable_load_extension(True) + conn.load_extension(ext_path) + conn.execute("PRAGMA page_size=8192") + conn.execute(f"ATTACH DATABASE '{base_db}' AS base") + return conn + + +DEFAULT_INSERT_SQL = ( + "INSERT INTO vec_items(id, embedding) " + "SELECT id, vector FROM base.train WHERE id >= :lo AND id < :hi" +) + + +# ============================================================================ +# Build +# ============================================================================ + + +def build_index(base_db, ext_path, name, params, subset_size, out_dir): + db_path = os.path.join(out_dir, f"{name}.{subset_size}.db") + conn = open_bench_db(db_path, ext_path, base_db) + + reg = INDEX_REGISTRY[params["index_type"]] + + conn.execute(reg["create_table_sql"](params)) + + label = params["index_type"] + print(f" Inserting {subset_size} vectors...") + + sql_fn = reg.get("insert_sql") + sql = sql_fn(params) if sql_fn else None + if sql is None: + sql = DEFAULT_INSERT_SQL + + insert_time = insert_loop(conn, sql, subset_size, label) + + train_time = 0.0 + hook = reg.get("post_insert_hook") + if hook: + train_time = hook(conn, params) + + row_count = conn.execute("SELECT count(*) FROM vec_items").fetchone()[0] + conn.close() + file_size_mb = os.path.getsize(db_path) / (1024 * 1024) + + return { + "db_path": db_path, + "insert_time_s": round(insert_time, 3), + "train_time_s": round(train_time, 3), + "total_time_s": round(insert_time + train_time, 3), + "insert_per_vec_ms": round((insert_time / row_count) * 1000, 2) + if row_count + else 0, + "rows": row_count, + "file_size_mb": round(file_size_mb, 2), + } + + +# ============================================================================ +# KNN measurement +# ============================================================================ + + +def _default_match_query(conn, query, k): + return conn.execute( + "SELECT id, distance FROM vec_items " + "WHERE embedding MATCH :query AND k = :k", + {"query": query, "k": k}, + ).fetchall() + + +def measure_knn(db_path, ext_path, base_db, params, subset_size, k=10, n=50): + conn = sqlite3.connect(db_path) + conn.enable_load_extension(True) + conn.load_extension(ext_path) + conn.execute(f"ATTACH DATABASE '{base_db}' AS base") + + query_vectors = load_query_vectors(base_db, n) + + reg = INDEX_REGISTRY[params["index_type"]] + query_fn = reg.get("run_query") + + times_ms = [] + recalls = [] + for qid, query in query_vectors: + t0 = time.perf_counter() + + results = None + if query_fn: + results = query_fn(conn, params, query, k) + if results is None: + results = _default_match_query(conn, query, k) + + elapsed_ms = (time.perf_counter() - t0) * 1000 + times_ms.append(elapsed_ms) + result_ids = set(r[0] for r in results) + + # Ground truth: use pre-computed neighbors table for full dataset, + # otherwise brute-force over the subset + if subset_size >= 1000000: + gt_rows = conn.execute( + "SELECT CAST(neighbors_id AS INTEGER) FROM base.neighbors " + "WHERE query_vector_id = :qid AND rank < :k", + {"qid": qid, "k": k}, + ).fetchall() + else: + gt_rows = conn.execute( + "SELECT id FROM (" + " SELECT id, vec_distance_cosine(vector, :query) as dist " + " FROM base.train WHERE id < :n ORDER BY dist LIMIT :k" + ")", + {"query": query, "k": k, "n": subset_size}, + ).fetchall() + gt_ids = set(r[0] for r in gt_rows) + + if gt_ids: + recalls.append(len(result_ids & gt_ids) / len(gt_ids)) + else: + recalls.append(0.0) + + conn.close() + + return { + "mean_ms": round(statistics.mean(times_ms), 2), + "median_ms": round(statistics.median(times_ms), 2), + "p99_ms": round(sorted(times_ms)[int(len(times_ms) * 0.99)], 2) + if len(times_ms) > 1 + else round(times_ms[0], 2), + "total_ms": round(sum(times_ms), 2), + "recall": round(statistics.mean(recalls), 4), + } + + +# ============================================================================ +# Results persistence +# ============================================================================ + + +def save_results(results_path, rows): + db = sqlite3.connect(results_path) + db.executescript(open(os.path.join(_SCRIPT_DIR, "schema.sql")).read()) + for r in rows: + db.execute( + "INSERT OR REPLACE INTO build_results " + "(config_name, index_type, subset_size, db_path, " + " insert_time_s, train_time_s, total_time_s, rows, file_size_mb) " + "VALUES (?,?,?,?,?,?,?,?,?)", + ( + r["name"], r["index_type"], r["n_vectors"], r["db_path"], + r["insert_time_s"], r["train_time_s"], r["total_time_s"], + r["rows"], r["file_size_mb"], + ), + ) + db.execute( + "INSERT OR REPLACE INTO bench_results " + "(config_name, index_type, subset_size, k, n, " + " mean_ms, median_ms, p99_ms, total_ms, qps, recall, db_path) " + "VALUES (?,?,?,?,?,?,?,?,?,?,?,?)", + ( + r["name"], r["index_type"], r["n_vectors"], r["k"], r["n_queries"], + r["mean_ms"], r["median_ms"], r["p99_ms"], r["total_ms"], + round(r["n_queries"] / (r["total_ms"] / 1000), 1) + if r["total_ms"] > 0 else 0, + r["recall"], r["db_path"], + ), + ) + db.commit() + db.close() + + +# ============================================================================ +# Reporting +# ============================================================================ + + +def print_report(all_results): + print( + f"\n{'name':>20} {'N':>7} {'type':>10} {'config':>28} " + f"{'ins(s)':>7} {'train':>6} {'MB':>7} " + f"{'qry(ms)':>8} {'recall':>7}" + ) + print("-" * 115) + for r in all_results: + train = f"{r['train_time_s']:.1f}" if r["train_time_s"] > 0 else "-" + print( + f"{r['name']:>20} {r['n_vectors']:>7} {r['index_type']:>10} " + f"{r['config_desc']:>28} " + f"{r['insert_time_s']:>7.1f} {train:>6} {r['file_size_mb']:>7.1f} " + f"{r['mean_ms']:>8.2f} {r['recall']:>7.4f}" + ) + + +# ============================================================================ +# Main +# ============================================================================ + + +def main(): + parser = argparse.ArgumentParser( + description="Benchmark runner for sqlite-vec KNN configurations", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + parser.add_argument("configs", nargs="+", help="config specs (name:type=X,key=val,...)") + parser.add_argument("--subset-size", type=int, required=True) + parser.add_argument("-k", type=int, default=10, help="KNN k (default 10)") + parser.add_argument("-n", type=int, default=50, help="number of queries (default 50)") + parser.add_argument("--base-db", default=BASE_DB) + parser.add_argument("--ext", default=EXT_PATH) + parser.add_argument("-o", "--out-dir", default="runs") + parser.add_argument("--results-db", default=None, + help="path to results DB (default: /results.db)") + args = parser.parse_args() + + os.makedirs(args.out_dir, exist_ok=True) + results_db = args.results_db or os.path.join(args.out_dir, "results.db") + configs = [parse_config(c) for c in args.configs] + + all_results = [] + for i, (name, params) in enumerate(configs, 1): + reg = INDEX_REGISTRY[params["index_type"]] + desc = reg["describe"](params) + print(f"\n[{i}/{len(configs)}] {name} ({desc.strip()})") + + build = build_index( + args.base_db, args.ext, name, params, args.subset_size, args.out_dir + ) + train_str = f" + {build['train_time_s']}s train" if build["train_time_s"] > 0 else "" + print( + f" Build: {build['insert_time_s']}s insert{train_str} " + f"{build['file_size_mb']} MB" + ) + + print(f" Measuring KNN (k={args.k}, n={args.n})...") + knn = measure_knn( + build["db_path"], args.ext, args.base_db, + params, args.subset_size, k=args.k, n=args.n, + ) + print(f" KNN: mean={knn['mean_ms']}ms recall@{args.k}={knn['recall']}") + + all_results.append({ + "name": name, + "n_vectors": args.subset_size, + "index_type": params["index_type"], + "config_desc": desc, + "db_path": build["db_path"], + "insert_time_s": build["insert_time_s"], + "train_time_s": build["train_time_s"], + "total_time_s": build["total_time_s"], + "insert_per_vec_ms": build["insert_per_vec_ms"], + "rows": build["rows"], + "file_size_mb": build["file_size_mb"], + "k": args.k, + "n_queries": args.n, + "mean_ms": knn["mean_ms"], + "median_ms": knn["median_ms"], + "p99_ms": knn["p99_ms"], + "total_ms": knn["total_ms"], + "recall": knn["recall"], + }) + + print_report(all_results) + save_results(results_db, all_results) + print(f"\nResults saved to {results_db}") + + +if __name__ == "__main__": + main() diff --git a/benchmarks-ann/ground_truth.py b/benchmarks-ann/ground_truth.py new file mode 100644 index 0000000..636a495 --- /dev/null +++ b/benchmarks-ann/ground_truth.py @@ -0,0 +1,168 @@ +#!/usr/bin/env python3 +"""Compute per-subset ground truth for ANN benchmarks. + +For subset sizes < 1M, builds a temporary vec0 float table with the first N +vectors and runs brute-force KNN to get correct ground truth per subset. + +For 1M (the full dataset), converts the existing `neighbors` table. + +Output: ground_truth.{subset_size}.db with table: + ground_truth(query_vector_id, rank, neighbor_id, distance) + +Usage: + python ground_truth.py --subset-size 50000 + python ground_truth.py --subset-size 1000000 +""" +import argparse +import os +import sqlite3 +import time + +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +EXT_PATH = os.path.join(_SCRIPT_DIR, "..", "dist", "vec0") +BASE_DB = os.path.join(_SCRIPT_DIR, "seed", "base.db") +FULL_DATASET_SIZE = 1_000_000 + + +def gen_ground_truth_subset(base_db, ext_path, subset_size, n_queries, k, out_path): + """Build ground truth by brute-force KNN over the first `subset_size` vectors.""" + if os.path.exists(out_path): + os.remove(out_path) + + conn = sqlite3.connect(out_path) + conn.enable_load_extension(True) + conn.load_extension(ext_path) + + conn.execute( + "CREATE TABLE ground_truth (" + " query_vector_id INTEGER NOT NULL," + " rank INTEGER NOT NULL," + " neighbor_id INTEGER NOT NULL," + " distance REAL NOT NULL," + " PRIMARY KEY (query_vector_id, rank)" + ")" + ) + + conn.execute(f"ATTACH DATABASE '{base_db}' AS base") + + print(f" Building temp vec0 table with {subset_size} vectors...") + conn.execute( + "CREATE VIRTUAL TABLE tmp_vec USING vec0(" + " id integer primary key," + " embedding float[768] distance_metric=cosine" + ")" + ) + + t0 = time.perf_counter() + conn.execute( + "INSERT INTO tmp_vec(id, embedding) " + "SELECT id, vector FROM base.train WHERE id < :n", + {"n": subset_size}, + ) + conn.commit() + build_time = time.perf_counter() - t0 + print(f" Temp table built in {build_time:.1f}s") + + query_vectors = conn.execute( + "SELECT id, vector FROM base.query_vectors ORDER BY id LIMIT :n", + {"n": n_queries}, + ).fetchall() + + print(f" Running brute-force KNN for {len(query_vectors)} queries, k={k}...") + t0 = time.perf_counter() + + for i, (qid, qvec) in enumerate(query_vectors): + results = conn.execute( + "SELECT id, distance FROM tmp_vec " + "WHERE embedding MATCH :query AND k = :k", + {"query": qvec, "k": k}, + ).fetchall() + + for rank, (nid, dist) in enumerate(results): + conn.execute( + "INSERT INTO ground_truth(query_vector_id, rank, neighbor_id, distance) " + "VALUES (?, ?, ?, ?)", + (qid, rank, nid, dist), + ) + + if (i + 1) % 10 == 0 or i == 0: + elapsed = time.perf_counter() - t0 + eta = (elapsed / (i + 1)) * (len(query_vectors) - i - 1) + print( + f" {i+1}/{len(query_vectors)} queries " + f"elapsed={elapsed:.1f}s eta={eta:.1f}s", + flush=True, + ) + + conn.commit() + conn.execute("DROP TABLE tmp_vec") + conn.execute("DETACH DATABASE base") + conn.commit() + + elapsed = time.perf_counter() - t0 + total_rows = conn.execute("SELECT count(*) FROM ground_truth").fetchone()[0] + conn.close() + print(f" Ground truth: {total_rows} rows in {elapsed:.1f}s -> {out_path}") + + +def gen_ground_truth_full(base_db, n_queries, k, out_path): + """Convert the existing neighbors table for the full 1M dataset.""" + if os.path.exists(out_path): + os.remove(out_path) + + conn = sqlite3.connect(out_path) + conn.execute(f"ATTACH DATABASE '{base_db}' AS base") + + conn.execute( + "CREATE TABLE ground_truth (" + " query_vector_id INTEGER NOT NULL," + " rank INTEGER NOT NULL," + " neighbor_id INTEGER NOT NULL," + " distance REAL," + " PRIMARY KEY (query_vector_id, rank)" + ")" + ) + + conn.execute( + "INSERT INTO ground_truth(query_vector_id, rank, neighbor_id) " + "SELECT query_vector_id, rank, CAST(neighbors_id AS INTEGER) " + "FROM base.neighbors " + "WHERE query_vector_id < :n AND rank < :k", + {"n": n_queries, "k": k}, + ) + conn.commit() + + total_rows = conn.execute("SELECT count(*) FROM ground_truth").fetchone()[0] + conn.execute("DETACH DATABASE base") + conn.close() + print(f" Ground truth (full): {total_rows} rows -> {out_path}") + + +def main(): + parser = argparse.ArgumentParser(description="Generate per-subset ground truth") + parser.add_argument( + "--subset-size", type=int, required=True, help="number of vectors in subset" + ) + parser.add_argument("-n", type=int, default=100, help="number of query vectors") + parser.add_argument("-k", type=int, default=100, help="max k for ground truth") + parser.add_argument("--base-db", default=BASE_DB) + parser.add_argument("--ext", default=EXT_PATH) + parser.add_argument( + "-o", "--out-dir", default=os.path.join(_SCRIPT_DIR, "seed"), + help="output directory for ground_truth.{N}.db", + ) + args = parser.parse_args() + + os.makedirs(args.out_dir, exist_ok=True) + out_path = os.path.join(args.out_dir, f"ground_truth.{args.subset_size}.db") + + if args.subset_size >= FULL_DATASET_SIZE: + gen_ground_truth_full(args.base_db, args.n, args.k, out_path) + else: + gen_ground_truth_subset( + args.base_db, args.ext, args.subset_size, args.n, args.k, out_path + ) + + +if __name__ == "__main__": + main() diff --git a/benchmarks-ann/profile.py b/benchmarks-ann/profile.py new file mode 100644 index 0000000..0792373 --- /dev/null +++ b/benchmarks-ann/profile.py @@ -0,0 +1,440 @@ +#!/usr/bin/env python3 +"""CPU profiling for sqlite-vec KNN configurations using macOS `sample` tool. + +Builds dist/sqlite3 (with -g3), generates a SQL workload (inserts + repeated +KNN queries) for each config, profiles the sqlite3 process with `sample`, and +prints the top-N hottest functions by self (exclusive) CPU samples. + +Usage: + cd benchmarks-ann + uv run profile.py --subset-size 50000 -n 50 \\ + "baseline-int8:type=baseline,variant=int8,oversample=8" \\ + "rescore-int8:type=rescore,quantizer=int8,oversample=8" +""" + +import argparse +import os +import re +import shutil +import subprocess +import sys +import tempfile + +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +_PROJECT_ROOT = os.path.join(_SCRIPT_DIR, "..") + +sys.path.insert(0, _SCRIPT_DIR) +from bench import ( + BASE_DB, + DEFAULT_INSERT_SQL, + INDEX_REGISTRY, + INSERT_BATCH_SIZE, + parse_config, +) + +SQLITE3_PATH = os.path.join(_PROJECT_ROOT, "dist", "sqlite3") +EXT_PATH = os.path.join(_PROJECT_ROOT, "dist", "vec0") + + +# ============================================================================ +# SQL generation +# ============================================================================ + + +def _query_sql_for_config(params, query_id, k): + """Return a SQL query string for a single KNN query by query_vector id.""" + index_type = params["index_type"] + qvec = f"(SELECT vector FROM base.query_vectors WHERE id = {query_id})" + + if index_type == "baseline": + variant = params.get("variant", "float") + oversample = params.get("oversample", 8) + oversample_k = k * oversample + + if variant == "int8": + return ( + f"WITH coarse AS (" + f" SELECT id, embedding FROM vec_items" + f" WHERE embedding_int8 MATCH vec_quantize_int8({qvec}, 'unit')" + f" LIMIT {oversample_k}" + f") " + f"SELECT id, vec_distance_cosine(embedding, {qvec}) as distance " + f"FROM coarse ORDER BY 2 LIMIT {k};" + ) + elif variant == "bit": + return ( + f"WITH coarse AS (" + f" SELECT id, embedding FROM vec_items" + f" WHERE embedding_bq MATCH vec_quantize_binary({qvec})" + f" LIMIT {oversample_k}" + f") " + f"SELECT id, vec_distance_cosine(embedding, {qvec}) as distance " + f"FROM coarse ORDER BY 2 LIMIT {k};" + ) + + # Default MATCH query (baseline-float, rescore, and others) + return ( + f"SELECT id, distance FROM vec_items" + f" WHERE embedding MATCH {qvec} AND k = {k};" + ) + + +def generate_sql(db_path, params, subset_size, n_queries, k, repeats): + """Generate a complete SQL workload: load ext, create table, insert, query.""" + lines = [] + lines.append(".bail on") + lines.append(f".load {EXT_PATH}") + lines.append(f"ATTACH DATABASE '{os.path.abspath(BASE_DB)}' AS base;") + lines.append("PRAGMA page_size=8192;") + + # Create table + reg = INDEX_REGISTRY[params["index_type"]] + lines.append(reg["create_table_sql"](params) + ";") + + # Inserts + sql_fn = reg.get("insert_sql") + insert_sql = sql_fn(params) if sql_fn else None + if insert_sql is None: + insert_sql = DEFAULT_INSERT_SQL + for lo in range(0, subset_size, INSERT_BATCH_SIZE): + hi = min(lo + INSERT_BATCH_SIZE, subset_size) + stmt = insert_sql.replace(":lo", str(lo)).replace(":hi", str(hi)) + lines.append(stmt + ";") + if hi % 10000 == 0 or hi == subset_size: + lines.append("-- progress: inserted %d/%d" % (hi, subset_size)) + + # Queries (repeated) + lines.append("-- BEGIN QUERIES") + for _rep in range(repeats): + for qid in range(n_queries): + lines.append(_query_sql_for_config(params, qid, k)) + + return "\n".join(lines) + + +# ============================================================================ +# Profiling with macOS `sample` +# ============================================================================ + + +def run_profile(sqlite3_path, db_path, sql_file, sample_output, duration=120): + """Run sqlite3 under macOS `sample` profiler. + + Starts sqlite3 directly with stdin from the SQL file, then immediately + attaches `sample` to its PID with -mayDie (tolerates process exit). + The workload must be long enough for sample to attach and capture useful data. + """ + sql_fd = open(sql_file, "r") + proc = subprocess.Popen( + [sqlite3_path, db_path], + stdin=sql_fd, + stdout=subprocess.DEVNULL, + stderr=subprocess.PIPE, + ) + + pid = proc.pid + print(f" sqlite3 PID: {pid}") + + # Attach sample immediately (1ms interval, -mayDie tolerates process exit) + sample_proc = subprocess.Popen( + ["sample", str(pid), str(duration), "1", "-mayDie", "-file", sample_output], + stdout=subprocess.DEVNULL, + stderr=subprocess.PIPE, + ) + + # Wait for sqlite3 to finish + _, stderr = proc.communicate() + sql_fd.close() + rc = proc.returncode + if rc != 0: + print(f" sqlite3 failed (rc={rc}):", file=sys.stderr) + print(f" {stderr.decode().strip()}", file=sys.stderr) + sample_proc.kill() + return False + + # Wait for sample to finish + sample_proc.wait() + return True + + +# ============================================================================ +# Parse `sample` output +# ============================================================================ + +# Tree-drawing characters used by macOS `sample` to represent hierarchy. +# We replace them with spaces so indentation depth reflects tree depth. +_TREE_CHARS_RE = re.compile(r"[+!:|]") + +# After tree chars are replaced with spaces, each call-graph line looks like: +# " 800 rescore_knn (in vec0.dylib) + 3808,3640,... [0x1a,0x2b,...] file.c:123" +# We extract just (indent, count, symbol, module) — everything after "(in ...)" +# is decoration we don't need. +_LEADING_RE = re.compile(r"^(\s+)(\d+)\s+(.+)") + + +def _extract_symbol_and_module(rest): + """Given the text after 'count ', extract (symbol, module). + + Handles patterns like: + 'rescore_knn (in vec0.dylib) + 3808,3640,... [0x...]' + 'pread (in libsystem_kernel.dylib) + 8 [0x...]' + '??? (in ) [0x...]' + 'start (in dyld) + 2840 [0x198650274]' + 'Thread_26759239 DispatchQueue_1: ...' + """ + # Try to find "(in ...)" to split symbol from module + m = re.match(r"^(.+?)\s+\(in\s+(.+?)\)", rest) + if m: + return m.group(1).strip(), m.group(2).strip() + # No module — return whole thing as symbol, strip trailing junk + sym = re.sub(r"\s+\[0x[0-9a-f].*", "", rest).strip() + return sym, "" + + +def _parse_call_graph_lines(text): + """Parse call-graph section into list of (depth, count, symbol, module).""" + entries = [] + for raw_line in text.split("\n"): + # Strip tree-drawing characters, replace with spaces to preserve depth + line = _TREE_CHARS_RE.sub(" ", raw_line) + m = _LEADING_RE.match(line) + if not m: + continue + depth = len(m.group(1)) + count = int(m.group(2)) + rest = m.group(3) + symbol, module = _extract_symbol_and_module(rest) + entries.append((depth, count, symbol, module)) + return entries + + +def parse_sample_output(filepath): + """Parse `sample` call-graph output, compute exclusive (self) samples per function. + + Returns dict of {display_name: self_sample_count}. + """ + with open(filepath, "r") as f: + text = f.read() + + # Find "Call graph:" section + cg_start = text.find("Call graph:") + if cg_start == -1: + print(" Warning: no 'Call graph:' section found in sample output") + return {} + + # End at "Total number in stack" or EOF + cg_end = text.find("\nTotal number in stack", cg_start) + if cg_end == -1: + cg_end = len(text) + + entries = _parse_call_graph_lines(text[cg_start:cg_end]) + + if not entries: + print(" Warning: no call graph entries parsed") + return {} + + # Compute self (exclusive) samples per function: + # self = count - sum(direct_children_counts) + self_samples = {} + for i, (depth, count, sym, mod) in enumerate(entries): + children_sum = 0 + child_depth = None + for j in range(i + 1, len(entries)): + j_depth = entries[j][0] + if j_depth <= depth: + break + if child_depth is None: + child_depth = j_depth + if j_depth == child_depth: + children_sum += entries[j][1] + + self_count = count - children_sum + if self_count > 0: + key = f"{sym} ({mod})" if mod else sym + self_samples[key] = self_samples.get(key, 0) + self_count + + return self_samples + + +# ============================================================================ +# Display +# ============================================================================ + + +def print_profile(title, self_samples, top_n=20): + total = sum(self_samples.values()) + if total == 0: + print(f"\n=== {title} (no samples) ===") + return + + sorted_syms = sorted(self_samples.items(), key=lambda x: -x[1]) + + print(f"\n=== {title} (top {top_n}, {total} total self-samples) ===") + for sym, count in sorted_syms[:top_n]: + pct = 100.0 * count / total + print(f" {pct:5.1f}% {count:>6} {sym}") + + +# ============================================================================ +# Main +# ============================================================================ + + +def main(): + parser = argparse.ArgumentParser( + description="CPU profiling for sqlite-vec KNN configurations", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + parser.add_argument( + "configs", nargs="+", help="config specs (name:type=X,key=val,...)" + ) + parser.add_argument("--subset-size", type=int, required=True) + parser.add_argument("-k", type=int, default=10, help="KNN k (default 10)") + parser.add_argument( + "-n", type=int, default=50, help="number of distinct queries (default 50)" + ) + parser.add_argument( + "--repeats", + type=int, + default=10, + help="repeat query set N times for more samples (default 10)", + ) + parser.add_argument( + "--top", type=int, default=20, help="show top N functions (default 20)" + ) + parser.add_argument("--base-db", default=BASE_DB) + parser.add_argument("--sqlite3", default=SQLITE3_PATH) + parser.add_argument( + "--keep-temp", + action="store_true", + help="keep temp directory with DBs, SQL, and sample output", + ) + args = parser.parse_args() + + # Check prerequisites + if not os.path.exists(args.base_db): + print(f"Error: base DB not found at {args.base_db}", file=sys.stderr) + print("Run 'make seed' in benchmarks-ann/ first.", file=sys.stderr) + sys.exit(1) + + if not shutil.which("sample"): + print("Error: macOS 'sample' tool not found.", file=sys.stderr) + sys.exit(1) + + # Build CLI + print("Building dist/sqlite3...") + result = subprocess.run( + ["make", "cli"], cwd=_PROJECT_ROOT, capture_output=True, text=True + ) + if result.returncode != 0: + print(f"Error: make cli failed:\n{result.stderr}", file=sys.stderr) + sys.exit(1) + print(" done.") + + if not os.path.exists(args.sqlite3): + print(f"Error: sqlite3 not found at {args.sqlite3}", file=sys.stderr) + sys.exit(1) + + configs = [parse_config(c) for c in args.configs] + + tmpdir = tempfile.mkdtemp(prefix="sqlite-vec-profile-") + print(f"Working directory: {tmpdir}") + + all_profiles = [] + + for i, (name, params) in enumerate(configs, 1): + reg = INDEX_REGISTRY[params["index_type"]] + desc = reg["describe"](params) + print(f"\n[{i}/{len(configs)}] {name} ({desc})") + + # Generate SQL workload + db_path = os.path.join(tmpdir, f"{name}.db") + sql_text = generate_sql( + db_path, params, args.subset_size, args.n, args.k, args.repeats + ) + sql_file = os.path.join(tmpdir, f"{name}.sql") + with open(sql_file, "w") as f: + f.write(sql_text) + + total_queries = args.n * args.repeats + print( + f" SQL workload: {args.subset_size} inserts + " + f"{total_queries} queries ({args.n} x {args.repeats} repeats)" + ) + + # Profile + sample_file = os.path.join(tmpdir, f"{name}.sample.txt") + print(f" Profiling...") + ok = run_profile(args.sqlite3, db_path, sql_file, sample_file) + if not ok: + print(f" FAILED — skipping {name}") + all_profiles.append((name, desc, {})) + continue + + if not os.path.exists(sample_file): + print(f" Warning: sample output not created") + all_profiles.append((name, desc, {})) + continue + + # Parse + self_samples = parse_sample_output(sample_file) + all_profiles.append((name, desc, self_samples)) + + # Show individual profile + print_profile(f"{name} ({desc})", self_samples, args.top) + + # Side-by-side comparison if multiple configs + if len(all_profiles) > 1: + print("\n" + "=" * 80) + print("COMPARISON") + print("=" * 80) + + # Collect all symbols that appear in top-N of any config + all_syms = set() + for _name, _desc, prof in all_profiles: + sorted_syms = sorted(prof.items(), key=lambda x: -x[1]) + for sym, _count in sorted_syms[: args.top]: + all_syms.add(sym) + + # Build comparison table + rows = [] + for sym in all_syms: + row = [sym] + for _name, _desc, prof in all_profiles: + total = sum(prof.values()) + count = prof.get(sym, 0) + pct = 100.0 * count / total if total > 0 else 0.0 + row.append((pct, count)) + max_pct = max(r[0] for r in row[1:]) + rows.append((max_pct, row)) + + rows.sort(key=lambda x: -x[0]) + + # Header + header = f"{'function':>40}" + for name, desc, _ in all_profiles: + header += f" {name:>14}" + print(header) + print("-" * len(header)) + + for _sort_key, row in rows[: args.top * 2]: + sym = row[0] + display_sym = sym if len(sym) <= 40 else sym[:37] + "..." + line = f"{display_sym:>40}" + for pct, count in row[1:]: + if count > 0: + line += f" {pct:>13.1f}%" + else: + line += f" {'-':>14}" + print(line) + + if args.keep_temp: + print(f"\nTemp files kept at: {tmpdir}") + else: + shutil.rmtree(tmpdir) + print(f"\nTemp files cleaned up. Use --keep-temp to preserve.") + + +if __name__ == "__main__": + main() diff --git a/benchmarks-ann/schema.sql b/benchmarks-ann/schema.sql new file mode 100644 index 0000000..681df4e --- /dev/null +++ b/benchmarks-ann/schema.sql @@ -0,0 +1,35 @@ +-- Canonical results schema for vec0 KNN benchmark comparisons. +-- The index_type column is a free-form TEXT field. Baseline configs use +-- "baseline"; index-specific branches add their own types (registered +-- via INDEX_REGISTRY in bench.py). + +CREATE TABLE IF NOT EXISTS build_results ( + config_name TEXT NOT NULL, + index_type TEXT NOT NULL, + subset_size INTEGER NOT NULL, + db_path TEXT NOT NULL, + insert_time_s REAL NOT NULL, + train_time_s REAL, -- NULL when no training/build step is needed + total_time_s REAL NOT NULL, + rows INTEGER NOT NULL, + file_size_mb REAL NOT NULL, + created_at TEXT NOT NULL DEFAULT (datetime('now')), + PRIMARY KEY (config_name, subset_size) +); + +CREATE TABLE IF NOT EXISTS bench_results ( + config_name TEXT NOT NULL, + index_type TEXT NOT NULL, + subset_size INTEGER NOT NULL, + k INTEGER NOT NULL, + n INTEGER NOT NULL, + mean_ms REAL NOT NULL, + median_ms REAL NOT NULL, + p99_ms REAL NOT NULL, + total_ms REAL NOT NULL, + qps REAL NOT NULL, + recall REAL NOT NULL, + db_path TEXT NOT NULL, + created_at TEXT NOT NULL DEFAULT (datetime('now')), + PRIMARY KEY (config_name, subset_size, k) +); diff --git a/benchmarks-ann/seed/.gitignore b/benchmarks-ann/seed/.gitignore new file mode 100644 index 0000000..8efed50 --- /dev/null +++ b/benchmarks-ann/seed/.gitignore @@ -0,0 +1,2 @@ +*.parquet +base.db diff --git a/benchmarks-ann/seed/Makefile b/benchmarks-ann/seed/Makefile new file mode 100644 index 0000000..186bf66 --- /dev/null +++ b/benchmarks-ann/seed/Makefile @@ -0,0 +1,24 @@ +BASE_URL = https://assets.zilliz.com/benchmark/cohere_medium_1m + +PARQUETS = train.parquet test.parquet neighbors.parquet + +.PHONY: all download base.db clean + +all: base.db + +download: $(PARQUETS) + +train.parquet: + curl -L -o $@ $(BASE_URL)/train.parquet + +test.parquet: + curl -L -o $@ $(BASE_URL)/test.parquet + +neighbors.parquet: + curl -L -o $@ $(BASE_URL)/neighbors.parquet + +base.db: $(PARQUETS) build_base_db.py + uv run --with pandas --with pyarrow python build_base_db.py + +clean: + rm -f base.db diff --git a/benchmarks-ann/seed/build_base_db.py b/benchmarks-ann/seed/build_base_db.py new file mode 100644 index 0000000..33d280d --- /dev/null +++ b/benchmarks-ann/seed/build_base_db.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python3 +"""Build base.db from downloaded parquet files. + +Reads train.parquet, test.parquet, neighbors.parquet and creates a SQLite +database with tables: train, query_vectors, neighbors. + +Usage: + uv run --with pandas --with pyarrow python build_base_db.py +""" +import json +import os +import sqlite3 +import struct +import sys +import time + +import pandas as pd + + +def float_list_to_blob(floats): + """Pack a list of floats into a little-endian f32 blob.""" + return struct.pack(f"<{len(floats)}f", *floats) + + +def main(): + seed_dir = os.path.dirname(os.path.abspath(__file__)) + db_path = os.path.join(seed_dir, "base.db") + + train_path = os.path.join(seed_dir, "train.parquet") + test_path = os.path.join(seed_dir, "test.parquet") + neighbors_path = os.path.join(seed_dir, "neighbors.parquet") + + for p in (train_path, test_path, neighbors_path): + if not os.path.exists(p): + print(f"ERROR: {p} not found. Run 'make download' first.") + sys.exit(1) + + if os.path.exists(db_path): + os.remove(db_path) + + conn = sqlite3.connect(db_path) + conn.execute("PRAGMA journal_mode=WAL") + conn.execute("PRAGMA page_size=4096") + + # --- query_vectors (from test.parquet) --- + print("Loading test.parquet (query vectors)...") + t0 = time.perf_counter() + df_test = pd.read_parquet(test_path) + conn.execute( + "CREATE TABLE query_vectors (id INTEGER PRIMARY KEY, vector BLOB)" + ) + rows = [] + for _, row in df_test.iterrows(): + rows.append((int(row["id"]), float_list_to_blob(row["emb"]))) + conn.executemany("INSERT INTO query_vectors (id, vector) VALUES (?, ?)", rows) + conn.commit() + print(f" {len(rows)} query vectors in {time.perf_counter() - t0:.1f}s") + + # --- neighbors (from neighbors.parquet) --- + print("Loading neighbors.parquet...") + t0 = time.perf_counter() + df_neighbors = pd.read_parquet(neighbors_path) + conn.execute( + "CREATE TABLE neighbors (" + " query_vector_id INTEGER, rank INTEGER, neighbors_id TEXT," + " UNIQUE(query_vector_id, rank))" + ) + rows = [] + for _, row in df_neighbors.iterrows(): + qid = int(row["id"]) + # neighbors_id may be a numpy array or JSON string + nids = row["neighbors_id"] + if isinstance(nids, str): + nids = json.loads(nids) + for rank, nid in enumerate(nids): + rows.append((qid, rank, str(int(nid)))) + conn.executemany( + "INSERT INTO neighbors (query_vector_id, rank, neighbors_id) VALUES (?, ?, ?)", + rows, + ) + conn.commit() + print(f" {len(rows)} neighbor rows in {time.perf_counter() - t0:.1f}s") + + # --- train (from train.parquet) --- + print("Loading train.parquet (1M vectors, this takes a few minutes)...") + t0 = time.perf_counter() + conn.execute( + "CREATE TABLE train (id INTEGER PRIMARY KEY, vector BLOB)" + ) + + batch_size = 10000 + df_iter = pd.read_parquet(train_path) + total = len(df_iter) + + for start in range(0, total, batch_size): + chunk = df_iter.iloc[start : start + batch_size] + rows = [] + for _, row in chunk.iterrows(): + rows.append((int(row["id"]), float_list_to_blob(row["emb"]))) + conn.executemany("INSERT INTO train (id, vector) VALUES (?, ?)", rows) + conn.commit() + + done = min(start + batch_size, total) + elapsed = time.perf_counter() - t0 + rate = done / elapsed if elapsed > 0 else 0 + eta = (total - done) / rate if rate > 0 else 0 + print( + f" {done:>8}/{total} {elapsed:.0f}s {rate:.0f} rows/s eta {eta:.0f}s", + flush=True, + ) + + elapsed = time.perf_counter() - t0 + print(f" {total} train vectors in {elapsed:.1f}s") + + conn.close() + size_mb = os.path.getsize(db_path) / (1024 * 1024) + print(f"\nDone: {db_path} ({size_mb:.0f} MB)") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/exhaustive-memory/bench.py b/benchmarks/exhaustive-memory/bench.py index c9da831..7c969d6 100644 --- a/benchmarks/exhaustive-memory/bench.py +++ b/benchmarks/exhaustive-memory/bench.py @@ -248,59 +248,6 @@ def bench_libsql(base, query, page_size, k) -> BenchResult: return BenchResult(f"libsql ({page_size})", build_time, times) -def register_np(db, array, name): - ptr = array.__array_interface__["data"][0] - nvectors, dimensions = array.__array_interface__["shape"] - element_type = array.__array_interface__["typestr"] - - assert element_type == " BenchResult: - print(f"sqlite-vec static...") - - db = sqlite3.connect(":memory:") - db.enable_load_extension(True) - db.load_extension("../../dist/vec0") - - - - t = time.time() - register_np(db, base, "base") - build_time = time.time() - t - - times = [] - results = [] - for ( - idx, - q, - ) in enumerate(query): - t0 = time.time() - result = db.execute( - """ - select - rowid - from base - where vector match ? - and k = ? - order by distance - """, - [q.tobytes(), k], - ).fetchall() - assert len(result) == k - times.append(time.time() - t0) - return BenchResult(f"sqlite-vec static", build_time, times) - def bench_faiss(base, query, k) -> BenchResult: import faiss dimensions = base.shape[1] @@ -438,8 +385,6 @@ def suite(name, base, query, k, benchmarks): for b in benchmarks: if b == "faiss": results.append(bench_faiss(base, query, k=k)) - elif b == "vec-static": - results.append(bench_sqlite_vec_static(base, query, k=k)) elif b.startswith("vec-scalar"): _, page_size = b.split('.') results.append(bench_sqlite_vec_scalar(base, query, page_size, k=k)) @@ -541,7 +486,7 @@ def parse_args(): help="Number of queries to use. Defaults all", ) parser.add_argument( - "-x", help="type of runs to make", default="faiss,vec-scalar.4096,vec-static,vec-vec0.4096.16,usearch,duckdb,hnswlib,numpy" + "-x", help="type of runs to make", default="faiss,vec-scalar.4096,vec-vec0.4096.16,usearch,duckdb,hnswlib,numpy" ) args = parser.parse_args() diff --git a/benchmarks/profiling/build-from-npy.sql b/benchmarks/profiling/build-from-npy.sql index 134df70..92ef59c 100644 --- a/benchmarks/profiling/build-from-npy.sql +++ b/benchmarks/profiling/build-from-npy.sql @@ -8,10 +8,3 @@ create virtual table vec_items using vec0( embedding float[1536] ); --- 65s (limit 1e5), ~615MB on disk -insert into vec_items - select - rowid, - vector - from vec_npy_each(vec_npy_file('examples/dbpedia-openai/data/vectors.npy')) - limit 1e5; diff --git a/benchmarks/self-params/build.py b/benchmarks/self-params/build.py index bc6e388..c5d9fc1 100644 --- a/benchmarks/self-params/build.py +++ b/benchmarks/self-params/build.py @@ -6,7 +6,6 @@ def connect(path): db = sqlite3.connect(path) db.enable_load_extension(True) db.load_extension("../dist/vec0") - db.execute("select load_extension('../dist/vec0', 'sqlite3_vec_fs_read_init')") db.enable_load_extension(False) return db @@ -18,8 +17,6 @@ page_sizes = [ # 4096, 8192, chunk_sizes = [128, 256, 1024, 2048] types = ["f32", "int8", "bit"] -SRC = "../examples/dbpedia-openai/data/vectors.npy" - for page_size in page_sizes: for chunk_size in chunk_sizes: for t in types: @@ -42,15 +39,8 @@ for page_size in page_sizes: func = "vec_quantize_i8(vector, 'unit')" if t == "bit": func = "vec_quantize_binary(vector)" - db.execute( - f""" - insert into vec_items - select rowid, {func} - from vec_npy_each(vec_npy_file(?)) - limit 100000 - """, - [SRC], - ) + # TODO: replace with non-npy data loading + pass elapsed = time.time() - t0 print(elapsed) diff --git a/bindings/go/ncruces/go-sqlite3.patch b/bindings/go/ncruces/go-sqlite3.patch index f202bc3..03bead9 100644 --- a/bindings/go/ncruces/go-sqlite3.patch +++ b/bindings/go/ncruces/go-sqlite3.patch @@ -6,7 +6,6 @@ index ed2aaec..4cc0b0e 100755 -Wl,--initial-memory=327680 \ -D_HAVE_SQLITE_CONFIG_H \ -DSQLITE_CUSTOM_INCLUDE=sqlite_opt.h \ -+ -DSQLITE_VEC_OMIT_FS=1 \ $(awk '{print "-Wl,--export="$0}' exports.txt) "$BINARYEN/wasm-ctor-eval" -g -c _initialize sqlite3.wasm -o sqlite3.tmp diff --git a/bindings/python/extra_init.py b/bindings/python/extra_init.py index 267bc41..4408855 100644 --- a/bindings/python/extra_init.py +++ b/bindings/python/extra_init.py @@ -1,6 +1,5 @@ from typing import List from struct import pack -from sqlite3 import Connection def serialize_float32(vector: List[float]) -> bytes: @@ -13,33 +12,3 @@ def serialize_int8(vector: List[int]) -> bytes: return pack("%sb" % len(vector), *vector) -try: - import numpy.typing as npt - - def register_numpy(db: Connection, name: str, array: npt.NDArray): - """ayoo""" - - ptr = array.__array_interface__["data"][0] - nvectors, dimensions = array.__array_interface__["shape"] - element_type = array.__array_interface__["typestr"] - - assert element_type == " dist/sqlite-vec.c +""" + +import re +import sys +import os + + +def strip_lsp_block(content): + """Remove the LSP-support pattern: + #ifndef SQLITE_VEC_H + #include "sqlite-vec.c" // ... + #endif + """ + pattern = re.compile( + r'^\s*#ifndef\s+SQLITE_VEC_H\s*\n' + r'\s*#include\s+"sqlite-vec\.c"[^\n]*\n' + r'\s*#endif[^\n]*\n', + re.MULTILINE, + ) + return pattern.sub('', content) + + +def strip_include_guard(content, guard_macro): + """Remove the include guard pair: + #ifndef GUARD_MACRO + #define GUARD_MACRO + ...content... + (trailing #endif removed) + """ + # Strip the #ifndef / #define pair at the top + header_pattern = re.compile( + r'^\s*#ifndef\s+' + re.escape(guard_macro) + r'\s*\n' + r'\s*#define\s+' + re.escape(guard_macro) + r'\s*\n', + re.MULTILINE, + ) + content = header_pattern.sub('', content, count=1) + + # Strip the trailing #endif (last one in file that closes the guard) + # Find the last #endif and remove it + lines = content.rstrip('\n').split('\n') + for i in range(len(lines) - 1, -1, -1): + if re.match(r'^\s*#endif', lines[i]): + lines.pop(i) + break + + return '\n'.join(lines) + '\n' + + +def detect_include_guard(content): + """Detect an include guard macro like SQLITE_VEC_IVF_C.""" + m = re.match( + r'\s*(?:/\*[\s\S]*?\*/\s*)?' # optional block comment + r'#ifndef\s+(SQLITE_VEC_\w+_C)\s*\n' + r'#define\s+\1', + content, + ) + return m.group(1) if m else None + + +def inline_include(match, base_dir): + """Replace an #include "sqlite-vec-*.c" with the file's contents.""" + filename = match.group(1) + filepath = os.path.join(base_dir, filename) + + if not os.path.exists(filepath): + print(f"Warning: {filepath} not found, leaving #include in place", file=sys.stderr) + return match.group(0) + + with open(filepath, 'r') as f: + content = f.read() + + # Strip LSP-support block + content = strip_lsp_block(content) + + # Strip include guard if present + guard = detect_include_guard(content) + if guard: + content = strip_include_guard(content, guard) + + separator = '/' * 78 + header = f'\n{separator}\n// Begin inlined: {filename}\n{separator}\n\n' + footer = f'\n{separator}\n// End inlined: {filename}\n{separator}\n' + + return header + content.strip('\n') + footer + + +def amalgamate(input_path): + base_dir = os.path.dirname(os.path.abspath(input_path)) + + with open(input_path, 'r') as f: + content = f.read() + + # Replace #include "sqlite-vec-*.c" with inlined contents + include_pattern = re.compile(r'^#include\s+"(sqlite-vec-[^"]+\.c)"\s*$', re.MULTILINE) + content = include_pattern.sub(lambda m: inline_include(m, base_dir), content) + + return content + + +def main(): + if len(sys.argv) != 2: + print(f"Usage: {sys.argv[0]} ", file=sys.stderr) + sys.exit(1) + + result = amalgamate(sys.argv[1]) + sys.stdout.write(result) + + +if __name__ == '__main__': + main() diff --git a/site/api-reference.md b/site/api-reference.md index bd144ea..ba8c648 100644 --- a/site/api-reference.md +++ b/site/api-reference.md @@ -568,65 +568,6 @@ select 'todo'; -- 'todo' -``` - -## NumPy Utilities {#numpy} - -Functions to read data from or work with [NumPy arrays](https://numpy.org/doc/stable/reference/generated/numpy.array.html). - -### `vec_npy_each(vector)` {#vec_npy_each} - -xxx - - -```sql --- db.execute('select quote(?)', [to_npy(np.array([[1.0], [2.0], [3.0]], dtype=np.float32))]).fetchone() -select - rowid, - vector, - vec_type(vector), - vec_to_json(vector) -from vec_npy_each( - X'934E554D5059010076007B276465736372273A20273C6634272C2027666F727472616E5F6F72646572273A2046616C73652C20277368617065273A2028332C2031292C207D202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020200A0000803F0000004000004040' -) -/* -┌───────┬─────────────┬──────────────────┬─────────────────────┐ -│ rowid │ vector │ vec_type(vector) │ vec_to_json(vector) │ -├───────┼─────────────┼──────────────────┼─────────────────────┤ -│ 0 │ X'0000803F' │ 'float32' │ '[1.000000]' │ -├───────┼─────────────┼──────────────────┼─────────────────────┤ -│ 1 │ X'00000040' │ 'float32' │ '[2.000000]' │ -├───────┼─────────────┼──────────────────┼─────────────────────┤ -│ 2 │ X'00004040' │ 'float32' │ '[3.000000]' │ -└───────┴─────────────┴──────────────────┴─────────────────────┘ - -*/ - - --- db.execute('select quote(?)', [to_npy(np.array([[1.0], [2.0], [3.0]], dtype=np.float32))]).fetchone() -select - rowid, - vector, - vec_type(vector), - vec_to_json(vector) -from vec_npy_each( - X'934E554D5059010076007B276465736372273A20273C6634272C2027666F727472616E5F6F72646572273A2046616C73652C20277368617065273A2028332C2031292C207D202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020200A0000803F0000004000004040' -) -/* -┌───────┬─────────────┬──────────────────┬─────────────────────┐ -│ rowid │ vector │ vec_type(vector) │ vec_to_json(vector) │ -├───────┼─────────────┼──────────────────┼─────────────────────┤ -│ 0 │ X'0000803F' │ 'float32' │ '[1.000000]' │ -├───────┼─────────────┼──────────────────┼─────────────────────┤ -│ 1 │ X'00000040' │ 'float32' │ '[2.000000]' │ -├───────┼─────────────┼──────────────────┼─────────────────────┤ -│ 2 │ X'00004040' │ 'float32' │ '[3.000000]' │ -└───────┴─────────────┴──────────────────┴─────────────────────┘ - -*/ - - - ``` ## Meta {#meta} diff --git a/site/compiling.md b/site/compiling.md index 9ce3c83..b3b2e33 100644 --- a/site/compiling.md +++ b/site/compiling.md @@ -59,5 +59,4 @@ The current compile-time flags are: - `SQLITE_VEC_ENABLE_AVX`, enables AVX CPU instructions for some vector search operations - `SQLITE_VEC_ENABLE_NEON`, enables NEON CPU instructions for some vector search operations -- `SQLITE_VEC_OMIT_FS`, removes some obsure SQL functions and features that use the filesystem, meant for some WASM builds where there's no available filesystem - `SQLITE_VEC_STATIC`, meant for statically linking `sqlite-vec` diff --git a/sqlite-vec.c b/sqlite-vec.c index c1874a7..390123b 100644 --- a/sqlite-vec.c +++ b/sqlite-vec.c @@ -11,7 +11,7 @@ #include #include -#ifndef SQLITE_VEC_OMIT_FS +#ifdef SQLITE_VEC_DEBUG #include #endif @@ -224,6 +224,63 @@ static f32 l2_sqr_float_neon(const void *pVect1v, const void *pVect2v, return sqrt(sum_scalar); } +static f32 cosine_float_neon(const void *pVect1v, const void *pVect2v, + const void *qty_ptr) { + f32 *pVect1 = (f32 *)pVect1v; + f32 *pVect2 = (f32 *)pVect2v; + size_t qty = *((size_t *)qty_ptr); + size_t qty16 = qty >> 4; + const f32 *pEnd1 = pVect1 + (qty16 << 4); + + float32x4_t dot0 = vdupq_n_f32(0), dot1 = vdupq_n_f32(0); + float32x4_t dot2 = vdupq_n_f32(0), dot3 = vdupq_n_f32(0); + float32x4_t amag0 = vdupq_n_f32(0), amag1 = vdupq_n_f32(0); + float32x4_t amag2 = vdupq_n_f32(0), amag3 = vdupq_n_f32(0); + float32x4_t bmag0 = vdupq_n_f32(0), bmag1 = vdupq_n_f32(0); + float32x4_t bmag2 = vdupq_n_f32(0), bmag3 = vdupq_n_f32(0); + + while (pVect1 < pEnd1) { + float32x4_t v1, v2; + v1 = vld1q_f32(pVect1); pVect1 += 4; + v2 = vld1q_f32(pVect2); pVect2 += 4; + dot0 = vfmaq_f32(dot0, v1, v2); + amag0 = vfmaq_f32(amag0, v1, v1); + bmag0 = vfmaq_f32(bmag0, v2, v2); + + v1 = vld1q_f32(pVect1); pVect1 += 4; + v2 = vld1q_f32(pVect2); pVect2 += 4; + dot1 = vfmaq_f32(dot1, v1, v2); + amag1 = vfmaq_f32(amag1, v1, v1); + bmag1 = vfmaq_f32(bmag1, v2, v2); + + v1 = vld1q_f32(pVect1); pVect1 += 4; + v2 = vld1q_f32(pVect2); pVect2 += 4; + dot2 = vfmaq_f32(dot2, v1, v2); + amag2 = vfmaq_f32(amag2, v1, v1); + bmag2 = vfmaq_f32(bmag2, v2, v2); + + v1 = vld1q_f32(pVect1); pVect1 += 4; + v2 = vld1q_f32(pVect2); pVect2 += 4; + dot3 = vfmaq_f32(dot3, v1, v2); + amag3 = vfmaq_f32(amag3, v1, v1); + bmag3 = vfmaq_f32(bmag3, v2, v2); + } + + f32 dot_s = vaddvq_f32(vaddq_f32(vaddq_f32(dot0, dot1), vaddq_f32(dot2, dot3))); + f32 amag_s = vaddvq_f32(vaddq_f32(vaddq_f32(amag0, amag1), vaddq_f32(amag2, amag3))); + f32 bmag_s = vaddvq_f32(vaddq_f32(vaddq_f32(bmag0, bmag1), vaddq_f32(bmag2, bmag3))); + + const f32 *pEnd2 = pVect1 + (qty - (qty16 << 4)); + while (pVect1 < pEnd2) { + dot_s += *pVect1 * *pVect2; + amag_s += *pVect1 * *pVect1; + bmag_s += *pVect2 * *pVect2; + pVect1++; pVect2++; + } + + return 1.0f - (dot_s / (sqrtf(amag_s) * sqrtf(bmag_s))); +} + static f32 l2_sqr_int8_neon(const void *pVect1v, const void *pVect2v, const void *qty_ptr) { i8 *pVect1 = (i8 *)pVect1v; @@ -462,6 +519,11 @@ static double distance_l1_f32(const void *a, const void *b, const void *d) { static f32 distance_cosine_float(const void *pVect1v, const void *pVect2v, const void *qty_ptr) { +#ifdef SQLITE_VEC_ENABLE_NEON + if ((*(const size_t *)qty_ptr) > 16) { + return cosine_float_neon(pVect1v, pVect2v, qty_ptr); + } +#endif f32 *pVect1 = (f32 *)pVect1v; f32 *pVect2 = (f32 *)pVect2v; size_t qty = *((size_t *)qty_ptr); @@ -478,8 +540,7 @@ static f32 distance_cosine_float(const void *pVect1v, const void *pVect2v, } return 1 - (dot / (sqrt(aMag) * sqrt(bMag))); } -static f32 distance_cosine_int8(const void *pA, const void *pB, - const void *pD) { +static f32 cosine_int8(const void *pA, const void *pB, const void *pD) { i8 *a = (i8 *)pA; i8 *b = (i8 *)pB; size_t d = *((size_t *)pD); @@ -497,6 +558,125 @@ static f32 distance_cosine_int8(const void *pA, const void *pB, return 1 - (dot / (sqrt(aMag) * sqrt(bMag))); } +#ifdef SQLITE_VEC_ENABLE_NEON +static f32 cosine_int8_neon(const void *pA, const void *pB, const void *pD) { + const i8 *a = (const i8 *)pA; + const i8 *b = (const i8 *)pB; + size_t d = *((const size_t *)pD); + const i8 *aEnd = a + d; + + int32x4_t dot_acc1 = vdupq_n_s32(0); + int32x4_t dot_acc2 = vdupq_n_s32(0); + int32x4_t aMag_acc1 = vdupq_n_s32(0); + int32x4_t aMag_acc2 = vdupq_n_s32(0); + int32x4_t bMag_acc1 = vdupq_n_s32(0); + int32x4_t bMag_acc2 = vdupq_n_s32(0); + + while (a < aEnd - 31) { + int8x16_t va1 = vld1q_s8(a); + int8x16_t vb1 = vld1q_s8(b); + int16x8_t a1_lo = vmovl_s8(vget_low_s8(va1)); + int16x8_t a1_hi = vmovl_s8(vget_high_s8(va1)); + int16x8_t b1_lo = vmovl_s8(vget_low_s8(vb1)); + int16x8_t b1_hi = vmovl_s8(vget_high_s8(vb1)); + + dot_acc1 = vmlal_s16(dot_acc1, vget_low_s16(a1_lo), vget_low_s16(b1_lo)); + dot_acc1 = vmlal_s16(dot_acc1, vget_high_s16(a1_lo), vget_high_s16(b1_lo)); + dot_acc2 = vmlal_s16(dot_acc2, vget_low_s16(a1_hi), vget_low_s16(b1_hi)); + dot_acc2 = vmlal_s16(dot_acc2, vget_high_s16(a1_hi), vget_high_s16(b1_hi)); + + aMag_acc1 = vmlal_s16(aMag_acc1, vget_low_s16(a1_lo), vget_low_s16(a1_lo)); + aMag_acc1 = vmlal_s16(aMag_acc1, vget_high_s16(a1_lo), vget_high_s16(a1_lo)); + aMag_acc2 = vmlal_s16(aMag_acc2, vget_low_s16(a1_hi), vget_low_s16(a1_hi)); + aMag_acc2 = vmlal_s16(aMag_acc2, vget_high_s16(a1_hi), vget_high_s16(a1_hi)); + + bMag_acc1 = vmlal_s16(bMag_acc1, vget_low_s16(b1_lo), vget_low_s16(b1_lo)); + bMag_acc1 = vmlal_s16(bMag_acc1, vget_high_s16(b1_lo), vget_high_s16(b1_lo)); + bMag_acc2 = vmlal_s16(bMag_acc2, vget_low_s16(b1_hi), vget_low_s16(b1_hi)); + bMag_acc2 = vmlal_s16(bMag_acc2, vget_high_s16(b1_hi), vget_high_s16(b1_hi)); + + int8x16_t va2 = vld1q_s8(a + 16); + int8x16_t vb2 = vld1q_s8(b + 16); + int16x8_t a2_lo = vmovl_s8(vget_low_s8(va2)); + int16x8_t a2_hi = vmovl_s8(vget_high_s8(va2)); + int16x8_t b2_lo = vmovl_s8(vget_low_s8(vb2)); + int16x8_t b2_hi = vmovl_s8(vget_high_s8(vb2)); + + dot_acc1 = vmlal_s16(dot_acc1, vget_low_s16(a2_lo), vget_low_s16(b2_lo)); + dot_acc1 = vmlal_s16(dot_acc1, vget_high_s16(a2_lo), vget_high_s16(b2_lo)); + dot_acc2 = vmlal_s16(dot_acc2, vget_low_s16(a2_hi), vget_low_s16(b2_hi)); + dot_acc2 = vmlal_s16(dot_acc2, vget_high_s16(a2_hi), vget_high_s16(b2_hi)); + + aMag_acc1 = vmlal_s16(aMag_acc1, vget_low_s16(a2_lo), vget_low_s16(a2_lo)); + aMag_acc1 = vmlal_s16(aMag_acc1, vget_high_s16(a2_lo), vget_high_s16(a2_lo)); + aMag_acc2 = vmlal_s16(aMag_acc2, vget_low_s16(a2_hi), vget_low_s16(a2_hi)); + aMag_acc2 = vmlal_s16(aMag_acc2, vget_high_s16(a2_hi), vget_high_s16(a2_hi)); + + bMag_acc1 = vmlal_s16(bMag_acc1, vget_low_s16(b2_lo), vget_low_s16(b2_lo)); + bMag_acc1 = vmlal_s16(bMag_acc1, vget_high_s16(b2_lo), vget_high_s16(b2_lo)); + bMag_acc2 = vmlal_s16(bMag_acc2, vget_low_s16(b2_hi), vget_low_s16(b2_hi)); + bMag_acc2 = vmlal_s16(bMag_acc2, vget_high_s16(b2_hi), vget_high_s16(b2_hi)); + + a += 32; + b += 32; + } + + while (a < aEnd - 15) { + int8x16_t va = vld1q_s8(a); + int8x16_t vb = vld1q_s8(b); + int16x8_t a_lo = vmovl_s8(vget_low_s8(va)); + int16x8_t a_hi = vmovl_s8(vget_high_s8(va)); + int16x8_t b_lo = vmovl_s8(vget_low_s8(vb)); + int16x8_t b_hi = vmovl_s8(vget_high_s8(vb)); + + dot_acc1 = vmlal_s16(dot_acc1, vget_low_s16(a_lo), vget_low_s16(b_lo)); + dot_acc1 = vmlal_s16(dot_acc1, vget_high_s16(a_lo), vget_high_s16(b_lo)); + dot_acc1 = vmlal_s16(dot_acc1, vget_low_s16(a_hi), vget_low_s16(b_hi)); + dot_acc1 = vmlal_s16(dot_acc1, vget_high_s16(a_hi), vget_high_s16(b_hi)); + + aMag_acc1 = vmlal_s16(aMag_acc1, vget_low_s16(a_lo), vget_low_s16(a_lo)); + aMag_acc1 = vmlal_s16(aMag_acc1, vget_high_s16(a_lo), vget_high_s16(a_lo)); + aMag_acc1 = vmlal_s16(aMag_acc1, vget_low_s16(a_hi), vget_low_s16(a_hi)); + aMag_acc1 = vmlal_s16(aMag_acc1, vget_high_s16(a_hi), vget_high_s16(a_hi)); + + bMag_acc1 = vmlal_s16(bMag_acc1, vget_low_s16(b_lo), vget_low_s16(b_lo)); + bMag_acc1 = vmlal_s16(bMag_acc1, vget_high_s16(b_lo), vget_high_s16(b_lo)); + bMag_acc1 = vmlal_s16(bMag_acc1, vget_low_s16(b_hi), vget_low_s16(b_hi)); + bMag_acc1 = vmlal_s16(bMag_acc1, vget_high_s16(b_hi), vget_high_s16(b_hi)); + + a += 16; + b += 16; + } + + int32x4_t dot_sum = vaddq_s32(dot_acc1, dot_acc2); + int32x4_t aMag_sum = vaddq_s32(aMag_acc1, aMag_acc2); + int32x4_t bMag_sum = vaddq_s32(bMag_acc1, bMag_acc2); + + i32 dot = vaddvq_s32(dot_sum); + i32 aMag = vaddvq_s32(aMag_sum); + i32 bMag = vaddvq_s32(bMag_sum); + + while (a < aEnd) { + dot += (i32)*a * (i32)*b; + aMag += (i32)*a * (i32)*a; + bMag += (i32)*b * (i32)*b; + a++; + b++; + } + + return 1.0f - ((f32)dot / (sqrtf((f32)aMag) * sqrtf((f32)bMag))); +} +#endif + +static f32 distance_cosine_int8(const void *a, const void *b, const void *d) { +#ifdef SQLITE_VEC_ENABLE_NEON + if ((*(const size_t *)d) > 15) { + return cosine_int8_neon(a, b, d); + } +#endif + return cosine_int8(a, b, d); +} + // https://github.com/facebookresearch/faiss/blob/77e2e79cd0a680adc343b9840dd865da724c579e/faiss/utils/hamming_distance/common.h#L34 static u8 hamdist_table[256] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, @@ -511,6 +691,59 @@ static u8 hamdist_table[256] = { 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8}; +#ifdef SQLITE_VEC_ENABLE_NEON +static f32 distance_hamming_neon(const u8 *a, const u8 *b, size_t n_bytes) { + const u8 *pEnd = a + n_bytes; + + uint32x4_t acc1 = vdupq_n_u32(0); + uint32x4_t acc2 = vdupq_n_u32(0); + uint32x4_t acc3 = vdupq_n_u32(0); + uint32x4_t acc4 = vdupq_n_u32(0); + + while (a <= pEnd - 64) { + uint8x16_t v1 = vld1q_u8(a); + uint8x16_t v2 = vld1q_u8(b); + acc1 = vaddq_u32(acc1, vpaddlq_u16(vpaddlq_u8(vcntq_u8(veorq_u8(v1, v2))))); + + v1 = vld1q_u8(a + 16); + v2 = vld1q_u8(b + 16); + acc2 = vaddq_u32(acc2, vpaddlq_u16(vpaddlq_u8(vcntq_u8(veorq_u8(v1, v2))))); + + v1 = vld1q_u8(a + 32); + v2 = vld1q_u8(b + 32); + acc3 = vaddq_u32(acc3, vpaddlq_u16(vpaddlq_u8(vcntq_u8(veorq_u8(v1, v2))))); + + v1 = vld1q_u8(a + 48); + v2 = vld1q_u8(b + 48); + acc4 = vaddq_u32(acc4, vpaddlq_u16(vpaddlq_u8(vcntq_u8(veorq_u8(v1, v2))))); + + a += 64; + b += 64; + } + + while (a <= pEnd - 16) { + uint8x16_t v1 = vld1q_u8(a); + uint8x16_t v2 = vld1q_u8(b); + acc1 = vaddq_u32(acc1, vpaddlq_u16(vpaddlq_u8(vcntq_u8(veorq_u8(v1, v2))))); + a += 16; + b += 16; + } + + acc1 = vaddq_u32(acc1, acc2); + acc3 = vaddq_u32(acc3, acc4); + acc1 = vaddq_u32(acc1, acc3); + u32 sum = vaddvq_u32(acc1); + + while (a < pEnd) { + sum += hamdist_table[*a ^ *b]; + a++; + b++; + } + + return (f32)sum; +} +#endif + static f32 distance_hamming_u8(u8 *a, u8 *b, size_t n) { int same = 0; for (unsigned long i = 0; i < n; i++) { @@ -555,11 +788,18 @@ static f32 distance_hamming_u64(u64 *a, u64 *b, size_t n) { */ static f32 distance_hamming(const void *a, const void *b, const void *d) { size_t dimensions = *((size_t *)d); + size_t n_bytes = dimensions / CHAR_BIT; + +#ifdef SQLITE_VEC_ENABLE_NEON + if (dimensions >= 128) { + return distance_hamming_neon((const u8 *)a, (const u8 *)b, n_bytes); + } +#endif if ((dimensions % 64) == 0) { - return distance_hamming_u64((u64 *)a, (u64 *)b, dimensions / 8 / CHAR_BIT); + return distance_hamming_u64((u64 *)a, (u64 *)b, n_bytes / sizeof(u64)); } - return distance_hamming_u8((u8 *)a, (u8 *)b, dimensions / CHAR_BIT); + return distance_hamming_u8((u8 *)a, (u8 *)b, n_bytes); } #ifdef SQLITE_VEC_TEST @@ -1065,33 +1305,6 @@ int ensure_vector_match(sqlite3_value *aValue, sqlite3_value *bValue, void **a, int _cmp(const void *a, const void *b) { return (*(i64 *)a - *(i64 *)b); } -struct VecNpyFile { - char *path; - size_t pathLength; -}; -#define SQLITE_VEC_NPY_FILE_NAME "vec0-npy-file" - -#ifndef SQLITE_VEC_OMIT_FS -static void vec_npy_file(sqlite3_context *context, int argc, - sqlite3_value **argv) { - assert(argc == 1); - char *path = (char *)sqlite3_value_text(argv[0]); - size_t pathLength = sqlite3_value_bytes(argv[0]); - struct VecNpyFile *f; - - f = sqlite3_malloc(sizeof(*f)); - if (!f) { - sqlite3_result_error_nomem(context); - return; - } - memset(f, 0, sizeof(*f)); - - f->path = path; - f->pathLength = pathLength; - sqlite3_result_pointer(context, f, SQLITE_VEC_NPY_FILE_NAME, sqlite3_free); -} -#endif - #pragma region scalar functions static void vec_f32(sqlite3_context *context, int argc, sqlite3_value **argv) { assert(argc == 1); @@ -2281,12 +2494,53 @@ enum Vec0DistanceMetrics { VEC0_DISTANCE_METRIC_L1 = 3, }; +/** + * Compute distance between two full-precision vectors using the appropriate + * distance function for the given element type and metric. + * Shared utility used by ANN index implementations. + */ +static f32 vec0_distance_full( + const void *a, const void *b, size_t dimensions, + enum VectorElementType elementType, + enum Vec0DistanceMetrics metric) { + switch (elementType) { + case SQLITE_VEC_ELEMENT_TYPE_FLOAT32: + switch (metric) { + case VEC0_DISTANCE_METRIC_L2: + return distance_l2_sqr_float(a, b, &dimensions); + case VEC0_DISTANCE_METRIC_COSINE: + return distance_cosine_float(a, b, &dimensions); + case VEC0_DISTANCE_METRIC_L1: + return (f32)distance_l1_f32(a, b, &dimensions); + } + break; + case SQLITE_VEC_ELEMENT_TYPE_INT8: + switch (metric) { + case VEC0_DISTANCE_METRIC_L2: + return distance_l2_sqr_int8(a, b, &dimensions); + case VEC0_DISTANCE_METRIC_COSINE: + return distance_cosine_int8(a, b, &dimensions); + case VEC0_DISTANCE_METRIC_L1: + return (f32)distance_l1_int8(a, b, &dimensions); + } + break; + case SQLITE_VEC_ELEMENT_TYPE_BIT: + return distance_hamming(a, b, &dimensions); + } + return 0.0f; +} + +enum Vec0IndexType { + VEC0_INDEX_TYPE_FLAT = 1, +}; + struct VectorColumnDefinition { char *name; int name_length; size_t dimensions; enum VectorElementType element_type; enum Vec0DistanceMetrics distance_metric; + enum Vec0IndexType index_type; }; struct Vec0PartitionColumnDefinition { @@ -2346,6 +2600,7 @@ int vec0_parse_vector_column(const char *source, int source_length, int nameLength; enum VectorElementType elementType; enum Vec0DistanceMetrics distanceMetric = VEC0_DISTANCE_METRIC_L2; + enum Vec0IndexType indexType = VEC0_INDEX_TYPE_FLAT; int dimensions; vec0_scanner_init(&scanner, source, source_length); @@ -2449,6 +2704,40 @@ int vec0_parse_vector_column(const char *source, int source_length, return SQLITE_ERROR; } } + else if (sqlite3_strnicmp(key, "indexed", keyLength) == 0) { + // expect "by" + rc = vec0_scanner_next(&scanner, &token); + if (rc != VEC0_TOKEN_RESULT_SOME || + token.token_type != TOKEN_TYPE_IDENTIFIER || + sqlite3_strnicmp(token.start, "by", token.end - token.start) != 0) { + return SQLITE_ERROR; + } + // expect index type name + rc = vec0_scanner_next(&scanner, &token); + if (rc != VEC0_TOKEN_RESULT_SOME || + token.token_type != TOKEN_TYPE_IDENTIFIER) { + return SQLITE_ERROR; + } + int indexNameLen = token.end - token.start; + if (sqlite3_strnicmp(token.start, "flat", indexNameLen) == 0) { + indexType = VEC0_INDEX_TYPE_FLAT; + // expect '(' + rc = vec0_scanner_next(&scanner, &token); + if (rc != VEC0_TOKEN_RESULT_SOME || + token.token_type != TOKEN_TYPE_LPAREN) { + return SQLITE_ERROR; + } + // expect ')' + rc = vec0_scanner_next(&scanner, &token); + if (rc != VEC0_TOKEN_RESULT_SOME || + token.token_type != TOKEN_TYPE_RPAREN) { + return SQLITE_ERROR; + } + } else { + // unknown index type + return SQLITE_ERROR; + } + } // unknown key else { return SQLITE_ERROR; @@ -2463,6 +2752,7 @@ int vec0_parse_vector_column(const char *source, int source_length, outColumn->distance_metric = distanceMetric; outColumn->element_type = elementType; outColumn->dimensions = dimensions; + outColumn->index_type = indexType; return SQLITE_OK; } @@ -2660,758 +2950,6 @@ static sqlite3_module vec_eachModule = { #pragma endregion -#pragma region vec_npy_each table function - -enum NpyTokenType { - NPY_TOKEN_TYPE_IDENTIFIER, - NPY_TOKEN_TYPE_NUMBER, - NPY_TOKEN_TYPE_LPAREN, - NPY_TOKEN_TYPE_RPAREN, - NPY_TOKEN_TYPE_LBRACE, - NPY_TOKEN_TYPE_RBRACE, - NPY_TOKEN_TYPE_COLON, - NPY_TOKEN_TYPE_COMMA, - NPY_TOKEN_TYPE_STRING, - NPY_TOKEN_TYPE_FALSE, -}; - -struct NpyToken { - enum NpyTokenType token_type; - unsigned char *start; - unsigned char *end; -}; - -int npy_token_next(unsigned char *start, unsigned char *end, - struct NpyToken *out) { - unsigned char *ptr = start; - while (ptr < end) { - unsigned char curr = *ptr; - if (is_whitespace(curr)) { - ptr++; - continue; - } else if (curr == '(') { - out->start = ptr++; - out->end = ptr; - out->token_type = NPY_TOKEN_TYPE_LPAREN; - return VEC0_TOKEN_RESULT_SOME; - } else if (curr == ')') { - out->start = ptr++; - out->end = ptr; - out->token_type = NPY_TOKEN_TYPE_RPAREN; - return VEC0_TOKEN_RESULT_SOME; - } else if (curr == '{') { - out->start = ptr++; - out->end = ptr; - out->token_type = NPY_TOKEN_TYPE_LBRACE; - return VEC0_TOKEN_RESULT_SOME; - } else if (curr == '}') { - out->start = ptr++; - out->end = ptr; - out->token_type = NPY_TOKEN_TYPE_RBRACE; - return VEC0_TOKEN_RESULT_SOME; - } else if (curr == ':') { - out->start = ptr++; - out->end = ptr; - out->token_type = NPY_TOKEN_TYPE_COLON; - return VEC0_TOKEN_RESULT_SOME; - } else if (curr == ',') { - out->start = ptr++; - out->end = ptr; - out->token_type = NPY_TOKEN_TYPE_COMMA; - return VEC0_TOKEN_RESULT_SOME; - } else if (curr == '\'') { - unsigned char *start = ptr; - ptr++; - while (ptr < end) { - if ((*ptr) == '\'') { - break; - } - ptr++; - } - if (ptr >= end || (*ptr) != '\'') { - return VEC0_TOKEN_RESULT_ERROR; - } - out->start = start; - out->end = ++ptr; - out->token_type = NPY_TOKEN_TYPE_STRING; - return VEC0_TOKEN_RESULT_SOME; - } else if (curr == 'F' && - strncmp((char *)ptr, "False", strlen("False")) == 0) { - out->start = ptr; - out->end = (ptr + (int)strlen("False")); - ptr = out->end; - out->token_type = NPY_TOKEN_TYPE_FALSE; - return VEC0_TOKEN_RESULT_SOME; - } else if (is_digit(curr)) { - unsigned char *start = ptr; - while (ptr < end && (is_digit(*ptr))) { - ptr++; - } - out->start = start; - out->end = ptr; - out->token_type = NPY_TOKEN_TYPE_NUMBER; - return VEC0_TOKEN_RESULT_SOME; - } else { - return VEC0_TOKEN_RESULT_ERROR; - } - } - return VEC0_TOKEN_RESULT_ERROR; -} - -struct NpyScanner { - unsigned char *start; - unsigned char *end; - unsigned char *ptr; -}; - -void npy_scanner_init(struct NpyScanner *scanner, const unsigned char *source, - int source_length) { - scanner->start = (unsigned char *)source; - scanner->end = (unsigned char *)source + source_length; - scanner->ptr = (unsigned char *)source; -} - -int npy_scanner_next(struct NpyScanner *scanner, struct NpyToken *out) { - int rc = npy_token_next(scanner->start, scanner->end, out); - if (rc == VEC0_TOKEN_RESULT_SOME) { - scanner->start = out->end; - } - return rc; -} - -#define NPY_PARSE_ERROR "Error parsing numpy array: " -int parse_npy_header(sqlite3_vtab *pVTab, const unsigned char *header, - size_t headerLength, - enum VectorElementType *out_element_type, - int *fortran_order, size_t *numElements, - size_t *numDimensions) { - - struct NpyScanner scanner; - struct NpyToken token; - int rc; - npy_scanner_init(&scanner, header, headerLength); - - if (npy_scanner_next(&scanner, &token) != VEC0_TOKEN_RESULT_SOME && - token.token_type != NPY_TOKEN_TYPE_LBRACE) { - vtab_set_error(pVTab, - NPY_PARSE_ERROR "numpy header did not start with '{'"); - return SQLITE_ERROR; - } - while (1) { - rc = npy_scanner_next(&scanner, &token); - if (rc != VEC0_TOKEN_RESULT_SOME) { - vtab_set_error(pVTab, NPY_PARSE_ERROR "expected key in numpy header"); - return SQLITE_ERROR; - } - - if (token.token_type == NPY_TOKEN_TYPE_RBRACE) { - break; - } - if (token.token_type != NPY_TOKEN_TYPE_STRING) { - vtab_set_error(pVTab, NPY_PARSE_ERROR - "expected a string as key in numpy header"); - return SQLITE_ERROR; - } - unsigned char *key = token.start; - - rc = npy_scanner_next(&scanner, &token); - if ((rc != VEC0_TOKEN_RESULT_SOME) || - (token.token_type != NPY_TOKEN_TYPE_COLON)) { - vtab_set_error(pVTab, NPY_PARSE_ERROR - "expected a ':' after key in numpy header"); - return SQLITE_ERROR; - } - - if (strncmp((char *)key, "'descr'", strlen("'descr'")) == 0) { - rc = npy_scanner_next(&scanner, &token); - if ((rc != VEC0_TOKEN_RESULT_SOME) || - (token.token_type != NPY_TOKEN_TYPE_STRING)) { - vtab_set_error(pVTab, NPY_PARSE_ERROR - "expected a string value after 'descr' key"); - return SQLITE_ERROR; - } - if (strncmp((char *)token.start, "'maxChunks = 1024; - pCur->chunksBufferSize = - (vector_byte_size(element_type, numDimensions)) * pCur->maxChunks; - pCur->chunksBuffer = sqlite3_malloc(pCur->chunksBufferSize); - if (pCur->chunksBufferSize && !pCur->chunksBuffer) { - return SQLITE_NOMEM; - } - - pCur->currentChunkSize = - fread(pCur->chunksBuffer, vector_byte_size(element_type, numDimensions), - pCur->maxChunks, file); - - pCur->currentChunkIndex = 0; - pCur->elementType = element_type; - pCur->nElements = numElements; - pCur->nDimensions = numDimensions; - pCur->input_type = VEC_NPY_EACH_INPUT_FILE; - - pCur->eof = pCur->currentChunkSize == 0; - pCur->file = file; - return SQLITE_OK; -} -#endif - -int parse_npy_buffer(sqlite3_vtab *pVTab, const unsigned char *buffer, - int bufferLength, void **data, size_t *numElements, - size_t *numDimensions, - enum VectorElementType *element_type) { - - if (bufferLength < 10) { - // IMP: V03312_20150 - vtab_set_error(pVTab, "numpy array too short"); - return SQLITE_ERROR; - } - if (memcmp(NPY_MAGIC, buffer, sizeof(NPY_MAGIC)) != 0) { - // V11954_28792 - vtab_set_error(pVTab, "numpy array does not contain the 'magic' header"); - return SQLITE_ERROR; - } - - u8 major = buffer[6]; - u8 minor = buffer[7]; - uint16_t headerLength = 0; - memcpy(&headerLength, &buffer[8], sizeof(uint16_t)); - - i32 totalHeaderLength = sizeof(NPY_MAGIC) + sizeof(major) + sizeof(minor) + - sizeof(headerLength) + headerLength; - i32 dataSize = bufferLength - totalHeaderLength; - - if (dataSize < 0) { - vtab_set_error(pVTab, "numpy array header length is invalid"); - return SQLITE_ERROR; - } - - const unsigned char *header = &buffer[10]; - int fortran_order; - - int rc = parse_npy_header(pVTab, header, headerLength, element_type, - &fortran_order, numElements, numDimensions); - if (rc != SQLITE_OK) { - return rc; - } - - i32 expectedDataSize = - (*numElements * vector_byte_size(*element_type, *numDimensions)); - if (expectedDataSize != dataSize) { - vtab_set_error(pVTab, - "numpy array error: Expected a data size of %d, found %d", - expectedDataSize, dataSize); - return SQLITE_ERROR; - } - - *data = (void *)&buffer[totalHeaderLength]; - return SQLITE_OK; -} - -static int vec_npy_eachConnect(sqlite3 *db, void *pAux, int argc, - const char *const *argv, sqlite3_vtab **ppVtab, - char **pzErr) { - UNUSED_PARAMETER(pAux); - UNUSED_PARAMETER(argc); - UNUSED_PARAMETER(argv); - UNUSED_PARAMETER(pzErr); - vec_npy_each_vtab *pNew; - int rc; - - rc = sqlite3_declare_vtab(db, "CREATE TABLE x(vector, input hidden)"); -#define VEC_NPY_EACH_COLUMN_VECTOR 0 -#define VEC_NPY_EACH_COLUMN_INPUT 1 - if (rc == SQLITE_OK) { - pNew = sqlite3_malloc(sizeof(*pNew)); - *ppVtab = (sqlite3_vtab *)pNew; - if (pNew == 0) - return SQLITE_NOMEM; - memset(pNew, 0, sizeof(*pNew)); - } - return rc; -} - -static int vec_npy_eachDisconnect(sqlite3_vtab *pVtab) { - vec_npy_each_vtab *p = (vec_npy_each_vtab *)pVtab; - sqlite3_free(p); - return SQLITE_OK; -} - -static int vec_npy_eachOpen(sqlite3_vtab *p, sqlite3_vtab_cursor **ppCursor) { - UNUSED_PARAMETER(p); - vec_npy_each_cursor *pCur; - pCur = sqlite3_malloc(sizeof(*pCur)); - if (pCur == 0) - return SQLITE_NOMEM; - memset(pCur, 0, sizeof(*pCur)); - *ppCursor = &pCur->base; - return SQLITE_OK; -} - -static int vec_npy_eachClose(sqlite3_vtab_cursor *cur) { - vec_npy_each_cursor *pCur = (vec_npy_each_cursor *)cur; -#ifndef SQLITE_VEC_OMIT_FS - if (pCur->file) { - fclose(pCur->file); - pCur->file = NULL; - } -#endif - if (pCur->chunksBuffer) { - sqlite3_free(pCur->chunksBuffer); - pCur->chunksBuffer = NULL; - } - if (pCur->vector) { - pCur->vector = NULL; - } - sqlite3_free(pCur); - return SQLITE_OK; -} - -static int vec_npy_eachBestIndex(sqlite3_vtab *pVTab, - sqlite3_index_info *pIdxInfo) { - int hasInput; - for (int i = 0; i < pIdxInfo->nConstraint; i++) { - const struct sqlite3_index_constraint *pCons = &pIdxInfo->aConstraint[i]; - // printf("i=%d iColumn=%d, op=%d, usable=%d\n", i, pCons->iColumn, - // pCons->op, pCons->usable); - switch (pCons->iColumn) { - case VEC_NPY_EACH_COLUMN_INPUT: { - if (pCons->op == SQLITE_INDEX_CONSTRAINT_EQ && pCons->usable) { - hasInput = 1; - pIdxInfo->aConstraintUsage[i].argvIndex = 1; - pIdxInfo->aConstraintUsage[i].omit = 1; - } - break; - } - } - } - if (!hasInput) { - pVTab->zErrMsg = sqlite3_mprintf("input argument is required"); - return SQLITE_ERROR; - } - - pIdxInfo->estimatedCost = (double)100000; - pIdxInfo->estimatedRows = 100000; - - return SQLITE_OK; -} - -static int vec_npy_eachFilter(sqlite3_vtab_cursor *pVtabCursor, int idxNum, - const char *idxStr, int argc, - sqlite3_value **argv) { - UNUSED_PARAMETER(idxNum); - UNUSED_PARAMETER(idxStr); - assert(argc == 1); - int rc; - - vec_npy_each_cursor *pCur = (vec_npy_each_cursor *)pVtabCursor; - -#ifndef SQLITE_VEC_OMIT_FS - if (pCur->file) { - fclose(pCur->file); - pCur->file = NULL; - } -#endif - if (pCur->chunksBuffer) { - sqlite3_free(pCur->chunksBuffer); - pCur->chunksBuffer = NULL; - } - if (pCur->vector) { - pCur->vector = NULL; - } - -#ifndef SQLITE_VEC_OMIT_FS - struct VecNpyFile *f = NULL; - if ((f = sqlite3_value_pointer(argv[0], SQLITE_VEC_NPY_FILE_NAME))) { - FILE *file = fopen(f->path, "r"); - if (!file) { - vtab_set_error(pVtabCursor->pVtab, "Could not open numpy file"); - return SQLITE_ERROR; - } - - rc = parse_npy_file(pVtabCursor->pVtab, file, pCur); - if (rc != SQLITE_OK) { -#ifndef SQLITE_VEC_OMIT_FS - fclose(file); -#endif - return rc; - } - - } else -#endif - { - - const unsigned char *input = sqlite3_value_blob(argv[0]); - int inputLength = sqlite3_value_bytes(argv[0]); - void *data; - size_t numElements; - size_t numDimensions; - enum VectorElementType element_type; - - rc = parse_npy_buffer(pVtabCursor->pVtab, input, inputLength, &data, - &numElements, &numDimensions, &element_type); - if (rc != SQLITE_OK) { - return rc; - } - - pCur->vector = data; - pCur->elementType = element_type; - pCur->nElements = numElements; - pCur->nDimensions = numDimensions; - pCur->input_type = VEC_NPY_EACH_INPUT_BUFFER; - } - - pCur->iRowid = 0; - return SQLITE_OK; -} - -static int vec_npy_eachRowid(sqlite3_vtab_cursor *cur, sqlite_int64 *pRowid) { - vec_npy_each_cursor *pCur = (vec_npy_each_cursor *)cur; - *pRowid = pCur->iRowid; - return SQLITE_OK; -} - -static int vec_npy_eachEof(sqlite3_vtab_cursor *cur) { - vec_npy_each_cursor *pCur = (vec_npy_each_cursor *)cur; - if (pCur->input_type == VEC_NPY_EACH_INPUT_BUFFER) { - return (!pCur->nElements) || (size_t)pCur->iRowid >= pCur->nElements; - } - return pCur->eof; -} - -static int vec_npy_eachNext(sqlite3_vtab_cursor *cur) { - vec_npy_each_cursor *pCur = (vec_npy_each_cursor *)cur; - pCur->iRowid++; - if (pCur->input_type == VEC_NPY_EACH_INPUT_BUFFER) { - return SQLITE_OK; - } - -#ifndef SQLITE_VEC_OMIT_FS - // else: input is a file - pCur->currentChunkIndex++; - if (pCur->currentChunkIndex >= pCur->currentChunkSize) { - pCur->currentChunkSize = - fread(pCur->chunksBuffer, - vector_byte_size(pCur->elementType, pCur->nDimensions), - pCur->maxChunks, pCur->file); - if (!pCur->currentChunkSize) { - pCur->eof = 1; - } - pCur->currentChunkIndex = 0; - } -#endif - return SQLITE_OK; -} - -static int vec_npy_eachColumnBuffer(vec_npy_each_cursor *pCur, - sqlite3_context *context, int i) { - switch (i) { - case VEC_NPY_EACH_COLUMN_VECTOR: { - sqlite3_result_subtype(context, pCur->elementType); - switch (pCur->elementType) { - case SQLITE_VEC_ELEMENT_TYPE_FLOAT32: { - sqlite3_result_blob( - context, - &((unsigned char *) - pCur->vector)[pCur->iRowid * pCur->nDimensions * sizeof(f32)], - pCur->nDimensions * sizeof(f32), SQLITE_TRANSIENT); - - break; - } - case SQLITE_VEC_ELEMENT_TYPE_INT8: - case SQLITE_VEC_ELEMENT_TYPE_BIT: { - // https://github.com/asg017/sqlite-vec/issues/42 - sqlite3_result_error(context, - "vec_npy_each only supports float32 vectors", -1); - break; - } - } - - break; - } - } - return SQLITE_OK; -} -static int vec_npy_eachColumnFile(vec_npy_each_cursor *pCur, - sqlite3_context *context, int i) { - switch (i) { - case VEC_NPY_EACH_COLUMN_VECTOR: { - switch (pCur->elementType) { - case SQLITE_VEC_ELEMENT_TYPE_FLOAT32: { - sqlite3_result_blob( - context, - &((unsigned char *) - pCur->chunksBuffer)[pCur->currentChunkIndex * - pCur->nDimensions * sizeof(f32)], - pCur->nDimensions * sizeof(f32), SQLITE_TRANSIENT); - break; - } - case SQLITE_VEC_ELEMENT_TYPE_INT8: - case SQLITE_VEC_ELEMENT_TYPE_BIT: { - // https://github.com/asg017/sqlite-vec/issues/42 - sqlite3_result_error(context, - "vec_npy_each only supports float32 vectors", -1); - break; - } - } - break; - } - } - return SQLITE_OK; -} -static int vec_npy_eachColumn(sqlite3_vtab_cursor *cur, - sqlite3_context *context, int i) { - vec_npy_each_cursor *pCur = (vec_npy_each_cursor *)cur; - switch (pCur->input_type) { - case VEC_NPY_EACH_INPUT_BUFFER: - return vec_npy_eachColumnBuffer(pCur, context, i); - case VEC_NPY_EACH_INPUT_FILE: - return vec_npy_eachColumnFile(pCur, context, i); - } - return SQLITE_ERROR; -} - -static sqlite3_module vec_npy_eachModule = { - /* iVersion */ 0, - /* xCreate */ 0, - /* xConnect */ vec_npy_eachConnect, - /* xBestIndex */ vec_npy_eachBestIndex, - /* xDisconnect */ vec_npy_eachDisconnect, - /* xDestroy */ 0, - /* xOpen */ vec_npy_eachOpen, - /* xClose */ vec_npy_eachClose, - /* xFilter */ vec_npy_eachFilter, - /* xNext */ vec_npy_eachNext, - /* xEof */ vec_npy_eachEof, - /* xColumn */ vec_npy_eachColumn, - /* xRowid */ vec_npy_eachRowid, - /* xUpdate */ 0, - /* xBegin */ 0, - /* xSync */ 0, - /* xCommit */ 0, - /* xRollback */ 0, - /* xFindMethod */ 0, - /* xRename */ 0, - /* xSavepoint */ 0, - /* xRelease */ 0, - /* xRollbackTo */ 0, - /* xShadowName */ 0, -#if SQLITE_VERSION_NUMBER >= 3044000 - /* xIntegrity */ 0, -#endif -}; - -#pragma endregion #pragma region vec0 virtual table @@ -5959,6 +5497,65 @@ int min_idx(const f32 *distances, i32 n, u8 *candidates, i32 *out, i32 k, assert(k > 0); assert(k <= n); +#ifdef SQLITE_VEC_EXPERIMENTAL_MIN_IDX + // Max-heap variant: O(n log k) single-pass. + // out[0..heap_size-1] stores indices; heap ordered by distances descending + // so out[0] is always the index of the LARGEST distance in the top-k. + (void)bTaken; + int heap_size = 0; + + #define HEAP_SIFT_UP(pos) do { \ + int _c = (pos); \ + while (_c > 0) { \ + int _p = (_c - 1) / 2; \ + if (distances[out[_p]] < distances[out[_c]]) { \ + i32 _tmp = out[_p]; out[_p] = out[_c]; out[_c] = _tmp; \ + _c = _p; \ + } else break; \ + } \ + } while(0) + + #define HEAP_SIFT_DOWN(pos, sz) do { \ + int _p = (pos); \ + for (;;) { \ + int _l = 2*_p + 1, _r = 2*_p + 2, _largest = _p; \ + if (_l < (sz) && distances[out[_l]] > distances[out[_largest]]) \ + _largest = _l; \ + if (_r < (sz) && distances[out[_r]] > distances[out[_largest]]) \ + _largest = _r; \ + if (_largest == _p) break; \ + i32 _tmp = out[_p]; out[_p] = out[_largest]; out[_largest] = _tmp; \ + _p = _largest; \ + } \ + } while(0) + + for (int i = 0; i < n; i++) { + if (!bitmap_get(candidates, i)) + continue; + if (heap_size < k) { + out[heap_size] = i; + heap_size++; + HEAP_SIFT_UP(heap_size - 1); + } else if (distances[i] < distances[out[0]]) { + out[0] = i; + HEAP_SIFT_DOWN(0, heap_size); + } + } + + // Heapsort to produce ascending order. + for (int i = heap_size - 1; i > 0; i--) { + i32 tmp = out[0]; out[0] = out[i]; out[i] = tmp; + HEAP_SIFT_DOWN(0, i); + } + + #undef HEAP_SIFT_UP + #undef HEAP_SIFT_DOWN + + *k_used = heap_size; + return SQLITE_OK; + +#else + // Original: O(n*k) repeated linear scan with bitmap. bitmap_clear(bTaken, n); for (int ik = 0; ik < k; ik++) { @@ -5984,6 +5581,7 @@ int min_idx(const f32 *distances, i32 n, u8 *candidates, i32 *out, i32 k, } *k_used = k; return SQLITE_OK; +#endif } int vec0_get_metadata_text_long_value( @@ -9388,652 +8986,6 @@ static sqlite3_module vec0Module = { }; #pragma endregion -static char *POINTER_NAME_STATIC_BLOB_DEF = "vec0-static_blob_def"; -struct static_blob_definition { - void *p; - size_t dimensions; - size_t nvectors; - enum VectorElementType element_type; -}; -static void vec_static_blob_from_raw(sqlite3_context *context, int argc, - sqlite3_value **argv) { - - assert(argc == 4); - struct static_blob_definition *p; - p = sqlite3_malloc(sizeof(*p)); - if (!p) { - sqlite3_result_error_nomem(context); - return; - } - memset(p, 0, sizeof(*p)); - p->p = (void *)sqlite3_value_int64(argv[0]); - p->element_type = SQLITE_VEC_ELEMENT_TYPE_FLOAT32; - p->dimensions = sqlite3_value_int64(argv[2]); - p->nvectors = sqlite3_value_int64(argv[3]); - sqlite3_result_pointer(context, p, POINTER_NAME_STATIC_BLOB_DEF, - sqlite3_free); -} -#pragma region vec_static_blobs() table function - -#define MAX_STATIC_BLOBS 16 - -typedef struct static_blob static_blob; -struct static_blob { - char *name; - void *p; - size_t dimensions; - size_t nvectors; - enum VectorElementType element_type; -}; - -typedef struct vec_static_blob_data vec_static_blob_data; -struct vec_static_blob_data { - static_blob static_blobs[MAX_STATIC_BLOBS]; -}; - -typedef struct vec_static_blobs_vtab vec_static_blobs_vtab; -struct vec_static_blobs_vtab { - sqlite3_vtab base; - vec_static_blob_data *data; -}; - -typedef struct vec_static_blobs_cursor vec_static_blobs_cursor; -struct vec_static_blobs_cursor { - sqlite3_vtab_cursor base; - sqlite3_int64 iRowid; -}; - -static int vec_static_blobsConnect(sqlite3 *db, void *pAux, int argc, - const char *const *argv, - sqlite3_vtab **ppVtab, char **pzErr) { - UNUSED_PARAMETER(argc); - UNUSED_PARAMETER(argv); - UNUSED_PARAMETER(pzErr); - - vec_static_blobs_vtab *pNew; -#define VEC_STATIC_BLOBS_NAME 0 -#define VEC_STATIC_BLOBS_DATA 1 -#define VEC_STATIC_BLOBS_DIMENSIONS 2 -#define VEC_STATIC_BLOBS_COUNT 3 - int rc = sqlite3_declare_vtab( - db, "CREATE TABLE x(name, data, dimensions hidden, count hidden)"); - if (rc == SQLITE_OK) { - pNew = sqlite3_malloc(sizeof(*pNew)); - *ppVtab = (sqlite3_vtab *)pNew; - if (pNew == 0) - return SQLITE_NOMEM; - memset(pNew, 0, sizeof(*pNew)); - pNew->data = pAux; - } - return rc; -} - -static int vec_static_blobsDisconnect(sqlite3_vtab *pVtab) { - vec_static_blobs_vtab *p = (vec_static_blobs_vtab *)pVtab; - sqlite3_free(p); - return SQLITE_OK; -} - -static int vec_static_blobsUpdate(sqlite3_vtab *pVTab, int argc, - sqlite3_value **argv, sqlite_int64 *pRowid) { - UNUSED_PARAMETER(pRowid); - vec_static_blobs_vtab *p = (vec_static_blobs_vtab *)pVTab; - // DELETE operation - if (argc == 1 && sqlite3_value_type(argv[0]) != SQLITE_NULL) { - return SQLITE_ERROR; - } - // INSERT operation - else if (argc > 1 && sqlite3_value_type(argv[0]) == SQLITE_NULL) { - const char *key = - (const char *)sqlite3_value_text(argv[2 + VEC_STATIC_BLOBS_NAME]); - int idx = -1; - for (int i = 0; i < MAX_STATIC_BLOBS; i++) { - if (!p->data->static_blobs[i].name) { - p->data->static_blobs[i].name = sqlite3_mprintf("%s", key); - idx = i; - break; - } - } - if (idx < 0) - abort(); - struct static_blob_definition *def = sqlite3_value_pointer( - argv[2 + VEC_STATIC_BLOBS_DATA], POINTER_NAME_STATIC_BLOB_DEF); - p->data->static_blobs[idx].p = def->p; - p->data->static_blobs[idx].dimensions = def->dimensions; - p->data->static_blobs[idx].nvectors = def->nvectors; - p->data->static_blobs[idx].element_type = def->element_type; - - return SQLITE_OK; - } - // UPDATE operation - else if (argc > 1 && sqlite3_value_type(argv[0]) != SQLITE_NULL) { - return SQLITE_ERROR; - } - return SQLITE_ERROR; -} - -static int vec_static_blobsOpen(sqlite3_vtab *p, - sqlite3_vtab_cursor **ppCursor) { - UNUSED_PARAMETER(p); - vec_static_blobs_cursor *pCur; - pCur = sqlite3_malloc(sizeof(*pCur)); - if (pCur == 0) - return SQLITE_NOMEM; - memset(pCur, 0, sizeof(*pCur)); - *ppCursor = &pCur->base; - return SQLITE_OK; -} - -static int vec_static_blobsClose(sqlite3_vtab_cursor *cur) { - vec_static_blobs_cursor *pCur = (vec_static_blobs_cursor *)cur; - sqlite3_free(pCur); - return SQLITE_OK; -} - -static int vec_static_blobsBestIndex(sqlite3_vtab *pVTab, - sqlite3_index_info *pIdxInfo) { - UNUSED_PARAMETER(pVTab); - pIdxInfo->idxNum = 1; - pIdxInfo->estimatedCost = (double)10; - pIdxInfo->estimatedRows = 10; - return SQLITE_OK; -} - -static int vec_static_blobsNext(sqlite3_vtab_cursor *cur); -static int vec_static_blobsFilter(sqlite3_vtab_cursor *pVtabCursor, int idxNum, - const char *idxStr, int argc, - sqlite3_value **argv) { - UNUSED_PARAMETER(idxNum); - UNUSED_PARAMETER(idxStr); - UNUSED_PARAMETER(argc); - UNUSED_PARAMETER(argv); - vec_static_blobs_cursor *pCur = (vec_static_blobs_cursor *)pVtabCursor; - pCur->iRowid = -1; - vec_static_blobsNext(pVtabCursor); - return SQLITE_OK; -} - -static int vec_static_blobsRowid(sqlite3_vtab_cursor *cur, - sqlite_int64 *pRowid) { - vec_static_blobs_cursor *pCur = (vec_static_blobs_cursor *)cur; - *pRowid = pCur->iRowid; - return SQLITE_OK; -} - -static int vec_static_blobsNext(sqlite3_vtab_cursor *cur) { - vec_static_blobs_cursor *pCur = (vec_static_blobs_cursor *)cur; - vec_static_blobs_vtab *p = (vec_static_blobs_vtab *)pCur->base.pVtab; - pCur->iRowid++; - while (pCur->iRowid < MAX_STATIC_BLOBS) { - if (p->data->static_blobs[pCur->iRowid].name) { - return SQLITE_OK; - } - pCur->iRowid++; - } - return SQLITE_OK; -} - -static int vec_static_blobsEof(sqlite3_vtab_cursor *cur) { - vec_static_blobs_cursor *pCur = (vec_static_blobs_cursor *)cur; - return pCur->iRowid >= MAX_STATIC_BLOBS; -} - -static int vec_static_blobsColumn(sqlite3_vtab_cursor *cur, - sqlite3_context *context, int i) { - vec_static_blobs_cursor *pCur = (vec_static_blobs_cursor *)cur; - vec_static_blobs_vtab *p = (vec_static_blobs_vtab *)cur->pVtab; - switch (i) { - case VEC_STATIC_BLOBS_NAME: - sqlite3_result_text(context, p->data->static_blobs[pCur->iRowid].name, -1, - SQLITE_TRANSIENT); - break; - case VEC_STATIC_BLOBS_DATA: - sqlite3_result_null(context); - break; - case VEC_STATIC_BLOBS_DIMENSIONS: - sqlite3_result_int64(context, - p->data->static_blobs[pCur->iRowid].dimensions); - break; - case VEC_STATIC_BLOBS_COUNT: - sqlite3_result_int64(context, p->data->static_blobs[pCur->iRowid].nvectors); - break; - } - return SQLITE_OK; -} - -static sqlite3_module vec_static_blobsModule = { - /* iVersion */ 3, - /* xCreate */ 0, - /* xConnect */ vec_static_blobsConnect, - /* xBestIndex */ vec_static_blobsBestIndex, - /* xDisconnect */ vec_static_blobsDisconnect, - /* xDestroy */ 0, - /* xOpen */ vec_static_blobsOpen, - /* xClose */ vec_static_blobsClose, - /* xFilter */ vec_static_blobsFilter, - /* xNext */ vec_static_blobsNext, - /* xEof */ vec_static_blobsEof, - /* xColumn */ vec_static_blobsColumn, - /* xRowid */ vec_static_blobsRowid, - /* xUpdate */ vec_static_blobsUpdate, - /* xBegin */ 0, - /* xSync */ 0, - /* xCommit */ 0, - /* xRollback */ 0, - /* xFindMethod */ 0, - /* xRename */ 0, - /* xSavepoint */ 0, - /* xRelease */ 0, - /* xRollbackTo */ 0, - /* xShadowName */ 0, -#if SQLITE_VERSION_NUMBER >= 3044000 - /* xIntegrity */ 0 -#endif -}; -#pragma endregion - -#pragma region vec_static_blob_entries() table function - -typedef struct vec_static_blob_entries_vtab vec_static_blob_entries_vtab; -struct vec_static_blob_entries_vtab { - sqlite3_vtab base; - static_blob *blob; -}; -typedef enum { - VEC_SBE__QUERYPLAN_FULLSCAN = 1, - VEC_SBE__QUERYPLAN_KNN = 2 -} vec_sbe_query_plan; - -struct sbe_query_knn_data { - i64 k; - i64 k_used; - // Array of rowids of size k. Must be freed with sqlite3_free(). - i32 *rowids; - // Array of distances of size k. Must be freed with sqlite3_free(). - f32 *distances; - i64 current_idx; -}; -void sbe_query_knn_data_clear(struct sbe_query_knn_data *knn_data) { - if (!knn_data) - return; - - if (knn_data->rowids) { - sqlite3_free(knn_data->rowids); - knn_data->rowids = NULL; - } - if (knn_data->distances) { - sqlite3_free(knn_data->distances); - knn_data->distances = NULL; - } -} - -typedef struct vec_static_blob_entries_cursor vec_static_blob_entries_cursor; -struct vec_static_blob_entries_cursor { - sqlite3_vtab_cursor base; - sqlite3_int64 iRowid; - vec_sbe_query_plan query_plan; - struct sbe_query_knn_data *knn_data; -}; - -static int vec_static_blob_entriesConnect(sqlite3 *db, void *pAux, int argc, - const char *const *argv, - sqlite3_vtab **ppVtab, char **pzErr) { - UNUSED_PARAMETER(argc); - UNUSED_PARAMETER(argv); - UNUSED_PARAMETER(pzErr); - vec_static_blob_data *blob_data = pAux; - int idx = -1; - for (int i = 0; i < MAX_STATIC_BLOBS; i++) { - if (!blob_data->static_blobs[i].name) - continue; - if (strncmp(blob_data->static_blobs[i].name, argv[3], - strlen(blob_data->static_blobs[i].name)) == 0) { - idx = i; - break; - } - } - if (idx < 0) - abort(); - vec_static_blob_entries_vtab *pNew; -#define VEC_STATIC_BLOB_ENTRIES_VECTOR 0 -#define VEC_STATIC_BLOB_ENTRIES_DISTANCE 1 -#define VEC_STATIC_BLOB_ENTRIES_K 2 - int rc = sqlite3_declare_vtab( - db, "CREATE TABLE x(vector, distance hidden, k hidden)"); - if (rc == SQLITE_OK) { - pNew = sqlite3_malloc(sizeof(*pNew)); - *ppVtab = (sqlite3_vtab *)pNew; - if (pNew == 0) - return SQLITE_NOMEM; - memset(pNew, 0, sizeof(*pNew)); - pNew->blob = &blob_data->static_blobs[idx]; - } - return rc; -} - -static int vec_static_blob_entriesCreate(sqlite3 *db, void *pAux, int argc, - const char *const *argv, - sqlite3_vtab **ppVtab, char **pzErr) { - return vec_static_blob_entriesConnect(db, pAux, argc, argv, ppVtab, pzErr); -} - -static int vec_static_blob_entriesDisconnect(sqlite3_vtab *pVtab) { - vec_static_blob_entries_vtab *p = (vec_static_blob_entries_vtab *)pVtab; - sqlite3_free(p); - return SQLITE_OK; -} - -static int vec_static_blob_entriesOpen(sqlite3_vtab *p, - sqlite3_vtab_cursor **ppCursor) { - UNUSED_PARAMETER(p); - vec_static_blob_entries_cursor *pCur; - pCur = sqlite3_malloc(sizeof(*pCur)); - if (pCur == 0) - return SQLITE_NOMEM; - memset(pCur, 0, sizeof(*pCur)); - *ppCursor = &pCur->base; - return SQLITE_OK; -} - -static int vec_static_blob_entriesClose(sqlite3_vtab_cursor *cur) { - vec_static_blob_entries_cursor *pCur = (vec_static_blob_entries_cursor *)cur; - sqlite3_free(pCur->knn_data); - sqlite3_free(pCur); - return SQLITE_OK; -} - -static int vec_static_blob_entriesBestIndex(sqlite3_vtab *pVTab, - sqlite3_index_info *pIdxInfo) { - vec_static_blob_entries_vtab *p = (vec_static_blob_entries_vtab *)pVTab; - int iMatchTerm = -1; - int iLimitTerm = -1; - // int iRowidTerm = -1; // https://github.com/asg017/sqlite-vec/issues/47 - int iKTerm = -1; - - for (int i = 0; i < pIdxInfo->nConstraint; i++) { - if (!pIdxInfo->aConstraint[i].usable) - continue; - - int iColumn = pIdxInfo->aConstraint[i].iColumn; - int op = pIdxInfo->aConstraint[i].op; - if (op == SQLITE_INDEX_CONSTRAINT_MATCH && - iColumn == VEC_STATIC_BLOB_ENTRIES_VECTOR) { - if (iMatchTerm > -1) { - // https://github.com/asg017/sqlite-vec/issues/51 - return SQLITE_ERROR; - } - iMatchTerm = i; - } - if (op == SQLITE_INDEX_CONSTRAINT_LIMIT) { - iLimitTerm = i; - } - if (op == SQLITE_INDEX_CONSTRAINT_EQ && - iColumn == VEC_STATIC_BLOB_ENTRIES_K) { - iKTerm = i; - } - } - if (iMatchTerm >= 0) { - if (iLimitTerm < 0 && iKTerm < 0) { - // https://github.com/asg017/sqlite-vec/issues/51 - return SQLITE_ERROR; - } - if (iLimitTerm >= 0 && iKTerm >= 0) { - return SQLITE_ERROR; // limit or k, not both - } - if (pIdxInfo->nOrderBy < 1) { - vtab_set_error(pVTab, "ORDER BY distance required"); - return SQLITE_CONSTRAINT; - } - if (pIdxInfo->nOrderBy > 1) { - // https://github.com/asg017/sqlite-vec/issues/51 - vtab_set_error(pVTab, "more than 1 ORDER BY clause provided"); - return SQLITE_CONSTRAINT; - } - if (pIdxInfo->aOrderBy[0].iColumn != VEC_STATIC_BLOB_ENTRIES_DISTANCE) { - vtab_set_error(pVTab, "ORDER BY must be on the distance column"); - return SQLITE_CONSTRAINT; - } - if (pIdxInfo->aOrderBy[0].desc) { - vtab_set_error(pVTab, - "Only ascending in ORDER BY distance clause is supported, " - "DESC is not supported yet."); - return SQLITE_CONSTRAINT; - } - - pIdxInfo->idxNum = VEC_SBE__QUERYPLAN_KNN; - pIdxInfo->estimatedCost = (double)10; - pIdxInfo->estimatedRows = 10; - - pIdxInfo->orderByConsumed = 1; - pIdxInfo->aConstraintUsage[iMatchTerm].argvIndex = 1; - pIdxInfo->aConstraintUsage[iMatchTerm].omit = 1; - if (iLimitTerm >= 0) { - pIdxInfo->aConstraintUsage[iLimitTerm].argvIndex = 2; - pIdxInfo->aConstraintUsage[iLimitTerm].omit = 1; - } else { - pIdxInfo->aConstraintUsage[iKTerm].argvIndex = 2; - pIdxInfo->aConstraintUsage[iKTerm].omit = 1; - } - - } else { - pIdxInfo->idxNum = VEC_SBE__QUERYPLAN_FULLSCAN; - pIdxInfo->estimatedCost = (double)p->blob->nvectors; - pIdxInfo->estimatedRows = p->blob->nvectors; - } - return SQLITE_OK; -} - -static int vec_static_blob_entriesFilter(sqlite3_vtab_cursor *pVtabCursor, - int idxNum, const char *idxStr, - int argc, sqlite3_value **argv) { - UNUSED_PARAMETER(idxStr); - assert(argc >= 0 && argc <= 3); - vec_static_blob_entries_cursor *pCur = - (vec_static_blob_entries_cursor *)pVtabCursor; - vec_static_blob_entries_vtab *p = - (vec_static_blob_entries_vtab *)pCur->base.pVtab; - - if (idxNum == VEC_SBE__QUERYPLAN_KNN) { - assert(argc == 2); - pCur->query_plan = VEC_SBE__QUERYPLAN_KNN; - struct sbe_query_knn_data *knn_data; - knn_data = sqlite3_malloc(sizeof(*knn_data)); - if (!knn_data) { - return SQLITE_NOMEM; - } - memset(knn_data, 0, sizeof(*knn_data)); - - void *queryVector; - size_t dimensions; - enum VectorElementType elementType; - vector_cleanup cleanup; - char *err; - int rc = vector_from_value(argv[0], &queryVector, &dimensions, &elementType, - &cleanup, &err); - if (rc != SQLITE_OK) { - return SQLITE_ERROR; - } - if (elementType != p->blob->element_type) { - return SQLITE_ERROR; - } - if (dimensions != p->blob->dimensions) { - return SQLITE_ERROR; - } - - i64 k = min(sqlite3_value_int64(argv[1]), (i64)p->blob->nvectors); - if (k < 0) { - // HANDLE https://github.com/asg017/sqlite-vec/issues/55 - return SQLITE_ERROR; - } - if (k == 0) { - knn_data->k = 0; - pCur->knn_data = knn_data; - return SQLITE_OK; - } - - size_t bsize = (p->blob->nvectors + 7) & ~7; - - i32 *topk_rowids = sqlite3_malloc(k * sizeof(i32)); - if (!topk_rowids) { - // HANDLE https://github.com/asg017/sqlite-vec/issues/55 - return SQLITE_ERROR; - } - f32 *distances = sqlite3_malloc(bsize * sizeof(f32)); - if (!distances) { - // HANDLE https://github.com/asg017/sqlite-vec/issues/55 - return SQLITE_ERROR; - } - - for (size_t i = 0; i < p->blob->nvectors; i++) { - // https://github.com/asg017/sqlite-vec/issues/52 - float *v = ((float *)p->blob->p) + (i * p->blob->dimensions); - distances[i] = - distance_l2_sqr_float(v, (float *)queryVector, &p->blob->dimensions); - } - u8 *candidates = bitmap_new(bsize); - assert(candidates); - - u8 *taken = bitmap_new(bsize); - assert(taken); - - bitmap_fill(candidates, bsize); - for (size_t i = bsize; i >= p->blob->nvectors; i--) { - bitmap_set(candidates, i, 0); - } - i32 k_used = 0; - min_idx(distances, bsize, candidates, topk_rowids, k, taken, &k_used); - knn_data->current_idx = 0; - knn_data->distances = distances; - knn_data->k = k; - knn_data->rowids = topk_rowids; - - pCur->knn_data = knn_data; - } else { - pCur->query_plan = VEC_SBE__QUERYPLAN_FULLSCAN; - pCur->iRowid = 0; - } - - return SQLITE_OK; -} - -static int vec_static_blob_entriesRowid(sqlite3_vtab_cursor *cur, - sqlite_int64 *pRowid) { - vec_static_blob_entries_cursor *pCur = (vec_static_blob_entries_cursor *)cur; - switch (pCur->query_plan) { - case VEC_SBE__QUERYPLAN_FULLSCAN: { - *pRowid = pCur->iRowid; - return SQLITE_OK; - } - case VEC_SBE__QUERYPLAN_KNN: { - i32 rowid = ((i32 *)pCur->knn_data->rowids)[pCur->knn_data->current_idx]; - *pRowid = (sqlite3_int64)rowid; - return SQLITE_OK; - } - } - return SQLITE_ERROR; -} - -static int vec_static_blob_entriesNext(sqlite3_vtab_cursor *cur) { - vec_static_blob_entries_cursor *pCur = (vec_static_blob_entries_cursor *)cur; - switch (pCur->query_plan) { - case VEC_SBE__QUERYPLAN_FULLSCAN: { - pCur->iRowid++; - return SQLITE_OK; - } - case VEC_SBE__QUERYPLAN_KNN: { - pCur->knn_data->current_idx++; - return SQLITE_OK; - } - } - return SQLITE_ERROR; -} - -static int vec_static_blob_entriesEof(sqlite3_vtab_cursor *cur) { - vec_static_blob_entries_cursor *pCur = (vec_static_blob_entries_cursor *)cur; - vec_static_blob_entries_vtab *p = - (vec_static_blob_entries_vtab *)pCur->base.pVtab; - switch (pCur->query_plan) { - case VEC_SBE__QUERYPLAN_FULLSCAN: { - return (size_t)pCur->iRowid >= p->blob->nvectors; - } - case VEC_SBE__QUERYPLAN_KNN: { - return pCur->knn_data->current_idx >= pCur->knn_data->k; - } - } - return SQLITE_ERROR; -} - -static int vec_static_blob_entriesColumn(sqlite3_vtab_cursor *cur, - sqlite3_context *context, int i) { - vec_static_blob_entries_cursor *pCur = (vec_static_blob_entries_cursor *)cur; - vec_static_blob_entries_vtab *p = (vec_static_blob_entries_vtab *)cur->pVtab; - - switch (pCur->query_plan) { - case VEC_SBE__QUERYPLAN_FULLSCAN: { - switch (i) { - case VEC_STATIC_BLOB_ENTRIES_VECTOR: - - sqlite3_result_blob( - context, - ((unsigned char *)p->blob->p) + - (pCur->iRowid * p->blob->dimensions * sizeof(float)), - p->blob->dimensions * sizeof(float), SQLITE_TRANSIENT); - sqlite3_result_subtype(context, p->blob->element_type); - break; - } - return SQLITE_OK; - } - case VEC_SBE__QUERYPLAN_KNN: { - switch (i) { - case VEC_STATIC_BLOB_ENTRIES_VECTOR: { - i32 rowid = ((i32 *)pCur->knn_data->rowids)[pCur->knn_data->current_idx]; - sqlite3_result_blob(context, - ((unsigned char *)p->blob->p) + - (rowid * p->blob->dimensions * sizeof(float)), - p->blob->dimensions * sizeof(float), - SQLITE_TRANSIENT); - sqlite3_result_subtype(context, p->blob->element_type); - break; - } - } - return SQLITE_OK; - } - } - return SQLITE_ERROR; -} - -static sqlite3_module vec_static_blob_entriesModule = { - /* iVersion */ 3, - /* xCreate */ - vec_static_blob_entriesCreate, // handle rm? - // https://github.com/asg017/sqlite-vec/issues/55 - /* xConnect */ vec_static_blob_entriesConnect, - /* xBestIndex */ vec_static_blob_entriesBestIndex, - /* xDisconnect */ vec_static_blob_entriesDisconnect, - /* xDestroy */ vec_static_blob_entriesDisconnect, - /* xOpen */ vec_static_blob_entriesOpen, - /* xClose */ vec_static_blob_entriesClose, - /* xFilter */ vec_static_blob_entriesFilter, - /* xNext */ vec_static_blob_entriesNext, - /* xEof */ vec_static_blob_entriesEof, - /* xColumn */ vec_static_blob_entriesColumn, - /* xRowid */ vec_static_blob_entriesRowid, - /* xUpdate */ 0, - /* xBegin */ 0, - /* xSync */ 0, - /* xCommit */ 0, - /* xRollback */ 0, - /* xFindMethod */ 0, - /* xRename */ 0, - /* xSavepoint */ 0, - /* xRelease */ 0, - /* xRollbackTo */ 0, - /* xShadowName */ 0, -#if SQLITE_VERSION_NUMBER >= 3044000 - /* xIntegrity */ 0 -#endif -}; -#pragma endregion #ifdef SQLITE_VEC_ENABLE_AVX #define SQLITE_VEC_DEBUG_BUILD_AVX "avx" @@ -10139,55 +9091,4 @@ SQLITE_VEC_API int sqlite3_vec_init(sqlite3 *db, char **pzErrMsg, return SQLITE_OK; } -#ifndef SQLITE_VEC_OMIT_FS -SQLITE_VEC_API int sqlite3_vec_numpy_init(sqlite3 *db, char **pzErrMsg, - const sqlite3_api_routines *pApi) { - UNUSED_PARAMETER(pzErrMsg); -#ifndef SQLITE_CORE - SQLITE_EXTENSION_INIT2(pApi); -#endif - int rc = SQLITE_OK; - rc = sqlite3_create_function_v2(db, "vec_npy_file", 1, SQLITE_RESULT_SUBTYPE, - NULL, vec_npy_file, NULL, NULL, NULL); - if(rc != SQLITE_OK) { - return rc; - } - rc = sqlite3_create_module_v2(db, "vec_npy_each", &vec_npy_eachModule, NULL, NULL); - return rc; -} -#endif -SQLITE_VEC_API int -sqlite3_vec_static_blobs_init(sqlite3 *db, char **pzErrMsg, - const sqlite3_api_routines *pApi) { - UNUSED_PARAMETER(pzErrMsg); -#ifndef SQLITE_CORE - SQLITE_EXTENSION_INIT2(pApi); -#endif - - int rc = SQLITE_OK; - vec_static_blob_data *static_blob_data; - static_blob_data = sqlite3_malloc(sizeof(*static_blob_data)); - if (!static_blob_data) { - return SQLITE_NOMEM; - } - memset(static_blob_data, 0, sizeof(*static_blob_data)); - - rc = sqlite3_create_function_v2( - db, "vec_static_blob_from_raw", 4, - DEFAULT_FLAGS | SQLITE_SUBTYPE | SQLITE_RESULT_SUBTYPE, NULL, - vec_static_blob_from_raw, NULL, NULL, NULL); - if (rc != SQLITE_OK) - return rc; - - rc = sqlite3_create_module_v2(db, "vec_static_blobs", &vec_static_blobsModule, - static_blob_data, sqlite3_free); - if (rc != SQLITE_OK) - return rc; - rc = sqlite3_create_module_v2(db, "vec_static_blob_entries", - &vec_static_blob_entriesModule, - static_blob_data, NULL); - if (rc != SQLITE_OK) - return rc; - return rc; -} diff --git a/tests/correctness/test-correctness.py b/tests/correctness/test-correctness.py index cb01f8f..9ed0319 100644 --- a/tests/correctness/test-correctness.py +++ b/tests/correctness/test-correctness.py @@ -48,7 +48,6 @@ import json db = sqlite3.connect(":memory:") db.enable_load_extension(True) db.load_extension("../../dist/vec0") -db.execute("select load_extension('../../dist/vec0', 'sqlite3_vec_fs_read_init')") db.enable_load_extension(False) results = db.execute( @@ -75,17 +74,21 @@ print(b) db.execute('PRAGMA page_size=16384') -print("Loading into sqlite-vec vec0 table...") -t0 = time.time() -db.execute("create virtual table v using vec0(a float[3072], chunk_size=16)") -db.execute('insert into v select rowid, vector from vec_npy_each(vec_npy_file("dbpedia_openai_3_large_00.npy"))') -print(time.time() - t0) - print("loading numpy array...") t0 = time.time() base = np.load('dbpedia_openai_3_large_00.npy') print(time.time() - t0) +print("Loading into sqlite-vec vec0 table...") +t0 = time.time() +db.execute("create virtual table v using vec0(a float[3072], chunk_size=16)") +with db: + db.executemany( + "insert into v(rowid, a) values (?, ?)", + [(i, row.tobytes()) for i, row in enumerate(base)], + ) +print(time.time() - t0) + np.random.seed(1) queries = base[np.random.choice(base.shape[0], 20, replace=False), :] diff --git a/tests/fuzz/numpy.c b/tests/fuzz/numpy.c deleted file mode 100644 index 9e2900b..0000000 --- a/tests/fuzz/numpy.c +++ /dev/null @@ -1,37 +0,0 @@ -#include -#include - -#include -#include -#include -#include "sqlite-vec.h" -#include "sqlite3.h" -#include - -extern int sqlite3_vec_numpy_init(sqlite3 *db, char **pzErrMsg, - const sqlite3_api_routines *pApi); - -int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { - int rc = SQLITE_OK; - sqlite3 *db; - sqlite3_stmt *stmt; - - rc = sqlite3_open(":memory:", &db); - assert(rc == SQLITE_OK); - rc = sqlite3_vec_init(db, NULL, NULL); - assert(rc == SQLITE_OK); - rc = sqlite3_vec_numpy_init(db, NULL, NULL); - assert(rc == SQLITE_OK); - - rc = sqlite3_prepare_v2(db, "select * from vec_npy_each(?)", -1, &stmt, NULL); - assert(rc == SQLITE_OK); - sqlite3_bind_blob(stmt, 1, data, size, SQLITE_STATIC); - rc = sqlite3_step(stmt); - while (rc == SQLITE_ROW) { - rc = sqlite3_step(stmt); - } - - sqlite3_finalize(stmt); - sqlite3_close(db); - return 0; -} diff --git a/tests/sqlite-vec-internal.h b/tests/sqlite-vec-internal.h index a540849..a02c72a 100644 --- a/tests/sqlite-vec-internal.h +++ b/tests/sqlite-vec-internal.h @@ -3,6 +3,7 @@ #include #include +#include int min_idx( const float *distances, @@ -62,12 +63,17 @@ enum Vec0DistanceMetrics { VEC0_DISTANCE_METRIC_L1 = 3, }; +enum Vec0IndexType { + VEC0_INDEX_TYPE_FLAT = 1, +}; + struct VectorColumnDefinition { char *name; int name_length; size_t dimensions; enum VectorElementType element_type; enum Vec0DistanceMetrics distance_metric; + enum Vec0IndexType index_type; }; int vec0_parse_vector_column(const char *source, int source_length, diff --git a/tests/test-loadable.py b/tests/test-loadable.py index bc4eed1..40c6a5e 100644 --- a/tests/test-loadable.py +++ b/tests/test-loadable.py @@ -119,151 +119,9 @@ FUNCTIONS = [ MODULES = [ "vec0", "vec_each", - # "vec_static_blob_entries", - # "vec_static_blobs", ] -def register_numpy(db, name: str, array): - ptr = array.__array_interface__["data"][0] - nvectors, dimensions = array.__array_interface__["shape"] - element_type = array.__array_interface__["typestr"] - - assert element_type == "\x9a\x99\x99>", - }, - { - "vector": b"fff?\xcd\xccL?", - }, - ] - assert execute_all(db, "select rowid, (vector) from z") == [ - { - "rowid": 0, - "vector": b"\xcd\xcc\xcc=\xcd\xcc\xcc=\xcd\xcc\xcc=\xcd\xcc\xcc=", - }, - { - "rowid": 1, - "vector": b"\xcd\xccL>\xcd\xccL>\xcd\xccL>\xcd\xccL>", - }, - { - "rowid": 2, - "vector": b"\x9a\x99\x99>\x9a\x99\x99>\x9a\x99\x99>\x9a\x99\x99>", - }, - { - "rowid": 3, - "vector": b"\xcd\xcc\xcc>\xcd\xcc\xcc>\xcd\xcc\xcc>\xcd\xcc\xcc>", - }, - { - "rowid": 4, - "vector": b"\x00\x00\x00?\x00\x00\x00?\x00\x00\x00?\x00\x00\x00?", - }, - ] - assert execute_all( - db, - "select rowid, vec_to_json(vector) as v from z where vector match ? and k = 3 order by distance;", - [np.array([0.3, 0.3, 0.3, 0.3], dtype=np.float32)], - ) == [ - { - "rowid": 2, - "v": "[0.300000,0.300000,0.300000,0.300000]", - }, - { - "rowid": 3, - "v": "[0.400000,0.400000,0.400000,0.400000]", - }, - { - "rowid": 1, - "v": "[0.200000,0.200000,0.200000,0.200000]", - }, - ] - assert execute_all( - db, - "select rowid, vec_to_json(vector) as v from z where vector match ? and k = 3 order by distance;", - [np.array([0.6, 0.6, 0.6, 0.6], dtype=np.float32)], - ) == [ - { - "rowid": 4, - "v": "[0.500000,0.500000,0.500000,0.500000]", - }, - { - "rowid": 3, - "v": "[0.400000,0.400000,0.400000,0.400000]", - }, - { - "rowid": 2, - "v": "[0.300000,0.300000,0.300000,0.300000]", - }, - ] - - def test_limits(): db = connect(EXT_PATH) with _raises( @@ -1618,231 +1476,6 @@ def test_vec_each(): vec_each_f32(None) -import io - - -def to_npy(arr): - buf = io.BytesIO() - np.save(buf, arr) - buf.seek(0) - return buf.read() - - -def test_vec_npy_each(): - db = connect(EXT_PATH, extra_entrypoint="sqlite3_vec_numpy_init") - vec_npy_each = lambda *args: execute_all( - db, "select rowid, * from vec_npy_each(?)", args - ) - assert vec_npy_each(to_npy(np.array([1.1, 2.2, 3.3], dtype=np.float32))) == [ - { - "rowid": 0, - "vector": _f32([1.1, 2.2, 3.3]), - }, - ] - assert vec_npy_each(to_npy(np.array([[1.1, 2.2, 3.3]], dtype=np.float32))) == [ - { - "rowid": 0, - "vector": _f32([1.1, 2.2, 3.3]), - }, - ] - assert vec_npy_each( - to_npy(np.array([[1.1, 2.2, 3.3], [9.9, 8.8, 7.7]], dtype=np.float32)) - ) == [ - { - "rowid": 0, - "vector": _f32([1.1, 2.2, 3.3]), - }, - { - "rowid": 1, - "vector": _f32([9.9, 8.8, 7.7]), - }, - ] - - assert vec_npy_each(to_npy(np.array([], dtype=np.float32))) == [] - - -def test_vec_npy_each_errors(): - db = connect(EXT_PATH, extra_entrypoint="sqlite3_vec_numpy_init") - vec_npy_each = lambda *args: execute_all( - db, "select rowid, * from vec_npy_each(?)", args - ) - - full = b"\x93NUMPY\x01\x00v\x00{'descr': ' 8 bits per byte * 64 bytes = 512 + for (int i = 0; i < 128; i += 2) { + a[i] = 0xFF; + } + d = _test_distance_hamming(a, b, 1024); + assert(d == 512.0f); + } + printf(" All distance_hamming tests passed.\n"); } diff --git a/tmp-static.py b/tmp-static.py deleted file mode 100644 index a3b5f37..0000000 --- a/tmp-static.py +++ /dev/null @@ -1,56 +0,0 @@ -import sqlite3 -import numpy as np - -db = sqlite3.connect(":memory:") - -db.enable_load_extension(True) -db.load_extension("./dist/vec0") -db.execute("select load_extension('./dist/vec0', 'sqlite3_vec_raw_init')") -db.enable_load_extension(False) - -x = np.array([[0.1, 0.2, 0.3, 0.4], [0.9, 0.8, 0.7, 0.6]], dtype=np.float32) -y = np.array([[0.2, 0.3], [0.9, 0.8], [0.6, 0.5]], dtype=np.float32) -z = np.array( - [ - [0.1, 0.1, 0.1, 0.1], - [0.2, 0.2, 0.2, 0.2], - [0.3, 0.3, 0.3, 0.3], - [0.4, 0.4, 0.4, 0.4], - [0.5, 0.5, 0.5, 0.5], - ], - dtype=np.float32, -) - - -def register_np(array, name): - ptr = array.__array_interface__["data"][0] - nvectors, dimensions = array.__array_interface__["shape"] - element_type = array.__array_interface__["typestr"] - - assert element_type == " Date: Sun, 29 Mar 2026 19:45:54 -0700 Subject: [PATCH 02/38] Add rescore index for ANN queries Add rescore index type: stores full-precision float vectors in a rowid-keyed shadow table, quantizes to int8 for fast initial scan, then rescores top candidates with original vectors. Includes config parser, shadow table management, insert/delete support, KNN integration, compile flag (SQLITE_VEC_ENABLE_RESCORE), fuzz targets, and tests. --- Makefile | 2 +- benchmarks-ann/Makefile | 13 +- benchmarks-ann/bench.py | 33 ++ sqlite-vec-rescore.c | 662 ++++++++++++++++++++++++++++ sqlite-vec.c | 435 +++++++++++++++++- tests/fuzz/.gitignore | 5 + tests/fuzz/Makefile | 26 +- tests/fuzz/rescore-create.c | 36 ++ tests/fuzz/rescore-create.dict | 20 + tests/fuzz/rescore-interleave.c | 151 +++++++ tests/fuzz/rescore-knn-deep.c | 178 ++++++++ tests/fuzz/rescore-operations.c | 96 ++++ tests/fuzz/rescore-quantize-edge.c | 177 ++++++++ tests/fuzz/rescore-quantize.c | 54 +++ tests/fuzz/rescore-shadow-corrupt.c | 230 ++++++++++ tests/sqlite-vec-internal.h | 25 ++ tests/test-rescore-mutations.py | 470 ++++++++++++++++++++ tests/test-rescore.py | 568 ++++++++++++++++++++++++ tests/test-unit.c | 205 +++++++++ 19 files changed, 3378 insertions(+), 8 deletions(-) create mode 100644 sqlite-vec-rescore.c create mode 100644 tests/fuzz/rescore-create.c create mode 100644 tests/fuzz/rescore-create.dict create mode 100644 tests/fuzz/rescore-interleave.c create mode 100644 tests/fuzz/rescore-knn-deep.c create mode 100644 tests/fuzz/rescore-operations.c create mode 100644 tests/fuzz/rescore-quantize-edge.c create mode 100644 tests/fuzz/rescore-quantize.c create mode 100644 tests/fuzz/rescore-shadow-corrupt.c create mode 100644 tests/test-rescore-mutations.py create mode 100644 tests/test-rescore.py diff --git a/Makefile b/Makefile index 051590e..b604171 100644 --- a/Makefile +++ b/Makefile @@ -202,7 +202,7 @@ test-loadable-watch: watchexec --exts c,py,Makefile --clear -- make test-loadable test-unit: - $(CC) -DSQLITE_CORE -DSQLITE_VEC_TEST tests/test-unit.c sqlite-vec.c vendor/sqlite3.c -I./ -Ivendor -o $(prefix)/test-unit && $(prefix)/test-unit + $(CC) -DSQLITE_CORE -DSQLITE_VEC_TEST -DSQLITE_VEC_ENABLE_RESCORE tests/test-unit.c sqlite-vec.c vendor/sqlite3.c -I./ -Ivendor -o $(prefix)/test-unit && $(prefix)/test-unit fuzz-build: $(MAKE) -C tests/fuzz all diff --git a/benchmarks-ann/Makefile b/benchmarks-ann/Makefile index 59e2dcd..762abea 100644 --- a/benchmarks-ann/Makefile +++ b/benchmarks-ann/Makefile @@ -21,9 +21,14 @@ BASELINES = \ # ANNOY_CONFIGS = \ # "annoy-t50:type=annoy,n_trees=50" -ALL_CONFIGS = $(BASELINES) +RESCORE_CONFIGS = \ + "rescore-bit-os8:type=rescore,quantizer=bit,oversample=8" \ + "rescore-bit-os16:type=rescore,quantizer=bit,oversample=16" \ + "rescore-int8-os8:type=rescore,quantizer=int8,oversample=8" -.PHONY: seed ground-truth bench-smoke bench-10k bench-50k bench-100k bench-all \ +ALL_CONFIGS = $(BASELINES) $(RESCORE_CONFIGS) + +.PHONY: seed ground-truth bench-smoke bench-rescore bench-10k bench-50k bench-100k bench-all \ report clean # --- Data preparation --- @@ -40,6 +45,10 @@ bench-smoke: seed $(BENCH) --subset-size 5000 -k 10 -n 20 -o runs/smoke \ $(BASELINES) +bench-rescore: seed + $(BENCH) --subset-size 10000 -k 10 -o runs/rescore \ + $(RESCORE_CONFIGS) + # --- Standard sizes --- bench-10k: seed $(BENCH) --subset-size 10000 -k 10 -o runs/10k $(ALL_CONFIGS) diff --git a/benchmarks-ann/bench.py b/benchmarks-ann/bench.py index 93f8f82..c1179d6 100644 --- a/benchmarks-ann/bench.py +++ b/benchmarks-ann/bench.py @@ -140,6 +140,39 @@ INDEX_REGISTRY["baseline"] = { } +# ============================================================================ +# Rescore implementation +# ============================================================================ + + +def _rescore_create_table_sql(params): + quantizer = params.get("quantizer", "bit") + oversample = params.get("oversample", 8) + return ( + f"CREATE VIRTUAL TABLE vec_items USING vec0(" + f" chunk_size=256," + f" id integer primary key," + f" embedding float[768] distance_metric=cosine" + f" indexed by rescore(quantizer={quantizer}, oversample={oversample}))" + ) + + +def _rescore_describe(params): + q = params.get("quantizer", "bit") + os = params.get("oversample", 8) + return f"rescore {q} (os={os})" + + +INDEX_REGISTRY["rescore"] = { + "defaults": {"quantizer": "bit", "oversample": 8}, + "create_table_sql": _rescore_create_table_sql, + "insert_sql": None, + "post_insert_hook": None, + "run_query": None, # default MATCH query works — rescore is automatic + "describe": _rescore_describe, +} + + # ============================================================================ # Config parsing # ============================================================================ diff --git a/sqlite-vec-rescore.c b/sqlite-vec-rescore.c new file mode 100644 index 0000000..a45f52f --- /dev/null +++ b/sqlite-vec-rescore.c @@ -0,0 +1,662 @@ +/** + * sqlite-vec-rescore.c — Rescore index logic for sqlite-vec. + * + * This file is #included into sqlite-vec.c after the vec0_vtab definition. + * All functions receive a vec0_vtab *p and access p->vector_columns[i].rescore. + * + * Shadow tables per rescore-enabled vector column: + * _rescore_chunks{NN} — quantized vectors in chunk layout (for coarse scan) + * _rescore_vectors{NN} — float vectors keyed by rowid (for fast rescore lookup) + */ + +// ============================================================================ +// Shadow table lifecycle +// ============================================================================ + +static int rescore_create_tables(vec0_vtab *p, sqlite3 *db, char **pzErr) { + for (int i = 0; i < p->numVectorColumns; i++) { + if (p->vector_columns[i].index_type != VEC0_INDEX_TYPE_RESCORE) + continue; + + // Quantized chunk table (same layout as _vector_chunks) + char *zSql = sqlite3_mprintf( + "CREATE TABLE \"%w\".\"%w_rescore_chunks%02d\"" + "(rowid PRIMARY KEY, vectors BLOB NOT NULL)", + p->schemaName, p->tableName, i); + if (!zSql) + return SQLITE_NOMEM; + sqlite3_stmt *stmt; + int rc = sqlite3_prepare_v2(db, zSql, -1, &stmt, 0); + sqlite3_free(zSql); + if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) { + *pzErr = sqlite3_mprintf( + "Could not create '_rescore_chunks%02d' shadow table: %s", i, + sqlite3_errmsg(db)); + sqlite3_finalize(stmt); + return SQLITE_ERROR; + } + sqlite3_finalize(stmt); + + // Float vector table (rowid-keyed for fast random access) + zSql = sqlite3_mprintf( + "CREATE TABLE \"%w\".\"%w_rescore_vectors%02d\"" + "(rowid INTEGER PRIMARY KEY, vector BLOB NOT NULL)", + p->schemaName, p->tableName, i); + if (!zSql) + return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(db, zSql, -1, &stmt, 0); + sqlite3_free(zSql); + if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) { + *pzErr = sqlite3_mprintf( + "Could not create '_rescore_vectors%02d' shadow table: %s", i, + sqlite3_errmsg(db)); + sqlite3_finalize(stmt); + return SQLITE_ERROR; + } + sqlite3_finalize(stmt); + } + return SQLITE_OK; +} + +static int rescore_drop_tables(vec0_vtab *p) { + for (int i = 0; i < p->numVectorColumns; i++) { + sqlite3_stmt *stmt; + int rc; + char *zSql; + + if (p->shadowRescoreChunksNames[i]) { + zSql = sqlite3_mprintf("DROP TABLE IF EXISTS \"%w\".\"%w\"", + p->schemaName, p->shadowRescoreChunksNames[i]); + if (!zSql) + return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, 0); + sqlite3_free(zSql); + if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) { + sqlite3_finalize(stmt); + return SQLITE_ERROR; + } + sqlite3_finalize(stmt); + } + + if (p->shadowRescoreVectorsNames[i]) { + zSql = sqlite3_mprintf("DROP TABLE IF EXISTS \"%w\".\"%w\"", + p->schemaName, p->shadowRescoreVectorsNames[i]); + if (!zSql) + return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, 0); + sqlite3_free(zSql); + if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) { + sqlite3_finalize(stmt); + return SQLITE_ERROR; + } + sqlite3_finalize(stmt); + } + } + return SQLITE_OK; +} + +static size_t rescore_quantized_byte_size(struct VectorColumnDefinition *col) { + switch (col->rescore.quantizer_type) { + case VEC0_RESCORE_QUANTIZER_BIT: + return col->dimensions / CHAR_BIT; + case VEC0_RESCORE_QUANTIZER_INT8: + return col->dimensions; + default: + return 0; + } +} + +/** + * Insert a new chunk row into each _rescore_chunks{NN} table with a zeroblob. + */ +static int rescore_new_chunk(vec0_vtab *p, i64 chunk_rowid) { + for (int i = 0; i < p->numVectorColumns; i++) { + if (p->vector_columns[i].index_type != VEC0_INDEX_TYPE_RESCORE) + continue; + size_t quantized_size = + rescore_quantized_byte_size(&p->vector_columns[i]); + i64 blob_size = (i64)p->chunk_size * (i64)quantized_size; + + char *zSql = sqlite3_mprintf( + "INSERT INTO \"%w\".\"%w\"(_rowid_, rowid, vectors) VALUES (?, ?, ?)", + p->schemaName, p->shadowRescoreChunksNames[i]); + if (!zSql) + return SQLITE_NOMEM; + sqlite3_stmt *stmt; + int rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) { + sqlite3_finalize(stmt); + return rc; + } + sqlite3_bind_int64(stmt, 1, chunk_rowid); + sqlite3_bind_int64(stmt, 2, chunk_rowid); + sqlite3_bind_zeroblob64(stmt, 3, blob_size); + rc = sqlite3_step(stmt); + sqlite3_finalize(stmt); + if (rc != SQLITE_DONE) + return rc; + } + return SQLITE_OK; +} + +// ============================================================================ +// Quantization +// ============================================================================ + +static void rescore_quantize_float_to_bit(const float *src, uint8_t *dst, + size_t dimensions) { + memset(dst, 0, dimensions / CHAR_BIT); + for (size_t i = 0; i < dimensions; i++) { + if (src[i] >= 0.0f) { + dst[i / CHAR_BIT] |= (1 << (i % CHAR_BIT)); + } + } +} + +static void rescore_quantize_float_to_int8(const float *src, int8_t *dst, + size_t dimensions) { + float vmin = src[0], vmax = src[0]; + for (size_t i = 1; i < dimensions; i++) { + if (src[i] < vmin) vmin = src[i]; + if (src[i] > vmax) vmax = src[i]; + } + float range = vmax - vmin; + if (range == 0.0f) { + memset(dst, 0, dimensions); + return; + } + float scale = 255.0f / range; + for (size_t i = 0; i < dimensions; i++) { + float v = (src[i] - vmin) * scale - 128.0f; + if (v < -128.0f) v = -128.0f; + if (v > 127.0f) v = 127.0f; + dst[i] = (int8_t)v; + } +} + +// ============================================================================ +// Insert path +// ============================================================================ + +/** + * Quantize float vector to _rescore_chunks and store in _rescore_vectors. + */ +static int rescore_on_insert(vec0_vtab *p, i64 chunk_rowid, i64 chunk_offset, + i64 rowid, void *vectorDatas[]) { + for (int i = 0; i < p->numVectorColumns; i++) { + if (p->vector_columns[i].index_type != VEC0_INDEX_TYPE_RESCORE) + continue; + + struct VectorColumnDefinition *col = &p->vector_columns[i]; + size_t qsize = rescore_quantized_byte_size(col); + size_t fsize = vector_column_byte_size(*col); + int rc; + + // 1. Write quantized vector to _rescore_chunks blob + { + void *qbuf = sqlite3_malloc(qsize); + if (!qbuf) + return SQLITE_NOMEM; + + switch (col->rescore.quantizer_type) { + case VEC0_RESCORE_QUANTIZER_BIT: + rescore_quantize_float_to_bit((const float *)vectorDatas[i], + (uint8_t *)qbuf, col->dimensions); + break; + case VEC0_RESCORE_QUANTIZER_INT8: + rescore_quantize_float_to_int8((const float *)vectorDatas[i], + (int8_t *)qbuf, col->dimensions); + break; + } + + sqlite3_blob *blob = NULL; + rc = sqlite3_blob_open(p->db, p->schemaName, + p->shadowRescoreChunksNames[i], "vectors", + chunk_rowid, 1, &blob); + if (rc != SQLITE_OK) { + sqlite3_free(qbuf); + return rc; + } + rc = sqlite3_blob_write(blob, qbuf, qsize, chunk_offset * qsize); + sqlite3_free(qbuf); + int brc = sqlite3_blob_close(blob); + if (rc != SQLITE_OK) + return rc; + if (brc != SQLITE_OK) + return brc; + } + + // 2. Insert float vector into _rescore_vectors (rowid-keyed) + { + char *zSql = sqlite3_mprintf( + "INSERT INTO \"%w\".\"%w\"(rowid, vector) VALUES (?, ?)", + p->schemaName, p->shadowRescoreVectorsNames[i]); + if (!zSql) + return SQLITE_NOMEM; + sqlite3_stmt *stmt; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) { + sqlite3_finalize(stmt); + return rc; + } + sqlite3_bind_int64(stmt, 1, rowid); + sqlite3_bind_blob(stmt, 2, vectorDatas[i], fsize, SQLITE_TRANSIENT); + rc = sqlite3_step(stmt); + sqlite3_finalize(stmt); + if (rc != SQLITE_DONE) + return SQLITE_ERROR; + } + } + return SQLITE_OK; +} + +// ============================================================================ +// Delete path +// ============================================================================ + +/** + * Zero out quantized vector in _rescore_chunks and delete from _rescore_vectors. + */ +static int rescore_on_delete(vec0_vtab *p, i64 chunk_id, u64 chunk_offset, + i64 rowid) { + for (int i = 0; i < p->numVectorColumns; i++) { + if (p->vector_columns[i].index_type != VEC0_INDEX_TYPE_RESCORE) + continue; + int rc; + + // 1. Zero out quantized data in _rescore_chunks + { + size_t qsize = rescore_quantized_byte_size(&p->vector_columns[i]); + void *zeroBuf = sqlite3_malloc(qsize); + if (!zeroBuf) + return SQLITE_NOMEM; + memset(zeroBuf, 0, qsize); + + sqlite3_blob *blob = NULL; + rc = sqlite3_blob_open(p->db, p->schemaName, + p->shadowRescoreChunksNames[i], "vectors", + chunk_id, 1, &blob); + if (rc != SQLITE_OK) { + sqlite3_free(zeroBuf); + return rc; + } + rc = sqlite3_blob_write(blob, zeroBuf, qsize, chunk_offset * qsize); + sqlite3_free(zeroBuf); + int brc = sqlite3_blob_close(blob); + if (rc != SQLITE_OK) + return rc; + if (brc != SQLITE_OK) + return brc; + } + + // 2. Delete from _rescore_vectors + { + char *zSql = sqlite3_mprintf( + "DELETE FROM \"%w\".\"%w\" WHERE rowid = ?", + p->schemaName, p->shadowRescoreVectorsNames[i]); + if (!zSql) + return SQLITE_NOMEM; + sqlite3_stmt *stmt; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) + return rc; + sqlite3_bind_int64(stmt, 1, rowid); + rc = sqlite3_step(stmt); + sqlite3_finalize(stmt); + if (rc != SQLITE_DONE) + return SQLITE_ERROR; + } + } + return SQLITE_OK; +} + +/** + * Delete a chunk row from _rescore_chunks{NN} tables. + * (_rescore_vectors rows were already deleted per-row in rescore_on_delete) + */ +static int rescore_delete_chunk(vec0_vtab *p, i64 chunk_id) { + for (int i = 0; i < p->numVectorColumns; i++) { + if (!p->shadowRescoreChunksNames[i]) + continue; + char *zSql = sqlite3_mprintf( + "DELETE FROM \"%w\".\"%w\" WHERE rowid = ?", + p->schemaName, p->shadowRescoreChunksNames[i]); + if (!zSql) + return SQLITE_NOMEM; + sqlite3_stmt *stmt; + int rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) + return rc; + sqlite3_bind_int64(stmt, 1, chunk_id); + rc = sqlite3_step(stmt); + sqlite3_finalize(stmt); + if (rc != SQLITE_DONE) + return SQLITE_ERROR; + } + return SQLITE_OK; +} + +// ============================================================================ +// KNN rescore query +// ============================================================================ + +/** + * Phase 1: Coarse scan of quantized chunks → top k*oversample candidates (rowids). + * Phase 2: For each candidate, blob_open _rescore_vectors by rowid, read float + * vector, compute float distance. Sort, return top k. + * + * Phase 2 is fast because _rescore_vectors has INTEGER PRIMARY KEY, so + * sqlite3_blob_open/reopen addresses rows directly by rowid — no index lookup. + */ +static int rescore_knn(vec0_vtab *p, vec0_cursor *pCur, + struct VectorColumnDefinition *vector_column, + int vectorColumnIdx, struct Array *arrayRowidsIn, + struct Array *aMetadataIn, const char *idxStr, int argc, + sqlite3_value **argv, void *queryVector, i64 k, + struct vec0_query_knn_data *knn_data) { + (void)pCur; + (void)aMetadataIn; + int rc = SQLITE_OK; + int oversample = vector_column->rescore.oversample; + i64 k_oversample = k * oversample; + if (k_oversample > 4096) + k_oversample = 4096; + + size_t qdim = vector_column->dimensions; + size_t qsize = rescore_quantized_byte_size(vector_column); + size_t fsize = vector_column_byte_size(*vector_column); + + // Quantize the query vector + void *quantizedQuery = sqlite3_malloc(qsize); + if (!quantizedQuery) + return SQLITE_NOMEM; + + switch (vector_column->rescore.quantizer_type) { + case VEC0_RESCORE_QUANTIZER_BIT: + rescore_quantize_float_to_bit((const float *)queryVector, + (uint8_t *)quantizedQuery, qdim); + break; + case VEC0_RESCORE_QUANTIZER_INT8: + rescore_quantize_float_to_int8((const float *)queryVector, + (int8_t *)quantizedQuery, qdim); + break; + } + + // Phase 1: Scan quantized chunks for k*oversample candidates + sqlite3_stmt *stmtChunks = NULL; + rc = vec0_chunks_iter(p, idxStr, argc, argv, &stmtChunks); + if (rc != SQLITE_OK) { + sqlite3_free(quantizedQuery); + return rc; + } + + i64 *cand_rowids = sqlite3_malloc(k_oversample * sizeof(i64)); + f32 *cand_distances = sqlite3_malloc(k_oversample * sizeof(f32)); + i64 *tmp_rowids = sqlite3_malloc(k_oversample * sizeof(i64)); + f32 *tmp_distances = sqlite3_malloc(k_oversample * sizeof(f32)); + f32 *chunk_distances = sqlite3_malloc(p->chunk_size * sizeof(f32)); + i32 *chunk_topk_idxs = sqlite3_malloc(k_oversample * sizeof(i32)); + u8 *b = sqlite3_malloc(p->chunk_size / CHAR_BIT); + u8 *bTaken = sqlite3_malloc(p->chunk_size / CHAR_BIT); + u8 *bmRowids = NULL; + void *baseVectors = sqlite3_malloc((i64)p->chunk_size * (i64)qsize); + + if (!cand_rowids || !cand_distances || !tmp_rowids || !tmp_distances || + !chunk_distances || !chunk_topk_idxs || !b || !bTaken || !baseVectors) { + rc = SQLITE_NOMEM; + goto cleanup; + } + memset(cand_rowids, 0, k_oversample * sizeof(i64)); + memset(cand_distances, 0, k_oversample * sizeof(f32)); + + if (arrayRowidsIn) { + bmRowids = sqlite3_malloc(p->chunk_size / CHAR_BIT); + if (!bmRowids) { + rc = SQLITE_NOMEM; + goto cleanup; + } + } + + i64 cand_used = 0; + + while (1) { + rc = sqlite3_step(stmtChunks); + if (rc == SQLITE_DONE) + break; + if (rc != SQLITE_ROW) { + rc = SQLITE_ERROR; + goto cleanup; + } + + i64 chunk_id = sqlite3_column_int64(stmtChunks, 0); + unsigned char *chunkValidity = + (unsigned char *)sqlite3_column_blob(stmtChunks, 1); + i64 *chunkRowids = (i64 *)sqlite3_column_blob(stmtChunks, 2); + + memset(chunk_distances, 0, p->chunk_size * sizeof(f32)); + memset(chunk_topk_idxs, 0, k_oversample * sizeof(i32)); + bitmap_copy(b, chunkValidity, p->chunk_size); + + if (arrayRowidsIn) { + bitmap_clear(bmRowids, p->chunk_size); + for (int j = 0; j < p->chunk_size; j++) { + if (!bitmap_get(chunkValidity, j)) + continue; + i64 rid = chunkRowids[j]; + void *found = bsearch(&rid, arrayRowidsIn->z, arrayRowidsIn->length, + sizeof(i64), _cmp); + bitmap_set(bmRowids, j, found ? 1 : 0); + } + bitmap_and_inplace(b, bmRowids, p->chunk_size); + } + + // Read quantized vectors + sqlite3_blob *blobQ = NULL; + rc = sqlite3_blob_open(p->db, p->schemaName, + p->shadowRescoreChunksNames[vectorColumnIdx], + "vectors", chunk_id, 0, &blobQ); + if (rc != SQLITE_OK) + goto cleanup; + rc = sqlite3_blob_read(blobQ, baseVectors, + (i64)p->chunk_size * (i64)qsize, 0); + sqlite3_blob_close(blobQ); + if (rc != SQLITE_OK) + goto cleanup; + + // Compute quantized distances + for (int j = 0; j < p->chunk_size; j++) { + if (!bitmap_get(b, j)) + continue; + f32 dist; + switch (vector_column->rescore.quantizer_type) { + case VEC0_RESCORE_QUANTIZER_BIT: { + const u8 *base_j = ((u8 *)baseVectors) + (j * (qdim / CHAR_BIT)); + dist = distance_hamming(base_j, (u8 *)quantizedQuery, &qdim); + break; + } + case VEC0_RESCORE_QUANTIZER_INT8: { + const i8 *base_j = ((i8 *)baseVectors) + (j * qdim); + switch (vector_column->distance_metric) { + case VEC0_DISTANCE_METRIC_L2: + dist = distance_l2_sqr_int8(base_j, (i8 *)quantizedQuery, &qdim); + break; + case VEC0_DISTANCE_METRIC_COSINE: + dist = distance_cosine_int8(base_j, (i8 *)quantizedQuery, &qdim); + break; + case VEC0_DISTANCE_METRIC_L1: + dist = (f32)distance_l1_int8(base_j, (i8 *)quantizedQuery, &qdim); + break; + } + break; + } + } + chunk_distances[j] = dist; + } + + int used1; + min_idx(chunk_distances, p->chunk_size, b, chunk_topk_idxs, + min(k_oversample, p->chunk_size), bTaken, &used1); + + i64 merged_used; + merge_sorted_lists(cand_distances, cand_rowids, cand_used, chunk_distances, + chunkRowids, chunk_topk_idxs, + min(min(k_oversample, p->chunk_size), used1), + tmp_distances, tmp_rowids, k_oversample, &merged_used); + + for (i64 j = 0; j < merged_used; j++) { + cand_rowids[j] = tmp_rowids[j]; + cand_distances[j] = tmp_distances[j]; + } + cand_used = merged_used; + } + rc = SQLITE_OK; + + // Phase 2: Rescore candidates using _rescore_vectors (rowid-keyed) + if (cand_used == 0) { + knn_data->current_idx = 0; + knn_data->k = 0; + knn_data->rowids = NULL; + knn_data->distances = NULL; + knn_data->k_used = 0; + goto cleanup; + } + { + f32 *float_distances = sqlite3_malloc(cand_used * sizeof(f32)); + void *fBuf = sqlite3_malloc(fsize); + if (!float_distances || !fBuf) { + sqlite3_free(float_distances); + sqlite3_free(fBuf); + rc = SQLITE_NOMEM; + goto cleanup; + } + + // Open blob on _rescore_vectors, then reopen for each candidate rowid. + // blob_reopen is O(1) for INTEGER PRIMARY KEY tables. + sqlite3_blob *blobFloat = NULL; + rc = sqlite3_blob_open(p->db, p->schemaName, + p->shadowRescoreVectorsNames[vectorColumnIdx], + "vector", cand_rowids[0], 0, &blobFloat); + if (rc != SQLITE_OK) { + sqlite3_free(float_distances); + sqlite3_free(fBuf); + goto cleanup; + } + + rc = sqlite3_blob_read(blobFloat, fBuf, fsize, 0); + if (rc != SQLITE_OK) { + sqlite3_blob_close(blobFloat); + sqlite3_free(float_distances); + sqlite3_free(fBuf); + goto cleanup; + } + float_distances[0] = + vec0_distance_full(fBuf, queryVector, vector_column->dimensions, + vector_column->element_type, + vector_column->distance_metric); + + for (i64 j = 1; j < cand_used; j++) { + rc = sqlite3_blob_reopen(blobFloat, cand_rowids[j]); + if (rc != SQLITE_OK) { + sqlite3_blob_close(blobFloat); + sqlite3_free(float_distances); + sqlite3_free(fBuf); + goto cleanup; + } + rc = sqlite3_blob_read(blobFloat, fBuf, fsize, 0); + if (rc != SQLITE_OK) { + sqlite3_blob_close(blobFloat); + sqlite3_free(float_distances); + sqlite3_free(fBuf); + goto cleanup; + } + float_distances[j] = + vec0_distance_full(fBuf, queryVector, vector_column->dimensions, + vector_column->element_type, + vector_column->distance_metric); + } + sqlite3_blob_close(blobFloat); + sqlite3_free(fBuf); + + // Sort by float distance + for (i64 a = 0; a + 1 < cand_used; a++) { + i64 minIdx = a; + for (i64 c = a + 1; c < cand_used; c++) { + if (float_distances[c] < float_distances[minIdx]) + minIdx = c; + } + if (minIdx != a) { + f32 td = float_distances[a]; + float_distances[a] = float_distances[minIdx]; + float_distances[minIdx] = td; + i64 tr = cand_rowids[a]; + cand_rowids[a] = cand_rowids[minIdx]; + cand_rowids[minIdx] = tr; + } + } + + i64 result_k = min(k, cand_used); + i64 *out_rowids = sqlite3_malloc(result_k * sizeof(i64)); + f32 *out_distances = sqlite3_malloc(result_k * sizeof(f32)); + if (!out_rowids || !out_distances) { + sqlite3_free(out_rowids); + sqlite3_free(out_distances); + sqlite3_free(float_distances); + rc = SQLITE_NOMEM; + goto cleanup; + } + for (i64 j = 0; j < result_k; j++) { + out_rowids[j] = cand_rowids[j]; + out_distances[j] = float_distances[j]; + } + + knn_data->current_idx = 0; + knn_data->k = result_k; + knn_data->rowids = out_rowids; + knn_data->distances = out_distances; + knn_data->k_used = result_k; + + sqlite3_free(float_distances); + } + +cleanup: + sqlite3_finalize(stmtChunks); + sqlite3_free(quantizedQuery); + sqlite3_free(cand_rowids); + sqlite3_free(cand_distances); + sqlite3_free(tmp_rowids); + sqlite3_free(tmp_distances); + sqlite3_free(chunk_distances); + sqlite3_free(chunk_topk_idxs); + sqlite3_free(b); + sqlite3_free(bTaken); + sqlite3_free(bmRowids); + sqlite3_free(baseVectors); + return rc; +} + +#ifdef SQLITE_VEC_TEST +void _test_rescore_quantize_float_to_bit(const float *src, uint8_t *dst, size_t dim) { + rescore_quantize_float_to_bit(src, dst, dim); +} +void _test_rescore_quantize_float_to_int8(const float *src, int8_t *dst, size_t dim) { + rescore_quantize_float_to_int8(src, dst, dim); +} +size_t _test_rescore_quantized_byte_size_bit(size_t dimensions) { + struct VectorColumnDefinition col; + memset(&col, 0, sizeof(col)); + col.dimensions = dimensions; + col.rescore.quantizer_type = VEC0_RESCORE_QUANTIZER_BIT; + return rescore_quantized_byte_size(&col); +} +size_t _test_rescore_quantized_byte_size_int8(size_t dimensions) { + struct VectorColumnDefinition col; + memset(&col, 0, sizeof(col)); + col.dimensions = dimensions; + col.rescore.quantizer_type = VEC0_RESCORE_QUANTIZER_INT8; + return rescore_quantized_byte_size(&col); +} +#endif diff --git a/sqlite-vec.c b/sqlite-vec.c index 390123b..ff9e0da 100644 --- a/sqlite-vec.c +++ b/sqlite-vec.c @@ -112,6 +112,10 @@ typedef size_t usize; #define countof(x) (sizeof(x) / sizeof((x)[0])) #define min(a, b) (((a) <= (b)) ? (a) : (b)) +#ifndef SQLITE_VEC_ENABLE_RESCORE +#define SQLITE_VEC_ENABLE_RESCORE 1 +#endif + enum VectorElementType { // clang-format off SQLITE_VEC_ELEMENT_TYPE_FLOAT32 = 223 + 0, @@ -2532,8 +2536,23 @@ static f32 vec0_distance_full( enum Vec0IndexType { VEC0_INDEX_TYPE_FLAT = 1, +#if SQLITE_VEC_ENABLE_RESCORE + VEC0_INDEX_TYPE_RESCORE = 2, +#endif }; +#if SQLITE_VEC_ENABLE_RESCORE +enum Vec0RescoreQuantizerType { + VEC0_RESCORE_QUANTIZER_BIT = 1, + VEC0_RESCORE_QUANTIZER_INT8 = 2, +}; + +struct Vec0RescoreConfig { + enum Vec0RescoreQuantizerType quantizer_type; + int oversample; +}; +#endif + struct VectorColumnDefinition { char *name; int name_length; @@ -2541,6 +2560,9 @@ struct VectorColumnDefinition { enum VectorElementType element_type; enum Vec0DistanceMetrics distance_metric; enum Vec0IndexType index_type; +#if SQLITE_VEC_ENABLE_RESCORE + struct Vec0RescoreConfig rescore; +#endif }; struct Vec0PartitionColumnDefinition { @@ -2577,6 +2599,111 @@ size_t vector_column_byte_size(struct VectorColumnDefinition column) { return vector_byte_size(column.element_type, column.dimensions); } +#if SQLITE_VEC_ENABLE_RESCORE +/** + * @brief Parse rescore options from an "INDEXED BY rescore(...)" clause. + * + * @param scanner Scanner positioned right after the opening '(' of rescore(...) + * @param outConfig Output rescore config + * @param pzErr Error message output + * @return int SQLITE_OK on success, SQLITE_ERROR on error. + */ +static int vec0_parse_rescore_options(struct Vec0Scanner *scanner, + struct Vec0RescoreConfig *outConfig, + char **pzErr) { + struct Vec0Token token; + int rc; + int hasQuantizer = 0; + outConfig->oversample = 8; + outConfig->quantizer_type = 0; + + while (1) { + rc = vec0_scanner_next(scanner, &token); + if (rc == VEC0_TOKEN_RESULT_EOF) { + break; + } + // ')' closes rescore options + if (rc == VEC0_TOKEN_RESULT_SOME && token.token_type == TOKEN_TYPE_RPAREN) { + break; + } + if (rc != VEC0_TOKEN_RESULT_SOME || token.token_type != TOKEN_TYPE_IDENTIFIER) { + *pzErr = sqlite3_mprintf("Expected option name in rescore(...)"); + return SQLITE_ERROR; + } + + char *key = token.start; + int keyLength = token.end - token.start; + + // expect '=' + rc = vec0_scanner_next(scanner, &token); + if (rc != VEC0_TOKEN_RESULT_SOME || token.token_type != TOKEN_TYPE_EQ) { + *pzErr = sqlite3_mprintf("Expected '=' after option name in rescore(...)"); + return SQLITE_ERROR; + } + + // value + rc = vec0_scanner_next(scanner, &token); + if (rc != VEC0_TOKEN_RESULT_SOME) { + *pzErr = sqlite3_mprintf("Expected value after '=' in rescore(...)"); + return SQLITE_ERROR; + } + + if (sqlite3_strnicmp(key, "quantizer", keyLength) == 0) { + if (token.token_type != TOKEN_TYPE_IDENTIFIER) { + *pzErr = sqlite3_mprintf("Expected identifier for quantizer value in rescore(...)"); + return SQLITE_ERROR; + } + int valLen = token.end - token.start; + if (sqlite3_strnicmp(token.start, "bit", valLen) == 0) { + outConfig->quantizer_type = VEC0_RESCORE_QUANTIZER_BIT; + } else if (sqlite3_strnicmp(token.start, "int8", valLen) == 0) { + outConfig->quantizer_type = VEC0_RESCORE_QUANTIZER_INT8; + } else { + *pzErr = sqlite3_mprintf("Unknown quantizer type '%.*s' in rescore(...). Expected 'bit' or 'int8'.", valLen, token.start); + return SQLITE_ERROR; + } + hasQuantizer = 1; + } else if (sqlite3_strnicmp(key, "oversample", keyLength) == 0) { + if (token.token_type != TOKEN_TYPE_DIGIT) { + *pzErr = sqlite3_mprintf("Expected integer for oversample value in rescore(...)"); + return SQLITE_ERROR; + } + outConfig->oversample = atoi(token.start); + if (outConfig->oversample <= 0 || outConfig->oversample > 128) { + *pzErr = sqlite3_mprintf("oversample in rescore(...) must be between 1 and 128, got %d", outConfig->oversample); + return SQLITE_ERROR; + } + } else { + *pzErr = sqlite3_mprintf("Unknown option '%.*s' in rescore(...)", keyLength, key); + return SQLITE_ERROR; + } + + // optional comma between options + rc = vec0_scanner_next(scanner, &token); + if (rc == VEC0_TOKEN_RESULT_EOF) { + break; + } + if (rc == VEC0_TOKEN_RESULT_SOME && token.token_type == TOKEN_TYPE_RPAREN) { + break; + } + if (rc == VEC0_TOKEN_RESULT_SOME && token.token_type == TOKEN_TYPE_COMMA) { + continue; + } + // If it's not a comma or rparen, it might be the next key — push back isn't + // possible with this scanner, so we'll treat unexpected tokens as errors + *pzErr = sqlite3_mprintf("Unexpected token in rescore(...) options"); + return SQLITE_ERROR; + } + + if (!hasQuantizer) { + *pzErr = sqlite3_mprintf("rescore(...) requires a 'quantizer' option (quantizer=bit or quantizer=int8)"); + return SQLITE_ERROR; + } + + return SQLITE_OK; +} +#endif /* SQLITE_VEC_ENABLE_RESCORE */ + /** * @brief Parse an vec0 vtab argv[i] column definition and see if * it's a vector column defintion, ex `contents_embedding float[768]`. @@ -2601,6 +2728,10 @@ int vec0_parse_vector_column(const char *source, int source_length, enum VectorElementType elementType; enum Vec0DistanceMetrics distanceMetric = VEC0_DISTANCE_METRIC_L2; enum Vec0IndexType indexType = VEC0_INDEX_TYPE_FLAT; +#if SQLITE_VEC_ENABLE_RESCORE + struct Vec0RescoreConfig rescoreConfig; + memset(&rescoreConfig, 0, sizeof(rescoreConfig)); +#endif int dimensions; vec0_scanner_init(&scanner, source, source_length); @@ -2704,6 +2835,7 @@ int vec0_parse_vector_column(const char *source, int source_length, return SQLITE_ERROR; } } + // INDEXED BY flat() | rescore(...) else if (sqlite3_strnicmp(key, "indexed", keyLength) == 0) { // expect "by" rc = vec0_scanner_next(&scanner, &token); @@ -2733,7 +2865,32 @@ int vec0_parse_vector_column(const char *source, int source_length, token.token_type != TOKEN_TYPE_RPAREN) { return SQLITE_ERROR; } - } else { + } +#if SQLITE_VEC_ENABLE_RESCORE + else if (sqlite3_strnicmp(token.start, "rescore", indexNameLen) == 0) { + indexType = VEC0_INDEX_TYPE_RESCORE; + if (elementType != SQLITE_VEC_ELEMENT_TYPE_FLOAT32) { + return SQLITE_ERROR; + } + // expect '(' + rc = vec0_scanner_next(&scanner, &token); + if (rc != VEC0_TOKEN_RESULT_SOME || token.token_type != TOKEN_TYPE_LPAREN) { + return SQLITE_ERROR; + } + char *rescoreErr = NULL; + rc = vec0_parse_rescore_options(&scanner, &rescoreConfig, &rescoreErr); + if (rc != SQLITE_OK) { + if (rescoreErr) sqlite3_free(rescoreErr); + return SQLITE_ERROR; + } + // validate dimensions for bit quantizer + if (rescoreConfig.quantizer_type == VEC0_RESCORE_QUANTIZER_BIT && + (dimensions % CHAR_BIT) != 0) { + return SQLITE_ERROR; + } + } +#endif + else { // unknown index type return SQLITE_ERROR; } @@ -2753,6 +2910,9 @@ int vec0_parse_vector_column(const char *source, int source_length, outColumn->element_type = elementType; outColumn->dimensions = dimensions; outColumn->index_type = indexType; +#if SQLITE_VEC_ENABLE_RESCORE + outColumn->rescore = rescoreConfig; +#endif return SQLITE_OK; } @@ -3093,6 +3253,19 @@ struct vec0_vtab { // The first numVectorColumns entries must be freed with sqlite3_free() char *shadowVectorChunksNames[VEC0_MAX_VECTOR_COLUMNS]; +#if SQLITE_VEC_ENABLE_RESCORE + // Name of all rescore chunk shadow tables, ie `_rescore_chunks00` + // Only populated for vector columns with rescore enabled. + // Must be freed with sqlite3_free() + char *shadowRescoreChunksNames[VEC0_MAX_VECTOR_COLUMNS]; + + // Name of all rescore vector shadow tables, ie `_rescore_vectors00` + // Rowid-keyed table for fast random-access float vector reads during rescore. + // Only populated for vector columns with rescore enabled. + // Must be freed with sqlite3_free() + char *shadowRescoreVectorsNames[VEC0_MAX_VECTOR_COLUMNS]; +#endif + // Name of all metadata chunk shadow tables, ie `_metadatachunks00` // Only the first numMetadataColumns entries will be available. // The first numMetadataColumns entries must be freed with sqlite3_free() @@ -3162,6 +3335,18 @@ struct vec0_vtab { sqlite3_stmt *stmtRowidsGetChunkPosition; }; +#if SQLITE_VEC_ENABLE_RESCORE +// Forward declarations for rescore functions (defined in sqlite-vec-rescore.c, +// included later after all helpers they depend on are defined). +static int rescore_create_tables(vec0_vtab *p, sqlite3 *db, char **pzErr); +static int rescore_drop_tables(vec0_vtab *p); +static int rescore_new_chunk(vec0_vtab *p, i64 chunk_rowid); +static int rescore_on_insert(vec0_vtab *p, i64 chunk_rowid, i64 chunk_offset, + i64 rowid, void *vectorDatas[]); +static int rescore_on_delete(vec0_vtab *p, i64 chunk_id, u64 chunk_offset, i64 rowid); +static int rescore_delete_chunk(vec0_vtab *p, i64 chunk_id); +#endif + /** * @brief Finalize all the sqlite3_stmt members in a vec0_vtab. * @@ -3201,6 +3386,14 @@ void vec0_free(vec0_vtab *p) { sqlite3_free(p->shadowVectorChunksNames[i]); p->shadowVectorChunksNames[i] = NULL; +#if SQLITE_VEC_ENABLE_RESCORE + sqlite3_free(p->shadowRescoreChunksNames[i]); + p->shadowRescoreChunksNames[i] = NULL; + + sqlite3_free(p->shadowRescoreVectorsNames[i]); + p->shadowRescoreVectorsNames[i] = NULL; +#endif + sqlite3_free(p->vector_columns[i].name); p->vector_columns[i].name = NULL; } @@ -3493,6 +3686,41 @@ int vec0_get_vector_data(vec0_vtab *pVtab, i64 rowid, int vector_column_idx, assert((vector_column_idx >= 0) && (vector_column_idx < pVtab->numVectorColumns)); +#if SQLITE_VEC_ENABLE_RESCORE + // Rescore columns store float vectors in _rescore_vectors (rowid-keyed) + if (p->vector_columns[vector_column_idx].index_type == VEC0_INDEX_TYPE_RESCORE) { + size = vector_column_byte_size(p->vector_columns[vector_column_idx]); + rc = sqlite3_blob_open(p->db, p->schemaName, + p->shadowRescoreVectorsNames[vector_column_idx], + "vector", rowid, 0, &vectorBlob); + if (rc != SQLITE_OK) { + vtab_set_error(&pVtab->base, + "Could not fetch vector data for %lld from rescore vectors", + rowid); + rc = SQLITE_ERROR; + goto cleanup; + } + buf = sqlite3_malloc(size); + if (!buf) { + rc = SQLITE_NOMEM; + goto cleanup; + } + rc = sqlite3_blob_read(vectorBlob, buf, size, 0); + if (rc != SQLITE_OK) { + sqlite3_free(buf); + buf = NULL; + rc = SQLITE_ERROR; + goto cleanup; + } + *outVector = buf; + if (outVectorSize) { + *outVectorSize = size; + } + rc = SQLITE_OK; + goto cleanup; + } +#endif /* SQLITE_VEC_ENABLE_RESCORE */ + rc = vec0_get_chunk_position(pVtab, rowid, NULL, &chunk_id, &chunk_offset); if (rc == SQLITE_EMPTY) { vtab_set_error(&pVtab->base, "Could not find a row with rowid %lld", rowid); @@ -4096,6 +4324,14 @@ int vec0_new_chunk(vec0_vtab *p, sqlite3_value ** partitionKeyValues, i64 *chunk continue; } int vector_column_idx = p->user_column_idxs[i]; + +#if SQLITE_VEC_ENABLE_RESCORE + // Rescore columns don't use _vector_chunks for float storage + if (p->vector_columns[vector_column_idx].index_type == VEC0_INDEX_TYPE_RESCORE) { + continue; + } +#endif + i64 vectorsSize = p->chunk_size * vector_column_byte_size(p->vector_columns[vector_column_idx]); @@ -4126,6 +4362,14 @@ int vec0_new_chunk(vec0_vtab *p, sqlite3_value ** partitionKeyValues, i64 *chunk } } +#if SQLITE_VEC_ENABLE_RESCORE + // Create new rescore chunks for each rescore-enabled vector column + rc = rescore_new_chunk(p, rowid); + if (rc != SQLITE_OK) { + return rc; + } +#endif + // Step 3: Create new metadata chunks for each metadata column for (int i = 0; i < vec0_num_defined_user_columns(p); i++) { if(p->user_column_kinds[i] != SQLITE_VEC0_USER_COLUMN_KIND_METADATA) { @@ -4487,6 +4731,35 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, goto error; } +#if SQLITE_VEC_ENABLE_RESCORE + { + int hasRescore = 0; + for (int i = 0; i < numVectorColumns; i++) { + if (pNew->vector_columns[i].index_type == VEC0_INDEX_TYPE_RESCORE) { + hasRescore = 1; + break; + } + } + if (hasRescore) { + if (numAuxiliaryColumns > 0) { + *pzErr = sqlite3_mprintf(VEC_CONSTRUCTOR_ERROR + "Auxiliary columns are not supported with rescore indexes"); + goto error; + } + if (numMetadataColumns > 0) { + *pzErr = sqlite3_mprintf(VEC_CONSTRUCTOR_ERROR + "Metadata columns are not supported with rescore indexes"); + goto error; + } + if (numPartitionColumns > 0) { + *pzErr = sqlite3_mprintf(VEC_CONSTRUCTOR_ERROR + "Partition key columns are not supported with rescore indexes"); + goto error; + } + } + } +#endif + sqlite3_str *createStr = sqlite3_str_new(NULL); sqlite3_str_appendall(createStr, "CREATE TABLE x("); if (pkColumnName) { @@ -4577,6 +4850,20 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, if (!pNew->shadowVectorChunksNames[i]) { goto error; } +#if SQLITE_VEC_ENABLE_RESCORE + if (pNew->vector_columns[i].index_type == VEC0_INDEX_TYPE_RESCORE) { + pNew->shadowRescoreChunksNames[i] = + sqlite3_mprintf("%s_rescore_chunks%02d", tableName, i); + if (!pNew->shadowRescoreChunksNames[i]) { + goto error; + } + pNew->shadowRescoreVectorsNames[i] = + sqlite3_mprintf("%s_rescore_vectors%02d", tableName, i); + if (!pNew->shadowRescoreVectorsNames[i]) { + goto error; + } + } +#endif } for (int i = 0; i < pNew->numMetadataColumns; i++) { pNew->shadowMetadataChunksNames[i] = @@ -4700,6 +4987,11 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, sqlite3_finalize(stmt); for (int i = 0; i < pNew->numVectorColumns; i++) { +#if SQLITE_VEC_ENABLE_RESCORE + // Rescore columns don't use _vector_chunks + if (pNew->vector_columns[i].index_type == VEC0_INDEX_TYPE_RESCORE) + continue; +#endif char *zSql = sqlite3_mprintf(VEC0_SHADOW_VECTOR_N_CREATE, pNew->schemaName, pNew->tableName, i); if (!zSql) { @@ -4718,6 +5010,13 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, sqlite3_finalize(stmt); } +#if SQLITE_VEC_ENABLE_RESCORE + rc = rescore_create_tables(pNew, db, pzErr); + if (rc != SQLITE_OK) { + goto error; + } +#endif + // See SHADOW_TABLE_ROWID_QUIRK in vec0_new_chunk() — same "rowid PRIMARY KEY" // without INTEGER type issue applies here. for (int i = 0; i < pNew->numMetadataColumns; i++) { @@ -4852,6 +5151,10 @@ static int vec0Destroy(sqlite3_vtab *pVtab) { sqlite3_finalize(stmt); for (int i = 0; i < p->numVectorColumns; i++) { +#if SQLITE_VEC_ENABLE_RESCORE + if (p->vector_columns[i].index_type == VEC0_INDEX_TYPE_RESCORE) + continue; +#endif zSql = sqlite3_mprintf("DROP TABLE \"%w\".\"%w\"", p->schemaName, p->shadowVectorChunksNames[i]); rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, 0); @@ -4863,6 +5166,13 @@ static int vec0Destroy(sqlite3_vtab *pVtab) { sqlite3_finalize(stmt); } +#if SQLITE_VEC_ENABLE_RESCORE + rc = rescore_drop_tables(p); + if (rc != SQLITE_OK) { + goto done; + } +#endif + if(p->numAuxiliaryColumns > 0) { zSql = sqlite3_mprintf("DROP TABLE " VEC0_SHADOW_AUXILIARY_NAME, p->schemaName, p->tableName); rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, 0); @@ -6624,6 +6934,10 @@ cleanup: return rc; } +#if SQLITE_VEC_ENABLE_RESCORE +#include "sqlite-vec-rescore.c" +#endif + int vec0Filter_knn(vec0_cursor *pCur, vec0_vtab *p, int idxNum, const char *idxStr, int argc, sqlite3_value **argv) { assert(argc == (strlen(idxStr)-1) / 4); @@ -6856,6 +7170,21 @@ int vec0Filter_knn(vec0_cursor *pCur, vec0_vtab *p, int idxNum, } #endif +#if SQLITE_VEC_ENABLE_RESCORE + // Dispatch to rescore KNN path if this vector column has rescore enabled + if (vector_column->index_type == VEC0_INDEX_TYPE_RESCORE) { + rc = rescore_knn(p, pCur, vector_column, vectorColumnIdx, arrayRowidsIn, + aMetadataIn, idxStr, argc, argv, queryVector, k, knn_data); + if (rc != SQLITE_OK) { + goto cleanup; + } + pCur->knn_data = knn_data; + pCur->query_plan = VEC0_QUERY_PLAN_KNN; + rc = SQLITE_OK; + goto cleanup; + } +#endif + rc = vec0_chunks_iter(p, idxStr, argc, argv, &stmtChunks); if (rc != SQLITE_OK) { // IMP: V06942_23781 @@ -7680,6 +8009,12 @@ int vec0Update_InsertWriteFinalStep(vec0_vtab *p, i64 chunk_rowid, // Go insert the vector data into the vector chunk shadow tables for (int i = 0; i < p->numVectorColumns; i++) { +#if SQLITE_VEC_ENABLE_RESCORE + // Rescore columns store float vectors in _rescore_vectors instead + if (p->vector_columns[i].index_type == VEC0_INDEX_TYPE_RESCORE) + continue; +#endif + sqlite3_blob *blobVectors; rc = sqlite3_blob_open(p->db, p->schemaName, p->shadowVectorChunksNames[i], "vectors", chunk_rowid, 1, &blobVectors); @@ -8082,6 +8417,13 @@ int vec0Update_Insert(sqlite3_vtab *pVTab, int argc, sqlite3_value **argv, goto cleanup; } +#if SQLITE_VEC_ENABLE_RESCORE + rc = rescore_on_insert(p, chunk_rowid, chunk_offset, rowid, vectorDatas); + if (rc != SQLITE_OK) { + goto cleanup; + } +#endif + if(p->numAuxiliaryColumns > 0) { sqlite3_stmt *stmt; sqlite3_str * s = sqlite3_str_new(NULL); @@ -8272,6 +8614,11 @@ int vec0Update_Delete_ClearVectors(vec0_vtab *p, i64 chunk_id, u64 chunk_offset) { int rc, brc; for (int i = 0; i < p->numVectorColumns; i++) { +#if SQLITE_VEC_ENABLE_RESCORE + // Rescore columns don't use _vector_chunks + if (p->vector_columns[i].index_type == VEC0_INDEX_TYPE_RESCORE) + continue; +#endif sqlite3_blob *blobVectors = NULL; size_t n = vector_column_byte_size(p->vector_columns[i]); @@ -8383,6 +8730,10 @@ int vec0Update_Delete_DeleteChunkIfEmpty(vec0_vtab *p, i64 chunk_id, // Delete from each _vector_chunksNN for (int i = 0; i < p->numVectorColumns; i++) { +#if SQLITE_VEC_ENABLE_RESCORE + if (p->vector_columns[i].index_type == VEC0_INDEX_TYPE_RESCORE) + continue; +#endif zSql = sqlite3_mprintf( "DELETE FROM " VEC0_SHADOW_VECTOR_N_NAME " WHERE rowid = ?", p->schemaName, p->tableName, i); @@ -8399,6 +8750,12 @@ int vec0Update_Delete_DeleteChunkIfEmpty(vec0_vtab *p, i64 chunk_id, return SQLITE_ERROR; } +#if SQLITE_VEC_ENABLE_RESCORE + rc = rescore_delete_chunk(p, chunk_id); + if (rc != SQLITE_OK) + return rc; +#endif + // Delete from each _metadatachunksNN for (int i = 0; i < p->numMetadataColumns; i++) { zSql = sqlite3_mprintf( @@ -8606,6 +8963,14 @@ int vec0Update_Delete(sqlite3_vtab *pVTab, sqlite3_value *idValue) { return rc; } +#if SQLITE_VEC_ENABLE_RESCORE + // 4b. zero out quantized data in rescore chunk tables, delete from rescore vectors + rc = rescore_on_delete(p, chunk_id, chunk_offset, rowid); + if (rc != SQLITE_OK) { + return rc; + } +#endif + // 5. delete from _rowids table rc = vec0Update_Delete_DeleteRowids(p, rowid); if (rc != SQLITE_OK) { @@ -8663,8 +9028,11 @@ int vec0Update_UpdateAuxColumn(vec0_vtab *p, int auxiliary_column_idx, sqlite3_v } int vec0Update_UpdateVectorColumn(vec0_vtab *p, i64 chunk_id, i64 chunk_offset, - int i, sqlite3_value *valueVector) { + int i, sqlite3_value *valueVector, i64 rowid) { int rc; +#if !SQLITE_VEC_ENABLE_RESCORE + UNUSED_PARAMETER(rowid); +#endif sqlite3_blob *blobVectors = NULL; @@ -8708,6 +9076,59 @@ int vec0Update_UpdateVectorColumn(vec0_vtab *p, i64 chunk_id, i64 chunk_offset, goto cleanup; } +#if SQLITE_VEC_ENABLE_RESCORE + if (p->vector_columns[i].index_type == VEC0_INDEX_TYPE_RESCORE) { + // For rescore columns, update _rescore_vectors and _rescore_chunks + struct VectorColumnDefinition *col = &p->vector_columns[i]; + size_t qsize = rescore_quantized_byte_size(col); + size_t fsize = vector_column_byte_size(*col); + + // 1. Update quantized chunk + { + void *qbuf = sqlite3_malloc(qsize); + if (!qbuf) { rc = SQLITE_NOMEM; goto cleanup; } + switch (col->rescore.quantizer_type) { + case VEC0_RESCORE_QUANTIZER_BIT: + rescore_quantize_float_to_bit((const float *)vector, (uint8_t *)qbuf, col->dimensions); + break; + case VEC0_RESCORE_QUANTIZER_INT8: + rescore_quantize_float_to_int8((const float *)vector, (int8_t *)qbuf, col->dimensions); + break; + } + sqlite3_blob *blobQ = NULL; + rc = sqlite3_blob_open(p->db, p->schemaName, + p->shadowRescoreChunksNames[i], "vectors", + chunk_id, 1, &blobQ); + if (rc != SQLITE_OK) { sqlite3_free(qbuf); goto cleanup; } + rc = sqlite3_blob_write(blobQ, qbuf, qsize, chunk_offset * qsize); + sqlite3_free(qbuf); + int brc2 = sqlite3_blob_close(blobQ); + if (rc != SQLITE_OK) goto cleanup; + if (brc2 != SQLITE_OK) { rc = brc2; goto cleanup; } + } + + // 2. Update float vector in _rescore_vectors (keyed by user rowid) + { + char *zSql = sqlite3_mprintf( + "UPDATE \"%w\".\"%w\" SET vector = ? WHERE rowid = ?", + p->schemaName, p->shadowRescoreVectorsNames[i]); + if (!zSql) { rc = SQLITE_NOMEM; goto cleanup; } + sqlite3_stmt *stmtUp; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmtUp, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) goto cleanup; + sqlite3_bind_blob(stmtUp, 1, vector, fsize, SQLITE_TRANSIENT); + sqlite3_bind_int64(stmtUp, 2, rowid); + rc = sqlite3_step(stmtUp); + sqlite3_finalize(stmtUp); + if (rc != SQLITE_DONE) { rc = SQLITE_ERROR; goto cleanup; } + } + + rc = SQLITE_OK; + goto cleanup; + } +#endif + rc = sqlite3_blob_open(p->db, p->schemaName, p->shadowVectorChunksNames[i], "vectors", chunk_id, 1, &blobVectors); if (rc != SQLITE_OK) { @@ -8839,7 +9260,7 @@ int vec0Update_Update(sqlite3_vtab *pVTab, int argc, sqlite3_value **argv) { } rc = vec0Update_UpdateVectorColumn(p, chunk_id, chunk_offset, vector_idx, - valueVector); + valueVector, rowid); if (rc != SQLITE_OK) { return SQLITE_ERROR; } @@ -8997,9 +9418,15 @@ static sqlite3_module vec0Module = { #else #define SQLITE_VEC_DEBUG_BUILD_NEON "" #endif +#if SQLITE_VEC_ENABLE_RESCORE +#define SQLITE_VEC_DEBUG_BUILD_RESCORE "rescore" +#else +#define SQLITE_VEC_DEBUG_BUILD_RESCORE "" +#endif #define SQLITE_VEC_DEBUG_BUILD \ - SQLITE_VEC_DEBUG_BUILD_AVX " " SQLITE_VEC_DEBUG_BUILD_NEON + SQLITE_VEC_DEBUG_BUILD_AVX " " SQLITE_VEC_DEBUG_BUILD_NEON " " \ + SQLITE_VEC_DEBUG_BUILD_RESCORE #define SQLITE_VEC_DEBUG_STRING \ "Version: " SQLITE_VEC_VERSION "\n" \ diff --git a/tests/fuzz/.gitignore b/tests/fuzz/.gitignore index 757d1ac..b9c7d30 100644 --- a/tests/fuzz/.gitignore +++ b/tests/fuzz/.gitignore @@ -1,2 +1,7 @@ *.dSYM targets/ +corpus/ +crash-* +leak-* +timeout-* +*.log diff --git a/tests/fuzz/Makefile b/tests/fuzz/Makefile index 21629ef..0030c2e 100644 --- a/tests/fuzz/Makefile +++ b/tests/fuzz/Makefile @@ -72,10 +72,34 @@ $(TARGET_DIR)/vec_mismatch: vec-mismatch.c $(FUZZ_SRCS) | $(TARGET_DIR) $(TARGET_DIR)/vec0_delete_completeness: vec0-delete-completeness.c $(FUZZ_SRCS) | $(TARGET_DIR) $(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@ +$(TARGET_DIR)/rescore_operations: rescore-operations.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) -DSQLITE_VEC_ENABLE_RESCORE $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/rescore_create: rescore-create.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) -DSQLITE_VEC_ENABLE_RESCORE $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/rescore_quantize: rescore-quantize.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) -DSQLITE_VEC_ENABLE_RESCORE -DSQLITE_VEC_TEST $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/rescore_shadow_corrupt: rescore-shadow-corrupt.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) -DSQLITE_VEC_ENABLE_RESCORE $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/rescore_knn_deep: rescore-knn-deep.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) -DSQLITE_VEC_ENABLE_RESCORE $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/rescore_quantize_edge: rescore-quantize-edge.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) -DSQLITE_VEC_ENABLE_RESCORE -DSQLITE_VEC_TEST $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/rescore_interleave: rescore-interleave.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) -DSQLITE_VEC_ENABLE_RESCORE $(FUZZ_SRCS) $< -o $@ + FUZZ_TARGETS = vec0_create exec json numpy \ shadow_corrupt vec0_operations scalar_functions \ vec0_create_full metadata_columns vec_each vec_mismatch \ - vec0_delete_completeness + vec0_delete_completeness \ + rescore_operations rescore_create rescore_quantize \ + rescore_shadow_corrupt rescore_knn_deep \ + rescore_quantize_edge rescore_interleave all: $(addprefix $(TARGET_DIR)/,$(FUZZ_TARGETS)) diff --git a/tests/fuzz/rescore-create.c b/tests/fuzz/rescore-create.c new file mode 100644 index 0000000..3e69d6d --- /dev/null +++ b/tests/fuzz/rescore-create.c @@ -0,0 +1,36 @@ +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + int rc = SQLITE_OK; + sqlite3 *db; + sqlite3_stmt *stmt; + + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + sqlite3_str *s = sqlite3_str_new(NULL); + assert(s); + sqlite3_str_appendall(s, "CREATE VIRTUAL TABLE v USING vec0(emb float[128] indexed by rescore("); + sqlite3_str_appendf(s, "%.*s", (int)size, data); + sqlite3_str_appendall(s, "))"); + const char *zSql = sqlite3_str_finish(s); + assert(zSql); + + rc = sqlite3_prepare_v2(db, zSql, -1, &stmt, NULL); + sqlite3_free((void *)zSql); + if (rc == SQLITE_OK) { + sqlite3_step(stmt); + } + sqlite3_finalize(stmt); + sqlite3_close(db); + return 0; +} diff --git a/tests/fuzz/rescore-create.dict b/tests/fuzz/rescore-create.dict new file mode 100644 index 0000000..a8adf71 --- /dev/null +++ b/tests/fuzz/rescore-create.dict @@ -0,0 +1,20 @@ +"rescore" +"quantizer" +"bit" +"int8" +"oversample" +"indexed" +"by" +"float" +"(" +")" +"," +"=" +"[" +"]" +"1" +"8" +"16" +"128" +"256" +"1024" diff --git a/tests/fuzz/rescore-interleave.c b/tests/fuzz/rescore-interleave.c new file mode 100644 index 0000000..74e8b8d --- /dev/null +++ b/tests/fuzz/rescore-interleave.c @@ -0,0 +1,151 @@ +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +/** + * Fuzz target: interleaved insert/update/delete/KNN operations on rescore + * tables with BOTH quantizer types, exercising the int8 quantizer path + * and the update code path that the existing rescore-operations.c misses. + * + * Key differences from rescore-operations.c: + * - Tests BOTH bit and int8 quantizers (the existing target only tests bit) + * - Fuzz-controlled query vectors (not fixed [1,0,0,...]) + * - Exercises the UPDATE path (line 9080+ in sqlite-vec.c) + * - Tests with 16 dimensions (more realistic, exercises more of the + * quantization loop) + * - Interleaves KNN between mutations to stress the blob_reopen path + * when _rescore_vectors rows have been deleted/modified + */ +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 8) return 0; + + int rc; + sqlite3 *db; + sqlite3_stmt *stmtInsert = NULL; + sqlite3_stmt *stmtUpdate = NULL; + sqlite3_stmt *stmtDelete = NULL; + sqlite3_stmt *stmtKnn = NULL; + + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + /* Use first byte to pick quantizer */ + int use_int8 = data[0] & 1; + data++; size--; + + const char *create_sql = use_int8 + ? "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[16] indexed by rescore(quantizer=int8))" + : "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[16] indexed by rescore(quantizer=bit))"; + + rc = sqlite3_exec(db, create_sql, NULL, NULL, NULL); + if (rc != SQLITE_OK) { sqlite3_close(db); return 0; } + + sqlite3_prepare_v2(db, + "INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &stmtInsert, NULL); + sqlite3_prepare_v2(db, + "UPDATE v SET emb = ? WHERE rowid = ?", -1, &stmtUpdate, NULL); + sqlite3_prepare_v2(db, + "DELETE FROM v WHERE rowid = ?", -1, &stmtDelete, NULL); + sqlite3_prepare_v2(db, + "SELECT rowid, distance FROM v WHERE emb MATCH ? " + "ORDER BY distance LIMIT 5", -1, &stmtKnn, NULL); + + if (!stmtInsert || !stmtUpdate || !stmtDelete || !stmtKnn) + goto cleanup; + + size_t i = 0; + while (i + 2 <= size) { + uint8_t op = data[i++] % 5; /* 5 operations now */ + uint8_t rowid_byte = data[i++]; + int64_t rowid = (int64_t)(rowid_byte % 24) + 1; + + switch (op) { + case 0: { + /* INSERT: consume bytes for 16 floats */ + float vec[16] = {0}; + for (int j = 0; j < 16 && i < size; j++, i++) { + vec[j] = (float)((int8_t)data[i]) / 8.0f; + } + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, rowid); + sqlite3_bind_blob(stmtInsert, 2, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + break; + } + case 1: { + /* DELETE */ + sqlite3_reset(stmtDelete); + sqlite3_bind_int64(stmtDelete, 1, rowid); + sqlite3_step(stmtDelete); + break; + } + case 2: { + /* KNN with fuzz-controlled query vector */ + float qvec[16] = {0}; + for (int j = 0; j < 16 && i < size; j++, i++) { + qvec[j] = (float)((int8_t)data[i]) / 4.0f; + } + sqlite3_reset(stmtKnn); + sqlite3_bind_blob(stmtKnn, 1, qvec, sizeof(qvec), SQLITE_STATIC); + while (sqlite3_step(stmtKnn) == SQLITE_ROW) { + (void)sqlite3_column_int64(stmtKnn, 0); + (void)sqlite3_column_double(stmtKnn, 1); + } + break; + } + case 3: { + /* UPDATE: modify an existing vector (exercises rescore update path) */ + float vec[16] = {0}; + for (int j = 0; j < 16 && i < size; j++, i++) { + vec[j] = (float)((int8_t)data[i]) / 6.0f; + } + sqlite3_reset(stmtUpdate); + sqlite3_bind_blob(stmtUpdate, 1, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_bind_int64(stmtUpdate, 2, rowid); + sqlite3_step(stmtUpdate); + break; + } + case 4: { + /* INSERT then immediately UPDATE same row (stresses blob lifecycle) */ + float vec1[16] = {0}; + float vec2[16] = {0}; + for (int j = 0; j < 16 && i < size; j++, i++) { + vec1[j] = (float)((int8_t)data[i]) / 10.0f; + vec2[j] = -vec1[j]; /* opposite direction */ + } + /* Insert */ + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, rowid); + sqlite3_bind_blob(stmtInsert, 2, vec1, sizeof(vec1), SQLITE_TRANSIENT); + if (sqlite3_step(stmtInsert) == SQLITE_DONE) { + /* Only update if insert succeeded (rowid might already exist) */ + sqlite3_reset(stmtUpdate); + sqlite3_bind_blob(stmtUpdate, 1, vec2, sizeof(vec2), SQLITE_TRANSIENT); + sqlite3_bind_int64(stmtUpdate, 2, rowid); + sqlite3_step(stmtUpdate); + } + break; + } + } + } + + /* Final consistency check: full scan must not crash */ + sqlite3_exec(db, "SELECT * FROM v", NULL, NULL, NULL); + +cleanup: + sqlite3_finalize(stmtInsert); + sqlite3_finalize(stmtUpdate); + sqlite3_finalize(stmtDelete); + sqlite3_finalize(stmtKnn); + sqlite3_close(db); + return 0; +} diff --git a/tests/fuzz/rescore-knn-deep.c b/tests/fuzz/rescore-knn-deep.c new file mode 100644 index 0000000..8ff3c37 --- /dev/null +++ b/tests/fuzz/rescore-knn-deep.c @@ -0,0 +1,178 @@ +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +/** + * Fuzz target: deep exercise of rescore KNN with fuzz-controlled query vectors + * and both quantizer types (bit + int8), multiple distance metrics. + * + * The existing rescore-operations.c only tests bit quantizer with a fixed + * query vector. This target: + * - Tests both bit and int8 quantizers + * - Uses fuzz-controlled query vectors (hits NaN/Inf/denormal paths) + * - Tests all distance metrics with int8 (L2, cosine, L1) + * - Exercises large LIMIT values (oversample multiplication) + * - Tests KNN with rowid IN constraints + * - Exercises the insert->query->update->query->delete->query cycle + */ +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 20) return 0; + + int rc; + sqlite3 *db; + + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + /* Use first 4 bytes for configuration */ + uint8_t config = data[0]; + uint8_t num_inserts = (data[1] % 20) + 3; /* 3..22 inserts */ + uint8_t limit_val = (data[2] % 50) + 1; /* 1..50 for LIMIT */ + uint8_t metric_choice = data[3] % 3; + data += 4; + size -= 4; + + int use_int8 = config & 1; + const char *metric_str; + switch (metric_choice) { + case 0: metric_str = ""; break; /* default L2 */ + case 1: metric_str = " distance_metric=cosine"; break; + case 2: metric_str = " distance_metric=l1"; break; + default: metric_str = ""; break; + } + + /* Build CREATE TABLE statement */ + char create_sql[256]; + if (use_int8) { + snprintf(create_sql, sizeof(create_sql), + "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[16] indexed by rescore(quantizer=int8)%s)", metric_str); + } else { + /* bit quantizer ignores distance_metric for the coarse pass (always hamming), + but the float rescore phase uses the specified metric */ + snprintf(create_sql, sizeof(create_sql), + "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[16] indexed by rescore(quantizer=bit)%s)", metric_str); + } + + rc = sqlite3_exec(db, create_sql, NULL, NULL, NULL); + if (rc != SQLITE_OK) { sqlite3_close(db); return 0; } + + /* Insert vectors using fuzz data */ + { + sqlite3_stmt *ins = NULL; + sqlite3_prepare_v2(db, + "INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &ins, NULL); + if (!ins) { sqlite3_close(db); return 0; } + + size_t cursor = 0; + for (int i = 0; i < num_inserts && cursor + 1 < size; i++) { + float vec[16]; + for (int j = 0; j < 16; j++) { + if (cursor < size) { + /* Map fuzz byte to float -- includes potential for + interesting float values via reinterpretation */ + int8_t sb = (int8_t)data[cursor++]; + vec[j] = (float)sb / 5.0f; + } else { + vec[j] = 0.0f; + } + } + sqlite3_reset(ins); + sqlite3_bind_int64(ins, 1, (sqlite3_int64)(i + 1)); + sqlite3_bind_blob(ins, 2, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_step(ins); + } + sqlite3_finalize(ins); + } + + /* Build a fuzz-controlled query vector from remaining data */ + float qvec[16] = {0}; + { + size_t cursor = 0; + for (int j = 0; j < 16 && cursor < size; j++) { + int8_t sb = (int8_t)data[cursor++]; + qvec[j] = (float)sb / 3.0f; + } + } + + /* KNN query with fuzz-controlled vector and LIMIT */ + { + char knn_sql[256]; + snprintf(knn_sql, sizeof(knn_sql), + "SELECT rowid, distance FROM v WHERE emb MATCH ? " + "ORDER BY distance LIMIT %d", (int)limit_val); + + sqlite3_stmt *knn = NULL; + sqlite3_prepare_v2(db, knn_sql, -1, &knn, NULL); + if (knn) { + sqlite3_bind_blob(knn, 1, qvec, sizeof(qvec), SQLITE_STATIC); + while (sqlite3_step(knn) == SQLITE_ROW) { + /* Read results to ensure distance computation didn't produce garbage + that crashes the cursor iteration */ + (void)sqlite3_column_int64(knn, 0); + (void)sqlite3_column_double(knn, 1); + } + sqlite3_finalize(knn); + } + } + + /* Update some vectors, then query again */ + { + float uvec[16]; + for (int j = 0; j < 16; j++) uvec[j] = qvec[15 - j]; /* reverse of query */ + sqlite3_stmt *upd = NULL; + sqlite3_prepare_v2(db, + "UPDATE v SET emb = ? WHERE rowid = 1", -1, &upd, NULL); + if (upd) { + sqlite3_bind_blob(upd, 1, uvec, sizeof(uvec), SQLITE_STATIC); + sqlite3_step(upd); + sqlite3_finalize(upd); + } + } + + /* Second KNN after update */ + { + sqlite3_stmt *knn = NULL; + sqlite3_prepare_v2(db, + "SELECT rowid, distance FROM v WHERE emb MATCH ? " + "ORDER BY distance LIMIT 10", -1, &knn, NULL); + if (knn) { + sqlite3_bind_blob(knn, 1, qvec, sizeof(qvec), SQLITE_STATIC); + while (sqlite3_step(knn) == SQLITE_ROW) {} + sqlite3_finalize(knn); + } + } + + /* Delete half the rows, then KNN again */ + for (int i = 1; i <= num_inserts; i += 2) { + char del_sql[64]; + snprintf(del_sql, sizeof(del_sql), + "DELETE FROM v WHERE rowid = %d", i); + sqlite3_exec(db, del_sql, NULL, NULL, NULL); + } + + /* Third KNN after deletes -- exercises distance computation over + zeroed-out slots in the quantized chunk */ + { + sqlite3_stmt *knn = NULL; + sqlite3_prepare_v2(db, + "SELECT rowid, distance FROM v WHERE emb MATCH ? " + "ORDER BY distance LIMIT 5", -1, &knn, NULL); + if (knn) { + sqlite3_bind_blob(knn, 1, qvec, sizeof(qvec), SQLITE_STATIC); + while (sqlite3_step(knn) == SQLITE_ROW) {} + sqlite3_finalize(knn); + } + } + + sqlite3_close(db); + return 0; +} diff --git a/tests/fuzz/rescore-operations.c b/tests/fuzz/rescore-operations.c new file mode 100644 index 0000000..4bb7ff1 --- /dev/null +++ b/tests/fuzz/rescore-operations.c @@ -0,0 +1,96 @@ +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 6) return 0; + + int rc; + sqlite3 *db; + sqlite3_stmt *stmtInsert = NULL; + sqlite3_stmt *stmtDelete = NULL; + sqlite3_stmt *stmtKnn = NULL; + sqlite3_stmt *stmtScan = NULL; + + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + rc = sqlite3_exec(db, + "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[8] indexed by rescore(quantizer=bit))", + NULL, NULL, NULL); + if (rc != SQLITE_OK) { sqlite3_close(db); return 0; } + + sqlite3_prepare_v2(db, + "INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &stmtInsert, NULL); + sqlite3_prepare_v2(db, + "DELETE FROM v WHERE rowid = ?", -1, &stmtDelete, NULL); + sqlite3_prepare_v2(db, + "SELECT rowid, distance FROM v WHERE emb MATCH ? ORDER BY distance LIMIT 3", + -1, &stmtKnn, NULL); + sqlite3_prepare_v2(db, + "SELECT rowid FROM v", -1, &stmtScan, NULL); + + if (!stmtInsert || !stmtDelete || !stmtKnn || !stmtScan) goto cleanup; + + size_t i = 0; + while (i + 2 <= size) { + uint8_t op = data[i++] % 4; + uint8_t rowid_byte = data[i++]; + int64_t rowid = (int64_t)(rowid_byte % 32) + 1; + + switch (op) { + case 0: { + // INSERT: consume 32 bytes for 8 floats, or use what's left + float vec[8] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; + for (int j = 0; j < 8 && i < size; j++, i++) { + vec[j] = (float)((int8_t)data[i]) / 10.0f; + } + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, rowid); + sqlite3_bind_blob(stmtInsert, 2, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + break; + } + case 1: { + // DELETE + sqlite3_reset(stmtDelete); + sqlite3_bind_int64(stmtDelete, 1, rowid); + sqlite3_step(stmtDelete); + break; + } + case 2: { + // KNN query with a fixed query vector + float qvec[8] = {1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; + sqlite3_reset(stmtKnn); + sqlite3_bind_blob(stmtKnn, 1, qvec, sizeof(qvec), SQLITE_STATIC); + while (sqlite3_step(stmtKnn) == SQLITE_ROW) {} + break; + } + case 3: { + // Full scan + sqlite3_reset(stmtScan); + while (sqlite3_step(stmtScan) == SQLITE_ROW) {} + break; + } + } + } + + // Final operations -- must not crash regardless of prior state + sqlite3_exec(db, "SELECT * FROM v", NULL, NULL, NULL); + +cleanup: + sqlite3_finalize(stmtInsert); + sqlite3_finalize(stmtDelete); + sqlite3_finalize(stmtKnn); + sqlite3_finalize(stmtScan); + sqlite3_close(db); + return 0; +} diff --git a/tests/fuzz/rescore-quantize-edge.c b/tests/fuzz/rescore-quantize-edge.c new file mode 100644 index 0000000..4ab9e20 --- /dev/null +++ b/tests/fuzz/rescore-quantize-edge.c @@ -0,0 +1,177 @@ +#include +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +/* Test wrappers from sqlite-vec-rescore.c (SQLITE_VEC_TEST build) */ +extern void _test_rescore_quantize_float_to_bit(const float *src, uint8_t *dst, size_t dim); +extern void _test_rescore_quantize_float_to_int8(const float *src, int8_t *dst, size_t dim); +extern size_t _test_rescore_quantized_byte_size_bit(size_t dimensions); +extern size_t _test_rescore_quantized_byte_size_int8(size_t dimensions); + +/** + * Fuzz target: edge cases in rescore quantization functions. + * + * The existing rescore-quantize.c only tests dimensions that are multiples of 8 + * and never passes special float values. This target: + * + * - Tests rescore_quantized_byte_size with arbitrary dimension values + * (including 0, 1, 7, MAX values -- looking for integer division issues) + * - Passes raw float reinterpretation of fuzz bytes (NaN, Inf, denormals, + * negative zero -- these are the values that break min/max/range logic) + * - Tests the int8 quantizer with all-identical values (range=0 branch) + * - Tests the int8 quantizer with extreme ranges (overflow in scale calc) + * - Tests bit quantizer with exact float threshold (0.0f boundary) + */ +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 8) return 0; + + uint8_t mode = data[0] % 5; + data++; size--; + + switch (mode) { + case 0: { + /* Test rescore_quantized_byte_size with fuzz-controlled dimensions. + This function does dimensions / CHAR_BIT for bit, dimensions for int8. + We're checking it doesn't do anything weird with edge values. */ + if (size < sizeof(size_t)) return 0; + size_t dim; + memcpy(&dim, data, sizeof(dim)); + + /* These should never crash, just return values */ + size_t bit_size = _test_rescore_quantized_byte_size_bit(dim); + size_t int8_size = _test_rescore_quantized_byte_size_int8(dim); + + /* Verify basic invariants */ + (void)bit_size; + (void)int8_size; + break; + } + + case 1: { + /* Bit quantize with raw reinterpreted floats (NaN, Inf, denormal). + The key check: src[i] >= 0.0f -- NaN comparison is always false, + so NaN should produce 0-bits. But denormals and -0.0f are tricky. */ + size_t num_floats = size / sizeof(float); + if (num_floats == 0) return 0; + /* Round to multiple of 8 for bit quantizer */ + size_t dim = (num_floats / 8) * 8; + if (dim == 0) return 0; + + const float *src = (const float *)data; + size_t bit_bytes = dim / 8; + uint8_t *dst = (uint8_t *)malloc(bit_bytes); + if (!dst) return 0; + + _test_rescore_quantize_float_to_bit(src, dst, dim); + + /* Verify: for each bit, if src >= 0 then bit should be set */ + for (size_t i = 0; i < dim; i++) { + int bit_set = (dst[i / 8] >> (i % 8)) & 1; + if (src[i] >= 0.0f) { + assert(bit_set == 1); + } else if (src[i] < 0.0f) { + /* Definitely negative -- bit must be 0 */ + assert(bit_set == 0); + } + /* NaN: comparison is false, so bit_set should be 0 */ + } + + free(dst); + break; + } + + case 2: { + /* Int8 quantize with raw reinterpreted floats. + The dangerous paths: + - All values identical (range == 0) -> memset path + - vmin/vmax with NaN (NaN < anything is false, NaN > anything is false) + - Extreme range causing scale = 255/range to be Inf or 0 + - denormals near the clamping boundaries */ + size_t num_floats = size / sizeof(float); + if (num_floats == 0) return 0; + + const float *src = (const float *)data; + int8_t *dst = (int8_t *)malloc(num_floats); + if (!dst) return 0; + + _test_rescore_quantize_float_to_int8(src, dst, num_floats); + + /* Output must always be in [-128, 127] (trivially true for int8_t, + but check the actual clamping logic worked) */ + for (size_t i = 0; i < num_floats; i++) { + assert(dst[i] >= -128 && dst[i] <= 127); + } + + free(dst); + break; + } + + case 3: { + /* Int8 quantize stress: all-same values (range=0 branch) */ + size_t dim = (size < 64) ? size : 64; + if (dim == 0) return 0; + + float *src = (float *)malloc(dim * sizeof(float)); + int8_t *dst = (int8_t *)malloc(dim); + if (!src || !dst) { free(src); free(dst); return 0; } + + /* Fill with a single value derived from fuzz data */ + float val; + memcpy(&val, data, sizeof(float) < size ? sizeof(float) : size); + for (size_t i = 0; i < dim; i++) src[i] = val; + + _test_rescore_quantize_float_to_int8(src, dst, dim); + + /* All outputs should be 0 when range == 0 */ + for (size_t i = 0; i < dim; i++) { + assert(dst[i] == 0); + } + + free(src); + free(dst); + break; + } + + case 4: { + /* Int8 quantize with extreme range: one huge positive, one huge negative. + Tests scale = 255.0f / range overflow to Inf, then v * Inf = Inf, + then clamping to [-128, 127]. */ + if (size < 2 * sizeof(float)) return 0; + + float extreme[2]; + memcpy(extreme, data, 2 * sizeof(float)); + + /* Only test if both are finite (NaN/Inf tested in case 2) */ + if (!isfinite(extreme[0]) || !isfinite(extreme[1])) return 0; + + /* Build a vector with these two extreme values plus some fuzz */ + size_t dim = 16; + float src[16]; + src[0] = extreme[0]; + src[1] = extreme[1]; + for (size_t i = 2; i < dim; i++) { + if (2 * sizeof(float) + (i - 2) < size) { + src[i] = (float)((int8_t)data[2 * sizeof(float) + (i - 2)]) * 1000.0f; + } else { + src[i] = 0.0f; + } + } + + int8_t dst[16]; + _test_rescore_quantize_float_to_int8(src, dst, dim); + + for (size_t i = 0; i < dim; i++) { + assert(dst[i] >= -128 && dst[i] <= 127); + } + break; + } + } + + return 0; +} diff --git a/tests/fuzz/rescore-quantize.c b/tests/fuzz/rescore-quantize.c new file mode 100644 index 0000000..6aad445 --- /dev/null +++ b/tests/fuzz/rescore-quantize.c @@ -0,0 +1,54 @@ +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +/* These are SQLITE_VEC_TEST wrappers defined in sqlite-vec-rescore.c */ +extern void _test_rescore_quantize_float_to_bit(const float *src, uint8_t *dst, size_t dim); +extern void _test_rescore_quantize_float_to_int8(const float *src, int8_t *dst, size_t dim); + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + /* Need at least 4 bytes for one float */ + if (size < 4) return 0; + + /* Use the input as an array of floats. Dimensions must be a multiple of 8 + * for the bit quantizer. */ + size_t num_floats = size / sizeof(float); + if (num_floats == 0) return 0; + + /* Round down to multiple of 8 for bit quantizer compatibility */ + size_t dim = (num_floats / 8) * 8; + if (dim == 0) dim = 8; + if (dim > num_floats) return 0; + + const float *src = (const float *)data; + + /* Allocate output buffers */ + size_t bit_bytes = dim / 8; + uint8_t *bit_dst = (uint8_t *)malloc(bit_bytes); + int8_t *int8_dst = (int8_t *)malloc(dim); + if (!bit_dst || !int8_dst) { + free(bit_dst); + free(int8_dst); + return 0; + } + + /* Test bit quantization */ + _test_rescore_quantize_float_to_bit(src, bit_dst, dim); + + /* Test int8 quantization */ + _test_rescore_quantize_float_to_int8(src, int8_dst, dim); + + /* Verify int8 output is in range */ + for (size_t i = 0; i < dim; i++) { + assert(int8_dst[i] >= -128 && int8_dst[i] <= 127); + } + + free(bit_dst); + free(int8_dst); + return 0; +} diff --git a/tests/fuzz/rescore-shadow-corrupt.c b/tests/fuzz/rescore-shadow-corrupt.c new file mode 100644 index 0000000..edd87ef --- /dev/null +++ b/tests/fuzz/rescore-shadow-corrupt.c @@ -0,0 +1,230 @@ +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +/** + * Fuzz target: corrupt rescore shadow tables then exercise KNN/read/write. + * + * This targets the dangerous code paths in rescore_knn (Phase 1 + 2): + * - sqlite3_blob_read into baseVectors with potentially wrong-sized blobs + * - distance computation on corrupted/partial quantized data + * - blob_reopen on _rescore_vectors with missing/corrupted rows + * - insert/delete after corruption (blob_write to wrong offsets) + * + * The existing shadow-corrupt.c only tests vec0 without rescore. + */ +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 4) return 0; + + int rc; + sqlite3 *db; + + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + /* Pick quantizer type from first byte */ + int use_int8 = data[0] & 1; + int target = (data[1] % 8); + const uint8_t *payload = data + 2; + int payload_size = (int)(size - 2); + + const char *create_sql = use_int8 + ? "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[16] indexed by rescore(quantizer=int8))" + : "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[16] indexed by rescore(quantizer=bit))"; + + rc = sqlite3_exec(db, create_sql, NULL, NULL, NULL); + if (rc != SQLITE_OK) { sqlite3_close(db); return 0; } + + /* Insert several vectors so there's a full chunk to corrupt */ + { + sqlite3_stmt *ins = NULL; + sqlite3_prepare_v2(db, + "INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &ins, NULL); + if (!ins) { sqlite3_close(db); return 0; } + + for (int i = 1; i <= 8; i++) { + float vec[16]; + for (int j = 0; j < 16; j++) vec[j] = (float)(i * 10 + j) / 100.0f; + sqlite3_reset(ins); + sqlite3_bind_int64(ins, 1, i); + sqlite3_bind_blob(ins, 2, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_step(ins); + } + sqlite3_finalize(ins); + } + + /* Now corrupt rescore shadow tables based on fuzz input */ + sqlite3_stmt *stmt = NULL; + + switch (target) { + case 0: { + /* Corrupt _rescore_chunks00 vectors blob with fuzz data */ + rc = sqlite3_prepare_v2(db, + "UPDATE v_rescore_chunks00 SET vectors = ? WHERE rowid = 1", + -1, &stmt, NULL); + if (rc == SQLITE_OK) { + sqlite3_bind_blob(stmt, 1, payload, payload_size, SQLITE_STATIC); + sqlite3_step(stmt); + sqlite3_finalize(stmt); + stmt = NULL; + } + break; + } + case 1: { + /* Corrupt _rescore_vectors00 vector blob for a specific row */ + rc = sqlite3_prepare_v2(db, + "UPDATE v_rescore_vectors00 SET vector = ? WHERE rowid = 3", + -1, &stmt, NULL); + if (rc == SQLITE_OK) { + sqlite3_bind_blob(stmt, 1, payload, payload_size, SQLITE_STATIC); + sqlite3_step(stmt); + sqlite3_finalize(stmt); + stmt = NULL; + } + break; + } + case 2: { + /* Truncate the quantized chunk blob to wrong size */ + rc = sqlite3_prepare_v2(db, + "UPDATE v_rescore_chunks00 SET vectors = X'DEADBEEF' WHERE rowid = 1", + -1, &stmt, NULL); + if (rc == SQLITE_OK) { + sqlite3_step(stmt); + sqlite3_finalize(stmt); + stmt = NULL; + } + break; + } + case 3: { + /* Delete rows from _rescore_vectors (orphan the float vectors) */ + sqlite3_exec(db, + "DELETE FROM v_rescore_vectors00 WHERE rowid IN (2, 4, 6)", + NULL, NULL, NULL); + break; + } + case 4: { + /* Delete the chunk row entirely from _rescore_chunks */ + sqlite3_exec(db, + "DELETE FROM v_rescore_chunks00 WHERE rowid = 1", + NULL, NULL, NULL); + break; + } + case 5: { + /* Set vectors to NULL in _rescore_chunks */ + sqlite3_exec(db, + "UPDATE v_rescore_chunks00 SET vectors = NULL WHERE rowid = 1", + NULL, NULL, NULL); + break; + } + case 6: { + /* Set vector to NULL in _rescore_vectors */ + sqlite3_exec(db, + "UPDATE v_rescore_vectors00 SET vector = NULL WHERE rowid = 3", + NULL, NULL, NULL); + break; + } + case 7: { + /* Corrupt BOTH tables with fuzz data */ + int half = payload_size / 2; + rc = sqlite3_prepare_v2(db, + "UPDATE v_rescore_chunks00 SET vectors = ? WHERE rowid = 1", + -1, &stmt, NULL); + if (rc == SQLITE_OK) { + sqlite3_bind_blob(stmt, 1, payload, half, SQLITE_STATIC); + sqlite3_step(stmt); + sqlite3_finalize(stmt); + stmt = NULL; + } + rc = sqlite3_prepare_v2(db, + "UPDATE v_rescore_vectors00 SET vector = ? WHERE rowid = 1", + -1, &stmt, NULL); + if (rc == SQLITE_OK) { + sqlite3_bind_blob(stmt, 1, payload + half, + payload_size - half, SQLITE_STATIC); + sqlite3_step(stmt); + sqlite3_finalize(stmt); + stmt = NULL; + } + break; + } + } + + /* Exercise ALL read/write paths -- NONE should crash */ + + /* KNN query (triggers rescore_knn Phase 1 + Phase 2) */ + { + float qvec[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1}; + sqlite3_stmt *knn = NULL; + sqlite3_prepare_v2(db, + "SELECT rowid, distance FROM v WHERE emb MATCH ? " + "ORDER BY distance LIMIT 5", -1, &knn, NULL); + if (knn) { + sqlite3_bind_blob(knn, 1, qvec, sizeof(qvec), SQLITE_STATIC); + while (sqlite3_step(knn) == SQLITE_ROW) {} + sqlite3_finalize(knn); + } + } + + /* Full scan (triggers reading from _rescore_vectors) */ + sqlite3_exec(db, "SELECT * FROM v", NULL, NULL, NULL); + + /* Point lookups */ + sqlite3_exec(db, "SELECT * FROM v WHERE rowid = 1", NULL, NULL, NULL); + sqlite3_exec(db, "SELECT * FROM v WHERE rowid = 3", NULL, NULL, NULL); + + /* Insert after corruption */ + { + float vec[16] = {0}; + sqlite3_stmt *ins = NULL; + sqlite3_prepare_v2(db, + "INSERT INTO v(rowid, emb) VALUES (99, ?)", -1, &ins, NULL); + if (ins) { + sqlite3_bind_blob(ins, 1, vec, sizeof(vec), SQLITE_STATIC); + sqlite3_step(ins); + sqlite3_finalize(ins); + } + } + + /* Delete after corruption */ + sqlite3_exec(db, "DELETE FROM v WHERE rowid = 5", NULL, NULL, NULL); + + /* Update after corruption */ + { + float vec[16] = {1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1}; + sqlite3_stmt *upd = NULL; + sqlite3_prepare_v2(db, + "UPDATE v SET emb = ? WHERE rowid = 1", -1, &upd, NULL); + if (upd) { + sqlite3_bind_blob(upd, 1, vec, sizeof(vec), SQLITE_STATIC); + sqlite3_step(upd); + sqlite3_finalize(upd); + } + } + + /* KNN again after modifications to corrupted state */ + { + float qvec[16] = {0,0,0,0, 0,0,0,0, 1,1,1,1, 1,1,1,1}; + sqlite3_stmt *knn = NULL; + sqlite3_prepare_v2(db, + "SELECT rowid, distance FROM v WHERE emb MATCH ? " + "ORDER BY distance LIMIT 3", -1, &knn, NULL); + if (knn) { + sqlite3_bind_blob(knn, 1, qvec, sizeof(qvec), SQLITE_STATIC); + while (sqlite3_step(knn) == SQLITE_ROW) {} + sqlite3_finalize(knn); + } + } + + sqlite3_exec(db, "DROP TABLE v", NULL, NULL, NULL); + sqlite3_close(db); + return 0; +} diff --git a/tests/sqlite-vec-internal.h b/tests/sqlite-vec-internal.h index a02c72a..cbc2c08 100644 --- a/tests/sqlite-vec-internal.h +++ b/tests/sqlite-vec-internal.h @@ -65,8 +65,23 @@ enum Vec0DistanceMetrics { enum Vec0IndexType { VEC0_INDEX_TYPE_FLAT = 1, +#ifdef SQLITE_VEC_ENABLE_RESCORE + VEC0_INDEX_TYPE_RESCORE = 2, +#endif }; +#ifdef SQLITE_VEC_ENABLE_RESCORE +enum Vec0RescoreQuantizerType { + VEC0_RESCORE_QUANTIZER_BIT = 1, + VEC0_RESCORE_QUANTIZER_INT8 = 2, +}; + +struct Vec0RescoreConfig { + enum Vec0RescoreQuantizerType quantizer_type; + int oversample; +}; +#endif + struct VectorColumnDefinition { char *name; int name_length; @@ -74,6 +89,9 @@ struct VectorColumnDefinition { enum VectorElementType element_type; enum Vec0DistanceMetrics distance_metric; enum Vec0IndexType index_type; +#ifdef SQLITE_VEC_ENABLE_RESCORE + struct Vec0RescoreConfig rescore; +#endif }; int vec0_parse_vector_column(const char *source, int source_length, @@ -88,6 +106,13 @@ int vec0_parse_partition_key_definition(const char *source, int source_length, float _test_distance_l2_sqr_float(const float *a, const float *b, size_t dims); float _test_distance_cosine_float(const float *a, const float *b, size_t dims); float _test_distance_hamming(const unsigned char *a, const unsigned char *b, size_t dims); + +#ifdef SQLITE_VEC_ENABLE_RESCORE +void _test_rescore_quantize_float_to_bit(const float *src, uint8_t *dst, size_t dim); +void _test_rescore_quantize_float_to_int8(const float *src, int8_t *dst, size_t dim); +size_t _test_rescore_quantized_byte_size_bit(size_t dimensions); +size_t _test_rescore_quantized_byte_size_int8(size_t dimensions); +#endif #endif #endif /* SQLITE_VEC_INTERNAL_H */ diff --git a/tests/test-rescore-mutations.py b/tests/test-rescore-mutations.py new file mode 100644 index 0000000..28495c2 --- /dev/null +++ b/tests/test-rescore-mutations.py @@ -0,0 +1,470 @@ +"""Mutation and edge-case tests for the rescore index feature.""" +import struct +import sqlite3 +import pytest +import math +import random + + +@pytest.fixture() +def db(): + db = sqlite3.connect(":memory:") + db.row_factory = sqlite3.Row + db.enable_load_extension(True) + db.load_extension("dist/vec0") + db.enable_load_extension(False) + return db + + +def float_vec(values): + """Pack a list of floats into a blob for sqlite-vec.""" + return struct.pack(f"{len(values)}f", *values) + + +def unpack_float_vec(blob): + """Unpack a float vector blob.""" + n = len(blob) // 4 + return list(struct.unpack(f"{n}f", blob)) + + +# ============================================================================ +# Error cases: rescore + aux/metadata/partition +# ============================================================================ + + +def test_create_error_with_aux_column(db): + """Rescore should reject auxiliary columns.""" + with pytest.raises(sqlite3.OperationalError, match="Auxiliary columns"): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[8] indexed by rescore(quantizer=bit)," + " +extra text" + ")" + ) + + +def test_create_error_with_metadata_column(db): + """Rescore should reject metadata columns.""" + with pytest.raises(sqlite3.OperationalError, match="Metadata columns"): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[8] indexed by rescore(quantizer=bit)," + " genre text" + ")" + ) + + +def test_create_error_with_partition_key(db): + """Rescore should reject partition key columns.""" + with pytest.raises(sqlite3.OperationalError, match="Partition key"): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[8] indexed by rescore(quantizer=bit)," + " user_id integer partition key" + ")" + ) + + +# ============================================================================ +# Insert / batch / delete / update mutations +# ============================================================================ + + +def test_insert_single_verify_knn(db): + """Insert a single row and verify KNN returns it.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[8] indexed by rescore(quantizer=bit)" + ")" + ) + db.execute("INSERT INTO t(rowid, embedding) VALUES (1, ?)", [float_vec([1.0] * 8)]) + rows = db.execute( + "SELECT rowid, distance FROM t WHERE embedding MATCH ? ORDER BY distance LIMIT 1", + [float_vec([1.0] * 8)], + ).fetchall() + assert len(rows) == 1 + assert rows[0]["rowid"] == 1 + assert rows[0]["distance"] < 0.01 + + +def test_insert_large_batch(db): + """Insert 200+ rows (multiple chunks with default chunk_size=1024) and verify count and KNN.""" + dim = 16 + n = 200 + random.seed(99) + db.execute( + f"CREATE VIRTUAL TABLE t USING vec0(" + f" embedding float[{dim}] indexed by rescore(quantizer=int8)" + f")" + ) + for i in range(n): + v = [random.gauss(0, 1) for _ in range(dim)] + db.execute( + "INSERT INTO t(rowid, embedding) VALUES (?, ?)", + [i + 1, float_vec(v)], + ) + row = db.execute("SELECT count(*) as cnt FROM t").fetchone() + assert row["cnt"] == n + + # KNN should return results + query = float_vec([random.gauss(0, 1) for _ in range(dim)]) + rows = db.execute( + "SELECT rowid FROM t WHERE embedding MATCH ? ORDER BY distance LIMIT 10", + [query], + ).fetchall() + assert len(rows) == 10 + + +def test_delete_all_rows(db): + """Delete every row, verify count=0, KNN returns empty.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[8] indexed by rescore(quantizer=bit)" + ")" + ) + for i in range(20): + db.execute( + "INSERT INTO t(rowid, embedding) VALUES (?, ?)", + [i + 1, float_vec([float(i)] * 8)], + ) + assert db.execute("SELECT count(*) as cnt FROM t").fetchone()["cnt"] == 20 + + for i in range(20): + db.execute("DELETE FROM t WHERE rowid = ?", [i + 1]) + + assert db.execute("SELECT count(*) as cnt FROM t").fetchone()["cnt"] == 0 + + rows = db.execute( + "SELECT rowid FROM t WHERE embedding MATCH ? ORDER BY distance LIMIT 5", + [float_vec([0.0] * 8)], + ).fetchall() + assert len(rows) == 0 + + +def test_delete_then_reinsert_same_rowid(db): + """Delete rowid=1, re-insert rowid=1 with different vector, verify KNN uses new vector.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[8] indexed by rescore(quantizer=int8)" + ")" + ) + # Insert rowid=1 near origin, rowid=2 far from origin + db.execute( + "INSERT INTO t(rowid, embedding) VALUES (1, ?)", + [float_vec([0.1] * 8)], + ) + db.execute( + "INSERT INTO t(rowid, embedding) VALUES (2, ?)", + [float_vec([100.0] * 8)], + ) + + # KNN to [0]*8 -> rowid 1 is closer + rows = db.execute( + "SELECT rowid FROM t WHERE embedding MATCH ? ORDER BY distance LIMIT 1", + [float_vec([0.0] * 8)], + ).fetchall() + assert rows[0]["rowid"] == 1 + + # Delete rowid=1, re-insert with vector far from origin + db.execute("DELETE FROM t WHERE rowid = 1") + db.execute( + "INSERT INTO t(rowid, embedding) VALUES (1, ?)", + [float_vec([200.0] * 8)], + ) + + # Now KNN to [0]*8 -> rowid 2 should be closer + rows = db.execute( + "SELECT rowid FROM t WHERE embedding MATCH ? ORDER BY distance LIMIT 1", + [float_vec([0.0] * 8)], + ).fetchall() + assert rows[0]["rowid"] == 2 + + +def test_update_vector(db): + """UPDATE the vector column and verify KNN reflects new value.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[8] indexed by rescore(quantizer=int8)" + ")" + ) + db.execute( + "INSERT INTO t(rowid, embedding) VALUES (1, ?)", + [float_vec([0.0] * 8)], + ) + db.execute( + "INSERT INTO t(rowid, embedding) VALUES (2, ?)", + [float_vec([10.0] * 8)], + ) + + # Update rowid=1 to be far away + db.execute( + "UPDATE t SET embedding = ? WHERE rowid = 1", + [float_vec([100.0] * 8)], + ) + + # Now KNN to [0]*8 -> rowid 2 should be closest + rows = db.execute( + "SELECT rowid FROM t WHERE embedding MATCH ? ORDER BY distance LIMIT 1", + [float_vec([0.0] * 8)], + ).fetchall() + assert rows[0]["rowid"] == 2 + + +def test_knn_after_delete_all_but_one(db): + """Insert 50 rows, delete 49, KNN should only return the survivor.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[8] indexed by rescore(quantizer=bit)" + ")" + ) + for i in range(50): + db.execute( + "INSERT INTO t(rowid, embedding) VALUES (?, ?)", + [i + 1, float_vec([float(i)] * 8)], + ) + # Delete all except rowid=25 + for i in range(50): + if i + 1 != 25: + db.execute("DELETE FROM t WHERE rowid = ?", [i + 1]) + + assert db.execute("SELECT count(*) as cnt FROM t").fetchone()["cnt"] == 1 + + rows = db.execute( + "SELECT rowid FROM t WHERE embedding MATCH ? ORDER BY distance LIMIT 10", + [float_vec([0.0] * 8)], + ).fetchall() + assert len(rows) == 1 + assert rows[0]["rowid"] == 25 + + +# ============================================================================ +# Edge cases +# ============================================================================ + + +def test_single_row_knn(db): + """Table with exactly 1 row. LIMIT 1 returns it; LIMIT 5 returns 1.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[8] indexed by rescore(quantizer=bit)" + ")" + ) + db.execute("INSERT INTO t(rowid, embedding) VALUES (1, ?)", [float_vec([1.0] * 8)]) + + rows = db.execute( + "SELECT rowid FROM t WHERE embedding MATCH ? ORDER BY distance LIMIT 1", + [float_vec([1.0] * 8)], + ).fetchall() + assert len(rows) == 1 + + rows = db.execute( + "SELECT rowid FROM t WHERE embedding MATCH ? ORDER BY distance LIMIT 5", + [float_vec([1.0] * 8)], + ).fetchall() + assert len(rows) == 1 + + +def test_knn_with_all_identical_vectors(db): + """All vectors are the same. All distances should be equal.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[8] indexed by rescore(quantizer=int8)" + ")" + ) + vec = [3.0, 1.0, 4.0, 1.0, 5.0, 9.0, 2.0, 6.0] + for i in range(10): + db.execute( + "INSERT INTO t(rowid, embedding) VALUES (?, ?)", + [i + 1, float_vec(vec)], + ) + + rows = db.execute( + "SELECT rowid, distance FROM t WHERE embedding MATCH ? ORDER BY distance LIMIT 10", + [float_vec(vec)], + ).fetchall() + assert len(rows) == 10 + # All distances should be ~0 (exact match) + for r in rows: + assert r["distance"] < 0.01 + + +def test_zero_vector_insert(db): + """Insert the zero vector [0,0,...,0]. Should not crash quantization.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[8] indexed by rescore(quantizer=bit)" + ")" + ) + db.execute( + "INSERT INTO t(rowid, embedding) VALUES (1, ?)", + [float_vec([0.0] * 8)], + ) + row = db.execute("SELECT count(*) as cnt FROM t").fetchone() + assert row["cnt"] == 1 + + # Also test int8 quantizer with zero vector + db.execute( + "CREATE VIRTUAL TABLE t2 USING vec0(" + " embedding float[8] indexed by rescore(quantizer=int8)" + ")" + ) + db.execute( + "INSERT INTO t2(rowid, embedding) VALUES (1, ?)", + [float_vec([0.0] * 8)], + ) + row = db.execute("SELECT count(*) as cnt FROM t2").fetchone() + assert row["cnt"] == 1 + + +def test_very_large_values(db): + """Insert vectors with very large float values. Quantization should not crash.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[8] indexed by rescore(quantizer=int8)" + ")" + ) + db.execute( + "INSERT INTO t(rowid, embedding) VALUES (1, ?)", + [float_vec([1e30] * 8)], + ) + db.execute( + "INSERT INTO t(rowid, embedding) VALUES (2, ?)", + [float_vec([1e30, -1e30, 1e30, -1e30, 1e30, -1e30, 1e30, -1e30])], + ) + row = db.execute("SELECT count(*) as cnt FROM t").fetchone() + assert row["cnt"] == 2 + + +def test_negative_values(db): + """Insert vectors with all negative values. Bit quantization maps all to 0.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[8] indexed by rescore(quantizer=bit)" + ")" + ) + db.execute( + "INSERT INTO t(rowid, embedding) VALUES (1, ?)", + [float_vec([-1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0])], + ) + db.execute( + "INSERT INTO t(rowid, embedding) VALUES (2, ?)", + [float_vec([-0.1, -0.2, -0.3, -0.4, -0.5, -0.6, -0.7, -0.8])], + ) + row = db.execute("SELECT count(*) as cnt FROM t").fetchone() + assert row["cnt"] == 2 + + # KNN should still work + rows = db.execute( + "SELECT rowid FROM t WHERE embedding MATCH ? ORDER BY distance LIMIT 2", + [float_vec([-0.1, -0.2, -0.3, -0.4, -0.5, -0.6, -0.7, -0.8])], + ).fetchall() + assert len(rows) == 2 + assert rows[0]["rowid"] == 2 + + +def test_single_dimension(db): + """Single-dimension vector (edge case for quantization).""" + # int8 quantizer (bit needs dim divisible by 8) + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[8] indexed by rescore(quantizer=int8)" + ")" + ) + db.execute("INSERT INTO t(rowid, embedding) VALUES (1, ?)", [float_vec([1.0] * 8)]) + db.execute("INSERT INTO t(rowid, embedding) VALUES (2, ?)", [float_vec([5.0] * 8)]) + rows = db.execute( + "SELECT rowid FROM t WHERE embedding MATCH ? ORDER BY distance LIMIT 1", + [float_vec([1.0] * 8)], + ).fetchall() + assert rows[0]["rowid"] == 1 + + +# ============================================================================ +# vec_debug() verification +# ============================================================================ + + +def test_vec_debug_contains_rescore(db): + """vec_debug() should contain 'rescore' in build flags when compiled with SQLITE_VEC_ENABLE_RESCORE.""" + row = db.execute("SELECT vec_debug() as d").fetchone() + assert "rescore" in row["d"] + + +# ============================================================================ +# Insert batch recall test +# ============================================================================ + + +def test_insert_batch_recall(db): + """Insert 150 rows and verify KNN recall is reasonable (>0.6).""" + dim = 16 + n = 150 + k = 10 + random.seed(77) + + db.execute( + f"CREATE VIRTUAL TABLE t_rescore USING vec0(" + f" embedding float[{dim}] indexed by rescore(quantizer=int8, oversample=16)" + f")" + ) + db.execute( + f"CREATE VIRTUAL TABLE t_flat USING vec0(embedding float[{dim}])" + ) + + vectors = [[random.gauss(0, 1) for _ in range(dim)] for _ in range(n)] + for i, v in enumerate(vectors): + blob = float_vec(v) + db.execute( + "INSERT INTO t_rescore(rowid, embedding) VALUES (?, ?)", [i + 1, blob] + ) + db.execute( + "INSERT INTO t_flat(rowid, embedding) VALUES (?, ?)", [i + 1, blob] + ) + + query = float_vec([random.gauss(0, 1) for _ in range(dim)]) + + rescore_rows = db.execute( + "SELECT rowid FROM t_rescore WHERE embedding MATCH ? ORDER BY distance LIMIT ?", + [query, k], + ).fetchall() + flat_rows = db.execute( + "SELECT rowid FROM t_flat WHERE embedding MATCH ? ORDER BY distance LIMIT ?", + [query, k], + ).fetchall() + + rescore_ids = {r["rowid"] for r in rescore_rows} + flat_ids = {r["rowid"] for r in flat_rows} + recall = len(rescore_ids & flat_ids) / k + assert recall >= 0.6, f"Recall too low: {recall}" + + +# ============================================================================ +# Distance metric variants +# ============================================================================ + + +def test_knn_int8_cosine(db): + """Rescore with quantizer=int8 and distance_metric=cosine.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[8] distance_metric=cosine indexed by rescore(quantizer=int8)" + ")" + ) + db.execute( + "INSERT INTO t(rowid, embedding) VALUES (1, ?)", + [float_vec([1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])], + ) + db.execute( + "INSERT INTO t(rowid, embedding) VALUES (2, ?)", + [float_vec([0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])], + ) + db.execute( + "INSERT INTO t(rowid, embedding) VALUES (3, ?)", + [float_vec([1.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])], + ) + rows = db.execute( + "SELECT rowid, distance FROM t WHERE embedding MATCH ? ORDER BY distance LIMIT 2", + [float_vec([1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])], + ).fetchall() + assert rows[0]["rowid"] == 1 + assert rows[0]["distance"] < 0.01 diff --git a/tests/test-rescore.py b/tests/test-rescore.py new file mode 100644 index 0000000..5025857 --- /dev/null +++ b/tests/test-rescore.py @@ -0,0 +1,568 @@ +"""Tests for the rescore index feature in sqlite-vec.""" +import struct +import sqlite3 +import pytest +import math +import random + + +@pytest.fixture() +def db(): + db = sqlite3.connect(":memory:") + db.row_factory = sqlite3.Row + db.enable_load_extension(True) + db.load_extension("dist/vec0") + db.enable_load_extension(False) + return db + + +def float_vec(values): + """Pack a list of floats into a blob for sqlite-vec.""" + return struct.pack(f"{len(values)}f", *values) + + +def unpack_float_vec(blob): + """Unpack a float vector blob.""" + n = len(blob) // 4 + return list(struct.unpack(f"{n}f", blob)) + + +# ============================================================================ +# Creation tests +# ============================================================================ + + +def test_create_bit(db): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[128] indexed by rescore(quantizer=bit)" + ")" + ) + # Table exists and has the right structure + row = db.execute( + "SELECT count(*) as cnt FROM sqlite_master WHERE name LIKE 't_%'" + ).fetchone() + assert row["cnt"] > 0 + + +def test_create_int8(db): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[128] indexed by rescore(quantizer=int8)" + ")" + ) + row = db.execute( + "SELECT count(*) as cnt FROM sqlite_master WHERE name LIKE 't_%'" + ).fetchone() + assert row["cnt"] > 0 + + +def test_create_with_oversample(db): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[128] indexed by rescore(quantizer=bit, oversample=16)" + ")" + ) + row = db.execute( + "SELECT count(*) as cnt FROM sqlite_master WHERE name LIKE 't_%'" + ).fetchone() + assert row["cnt"] > 0 + + +def test_create_with_distance_metric(db): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[128] distance_metric=cosine indexed by rescore(quantizer=bit)" + ")" + ) + row = db.execute( + "SELECT count(*) as cnt FROM sqlite_master WHERE name LIKE 't_%'" + ).fetchone() + assert row["cnt"] > 0 + + +def test_create_error_missing_quantizer(db): + with pytest.raises(sqlite3.OperationalError): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[128] indexed by rescore(oversample=8)" + ")" + ) + + +def test_create_error_invalid_quantizer(db): + with pytest.raises(sqlite3.OperationalError): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[128] indexed by rescore(quantizer=float)" + ")" + ) + + +def test_create_error_on_bit_column(db): + with pytest.raises(sqlite3.OperationalError): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding bit[1024] indexed by rescore(quantizer=bit)" + ")" + ) + + +def test_create_error_on_int8_column(db): + with pytest.raises(sqlite3.OperationalError): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding int8[128] indexed by rescore(quantizer=bit)" + ")" + ) + + +def test_create_error_bad_oversample_zero(db): + with pytest.raises(sqlite3.OperationalError): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[128] indexed by rescore(quantizer=bit, oversample=0)" + ")" + ) + + +def test_create_error_bad_oversample_too_large(db): + with pytest.raises(sqlite3.OperationalError): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[128] indexed by rescore(quantizer=bit, oversample=999)" + ")" + ) + + +def test_create_error_bit_dim_not_divisible_by_8(db): + with pytest.raises(sqlite3.OperationalError): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[100] indexed by rescore(quantizer=bit)" + ")" + ) + + +# ============================================================================ +# Shadow table tests +# ============================================================================ + + +def test_shadow_tables_exist(db): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[128] indexed by rescore(quantizer=bit)" + ")" + ) + tables = [ + r[0] + for r in db.execute( + "SELECT name FROM sqlite_master WHERE type='table' AND name LIKE 't_%' ORDER BY name" + ).fetchall() + ] + assert "t_rescore_chunks00" in tables + assert "t_rescore_vectors00" in tables + # Rescore columns don't create _vector_chunks + assert "t_vector_chunks00" not in tables + + +def test_drop_cleans_up(db): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[128] indexed by rescore(quantizer=bit)" + ")" + ) + db.execute("DROP TABLE t") + tables = [ + r[0] + for r in db.execute( + "SELECT name FROM sqlite_master WHERE type='table' AND name LIKE 't_%'" + ).fetchall() + ] + assert len(tables) == 0 + + +# ============================================================================ +# Insert tests +# ============================================================================ + + +def test_insert_single(db): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[8] indexed by rescore(quantizer=bit)" + ")" + ) + db.execute("INSERT INTO t(rowid, embedding) VALUES (1, ?)", [float_vec([1.0] * 8)]) + row = db.execute("SELECT count(*) as cnt FROM t").fetchone() + assert row["cnt"] == 1 + + +def test_insert_multiple(db): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[8] indexed by rescore(quantizer=int8)" + ")" + ) + for i in range(10): + db.execute( + "INSERT INTO t(rowid, embedding) VALUES (?, ?)", + [i + 1, float_vec([float(i)] * 8)], + ) + row = db.execute("SELECT count(*) as cnt FROM t").fetchone() + assert row["cnt"] == 10 + + +# ============================================================================ +# Delete tests +# ============================================================================ + + +def test_delete_single(db): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[8] indexed by rescore(quantizer=bit)" + ")" + ) + db.execute("INSERT INTO t(rowid, embedding) VALUES (1, ?)", [float_vec([1.0] * 8)]) + db.execute("DELETE FROM t WHERE rowid = 1") + row = db.execute("SELECT count(*) as cnt FROM t").fetchone() + assert row["cnt"] == 0 + + +def test_delete_and_reinsert(db): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[8] indexed by rescore(quantizer=bit)" + ")" + ) + db.execute("INSERT INTO t(rowid, embedding) VALUES (1, ?)", [float_vec([1.0] * 8)]) + db.execute("DELETE FROM t WHERE rowid = 1") + db.execute( + "INSERT INTO t(rowid, embedding) VALUES (2, ?)", [float_vec([2.0] * 8)] + ) + row = db.execute("SELECT count(*) as cnt FROM t").fetchone() + assert row["cnt"] == 1 + + +def test_point_query_returns_float(db): + """SELECT by rowid should return the original float vector, not quantized.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[8] indexed by rescore(quantizer=bit)" + ")" + ) + vals = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8] + db.execute("INSERT INTO t(rowid, embedding) VALUES (1, ?)", [float_vec(vals)]) + row = db.execute("SELECT embedding FROM t WHERE rowid = 1").fetchone() + result = unpack_float_vec(row["embedding"]) + for a, b in zip(result, vals): + assert abs(a - b) < 1e-6 + + +# ============================================================================ +# KNN tests +# ============================================================================ + + +def test_knn_basic_bit(db): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[8] indexed by rescore(quantizer=bit)" + ")" + ) + # Insert vectors where [1,0,0,...] is closest to query [1,0,0,...] + db.execute( + "INSERT INTO t(rowid, embedding) VALUES (1, ?)", + [float_vec([1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])], + ) + db.execute( + "INSERT INTO t(rowid, embedding) VALUES (2, ?)", + [float_vec([0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])], + ) + db.execute( + "INSERT INTO t(rowid, embedding) VALUES (3, ?)", + [float_vec([0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0])], + ) + rows = db.execute( + "SELECT rowid, distance FROM t WHERE embedding MATCH ? ORDER BY distance LIMIT 1", + [float_vec([1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])], + ).fetchall() + assert len(rows) == 1 + assert rows[0]["rowid"] == 1 + + +def test_knn_basic_int8(db): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[8] indexed by rescore(quantizer=int8)" + ")" + ) + db.execute( + "INSERT INTO t(rowid, embedding) VALUES (1, ?)", + [float_vec([1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])], + ) + db.execute( + "INSERT INTO t(rowid, embedding) VALUES (2, ?)", + [float_vec([0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])], + ) + db.execute( + "INSERT INTO t(rowid, embedding) VALUES (3, ?)", + [float_vec([0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0])], + ) + rows = db.execute( + "SELECT rowid, distance FROM t WHERE embedding MATCH ? ORDER BY distance LIMIT 1", + [float_vec([1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])], + ).fetchall() + assert len(rows) == 1 + assert rows[0]["rowid"] == 1 + + +def test_knn_returns_float_distances(db): + """KNN should return float-precision distances, not quantized distances.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[8] indexed by rescore(quantizer=bit)" + ")" + ) + v1 = [1.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] + v2 = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0] + db.execute("INSERT INTO t(rowid, embedding) VALUES (1, ?)", [float_vec(v1)]) + db.execute("INSERT INTO t(rowid, embedding) VALUES (2, ?)", [float_vec(v2)]) + + query = [1.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] + rows = db.execute( + "SELECT rowid, distance FROM t WHERE embedding MATCH ? ORDER BY distance LIMIT 2", + [float_vec(query)], + ).fetchall() + + # First result should be exact match with distance ~0 + assert rows[0]["rowid"] == 1 + assert rows[0]["distance"] < 0.01 + + # Second result should have a float distance + # sqrt((1-0)^2 + (0.5-0)^2 + (0-1)^2) = sqrt(2.25) = 1.5 + assert abs(rows[1]["distance"] - 1.5) < 0.01 + + +def test_knn_recall(db): + """With enough vectors, rescore should achieve good recall (>0.9).""" + dim = 32 + n = 1000 + k = 10 + random.seed(42) + + db.execute( + "CREATE VIRTUAL TABLE t_rescore USING vec0(" + f" embedding float[{dim}] indexed by rescore(quantizer=bit, oversample=16)" + ")" + ) + db.execute( + f"CREATE VIRTUAL TABLE t_flat USING vec0(embedding float[{dim}])" + ) + + vectors = [[random.gauss(0, 1) for _ in range(dim)] for _ in range(n)] + for i, v in enumerate(vectors): + blob = float_vec(v) + db.execute( + "INSERT INTO t_rescore(rowid, embedding) VALUES (?, ?)", [i + 1, blob] + ) + db.execute( + "INSERT INTO t_flat(rowid, embedding) VALUES (?, ?)", [i + 1, blob] + ) + + query = float_vec([random.gauss(0, 1) for _ in range(dim)]) + + rescore_rows = db.execute( + "SELECT rowid FROM t_rescore WHERE embedding MATCH ? ORDER BY distance LIMIT ?", + [query, k], + ).fetchall() + flat_rows = db.execute( + "SELECT rowid FROM t_flat WHERE embedding MATCH ? ORDER BY distance LIMIT ?", + [query, k], + ).fetchall() + + rescore_ids = {r["rowid"] for r in rescore_rows} + flat_ids = {r["rowid"] for r in flat_rows} + recall = len(rescore_ids & flat_ids) / k + assert recall >= 0.7, f"Recall too low: {recall}" + + +def test_knn_cosine(db): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[8] distance_metric=cosine indexed by rescore(quantizer=bit)" + ")" + ) + db.execute( + "INSERT INTO t(rowid, embedding) VALUES (1, ?)", + [float_vec([1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])], + ) + db.execute( + "INSERT INTO t(rowid, embedding) VALUES (2, ?)", + [float_vec([0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])], + ) + rows = db.execute( + "SELECT rowid, distance FROM t WHERE embedding MATCH ? ORDER BY distance LIMIT 1", + [float_vec([1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])], + ).fetchall() + assert rows[0]["rowid"] == 1 + # cosine distance of identical vectors should be ~0 + assert rows[0]["distance"] < 0.01 + + +def test_knn_empty_table(db): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[8] indexed by rescore(quantizer=bit)" + ")" + ) + rows = db.execute( + "SELECT rowid FROM t WHERE embedding MATCH ? ORDER BY distance LIMIT 5", + [float_vec([1.0] * 8)], + ).fetchall() + assert len(rows) == 0 + + +def test_knn_k_larger_than_n(db): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[8] indexed by rescore(quantizer=bit)" + ")" + ) + db.execute("INSERT INTO t(rowid, embedding) VALUES (1, ?)", [float_vec([1.0] * 8)]) + db.execute("INSERT INTO t(rowid, embedding) VALUES (2, ?)", [float_vec([2.0] * 8)]) + rows = db.execute( + "SELECT rowid FROM t WHERE embedding MATCH ? ORDER BY distance LIMIT 10", + [float_vec([1.0] * 8)], + ).fetchall() + assert len(rows) == 2 + + +# ============================================================================ +# Integration / edge case tests +# ============================================================================ + + +def test_knn_with_rowid_in(db): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[8] indexed by rescore(quantizer=bit)" + ")" + ) + for i in range(5): + db.execute( + "INSERT INTO t(rowid, embedding) VALUES (?, ?)", + [i + 1, float_vec([float(i)] * 8)], + ) + # Only search within rowids 1, 3, 5 + rows = db.execute( + "SELECT rowid FROM t WHERE embedding MATCH ? AND rowid IN (1, 3, 5) ORDER BY distance LIMIT 3", + [float_vec([0.0] * 8)], + ).fetchall() + result_ids = {r["rowid"] for r in rows} + assert result_ids <= {1, 3, 5} + + +def test_knn_after_deletes(db): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[8] indexed by rescore(quantizer=int8)" + ")" + ) + for i in range(10): + db.execute( + "INSERT INTO t(rowid, embedding) VALUES (?, ?)", + [i + 1, float_vec([float(i)] * 8)], + ) + # Delete the closest match (rowid 1 = [0,0,...]) + db.execute("DELETE FROM t WHERE rowid = 1") + rows = db.execute( + "SELECT rowid, distance FROM t WHERE embedding MATCH ? ORDER BY distance LIMIT 5", + [float_vec([0.0] * 8)], + ).fetchall() + # Verify ordering: rowid 2 ([1]*8) should be closest, then 3 ([2]*8), etc. + assert len(rows) >= 2 + assert rows[0]["distance"] <= rows[1]["distance"] + # rowid 2 = [1,1,...] → L2 = sqrt(8) ≈ 2.83, rowid 3 = [2,2,...] → L2 = sqrt(32) ≈ 5.66 + assert rows[0]["rowid"] == 2, f"Expected rowid 2, got {rows[0]['rowid']} with dist={rows[0]['distance']}" + + +def test_oversample_effect(db): + """Higher oversample should give equal or better recall.""" + dim = 32 + n = 500 + k = 10 + random.seed(123) + + vectors = [[random.gauss(0, 1) for _ in range(dim)] for _ in range(n)] + query = float_vec([random.gauss(0, 1) for _ in range(dim)]) + + recalls = [] + for oversample in [2, 16]: + tname = f"t_os{oversample}" + db.execute( + f"CREATE VIRTUAL TABLE {tname} USING vec0(" + f" embedding float[{dim}] indexed by rescore(quantizer=bit, oversample={oversample})" + ")" + ) + for i, v in enumerate(vectors): + db.execute( + f"INSERT INTO {tname}(rowid, embedding) VALUES (?, ?)", + [i + 1, float_vec(v)], + ) + rows = db.execute( + f"SELECT rowid FROM {tname} WHERE embedding MATCH ? ORDER BY distance LIMIT ?", + [query, k], + ).fetchall() + recalls.append({r["rowid"] for r in rows}) + + # Also get ground truth + db.execute(f"CREATE VIRTUAL TABLE t_flat USING vec0(embedding float[{dim}])") + for i, v in enumerate(vectors): + db.execute( + "INSERT INTO t_flat(rowid, embedding) VALUES (?, ?)", + [i + 1, float_vec(v)], + ) + gt_rows = db.execute( + "SELECT rowid FROM t_flat WHERE embedding MATCH ? ORDER BY distance LIMIT ?", + [query, k], + ).fetchall() + gt_ids = {r["rowid"] for r in gt_rows} + + recall_low = len(recalls[0] & gt_ids) / k + recall_high = len(recalls[1] & gt_ids) / k + assert recall_high >= recall_low + + +def test_multiple_vector_columns(db): + """One column with rescore, one without.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " v1 float[8] indexed by rescore(quantizer=bit)," + " v2 float[8]" + ")" + ) + db.execute( + "INSERT INTO t(rowid, v1, v2) VALUES (1, ?, ?)", + [float_vec([1.0] * 8), float_vec([0.0] * 8)], + ) + db.execute( + "INSERT INTO t(rowid, v1, v2) VALUES (2, ?, ?)", + [float_vec([0.0] * 8), float_vec([1.0] * 8)], + ) + + # KNN on v1 (rescore path) + rows = db.execute( + "SELECT rowid FROM t WHERE v1 MATCH ? ORDER BY distance LIMIT 1", + [float_vec([1.0] * 8)], + ).fetchall() + assert rows[0]["rowid"] == 1 + + # KNN on v2 (normal path) + rows = db.execute( + "SELECT rowid FROM t WHERE v2 MATCH ? ORDER BY distance LIMIT 1", + [float_vec([1.0] * 8)], + ).fetchall() + assert rows[0]["rowid"] == 2 diff --git a/tests/test-unit.c b/tests/test-unit.c index 9eb8704..b180625 100644 --- a/tests/test-unit.c +++ b/tests/test-unit.c @@ -760,6 +760,202 @@ void test_distance_hamming() { printf(" All distance_hamming tests passed.\n"); } +#ifdef SQLITE_VEC_ENABLE_RESCORE + +void test_rescore_quantize_float_to_bit() { + printf("Starting %s...\n", __func__); + uint8_t dst[16]; + + // All positive -> all bits 1 + { + float src[8] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}; + memset(dst, 0, sizeof(dst)); + _test_rescore_quantize_float_to_bit(src, dst, 8); + assert(dst[0] == 0xFF); + } + + // All negative -> all bits 0 + { + float src[8] = {-1.0f, -2.0f, -3.0f, -4.0f, -5.0f, -6.0f, -7.0f, -8.0f}; + memset(dst, 0xFF, sizeof(dst)); + _test_rescore_quantize_float_to_bit(src, dst, 8); + assert(dst[0] == 0x00); + } + + // Alternating positive/negative + { + float src[8] = {1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f}; + _test_rescore_quantize_float_to_bit(src, dst, 8); + // bits 0,2,4,6 set => 0b01010101 = 0x55 + assert(dst[0] == 0x55); + } + + // Zero values -> bit is set (>= 0.0f) + { + float src[8] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; + _test_rescore_quantize_float_to_bit(src, dst, 8); + assert(dst[0] == 0xFF); + } + + // 128 dimensions -> 16 bytes output + { + float src[128]; + for (int i = 0; i < 128; i++) src[i] = (i % 2 == 0) ? 1.0f : -1.0f; + memset(dst, 0, 16); + _test_rescore_quantize_float_to_bit(src, dst, 128); + // Even indices set: bits 0,2,4,6 in each byte => 0x55 + for (int i = 0; i < 16; i++) { + assert(dst[i] == 0x55); + } + } + + printf(" All rescore_quantize_float_to_bit tests passed.\n"); +} + +void test_rescore_quantize_float_to_int8() { + printf("Starting %s...\n", __func__); + int8_t dst[256]; + + // Uniform vector -> all zeros (range=0) + { + float src[8] = {5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f}; + _test_rescore_quantize_float_to_int8(src, dst, 8); + for (int i = 0; i < 8; i++) { + assert(dst[i] == 0); + } + } + + // [0.0, 1.0] -> should map to [-128, 127] + { + float src[2] = {0.0f, 1.0f}; + _test_rescore_quantize_float_to_int8(src, dst, 2); + assert(dst[0] == -128); + assert(dst[1] == 127); + } + + // [-1.0, 0.0] -> should map to [-128, 127] + { + float src[2] = {-1.0f, 0.0f}; + _test_rescore_quantize_float_to_int8(src, dst, 2); + assert(dst[0] == -128); + assert(dst[1] == 127); + } + + // Single-element: range=0 -> 0 + { + float src[1] = {42.0f}; + _test_rescore_quantize_float_to_int8(src, dst, 1); + assert(dst[0] == 0); + } + + // Verify range: all outputs in [-128, 127], min near -128, max near 127 + { + float src[4] = {-100.0f, 0.0f, 100.0f, 50.0f}; + _test_rescore_quantize_float_to_int8(src, dst, 4); + for (int i = 0; i < 4; i++) { + assert(dst[i] >= -128 && dst[i] <= 127); + } + // Min maps to -128 (exact), max maps to ~127 (may lose 1 to float rounding) + assert(dst[0] == -128); + assert(dst[2] >= 126 && dst[2] <= 127); + // Middle value (50) should be positive + assert(dst[3] > 0); + } + + printf(" All rescore_quantize_float_to_int8 tests passed.\n"); +} + +void test_rescore_quantized_byte_size() { + printf("Starting %s...\n", __func__); + + // Bit quantizer: dims/8 + assert(_test_rescore_quantized_byte_size_bit(128) == 16); + assert(_test_rescore_quantized_byte_size_bit(8) == 1); + assert(_test_rescore_quantized_byte_size_bit(1024) == 128); + + // Int8 quantizer: dims + assert(_test_rescore_quantized_byte_size_int8(128) == 128); + assert(_test_rescore_quantized_byte_size_int8(8) == 8); + assert(_test_rescore_quantized_byte_size_int8(1024) == 1024); + + printf(" All rescore_quantized_byte_size tests passed.\n"); +} + +void test_vec0_parse_vector_column_rescore() { + printf("Starting %s...\n", __func__); + struct VectorColumnDefinition col; + int rc; + + // Basic bit quantizer + { + const char *input = "emb float[128] indexed by rescore(quantizer=bit)"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_OK); + assert(col.index_type == VEC0_INDEX_TYPE_RESCORE); + assert(col.rescore.quantizer_type == VEC0_RESCORE_QUANTIZER_BIT); + assert(col.rescore.oversample == 8); // default + assert(col.dimensions == 128); + sqlite3_free(col.name); + } + + // Int8 quantizer + { + const char *input = "emb float[128] indexed by rescore(quantizer=int8)"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_OK); + assert(col.index_type == VEC0_INDEX_TYPE_RESCORE); + assert(col.rescore.quantizer_type == VEC0_RESCORE_QUANTIZER_INT8); + sqlite3_free(col.name); + } + + // Bit quantizer with oversample + { + const char *input = "emb float[128] indexed by rescore(quantizer=bit, oversample=16)"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_OK); + assert(col.index_type == VEC0_INDEX_TYPE_RESCORE); + assert(col.rescore.quantizer_type == VEC0_RESCORE_QUANTIZER_BIT); + assert(col.rescore.oversample == 16); + sqlite3_free(col.name); + } + + // Error: non-float element type + { + const char *input = "emb int8[128] indexed by rescore(quantizer=bit)"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_ERROR); + } + + // Error: dims not divisible by 8 for bit quantizer + { + const char *input = "emb float[100] indexed by rescore(quantizer=bit)"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_ERROR); + } + + // Error: missing quantizer + { + const char *input = "emb float[128] indexed by rescore(oversample=8)"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_ERROR); + } + + // With distance_metric=cosine + { + const char *input = "emb float[128] distance_metric=cosine indexed by rescore(quantizer=int8)"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_OK); + assert(col.index_type == VEC0_INDEX_TYPE_RESCORE); + assert(col.distance_metric == VEC0_DISTANCE_METRIC_COSINE); + assert(col.rescore.quantizer_type == VEC0_RESCORE_QUANTIZER_INT8); + sqlite3_free(col.name); + } + + printf(" All vec0_parse_vector_column_rescore tests passed.\n"); +} + +#endif /* SQLITE_VEC_ENABLE_RESCORE */ + int main() { printf("Starting unit tests...\n"); #ifdef SQLITE_VEC_ENABLE_AVX @@ -768,6 +964,9 @@ int main() { #ifdef SQLITE_VEC_ENABLE_NEON printf("SQLITE_VEC_ENABLE_NEON=1\n"); #endif +#ifdef SQLITE_VEC_ENABLE_RESCORE + printf("SQLITE_VEC_ENABLE_RESCORE=1\n"); +#endif #if !defined(SQLITE_VEC_ENABLE_AVX) && !defined(SQLITE_VEC_ENABLE_NEON) printf("SIMD: none\n"); #endif @@ -778,5 +977,11 @@ int main() { test_distance_l2_sqr_float(); test_distance_cosine_float(); test_distance_hamming(); +#ifdef SQLITE_VEC_ENABLE_RESCORE + test_rescore_quantize_float_to_bit(); + test_rescore_quantize_float_to_int8(); + test_rescore_quantized_byte_size(); + test_vec0_parse_vector_column_rescore(); +#endif printf("All unit tests passed.\n"); } From 69f7b658e99684876a501a62d79c397e905db3b7 Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Mon, 30 Mar 2026 16:40:44 -0700 Subject: [PATCH 03/38] rm unnecessary TODO --- TODO.md | 73 --------------------------------------------------------- 1 file changed, 73 deletions(-) delete mode 100644 TODO.md diff --git a/TODO.md b/TODO.md deleted file mode 100644 index 4c3cc19..0000000 --- a/TODO.md +++ /dev/null @@ -1,73 +0,0 @@ -# TODO: `ann` base branch + consolidated benchmarks - -## 1. Create `ann` branch with shared code - -### 1.1 Branch setup -- [x] `git checkout -B ann origin/main` -- [x] Cherry-pick `624f998` (vec0_distance_full shared distance dispatch) -- [x] Cherry-pick stdint.h fix for test header -- [ ] Pull NEON cosine optimization from ivf-yolo3 into shared code - - Currently only in ivf branch but is general-purpose (benefits all distance calcs) - - Lives in `distance_cosine_float()` — ~57 lines of ARM NEON vectorized cosine - -### 1.2 Benchmark infrastructure (`benchmarks-ann/`) -- [x] Seed data pipeline (`seed/Makefile`, `seed/build_base_db.py`) -- [x] Ground truth generator (`ground_truth.py`) -- [x] Results schema (`schema.sql`) -- [x] Benchmark runner with `INDEX_REGISTRY` extension point (`bench.py`) - - Baseline configs (float, int8-rescore, bit-rescore) implemented - - Index branches register their types via `INDEX_REGISTRY` dict -- [x] Makefile with baseline targets -- [x] README - -### 1.3 Rebase feature branches onto `ann` -- [x] Rebase `diskann-yolo2` onto `ann` (1 commit: DiskANN implementation) -- [x] Rebase `ivf-yolo3` onto `ann` (1 commit: IVF implementation) -- [x] Rebase `annoy-yolo2` onto `ann` (2 commits: Annoy implementation + schema fix) -- [x] Verify each branch has only its index-specific commits remaining -- [ ] Force-push all 4 branches to origin - ---- - -## 2. Per-branch: register index type in benchmarks - -Each index branch should add to `benchmarks-ann/` when rebased onto `ann`: - -### 2.1 Register in `bench.py` - -Add an `INDEX_REGISTRY` entry. Each entry provides: -- `defaults` — default param values -- `create_table_sql(params)` — CREATE VIRTUAL TABLE with INDEXED BY clause -- `insert_sql(params)` — custom insert SQL, or None for default -- `post_insert_hook(conn, params)` — training/building step, returns time -- `run_query(conn, params, query, k)` — custom query, or None for default MATCH -- `describe(params)` — one-line description for report output - -### 2.2 Add configs to `Makefile` - -Append index-specific config variables and targets. Example pattern: - -```makefile -DISKANN_CONFIGS = \ - "diskann-R48-binary:type=diskann,R=48,L=128,quantizer=binary" \ - ... - -ALL_CONFIGS += $(DISKANN_CONFIGS) - -bench-diskann: seed - $(BENCH) --subset-size 10000 -k 10 -o runs/diskann $(BASELINES) $(DISKANN_CONFIGS) - ... -``` - -### 2.3 Migrate existing benchmark results/docs - -- Move useful results docs (RESULTS.md, etc.) into `benchmarks-ann/results/` -- Delete redundant per-branch benchmark directories once consolidated infra is proven - ---- - -## 3. Future improvements - -- [ ] Reporting script (`report.py`) — query results.db, produce markdown comparison tables -- [ ] Profiling targets in Makefile (lift from ivf-yolo3's Instruments/perf wrappers) -- [ ] Pre-computed ground truth integration (use GT DB files instead of on-the-fly brute-force) From e9f598abfa0c06b328d8fe5da9c3760cce74be10 Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Tue, 31 Mar 2026 00:59:06 -0700 Subject: [PATCH 04/38] v0.1.9 --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 2f2c74e..1a03094 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.1.9-alpha.1 +0.1.9 From 0de765f4570c1d65a0699e8e38d6c5562da9e23e Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Tue, 31 Mar 2026 01:03:32 -0700 Subject: [PATCH 05/38] Add ANN search support for vec0 virtual table (#273) Add approximate nearest neighbor infrastructure to vec0: shared distance dispatch (vec0_distance_full), flat index type with parser, NEON-optimized cosine/Hamming for float32/int8, amalgamation script, and benchmark suite (benchmarks-ann/) with ground-truth generation and profiling tools. Remove unused vec_npy_each/vec_static_blobs code, fix missing stdint.h include. --- Makefile | 14 +- TODO.md | 73 + benchmarks-ann/.gitignore | 2 + benchmarks-ann/Makefile | 61 + benchmarks-ann/README.md | 81 + benchmarks-ann/bench.py | 488 ++++++ benchmarks-ann/ground_truth.py | 168 ++ benchmarks-ann/profile.py | 440 ++++++ benchmarks-ann/schema.sql | 35 + benchmarks-ann/seed/.gitignore | 2 + benchmarks-ann/seed/Makefile | 24 + benchmarks-ann/seed/build_base_db.py | 121 ++ benchmarks/exhaustive-memory/bench.py | 57 +- benchmarks/profiling/build-from-npy.sql | 7 - benchmarks/self-params/build.py | 14 +- bindings/go/ncruces/go-sqlite3.patch | 1 - bindings/python/extra_init.py | 31 - scripts/amalgamate.py | 119 ++ site/api-reference.md | 59 - site/compiling.md | 1 - sqlite-vec.c | 1863 +++++------------------ tests/correctness/test-correctness.py | 17 +- tests/fuzz/numpy.c | 37 - tests/sqlite-vec-internal.h | 6 + tests/test-loadable.py | 415 +---- tests/test-unit.c | 101 ++ tmp-static.py | 56 - 27 files changed, 2177 insertions(+), 2116 deletions(-) create mode 100644 TODO.md create mode 100644 benchmarks-ann/.gitignore create mode 100644 benchmarks-ann/Makefile create mode 100644 benchmarks-ann/README.md create mode 100644 benchmarks-ann/bench.py create mode 100644 benchmarks-ann/ground_truth.py create mode 100644 benchmarks-ann/profile.py create mode 100644 benchmarks-ann/schema.sql create mode 100644 benchmarks-ann/seed/.gitignore create mode 100644 benchmarks-ann/seed/Makefile create mode 100644 benchmarks-ann/seed/build_base_db.py create mode 100644 scripts/amalgamate.py delete mode 100644 tests/fuzz/numpy.c delete mode 100644 tmp-static.py diff --git a/Makefile b/Makefile index 1ebdbed..051590e 100644 --- a/Makefile +++ b/Makefile @@ -42,6 +42,11 @@ ifndef OMIT_SIMD ifeq ($(shell uname -sm),Darwin arm64) CFLAGS += -mcpu=apple-m1 -DSQLITE_VEC_ENABLE_NEON endif + ifeq ($(shell uname -s),Linux) + ifneq ($(filter avx,$(shell grep -o 'avx[^ ]*' /proc/cpuinfo 2>/dev/null | head -1)),) + CFLAGS += -mavx -DSQLITE_VEC_ENABLE_AVX + endif + endif endif ifdef USE_BREW_SQLITE @@ -155,6 +160,13 @@ clean: rm -rf dist +TARGET_AMALGAMATION=$(prefix)/sqlite-vec.c + +amalgamation: $(TARGET_AMALGAMATION) + +$(TARGET_AMALGAMATION): sqlite-vec.c $(wildcard sqlite-vec-*.c) scripts/amalgamate.py $(prefix) + python3 scripts/amalgamate.py sqlite-vec.c > $@ + FORMAT_FILES=sqlite-vec.h sqlite-vec.c format: $(FORMAT_FILES) clang-format -i $(FORMAT_FILES) @@ -174,7 +186,7 @@ evidence-of: test: sqlite3 :memory: '.read test.sql' -.PHONY: version loadable static test clean gh-release evidence-of install uninstall +.PHONY: version loadable static test clean gh-release evidence-of install uninstall amalgamation publish-release: ./scripts/publish-release.sh diff --git a/TODO.md b/TODO.md new file mode 100644 index 0000000..4c3cc19 --- /dev/null +++ b/TODO.md @@ -0,0 +1,73 @@ +# TODO: `ann` base branch + consolidated benchmarks + +## 1. Create `ann` branch with shared code + +### 1.1 Branch setup +- [x] `git checkout -B ann origin/main` +- [x] Cherry-pick `624f998` (vec0_distance_full shared distance dispatch) +- [x] Cherry-pick stdint.h fix for test header +- [ ] Pull NEON cosine optimization from ivf-yolo3 into shared code + - Currently only in ivf branch but is general-purpose (benefits all distance calcs) + - Lives in `distance_cosine_float()` — ~57 lines of ARM NEON vectorized cosine + +### 1.2 Benchmark infrastructure (`benchmarks-ann/`) +- [x] Seed data pipeline (`seed/Makefile`, `seed/build_base_db.py`) +- [x] Ground truth generator (`ground_truth.py`) +- [x] Results schema (`schema.sql`) +- [x] Benchmark runner with `INDEX_REGISTRY` extension point (`bench.py`) + - Baseline configs (float, int8-rescore, bit-rescore) implemented + - Index branches register their types via `INDEX_REGISTRY` dict +- [x] Makefile with baseline targets +- [x] README + +### 1.3 Rebase feature branches onto `ann` +- [x] Rebase `diskann-yolo2` onto `ann` (1 commit: DiskANN implementation) +- [x] Rebase `ivf-yolo3` onto `ann` (1 commit: IVF implementation) +- [x] Rebase `annoy-yolo2` onto `ann` (2 commits: Annoy implementation + schema fix) +- [x] Verify each branch has only its index-specific commits remaining +- [ ] Force-push all 4 branches to origin + +--- + +## 2. Per-branch: register index type in benchmarks + +Each index branch should add to `benchmarks-ann/` when rebased onto `ann`: + +### 2.1 Register in `bench.py` + +Add an `INDEX_REGISTRY` entry. Each entry provides: +- `defaults` — default param values +- `create_table_sql(params)` — CREATE VIRTUAL TABLE with INDEXED BY clause +- `insert_sql(params)` — custom insert SQL, or None for default +- `post_insert_hook(conn, params)` — training/building step, returns time +- `run_query(conn, params, query, k)` — custom query, or None for default MATCH +- `describe(params)` — one-line description for report output + +### 2.2 Add configs to `Makefile` + +Append index-specific config variables and targets. Example pattern: + +```makefile +DISKANN_CONFIGS = \ + "diskann-R48-binary:type=diskann,R=48,L=128,quantizer=binary" \ + ... + +ALL_CONFIGS += $(DISKANN_CONFIGS) + +bench-diskann: seed + $(BENCH) --subset-size 10000 -k 10 -o runs/diskann $(BASELINES) $(DISKANN_CONFIGS) + ... +``` + +### 2.3 Migrate existing benchmark results/docs + +- Move useful results docs (RESULTS.md, etc.) into `benchmarks-ann/results/` +- Delete redundant per-branch benchmark directories once consolidated infra is proven + +--- + +## 3. Future improvements + +- [ ] Reporting script (`report.py`) — query results.db, produce markdown comparison tables +- [ ] Profiling targets in Makefile (lift from ivf-yolo3's Instruments/perf wrappers) +- [ ] Pre-computed ground truth integration (use GT DB files instead of on-the-fly brute-force) diff --git a/benchmarks-ann/.gitignore b/benchmarks-ann/.gitignore new file mode 100644 index 0000000..c418b76 --- /dev/null +++ b/benchmarks-ann/.gitignore @@ -0,0 +1,2 @@ +*.db +runs/ diff --git a/benchmarks-ann/Makefile b/benchmarks-ann/Makefile new file mode 100644 index 0000000..59e2dcd --- /dev/null +++ b/benchmarks-ann/Makefile @@ -0,0 +1,61 @@ +BENCH = python bench.py +BASE_DB = seed/base.db +EXT = ../dist/vec0 + +# --- Baseline (brute-force) configs --- +BASELINES = \ + "brute-float:type=baseline,variant=float" \ + "brute-int8:type=baseline,variant=int8" \ + "brute-bit:type=baseline,variant=bit" + +# --- Index-specific configs --- +# Each index branch should add its own configs here. Example: +# +# DISKANN_CONFIGS = \ +# "diskann-R48-binary:type=diskann,R=48,L=128,quantizer=binary" \ +# "diskann-R72-int8:type=diskann,R=72,L=128,quantizer=int8" +# +# IVF_CONFIGS = \ +# "ivf-n128-p16:type=ivf,nlist=128,nprobe=16" +# +# ANNOY_CONFIGS = \ +# "annoy-t50:type=annoy,n_trees=50" + +ALL_CONFIGS = $(BASELINES) + +.PHONY: seed ground-truth bench-smoke bench-10k bench-50k bench-100k bench-all \ + report clean + +# --- Data preparation --- +seed: + $(MAKE) -C seed + +ground-truth: seed + python ground_truth.py --subset-size 10000 + python ground_truth.py --subset-size 50000 + python ground_truth.py --subset-size 100000 + +# --- Quick smoke test --- +bench-smoke: seed + $(BENCH) --subset-size 5000 -k 10 -n 20 -o runs/smoke \ + $(BASELINES) + +# --- Standard sizes --- +bench-10k: seed + $(BENCH) --subset-size 10000 -k 10 -o runs/10k $(ALL_CONFIGS) + +bench-50k: seed + $(BENCH) --subset-size 50000 -k 10 -o runs/50k $(ALL_CONFIGS) + +bench-100k: seed + $(BENCH) --subset-size 100000 -k 10 -o runs/100k $(ALL_CONFIGS) + +bench-all: bench-10k bench-50k bench-100k + +# --- Report --- +report: + @echo "Use: sqlite3 runs//results.db 'SELECT * FROM bench_results ORDER BY recall DESC'" + +# --- Cleanup --- +clean: + rm -rf runs/ diff --git a/benchmarks-ann/README.md b/benchmarks-ann/README.md new file mode 100644 index 0000000..1f7fd5c --- /dev/null +++ b/benchmarks-ann/README.md @@ -0,0 +1,81 @@ +# KNN Benchmarks for sqlite-vec + +Benchmarking infrastructure for vec0 KNN configurations. Includes brute-force +baselines (float, int8, bit); index-specific branches add their own types +via the `INDEX_REGISTRY` in `bench.py`. + +## Prerequisites + +- Built `dist/vec0` extension (run `make` from repo root) +- Python 3.10+ +- `uv` (for seed data prep): `pip install uv` + +## Quick start + +```bash +# 1. Download dataset and build seed DB (~3 GB download, ~5 min) +make seed + +# 2. Run a quick smoke test (5k vectors, ~1 min) +make bench-smoke + +# 3. Run full benchmark at 10k +make bench-10k +``` + +## Usage + +### Direct invocation + +```bash +python bench.py --subset-size 10000 \ + "brute-float:type=baseline,variant=float" \ + "brute-int8:type=baseline,variant=int8" \ + "brute-bit:type=baseline,variant=bit" +``` + +### Config format + +`name:type=,key=val,key=val` + +| Index type | Keys | Branch | +|-----------|------|--------| +| `baseline` | `variant` (float/int8/bit), `oversample` | this branch | + +Index branches register additional types in `INDEX_REGISTRY`. See the +docstring in `bench.py` for the extension API. + +### Make targets + +| Target | Description | +|--------|-------------| +| `make seed` | Download COHERE 1M dataset | +| `make ground-truth` | Pre-compute ground truth for 10k/50k/100k | +| `make bench-smoke` | Quick 5k baseline test | +| `make bench-10k` | All configs at 10k vectors | +| `make bench-50k` | All configs at 50k vectors | +| `make bench-100k` | All configs at 100k vectors | +| `make bench-all` | 10k + 50k + 100k | + +## Adding an index type + +In your index branch, add an entry to `INDEX_REGISTRY` in `bench.py` and +append your configs to `ALL_CONFIGS` in the `Makefile`. See the existing +`baseline` entry and the comments in both files for the pattern. + +## Results + +Results are stored in `runs//results.db` using the schema in `schema.sql`. + +```bash +sqlite3 runs/10k/results.db " + SELECT config_name, recall, mean_ms, qps + FROM bench_results + ORDER BY recall DESC +" +``` + +## Dataset + +[Zilliz COHERE Medium 1M](https://zilliz.com/learn/datasets-for-vector-database-benchmarks): +768 dimensions, cosine distance, 1M train vectors + 10k query vectors with precomputed neighbors. diff --git a/benchmarks-ann/bench.py b/benchmarks-ann/bench.py new file mode 100644 index 0000000..93f8f82 --- /dev/null +++ b/benchmarks-ann/bench.py @@ -0,0 +1,488 @@ +#!/usr/bin/env python3 +"""Benchmark runner for sqlite-vec KNN configurations. + +Measures insert time, build/train time, DB size, KNN latency, and recall +across different vec0 configurations. + +Config format: name:type=,key=val,key=val + + Baseline (brute-force) keys: + type=baseline, variant=float|int8|bit, oversample=8 + + Index-specific types can be registered via INDEX_REGISTRY (see below). + +Usage: + python bench.py --subset-size 10000 \ + "brute-float:type=baseline,variant=float" \ + "brute-int8:type=baseline,variant=int8" \ + "brute-bit:type=baseline,variant=bit" +""" +import argparse +import os +import sqlite3 +import statistics +import time + +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +EXT_PATH = os.path.join(_SCRIPT_DIR, "..", "dist", "vec0") +BASE_DB = os.path.join(_SCRIPT_DIR, "seed", "base.db") +INSERT_BATCH_SIZE = 1000 + + +# ============================================================================ +# Index registry — extension point for ANN index branches +# ============================================================================ +# +# Each index type provides a dict with: +# "defaults": dict of default params +# "create_table_sql": fn(params) -> SQL string +# "insert_sql": fn(params) -> SQL string (or None for default) +# "post_insert_hook": fn(conn, params) -> train_time_s (or None) +# "run_query": fn(conn, params, query, k) -> [(id, distance), ...] (or None for default MATCH) +# "describe": fn(params) -> str (one-line description) +# +# To add a new index type, add an entry here. Example (in your branch): +# +# INDEX_REGISTRY["diskann"] = { +# "defaults": {"R": 72, "L": 128, "quantizer": "binary", "buffer_threshold": 0}, +# "create_table_sql": lambda p: f"CREATE VIRTUAL TABLE vec_items USING vec0(...)", +# "insert_sql": None, +# "post_insert_hook": None, +# "run_query": None, +# "describe": lambda p: f"diskann q={p['quantizer']} R={p['R']} L={p['L']}", +# } + +INDEX_REGISTRY = {} + + +# ============================================================================ +# Baseline implementation +# ============================================================================ + + +def _baseline_create_table_sql(params): + variant = params["variant"] + extra = "" + if variant == "int8": + extra = ", embedding_int8 int8[768]" + elif variant == "bit": + extra = ", embedding_bq bit[768]" + return ( + f"CREATE VIRTUAL TABLE vec_items USING vec0(" + f" chunk_size=256," + f" id integer primary key," + f" embedding float[768] distance_metric=cosine" + f" {extra})" + ) + + +def _baseline_insert_sql(params): + variant = params["variant"] + if variant == "int8": + return ( + "INSERT INTO vec_items(id, embedding, embedding_int8) " + "SELECT id, vector, vec_quantize_int8(vector, 'unit') " + "FROM base.train WHERE id >= :lo AND id < :hi" + ) + elif variant == "bit": + return ( + "INSERT INTO vec_items(id, embedding, embedding_bq) " + "SELECT id, vector, vec_quantize_binary(vector) " + "FROM base.train WHERE id >= :lo AND id < :hi" + ) + return None # use default + + +def _baseline_run_query(conn, params, query, k): + variant = params["variant"] + oversample = params.get("oversample", 8) + + if variant == "int8": + return conn.execute( + "WITH coarse AS (" + " SELECT id, embedding FROM vec_items" + " WHERE embedding_int8 MATCH vec_quantize_int8(:query, 'unit')" + " LIMIT :oversample_k" + ") " + "SELECT id, vec_distance_cosine(embedding, :query) as distance " + "FROM coarse ORDER BY 2 LIMIT :k", + {"query": query, "k": k, "oversample_k": k * oversample}, + ).fetchall() + elif variant == "bit": + return conn.execute( + "WITH coarse AS (" + " SELECT id, embedding FROM vec_items" + " WHERE embedding_bq MATCH vec_quantize_binary(:query)" + " LIMIT :oversample_k" + ") " + "SELECT id, vec_distance_cosine(embedding, :query) as distance " + "FROM coarse ORDER BY 2 LIMIT :k", + {"query": query, "k": k, "oversample_k": k * oversample}, + ).fetchall() + + return None # use default MATCH + + +def _baseline_describe(params): + v = params["variant"] + if v in ("int8", "bit"): + return f"baseline {v} (os={params['oversample']})" + return f"baseline {v}" + + +INDEX_REGISTRY["baseline"] = { + "defaults": {"variant": "float", "oversample": 8}, + "create_table_sql": _baseline_create_table_sql, + "insert_sql": _baseline_insert_sql, + "post_insert_hook": None, + "run_query": _baseline_run_query, + "describe": _baseline_describe, +} + + +# ============================================================================ +# Config parsing +# ============================================================================ + +INT_KEYS = { + "R", "L", "buffer_threshold", "nlist", "nprobe", "oversample", + "n_trees", "search_k", +} + + +def parse_config(spec): + """Parse 'name:type=baseline,key=val,...' into (name, params_dict).""" + if ":" in spec: + name, opts_str = spec.split(":", 1) + else: + name, opts_str = spec, "" + + raw = {} + if opts_str: + for kv in opts_str.split(","): + k, v = kv.split("=", 1) + raw[k.strip()] = v.strip() + + index_type = raw.pop("type", "baseline") + if index_type not in INDEX_REGISTRY: + raise ValueError( + f"Unknown index type: {index_type}. " + f"Available: {', '.join(sorted(INDEX_REGISTRY.keys()))}" + ) + + reg = INDEX_REGISTRY[index_type] + params = dict(reg["defaults"]) + for k, v in raw.items(): + if k in INT_KEYS: + params[k] = int(v) + else: + params[k] = v + params["index_type"] = index_type + + return name, params + + +# ============================================================================ +# Shared helpers +# ============================================================================ + + +def load_query_vectors(base_db_path, n): + conn = sqlite3.connect(base_db_path) + rows = conn.execute( + "SELECT id, vector FROM query_vectors ORDER BY id LIMIT :n", {"n": n} + ).fetchall() + conn.close() + return [(r[0], r[1]) for r in rows] + + +def insert_loop(conn, sql, subset_size, label=""): + t0 = time.perf_counter() + for lo in range(0, subset_size, INSERT_BATCH_SIZE): + hi = min(lo + INSERT_BATCH_SIZE, subset_size) + conn.execute(sql, {"lo": lo, "hi": hi}) + conn.commit() + done = hi + if done % 5000 == 0 or done == subset_size: + elapsed = time.perf_counter() - t0 + rate = done / elapsed if elapsed > 0 else 0 + print( + f" [{label}] {done:>8}/{subset_size} " + f"{elapsed:.1f}s {rate:.0f} rows/s", + flush=True, + ) + return time.perf_counter() - t0 + + +def open_bench_db(db_path, ext_path, base_db): + if os.path.exists(db_path): + os.remove(db_path) + conn = sqlite3.connect(db_path) + conn.enable_load_extension(True) + conn.load_extension(ext_path) + conn.execute("PRAGMA page_size=8192") + conn.execute(f"ATTACH DATABASE '{base_db}' AS base") + return conn + + +DEFAULT_INSERT_SQL = ( + "INSERT INTO vec_items(id, embedding) " + "SELECT id, vector FROM base.train WHERE id >= :lo AND id < :hi" +) + + +# ============================================================================ +# Build +# ============================================================================ + + +def build_index(base_db, ext_path, name, params, subset_size, out_dir): + db_path = os.path.join(out_dir, f"{name}.{subset_size}.db") + conn = open_bench_db(db_path, ext_path, base_db) + + reg = INDEX_REGISTRY[params["index_type"]] + + conn.execute(reg["create_table_sql"](params)) + + label = params["index_type"] + print(f" Inserting {subset_size} vectors...") + + sql_fn = reg.get("insert_sql") + sql = sql_fn(params) if sql_fn else None + if sql is None: + sql = DEFAULT_INSERT_SQL + + insert_time = insert_loop(conn, sql, subset_size, label) + + train_time = 0.0 + hook = reg.get("post_insert_hook") + if hook: + train_time = hook(conn, params) + + row_count = conn.execute("SELECT count(*) FROM vec_items").fetchone()[0] + conn.close() + file_size_mb = os.path.getsize(db_path) / (1024 * 1024) + + return { + "db_path": db_path, + "insert_time_s": round(insert_time, 3), + "train_time_s": round(train_time, 3), + "total_time_s": round(insert_time + train_time, 3), + "insert_per_vec_ms": round((insert_time / row_count) * 1000, 2) + if row_count + else 0, + "rows": row_count, + "file_size_mb": round(file_size_mb, 2), + } + + +# ============================================================================ +# KNN measurement +# ============================================================================ + + +def _default_match_query(conn, query, k): + return conn.execute( + "SELECT id, distance FROM vec_items " + "WHERE embedding MATCH :query AND k = :k", + {"query": query, "k": k}, + ).fetchall() + + +def measure_knn(db_path, ext_path, base_db, params, subset_size, k=10, n=50): + conn = sqlite3.connect(db_path) + conn.enable_load_extension(True) + conn.load_extension(ext_path) + conn.execute(f"ATTACH DATABASE '{base_db}' AS base") + + query_vectors = load_query_vectors(base_db, n) + + reg = INDEX_REGISTRY[params["index_type"]] + query_fn = reg.get("run_query") + + times_ms = [] + recalls = [] + for qid, query in query_vectors: + t0 = time.perf_counter() + + results = None + if query_fn: + results = query_fn(conn, params, query, k) + if results is None: + results = _default_match_query(conn, query, k) + + elapsed_ms = (time.perf_counter() - t0) * 1000 + times_ms.append(elapsed_ms) + result_ids = set(r[0] for r in results) + + # Ground truth: use pre-computed neighbors table for full dataset, + # otherwise brute-force over the subset + if subset_size >= 1000000: + gt_rows = conn.execute( + "SELECT CAST(neighbors_id AS INTEGER) FROM base.neighbors " + "WHERE query_vector_id = :qid AND rank < :k", + {"qid": qid, "k": k}, + ).fetchall() + else: + gt_rows = conn.execute( + "SELECT id FROM (" + " SELECT id, vec_distance_cosine(vector, :query) as dist " + " FROM base.train WHERE id < :n ORDER BY dist LIMIT :k" + ")", + {"query": query, "k": k, "n": subset_size}, + ).fetchall() + gt_ids = set(r[0] for r in gt_rows) + + if gt_ids: + recalls.append(len(result_ids & gt_ids) / len(gt_ids)) + else: + recalls.append(0.0) + + conn.close() + + return { + "mean_ms": round(statistics.mean(times_ms), 2), + "median_ms": round(statistics.median(times_ms), 2), + "p99_ms": round(sorted(times_ms)[int(len(times_ms) * 0.99)], 2) + if len(times_ms) > 1 + else round(times_ms[0], 2), + "total_ms": round(sum(times_ms), 2), + "recall": round(statistics.mean(recalls), 4), + } + + +# ============================================================================ +# Results persistence +# ============================================================================ + + +def save_results(results_path, rows): + db = sqlite3.connect(results_path) + db.executescript(open(os.path.join(_SCRIPT_DIR, "schema.sql")).read()) + for r in rows: + db.execute( + "INSERT OR REPLACE INTO build_results " + "(config_name, index_type, subset_size, db_path, " + " insert_time_s, train_time_s, total_time_s, rows, file_size_mb) " + "VALUES (?,?,?,?,?,?,?,?,?)", + ( + r["name"], r["index_type"], r["n_vectors"], r["db_path"], + r["insert_time_s"], r["train_time_s"], r["total_time_s"], + r["rows"], r["file_size_mb"], + ), + ) + db.execute( + "INSERT OR REPLACE INTO bench_results " + "(config_name, index_type, subset_size, k, n, " + " mean_ms, median_ms, p99_ms, total_ms, qps, recall, db_path) " + "VALUES (?,?,?,?,?,?,?,?,?,?,?,?)", + ( + r["name"], r["index_type"], r["n_vectors"], r["k"], r["n_queries"], + r["mean_ms"], r["median_ms"], r["p99_ms"], r["total_ms"], + round(r["n_queries"] / (r["total_ms"] / 1000), 1) + if r["total_ms"] > 0 else 0, + r["recall"], r["db_path"], + ), + ) + db.commit() + db.close() + + +# ============================================================================ +# Reporting +# ============================================================================ + + +def print_report(all_results): + print( + f"\n{'name':>20} {'N':>7} {'type':>10} {'config':>28} " + f"{'ins(s)':>7} {'train':>6} {'MB':>7} " + f"{'qry(ms)':>8} {'recall':>7}" + ) + print("-" * 115) + for r in all_results: + train = f"{r['train_time_s']:.1f}" if r["train_time_s"] > 0 else "-" + print( + f"{r['name']:>20} {r['n_vectors']:>7} {r['index_type']:>10} " + f"{r['config_desc']:>28} " + f"{r['insert_time_s']:>7.1f} {train:>6} {r['file_size_mb']:>7.1f} " + f"{r['mean_ms']:>8.2f} {r['recall']:>7.4f}" + ) + + +# ============================================================================ +# Main +# ============================================================================ + + +def main(): + parser = argparse.ArgumentParser( + description="Benchmark runner for sqlite-vec KNN configurations", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + parser.add_argument("configs", nargs="+", help="config specs (name:type=X,key=val,...)") + parser.add_argument("--subset-size", type=int, required=True) + parser.add_argument("-k", type=int, default=10, help="KNN k (default 10)") + parser.add_argument("-n", type=int, default=50, help="number of queries (default 50)") + parser.add_argument("--base-db", default=BASE_DB) + parser.add_argument("--ext", default=EXT_PATH) + parser.add_argument("-o", "--out-dir", default="runs") + parser.add_argument("--results-db", default=None, + help="path to results DB (default: /results.db)") + args = parser.parse_args() + + os.makedirs(args.out_dir, exist_ok=True) + results_db = args.results_db or os.path.join(args.out_dir, "results.db") + configs = [parse_config(c) for c in args.configs] + + all_results = [] + for i, (name, params) in enumerate(configs, 1): + reg = INDEX_REGISTRY[params["index_type"]] + desc = reg["describe"](params) + print(f"\n[{i}/{len(configs)}] {name} ({desc.strip()})") + + build = build_index( + args.base_db, args.ext, name, params, args.subset_size, args.out_dir + ) + train_str = f" + {build['train_time_s']}s train" if build["train_time_s"] > 0 else "" + print( + f" Build: {build['insert_time_s']}s insert{train_str} " + f"{build['file_size_mb']} MB" + ) + + print(f" Measuring KNN (k={args.k}, n={args.n})...") + knn = measure_knn( + build["db_path"], args.ext, args.base_db, + params, args.subset_size, k=args.k, n=args.n, + ) + print(f" KNN: mean={knn['mean_ms']}ms recall@{args.k}={knn['recall']}") + + all_results.append({ + "name": name, + "n_vectors": args.subset_size, + "index_type": params["index_type"], + "config_desc": desc, + "db_path": build["db_path"], + "insert_time_s": build["insert_time_s"], + "train_time_s": build["train_time_s"], + "total_time_s": build["total_time_s"], + "insert_per_vec_ms": build["insert_per_vec_ms"], + "rows": build["rows"], + "file_size_mb": build["file_size_mb"], + "k": args.k, + "n_queries": args.n, + "mean_ms": knn["mean_ms"], + "median_ms": knn["median_ms"], + "p99_ms": knn["p99_ms"], + "total_ms": knn["total_ms"], + "recall": knn["recall"], + }) + + print_report(all_results) + save_results(results_db, all_results) + print(f"\nResults saved to {results_db}") + + +if __name__ == "__main__": + main() diff --git a/benchmarks-ann/ground_truth.py b/benchmarks-ann/ground_truth.py new file mode 100644 index 0000000..636a495 --- /dev/null +++ b/benchmarks-ann/ground_truth.py @@ -0,0 +1,168 @@ +#!/usr/bin/env python3 +"""Compute per-subset ground truth for ANN benchmarks. + +For subset sizes < 1M, builds a temporary vec0 float table with the first N +vectors and runs brute-force KNN to get correct ground truth per subset. + +For 1M (the full dataset), converts the existing `neighbors` table. + +Output: ground_truth.{subset_size}.db with table: + ground_truth(query_vector_id, rank, neighbor_id, distance) + +Usage: + python ground_truth.py --subset-size 50000 + python ground_truth.py --subset-size 1000000 +""" +import argparse +import os +import sqlite3 +import time + +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +EXT_PATH = os.path.join(_SCRIPT_DIR, "..", "dist", "vec0") +BASE_DB = os.path.join(_SCRIPT_DIR, "seed", "base.db") +FULL_DATASET_SIZE = 1_000_000 + + +def gen_ground_truth_subset(base_db, ext_path, subset_size, n_queries, k, out_path): + """Build ground truth by brute-force KNN over the first `subset_size` vectors.""" + if os.path.exists(out_path): + os.remove(out_path) + + conn = sqlite3.connect(out_path) + conn.enable_load_extension(True) + conn.load_extension(ext_path) + + conn.execute( + "CREATE TABLE ground_truth (" + " query_vector_id INTEGER NOT NULL," + " rank INTEGER NOT NULL," + " neighbor_id INTEGER NOT NULL," + " distance REAL NOT NULL," + " PRIMARY KEY (query_vector_id, rank)" + ")" + ) + + conn.execute(f"ATTACH DATABASE '{base_db}' AS base") + + print(f" Building temp vec0 table with {subset_size} vectors...") + conn.execute( + "CREATE VIRTUAL TABLE tmp_vec USING vec0(" + " id integer primary key," + " embedding float[768] distance_metric=cosine" + ")" + ) + + t0 = time.perf_counter() + conn.execute( + "INSERT INTO tmp_vec(id, embedding) " + "SELECT id, vector FROM base.train WHERE id < :n", + {"n": subset_size}, + ) + conn.commit() + build_time = time.perf_counter() - t0 + print(f" Temp table built in {build_time:.1f}s") + + query_vectors = conn.execute( + "SELECT id, vector FROM base.query_vectors ORDER BY id LIMIT :n", + {"n": n_queries}, + ).fetchall() + + print(f" Running brute-force KNN for {len(query_vectors)} queries, k={k}...") + t0 = time.perf_counter() + + for i, (qid, qvec) in enumerate(query_vectors): + results = conn.execute( + "SELECT id, distance FROM tmp_vec " + "WHERE embedding MATCH :query AND k = :k", + {"query": qvec, "k": k}, + ).fetchall() + + for rank, (nid, dist) in enumerate(results): + conn.execute( + "INSERT INTO ground_truth(query_vector_id, rank, neighbor_id, distance) " + "VALUES (?, ?, ?, ?)", + (qid, rank, nid, dist), + ) + + if (i + 1) % 10 == 0 or i == 0: + elapsed = time.perf_counter() - t0 + eta = (elapsed / (i + 1)) * (len(query_vectors) - i - 1) + print( + f" {i+1}/{len(query_vectors)} queries " + f"elapsed={elapsed:.1f}s eta={eta:.1f}s", + flush=True, + ) + + conn.commit() + conn.execute("DROP TABLE tmp_vec") + conn.execute("DETACH DATABASE base") + conn.commit() + + elapsed = time.perf_counter() - t0 + total_rows = conn.execute("SELECT count(*) FROM ground_truth").fetchone()[0] + conn.close() + print(f" Ground truth: {total_rows} rows in {elapsed:.1f}s -> {out_path}") + + +def gen_ground_truth_full(base_db, n_queries, k, out_path): + """Convert the existing neighbors table for the full 1M dataset.""" + if os.path.exists(out_path): + os.remove(out_path) + + conn = sqlite3.connect(out_path) + conn.execute(f"ATTACH DATABASE '{base_db}' AS base") + + conn.execute( + "CREATE TABLE ground_truth (" + " query_vector_id INTEGER NOT NULL," + " rank INTEGER NOT NULL," + " neighbor_id INTEGER NOT NULL," + " distance REAL," + " PRIMARY KEY (query_vector_id, rank)" + ")" + ) + + conn.execute( + "INSERT INTO ground_truth(query_vector_id, rank, neighbor_id) " + "SELECT query_vector_id, rank, CAST(neighbors_id AS INTEGER) " + "FROM base.neighbors " + "WHERE query_vector_id < :n AND rank < :k", + {"n": n_queries, "k": k}, + ) + conn.commit() + + total_rows = conn.execute("SELECT count(*) FROM ground_truth").fetchone()[0] + conn.execute("DETACH DATABASE base") + conn.close() + print(f" Ground truth (full): {total_rows} rows -> {out_path}") + + +def main(): + parser = argparse.ArgumentParser(description="Generate per-subset ground truth") + parser.add_argument( + "--subset-size", type=int, required=True, help="number of vectors in subset" + ) + parser.add_argument("-n", type=int, default=100, help="number of query vectors") + parser.add_argument("-k", type=int, default=100, help="max k for ground truth") + parser.add_argument("--base-db", default=BASE_DB) + parser.add_argument("--ext", default=EXT_PATH) + parser.add_argument( + "-o", "--out-dir", default=os.path.join(_SCRIPT_DIR, "seed"), + help="output directory for ground_truth.{N}.db", + ) + args = parser.parse_args() + + os.makedirs(args.out_dir, exist_ok=True) + out_path = os.path.join(args.out_dir, f"ground_truth.{args.subset_size}.db") + + if args.subset_size >= FULL_DATASET_SIZE: + gen_ground_truth_full(args.base_db, args.n, args.k, out_path) + else: + gen_ground_truth_subset( + args.base_db, args.ext, args.subset_size, args.n, args.k, out_path + ) + + +if __name__ == "__main__": + main() diff --git a/benchmarks-ann/profile.py b/benchmarks-ann/profile.py new file mode 100644 index 0000000..0792373 --- /dev/null +++ b/benchmarks-ann/profile.py @@ -0,0 +1,440 @@ +#!/usr/bin/env python3 +"""CPU profiling for sqlite-vec KNN configurations using macOS `sample` tool. + +Builds dist/sqlite3 (with -g3), generates a SQL workload (inserts + repeated +KNN queries) for each config, profiles the sqlite3 process with `sample`, and +prints the top-N hottest functions by self (exclusive) CPU samples. + +Usage: + cd benchmarks-ann + uv run profile.py --subset-size 50000 -n 50 \\ + "baseline-int8:type=baseline,variant=int8,oversample=8" \\ + "rescore-int8:type=rescore,quantizer=int8,oversample=8" +""" + +import argparse +import os +import re +import shutil +import subprocess +import sys +import tempfile + +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +_PROJECT_ROOT = os.path.join(_SCRIPT_DIR, "..") + +sys.path.insert(0, _SCRIPT_DIR) +from bench import ( + BASE_DB, + DEFAULT_INSERT_SQL, + INDEX_REGISTRY, + INSERT_BATCH_SIZE, + parse_config, +) + +SQLITE3_PATH = os.path.join(_PROJECT_ROOT, "dist", "sqlite3") +EXT_PATH = os.path.join(_PROJECT_ROOT, "dist", "vec0") + + +# ============================================================================ +# SQL generation +# ============================================================================ + + +def _query_sql_for_config(params, query_id, k): + """Return a SQL query string for a single KNN query by query_vector id.""" + index_type = params["index_type"] + qvec = f"(SELECT vector FROM base.query_vectors WHERE id = {query_id})" + + if index_type == "baseline": + variant = params.get("variant", "float") + oversample = params.get("oversample", 8) + oversample_k = k * oversample + + if variant == "int8": + return ( + f"WITH coarse AS (" + f" SELECT id, embedding FROM vec_items" + f" WHERE embedding_int8 MATCH vec_quantize_int8({qvec}, 'unit')" + f" LIMIT {oversample_k}" + f") " + f"SELECT id, vec_distance_cosine(embedding, {qvec}) as distance " + f"FROM coarse ORDER BY 2 LIMIT {k};" + ) + elif variant == "bit": + return ( + f"WITH coarse AS (" + f" SELECT id, embedding FROM vec_items" + f" WHERE embedding_bq MATCH vec_quantize_binary({qvec})" + f" LIMIT {oversample_k}" + f") " + f"SELECT id, vec_distance_cosine(embedding, {qvec}) as distance " + f"FROM coarse ORDER BY 2 LIMIT {k};" + ) + + # Default MATCH query (baseline-float, rescore, and others) + return ( + f"SELECT id, distance FROM vec_items" + f" WHERE embedding MATCH {qvec} AND k = {k};" + ) + + +def generate_sql(db_path, params, subset_size, n_queries, k, repeats): + """Generate a complete SQL workload: load ext, create table, insert, query.""" + lines = [] + lines.append(".bail on") + lines.append(f".load {EXT_PATH}") + lines.append(f"ATTACH DATABASE '{os.path.abspath(BASE_DB)}' AS base;") + lines.append("PRAGMA page_size=8192;") + + # Create table + reg = INDEX_REGISTRY[params["index_type"]] + lines.append(reg["create_table_sql"](params) + ";") + + # Inserts + sql_fn = reg.get("insert_sql") + insert_sql = sql_fn(params) if sql_fn else None + if insert_sql is None: + insert_sql = DEFAULT_INSERT_SQL + for lo in range(0, subset_size, INSERT_BATCH_SIZE): + hi = min(lo + INSERT_BATCH_SIZE, subset_size) + stmt = insert_sql.replace(":lo", str(lo)).replace(":hi", str(hi)) + lines.append(stmt + ";") + if hi % 10000 == 0 or hi == subset_size: + lines.append("-- progress: inserted %d/%d" % (hi, subset_size)) + + # Queries (repeated) + lines.append("-- BEGIN QUERIES") + for _rep in range(repeats): + for qid in range(n_queries): + lines.append(_query_sql_for_config(params, qid, k)) + + return "\n".join(lines) + + +# ============================================================================ +# Profiling with macOS `sample` +# ============================================================================ + + +def run_profile(sqlite3_path, db_path, sql_file, sample_output, duration=120): + """Run sqlite3 under macOS `sample` profiler. + + Starts sqlite3 directly with stdin from the SQL file, then immediately + attaches `sample` to its PID with -mayDie (tolerates process exit). + The workload must be long enough for sample to attach and capture useful data. + """ + sql_fd = open(sql_file, "r") + proc = subprocess.Popen( + [sqlite3_path, db_path], + stdin=sql_fd, + stdout=subprocess.DEVNULL, + stderr=subprocess.PIPE, + ) + + pid = proc.pid + print(f" sqlite3 PID: {pid}") + + # Attach sample immediately (1ms interval, -mayDie tolerates process exit) + sample_proc = subprocess.Popen( + ["sample", str(pid), str(duration), "1", "-mayDie", "-file", sample_output], + stdout=subprocess.DEVNULL, + stderr=subprocess.PIPE, + ) + + # Wait for sqlite3 to finish + _, stderr = proc.communicate() + sql_fd.close() + rc = proc.returncode + if rc != 0: + print(f" sqlite3 failed (rc={rc}):", file=sys.stderr) + print(f" {stderr.decode().strip()}", file=sys.stderr) + sample_proc.kill() + return False + + # Wait for sample to finish + sample_proc.wait() + return True + + +# ============================================================================ +# Parse `sample` output +# ============================================================================ + +# Tree-drawing characters used by macOS `sample` to represent hierarchy. +# We replace them with spaces so indentation depth reflects tree depth. +_TREE_CHARS_RE = re.compile(r"[+!:|]") + +# After tree chars are replaced with spaces, each call-graph line looks like: +# " 800 rescore_knn (in vec0.dylib) + 3808,3640,... [0x1a,0x2b,...] file.c:123" +# We extract just (indent, count, symbol, module) — everything after "(in ...)" +# is decoration we don't need. +_LEADING_RE = re.compile(r"^(\s+)(\d+)\s+(.+)") + + +def _extract_symbol_and_module(rest): + """Given the text after 'count ', extract (symbol, module). + + Handles patterns like: + 'rescore_knn (in vec0.dylib) + 3808,3640,... [0x...]' + 'pread (in libsystem_kernel.dylib) + 8 [0x...]' + '??? (in ) [0x...]' + 'start (in dyld) + 2840 [0x198650274]' + 'Thread_26759239 DispatchQueue_1: ...' + """ + # Try to find "(in ...)" to split symbol from module + m = re.match(r"^(.+?)\s+\(in\s+(.+?)\)", rest) + if m: + return m.group(1).strip(), m.group(2).strip() + # No module — return whole thing as symbol, strip trailing junk + sym = re.sub(r"\s+\[0x[0-9a-f].*", "", rest).strip() + return sym, "" + + +def _parse_call_graph_lines(text): + """Parse call-graph section into list of (depth, count, symbol, module).""" + entries = [] + for raw_line in text.split("\n"): + # Strip tree-drawing characters, replace with spaces to preserve depth + line = _TREE_CHARS_RE.sub(" ", raw_line) + m = _LEADING_RE.match(line) + if not m: + continue + depth = len(m.group(1)) + count = int(m.group(2)) + rest = m.group(3) + symbol, module = _extract_symbol_and_module(rest) + entries.append((depth, count, symbol, module)) + return entries + + +def parse_sample_output(filepath): + """Parse `sample` call-graph output, compute exclusive (self) samples per function. + + Returns dict of {display_name: self_sample_count}. + """ + with open(filepath, "r") as f: + text = f.read() + + # Find "Call graph:" section + cg_start = text.find("Call graph:") + if cg_start == -1: + print(" Warning: no 'Call graph:' section found in sample output") + return {} + + # End at "Total number in stack" or EOF + cg_end = text.find("\nTotal number in stack", cg_start) + if cg_end == -1: + cg_end = len(text) + + entries = _parse_call_graph_lines(text[cg_start:cg_end]) + + if not entries: + print(" Warning: no call graph entries parsed") + return {} + + # Compute self (exclusive) samples per function: + # self = count - sum(direct_children_counts) + self_samples = {} + for i, (depth, count, sym, mod) in enumerate(entries): + children_sum = 0 + child_depth = None + for j in range(i + 1, len(entries)): + j_depth = entries[j][0] + if j_depth <= depth: + break + if child_depth is None: + child_depth = j_depth + if j_depth == child_depth: + children_sum += entries[j][1] + + self_count = count - children_sum + if self_count > 0: + key = f"{sym} ({mod})" if mod else sym + self_samples[key] = self_samples.get(key, 0) + self_count + + return self_samples + + +# ============================================================================ +# Display +# ============================================================================ + + +def print_profile(title, self_samples, top_n=20): + total = sum(self_samples.values()) + if total == 0: + print(f"\n=== {title} (no samples) ===") + return + + sorted_syms = sorted(self_samples.items(), key=lambda x: -x[1]) + + print(f"\n=== {title} (top {top_n}, {total} total self-samples) ===") + for sym, count in sorted_syms[:top_n]: + pct = 100.0 * count / total + print(f" {pct:5.1f}% {count:>6} {sym}") + + +# ============================================================================ +# Main +# ============================================================================ + + +def main(): + parser = argparse.ArgumentParser( + description="CPU profiling for sqlite-vec KNN configurations", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + parser.add_argument( + "configs", nargs="+", help="config specs (name:type=X,key=val,...)" + ) + parser.add_argument("--subset-size", type=int, required=True) + parser.add_argument("-k", type=int, default=10, help="KNN k (default 10)") + parser.add_argument( + "-n", type=int, default=50, help="number of distinct queries (default 50)" + ) + parser.add_argument( + "--repeats", + type=int, + default=10, + help="repeat query set N times for more samples (default 10)", + ) + parser.add_argument( + "--top", type=int, default=20, help="show top N functions (default 20)" + ) + parser.add_argument("--base-db", default=BASE_DB) + parser.add_argument("--sqlite3", default=SQLITE3_PATH) + parser.add_argument( + "--keep-temp", + action="store_true", + help="keep temp directory with DBs, SQL, and sample output", + ) + args = parser.parse_args() + + # Check prerequisites + if not os.path.exists(args.base_db): + print(f"Error: base DB not found at {args.base_db}", file=sys.stderr) + print("Run 'make seed' in benchmarks-ann/ first.", file=sys.stderr) + sys.exit(1) + + if not shutil.which("sample"): + print("Error: macOS 'sample' tool not found.", file=sys.stderr) + sys.exit(1) + + # Build CLI + print("Building dist/sqlite3...") + result = subprocess.run( + ["make", "cli"], cwd=_PROJECT_ROOT, capture_output=True, text=True + ) + if result.returncode != 0: + print(f"Error: make cli failed:\n{result.stderr}", file=sys.stderr) + sys.exit(1) + print(" done.") + + if not os.path.exists(args.sqlite3): + print(f"Error: sqlite3 not found at {args.sqlite3}", file=sys.stderr) + sys.exit(1) + + configs = [parse_config(c) for c in args.configs] + + tmpdir = tempfile.mkdtemp(prefix="sqlite-vec-profile-") + print(f"Working directory: {tmpdir}") + + all_profiles = [] + + for i, (name, params) in enumerate(configs, 1): + reg = INDEX_REGISTRY[params["index_type"]] + desc = reg["describe"](params) + print(f"\n[{i}/{len(configs)}] {name} ({desc})") + + # Generate SQL workload + db_path = os.path.join(tmpdir, f"{name}.db") + sql_text = generate_sql( + db_path, params, args.subset_size, args.n, args.k, args.repeats + ) + sql_file = os.path.join(tmpdir, f"{name}.sql") + with open(sql_file, "w") as f: + f.write(sql_text) + + total_queries = args.n * args.repeats + print( + f" SQL workload: {args.subset_size} inserts + " + f"{total_queries} queries ({args.n} x {args.repeats} repeats)" + ) + + # Profile + sample_file = os.path.join(tmpdir, f"{name}.sample.txt") + print(f" Profiling...") + ok = run_profile(args.sqlite3, db_path, sql_file, sample_file) + if not ok: + print(f" FAILED — skipping {name}") + all_profiles.append((name, desc, {})) + continue + + if not os.path.exists(sample_file): + print(f" Warning: sample output not created") + all_profiles.append((name, desc, {})) + continue + + # Parse + self_samples = parse_sample_output(sample_file) + all_profiles.append((name, desc, self_samples)) + + # Show individual profile + print_profile(f"{name} ({desc})", self_samples, args.top) + + # Side-by-side comparison if multiple configs + if len(all_profiles) > 1: + print("\n" + "=" * 80) + print("COMPARISON") + print("=" * 80) + + # Collect all symbols that appear in top-N of any config + all_syms = set() + for _name, _desc, prof in all_profiles: + sorted_syms = sorted(prof.items(), key=lambda x: -x[1]) + for sym, _count in sorted_syms[: args.top]: + all_syms.add(sym) + + # Build comparison table + rows = [] + for sym in all_syms: + row = [sym] + for _name, _desc, prof in all_profiles: + total = sum(prof.values()) + count = prof.get(sym, 0) + pct = 100.0 * count / total if total > 0 else 0.0 + row.append((pct, count)) + max_pct = max(r[0] for r in row[1:]) + rows.append((max_pct, row)) + + rows.sort(key=lambda x: -x[0]) + + # Header + header = f"{'function':>40}" + for name, desc, _ in all_profiles: + header += f" {name:>14}" + print(header) + print("-" * len(header)) + + for _sort_key, row in rows[: args.top * 2]: + sym = row[0] + display_sym = sym if len(sym) <= 40 else sym[:37] + "..." + line = f"{display_sym:>40}" + for pct, count in row[1:]: + if count > 0: + line += f" {pct:>13.1f}%" + else: + line += f" {'-':>14}" + print(line) + + if args.keep_temp: + print(f"\nTemp files kept at: {tmpdir}") + else: + shutil.rmtree(tmpdir) + print(f"\nTemp files cleaned up. Use --keep-temp to preserve.") + + +if __name__ == "__main__": + main() diff --git a/benchmarks-ann/schema.sql b/benchmarks-ann/schema.sql new file mode 100644 index 0000000..681df4e --- /dev/null +++ b/benchmarks-ann/schema.sql @@ -0,0 +1,35 @@ +-- Canonical results schema for vec0 KNN benchmark comparisons. +-- The index_type column is a free-form TEXT field. Baseline configs use +-- "baseline"; index-specific branches add their own types (registered +-- via INDEX_REGISTRY in bench.py). + +CREATE TABLE IF NOT EXISTS build_results ( + config_name TEXT NOT NULL, + index_type TEXT NOT NULL, + subset_size INTEGER NOT NULL, + db_path TEXT NOT NULL, + insert_time_s REAL NOT NULL, + train_time_s REAL, -- NULL when no training/build step is needed + total_time_s REAL NOT NULL, + rows INTEGER NOT NULL, + file_size_mb REAL NOT NULL, + created_at TEXT NOT NULL DEFAULT (datetime('now')), + PRIMARY KEY (config_name, subset_size) +); + +CREATE TABLE IF NOT EXISTS bench_results ( + config_name TEXT NOT NULL, + index_type TEXT NOT NULL, + subset_size INTEGER NOT NULL, + k INTEGER NOT NULL, + n INTEGER NOT NULL, + mean_ms REAL NOT NULL, + median_ms REAL NOT NULL, + p99_ms REAL NOT NULL, + total_ms REAL NOT NULL, + qps REAL NOT NULL, + recall REAL NOT NULL, + db_path TEXT NOT NULL, + created_at TEXT NOT NULL DEFAULT (datetime('now')), + PRIMARY KEY (config_name, subset_size, k) +); diff --git a/benchmarks-ann/seed/.gitignore b/benchmarks-ann/seed/.gitignore new file mode 100644 index 0000000..8efed50 --- /dev/null +++ b/benchmarks-ann/seed/.gitignore @@ -0,0 +1,2 @@ +*.parquet +base.db diff --git a/benchmarks-ann/seed/Makefile b/benchmarks-ann/seed/Makefile new file mode 100644 index 0000000..186bf66 --- /dev/null +++ b/benchmarks-ann/seed/Makefile @@ -0,0 +1,24 @@ +BASE_URL = https://assets.zilliz.com/benchmark/cohere_medium_1m + +PARQUETS = train.parquet test.parquet neighbors.parquet + +.PHONY: all download base.db clean + +all: base.db + +download: $(PARQUETS) + +train.parquet: + curl -L -o $@ $(BASE_URL)/train.parquet + +test.parquet: + curl -L -o $@ $(BASE_URL)/test.parquet + +neighbors.parquet: + curl -L -o $@ $(BASE_URL)/neighbors.parquet + +base.db: $(PARQUETS) build_base_db.py + uv run --with pandas --with pyarrow python build_base_db.py + +clean: + rm -f base.db diff --git a/benchmarks-ann/seed/build_base_db.py b/benchmarks-ann/seed/build_base_db.py new file mode 100644 index 0000000..33d280d --- /dev/null +++ b/benchmarks-ann/seed/build_base_db.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python3 +"""Build base.db from downloaded parquet files. + +Reads train.parquet, test.parquet, neighbors.parquet and creates a SQLite +database with tables: train, query_vectors, neighbors. + +Usage: + uv run --with pandas --with pyarrow python build_base_db.py +""" +import json +import os +import sqlite3 +import struct +import sys +import time + +import pandas as pd + + +def float_list_to_blob(floats): + """Pack a list of floats into a little-endian f32 blob.""" + return struct.pack(f"<{len(floats)}f", *floats) + + +def main(): + seed_dir = os.path.dirname(os.path.abspath(__file__)) + db_path = os.path.join(seed_dir, "base.db") + + train_path = os.path.join(seed_dir, "train.parquet") + test_path = os.path.join(seed_dir, "test.parquet") + neighbors_path = os.path.join(seed_dir, "neighbors.parquet") + + for p in (train_path, test_path, neighbors_path): + if not os.path.exists(p): + print(f"ERROR: {p} not found. Run 'make download' first.") + sys.exit(1) + + if os.path.exists(db_path): + os.remove(db_path) + + conn = sqlite3.connect(db_path) + conn.execute("PRAGMA journal_mode=WAL") + conn.execute("PRAGMA page_size=4096") + + # --- query_vectors (from test.parquet) --- + print("Loading test.parquet (query vectors)...") + t0 = time.perf_counter() + df_test = pd.read_parquet(test_path) + conn.execute( + "CREATE TABLE query_vectors (id INTEGER PRIMARY KEY, vector BLOB)" + ) + rows = [] + for _, row in df_test.iterrows(): + rows.append((int(row["id"]), float_list_to_blob(row["emb"]))) + conn.executemany("INSERT INTO query_vectors (id, vector) VALUES (?, ?)", rows) + conn.commit() + print(f" {len(rows)} query vectors in {time.perf_counter() - t0:.1f}s") + + # --- neighbors (from neighbors.parquet) --- + print("Loading neighbors.parquet...") + t0 = time.perf_counter() + df_neighbors = pd.read_parquet(neighbors_path) + conn.execute( + "CREATE TABLE neighbors (" + " query_vector_id INTEGER, rank INTEGER, neighbors_id TEXT," + " UNIQUE(query_vector_id, rank))" + ) + rows = [] + for _, row in df_neighbors.iterrows(): + qid = int(row["id"]) + # neighbors_id may be a numpy array or JSON string + nids = row["neighbors_id"] + if isinstance(nids, str): + nids = json.loads(nids) + for rank, nid in enumerate(nids): + rows.append((qid, rank, str(int(nid)))) + conn.executemany( + "INSERT INTO neighbors (query_vector_id, rank, neighbors_id) VALUES (?, ?, ?)", + rows, + ) + conn.commit() + print(f" {len(rows)} neighbor rows in {time.perf_counter() - t0:.1f}s") + + # --- train (from train.parquet) --- + print("Loading train.parquet (1M vectors, this takes a few minutes)...") + t0 = time.perf_counter() + conn.execute( + "CREATE TABLE train (id INTEGER PRIMARY KEY, vector BLOB)" + ) + + batch_size = 10000 + df_iter = pd.read_parquet(train_path) + total = len(df_iter) + + for start in range(0, total, batch_size): + chunk = df_iter.iloc[start : start + batch_size] + rows = [] + for _, row in chunk.iterrows(): + rows.append((int(row["id"]), float_list_to_blob(row["emb"]))) + conn.executemany("INSERT INTO train (id, vector) VALUES (?, ?)", rows) + conn.commit() + + done = min(start + batch_size, total) + elapsed = time.perf_counter() - t0 + rate = done / elapsed if elapsed > 0 else 0 + eta = (total - done) / rate if rate > 0 else 0 + print( + f" {done:>8}/{total} {elapsed:.0f}s {rate:.0f} rows/s eta {eta:.0f}s", + flush=True, + ) + + elapsed = time.perf_counter() - t0 + print(f" {total} train vectors in {elapsed:.1f}s") + + conn.close() + size_mb = os.path.getsize(db_path) / (1024 * 1024) + print(f"\nDone: {db_path} ({size_mb:.0f} MB)") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/exhaustive-memory/bench.py b/benchmarks/exhaustive-memory/bench.py index c9da831..7c969d6 100644 --- a/benchmarks/exhaustive-memory/bench.py +++ b/benchmarks/exhaustive-memory/bench.py @@ -248,59 +248,6 @@ def bench_libsql(base, query, page_size, k) -> BenchResult: return BenchResult(f"libsql ({page_size})", build_time, times) -def register_np(db, array, name): - ptr = array.__array_interface__["data"][0] - nvectors, dimensions = array.__array_interface__["shape"] - element_type = array.__array_interface__["typestr"] - - assert element_type == " BenchResult: - print(f"sqlite-vec static...") - - db = sqlite3.connect(":memory:") - db.enable_load_extension(True) - db.load_extension("../../dist/vec0") - - - - t = time.time() - register_np(db, base, "base") - build_time = time.time() - t - - times = [] - results = [] - for ( - idx, - q, - ) in enumerate(query): - t0 = time.time() - result = db.execute( - """ - select - rowid - from base - where vector match ? - and k = ? - order by distance - """, - [q.tobytes(), k], - ).fetchall() - assert len(result) == k - times.append(time.time() - t0) - return BenchResult(f"sqlite-vec static", build_time, times) - def bench_faiss(base, query, k) -> BenchResult: import faiss dimensions = base.shape[1] @@ -438,8 +385,6 @@ def suite(name, base, query, k, benchmarks): for b in benchmarks: if b == "faiss": results.append(bench_faiss(base, query, k=k)) - elif b == "vec-static": - results.append(bench_sqlite_vec_static(base, query, k=k)) elif b.startswith("vec-scalar"): _, page_size = b.split('.') results.append(bench_sqlite_vec_scalar(base, query, page_size, k=k)) @@ -541,7 +486,7 @@ def parse_args(): help="Number of queries to use. Defaults all", ) parser.add_argument( - "-x", help="type of runs to make", default="faiss,vec-scalar.4096,vec-static,vec-vec0.4096.16,usearch,duckdb,hnswlib,numpy" + "-x", help="type of runs to make", default="faiss,vec-scalar.4096,vec-vec0.4096.16,usearch,duckdb,hnswlib,numpy" ) args = parser.parse_args() diff --git a/benchmarks/profiling/build-from-npy.sql b/benchmarks/profiling/build-from-npy.sql index 134df70..92ef59c 100644 --- a/benchmarks/profiling/build-from-npy.sql +++ b/benchmarks/profiling/build-from-npy.sql @@ -8,10 +8,3 @@ create virtual table vec_items using vec0( embedding float[1536] ); --- 65s (limit 1e5), ~615MB on disk -insert into vec_items - select - rowid, - vector - from vec_npy_each(vec_npy_file('examples/dbpedia-openai/data/vectors.npy')) - limit 1e5; diff --git a/benchmarks/self-params/build.py b/benchmarks/self-params/build.py index bc6e388..c5d9fc1 100644 --- a/benchmarks/self-params/build.py +++ b/benchmarks/self-params/build.py @@ -6,7 +6,6 @@ def connect(path): db = sqlite3.connect(path) db.enable_load_extension(True) db.load_extension("../dist/vec0") - db.execute("select load_extension('../dist/vec0', 'sqlite3_vec_fs_read_init')") db.enable_load_extension(False) return db @@ -18,8 +17,6 @@ page_sizes = [ # 4096, 8192, chunk_sizes = [128, 256, 1024, 2048] types = ["f32", "int8", "bit"] -SRC = "../examples/dbpedia-openai/data/vectors.npy" - for page_size in page_sizes: for chunk_size in chunk_sizes: for t in types: @@ -42,15 +39,8 @@ for page_size in page_sizes: func = "vec_quantize_i8(vector, 'unit')" if t == "bit": func = "vec_quantize_binary(vector)" - db.execute( - f""" - insert into vec_items - select rowid, {func} - from vec_npy_each(vec_npy_file(?)) - limit 100000 - """, - [SRC], - ) + # TODO: replace with non-npy data loading + pass elapsed = time.time() - t0 print(elapsed) diff --git a/bindings/go/ncruces/go-sqlite3.patch b/bindings/go/ncruces/go-sqlite3.patch index f202bc3..03bead9 100644 --- a/bindings/go/ncruces/go-sqlite3.patch +++ b/bindings/go/ncruces/go-sqlite3.patch @@ -6,7 +6,6 @@ index ed2aaec..4cc0b0e 100755 -Wl,--initial-memory=327680 \ -D_HAVE_SQLITE_CONFIG_H \ -DSQLITE_CUSTOM_INCLUDE=sqlite_opt.h \ -+ -DSQLITE_VEC_OMIT_FS=1 \ $(awk '{print "-Wl,--export="$0}' exports.txt) "$BINARYEN/wasm-ctor-eval" -g -c _initialize sqlite3.wasm -o sqlite3.tmp diff --git a/bindings/python/extra_init.py b/bindings/python/extra_init.py index 267bc41..4408855 100644 --- a/bindings/python/extra_init.py +++ b/bindings/python/extra_init.py @@ -1,6 +1,5 @@ from typing import List from struct import pack -from sqlite3 import Connection def serialize_float32(vector: List[float]) -> bytes: @@ -13,33 +12,3 @@ def serialize_int8(vector: List[int]) -> bytes: return pack("%sb" % len(vector), *vector) -try: - import numpy.typing as npt - - def register_numpy(db: Connection, name: str, array: npt.NDArray): - """ayoo""" - - ptr = array.__array_interface__["data"][0] - nvectors, dimensions = array.__array_interface__["shape"] - element_type = array.__array_interface__["typestr"] - - assert element_type == " dist/sqlite-vec.c +""" + +import re +import sys +import os + + +def strip_lsp_block(content): + """Remove the LSP-support pattern: + #ifndef SQLITE_VEC_H + #include "sqlite-vec.c" // ... + #endif + """ + pattern = re.compile( + r'^\s*#ifndef\s+SQLITE_VEC_H\s*\n' + r'\s*#include\s+"sqlite-vec\.c"[^\n]*\n' + r'\s*#endif[^\n]*\n', + re.MULTILINE, + ) + return pattern.sub('', content) + + +def strip_include_guard(content, guard_macro): + """Remove the include guard pair: + #ifndef GUARD_MACRO + #define GUARD_MACRO + ...content... + (trailing #endif removed) + """ + # Strip the #ifndef / #define pair at the top + header_pattern = re.compile( + r'^\s*#ifndef\s+' + re.escape(guard_macro) + r'\s*\n' + r'\s*#define\s+' + re.escape(guard_macro) + r'\s*\n', + re.MULTILINE, + ) + content = header_pattern.sub('', content, count=1) + + # Strip the trailing #endif (last one in file that closes the guard) + # Find the last #endif and remove it + lines = content.rstrip('\n').split('\n') + for i in range(len(lines) - 1, -1, -1): + if re.match(r'^\s*#endif', lines[i]): + lines.pop(i) + break + + return '\n'.join(lines) + '\n' + + +def detect_include_guard(content): + """Detect an include guard macro like SQLITE_VEC_IVF_C.""" + m = re.match( + r'\s*(?:/\*[\s\S]*?\*/\s*)?' # optional block comment + r'#ifndef\s+(SQLITE_VEC_\w+_C)\s*\n' + r'#define\s+\1', + content, + ) + return m.group(1) if m else None + + +def inline_include(match, base_dir): + """Replace an #include "sqlite-vec-*.c" with the file's contents.""" + filename = match.group(1) + filepath = os.path.join(base_dir, filename) + + if not os.path.exists(filepath): + print(f"Warning: {filepath} not found, leaving #include in place", file=sys.stderr) + return match.group(0) + + with open(filepath, 'r') as f: + content = f.read() + + # Strip LSP-support block + content = strip_lsp_block(content) + + # Strip include guard if present + guard = detect_include_guard(content) + if guard: + content = strip_include_guard(content, guard) + + separator = '/' * 78 + header = f'\n{separator}\n// Begin inlined: {filename}\n{separator}\n\n' + footer = f'\n{separator}\n// End inlined: {filename}\n{separator}\n' + + return header + content.strip('\n') + footer + + +def amalgamate(input_path): + base_dir = os.path.dirname(os.path.abspath(input_path)) + + with open(input_path, 'r') as f: + content = f.read() + + # Replace #include "sqlite-vec-*.c" with inlined contents + include_pattern = re.compile(r'^#include\s+"(sqlite-vec-[^"]+\.c)"\s*$', re.MULTILINE) + content = include_pattern.sub(lambda m: inline_include(m, base_dir), content) + + return content + + +def main(): + if len(sys.argv) != 2: + print(f"Usage: {sys.argv[0]} ", file=sys.stderr) + sys.exit(1) + + result = amalgamate(sys.argv[1]) + sys.stdout.write(result) + + +if __name__ == '__main__': + main() diff --git a/site/api-reference.md b/site/api-reference.md index bd144ea..ba8c648 100644 --- a/site/api-reference.md +++ b/site/api-reference.md @@ -568,65 +568,6 @@ select 'todo'; -- 'todo' -``` - -## NumPy Utilities {#numpy} - -Functions to read data from or work with [NumPy arrays](https://numpy.org/doc/stable/reference/generated/numpy.array.html). - -### `vec_npy_each(vector)` {#vec_npy_each} - -xxx - - -```sql --- db.execute('select quote(?)', [to_npy(np.array([[1.0], [2.0], [3.0]], dtype=np.float32))]).fetchone() -select - rowid, - vector, - vec_type(vector), - vec_to_json(vector) -from vec_npy_each( - X'934E554D5059010076007B276465736372273A20273C6634272C2027666F727472616E5F6F72646572273A2046616C73652C20277368617065273A2028332C2031292C207D202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020200A0000803F0000004000004040' -) -/* -┌───────┬─────────────┬──────────────────┬─────────────────────┐ -│ rowid │ vector │ vec_type(vector) │ vec_to_json(vector) │ -├───────┼─────────────┼──────────────────┼─────────────────────┤ -│ 0 │ X'0000803F' │ 'float32' │ '[1.000000]' │ -├───────┼─────────────┼──────────────────┼─────────────────────┤ -│ 1 │ X'00000040' │ 'float32' │ '[2.000000]' │ -├───────┼─────────────┼──────────────────┼─────────────────────┤ -│ 2 │ X'00004040' │ 'float32' │ '[3.000000]' │ -└───────┴─────────────┴──────────────────┴─────────────────────┘ - -*/ - - --- db.execute('select quote(?)', [to_npy(np.array([[1.0], [2.0], [3.0]], dtype=np.float32))]).fetchone() -select - rowid, - vector, - vec_type(vector), - vec_to_json(vector) -from vec_npy_each( - X'934E554D5059010076007B276465736372273A20273C6634272C2027666F727472616E5F6F72646572273A2046616C73652C20277368617065273A2028332C2031292C207D202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020200A0000803F0000004000004040' -) -/* -┌───────┬─────────────┬──────────────────┬─────────────────────┐ -│ rowid │ vector │ vec_type(vector) │ vec_to_json(vector) │ -├───────┼─────────────┼──────────────────┼─────────────────────┤ -│ 0 │ X'0000803F' │ 'float32' │ '[1.000000]' │ -├───────┼─────────────┼──────────────────┼─────────────────────┤ -│ 1 │ X'00000040' │ 'float32' │ '[2.000000]' │ -├───────┼─────────────┼──────────────────┼─────────────────────┤ -│ 2 │ X'00004040' │ 'float32' │ '[3.000000]' │ -└───────┴─────────────┴──────────────────┴─────────────────────┘ - -*/ - - - ``` ## Meta {#meta} diff --git a/site/compiling.md b/site/compiling.md index 9ce3c83..b3b2e33 100644 --- a/site/compiling.md +++ b/site/compiling.md @@ -59,5 +59,4 @@ The current compile-time flags are: - `SQLITE_VEC_ENABLE_AVX`, enables AVX CPU instructions for some vector search operations - `SQLITE_VEC_ENABLE_NEON`, enables NEON CPU instructions for some vector search operations -- `SQLITE_VEC_OMIT_FS`, removes some obsure SQL functions and features that use the filesystem, meant for some WASM builds where there's no available filesystem - `SQLITE_VEC_STATIC`, meant for statically linking `sqlite-vec` diff --git a/sqlite-vec.c b/sqlite-vec.c index de3176f..cc4471b 100644 --- a/sqlite-vec.c +++ b/sqlite-vec.c @@ -11,7 +11,7 @@ #include #include -#ifndef SQLITE_VEC_OMIT_FS +#ifdef SQLITE_VEC_DEBUG #include #endif @@ -224,6 +224,63 @@ static f32 l2_sqr_float_neon(const void *pVect1v, const void *pVect2v, return sqrt(sum_scalar); } +static f32 cosine_float_neon(const void *pVect1v, const void *pVect2v, + const void *qty_ptr) { + f32 *pVect1 = (f32 *)pVect1v; + f32 *pVect2 = (f32 *)pVect2v; + size_t qty = *((size_t *)qty_ptr); + size_t qty16 = qty >> 4; + const f32 *pEnd1 = pVect1 + (qty16 << 4); + + float32x4_t dot0 = vdupq_n_f32(0), dot1 = vdupq_n_f32(0); + float32x4_t dot2 = vdupq_n_f32(0), dot3 = vdupq_n_f32(0); + float32x4_t amag0 = vdupq_n_f32(0), amag1 = vdupq_n_f32(0); + float32x4_t amag2 = vdupq_n_f32(0), amag3 = vdupq_n_f32(0); + float32x4_t bmag0 = vdupq_n_f32(0), bmag1 = vdupq_n_f32(0); + float32x4_t bmag2 = vdupq_n_f32(0), bmag3 = vdupq_n_f32(0); + + while (pVect1 < pEnd1) { + float32x4_t v1, v2; + v1 = vld1q_f32(pVect1); pVect1 += 4; + v2 = vld1q_f32(pVect2); pVect2 += 4; + dot0 = vfmaq_f32(dot0, v1, v2); + amag0 = vfmaq_f32(amag0, v1, v1); + bmag0 = vfmaq_f32(bmag0, v2, v2); + + v1 = vld1q_f32(pVect1); pVect1 += 4; + v2 = vld1q_f32(pVect2); pVect2 += 4; + dot1 = vfmaq_f32(dot1, v1, v2); + amag1 = vfmaq_f32(amag1, v1, v1); + bmag1 = vfmaq_f32(bmag1, v2, v2); + + v1 = vld1q_f32(pVect1); pVect1 += 4; + v2 = vld1q_f32(pVect2); pVect2 += 4; + dot2 = vfmaq_f32(dot2, v1, v2); + amag2 = vfmaq_f32(amag2, v1, v1); + bmag2 = vfmaq_f32(bmag2, v2, v2); + + v1 = vld1q_f32(pVect1); pVect1 += 4; + v2 = vld1q_f32(pVect2); pVect2 += 4; + dot3 = vfmaq_f32(dot3, v1, v2); + amag3 = vfmaq_f32(amag3, v1, v1); + bmag3 = vfmaq_f32(bmag3, v2, v2); + } + + f32 dot_s = vaddvq_f32(vaddq_f32(vaddq_f32(dot0, dot1), vaddq_f32(dot2, dot3))); + f32 amag_s = vaddvq_f32(vaddq_f32(vaddq_f32(amag0, amag1), vaddq_f32(amag2, amag3))); + f32 bmag_s = vaddvq_f32(vaddq_f32(vaddq_f32(bmag0, bmag1), vaddq_f32(bmag2, bmag3))); + + const f32 *pEnd2 = pVect1 + (qty - (qty16 << 4)); + while (pVect1 < pEnd2) { + dot_s += *pVect1 * *pVect2; + amag_s += *pVect1 * *pVect1; + bmag_s += *pVect2 * *pVect2; + pVect1++; pVect2++; + } + + return 1.0f - (dot_s / (sqrtf(amag_s) * sqrtf(bmag_s))); +} + static f32 l2_sqr_int8_neon(const void *pVect1v, const void *pVect2v, const void *qty_ptr) { i8 *pVect1 = (i8 *)pVect1v; @@ -462,6 +519,11 @@ static double distance_l1_f32(const void *a, const void *b, const void *d) { static f32 distance_cosine_float(const void *pVect1v, const void *pVect2v, const void *qty_ptr) { +#ifdef SQLITE_VEC_ENABLE_NEON + if ((*(const size_t *)qty_ptr) > 16) { + return cosine_float_neon(pVect1v, pVect2v, qty_ptr); + } +#endif f32 *pVect1 = (f32 *)pVect1v; f32 *pVect2 = (f32 *)pVect2v; size_t qty = *((size_t *)qty_ptr); @@ -478,8 +540,7 @@ static f32 distance_cosine_float(const void *pVect1v, const void *pVect2v, } return 1 - (dot / (sqrt(aMag) * sqrt(bMag))); } -static f32 distance_cosine_int8(const void *pA, const void *pB, - const void *pD) { +static f32 cosine_int8(const void *pA, const void *pB, const void *pD) { i8 *a = (i8 *)pA; i8 *b = (i8 *)pB; size_t d = *((size_t *)pD); @@ -497,6 +558,125 @@ static f32 distance_cosine_int8(const void *pA, const void *pB, return 1 - (dot / (sqrt(aMag) * sqrt(bMag))); } +#ifdef SQLITE_VEC_ENABLE_NEON +static f32 cosine_int8_neon(const void *pA, const void *pB, const void *pD) { + const i8 *a = (const i8 *)pA; + const i8 *b = (const i8 *)pB; + size_t d = *((const size_t *)pD); + const i8 *aEnd = a + d; + + int32x4_t dot_acc1 = vdupq_n_s32(0); + int32x4_t dot_acc2 = vdupq_n_s32(0); + int32x4_t aMag_acc1 = vdupq_n_s32(0); + int32x4_t aMag_acc2 = vdupq_n_s32(0); + int32x4_t bMag_acc1 = vdupq_n_s32(0); + int32x4_t bMag_acc2 = vdupq_n_s32(0); + + while (a < aEnd - 31) { + int8x16_t va1 = vld1q_s8(a); + int8x16_t vb1 = vld1q_s8(b); + int16x8_t a1_lo = vmovl_s8(vget_low_s8(va1)); + int16x8_t a1_hi = vmovl_s8(vget_high_s8(va1)); + int16x8_t b1_lo = vmovl_s8(vget_low_s8(vb1)); + int16x8_t b1_hi = vmovl_s8(vget_high_s8(vb1)); + + dot_acc1 = vmlal_s16(dot_acc1, vget_low_s16(a1_lo), vget_low_s16(b1_lo)); + dot_acc1 = vmlal_s16(dot_acc1, vget_high_s16(a1_lo), vget_high_s16(b1_lo)); + dot_acc2 = vmlal_s16(dot_acc2, vget_low_s16(a1_hi), vget_low_s16(b1_hi)); + dot_acc2 = vmlal_s16(dot_acc2, vget_high_s16(a1_hi), vget_high_s16(b1_hi)); + + aMag_acc1 = vmlal_s16(aMag_acc1, vget_low_s16(a1_lo), vget_low_s16(a1_lo)); + aMag_acc1 = vmlal_s16(aMag_acc1, vget_high_s16(a1_lo), vget_high_s16(a1_lo)); + aMag_acc2 = vmlal_s16(aMag_acc2, vget_low_s16(a1_hi), vget_low_s16(a1_hi)); + aMag_acc2 = vmlal_s16(aMag_acc2, vget_high_s16(a1_hi), vget_high_s16(a1_hi)); + + bMag_acc1 = vmlal_s16(bMag_acc1, vget_low_s16(b1_lo), vget_low_s16(b1_lo)); + bMag_acc1 = vmlal_s16(bMag_acc1, vget_high_s16(b1_lo), vget_high_s16(b1_lo)); + bMag_acc2 = vmlal_s16(bMag_acc2, vget_low_s16(b1_hi), vget_low_s16(b1_hi)); + bMag_acc2 = vmlal_s16(bMag_acc2, vget_high_s16(b1_hi), vget_high_s16(b1_hi)); + + int8x16_t va2 = vld1q_s8(a + 16); + int8x16_t vb2 = vld1q_s8(b + 16); + int16x8_t a2_lo = vmovl_s8(vget_low_s8(va2)); + int16x8_t a2_hi = vmovl_s8(vget_high_s8(va2)); + int16x8_t b2_lo = vmovl_s8(vget_low_s8(vb2)); + int16x8_t b2_hi = vmovl_s8(vget_high_s8(vb2)); + + dot_acc1 = vmlal_s16(dot_acc1, vget_low_s16(a2_lo), vget_low_s16(b2_lo)); + dot_acc1 = vmlal_s16(dot_acc1, vget_high_s16(a2_lo), vget_high_s16(b2_lo)); + dot_acc2 = vmlal_s16(dot_acc2, vget_low_s16(a2_hi), vget_low_s16(b2_hi)); + dot_acc2 = vmlal_s16(dot_acc2, vget_high_s16(a2_hi), vget_high_s16(b2_hi)); + + aMag_acc1 = vmlal_s16(aMag_acc1, vget_low_s16(a2_lo), vget_low_s16(a2_lo)); + aMag_acc1 = vmlal_s16(aMag_acc1, vget_high_s16(a2_lo), vget_high_s16(a2_lo)); + aMag_acc2 = vmlal_s16(aMag_acc2, vget_low_s16(a2_hi), vget_low_s16(a2_hi)); + aMag_acc2 = vmlal_s16(aMag_acc2, vget_high_s16(a2_hi), vget_high_s16(a2_hi)); + + bMag_acc1 = vmlal_s16(bMag_acc1, vget_low_s16(b2_lo), vget_low_s16(b2_lo)); + bMag_acc1 = vmlal_s16(bMag_acc1, vget_high_s16(b2_lo), vget_high_s16(b2_lo)); + bMag_acc2 = vmlal_s16(bMag_acc2, vget_low_s16(b2_hi), vget_low_s16(b2_hi)); + bMag_acc2 = vmlal_s16(bMag_acc2, vget_high_s16(b2_hi), vget_high_s16(b2_hi)); + + a += 32; + b += 32; + } + + while (a < aEnd - 15) { + int8x16_t va = vld1q_s8(a); + int8x16_t vb = vld1q_s8(b); + int16x8_t a_lo = vmovl_s8(vget_low_s8(va)); + int16x8_t a_hi = vmovl_s8(vget_high_s8(va)); + int16x8_t b_lo = vmovl_s8(vget_low_s8(vb)); + int16x8_t b_hi = vmovl_s8(vget_high_s8(vb)); + + dot_acc1 = vmlal_s16(dot_acc1, vget_low_s16(a_lo), vget_low_s16(b_lo)); + dot_acc1 = vmlal_s16(dot_acc1, vget_high_s16(a_lo), vget_high_s16(b_lo)); + dot_acc1 = vmlal_s16(dot_acc1, vget_low_s16(a_hi), vget_low_s16(b_hi)); + dot_acc1 = vmlal_s16(dot_acc1, vget_high_s16(a_hi), vget_high_s16(b_hi)); + + aMag_acc1 = vmlal_s16(aMag_acc1, vget_low_s16(a_lo), vget_low_s16(a_lo)); + aMag_acc1 = vmlal_s16(aMag_acc1, vget_high_s16(a_lo), vget_high_s16(a_lo)); + aMag_acc1 = vmlal_s16(aMag_acc1, vget_low_s16(a_hi), vget_low_s16(a_hi)); + aMag_acc1 = vmlal_s16(aMag_acc1, vget_high_s16(a_hi), vget_high_s16(a_hi)); + + bMag_acc1 = vmlal_s16(bMag_acc1, vget_low_s16(b_lo), vget_low_s16(b_lo)); + bMag_acc1 = vmlal_s16(bMag_acc1, vget_high_s16(b_lo), vget_high_s16(b_lo)); + bMag_acc1 = vmlal_s16(bMag_acc1, vget_low_s16(b_hi), vget_low_s16(b_hi)); + bMag_acc1 = vmlal_s16(bMag_acc1, vget_high_s16(b_hi), vget_high_s16(b_hi)); + + a += 16; + b += 16; + } + + int32x4_t dot_sum = vaddq_s32(dot_acc1, dot_acc2); + int32x4_t aMag_sum = vaddq_s32(aMag_acc1, aMag_acc2); + int32x4_t bMag_sum = vaddq_s32(bMag_acc1, bMag_acc2); + + i32 dot = vaddvq_s32(dot_sum); + i32 aMag = vaddvq_s32(aMag_sum); + i32 bMag = vaddvq_s32(bMag_sum); + + while (a < aEnd) { + dot += (i32)*a * (i32)*b; + aMag += (i32)*a * (i32)*a; + bMag += (i32)*b * (i32)*b; + a++; + b++; + } + + return 1.0f - ((f32)dot / (sqrtf((f32)aMag) * sqrtf((f32)bMag))); +} +#endif + +static f32 distance_cosine_int8(const void *a, const void *b, const void *d) { +#ifdef SQLITE_VEC_ENABLE_NEON + if ((*(const size_t *)d) > 15) { + return cosine_int8_neon(a, b, d); + } +#endif + return cosine_int8(a, b, d); +} + // https://github.com/facebookresearch/faiss/blob/77e2e79cd0a680adc343b9840dd865da724c579e/faiss/utils/hamming_distance/common.h#L34 static u8 hamdist_table[256] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, @@ -511,6 +691,59 @@ static u8 hamdist_table[256] = { 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8}; +#ifdef SQLITE_VEC_ENABLE_NEON +static f32 distance_hamming_neon(const u8 *a, const u8 *b, size_t n_bytes) { + const u8 *pEnd = a + n_bytes; + + uint32x4_t acc1 = vdupq_n_u32(0); + uint32x4_t acc2 = vdupq_n_u32(0); + uint32x4_t acc3 = vdupq_n_u32(0); + uint32x4_t acc4 = vdupq_n_u32(0); + + while (a <= pEnd - 64) { + uint8x16_t v1 = vld1q_u8(a); + uint8x16_t v2 = vld1q_u8(b); + acc1 = vaddq_u32(acc1, vpaddlq_u16(vpaddlq_u8(vcntq_u8(veorq_u8(v1, v2))))); + + v1 = vld1q_u8(a + 16); + v2 = vld1q_u8(b + 16); + acc2 = vaddq_u32(acc2, vpaddlq_u16(vpaddlq_u8(vcntq_u8(veorq_u8(v1, v2))))); + + v1 = vld1q_u8(a + 32); + v2 = vld1q_u8(b + 32); + acc3 = vaddq_u32(acc3, vpaddlq_u16(vpaddlq_u8(vcntq_u8(veorq_u8(v1, v2))))); + + v1 = vld1q_u8(a + 48); + v2 = vld1q_u8(b + 48); + acc4 = vaddq_u32(acc4, vpaddlq_u16(vpaddlq_u8(vcntq_u8(veorq_u8(v1, v2))))); + + a += 64; + b += 64; + } + + while (a <= pEnd - 16) { + uint8x16_t v1 = vld1q_u8(a); + uint8x16_t v2 = vld1q_u8(b); + acc1 = vaddq_u32(acc1, vpaddlq_u16(vpaddlq_u8(vcntq_u8(veorq_u8(v1, v2))))); + a += 16; + b += 16; + } + + acc1 = vaddq_u32(acc1, acc2); + acc3 = vaddq_u32(acc3, acc4); + acc1 = vaddq_u32(acc1, acc3); + u32 sum = vaddvq_u32(acc1); + + while (a < pEnd) { + sum += hamdist_table[*a ^ *b]; + a++; + b++; + } + + return (f32)sum; +} +#endif + static f32 distance_hamming_u8(u8 *a, u8 *b, size_t n) { int same = 0; for (unsigned long i = 0; i < n; i++) { @@ -555,11 +788,18 @@ static f32 distance_hamming_u64(u64 *a, u64 *b, size_t n) { */ static f32 distance_hamming(const void *a, const void *b, const void *d) { size_t dimensions = *((size_t *)d); + size_t n_bytes = dimensions / CHAR_BIT; + +#ifdef SQLITE_VEC_ENABLE_NEON + if (dimensions >= 128) { + return distance_hamming_neon((const u8 *)a, (const u8 *)b, n_bytes); + } +#endif if ((dimensions % 64) == 0) { - return distance_hamming_u64((u64 *)a, (u64 *)b, dimensions / 8 / CHAR_BIT); + return distance_hamming_u64((u64 *)a, (u64 *)b, n_bytes / sizeof(u64)); } - return distance_hamming_u8((u8 *)a, (u8 *)b, dimensions / CHAR_BIT); + return distance_hamming_u8((u8 *)a, (u8 *)b, n_bytes); } #ifdef SQLITE_VEC_TEST @@ -1065,33 +1305,6 @@ int ensure_vector_match(sqlite3_value *aValue, sqlite3_value *bValue, void **a, int _cmp(const void *a, const void *b) { return (*(i64 *)a - *(i64 *)b); } -struct VecNpyFile { - char *path; - size_t pathLength; -}; -#define SQLITE_VEC_NPY_FILE_NAME "vec0-npy-file" - -#ifndef SQLITE_VEC_OMIT_FS -static void vec_npy_file(sqlite3_context *context, int argc, - sqlite3_value **argv) { - assert(argc == 1); - char *path = (char *)sqlite3_value_text(argv[0]); - size_t pathLength = sqlite3_value_bytes(argv[0]); - struct VecNpyFile *f; - - f = sqlite3_malloc(sizeof(*f)); - if (!f) { - sqlite3_result_error_nomem(context); - return; - } - memset(f, 0, sizeof(*f)); - - f->path = path; - f->pathLength = pathLength; - sqlite3_result_pointer(context, f, SQLITE_VEC_NPY_FILE_NAME, sqlite3_free); -} -#endif - #pragma region scalar functions static void vec_f32(sqlite3_context *context, int argc, sqlite3_value **argv) { assert(argc == 1); @@ -2281,12 +2494,53 @@ enum Vec0DistanceMetrics { VEC0_DISTANCE_METRIC_L1 = 3, }; +/** + * Compute distance between two full-precision vectors using the appropriate + * distance function for the given element type and metric. + * Shared utility used by ANN index implementations. + */ +static f32 vec0_distance_full( + const void *a, const void *b, size_t dimensions, + enum VectorElementType elementType, + enum Vec0DistanceMetrics metric) { + switch (elementType) { + case SQLITE_VEC_ELEMENT_TYPE_FLOAT32: + switch (metric) { + case VEC0_DISTANCE_METRIC_L2: + return distance_l2_sqr_float(a, b, &dimensions); + case VEC0_DISTANCE_METRIC_COSINE: + return distance_cosine_float(a, b, &dimensions); + case VEC0_DISTANCE_METRIC_L1: + return (f32)distance_l1_f32(a, b, &dimensions); + } + break; + case SQLITE_VEC_ELEMENT_TYPE_INT8: + switch (metric) { + case VEC0_DISTANCE_METRIC_L2: + return distance_l2_sqr_int8(a, b, &dimensions); + case VEC0_DISTANCE_METRIC_COSINE: + return distance_cosine_int8(a, b, &dimensions); + case VEC0_DISTANCE_METRIC_L1: + return (f32)distance_l1_int8(a, b, &dimensions); + } + break; + case SQLITE_VEC_ELEMENT_TYPE_BIT: + return distance_hamming(a, b, &dimensions); + } + return 0.0f; +} + +enum Vec0IndexType { + VEC0_INDEX_TYPE_FLAT = 1, +}; + struct VectorColumnDefinition { char *name; int name_length; size_t dimensions; enum VectorElementType element_type; enum Vec0DistanceMetrics distance_metric; + enum Vec0IndexType index_type; }; struct Vec0PartitionColumnDefinition { @@ -2346,6 +2600,7 @@ int vec0_parse_vector_column(const char *source, int source_length, int nameLength; enum VectorElementType elementType; enum Vec0DistanceMetrics distanceMetric = VEC0_DISTANCE_METRIC_L2; + enum Vec0IndexType indexType = VEC0_INDEX_TYPE_FLAT; int dimensions; vec0_scanner_init(&scanner, source, source_length); @@ -2449,6 +2704,40 @@ int vec0_parse_vector_column(const char *source, int source_length, return SQLITE_ERROR; } } + else if (sqlite3_strnicmp(key, "indexed", keyLength) == 0) { + // expect "by" + rc = vec0_scanner_next(&scanner, &token); + if (rc != VEC0_TOKEN_RESULT_SOME || + token.token_type != TOKEN_TYPE_IDENTIFIER || + sqlite3_strnicmp(token.start, "by", token.end - token.start) != 0) { + return SQLITE_ERROR; + } + // expect index type name + rc = vec0_scanner_next(&scanner, &token); + if (rc != VEC0_TOKEN_RESULT_SOME || + token.token_type != TOKEN_TYPE_IDENTIFIER) { + return SQLITE_ERROR; + } + int indexNameLen = token.end - token.start; + if (sqlite3_strnicmp(token.start, "flat", indexNameLen) == 0) { + indexType = VEC0_INDEX_TYPE_FLAT; + // expect '(' + rc = vec0_scanner_next(&scanner, &token); + if (rc != VEC0_TOKEN_RESULT_SOME || + token.token_type != TOKEN_TYPE_LPAREN) { + return SQLITE_ERROR; + } + // expect ')' + rc = vec0_scanner_next(&scanner, &token); + if (rc != VEC0_TOKEN_RESULT_SOME || + token.token_type != TOKEN_TYPE_RPAREN) { + return SQLITE_ERROR; + } + } else { + // unknown index type + return SQLITE_ERROR; + } + } // unknown key else { return SQLITE_ERROR; @@ -2463,6 +2752,7 @@ int vec0_parse_vector_column(const char *source, int source_length, outColumn->distance_metric = distanceMetric; outColumn->element_type = elementType; outColumn->dimensions = dimensions; + outColumn->index_type = indexType; return SQLITE_OK; } @@ -2660,758 +2950,6 @@ static sqlite3_module vec_eachModule = { #pragma endregion -#pragma region vec_npy_each table function - -enum NpyTokenType { - NPY_TOKEN_TYPE_IDENTIFIER, - NPY_TOKEN_TYPE_NUMBER, - NPY_TOKEN_TYPE_LPAREN, - NPY_TOKEN_TYPE_RPAREN, - NPY_TOKEN_TYPE_LBRACE, - NPY_TOKEN_TYPE_RBRACE, - NPY_TOKEN_TYPE_COLON, - NPY_TOKEN_TYPE_COMMA, - NPY_TOKEN_TYPE_STRING, - NPY_TOKEN_TYPE_FALSE, -}; - -struct NpyToken { - enum NpyTokenType token_type; - unsigned char *start; - unsigned char *end; -}; - -int npy_token_next(unsigned char *start, unsigned char *end, - struct NpyToken *out) { - unsigned char *ptr = start; - while (ptr < end) { - unsigned char curr = *ptr; - if (is_whitespace(curr)) { - ptr++; - continue; - } else if (curr == '(') { - out->start = ptr++; - out->end = ptr; - out->token_type = NPY_TOKEN_TYPE_LPAREN; - return VEC0_TOKEN_RESULT_SOME; - } else if (curr == ')') { - out->start = ptr++; - out->end = ptr; - out->token_type = NPY_TOKEN_TYPE_RPAREN; - return VEC0_TOKEN_RESULT_SOME; - } else if (curr == '{') { - out->start = ptr++; - out->end = ptr; - out->token_type = NPY_TOKEN_TYPE_LBRACE; - return VEC0_TOKEN_RESULT_SOME; - } else if (curr == '}') { - out->start = ptr++; - out->end = ptr; - out->token_type = NPY_TOKEN_TYPE_RBRACE; - return VEC0_TOKEN_RESULT_SOME; - } else if (curr == ':') { - out->start = ptr++; - out->end = ptr; - out->token_type = NPY_TOKEN_TYPE_COLON; - return VEC0_TOKEN_RESULT_SOME; - } else if (curr == ',') { - out->start = ptr++; - out->end = ptr; - out->token_type = NPY_TOKEN_TYPE_COMMA; - return VEC0_TOKEN_RESULT_SOME; - } else if (curr == '\'') { - unsigned char *start = ptr; - ptr++; - while (ptr < end) { - if ((*ptr) == '\'') { - break; - } - ptr++; - } - if (ptr >= end || (*ptr) != '\'') { - return VEC0_TOKEN_RESULT_ERROR; - } - out->start = start; - out->end = ++ptr; - out->token_type = NPY_TOKEN_TYPE_STRING; - return VEC0_TOKEN_RESULT_SOME; - } else if (curr == 'F' && - strncmp((char *)ptr, "False", strlen("False")) == 0) { - out->start = ptr; - out->end = (ptr + (int)strlen("False")); - ptr = out->end; - out->token_type = NPY_TOKEN_TYPE_FALSE; - return VEC0_TOKEN_RESULT_SOME; - } else if (is_digit(curr)) { - unsigned char *start = ptr; - while (ptr < end && (is_digit(*ptr))) { - ptr++; - } - out->start = start; - out->end = ptr; - out->token_type = NPY_TOKEN_TYPE_NUMBER; - return VEC0_TOKEN_RESULT_SOME; - } else { - return VEC0_TOKEN_RESULT_ERROR; - } - } - return VEC0_TOKEN_RESULT_ERROR; -} - -struct NpyScanner { - unsigned char *start; - unsigned char *end; - unsigned char *ptr; -}; - -void npy_scanner_init(struct NpyScanner *scanner, const unsigned char *source, - int source_length) { - scanner->start = (unsigned char *)source; - scanner->end = (unsigned char *)source + source_length; - scanner->ptr = (unsigned char *)source; -} - -int npy_scanner_next(struct NpyScanner *scanner, struct NpyToken *out) { - int rc = npy_token_next(scanner->start, scanner->end, out); - if (rc == VEC0_TOKEN_RESULT_SOME) { - scanner->start = out->end; - } - return rc; -} - -#define NPY_PARSE_ERROR "Error parsing numpy array: " -int parse_npy_header(sqlite3_vtab *pVTab, const unsigned char *header, - size_t headerLength, - enum VectorElementType *out_element_type, - int *fortran_order, size_t *numElements, - size_t *numDimensions) { - - struct NpyScanner scanner; - struct NpyToken token; - int rc; - npy_scanner_init(&scanner, header, headerLength); - - if (npy_scanner_next(&scanner, &token) != VEC0_TOKEN_RESULT_SOME && - token.token_type != NPY_TOKEN_TYPE_LBRACE) { - vtab_set_error(pVTab, - NPY_PARSE_ERROR "numpy header did not start with '{'"); - return SQLITE_ERROR; - } - while (1) { - rc = npy_scanner_next(&scanner, &token); - if (rc != VEC0_TOKEN_RESULT_SOME) { - vtab_set_error(pVTab, NPY_PARSE_ERROR "expected key in numpy header"); - return SQLITE_ERROR; - } - - if (token.token_type == NPY_TOKEN_TYPE_RBRACE) { - break; - } - if (token.token_type != NPY_TOKEN_TYPE_STRING) { - vtab_set_error(pVTab, NPY_PARSE_ERROR - "expected a string as key in numpy header"); - return SQLITE_ERROR; - } - unsigned char *key = token.start; - - rc = npy_scanner_next(&scanner, &token); - if ((rc != VEC0_TOKEN_RESULT_SOME) || - (token.token_type != NPY_TOKEN_TYPE_COLON)) { - vtab_set_error(pVTab, NPY_PARSE_ERROR - "expected a ':' after key in numpy header"); - return SQLITE_ERROR; - } - - if (strncmp((char *)key, "'descr'", strlen("'descr'")) == 0) { - rc = npy_scanner_next(&scanner, &token); - if ((rc != VEC0_TOKEN_RESULT_SOME) || - (token.token_type != NPY_TOKEN_TYPE_STRING)) { - vtab_set_error(pVTab, NPY_PARSE_ERROR - "expected a string value after 'descr' key"); - return SQLITE_ERROR; - } - if (strncmp((char *)token.start, "'maxChunks = 1024; - pCur->chunksBufferSize = - (vector_byte_size(element_type, numDimensions)) * pCur->maxChunks; - pCur->chunksBuffer = sqlite3_malloc(pCur->chunksBufferSize); - if (pCur->chunksBufferSize && !pCur->chunksBuffer) { - return SQLITE_NOMEM; - } - - pCur->currentChunkSize = - fread(pCur->chunksBuffer, vector_byte_size(element_type, numDimensions), - pCur->maxChunks, file); - - pCur->currentChunkIndex = 0; - pCur->elementType = element_type; - pCur->nElements = numElements; - pCur->nDimensions = numDimensions; - pCur->input_type = VEC_NPY_EACH_INPUT_FILE; - - pCur->eof = pCur->currentChunkSize == 0; - pCur->file = file; - return SQLITE_OK; -} -#endif - -int parse_npy_buffer(sqlite3_vtab *pVTab, const unsigned char *buffer, - int bufferLength, void **data, size_t *numElements, - size_t *numDimensions, - enum VectorElementType *element_type) { - - if (bufferLength < 10) { - // IMP: V03312_20150 - vtab_set_error(pVTab, "numpy array too short"); - return SQLITE_ERROR; - } - if (memcmp(NPY_MAGIC, buffer, sizeof(NPY_MAGIC)) != 0) { - // V11954_28792 - vtab_set_error(pVTab, "numpy array does not contain the 'magic' header"); - return SQLITE_ERROR; - } - - u8 major = buffer[6]; - u8 minor = buffer[7]; - uint16_t headerLength = 0; - memcpy(&headerLength, &buffer[8], sizeof(uint16_t)); - - i32 totalHeaderLength = sizeof(NPY_MAGIC) + sizeof(major) + sizeof(minor) + - sizeof(headerLength) + headerLength; - i32 dataSize = bufferLength - totalHeaderLength; - - if (dataSize < 0) { - vtab_set_error(pVTab, "numpy array header length is invalid"); - return SQLITE_ERROR; - } - - const unsigned char *header = &buffer[10]; - int fortran_order; - - int rc = parse_npy_header(pVTab, header, headerLength, element_type, - &fortran_order, numElements, numDimensions); - if (rc != SQLITE_OK) { - return rc; - } - - i32 expectedDataSize = - (*numElements * vector_byte_size(*element_type, *numDimensions)); - if (expectedDataSize != dataSize) { - vtab_set_error(pVTab, - "numpy array error: Expected a data size of %d, found %d", - expectedDataSize, dataSize); - return SQLITE_ERROR; - } - - *data = (void *)&buffer[totalHeaderLength]; - return SQLITE_OK; -} - -static int vec_npy_eachConnect(sqlite3 *db, void *pAux, int argc, - const char *const *argv, sqlite3_vtab **ppVtab, - char **pzErr) { - UNUSED_PARAMETER(pAux); - UNUSED_PARAMETER(argc); - UNUSED_PARAMETER(argv); - UNUSED_PARAMETER(pzErr); - vec_npy_each_vtab *pNew; - int rc; - - rc = sqlite3_declare_vtab(db, "CREATE TABLE x(vector, input hidden)"); -#define VEC_NPY_EACH_COLUMN_VECTOR 0 -#define VEC_NPY_EACH_COLUMN_INPUT 1 - if (rc == SQLITE_OK) { - pNew = sqlite3_malloc(sizeof(*pNew)); - *ppVtab = (sqlite3_vtab *)pNew; - if (pNew == 0) - return SQLITE_NOMEM; - memset(pNew, 0, sizeof(*pNew)); - } - return rc; -} - -static int vec_npy_eachDisconnect(sqlite3_vtab *pVtab) { - vec_npy_each_vtab *p = (vec_npy_each_vtab *)pVtab; - sqlite3_free(p); - return SQLITE_OK; -} - -static int vec_npy_eachOpen(sqlite3_vtab *p, sqlite3_vtab_cursor **ppCursor) { - UNUSED_PARAMETER(p); - vec_npy_each_cursor *pCur; - pCur = sqlite3_malloc(sizeof(*pCur)); - if (pCur == 0) - return SQLITE_NOMEM; - memset(pCur, 0, sizeof(*pCur)); - *ppCursor = &pCur->base; - return SQLITE_OK; -} - -static int vec_npy_eachClose(sqlite3_vtab_cursor *cur) { - vec_npy_each_cursor *pCur = (vec_npy_each_cursor *)cur; -#ifndef SQLITE_VEC_OMIT_FS - if (pCur->file) { - fclose(pCur->file); - pCur->file = NULL; - } -#endif - if (pCur->chunksBuffer) { - sqlite3_free(pCur->chunksBuffer); - pCur->chunksBuffer = NULL; - } - if (pCur->vector) { - pCur->vector = NULL; - } - sqlite3_free(pCur); - return SQLITE_OK; -} - -static int vec_npy_eachBestIndex(sqlite3_vtab *pVTab, - sqlite3_index_info *pIdxInfo) { - int hasInput; - for (int i = 0; i < pIdxInfo->nConstraint; i++) { - const struct sqlite3_index_constraint *pCons = &pIdxInfo->aConstraint[i]; - // printf("i=%d iColumn=%d, op=%d, usable=%d\n", i, pCons->iColumn, - // pCons->op, pCons->usable); - switch (pCons->iColumn) { - case VEC_NPY_EACH_COLUMN_INPUT: { - if (pCons->op == SQLITE_INDEX_CONSTRAINT_EQ && pCons->usable) { - hasInput = 1; - pIdxInfo->aConstraintUsage[i].argvIndex = 1; - pIdxInfo->aConstraintUsage[i].omit = 1; - } - break; - } - } - } - if (!hasInput) { - pVTab->zErrMsg = sqlite3_mprintf("input argument is required"); - return SQLITE_ERROR; - } - - pIdxInfo->estimatedCost = (double)100000; - pIdxInfo->estimatedRows = 100000; - - return SQLITE_OK; -} - -static int vec_npy_eachFilter(sqlite3_vtab_cursor *pVtabCursor, int idxNum, - const char *idxStr, int argc, - sqlite3_value **argv) { - UNUSED_PARAMETER(idxNum); - UNUSED_PARAMETER(idxStr); - assert(argc == 1); - int rc; - - vec_npy_each_cursor *pCur = (vec_npy_each_cursor *)pVtabCursor; - -#ifndef SQLITE_VEC_OMIT_FS - if (pCur->file) { - fclose(pCur->file); - pCur->file = NULL; - } -#endif - if (pCur->chunksBuffer) { - sqlite3_free(pCur->chunksBuffer); - pCur->chunksBuffer = NULL; - } - if (pCur->vector) { - pCur->vector = NULL; - } - -#ifndef SQLITE_VEC_OMIT_FS - struct VecNpyFile *f = NULL; - if ((f = sqlite3_value_pointer(argv[0], SQLITE_VEC_NPY_FILE_NAME))) { - FILE *file = fopen(f->path, "r"); - if (!file) { - vtab_set_error(pVtabCursor->pVtab, "Could not open numpy file"); - return SQLITE_ERROR; - } - - rc = parse_npy_file(pVtabCursor->pVtab, file, pCur); - if (rc != SQLITE_OK) { -#ifndef SQLITE_VEC_OMIT_FS - fclose(file); -#endif - return rc; - } - - } else -#endif - { - - const unsigned char *input = sqlite3_value_blob(argv[0]); - int inputLength = sqlite3_value_bytes(argv[0]); - void *data; - size_t numElements; - size_t numDimensions; - enum VectorElementType element_type; - - rc = parse_npy_buffer(pVtabCursor->pVtab, input, inputLength, &data, - &numElements, &numDimensions, &element_type); - if (rc != SQLITE_OK) { - return rc; - } - - pCur->vector = data; - pCur->elementType = element_type; - pCur->nElements = numElements; - pCur->nDimensions = numDimensions; - pCur->input_type = VEC_NPY_EACH_INPUT_BUFFER; - } - - pCur->iRowid = 0; - return SQLITE_OK; -} - -static int vec_npy_eachRowid(sqlite3_vtab_cursor *cur, sqlite_int64 *pRowid) { - vec_npy_each_cursor *pCur = (vec_npy_each_cursor *)cur; - *pRowid = pCur->iRowid; - return SQLITE_OK; -} - -static int vec_npy_eachEof(sqlite3_vtab_cursor *cur) { - vec_npy_each_cursor *pCur = (vec_npy_each_cursor *)cur; - if (pCur->input_type == VEC_NPY_EACH_INPUT_BUFFER) { - return (!pCur->nElements) || (size_t)pCur->iRowid >= pCur->nElements; - } - return pCur->eof; -} - -static int vec_npy_eachNext(sqlite3_vtab_cursor *cur) { - vec_npy_each_cursor *pCur = (vec_npy_each_cursor *)cur; - pCur->iRowid++; - if (pCur->input_type == VEC_NPY_EACH_INPUT_BUFFER) { - return SQLITE_OK; - } - -#ifndef SQLITE_VEC_OMIT_FS - // else: input is a file - pCur->currentChunkIndex++; - if (pCur->currentChunkIndex >= pCur->currentChunkSize) { - pCur->currentChunkSize = - fread(pCur->chunksBuffer, - vector_byte_size(pCur->elementType, pCur->nDimensions), - pCur->maxChunks, pCur->file); - if (!pCur->currentChunkSize) { - pCur->eof = 1; - } - pCur->currentChunkIndex = 0; - } -#endif - return SQLITE_OK; -} - -static int vec_npy_eachColumnBuffer(vec_npy_each_cursor *pCur, - sqlite3_context *context, int i) { - switch (i) { - case VEC_NPY_EACH_COLUMN_VECTOR: { - sqlite3_result_subtype(context, pCur->elementType); - switch (pCur->elementType) { - case SQLITE_VEC_ELEMENT_TYPE_FLOAT32: { - sqlite3_result_blob( - context, - &((unsigned char *) - pCur->vector)[pCur->iRowid * pCur->nDimensions * sizeof(f32)], - pCur->nDimensions * sizeof(f32), SQLITE_TRANSIENT); - - break; - } - case SQLITE_VEC_ELEMENT_TYPE_INT8: - case SQLITE_VEC_ELEMENT_TYPE_BIT: { - // https://github.com/asg017/sqlite-vec/issues/42 - sqlite3_result_error(context, - "vec_npy_each only supports float32 vectors", -1); - break; - } - } - - break; - } - } - return SQLITE_OK; -} -static int vec_npy_eachColumnFile(vec_npy_each_cursor *pCur, - sqlite3_context *context, int i) { - switch (i) { - case VEC_NPY_EACH_COLUMN_VECTOR: { - switch (pCur->elementType) { - case SQLITE_VEC_ELEMENT_TYPE_FLOAT32: { - sqlite3_result_blob( - context, - &((unsigned char *) - pCur->chunksBuffer)[pCur->currentChunkIndex * - pCur->nDimensions * sizeof(f32)], - pCur->nDimensions * sizeof(f32), SQLITE_TRANSIENT); - break; - } - case SQLITE_VEC_ELEMENT_TYPE_INT8: - case SQLITE_VEC_ELEMENT_TYPE_BIT: { - // https://github.com/asg017/sqlite-vec/issues/42 - sqlite3_result_error(context, - "vec_npy_each only supports float32 vectors", -1); - break; - } - } - break; - } - } - return SQLITE_OK; -} -static int vec_npy_eachColumn(sqlite3_vtab_cursor *cur, - sqlite3_context *context, int i) { - vec_npy_each_cursor *pCur = (vec_npy_each_cursor *)cur; - switch (pCur->input_type) { - case VEC_NPY_EACH_INPUT_BUFFER: - return vec_npy_eachColumnBuffer(pCur, context, i); - case VEC_NPY_EACH_INPUT_FILE: - return vec_npy_eachColumnFile(pCur, context, i); - } - return SQLITE_ERROR; -} - -static sqlite3_module vec_npy_eachModule = { - /* iVersion */ 0, - /* xCreate */ 0, - /* xConnect */ vec_npy_eachConnect, - /* xBestIndex */ vec_npy_eachBestIndex, - /* xDisconnect */ vec_npy_eachDisconnect, - /* xDestroy */ 0, - /* xOpen */ vec_npy_eachOpen, - /* xClose */ vec_npy_eachClose, - /* xFilter */ vec_npy_eachFilter, - /* xNext */ vec_npy_eachNext, - /* xEof */ vec_npy_eachEof, - /* xColumn */ vec_npy_eachColumn, - /* xRowid */ vec_npy_eachRowid, - /* xUpdate */ 0, - /* xBegin */ 0, - /* xSync */ 0, - /* xCommit */ 0, - /* xRollback */ 0, - /* xFindMethod */ 0, - /* xRename */ 0, - /* xSavepoint */ 0, - /* xRelease */ 0, - /* xRollbackTo */ 0, - /* xShadowName */ 0, -#if SQLITE_VERSION_NUMBER >= 3044000 - /* xIntegrity */ 0, -#endif -}; - -#pragma endregion #pragma region vec0 virtual table @@ -5959,6 +5497,65 @@ int min_idx(const f32 *distances, i32 n, u8 *candidates, i32 *out, i32 k, assert(k > 0); assert(k <= n); +#ifdef SQLITE_VEC_EXPERIMENTAL_MIN_IDX + // Max-heap variant: O(n log k) single-pass. + // out[0..heap_size-1] stores indices; heap ordered by distances descending + // so out[0] is always the index of the LARGEST distance in the top-k. + (void)bTaken; + int heap_size = 0; + + #define HEAP_SIFT_UP(pos) do { \ + int _c = (pos); \ + while (_c > 0) { \ + int _p = (_c - 1) / 2; \ + if (distances[out[_p]] < distances[out[_c]]) { \ + i32 _tmp = out[_p]; out[_p] = out[_c]; out[_c] = _tmp; \ + _c = _p; \ + } else break; \ + } \ + } while(0) + + #define HEAP_SIFT_DOWN(pos, sz) do { \ + int _p = (pos); \ + for (;;) { \ + int _l = 2*_p + 1, _r = 2*_p + 2, _largest = _p; \ + if (_l < (sz) && distances[out[_l]] > distances[out[_largest]]) \ + _largest = _l; \ + if (_r < (sz) && distances[out[_r]] > distances[out[_largest]]) \ + _largest = _r; \ + if (_largest == _p) break; \ + i32 _tmp = out[_p]; out[_p] = out[_largest]; out[_largest] = _tmp; \ + _p = _largest; \ + } \ + } while(0) + + for (int i = 0; i < n; i++) { + if (!bitmap_get(candidates, i)) + continue; + if (heap_size < k) { + out[heap_size] = i; + heap_size++; + HEAP_SIFT_UP(heap_size - 1); + } else if (distances[i] < distances[out[0]]) { + out[0] = i; + HEAP_SIFT_DOWN(0, heap_size); + } + } + + // Heapsort to produce ascending order. + for (int i = heap_size - 1; i > 0; i--) { + i32 tmp = out[0]; out[0] = out[i]; out[i] = tmp; + HEAP_SIFT_DOWN(0, i); + } + + #undef HEAP_SIFT_UP + #undef HEAP_SIFT_DOWN + + *k_used = heap_size; + return SQLITE_OK; + +#else + // Original: O(n*k) repeated linear scan with bitmap. bitmap_clear(bTaken, n); for (int ik = 0; ik < k; ik++) { @@ -5984,6 +5581,7 @@ int min_idx(const f32 *distances, i32 n, u8 *candidates, i32 *out, i32 k, } *k_used = k; return SQLITE_OK; +#endif } int vec0_get_metadata_text_long_value( @@ -9394,652 +8992,6 @@ static sqlite3_module vec0Module = { }; #pragma endregion -static char *POINTER_NAME_STATIC_BLOB_DEF = "vec0-static_blob_def"; -struct static_blob_definition { - void *p; - size_t dimensions; - size_t nvectors; - enum VectorElementType element_type; -}; -static void vec_static_blob_from_raw(sqlite3_context *context, int argc, - sqlite3_value **argv) { - - assert(argc == 4); - struct static_blob_definition *p; - p = sqlite3_malloc(sizeof(*p)); - if (!p) { - sqlite3_result_error_nomem(context); - return; - } - memset(p, 0, sizeof(*p)); - p->p = (void *)sqlite3_value_int64(argv[0]); - p->element_type = SQLITE_VEC_ELEMENT_TYPE_FLOAT32; - p->dimensions = sqlite3_value_int64(argv[2]); - p->nvectors = sqlite3_value_int64(argv[3]); - sqlite3_result_pointer(context, p, POINTER_NAME_STATIC_BLOB_DEF, - sqlite3_free); -} -#pragma region vec_static_blobs() table function - -#define MAX_STATIC_BLOBS 16 - -typedef struct static_blob static_blob; -struct static_blob { - char *name; - void *p; - size_t dimensions; - size_t nvectors; - enum VectorElementType element_type; -}; - -typedef struct vec_static_blob_data vec_static_blob_data; -struct vec_static_blob_data { - static_blob static_blobs[MAX_STATIC_BLOBS]; -}; - -typedef struct vec_static_blobs_vtab vec_static_blobs_vtab; -struct vec_static_blobs_vtab { - sqlite3_vtab base; - vec_static_blob_data *data; -}; - -typedef struct vec_static_blobs_cursor vec_static_blobs_cursor; -struct vec_static_blobs_cursor { - sqlite3_vtab_cursor base; - sqlite3_int64 iRowid; -}; - -static int vec_static_blobsConnect(sqlite3 *db, void *pAux, int argc, - const char *const *argv, - sqlite3_vtab **ppVtab, char **pzErr) { - UNUSED_PARAMETER(argc); - UNUSED_PARAMETER(argv); - UNUSED_PARAMETER(pzErr); - - vec_static_blobs_vtab *pNew; -#define VEC_STATIC_BLOBS_NAME 0 -#define VEC_STATIC_BLOBS_DATA 1 -#define VEC_STATIC_BLOBS_DIMENSIONS 2 -#define VEC_STATIC_BLOBS_COUNT 3 - int rc = sqlite3_declare_vtab( - db, "CREATE TABLE x(name, data, dimensions hidden, count hidden)"); - if (rc == SQLITE_OK) { - pNew = sqlite3_malloc(sizeof(*pNew)); - *ppVtab = (sqlite3_vtab *)pNew; - if (pNew == 0) - return SQLITE_NOMEM; - memset(pNew, 0, sizeof(*pNew)); - pNew->data = pAux; - } - return rc; -} - -static int vec_static_blobsDisconnect(sqlite3_vtab *pVtab) { - vec_static_blobs_vtab *p = (vec_static_blobs_vtab *)pVtab; - sqlite3_free(p); - return SQLITE_OK; -} - -static int vec_static_blobsUpdate(sqlite3_vtab *pVTab, int argc, - sqlite3_value **argv, sqlite_int64 *pRowid) { - UNUSED_PARAMETER(pRowid); - vec_static_blobs_vtab *p = (vec_static_blobs_vtab *)pVTab; - // DELETE operation - if (argc == 1 && sqlite3_value_type(argv[0]) != SQLITE_NULL) { - return SQLITE_ERROR; - } - // INSERT operation - else if (argc > 1 && sqlite3_value_type(argv[0]) == SQLITE_NULL) { - const char *key = - (const char *)sqlite3_value_text(argv[2 + VEC_STATIC_BLOBS_NAME]); - int idx = -1; - for (int i = 0; i < MAX_STATIC_BLOBS; i++) { - if (!p->data->static_blobs[i].name) { - p->data->static_blobs[i].name = sqlite3_mprintf("%s", key); - idx = i; - break; - } - } - if (idx < 0) - abort(); - struct static_blob_definition *def = sqlite3_value_pointer( - argv[2 + VEC_STATIC_BLOBS_DATA], POINTER_NAME_STATIC_BLOB_DEF); - p->data->static_blobs[idx].p = def->p; - p->data->static_blobs[idx].dimensions = def->dimensions; - p->data->static_blobs[idx].nvectors = def->nvectors; - p->data->static_blobs[idx].element_type = def->element_type; - - return SQLITE_OK; - } - // UPDATE operation - else if (argc > 1 && sqlite3_value_type(argv[0]) != SQLITE_NULL) { - return SQLITE_ERROR; - } - return SQLITE_ERROR; -} - -static int vec_static_blobsOpen(sqlite3_vtab *p, - sqlite3_vtab_cursor **ppCursor) { - UNUSED_PARAMETER(p); - vec_static_blobs_cursor *pCur; - pCur = sqlite3_malloc(sizeof(*pCur)); - if (pCur == 0) - return SQLITE_NOMEM; - memset(pCur, 0, sizeof(*pCur)); - *ppCursor = &pCur->base; - return SQLITE_OK; -} - -static int vec_static_blobsClose(sqlite3_vtab_cursor *cur) { - vec_static_blobs_cursor *pCur = (vec_static_blobs_cursor *)cur; - sqlite3_free(pCur); - return SQLITE_OK; -} - -static int vec_static_blobsBestIndex(sqlite3_vtab *pVTab, - sqlite3_index_info *pIdxInfo) { - UNUSED_PARAMETER(pVTab); - pIdxInfo->idxNum = 1; - pIdxInfo->estimatedCost = (double)10; - pIdxInfo->estimatedRows = 10; - return SQLITE_OK; -} - -static int vec_static_blobsNext(sqlite3_vtab_cursor *cur); -static int vec_static_blobsFilter(sqlite3_vtab_cursor *pVtabCursor, int idxNum, - const char *idxStr, int argc, - sqlite3_value **argv) { - UNUSED_PARAMETER(idxNum); - UNUSED_PARAMETER(idxStr); - UNUSED_PARAMETER(argc); - UNUSED_PARAMETER(argv); - vec_static_blobs_cursor *pCur = (vec_static_blobs_cursor *)pVtabCursor; - pCur->iRowid = -1; - vec_static_blobsNext(pVtabCursor); - return SQLITE_OK; -} - -static int vec_static_blobsRowid(sqlite3_vtab_cursor *cur, - sqlite_int64 *pRowid) { - vec_static_blobs_cursor *pCur = (vec_static_blobs_cursor *)cur; - *pRowid = pCur->iRowid; - return SQLITE_OK; -} - -static int vec_static_blobsNext(sqlite3_vtab_cursor *cur) { - vec_static_blobs_cursor *pCur = (vec_static_blobs_cursor *)cur; - vec_static_blobs_vtab *p = (vec_static_blobs_vtab *)pCur->base.pVtab; - pCur->iRowid++; - while (pCur->iRowid < MAX_STATIC_BLOBS) { - if (p->data->static_blobs[pCur->iRowid].name) { - return SQLITE_OK; - } - pCur->iRowid++; - } - return SQLITE_OK; -} - -static int vec_static_blobsEof(sqlite3_vtab_cursor *cur) { - vec_static_blobs_cursor *pCur = (vec_static_blobs_cursor *)cur; - return pCur->iRowid >= MAX_STATIC_BLOBS; -} - -static int vec_static_blobsColumn(sqlite3_vtab_cursor *cur, - sqlite3_context *context, int i) { - vec_static_blobs_cursor *pCur = (vec_static_blobs_cursor *)cur; - vec_static_blobs_vtab *p = (vec_static_blobs_vtab *)cur->pVtab; - switch (i) { - case VEC_STATIC_BLOBS_NAME: - sqlite3_result_text(context, p->data->static_blobs[pCur->iRowid].name, -1, - SQLITE_TRANSIENT); - break; - case VEC_STATIC_BLOBS_DATA: - sqlite3_result_null(context); - break; - case VEC_STATIC_BLOBS_DIMENSIONS: - sqlite3_result_int64(context, - p->data->static_blobs[pCur->iRowid].dimensions); - break; - case VEC_STATIC_BLOBS_COUNT: - sqlite3_result_int64(context, p->data->static_blobs[pCur->iRowid].nvectors); - break; - } - return SQLITE_OK; -} - -static sqlite3_module vec_static_blobsModule = { - /* iVersion */ 3, - /* xCreate */ 0, - /* xConnect */ vec_static_blobsConnect, - /* xBestIndex */ vec_static_blobsBestIndex, - /* xDisconnect */ vec_static_blobsDisconnect, - /* xDestroy */ 0, - /* xOpen */ vec_static_blobsOpen, - /* xClose */ vec_static_blobsClose, - /* xFilter */ vec_static_blobsFilter, - /* xNext */ vec_static_blobsNext, - /* xEof */ vec_static_blobsEof, - /* xColumn */ vec_static_blobsColumn, - /* xRowid */ vec_static_blobsRowid, - /* xUpdate */ vec_static_blobsUpdate, - /* xBegin */ 0, - /* xSync */ 0, - /* xCommit */ 0, - /* xRollback */ 0, - /* xFindMethod */ 0, - /* xRename */ 0, - /* xSavepoint */ 0, - /* xRelease */ 0, - /* xRollbackTo */ 0, - /* xShadowName */ 0, -#if SQLITE_VERSION_NUMBER >= 3044000 - /* xIntegrity */ 0 -#endif -}; -#pragma endregion - -#pragma region vec_static_blob_entries() table function - -typedef struct vec_static_blob_entries_vtab vec_static_blob_entries_vtab; -struct vec_static_blob_entries_vtab { - sqlite3_vtab base; - static_blob *blob; -}; -typedef enum { - VEC_SBE__QUERYPLAN_FULLSCAN = 1, - VEC_SBE__QUERYPLAN_KNN = 2 -} vec_sbe_query_plan; - -struct sbe_query_knn_data { - i64 k; - i64 k_used; - // Array of rowids of size k. Must be freed with sqlite3_free(). - i32 *rowids; - // Array of distances of size k. Must be freed with sqlite3_free(). - f32 *distances; - i64 current_idx; -}; -void sbe_query_knn_data_clear(struct sbe_query_knn_data *knn_data) { - if (!knn_data) - return; - - if (knn_data->rowids) { - sqlite3_free(knn_data->rowids); - knn_data->rowids = NULL; - } - if (knn_data->distances) { - sqlite3_free(knn_data->distances); - knn_data->distances = NULL; - } -} - -typedef struct vec_static_blob_entries_cursor vec_static_blob_entries_cursor; -struct vec_static_blob_entries_cursor { - sqlite3_vtab_cursor base; - sqlite3_int64 iRowid; - vec_sbe_query_plan query_plan; - struct sbe_query_knn_data *knn_data; -}; - -static int vec_static_blob_entriesConnect(sqlite3 *db, void *pAux, int argc, - const char *const *argv, - sqlite3_vtab **ppVtab, char **pzErr) { - UNUSED_PARAMETER(argc); - UNUSED_PARAMETER(argv); - UNUSED_PARAMETER(pzErr); - vec_static_blob_data *blob_data = pAux; - int idx = -1; - for (int i = 0; i < MAX_STATIC_BLOBS; i++) { - if (!blob_data->static_blobs[i].name) - continue; - if (strncmp(blob_data->static_blobs[i].name, argv[3], - strlen(blob_data->static_blobs[i].name)) == 0) { - idx = i; - break; - } - } - if (idx < 0) - abort(); - vec_static_blob_entries_vtab *pNew; -#define VEC_STATIC_BLOB_ENTRIES_VECTOR 0 -#define VEC_STATIC_BLOB_ENTRIES_DISTANCE 1 -#define VEC_STATIC_BLOB_ENTRIES_K 2 - int rc = sqlite3_declare_vtab( - db, "CREATE TABLE x(vector, distance hidden, k hidden)"); - if (rc == SQLITE_OK) { - pNew = sqlite3_malloc(sizeof(*pNew)); - *ppVtab = (sqlite3_vtab *)pNew; - if (pNew == 0) - return SQLITE_NOMEM; - memset(pNew, 0, sizeof(*pNew)); - pNew->blob = &blob_data->static_blobs[idx]; - } - return rc; -} - -static int vec_static_blob_entriesCreate(sqlite3 *db, void *pAux, int argc, - const char *const *argv, - sqlite3_vtab **ppVtab, char **pzErr) { - return vec_static_blob_entriesConnect(db, pAux, argc, argv, ppVtab, pzErr); -} - -static int vec_static_blob_entriesDisconnect(sqlite3_vtab *pVtab) { - vec_static_blob_entries_vtab *p = (vec_static_blob_entries_vtab *)pVtab; - sqlite3_free(p); - return SQLITE_OK; -} - -static int vec_static_blob_entriesOpen(sqlite3_vtab *p, - sqlite3_vtab_cursor **ppCursor) { - UNUSED_PARAMETER(p); - vec_static_blob_entries_cursor *pCur; - pCur = sqlite3_malloc(sizeof(*pCur)); - if (pCur == 0) - return SQLITE_NOMEM; - memset(pCur, 0, sizeof(*pCur)); - *ppCursor = &pCur->base; - return SQLITE_OK; -} - -static int vec_static_blob_entriesClose(sqlite3_vtab_cursor *cur) { - vec_static_blob_entries_cursor *pCur = (vec_static_blob_entries_cursor *)cur; - sqlite3_free(pCur->knn_data); - sqlite3_free(pCur); - return SQLITE_OK; -} - -static int vec_static_blob_entriesBestIndex(sqlite3_vtab *pVTab, - sqlite3_index_info *pIdxInfo) { - vec_static_blob_entries_vtab *p = (vec_static_blob_entries_vtab *)pVTab; - int iMatchTerm = -1; - int iLimitTerm = -1; - // int iRowidTerm = -1; // https://github.com/asg017/sqlite-vec/issues/47 - int iKTerm = -1; - - for (int i = 0; i < pIdxInfo->nConstraint; i++) { - if (!pIdxInfo->aConstraint[i].usable) - continue; - - int iColumn = pIdxInfo->aConstraint[i].iColumn; - int op = pIdxInfo->aConstraint[i].op; - if (op == SQLITE_INDEX_CONSTRAINT_MATCH && - iColumn == VEC_STATIC_BLOB_ENTRIES_VECTOR) { - if (iMatchTerm > -1) { - // https://github.com/asg017/sqlite-vec/issues/51 - return SQLITE_ERROR; - } - iMatchTerm = i; - } - if (op == SQLITE_INDEX_CONSTRAINT_LIMIT) { - iLimitTerm = i; - } - if (op == SQLITE_INDEX_CONSTRAINT_EQ && - iColumn == VEC_STATIC_BLOB_ENTRIES_K) { - iKTerm = i; - } - } - if (iMatchTerm >= 0) { - if (iLimitTerm < 0 && iKTerm < 0) { - // https://github.com/asg017/sqlite-vec/issues/51 - return SQLITE_ERROR; - } - if (iLimitTerm >= 0 && iKTerm >= 0) { - return SQLITE_ERROR; // limit or k, not both - } - if (pIdxInfo->nOrderBy < 1) { - vtab_set_error(pVTab, "ORDER BY distance required"); - return SQLITE_CONSTRAINT; - } - if (pIdxInfo->nOrderBy > 1) { - // https://github.com/asg017/sqlite-vec/issues/51 - vtab_set_error(pVTab, "more than 1 ORDER BY clause provided"); - return SQLITE_CONSTRAINT; - } - if (pIdxInfo->aOrderBy[0].iColumn != VEC_STATIC_BLOB_ENTRIES_DISTANCE) { - vtab_set_error(pVTab, "ORDER BY must be on the distance column"); - return SQLITE_CONSTRAINT; - } - if (pIdxInfo->aOrderBy[0].desc) { - vtab_set_error(pVTab, - "Only ascending in ORDER BY distance clause is supported, " - "DESC is not supported yet."); - return SQLITE_CONSTRAINT; - } - - pIdxInfo->idxNum = VEC_SBE__QUERYPLAN_KNN; - pIdxInfo->estimatedCost = (double)10; - pIdxInfo->estimatedRows = 10; - - pIdxInfo->orderByConsumed = 1; - pIdxInfo->aConstraintUsage[iMatchTerm].argvIndex = 1; - pIdxInfo->aConstraintUsage[iMatchTerm].omit = 1; - if (iLimitTerm >= 0) { - pIdxInfo->aConstraintUsage[iLimitTerm].argvIndex = 2; - pIdxInfo->aConstraintUsage[iLimitTerm].omit = 1; - } else { - pIdxInfo->aConstraintUsage[iKTerm].argvIndex = 2; - pIdxInfo->aConstraintUsage[iKTerm].omit = 1; - } - - } else { - pIdxInfo->idxNum = VEC_SBE__QUERYPLAN_FULLSCAN; - pIdxInfo->estimatedCost = (double)p->blob->nvectors; - pIdxInfo->estimatedRows = p->blob->nvectors; - } - return SQLITE_OK; -} - -static int vec_static_blob_entriesFilter(sqlite3_vtab_cursor *pVtabCursor, - int idxNum, const char *idxStr, - int argc, sqlite3_value **argv) { - UNUSED_PARAMETER(idxStr); - assert(argc >= 0 && argc <= 3); - vec_static_blob_entries_cursor *pCur = - (vec_static_blob_entries_cursor *)pVtabCursor; - vec_static_blob_entries_vtab *p = - (vec_static_blob_entries_vtab *)pCur->base.pVtab; - - if (idxNum == VEC_SBE__QUERYPLAN_KNN) { - assert(argc == 2); - pCur->query_plan = VEC_SBE__QUERYPLAN_KNN; - struct sbe_query_knn_data *knn_data; - knn_data = sqlite3_malloc(sizeof(*knn_data)); - if (!knn_data) { - return SQLITE_NOMEM; - } - memset(knn_data, 0, sizeof(*knn_data)); - - void *queryVector; - size_t dimensions; - enum VectorElementType elementType; - vector_cleanup cleanup; - char *err; - int rc = vector_from_value(argv[0], &queryVector, &dimensions, &elementType, - &cleanup, &err); - if (rc != SQLITE_OK) { - return SQLITE_ERROR; - } - if (elementType != p->blob->element_type) { - return SQLITE_ERROR; - } - if (dimensions != p->blob->dimensions) { - return SQLITE_ERROR; - } - - i64 k = min(sqlite3_value_int64(argv[1]), (i64)p->blob->nvectors); - if (k < 0) { - // HANDLE https://github.com/asg017/sqlite-vec/issues/55 - return SQLITE_ERROR; - } - if (k == 0) { - knn_data->k = 0; - pCur->knn_data = knn_data; - return SQLITE_OK; - } - - size_t bsize = (p->blob->nvectors + 7) & ~7; - - i32 *topk_rowids = sqlite3_malloc(k * sizeof(i32)); - if (!topk_rowids) { - // HANDLE https://github.com/asg017/sqlite-vec/issues/55 - return SQLITE_ERROR; - } - f32 *distances = sqlite3_malloc(bsize * sizeof(f32)); - if (!distances) { - // HANDLE https://github.com/asg017/sqlite-vec/issues/55 - return SQLITE_ERROR; - } - - for (size_t i = 0; i < p->blob->nvectors; i++) { - // https://github.com/asg017/sqlite-vec/issues/52 - float *v = ((float *)p->blob->p) + (i * p->blob->dimensions); - distances[i] = - distance_l2_sqr_float(v, (float *)queryVector, &p->blob->dimensions); - } - u8 *candidates = bitmap_new(bsize); - assert(candidates); - - u8 *taken = bitmap_new(bsize); - assert(taken); - - bitmap_fill(candidates, bsize); - for (size_t i = bsize; i >= p->blob->nvectors; i--) { - bitmap_set(candidates, i, 0); - } - i32 k_used = 0; - min_idx(distances, bsize, candidates, topk_rowids, k, taken, &k_used); - knn_data->current_idx = 0; - knn_data->distances = distances; - knn_data->k = k; - knn_data->rowids = topk_rowids; - - pCur->knn_data = knn_data; - } else { - pCur->query_plan = VEC_SBE__QUERYPLAN_FULLSCAN; - pCur->iRowid = 0; - } - - return SQLITE_OK; -} - -static int vec_static_blob_entriesRowid(sqlite3_vtab_cursor *cur, - sqlite_int64 *pRowid) { - vec_static_blob_entries_cursor *pCur = (vec_static_blob_entries_cursor *)cur; - switch (pCur->query_plan) { - case VEC_SBE__QUERYPLAN_FULLSCAN: { - *pRowid = pCur->iRowid; - return SQLITE_OK; - } - case VEC_SBE__QUERYPLAN_KNN: { - i32 rowid = ((i32 *)pCur->knn_data->rowids)[pCur->knn_data->current_idx]; - *pRowid = (sqlite3_int64)rowid; - return SQLITE_OK; - } - } - return SQLITE_ERROR; -} - -static int vec_static_blob_entriesNext(sqlite3_vtab_cursor *cur) { - vec_static_blob_entries_cursor *pCur = (vec_static_blob_entries_cursor *)cur; - switch (pCur->query_plan) { - case VEC_SBE__QUERYPLAN_FULLSCAN: { - pCur->iRowid++; - return SQLITE_OK; - } - case VEC_SBE__QUERYPLAN_KNN: { - pCur->knn_data->current_idx++; - return SQLITE_OK; - } - } - return SQLITE_ERROR; -} - -static int vec_static_blob_entriesEof(sqlite3_vtab_cursor *cur) { - vec_static_blob_entries_cursor *pCur = (vec_static_blob_entries_cursor *)cur; - vec_static_blob_entries_vtab *p = - (vec_static_blob_entries_vtab *)pCur->base.pVtab; - switch (pCur->query_plan) { - case VEC_SBE__QUERYPLAN_FULLSCAN: { - return (size_t)pCur->iRowid >= p->blob->nvectors; - } - case VEC_SBE__QUERYPLAN_KNN: { - return pCur->knn_data->current_idx >= pCur->knn_data->k; - } - } - return SQLITE_ERROR; -} - -static int vec_static_blob_entriesColumn(sqlite3_vtab_cursor *cur, - sqlite3_context *context, int i) { - vec_static_blob_entries_cursor *pCur = (vec_static_blob_entries_cursor *)cur; - vec_static_blob_entries_vtab *p = (vec_static_blob_entries_vtab *)cur->pVtab; - - switch (pCur->query_plan) { - case VEC_SBE__QUERYPLAN_FULLSCAN: { - switch (i) { - case VEC_STATIC_BLOB_ENTRIES_VECTOR: - - sqlite3_result_blob( - context, - ((unsigned char *)p->blob->p) + - (pCur->iRowid * p->blob->dimensions * sizeof(float)), - p->blob->dimensions * sizeof(float), SQLITE_TRANSIENT); - sqlite3_result_subtype(context, p->blob->element_type); - break; - } - return SQLITE_OK; - } - case VEC_SBE__QUERYPLAN_KNN: { - switch (i) { - case VEC_STATIC_BLOB_ENTRIES_VECTOR: { - i32 rowid = ((i32 *)pCur->knn_data->rowids)[pCur->knn_data->current_idx]; - sqlite3_result_blob(context, - ((unsigned char *)p->blob->p) + - (rowid * p->blob->dimensions * sizeof(float)), - p->blob->dimensions * sizeof(float), - SQLITE_TRANSIENT); - sqlite3_result_subtype(context, p->blob->element_type); - break; - } - } - return SQLITE_OK; - } - } - return SQLITE_ERROR; -} - -static sqlite3_module vec_static_blob_entriesModule = { - /* iVersion */ 3, - /* xCreate */ - vec_static_blob_entriesCreate, // handle rm? - // https://github.com/asg017/sqlite-vec/issues/55 - /* xConnect */ vec_static_blob_entriesConnect, - /* xBestIndex */ vec_static_blob_entriesBestIndex, - /* xDisconnect */ vec_static_blob_entriesDisconnect, - /* xDestroy */ vec_static_blob_entriesDisconnect, - /* xOpen */ vec_static_blob_entriesOpen, - /* xClose */ vec_static_blob_entriesClose, - /* xFilter */ vec_static_blob_entriesFilter, - /* xNext */ vec_static_blob_entriesNext, - /* xEof */ vec_static_blob_entriesEof, - /* xColumn */ vec_static_blob_entriesColumn, - /* xRowid */ vec_static_blob_entriesRowid, - /* xUpdate */ 0, - /* xBegin */ 0, - /* xSync */ 0, - /* xCommit */ 0, - /* xRollback */ 0, - /* xFindMethod */ 0, - /* xRename */ 0, - /* xSavepoint */ 0, - /* xRelease */ 0, - /* xRollbackTo */ 0, - /* xShadowName */ 0, -#if SQLITE_VERSION_NUMBER >= 3044000 - /* xIntegrity */ 0 -#endif -}; -#pragma endregion #ifdef SQLITE_VEC_ENABLE_AVX #define SQLITE_VEC_DEBUG_BUILD_AVX "avx" @@ -10145,55 +9097,4 @@ SQLITE_VEC_API int sqlite3_vec_init(sqlite3 *db, char **pzErrMsg, return SQLITE_OK; } -#ifndef SQLITE_VEC_OMIT_FS -SQLITE_VEC_API int sqlite3_vec_numpy_init(sqlite3 *db, char **pzErrMsg, - const sqlite3_api_routines *pApi) { - UNUSED_PARAMETER(pzErrMsg); -#ifndef SQLITE_CORE - SQLITE_EXTENSION_INIT2(pApi); -#endif - int rc = SQLITE_OK; - rc = sqlite3_create_function_v2(db, "vec_npy_file", 1, SQLITE_RESULT_SUBTYPE, - NULL, vec_npy_file, NULL, NULL, NULL); - if(rc != SQLITE_OK) { - return rc; - } - rc = sqlite3_create_module_v2(db, "vec_npy_each", &vec_npy_eachModule, NULL, NULL); - return rc; -} -#endif -SQLITE_VEC_API int -sqlite3_vec_static_blobs_init(sqlite3 *db, char **pzErrMsg, - const sqlite3_api_routines *pApi) { - UNUSED_PARAMETER(pzErrMsg); -#ifndef SQLITE_CORE - SQLITE_EXTENSION_INIT2(pApi); -#endif - - int rc = SQLITE_OK; - vec_static_blob_data *static_blob_data; - static_blob_data = sqlite3_malloc(sizeof(*static_blob_data)); - if (!static_blob_data) { - return SQLITE_NOMEM; - } - memset(static_blob_data, 0, sizeof(*static_blob_data)); - - rc = sqlite3_create_function_v2( - db, "vec_static_blob_from_raw", 4, - DEFAULT_FLAGS | SQLITE_SUBTYPE | SQLITE_RESULT_SUBTYPE, NULL, - vec_static_blob_from_raw, NULL, NULL, NULL); - if (rc != SQLITE_OK) - return rc; - - rc = sqlite3_create_module_v2(db, "vec_static_blobs", &vec_static_blobsModule, - static_blob_data, sqlite3_free); - if (rc != SQLITE_OK) - return rc; - rc = sqlite3_create_module_v2(db, "vec_static_blob_entries", - &vec_static_blob_entriesModule, - static_blob_data, NULL); - if (rc != SQLITE_OK) - return rc; - return rc; -} diff --git a/tests/correctness/test-correctness.py b/tests/correctness/test-correctness.py index cb01f8f..9ed0319 100644 --- a/tests/correctness/test-correctness.py +++ b/tests/correctness/test-correctness.py @@ -48,7 +48,6 @@ import json db = sqlite3.connect(":memory:") db.enable_load_extension(True) db.load_extension("../../dist/vec0") -db.execute("select load_extension('../../dist/vec0', 'sqlite3_vec_fs_read_init')") db.enable_load_extension(False) results = db.execute( @@ -75,17 +74,21 @@ print(b) db.execute('PRAGMA page_size=16384') -print("Loading into sqlite-vec vec0 table...") -t0 = time.time() -db.execute("create virtual table v using vec0(a float[3072], chunk_size=16)") -db.execute('insert into v select rowid, vector from vec_npy_each(vec_npy_file("dbpedia_openai_3_large_00.npy"))') -print(time.time() - t0) - print("loading numpy array...") t0 = time.time() base = np.load('dbpedia_openai_3_large_00.npy') print(time.time() - t0) +print("Loading into sqlite-vec vec0 table...") +t0 = time.time() +db.execute("create virtual table v using vec0(a float[3072], chunk_size=16)") +with db: + db.executemany( + "insert into v(rowid, a) values (?, ?)", + [(i, row.tobytes()) for i, row in enumerate(base)], + ) +print(time.time() - t0) + np.random.seed(1) queries = base[np.random.choice(base.shape[0], 20, replace=False), :] diff --git a/tests/fuzz/numpy.c b/tests/fuzz/numpy.c deleted file mode 100644 index 9e2900b..0000000 --- a/tests/fuzz/numpy.c +++ /dev/null @@ -1,37 +0,0 @@ -#include -#include - -#include -#include -#include -#include "sqlite-vec.h" -#include "sqlite3.h" -#include - -extern int sqlite3_vec_numpy_init(sqlite3 *db, char **pzErrMsg, - const sqlite3_api_routines *pApi); - -int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { - int rc = SQLITE_OK; - sqlite3 *db; - sqlite3_stmt *stmt; - - rc = sqlite3_open(":memory:", &db); - assert(rc == SQLITE_OK); - rc = sqlite3_vec_init(db, NULL, NULL); - assert(rc == SQLITE_OK); - rc = sqlite3_vec_numpy_init(db, NULL, NULL); - assert(rc == SQLITE_OK); - - rc = sqlite3_prepare_v2(db, "select * from vec_npy_each(?)", -1, &stmt, NULL); - assert(rc == SQLITE_OK); - sqlite3_bind_blob(stmt, 1, data, size, SQLITE_STATIC); - rc = sqlite3_step(stmt); - while (rc == SQLITE_ROW) { - rc = sqlite3_step(stmt); - } - - sqlite3_finalize(stmt); - sqlite3_close(db); - return 0; -} diff --git a/tests/sqlite-vec-internal.h b/tests/sqlite-vec-internal.h index a540849..a02c72a 100644 --- a/tests/sqlite-vec-internal.h +++ b/tests/sqlite-vec-internal.h @@ -3,6 +3,7 @@ #include #include +#include int min_idx( const float *distances, @@ -62,12 +63,17 @@ enum Vec0DistanceMetrics { VEC0_DISTANCE_METRIC_L1 = 3, }; +enum Vec0IndexType { + VEC0_INDEX_TYPE_FLAT = 1, +}; + struct VectorColumnDefinition { char *name; int name_length; size_t dimensions; enum VectorElementType element_type; enum Vec0DistanceMetrics distance_metric; + enum Vec0IndexType index_type; }; int vec0_parse_vector_column(const char *source, int source_length, diff --git a/tests/test-loadable.py b/tests/test-loadable.py index bc4eed1..40c6a5e 100644 --- a/tests/test-loadable.py +++ b/tests/test-loadable.py @@ -119,151 +119,9 @@ FUNCTIONS = [ MODULES = [ "vec0", "vec_each", - # "vec_static_blob_entries", - # "vec_static_blobs", ] -def register_numpy(db, name: str, array): - ptr = array.__array_interface__["data"][0] - nvectors, dimensions = array.__array_interface__["shape"] - element_type = array.__array_interface__["typestr"] - - assert element_type == "\x9a\x99\x99>", - }, - { - "vector": b"fff?\xcd\xccL?", - }, - ] - assert execute_all(db, "select rowid, (vector) from z") == [ - { - "rowid": 0, - "vector": b"\xcd\xcc\xcc=\xcd\xcc\xcc=\xcd\xcc\xcc=\xcd\xcc\xcc=", - }, - { - "rowid": 1, - "vector": b"\xcd\xccL>\xcd\xccL>\xcd\xccL>\xcd\xccL>", - }, - { - "rowid": 2, - "vector": b"\x9a\x99\x99>\x9a\x99\x99>\x9a\x99\x99>\x9a\x99\x99>", - }, - { - "rowid": 3, - "vector": b"\xcd\xcc\xcc>\xcd\xcc\xcc>\xcd\xcc\xcc>\xcd\xcc\xcc>", - }, - { - "rowid": 4, - "vector": b"\x00\x00\x00?\x00\x00\x00?\x00\x00\x00?\x00\x00\x00?", - }, - ] - assert execute_all( - db, - "select rowid, vec_to_json(vector) as v from z where vector match ? and k = 3 order by distance;", - [np.array([0.3, 0.3, 0.3, 0.3], dtype=np.float32)], - ) == [ - { - "rowid": 2, - "v": "[0.300000,0.300000,0.300000,0.300000]", - }, - { - "rowid": 3, - "v": "[0.400000,0.400000,0.400000,0.400000]", - }, - { - "rowid": 1, - "v": "[0.200000,0.200000,0.200000,0.200000]", - }, - ] - assert execute_all( - db, - "select rowid, vec_to_json(vector) as v from z where vector match ? and k = 3 order by distance;", - [np.array([0.6, 0.6, 0.6, 0.6], dtype=np.float32)], - ) == [ - { - "rowid": 4, - "v": "[0.500000,0.500000,0.500000,0.500000]", - }, - { - "rowid": 3, - "v": "[0.400000,0.400000,0.400000,0.400000]", - }, - { - "rowid": 2, - "v": "[0.300000,0.300000,0.300000,0.300000]", - }, - ] - - def test_limits(): db = connect(EXT_PATH) with _raises( @@ -1618,231 +1476,6 @@ def test_vec_each(): vec_each_f32(None) -import io - - -def to_npy(arr): - buf = io.BytesIO() - np.save(buf, arr) - buf.seek(0) - return buf.read() - - -def test_vec_npy_each(): - db = connect(EXT_PATH, extra_entrypoint="sqlite3_vec_numpy_init") - vec_npy_each = lambda *args: execute_all( - db, "select rowid, * from vec_npy_each(?)", args - ) - assert vec_npy_each(to_npy(np.array([1.1, 2.2, 3.3], dtype=np.float32))) == [ - { - "rowid": 0, - "vector": _f32([1.1, 2.2, 3.3]), - }, - ] - assert vec_npy_each(to_npy(np.array([[1.1, 2.2, 3.3]], dtype=np.float32))) == [ - { - "rowid": 0, - "vector": _f32([1.1, 2.2, 3.3]), - }, - ] - assert vec_npy_each( - to_npy(np.array([[1.1, 2.2, 3.3], [9.9, 8.8, 7.7]], dtype=np.float32)) - ) == [ - { - "rowid": 0, - "vector": _f32([1.1, 2.2, 3.3]), - }, - { - "rowid": 1, - "vector": _f32([9.9, 8.8, 7.7]), - }, - ] - - assert vec_npy_each(to_npy(np.array([], dtype=np.float32))) == [] - - -def test_vec_npy_each_errors(): - db = connect(EXT_PATH, extra_entrypoint="sqlite3_vec_numpy_init") - vec_npy_each = lambda *args: execute_all( - db, "select rowid, * from vec_npy_each(?)", args - ) - - full = b"\x93NUMPY\x01\x00v\x00{'descr': ' 8 bits per byte * 64 bytes = 512 + for (int i = 0; i < 128; i += 2) { + a[i] = 0xFF; + } + d = _test_distance_hamming(a, b, 1024); + assert(d == 512.0f); + } + printf(" All distance_hamming tests passed.\n"); } diff --git a/tmp-static.py b/tmp-static.py deleted file mode 100644 index a3b5f37..0000000 --- a/tmp-static.py +++ /dev/null @@ -1,56 +0,0 @@ -import sqlite3 -import numpy as np - -db = sqlite3.connect(":memory:") - -db.enable_load_extension(True) -db.load_extension("./dist/vec0") -db.execute("select load_extension('./dist/vec0', 'sqlite3_vec_raw_init')") -db.enable_load_extension(False) - -x = np.array([[0.1, 0.2, 0.3, 0.4], [0.9, 0.8, 0.7, 0.6]], dtype=np.float32) -y = np.array([[0.2, 0.3], [0.9, 0.8], [0.6, 0.5]], dtype=np.float32) -z = np.array( - [ - [0.1, 0.1, 0.1, 0.1], - [0.2, 0.2, 0.2, 0.2], - [0.3, 0.3, 0.3, 0.3], - [0.4, 0.4, 0.4, 0.4], - [0.5, 0.5, 0.5, 0.5], - ], - dtype=np.float32, -) - - -def register_np(array, name): - ptr = array.__array_interface__["data"][0] - nvectors, dimensions = array.__array_interface__["shape"] - element_type = array.__array_interface__["typestr"] - - assert element_type == " Date: Tue, 31 Mar 2026 01:06:38 -0700 Subject: [PATCH 06/38] Fix Android cross-compilation failing with unsupported '-mavx' flag The Linux AVX auto-detection checked the host's /proc/cpuinfo, which passes on x86 CI runners even when cross-compiling for Android ARM targets. Skip AVX detection when CC contains 'android'. Co-Authored-By: Claude Opus 4.6 (1M context) --- Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile b/Makefile index 051590e..31064d2 100644 --- a/Makefile +++ b/Makefile @@ -43,10 +43,12 @@ ifndef OMIT_SIMD CFLAGS += -mcpu=apple-m1 -DSQLITE_VEC_ENABLE_NEON endif ifeq ($(shell uname -s),Linux) + ifeq ($(findstring android,$(CC)),) ifneq ($(filter avx,$(shell grep -o 'avx[^ ]*' /proc/cpuinfo 2>/dev/null | head -1)),) CFLAGS += -mavx -DSQLITE_VEC_ENABLE_AVX endif endif + endif endif ifdef USE_BREW_SQLITE From 3358e127f6f1f0458ada24de2b987706387b227a Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Sun, 29 Mar 2026 19:46:23 -0700 Subject: [PATCH 07/38] Add IVF index for vec0 virtual table Add inverted file (IVF) index type: partitions vectors into clusters via k-means, quantizes to int8, and scans only the nearest nprobe partitions at query time. Includes shadow table management, insert/delete, KNN integration, compile flag (SQLITE_VEC_ENABLE_IVF), fuzz targets, and tests. Removes superseded ivf-benchmarks/ directory. --- IVF_PLAN.md | 264 ++++++ Makefile | 15 + benchmarks-ann/Makefile | 30 +- benchmarks-ann/bench.py | 42 + sqlite-vec-ivf-kmeans.c | 214 +++++ sqlite-vec-ivf.c | 1445 +++++++++++++++++++++++++++++++ sqlite-vec.c | 231 ++++- tests/fuzz/Makefile | 29 +- tests/fuzz/ivf-cell-overflow.c | 192 ++++ tests/fuzz/ivf-create.c | 36 + tests/fuzz/ivf-create.dict | 16 + tests/fuzz/ivf-kmeans.c | 180 ++++ tests/fuzz/ivf-knn-deep.c | 199 +++++ tests/fuzz/ivf-operations.c | 121 +++ tests/fuzz/ivf-quantize.c | 129 +++ tests/fuzz/ivf-rescore.c | 182 ++++ tests/fuzz/ivf-shadow-corrupt.c | 228 +++++ tests/sqlite-vec-internal.h | 37 + tests/test-ivf-mutations.py | 575 ++++++++++++ tests/test-ivf-quantization.py | 255 ++++++ tests/test-ivf.py | 426 +++++++++ tests/test-unit.c | 419 +++++++++ 22 files changed, 5237 insertions(+), 28 deletions(-) create mode 100644 IVF_PLAN.md create mode 100644 sqlite-vec-ivf-kmeans.c create mode 100644 sqlite-vec-ivf.c create mode 100644 tests/fuzz/ivf-cell-overflow.c create mode 100644 tests/fuzz/ivf-create.c create mode 100644 tests/fuzz/ivf-create.dict create mode 100644 tests/fuzz/ivf-kmeans.c create mode 100644 tests/fuzz/ivf-knn-deep.c create mode 100644 tests/fuzz/ivf-operations.c create mode 100644 tests/fuzz/ivf-quantize.c create mode 100644 tests/fuzz/ivf-rescore.c create mode 100644 tests/fuzz/ivf-shadow-corrupt.c create mode 100644 tests/test-ivf-mutations.py create mode 100644 tests/test-ivf-quantization.py create mode 100644 tests/test-ivf.py diff --git a/IVF_PLAN.md b/IVF_PLAN.md new file mode 100644 index 0000000..91bb85a --- /dev/null +++ b/IVF_PLAN.md @@ -0,0 +1,264 @@ +# IVF Index for sqlite-vec + +## Overview + +IVF (Inverted File Index) is an approximate nearest neighbor index for +sqlite-vec's `vec0` virtual table. It partitions vectors into clusters via +k-means, then at query time only scans the nearest clusters instead of all +vectors. Combined with scalar or binary quantization, this gives 5-20x query +speedups over brute-force with tunable recall. + +## SQL API + +### Table Creation + +```sql +CREATE VIRTUAL TABLE vec_items USING vec0( + id INTEGER PRIMARY KEY, + embedding float[768] distance_metric=cosine + INDEXED BY ivf(nlist=128, nprobe=16) +); + +-- With quantization (4x smaller cells, rescore for recall) +CREATE VIRTUAL TABLE vec_items USING vec0( + id INTEGER PRIMARY KEY, + embedding float[768] distance_metric=cosine + INDEXED BY ivf(nlist=128, nprobe=16, quantizer=int8, oversample=4) +); +``` + +### Parameters + +| Parameter | Values | Default | Description | +|-----------|--------|---------|-------------| +| `nlist` | 1-65536, or 0 | 128 | Number of k-means clusters. Rule of thumb: `sqrt(N)` | +| `nprobe` | 1-nlist | 10 | Clusters to search at query time. More = better recall, slower | +| `quantizer` | `none`, `int8`, `binary` | `none` | How vectors are stored in cells | +| `oversample` | >= 1 | 1 | Re-rank `oversample * k` candidates with full-precision distance | + +### Inserting Vectors + +```sql +-- Works immediately, even before training +INSERT INTO vec_items(id, embedding) VALUES (1, :vector); +``` + +Before centroids exist, vectors go to an "unassigned" partition and queries do +brute-force. After training, new inserts are assigned to the nearest centroid. + +### Training (Computing Centroids) + +```sql +-- Run built-in k-means on all vectors +INSERT INTO vec_items(id) VALUES ('compute-centroids'); +``` + +This loads all vectors into memory, runs k-means++ with Lloyd's algorithm, +creates quantized centroids, and redistributes all vectors into cluster cells. +It's a blocking operation — run it once after bulk insert. + +### Manual Centroid Import + +```sql +-- Import externally-computed centroids +INSERT INTO vec_items(id, embedding) VALUES ('set-centroid:0', :centroid_0); +INSERT INTO vec_items(id, embedding) VALUES ('set-centroid:1', :centroid_1); + +-- Assign vectors to imported centroids +INSERT INTO vec_items(id) VALUES ('assign-vectors'); +``` + +### Runtime Parameter Tuning + +```sql +-- Change nprobe without rebuilding the index +INSERT INTO vec_items(id) VALUES ('nprobe=32'); +``` + +### KNN Queries + +```sql +-- Same syntax as standard vec0 +SELECT id, distance +FROM vec_items +WHERE embedding MATCH :query AND k = 10; +``` + +### Other Commands + +```sql +-- Remove centroids, move all vectors back to unassigned +INSERT INTO vec_items(id) VALUES ('clear-centroids'); +``` + +## How It Works + +### Architecture + +``` +User vector (float32) + → quantize to int8/binary (if quantizer != none) + → find nearest centroid (quantized distance) + → store quantized vector in cell blob + → store full vector in KV table (if quantizer != none) + → query: + 1. quantize query vector + 2. find top nprobe centroids by quantized distance + 3. scan cell blobs: quantized distance (fast, small I/O) + 4. if oversample > 1: re-score top N*k with full vectors + 5. return top k +``` + +### Shadow Tables + +For a table `vec_items` with vector column index 0: + +| Table | Schema | Purpose | +|-------|--------|---------| +| `vec_items_ivf_centroids00` | `centroid_id PK, centroid BLOB` | K-means centroids (quantized) | +| `vec_items_ivf_cells00` | `centroid_id, n_vectors, validity BLOB, rowids BLOB, vectors BLOB` | Packed vector cells, 64 vectors max per row. Multiple rows per centroid. Index on centroid_id. | +| `vec_items_ivf_rowid_map00` | `rowid PK, cell_id, slot` | Maps vector rowid → cell location for O(1) delete | +| `vec_items_ivf_vectors00` | `rowid PK, vector BLOB` | Full-precision vectors (only when quantizer != none) | + +### Cell Storage + +Cells use packed blob storage identical to vec0's chunk layout: +- **validity**: bitmap (1 bit per slot) marking live vectors +- **rowids**: packed i64 array +- **vectors**: packed array of quantized vectors + +Cells are capped at 64 vectors (~200KB at 768-dim float32, ~48KB for int8, +~6KB for binary). When a cell fills, a new row is created for the same +centroid. This avoids SQLite overflow page traversal which was a 110x +performance bottleneck with unbounded cells. + +### Quantization + +**int8**: Each float32 dimension clamped to [-1,1] and scaled to int8 +[-127,127]. 4x storage reduction. Distance computed via int8 L2. + +**binary**: Sign-bit quantization — each bit is 1 if the float is positive. +32x storage reduction. Distance computed via hamming distance. + +**Oversample re-ranking**: When `oversample > 1`, the quantized scan collects +`oversample * k` candidates, then looks up each candidate's full-precision +vector from the KV table and re-computes exact distance. This recovers nearly +all recall lost from quantization. At oversample=4 with int8, recall matches +non-quantized IVF exactly. + +### K-Means + +Uses Lloyd's algorithm with k-means++ initialization: +1. K-means++ picks initial centroids weighted by distance +2. Lloyd's iterations: assign vectors to nearest centroid, recompute centroids as cluster means +3. Empty cluster handling: reassign to farthest point +4. K-means runs in float32; centroids are quantized before storage + +Training data: recommend 16× nlist vectors. At nlist=1000, that's 16k +vectors — k-means takes ~140s on 768-dim data. + +## Performance + +### 100k vectors (COHERE 768-dim cosine) + +``` + name qry(ms) recall +─────────────────────────────────────────────── + ivf(q=int8,os=4),p=8 5.3ms 0.934 ← 6x faster than flat + ivf(q=int8,os=4),p=16 5.4ms 0.968 + ivf(q=none),p=8 5.3ms 0.934 + ivf(q=binary,os=10),p=16 1.3ms 0.832 ← 26x faster than flat + ivf(q=int8,os=4),p=32 7.4ms 0.990 + ivf(q=none),p=32 15.5ms 0.992 + int8(os=4) 18.7ms 0.996 + bit(os=8) 18.7ms 0.884 + flat 33.7ms 1.000 +``` + +### 1M vectors (COHERE 768-dim cosine) + +``` + name insert train MB qry(ms) recall +────────────────────────────────────────────────────────────────────── + ivf(q=int8,os=4),p=8 163s 142s 4725 16.3ms 0.892 + ivf(q=binary,os=10),p=16 118s 144s 4073 17.7ms 0.830 + ivf(q=int8,os=4),p=16 163s 142s 4725 24.3ms 0.950 + ivf(q=int8,os=4),p=32 163s 142s 4725 41.6ms 0.980 + ivf(q=none),p=8 497s 144s 3101 52.1ms 0.890 + ivf(q=none),p=16 497s 144s 3101 56.6ms 0.950 + bit(os=8) 18s - 3048 83.5ms 0.918 + ivf(q=none),p=32 497s 144s 3101 103.9ms 0.980 + int8(os=4) 19s - 3689 169.1ms 0.994 + flat 20s - 2955 338.0ms 1.000 +``` + +**Best config at 1M: `ivf(quantizer=int8, oversample=4, nprobe=16)`** — +24ms query, 0.95 recall, 14x faster than flat, 7x faster than int8 baseline. + +### Scaling Characteristics + +| Metric | 100k | 1M | Scaling | +|--------|------|-----|---------| +| Flat query | 34ms | 338ms | 10x (linear) | +| IVF int8 p=16 | 5.4ms | 24.3ms | 4.5x (sublinear) | +| IVF insert rate | ~10k/s | ~6k/s | Slight degradation | +| Training (nlist=1000) | 13s | 142s | ~11x | + +## Implementation + +### File Structure + +``` +sqlite-vec-ivf-kmeans.c K-means++ algorithm (pure C, no SQLite deps) +sqlite-vec-ivf.c All IVF logic: parser, shadow tables, insert, + delete, query, centroid commands, quantization +sqlite-vec.c ~50 lines of additions: struct fields, #includes, + dispatch hooks in parse/create/insert/delete/filter +``` + +Both IVF files are `#include`d into `sqlite-vec.c`. No Makefile changes needed. + +### Key Design Decisions + +1. **Fixed-size cells (64 vectors)** instead of one blob per centroid. Avoids + SQLite overflow page traversal which caused 110x insert slowdown. + +2. **Multiple cell rows per centroid** with an index on centroid_id. When a + cell fills, a new row is created. Query scans all rows for probed centroids + via `WHERE centroid_id IN (...)`. + +3. **Always store full vectors** when quantizer != none (in `_ivf_vectors` KV + table). Enables oversample re-ranking and point queries returning original + precision. + +4. **K-means in float32, quantize after**. Simpler than quantized k-means, + and assignment accuracy doesn't suffer much since nprobe compensates. + +5. **NEON SIMD for cosine distance**. Added `cosine_float_neon()` with 4-wide + FMA for dot product + magnitudes. Benefits all vec0 queries, not just IVF. + +6. **Runtime nprobe tuning**. `INSERT INTO t(id) VALUES ('nprobe=N')` changes + the probe count without rebuilding — enables fast parameter sweeps. + +### Optimization History + +| Optimization | Impact | +|-------------|--------| +| Fixed-size cells (64 max) | 110x insert speedup | +| Skip chunk writes for IVF | 2x DB size reduction | +| NEON cosine distance | 2x query speedup + 13% recall improvement (correct metric) | +| Cached prepared statements | Eliminated per-insert prepare/finalize | +| Batched cell reads (IN clause) | Fewer SQLite queries per KNN | +| int8 quantization | 2.5x query speedup at same recall | +| Binary quantization | 32x less cell I/O | +| Oversample re-ranking | Recovers quantization recall loss | + +## Remaining Work + +See `ivf-benchmarks/TODO.md` for the full list. Key items: + +- **Cache centroids in memory** — each insert re-reads all centroids from SQLite +- **Runtime oversample** — same pattern as nprobe runtime command +- **SIMD k-means** — training uses scalar distance, could be 4x faster +- **Top-k heap** — replace qsort with min-heap for large nprobe +- **IVF-PQ** — product quantization for better compression/recall tradeoff diff --git a/Makefile b/Makefile index b50751b..2758ee5 100644 --- a/Makefile +++ b/Makefile @@ -206,6 +206,21 @@ test-loadable-watch: test-unit: $(CC) -DSQLITE_CORE -DSQLITE_VEC_TEST -DSQLITE_VEC_ENABLE_RESCORE tests/test-unit.c sqlite-vec.c vendor/sqlite3.c -I./ -Ivendor -o $(prefix)/test-unit && $(prefix)/test-unit +# Standalone sqlite3 CLI with vec0 compiled in. Useful for benchmarking, +# profiling (has debug symbols), and scripting without .load_extension. +# make cli +# dist/sqlite3 :memory: "SELECT vec_version()" +# dist/sqlite3 < script.sql +cli: sqlite-vec.h $(prefix) + $(CC) -O2 -g \ + -DSQLITE_CORE \ + -DSQLITE_EXTRA_INIT=core_init \ + -DSQLITE_THREADSAFE=0 \ + -Ivendor/ -I./ \ + $(CFLAGS) \ + vendor/sqlite3.c vendor/shell.c sqlite-vec.c examples/sqlite3-cli/core_init.c \ + -ldl -lm -o $(prefix)/sqlite3 + fuzz-build: $(MAKE) -C tests/fuzz all diff --git a/benchmarks-ann/Makefile b/benchmarks-ann/Makefile index 0789d38..6081457 100644 --- a/benchmarks-ann/Makefile +++ b/benchmarks-ann/Makefile @@ -8,27 +8,20 @@ BASELINES = \ "brute-int8:type=baseline,variant=int8" \ "brute-bit:type=baseline,variant=bit" -# --- Index-specific configs --- -# Each index branch should add its own configs here. Example: -# -# DISKANN_CONFIGS = \ -# "diskann-R48-binary:type=diskann,R=48,L=128,quantizer=binary" \ -# "diskann-R72-int8:type=diskann,R=72,L=128,quantizer=int8" -# -# IVF_CONFIGS = \ -# "ivf-n128-p16:type=ivf,nlist=128,nprobe=16" -# -# ANNOY_CONFIGS = \ -# "annoy-t50:type=annoy,n_trees=50" +# --- IVF configs --- +IVF_CONFIGS = \ + "ivf-n32-p8:type=ivf,nlist=32,nprobe=8" \ + "ivf-n128-p16:type=ivf,nlist=128,nprobe=16" \ + "ivf-n512-p32:type=ivf,nlist=512,nprobe=32" RESCORE_CONFIGS = \ "rescore-bit-os8:type=rescore,quantizer=bit,oversample=8" \ "rescore-bit-os16:type=rescore,quantizer=bit,oversample=16" \ "rescore-int8-os8:type=rescore,quantizer=int8,oversample=8" -ALL_CONFIGS = $(BASELINES) $(RESCORE_CONFIGS) +ALL_CONFIGS = $(BASELINES) $(RESCORE_CONFIGS) $(IVF_CONFIGS) -.PHONY: seed ground-truth bench-smoke bench-rescore bench-10k bench-50k bench-100k bench-all \ +.PHONY: seed ground-truth bench-smoke bench-rescore bench-ivf bench-10k bench-50k bench-100k bench-all \ report clean # --- Data preparation --- @@ -43,7 +36,8 @@ ground-truth: seed # --- Quick smoke test --- bench-smoke: seed $(BENCH) --subset-size 5000 -k 10 -n 20 -o runs/smoke \ - $(BASELINES) + "brute-float:type=baseline,variant=float" \ + "ivf-quick:type=ivf,nlist=16,nprobe=4" bench-rescore: seed $(BENCH) --subset-size 10000 -k 10 -o runs/rescore \ @@ -62,6 +56,12 @@ bench-100k: seed bench-all: bench-10k bench-50k bench-100k +# --- IVF across sizes --- +bench-ivf: seed + $(BENCH) --subset-size 10000 -k 10 -o runs/ivf $(BASELINES) $(IVF_CONFIGS) + $(BENCH) --subset-size 50000 -k 10 -o runs/ivf $(BASELINES) $(IVF_CONFIGS) + $(BENCH) --subset-size 100000 -k 10 -o runs/ivf $(BASELINES) $(IVF_CONFIGS) + # --- Report --- report: @echo "Use: sqlite3 runs//results.db 'SELECT * FROM bench_results ORDER BY recall DESC'" diff --git a/benchmarks-ann/bench.py b/benchmarks-ann/bench.py index c1179d6..c640628 100644 --- a/benchmarks-ann/bench.py +++ b/benchmarks-ann/bench.py @@ -173,6 +173,48 @@ INDEX_REGISTRY["rescore"] = { } +# ============================================================================ +# IVF implementation +# ============================================================================ + + +def _ivf_create_table_sql(params): + return ( + f"CREATE VIRTUAL TABLE vec_items USING vec0(" + f" id integer primary key," + f" embedding float[768] distance_metric=cosine" + f" indexed by ivf(" + f" nlist={params['nlist']}," + f" nprobe={params['nprobe']}" + f" )" + f")" + ) + + +def _ivf_post_insert_hook(conn, params): + print(" Training k-means centroids...", flush=True) + t0 = time.perf_counter() + conn.execute("INSERT INTO vec_items(id) VALUES ('compute-centroids')") + conn.commit() + elapsed = time.perf_counter() - t0 + print(f" Training done in {elapsed:.1f}s", flush=True) + return elapsed + + +def _ivf_describe(params): + return f"ivf nlist={params['nlist']:<4} nprobe={params['nprobe']}" + + +INDEX_REGISTRY["ivf"] = { + "defaults": {"nlist": 128, "nprobe": 16}, + "create_table_sql": _ivf_create_table_sql, + "insert_sql": None, + "post_insert_hook": _ivf_post_insert_hook, + "run_query": None, + "describe": _ivf_describe, +} + + # ============================================================================ # Config parsing # ============================================================================ diff --git a/sqlite-vec-ivf-kmeans.c b/sqlite-vec-ivf-kmeans.c new file mode 100644 index 0000000..0faa803 --- /dev/null +++ b/sqlite-vec-ivf-kmeans.c @@ -0,0 +1,214 @@ +/** + * sqlite-vec-ivf-kmeans.c — Pure k-means clustering algorithm. + * + * No SQLite dependency. Operates on float arrays in memory. + * #include'd into sqlite-vec.c after struct definitions. + */ + +#ifndef SQLITE_VEC_IVF_KMEANS_C +#define SQLITE_VEC_IVF_KMEANS_C + +// When opened standalone in an editor, pull in types so the LSP is happy. +// When #include'd from sqlite-vec.c, SQLITE_VEC_H is already defined. +#ifndef SQLITE_VEC_H +#include "sqlite-vec.c" // IWYU pragma: keep +#endif + +#include +#include + +#define VEC0_IVF_KMEANS_MAX_ITER 25 +#define VEC0_IVF_KMEANS_DEFAULT_SEED 0 + +// Simple xorshift32 PRNG +static uint32_t ivf_xorshift32(uint32_t *state) { + uint32_t x = *state; + x ^= x << 13; + x ^= x >> 17; + x ^= x << 5; + *state = x; + return x; +} + +// L2 squared distance between two float vectors +static float ivf_l2_dist(const float *a, const float *b, int D) { + float sum = 0.0f; + for (int d = 0; d < D; d++) { + float diff = a[d] - b[d]; + sum += diff * diff; + } + return sum; +} + +// Find nearest centroid for a single vector. Returns centroid index. +static int ivf_nearest_centroid(const float *vec, const float *centroids, + int D, int k) { + float min_dist = FLT_MAX; + int best = 0; + for (int c = 0; c < k; c++) { + float dist = ivf_l2_dist(vec, ¢roids[c * D], D); + if (dist < min_dist) { + min_dist = dist; + best = c; + } + } + return best; +} + +/** + * K-means++ initialization. + * Picks k initial centroids from the data with probability proportional + * to squared distance from nearest existing centroid. + */ +static int ivf_kmeans_init_plusplus(const float *vectors, int N, int D, + int k, uint32_t seed, float *centroids) { + if (N <= 0 || k <= 0 || D <= 0) + return -1; + if (seed == 0) + seed = 42; + + // Pick first centroid randomly + int first = ivf_xorshift32(&seed) % N; + memcpy(centroids, &vectors[first * D], D * sizeof(float)); + + if (k == 1) + return 0; + + // Allocate distance array + float *dists = sqlite3_malloc64((i64)N * sizeof(float)); + if (!dists) + return -1; + + for (int c = 1; c < k; c++) { + // Compute D(x) = distance to nearest existing centroid + double total = 0.0; + for (int i = 0; i < N; i++) { + float d = ivf_l2_dist(&vectors[i * D], ¢roids[(c - 1) * D], D); + if (c == 1 || d < dists[i]) { + dists[i] = d; + } + total += dists[i]; + } + + // Weighted random selection + if (total <= 0.0) { + // All distances zero — pick randomly + int pick = ivf_xorshift32(&seed) % N; + memcpy(¢roids[c * D], &vectors[pick * D], D * sizeof(float)); + } else { + double threshold = ((double)ivf_xorshift32(&seed) / (double)0xFFFFFFFF) * total; + double cumulative = 0.0; + int pick = N - 1; + for (int i = 0; i < N; i++) { + cumulative += dists[i]; + if (cumulative >= threshold) { + pick = i; + break; + } + } + memcpy(¢roids[c * D], &vectors[pick * D], D * sizeof(float)); + } + } + + sqlite3_free(dists); + return 0; +} + +/** + * Lloyd's k-means algorithm. + * + * @param vectors N*D float array (row-major) + * @param N number of vectors + * @param D dimensionality + * @param k number of clusters + * @param max_iter maximum iterations + * @param seed PRNG seed for initialization + * @param out_centroids output: k*D float array (caller-allocated) + * @return 0 on success, -1 on error + */ +static int ivf_kmeans(const float *vectors, int N, int D, int k, + int max_iter, uint32_t seed, float *out_centroids) { + if (N <= 0 || D <= 0 || k <= 0) + return -1; + + // Clamp k to N + if (k > N) + k = N; + + // Allocate working memory + int *assignments = sqlite3_malloc64((i64)N * sizeof(int)); + float *new_centroids = sqlite3_malloc64((i64)k * D * sizeof(float)); + int *counts = sqlite3_malloc64((i64)k * sizeof(int)); + + if (!assignments || !new_centroids || !counts) { + sqlite3_free(assignments); + sqlite3_free(new_centroids); + sqlite3_free(counts); + return -1; + } + + memset(assignments, -1, N * sizeof(int)); + + // Initialize centroids via k-means++ + if (ivf_kmeans_init_plusplus(vectors, N, D, k, seed, out_centroids) != 0) { + sqlite3_free(assignments); + sqlite3_free(new_centroids); + sqlite3_free(counts); + return -1; + } + + for (int iter = 0; iter < max_iter; iter++) { + // Assignment step + int changed = 0; + for (int i = 0; i < N; i++) { + int nearest = ivf_nearest_centroid(&vectors[i * D], out_centroids, D, k); + if (nearest != assignments[i]) { + assignments[i] = nearest; + changed++; + } + } + if (changed == 0) + break; + + // Update step + memset(new_centroids, 0, (size_t)k * D * sizeof(float)); + memset(counts, 0, k * sizeof(int)); + + for (int i = 0; i < N; i++) { + int c = assignments[i]; + counts[c]++; + for (int d = 0; d < D; d++) { + new_centroids[c * D + d] += vectors[i * D + d]; + } + } + + for (int c = 0; c < k; c++) { + if (counts[c] == 0) { + // Empty cluster: reassign to farthest point from its nearest centroid + float max_dist = -1.0f; + int farthest = 0; + for (int i = 0; i < N; i++) { + float d = ivf_l2_dist(&vectors[i * D], + &out_centroids[assignments[i] * D], D); + if (d > max_dist) { + max_dist = d; + farthest = i; + } + } + memcpy(&out_centroids[c * D], &vectors[farthest * D], + D * sizeof(float)); + } else { + for (int d = 0; d < D; d++) { + out_centroids[c * D + d] = new_centroids[c * D + d] / counts[c]; + } + } + } + } + + sqlite3_free(assignments); + sqlite3_free(new_centroids); + sqlite3_free(counts); + return 0; +} + +#endif /* SQLITE_VEC_IVF_KMEANS_C */ diff --git a/sqlite-vec-ivf.c b/sqlite-vec-ivf.c new file mode 100644 index 0000000..5bc8edb --- /dev/null +++ b/sqlite-vec-ivf.c @@ -0,0 +1,1445 @@ +/** + * sqlite-vec-ivf.c — IVF (Inverted File Index) for sqlite-vec. + * + * #include'd into sqlite-vec.c after struct definitions and before vec0_init(). + * + * Storage: fixed-size packed blob cells (capped at IVF_CELL_MAX_VECTORS). + * Multiple cell rows per centroid. cell_id is auto-increment rowid, + * centroid_id is indexed for lookup. This keeps blobs small (~200KB) + * and avoids expensive overflow page traversal on insert. + */ + +#ifndef SQLITE_VEC_IVF_C +#define SQLITE_VEC_IVF_C + +#ifdef SQLITE_VEC_TEST +#define IVF_STATIC +#else +#define IVF_STATIC static +#endif + +// When opened standalone in an editor, pull in sqlite-vec.c so the LSP +// can resolve all types (vec0_vtab, VectorColumnDefinition, etc.). +// When #include'd from sqlite-vec.c, SQLITE_VEC_H is already defined. +#ifndef SQLITE_VEC_H +#include "sqlite-vec.c" // IWYU pragma: keep +#endif + +#define VEC0_IVF_DEFAULT_NLIST 128 +#define VEC0_IVF_DEFAULT_NPROBE 10 +#define VEC0_IVF_MAX_NLIST 65536 +#define VEC0_IVF_CELL_MAX_VECTORS 64 // ~200KB per cell at 768-dim f32 +#define VEC0_IVF_UNASSIGNED_CENTROID_ID (-1) + +#define VEC0_SHADOW_IVF_CENTROIDS_NAME "\"%w\".\"%w_ivf_centroids%02d\"" +#define VEC0_SHADOW_IVF_CELLS_NAME "\"%w\".\"%w_ivf_cells%02d\"" +#define VEC0_SHADOW_IVF_ROWID_MAP_NAME "\"%w\".\"%w_ivf_rowid_map%02d\"" +#define VEC0_SHADOW_IVF_VECTORS_NAME "\"%w\".\"%w_ivf_vectors%02d\"" + +// ============================================================================ +// Parser +// ============================================================================ + +static int vec0_parse_ivf_options(struct Vec0Scanner *scanner, + struct Vec0IvfConfig *config) { + struct Vec0Token token; + int rc; + config->nlist = VEC0_IVF_DEFAULT_NLIST; + config->nprobe = -1; + config->quantizer = VEC0_IVF_QUANTIZER_NONE; + config->oversample = 1; + int nprobe_explicit = 0; + + rc = vec0_scanner_next(scanner, &token); + if (rc != VEC0_TOKEN_RESULT_SOME || token.token_type != TOKEN_TYPE_LPAREN) + return SQLITE_ERROR; + + rc = vec0_scanner_next(scanner, &token); + if (rc == VEC0_TOKEN_RESULT_SOME && token.token_type == TOKEN_TYPE_RPAREN) { + config->nprobe = VEC0_IVF_DEFAULT_NPROBE; + return SQLITE_OK; + } + + while (1) { + if (rc != VEC0_TOKEN_RESULT_SOME || token.token_type != TOKEN_TYPE_IDENTIFIER) + return SQLITE_ERROR; + char *key = token.start; + int keyLength = token.end - token.start; + + rc = vec0_scanner_next(scanner, &token); + if (rc != VEC0_TOKEN_RESULT_SOME || token.token_type != TOKEN_TYPE_EQ) + return SQLITE_ERROR; + + // Read value — can be digit or identifier + rc = vec0_scanner_next(scanner, &token); + if (rc != VEC0_TOKEN_RESULT_SOME) return SQLITE_ERROR; + if (token.token_type != TOKEN_TYPE_DIGIT && + token.token_type != TOKEN_TYPE_IDENTIFIER) + return SQLITE_ERROR; + + char *val = token.start; + int valLength = token.end - token.start; + + if (sqlite3_strnicmp(key, "nlist", keyLength) == 0) { + if (token.token_type != TOKEN_TYPE_DIGIT) return SQLITE_ERROR; + int v = atoi(val); + if (v < 0 || v > VEC0_IVF_MAX_NLIST) return SQLITE_ERROR; + config->nlist = v; + } else if (sqlite3_strnicmp(key, "nprobe", keyLength) == 0) { + if (token.token_type != TOKEN_TYPE_DIGIT) return SQLITE_ERROR; + int v = atoi(val); + if (v < 1 || v > VEC0_IVF_MAX_NLIST) return SQLITE_ERROR; + config->nprobe = v; + nprobe_explicit = 1; + } else if (sqlite3_strnicmp(key, "quantizer", keyLength) == 0) { + if (token.token_type != TOKEN_TYPE_IDENTIFIER) return SQLITE_ERROR; + if (sqlite3_strnicmp(val, "none", valLength) == 0) { + config->quantizer = VEC0_IVF_QUANTIZER_NONE; + } else if (sqlite3_strnicmp(val, "int8", valLength) == 0) { + config->quantizer = VEC0_IVF_QUANTIZER_INT8; + } else if (sqlite3_strnicmp(val, "binary", valLength) == 0) { + config->quantizer = VEC0_IVF_QUANTIZER_BINARY; + } else { + return SQLITE_ERROR; + } + } else if (sqlite3_strnicmp(key, "oversample", keyLength) == 0) { + if (token.token_type != TOKEN_TYPE_DIGIT) return SQLITE_ERROR; + int v = atoi(val); + if (v < 1) return SQLITE_ERROR; + config->oversample = v; + } else { + return SQLITE_ERROR; + } + + rc = vec0_scanner_next(scanner, &token); + if (rc != VEC0_TOKEN_RESULT_SOME) return SQLITE_ERROR; + if (token.token_type == TOKEN_TYPE_RPAREN) break; + if (token.token_type != TOKEN_TYPE_COMMA) return SQLITE_ERROR; + rc = vec0_scanner_next(scanner, &token); + } + + if (config->nprobe < 0) config->nprobe = VEC0_IVF_DEFAULT_NPROBE; + if (config->nlist > 0 && config->nprobe > config->nlist) { + if (nprobe_explicit) return SQLITE_ERROR; + config->nprobe = config->nlist; + } + + // Validation: oversample > 1 only makes sense with quantization + if (config->oversample > 1 && config->quantizer == VEC0_IVF_QUANTIZER_NONE) { + return SQLITE_ERROR; + } + + return SQLITE_OK; +} + +// ============================================================================ +// Helpers +// ============================================================================ + +/** + * Size of a stored vector in bytes, accounting for quantization. + */ +static int ivf_vec_size(vec0_vtab *p, int col_idx) { + int D = (int)p->vector_columns[col_idx].dimensions; + switch (p->vector_columns[col_idx].ivf.quantizer) { + case VEC0_IVF_QUANTIZER_INT8: return D; + case VEC0_IVF_QUANTIZER_BINARY: return D / 8; + default: return D * (int)sizeof(float); + } +} + +/** + * Size of the full-precision vector in bytes (always float32). + */ +static int ivf_full_vec_size(vec0_vtab *p, int col_idx) { + return (int)(p->vector_columns[col_idx].dimensions * sizeof(float)); +} + +/** + * Quantize float32 vector to int8. + * Uses unit normalization: clamp to [-1,1], scale to [-127,127]. + */ +IVF_STATIC void ivf_quantize_int8(const float *src, int8_t *dst, int D) { + for (int i = 0; i < D; i++) { + float v = src[i]; + if (v > 1.0f) v = 1.0f; + if (v < -1.0f) v = -1.0f; + dst[i] = (int8_t)(v * 127.0f); + } +} + +/** + * Quantize float32 vector to binary (sign-bit quantization). + * Each bit = 1 if src[i] > 0, else 0. + */ +IVF_STATIC void ivf_quantize_binary(const float *src, uint8_t *dst, int D) { + memset(dst, 0, D / 8); + for (int i = 0; i < D; i++) { + if (src[i] > 0.0f) { + dst[i / 8] |= (1 << (i % 8)); + } + } +} + +/** + * Quantize a float32 vector to the target type based on config. + * dst must be pre-allocated to ivf_vec_size() bytes. + * If quantizer=none, copies src as-is. + */ +static void ivf_quantize(vec0_vtab *p, int col_idx, + const float *src, void *dst) { + int D = (int)p->vector_columns[col_idx].dimensions; + switch (p->vector_columns[col_idx].ivf.quantizer) { + case VEC0_IVF_QUANTIZER_INT8: + ivf_quantize_int8(src, (int8_t *)dst, D); + break; + case VEC0_IVF_QUANTIZER_BINARY: + ivf_quantize_binary(src, (uint8_t *)dst, D); + break; + default: + memcpy(dst, src, D * sizeof(float)); + break; + } +} + +// Forward declaration +static float ivf_distance(vec0_vtab *p, int col_idx, const void *a, const void *b); + +/** + * Find nearest centroid. Works with quantized or float centroids. + * vec and centroids must be in the same representation (both quantized or both float). + * vecSize = size of one vector in bytes. + */ +static int ivf_find_nearest_centroid(vec0_vtab *p, int col_idx, + const void *vec, const void *centroids, + int vecSize, int k) { + float min_dist = FLT_MAX; + int best = 0; + const unsigned char *cdata = (const unsigned char *)centroids; + for (int c = 0; c < k; c++) { + float dist = ivf_distance(p, col_idx, vec, cdata + c * vecSize); + if (dist < min_dist) { min_dist = dist; best = c; } + } + return best; +} + +/** + * Compute distance between two vectors using the column's distance_metric. + * Dispatches to SIMD-optimized functions (NEON/AVX) via distance_*_float(). + * For float32 (non-quantized) vectors. + */ +static float ivf_distance_float(vec0_vtab *p, int col_idx, + const float *a, const float *b) { + size_t dims = p->vector_columns[col_idx].dimensions; + switch (p->vector_columns[col_idx].distance_metric) { + case VEC0_DISTANCE_METRIC_COSINE: + return distance_cosine_float(a, b, &dims); + case VEC0_DISTANCE_METRIC_L1: + return (float)distance_l1_f32(a, b, &dims); + case VEC0_DISTANCE_METRIC_L2: + default: + return distance_l2_sqr_float(a, b, &dims); + } +} + +/** + * Compute distance between two quantized vectors. + * For int8: uses L2 or cosine on int8. + * For binary: uses hamming distance. + * For none: delegates to ivf_distance_float. + */ +static float ivf_distance(vec0_vtab *p, int col_idx, + const void *a, const void *b) { + size_t dims = p->vector_columns[col_idx].dimensions; + switch (p->vector_columns[col_idx].ivf.quantizer) { + case VEC0_IVF_QUANTIZER_INT8: + return distance_l2_sqr_int8(a, b, &dims); + case VEC0_IVF_QUANTIZER_BINARY: + return distance_hamming(a, b, &dims); + default: + return ivf_distance_float(p, col_idx, (const float *)a, (const float *)b); + } +} + +static int ivf_ensure_stmt(vec0_vtab *p, sqlite3_stmt **pStmt, const char *fmt, + int col_idx) { + if (*pStmt) return SQLITE_OK; + char *zSql = sqlite3_mprintf(fmt, p->schemaName, p->tableName, col_idx); + if (!zSql) return SQLITE_NOMEM; + int rc = sqlite3_prepare_v2(p->db, zSql, -1, pStmt, NULL); + sqlite3_free(zSql); + return rc; +} + +static int ivf_exec(vec0_vtab *p, const char *fmt, int col_idx) { + sqlite3_stmt *stmt = NULL; + char *zSql = sqlite3_mprintf(fmt, p->schemaName, p->tableName, col_idx); + if (!zSql) return SQLITE_NOMEM; + int rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if (rc == SQLITE_OK) { sqlite3_step(stmt); sqlite3_finalize(stmt); } + return SQLITE_OK; +} + +static int ivf_is_trained(vec0_vtab *p, int col_idx) { + if (p->ivfTrainedCache[col_idx] >= 0) return p->ivfTrainedCache[col_idx]; + sqlite3_stmt *stmt = NULL; + int trained = 0; + char *zSql = sqlite3_mprintf( + "SELECT value FROM " VEC0_SHADOW_INFO_NAME " WHERE key = 'ivf_trained_%d'", + p->schemaName, p->tableName, col_idx); + if (!zSql) return 0; + if (sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL) == SQLITE_OK) { + if (sqlite3_step(stmt) == SQLITE_ROW) + trained = (sqlite3_column_int(stmt, 0) == 1); + } + sqlite3_free(zSql); + sqlite3_finalize(stmt); + p->ivfTrainedCache[col_idx] = trained; + return trained; +} + +// ============================================================================ +// Cell operations — fixed-size cells, multiple rows per centroid +// ============================================================================ + +/** + * Create a new cell row. Returns the new cell_id (rowid) via *out_cell_id. + */ +static int ivf_cell_create(vec0_vtab *p, int col_idx, i64 centroid_id, + i64 *out_cell_id) { + sqlite3_stmt *stmt = NULL; + int rc; + int cap = VEC0_IVF_CELL_MAX_VECTORS; + int vecSize = ivf_vec_size(p, col_idx); + char *zSql = sqlite3_mprintf( + "INSERT INTO " VEC0_SHADOW_IVF_CELLS_NAME + " (centroid_id, n_vectors, validity, rowids, vectors) VALUES (?, 0, ?, ?, ?)", + p->schemaName, p->tableName, col_idx); + if (!zSql) return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) return rc; + sqlite3_bind_int64(stmt, 1, centroid_id); + sqlite3_bind_zeroblob(stmt, 2, cap / 8); + sqlite3_bind_zeroblob(stmt, 3, cap * (int)sizeof(i64)); + sqlite3_bind_zeroblob(stmt, 4, cap * vecSize); + rc = sqlite3_step(stmt); + sqlite3_finalize(stmt); + if (rc != SQLITE_DONE) return SQLITE_ERROR; + if (out_cell_id) *out_cell_id = sqlite3_last_insert_rowid(p->db); + return SQLITE_OK; +} + +/** + * Find a cell with space for the given centroid, or create one. + * Returns cell_id (rowid) and current n_vectors. + */ +static int ivf_cell_find_or_create(vec0_vtab *p, int col_idx, i64 centroid_id, + i64 *out_cell_id, int *out_n) { + int rc; + // Find existing cell with space + rc = ivf_ensure_stmt(p, &p->stmtIvfCellMeta[col_idx], + "SELECT rowid, n_vectors FROM " VEC0_SHADOW_IVF_CELLS_NAME + " WHERE centroid_id = ? AND n_vectors < %d LIMIT 1", + col_idx); + // The %d in the format won't work with ivf_ensure_stmt since it only has 3 + // format args. Use a direct approach instead. + sqlite3_finalize(p->stmtIvfCellMeta[col_idx]); + p->stmtIvfCellMeta[col_idx] = NULL; + + char *zSql = sqlite3_mprintf( + "SELECT rowid, n_vectors FROM " VEC0_SHADOW_IVF_CELLS_NAME + " WHERE centroid_id = ? AND n_vectors < %d LIMIT 1", + p->schemaName, p->tableName, col_idx, VEC0_IVF_CELL_MAX_VECTORS); + if (!zSql) return SQLITE_NOMEM; + // Cache this manually + if (!p->stmtIvfCellMeta[col_idx]) { + rc = sqlite3_prepare_v2(p->db, zSql, -1, &p->stmtIvfCellMeta[col_idx], NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) return rc; + } else { + sqlite3_free(zSql); + } + + sqlite3_stmt *stmt = p->stmtIvfCellMeta[col_idx]; + sqlite3_reset(stmt); + sqlite3_bind_int64(stmt, 1, centroid_id); + + if (sqlite3_step(stmt) == SQLITE_ROW) { + *out_cell_id = sqlite3_column_int64(stmt, 0); + *out_n = sqlite3_column_int(stmt, 1); + return SQLITE_OK; + } + + // No cell with space — create new one + rc = ivf_cell_create(p, col_idx, centroid_id, out_cell_id); + *out_n = 0; + return rc; +} + +/** + * Insert vector into cell at slot = n_vectors (append). + * Cell must have space (n_vectors < VEC0_IVF_CELL_MAX_VECTORS). + */ +static int ivf_cell_insert(vec0_vtab *p, int col_idx, i64 centroid_id, + i64 rowid, const void *vectorData, int vectorSize) { + int rc; + i64 cell_id; + int n_vectors; + + rc = ivf_cell_find_or_create(p, col_idx, centroid_id, &cell_id, &n_vectors); + if (rc != SQLITE_OK) return rc; + + int slot = n_vectors; + char *cellsTable = p->shadowIvfCellsNames[col_idx]; + + // Set validity bit + sqlite3_blob *blob = NULL; + rc = sqlite3_blob_open(p->db, p->schemaName, cellsTable, "validity", + cell_id, 1, &blob); + if (rc != SQLITE_OK) return rc; + unsigned char bx; + sqlite3_blob_read(blob, &bx, 1, slot / 8); + bx |= (1 << (slot % 8)); + sqlite3_blob_write(blob, &bx, 1, slot / 8); + sqlite3_blob_close(blob); + + // Write rowid + rc = sqlite3_blob_open(p->db, p->schemaName, cellsTable, "rowids", + cell_id, 1, &blob); + if (rc == SQLITE_OK) { + sqlite3_blob_write(blob, &rowid, sizeof(i64), slot * (int)sizeof(i64)); + sqlite3_blob_close(blob); + } + + // Write vector + rc = sqlite3_blob_open(p->db, p->schemaName, cellsTable, "vectors", + cell_id, 1, &blob); + if (rc == SQLITE_OK) { + sqlite3_blob_write(blob, vectorData, vectorSize, slot * vectorSize); + sqlite3_blob_close(blob); + } + + // Increment n_vectors (cached) + ivf_ensure_stmt(p, &p->stmtIvfCellUpdateN[col_idx], + "UPDATE " VEC0_SHADOW_IVF_CELLS_NAME + " SET n_vectors = n_vectors + 1 WHERE rowid = ?", col_idx); + if (p->stmtIvfCellUpdateN[col_idx]) { + sqlite3_stmt *s = p->stmtIvfCellUpdateN[col_idx]; + sqlite3_reset(s); + sqlite3_bind_int64(s, 1, cell_id); + sqlite3_step(s); + } + + // Insert rowid_map (cached) + ivf_ensure_stmt(p, &p->stmtIvfRowidMapInsert[col_idx], + "INSERT INTO " VEC0_SHADOW_IVF_ROWID_MAP_NAME + " (rowid, cell_id, slot) VALUES (?, ?, ?)", col_idx); + if (p->stmtIvfRowidMapInsert[col_idx]) { + sqlite3_stmt *s = p->stmtIvfRowidMapInsert[col_idx]; + sqlite3_reset(s); + sqlite3_bind_int64(s, 1, rowid); + sqlite3_bind_int64(s, 2, cell_id); + sqlite3_bind_int(s, 3, slot); + sqlite3_step(s); + } + + return SQLITE_OK; +} + +// ============================================================================ +// Shadow tables +// ============================================================================ + +static int ivf_create_shadow_tables(vec0_vtab *p, int col_idx) { + sqlite3_stmt *stmt = NULL; + int rc; + char *zSql; + + zSql = sqlite3_mprintf( + "CREATE TABLE " VEC0_SHADOW_IVF_CENTROIDS_NAME + " (centroid_id INTEGER PRIMARY KEY, centroid BLOB NOT NULL)", + p->schemaName, p->tableName, col_idx); + if (!zSql) return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); sqlite3_free(zSql); + if (rc != SQLITE_OK || sqlite3_step(stmt) != SQLITE_DONE) { sqlite3_finalize(stmt); return SQLITE_ERROR; } + sqlite3_finalize(stmt); + + // cell_id is rowid (auto-increment), centroid_id is indexed + zSql = sqlite3_mprintf( + "CREATE TABLE " VEC0_SHADOW_IVF_CELLS_NAME + " (centroid_id INTEGER NOT NULL," + " n_vectors INTEGER NOT NULL DEFAULT 0," + " validity BLOB NOT NULL," + " rowids BLOB NOT NULL," + " vectors BLOB NOT NULL)", + p->schemaName, p->tableName, col_idx); + if (!zSql) return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); sqlite3_free(zSql); + if (rc != SQLITE_OK || sqlite3_step(stmt) != SQLITE_DONE) { sqlite3_finalize(stmt); return SQLITE_ERROR; } + sqlite3_finalize(stmt); + + // Index on centroid_id for cell lookup + zSql = sqlite3_mprintf( + "CREATE INDEX \"%w_ivf_cells%02d_centroid\" ON \"%w_ivf_cells%02d\" (centroid_id)", + p->tableName, col_idx, p->tableName, col_idx); + if (!zSql) return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); sqlite3_free(zSql); + if (rc != SQLITE_OK || sqlite3_step(stmt) != SQLITE_DONE) { sqlite3_finalize(stmt); return SQLITE_ERROR; } + sqlite3_finalize(stmt); + + zSql = sqlite3_mprintf( + "CREATE TABLE " VEC0_SHADOW_IVF_ROWID_MAP_NAME + " (rowid INTEGER PRIMARY KEY, cell_id INTEGER NOT NULL, slot INTEGER NOT NULL)", + p->schemaName, p->tableName, col_idx); + if (!zSql) return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); sqlite3_free(zSql); + if (rc != SQLITE_OK || sqlite3_step(stmt) != SQLITE_DONE) { sqlite3_finalize(stmt); return SQLITE_ERROR; } + sqlite3_finalize(stmt); + + // _ivf_vectors — full-precision KV store (only when quantizer != none) + if (p->vector_columns[col_idx].ivf.quantizer != VEC0_IVF_QUANTIZER_NONE) { + zSql = sqlite3_mprintf( + "CREATE TABLE " VEC0_SHADOW_IVF_VECTORS_NAME + " (rowid INTEGER PRIMARY KEY, vector BLOB NOT NULL)", + p->schemaName, p->tableName, col_idx); + if (!zSql) return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); sqlite3_free(zSql); + if (rc != SQLITE_OK || sqlite3_step(stmt) != SQLITE_DONE) { sqlite3_finalize(stmt); return SQLITE_ERROR; } + sqlite3_finalize(stmt); + } + + zSql = sqlite3_mprintf( + "INSERT INTO " VEC0_SHADOW_INFO_NAME " (key, value) VALUES ('ivf_trained_%d', '0')", + p->schemaName, p->tableName, col_idx); + if (!zSql) return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); sqlite3_free(zSql); + if (rc != SQLITE_OK || sqlite3_step(stmt) != SQLITE_DONE) { sqlite3_finalize(stmt); return SQLITE_ERROR; } + sqlite3_finalize(stmt); + + return SQLITE_OK; +} + +static int ivf_drop_shadow_tables(vec0_vtab *p, int col_idx) { + ivf_exec(p, "DROP TABLE IF EXISTS " VEC0_SHADOW_IVF_CENTROIDS_NAME, col_idx); + ivf_exec(p, "DROP TABLE IF EXISTS " VEC0_SHADOW_IVF_CELLS_NAME, col_idx); + ivf_exec(p, "DROP TABLE IF EXISTS " VEC0_SHADOW_IVF_ROWID_MAP_NAME, col_idx); + ivf_exec(p, "DROP TABLE IF EXISTS " VEC0_SHADOW_IVF_VECTORS_NAME, col_idx); + return SQLITE_OK; +} + +// ============================================================================ +// Insert / Delete +// ============================================================================ + +static int ivf_insert(vec0_vtab *p, int col_idx, i64 rowid, + const void *vectorData, int vectorSize) { + UNUSED_PARAMETER(vectorSize); + int quantizer = p->vector_columns[col_idx].ivf.quantizer; + int qvecSize = ivf_vec_size(p, col_idx); + int rc; + + // Quantize the input vector (or copy as-is if no quantization) + void *qvec = sqlite3_malloc(qvecSize); + if (!qvec) return SQLITE_NOMEM; + ivf_quantize(p, col_idx, (const float *)vectorData, qvec); + + if (!ivf_is_trained(p, col_idx)) { + rc = ivf_cell_insert(p, col_idx, VEC0_IVF_UNASSIGNED_CENTROID_ID, + rowid, qvec, qvecSize); + } else { + // Find nearest centroid using quantized distance + int best_centroid = -1; + float min_dist = FLT_MAX; + + rc = ivf_ensure_stmt(p, &p->stmtIvfCentroidsAll[col_idx], + "SELECT centroid_id, centroid FROM " VEC0_SHADOW_IVF_CENTROIDS_NAME, col_idx); + if (rc != SQLITE_OK) { sqlite3_free(qvec); return rc; } + sqlite3_stmt *stmt = p->stmtIvfCentroidsAll[col_idx]; + sqlite3_reset(stmt); + while (sqlite3_step(stmt) == SQLITE_ROW) { + int cid = sqlite3_column_int(stmt, 0); + const void *c = sqlite3_column_blob(stmt, 1); + int cBytes = sqlite3_column_bytes(stmt, 1); + if (!c || cBytes != qvecSize) continue; + float dist = ivf_distance(p, col_idx, qvec, c); + if (dist < min_dist) { min_dist = dist; best_centroid = cid; } + } + if (best_centroid < 0) { sqlite3_free(qvec); return SQLITE_ERROR; } + + rc = ivf_cell_insert(p, col_idx, best_centroid, rowid, qvec, qvecSize); + } + + sqlite3_free(qvec); + if (rc != SQLITE_OK) return rc; + + // Store full-precision vector in KV table when quantized + if (quantizer != VEC0_IVF_QUANTIZER_NONE) { + sqlite3_stmt *stmt = NULL; + char *zSql = sqlite3_mprintf( + "INSERT INTO " VEC0_SHADOW_IVF_VECTORS_NAME " (rowid, vector) VALUES (?, ?)", + p->schemaName, p->tableName, col_idx); + if (!zSql) return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); sqlite3_free(zSql); + if (rc != SQLITE_OK) return rc; + sqlite3_bind_int64(stmt, 1, rowid); + sqlite3_bind_blob(stmt, 2, vectorData, ivf_full_vec_size(p, col_idx), SQLITE_STATIC); + rc = sqlite3_step(stmt); + sqlite3_finalize(stmt); + if (rc != SQLITE_DONE) return SQLITE_ERROR; + } + + return SQLITE_OK; +} + +static int ivf_delete(vec0_vtab *p, int col_idx, i64 rowid) { + int rc; + i64 cell_id = 0; + int slot = -1; + + rc = ivf_ensure_stmt(p, &p->stmtIvfRowidMapLookup[col_idx], + "SELECT cell_id, slot FROM " VEC0_SHADOW_IVF_ROWID_MAP_NAME + " WHERE rowid = ?", col_idx); + if (rc != SQLITE_OK) return rc; + sqlite3_stmt *s = p->stmtIvfRowidMapLookup[col_idx]; + sqlite3_reset(s); + sqlite3_bind_int64(s, 1, rowid); + if (sqlite3_step(s) == SQLITE_ROW) { + cell_id = sqlite3_column_int64(s, 0); + slot = sqlite3_column_int(s, 1); + } + if (slot < 0) return SQLITE_OK; + + // Clear validity bit + char *cellsTable = p->shadowIvfCellsNames[col_idx]; + sqlite3_blob *blob = NULL; + rc = sqlite3_blob_open(p->db, p->schemaName, cellsTable, "validity", + cell_id, 1, &blob); + if (rc == SQLITE_OK) { + unsigned char bx; + sqlite3_blob_read(blob, &bx, 1, slot / 8); + bx &= ~(1 << (slot % 8)); + sqlite3_blob_write(blob, &bx, 1, slot / 8); + sqlite3_blob_close(blob); + } + + // Decrement n_vectors + if (p->stmtIvfCellUpdateN[col_idx]) { + // This stmt does +1, but we want -1. Use a different cached stmt. + } + // Just use inline for decrement (not hot path) + { + sqlite3_stmt *stmtDec = NULL; + char *zSql = sqlite3_mprintf( + "UPDATE " VEC0_SHADOW_IVF_CELLS_NAME + " SET n_vectors = n_vectors - 1 WHERE rowid = ?", + p->schemaName, p->tableName, col_idx); + if (zSql) { + sqlite3_prepare_v2(p->db, zSql, -1, &stmtDec, NULL); sqlite3_free(zSql); + if (stmtDec) { sqlite3_bind_int64(stmtDec, 1, cell_id); sqlite3_step(stmtDec); sqlite3_finalize(stmtDec); } + } + } + + // Delete from rowid_map + ivf_ensure_stmt(p, &p->stmtIvfRowidMapDelete[col_idx], + "DELETE FROM " VEC0_SHADOW_IVF_ROWID_MAP_NAME " WHERE rowid = ?", col_idx); + if (p->stmtIvfRowidMapDelete[col_idx]) { + sqlite3_stmt *sd = p->stmtIvfRowidMapDelete[col_idx]; + sqlite3_reset(sd); + sqlite3_bind_int64(sd, 1, rowid); + sqlite3_step(sd); + } + + // Delete from _ivf_vectors (full-precision KV) when quantized + if (p->vector_columns[col_idx].ivf.quantizer != VEC0_IVF_QUANTIZER_NONE) { + sqlite3_stmt *stmtDelVec = NULL; + char *zSql = sqlite3_mprintf( + "DELETE FROM " VEC0_SHADOW_IVF_VECTORS_NAME " WHERE rowid = ?", + p->schemaName, p->tableName, col_idx); + if (zSql) { + sqlite3_prepare_v2(p->db, zSql, -1, &stmtDelVec, NULL); sqlite3_free(zSql); + if (stmtDelVec) { sqlite3_bind_int64(stmtDelVec, 1, rowid); sqlite3_step(stmtDelVec); sqlite3_finalize(stmtDelVec); } + } + } + + return SQLITE_OK; +} + +// ============================================================================ +// Point query +// ============================================================================ + +static int ivf_get_vector_data(vec0_vtab *p, i64 rowid, int col_idx, + void **outVector, int *outVectorSize) { + int rc; + int vecSize = ivf_vec_size(p, col_idx); + i64 cell_id = 0; + int slot = -1; + + rc = ivf_ensure_stmt(p, &p->stmtIvfRowidMapLookup[col_idx], + "SELECT cell_id, slot FROM " VEC0_SHADOW_IVF_ROWID_MAP_NAME + " WHERE rowid = ?", col_idx); + if (rc != SQLITE_OK) return rc; + sqlite3_stmt *s = p->stmtIvfRowidMapLookup[col_idx]; + sqlite3_reset(s); + sqlite3_bind_int64(s, 1, rowid); + if (sqlite3_step(s) != SQLITE_ROW) return SQLITE_EMPTY; + cell_id = sqlite3_column_int64(s, 0); + slot = sqlite3_column_int(s, 1); + + void *buf = sqlite3_malloc(vecSize); + if (!buf) return SQLITE_NOMEM; + + sqlite3_blob *blob = NULL; + rc = sqlite3_blob_open(p->db, p->schemaName, p->shadowIvfCellsNames[col_idx], + "vectors", cell_id, 0, &blob); + if (rc != SQLITE_OK) { sqlite3_free(buf); return rc; } + rc = sqlite3_blob_read(blob, buf, vecSize, slot * vecSize); + sqlite3_blob_close(blob); + if (rc != SQLITE_OK) { sqlite3_free(buf); return rc; } + + *outVector = buf; + if (outVectorSize) *outVectorSize = vecSize; + return SQLITE_OK; +} + +// ============================================================================ +// Centroid commands +// ============================================================================ + +static int ivf_load_all_vectors(vec0_vtab *p, int col_idx, + float **out_vectors, i64 **out_rowids, int *out_N) { + sqlite3_stmt *stmt = NULL; + int rc; + int D = (int)p->vector_columns[col_idx].dimensions; + int vecSize = D * (int)sizeof(float); + int quantizer = p->vector_columns[col_idx].ivf.quantizer; + + // When quantized, load full-precision vectors from _ivf_vectors KV table + if (quantizer != VEC0_IVF_QUANTIZER_NONE) { + int total = 0; + char *zSql = sqlite3_mprintf( + "SELECT count(*) FROM " VEC0_SHADOW_IVF_VECTORS_NAME, + p->schemaName, p->tableName, col_idx); + if (!zSql) return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); sqlite3_free(zSql); + if (rc == SQLITE_OK && sqlite3_step(stmt) == SQLITE_ROW) total = sqlite3_column_int(stmt, 0); + sqlite3_finalize(stmt); + if (total == 0) { *out_vectors = NULL; *out_rowids = NULL; *out_N = 0; return SQLITE_OK; } + + float *vectors = sqlite3_malloc64((i64)total * D * sizeof(float)); + i64 *rowids = sqlite3_malloc64((i64)total * sizeof(i64)); + if (!vectors || !rowids) { sqlite3_free(vectors); sqlite3_free(rowids); return SQLITE_NOMEM; } + + int idx = 0; + zSql = sqlite3_mprintf( + "SELECT rowid, vector FROM " VEC0_SHADOW_IVF_VECTORS_NAME, + p->schemaName, p->tableName, col_idx); + if (!zSql) { sqlite3_free(vectors); sqlite3_free(rowids); return SQLITE_NOMEM; } + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); sqlite3_free(zSql); + if (rc == SQLITE_OK) { + while (sqlite3_step(stmt) == SQLITE_ROW && idx < total) { + rowids[idx] = sqlite3_column_int64(stmt, 0); + const void *blob = sqlite3_column_blob(stmt, 1); + int blobBytes = sqlite3_column_bytes(stmt, 1); + if (blob && blobBytes == vecSize) { + memcpy(&vectors[idx * D], blob, vecSize); + idx++; + } + } + } + sqlite3_finalize(stmt); + *out_vectors = vectors; *out_rowids = rowids; *out_N = idx; + return SQLITE_OK; + } + + // Non-quantized: load from cells (existing path) + + // Count total + int total = 0; + char *zSql = sqlite3_mprintf( + "SELECT COALESCE(SUM(n_vectors),0) FROM " VEC0_SHADOW_IVF_CELLS_NAME, + p->schemaName, p->tableName, col_idx); + if (!zSql) return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); sqlite3_free(zSql); + if (rc == SQLITE_OK && sqlite3_step(stmt) == SQLITE_ROW) total = sqlite3_column_int(stmt, 0); + sqlite3_finalize(stmt); + + if (total == 0) { *out_vectors = NULL; *out_rowids = NULL; *out_N = 0; return SQLITE_OK; } + + float *vectors = sqlite3_malloc64((i64)total * D * sizeof(float)); + i64 *rowids = sqlite3_malloc64((i64)total * sizeof(i64)); + if (!vectors || !rowids) { sqlite3_free(vectors); sqlite3_free(rowids); return SQLITE_NOMEM; } + + int idx = 0; + zSql = sqlite3_mprintf( + "SELECT n_vectors, validity, rowids, vectors FROM " VEC0_SHADOW_IVF_CELLS_NAME, + p->schemaName, p->tableName, col_idx); + if (!zSql) { sqlite3_free(vectors); sqlite3_free(rowids); return SQLITE_NOMEM; } + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); sqlite3_free(zSql); + if (rc != SQLITE_OK) { sqlite3_free(vectors); sqlite3_free(rowids); return rc; } + + while (sqlite3_step(stmt) == SQLITE_ROW) { + int n = sqlite3_column_int(stmt, 0); + if (n == 0) continue; + const unsigned char *val = (const unsigned char *)sqlite3_column_blob(stmt, 1); + const i64 *rids = (const i64 *)sqlite3_column_blob(stmt, 2); + const float *vecs = (const float *)sqlite3_column_blob(stmt, 3); + int valBytes = sqlite3_column_bytes(stmt, 1); + int ridsBytes = sqlite3_column_bytes(stmt, 2); + int vecsBytes = sqlite3_column_bytes(stmt, 3); + if (!val || !rids || !vecs) continue; + int cap = valBytes * 8; + // Clamp cap to the number of entries actually backed by the rowids and vectors blobs + if (ridsBytes / (int)sizeof(i64) < cap) cap = ridsBytes / (int)sizeof(i64); + if (vecsBytes / vecSize < cap) cap = vecsBytes / vecSize; + for (int i = 0; i < cap && idx < total; i++) { + if (val[i / 8] & (1 << (i % 8))) { + rowids[idx] = rids[i]; + memcpy(&vectors[idx * D], &vecs[i * D], vecSize); + idx++; + } + } + } + sqlite3_finalize(stmt); + *out_vectors = vectors; *out_rowids = rowids; *out_N = idx; + return SQLITE_OK; +} + +static void ivf_invalidate_cached(vec0_vtab *p, int col_idx) { + sqlite3_finalize(p->stmtIvfCellMeta[col_idx]); p->stmtIvfCellMeta[col_idx] = NULL; + sqlite3_finalize(p->stmtIvfCentroidsAll[col_idx]); p->stmtIvfCentroidsAll[col_idx] = NULL; + sqlite3_finalize(p->stmtIvfCellUpdateN[col_idx]); p->stmtIvfCellUpdateN[col_idx] = NULL; + sqlite3_finalize(p->stmtIvfRowidMapInsert[col_idx]); p->stmtIvfRowidMapInsert[col_idx] = NULL; +} + +static int ivf_cmd_compute_centroids(vec0_vtab *p, int col_idx, int nlist_override, + int max_iter, uint32_t seed) { + int rc; + int D = (int)p->vector_columns[col_idx].dimensions; + int vecSize = D * (int)sizeof(float); + int quantizer = p->vector_columns[col_idx].ivf.quantizer; + int nlist = nlist_override > 0 ? nlist_override : p->vector_columns[col_idx].ivf.nlist; + if (nlist <= 0) { vtab_set_error(&p->base, "nlist must be specified"); return SQLITE_ERROR; } + + float *vectors = NULL; i64 *rowids = NULL; int N = 0; + rc = ivf_load_all_vectors(p, col_idx, &vectors, &rowids, &N); + if (rc != SQLITE_OK) return rc; + if (N == 0) { vtab_set_error(&p->base, "No vectors"); sqlite3_free(vectors); sqlite3_free(rowids); return SQLITE_ERROR; } + if (nlist > N) nlist = N; + + float *centroids = sqlite3_malloc64((i64)nlist * D * sizeof(float)); + if (!centroids) { sqlite3_free(vectors); sqlite3_free(rowids); return SQLITE_NOMEM; } + if (ivf_kmeans(vectors, N, D, nlist, max_iter, seed, centroids) != 0) { + sqlite3_free(vectors); sqlite3_free(rowids); sqlite3_free(centroids); return SQLITE_ERROR; + } + + // Compute assignments + int *assignments = sqlite3_malloc64((i64)N * sizeof(int)); + if (!assignments) { sqlite3_free(vectors); sqlite3_free(rowids); sqlite3_free(centroids); return SQLITE_NOMEM; } + // Assignment uses float32 distances (k-means operates in float32 space) + for (int i = 0; i < N; i++) { + float min_d = FLT_MAX; + int best = 0; + for (int c = 0; c < nlist; c++) { + float d = ivf_distance_float(p, col_idx, &vectors[i * D], ¢roids[c * D]); + if (d < min_d) { min_d = d; best = c; } + } + assignments[i] = best; + } + + // Invalidate all cached stmts before dropping/recreating tables + ivf_invalidate_cached(p, col_idx); + + sqlite3_exec(p->db, "SAVEPOINT ivf_train", NULL, NULL, NULL); + sqlite3_stmt *stmt = NULL; + char *zSql; + + // Clear all data + ivf_exec(p, "DELETE FROM " VEC0_SHADOW_IVF_CENTROIDS_NAME, col_idx); + ivf_exec(p, "DELETE FROM " VEC0_SHADOW_IVF_CELLS_NAME, col_idx); + ivf_exec(p, "DELETE FROM " VEC0_SHADOW_IVF_ROWID_MAP_NAME, col_idx); + + // Write centroids (quantized if quantizer is set) + int qvecSize = ivf_vec_size(p, col_idx); + void *qbuf = sqlite3_malloc(qvecSize > vecSize ? qvecSize : vecSize); + if (!qbuf) { rc = SQLITE_NOMEM; goto train_error; } + + zSql = sqlite3_mprintf( + "INSERT INTO " VEC0_SHADOW_IVF_CENTROIDS_NAME " (centroid_id, centroid) VALUES (?, ?)", + p->schemaName, p->tableName, col_idx); + if (!zSql) { sqlite3_free(qbuf); rc = SQLITE_NOMEM; goto train_error; } + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); sqlite3_free(zSql); + if (rc != SQLITE_OK) { sqlite3_free(qbuf); goto train_error; } + for (int i = 0; i < nlist; i++) { + ivf_quantize(p, col_idx, ¢roids[i * D], qbuf); + sqlite3_reset(stmt); + sqlite3_bind_int(stmt, 1, i); + sqlite3_bind_blob(stmt, 2, qbuf, qvecSize, SQLITE_TRANSIENT); + if (sqlite3_step(stmt) != SQLITE_DONE) { sqlite3_finalize(stmt); sqlite3_free(qbuf); rc = SQLITE_ERROR; goto train_error; } + } + sqlite3_finalize(stmt); + + // Build cells: group vectors by centroid, create fixed-size cells + { + // Prepare INSERT statements + sqlite3_stmt *stmtCell = NULL; + zSql = sqlite3_mprintf( + "INSERT INTO " VEC0_SHADOW_IVF_CELLS_NAME + " (centroid_id, n_vectors, validity, rowids, vectors) VALUES (?, ?, ?, ?, ?)", + p->schemaName, p->tableName, col_idx); + if (!zSql) { rc = SQLITE_NOMEM; goto train_error; } + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmtCell, NULL); sqlite3_free(zSql); + if (rc != SQLITE_OK) goto train_error; + + sqlite3_stmt *stmtMap = NULL; + zSql = sqlite3_mprintf( + "INSERT INTO " VEC0_SHADOW_IVF_ROWID_MAP_NAME " (rowid, cell_id, slot) VALUES (?, ?, ?)", + p->schemaName, p->tableName, col_idx); + if (!zSql) { sqlite3_finalize(stmtCell); rc = SQLITE_NOMEM; goto train_error; } + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmtMap, NULL); sqlite3_free(zSql); + if (rc != SQLITE_OK) { sqlite3_finalize(stmtCell); goto train_error; } + + int cap = VEC0_IVF_CELL_MAX_VECTORS; + unsigned char *val = sqlite3_malloc(cap / 8); + i64 *rids = sqlite3_malloc64((i64)cap * sizeof(i64)); + unsigned char *vecs = sqlite3_malloc64((i64)cap * qvecSize); // quantized size + if (!val || !rids || !vecs) { + sqlite3_free(val); sqlite3_free(rids); sqlite3_free(vecs); + sqlite3_finalize(stmtCell); sqlite3_finalize(stmtMap); + sqlite3_free(qbuf); + rc = SQLITE_NOMEM; goto train_error; + } + + // Process one centroid at a time + for (int c = 0; c < nlist; c++) { + int slot = 0; + memset(val, 0, cap / 8); + memset(rids, 0, cap * sizeof(i64)); + + for (int i = 0; i < N; i++) { + if (assignments[i] != c) continue; + + if (slot >= cap) { + // Flush current cell + sqlite3_reset(stmtCell); + sqlite3_bind_int(stmtCell, 1, c); + sqlite3_bind_int(stmtCell, 2, slot); + sqlite3_bind_blob(stmtCell, 3, val, cap / 8, SQLITE_TRANSIENT); + sqlite3_bind_blob(stmtCell, 4, rids, cap * (int)sizeof(i64), SQLITE_TRANSIENT); + sqlite3_bind_blob(stmtCell, 5, vecs, cap * qvecSize, SQLITE_TRANSIENT); + sqlite3_step(stmtCell); + i64 flushed_cell_id = sqlite3_last_insert_rowid(p->db); + + for (int s = 0; s < slot; s++) { + sqlite3_reset(stmtMap); + sqlite3_bind_int64(stmtMap, 1, rids[s]); + sqlite3_bind_int64(stmtMap, 2, flushed_cell_id); + sqlite3_bind_int(stmtMap, 3, s); + sqlite3_step(stmtMap); + } + + slot = 0; + memset(val, 0, cap / 8); + memset(rids, 0, cap * sizeof(i64)); + } + + val[slot / 8] |= (1 << (slot % 8)); + rids[slot] = rowids[i]; + // Quantize float32 vector into cell blob + ivf_quantize(p, col_idx, &vectors[i * D], &vecs[slot * qvecSize]); + slot++; + } + + // Flush remaining + if (slot > 0) { + sqlite3_reset(stmtCell); + sqlite3_bind_int(stmtCell, 1, c); + sqlite3_bind_int(stmtCell, 2, slot); + sqlite3_bind_blob(stmtCell, 3, val, cap / 8, SQLITE_TRANSIENT); + sqlite3_bind_blob(stmtCell, 4, rids, cap * (int)sizeof(i64), SQLITE_TRANSIENT); + sqlite3_bind_blob(stmtCell, 5, vecs, cap * qvecSize, SQLITE_TRANSIENT); + sqlite3_step(stmtCell); + i64 flushed_cell_id = sqlite3_last_insert_rowid(p->db); + + for (int s = 0; s < slot; s++) { + sqlite3_reset(stmtMap); + sqlite3_bind_int64(stmtMap, 1, rids[s]); + sqlite3_bind_int64(stmtMap, 2, flushed_cell_id); + sqlite3_bind_int(stmtMap, 3, s); + sqlite3_step(stmtMap); + } + } + } + + sqlite3_free(val); sqlite3_free(rids); sqlite3_free(vecs); + sqlite3_finalize(stmtCell); sqlite3_finalize(stmtMap); + } + + sqlite3_free(qbuf); + + // Store full-precision vectors in _ivf_vectors when quantized + if (quantizer != VEC0_IVF_QUANTIZER_NONE) { + ivf_exec(p, "DELETE FROM " VEC0_SHADOW_IVF_VECTORS_NAME, col_idx); + zSql = sqlite3_mprintf( + "INSERT INTO " VEC0_SHADOW_IVF_VECTORS_NAME " (rowid, vector) VALUES (?, ?)", + p->schemaName, p->tableName, col_idx); + if (!zSql) { rc = SQLITE_NOMEM; goto train_error; } + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); sqlite3_free(zSql); + if (rc != SQLITE_OK) goto train_error; + for (int i = 0; i < N; i++) { + sqlite3_reset(stmt); + sqlite3_bind_int64(stmt, 1, rowids[i]); + sqlite3_bind_blob(stmt, 2, &vectors[i * D], vecSize, SQLITE_STATIC); + sqlite3_step(stmt); + } + sqlite3_finalize(stmt); + } + + // Set trained = 1 + { + zSql = sqlite3_mprintf( + "INSERT OR REPLACE INTO " VEC0_SHADOW_INFO_NAME " (key, value) VALUES ('ivf_trained_%d', '1')", + p->schemaName, p->tableName, col_idx); + if (zSql) { sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); sqlite3_free(zSql); + sqlite3_step(stmt); sqlite3_finalize(stmt); } + } + p->ivfTrainedCache[col_idx] = 1; + + sqlite3_exec(p->db, "RELEASE ivf_train", NULL, NULL, NULL); + sqlite3_free(vectors); sqlite3_free(rowids); sqlite3_free(centroids); sqlite3_free(assignments); + return SQLITE_OK; + +train_error: + sqlite3_exec(p->db, "ROLLBACK TO ivf_train", NULL, NULL, NULL); + sqlite3_exec(p->db, "RELEASE ivf_train", NULL, NULL, NULL); + sqlite3_free(vectors); sqlite3_free(rowids); sqlite3_free(centroids); sqlite3_free(assignments); + return rc; +} + +static int ivf_cmd_set_centroid(vec0_vtab *p, int col_idx, int centroid_id, + const void *vectorData, int vectorSize) { + sqlite3_stmt *stmt = NULL; + int rc; + int D = (int)p->vector_columns[col_idx].dimensions; + if (vectorSize != (int)(D * sizeof(float))) { vtab_set_error(&p->base, "Dimension mismatch"); return SQLITE_ERROR; } + + char *zSql = sqlite3_mprintf( + "INSERT OR REPLACE INTO " VEC0_SHADOW_IVF_CENTROIDS_NAME " (centroid_id, centroid) VALUES (?, ?)", + p->schemaName, p->tableName, col_idx); + if (!zSql) return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); sqlite3_free(zSql); + if (rc != SQLITE_OK) return rc; + sqlite3_bind_int(stmt, 1, centroid_id); + sqlite3_bind_blob(stmt, 2, vectorData, vectorSize, SQLITE_STATIC); + rc = sqlite3_step(stmt); sqlite3_finalize(stmt); + if (rc != SQLITE_DONE) return SQLITE_ERROR; + + zSql = sqlite3_mprintf( + "INSERT OR REPLACE INTO " VEC0_SHADOW_INFO_NAME " (key, value) VALUES ('ivf_trained_%d', '1')", + p->schemaName, p->tableName, col_idx); + if (zSql) { sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); sqlite3_free(zSql); + sqlite3_step(stmt); sqlite3_finalize(stmt); } + p->ivfTrainedCache[col_idx] = 1; + sqlite3_finalize(p->stmtIvfCentroidsAll[col_idx]); p->stmtIvfCentroidsAll[col_idx] = NULL; + return SQLITE_OK; +} + +static int ivf_cmd_assign_vectors(vec0_vtab *p, int col_idx) { + if (!ivf_is_trained(p, col_idx)) { vtab_set_error(&p->base, "No centroids"); return SQLITE_ERROR; } + + int D = (int)p->vector_columns[col_idx].dimensions; + int vecSize = D * (int)sizeof(float); + int rc; + sqlite3_stmt *stmt = NULL; + char *zSql; + + // Load centroids + int nlist = 0; + float *centroids = NULL; + zSql = sqlite3_mprintf("SELECT count(*) FROM " VEC0_SHADOW_IVF_CENTROIDS_NAME, + p->schemaName, p->tableName, col_idx); + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); sqlite3_free(zSql); + if (rc == SQLITE_OK && sqlite3_step(stmt) == SQLITE_ROW) nlist = sqlite3_column_int(stmt, 0); + sqlite3_finalize(stmt); + if (nlist == 0) { vtab_set_error(&p->base, "No centroids"); return SQLITE_ERROR; } + + centroids = sqlite3_malloc64((i64)nlist * D * sizeof(float)); + if (!centroids) return SQLITE_NOMEM; + zSql = sqlite3_mprintf("SELECT centroid_id, centroid FROM " VEC0_SHADOW_IVF_CENTROIDS_NAME " ORDER BY centroid_id", + p->schemaName, p->tableName, col_idx); + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); sqlite3_free(zSql); + { int ci = 0; while (sqlite3_step(stmt) == SQLITE_ROW && ci < nlist) { + const void *b = sqlite3_column_blob(stmt, 1); + int bBytes = sqlite3_column_bytes(stmt, 1); + if (b && bBytes == vecSize) memcpy(¢roids[ci * D], b, vecSize); + ci++; + }} + sqlite3_finalize(stmt); + + // Read unassigned cells, re-insert into trained cells + zSql = sqlite3_mprintf( + "SELECT rowid, n_vectors, validity, rowids, vectors FROM " VEC0_SHADOW_IVF_CELLS_NAME + " WHERE centroid_id = %d", + p->schemaName, p->tableName, col_idx, VEC0_IVF_UNASSIGNED_CENTROID_ID); + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); sqlite3_free(zSql); + + // Invalidate cached stmts since we'll be modifying cells + ivf_invalidate_cached(p, col_idx); + + while (sqlite3_step(stmt) == SQLITE_ROW) { + int n = sqlite3_column_int(stmt, 1); + const unsigned char *val = (const unsigned char *)sqlite3_column_blob(stmt, 2); + const i64 *rids = (const i64 *)sqlite3_column_blob(stmt, 3); + const float *vecs = (const float *)sqlite3_column_blob(stmt, 4); + int valBytes = sqlite3_column_bytes(stmt, 2); + int ridsBytes = sqlite3_column_bytes(stmt, 3); + int vecsBytes = sqlite3_column_bytes(stmt, 4); + if (!val || !rids || !vecs) continue; + int cap = valBytes * 8; + if (ridsBytes / (int)sizeof(i64) < cap) cap = ridsBytes / (int)sizeof(i64); + if (vecsBytes / vecSize < cap) cap = vecsBytes / vecSize; + + for (int i = 0; i < cap && n > 0; i++) { + if (!(val[i / 8] & (1 << (i % 8)))) continue; + n--; + int cid = ivf_find_nearest_centroid(p, col_idx, &vecs[i * D], centroids, D, nlist); + + // Delete old rowid_map entry + sqlite3_stmt *sd = NULL; + char *zd = sqlite3_mprintf("DELETE FROM " VEC0_SHADOW_IVF_ROWID_MAP_NAME " WHERE rowid = ?", + p->schemaName, p->tableName, col_idx); + if (zd) { sqlite3_prepare_v2(p->db, zd, -1, &sd, NULL); sqlite3_free(zd); + sqlite3_bind_int64(sd, 1, rids[i]); sqlite3_step(sd); sqlite3_finalize(sd); } + + ivf_cell_insert(p, col_idx, cid, rids[i], &vecs[i * D], vecSize); + } + } + sqlite3_finalize(stmt); + + // Delete unassigned cells + zSql = sqlite3_mprintf( + "DELETE FROM " VEC0_SHADOW_IVF_CELLS_NAME " WHERE centroid_id = %d", + p->schemaName, p->tableName, col_idx, VEC0_IVF_UNASSIGNED_CENTROID_ID); + if (zSql) { sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); sqlite3_free(zSql); + sqlite3_step(stmt); sqlite3_finalize(stmt); } + + sqlite3_free(centroids); + return SQLITE_OK; +} + +static int ivf_cmd_clear_centroids(vec0_vtab *p, int col_idx) { + float *vectors = NULL; i64 *rowids = NULL; int N = 0; + int vecSize = ivf_vec_size(p, col_idx); + int D = (int)p->vector_columns[col_idx].dimensions; + int rc; + sqlite3_stmt *stmt = NULL; + char *zSql; + + rc = ivf_load_all_vectors(p, col_idx, &vectors, &rowids, &N); + if (rc != SQLITE_OK) return rc; + + ivf_invalidate_cached(p, col_idx); + + ivf_exec(p, "DELETE FROM " VEC0_SHADOW_IVF_CENTROIDS_NAME, col_idx); + ivf_exec(p, "DELETE FROM " VEC0_SHADOW_IVF_CELLS_NAME, col_idx); + ivf_exec(p, "DELETE FROM " VEC0_SHADOW_IVF_ROWID_MAP_NAME, col_idx); + + // Re-insert all vectors into unassigned cells + for (int i = 0; i < N; i++) { + ivf_cell_insert(p, col_idx, VEC0_IVF_UNASSIGNED_CENTROID_ID, + rowids[i], &vectors[i * D], vecSize); + } + + zSql = sqlite3_mprintf( + "INSERT OR REPLACE INTO " VEC0_SHADOW_INFO_NAME " (key, value) VALUES ('ivf_trained_%d', '0')", + p->schemaName, p->tableName, col_idx); + if (zSql) { sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); sqlite3_free(zSql); + sqlite3_step(stmt); sqlite3_finalize(stmt); } + p->ivfTrainedCache[col_idx] = 0; + + sqlite3_free(vectors); sqlite3_free(rowids); + return SQLITE_OK; +} + +// ============================================================================ +// KNN Query — scan all cells for probed centroids +// ============================================================================ + +struct IvfCentroidDist { int id; float dist; }; +struct IvfCandidate { i64 rowid; float distance; }; + +static int ivf_candidate_cmp(const void *a, const void *b) { + float da = ((const struct IvfCandidate *)a)->distance; + float db = ((const struct IvfCandidate *)b)->distance; + if (da < db) return -1; + if (da > db) return 1; + return 0; +} + +/** + * Scan cell rows from a prepared statement, computing distances in-memory. + * The statement must return (n_vectors, validity, rowids, vectors) columns. + * queryVecQ is the quantized query (same type as cell vectors). + * qvecSize is the size of one quantized vector in bytes. + */ +static int ivf_scan_cells_from_stmt(vec0_vtab *p, int col_idx, + sqlite3_stmt *stmt, + const void *queryVecQ, int qvecSize, + struct IvfCandidate **candidates, + int *nCandidates, int *cap) { + while (sqlite3_step(stmt) == SQLITE_ROW) { + int n = sqlite3_column_int(stmt, 0); + if (n == 0) continue; + const unsigned char *validity = (const unsigned char *)sqlite3_column_blob(stmt, 1); + const i64 *rowids = (const i64 *)sqlite3_column_blob(stmt, 2); + const unsigned char *vectors = (const unsigned char *)sqlite3_column_blob(stmt, 3); + int valBytes = sqlite3_column_bytes(stmt, 1); + int ridsBytes = sqlite3_column_bytes(stmt, 2); + int vecsBytes = sqlite3_column_bytes(stmt, 3); + if (!validity || !rowids || !vectors) continue; + int cell_cap = valBytes * 8; + if (ridsBytes / (int)sizeof(i64) < cell_cap) cell_cap = ridsBytes / (int)sizeof(i64); + if (vecsBytes / qvecSize < cell_cap) cell_cap = vecsBytes / qvecSize; + + int found = 0; + for (int i = 0; i < cell_cap && found < n; i++) { + if (!(validity[i / 8] & (1 << (i % 8)))) continue; + found++; + if (*nCandidates >= *cap) { + *cap *= 2; + struct IvfCandidate *tmp = sqlite3_realloc64(*candidates, (i64)*cap * sizeof(struct IvfCandidate)); + if (!tmp) return SQLITE_NOMEM; + *candidates = tmp; + } + (*candidates)[*nCandidates].rowid = rowids[i]; + (*candidates)[*nCandidates].distance = ivf_distance(p, col_idx, + queryVecQ, &vectors[i * qvecSize]); + (*nCandidates)++; + } + } + return SQLITE_OK; +} + +static int ivf_query_knn(vec0_vtab *p, int col_idx, + const void *queryVector, int queryVectorSize, + i64 k, struct vec0_query_knn_data *knn_data) { + UNUSED_PARAMETER(queryVectorSize); + int rc; + int nprobe = p->vector_columns[col_idx].ivf.nprobe; + int trained = ivf_is_trained(p, col_idx); + int quantizer = p->vector_columns[col_idx].ivf.quantizer; + int oversample = p->vector_columns[col_idx].ivf.oversample; + int qvecSize = ivf_vec_size(p, col_idx); + + // Quantize query vector for scanning + void *queryQ = sqlite3_malloc(qvecSize); + if (!queryQ) return SQLITE_NOMEM; + ivf_quantize(p, col_idx, (const float *)queryVector, queryQ); + + // With oversample, collect more candidates for re-ranking + i64 collect_k = (oversample > 1) ? k * oversample : k; + + int cap = (collect_k < 1024) ? 1024 : (int)collect_k * 2; + int nCandidates = 0; + struct IvfCandidate *candidates = sqlite3_malloc64((i64)cap * sizeof(struct IvfCandidate)); + if (!candidates) { sqlite3_free(queryQ); return SQLITE_NOMEM; } + + if (trained) { + // Find top nprobe centroids using quantized distance + int nlist = 0; + rc = ivf_ensure_stmt(p, &p->stmtIvfCentroidsAll[col_idx], + "SELECT centroid_id, centroid FROM " VEC0_SHADOW_IVF_CENTROIDS_NAME, col_idx); + if (rc != SQLITE_OK) { sqlite3_free(queryQ); sqlite3_free(candidates); return rc; } + sqlite3_stmt *stmt = p->stmtIvfCentroidsAll[col_idx]; + sqlite3_reset(stmt); + + int centroid_cap = 64; + struct IvfCentroidDist *cd = sqlite3_malloc64(centroid_cap * sizeof(*cd)); + if (!cd) { sqlite3_free(queryQ); sqlite3_free(candidates); return SQLITE_NOMEM; } + + while (sqlite3_step(stmt) == SQLITE_ROW) { + if (nlist >= centroid_cap) { + centroid_cap *= 2; + struct IvfCentroidDist *tmp = sqlite3_realloc64(cd, centroid_cap * sizeof(*cd)); + if (!tmp) { sqlite3_free(cd); sqlite3_free(queryQ); sqlite3_free(candidates); return SQLITE_NOMEM; } + cd = tmp; + } + cd[nlist].id = sqlite3_column_int(stmt, 0); + const void *c = sqlite3_column_blob(stmt, 1); + int cBytes = sqlite3_column_bytes(stmt, 1); + // Compare quantized query with quantized centroid + cd[nlist].dist = (c && cBytes == qvecSize) ? ivf_distance(p, col_idx, queryQ, c) : FLT_MAX; + nlist++; + } + + int actual_nprobe = nprobe < nlist ? nprobe : nlist; + for (int i = 0; i < actual_nprobe; i++) { + int min_j = i; + for (int j = i + 1; j < nlist; j++) { + if (cd[j].dist < cd[min_j].dist) min_j = j; + } + if (min_j != i) { struct IvfCentroidDist tmp = cd[i]; cd[i] = cd[min_j]; cd[min_j] = tmp; } + } + + // Scan probed cells + unassigned with quantized distance + { + sqlite3_str *s = sqlite3_str_new(NULL); + sqlite3_str_appendf(s, + "SELECT n_vectors, validity, rowids, vectors FROM " VEC0_SHADOW_IVF_CELLS_NAME + " WHERE centroid_id IN (", + p->schemaName, p->tableName, col_idx); + for (int i = 0; i < actual_nprobe; i++) { + if (i > 0) sqlite3_str_appendall(s, ","); + sqlite3_str_appendf(s, "%d", cd[i].id); + } + sqlite3_str_appendf(s, ",%d)", VEC0_IVF_UNASSIGNED_CENTROID_ID); + char *zSql = sqlite3_str_finish(s); + if (!zSql) { sqlite3_free(cd); sqlite3_free(queryQ); sqlite3_free(candidates); return SQLITE_NOMEM; } + + sqlite3_stmt *stmtScan = NULL; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmtScan, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) { sqlite3_free(cd); sqlite3_free(queryQ); sqlite3_free(candidates); return rc; } + + rc = ivf_scan_cells_from_stmt(p, col_idx, stmtScan, queryQ, qvecSize, + &candidates, &nCandidates, &cap); + sqlite3_finalize(stmtScan); + if (rc != SQLITE_OK) { sqlite3_free(cd); sqlite3_free(queryQ); sqlite3_free(candidates); return rc; } + } + + sqlite3_free(cd); + } else { + // Flat mode: scan only unassigned cells + sqlite3_stmt *stmtScan = NULL; + char *zSql = sqlite3_mprintf( + "SELECT n_vectors, validity, rowids, vectors FROM " VEC0_SHADOW_IVF_CELLS_NAME + " WHERE centroid_id = %d", + p->schemaName, p->tableName, col_idx, VEC0_IVF_UNASSIGNED_CENTROID_ID); + if (!zSql) { sqlite3_free(queryQ); sqlite3_free(candidates); return SQLITE_NOMEM; } + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmtScan, NULL); sqlite3_free(zSql); + if (rc == SQLITE_OK) { + rc = ivf_scan_cells_from_stmt(p, col_idx, stmtScan, queryQ, qvecSize, + &candidates, &nCandidates, &cap); + sqlite3_finalize(stmtScan); + if (rc != SQLITE_OK) { sqlite3_free(queryQ); sqlite3_free(candidates); return rc; } + } + } + + sqlite3_free(queryQ); + + // Sort candidates by quantized distance + qsort(candidates, nCandidates, sizeof(struct IvfCandidate), ivf_candidate_cmp); + + // Oversample re-ranking: re-score top (oversample*k) with full-precision vectors + if (oversample > 1 && quantizer != VEC0_IVF_QUANTIZER_NONE && nCandidates > 0) { + i64 rescore_n = collect_k < nCandidates ? collect_k : nCandidates; + sqlite3_stmt *stmtVec = NULL; + char *zSql = sqlite3_mprintf( + "SELECT vector FROM " VEC0_SHADOW_IVF_VECTORS_NAME " WHERE rowid = ?", + p->schemaName, p->tableName, col_idx); + if (zSql) { + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmtVec, NULL); sqlite3_free(zSql); + if (rc == SQLITE_OK) { + for (i64 i = 0; i < rescore_n; i++) { + sqlite3_reset(stmtVec); + sqlite3_bind_int64(stmtVec, 1, candidates[i].rowid); + if (sqlite3_step(stmtVec) == SQLITE_ROW) { + const float *fullVec = (const float *)sqlite3_column_blob(stmtVec, 0); + int fullVecBytes = sqlite3_column_bytes(stmtVec, 0); + if (fullVec && fullVecBytes == (int)p->vector_columns[col_idx].dimensions * (int)sizeof(float)) { + candidates[i].distance = ivf_distance_float(p, col_idx, + (const float *)queryVector, fullVec); + } + } + } + sqlite3_finalize(stmtVec); + } + } + // Re-sort after re-scoring + qsort(candidates, (size_t)rescore_n, sizeof(struct IvfCandidate), ivf_candidate_cmp); + nCandidates = (int)rescore_n; + } + + qsort(candidates, nCandidates, sizeof(struct IvfCandidate), ivf_candidate_cmp); + i64 nResults = nCandidates < k ? nCandidates : k; + + if (nResults == 0) { + knn_data->rowids = NULL; knn_data->distances = NULL; + knn_data->k = k; knn_data->k_used = 0; knn_data->current_idx = 0; + sqlite3_free(candidates); return SQLITE_OK; + } + + knn_data->rowids = sqlite3_malloc64(nResults * sizeof(i64)); + knn_data->distances = sqlite3_malloc64(nResults * sizeof(f32)); + if (!knn_data->rowids || !knn_data->distances) { + sqlite3_free(knn_data->rowids); sqlite3_free(knn_data->distances); + sqlite3_free(candidates); return SQLITE_NOMEM; + } + for (i64 i = 0; i < nResults; i++) { + knn_data->rowids[i] = candidates[i].rowid; + knn_data->distances[i] = candidates[i].distance; + } + knn_data->k = k; knn_data->k_used = nResults; knn_data->current_idx = 0; + sqlite3_free(candidates); + return SQLITE_OK; +} + +// ============================================================================ +// Command dispatch +// ============================================================================ + +static int ivf_handle_command(vec0_vtab *p, const char *command, + int argc, sqlite3_value **argv) { + UNUSED_PARAMETER(argc); + int col_idx = -1; + for (int i = 0; i < p->numVectorColumns; i++) { + if (p->vector_columns[i].index_type == VEC0_INDEX_TYPE_IVF) { col_idx = i; break; } + } + if (col_idx < 0) return SQLITE_EMPTY; + + // nprobe=N — change nprobe at runtime without rebuilding + if (strncmp(command, "nprobe=", 7) == 0) { + int new_nprobe = atoi(command + 7); + if (new_nprobe < 1) { + vtab_set_error(&p->base, "nprobe must be >= 1"); + return SQLITE_ERROR; + } + p->vector_columns[col_idx].ivf.nprobe = new_nprobe; + return SQLITE_OK; + } + + if (strcmp(command, "compute-centroids") == 0) + return ivf_cmd_compute_centroids(p, col_idx, 0, VEC0_IVF_KMEANS_MAX_ITER, VEC0_IVF_KMEANS_DEFAULT_SEED); + + if (strncmp(command, "compute-centroids:", 18) == 0) { + const char *json = command + 18; + int nlist = 0, max_iter = VEC0_IVF_KMEANS_MAX_ITER; + uint32_t seed = VEC0_IVF_KMEANS_DEFAULT_SEED; + const char *pn = strstr(json, "\"nlist\":"); if (pn) nlist = atoi(pn + 8); + const char *pi = strstr(json, "\"max_iterations\":"); if (pi) max_iter = atoi(pi + 17); + const char *ps = strstr(json, "\"seed\":"); if (ps) seed = (uint32_t)atoi(ps + 7); + return ivf_cmd_compute_centroids(p, col_idx, nlist, max_iter, seed); + } + + if (strncmp(command, "set-centroid:", 13) == 0) { + int centroid_id = atoi(command + 13); + for (int i = 0; i < (int)(p->numVectorColumns + p->numPartitionColumns + + p->numAuxiliaryColumns + p->numMetadataColumns); i++) { + if (p->user_column_kinds[i] == SQLITE_VEC0_USER_COLUMN_KIND_VECTOR && + p->user_column_idxs[i] == col_idx) { + sqlite3_value *v = argv[2 + VEC0_COLUMN_USERN_START + i]; + if (sqlite3_value_type(v) == SQLITE_NULL) { vtab_set_error(&p->base, "set-centroid requires vector"); return SQLITE_ERROR; } + return ivf_cmd_set_centroid(p, col_idx, centroid_id, sqlite3_value_blob(v), sqlite3_value_bytes(v)); + } + } + return SQLITE_ERROR; + } + + if (strcmp(command, "assign-vectors") == 0) return ivf_cmd_assign_vectors(p, col_idx); + if (strcmp(command, "clear-centroids") == 0) return ivf_cmd_clear_centroids(p, col_idx); + return SQLITE_EMPTY; +} + +#endif /* SQLITE_VEC_IVF_C */ diff --git a/sqlite-vec.c b/sqlite-vec.c index 7079f7e..88f60b9 100644 --- a/sqlite-vec.c +++ b/sqlite-vec.c @@ -93,6 +93,10 @@ typedef size_t usize; #define COMPILER_SUPPORTS_VTAB_IN 1 #endif +#ifndef SQLITE_VEC_ENABLE_IVF +#define SQLITE_VEC_ENABLE_IVF 1 +#endif + #ifndef SQLITE_SUBTYPE #define SQLITE_SUBTYPE 0x000100000 #endif @@ -2539,6 +2543,7 @@ enum Vec0IndexType { #if SQLITE_VEC_ENABLE_RESCORE VEC0_INDEX_TYPE_RESCORE = 2, #endif + VEC0_INDEX_TYPE_IVF = 3, }; #if SQLITE_VEC_ENABLE_RESCORE @@ -2553,6 +2558,22 @@ struct Vec0RescoreConfig { }; #endif +#if SQLITE_VEC_ENABLE_IVF +enum Vec0IvfQuantizer { + VEC0_IVF_QUANTIZER_NONE = 0, + VEC0_IVF_QUANTIZER_INT8 = 1, + VEC0_IVF_QUANTIZER_BINARY = 2, +}; + +struct Vec0IvfConfig { + int nlist; // number of centroids (0 = deferred) + int nprobe; // cells to probe at query time + int quantizer; // VEC0_IVF_QUANTIZER_NONE / INT8 / BINARY + int oversample; // >= 1 (1 = no oversampling) +}; +#else +struct Vec0IvfConfig { char _unused; }; +#endif struct VectorColumnDefinition { char *name; @@ -2564,6 +2585,7 @@ struct VectorColumnDefinition { #if SQLITE_VEC_ENABLE_RESCORE struct Vec0RescoreConfig rescore; #endif + struct Vec0IvfConfig ivf; }; struct Vec0PartitionColumnDefinition { @@ -2715,6 +2737,12 @@ static int vec0_parse_rescore_options(struct Vec0Scanner *scanner, * @return int SQLITE_OK on success, SQLITE_EMPTY is it's not a vector column * definition, SQLITE_ERROR on error. */ +#if SQLITE_VEC_ENABLE_IVF +// Forward declaration — defined in sqlite-vec-ivf.c +static int vec0_parse_ivf_options(struct Vec0Scanner *scanner, + struct Vec0IvfConfig *config); +#endif + int vec0_parse_vector_column(const char *source, int source_length, struct VectorColumnDefinition *outColumn) { // parses a vector column definition like so: @@ -2733,6 +2761,8 @@ int vec0_parse_vector_column(const char *source, int source_length, struct Vec0RescoreConfig rescoreConfig; memset(&rescoreConfig, 0, sizeof(rescoreConfig)); #endif + struct Vec0IvfConfig ivfConfig; + memset(&ivfConfig, 0, sizeof(ivfConfig)); int dimensions; vec0_scanner_init(&scanner, source, source_length); @@ -2891,7 +2921,18 @@ int vec0_parse_vector_column(const char *source, int source_length, } } #endif - else { + else if (sqlite3_strnicmp(token.start, "ivf", indexNameLen) == 0) { +#if SQLITE_VEC_ENABLE_IVF + indexType = VEC0_INDEX_TYPE_IVF; + memset(&ivfConfig, 0, sizeof(ivfConfig)); + rc = vec0_parse_ivf_options(&scanner, &ivfConfig); + if (rc != SQLITE_OK) { + return SQLITE_ERROR; + } +#else + return SQLITE_ERROR; // IVF not compiled in +#endif + } else { // unknown index type return SQLITE_ERROR; } @@ -2914,6 +2955,7 @@ int vec0_parse_vector_column(const char *source, int source_length, #if SQLITE_VEC_ENABLE_RESCORE outColumn->rescore = rescoreConfig; #endif + outColumn->ivf = ivfConfig; return SQLITE_OK; } @@ -3279,6 +3321,18 @@ struct vec0_vtab { int chunk_size; +#if SQLITE_VEC_ENABLE_IVF + // IVF cached state per vector column + char *shadowIvfCellsNames[VEC0_MAX_VECTOR_COLUMNS]; // table name for blob_open + int ivfTrainedCache[VEC0_MAX_VECTOR_COLUMNS]; // -1=unknown, 0=no, 1=yes + sqlite3_stmt *stmtIvfCellMeta[VEC0_MAX_VECTOR_COLUMNS]; // SELECT n_vectors, length(validity)*8 FROM cells WHERE cell_id=? + sqlite3_stmt *stmtIvfCellUpdateN[VEC0_MAX_VECTOR_COLUMNS]; // UPDATE cells SET n_vectors=n_vectors+? WHERE cell_id=? + sqlite3_stmt *stmtIvfRowidMapInsert[VEC0_MAX_VECTOR_COLUMNS]; // INSERT INTO rowid_map(rowid,cell_id,slot) VALUES(?,?,?) + sqlite3_stmt *stmtIvfRowidMapLookup[VEC0_MAX_VECTOR_COLUMNS]; // SELECT cell_id,slot FROM rowid_map WHERE rowid=? + sqlite3_stmt *stmtIvfRowidMapDelete[VEC0_MAX_VECTOR_COLUMNS]; // DELETE FROM rowid_map WHERE rowid=? + sqlite3_stmt *stmtIvfCentroidsAll[VEC0_MAX_VECTOR_COLUMNS]; // SELECT centroid_id,centroid FROM centroids +#endif + // select latest chunk from _chunks, getting chunk_id sqlite3_stmt *stmtLatestChunk; @@ -3364,6 +3418,17 @@ void vec0_free_resources(vec0_vtab *p) { p->stmtRowidsUpdatePosition = NULL; sqlite3_finalize(p->stmtRowidsGetChunkPosition); p->stmtRowidsGetChunkPosition = NULL; + +#if SQLITE_VEC_ENABLE_IVF + for (int i = 0; i < VEC0_MAX_VECTOR_COLUMNS; i++) { + sqlite3_finalize(p->stmtIvfCellMeta[i]); p->stmtIvfCellMeta[i] = NULL; + sqlite3_finalize(p->stmtIvfCellUpdateN[i]); p->stmtIvfCellUpdateN[i] = NULL; + sqlite3_finalize(p->stmtIvfRowidMapInsert[i]); p->stmtIvfRowidMapInsert[i] = NULL; + sqlite3_finalize(p->stmtIvfRowidMapLookup[i]); p->stmtIvfRowidMapLookup[i] = NULL; + sqlite3_finalize(p->stmtIvfRowidMapDelete[i]); p->stmtIvfRowidMapDelete[i] = NULL; + sqlite3_finalize(p->stmtIvfCentroidsAll[i]); p->stmtIvfCentroidsAll[i] = NULL; + } +#endif } /** @@ -3386,6 +3451,10 @@ void vec0_free(vec0_vtab *p) { for (int i = 0; i < p->numVectorColumns; i++) { sqlite3_free(p->shadowVectorChunksNames[i]); p->shadowVectorChunksNames[i] = NULL; +#if SQLITE_VEC_ENABLE_IVF + sqlite3_free(p->shadowIvfCellsNames[i]); + p->shadowIvfCellsNames[i] = NULL; +#endif #if SQLITE_VEC_ENABLE_RESCORE sqlite3_free(p->shadowRescoreChunksNames[i]); @@ -3674,12 +3743,25 @@ int vec0_result_id(vec0_vtab *p, sqlite3_context *context, i64 rowid) { * will be stored. * @return int SQLITE_OK on success. */ +#if SQLITE_VEC_ENABLE_IVF +// Forward declaration — defined in sqlite-vec-ivf.c (included later) +static int ivf_get_vector_data(vec0_vtab *p, i64 rowid, int col_idx, + void **outVector, int *outVectorSize); +#endif + int vec0_get_vector_data(vec0_vtab *pVtab, i64 rowid, int vector_column_idx, void **outVector, int *outVectorSize) { vec0_vtab *p = pVtab; int rc, brc; i64 chunk_id; i64 chunk_offset; + +#if SQLITE_VEC_ENABLE_IVF + // IVF-indexed columns store vectors in _ivf_cells, not _vector_chunks + if (p->vector_columns[vector_column_idx].index_type == VEC0_INDEX_TYPE_IVF) { + return ivf_get_vector_data(p, rowid, vector_column_idx, outVector, outVectorSize); + } +#endif size_t size; void *buf = NULL; int blobOffset; @@ -4327,8 +4409,12 @@ int vec0_new_chunk(vec0_vtab *p, sqlite3_value ** partitionKeyValues, i64 *chunk int vector_column_idx = p->user_column_idxs[i]; #if SQLITE_VEC_ENABLE_RESCORE - // Rescore columns don't use _vector_chunks for float storage - if (p->vector_columns[vector_column_idx].index_type == VEC0_INDEX_TYPE_RESCORE) { + // Rescore and IVF columns don't use _vector_chunks for float storage + if (p->vector_columns[vector_column_idx].index_type == VEC0_INDEX_TYPE_RESCORE +#if SQLITE_VEC_ENABLE_IVF + || p->vector_columns[vector_column_idx].index_type == VEC0_INDEX_TYPE_IVF +#endif + ) { continue; } #endif @@ -4500,6 +4586,12 @@ void vec0_cursor_clear(vec0_cursor *pCur) { } } +// IVF index implementation — #include'd here after all struct/helper definitions +#if SQLITE_VEC_ENABLE_IVF +#include "sqlite-vec-ivf-kmeans.c" +#include "sqlite-vec-ivf.c" +#endif + #define VEC_CONSTRUCTOR_ERROR "vec0 constructor error: " static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, sqlite3_vtab **ppVtab, char **pzErr, bool isCreate) { @@ -4761,6 +4853,34 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, } #endif + // IVF indexes do not support auxiliary, metadata, or partition key columns. + { + int has_ivf = 0; + for (int i = 0; i < numVectorColumns; i++) { + if (pNew->vector_columns[i].index_type == VEC0_INDEX_TYPE_IVF) { + has_ivf = 1; + break; + } + } + if (has_ivf) { + if (numPartitionColumns > 0) { + *pzErr = sqlite3_mprintf(VEC_CONSTRUCTOR_ERROR + "partition key columns are not supported with IVF indexes"); + goto error; + } + if (numAuxiliaryColumns > 0) { + *pzErr = sqlite3_mprintf(VEC_CONSTRUCTOR_ERROR + "auxiliary columns are not supported with IVF indexes"); + goto error; + } + if (numMetadataColumns > 0) { + *pzErr = sqlite3_mprintf(VEC_CONSTRUCTOR_ERROR + "metadata columns are not supported with IVF indexes"); + goto error; + } + } + } + sqlite3_str *createStr = sqlite3_str_new(NULL); sqlite3_str_appendall(createStr, "CREATE TABLE x("); if (pkColumnName) { @@ -4866,6 +4986,15 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, } #endif } +#if SQLITE_VEC_ENABLE_IVF + for (int i = 0; i < pNew->numVectorColumns; i++) { + if (pNew->vector_columns[i].index_type != VEC0_INDEX_TYPE_IVF) continue; + pNew->shadowIvfCellsNames[i] = + sqlite3_mprintf("%s_ivf_cells%02d", tableName, i); + if (!pNew->shadowIvfCellsNames[i]) goto error; + pNew->ivfTrainedCache[i] = -1; // unknown + } +#endif for (int i = 0; i < pNew->numMetadataColumns; i++) { pNew->shadowMetadataChunksNames[i] = sqlite3_mprintf("%s_metadatachunks%02d", tableName, i); @@ -4989,8 +5118,8 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, for (int i = 0; i < pNew->numVectorColumns; i++) { #if SQLITE_VEC_ENABLE_RESCORE - // Rescore columns don't use _vector_chunks - if (pNew->vector_columns[i].index_type == VEC0_INDEX_TYPE_RESCORE) + // Rescore and IVF columns don't use _vector_chunks + if (pNew->vector_columns[i].index_type != VEC0_INDEX_TYPE_FLAT) continue; #endif char *zSql = sqlite3_mprintf(VEC0_SHADOW_VECTOR_N_CREATE, @@ -5018,6 +5147,18 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, } #endif +#if SQLITE_VEC_ENABLE_IVF + // Create IVF shadow tables for IVF-indexed vector columns + for (int i = 0; i < pNew->numVectorColumns; i++) { + if (pNew->vector_columns[i].index_type != VEC0_INDEX_TYPE_IVF) continue; + rc = ivf_create_shadow_tables(pNew, i); + if (rc != SQLITE_OK) { + *pzErr = sqlite3_mprintf("Could not create IVF shadow tables for column %d", i); + goto error; + } + } +#endif + // See SHADOW_TABLE_ROWID_QUIRK in vec0_new_chunk() — same "rowid PRIMARY KEY" // without INTEGER type issue applies here. for (int i = 0; i < pNew->numMetadataColumns; i++) { @@ -5153,7 +5294,7 @@ static int vec0Destroy(sqlite3_vtab *pVtab) { for (int i = 0; i < p->numVectorColumns; i++) { #if SQLITE_VEC_ENABLE_RESCORE - if (p->vector_columns[i].index_type == VEC0_INDEX_TYPE_RESCORE) + if (p->vector_columns[i].index_type != VEC0_INDEX_TYPE_FLAT) continue; #endif zSql = sqlite3_mprintf("DROP TABLE \"%w\".\"%w\"", p->schemaName, @@ -5174,6 +5315,14 @@ static int vec0Destroy(sqlite3_vtab *pVtab) { } #endif +#if SQLITE_VEC_ENABLE_IVF + // Drop IVF shadow tables + for (int i = 0; i < p->numVectorColumns; i++) { + if (p->vector_columns[i].index_type != VEC0_INDEX_TYPE_IVF) continue; + ivf_drop_shadow_tables(p, i); + } +#endif + if(p->numAuxiliaryColumns > 0) { zSql = sqlite3_mprintf("DROP TABLE " VEC0_SHADOW_AUXILIARY_NAME, p->schemaName, p->tableName); rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, 0); @@ -7186,6 +7335,21 @@ int vec0Filter_knn(vec0_cursor *pCur, vec0_vtab *p, int idxNum, } #endif +#if SQLITE_VEC_ENABLE_IVF + // IVF dispatch: if vector column has IVF, use IVF query instead of chunk scan + if (vector_column->index_type == VEC0_INDEX_TYPE_IVF) { + rc = ivf_query_knn(p, vectorColumnIdx, queryVector, + (int)vector_column_byte_size(*vector_column), k, knn_data); + if (rc != SQLITE_OK) { + goto cleanup; + } + pCur->knn_data = knn_data; + pCur->query_plan = VEC0_QUERY_PLAN_KNN; + rc = SQLITE_OK; + goto cleanup; + } +#endif + rc = vec0_chunks_iter(p, idxStr, argc, argv, &stmtChunks); if (rc != SQLITE_OK) { // IMP: V06942_23781 @@ -8011,8 +8175,12 @@ int vec0Update_InsertWriteFinalStep(vec0_vtab *p, i64 chunk_rowid, // Go insert the vector data into the vector chunk shadow tables for (int i = 0; i < p->numVectorColumns; i++) { #if SQLITE_VEC_ENABLE_RESCORE - // Rescore columns store float vectors in _rescore_vectors instead - if (p->vector_columns[i].index_type == VEC0_INDEX_TYPE_RESCORE) + // Rescore and IVF columns don't use _vector_chunks + if (p->vector_columns[i].index_type == VEC0_INDEX_TYPE_RESCORE +#if SQLITE_VEC_ENABLE_IVF + || p->vector_columns[i].index_type == VEC0_INDEX_TYPE_IVF +#endif + ) continue; #endif @@ -8425,6 +8593,18 @@ int vec0Update_Insert(sqlite3_vtab *pVTab, int argc, sqlite3_value **argv, } #endif +#if SQLITE_VEC_ENABLE_IVF + // Step #4: IVF index insert (if any vector column uses IVF) + for (int i = 0; i < p->numVectorColumns; i++) { + if (p->vector_columns[i].index_type != VEC0_INDEX_TYPE_IVF) continue; + int vecSize = (int)vector_column_byte_size(p->vector_columns[i]); + rc = ivf_insert(p, i, rowid, vectorDatas[i], vecSize); + if (rc != SQLITE_OK) { + goto cleanup; + } + } +#endif + if(p->numAuxiliaryColumns > 0) { sqlite3_stmt *stmt; sqlite3_str * s = sqlite3_str_new(NULL); @@ -8616,8 +8796,8 @@ int vec0Update_Delete_ClearVectors(vec0_vtab *p, i64 chunk_id, int rc, brc; for (int i = 0; i < p->numVectorColumns; i++) { #if SQLITE_VEC_ENABLE_RESCORE - // Rescore columns don't use _vector_chunks - if (p->vector_columns[i].index_type == VEC0_INDEX_TYPE_RESCORE) + // Non-FLAT columns don't use _vector_chunks + if (p->vector_columns[i].index_type != VEC0_INDEX_TYPE_FLAT) continue; #endif sqlite3_blob *blobVectors = NULL; @@ -8732,7 +8912,7 @@ int vec0Update_Delete_DeleteChunkIfEmpty(vec0_vtab *p, i64 chunk_id, // Delete from each _vector_chunksNN for (int i = 0; i < p->numVectorColumns; i++) { #if SQLITE_VEC_ENABLE_RESCORE - if (p->vector_columns[i].index_type == VEC0_INDEX_TYPE_RESCORE) + if (p->vector_columns[i].index_type != VEC0_INDEX_TYPE_FLAT) continue; #endif zSql = sqlite3_mprintf( @@ -9009,6 +9189,15 @@ int vec0Update_Delete(sqlite3_vtab *pVTab, sqlite3_value *idValue) { } } +#if SQLITE_VEC_ENABLE_IVF + // 7. delete from IVF index + for (int i = 0; i < p->numVectorColumns; i++) { + if (p->vector_columns[i].index_type != VEC0_INDEX_TYPE_IVF) continue; + rc = ivf_delete(p, i, rowid); + if (rc != SQLITE_OK) return rc; + } +#endif + return SQLITE_OK; } @@ -9284,6 +9473,18 @@ static int vec0Update(sqlite3_vtab *pVTab, int argc, sqlite3_value **argv, } // INSERT operation else if (argc > 1 && sqlite3_value_type(argv[0]) == SQLITE_NULL) { +#if SQLITE_VEC_ENABLE_IVF + // Check for IVF command inserts: INSERT INTO t(rowid) VALUES ('compute-centroids') + // The id column holds the command string. + sqlite3_value *idVal = argv[2 + VEC0_COLUMN_ID]; + if (sqlite3_value_type(idVal) == SQLITE_TEXT) { + const char *cmd = (const char *)sqlite3_value_text(idVal); + vec0_vtab *p = (vec0_vtab *)pVTab; + int cmdRc = ivf_handle_command(p, cmd, argc, argv); + if (cmdRc != SQLITE_EMPTY) return cmdRc; // handled (or error) + // SQLITE_EMPTY means not an IVF command — fall through to normal insert + } +#endif return vec0Update_Insert(pVTab, argc, argv, pRowid); } // UPDATE operation @@ -9431,9 +9632,15 @@ static sqlite3_module vec0Module = { #define SQLITE_VEC_DEBUG_BUILD_RESCORE "" #endif +#if SQLITE_VEC_ENABLE_IVF +#define SQLITE_VEC_DEBUG_BUILD_IVF "ivf" +#else +#define SQLITE_VEC_DEBUG_BUILD_IVF "" +#endif + #define SQLITE_VEC_DEBUG_BUILD \ SQLITE_VEC_DEBUG_BUILD_AVX " " SQLITE_VEC_DEBUG_BUILD_NEON " " \ - SQLITE_VEC_DEBUG_BUILD_RESCORE + SQLITE_VEC_DEBUG_BUILD_RESCORE " " SQLITE_VEC_DEBUG_BUILD_IVF #define SQLITE_VEC_DEBUG_STRING \ "Version: " SQLITE_VEC_VERSION "\n" \ diff --git a/tests/fuzz/Makefile b/tests/fuzz/Makefile index 0030c2e..a3405a4 100644 --- a/tests/fuzz/Makefile +++ b/tests/fuzz/Makefile @@ -93,13 +93,40 @@ $(TARGET_DIR)/rescore_quantize_edge: rescore-quantize-edge.c $(FUZZ_SRCS) | $(TA $(TARGET_DIR)/rescore_interleave: rescore-interleave.c $(FUZZ_SRCS) | $(TARGET_DIR) $(FUZZ_CC) $(FUZZ_CFLAGS) -DSQLITE_VEC_ENABLE_RESCORE $(FUZZ_SRCS) $< -o $@ +$(TARGET_DIR)/ivf_create: ivf-create.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/ivf_operations: ivf-operations.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/ivf_quantize: ivf-quantize.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/ivf_kmeans: ivf-kmeans.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/ivf_shadow_corrupt: ivf-shadow-corrupt.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/ivf_knn_deep: ivf-knn-deep.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/ivf_cell_overflow: ivf-cell-overflow.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/ivf_rescore: ivf-rescore.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@ + FUZZ_TARGETS = vec0_create exec json numpy \ shadow_corrupt vec0_operations scalar_functions \ vec0_create_full metadata_columns vec_each vec_mismatch \ vec0_delete_completeness \ rescore_operations rescore_create rescore_quantize \ rescore_shadow_corrupt rescore_knn_deep \ - rescore_quantize_edge rescore_interleave + rescore_quantize_edge rescore_interleave \ + ivf_create ivf_operations \ + ivf_quantize ivf_kmeans ivf_shadow_corrupt \ + ivf_knn_deep ivf_cell_overflow ivf_rescore all: $(addprefix $(TARGET_DIR)/,$(FUZZ_TARGETS)) diff --git a/tests/fuzz/ivf-cell-overflow.c b/tests/fuzz/ivf-cell-overflow.c new file mode 100644 index 0000000..4b18ba2 --- /dev/null +++ b/tests/fuzz/ivf-cell-overflow.c @@ -0,0 +1,192 @@ +/** + * Fuzz target: IVF cell overflow and boundary conditions. + * + * Pushes cells past VEC0_IVF_CELL_MAX_VECTORS (64) to trigger cell + * splitting, then exercises blob I/O at slot boundaries. + * + * Targets: + * - Cell splitting when n_vectors reaches cap (64) + * - Blob offset arithmetic: slot * vecSize, slot / 8, slot % 8 + * - Validity bitmap at byte boundaries (slot 7->8, 15->16, etc.) + * - Insert into full cell -> create new cell path + * - Delete from various slot positions (first, last, middle) + * - Multiple cells per centroid + * - assign-vectors command with multi-cell centroids + */ +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 8) return 0; + + int rc; + sqlite3 *db; + + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + // Use small dimensions for speed but enough vectors to overflow cells + int dim = (data[0] % 8) + 2; // 2..9 + int nlist = (data[1] % 4) + 1; // 1..4 + // We need >64 vectors to overflow a cell + int num_vecs = (data[2] % 64) + 65; // 65..128 + int delete_pattern = data[3]; // Controls which vectors to delete + + const uint8_t *payload = data + 4; + size_t payload_size = size - 4; + + char sql[256]; + snprintf(sql, sizeof(sql), + "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[%d] indexed by ivf(nlist=%d, nprobe=%d))", + dim, nlist, nlist); + + rc = sqlite3_exec(db, sql, NULL, NULL, NULL); + if (rc != SQLITE_OK) { sqlite3_close(db); return 0; } + + // Insert enough vectors to overflow at least one cell + sqlite3_stmt *stmtInsert = NULL; + sqlite3_prepare_v2(db, + "INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &stmtInsert, NULL); + if (!stmtInsert) { sqlite3_close(db); return 0; } + + size_t offset = 0; + for (int i = 0; i < num_vecs; i++) { + float *vec = sqlite3_malloc(dim * sizeof(float)); + if (!vec) break; + for (int d = 0; d < dim; d++) { + if (offset < payload_size) { + vec[d] = ((float)(int8_t)payload[offset++]) / 50.0f; + } else { + // Cluster vectors near specific centroids to ensure some cells overflow + int cluster = i % nlist; + vec[d] = (float)cluster + (float)(i % 10) * 0.01f + d * 0.001f; + } + } + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, (int64_t)(i + 1)); + sqlite3_bind_blob(stmtInsert, 2, vec, dim * sizeof(float), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + sqlite3_free(vec); + } + sqlite3_finalize(stmtInsert); + + // Train to assign vectors to centroids (triggers cell building) + sqlite3_exec(db, + "INSERT INTO v(rowid) VALUES ('compute-centroids')", + NULL, NULL, NULL); + + // Delete vectors at boundary positions based on fuzz data + // This tests validity bitmap manipulation at different slot positions + for (int i = 0; i < num_vecs; i++) { + int byte_idx = i / 8; + if (byte_idx < (int)payload_size && (payload[byte_idx] & (1 << (i % 8)))) { + // Use delete_pattern to thin deletions + if ((delete_pattern + i) % 3 == 0) { + char delsql[64]; + snprintf(delsql, sizeof(delsql), "DELETE FROM v WHERE rowid = %d", i + 1); + sqlite3_exec(db, delsql, NULL, NULL, NULL); + } + } + } + + // Insert more vectors after deletions (into cells with holes) + { + sqlite3_stmt *si = NULL; + sqlite3_prepare_v2(db, + "INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &si, NULL); + if (si) { + for (int i = 0; i < 10; i++) { + float *vec = sqlite3_malloc(dim * sizeof(float)); + if (!vec) break; + for (int d = 0; d < dim; d++) + vec[d] = (float)(i + 200) * 0.01f; + sqlite3_reset(si); + sqlite3_bind_int64(si, 1, (int64_t)(num_vecs + i + 1)); + sqlite3_bind_blob(si, 2, vec, dim * sizeof(float), SQLITE_TRANSIENT); + sqlite3_step(si); + sqlite3_free(vec); + } + sqlite3_finalize(si); + } + } + + // KNN query that must scan multiple cells per centroid + { + float *qvec = sqlite3_malloc(dim * sizeof(float)); + if (qvec) { + for (int d = 0; d < dim; d++) qvec[d] = 0.0f; + sqlite3_stmt *sk = NULL; + snprintf(sql, sizeof(sql), + "SELECT rowid, distance FROM v WHERE emb MATCH ? LIMIT 20"); + sqlite3_prepare_v2(db, sql, -1, &sk, NULL); + if (sk) { + sqlite3_bind_blob(sk, 1, qvec, dim * sizeof(float), SQLITE_TRANSIENT); + while (sqlite3_step(sk) == SQLITE_ROW) {} + sqlite3_finalize(sk); + } + sqlite3_free(qvec); + } + } + + // Test assign-vectors with multi-cell state + // First clear centroids + sqlite3_exec(db, + "INSERT INTO v(rowid) VALUES ('clear-centroids')", + NULL, NULL, NULL); + + // Set centroids manually, then assign + for (int c = 0; c < nlist; c++) { + float *cvec = sqlite3_malloc(dim * sizeof(float)); + if (!cvec) break; + for (int d = 0; d < dim; d++) cvec[d] = (float)c + d * 0.1f; + + char cmd[128]; + snprintf(cmd, sizeof(cmd), + "INSERT INTO v(rowid, emb) VALUES ('set-centroid:%d', ?)", c); + sqlite3_stmt *sc = NULL; + sqlite3_prepare_v2(db, cmd, -1, &sc, NULL); + if (sc) { + sqlite3_bind_blob(sc, 1, cvec, dim * sizeof(float), SQLITE_TRANSIENT); + sqlite3_step(sc); + sqlite3_finalize(sc); + } + sqlite3_free(cvec); + } + + sqlite3_exec(db, + "INSERT INTO v(rowid) VALUES ('assign-vectors')", + NULL, NULL, NULL); + + // Final query after assign-vectors + { + float *qvec = sqlite3_malloc(dim * sizeof(float)); + if (qvec) { + for (int d = 0; d < dim; d++) qvec[d] = 1.0f; + sqlite3_stmt *sk = NULL; + sqlite3_prepare_v2(db, + "SELECT rowid, distance FROM v WHERE emb MATCH ? LIMIT 5", + -1, &sk, NULL); + if (sk) { + sqlite3_bind_blob(sk, 1, qvec, dim * sizeof(float), SQLITE_TRANSIENT); + while (sqlite3_step(sk) == SQLITE_ROW) {} + sqlite3_finalize(sk); + } + sqlite3_free(qvec); + } + } + + // Full scan + sqlite3_exec(db, "SELECT * FROM v", NULL, NULL, NULL); + + sqlite3_close(db); + return 0; +} diff --git a/tests/fuzz/ivf-create.c b/tests/fuzz/ivf-create.c new file mode 100644 index 0000000..222b67b --- /dev/null +++ b/tests/fuzz/ivf-create.c @@ -0,0 +1,36 @@ +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + int rc; + sqlite3 *db; + sqlite3_stmt *stmt; + + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + sqlite3_str *s = sqlite3_str_new(NULL); + assert(s); + sqlite3_str_appendall(s, "CREATE VIRTUAL TABLE v USING vec0(emb float[4] indexed by ivf("); + sqlite3_str_appendf(s, "%.*s", (int)size, data); + sqlite3_str_appendall(s, "))"); + const char *zSql = sqlite3_str_finish(s); + assert(zSql); + + rc = sqlite3_prepare_v2(db, zSql, -1, &stmt, NULL); + sqlite3_free((void *)zSql); + if (rc == SQLITE_OK) { + sqlite3_step(stmt); + } + sqlite3_finalize(stmt); + sqlite3_close(db); + return 0; +} diff --git a/tests/fuzz/ivf-create.dict b/tests/fuzz/ivf-create.dict new file mode 100644 index 0000000..9a014e7 --- /dev/null +++ b/tests/fuzz/ivf-create.dict @@ -0,0 +1,16 @@ +"nlist" +"nprobe" +"quantizer" +"oversample" +"binary" +"int8" +"none" +"=" +"," +"(" +")" +"0" +"1" +"128" +"65536" +"65537" diff --git a/tests/fuzz/ivf-kmeans.c b/tests/fuzz/ivf-kmeans.c new file mode 100644 index 0000000..46804d0 --- /dev/null +++ b/tests/fuzz/ivf-kmeans.c @@ -0,0 +1,180 @@ +/** + * Fuzz target: IVF k-means clustering. + * + * Builds a table, inserts fuzz-controlled vectors, then runs + * compute-centroids with fuzz-controlled parameters (nlist, max_iter, seed). + * Targets: + * - kmeans with N < k (clamping), N == 1, k == 1 + * - kmeans with duplicate/identical vectors (all distances zero) + * - kmeans with NaN/Inf vectors + * - Empty cluster reassignment path (farthest-point heuristic) + * - Large nlist relative to N + * - The compute-centroids:{json} command parsing + * - clear-centroids followed by compute-centroids (round-trip) + */ +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 10) return 0; + + int rc; + sqlite3 *db; + + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + // Parse fuzz header + // Byte 0-1: dimension (1..128) + // Byte 2: nlist for CREATE (1..64) + // Byte 3: nlist override for compute-centroids (0 = use default) + // Byte 4: max_iter (1..50) + // Byte 5-8: seed + // Byte 9: num_vectors (1..64) + // Remaining: vector float data + + int dim = (data[0] | (data[1] << 8)) % 128 + 1; + int nlist_create = (data[2] % 64) + 1; + int nlist_override = data[3] % 65; // 0 means use table default + int max_iter = (data[4] % 50) + 1; + uint32_t seed = (uint32_t)data[5] | ((uint32_t)data[6] << 8) | + ((uint32_t)data[7] << 16) | ((uint32_t)data[8] << 24); + int num_vecs = (data[9] % 64) + 1; + + const uint8_t *payload = data + 10; + size_t payload_size = size - 10; + + char sql[256]; + snprintf(sql, sizeof(sql), + "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[%d] indexed by ivf(nlist=%d, nprobe=%d))", + dim, nlist_create, nlist_create); + + rc = sqlite3_exec(db, sql, NULL, NULL, NULL); + if (rc != SQLITE_OK) { sqlite3_close(db); return 0; } + + // Insert vectors + sqlite3_stmt *stmtInsert = NULL; + sqlite3_prepare_v2(db, + "INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &stmtInsert, NULL); + if (!stmtInsert) { sqlite3_close(db); return 0; } + + size_t offset = 0; + for (int i = 0; i < num_vecs; i++) { + float *vec = sqlite3_malloc(dim * sizeof(float)); + if (!vec) break; + + for (int d = 0; d < dim; d++) { + if (offset + 4 <= payload_size) { + memcpy(&vec[d], payload + offset, sizeof(float)); + offset += 4; + } else if (offset < payload_size) { + // Scale to interesting range including values > 1, < -1 + vec[d] = ((float)(int8_t)payload[offset++]) / 5.0f; + } else { + // Reuse earlier bytes to fill remaining dimensions + vec[d] = (float)(i * dim + d) * 0.01f; + } + } + + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, (int64_t)(i + 1)); + sqlite3_bind_blob(stmtInsert, 2, vec, dim * sizeof(float), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + sqlite3_free(vec); + } + sqlite3_finalize(stmtInsert); + + // Exercise compute-centroids with JSON options + { + char cmd[256]; + snprintf(cmd, sizeof(cmd), + "INSERT INTO v(rowid) VALUES " + "('compute-centroids:{\"nlist\":%d,\"max_iterations\":%d,\"seed\":%u}')", + nlist_override, max_iter, seed); + sqlite3_exec(db, cmd, NULL, NULL, NULL); + } + + // KNN query after training + { + float *qvec = sqlite3_malloc(dim * sizeof(float)); + if (qvec) { + for (int d = 0; d < dim; d++) { + qvec[d] = (d < 3) ? 1.0f : 0.0f; + } + sqlite3_stmt *stmtKnn = NULL; + sqlite3_prepare_v2(db, + "SELECT rowid, distance FROM v WHERE emb MATCH ? LIMIT 5", + -1, &stmtKnn, NULL); + if (stmtKnn) { + sqlite3_bind_blob(stmtKnn, 1, qvec, dim * sizeof(float), SQLITE_TRANSIENT); + while (sqlite3_step(stmtKnn) == SQLITE_ROW) {} + sqlite3_finalize(stmtKnn); + } + sqlite3_free(qvec); + } + } + + // Clear centroids and re-compute to test round-trip + sqlite3_exec(db, + "INSERT INTO v(rowid) VALUES ('clear-centroids')", + NULL, NULL, NULL); + + // Insert a few more vectors in untrained state + { + sqlite3_stmt *si = NULL; + sqlite3_prepare_v2(db, + "INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &si, NULL); + if (si) { + for (int i = 0; i < 3; i++) { + float *vec = sqlite3_malloc(dim * sizeof(float)); + if (!vec) break; + for (int d = 0; d < dim; d++) vec[d] = (float)(i + 100) * 0.1f; + sqlite3_reset(si); + sqlite3_bind_int64(si, 1, (int64_t)(num_vecs + i + 1)); + sqlite3_bind_blob(si, 2, vec, dim * sizeof(float), SQLITE_TRANSIENT); + sqlite3_step(si); + sqlite3_free(vec); + } + sqlite3_finalize(si); + } + } + + // Re-train + sqlite3_exec(db, + "INSERT INTO v(rowid) VALUES ('compute-centroids')", + NULL, NULL, NULL); + + // Delete some rows after training, then query + sqlite3_exec(db, "DELETE FROM v WHERE rowid = 1", NULL, NULL, NULL); + sqlite3_exec(db, "DELETE FROM v WHERE rowid = 2", NULL, NULL, NULL); + + // Query after deletes + { + float *qvec = sqlite3_malloc(dim * sizeof(float)); + if (qvec) { + for (int d = 0; d < dim; d++) qvec[d] = 0.5f; + sqlite3_stmt *stmtKnn = NULL; + sqlite3_prepare_v2(db, + "SELECT rowid, distance FROM v WHERE emb MATCH ? LIMIT 10", + -1, &stmtKnn, NULL); + if (stmtKnn) { + sqlite3_bind_blob(stmtKnn, 1, qvec, dim * sizeof(float), SQLITE_TRANSIENT); + while (sqlite3_step(stmtKnn) == SQLITE_ROW) {} + sqlite3_finalize(stmtKnn); + } + sqlite3_free(qvec); + } + } + + sqlite3_close(db); + return 0; +} diff --git a/tests/fuzz/ivf-knn-deep.c b/tests/fuzz/ivf-knn-deep.c new file mode 100644 index 0000000..27d19a1 --- /dev/null +++ b/tests/fuzz/ivf-knn-deep.c @@ -0,0 +1,199 @@ +/** + * Fuzz target: IVF KNN search deep paths. + * + * Exercises the full KNN pipeline with fuzz-controlled: + * - nprobe values (including > nlist, =1, =nlist) + * - Query vectors (including adversarial floats) + * - Mix of trained/untrained state + * - Oversample + rescore path (quantizer=int8 with oversample>1) + * - Multiple interleaved KNN queries + * - Candidate array realloc path (many vectors in probed cells) + * + * Targets: + * - ivf_scan_cells_from_stmt: candidate realloc, distance computation + * - ivf_query_knn: centroid sorting, nprobe selection + * - Oversample rescore: re-ranking with full-precision vectors + * - qsort with NaN distances + */ +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +static uint16_t read_u16(const uint8_t *p) { + return (uint16_t)(p[0] | (p[1] << 8)); +} + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 16) return 0; + + int rc; + sqlite3 *db; + + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + // Header + int dim = (data[0] % 32) + 2; // 2..33 + int nlist = (data[1] % 16) + 1; // 1..16 + int nprobe_initial = (data[2] % 20) + 1; // 1..20 (can be > nlist) + int quantizer_type = data[3] % 3; // 0=none, 1=int8, 2=binary + int oversample = (data[4] % 4) + 1; // 1..4 + int num_vecs = (data[5] % 80) + 4; // 4..83 + int num_queries = (data[6] % 8) + 1; // 1..8 + int k_limit = (data[7] % 20) + 1; // 1..20 + + const uint8_t *payload = data + 8; + size_t payload_size = size - 8; + + // For binary quantizer, dimension must be multiple of 8 + if (quantizer_type == 2) { + dim = ((dim + 7) / 8) * 8; + if (dim == 0) dim = 8; + } + + const char *qname; + switch (quantizer_type) { + case 1: qname = "int8"; break; + case 2: qname = "binary"; break; + default: qname = "none"; break; + } + + // Oversample only valid with quantization + if (quantizer_type == 0) oversample = 1; + + // Cap nprobe to nlist for CREATE (parser rejects nprobe > nlist) + int nprobe_create = nprobe_initial <= nlist ? nprobe_initial : nlist; + + char sql[512]; + snprintf(sql, sizeof(sql), + "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[%d] indexed by ivf(nlist=%d, nprobe=%d, quantizer=%s%s))", + dim, nlist, nprobe_create, qname, + oversample > 1 ? ", oversample=2" : ""); + + // If that fails (e.g. oversample with none), try without oversample + rc = sqlite3_exec(db, sql, NULL, NULL, NULL); + if (rc != SQLITE_OK) { + snprintf(sql, sizeof(sql), + "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[%d] indexed by ivf(nlist=%d, nprobe=%d, quantizer=%s))", + dim, nlist, nprobe_create, qname); + rc = sqlite3_exec(db, sql, NULL, NULL, NULL); + if (rc != SQLITE_OK) { sqlite3_close(db); return 0; } + } + + // Insert vectors + sqlite3_stmt *stmtInsert = NULL; + sqlite3_prepare_v2(db, + "INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &stmtInsert, NULL); + if (!stmtInsert) { sqlite3_close(db); return 0; } + + size_t offset = 0; + for (int i = 0; i < num_vecs; i++) { + float *vec = sqlite3_malloc(dim * sizeof(float)); + if (!vec) break; + for (int d = 0; d < dim; d++) { + if (offset < payload_size) { + vec[d] = ((float)(int8_t)payload[offset++]) / 20.0f; + } else { + vec[d] = (float)((i * dim + d) % 256 - 128) / 128.0f; + } + } + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, (int64_t)(i + 1)); + sqlite3_bind_blob(stmtInsert, 2, vec, dim * sizeof(float), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + sqlite3_free(vec); + } + sqlite3_finalize(stmtInsert); + + // Query BEFORE training (flat scan path) + { + float *qvec = sqlite3_malloc(dim * sizeof(float)); + if (qvec) { + for (int d = 0; d < dim; d++) qvec[d] = 0.5f; + sqlite3_stmt *sk = NULL; + snprintf(sql, sizeof(sql), + "SELECT rowid, distance FROM v WHERE emb MATCH ? LIMIT %d", k_limit); + sqlite3_prepare_v2(db, sql, -1, &sk, NULL); + if (sk) { + sqlite3_bind_blob(sk, 1, qvec, dim * sizeof(float), SQLITE_TRANSIENT); + while (sqlite3_step(sk) == SQLITE_ROW) {} + sqlite3_finalize(sk); + } + sqlite3_free(qvec); + } + } + + // Train + sqlite3_exec(db, + "INSERT INTO v(rowid) VALUES ('compute-centroids')", + NULL, NULL, NULL); + + // Change nprobe at runtime (can exceed nlist -- tests clamping in query) + { + char cmd[64]; + snprintf(cmd, sizeof(cmd), + "INSERT INTO v(rowid) VALUES ('nprobe=%d')", nprobe_initial); + sqlite3_exec(db, cmd, NULL, NULL, NULL); + } + + // Multiple KNN queries with different fuzz-derived query vectors + for (int q = 0; q < num_queries; q++) { + float *qvec = sqlite3_malloc(dim * sizeof(float)); + if (!qvec) break; + for (int d = 0; d < dim; d++) { + if (offset < payload_size) { + qvec[d] = ((float)(int8_t)payload[offset++]) / 10.0f; + } else { + qvec[d] = (q == 0) ? 1.0f : 0.0f; + } + } + + sqlite3_stmt *sk = NULL; + snprintf(sql, sizeof(sql), + "SELECT rowid, distance FROM v WHERE emb MATCH ? LIMIT %d", k_limit); + sqlite3_prepare_v2(db, sql, -1, &sk, NULL); + if (sk) { + sqlite3_bind_blob(sk, 1, qvec, dim * sizeof(float), SQLITE_TRANSIENT); + while (sqlite3_step(sk) == SQLITE_ROW) {} + sqlite3_finalize(sk); + } + sqlite3_free(qvec); + } + + // Delete half the vectors then query again + for (int i = 1; i <= num_vecs / 2; i++) { + char delsql[64]; + snprintf(delsql, sizeof(delsql), "DELETE FROM v WHERE rowid = %d", i); + sqlite3_exec(db, delsql, NULL, NULL, NULL); + } + + // Query after mass deletion + { + float *qvec = sqlite3_malloc(dim * sizeof(float)); + if (qvec) { + for (int d = 0; d < dim; d++) qvec[d] = -0.5f; + sqlite3_stmt *sk = NULL; + snprintf(sql, sizeof(sql), + "SELECT rowid, distance FROM v WHERE emb MATCH ? LIMIT %d", k_limit); + sqlite3_prepare_v2(db, sql, -1, &sk, NULL); + if (sk) { + sqlite3_bind_blob(sk, 1, qvec, dim * sizeof(float), SQLITE_TRANSIENT); + while (sqlite3_step(sk) == SQLITE_ROW) {} + sqlite3_finalize(sk); + } + sqlite3_free(qvec); + } + } + + sqlite3_close(db); + return 0; +} diff --git a/tests/fuzz/ivf-operations.c b/tests/fuzz/ivf-operations.c new file mode 100644 index 0000000..a955870 --- /dev/null +++ b/tests/fuzz/ivf-operations.c @@ -0,0 +1,121 @@ +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 6) return 0; + + int rc; + sqlite3 *db; + sqlite3_stmt *stmtInsert = NULL; + sqlite3_stmt *stmtDelete = NULL; + sqlite3_stmt *stmtKnn = NULL; + sqlite3_stmt *stmtScan = NULL; + + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + rc = sqlite3_exec(db, + "CREATE VIRTUAL TABLE v USING vec0(emb float[4] indexed by ivf(nlist=4, nprobe=4))", + NULL, NULL, NULL); + if (rc != SQLITE_OK) { sqlite3_close(db); return 0; } + + sqlite3_prepare_v2(db, + "INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &stmtInsert, NULL); + sqlite3_prepare_v2(db, + "DELETE FROM v WHERE rowid = ?", -1, &stmtDelete, NULL); + sqlite3_prepare_v2(db, + "SELECT rowid, distance FROM v WHERE emb MATCH ? LIMIT 3", + -1, &stmtKnn, NULL); + sqlite3_prepare_v2(db, + "SELECT rowid FROM v", -1, &stmtScan, NULL); + + if (!stmtInsert || !stmtDelete || !stmtKnn || !stmtScan) goto cleanup; + + size_t i = 0; + while (i + 2 <= size) { + uint8_t op = data[i++] % 7; + uint8_t rowid_byte = data[i++]; + int64_t rowid = (int64_t)(rowid_byte % 32) + 1; + + switch (op) { + case 0: { + // INSERT: consume 16 bytes for 4 floats, or use what's left + float vec[4] = {0.0f, 0.0f, 0.0f, 0.0f}; + for (int j = 0; j < 4 && i < size; j++, i++) { + vec[j] = (float)((int8_t)data[i]) / 10.0f; + } + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, rowid); + sqlite3_bind_blob(stmtInsert, 2, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + break; + } + case 1: { + // DELETE + sqlite3_reset(stmtDelete); + sqlite3_bind_int64(stmtDelete, 1, rowid); + sqlite3_step(stmtDelete); + break; + } + case 2: { + // KNN query with a fixed query vector + float qvec[4] = {1.0f, 0.0f, 0.0f, 0.0f}; + sqlite3_reset(stmtKnn); + sqlite3_bind_blob(stmtKnn, 1, qvec, sizeof(qvec), SQLITE_STATIC); + while (sqlite3_step(stmtKnn) == SQLITE_ROW) {} + break; + } + case 3: { + // Full scan + sqlite3_reset(stmtScan); + while (sqlite3_step(stmtScan) == SQLITE_ROW) {} + break; + } + case 4: { + // compute-centroids command + sqlite3_exec(db, + "INSERT INTO v(rowid) VALUES ('compute-centroids')", + NULL, NULL, NULL); + break; + } + case 5: { + // clear-centroids command + sqlite3_exec(db, + "INSERT INTO v(rowid) VALUES ('clear-centroids')", + NULL, NULL, NULL); + break; + } + case 6: { + // nprobe=N command + if (i < size) { + uint8_t n = data[i++]; + int nprobe = (n % 4) + 1; + char buf[64]; + snprintf(buf, sizeof(buf), + "INSERT INTO v(rowid) VALUES ('nprobe=%d')", nprobe); + sqlite3_exec(db, buf, NULL, NULL, NULL); + } + break; + } + } + } + + // Final operations — must not crash regardless of prior state + sqlite3_exec(db, "SELECT * FROM v", NULL, NULL, NULL); + +cleanup: + sqlite3_finalize(stmtInsert); + sqlite3_finalize(stmtDelete); + sqlite3_finalize(stmtKnn); + sqlite3_finalize(stmtScan); + sqlite3_close(db); + return 0; +} diff --git a/tests/fuzz/ivf-quantize.c b/tests/fuzz/ivf-quantize.c new file mode 100644 index 0000000..22149ee --- /dev/null +++ b/tests/fuzz/ivf-quantize.c @@ -0,0 +1,129 @@ +/** + * Fuzz target: IVF quantization functions. + * + * Directly exercises ivf_quantize_int8 and ivf_quantize_binary with + * fuzz-controlled dimensions and float data. Targets: + * - ivf_quantize_int8: clamping, int8 overflow boundary + * - ivf_quantize_binary: D not divisible by 8, memset(D/8) undercount + * - Round-trip through CREATE TABLE + INSERT with quantized IVF + */ +#include +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 8) return 0; + + int rc; + sqlite3 *db; + + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + // Byte 0: quantizer type (0=int8, 1=binary) + // Byte 1: dimension (1..64, but we test edge cases) + // Byte 2: nlist (1..8) + // Byte 3: num_vectors to insert (1..32) + // Remaining: float data + int qtype = data[0] % 2; + int dim = (data[1] % 64) + 1; + int nlist = (data[2] % 8) + 1; + int num_vecs = (data[3] % 32) + 1; + const uint8_t *payload = data + 4; + size_t payload_size = size - 4; + + // For binary quantizer, D must be multiple of 8 to avoid the D/8 bug + // in production. But we explicitly want to test non-multiples too to + // find the bug. Use dim as-is. + const char *quantizer = qtype ? "binary" : "int8"; + + // Binary quantizer needs D multiple of 8 in current code, but let's + // test both valid and invalid dimensions to see what happens. + // For binary with non-multiple-of-8, the code does memset(dst, 0, D/8) + // which underallocates when D%8 != 0. + char sql[256]; + snprintf(sql, sizeof(sql), + "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[%d] indexed by ivf(nlist=%d, nprobe=%d, quantizer=%s))", + dim, nlist, nlist, quantizer); + + rc = sqlite3_exec(db, sql, NULL, NULL, NULL); + if (rc != SQLITE_OK) { sqlite3_close(db); return 0; } + + // Insert vectors with fuzz-controlled float values + sqlite3_stmt *stmtInsert = NULL; + sqlite3_prepare_v2(db, + "INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &stmtInsert, NULL); + if (!stmtInsert) { sqlite3_close(db); return 0; } + + size_t offset = 0; + for (int i = 0; i < num_vecs && offset < payload_size; i++) { + // Build float vector from fuzz data + float *vec = sqlite3_malloc(dim * sizeof(float)); + if (!vec) break; + + for (int d = 0; d < dim; d++) { + if (offset + 4 <= payload_size) { + // Use raw bytes as float -- can produce NaN, Inf, denormals + memcpy(&vec[d], payload + offset, sizeof(float)); + offset += 4; + } else if (offset < payload_size) { + // Partial: use byte as scaled value + vec[d] = ((float)(int8_t)payload[offset++]) / 50.0f; + } else { + vec[d] = 0.0f; + } + } + + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, (int64_t)(i + 1)); + sqlite3_bind_blob(stmtInsert, 2, vec, dim * sizeof(float), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + sqlite3_free(vec); + } + sqlite3_finalize(stmtInsert); + + // Trigger compute-centroids to exercise kmeans + quantization together + sqlite3_exec(db, + "INSERT INTO v(rowid) VALUES ('compute-centroids')", + NULL, NULL, NULL); + + // KNN query with fuzz-derived query vector + { + float *qvec = sqlite3_malloc(dim * sizeof(float)); + if (qvec) { + for (int d = 0; d < dim; d++) { + if (offset < payload_size) { + qvec[d] = ((float)(int8_t)payload[offset++]) / 10.0f; + } else { + qvec[d] = 1.0f; + } + } + + sqlite3_stmt *stmtKnn = NULL; + sqlite3_prepare_v2(db, + "SELECT rowid, distance FROM v WHERE emb MATCH ? LIMIT 5", + -1, &stmtKnn, NULL); + if (stmtKnn) { + sqlite3_bind_blob(stmtKnn, 1, qvec, dim * sizeof(float), SQLITE_TRANSIENT); + while (sqlite3_step(stmtKnn) == SQLITE_ROW) {} + sqlite3_finalize(stmtKnn); + } + sqlite3_free(qvec); + } + } + + // Full scan + sqlite3_exec(db, "SELECT * FROM v", NULL, NULL, NULL); + + sqlite3_close(db); + return 0; +} diff --git a/tests/fuzz/ivf-rescore.c b/tests/fuzz/ivf-rescore.c new file mode 100644 index 0000000..1c3f34a --- /dev/null +++ b/tests/fuzz/ivf-rescore.c @@ -0,0 +1,182 @@ +/** + * Fuzz target: IVF oversample + rescore path. + * + * Specifically targets the code path where quantizer != none AND + * oversample > 1, which triggers: + * 1. Quantized KNN scan to collect oversample*k candidates + * 2. Full-precision vector lookup from _ivf_vectors table + * 3. Re-scoring with float32 distances + * 4. Re-sort and truncation + * + * This path has the most complex memory management in the KNN query: + * - Two separate distance computations (quantized + float) + * - Cross-table lookups (cells + vectors KV store) + * - Candidate array resizing + * - qsort over partially re-scored arrays + * + * Also tests the int8 + binary quantization round-trip fidelity + * under adversarial float inputs. + */ +#include +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 12) return 0; + + int rc; + sqlite3 *db; + + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + // Header + int quantizer_type = (data[0] % 2) + 1; // 1=int8, 2=binary (never none) + int dim = (data[1] % 32) + 8; // 8..39 + int nlist = (data[2] % 8) + 1; // 1..8 + int oversample = (data[3] % 4) + 2; // 2..5 (always > 1) + int num_vecs = (data[4] % 60) + 8; // 8..67 + int k_limit = (data[5] % 15) + 1; // 1..15 + + const uint8_t *payload = data + 6; + size_t payload_size = size - 6; + + // Binary quantizer needs D multiple of 8 + if (quantizer_type == 2) { + dim = ((dim + 7) / 8) * 8; + } + + const char *qname = (quantizer_type == 1) ? "int8" : "binary"; + + char sql[512]; + snprintf(sql, sizeof(sql), + "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[%d] indexed by ivf(nlist=%d, nprobe=%d, quantizer=%s, oversample=%d))", + dim, nlist, nlist, qname, oversample); + + rc = sqlite3_exec(db, sql, NULL, NULL, NULL); + if (rc != SQLITE_OK) { sqlite3_close(db); return 0; } + + // Insert vectors with diverse values + sqlite3_stmt *stmtInsert = NULL; + sqlite3_prepare_v2(db, + "INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &stmtInsert, NULL); + if (!stmtInsert) { sqlite3_close(db); return 0; } + + size_t offset = 0; + for (int i = 0; i < num_vecs; i++) { + float *vec = sqlite3_malloc(dim * sizeof(float)); + if (!vec) break; + for (int d = 0; d < dim; d++) { + if (offset + 4 <= payload_size) { + // Use raw bytes as float for adversarial values + memcpy(&vec[d], payload + offset, sizeof(float)); + offset += 4; + // Sanitize: replace NaN/Inf with bounded values to avoid + // poisoning the entire computation. We want edge values, + // not complete nonsense. + if (isnan(vec[d]) || isinf(vec[d])) { + vec[d] = (vec[d] > 0) ? 1e6f : -1e6f; + if (isnan(vec[d])) vec[d] = 0.0f; + } + } else if (offset < payload_size) { + vec[d] = ((float)(int8_t)payload[offset++]) / 30.0f; + } else { + vec[d] = (float)(i * dim + d) * 0.001f; + } + } + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, (int64_t)(i + 1)); + sqlite3_bind_blob(stmtInsert, 2, vec, dim * sizeof(float), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + sqlite3_free(vec); + } + sqlite3_finalize(stmtInsert); + + // Train + sqlite3_exec(db, + "INSERT INTO v(rowid) VALUES ('compute-centroids')", + NULL, NULL, NULL); + + // Multiple KNN queries to exercise rescore path + for (int q = 0; q < 4; q++) { + float *qvec = sqlite3_malloc(dim * sizeof(float)); + if (!qvec) break; + for (int d = 0; d < dim; d++) { + if (offset < payload_size) { + qvec[d] = ((float)(int8_t)payload[offset++]) / 10.0f; + } else { + qvec[d] = (q == 0) ? 1.0f : (q == 1) ? -1.0f : 0.0f; + } + } + + sqlite3_stmt *sk = NULL; + snprintf(sql, sizeof(sql), + "SELECT rowid, distance FROM v WHERE emb MATCH ? LIMIT %d", k_limit); + sqlite3_prepare_v2(db, sql, -1, &sk, NULL); + if (sk) { + sqlite3_bind_blob(sk, 1, qvec, dim * sizeof(float), SQLITE_TRANSIENT); + while (sqlite3_step(sk) == SQLITE_ROW) {} + sqlite3_finalize(sk); + } + sqlite3_free(qvec); + } + + // Delete some vectors, then query again (rescore with missing _ivf_vectors rows) + for (int i = 1; i <= num_vecs / 3; i++) { + char delsql[64]; + snprintf(delsql, sizeof(delsql), "DELETE FROM v WHERE rowid = %d", i); + sqlite3_exec(db, delsql, NULL, NULL, NULL); + } + + { + float *qvec = sqlite3_malloc(dim * sizeof(float)); + if (qvec) { + for (int d = 0; d < dim; d++) qvec[d] = 0.5f; + sqlite3_stmt *sk = NULL; + snprintf(sql, sizeof(sql), + "SELECT rowid, distance FROM v WHERE emb MATCH ? LIMIT %d", k_limit); + sqlite3_prepare_v2(db, sql, -1, &sk, NULL); + if (sk) { + sqlite3_bind_blob(sk, 1, qvec, dim * sizeof(float), SQLITE_TRANSIENT); + while (sqlite3_step(sk) == SQLITE_ROW) {} + sqlite3_finalize(sk); + } + sqlite3_free(qvec); + } + } + + // Retrain after deletions + sqlite3_exec(db, + "INSERT INTO v(rowid) VALUES ('compute-centroids')", + NULL, NULL, NULL); + + // Query after retrain + { + float *qvec = sqlite3_malloc(dim * sizeof(float)); + if (qvec) { + for (int d = 0; d < dim; d++) qvec[d] = -0.3f; + sqlite3_stmt *sk = NULL; + snprintf(sql, sizeof(sql), + "SELECT rowid, distance FROM v WHERE emb MATCH ? LIMIT %d", k_limit); + sqlite3_prepare_v2(db, sql, -1, &sk, NULL); + if (sk) { + sqlite3_bind_blob(sk, 1, qvec, dim * sizeof(float), SQLITE_TRANSIENT); + while (sqlite3_step(sk) == SQLITE_ROW) {} + sqlite3_finalize(sk); + } + sqlite3_free(qvec); + } + } + + sqlite3_close(db); + return 0; +} diff --git a/tests/fuzz/ivf-shadow-corrupt.c b/tests/fuzz/ivf-shadow-corrupt.c new file mode 100644 index 0000000..1153ac9 --- /dev/null +++ b/tests/fuzz/ivf-shadow-corrupt.c @@ -0,0 +1,228 @@ +/** + * Fuzz target: IVF shadow table corruption. + * + * Creates a trained IVF table, then corrupts IVF shadow table blobs + * (centroids, cells validity/rowids/vectors, rowid_map) with fuzz data. + * Then exercises all read/write paths. Must not crash. + * + * Targets: + * - Cell validity bitmap with wrong size + * - Cell rowids blob with wrong size/alignment + * - Cell vectors blob with wrong size + * - Centroid blob with wrong size + * - n_vectors inconsistent with validity bitmap + * - Missing rowid_map entries + * - KNN scan over corrupted cells + * - Insert/delete with corrupted rowid_map + */ +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 4) return 0; + + int rc; + sqlite3 *db; + + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + // Create IVF table and insert enough vectors to train + rc = sqlite3_exec(db, + "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[8] indexed by ivf(nlist=2, nprobe=2))", + NULL, NULL, NULL); + if (rc != SQLITE_OK) { sqlite3_close(db); return 0; } + + // Insert 10 vectors + { + sqlite3_stmt *si = NULL; + sqlite3_prepare_v2(db, + "INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &si, NULL); + if (!si) { sqlite3_close(db); return 0; } + for (int i = 0; i < 10; i++) { + float vec[8]; + for (int d = 0; d < 8; d++) { + vec[d] = (float)(i * 8 + d) * 0.1f; + } + sqlite3_reset(si); + sqlite3_bind_int64(si, 1, i + 1); + sqlite3_bind_blob(si, 2, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_step(si); + } + sqlite3_finalize(si); + } + + // Train + sqlite3_exec(db, + "INSERT INTO v(rowid) VALUES ('compute-centroids')", + NULL, NULL, NULL); + + // Now corrupt shadow tables based on fuzz input + uint8_t target = data[0] % 10; + const uint8_t *payload = data + 1; + int payload_size = (int)(size - 1); + + // Limit payload to avoid huge allocations + if (payload_size > 4096) payload_size = 4096; + + sqlite3_stmt *stmt = NULL; + + switch (target) { + case 0: { + // Corrupt cell validity blob + rc = sqlite3_prepare_v2(db, + "UPDATE v_ivf_cells00 SET validity = ? WHERE rowid = 1", + -1, &stmt, NULL); + if (rc == SQLITE_OK) { + sqlite3_bind_blob(stmt, 1, payload, payload_size, SQLITE_STATIC); + sqlite3_step(stmt); sqlite3_finalize(stmt); + } + break; + } + case 1: { + // Corrupt cell rowids blob + rc = sqlite3_prepare_v2(db, + "UPDATE v_ivf_cells00 SET rowids = ? WHERE rowid = 1", + -1, &stmt, NULL); + if (rc == SQLITE_OK) { + sqlite3_bind_blob(stmt, 1, payload, payload_size, SQLITE_STATIC); + sqlite3_step(stmt); sqlite3_finalize(stmt); + } + break; + } + case 2: { + // Corrupt cell vectors blob + rc = sqlite3_prepare_v2(db, + "UPDATE v_ivf_cells00 SET vectors = ? WHERE rowid = 1", + -1, &stmt, NULL); + if (rc == SQLITE_OK) { + sqlite3_bind_blob(stmt, 1, payload, payload_size, SQLITE_STATIC); + sqlite3_step(stmt); sqlite3_finalize(stmt); + } + break; + } + case 3: { + // Corrupt centroid blob + rc = sqlite3_prepare_v2(db, + "UPDATE v_ivf_centroids00 SET centroid = ? WHERE centroid_id = 0", + -1, &stmt, NULL); + if (rc == SQLITE_OK) { + sqlite3_bind_blob(stmt, 1, payload, payload_size, SQLITE_STATIC); + sqlite3_step(stmt); sqlite3_finalize(stmt); + } + break; + } + case 4: { + // Set n_vectors to a bogus value (larger than cell capacity) + int bogus_n = 99999; + if (payload_size >= 4) { + memcpy(&bogus_n, payload, 4); + bogus_n = abs(bogus_n) % 100000; + } + char sql[128]; + snprintf(sql, sizeof(sql), + "UPDATE v_ivf_cells00 SET n_vectors = %d WHERE rowid = 1", bogus_n); + sqlite3_exec(db, sql, NULL, NULL, NULL); + break; + } + case 5: { + // Delete rowid_map entries (orphan vectors) + sqlite3_exec(db, + "DELETE FROM v_ivf_rowid_map00 WHERE rowid IN (1, 2, 3)", + NULL, NULL, NULL); + break; + } + case 6: { + // Corrupt rowid_map slot values + char sql[128]; + int bogus_slot = payload_size > 0 ? (int)payload[0] * 1000 : 99999; + snprintf(sql, sizeof(sql), + "UPDATE v_ivf_rowid_map00 SET slot = %d WHERE rowid = 1", bogus_slot); + sqlite3_exec(db, sql, NULL, NULL, NULL); + break; + } + case 7: { + // Corrupt rowid_map cell_id values + sqlite3_exec(db, + "UPDATE v_ivf_rowid_map00 SET cell_id = 99999 WHERE rowid = 1", + NULL, NULL, NULL); + break; + } + case 8: { + // Delete all centroids (make trained but no centroids) + sqlite3_exec(db, + "DELETE FROM v_ivf_centroids00", + NULL, NULL, NULL); + break; + } + case 9: { + // Set validity to NULL + sqlite3_exec(db, + "UPDATE v_ivf_cells00 SET validity = NULL WHERE rowid = 1", + NULL, NULL, NULL); + break; + } + } + + // Exercise all read paths over corrupted state — must not crash + float qvec[8] = {1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; + + // KNN query + { + sqlite3_stmt *sk = NULL; + sqlite3_prepare_v2(db, + "SELECT rowid, distance FROM v WHERE emb MATCH ? LIMIT 5", + -1, &sk, NULL); + if (sk) { + sqlite3_bind_blob(sk, 1, qvec, sizeof(qvec), SQLITE_STATIC); + while (sqlite3_step(sk) == SQLITE_ROW) {} + sqlite3_finalize(sk); + } + } + + // Full scan + sqlite3_exec(db, "SELECT * FROM v", NULL, NULL, NULL); + + // Point query + sqlite3_exec(db, "SELECT * FROM v WHERE rowid = 1", NULL, NULL, NULL); + sqlite3_exec(db, "SELECT * FROM v WHERE rowid = 5", NULL, NULL, NULL); + + // Delete + sqlite3_exec(db, "DELETE FROM v WHERE rowid = 3", NULL, NULL, NULL); + + // Insert after corruption + { + float newvec[8] = {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f}; + sqlite3_stmt *si = NULL; + sqlite3_prepare_v2(db, + "INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &si, NULL); + if (si) { + sqlite3_bind_int64(si, 1, 100); + sqlite3_bind_blob(si, 2, newvec, sizeof(newvec), SQLITE_STATIC); + sqlite3_step(si); + sqlite3_finalize(si); + } + } + + // compute-centroids over corrupted state + sqlite3_exec(db, + "INSERT INTO v(rowid) VALUES ('compute-centroids')", + NULL, NULL, NULL); + + // clear-centroids + sqlite3_exec(db, + "INSERT INTO v(rowid) VALUES ('clear-centroids')", + NULL, NULL, NULL); + + sqlite3_close(db); + return 0; +} diff --git a/tests/sqlite-vec-internal.h b/tests/sqlite-vec-internal.h index ca04b74..67f1370 100644 --- a/tests/sqlite-vec-internal.h +++ b/tests/sqlite-vec-internal.h @@ -5,6 +5,10 @@ #include #include +#ifndef SQLITE_VEC_ENABLE_IVF +#define SQLITE_VEC_ENABLE_IVF 1 +#endif + int min_idx( const float *distances, int32_t n, @@ -68,8 +72,36 @@ enum Vec0IndexType { #ifdef SQLITE_VEC_ENABLE_RESCORE VEC0_INDEX_TYPE_RESCORE = 2, #endif + VEC0_INDEX_TYPE_IVF = 3, }; +enum Vec0RescoreQuantizerType { + VEC0_RESCORE_QUANTIZER_BIT = 1, + VEC0_RESCORE_QUANTIZER_INT8 = 2, +}; + +struct Vec0RescoreConfig { + enum Vec0RescoreQuantizerType quantizer_type; + int oversample; +}; + +#if SQLITE_VEC_ENABLE_IVF +enum Vec0IvfQuantizer { + VEC0_IVF_QUANTIZER_NONE = 0, + VEC0_IVF_QUANTIZER_INT8 = 1, + VEC0_IVF_QUANTIZER_BINARY = 2, +}; + +struct Vec0IvfConfig { + int nlist; + int nprobe; + int quantizer; + int oversample; +}; +#else +struct Vec0IvfConfig { char _unused; }; +#endif + #ifdef SQLITE_VEC_ENABLE_RESCORE enum Vec0RescoreQuantizerType { VEC0_RESCORE_QUANTIZER_BIT = 1, @@ -93,6 +125,7 @@ struct VectorColumnDefinition { #ifdef SQLITE_VEC_ENABLE_RESCORE struct Vec0RescoreConfig rescore; #endif + struct Vec0IvfConfig ivf; }; int vec0_parse_vector_column(const char *source, int source_length, @@ -114,6 +147,10 @@ void _test_rescore_quantize_float_to_int8(const float *src, int8_t *dst, size_t size_t _test_rescore_quantized_byte_size_bit(size_t dimensions); size_t _test_rescore_quantized_byte_size_int8(size_t dimensions); #endif +#if SQLITE_VEC_ENABLE_IVF +void ivf_quantize_int8(const float *src, int8_t *dst, int D); +void ivf_quantize_binary(const float *src, uint8_t *dst, int D); +#endif #endif #endif /* SQLITE_VEC_INTERNAL_H */ diff --git a/tests/test-ivf-mutations.py b/tests/test-ivf-mutations.py new file mode 100644 index 0000000..5c61119 --- /dev/null +++ b/tests/test-ivf-mutations.py @@ -0,0 +1,575 @@ +""" +Thorough IVF mutation tests: insert, delete, update, KNN correctness, +error cases, edge cases, and cell overflow scenarios. +""" +import pytest +import sqlite3 +import struct +import math +from helpers import _f32, exec + + +@pytest.fixture() +def db(): + db = sqlite3.connect(":memory:") + db.row_factory = sqlite3.Row + db.enable_load_extension(True) + db.load_extension("dist/vec0") + db.enable_load_extension(False) + return db + + +def ivf_total_vectors(db, table="t", col=0): + """Count total vectors across all IVF cells.""" + return db.execute( + f"SELECT COALESCE(SUM(n_vectors), 0) FROM {table}_ivf_cells{col:02d}" + ).fetchone()[0] + + +def ivf_unassigned_count(db, table="t", col=0): + return db.execute( + f"SELECT COALESCE(SUM(n_vectors), 0) FROM {table}_ivf_cells{col:02d} WHERE centroid_id = -1" + ).fetchone()[0] + + +def ivf_assigned_count(db, table="t", col=0): + return db.execute( + f"SELECT COALESCE(SUM(n_vectors), 0) FROM {table}_ivf_cells{col:02d} WHERE centroid_id >= 0" + ).fetchone()[0] + + +def knn(db, query, k, table="t", col="v"): + """Run a KNN query and return list of (rowid, distance) tuples.""" + rows = db.execute( + f"SELECT rowid, distance FROM {table} WHERE {col} MATCH ? AND k = ?", + [_f32(query), k], + ).fetchall() + return [(r[0], r[1]) for r in rows] + + +# ============================================================================ +# Single row insert + KNN +# ============================================================================ + + +def test_insert_single_row_knn(db): + db.execute("CREATE VIRTUAL TABLE t USING vec0(v float[4] indexed by ivf())") + db.execute("INSERT INTO t(rowid, v) VALUES (1, ?)", [_f32([1, 0, 0, 0])]) + results = knn(db, [1, 0, 0, 0], 5) + assert len(results) == 1 + assert results[0][0] == 1 + assert results[0][1] < 0.001 + + +# ============================================================================ +# Batch insert + KNN recall +# ============================================================================ + + +def test_batch_insert_knn_recall(db): + """Insert 200 vectors, train, verify KNN recall with nprobe=nlist.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(v float[4] indexed by ivf(nlist=8, nprobe=8))" + ) + for i in range(200): + db.execute( + "INSERT INTO t(rowid, v) VALUES (?, ?)", + [i, _f32([float(i), 0, 0, 0])], + ) + assert ivf_total_vectors(db) == 200 + + db.execute("INSERT INTO t(rowid) VALUES ('compute-centroids')") + assert ivf_assigned_count(db) == 200 + + # Query near 100 -- closest should be rowid 100 + results = knn(db, [100.0, 0, 0, 0], 10) + assert len(results) == 10 + assert results[0][0] == 100 + assert results[0][1] < 0.01 + + # All results should be near 100 + rowids = {r[0] for r in results} + assert all(95 <= r <= 105 for r in rowids) + + +# ============================================================================ +# Delete rows, verify they're gone from KNN +# ============================================================================ + + +def test_delete_rows_gone_from_knn(db): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(v float[4] indexed by ivf(nlist=2, nprobe=2))" + ) + for i in range(20): + db.execute( + "INSERT INTO t(rowid, v) VALUES (?, ?)", + [i, _f32([float(i), 0, 0, 0])], + ) + + db.execute("INSERT INTO t(rowid) VALUES ('compute-centroids')") + + # Delete rowid 10 + db.execute("DELETE FROM t WHERE rowid = 10") + + results = knn(db, [10.0, 0, 0, 0], 20) + rowids = {r[0] for r in results} + assert 10 not in rowids + + +def test_delete_all_rows_empty_results(db): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(v float[4] indexed by ivf(nlist=2, nprobe=2))" + ) + for i in range(10): + db.execute( + "INSERT INTO t(rowid, v) VALUES (?, ?)", + [i, _f32([float(i), 0, 0, 0])], + ) + + db.execute("INSERT INTO t(rowid) VALUES ('compute-centroids')") + + for i in range(10): + db.execute("DELETE FROM t WHERE rowid = ?", [i]) + + assert ivf_total_vectors(db) == 0 + results = knn(db, [5.0, 0, 0, 0], 10) + assert len(results) == 0 + + +# ============================================================================ +# Insert after delete (reuse rowids) +# ============================================================================ + + +def test_insert_after_delete_reuse_rowid(db): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(v float[4] indexed by ivf(nlist=2, nprobe=2))" + ) + for i in range(10): + db.execute( + "INSERT INTO t(rowid, v) VALUES (?, ?)", + [i, _f32([float(i), 0, 0, 0])], + ) + + db.execute("INSERT INTO t(rowid) VALUES ('compute-centroids')") + + # Delete rowid 5 + db.execute("DELETE FROM t WHERE rowid = 5") + + # Re-insert rowid 5 with a very different vector + db.execute( + "INSERT INTO t(rowid, v) VALUES (5, ?)", [_f32([999.0, 0, 0, 0])] + ) + + # KNN near 999 should find rowid 5 + results = knn(db, [999.0, 0, 0, 0], 1) + assert len(results) >= 1 + assert results[0][0] == 5 + + +# ============================================================================ +# Update vectors (INSERT OR REPLACE), verify KNN reflects new values +# ============================================================================ + + +def test_update_vector_via_delete_insert(db): + """vec0 IVF update: delete then re-insert with new vector.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(v float[4] indexed by ivf(nlist=2, nprobe=2))" + ) + for i in range(10): + db.execute( + "INSERT INTO t(rowid, v) VALUES (?, ?)", + [i, _f32([float(i), 0, 0, 0])], + ) + + db.execute("INSERT INTO t(rowid) VALUES ('compute-centroids')") + + # "Update" rowid 3: delete and re-insert with new vector + db.execute("DELETE FROM t WHERE rowid = 3") + db.execute( + "INSERT INTO t(rowid, v) VALUES (3, ?)", + [_f32([100.0, 0, 0, 0])], + ) + + # KNN near 100 should find rowid 3 + results = knn(db, [100.0, 0, 0, 0], 1) + assert results[0][0] == 3 + + +# ============================================================================ +# Error cases: IVF + auxiliary/metadata/partition key columns +# ============================================================================ + + +def test_error_ivf_with_auxiliary_column(db): + result = exec( + db, + "CREATE VIRTUAL TABLE t USING vec0(v float[4] indexed by ivf(), +extra text)", + ) + assert "error" in result + assert "auxiliary" in result.get("message", "").lower() + + +def test_error_ivf_with_metadata_column(db): + result = exec( + db, + "CREATE VIRTUAL TABLE t USING vec0(v float[4] indexed by ivf(), genre text)", + ) + assert "error" in result + assert "metadata" in result.get("message", "").lower() + + +def test_error_ivf_with_partition_key(db): + result = exec( + db, + "CREATE VIRTUAL TABLE t USING vec0(v float[4] indexed by ivf(), user_id integer partition key)", + ) + assert "error" in result + assert "partition" in result.get("message", "").lower() + + +def test_flat_with_auxiliary_still_works(db): + """Regression guard: flat-indexed tables with aux columns should still work.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(v float[4], +extra text)" + ) + db.execute( + "INSERT INTO t(rowid, v, extra) VALUES (1, ?, 'hello')", + [_f32([1, 0, 0, 0])], + ) + row = db.execute("SELECT extra FROM t WHERE rowid = 1").fetchone() + assert row[0] == "hello" + + +def test_flat_with_metadata_still_works(db): + """Regression guard: flat-indexed tables with metadata columns should still work.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(v float[4], genre text)" + ) + db.execute( + "INSERT INTO t(rowid, v, genre) VALUES (1, ?, 'rock')", + [_f32([1, 0, 0, 0])], + ) + row = db.execute("SELECT genre FROM t WHERE rowid = 1").fetchone() + assert row[0] == "rock" + + +def test_flat_with_partition_key_still_works(db): + """Regression guard: flat-indexed tables with partition key should still work.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(v float[4], user_id integer partition key)" + ) + db.execute( + "INSERT INTO t(rowid, v, user_id) VALUES (1, ?, 42)", + [_f32([1, 0, 0, 0])], + ) + row = db.execute("SELECT user_id FROM t WHERE rowid = 1").fetchone() + assert row[0] == 42 + + +# ============================================================================ +# Edge cases +# ============================================================================ + + +def test_zero_vectors(db): + """Insert zero vectors, verify KNN still works.""" + db.execute("CREATE VIRTUAL TABLE t USING vec0(v float[4] indexed by ivf())") + for i in range(5): + db.execute( + "INSERT INTO t(rowid, v) VALUES (?, ?)", + [i, _f32([0, 0, 0, 0])], + ) + results = knn(db, [0, 0, 0, 0], 5) + assert len(results) == 5 + # All distances should be 0 + for _, dist in results: + assert dist < 0.001 + + +def test_large_values(db): + """Insert vectors with very large and small values.""" + db.execute("CREATE VIRTUAL TABLE t USING vec0(v float[4] indexed by ivf())") + db.execute( + "INSERT INTO t(rowid, v) VALUES (1, ?)", [_f32([1e6, 1e6, 1e6, 1e6])] + ) + db.execute( + "INSERT INTO t(rowid, v) VALUES (2, ?)", [_f32([1e-6, 1e-6, 1e-6, 1e-6])] + ) + db.execute( + "INSERT INTO t(rowid, v) VALUES (3, ?)", [_f32([-1e6, -1e6, -1e6, -1e6])] + ) + + results = knn(db, [1e6, 1e6, 1e6, 1e6], 3) + assert results[0][0] == 1 + + +def test_single_row_compute_centroids(db): + """Single row table, compute-centroids should still work.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(v float[4] indexed by ivf(nlist=1))" + ) + db.execute( + "INSERT INTO t(rowid, v) VALUES (1, ?)", [_f32([1, 2, 3, 4])] + ) + db.execute("INSERT INTO t(rowid) VALUES ('compute-centroids')") + assert ivf_assigned_count(db) == 1 + + results = knn(db, [1, 2, 3, 4], 1) + assert len(results) == 1 + assert results[0][0] == 1 + + +# ============================================================================ +# Cell overflow (many vectors in one cell) +# ============================================================================ + + +def test_cell_overflow_many_vectors(db): + """Insert >64 vectors that all go to same centroid. Should create multiple cells.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(v float[4] indexed by ivf(nlist=0))" + ) + # Insert 100 very similar vectors + for i in range(100): + db.execute( + "INSERT INTO t(rowid, v) VALUES (?, ?)", + [i, _f32([1.0 + i * 0.001, 0, 0, 0])], + ) + + # Set a single centroid so all vectors go there + db.execute( + "INSERT INTO t(rowid, v) VALUES ('set-centroid:0', ?)", + [_f32([1.0, 0, 0, 0])], + ) + db.execute("INSERT INTO t(rowid) VALUES ('assign-vectors')") + + assert ivf_assigned_count(db) == 100 + + # Should have more than 1 cell (64 max per cell) + cell_count = db.execute( + "SELECT count(*) FROM t_ivf_cells00 WHERE centroid_id = 0" + ).fetchone()[0] + assert cell_count >= 2 # 100 / 64 = 2 cells needed + + # All vectors should be queryable + results = knn(db, [1.0, 0, 0, 0], 100) + assert len(results) == 100 + + +# ============================================================================ +# Large batch with training +# ============================================================================ + + +def test_large_batch_with_training(db): + """Insert 500, train, insert 500 more, verify total is 1000.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(v float[4] indexed by ivf(nlist=16, nprobe=16))" + ) + for i in range(500): + db.execute( + "INSERT INTO t(rowid, v) VALUES (?, ?)", + [i, _f32([float(i), 0, 0, 0])], + ) + + db.execute("INSERT INTO t(rowid) VALUES ('compute-centroids')") + + for i in range(500, 1000): + db.execute( + "INSERT INTO t(rowid, v) VALUES (?, ?)", + [i, _f32([float(i), 0, 0, 0])], + ) + + assert ivf_total_vectors(db) == 1000 + + # KNN should still work + results = knn(db, [750.0, 0, 0, 0], 5) + assert len(results) == 5 + assert results[0][0] == 750 + + +# ============================================================================ +# KNN after interleaved insert/delete +# ============================================================================ + + +def test_knn_after_interleaved_insert_delete(db): + """Insert 20, train, delete 10 closest to query, verify remaining.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(v float[4] indexed by ivf(nlist=4, nprobe=4))" + ) + for i in range(20): + db.execute( + "INSERT INTO t(rowid, v) VALUES (?, ?)", + [i, _f32([float(i), 0, 0, 0])], + ) + + db.execute("INSERT INTO t(rowid) VALUES ('compute-centroids')") + + # Delete rowids 0-9 (closest to query at 5.0) + for i in range(10): + db.execute("DELETE FROM t WHERE rowid = ?", [i]) + + results = knn(db, [5.0, 0, 0, 0], 10) + rowids = {r[0] for r in results} + # None of the deleted rowids should appear + assert all(r >= 10 for r in rowids) + assert len(results) == 10 + + +def test_knn_empty_centroids_after_deletes(db): + """Some centroids may become empty after deletes. Should not crash.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(v float[4] indexed by ivf(nlist=4, nprobe=2))" + ) + # Insert vectors clustered near 4 points + for i in range(40): + db.execute( + "INSERT INTO t(rowid, v) VALUES (?, ?)", + [i, _f32([float(i % 10) * 10, 0, 0, 0])], + ) + + db.execute("INSERT INTO t(rowid) VALUES ('compute-centroids')") + + # Delete a bunch, potentially emptying some centroids + for i in range(30): + db.execute("DELETE FROM t WHERE rowid = ?", [i]) + + # Should not crash even with empty centroids + results = knn(db, [50.0, 0, 0, 0], 20) + assert len(results) <= 10 # only 10 left + + +# ============================================================================ +# KNN returns correct distances +# ============================================================================ + + +def test_knn_correct_distances(db): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(v float[4] indexed by ivf(nlist=2, nprobe=2))" + ) + db.execute("INSERT INTO t(rowid, v) VALUES (1, ?)", [_f32([0, 0, 0, 0])]) + db.execute("INSERT INTO t(rowid, v) VALUES (2, ?)", [_f32([3, 0, 0, 0])]) + db.execute("INSERT INTO t(rowid, v) VALUES (3, ?)", [_f32([0, 4, 0, 0])]) + + db.execute("INSERT INTO t(rowid) VALUES ('compute-centroids')") + + results = knn(db, [0, 0, 0, 0], 3) + result_map = {r[0]: r[1] for r in results} + + # L2 distances (sqrt of sum of squared differences) + assert abs(result_map[1] - 0.0) < 0.01 + assert abs(result_map[2] - 3.0) < 0.01 # sqrt(3^2) = 3 + assert abs(result_map[3] - 4.0) < 0.01 # sqrt(4^2) = 4 + + +# ============================================================================ +# Delete in flat mode leaves no orphan rowid_map entries +# ============================================================================ + + +def test_delete_flat_mode_rowid_map_count(db): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(v float[4] indexed by ivf(nlist=4))" + ) + for i in range(5): + db.execute( + "INSERT INTO t(rowid, v) VALUES (?, ?)", + [i, _f32([float(i), 0, 0, 0])], + ) + + db.execute("DELETE FROM t WHERE rowid = 0") + db.execute("DELETE FROM t WHERE rowid = 2") + db.execute("DELETE FROM t WHERE rowid = 4") + + assert db.execute("SELECT count(*) FROM t_ivf_rowid_map00").fetchone()[0] == 2 + assert ivf_unassigned_count(db) == 2 + + +# ============================================================================ +# Duplicate rowid insert +# ============================================================================ + + +def test_delete_reinsert_as_update(db): + """Simulate update via delete + insert on same rowid.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(v float[4] indexed by ivf(nlist=2, nprobe=2))" + ) + db.execute("INSERT INTO t(rowid, v) VALUES (1, ?)", [_f32([1, 0, 0, 0])]) + + # Delete then re-insert as "update" + db.execute("DELETE FROM t WHERE rowid = 1") + db.execute("INSERT INTO t(rowid, v) VALUES (1, ?)", [_f32([99, 0, 0, 0])]) + + results = knn(db, [99, 0, 0, 0], 1) + assert len(results) == 1 + assert results[0][0] == 1 + assert results[0][1] < 0.01 + + +def test_duplicate_rowid_insert_fails(db): + """Inserting a duplicate rowid should fail with a constraint error.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(v float[4] indexed by ivf(nlist=2, nprobe=2))" + ) + db.execute("INSERT INTO t(rowid, v) VALUES (1, ?)", [_f32([1, 0, 0, 0])]) + + result = exec( + db, + "INSERT INTO t(rowid, v) VALUES (1, ?)", + [_f32([99, 0, 0, 0])], + ) + assert "error" in result + + +# ============================================================================ +# Interleaved insert/delete with KNN correctness +# ============================================================================ + + +def test_interleaved_ops_correctness(db): + """Complex sequence of inserts and deletes, verify KNN is always correct.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(v float[4] indexed by ivf(nlist=4, nprobe=4))" + ) + + # Phase 1: Insert 50 vectors + for i in range(50): + db.execute( + "INSERT INTO t(rowid, v) VALUES (?, ?)", + [i, _f32([float(i), 0, 0, 0])], + ) + + db.execute("INSERT INTO t(rowid) VALUES ('compute-centroids')") + + # Phase 2: Delete even-numbered rowids + for i in range(0, 50, 2): + db.execute("DELETE FROM t WHERE rowid = ?", [i]) + + # Phase 3: Insert new vectors with higher rowids + for i in range(50, 75): + db.execute( + "INSERT INTO t(rowid, v) VALUES (?, ?)", + [i, _f32([float(i), 0, 0, 0])], + ) + + # Phase 4: Delete some of the new ones + for i in range(60, 70): + db.execute("DELETE FROM t WHERE rowid = ?", [i]) + + # KNN query: should only find existing vectors + results = knn(db, [25.0, 0, 0, 0], 50) + rowids = {r[0] for r in results} + + # Verify no deleted rowids appear + deleted = set(range(0, 50, 2)) | set(range(60, 70)) + assert len(rowids & deleted) == 0 + + # Verify we get the right count (25 odd + 15 new - 10 deleted new = 30) + expected_alive = set(range(1, 50, 2)) | set(range(50, 60)) | set(range(70, 75)) + assert rowids.issubset(expected_alive) diff --git a/tests/test-ivf-quantization.py b/tests/test-ivf-quantization.py new file mode 100644 index 0000000..9790680 --- /dev/null +++ b/tests/test-ivf-quantization.py @@ -0,0 +1,255 @@ +import pytest +import sqlite3 +from helpers import _f32, exec + + +@pytest.fixture() +def db(): + db = sqlite3.connect(":memory:") + db.row_factory = sqlite3.Row + db.enable_load_extension(True) + db.load_extension("dist/vec0") + db.enable_load_extension(False) + return db + + +# ============================================================================ +# Parser tests — quantizer and oversample options +# ============================================================================ + + +def test_ivf_quantizer_binary(db): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + "v float[768] indexed by ivf(nlist=64, quantizer=binary, oversample=10))" + ) + tables = [ + r[0] + for r in db.execute( + "SELECT name FROM sqlite_master WHERE type='table' ORDER BY 1" + ).fetchall() + ] + assert "t_ivf_centroids00" in tables + assert "t_ivf_cells00" in tables + + +def test_ivf_quantizer_int8(db): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + "v float[768] indexed by ivf(nlist=64, quantizer=int8))" + ) + tables = [ + r[0] + for r in db.execute( + "SELECT name FROM sqlite_master WHERE type='table' ORDER BY 1" + ).fetchall() + ] + assert "t_ivf_centroids00" in tables + + +def test_ivf_quantizer_none_explicit(db): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + "v float[768] indexed by ivf(quantizer=none))" + ) + # Should work — same as no quantizer + tables = [ + r[0] + for r in db.execute( + "SELECT name FROM sqlite_master WHERE type='table' ORDER BY 1" + ).fetchall() + ] + assert "t_ivf_centroids00" in tables + + +def test_ivf_quantizer_all_params(db): + """All params together: nlist, nprobe, quantizer, oversample.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + "v float[768] distance_metric=cosine " + "indexed by ivf(nlist=128, nprobe=16, quantizer=int8, oversample=4))" + ) + tables = [ + r[0] + for r in db.execute( + "SELECT name FROM sqlite_master WHERE type='table' ORDER BY 1" + ).fetchall() + ] + assert "t_ivf_centroids00" in tables + + +def test_ivf_error_oversample_without_quantizer(db): + """oversample > 1 without quantizer should error.""" + result = exec( + db, + "CREATE VIRTUAL TABLE t USING vec0(" + "v float[768] indexed by ivf(oversample=10))", + ) + assert "error" in result + + +def test_ivf_error_unknown_quantizer(db): + result = exec( + db, + "CREATE VIRTUAL TABLE t USING vec0(" + "v float[768] indexed by ivf(quantizer=pq))", + ) + assert "error" in result + + +def test_ivf_oversample_1_without_quantizer_ok(db): + """oversample=1 (default) is fine without quantizer.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + "v float[768] indexed by ivf(nlist=64))" + ) + # Should succeed — oversample defaults to 1 + + +# ============================================================================ +# Functional tests — insert, train, query with quantized IVF +# ============================================================================ + + +def test_ivf_int8_insert_and_query(db): + """int8 quantized IVF: insert, train, query.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + "v float[4] indexed by ivf(nlist=2, quantizer=int8, oversample=4))" + ) + for i in range(20): + db.execute( + "INSERT INTO t(rowid, v) VALUES (?, ?)", [i, _f32([i, 0, 0, 0])] + ) + + db.execute("INSERT INTO t(rowid) VALUES ('compute-centroids')") + + # Should be able to query + rows = db.execute( + "SELECT rowid, distance FROM t WHERE v MATCH ? AND k = 5", + [_f32([10.0, 0, 0, 0])], + ).fetchall() + assert len(rows) == 5 + # Top result should be close to 10 + assert rows[0][0] in range(8, 13) + + # Full vectors should be in _ivf_vectors table + fv_count = db.execute("SELECT count(*) FROM t_ivf_vectors00").fetchone()[0] + assert fv_count == 20 + + +def test_ivf_binary_insert_and_query(db): + """Binary quantized IVF: insert, train, query.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + "v float[8] indexed by ivf(nlist=2, quantizer=binary, oversample=4))" + ) + for i in range(20): + # Vectors with varying sign patterns + v = [(i * 0.1 - 1.0) + j * 0.3 for j in range(8)] + db.execute( + "INSERT INTO t(rowid, v) VALUES (?, ?)", [i, _f32(v)] + ) + + db.execute("INSERT INTO t(rowid) VALUES ('compute-centroids')") + + rows = db.execute( + "SELECT rowid FROM t WHERE v MATCH ? AND k = 5", + [_f32([0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5])], + ).fetchall() + assert len(rows) == 5 + + # Full vectors stored + fv_count = db.execute("SELECT count(*) FROM t_ivf_vectors00").fetchone()[0] + assert fv_count == 20 + + +def test_ivf_int8_cell_sizes_smaller(db): + """Cell blobs should be smaller with int8 quantization.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + "v float[768] indexed by ivf(nlist=2, quantizer=int8, oversample=1))" + ) + for i in range(10): + db.execute( + "INSERT INTO t(rowid, v) VALUES (?, ?)", + [i, _f32([float(x) / 768 for x in range(768)])], + ) + + # Cell vectors blob: 10 vectors at int8 = 10 * 768 = 7680 bytes + # vs float32 = 10 * 768 * 4 = 30720 bytes + # But cells have capacity 64, so blob = 64 * 768 = 49152 (int8) vs 64*768*4=196608 (float32) + blob_size = db.execute( + "SELECT length(vectors) FROM t_ivf_cells00 LIMIT 1" + ).fetchone()[0] + # int8: 64 slots * 768 bytes = 49152 + assert blob_size == 64 * 768 + + +def test_ivf_binary_cell_sizes_smaller(db): + """Cell blobs should be much smaller with binary quantization.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + "v float[768] indexed by ivf(nlist=2, quantizer=binary, oversample=1))" + ) + for i in range(10): + db.execute( + "INSERT INTO t(rowid, v) VALUES (?, ?)", + [i, _f32([float(x) / 768 for x in range(768)])], + ) + + blob_size = db.execute( + "SELECT length(vectors) FROM t_ivf_cells00 LIMIT 1" + ).fetchone()[0] + # binary: 64 slots * 768/8 bytes = 6144 + assert blob_size == 64 * (768 // 8) + + +def test_ivf_int8_oversample_improves_recall(db): + """Oversample re-ranking should improve recall over oversample=1.""" + # Create two tables: one with oversample=1, one with oversample=10 + db.execute( + "CREATE VIRTUAL TABLE t1 USING vec0(" + "v float[4] indexed by ivf(nlist=4, quantizer=int8, oversample=1))" + ) + db.execute( + "CREATE VIRTUAL TABLE t2 USING vec0(" + "v float[4] indexed by ivf(nlist=4, quantizer=int8, oversample=10))" + ) + for i in range(100): + v = _f32([i * 0.1, (i * 0.1) % 3, (i * 0.3) % 5, i * 0.01]) + db.execute("INSERT INTO t1(rowid, v) VALUES (?, ?)", [i, v]) + db.execute("INSERT INTO t2(rowid, v) VALUES (?, ?)", [i, v]) + + db.execute("INSERT INTO t1(rowid) VALUES ('compute-centroids')") + db.execute("INSERT INTO t2(rowid) VALUES ('compute-centroids')") + db.execute("INSERT INTO t1(rowid) VALUES ('nprobe=4')") + db.execute("INSERT INTO t2(rowid) VALUES ('nprobe=4')") + + query = _f32([5.0, 1.5, 2.5, 0.5]) + r1 = db.execute("SELECT rowid FROM t1 WHERE v MATCH ? AND k=10", [query]).fetchall() + r2 = db.execute("SELECT rowid FROM t2 WHERE v MATCH ? AND k=10", [query]).fetchall() + + # Both should return 10 results + assert len(r1) == 10 + assert len(r2) == 10 + # oversample=10 should have at least as good recall (same or better ordering) + + +def test_ivf_quantized_delete(db): + """Delete should remove from both cells and _ivf_vectors.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + "v float[4] indexed by ivf(nlist=2, quantizer=int8, oversample=1))" + ) + for i in range(10): + db.execute( + "INSERT INTO t(rowid, v) VALUES (?, ?)", [i, _f32([i, 0, 0, 0])] + ) + + db.execute("INSERT INTO t(rowid) VALUES ('compute-centroids')") + assert db.execute("SELECT count(*) FROM t_ivf_vectors00").fetchone()[0] == 10 + + db.execute("DELETE FROM t WHERE rowid = 5") + # _ivf_vectors should have 9 rows + assert db.execute("SELECT count(*) FROM t_ivf_vectors00").fetchone()[0] == 9 diff --git a/tests/test-ivf.py b/tests/test-ivf.py new file mode 100644 index 0000000..18a7532 --- /dev/null +++ b/tests/test-ivf.py @@ -0,0 +1,426 @@ +import pytest +import sqlite3 +import struct +import math +from helpers import _f32, exec + + +@pytest.fixture() +def db(): + db = sqlite3.connect(":memory:") + db.row_factory = sqlite3.Row + db.enable_load_extension(True) + db.load_extension("dist/vec0") + db.enable_load_extension(False) + return db + + +def ivf_total_vectors(db, table="t", col=0): + """Count total vectors across all IVF cells.""" + return db.execute( + f"SELECT COALESCE(SUM(n_vectors), 0) FROM {table}_ivf_cells{col:02d}" + ).fetchone()[0] + + +def ivf_unassigned_count(db, table="t", col=0): + """Count vectors in unassigned cells (centroid_id=-1).""" + return db.execute( + f"SELECT COALESCE(SUM(n_vectors), 0) FROM {table}_ivf_cells{col:02d} WHERE centroid_id = -1" + ).fetchone()[0] + + +def ivf_assigned_count(db, table="t", col=0): + """Count vectors in trained cells (centroid_id >= 0).""" + return db.execute( + f"SELECT COALESCE(SUM(n_vectors), 0) FROM {table}_ivf_cells{col:02d} WHERE centroid_id >= 0" + ).fetchone()[0] + + +# ============================================================================ +# Parser tests +# ============================================================================ + + +def test_ivf_create_defaults(db): + """ivf() with no args uses defaults.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(v float[4] indexed by ivf())" + ) + tables = [ + r[0] + for r in db.execute( + "SELECT name FROM sqlite_master WHERE type='table' ORDER BY 1" + ).fetchall() + ] + assert "t_ivf_centroids00" in tables + assert "t_ivf_cells00" in tables + assert "t_ivf_rowid_map00" in tables + + +def test_ivf_create_custom_params(db): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(v float[4] indexed by ivf(nlist=64, nprobe=8))" + ) + tables = [ + r[0] + for r in db.execute( + "SELECT name FROM sqlite_master WHERE type='table' ORDER BY 1" + ).fetchall() + ] + assert "t_ivf_centroids00" in tables + assert "t_ivf_cells00" in tables + + +def test_ivf_create_with_distance_metric(db): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(v float[4] distance_metric=cosine indexed by ivf(nlist=16))" + ) + tables = [ + r[0] + for r in db.execute( + "SELECT name FROM sqlite_master WHERE type='table' ORDER BY 1" + ).fetchall() + ] + assert "t_ivf_centroids00" in tables + + +def test_ivf_create_error_unknown_key(db): + result = exec( + db, + "CREATE VIRTUAL TABLE t USING vec0(v float[4] indexed by ivf(bogus=1))", + ) + assert "error" in result + + +def test_ivf_create_error_nprobe_gt_nlist(db): + result = exec( + db, + "CREATE VIRTUAL TABLE t USING vec0(v float[4] indexed by ivf(nlist=4, nprobe=10))", + ) + assert "error" in result + + +# ============================================================================ +# Shadow table tests +# ============================================================================ + + +def test_ivf_shadow_tables_created(db): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(v float[4] indexed by ivf(nlist=8))" + ) + trained = db.execute( + "SELECT value FROM t_info WHERE key = 'ivf_trained_0'" + ).fetchone()[0] + assert str(trained) == "0" + + # No cells yet (created lazily on first insert) + count = db.execute( + "SELECT count(*) FROM t_ivf_cells00" + ).fetchone()[0] + assert count == 0 + + +def test_ivf_drop_cleans_up(db): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(v float[4] indexed by ivf(nlist=4))" + ) + db.execute("DROP TABLE t") + tables = [ + r[0] + for r in db.execute( + "SELECT name FROM sqlite_master WHERE type='table'" + ).fetchall() + ] + assert not any("ivf" in t for t in tables) + + +# ============================================================================ +# Insert tests (flat mode) +# ============================================================================ + + +def test_ivf_insert_flat_mode(db): + """Before training, vectors go to unassigned cell.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(v float[4] indexed by ivf(nlist=4))" + ) + db.execute("INSERT INTO t(rowid, v) VALUES (1, ?)", [_f32([1, 2, 3, 4])]) + db.execute("INSERT INTO t(rowid, v) VALUES (2, ?)", [_f32([5, 6, 7, 8])]) + + assert ivf_unassigned_count(db) == 2 + assert ivf_assigned_count(db) == 0 + + # rowid_map should have 2 entries + assert db.execute("SELECT count(*) FROM t_ivf_rowid_map00").fetchone()[0] == 2 + + +def test_ivf_delete_flat_mode(db): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(v float[4] indexed by ivf(nlist=4))" + ) + db.execute("INSERT INTO t(rowid, v) VALUES (1, ?)", [_f32([1, 2, 3, 4])]) + db.execute("INSERT INTO t(rowid, v) VALUES (2, ?)", [_f32([5, 6, 7, 8])]) + db.execute("DELETE FROM t WHERE rowid = 1") + + assert ivf_unassigned_count(db) == 1 + assert db.execute("SELECT count(*) FROM t_ivf_rowid_map00").fetchone()[0] == 1 + + +# ============================================================================ +# KNN flat mode tests +# ============================================================================ + + +def test_ivf_knn_flat_mode(db): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(v float[4] indexed by ivf(nlist=4))" + ) + db.execute("INSERT INTO t(rowid, v) VALUES (1, ?)", [_f32([1, 0, 0, 0])]) + db.execute("INSERT INTO t(rowid, v) VALUES (2, ?)", [_f32([2, 0, 0, 0])]) + db.execute("INSERT INTO t(rowid, v) VALUES (3, ?)", [_f32([9, 0, 0, 0])]) + + rows = db.execute( + "SELECT rowid, distance FROM t WHERE v MATCH ? AND k = 2", + [_f32([1.5, 0, 0, 0])], + ).fetchall() + assert len(rows) == 2 + rowids = {r[0] for r in rows} + assert rowids == {1, 2} + + +def test_ivf_knn_flat_empty(db): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(v float[4] indexed by ivf(nlist=4))" + ) + rows = db.execute( + "SELECT rowid FROM t WHERE v MATCH ? AND k = 5", + [_f32([1, 0, 0, 0])], + ).fetchall() + assert len(rows) == 0 + + +# ============================================================================ +# compute-centroids tests +# ============================================================================ + + +def test_compute_centroids(db): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(v float[4] indexed by ivf(nlist=4))" + ) + for i in range(40): + db.execute( + "INSERT INTO t(rowid, v) VALUES (?, ?)", + [i, _f32([i % 10, i // 10, 0, 0])], + ) + + assert ivf_unassigned_count(db) == 40 + + db.execute("INSERT INTO t(rowid) VALUES ('compute-centroids')") + + # After training: unassigned cell should be gone (or empty), vectors in trained cells + assert ivf_unassigned_count(db) == 0 + assert ivf_assigned_count(db) == 40 + assert db.execute("SELECT count(*) FROM t_ivf_centroids00").fetchone()[0] == 4 + trained = db.execute( + "SELECT value FROM t_info WHERE key='ivf_trained_0'" + ).fetchone()[0] + assert str(trained) == "1" + + +def test_compute_centroids_recompute(db): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(v float[4] indexed by ivf(nlist=2))" + ) + for i in range(20): + db.execute( + "INSERT INTO t(rowid, v) VALUES (?, ?)", [i, _f32([i, 0, 0, 0])] + ) + + db.execute("INSERT INTO t(rowid) VALUES ('compute-centroids')") + assert db.execute("SELECT count(*) FROM t_ivf_centroids00").fetchone()[0] == 2 + + db.execute("INSERT INTO t(rowid) VALUES ('compute-centroids')") + assert db.execute("SELECT count(*) FROM t_ivf_centroids00").fetchone()[0] == 2 + assert ivf_assigned_count(db) == 20 + + +# ============================================================================ +# Insert after training (assigned mode) +# ============================================================================ + + +def test_ivf_insert_after_training(db): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(v float[4] indexed by ivf(nlist=2))" + ) + for i in range(20): + db.execute( + "INSERT INTO t(rowid, v) VALUES (?, ?)", [i, _f32([i, 0, 0, 0])] + ) + + db.execute("INSERT INTO t(rowid) VALUES ('compute-centroids')") + + db.execute( + "INSERT INTO t(rowid, v) VALUES (100, ?)", [_f32([5, 0, 0, 0])] + ) + + # Should be in a trained cell, not unassigned + row = db.execute( + "SELECT m.cell_id, c.centroid_id FROM t_ivf_rowid_map00 m " + "JOIN t_ivf_cells00 c ON c.rowid = m.cell_id " + "WHERE m.rowid = 100" + ).fetchone() + assert row is not None + assert row[1] >= 0 # centroid_id >= 0 means trained cell + + +# ============================================================================ +# KNN after training (IVF probe mode) +# ============================================================================ + + +def test_ivf_knn_after_training(db): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(v float[4] indexed by ivf(nlist=4, nprobe=4))" + ) + for i in range(100): + db.execute( + "INSERT INTO t(rowid, v) VALUES (?, ?)", [i, _f32([i, 0, 0, 0])] + ) + + db.execute("INSERT INTO t(rowid) VALUES ('compute-centroids')") + + rows = db.execute( + "SELECT rowid, distance FROM t WHERE v MATCH ? AND k = 5", + [_f32([50.0, 0, 0, 0])], + ).fetchall() + assert len(rows) == 5 + assert rows[0][0] == 50 + assert rows[0][1] < 0.01 + + +def test_ivf_knn_k_larger_than_n(db): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(v float[4] indexed by ivf(nlist=2, nprobe=2))" + ) + for i in range(5): + db.execute( + "INSERT INTO t(rowid, v) VALUES (?, ?)", [i, _f32([i, 0, 0, 0])] + ) + + db.execute("INSERT INTO t(rowid) VALUES ('compute-centroids')") + + rows = db.execute( + "SELECT rowid FROM t WHERE v MATCH ? AND k = 100", + [_f32([0, 0, 0, 0])], + ).fetchall() + assert len(rows) == 5 + + +# ============================================================================ +# Manual centroid import (set-centroid, assign-vectors) +# ============================================================================ + + +def test_set_centroid_and_assign(db): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(v float[4] indexed by ivf(nlist=0))" + ) + for i in range(20): + db.execute( + "INSERT INTO t(rowid, v) VALUES (?, ?)", [i, _f32([i, 0, 0, 0])] + ) + + db.execute( + "INSERT INTO t(rowid, v) VALUES ('set-centroid:0', ?)", + [_f32([5, 0, 0, 0])], + ) + db.execute( + "INSERT INTO t(rowid, v) VALUES ('set-centroid:1', ?)", + [_f32([15, 0, 0, 0])], + ) + + assert db.execute("SELECT count(*) FROM t_ivf_centroids00").fetchone()[0] == 2 + + db.execute("INSERT INTO t(rowid) VALUES ('assign-vectors')") + + assert ivf_unassigned_count(db) == 0 + assert ivf_assigned_count(db) == 20 + + +# ============================================================================ +# clear-centroids +# ============================================================================ + + +def test_clear_centroids(db): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(v float[4] indexed by ivf(nlist=2))" + ) + for i in range(20): + db.execute( + "INSERT INTO t(rowid, v) VALUES (?, ?)", [i, _f32([i, 0, 0, 0])] + ) + + db.execute("INSERT INTO t(rowid) VALUES ('compute-centroids')") + assert db.execute("SELECT count(*) FROM t_ivf_centroids00").fetchone()[0] == 2 + + db.execute("INSERT INTO t(rowid) VALUES ('clear-centroids')") + assert db.execute("SELECT count(*) FROM t_ivf_centroids00").fetchone()[0] == 0 + assert ivf_unassigned_count(db) == 20 + trained = db.execute( + "SELECT value FROM t_info WHERE key='ivf_trained_0'" + ).fetchone()[0] + assert str(trained) == "0" + + +# ============================================================================ +# Delete after training +# ============================================================================ + + +def test_ivf_delete_after_training(db): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(v float[4] indexed by ivf(nlist=2))" + ) + for i in range(10): + db.execute( + "INSERT INTO t(rowid, v) VALUES (?, ?)", [i, _f32([i, 0, 0, 0])] + ) + + db.execute("INSERT INTO t(rowid) VALUES ('compute-centroids')") + assert ivf_assigned_count(db) == 10 + + db.execute("DELETE FROM t WHERE rowid = 5") + assert ivf_assigned_count(db) == 9 + assert db.execute("SELECT count(*) FROM t_ivf_rowid_map00").fetchone()[0] == 9 + + +# ============================================================================ +# Recall test +# ============================================================================ + + +def test_ivf_recall_nprobe_equals_nlist(db): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(v float[4] indexed by ivf(nlist=8, nprobe=8))" + ) + for i in range(100): + db.execute( + "INSERT INTO t(rowid, v) VALUES (?, ?)", [i, _f32([i, 0, 0, 0])] + ) + + db.execute("INSERT INTO t(rowid) VALUES ('compute-centroids')") + + rows = db.execute( + "SELECT rowid FROM t WHERE v MATCH ? AND k = 10", + [_f32([50.0, 0, 0, 0])], + ).fetchall() + rowids = {r[0] for r in rows} + + # 45 and 55 are equidistant from 50, so either may appear in top 10 + assert 50 in rowids + assert len(rowids) == 10 + assert all(45 <= r <= 55 for r in rowids) diff --git a/tests/test-unit.c b/tests/test-unit.c index b180625..27a469d 100644 --- a/tests/test-unit.c +++ b/tests/test-unit.c @@ -577,6 +577,182 @@ void test_vec0_parse_vector_column() { assert(rc == SQLITE_ERROR); } +#if SQLITE_VEC_ENABLE_IVF + // IVF: indexed by ivf() — defaults + { + const char *input = "v float[4] indexed by ivf()"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_OK); + assert(col.index_type == VEC0_INDEX_TYPE_IVF); + assert(col.dimensions == 4); + assert(col.index_type == VEC0_INDEX_TYPE_IVF); + assert(col.ivf.nlist == 128); // default + assert(col.ivf.nprobe == 10); // default + sqlite3_free(col.name); + } + + // IVF: indexed by ivf(nlist=8) — nprobe auto-clamped to 8 + { + const char *input = "v float[4] indexed by ivf(nlist=8)"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_OK); + assert(col.index_type == VEC0_INDEX_TYPE_IVF); + assert(col.index_type == VEC0_INDEX_TYPE_IVF); + assert(col.ivf.nlist == 8); + assert(col.ivf.nprobe == 8); // clamped from default 10 + sqlite3_free(col.name); + } + + // IVF: indexed by ivf(nlist=64, nprobe=8) + { + const char *input = "v float[4] indexed by ivf(nlist=64, nprobe=8)"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_OK); + assert(col.index_type == VEC0_INDEX_TYPE_IVF); + assert(col.ivf.nlist == 64); + assert(col.ivf.nprobe == 8); + sqlite3_free(col.name); + } + + // IVF: with distance_metric before indexed by + { + const char *input = "v float[4] distance_metric=cosine indexed by ivf(nlist=16)"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_OK); + assert(col.index_type == VEC0_INDEX_TYPE_IVF); + assert(col.distance_metric == VEC0_DISTANCE_METRIC_COSINE); + assert(col.index_type == VEC0_INDEX_TYPE_IVF); + assert(col.ivf.nlist == 16); + sqlite3_free(col.name); + } + + // IVF: nlist=0 (deferred) + { + const char *input = "v float[4] indexed by ivf(nlist=0)"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_OK); + assert(col.ivf.nlist == 0); + sqlite3_free(col.name); + } + + // IVF error: nprobe > nlist + { + const char *input = "v float[4] indexed by ivf(nlist=4, nprobe=10)"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_ERROR); + } + + // IVF error: unknown key + { + const char *input = "v float[4] indexed by ivf(bogus=1)"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_ERROR); + } + + // IVF error: unknown index type (hnsw not supported) + { + const char *input = "v float[4] indexed by hnsw()"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_ERROR); + } + + // Not IVF: no ivf config + { + const char *input = "v float[4]"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_OK); + assert(col.index_type == VEC0_INDEX_TYPE_FLAT); + sqlite3_free(col.name); + } + + // IVF: quantizer=binary + { + const char *input = "v float[768] indexed by ivf(nlist=128, quantizer=binary)"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_OK); + assert(col.index_type == VEC0_INDEX_TYPE_IVF); + assert(col.ivf.nlist == 128); + assert(col.ivf.quantizer == VEC0_IVF_QUANTIZER_BINARY); + assert(col.ivf.oversample == 1); + sqlite3_free(col.name); + } + + // IVF: quantizer=int8 + { + const char *input = "v float[768] indexed by ivf(nlist=64, quantizer=int8)"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_OK); + assert(col.ivf.quantizer == VEC0_IVF_QUANTIZER_INT8); + sqlite3_free(col.name); + } + + // IVF: quantizer=none (explicit) + { + const char *input = "v float[768] indexed by ivf(quantizer=none)"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_OK); + assert(col.ivf.quantizer == VEC0_IVF_QUANTIZER_NONE); + sqlite3_free(col.name); + } + + // IVF: oversample=10 with quantizer + { + const char *input = "v float[768] indexed by ivf(nlist=128, quantizer=binary, oversample=10)"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_OK); + assert(col.ivf.quantizer == VEC0_IVF_QUANTIZER_BINARY); + assert(col.ivf.oversample == 10); + assert(col.ivf.nlist == 128); + sqlite3_free(col.name); + } + + // IVF: all params + { + const char *input = "v float[768] distance_metric=cosine indexed by ivf(nlist=256, nprobe=16, quantizer=int8, oversample=4)"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_OK); + assert(col.distance_metric == VEC0_DISTANCE_METRIC_COSINE); + assert(col.ivf.nlist == 256); + assert(col.ivf.nprobe == 16); + assert(col.ivf.quantizer == VEC0_IVF_QUANTIZER_INT8); + assert(col.ivf.oversample == 4); + sqlite3_free(col.name); + } + + // IVF error: oversample > 1 without quantizer + { + const char *input = "v float[768] indexed by ivf(oversample=10)"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_ERROR); + } + + // IVF error: unknown quantizer value + { + const char *input = "v float[768] indexed by ivf(quantizer=pq)"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_ERROR); + } + + // IVF: quantizer with defaults (nlist=128 default, nprobe=10 default) + { + const char *input = "v float[768] indexed by ivf(quantizer=binary, oversample=5)"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_OK); + assert(col.ivf.nlist == 128); + assert(col.ivf.nprobe == 10); + assert(col.ivf.quantizer == VEC0_IVF_QUANTIZER_BINARY); + assert(col.ivf.oversample == 5); + sqlite3_free(col.name); + } +#else + // When IVF is disabled, parsing "ivf" should fail + { + const char *input = "v float[4] indexed by ivf()"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_ERROR); + } +#endif /* SQLITE_VEC_ENABLE_IVF */ + printf(" All vec0_parse_vector_column tests passed.\n"); } @@ -821,6 +997,38 @@ void test_rescore_quantize_float_to_int8() { float src[8] = {5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f, 5.0f}; _test_rescore_quantize_float_to_int8(src, dst, 8); for (int i = 0; i < 8; i++) { +#if SQLITE_VEC_ENABLE_IVF +void test_ivf_quantize_int8() { + printf("Starting %s...\n", __func__); + + // Basic values in [-1, 1] range + { + float src[] = {0.0f, 1.0f, -1.0f, 0.5f}; + int8_t dst[4]; + ivf_quantize_int8(src, dst, 4); + assert(dst[0] == 0); + assert(dst[1] == 127); + assert(dst[2] == -127); + assert(dst[3] == 63); // 0.5 * 127 = 63.5, truncated to 63 + } + + // Clamping: values beyond [-1, 1] + { + float src[] = {2.0f, -3.0f, 100.0f, -0.01f}; + int8_t dst[4]; + ivf_quantize_int8(src, dst, 4); + assert(dst[0] == 127); // clamped to 1.0 + assert(dst[1] == -127); // clamped to -1.0 + assert(dst[2] == 127); // clamped to 1.0 + assert(dst[3] == (int8_t)(-0.01f * 127.0f)); + } + + // Zero vector + { + float src[] = {0.0f, 0.0f, 0.0f, 0.0f}; + int8_t dst[4]; + ivf_quantize_int8(src, dst, 4); + for (int i = 0; i < 4; i++) { assert(dst[i] == 0); } } @@ -882,6 +1090,103 @@ void test_rescore_quantized_byte_size() { } void test_vec0_parse_vector_column_rescore() { + // Negative zero + { + float src[] = {-0.0f}; + int8_t dst[1]; + ivf_quantize_int8(src, dst, 1); + assert(dst[0] == 0); + } + + // Single element + { + float src[] = {0.75f}; + int8_t dst[1]; + ivf_quantize_int8(src, dst, 1); + assert(dst[0] == (int8_t)(0.75f * 127.0f)); + } + + // Boundary: exactly 1.0 and -1.0 + { + float src[] = {1.0f, -1.0f}; + int8_t dst[2]; + ivf_quantize_int8(src, dst, 2); + assert(dst[0] == 127); + assert(dst[1] == -127); + } + + printf(" All ivf_quantize_int8 tests passed.\n"); +} + +void test_ivf_quantize_binary() { + printf("Starting %s...\n", __func__); + + // Basic sign-bit quantization: positive -> 1, negative/zero -> 0 + { + float src[] = {1.0f, -1.0f, 0.5f, -0.5f, 0.0f, 0.1f, -0.1f, 2.0f}; + uint8_t dst[1]; + ivf_quantize_binary(src, dst, 8); + // bit 0: 1.0 > 0 -> 1 (LSB) + // bit 1: -1.0 -> 0 + // bit 2: 0.5 > 0 -> 1 + // bit 3: -0.5 -> 0 + // bit 4: 0.0 -> 0 (not > 0) + // bit 5: 0.1 > 0 -> 1 + // bit 6: -0.1 -> 0 + // bit 7: 2.0 > 0 -> 1 + // Expected: bits 0,2,5,7 = 0b10100101 = 0xA5 + assert(dst[0] == 0xA5); + } + + // All positive + { + float src[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}; + uint8_t dst[1]; + ivf_quantize_binary(src, dst, 8); + assert(dst[0] == 0xFF); + } + + // All negative + { + float src[] = {-1.0f, -2.0f, -3.0f, -4.0f, -5.0f, -6.0f, -7.0f, -8.0f}; + uint8_t dst[1]; + ivf_quantize_binary(src, dst, 8); + assert(dst[0] == 0x00); + } + + // All zero (zero is NOT > 0, so all bits should be 0) + { + float src[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; + uint8_t dst[1]; + ivf_quantize_binary(src, dst, 8); + assert(dst[0] == 0x00); + } + + // Multi-byte: 16 dimensions -> 2 bytes + { + float src[16]; + for (int i = 0; i < 16; i++) src[i] = (i % 2 == 0) ? 1.0f : -1.0f; + uint8_t dst[2]; + ivf_quantize_binary(src, dst, 16); + // Even indices are positive: bits 0,2,4,6 in each byte + // byte 0: bits 0,2,4,6 = 0b01010101 = 0x55 + // byte 1: same pattern = 0x55 + assert(dst[0] == 0x55); + assert(dst[1] == 0x55); + } + + // Single byte, only first bit set + { + float src[] = {0.1f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f}; + uint8_t dst[1]; + ivf_quantize_binary(src, dst, 8); + assert(dst[0] == 0x01); + } + + printf(" All ivf_quantize_binary tests passed.\n"); +} + +void test_ivf_config_parsing() { printf("Starting %s...\n", __func__); struct VectorColumnDefinition col; int rc; @@ -955,6 +1260,116 @@ void test_vec0_parse_vector_column_rescore() { } #endif /* SQLITE_VEC_ENABLE_RESCORE */ + // Default IVF config + { + const char *s = "v float[4] indexed by ivf()"; + rc = vec0_parse_vector_column(s, (int)strlen(s), &col); + assert(rc == SQLITE_OK); + assert(col.index_type == VEC0_INDEX_TYPE_IVF); + assert(col.ivf.nlist == 128); // default + assert(col.ivf.nprobe == 10); // default + assert(col.ivf.quantizer == 0); // VEC0_IVF_QUANTIZER_NONE + sqlite3_free(col.name); + } + + // Custom nlist and nprobe + { + const char *s = "v float[4] indexed by ivf(nlist=64, nprobe=8)"; + rc = vec0_parse_vector_column(s, (int)strlen(s), &col); + assert(rc == SQLITE_OK); + assert(col.ivf.nlist == 64); + assert(col.ivf.nprobe == 8); + sqlite3_free(col.name); + } + + // nlist=0 (deferred) + { + const char *s = "v float[4] indexed by ivf(nlist=0)"; + rc = vec0_parse_vector_column(s, (int)strlen(s), &col); + assert(rc == SQLITE_OK); + assert(col.ivf.nlist == 0); + sqlite3_free(col.name); + } + + // Quantizer options + { + const char *s = "v float[8] indexed by ivf(quantizer=int8)"; + rc = vec0_parse_vector_column(s, (int)strlen(s), &col); + assert(rc == SQLITE_OK); + assert(col.ivf.quantizer == VEC0_IVF_QUANTIZER_INT8); + sqlite3_free(col.name); + } + + { + const char *s = "v float[8] indexed by ivf(quantizer=binary)"; + rc = vec0_parse_vector_column(s, (int)strlen(s), &col); + assert(rc == SQLITE_OK); + assert(col.ivf.quantizer == VEC0_IVF_QUANTIZER_BINARY); + sqlite3_free(col.name); + } + + // nprobe > nlist (explicit) should fail + { + const char *s = "v float[4] indexed by ivf(nlist=4, nprobe=10)"; + rc = vec0_parse_vector_column(s, (int)strlen(s), &col); + assert(rc == SQLITE_ERROR); + } + + // Unknown key + { + const char *s = "v float[4] indexed by ivf(bogus=1)"; + rc = vec0_parse_vector_column(s, (int)strlen(s), &col); + assert(rc == SQLITE_ERROR); + } + + // nlist > max (65536) should fail + { + const char *s = "v float[4] indexed by ivf(nlist=65537)"; + rc = vec0_parse_vector_column(s, (int)strlen(s), &col); + assert(rc == SQLITE_ERROR); + } + + // nlist at max boundary (65536) should succeed + { + const char *s = "v float[4] indexed by ivf(nlist=65536)"; + rc = vec0_parse_vector_column(s, (int)strlen(s), &col); + assert(rc == SQLITE_OK); + assert(col.ivf.nlist == 65536); + sqlite3_free(col.name); + } + + // oversample > 1 without quantization should fail + { + const char *s = "v float[4] indexed by ivf(oversample=4)"; + rc = vec0_parse_vector_column(s, (int)strlen(s), &col); + assert(rc == SQLITE_ERROR); + } + + // oversample with quantizer should succeed + { + const char *s = "v float[8] indexed by ivf(quantizer=int8, oversample=4)"; + rc = vec0_parse_vector_column(s, (int)strlen(s), &col); + assert(rc == SQLITE_OK); + assert(col.ivf.oversample == 4); + assert(col.ivf.quantizer == VEC0_IVF_QUANTIZER_INT8); + sqlite3_free(col.name); + } + + // All options combined + { + const char *s = "v float[8] indexed by ivf(nlist=32, nprobe=4, quantizer=int8, oversample=2)"; + rc = vec0_parse_vector_column(s, (int)strlen(s), &col); + assert(rc == SQLITE_OK); + assert(col.ivf.nlist == 32); + assert(col.ivf.nprobe == 4); + assert(col.ivf.quantizer == VEC0_IVF_QUANTIZER_INT8); + assert(col.ivf.oversample == 2); + sqlite3_free(col.name); + } + + printf(" All ivf_config_parsing tests passed.\n"); +} +#endif /* SQLITE_VEC_ENABLE_IVF */ int main() { printf("Starting unit tests...\n"); @@ -982,6 +1397,10 @@ int main() { test_rescore_quantize_float_to_int8(); test_rescore_quantized_byte_size(); test_vec0_parse_vector_column_rescore(); +#if SQLITE_VEC_ENABLE_IVF + test_ivf_quantize_int8(); + test_ivf_quantize_binary(); + test_ivf_config_parsing(); #endif printf("All unit tests passed.\n"); } From 3e26925ce00d35d1791c629b1f7ea5b703dfc138 Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Mon, 30 Mar 2026 19:52:12 -0700 Subject: [PATCH 08/38] rm ivf plan file --- IVF_PLAN.md | 264 ---------------------------------------------------- 1 file changed, 264 deletions(-) delete mode 100644 IVF_PLAN.md diff --git a/IVF_PLAN.md b/IVF_PLAN.md deleted file mode 100644 index 91bb85a..0000000 --- a/IVF_PLAN.md +++ /dev/null @@ -1,264 +0,0 @@ -# IVF Index for sqlite-vec - -## Overview - -IVF (Inverted File Index) is an approximate nearest neighbor index for -sqlite-vec's `vec0` virtual table. It partitions vectors into clusters via -k-means, then at query time only scans the nearest clusters instead of all -vectors. Combined with scalar or binary quantization, this gives 5-20x query -speedups over brute-force with tunable recall. - -## SQL API - -### Table Creation - -```sql -CREATE VIRTUAL TABLE vec_items USING vec0( - id INTEGER PRIMARY KEY, - embedding float[768] distance_metric=cosine - INDEXED BY ivf(nlist=128, nprobe=16) -); - --- With quantization (4x smaller cells, rescore for recall) -CREATE VIRTUAL TABLE vec_items USING vec0( - id INTEGER PRIMARY KEY, - embedding float[768] distance_metric=cosine - INDEXED BY ivf(nlist=128, nprobe=16, quantizer=int8, oversample=4) -); -``` - -### Parameters - -| Parameter | Values | Default | Description | -|-----------|--------|---------|-------------| -| `nlist` | 1-65536, or 0 | 128 | Number of k-means clusters. Rule of thumb: `sqrt(N)` | -| `nprobe` | 1-nlist | 10 | Clusters to search at query time. More = better recall, slower | -| `quantizer` | `none`, `int8`, `binary` | `none` | How vectors are stored in cells | -| `oversample` | >= 1 | 1 | Re-rank `oversample * k` candidates with full-precision distance | - -### Inserting Vectors - -```sql --- Works immediately, even before training -INSERT INTO vec_items(id, embedding) VALUES (1, :vector); -``` - -Before centroids exist, vectors go to an "unassigned" partition and queries do -brute-force. After training, new inserts are assigned to the nearest centroid. - -### Training (Computing Centroids) - -```sql --- Run built-in k-means on all vectors -INSERT INTO vec_items(id) VALUES ('compute-centroids'); -``` - -This loads all vectors into memory, runs k-means++ with Lloyd's algorithm, -creates quantized centroids, and redistributes all vectors into cluster cells. -It's a blocking operation — run it once after bulk insert. - -### Manual Centroid Import - -```sql --- Import externally-computed centroids -INSERT INTO vec_items(id, embedding) VALUES ('set-centroid:0', :centroid_0); -INSERT INTO vec_items(id, embedding) VALUES ('set-centroid:1', :centroid_1); - --- Assign vectors to imported centroids -INSERT INTO vec_items(id) VALUES ('assign-vectors'); -``` - -### Runtime Parameter Tuning - -```sql --- Change nprobe without rebuilding the index -INSERT INTO vec_items(id) VALUES ('nprobe=32'); -``` - -### KNN Queries - -```sql --- Same syntax as standard vec0 -SELECT id, distance -FROM vec_items -WHERE embedding MATCH :query AND k = 10; -``` - -### Other Commands - -```sql --- Remove centroids, move all vectors back to unassigned -INSERT INTO vec_items(id) VALUES ('clear-centroids'); -``` - -## How It Works - -### Architecture - -``` -User vector (float32) - → quantize to int8/binary (if quantizer != none) - → find nearest centroid (quantized distance) - → store quantized vector in cell blob - → store full vector in KV table (if quantizer != none) - → query: - 1. quantize query vector - 2. find top nprobe centroids by quantized distance - 3. scan cell blobs: quantized distance (fast, small I/O) - 4. if oversample > 1: re-score top N*k with full vectors - 5. return top k -``` - -### Shadow Tables - -For a table `vec_items` with vector column index 0: - -| Table | Schema | Purpose | -|-------|--------|---------| -| `vec_items_ivf_centroids00` | `centroid_id PK, centroid BLOB` | K-means centroids (quantized) | -| `vec_items_ivf_cells00` | `centroid_id, n_vectors, validity BLOB, rowids BLOB, vectors BLOB` | Packed vector cells, 64 vectors max per row. Multiple rows per centroid. Index on centroid_id. | -| `vec_items_ivf_rowid_map00` | `rowid PK, cell_id, slot` | Maps vector rowid → cell location for O(1) delete | -| `vec_items_ivf_vectors00` | `rowid PK, vector BLOB` | Full-precision vectors (only when quantizer != none) | - -### Cell Storage - -Cells use packed blob storage identical to vec0's chunk layout: -- **validity**: bitmap (1 bit per slot) marking live vectors -- **rowids**: packed i64 array -- **vectors**: packed array of quantized vectors - -Cells are capped at 64 vectors (~200KB at 768-dim float32, ~48KB for int8, -~6KB for binary). When a cell fills, a new row is created for the same -centroid. This avoids SQLite overflow page traversal which was a 110x -performance bottleneck with unbounded cells. - -### Quantization - -**int8**: Each float32 dimension clamped to [-1,1] and scaled to int8 -[-127,127]. 4x storage reduction. Distance computed via int8 L2. - -**binary**: Sign-bit quantization — each bit is 1 if the float is positive. -32x storage reduction. Distance computed via hamming distance. - -**Oversample re-ranking**: When `oversample > 1`, the quantized scan collects -`oversample * k` candidates, then looks up each candidate's full-precision -vector from the KV table and re-computes exact distance. This recovers nearly -all recall lost from quantization. At oversample=4 with int8, recall matches -non-quantized IVF exactly. - -### K-Means - -Uses Lloyd's algorithm with k-means++ initialization: -1. K-means++ picks initial centroids weighted by distance -2. Lloyd's iterations: assign vectors to nearest centroid, recompute centroids as cluster means -3. Empty cluster handling: reassign to farthest point -4. K-means runs in float32; centroids are quantized before storage - -Training data: recommend 16× nlist vectors. At nlist=1000, that's 16k -vectors — k-means takes ~140s on 768-dim data. - -## Performance - -### 100k vectors (COHERE 768-dim cosine) - -``` - name qry(ms) recall -─────────────────────────────────────────────── - ivf(q=int8,os=4),p=8 5.3ms 0.934 ← 6x faster than flat - ivf(q=int8,os=4),p=16 5.4ms 0.968 - ivf(q=none),p=8 5.3ms 0.934 - ivf(q=binary,os=10),p=16 1.3ms 0.832 ← 26x faster than flat - ivf(q=int8,os=4),p=32 7.4ms 0.990 - ivf(q=none),p=32 15.5ms 0.992 - int8(os=4) 18.7ms 0.996 - bit(os=8) 18.7ms 0.884 - flat 33.7ms 1.000 -``` - -### 1M vectors (COHERE 768-dim cosine) - -``` - name insert train MB qry(ms) recall -────────────────────────────────────────────────────────────────────── - ivf(q=int8,os=4),p=8 163s 142s 4725 16.3ms 0.892 - ivf(q=binary,os=10),p=16 118s 144s 4073 17.7ms 0.830 - ivf(q=int8,os=4),p=16 163s 142s 4725 24.3ms 0.950 - ivf(q=int8,os=4),p=32 163s 142s 4725 41.6ms 0.980 - ivf(q=none),p=8 497s 144s 3101 52.1ms 0.890 - ivf(q=none),p=16 497s 144s 3101 56.6ms 0.950 - bit(os=8) 18s - 3048 83.5ms 0.918 - ivf(q=none),p=32 497s 144s 3101 103.9ms 0.980 - int8(os=4) 19s - 3689 169.1ms 0.994 - flat 20s - 2955 338.0ms 1.000 -``` - -**Best config at 1M: `ivf(quantizer=int8, oversample=4, nprobe=16)`** — -24ms query, 0.95 recall, 14x faster than flat, 7x faster than int8 baseline. - -### Scaling Characteristics - -| Metric | 100k | 1M | Scaling | -|--------|------|-----|---------| -| Flat query | 34ms | 338ms | 10x (linear) | -| IVF int8 p=16 | 5.4ms | 24.3ms | 4.5x (sublinear) | -| IVF insert rate | ~10k/s | ~6k/s | Slight degradation | -| Training (nlist=1000) | 13s | 142s | ~11x | - -## Implementation - -### File Structure - -``` -sqlite-vec-ivf-kmeans.c K-means++ algorithm (pure C, no SQLite deps) -sqlite-vec-ivf.c All IVF logic: parser, shadow tables, insert, - delete, query, centroid commands, quantization -sqlite-vec.c ~50 lines of additions: struct fields, #includes, - dispatch hooks in parse/create/insert/delete/filter -``` - -Both IVF files are `#include`d into `sqlite-vec.c`. No Makefile changes needed. - -### Key Design Decisions - -1. **Fixed-size cells (64 vectors)** instead of one blob per centroid. Avoids - SQLite overflow page traversal which caused 110x insert slowdown. - -2. **Multiple cell rows per centroid** with an index on centroid_id. When a - cell fills, a new row is created. Query scans all rows for probed centroids - via `WHERE centroid_id IN (...)`. - -3. **Always store full vectors** when quantizer != none (in `_ivf_vectors` KV - table). Enables oversample re-ranking and point queries returning original - precision. - -4. **K-means in float32, quantize after**. Simpler than quantized k-means, - and assignment accuracy doesn't suffer much since nprobe compensates. - -5. **NEON SIMD for cosine distance**. Added `cosine_float_neon()` with 4-wide - FMA for dot product + magnitudes. Benefits all vec0 queries, not just IVF. - -6. **Runtime nprobe tuning**. `INSERT INTO t(id) VALUES ('nprobe=N')` changes - the probe count without rebuilding — enables fast parameter sweeps. - -### Optimization History - -| Optimization | Impact | -|-------------|--------| -| Fixed-size cells (64 max) | 110x insert speedup | -| Skip chunk writes for IVF | 2x DB size reduction | -| NEON cosine distance | 2x query speedup + 13% recall improvement (correct metric) | -| Cached prepared statements | Eliminated per-insert prepare/finalize | -| Batched cell reads (IN clause) | Fewer SQLite queries per KNN | -| int8 quantization | 2.5x query speedup at same recall | -| Binary quantization | 32x less cell I/O | -| Oversample re-ranking | Recovers quantization recall loss | - -## Remaining Work - -See `ivf-benchmarks/TODO.md` for the full list. Key items: - -- **Cache centroids in memory** — each insert re-reads all centroids from SQLite -- **Runtime oversample** — same pattern as nprobe runtime command -- **SIMD k-means** — training uses scalar distance, could be 4x faster -- **Top-k heap** — replace qsort with min-heap for large nprobe -- **IVF-PQ** — product quantization for better compression/recall tradeoff From bb3ef78f75cf72efb41f55ff297c186e37011329 Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Mon, 30 Mar 2026 23:17:30 -0700 Subject: [PATCH 09/38] Hide IVF behind SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE, default off Rename SQLITE_VEC_ENABLE_IVF to SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE and flip the default from 1 to 0. IVF tests are automatically skipped when the build flag is not set. Co-Authored-By: Claude Opus 4.6 (1M context) --- sqlite-vec.c | 42 +++++++++++++++++++++--------------------- tests/conftest.py | 24 ++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 21 deletions(-) diff --git a/sqlite-vec.c b/sqlite-vec.c index 88f60b9..015792b 100644 --- a/sqlite-vec.c +++ b/sqlite-vec.c @@ -93,8 +93,8 @@ typedef size_t usize; #define COMPILER_SUPPORTS_VTAB_IN 1 #endif -#ifndef SQLITE_VEC_ENABLE_IVF -#define SQLITE_VEC_ENABLE_IVF 1 +#ifndef SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE +#define SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE 0 #endif #ifndef SQLITE_SUBTYPE @@ -2558,7 +2558,7 @@ struct Vec0RescoreConfig { }; #endif -#if SQLITE_VEC_ENABLE_IVF +#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE enum Vec0IvfQuantizer { VEC0_IVF_QUANTIZER_NONE = 0, VEC0_IVF_QUANTIZER_INT8 = 1, @@ -2737,7 +2737,7 @@ static int vec0_parse_rescore_options(struct Vec0Scanner *scanner, * @return int SQLITE_OK on success, SQLITE_EMPTY is it's not a vector column * definition, SQLITE_ERROR on error. */ -#if SQLITE_VEC_ENABLE_IVF +#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE // Forward declaration — defined in sqlite-vec-ivf.c static int vec0_parse_ivf_options(struct Vec0Scanner *scanner, struct Vec0IvfConfig *config); @@ -2922,7 +2922,7 @@ int vec0_parse_vector_column(const char *source, int source_length, } #endif else if (sqlite3_strnicmp(token.start, "ivf", indexNameLen) == 0) { -#if SQLITE_VEC_ENABLE_IVF +#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE indexType = VEC0_INDEX_TYPE_IVF; memset(&ivfConfig, 0, sizeof(ivfConfig)); rc = vec0_parse_ivf_options(&scanner, &ivfConfig); @@ -3321,7 +3321,7 @@ struct vec0_vtab { int chunk_size; -#if SQLITE_VEC_ENABLE_IVF +#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE // IVF cached state per vector column char *shadowIvfCellsNames[VEC0_MAX_VECTOR_COLUMNS]; // table name for blob_open int ivfTrainedCache[VEC0_MAX_VECTOR_COLUMNS]; // -1=unknown, 0=no, 1=yes @@ -3419,7 +3419,7 @@ void vec0_free_resources(vec0_vtab *p) { sqlite3_finalize(p->stmtRowidsGetChunkPosition); p->stmtRowidsGetChunkPosition = NULL; -#if SQLITE_VEC_ENABLE_IVF +#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE for (int i = 0; i < VEC0_MAX_VECTOR_COLUMNS; i++) { sqlite3_finalize(p->stmtIvfCellMeta[i]); p->stmtIvfCellMeta[i] = NULL; sqlite3_finalize(p->stmtIvfCellUpdateN[i]); p->stmtIvfCellUpdateN[i] = NULL; @@ -3451,7 +3451,7 @@ void vec0_free(vec0_vtab *p) { for (int i = 0; i < p->numVectorColumns; i++) { sqlite3_free(p->shadowVectorChunksNames[i]); p->shadowVectorChunksNames[i] = NULL; -#if SQLITE_VEC_ENABLE_IVF +#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE sqlite3_free(p->shadowIvfCellsNames[i]); p->shadowIvfCellsNames[i] = NULL; #endif @@ -3743,7 +3743,7 @@ int vec0_result_id(vec0_vtab *p, sqlite3_context *context, i64 rowid) { * will be stored. * @return int SQLITE_OK on success. */ -#if SQLITE_VEC_ENABLE_IVF +#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE // Forward declaration — defined in sqlite-vec-ivf.c (included later) static int ivf_get_vector_data(vec0_vtab *p, i64 rowid, int col_idx, void **outVector, int *outVectorSize); @@ -3756,7 +3756,7 @@ int vec0_get_vector_data(vec0_vtab *pVtab, i64 rowid, int vector_column_idx, i64 chunk_id; i64 chunk_offset; -#if SQLITE_VEC_ENABLE_IVF +#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE // IVF-indexed columns store vectors in _ivf_cells, not _vector_chunks if (p->vector_columns[vector_column_idx].index_type == VEC0_INDEX_TYPE_IVF) { return ivf_get_vector_data(p, rowid, vector_column_idx, outVector, outVectorSize); @@ -4411,7 +4411,7 @@ int vec0_new_chunk(vec0_vtab *p, sqlite3_value ** partitionKeyValues, i64 *chunk #if SQLITE_VEC_ENABLE_RESCORE // Rescore and IVF columns don't use _vector_chunks for float storage if (p->vector_columns[vector_column_idx].index_type == VEC0_INDEX_TYPE_RESCORE -#if SQLITE_VEC_ENABLE_IVF +#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE || p->vector_columns[vector_column_idx].index_type == VEC0_INDEX_TYPE_IVF #endif ) { @@ -4587,7 +4587,7 @@ void vec0_cursor_clear(vec0_cursor *pCur) { } // IVF index implementation — #include'd here after all struct/helper definitions -#if SQLITE_VEC_ENABLE_IVF +#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE #include "sqlite-vec-ivf-kmeans.c" #include "sqlite-vec-ivf.c" #endif @@ -4986,7 +4986,7 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, } #endif } -#if SQLITE_VEC_ENABLE_IVF +#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE for (int i = 0; i < pNew->numVectorColumns; i++) { if (pNew->vector_columns[i].index_type != VEC0_INDEX_TYPE_IVF) continue; pNew->shadowIvfCellsNames[i] = @@ -5147,7 +5147,7 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, } #endif -#if SQLITE_VEC_ENABLE_IVF +#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE // Create IVF shadow tables for IVF-indexed vector columns for (int i = 0; i < pNew->numVectorColumns; i++) { if (pNew->vector_columns[i].index_type != VEC0_INDEX_TYPE_IVF) continue; @@ -5315,7 +5315,7 @@ static int vec0Destroy(sqlite3_vtab *pVtab) { } #endif -#if SQLITE_VEC_ENABLE_IVF +#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE // Drop IVF shadow tables for (int i = 0; i < p->numVectorColumns; i++) { if (p->vector_columns[i].index_type != VEC0_INDEX_TYPE_IVF) continue; @@ -7335,7 +7335,7 @@ int vec0Filter_knn(vec0_cursor *pCur, vec0_vtab *p, int idxNum, } #endif -#if SQLITE_VEC_ENABLE_IVF +#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE // IVF dispatch: if vector column has IVF, use IVF query instead of chunk scan if (vector_column->index_type == VEC0_INDEX_TYPE_IVF) { rc = ivf_query_knn(p, vectorColumnIdx, queryVector, @@ -8177,7 +8177,7 @@ int vec0Update_InsertWriteFinalStep(vec0_vtab *p, i64 chunk_rowid, #if SQLITE_VEC_ENABLE_RESCORE // Rescore and IVF columns don't use _vector_chunks if (p->vector_columns[i].index_type == VEC0_INDEX_TYPE_RESCORE -#if SQLITE_VEC_ENABLE_IVF +#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE || p->vector_columns[i].index_type == VEC0_INDEX_TYPE_IVF #endif ) @@ -8593,7 +8593,7 @@ int vec0Update_Insert(sqlite3_vtab *pVTab, int argc, sqlite3_value **argv, } #endif -#if SQLITE_VEC_ENABLE_IVF +#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE // Step #4: IVF index insert (if any vector column uses IVF) for (int i = 0; i < p->numVectorColumns; i++) { if (p->vector_columns[i].index_type != VEC0_INDEX_TYPE_IVF) continue; @@ -9189,7 +9189,7 @@ int vec0Update_Delete(sqlite3_vtab *pVTab, sqlite3_value *idValue) { } } -#if SQLITE_VEC_ENABLE_IVF +#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE // 7. delete from IVF index for (int i = 0; i < p->numVectorColumns; i++) { if (p->vector_columns[i].index_type != VEC0_INDEX_TYPE_IVF) continue; @@ -9473,7 +9473,7 @@ static int vec0Update(sqlite3_vtab *pVTab, int argc, sqlite3_value **argv, } // INSERT operation else if (argc > 1 && sqlite3_value_type(argv[0]) == SQLITE_NULL) { -#if SQLITE_VEC_ENABLE_IVF +#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE // Check for IVF command inserts: INSERT INTO t(rowid) VALUES ('compute-centroids') // The id column holds the command string. sqlite3_value *idVal = argv[2 + VEC0_COLUMN_ID]; @@ -9632,7 +9632,7 @@ static sqlite3_module vec0Module = { #define SQLITE_VEC_DEBUG_BUILD_RESCORE "" #endif -#if SQLITE_VEC_ENABLE_IVF +#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE #define SQLITE_VEC_DEBUG_BUILD_IVF "ivf" #else #define SQLITE_VEC_DEBUG_BUILD_IVF "" diff --git a/tests/conftest.py b/tests/conftest.py index 9549d37..3a24468 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,29 @@ import pytest import sqlite3 +import os + + +def _vec_debug(): + db = sqlite3.connect(":memory:") + db.enable_load_extension(True) + db.load_extension("dist/vec0") + db.enable_load_extension(False) + return db.execute("SELECT vec_debug()").fetchone()[0] + + +def _has_build_flag(flag): + return flag in _vec_debug().split("Build flags:")[-1] + + +def pytest_collection_modifyitems(config, items): + has_ivf = _has_build_flag("ivf") + if has_ivf: + return + skip_ivf = pytest.mark.skip(reason="IVF not enabled (compile with -DSQLITE_VEC_EXPERIMENTAL_IVF_ENABLE=1)") + ivf_prefixes = ("test-ivf",) + for item in items: + if any(item.fspath.basename.startswith(p) for p in ivf_prefixes): + item.add_marker(skip_ivf) @pytest.fixture() From 575371d751d716c1e35edf4cd46e5aa570b909a8 Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Sun, 29 Mar 2026 19:46:53 -0700 Subject: [PATCH 10/38] Add DiskANN index for vec0 virtual table Add DiskANN graph-based index: builds a Vamana graph with configurable R (max degree) and L (search list size, separate for insert/query), supports int8 quantization with rescore, lazy reverse-edge replacement, pre-quantized query optimization, and insert buffer reuse. Includes shadow table management, delete support, KNN integration, compile flag (SQLITE_VEC_ENABLE_DISKANN), release-demo workflow, fuzz targets, and tests. Fixes rescore int8 quantization bug. --- .github/workflows/release-demo.yml | 118 ++ Makefile | 2 +- benchmarks-ann/Makefile | 20 +- benchmarks-ann/bench.py | 439 ++++++- benchmarks-ann/schema.sql | 25 + sqlite-vec-diskann.c | 1768 +++++++++++++++++++++++++++ sqlite-vec-rescore.c | 18 +- sqlite-vec.c | 787 +++++++++++- tests/fuzz/Makefile | 35 +- tests/fuzz/diskann-blob-truncate.c | 250 ++++ tests/fuzz/diskann-buffer-flush.c | 164 +++ tests/fuzz/diskann-command-inject.c | 158 +++ tests/fuzz/diskann-create.c | 44 + tests/fuzz/diskann-deep-search.c | 187 +++ tests/fuzz/diskann-delete-stress.c | 175 +++ tests/fuzz/diskann-graph-corrupt.c | 123 ++ tests/fuzz/diskann-int8-quant.c | 164 +++ tests/fuzz/diskann-operations.c | 100 ++ tests/fuzz/diskann-prune-direct.c | 131 ++ tests/fuzz/diskann.dict | 10 + tests/sqlite-vec-internal.h | 85 ++ tests/test-diskann.py | 1160 ++++++++++++++++++ tests/test-unit.c | 722 +++++++++++ 23 files changed, 6550 insertions(+), 135 deletions(-) create mode 100644 .github/workflows/release-demo.yml create mode 100644 sqlite-vec-diskann.c create mode 100644 tests/fuzz/diskann-blob-truncate.c create mode 100644 tests/fuzz/diskann-buffer-flush.c create mode 100644 tests/fuzz/diskann-command-inject.c create mode 100644 tests/fuzz/diskann-create.c create mode 100644 tests/fuzz/diskann-deep-search.c create mode 100644 tests/fuzz/diskann-delete-stress.c create mode 100644 tests/fuzz/diskann-graph-corrupt.c create mode 100644 tests/fuzz/diskann-int8-quant.c create mode 100644 tests/fuzz/diskann-operations.c create mode 100644 tests/fuzz/diskann-prune-direct.c create mode 100644 tests/fuzz/diskann.dict create mode 100644 tests/test-diskann.py diff --git a/.github/workflows/release-demo.yml b/.github/workflows/release-demo.yml new file mode 100644 index 0000000..2f4b396 --- /dev/null +++ b/.github/workflows/release-demo.yml @@ -0,0 +1,118 @@ +name: "Release Demo (DiskANN)" +on: + push: + branches: [diskann-yolo2] +permissions: + contents: write +jobs: + build-linux-x86_64-extension: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v4 + - run: ./scripts/vendor.sh + - run: make loadable static + - uses: actions/upload-artifact@v4 + with: + name: sqlite-vec-linux-x86_64-extension + path: dist/* + build-linux-aarch64-extension: + runs-on: ubuntu-22.04-arm + steps: + - uses: actions/checkout@v4 + - run: ./scripts/vendor.sh + - run: make loadable static + - uses: actions/upload-artifact@v4 + with: + name: sqlite-vec-linux-aarch64-extension + path: dist/* + build-macos-x86_64-extension: + runs-on: macos-15-intel + steps: + - uses: actions/checkout@v4 + - run: ./scripts/vendor.sh + - run: make loadable static + - uses: actions/upload-artifact@v4 + with: + name: sqlite-vec-macos-x86_64-extension + path: dist/* + build-macos-aarch64-extension: + runs-on: macos-14 + steps: + - uses: actions/checkout@v4 + - run: ./scripts/vendor.sh + - run: make loadable static + - uses: actions/upload-artifact@v4 + with: + name: sqlite-vec-macos-aarch64-extension + path: dist/* + build-windows-x86_64-extension: + runs-on: windows-2022 + steps: + - uses: actions/checkout@v4 + - uses: ilammy/msvc-dev-cmd@v1 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + - run: ./scripts/vendor.sh + shell: bash + - run: make sqlite-vec.h + - run: mkdir dist + - run: cl.exe /fPIC -shared /W4 /Ivendor/ /O2 /LD sqlite-vec.c -o dist/vec0.dll + - uses: actions/upload-artifact@v4 + with: + name: sqlite-vec-windows-x86_64-extension + path: dist/* + dist: + runs-on: ubuntu-latest + needs: + [ + build-linux-x86_64-extension, + build-linux-aarch64-extension, + build-macos-x86_64-extension, + build-macos-aarch64-extension, + build-windows-x86_64-extension, + ] + steps: + - uses: actions/checkout@v4 + - uses: actions/download-artifact@v4 + with: + name: sqlite-vec-linux-x86_64-extension + path: dist/linux-x86_64 + - uses: actions/download-artifact@v4 + with: + name: sqlite-vec-linux-aarch64-extension + path: dist/linux-aarch64 + - uses: actions/download-artifact@v4 + with: + name: sqlite-vec-macos-x86_64-extension + path: dist/macos-x86_64 + - uses: actions/download-artifact@v4 + with: + name: sqlite-vec-macos-aarch64-extension + path: dist/macos-aarch64 + - uses: actions/download-artifact@v4 + with: + name: sqlite-vec-windows-x86_64-extension + path: dist/windows-x86_64 + - run: make sqlite-vec.h + - run: | + ./scripts/vendor.sh + make amalgamation + mkdir -p amalgamation + cp dist/sqlite-vec.c sqlite-vec.h amalgamation/ + rm dist/sqlite-vec.c + - uses: asg017/setup-sqlite-dist@73e37b2ffb0b51e64a64eb035da38c958b9ff6c6 + - run: sqlite-dist build --set-version $(cat VERSION) + - name: Create release and upload assets + env: + GH_TOKEN: ${{ github.token }} + run: | + SHORT_SHA=$(echo "${{ github.sha }}" | head -c 10) + TAG="diskann-${SHORT_SHA}" + zip -j "amalgamation/sqlite-vec-amalgamation.zip" amalgamation/sqlite-vec.c amalgamation/sqlite-vec.h + gh release create "$TAG" \ + --title "$TAG" \ + --target "${{ github.sha }}" \ + --prerelease \ + amalgamation/sqlite-vec-amalgamation.zip \ + .sqlite-dist/pip/* diff --git a/Makefile b/Makefile index 2758ee5..89907fa 100644 --- a/Makefile +++ b/Makefile @@ -204,7 +204,7 @@ test-loadable-watch: watchexec --exts c,py,Makefile --clear -- make test-loadable test-unit: - $(CC) -DSQLITE_CORE -DSQLITE_VEC_TEST -DSQLITE_VEC_ENABLE_RESCORE tests/test-unit.c sqlite-vec.c vendor/sqlite3.c -I./ -Ivendor -o $(prefix)/test-unit && $(prefix)/test-unit + $(CC) -DSQLITE_CORE -DSQLITE_VEC_TEST -DSQLITE_VEC_ENABLE_RESCORE -DSQLITE_VEC_ENABLE_DISKANN=1 tests/test-unit.c sqlite-vec.c vendor/sqlite3.c -I./ -Ivendor $(CFLAGS) -o $(prefix)/test-unit && $(prefix)/test-unit # Standalone sqlite3 CLI with vec0 compiled in. Useful for benchmarking, # profiling (has debug symbols), and scripting without .load_extension. diff --git a/benchmarks-ann/Makefile b/benchmarks-ann/Makefile index 6081457..ddceb65 100644 --- a/benchmarks-ann/Makefile +++ b/benchmarks-ann/Makefile @@ -19,9 +19,16 @@ RESCORE_CONFIGS = \ "rescore-bit-os16:type=rescore,quantizer=bit,oversample=16" \ "rescore-int8-os8:type=rescore,quantizer=int8,oversample=8" -ALL_CONFIGS = $(BASELINES) $(RESCORE_CONFIGS) $(IVF_CONFIGS) +# --- DiskANN configs --- +DISKANN_CONFIGS = \ + "diskann-R48-binary:type=diskann,R=48,L=128,quantizer=binary" \ + "diskann-R72-binary:type=diskann,R=72,L=128,quantizer=binary" \ + "diskann-R72-int8:type=diskann,R=72,L=128,quantizer=int8" \ + "diskann-R72-L256:type=diskann,R=72,L=256,quantizer=binary" -.PHONY: seed ground-truth bench-smoke bench-rescore bench-ivf bench-10k bench-50k bench-100k bench-all \ +ALL_CONFIGS = $(BASELINES) $(RESCORE_CONFIGS) $(IVF_CONFIGS) $(DISKANN_CONFIGS) + +.PHONY: seed ground-truth bench-smoke bench-rescore bench-ivf bench-diskann bench-10k bench-50k bench-100k bench-all \ report clean # --- Data preparation --- @@ -37,7 +44,8 @@ ground-truth: seed bench-smoke: seed $(BENCH) --subset-size 5000 -k 10 -n 20 -o runs/smoke \ "brute-float:type=baseline,variant=float" \ - "ivf-quick:type=ivf,nlist=16,nprobe=4" + "ivf-quick:type=ivf,nlist=16,nprobe=4" \ + "diskann-quick:type=diskann,R=48,L=64,quantizer=binary" bench-rescore: seed $(BENCH) --subset-size 10000 -k 10 -o runs/rescore \ @@ -62,6 +70,12 @@ bench-ivf: seed $(BENCH) --subset-size 50000 -k 10 -o runs/ivf $(BASELINES) $(IVF_CONFIGS) $(BENCH) --subset-size 100000 -k 10 -o runs/ivf $(BASELINES) $(IVF_CONFIGS) +# --- DiskANN across sizes --- +bench-diskann: seed + $(BENCH) --subset-size 10000 -k 10 -o runs/diskann $(BASELINES) $(DISKANN_CONFIGS) + $(BENCH) --subset-size 50000 -k 10 -o runs/diskann $(BASELINES) $(DISKANN_CONFIGS) + $(BENCH) --subset-size 100000 -k 10 -o runs/diskann $(BASELINES) $(DISKANN_CONFIGS) + # --- Report --- report: @echo "Use: sqlite3 runs//results.db 'SELECT * FROM bench_results ORDER BY recall DESC'" diff --git a/benchmarks-ann/bench.py b/benchmarks-ann/bench.py index c640628..520db77 100644 --- a/benchmarks-ann/bench.py +++ b/benchmarks-ann/bench.py @@ -6,18 +6,16 @@ across different vec0 configurations. Config format: name:type=,key=val,key=val - Baseline (brute-force) keys: - type=baseline, variant=float|int8|bit, oversample=8 - - Index-specific types can be registered via INDEX_REGISTRY (see below). + Available types: none, vec0-flat, rescore, ivf, diskann Usage: python bench.py --subset-size 10000 \ - "brute-float:type=baseline,variant=float" \ - "brute-int8:type=baseline,variant=int8" \ - "brute-bit:type=baseline,variant=bit" + "raw:type=none" \ + "flat:type=vec0-flat,variant=float" \ + "flat-int8:type=vec0-flat,variant=int8" """ import argparse +from datetime import datetime, timezone import os import sqlite3 import statistics @@ -56,11 +54,118 @@ INDEX_REGISTRY = {} # ============================================================================ -# Baseline implementation +# "none" — regular table, no vec0, manual KNN via vec_distance_cosine() # ============================================================================ -def _baseline_create_table_sql(params): +def _none_create_table_sql(params): + variant = params["variant"] + if variant == "int8": + return ( + "CREATE TABLE vec_items (" + " id INTEGER PRIMARY KEY," + " embedding BLOB NOT NULL," + " embedding_int8 BLOB NOT NULL)" + ) + elif variant == "bit": + return ( + "CREATE TABLE vec_items (" + " id INTEGER PRIMARY KEY," + " embedding BLOB NOT NULL," + " embedding_bq BLOB NOT NULL)" + ) + return ( + "CREATE TABLE vec_items (" + " id INTEGER PRIMARY KEY," + " embedding BLOB NOT NULL)" + ) + + +def _none_insert_sql(params): + variant = params["variant"] + if variant == "int8": + return ( + "INSERT INTO vec_items(id, embedding, embedding_int8) " + "SELECT id, vector, vec_quantize_int8(vector, 'unit') " + "FROM base.train WHERE id >= :lo AND id < :hi" + ) + elif variant == "bit": + return ( + "INSERT INTO vec_items(id, embedding, embedding_bq) " + "SELECT id, vector, vec_quantize_binary(vector) " + "FROM base.train WHERE id >= :lo AND id < :hi" + ) + return ( + "INSERT INTO vec_items(id, embedding) " + "SELECT id, vector FROM base.train WHERE id >= :lo AND id < :hi" + ) + + +def _none_run_query(conn, params, query, k): + variant = params["variant"] + oversample = params.get("oversample", 8) + + if variant == "int8": + q_int8 = conn.execute( + "SELECT vec_quantize_int8(:query, 'unit')", {"query": query} + ).fetchone()[0] + return conn.execute( + "WITH coarse AS (" + " SELECT id, embedding FROM (" + " SELECT id, embedding, vec_distance_cosine(vec_int8(:q_int8), vec_int8(embedding_int8)) as dist " + " FROM vec_items ORDER BY dist LIMIT :oversample_k" + " )" + ") " + "SELECT id, vec_distance_cosine(:query, embedding) as distance " + "FROM coarse ORDER BY 2 LIMIT :k", + {"q_int8": q_int8, "query": query, "k": k, "oversample_k": k * oversample}, + ).fetchall() + elif variant == "bit": + q_bit = conn.execute( + "SELECT vec_quantize_binary(:query)", {"query": query} + ).fetchone()[0] + return conn.execute( + "WITH coarse AS (" + " SELECT id, embedding FROM (" + " SELECT id, embedding, vec_distance_hamming(vec_bit(:q_bit), vec_bit(embedding_bq)) as dist " + " FROM vec_items ORDER BY dist LIMIT :oversample_k" + " )" + ") " + "SELECT id, vec_distance_cosine(:query, embedding) as distance " + "FROM coarse ORDER BY 2 LIMIT :k", + {"q_bit": q_bit, "query": query, "k": k, "oversample_k": k * oversample}, + ).fetchall() + + return conn.execute( + "SELECT id, vec_distance_cosine(:query, embedding) as distance " + "FROM vec_items ORDER BY 2 LIMIT :k", + {"query": query, "k": k}, + ).fetchall() + + +def _none_describe(params): + v = params["variant"] + if v in ("int8", "bit"): + return f"none {v} (os={params['oversample']})" + return f"none float" + + +INDEX_REGISTRY["none"] = { + "defaults": {"variant": "float", "oversample": 8}, + "create_table_sql": _none_create_table_sql, + "insert_sql": _none_insert_sql, + "post_insert_hook": None, + "run_query": _none_run_query, + "describe": _none_describe, +} + + +# ============================================================================ +# vec0-flat — vec0 virtual table with brute-force MATCH +# ============================================================================ + + +def _vec0flat_create_table_sql(params): variant = params["variant"] extra = "" if variant == "int8": @@ -76,7 +181,7 @@ def _baseline_create_table_sql(params): ) -def _baseline_insert_sql(params): +def _vec0flat_insert_sql(params): variant = params["variant"] if variant == "int8": return ( @@ -93,7 +198,7 @@ def _baseline_insert_sql(params): return None # use default -def _baseline_run_query(conn, params, query, k): +def _vec0flat_run_query(conn, params, query, k): variant = params["variant"] oversample = params.get("oversample", 8) @@ -123,20 +228,20 @@ def _baseline_run_query(conn, params, query, k): return None # use default MATCH -def _baseline_describe(params): +def _vec0flat_describe(params): v = params["variant"] if v in ("int8", "bit"): - return f"baseline {v} (os={params['oversample']})" - return f"baseline {v}" + return f"vec0-flat {v} (os={params['oversample']})" + return f"vec0-flat {v}" -INDEX_REGISTRY["baseline"] = { +INDEX_REGISTRY["vec0-flat"] = { "defaults": {"variant": "float", "oversample": 8}, - "create_table_sql": _baseline_create_table_sql, - "insert_sql": _baseline_insert_sql, + "create_table_sql": _vec0flat_create_table_sql, + "insert_sql": _vec0flat_insert_sql, "post_insert_hook": None, - "run_query": _baseline_run_query, - "describe": _baseline_describe, + "run_query": _vec0flat_run_query, + "describe": _vec0flat_describe, } @@ -215,12 +320,64 @@ INDEX_REGISTRY["ivf"] = { } +# ============================================================================ +# DiskANN implementation +# ============================================================================ + + +def _diskann_create_table_sql(params): + bt = params["buffer_threshold"] + extra = f", buffer_threshold={bt}" if bt > 0 else "" + return ( + f"CREATE VIRTUAL TABLE vec_items USING vec0(" + f" id integer primary key," + f" embedding float[768] distance_metric=cosine" + f" INDEXED BY diskann(" + f" neighbor_quantizer={params['quantizer']}," + f" n_neighbors={params['R']}," + f" search_list_size={params['L']}" + f" {extra}" + f" )" + f")" + ) + + +def _diskann_pre_query_hook(conn, params): + L_search = params.get("L_search") + if L_search: + conn.execute( + "INSERT INTO vec_items(id) VALUES (?)", + (f"search_list_size_search={L_search}",), + ) + conn.commit() + print(f" Set search_list_size_search={L_search}") + + +def _diskann_describe(params): + desc = f"diskann q={params['quantizer']:<6} R={params['R']:<3} L={params['L']}" + L_search = params.get("L_search") + if L_search: + desc += f" L_search={L_search}" + return desc + + +INDEX_REGISTRY["diskann"] = { + "defaults": {"R": 72, "L": 128, "quantizer": "binary", "buffer_threshold": 0}, + "create_table_sql": _diskann_create_table_sql, + "insert_sql": None, + "post_insert_hook": None, + "pre_query_hook": _diskann_pre_query_hook, + "run_query": None, + "describe": _diskann_describe, +} + + # ============================================================================ # Config parsing # ============================================================================ INT_KEYS = { - "R", "L", "buffer_threshold", "nlist", "nprobe", "oversample", + "R", "L", "L_search", "buffer_threshold", "nlist", "nprobe", "oversample", "n_trees", "search_k", } @@ -238,7 +395,7 @@ def parse_config(spec): k, v = kv.split("=", 1) raw[k.strip()] = v.strip() - index_type = raw.pop("type", "baseline") + index_type = raw.pop("type", "vec0-flat") if index_type not in INDEX_REGISTRY: raise ValueError( f"Unknown index type: {index_type}. " @@ -289,7 +446,7 @@ def insert_loop(conn, sql, subset_size, label=""): return time.perf_counter() - t0 -def open_bench_db(db_path, ext_path, base_db): +def create_bench_db(db_path, ext_path, base_db): if os.path.exists(db_path): os.remove(db_path) conn = sqlite3.connect(db_path) @@ -300,6 +457,19 @@ def open_bench_db(db_path, ext_path, base_db): return conn +def open_existing_bench_db(db_path, ext_path, base_db): + if not os.path.exists(db_path): + raise FileNotFoundError( + f"Index DB not found: {db_path}\n" + f"Build it first with: --phase build" + ) + conn = sqlite3.connect(db_path) + conn.enable_load_extension(True) + conn.load_extension(ext_path) + conn.execute(f"ATTACH DATABASE '{base_db}' AS base") + return conn + + DEFAULT_INSERT_SQL = ( "INSERT INTO vec_items(id, embedding) " "SELECT id, vector FROM base.train WHERE id >= :lo AND id < :hi" @@ -313,7 +483,7 @@ DEFAULT_INSERT_SQL = ( def build_index(base_db, ext_path, name, params, subset_size, out_dir): db_path = os.path.join(out_dir, f"{name}.{subset_size}.db") - conn = open_bench_db(db_path, ext_path, base_db) + conn = create_bench_db(db_path, ext_path, base_db) reg = INDEX_REGISTRY[params["index_type"]] @@ -364,12 +534,16 @@ def _default_match_query(conn, query, k): ).fetchall() -def measure_knn(db_path, ext_path, base_db, params, subset_size, k=10, n=50): +def measure_knn(db_path, ext_path, base_db, params, subset_size, k=10, n=50, + pre_query_hook=None): conn = sqlite3.connect(db_path) conn.enable_load_extension(True) conn.load_extension(ext_path) conn.execute(f"ATTACH DATABASE '{base_db}' AS base") + if pre_query_hook: + pre_query_hook(conn, params) + query_vectors = load_query_vectors(base_db, n) reg = INDEX_REGISTRY[params["index_type"]] @@ -431,6 +605,34 @@ def measure_knn(db_path, ext_path, base_db, params, subset_size, k=10, n=50): # ============================================================================ +def open_results_db(results_path): + db = sqlite3.connect(results_path) + db.executescript(open(os.path.join(_SCRIPT_DIR, "schema.sql")).read()) + # Migrate existing DBs that predate the runs table + cols = {r[1] for r in db.execute("PRAGMA table_info(runs)").fetchall()} + if "phase" not in cols: + db.execute("ALTER TABLE runs ADD COLUMN phase TEXT NOT NULL DEFAULT 'both'") + db.commit() + return db + + +def create_run(db, config_name, index_type, subset_size, phase, k=None, n=None): + cur = db.execute( + "INSERT INTO runs (config_name, index_type, subset_size, phase, status, k, n) " + "VALUES (?, ?, ?, ?, 'pending', ?, ?)", + (config_name, index_type, subset_size, phase, k, n), + ) + db.commit() + return cur.lastrowid + + +def update_run(db, run_id, **kwargs): + sets = ", ".join(f"{k} = ?" for k in kwargs) + vals = list(kwargs.values()) + [run_id] + db.execute(f"UPDATE runs SET {sets} WHERE run_id = ?", vals) + db.commit() + + def save_results(results_path, rows): db = sqlite3.connect(results_path) db.executescript(open(os.path.join(_SCRIPT_DIR, "schema.sql")).read()) @@ -500,6 +702,8 @@ def main(): parser.add_argument("--subset-size", type=int, required=True) parser.add_argument("-k", type=int, default=10, help="KNN k (default 10)") parser.add_argument("-n", type=int, default=50, help="number of queries (default 50)") + parser.add_argument("--phase", choices=["build", "query", "both"], default="both", + help="build=build only, query=query existing index, both=default") parser.add_argument("--base-db", default=BASE_DB) parser.add_argument("--ext", default=EXT_PATH) parser.add_argument("-o", "--out-dir", default="runs") @@ -508,55 +712,164 @@ def main(): args = parser.parse_args() os.makedirs(args.out_dir, exist_ok=True) - results_db = args.results_db or os.path.join(args.out_dir, "results.db") + results_db_path = args.results_db or os.path.join(args.out_dir, "results.db") configs = [parse_config(c) for c in args.configs] + results_db = open_results_db(results_db_path) all_results = [] for i, (name, params) in enumerate(configs, 1): reg = INDEX_REGISTRY[params["index_type"]] desc = reg["describe"](params) - print(f"\n[{i}/{len(configs)}] {name} ({desc.strip()})") + print(f"\n[{i}/{len(configs)}] {name} ({desc.strip()}) [phase={args.phase}]") - build = build_index( - args.base_db, args.ext, name, params, args.subset_size, args.out_dir - ) - train_str = f" + {build['train_time_s']}s train" if build["train_time_s"] > 0 else "" - print( - f" Build: {build['insert_time_s']}s insert{train_str} " - f"{build['file_size_mb']} MB" - ) + db_path = os.path.join(args.out_dir, f"{name}.{args.subset_size}.db") - print(f" Measuring KNN (k={args.k}, n={args.n})...") - knn = measure_knn( - build["db_path"], args.ext, args.base_db, - params, args.subset_size, k=args.k, n=args.n, - ) - print(f" KNN: mean={knn['mean_ms']}ms recall@{args.k}={knn['recall']}") + if args.phase == "build": + run_id = create_run(results_db, name, params["index_type"], + args.subset_size, "build") + update_run(results_db, run_id, status="inserting") - all_results.append({ - "name": name, - "n_vectors": args.subset_size, - "index_type": params["index_type"], - "config_desc": desc, - "db_path": build["db_path"], - "insert_time_s": build["insert_time_s"], - "train_time_s": build["train_time_s"], - "total_time_s": build["total_time_s"], - "insert_per_vec_ms": build["insert_per_vec_ms"], - "rows": build["rows"], - "file_size_mb": build["file_size_mb"], - "k": args.k, - "n_queries": args.n, - "mean_ms": knn["mean_ms"], - "median_ms": knn["median_ms"], - "p99_ms": knn["p99_ms"], - "total_ms": knn["total_ms"], - "recall": knn["recall"], - }) + build = build_index( + args.base_db, args.ext, name, params, args.subset_size, args.out_dir + ) + train_str = f" + {build['train_time_s']}s train" if build["train_time_s"] > 0 else "" + print( + f" Build: {build['insert_time_s']}s insert{train_str} " + f"{build['file_size_mb']} MB" + ) + update_run(results_db, run_id, + status="built", + db_path=build["db_path"], + insert_time_s=build["insert_time_s"], + train_time_s=build["train_time_s"], + total_build_time_s=build["total_time_s"], + rows=build["rows"], + file_size_mb=build["file_size_mb"], + finished_at=datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")) + print(f" Index DB: {build['db_path']}") - print_report(all_results) - save_results(results_db, all_results) - print(f"\nResults saved to {results_db}") + elif args.phase == "query": + if not os.path.exists(db_path): + raise FileNotFoundError( + f"Index DB not found: {db_path}\n" + f"Build it first with: --phase build" + ) + + run_id = create_run(results_db, name, params["index_type"], + args.subset_size, "query", k=args.k, n=args.n) + update_run(results_db, run_id, status="querying") + + pre_hook = reg.get("pre_query_hook") + print(f" Measuring KNN (k={args.k}, n={args.n})...") + knn = measure_knn( + db_path, args.ext, args.base_db, + params, args.subset_size, k=args.k, n=args.n, + pre_query_hook=pre_hook, + ) + print(f" KNN: mean={knn['mean_ms']}ms recall@{args.k}={knn['recall']}") + + qps = round(args.n / (knn["total_ms"] / 1000), 1) if knn["total_ms"] > 0 else 0 + update_run(results_db, run_id, + status="done", + db_path=db_path, + mean_ms=knn["mean_ms"], + median_ms=knn["median_ms"], + p99_ms=knn["p99_ms"], + total_query_ms=knn["total_ms"], + qps=qps, + recall=knn["recall"], + finished_at=datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")) + + file_size_mb = os.path.getsize(db_path) / (1024 * 1024) + all_results.append({ + "name": name, + "n_vectors": args.subset_size, + "index_type": params["index_type"], + "config_desc": desc, + "db_path": db_path, + "insert_time_s": 0, + "train_time_s": 0, + "total_time_s": 0, + "insert_per_vec_ms": 0, + "rows": 0, + "file_size_mb": file_size_mb, + "k": args.k, + "n_queries": args.n, + "mean_ms": knn["mean_ms"], + "median_ms": knn["median_ms"], + "p99_ms": knn["p99_ms"], + "total_ms": knn["total_ms"], + "recall": knn["recall"], + }) + + else: # both + run_id = create_run(results_db, name, params["index_type"], + args.subset_size, "both", k=args.k, n=args.n) + update_run(results_db, run_id, status="inserting") + + build = build_index( + args.base_db, args.ext, name, params, args.subset_size, args.out_dir + ) + train_str = f" + {build['train_time_s']}s train" if build["train_time_s"] > 0 else "" + print( + f" Build: {build['insert_time_s']}s insert{train_str} " + f"{build['file_size_mb']} MB" + ) + update_run(results_db, run_id, status="querying", + db_path=build["db_path"], + insert_time_s=build["insert_time_s"], + train_time_s=build["train_time_s"], + total_build_time_s=build["total_time_s"], + rows=build["rows"], + file_size_mb=build["file_size_mb"]) + + print(f" Measuring KNN (k={args.k}, n={args.n})...") + knn = measure_knn( + build["db_path"], args.ext, args.base_db, + params, args.subset_size, k=args.k, n=args.n, + ) + print(f" KNN: mean={knn['mean_ms']}ms recall@{args.k}={knn['recall']}") + + qps = round(args.n / (knn["total_ms"] / 1000), 1) if knn["total_ms"] > 0 else 0 + update_run(results_db, run_id, + status="done", + mean_ms=knn["mean_ms"], + median_ms=knn["median_ms"], + p99_ms=knn["p99_ms"], + total_query_ms=knn["total_ms"], + qps=qps, + recall=knn["recall"], + finished_at=datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")) + + all_results.append({ + "name": name, + "n_vectors": args.subset_size, + "index_type": params["index_type"], + "config_desc": desc, + "db_path": build["db_path"], + "insert_time_s": build["insert_time_s"], + "train_time_s": build["train_time_s"], + "total_time_s": build["total_time_s"], + "insert_per_vec_ms": build["insert_per_vec_ms"], + "rows": build["rows"], + "file_size_mb": build["file_size_mb"], + "k": args.k, + "n_queries": args.n, + "mean_ms": knn["mean_ms"], + "median_ms": knn["median_ms"], + "p99_ms": knn["p99_ms"], + "total_ms": knn["total_ms"], + "recall": knn["recall"], + }) + + results_db.close() + + if all_results: + print_report(all_results) + save_results(results_db_path, all_results) + print(f"\nResults saved to {results_db_path}") + elif args.phase == "build": + print(f"\nBuild complete. Results tracked in {results_db_path}") if __name__ == "__main__": diff --git a/benchmarks-ann/schema.sql b/benchmarks-ann/schema.sql index 681df4e..ae8acf3 100644 --- a/benchmarks-ann/schema.sql +++ b/benchmarks-ann/schema.sql @@ -3,6 +3,31 @@ -- "baseline"; index-specific branches add their own types (registered -- via INDEX_REGISTRY in bench.py). +CREATE TABLE IF NOT EXISTS runs ( + run_id INTEGER PRIMARY KEY AUTOINCREMENT, + config_name TEXT NOT NULL, + index_type TEXT NOT NULL, + subset_size INTEGER NOT NULL, + phase TEXT NOT NULL DEFAULT 'both', -- 'build', 'query', or 'both' + status TEXT NOT NULL DEFAULT 'pending', + k INTEGER, + n INTEGER, + db_path TEXT, + insert_time_s REAL, + train_time_s REAL, + total_build_time_s REAL, + rows INTEGER, + file_size_mb REAL, + mean_ms REAL, + median_ms REAL, + p99_ms REAL, + total_query_ms REAL, + qps REAL, + recall REAL, + created_at TEXT NOT NULL DEFAULT (datetime('now')), + finished_at TEXT +); + CREATE TABLE IF NOT EXISTS build_results ( config_name TEXT NOT NULL, index_type TEXT NOT NULL, diff --git a/sqlite-vec-diskann.c b/sqlite-vec-diskann.c new file mode 100644 index 0000000..1a5fd2b --- /dev/null +++ b/sqlite-vec-diskann.c @@ -0,0 +1,1768 @@ +// DiskANN algorithm implementation +// This file is #include'd into sqlite-vec.c — not compiled separately. + +// ============================================================ +// DiskANN node blob encode/decode functions +// ============================================================ + +/** Compute size of validity bitmap in bytes. */ +int diskann_validity_byte_size(int n_neighbors) { + return n_neighbors / CHAR_BIT; +} + +/** Compute size of neighbor IDs blob in bytes. */ +size_t diskann_neighbor_ids_byte_size(int n_neighbors) { + return (size_t)n_neighbors * sizeof(i64); +} + +/** Compute size of quantized vectors blob in bytes. */ +size_t diskann_neighbor_qvecs_byte_size( + int n_neighbors, enum Vec0DiskannQuantizerType quantizer_type, + size_t dimensions) { + return (size_t)n_neighbors * + diskann_quantized_vector_byte_size(quantizer_type, dimensions); +} + +/** + * Create empty blobs for a new DiskANN node (all neighbors invalid). + * Caller must free the returned pointers with sqlite3_free(). + */ +int diskann_node_init( + int n_neighbors, enum Vec0DiskannQuantizerType quantizer_type, + size_t dimensions, + u8 **outValidity, int *outValiditySize, + u8 **outNeighborIds, int *outNeighborIdsSize, + u8 **outNeighborQvecs, int *outNeighborQvecsSize) { + + int validitySize = diskann_validity_byte_size(n_neighbors); + size_t idsSize = diskann_neighbor_ids_byte_size(n_neighbors); + size_t qvecsSize = diskann_neighbor_qvecs_byte_size( + n_neighbors, quantizer_type, dimensions); + + u8 *validity = sqlite3_malloc(validitySize); + u8 *ids = sqlite3_malloc(idsSize); + u8 *qvecs = sqlite3_malloc(qvecsSize); + + if (!validity || !ids || !qvecs) { + sqlite3_free(validity); + sqlite3_free(ids); + sqlite3_free(qvecs); + return SQLITE_NOMEM; + } + + memset(validity, 0, validitySize); + memset(ids, 0, idsSize); + memset(qvecs, 0, qvecsSize); + + *outValidity = validity; *outValiditySize = validitySize; + *outNeighborIds = ids; *outNeighborIdsSize = (int)idsSize; + *outNeighborQvecs = qvecs; *outNeighborQvecsSize = (int)qvecsSize; + return SQLITE_OK; +} + +/** Check if neighbor slot i is valid. */ +int diskann_validity_get(const u8 *validity, int i) { + return (validity[i / CHAR_BIT] >> (i % CHAR_BIT)) & 1; +} + +/** Set neighbor slot i as valid (1) or invalid (0). */ +void diskann_validity_set(u8 *validity, int i, int value) { + if (value) { + validity[i / CHAR_BIT] |= (1 << (i % CHAR_BIT)); + } else { + validity[i / CHAR_BIT] &= ~(1 << (i % CHAR_BIT)); + } +} + +/** Count the number of valid neighbors. */ +int diskann_validity_count(const u8 *validity, int n_neighbors) { + int count = 0; + for (int i = 0; i < n_neighbors; i++) { + count += diskann_validity_get(validity, i); + } + return count; +} + +/** Get the rowid of the neighbor in slot i. */ +i64 diskann_neighbor_id_get(const u8 *neighbor_ids, int i) { + i64 result; + memcpy(&result, neighbor_ids + i * sizeof(i64), sizeof(i64)); + return result; +} + +/** Set the rowid of the neighbor in slot i. */ +void diskann_neighbor_id_set(u8 *neighbor_ids, int i, i64 rowid) { + memcpy(neighbor_ids + i * sizeof(i64), &rowid, sizeof(i64)); +} + +/** Get a pointer to the quantized vector in slot i (read-only). */ +const u8 *diskann_neighbor_qvec_get( + const u8 *qvecs, int i, + enum Vec0DiskannQuantizerType quantizer_type, size_t dimensions) { + size_t qvec_size = diskann_quantized_vector_byte_size(quantizer_type, dimensions); + return qvecs + (size_t)i * qvec_size; +} + +/** Copy a quantized vector into slot i. */ +void diskann_neighbor_qvec_set( + u8 *qvecs, int i, const u8 *src_qvec, + enum Vec0DiskannQuantizerType quantizer_type, size_t dimensions) { + size_t qvec_size = diskann_quantized_vector_byte_size(quantizer_type, dimensions); + memcpy(qvecs + (size_t)i * qvec_size, src_qvec, qvec_size); +} + +/** + * Set neighbor slot i with a rowid and quantized vector, and mark as valid. + */ +void diskann_node_set_neighbor( + u8 *validity, u8 *neighbor_ids, u8 *qvecs, int i, + i64 neighbor_rowid, const u8 *neighbor_qvec, + enum Vec0DiskannQuantizerType quantizer_type, size_t dimensions) { + diskann_validity_set(validity, i, 1); + diskann_neighbor_id_set(neighbor_ids, i, neighbor_rowid); + diskann_neighbor_qvec_set(qvecs, i, neighbor_qvec, quantizer_type, dimensions); +} + +/** + * Clear neighbor slot i (mark invalid, zero out data). + */ +void diskann_node_clear_neighbor( + u8 *validity, u8 *neighbor_ids, u8 *qvecs, int i, + enum Vec0DiskannQuantizerType quantizer_type, size_t dimensions) { + diskann_validity_set(validity, i, 0); + diskann_neighbor_id_set(neighbor_ids, i, 0); + size_t qvec_size = diskann_quantized_vector_byte_size(quantizer_type, dimensions); + memset(qvecs + (size_t)i * qvec_size, 0, qvec_size); +} + +/** + * Quantize a full-precision float32 vector into the target quantizer format. + * Output buffer must be pre-allocated with diskann_quantized_vector_byte_size() bytes. + */ +int diskann_quantize_vector( + const f32 *src, size_t dimensions, + enum Vec0DiskannQuantizerType quantizer_type, + u8 *out) { + + switch (quantizer_type) { + case VEC0_DISKANN_QUANTIZER_BINARY: { + memset(out, 0, dimensions / CHAR_BIT); + for (size_t i = 0; i < dimensions; i++) { + if (src[i] > 0.0f) { + out[i / CHAR_BIT] |= (1 << (i % CHAR_BIT)); + } + } + return SQLITE_OK; + } + case VEC0_DISKANN_QUANTIZER_INT8: { + f32 step = (1.0f - (-1.0f)) / 255.0f; + for (size_t i = 0; i < dimensions; i++) { + ((i8 *)out)[i] = (i8)(((src[i] - (-1.0f)) / step) - 128.0f); + } + return SQLITE_OK; + } + } + return SQLITE_ERROR; +} + +/** + * Compute approximate distance between a full-precision query vector and a + * quantized neighbor vector. Used during graph traversal. + */ +/** + * Compute distance between a pre-quantized query and a quantized neighbor. + * The caller is responsible for quantizing the query vector once and passing + * the result here for each neighbor comparison. + */ +static f32 diskann_distance_quantized_precomputed( + const u8 *query_quantized, const u8 *quantized_neighbor, + size_t dimensions, + enum Vec0DiskannQuantizerType quantizer_type, + enum Vec0DistanceMetrics distance_metric) { + + switch (quantizer_type) { + case VEC0_DISKANN_QUANTIZER_BINARY: + return distance_hamming(query_quantized, quantized_neighbor, &dimensions); + case VEC0_DISKANN_QUANTIZER_INT8: { + switch (distance_metric) { + case VEC0_DISTANCE_METRIC_L2: + return distance_l2_sqr_int8(query_quantized, quantized_neighbor, &dimensions); + case VEC0_DISTANCE_METRIC_COSINE: + return distance_cosine_int8(query_quantized, quantized_neighbor, &dimensions); + case VEC0_DISTANCE_METRIC_L1: + return (f32)distance_l1_int8(query_quantized, quantized_neighbor, &dimensions); + } + break; + } + } + return FLT_MAX; +} + +/** + * Quantize a float query vector. Returns allocated buffer (caller must free). + */ +static u8 *diskann_quantize_query( + const f32 *query_vector, size_t dimensions, + enum Vec0DiskannQuantizerType quantizer_type) { + size_t qsize = diskann_quantized_vector_byte_size(quantizer_type, dimensions); + u8 *buf = sqlite3_malloc(qsize); + if (!buf) return NULL; + diskann_quantize_vector(query_vector, dimensions, quantizer_type, buf); + return buf; +} + +/** + * Legacy wrapper: quantizes on-the-fly (used by callers that don't pre-quantize). + */ +f32 diskann_distance_quantized( + const void *query_vector, const u8 *quantized_neighbor, + size_t dimensions, + enum Vec0DiskannQuantizerType quantizer_type, + enum Vec0DistanceMetrics distance_metric) { + + u8 *query_q = diskann_quantize_query((const f32 *)query_vector, dimensions, quantizer_type); + if (!query_q) return FLT_MAX; + f32 dist = diskann_distance_quantized_precomputed( + query_q, quantized_neighbor, dimensions, quantizer_type, distance_metric); + sqlite3_free(query_q); + return dist; +} + +// ============================================================ +// DiskANN medoid / entry point management +// ============================================================ + +/** + * Get the current medoid rowid for the given vector column's DiskANN index. + * Returns SQLITE_OK with *outMedoid set to the medoid rowid. + * If the graph is empty, returns SQLITE_OK with *outIsEmpty = 1. + */ +static int diskann_medoid_get(vec0_vtab *p, int vec_col_idx, + i64 *outMedoid, int *outIsEmpty) { + int rc; + sqlite3_stmt *stmt = NULL; + char *key = sqlite3_mprintf("diskann_medoid_%02d", vec_col_idx); + char *zSql = sqlite3_mprintf( + "SELECT value FROM " VEC0_SHADOW_INFO_NAME " WHERE key = ?", + p->schemaName, p->tableName); + if (!key || !zSql) { + sqlite3_free(key); + sqlite3_free(zSql); + return SQLITE_NOMEM; + } + + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) { + sqlite3_free(key); + return rc; + } + + sqlite3_bind_text(stmt, 1, key, -1, sqlite3_free); + rc = sqlite3_step(stmt); + if (rc == SQLITE_ROW) { + if (sqlite3_column_type(stmt, 0) == SQLITE_NULL) { + *outIsEmpty = 1; + } else { + *outIsEmpty = 0; + *outMedoid = sqlite3_column_int64(stmt, 0); + } + rc = SQLITE_OK; + } else { + rc = SQLITE_ERROR; + } + sqlite3_finalize(stmt); + return rc; +} + +/** + * Set the medoid rowid for the given vector column's DiskANN index. + * Pass isEmpty = 1 to mark the graph as empty (NULL medoid). + */ +static int diskann_medoid_set(vec0_vtab *p, int vec_col_idx, + i64 medoidRowid, int isEmpty) { + int rc; + sqlite3_stmt *stmt = NULL; + char *key = sqlite3_mprintf("diskann_medoid_%02d", vec_col_idx); + char *zSql = sqlite3_mprintf( + "UPDATE " VEC0_SHADOW_INFO_NAME " SET value = ?2 WHERE key = ?1", + p->schemaName, p->tableName); + if (!key || !zSql) { + sqlite3_free(key); + sqlite3_free(zSql); + return SQLITE_NOMEM; + } + + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) { + sqlite3_free(key); + return rc; + } + + sqlite3_bind_text(stmt, 1, key, -1, sqlite3_free); + if (isEmpty) { + sqlite3_bind_null(stmt, 2); + } else { + sqlite3_bind_int64(stmt, 2, medoidRowid); + } + rc = sqlite3_step(stmt); + sqlite3_finalize(stmt); + return (rc == SQLITE_DONE) ? SQLITE_OK : SQLITE_ERROR; +} + + +/** + * Called when deleting a vector. If the deleted vector was the medoid, + * pick a new one (the first available vector, or set to empty if none remain). + */ +static int diskann_medoid_handle_delete(vec0_vtab *p, int vec_col_idx, + i64 deletedRowid) { + i64 currentMedoid; + int isEmpty; + int rc = diskann_medoid_get(p, vec_col_idx, ¤tMedoid, &isEmpty); + if (rc != SQLITE_OK) return rc; + + if (!isEmpty && currentMedoid == deletedRowid) { + sqlite3_stmt *stmt = NULL; + char *zSql = sqlite3_mprintf( + "SELECT rowid FROM " VEC0_SHADOW_VECTORS_N_NAME " WHERE rowid != ?1 LIMIT 1", + p->schemaName, p->tableName, vec_col_idx); + if (!zSql) return SQLITE_NOMEM; + + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) return rc; + + sqlite3_bind_int64(stmt, 1, deletedRowid); + rc = sqlite3_step(stmt); + if (rc == SQLITE_ROW) { + i64 newMedoid = sqlite3_column_int64(stmt, 0); + sqlite3_finalize(stmt); + return diskann_medoid_set(p, vec_col_idx, newMedoid, 0); + } else { + sqlite3_finalize(stmt); + return diskann_medoid_set(p, vec_col_idx, -1, 1); + } + } + return SQLITE_OK; +} + +// ============================================================ +// DiskANN database I/O helpers +// ============================================================ + +/** + * Read a node's full data from _diskann_nodes. + * Returns blobs that must be freed by the caller with sqlite3_free(). + */ +static int diskann_node_read(vec0_vtab *p, int vec_col_idx, i64 rowid, + u8 **outValidity, int *outValiditySize, + u8 **outNeighborIds, int *outNeighborIdsSize, + u8 **outQvecs, int *outQvecsSize) { + int rc; + if (!p->stmtDiskannNodeRead[vec_col_idx]) { + char *zSql = sqlite3_mprintf( + "SELECT neighbors_validity, neighbor_ids, neighbor_quantized_vectors " + "FROM " VEC0_SHADOW_DISKANN_NODES_N_NAME " WHERE rowid = ?", + p->schemaName, p->tableName, vec_col_idx); + if (!zSql) return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, + &p->stmtDiskannNodeRead[vec_col_idx], NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) return rc; + } + + sqlite3_stmt *stmt = p->stmtDiskannNodeRead[vec_col_idx]; + sqlite3_reset(stmt); + sqlite3_bind_int64(stmt, 1, rowid); + + rc = sqlite3_step(stmt); + if (rc != SQLITE_ROW) { + return SQLITE_ERROR; + } + + int vs = sqlite3_column_bytes(stmt, 0); + int is = sqlite3_column_bytes(stmt, 1); + int qs = sqlite3_column_bytes(stmt, 2); + + // Validate blob sizes against config expectations to detect truncated / + // corrupt data before any caller iterates using cfg->n_neighbors. + { + struct VectorColumnDefinition *col = &p->vector_columns[vec_col_idx]; + struct Vec0DiskannConfig *cfg = &col->diskann; + int expectedVs = diskann_validity_byte_size(cfg->n_neighbors); + int expectedIs = (int)diskann_neighbor_ids_byte_size(cfg->n_neighbors); + int expectedQs = (int)diskann_neighbor_qvecs_byte_size( + cfg->n_neighbors, cfg->quantizer_type, col->dimensions); + if (vs != expectedVs || is != expectedIs || qs != expectedQs) { + return SQLITE_CORRUPT; + } + } + + u8 *v = sqlite3_malloc(vs); + u8 *ids = sqlite3_malloc(is); + u8 *qv = sqlite3_malloc(qs); + if (!v || !ids || !qv) { + sqlite3_free(v); + sqlite3_free(ids); + sqlite3_free(qv); + return SQLITE_NOMEM; + } + + memcpy(v, sqlite3_column_blob(stmt, 0), vs); + memcpy(ids, sqlite3_column_blob(stmt, 1), is); + memcpy(qv, sqlite3_column_blob(stmt, 2), qs); + + *outValidity = v; *outValiditySize = vs; + *outNeighborIds = ids; *outNeighborIdsSize = is; + *outQvecs = qv; *outQvecsSize = qs; + return SQLITE_OK; +} + +/** + * Write (INSERT OR REPLACE) a node's data to _diskann_nodes. + */ +static int diskann_node_write(vec0_vtab *p, int vec_col_idx, i64 rowid, + const u8 *validity, int validitySize, + const u8 *neighborIds, int neighborIdsSize, + const u8 *qvecs, int qvecsSize) { + int rc; + if (!p->stmtDiskannNodeWrite[vec_col_idx]) { + char *zSql = sqlite3_mprintf( + "INSERT OR REPLACE INTO " VEC0_SHADOW_DISKANN_NODES_N_NAME + " (rowid, neighbors_validity, neighbor_ids, neighbor_quantized_vectors) " + "VALUES (?, ?, ?, ?)", + p->schemaName, p->tableName, vec_col_idx); + if (!zSql) return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, + &p->stmtDiskannNodeWrite[vec_col_idx], NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) return rc; + } + + sqlite3_stmt *stmt = p->stmtDiskannNodeWrite[vec_col_idx]; + sqlite3_reset(stmt); + sqlite3_bind_int64(stmt, 1, rowid); + sqlite3_bind_blob(stmt, 2, validity, validitySize, SQLITE_TRANSIENT); + sqlite3_bind_blob(stmt, 3, neighborIds, neighborIdsSize, SQLITE_TRANSIENT); + sqlite3_bind_blob(stmt, 4, qvecs, qvecsSize, SQLITE_TRANSIENT); + + rc = sqlite3_step(stmt); + return (rc == SQLITE_DONE) ? SQLITE_OK : SQLITE_ERROR; +} + +/** + * Read the full-precision vector for a given rowid from _vectors. + * Caller must free *outVector with sqlite3_free(). + */ +static int diskann_vector_read(vec0_vtab *p, int vec_col_idx, i64 rowid, + void **outVector, int *outVectorSize) { + int rc; + if (!p->stmtVectorsRead[vec_col_idx]) { + char *zSql = sqlite3_mprintf( + "SELECT vector FROM " VEC0_SHADOW_VECTORS_N_NAME " WHERE rowid = ?", + p->schemaName, p->tableName, vec_col_idx); + if (!zSql) return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, + &p->stmtVectorsRead[vec_col_idx], NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) return rc; + } + + sqlite3_stmt *stmt = p->stmtVectorsRead[vec_col_idx]; + sqlite3_reset(stmt); + sqlite3_bind_int64(stmt, 1, rowid); + + rc = sqlite3_step(stmt); + if (rc != SQLITE_ROW) { + return SQLITE_ERROR; + } + + int sz = sqlite3_column_bytes(stmt, 0); + void *vec = sqlite3_malloc(sz); + if (!vec) return SQLITE_NOMEM; + memcpy(vec, sqlite3_column_blob(stmt, 0), sz); + + *outVector = vec; + *outVectorSize = sz; + return SQLITE_OK; +} + +/** + * Write a full-precision vector to _vectors. + */ +static int diskann_vector_write(vec0_vtab *p, int vec_col_idx, i64 rowid, + const void *vector, int vectorSize) { + int rc; + if (!p->stmtVectorsInsert[vec_col_idx]) { + char *zSql = sqlite3_mprintf( + "INSERT OR REPLACE INTO " VEC0_SHADOW_VECTORS_N_NAME + " (rowid, vector) VALUES (?, ?)", + p->schemaName, p->tableName, vec_col_idx); + if (!zSql) return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, + &p->stmtVectorsInsert[vec_col_idx], NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) return rc; + } + + sqlite3_stmt *stmt = p->stmtVectorsInsert[vec_col_idx]; + sqlite3_reset(stmt); + sqlite3_bind_int64(stmt, 1, rowid); + sqlite3_bind_blob(stmt, 2, vector, vectorSize, SQLITE_TRANSIENT); + + rc = sqlite3_step(stmt); + return (rc == SQLITE_DONE) ? SQLITE_OK : SQLITE_ERROR; +} + +// ============================================================ +// DiskANN search data structures +// ============================================================ + +/** + * A sorted candidate list for greedy beam search. + */ +struct DiskannCandidateList { + struct Vec0DiskannCandidate *items; + int count; + int capacity; +}; + +static int diskann_candidate_list_init(struct DiskannCandidateList *list, int capacity) { + list->items = sqlite3_malloc(capacity * sizeof(struct Vec0DiskannCandidate)); + if (!list->items) return SQLITE_NOMEM; + list->count = 0; + list->capacity = capacity; + return SQLITE_OK; +} + +static void diskann_candidate_list_free(struct DiskannCandidateList *list) { + sqlite3_free(list->items); + list->items = NULL; + list->count = 0; + list->capacity = 0; +} + +/** + * Insert a candidate into the sorted list, maintaining sort order by distance. + * Deduplicates by rowid. If at capacity and new candidate is worse, discards it. + * Returns 1 if inserted, 0 if discarded. + */ +static int diskann_candidate_list_insert( + struct DiskannCandidateList *list, i64 rowid, f32 distance) { + + // Check for duplicate + for (int i = 0; i < list->count; i++) { + if (list->items[i].rowid == rowid) { + // Update distance if better + if (distance < list->items[i].distance) { + list->items[i].distance = distance; + // Re-sort this item into position + struct Vec0DiskannCandidate tmp = list->items[i]; + int j = i - 1; + while (j >= 0 && list->items[j].distance > tmp.distance) { + list->items[j + 1] = list->items[j]; + j--; + } + list->items[j + 1] = tmp; + } + return 1; + } + } + + // If at capacity, check if new candidate is better than worst + if (list->count >= list->capacity) { + if (distance >= list->items[list->count - 1].distance) { + return 0; // Discard + } + list->count--; // Make room by dropping the worst + } + + // Binary search for insertion point + int lo = 0, hi = list->count; + while (lo < hi) { + int mid = (lo + hi) / 2; + if (list->items[mid].distance < distance) { + lo = mid + 1; + } else { + hi = mid; + } + } + + // Shift elements to make room + memmove(&list->items[lo + 1], &list->items[lo], + (list->count - lo) * sizeof(struct Vec0DiskannCandidate)); + + list->items[lo].rowid = rowid; + list->items[lo].distance = distance; + list->items[lo].visited = 0; + list->count++; + return 1; +} + +/** + * Find the closest unvisited candidate. Returns its index, or -1 if none. + */ +static int diskann_candidate_list_next_unvisited( + const struct DiskannCandidateList *list) { + for (int i = 0; i < list->count; i++) { + if (!list->items[i].visited) return i; + } + return -1; +} + + + +/** + * Simple hash set for tracking visited rowids during search. + * Uses open addressing with linear probing. + */ +struct DiskannVisitedSet { + i64 *slots; + int capacity; + int count; +}; + +static int diskann_visited_set_init(struct DiskannVisitedSet *set, int capacity) { + // Round up to power of 2 + int cap = 16; + while (cap < capacity) cap *= 2; + set->slots = sqlite3_malloc(cap * sizeof(i64)); + if (!set->slots) return SQLITE_NOMEM; + memset(set->slots, 0, cap * sizeof(i64)); + set->capacity = cap; + set->count = 0; + return SQLITE_OK; +} + +static void diskann_visited_set_free(struct DiskannVisitedSet *set) { + sqlite3_free(set->slots); + set->slots = NULL; + set->capacity = 0; + set->count = 0; +} + +static int diskann_visited_set_contains(const struct DiskannVisitedSet *set, i64 rowid) { + if (rowid == 0) return 0; // 0 is our sentinel for empty + int mask = set->capacity - 1; + int idx = (int)(((u64)rowid * 0x9E3779B97F4A7C15ULL) >> 32) & mask; + for (int i = 0; i < set->capacity; i++) { + int slot = (idx + i) & mask; + if (set->slots[slot] == 0) return 0; + if (set->slots[slot] == rowid) return 1; + } + return 0; +} + +static int diskann_visited_set_insert(struct DiskannVisitedSet *set, i64 rowid) { + if (rowid == 0) return 0; + int mask = set->capacity - 1; + int idx = (int)(((u64)rowid * 0x9E3779B97F4A7C15ULL) >> 32) & mask; + for (int i = 0; i < set->capacity; i++) { + int slot = (idx + i) & mask; + if (set->slots[slot] == 0) { + set->slots[slot] = rowid; + set->count++; + return 1; + } + if (set->slots[slot] == rowid) return 0; // Already there + } + return 0; // Full (shouldn't happen with proper sizing) +} + +// ============================================================ +// DiskANN greedy beam search (LM-Search) +// ============================================================ + +/** + * Perform LM-Search: greedy beam search over the DiskANN graph. + * Follows Algorithm 1 from the LM-DiskANN paper. + */ +static int diskann_search( + vec0_vtab *p, int vec_col_idx, + const void *queryVector, size_t dimensions, + enum VectorElementType elementType, + int k, int searchListSize, + i64 *outRowids, f32 *outDistances, int *outCount) { + + struct VectorColumnDefinition *col = &p->vector_columns[vec_col_idx]; + struct Vec0DiskannConfig *cfg = &col->diskann; + int rc; + + if (searchListSize <= 0) { + searchListSize = cfg->search_list_size_search > 0 ? cfg->search_list_size_search : cfg->search_list_size; + } + if (searchListSize < k) { + searchListSize = k; + } + + // 1. Get the medoid (entry point) + i64 medoid; + int isEmpty; + rc = diskann_medoid_get(p, vec_col_idx, &medoid, &isEmpty); + if (rc != SQLITE_OK) return rc; + if (isEmpty) { + *outCount = 0; + return SQLITE_OK; + } + + // 2. Compute distance from query to medoid using full-precision vector + void *medoidVector = NULL; + int medoidVectorSize; + rc = diskann_vector_read(p, vec_col_idx, medoid, &medoidVector, &medoidVectorSize); + if (rc != SQLITE_OK) return rc; + + f32 medoidDist = vec0_distance_full(queryVector, medoidVector, + dimensions, elementType, + col->distance_metric); + sqlite3_free(medoidVector); + + // 3. Initialize candidate list and visited set + struct DiskannCandidateList candidates; + rc = diskann_candidate_list_init(&candidates, searchListSize); + if (rc != SQLITE_OK) return rc; + + struct DiskannVisitedSet visited; + rc = diskann_visited_set_init(&visited, searchListSize * 4); + if (rc != SQLITE_OK) { + diskann_candidate_list_free(&candidates); + return rc; + } + + // Seed with medoid + diskann_candidate_list_insert(&candidates, medoid, medoidDist); + + // Pre-quantize query vector once for all quantized distance comparisons + u8 *queryQuantized = NULL; + if (elementType == SQLITE_VEC_ELEMENT_TYPE_FLOAT32) { + queryQuantized = diskann_quantize_query( + (const f32 *)queryVector, dimensions, cfg->quantizer_type); + } + + // 4. Greedy beam search loop (Algorithm 1 from LM-DiskANN paper) + while (1) { + int nextIdx = diskann_candidate_list_next_unvisited(&candidates); + if (nextIdx < 0) break; + + struct Vec0DiskannCandidate *current = &candidates.items[nextIdx]; + current->visited = 1; + i64 currentRowid = current->rowid; + + // Read the node's neighbor data + u8 *validity = NULL, *neighborIds = NULL, *qvecs = NULL; + int validitySize, neighborIdsSize, qvecsSize; + rc = diskann_node_read(p, vec_col_idx, currentRowid, + &validity, &validitySize, + &neighborIds, &neighborIdsSize, + &qvecs, &qvecsSize); + if (rc != SQLITE_OK) { + continue; // Skip if node doesn't exist + } + + // Insert all valid neighbors with approximate (quantized) distances + for (int i = 0; i < cfg->n_neighbors; i++) { + if (!diskann_validity_get(validity, i)) continue; + + i64 neighborRowid = diskann_neighbor_id_get(neighborIds, i); + + if (diskann_visited_set_contains(&visited, neighborRowid)) continue; + + const u8 *neighborQvec = diskann_neighbor_qvec_get( + qvecs, i, cfg->quantizer_type, dimensions); + + f32 approxDist; + if (queryQuantized) { + approxDist = diskann_distance_quantized_precomputed( + queryQuantized, neighborQvec, dimensions, + cfg->quantizer_type, col->distance_metric); + } else { + approxDist = diskann_distance_quantized( + queryVector, neighborQvec, dimensions, + cfg->quantizer_type, col->distance_metric); + } + + diskann_candidate_list_insert(&candidates, neighborRowid, approxDist); + } + + sqlite3_free(validity); + sqlite3_free(neighborIds); + sqlite3_free(qvecs); + + // Add to visited set + diskann_visited_set_insert(&visited, currentRowid); + + // Paper line 13: Re-rank p* using full-precision distance + // We already have exact distance for medoid; for others, update now + void *fullVec = NULL; + int fullVecSize; + rc = diskann_vector_read(p, vec_col_idx, currentRowid, &fullVec, &fullVecSize); + if (rc == SQLITE_OK) { + f32 exactDist = vec0_distance_full(queryVector, fullVec, + dimensions, elementType, + col->distance_metric); + sqlite3_free(fullVec); + // Update distance in candidate list and re-sort + diskann_candidate_list_insert(&candidates, currentRowid, exactDist); + } + } + + // 5. Output results (candidates are already sorted by distance) + int resultCount = (candidates.count < k) ? candidates.count : k; + *outCount = resultCount; + for (int i = 0; i < resultCount; i++) { + outRowids[i] = candidates.items[i].rowid; + outDistances[i] = candidates.items[i].distance; + } + + sqlite3_free(queryQuantized); + diskann_candidate_list_free(&candidates); + diskann_visited_set_free(&visited); + return SQLITE_OK; +} + +// ============================================================ +// DiskANN RobustPrune (Algorithm 4 from LM-DiskANN paper) +// ============================================================ + +/** + * RobustPrune: Select up to max_neighbors neighbors for node p from a + * candidate set, using alpha-pruning for diversity. + * + * Following Algorithm 4 (LM-Prune): + * C = C union N_out(p) \ {p} + * N_out(p) = empty + * while C not empty: + * p* = argmin d(p, c) for c in C + * N_out(p).insert(p*) + * if |N_out(p)| >= R: break + * for each p' in C: + * if alpha * d(p*, p') <= d(p, p'): remove p' from C + */ +/** + * Pure function: given pre-sorted candidates and a distance matrix, select + * up to max_neighbors using alpha-pruning. inter_distances is a flattened + * num_candidates x num_candidates matrix where inter_distances[i*num_candidates+j] + * = d(candidate_i, candidate_j). p_distances[i] = d(p, candidate_i), already sorted. + * outSelected[i] = 1 if selected. Returns count of selected. + */ +int diskann_prune_select( + const f32 *inter_distances, const f32 *p_distances, + int num_candidates, f32 alpha, int max_neighbors, + int *outSelected, int *outCount) { + + if (num_candidates == 0) { + *outCount = 0; + return SQLITE_OK; + } + + u8 *active = sqlite3_malloc(num_candidates); + if (!active) return SQLITE_NOMEM; + memset(active, 1, num_candidates); + memset(outSelected, 0, num_candidates * sizeof(int)); + + int selectedCount = 0; + + for (int round = 0; round < num_candidates && selectedCount < max_neighbors; round++) { + int bestIdx = -1; + for (int i = 0; i < num_candidates; i++) { + if (active[i]) { bestIdx = i; break; } + } + if (bestIdx < 0) break; + + outSelected[bestIdx] = 1; + selectedCount++; + active[bestIdx] = 0; + + for (int i = 0; i < num_candidates; i++) { + if (!active[i]) continue; + f32 dist_best_to_i = inter_distances[bestIdx * num_candidates + i]; + if (alpha * dist_best_to_i <= p_distances[i]) { + active[i] = 0; + } + } + } + + *outCount = selectedCount; + sqlite3_free(active); + return SQLITE_OK; +} + +static int diskann_robust_prune( + vec0_vtab *p, int vec_col_idx, + i64 p_rowid, const void *p_vector, + i64 *candidates, f32 *candidate_distances, int num_candidates, + f32 alpha, int max_neighbors, + i64 *outNeighborRowids, int *outNeighborCount) { + + struct VectorColumnDefinition *col = &p->vector_columns[vec_col_idx]; + int rc; + + // Remove p itself from candidates + for (int i = 0; i < num_candidates; i++) { + if (candidates[i] == p_rowid) { + candidates[i] = candidates[num_candidates - 1]; + candidate_distances[i] = candidate_distances[num_candidates - 1]; + num_candidates--; + break; + } + } + + if (num_candidates == 0) { + *outNeighborCount = 0; + return SQLITE_OK; + } + + // Sort candidates by distance to p (ascending) - insertion sort + for (int i = 1; i < num_candidates; i++) { + f32 tmpDist = candidate_distances[i]; + i64 tmpRowid = candidates[i]; + int j = i - 1; + while (j >= 0 && candidate_distances[j] > tmpDist) { + candidate_distances[j + 1] = candidate_distances[j]; + candidates[j + 1] = candidates[j]; + j--; + } + candidate_distances[j + 1] = tmpDist; + candidates[j + 1] = tmpRowid; + } + + // Active flags + u8 *active = sqlite3_malloc(num_candidates); + if (!active) return SQLITE_NOMEM; + memset(active, 1, num_candidates); + + // Cache full-precision vectors for inter-candidate distance + void **candidateVectors = sqlite3_malloc(num_candidates * sizeof(void *)); + if (!candidateVectors) { + sqlite3_free(active); + return SQLITE_NOMEM; + } + memset(candidateVectors, 0, num_candidates * sizeof(void *)); + + int selectedCount = 0; + + for (int round = 0; round < num_candidates && selectedCount < max_neighbors; round++) { + // Find closest active candidate + int bestIdx = -1; + for (int i = 0; i < num_candidates; i++) { + if (active[i]) { bestIdx = i; break; } + } + if (bestIdx < 0) break; + + // Select this candidate + outNeighborRowids[selectedCount] = candidates[bestIdx]; + selectedCount++; + active[bestIdx] = 0; + + // Load selected candidate's vector + if (!candidateVectors[bestIdx]) { + int vecSize; + rc = diskann_vector_read(p, vec_col_idx, candidates[bestIdx], + &candidateVectors[bestIdx], &vecSize); + if (rc != SQLITE_OK) continue; + } + + // Alpha-prune: remove candidates covered by the selected neighbor + for (int i = 0; i < num_candidates; i++) { + if (!active[i]) continue; + + if (!candidateVectors[i]) { + int vecSize; + rc = diskann_vector_read(p, vec_col_idx, candidates[i], + &candidateVectors[i], &vecSize); + if (rc != SQLITE_OK) continue; + } + + f32 dist_selected_to_i = vec0_distance_full( + candidateVectors[bestIdx], candidateVectors[i], + col->dimensions, col->element_type, col->distance_metric); + + if (alpha * dist_selected_to_i <= candidate_distances[i]) { + active[i] = 0; + } + } + } + + *outNeighborCount = selectedCount; + + for (int i = 0; i < num_candidates; i++) { + sqlite3_free(candidateVectors[i]); + } + sqlite3_free(candidateVectors); + sqlite3_free(active); + + return SQLITE_OK; +} + +/** + * After RobustPrune selects neighbors, build the node blobs and write to DB. + * Quantizes each neighbor's vector and packs into the node format. + */ +static int diskann_write_pruned_neighbors( + vec0_vtab *p, int vec_col_idx, i64 nodeRowid, + const i64 *neighborRowids, int neighborCount) { + + struct VectorColumnDefinition *col = &p->vector_columns[vec_col_idx]; + struct Vec0DiskannConfig *cfg = &col->diskann; + int rc; + + u8 *validity, *neighborIds, *qvecs; + int validitySize, neighborIdsSize, qvecsSize; + rc = diskann_node_init(cfg->n_neighbors, cfg->quantizer_type, + col->dimensions, + &validity, &validitySize, + &neighborIds, &neighborIdsSize, + &qvecs, &qvecsSize); + if (rc != SQLITE_OK) return rc; + + size_t qvecSize = diskann_quantized_vector_byte_size( + cfg->quantizer_type, col->dimensions); + u8 *qvec = sqlite3_malloc(qvecSize); + if (!qvec) { + sqlite3_free(validity); + sqlite3_free(neighborIds); + sqlite3_free(qvecs); + return SQLITE_NOMEM; + } + + for (int i = 0; i < neighborCount && i < cfg->n_neighbors; i++) { + void *neighborVec = NULL; + int neighborVecSize; + rc = diskann_vector_read(p, vec_col_idx, neighborRowids[i], + &neighborVec, &neighborVecSize); + if (rc != SQLITE_OK) continue; + + if (col->element_type == SQLITE_VEC_ELEMENT_TYPE_FLOAT32) { + diskann_quantize_vector((const f32 *)neighborVec, col->dimensions, + cfg->quantizer_type, qvec); + } else { + memcpy(qvec, neighborVec, + qvecSize < (size_t)neighborVecSize ? qvecSize : (size_t)neighborVecSize); + } + + diskann_node_set_neighbor(validity, neighborIds, qvecs, i, + neighborRowids[i], qvec, + cfg->quantizer_type, col->dimensions); + + sqlite3_free(neighborVec); + } + sqlite3_free(qvec); + + rc = diskann_node_write(p, vec_col_idx, nodeRowid, + validity, validitySize, + neighborIds, neighborIdsSize, + qvecs, qvecsSize); + + sqlite3_free(validity); + sqlite3_free(neighborIds); + sqlite3_free(qvecs); + return rc; +} + +// ============================================================ +// DiskANN insert (Algorithm 2 from LM-DiskANN paper) +// ============================================================ + +/** + * Add a reverse edge: make target_rowid a neighbor of node_rowid. + * If node is full, run RobustPrune. + */ +static int diskann_add_reverse_edge( + vec0_vtab *p, int vec_col_idx, + i64 node_rowid, i64 target_rowid, const void *target_vector) { + + struct VectorColumnDefinition *col = &p->vector_columns[vec_col_idx]; + struct Vec0DiskannConfig *cfg = &col->diskann; + int rc; + + u8 *validity = NULL, *neighborIds = NULL, *qvecs = NULL; + int validitySize, neighborIdsSize, qvecsSize; + rc = diskann_node_read(p, vec_col_idx, node_rowid, + &validity, &validitySize, + &neighborIds, &neighborIdsSize, + &qvecs, &qvecsSize); + if (rc != SQLITE_OK) return rc; + + int currentCount = diskann_validity_count(validity, cfg->n_neighbors); + + // Check if target is already a neighbor + for (int i = 0; i < cfg->n_neighbors; i++) { + if (diskann_validity_get(validity, i) && + diskann_neighbor_id_get(neighborIds, i) == target_rowid) { + sqlite3_free(validity); + sqlite3_free(neighborIds); + sqlite3_free(qvecs); + return SQLITE_OK; + } + } + + if (currentCount < cfg->n_neighbors) { + // Room available: find first empty slot + for (int i = 0; i < cfg->n_neighbors; i++) { + if (!diskann_validity_get(validity, i)) { + size_t qvecSize = diskann_quantized_vector_byte_size( + cfg->quantizer_type, col->dimensions); + u8 *qvec = sqlite3_malloc(qvecSize); + if (!qvec) { + sqlite3_free(validity); + sqlite3_free(neighborIds); + sqlite3_free(qvecs); + return SQLITE_NOMEM; + } + + if (col->element_type == SQLITE_VEC_ELEMENT_TYPE_FLOAT32) { + diskann_quantize_vector((const f32 *)target_vector, col->dimensions, + cfg->quantizer_type, qvec); + } else { + size_t vbs = vector_column_byte_size(*col); + memcpy(qvec, target_vector, qvecSize < vbs ? qvecSize : vbs); + } + + diskann_node_set_neighbor(validity, neighborIds, qvecs, i, + target_rowid, qvec, + cfg->quantizer_type, col->dimensions); + sqlite3_free(qvec); + break; + } + } + + rc = diskann_node_write(p, vec_col_idx, node_rowid, + validity, validitySize, + neighborIds, neighborIdsSize, + qvecs, qvecsSize); + } else { + // Full: lazy replacement — use quantized distances to find the worst + // existing neighbor and replace it if target is closer. This avoids + // reading all neighbors' float vectors (the expensive RobustPrune path). + + // Quantize the node's vector and the target vector for comparison + void *nodeVector = NULL; + int nodeVecSize; + rc = diskann_vector_read(p, vec_col_idx, node_rowid, + &nodeVector, &nodeVecSize); + if (rc != SQLITE_OK) { + sqlite3_free(validity); + sqlite3_free(neighborIds); + sqlite3_free(qvecs); + return rc; + } + + // Quantize target for node-level comparison + size_t qvecSize = diskann_quantized_vector_byte_size( + cfg->quantizer_type, col->dimensions); + u8 *targetQ = sqlite3_malloc(qvecSize); + u8 *nodeQ = sqlite3_malloc(qvecSize); + if (!targetQ || !nodeQ) { + sqlite3_free(targetQ); + sqlite3_free(nodeQ); + sqlite3_free(nodeVector); + sqlite3_free(validity); + sqlite3_free(neighborIds); + sqlite3_free(qvecs); + return SQLITE_NOMEM; + } + + if (col->element_type == SQLITE_VEC_ELEMENT_TYPE_FLOAT32) { + diskann_quantize_vector((const f32 *)target_vector, col->dimensions, + cfg->quantizer_type, targetQ); + diskann_quantize_vector((const f32 *)nodeVector, col->dimensions, + cfg->quantizer_type, nodeQ); + } else { + memcpy(targetQ, target_vector, qvecSize); + memcpy(nodeQ, nodeVector, qvecSize); + } + + // Compute quantized distance from node to target + f32 targetDist = diskann_distance_quantized_precomputed( + nodeQ, targetQ, col->dimensions, + cfg->quantizer_type, col->distance_metric); + + // Find the worst (farthest) existing neighbor using quantized distances + int worstIdx = -1; + f32 worstDist = -1.0f; + for (int i = 0; i < cfg->n_neighbors; i++) { + if (!diskann_validity_get(validity, i)) continue; + const u8 *nqvec = diskann_neighbor_qvec_get( + qvecs, i, cfg->quantizer_type, col->dimensions); + f32 d = diskann_distance_quantized_precomputed( + nodeQ, nqvec, col->dimensions, + cfg->quantizer_type, col->distance_metric); + if (d > worstDist) { + worstDist = d; + worstIdx = i; + } + } + + // Replace worst neighbor if target is closer + if (worstIdx >= 0 && targetDist < worstDist) { + diskann_node_set_neighbor(validity, neighborIds, qvecs, worstIdx, + target_rowid, targetQ, + cfg->quantizer_type, col->dimensions); + rc = diskann_node_write(p, vec_col_idx, node_rowid, + validity, validitySize, + neighborIds, neighborIdsSize, + qvecs, qvecsSize); + } else { + rc = SQLITE_OK; // target is farther than all existing neighbors, skip + } + + sqlite3_free(targetQ); + sqlite3_free(nodeQ); + sqlite3_free(nodeVector); + } + + sqlite3_free(validity); + sqlite3_free(neighborIds); + sqlite3_free(qvecs); + return rc; +} + +// ============================================================ +// DiskANN buffer helpers (for batched inserts) +// ============================================================ + +/** + * Insert a vector into the _diskann_buffer table. + */ +static int diskann_buffer_write(vec0_vtab *p, int vec_col_idx, + i64 rowid, const void *vector, int vectorSize) { + sqlite3_stmt *stmt = NULL; + char *zSql = sqlite3_mprintf( + "INSERT INTO " VEC0_SHADOW_DISKANN_BUFFER_N_NAME + " (rowid, vector) VALUES (?, ?)", + p->schemaName, p->tableName, vec_col_idx); + if (!zSql) return SQLITE_NOMEM; + int rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) return rc; + sqlite3_bind_int64(stmt, 1, rowid); + sqlite3_bind_blob(stmt, 2, vector, vectorSize, SQLITE_STATIC); + rc = sqlite3_step(stmt); + sqlite3_finalize(stmt); + return (rc == SQLITE_DONE) ? SQLITE_OK : SQLITE_ERROR; +} + +/** + * Delete a vector from the _diskann_buffer table. + */ +static int diskann_buffer_delete(vec0_vtab *p, int vec_col_idx, i64 rowid) { + sqlite3_stmt *stmt = NULL; + char *zSql = sqlite3_mprintf( + "DELETE FROM " VEC0_SHADOW_DISKANN_BUFFER_N_NAME " WHERE rowid = ?", + p->schemaName, p->tableName, vec_col_idx); + if (!zSql) return SQLITE_NOMEM; + int rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) return rc; + sqlite3_bind_int64(stmt, 1, rowid); + rc = sqlite3_step(stmt); + sqlite3_finalize(stmt); + return (rc == SQLITE_DONE) ? SQLITE_OK : SQLITE_ERROR; +} + +/** + * Check if a rowid exists in the _diskann_buffer table. + * Returns SQLITE_OK and sets *exists to 1 if found, 0 if not. + */ +static int diskann_buffer_exists(vec0_vtab *p, int vec_col_idx, + i64 rowid, int *exists) { + sqlite3_stmt *stmt = NULL; + char *zSql = sqlite3_mprintf( + "SELECT 1 FROM " VEC0_SHADOW_DISKANN_BUFFER_N_NAME " WHERE rowid = ?", + p->schemaName, p->tableName, vec_col_idx); + if (!zSql) return SQLITE_NOMEM; + int rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) return rc; + sqlite3_bind_int64(stmt, 1, rowid); + rc = sqlite3_step(stmt); + *exists = (rc == SQLITE_ROW) ? 1 : 0; + sqlite3_finalize(stmt); + return SQLITE_OK; +} + +/** + * Get the count of rows in the _diskann_buffer table. + */ +static int diskann_buffer_count(vec0_vtab *p, int vec_col_idx, i64 *count) { + sqlite3_stmt *stmt = NULL; + char *zSql = sqlite3_mprintf( + "SELECT count(*) FROM " VEC0_SHADOW_DISKANN_BUFFER_N_NAME, + p->schemaName, p->tableName, vec_col_idx); + if (!zSql) return SQLITE_NOMEM; + int rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) return rc; + rc = sqlite3_step(stmt); + if (rc == SQLITE_ROW) { + *count = sqlite3_column_int64(stmt, 0); + sqlite3_finalize(stmt); + return SQLITE_OK; + } + sqlite3_finalize(stmt); + return SQLITE_ERROR; +} + +// Forward declaration: diskann_insert_graph does the actual graph insertion +static int diskann_insert_graph(vec0_vtab *p, int vec_col_idx, + i64 rowid, const void *vector); + +/** + * Flush all buffered vectors into the DiskANN graph. + * Iterates over _diskann_buffer rows and calls diskann_insert_graph for each. + */ +static int diskann_flush_buffer(vec0_vtab *p, int vec_col_idx) { + sqlite3_stmt *stmt = NULL; + char *zSql = sqlite3_mprintf( + "SELECT rowid, vector FROM " VEC0_SHADOW_DISKANN_BUFFER_N_NAME, + p->schemaName, p->tableName, vec_col_idx); + if (!zSql) return SQLITE_NOMEM; + int rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) return rc; + + while ((rc = sqlite3_step(stmt)) == SQLITE_ROW) { + i64 rowid = sqlite3_column_int64(stmt, 0); + const void *vector = sqlite3_column_blob(stmt, 1); + // Note: vector is already written to _vectors table, so + // diskann_insert_graph will skip re-writing it (vector already exists). + // We call the graph-only insert path. + int insertRc = diskann_insert_graph(p, vec_col_idx, rowid, vector); + if (insertRc != SQLITE_OK) { + sqlite3_finalize(stmt); + return insertRc; + } + } + sqlite3_finalize(stmt); + + // Clear the buffer + zSql = sqlite3_mprintf( + "DELETE FROM " VEC0_SHADOW_DISKANN_BUFFER_N_NAME, + p->schemaName, p->tableName, vec_col_idx); + if (!zSql) return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) return rc; + rc = sqlite3_step(stmt); + sqlite3_finalize(stmt); + return (rc == SQLITE_DONE) ? SQLITE_OK : SQLITE_ERROR; +} + +/** + * Insert a new vector into the DiskANN graph (graph-only path). + * The vector must already be written to _vectors table. + * This is the core graph insertion logic (Algorithm 2: LM-Insert). + */ +static int diskann_insert_graph(vec0_vtab *p, int vec_col_idx, + i64 rowid, const void *vector) { + struct VectorColumnDefinition *col = &p->vector_columns[vec_col_idx]; + struct Vec0DiskannConfig *cfg = &col->diskann; + int rc; + + // Handle first insert (empty graph) + i64 medoid; + int isEmpty; + rc = diskann_medoid_get(p, vec_col_idx, &medoid, &isEmpty); + if (rc != SQLITE_OK) return rc; + + if (isEmpty) { + u8 *validity, *neighborIds, *qvecs; + int validitySize, neighborIdsSize, qvecsSize; + rc = diskann_node_init(cfg->n_neighbors, cfg->quantizer_type, + col->dimensions, + &validity, &validitySize, + &neighborIds, &neighborIdsSize, + &qvecs, &qvecsSize); + if (rc != SQLITE_OK) return rc; + + rc = diskann_node_write(p, vec_col_idx, rowid, + validity, validitySize, + neighborIds, neighborIdsSize, + qvecs, qvecsSize); + sqlite3_free(validity); + sqlite3_free(neighborIds); + sqlite3_free(qvecs); + if (rc != SQLITE_OK) return rc; + + return diskann_medoid_set(p, vec_col_idx, rowid, 0); + } + + // Search for nearest neighbors + int L = cfg->search_list_size_insert > 0 ? cfg->search_list_size_insert : cfg->search_list_size; + i64 *searchRowids = sqlite3_malloc(L * sizeof(i64)); + f32 *searchDistances = sqlite3_malloc(L * sizeof(f32)); + if (!searchRowids || !searchDistances) { + sqlite3_free(searchRowids); + sqlite3_free(searchDistances); + return SQLITE_NOMEM; + } + + int searchCount; + rc = diskann_search(p, vec_col_idx, vector, col->dimensions, + col->element_type, L, L, + searchRowids, searchDistances, &searchCount); + if (rc != SQLITE_OK) { + sqlite3_free(searchRowids); + sqlite3_free(searchDistances); + return rc; + } + + // RobustPrune to select neighbors for x + i64 *selectedNeighbors = sqlite3_malloc(cfg->n_neighbors * sizeof(i64)); + int selectedCount = 0; + if (!selectedNeighbors) { + sqlite3_free(searchRowids); + sqlite3_free(searchDistances); + return SQLITE_NOMEM; + } + + rc = diskann_robust_prune(p, vec_col_idx, rowid, vector, + searchRowids, searchDistances, searchCount, + cfg->alpha, cfg->n_neighbors, + selectedNeighbors, &selectedCount); + sqlite3_free(searchRowids); + sqlite3_free(searchDistances); + if (rc != SQLITE_OK) { + sqlite3_free(selectedNeighbors); + return rc; + } + + // Write x's node with selected neighbors + rc = diskann_write_pruned_neighbors(p, vec_col_idx, rowid, + selectedNeighbors, selectedCount); + if (rc != SQLITE_OK) { + sqlite3_free(selectedNeighbors); + return rc; + } + + // Add bidirectional edges + for (int i = 0; i < selectedCount; i++) { + diskann_add_reverse_edge(p, vec_col_idx, + selectedNeighbors[i], rowid, vector); + } + + sqlite3_free(selectedNeighbors); + return SQLITE_OK; +} + +/** + * Insert a new vector into the DiskANN index (Algorithm 2: LM-Insert). + * When buffer_threshold > 0, vectors are buffered and flushed in batch. + */ +static int diskann_insert(vec0_vtab *p, int vec_col_idx, + i64 rowid, const void *vector) { + struct VectorColumnDefinition *col = &p->vector_columns[vec_col_idx]; + struct Vec0DiskannConfig *cfg = &col->diskann; + int rc; + size_t vectorSize = vector_column_byte_size(*col); + + // 1. Write full-precision vector to _vectors table (always needed for queries) + rc = diskann_vector_write(p, vec_col_idx, rowid, vector, (int)vectorSize); + if (rc != SQLITE_OK) return rc; + + // 2. If buffering is enabled, write to buffer instead of graph + if (cfg->buffer_threshold > 0) { + rc = diskann_buffer_write(p, vec_col_idx, rowid, vector, (int)vectorSize); + if (rc != SQLITE_OK) return rc; + + i64 count; + rc = diskann_buffer_count(p, vec_col_idx, &count); + if (rc != SQLITE_OK) return rc; + + if (count >= cfg->buffer_threshold) { + return diskann_flush_buffer(p, vec_col_idx); + } + return SQLITE_OK; + } + + // 3. Legacy per-row insert directly into graph + return diskann_insert_graph(p, vec_col_idx, rowid, vector); +} + +/** + * Returns 1 if ALL vector columns in this table are DiskANN-indexed. + */ +// ============================================================ +// DiskANN delete (Algorithm 3 from LM-DiskANN paper) +// ============================================================ + +static int diskann_node_delete(vec0_vtab *p, int vec_col_idx, i64 rowid) { + sqlite3_stmt *stmt = NULL; + char *zSql = sqlite3_mprintf( + "DELETE FROM " VEC0_SHADOW_DISKANN_NODES_N_NAME " WHERE rowid = ?", + p->schemaName, p->tableName, vec_col_idx); + if (!zSql) return SQLITE_NOMEM; + int rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) return rc; + sqlite3_bind_int64(stmt, 1, rowid); + rc = sqlite3_step(stmt); + sqlite3_finalize(stmt); + return (rc == SQLITE_DONE) ? SQLITE_OK : SQLITE_ERROR; +} + +static int diskann_vector_delete(vec0_vtab *p, int vec_col_idx, i64 rowid) { + sqlite3_stmt *stmt = NULL; + char *zSql = sqlite3_mprintf( + "DELETE FROM " VEC0_SHADOW_VECTORS_N_NAME " WHERE rowid = ?", + p->schemaName, p->tableName, vec_col_idx); + if (!zSql) return SQLITE_NOMEM; + int rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) return rc; + sqlite3_bind_int64(stmt, 1, rowid); + rc = sqlite3_step(stmt); + sqlite3_finalize(stmt); + return (rc == SQLITE_DONE) ? SQLITE_OK : SQLITE_ERROR; +} + +/** + * Repair graph after deleting a node. Following Algorithm 3 (LM-Delete): + * For each neighbor n of the deleted node, add deleted node's other neighbors + * to n's candidate set, then remove the deleted node from n's neighbor list. + * Uses simple slot replacement rather than full RobustPrune for performance. + */ +static int diskann_repair_reverse_edges( + vec0_vtab *p, int vec_col_idx, i64 deleted_rowid, + const i64 *deleted_neighbors, int deleted_neighbor_count) { + + struct VectorColumnDefinition *col = &p->vector_columns[vec_col_idx]; + struct Vec0DiskannConfig *cfg = &col->diskann; + int rc; + + // For each neighbor of the deleted node, fix their neighbor list + for (int dn = 0; dn < deleted_neighbor_count; dn++) { + i64 nodeRowid = deleted_neighbors[dn]; + + u8 *validity = NULL, *neighborIds = NULL, *qvecs = NULL; + int vs, nis, qs; + rc = diskann_node_read(p, vec_col_idx, nodeRowid, + &validity, &vs, &neighborIds, &nis, &qvecs, &qs); + if (rc != SQLITE_OK) continue; + + // Find and clear the deleted node's slot + int clearedSlot = -1; + for (int i = 0; i < cfg->n_neighbors; i++) { + if (diskann_validity_get(validity, i) && + diskann_neighbor_id_get(neighborIds, i) == deleted_rowid) { + diskann_node_clear_neighbor(validity, neighborIds, qvecs, i, + cfg->quantizer_type, col->dimensions); + clearedSlot = i; + break; + } + } + + if (clearedSlot >= 0) { + // Try to fill the cleared slot with one of the deleted node's other neighbors + for (int di = 0; di < deleted_neighbor_count; di++) { + i64 candidate = deleted_neighbors[di]; + if (candidate == nodeRowid || candidate == deleted_rowid) continue; + + // Check not already a neighbor + int alreadyNeighbor = 0; + for (int ni = 0; ni < cfg->n_neighbors; ni++) { + if (diskann_validity_get(validity, ni) && + diskann_neighbor_id_get(neighborIds, ni) == candidate) { + alreadyNeighbor = 1; + break; + } + } + if (alreadyNeighbor) continue; + + // Load, quantize, and set + void *candidateVec = NULL; + int cvs; + rc = diskann_vector_read(p, vec_col_idx, candidate, &candidateVec, &cvs); + if (rc != SQLITE_OK) continue; + + size_t qvecSize = diskann_quantized_vector_byte_size( + cfg->quantizer_type, col->dimensions); + u8 *qvec = sqlite3_malloc(qvecSize); + if (qvec) { + if (col->element_type == SQLITE_VEC_ELEMENT_TYPE_FLOAT32) { + diskann_quantize_vector((const f32 *)candidateVec, col->dimensions, + cfg->quantizer_type, qvec); + } else { + memcpy(qvec, candidateVec, + qvecSize < (size_t)cvs ? qvecSize : (size_t)cvs); + } + diskann_node_set_neighbor(validity, neighborIds, qvecs, clearedSlot, + candidate, qvec, + cfg->quantizer_type, col->dimensions); + sqlite3_free(qvec); + } + sqlite3_free(candidateVec); + break; + } + + diskann_node_write(p, vec_col_idx, nodeRowid, + validity, vs, neighborIds, nis, qvecs, qs); + } + + sqlite3_free(validity); + sqlite3_free(neighborIds); + sqlite3_free(qvecs); + } + + return SQLITE_OK; +} + +/** + * Delete a vector from the DiskANN graph (Algorithm 3: LM-Delete). + * If the vector is in the buffer (not yet flushed), just remove from buffer. + */ +static int diskann_delete(vec0_vtab *p, int vec_col_idx, i64 rowid) { + struct VectorColumnDefinition *col = &p->vector_columns[vec_col_idx]; + struct Vec0DiskannConfig *cfg = &col->diskann; + int rc; + + // Check if this rowid is in the buffer (not yet in graph) + if (cfg->buffer_threshold > 0) { + int inBuffer = 0; + rc = diskann_buffer_exists(p, vec_col_idx, rowid, &inBuffer); + if (rc != SQLITE_OK) return rc; + if (inBuffer) { + // Just remove from buffer and _vectors, no graph repair needed + rc = diskann_buffer_delete(p, vec_col_idx, rowid); + if (rc == SQLITE_OK) { + rc = diskann_vector_delete(p, vec_col_idx, rowid); + } + return rc; + } + } + + // 1. Read the node to get its neighbor list + u8 *delValidity = NULL, *delNeighborIds = NULL, *delQvecs = NULL; + int dvs, dnis, dqs; + rc = diskann_node_read(p, vec_col_idx, rowid, + &delValidity, &dvs, &delNeighborIds, &dnis, + &delQvecs, &dqs); + if (rc != SQLITE_OK) { + return SQLITE_OK; // Node doesn't exist, nothing to do + } + + i64 *deletedNeighbors = sqlite3_malloc(cfg->n_neighbors * sizeof(i64)); + int deletedNeighborCount = 0; + if (!deletedNeighbors) { + sqlite3_free(delValidity); + sqlite3_free(delNeighborIds); + sqlite3_free(delQvecs); + return SQLITE_NOMEM; + } + + for (int i = 0; i < cfg->n_neighbors; i++) { + if (diskann_validity_get(delValidity, i)) { + deletedNeighbors[deletedNeighborCount++] = + diskann_neighbor_id_get(delNeighborIds, i); + } + } + + sqlite3_free(delValidity); + sqlite3_free(delNeighborIds); + sqlite3_free(delQvecs); + + // 2. Repair reverse edges + rc = diskann_repair_reverse_edges(p, vec_col_idx, rowid, + deletedNeighbors, deletedNeighborCount); + sqlite3_free(deletedNeighbors); + + // 3. Delete node and vector + if (rc == SQLITE_OK) { + rc = diskann_node_delete(p, vec_col_idx, rowid); + } + if (rc == SQLITE_OK) { + rc = diskann_vector_delete(p, vec_col_idx, rowid); + } + + // 4. Handle medoid deletion + if (rc == SQLITE_OK) { + rc = diskann_medoid_handle_delete(p, vec_col_idx, rowid); + } + + return rc; +} + +static int vec0_all_columns_diskann(vec0_vtab *p) { + for (int i = 0; i < p->numVectorColumns; i++) { + if (p->vector_columns[i].index_type != VEC0_INDEX_TYPE_DISKANN) return 0; + } + return p->numVectorColumns > 0; +} + +// ============================================================================ +// Command dispatch +// ============================================================================ + +static int diskann_handle_command(vec0_vtab *p, const char *command) { + int col_idx = -1; + for (int i = 0; i < p->numVectorColumns; i++) { + if (p->vector_columns[i].index_type == VEC0_INDEX_TYPE_DISKANN) { col_idx = i; break; } + } + if (col_idx < 0) return SQLITE_EMPTY; + + struct Vec0DiskannConfig *cfg = &p->vector_columns[col_idx].diskann; + + if (strncmp(command, "search_list_size_search=", 24) == 0) { + int val = atoi(command + 24); + if (val < 1) { vtab_set_error(&p->base, "search_list_size_search must be >= 1"); return SQLITE_ERROR; } + cfg->search_list_size_search = val; + return SQLITE_OK; + } + if (strncmp(command, "search_list_size_insert=", 24) == 0) { + int val = atoi(command + 24); + if (val < 1) { vtab_set_error(&p->base, "search_list_size_insert must be >= 1"); return SQLITE_ERROR; } + cfg->search_list_size_insert = val; + return SQLITE_OK; + } + if (strncmp(command, "search_list_size=", 17) == 0) { + int val = atoi(command + 17); + if (val < 1) { vtab_set_error(&p->base, "search_list_size must be >= 1"); return SQLITE_ERROR; } + cfg->search_list_size = val; + return SQLITE_OK; + } + return SQLITE_EMPTY; +} + +#ifdef SQLITE_VEC_TEST +// Expose internal DiskANN data structures and functions for unit testing. + +int _test_diskann_candidate_list_init(struct DiskannCandidateList *list, int capacity) { + return diskann_candidate_list_init(list, capacity); +} +void _test_diskann_candidate_list_free(struct DiskannCandidateList *list) { + diskann_candidate_list_free(list); +} +int _test_diskann_candidate_list_insert(struct DiskannCandidateList *list, long long rowid, float distance) { + return diskann_candidate_list_insert(list, (i64)rowid, (f32)distance); +} +int _test_diskann_candidate_list_next_unvisited(const struct DiskannCandidateList *list) { + return diskann_candidate_list_next_unvisited(list); +} +int _test_diskann_candidate_list_count(const struct DiskannCandidateList *list) { + return list->count; +} +long long _test_diskann_candidate_list_rowid(const struct DiskannCandidateList *list, int i) { + return (long long)list->items[i].rowid; +} +float _test_diskann_candidate_list_distance(const struct DiskannCandidateList *list, int i) { + return (float)list->items[i].distance; +} +void _test_diskann_candidate_list_set_visited(struct DiskannCandidateList *list, int i) { + list->items[i].visited = 1; +} + +int _test_diskann_visited_set_init(struct DiskannVisitedSet *set, int capacity) { + return diskann_visited_set_init(set, capacity); +} +void _test_diskann_visited_set_free(struct DiskannVisitedSet *set) { + diskann_visited_set_free(set); +} +int _test_diskann_visited_set_contains(const struct DiskannVisitedSet *set, long long rowid) { + return diskann_visited_set_contains(set, (i64)rowid); +} +int _test_diskann_visited_set_insert(struct DiskannVisitedSet *set, long long rowid) { + return diskann_visited_set_insert(set, (i64)rowid); +} +#endif /* SQLITE_VEC_TEST */ + diff --git a/sqlite-vec-rescore.c b/sqlite-vec-rescore.c index a45f52f..ef4e692 100644 --- a/sqlite-vec-rescore.c +++ b/sqlite-vec-rescore.c @@ -156,21 +156,11 @@ static void rescore_quantize_float_to_bit(const float *src, uint8_t *dst, static void rescore_quantize_float_to_int8(const float *src, int8_t *dst, size_t dimensions) { - float vmin = src[0], vmax = src[0]; - for (size_t i = 1; i < dimensions; i++) { - if (src[i] < vmin) vmin = src[i]; - if (src[i] > vmax) vmax = src[i]; - } - float range = vmax - vmin; - if (range == 0.0f) { - memset(dst, 0, dimensions); - return; - } - float scale = 255.0f / range; + float step = 2.0f / 255.0f; for (size_t i = 0; i < dimensions; i++) { - float v = (src[i] - vmin) * scale - 128.0f; - if (v < -128.0f) v = -128.0f; - if (v > 127.0f) v = 127.0f; + float v = (src[i] - (-1.0f)) / step - 128.0f; + if (!(v <= 127.0f)) v = 127.0f; + if (!(v >= -128.0f)) v = -128.0f; dst[i] = (int8_t)v; } } diff --git a/sqlite-vec.c b/sqlite-vec.c index 015792b..5ca7834 100644 --- a/sqlite-vec.c +++ b/sqlite-vec.c @@ -61,6 +61,10 @@ SQLITE_EXTENSION_INIT1 #define LONGDOUBLE_TYPE long double #endif +#ifndef SQLITE_VEC_ENABLE_DISKANN +#define SQLITE_VEC_ENABLE_DISKANN 1 +#endif + #ifndef _WIN32 #ifndef __EMSCRIPTEN__ #ifndef __COSMOPOLITAN__ @@ -2544,6 +2548,7 @@ enum Vec0IndexType { VEC0_INDEX_TYPE_RESCORE = 2, #endif VEC0_INDEX_TYPE_IVF = 3, + VEC0_INDEX_TYPE_DISKANN = 4, }; #if SQLITE_VEC_ENABLE_RESCORE @@ -2575,6 +2580,75 @@ struct Vec0IvfConfig { struct Vec0IvfConfig { char _unused; }; #endif +// ============================================================ +// DiskANN types and constants +// ============================================================ + +#define VEC0_DISKANN_DEFAULT_N_NEIGHBORS 72 +#define VEC0_DISKANN_MAX_N_NEIGHBORS 256 +#define VEC0_DISKANN_DEFAULT_SEARCH_LIST_SIZE 128 +#define VEC0_DISKANN_DEFAULT_ALPHA 1.2f + +/** + * Quantizer type used for compressing neighbor vectors in the DiskANN graph. + */ +enum Vec0DiskannQuantizerType { + VEC0_DISKANN_QUANTIZER_BINARY = 1, // 1 bit per dimension (1/32 compression) + VEC0_DISKANN_QUANTIZER_INT8 = 2, // 1 byte per dimension (1/4 compression) +}; + +/** + * Configuration for a DiskANN index on a single vector column. + * Parsed from `INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=72)`. + */ +struct Vec0DiskannConfig { + // Quantizer type for neighbor vectors + enum Vec0DiskannQuantizerType quantizer_type; + + // Maximum number of neighbors per node (R in the paper). Must be divisible by 8. + int n_neighbors; + + // Search list size (L in the paper) — unified default for both insert and query. + int search_list_size; + + // Per-path overrides (0 = fall back to search_list_size). + int search_list_size_search; + int search_list_size_insert; + + // Alpha parameter for RobustPrune (distance scaling factor, typically 1.0-1.5) + f32 alpha; + + // Buffer threshold for batched inserts. When > 0, inserts go into a flat + // buffer table and are flushed into the graph when the buffer reaches this + // size. 0 = disabled (legacy per-row insert behavior). + int buffer_threshold; +}; + +/** + * Represents a single candidate during greedy beam search. + * Used in priority queues / sorted arrays during LM-Search. + */ +struct Vec0DiskannCandidate { + i64 rowid; + f32 distance; + int visited; // 1 if this candidate's neighbors have been explored +}; + +/** + * Returns the byte size of a quantized vector for the given quantizer type + * and number of dimensions. + */ +size_t diskann_quantized_vector_byte_size( + enum Vec0DiskannQuantizerType quantizer_type, size_t dimensions) { + switch (quantizer_type) { + case VEC0_DISKANN_QUANTIZER_BINARY: + return dimensions / CHAR_BIT; // 1 bit per dimension + case VEC0_DISKANN_QUANTIZER_INT8: + return dimensions * sizeof(i8); // 1 byte per dimension + } + return 0; +} + struct VectorColumnDefinition { char *name; int name_length; @@ -2586,6 +2660,7 @@ struct VectorColumnDefinition { struct Vec0RescoreConfig rescore; #endif struct Vec0IvfConfig ivf; + struct Vec0DiskannConfig diskann; }; struct Vec0PartitionColumnDefinition { @@ -2743,6 +2818,126 @@ static int vec0_parse_ivf_options(struct Vec0Scanner *scanner, struct Vec0IvfConfig *config); #endif +/** + * Parse the options inside diskann(...) parentheses. + * Scanner should be positioned right before the '(' token. + * + * Recognized options: + * neighbor_quantizer = binary | int8 (required) + * n_neighbors = (optional, default 72) + * search_list_size = (optional, default 128) + */ +static int vec0_parse_diskann_options(struct Vec0Scanner *scanner, + struct Vec0DiskannConfig *config) { + int rc; + struct Vec0Token token; + int hasQuantizer = 0; + + // Set defaults + config->n_neighbors = VEC0_DISKANN_DEFAULT_N_NEIGHBORS; + config->search_list_size = VEC0_DISKANN_DEFAULT_SEARCH_LIST_SIZE; + config->search_list_size_search = 0; + config->search_list_size_insert = 0; + config->alpha = VEC0_DISKANN_DEFAULT_ALPHA; + config->buffer_threshold = 0; + int hasSearchListSize = 0; + int hasSearchListSizeSplit = 0; + + // Expect '(' + rc = vec0_scanner_next(scanner, &token); + if (rc != VEC0_TOKEN_RESULT_SOME || token.token_type != TOKEN_TYPE_LPAREN) { + return SQLITE_ERROR; + } + + while (1) { + // key + rc = vec0_scanner_next(scanner, &token); + if (rc == VEC0_TOKEN_RESULT_SOME && token.token_type == TOKEN_TYPE_RPAREN) { + break; // empty parens or trailing comma + } + if (rc != VEC0_TOKEN_RESULT_SOME || token.token_type != TOKEN_TYPE_IDENTIFIER) { + return SQLITE_ERROR; + } + char *optKey = token.start; + int optKeyLen = token.end - token.start; + + // '=' + rc = vec0_scanner_next(scanner, &token); + if (rc != VEC0_TOKEN_RESULT_SOME || token.token_type != TOKEN_TYPE_EQ) { + return SQLITE_ERROR; + } + + // value (identifier or digit) + rc = vec0_scanner_next(scanner, &token); + if (rc != VEC0_TOKEN_RESULT_SOME) { + return SQLITE_ERROR; + } + char *optVal = token.start; + int optValLen = token.end - token.start; + + if (sqlite3_strnicmp(optKey, "neighbor_quantizer", optKeyLen) == 0) { + if (sqlite3_strnicmp(optVal, "binary", optValLen) == 0) { + config->quantizer_type = VEC0_DISKANN_QUANTIZER_BINARY; + } else if (sqlite3_strnicmp(optVal, "int8", optValLen) == 0) { + config->quantizer_type = VEC0_DISKANN_QUANTIZER_INT8; + } else { + return SQLITE_ERROR; // unknown quantizer + } + hasQuantizer = 1; + } else if (sqlite3_strnicmp(optKey, "n_neighbors", optKeyLen) == 0) { + config->n_neighbors = atoi(optVal); + if (config->n_neighbors <= 0 || (config->n_neighbors % 8) != 0 || + config->n_neighbors > VEC0_DISKANN_MAX_N_NEIGHBORS) { + return SQLITE_ERROR; + } + } else if (sqlite3_strnicmp(optKey, "search_list_size_search", optKeyLen) == 0 && optKeyLen == 23) { + config->search_list_size_search = atoi(optVal); + if (config->search_list_size_search <= 0) { + return SQLITE_ERROR; + } + hasSearchListSizeSplit = 1; + } else if (sqlite3_strnicmp(optKey, "search_list_size_insert", optKeyLen) == 0 && optKeyLen == 23) { + config->search_list_size_insert = atoi(optVal); + if (config->search_list_size_insert <= 0) { + return SQLITE_ERROR; + } + hasSearchListSizeSplit = 1; + } else if (sqlite3_strnicmp(optKey, "search_list_size", optKeyLen) == 0) { + config->search_list_size = atoi(optVal); + if (config->search_list_size <= 0) { + return SQLITE_ERROR; + } + hasSearchListSize = 1; + } else if (sqlite3_strnicmp(optKey, "buffer_threshold", optKeyLen) == 0) { + config->buffer_threshold = atoi(optVal); + if (config->buffer_threshold < 0) { + return SQLITE_ERROR; + } + } else { + return SQLITE_ERROR; // unknown option + } + + // Expect ',' or ')' + rc = vec0_scanner_next(scanner, &token); + if (rc == VEC0_TOKEN_RESULT_SOME && token.token_type == TOKEN_TYPE_RPAREN) { + break; + } + if (rc != VEC0_TOKEN_RESULT_SOME || token.token_type != TOKEN_TYPE_COMMA) { + return SQLITE_ERROR; + } + } + + if (!hasQuantizer) { + return SQLITE_ERROR; // neighbor_quantizer is required + } + + if (hasSearchListSize && hasSearchListSizeSplit) { + return SQLITE_ERROR; // cannot mix search_list_size with search_list_size_search/insert + } + + return SQLITE_OK; +} + int vec0_parse_vector_column(const char *source, int source_length, struct VectorColumnDefinition *outColumn) { // parses a vector column definition like so: @@ -2763,8 +2958,9 @@ int vec0_parse_vector_column(const char *source, int source_length, #endif struct Vec0IvfConfig ivfConfig; memset(&ivfConfig, 0, sizeof(ivfConfig)); + struct Vec0DiskannConfig diskannConfig; + memset(&diskannConfig, 0, sizeof(diskannConfig)); int dimensions; - vec0_scanner_init(&scanner, source, source_length); // starts with an identifier @@ -2931,6 +3127,16 @@ int vec0_parse_vector_column(const char *source, int source_length, } #else return SQLITE_ERROR; // IVF not compiled in +#endif + } else if (sqlite3_strnicmp(token.start, "diskann", indexNameLen) == 0) { +#if SQLITE_VEC_ENABLE_DISKANN + indexType = VEC0_INDEX_TYPE_DISKANN; + rc = vec0_parse_diskann_options(&scanner, &diskannConfig); + if (rc != SQLITE_OK) { + return rc; + } +#else + return SQLITE_ERROR; #endif } else { // unknown index type @@ -2956,6 +3162,7 @@ int vec0_parse_vector_column(const char *source, int source_length, outColumn->rescore = rescoreConfig; #endif outColumn->ivf = ivfConfig; + outColumn->diskann = diskannConfig; return SQLITE_OK; } @@ -3154,6 +3361,7 @@ static sqlite3_module vec_eachModule = { #pragma endregion + #pragma region vec0 virtual table #define VEC0_COLUMN_ID 0 @@ -3214,6 +3422,9 @@ static sqlite3_module vec_eachModule = { #define VEC0_SHADOW_AUXILIARY_NAME "\"%w\".\"%w_auxiliary\"" #define VEC0_SHADOW_METADATA_N_NAME "\"%w\".\"%w_metadatachunks%02d\"" +#define VEC0_SHADOW_VECTORS_N_NAME "\"%w\".\"%w_vectors%02d\"" +#define VEC0_SHADOW_DISKANN_NODES_N_NAME "\"%w\".\"%w_diskann_nodes%02d\"" +#define VEC0_SHADOW_DISKANN_BUFFER_N_NAME "\"%w\".\"%w_diskann_buffer%02d\"" #define VEC0_SHADOW_METADATA_TEXT_DATA_NAME "\"%w\".\"%w_metadatatext%02d\"" #define VEC_INTERAL_ERROR "Internal sqlite-vec error: " @@ -3388,6 +3599,24 @@ struct vec0_vtab { * Must be cleaned up with sqlite3_finalize(). */ sqlite3_stmt *stmtRowidsGetChunkPosition; + + // === DiskANN additions === +#if SQLITE_VEC_ENABLE_DISKANN + // Shadow table names for DiskANN, per vector column + // e.g., "{schema}"."{table}_vectors{00..15}" + char *shadowVectorsNames[VEC0_MAX_VECTOR_COLUMNS]; + + // e.g., "{schema}"."{table}_diskann_nodes{00..15}" + char *shadowDiskannNodesNames[VEC0_MAX_VECTOR_COLUMNS]; + + // Prepared statements for DiskANN operations (per vector column) + // These will be lazily prepared on first use. + sqlite3_stmt *stmtDiskannNodeRead[VEC0_MAX_VECTOR_COLUMNS]; + sqlite3_stmt *stmtDiskannNodeWrite[VEC0_MAX_VECTOR_COLUMNS]; + sqlite3_stmt *stmtDiskannNodeInsert[VEC0_MAX_VECTOR_COLUMNS]; + sqlite3_stmt *stmtVectorsRead[VEC0_MAX_VECTOR_COLUMNS]; + sqlite3_stmt *stmtVectorsInsert[VEC0_MAX_VECTOR_COLUMNS]; +#endif }; #if SQLITE_VEC_ENABLE_RESCORE @@ -3427,6 +3656,13 @@ void vec0_free_resources(vec0_vtab *p) { sqlite3_finalize(p->stmtIvfRowidMapLookup[i]); p->stmtIvfRowidMapLookup[i] = NULL; sqlite3_finalize(p->stmtIvfRowidMapDelete[i]); p->stmtIvfRowidMapDelete[i] = NULL; sqlite3_finalize(p->stmtIvfCentroidsAll[i]); p->stmtIvfCentroidsAll[i] = NULL; +#if SQLITE_VEC_ENABLE_DISKANN + sqlite3_finalize(p->stmtDiskannNodeRead[i]); p->stmtDiskannNodeRead[i] = NULL; + sqlite3_finalize(p->stmtDiskannNodeWrite[i]); p->stmtDiskannNodeWrite[i] = NULL; + sqlite3_finalize(p->stmtDiskannNodeInsert[i]); p->stmtDiskannNodeInsert[i] = NULL; + sqlite3_finalize(p->stmtVectorsRead[i]); p->stmtVectorsRead[i] = NULL; + sqlite3_finalize(p->stmtVectorsInsert[i]); p->stmtVectorsInsert[i] = NULL; +#endif } #endif } @@ -3464,6 +3700,13 @@ void vec0_free(vec0_vtab *p) { p->shadowRescoreVectorsNames[i] = NULL; #endif +#if SQLITE_VEC_ENABLE_DISKANN + sqlite3_free(p->shadowVectorsNames[i]); + p->shadowVectorsNames[i] = NULL; + sqlite3_free(p->shadowDiskannNodesNames[i]); + p->shadowDiskannNodesNames[i] = NULL; +#endif + sqlite3_free(p->vector_columns[i].name); p->vector_columns[i].name = NULL; } @@ -3484,6 +3727,12 @@ void vec0_free(vec0_vtab *p) { } } +#if SQLITE_VEC_ENABLE_DISKANN +#include "sqlite-vec-diskann.c" +#else +static int vec0_all_columns_diskann(vec0_vtab *p) { (void)p; return 0; } +#endif + int vec0_num_defined_user_columns(vec0_vtab *p) { return p->numVectorColumns + p->numPartitionColumns + p->numAuxiliaryColumns + p->numMetadataColumns; } @@ -3753,6 +4002,25 @@ int vec0_get_vector_data(vec0_vtab *pVtab, i64 rowid, int vector_column_idx, void **outVector, int *outVectorSize) { vec0_vtab *p = pVtab; int rc, brc; + +#if SQLITE_VEC_ENABLE_DISKANN + // DiskANN fast path: read from _vectors table + if (p->vector_columns[vector_column_idx].index_type == VEC0_INDEX_TYPE_DISKANN) { + void *vec = NULL; + int vecSize; + rc = diskann_vector_read(p, vector_column_idx, rowid, &vec, &vecSize); + if (rc != SQLITE_OK) { + vtab_set_error(&pVtab->base, + "Could not fetch vector data for %lld from DiskANN vectors table", + rowid); + return SQLITE_ERROR; + } + *outVector = vec; + if (outVectorSize) *outVectorSize = vecSize; + return SQLITE_OK; + } +#endif + i64 chunk_id; i64 chunk_offset; @@ -4653,6 +4921,26 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, (i64)vecColumn.dimensions, SQLITE_VEC_VEC0_MAX_DIMENSIONS); goto error; } + + // DiskANN validation + if (vecColumn.index_type == VEC0_INDEX_TYPE_DISKANN) { + if (vecColumn.element_type == SQLITE_VEC_ELEMENT_TYPE_BIT) { + sqlite3_free(vecColumn.name); + *pzErr = sqlite3_mprintf( + VEC_CONSTRUCTOR_ERROR + "DiskANN index is not supported on bit vector columns"); + goto error; + } + if (vecColumn.diskann.quantizer_type == VEC0_DISKANN_QUANTIZER_BINARY && + (vecColumn.dimensions % CHAR_BIT) != 0) { + sqlite3_free(vecColumn.name); + *pzErr = sqlite3_mprintf( + VEC_CONSTRUCTOR_ERROR + "DiskANN with binary quantizer requires dimensions divisible by 8"); + goto error; + } + } + pNew->user_column_kinds[user_column_idx] = SQLITE_VEC0_USER_COLUMN_KIND_VECTOR; pNew->user_column_idxs[user_column_idx] = numVectorColumns; memcpy(&pNew->vector_columns[numVectorColumns], &vecColumn, sizeof(vecColumn)); @@ -4881,6 +5169,31 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, } } + // DiskANN columns cannot coexist with aux/metadata/partition columns + for (int i = 0; i < numVectorColumns; i++) { + if (pNew->vector_columns[i].index_type == VEC0_INDEX_TYPE_DISKANN) { + if (numAuxiliaryColumns > 0) { + *pzErr = sqlite3_mprintf( + VEC_CONSTRUCTOR_ERROR + "Auxiliary columns are not supported with DiskANN-indexed vector columns"); + goto error; + } + if (numMetadataColumns > 0) { + *pzErr = sqlite3_mprintf( + VEC_CONSTRUCTOR_ERROR + "Metadata columns are not supported with DiskANN-indexed vector columns"); + goto error; + } + if (numPartitionColumns > 0) { + *pzErr = sqlite3_mprintf( + VEC_CONSTRUCTOR_ERROR + "Partition key columns are not supported with DiskANN-indexed vector columns"); + goto error; + } + break; + } + } + sqlite3_str *createStr = sqlite3_str_new(NULL); sqlite3_str_appendall(createStr, "CREATE TABLE x("); if (pkColumnName) { @@ -4984,6 +5297,20 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, goto error; } } +#endif +#if SQLITE_VEC_ENABLE_DISKANN + if (pNew->vector_columns[i].index_type == VEC0_INDEX_TYPE_DISKANN) { + pNew->shadowVectorsNames[i] = + sqlite3_mprintf("%s_vectors%02d", tableName, i); + if (!pNew->shadowVectorsNames[i]) { + goto error; + } + pNew->shadowDiskannNodesNames[i] = + sqlite3_mprintf("%s_diskann_nodes%02d", tableName, i); + if (!pNew->shadowDiskannNodesNames[i]) { + goto error; + } + } #endif } #if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE @@ -5060,7 +5387,32 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, } sqlite3_finalize(stmt); - +#if SQLITE_VEC_ENABLE_DISKANN + // Seed medoid entries for DiskANN-indexed columns + for (int i = 0; i < pNew->numVectorColumns; i++) { + if (pNew->vector_columns[i].index_type != VEC0_INDEX_TYPE_DISKANN) { + continue; + } + char *key = sqlite3_mprintf("diskann_medoid_%02d", i); + char *zInsert = sqlite3_mprintf( + "INSERT INTO " VEC0_SHADOW_INFO_NAME "(key, value) VALUES (?1, ?2)", + pNew->schemaName, pNew->tableName); + rc = sqlite3_prepare_v2(db, zInsert, -1, &stmt, NULL); + sqlite3_free(zInsert); + if (rc != SQLITE_OK) { + sqlite3_free(key); + sqlite3_finalize(stmt); + goto error; + } + sqlite3_bind_text(stmt, 1, key, -1, sqlite3_free); + sqlite3_bind_null(stmt, 2); // NULL means empty graph + if (sqlite3_step(stmt) != SQLITE_DONE) { + sqlite3_finalize(stmt); + goto error; + } + sqlite3_finalize(stmt); + } +#endif // create the _chunks shadow table char *zCreateShadowChunks = NULL; @@ -5118,7 +5470,7 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, for (int i = 0; i < pNew->numVectorColumns; i++) { #if SQLITE_VEC_ENABLE_RESCORE - // Rescore and IVF columns don't use _vector_chunks + // Non-FLAT columns don't use _vector_chunks if (pNew->vector_columns[i].index_type != VEC0_INDEX_TYPE_FLAT) continue; #endif @@ -5159,6 +5511,84 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, } #endif +#if SQLITE_VEC_ENABLE_DISKANN + // Create DiskANN shadow tables for indexed vector columns + for (int i = 0; i < pNew->numVectorColumns; i++) { + if (pNew->vector_columns[i].index_type != VEC0_INDEX_TYPE_DISKANN) { + continue; + } + + // Create _vectors{NN} table + { + char *zSql = sqlite3_mprintf( + "CREATE TABLE " VEC0_SHADOW_VECTORS_N_NAME + " (rowid INTEGER PRIMARY KEY, vector BLOB NOT NULL);", + pNew->schemaName, pNew->tableName, i); + if (!zSql) { + goto error; + } + rc = sqlite3_prepare_v2(db, zSql, -1, &stmt, 0); + sqlite3_free(zSql); + if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) { + sqlite3_finalize(stmt); + *pzErr = sqlite3_mprintf( + "Could not create '_vectors%02d' shadow table: %s", i, + sqlite3_errmsg(db)); + goto error; + } + sqlite3_finalize(stmt); + } + + // Create _diskann_nodes{NN} table + { + char *zSql = sqlite3_mprintf( + "CREATE TABLE " VEC0_SHADOW_DISKANN_NODES_N_NAME " (" + "rowid INTEGER PRIMARY KEY, " + "neighbors_validity BLOB NOT NULL, " + "neighbor_ids BLOB NOT NULL, " + "neighbor_quantized_vectors BLOB NOT NULL" + ");", + pNew->schemaName, pNew->tableName, i); + if (!zSql) { + goto error; + } + rc = sqlite3_prepare_v2(db, zSql, -1, &stmt, 0); + sqlite3_free(zSql); + if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) { + sqlite3_finalize(stmt); + *pzErr = sqlite3_mprintf( + "Could not create '_diskann_nodes%02d' shadow table: %s", i, + sqlite3_errmsg(db)); + goto error; + } + sqlite3_finalize(stmt); + } + + // Create _diskann_buffer{NN} table (for batched inserts) + { + char *zSql = sqlite3_mprintf( + "CREATE TABLE " VEC0_SHADOW_DISKANN_BUFFER_N_NAME " (" + "rowid INTEGER PRIMARY KEY, " + "vector BLOB NOT NULL" + ");", + pNew->schemaName, pNew->tableName, i); + if (!zSql) { + goto error; + } + rc = sqlite3_prepare_v2(db, zSql, -1, &stmt, 0); + sqlite3_free(zSql); + if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) { + sqlite3_finalize(stmt); + *pzErr = sqlite3_mprintf( + "Could not create '_diskann_buffer%02d' shadow table: %s", i, + sqlite3_errmsg(db)); + goto error; + } + sqlite3_finalize(stmt); + } + } +#endif + // See SHADOW_TABLE_ROWID_QUIRK in vec0_new_chunk() — same "rowid PRIMARY KEY" // without INTEGER type issue applies here. for (int i = 0; i < pNew->numMetadataColumns; i++) { @@ -5293,6 +5723,45 @@ static int vec0Destroy(sqlite3_vtab *pVtab) { sqlite3_finalize(stmt); for (int i = 0; i < p->numVectorColumns; i++) { +#if SQLITE_VEC_ENABLE_DISKANN + if (p->vector_columns[i].index_type == VEC0_INDEX_TYPE_DISKANN) { + // Drop DiskANN shadow tables + zSql = sqlite3_mprintf("DROP TABLE IF EXISTS " VEC0_SHADOW_VECTORS_N_NAME, + p->schemaName, p->tableName, i); + if (zSql) { + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, 0); + sqlite3_free((void *)zSql); + if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) { + rc = SQLITE_ERROR; + goto done; + } + sqlite3_finalize(stmt); + } + zSql = sqlite3_mprintf("DROP TABLE IF EXISTS " VEC0_SHADOW_DISKANN_NODES_N_NAME, + p->schemaName, p->tableName, i); + if (zSql) { + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, 0); + sqlite3_free((void *)zSql); + if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) { + rc = SQLITE_ERROR; + goto done; + } + sqlite3_finalize(stmt); + } + zSql = sqlite3_mprintf("DROP TABLE IF EXISTS " VEC0_SHADOW_DISKANN_BUFFER_N_NAME, + p->schemaName, p->tableName, i); + if (zSql) { + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, 0); + sqlite3_free((void *)zSql); + if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) { + rc = SQLITE_ERROR; + goto done; + } + sqlite3_finalize(stmt); + } + continue; + } +#endif #if SQLITE_VEC_ENABLE_RESCORE if (p->vector_columns[i].index_type != VEC0_INDEX_TYPE_FLAT) continue; @@ -7088,6 +7557,171 @@ cleanup: #include "sqlite-vec-rescore.c" #endif +#if SQLITE_VEC_ENABLE_DISKANN +/** + * Handle a KNN query using the DiskANN graph search. + */ +static int vec0Filter_knn_diskann( + vec0_cursor *pCur, vec0_vtab *p, int idxNum, + const char *idxStr, int argc, sqlite3_value **argv) { + + int rc; + int vectorColumnIdx = idxNum; + struct VectorColumnDefinition *vector_column = &p->vector_columns[vectorColumnIdx]; + struct vec0_query_knn_data *knn_data; + + knn_data = sqlite3_malloc(sizeof(*knn_data)); + if (!knn_data) return SQLITE_NOMEM; + memset(knn_data, 0, sizeof(*knn_data)); + + // Parse query_idx and k_idx from idxStr + int query_idx = -1; + int k_idx = -1; + for (int i = 0; i < argc; i++) { + if (idxStr[1 + (i * 4)] == VEC0_IDXSTR_KIND_KNN_MATCH) { + query_idx = i; + } + if (idxStr[1 + (i * 4)] == VEC0_IDXSTR_KIND_KNN_K) { + k_idx = i; + } + } + assert(query_idx >= 0); + assert(k_idx >= 0); + + // Extract query vector + void *queryVector; + size_t dimensions; + enum VectorElementType elementType; + vector_cleanup queryVectorCleanup = vector_cleanup_noop; + char *pzError; + + rc = vector_from_value(argv[query_idx], &queryVector, &dimensions, + &elementType, &queryVectorCleanup, &pzError); + if (rc != SQLITE_OK) { + vtab_set_error(&p->base, "Invalid query vector: %z", pzError); + sqlite3_free(knn_data); + return SQLITE_ERROR; + } + + if (elementType != vector_column->element_type || + dimensions != vector_column->dimensions) { + vtab_set_error(&p->base, "Query vector type/dimension mismatch"); + queryVectorCleanup(queryVector); + sqlite3_free(knn_data); + return SQLITE_ERROR; + } + + i64 k = sqlite3_value_int64(argv[k_idx]); + if (k <= 0) { + knn_data->k = 0; + knn_data->k_used = 0; + pCur->knn_data = knn_data; + pCur->query_plan = VEC0_QUERY_PLAN_KNN; + queryVectorCleanup(queryVector); + return SQLITE_OK; + } + + // Run DiskANN search + i64 *resultRowids = sqlite3_malloc(k * sizeof(i64)); + f32 *resultDistances = sqlite3_malloc(k * sizeof(f32)); + if (!resultRowids || !resultDistances) { + sqlite3_free(resultRowids); + sqlite3_free(resultDistances); + queryVectorCleanup(queryVector); + sqlite3_free(knn_data); + return SQLITE_NOMEM; + } + + int resultCount; + rc = diskann_search(p, vectorColumnIdx, queryVector, dimensions, + elementType, (int)k, 0, + resultRowids, resultDistances, &resultCount); + + if (rc != SQLITE_OK) { + queryVectorCleanup(queryVector); + sqlite3_free(resultRowids); + sqlite3_free(resultDistances); + sqlite3_free(knn_data); + return rc; + } + + // Scan _diskann_buffer for any buffered (unflushed) vectors and merge + // with graph results. This ensures no recall loss for buffered vectors. + { + sqlite3_stmt *bufStmt = NULL; + char *zSql = sqlite3_mprintf( + "SELECT rowid, vector FROM " VEC0_SHADOW_DISKANN_BUFFER_N_NAME, + p->schemaName, p->tableName, vectorColumnIdx); + if (!zSql) { + queryVectorCleanup(queryVector); + sqlite3_free(resultRowids); + sqlite3_free(resultDistances); + sqlite3_free(knn_data); + return SQLITE_NOMEM; + } + int bufRc = sqlite3_prepare_v2(p->db, zSql, -1, &bufStmt, NULL); + sqlite3_free(zSql); + if (bufRc == SQLITE_OK) { + while (sqlite3_step(bufStmt) == SQLITE_ROW) { + i64 bufRowid = sqlite3_column_int64(bufStmt, 0); + const void *bufVec = sqlite3_column_blob(bufStmt, 1); + f32 dist = vec0_distance_full( + queryVector, bufVec, dimensions, elementType, + vector_column->distance_metric); + + // Check if this buffer vector should replace the worst graph result + if (resultCount < (int)k) { + // Still have room, just add it + resultRowids[resultCount] = bufRowid; + resultDistances[resultCount] = dist; + resultCount++; + } else { + // Find worst (largest distance) in results + int worstIdx = 0; + for (int wi = 1; wi < resultCount; wi++) { + if (resultDistances[wi] > resultDistances[worstIdx]) { + worstIdx = wi; + } + } + if (dist < resultDistances[worstIdx]) { + resultRowids[worstIdx] = bufRowid; + resultDistances[worstIdx] = dist; + } + } + } + sqlite3_finalize(bufStmt); + } + } + + queryVectorCleanup(queryVector); + + // Sort results by distance (ascending) + for (int si = 0; si < resultCount - 1; si++) { + for (int sj = si + 1; sj < resultCount; sj++) { + if (resultDistances[sj] < resultDistances[si]) { + f32 tmpD = resultDistances[si]; + resultDistances[si] = resultDistances[sj]; + resultDistances[sj] = tmpD; + i64 tmpR = resultRowids[si]; + resultRowids[si] = resultRowids[sj]; + resultRowids[sj] = tmpR; + } + } + } + + knn_data->k = resultCount; + knn_data->k_used = resultCount; + knn_data->rowids = resultRowids; + knn_data->distances = resultDistances; + knn_data->current_idx = 0; + + pCur->knn_data = knn_data; + pCur->query_plan = VEC0_QUERY_PLAN_KNN; + + return SQLITE_OK; +} +#endif /* SQLITE_VEC_ENABLE_DISKANN */ + int vec0Filter_knn(vec0_cursor *pCur, vec0_vtab *p, int idxNum, const char *idxStr, int argc, sqlite3_value **argv) { assert(argc == (strlen(idxStr)-1) / 4); @@ -7098,6 +7732,13 @@ int vec0Filter_knn(vec0_cursor *pCur, vec0_vtab *p, int idxNum, struct VectorColumnDefinition *vector_column = &p->vector_columns[vectorColumnIdx]; +#if SQLITE_VEC_ENABLE_DISKANN + // DiskANN dispatch + if (vector_column->index_type == VEC0_INDEX_TYPE_DISKANN) { + return vec0Filter_knn_diskann(pCur, p, idxNum, idxStr, argc, argv); + } +#endif + struct Array *arrayRowidsIn = NULL; sqlite3_stmt *stmtChunks = NULL; void *queryVector; @@ -8567,24 +9208,37 @@ int vec0Update_Insert(sqlite3_vtab *pVTab, int argc, sqlite3_value **argv, goto cleanup; } - // Step #2: Find the next "available" position in the _chunks table for this - // row. - rc = vec0Update_InsertNextAvailableStep(p, partitionKeyValues, - &chunk_rowid, &chunk_offset, - &blobChunksValidity, - &bufferChunksValidity); - if (rc != SQLITE_OK) { - goto cleanup; + if (!vec0_all_columns_diskann(p)) { + // Step #2: Find the next "available" position in the _chunks table for this + // row. + rc = vec0Update_InsertNextAvailableStep(p, partitionKeyValues, + &chunk_rowid, &chunk_offset, + &blobChunksValidity, + &bufferChunksValidity); + if (rc != SQLITE_OK) { + goto cleanup; + } + + // Step #3: With the next available chunk position, write out all the vectors + // to their specified location. + rc = vec0Update_InsertWriteFinalStep(p, chunk_rowid, chunk_offset, rowid, + vectorDatas, blobChunksValidity, + bufferChunksValidity); + if (rc != SQLITE_OK) { + goto cleanup; + } } - // Step #3: With the next available chunk position, write out all the vectors - // to their specified location. - rc = vec0Update_InsertWriteFinalStep(p, chunk_rowid, chunk_offset, rowid, - vectorDatas, blobChunksValidity, - bufferChunksValidity); - if (rc != SQLITE_OK) { - goto cleanup; +#if SQLITE_VEC_ENABLE_DISKANN + // Step #4: Insert into DiskANN graph for indexed vector columns + for (int i = 0; i < p->numVectorColumns; i++) { + if (p->vector_columns[i].index_type != VEC0_INDEX_TYPE_DISKANN) continue; + rc = diskann_insert(p, i, rowid, vectorDatas[i]); + if (rc != SQLITE_OK) { + goto cleanup; + } } +#endif #if SQLITE_VEC_ENABLE_RESCORE rc = rescore_on_insert(p, chunk_rowid, chunk_offset, rowid, vectorDatas); @@ -9126,29 +9780,43 @@ int vec0Update_Delete(sqlite3_vtab *pVTab, sqlite3_value *idValue) { // 4. Zero out vector data in all vector column chunks // 5. Delete value in _rowids table - // 1. get chunk_id and chunk_offset from _rowids - rc = vec0_get_chunk_position(p, rowid, NULL, &chunk_id, &chunk_offset); - if (rc != SQLITE_OK) { - return rc; +#if SQLITE_VEC_ENABLE_DISKANN + // DiskANN graph deletion for indexed columns + for (int i = 0; i < p->numVectorColumns; i++) { + if (p->vector_columns[i].index_type != VEC0_INDEX_TYPE_DISKANN) continue; + rc = diskann_delete(p, i, rowid); + if (rc != SQLITE_OK) { + return rc; + } + } +#endif + + if (!vec0_all_columns_diskann(p)) { + // 1. get chunk_id and chunk_offset from _rowids + rc = vec0_get_chunk_position(p, rowid, NULL, &chunk_id, &chunk_offset); + if (rc != SQLITE_OK) { + return rc; + } + + // 2. clear validity bit + rc = vec0Update_Delete_ClearValidity(p, chunk_id, chunk_offset); + if (rc != SQLITE_OK) { + return rc; + } + + // 3. zero out rowid in chunks.rowids + rc = vec0Update_Delete_ClearRowid(p, chunk_id, chunk_offset); + if (rc != SQLITE_OK) { + return rc; + } + + // 4. zero out any data in vector chunks tables + rc = vec0Update_Delete_ClearVectors(p, chunk_id, chunk_offset); + if (rc != SQLITE_OK) { + return rc; + } } - // 2. clear validity bit - rc = vec0Update_Delete_ClearValidity(p, chunk_id, chunk_offset); - if (rc != SQLITE_OK) { - return rc; - } - - // 3. zero out rowid in chunks.rowids - rc = vec0Update_Delete_ClearRowid(p, chunk_id, chunk_offset); - if (rc != SQLITE_OK) { - return rc; - } - - // 4. zero out any data in vector chunks tables - rc = vec0Update_Delete_ClearVectors(p, chunk_id, chunk_offset); - if (rc != SQLITE_OK) { - return rc; - } #if SQLITE_VEC_ENABLE_RESCORE // 4b. zero out quantized data in rescore chunk tables, delete from rescore vectors @@ -9172,20 +9840,22 @@ int vec0Update_Delete(sqlite3_vtab *pVTab, sqlite3_value *idValue) { } } - // 7. delete metadata - for(int i = 0; i < p->numMetadataColumns; i++) { - rc = vec0Update_Delete_ClearMetadata(p, i, rowid, chunk_id, chunk_offset); - if (rc != SQLITE_OK) { - return rc; + // 7. delete metadata and reclaim chunk (only when using chunk-based storage) + if (!vec0_all_columns_diskann(p)) { + for(int i = 0; i < p->numMetadataColumns; i++) { + rc = vec0Update_Delete_ClearMetadata(p, i, rowid, chunk_id, chunk_offset); + if (rc != SQLITE_OK) { + return rc; + } } - } - // 8. reclaim chunk if fully empty - { - int chunkDeleted; - rc = vec0Update_Delete_DeleteChunkIfEmpty(p, chunk_id, &chunkDeleted); - if (rc != SQLITE_OK) { - return rc; + // 8. reclaim chunk if fully empty + { + int chunkDeleted; + rc = vec0Update_Delete_DeleteChunkIfEmpty(p, chunk_id, &chunkDeleted); + if (rc != SQLITE_OK) { + return rc; + } } } @@ -9481,8 +10151,12 @@ static int vec0Update(sqlite3_vtab *pVTab, int argc, sqlite3_value **argv, const char *cmd = (const char *)sqlite3_value_text(idVal); vec0_vtab *p = (vec0_vtab *)pVTab; int cmdRc = ivf_handle_command(p, cmd, argc, argv); +#if SQLITE_VEC_ENABLE_DISKANN + if (cmdRc == SQLITE_EMPTY) + cmdRc = diskann_handle_command(p, cmd); +#endif if (cmdRc != SQLITE_EMPTY) return cmdRc; // handled (or error) - // SQLITE_EMPTY means not an IVF command — fall through to normal insert + // SQLITE_EMPTY means not a recognized command — fall through to normal insert } #endif return vec0Update_Insert(pVTab, argc, argv, pRowid); @@ -9638,9 +10312,16 @@ static sqlite3_module vec0Module = { #define SQLITE_VEC_DEBUG_BUILD_IVF "" #endif +#if SQLITE_VEC_ENABLE_DISKANN +#define SQLITE_VEC_DEBUG_BUILD_DISKANN "diskann" +#else +#define SQLITE_VEC_DEBUG_BUILD_DISKANN "" +#endif + #define SQLITE_VEC_DEBUG_BUILD \ SQLITE_VEC_DEBUG_BUILD_AVX " " SQLITE_VEC_DEBUG_BUILD_NEON " " \ - SQLITE_VEC_DEBUG_BUILD_RESCORE " " SQLITE_VEC_DEBUG_BUILD_IVF + SQLITE_VEC_DEBUG_BUILD_RESCORE " " SQLITE_VEC_DEBUG_BUILD_IVF " " \ + SQLITE_VEC_DEBUG_BUILD_DISKANN #define SQLITE_VEC_DEBUG_STRING \ "Version: " SQLITE_VEC_VERSION "\n" \ diff --git a/tests/fuzz/Makefile b/tests/fuzz/Makefile index a3405a4..202dc2b 100644 --- a/tests/fuzz/Makefile +++ b/tests/fuzz/Makefile @@ -26,7 +26,7 @@ FUZZ_LDFLAGS ?= $(shell \ echo "-Wl,-ld_classic"; \ fi) -FUZZ_CFLAGS = $(FUZZ_SANITIZERS) -I ../../ -I ../../vendor -DSQLITE_CORE -g $(FUZZ_LDFLAGS) +FUZZ_CFLAGS = $(FUZZ_SANITIZERS) -I ../../ -I ../../vendor -DSQLITE_CORE -DSQLITE_VEC_ENABLE_DISKANN=1 -g $(FUZZ_LDFLAGS) FUZZ_SRCS = ../../vendor/sqlite3.c ../../sqlite-vec.c TARGET_DIR = ./targets @@ -115,6 +115,34 @@ $(TARGET_DIR)/ivf_cell_overflow: ivf-cell-overflow.c $(FUZZ_SRCS) | $(TARGET_DIR $(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@ $(TARGET_DIR)/ivf_rescore: ivf-rescore.c $(FUZZ_SRCS) | $(TARGET_DIR) +$(TARGET_DIR)/diskann_operations: diskann-operations.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/diskann_create: diskann-create.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/diskann_graph_corrupt: diskann-graph-corrupt.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/diskann_deep_search: diskann-deep-search.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/diskann_blob_truncate: diskann-blob-truncate.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/diskann_delete_stress: diskann-delete-stress.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/diskann_buffer_flush: diskann-buffer-flush.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/diskann_int8_quant: diskann-int8-quant.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/diskann_prune_direct: diskann-prune-direct.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/diskann_command_inject: diskann-command-inject.c $(FUZZ_SRCS) | $(TARGET_DIR) $(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@ FUZZ_TARGETS = vec0_create exec json numpy \ @@ -127,6 +155,11 @@ FUZZ_TARGETS = vec0_create exec json numpy \ ivf_create ivf_operations \ ivf_quantize ivf_kmeans ivf_shadow_corrupt \ ivf_knn_deep ivf_cell_overflow ivf_rescore + diskann_operations diskann_create diskann_graph_corrupt \ + diskann_deep_search diskann_blob_truncate \ + diskann_delete_stress diskann_buffer_flush \ + diskann_int8_quant diskann_prune_direct \ + diskann_command_inject all: $(addprefix $(TARGET_DIR)/,$(FUZZ_TARGETS)) diff --git a/tests/fuzz/diskann-blob-truncate.c b/tests/fuzz/diskann-blob-truncate.c new file mode 100644 index 0000000..903a0d7 --- /dev/null +++ b/tests/fuzz/diskann-blob-truncate.c @@ -0,0 +1,250 @@ +/** + * Fuzz target for DiskANN shadow table blob size mismatches. + * + * The critical vulnerability: diskann_node_read() copies whatever blob size + * SQLite returns, but diskann_search/insert/delete index into those blobs + * using cfg->n_neighbors * sizeof(i64) etc. If the blob is truncated, + * extended, or has wrong size, this causes out-of-bounds reads/writes. + * + * This fuzzer: + * 1. Creates a valid DiskANN graph with several nodes + * 2. Uses fuzz data to directly write malformed blobs to shadow tables: + * - Truncated neighbor_ids (fewer bytes than n_neighbors * 8) + * - Truncated validity bitmaps + * - Oversized blobs with garbage trailing data + * - Zero-length blobs + * - Blobs with valid headers but corrupted neighbor rowids + * 3. Runs INSERT, DELETE, and KNN operations that traverse the corrupted graph + * + * Key code paths targeted: + * - diskann_node_read with mismatched blob sizes + * - diskann_validity_get / diskann_neighbor_id_get on truncated blobs + * - diskann_add_reverse_edge reading corrupted neighbor data + * - diskann_repair_reverse_edges traversing corrupted neighbor lists + * - diskann_search iterating neighbors from corrupted blobs + */ +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +static uint8_t fuzz_byte(const uint8_t **data, size_t *size, uint8_t def) { + if (*size == 0) return def; + uint8_t b = **data; + (*data)++; + (*size)--; + return b; +} + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 32) return 0; + + int rc; + sqlite3 *db; + + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + /* Use binary quantizer, float[16], n_neighbors=8 for predictable blob sizes: + * validity: 8/8 = 1 byte + * neighbor_ids: 8 * 8 = 64 bytes + * qvecs: 8 * (16/8) = 16 bytes (binary: 2 bytes per qvec) + */ + rc = sqlite3_exec(db, + "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[16] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8))", + NULL, NULL, NULL); + if (rc != SQLITE_OK) { sqlite3_close(db); return 0; } + + /* Insert 12 vectors to create a valid graph structure */ + { + sqlite3_stmt *stmt; + sqlite3_prepare_v2(db, + "INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &stmt, NULL); + for (int i = 1; i <= 12; i++) { + float vec[16]; + for (int j = 0; j < 16; j++) { + vec[j] = (float)i * 0.1f + (float)j * 0.01f; + } + sqlite3_reset(stmt); + sqlite3_bind_int64(stmt, 1, i); + sqlite3_bind_blob(stmt, 2, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_step(stmt); + } + sqlite3_finalize(stmt); + } + + /* Now corrupt shadow table blobs using fuzz data */ + const char *columns[] = { + "neighbors_validity", + "neighbor_ids", + "neighbor_quantized_vectors" + }; + + /* Expected sizes for n_neighbors=8, dims=16, binary quantizer */ + int expected_sizes[] = {1, 64, 16}; + + while (size >= 4) { + int target_row = (fuzz_byte(&data, &size, 0) % 12) + 1; + int col_idx = fuzz_byte(&data, &size, 0) % 3; + uint8_t corrupt_mode = fuzz_byte(&data, &size, 0) % 6; + uint8_t extra = fuzz_byte(&data, &size, 0); + + char sqlbuf[256]; + snprintf(sqlbuf, sizeof(sqlbuf), + "UPDATE v_diskann_nodes00 SET %s = ? WHERE rowid = ?", + columns[col_idx]); + + sqlite3_stmt *writeStmt; + rc = sqlite3_prepare_v2(db, sqlbuf, -1, &writeStmt, NULL); + if (rc != SQLITE_OK) continue; + + int expected = expected_sizes[col_idx]; + unsigned char *blob = NULL; + int blob_size = 0; + + switch (corrupt_mode) { + case 0: { + /* Truncated blob: 0 to expected-1 bytes */ + blob_size = extra % expected; + if (blob_size == 0) blob_size = 0; /* zero-length is interesting */ + blob = sqlite3_malloc(blob_size > 0 ? blob_size : 1); + if (!blob) { sqlite3_finalize(writeStmt); continue; } + for (int i = 0; i < blob_size; i++) { + blob[i] = fuzz_byte(&data, &size, 0); + } + break; + } + case 1: { + /* Oversized blob: expected + extra bytes */ + blob_size = expected + (extra % 64); + blob = sqlite3_malloc(blob_size); + if (!blob) { sqlite3_finalize(writeStmt); continue; } + for (int i = 0; i < blob_size; i++) { + blob[i] = fuzz_byte(&data, &size, 0xFF); + } + break; + } + case 2: { + /* Zero-length blob */ + blob_size = 0; + blob = NULL; + sqlite3_bind_zeroblob(writeStmt, 1, 0); + sqlite3_bind_int64(writeStmt, 2, target_row); + sqlite3_step(writeStmt); + sqlite3_finalize(writeStmt); + continue; + } + case 3: { + /* Correct size but all-ones validity (all slots "valid") with + * garbage neighbor IDs -- forces reading non-existent nodes */ + blob_size = expected; + blob = sqlite3_malloc(blob_size); + if (!blob) { sqlite3_finalize(writeStmt); continue; } + memset(blob, 0xFF, blob_size); + break; + } + case 4: { + /* neighbor_ids with very large rowid values (near INT64_MAX) */ + blob_size = expected; + blob = sqlite3_malloc(blob_size); + if (!blob) { sqlite3_finalize(writeStmt); continue; } + memset(blob, 0x7F, blob_size); /* fills with large positive values */ + break; + } + case 5: { + /* neighbor_ids with negative rowid values (rowid=0 is sentinel) */ + blob_size = expected; + blob = sqlite3_malloc(blob_size); + if (!blob) { sqlite3_finalize(writeStmt); continue; } + memset(blob, 0x80, blob_size); /* fills with large negative values */ + /* Flip some bytes from fuzz data */ + for (int i = 0; i < blob_size && size > 0; i++) { + blob[i] ^= fuzz_byte(&data, &size, 0); + } + break; + } + } + + if (blob) { + sqlite3_bind_blob(writeStmt, 1, blob, blob_size, SQLITE_TRANSIENT); + } else { + sqlite3_bind_blob(writeStmt, 1, "", 0, SQLITE_STATIC); + } + sqlite3_bind_int64(writeStmt, 2, target_row); + sqlite3_step(writeStmt); + sqlite3_finalize(writeStmt); + sqlite3_free(blob); + } + + /* Exercise the corrupted graph with various operations */ + + /* KNN query */ + { + float qvec[16]; + for (int j = 0; j < 16; j++) qvec[j] = (float)j * 0.1f; + sqlite3_stmt *knnStmt; + rc = sqlite3_prepare_v2(db, + "SELECT rowid, distance FROM v WHERE emb MATCH ? AND k = 5", + -1, &knnStmt, NULL); + if (rc == SQLITE_OK) { + sqlite3_bind_blob(knnStmt, 1, qvec, sizeof(qvec), SQLITE_STATIC); + while (sqlite3_step(knnStmt) == SQLITE_ROW) {} + sqlite3_finalize(knnStmt); + } + } + + /* Insert into corrupted graph (triggers add_reverse_edge on corrupted nodes) */ + { + float vec[16]; + for (int j = 0; j < 16; j++) vec[j] = 0.5f; + sqlite3_stmt *stmt; + sqlite3_prepare_v2(db, + "INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &stmt, NULL); + if (stmt) { + sqlite3_bind_int64(stmt, 1, 100); + sqlite3_bind_blob(stmt, 2, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_step(stmt); + sqlite3_finalize(stmt); + } + } + + /* Delete from corrupted graph (triggers repair_reverse_edges) */ + { + sqlite3_stmt *stmt; + sqlite3_prepare_v2(db, + "DELETE FROM v WHERE rowid = ?", -1, &stmt, NULL); + if (stmt) { + sqlite3_bind_int64(stmt, 1, 5); + sqlite3_step(stmt); + sqlite3_finalize(stmt); + } + } + + /* Another KNN to traverse the post-mutation graph */ + { + float qvec[16]; + for (int j = 0; j < 16; j++) qvec[j] = -0.5f + (float)j * 0.07f; + sqlite3_stmt *knnStmt; + rc = sqlite3_prepare_v2(db, + "SELECT rowid, distance FROM v WHERE emb MATCH ? AND k = 12", + -1, &knnStmt, NULL); + if (rc == SQLITE_OK) { + sqlite3_bind_blob(knnStmt, 1, qvec, sizeof(qvec), SQLITE_STATIC); + while (sqlite3_step(knnStmt) == SQLITE_ROW) {} + sqlite3_finalize(knnStmt); + } + } + + /* Full scan */ + sqlite3_exec(db, "SELECT * FROM v", NULL, NULL, NULL); + + sqlite3_close(db); + return 0; +} diff --git a/tests/fuzz/diskann-buffer-flush.c b/tests/fuzz/diskann-buffer-flush.c new file mode 100644 index 0000000..f10e100 --- /dev/null +++ b/tests/fuzz/diskann-buffer-flush.c @@ -0,0 +1,164 @@ +/** + * Fuzz target for DiskANN buffered insert and flush paths. + * + * When buffer_threshold > 0, inserts go into a flat buffer table and + * are flushed into the graph in batch. This fuzzer exercises: + * + * - diskann_buffer_write / diskann_buffer_delete / diskann_buffer_exists + * - diskann_flush_buffer (batch graph insertion) + * - diskann_insert with buffer_threshold (batching logic) + * - Buffer-graph merge in vec0Filter_knn_diskann (unflushed vectors + * must be scanned during KNN and merged with graph results) + * - Delete of a buffered (not yet flushed) vector + * - Delete of a graph vector while buffer has pending inserts + * - Interaction: insert to buffer, query (triggers buffer scan), flush, + * query again (now from graph) + * + * The buffer merge path in vec0Filter_knn_diskann is particularly + * interesting because it does a brute-force scan of buffer vectors and + * merges with the top-k from graph search. + */ +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +static uint8_t fuzz_byte(const uint8_t **data, size_t *size, uint8_t def) { + if (*size == 0) return def; + uint8_t b = **data; + (*data)++; + (*size)--; + return b; +} + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 16) return 0; + + int rc; + sqlite3 *db; + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + /* buffer_threshold: small (3-8) to trigger frequent flushes */ + int buf_threshold = 3 + (fuzz_byte(&data, &size, 0) % 6); + int dims = 8; + + char sql[512]; + snprintf(sql, sizeof(sql), + "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[%d] INDEXED BY diskann(" + "neighbor_quantizer=binary, n_neighbors=8, " + "search_list_size=16, buffer_threshold=%d" + "))", dims, buf_threshold); + + rc = sqlite3_exec(db, sql, NULL, NULL, NULL); + if (rc != SQLITE_OK) { sqlite3_close(db); return 0; } + + sqlite3_stmt *stmtInsert = NULL, *stmtDelete = NULL, *stmtKnn = NULL; + sqlite3_prepare_v2(db, + "INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &stmtInsert, NULL); + sqlite3_prepare_v2(db, + "DELETE FROM v WHERE rowid = ?", -1, &stmtDelete, NULL); + sqlite3_prepare_v2(db, + "SELECT rowid, distance FROM v WHERE emb MATCH ? AND k = ?", + -1, &stmtKnn, NULL); + + if (!stmtInsert || !stmtDelete || !stmtKnn) goto cleanup; + + float vec[8]; + int next_rowid = 1; + + while (size >= 2) { + uint8_t op = fuzz_byte(&data, &size, 0) % 6; + uint8_t param = fuzz_byte(&data, &size, 0); + + switch (op) { + case 0: { /* Insert: accumulates in buffer until threshold */ + int64_t rowid = next_rowid++; + if (next_rowid > 64) next_rowid = 1; /* wrap around for reuse */ + for (int j = 0; j < dims; j++) { + vec[j] = (float)((int8_t)fuzz_byte(&data, &size, 0)) / 10.0f; + } + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, rowid); + sqlite3_bind_blob(stmtInsert, 2, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + break; + } + case 1: { /* KNN query while buffer may have unflushed vectors */ + for (int j = 0; j < dims; j++) { + vec[j] = (float)((int8_t)fuzz_byte(&data, &size, 0)) / 10.0f; + } + int k = (param % 10) + 1; + sqlite3_reset(stmtKnn); + sqlite3_bind_blob(stmtKnn, 1, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_bind_int(stmtKnn, 2, k); + while (sqlite3_step(stmtKnn) == SQLITE_ROW) {} + break; + } + case 2: { /* Delete a potentially-buffered vector */ + int64_t rowid = (int64_t)(param % 64) + 1; + sqlite3_reset(stmtDelete); + sqlite3_bind_int64(stmtDelete, 1, rowid); + sqlite3_step(stmtDelete); + break; + } + case 3: { /* Insert several at once to trigger flush mid-batch */ + for (int i = 0; i < buf_threshold + 1 && size >= 2; i++) { + int64_t rowid = (int64_t)(fuzz_byte(&data, &size, 0) % 64) + 1; + for (int j = 0; j < dims; j++) { + vec[j] = (float)((int8_t)fuzz_byte(&data, &size, 0)) / 10.0f; + } + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, rowid); + sqlite3_bind_blob(stmtInsert, 2, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + } + break; + } + case 4: { /* Insert then immediately delete (still in buffer) */ + int64_t rowid = (int64_t)(param % 64) + 1; + for (int j = 0; j < dims; j++) vec[j] = 0.1f * param; + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, rowid); + sqlite3_bind_blob(stmtInsert, 2, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + + sqlite3_reset(stmtDelete); + sqlite3_bind_int64(stmtDelete, 1, rowid); + sqlite3_step(stmtDelete); + break; + } + case 5: { /* Query with k=0 and k=1 (boundary) */ + for (int j = 0; j < dims; j++) vec[j] = 0.0f; + sqlite3_reset(stmtKnn); + sqlite3_bind_blob(stmtKnn, 1, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_bind_int(stmtKnn, 2, param % 2); /* k=0 or k=1 */ + while (sqlite3_step(stmtKnn) == SQLITE_ROW) {} + break; + } + } + } + + /* Final query to exercise post-operation state */ + { + float qvec[8] = {1.0f, -1.0f, 0.5f, -0.5f, 0.0f, 0.0f, 0.0f, 0.0f}; + sqlite3_reset(stmtKnn); + sqlite3_bind_blob(stmtKnn, 1, qvec, sizeof(qvec), SQLITE_TRANSIENT); + sqlite3_bind_int(stmtKnn, 2, 20); + while (sqlite3_step(stmtKnn) == SQLITE_ROW) {} + } + +cleanup: + sqlite3_finalize(stmtInsert); + sqlite3_finalize(stmtDelete); + sqlite3_finalize(stmtKnn); + sqlite3_close(db); + return 0; +} diff --git a/tests/fuzz/diskann-command-inject.c b/tests/fuzz/diskann-command-inject.c new file mode 100644 index 0000000..ef62884 --- /dev/null +++ b/tests/fuzz/diskann-command-inject.c @@ -0,0 +1,158 @@ +/** + * Fuzz target for DiskANN runtime command dispatch (diskann_handle_command). + * + * The command handler parses strings like "search_list_size_search=42" and + * modifies live DiskANN config. This fuzzer exercises: + * + * - atoi on fuzz-controlled strings (integer overflow, negative, non-numeric) + * - strncmp boundary with fuzz data (near-matches to valid commands) + * - Changing search_list_size mid-operation (affects subsequent queries) + * - Setting search_list_size to 1 (minimum - single-candidate beam search) + * - Setting search_list_size very large (memory pressure) + * - Interleaving command changes with inserts and queries + * + * Also tests the UPDATE v SET command = ? path through the vtable. + */ +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +static uint8_t fuzz_byte(const uint8_t **data, size_t *size, uint8_t def) { + if (*size == 0) return def; + uint8_t b = **data; + (*data)++; + (*size)--; + return b; +} + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 20) return 0; + + int rc; + sqlite3 *db; + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + rc = sqlite3_exec(db, + "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8))", + NULL, NULL, NULL); + if (rc != SQLITE_OK) { sqlite3_close(db); return 0; } + + /* Insert some vectors first */ + { + sqlite3_stmt *stmt; + sqlite3_prepare_v2(db, + "INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &stmt, NULL); + for (int i = 1; i <= 8; i++) { + float vec[8]; + for (int j = 0; j < 8; j++) vec[j] = (float)i * 0.1f + (float)j * 0.01f; + sqlite3_reset(stmt); + sqlite3_bind_int64(stmt, 1, i); + sqlite3_bind_blob(stmt, 2, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_step(stmt); + } + sqlite3_finalize(stmt); + } + + sqlite3_stmt *stmtCmd = NULL; + sqlite3_stmt *stmtInsert = NULL; + sqlite3_stmt *stmtKnn = NULL; + + /* Commands are dispatched via INSERT INTO t(rowid) VALUES ('cmd_string') */ + sqlite3_prepare_v2(db, + "INSERT INTO v(rowid) VALUES (?)", -1, &stmtCmd, NULL); + sqlite3_prepare_v2(db, + "INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &stmtInsert, NULL); + sqlite3_prepare_v2(db, + "SELECT rowid, distance FROM v WHERE emb MATCH ? AND k = ?", + -1, &stmtKnn, NULL); + + if (!stmtCmd || !stmtInsert || !stmtKnn) goto cleanup; + + /* Fuzz-driven command + operation interleaving */ + while (size >= 2) { + uint8_t op = fuzz_byte(&data, &size, 0) % 5; + + switch (op) { + case 0: { /* Send fuzz command string */ + int cmd_len = fuzz_byte(&data, &size, 0) % 64; + char cmd[65]; + for (int i = 0; i < cmd_len && size > 0; i++) { + cmd[i] = (char)fuzz_byte(&data, &size, 0); + } + cmd[cmd_len] = '\0'; + sqlite3_reset(stmtCmd); + sqlite3_bind_text(stmtCmd, 1, cmd, -1, SQLITE_TRANSIENT); + sqlite3_step(stmtCmd); /* May fail -- that's expected */ + break; + } + case 1: { /* Send valid-looking command with fuzz value */ + const char *prefixes[] = { + "search_list_size=", + "search_list_size_search=", + "search_list_size_insert=", + }; + int prefix_idx = fuzz_byte(&data, &size, 0) % 3; + int val = (int)(int8_t)fuzz_byte(&data, &size, 0); + + char cmd[128]; + snprintf(cmd, sizeof(cmd), "%s%d", prefixes[prefix_idx], val); + sqlite3_reset(stmtCmd); + sqlite3_bind_text(stmtCmd, 1, cmd, -1, SQLITE_TRANSIENT); + sqlite3_step(stmtCmd); + break; + } + case 2: { /* KNN query (uses whatever search_list_size is set) */ + float qvec[8] = {1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; + qvec[0] = (float)((int8_t)fuzz_byte(&data, &size, 127)) / 10.0f; + int k = fuzz_byte(&data, &size, 3) % 10 + 1; + sqlite3_reset(stmtKnn); + sqlite3_bind_blob(stmtKnn, 1, qvec, sizeof(qvec), SQLITE_TRANSIENT); + sqlite3_bind_int(stmtKnn, 2, k); + while (sqlite3_step(stmtKnn) == SQLITE_ROW) {} + break; + } + case 3: { /* Insert (uses whatever search_list_size_insert is set) */ + int64_t rowid = (int64_t)(fuzz_byte(&data, &size, 0) % 32) + 1; + float vec[8]; + for (int j = 0; j < 8; j++) { + vec[j] = (float)((int8_t)fuzz_byte(&data, &size, 0)) / 10.0f; + } + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, rowid); + sqlite3_bind_blob(stmtInsert, 2, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + break; + } + case 4: { /* Set search_list_size to extreme values */ + const char *extreme_cmds[] = { + "search_list_size=1", + "search_list_size=2", + "search_list_size=1000", + "search_list_size_search=1", + "search_list_size_insert=1", + }; + int idx = fuzz_byte(&data, &size, 0) % 5; + sqlite3_reset(stmtCmd); + sqlite3_bind_text(stmtCmd, 1, extreme_cmds[idx], -1, SQLITE_STATIC); + sqlite3_step(stmtCmd); + break; + } + } + } + +cleanup: + sqlite3_finalize(stmtCmd); + sqlite3_finalize(stmtInsert); + sqlite3_finalize(stmtKnn); + sqlite3_close(db); + return 0; +} diff --git a/tests/fuzz/diskann-create.c b/tests/fuzz/diskann-create.c new file mode 100644 index 0000000..1b40a84 --- /dev/null +++ b/tests/fuzz/diskann-create.c @@ -0,0 +1,44 @@ +/** + * Fuzz target for DiskANN CREATE TABLE config parsing. + * Feeds fuzz data as the INDEXED BY diskann(...) option string. + */ +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size > 4096) return 0; /* Limit input size */ + + int rc; + sqlite3 *db; + sqlite3_stmt *stmt; + + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + sqlite3_str *s = sqlite3_str_new(NULL); + assert(s); + sqlite3_str_appendall(s, + "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[64] INDEXED BY diskann("); + sqlite3_str_appendf(s, "%.*s", (int)size, data); + sqlite3_str_appendall(s, "))"); + const char *zSql = sqlite3_str_finish(s); + assert(zSql); + + rc = sqlite3_prepare_v2(db, zSql, -1, &stmt, NULL); + sqlite3_free((char *)zSql); + if (rc == SQLITE_OK) { + sqlite3_step(stmt); + } + sqlite3_finalize(stmt); + sqlite3_close(db); + return 0; +} diff --git a/tests/fuzz/diskann-deep-search.c b/tests/fuzz/diskann-deep-search.c new file mode 100644 index 0000000..35d548c --- /dev/null +++ b/tests/fuzz/diskann-deep-search.c @@ -0,0 +1,187 @@ +/** + * Fuzz target for DiskANN greedy beam search deep paths. + * + * Builds a graph with enough nodes to force multi-hop traversal, then + * uses fuzz data to control: query vector values, k, search_list_size + * overrides, and interleaved insert/delete/query sequences that stress + * the candidate list growth, visited set hash collisions, and the + * re-ranking logic. + * + * Key code paths targeted: + * - diskann_candidate_list_insert (sorted insert, dedup, eviction) + * - diskann_visited_set (hash collisions, capacity) + * - diskann_search (full beam search loop, re-ranking with exact dist) + * - diskann_distance_quantized_precomputed (both binary and int8) + * - Buffer merge in vec0Filter_knn_diskann + */ +#include +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +/* Consume one byte from fuzz input, or return default. */ +static uint8_t fuzz_byte(const uint8_t **data, size_t *size, uint8_t def) { + if (*size == 0) return def; + uint8_t b = **data; + (*data)++; + (*size)--; + return b; +} + +static uint16_t fuzz_u16(const uint8_t **data, size_t *size) { + uint8_t lo = fuzz_byte(data, size, 0); + uint8_t hi = fuzz_byte(data, size, 0); + return (uint16_t)hi << 8 | lo; +} + +static float fuzz_float(const uint8_t **data, size_t *size) { + return (float)((int8_t)fuzz_byte(data, size, 0)) / 10.0f; +} + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 32) return 0; + + /* Use first bytes to pick quantizer type and dimensions */ + uint8_t quantizer_choice = fuzz_byte(&data, &size, 0) % 2; + const char *quantizer = quantizer_choice ? "int8" : "binary"; + + /* Dimensions must be divisible by 8. Pick from {8, 16, 32} */ + int dim_choices[] = {8, 16, 32}; + int dims = dim_choices[fuzz_byte(&data, &size, 0) % 3]; + + /* n_neighbors: 8 or 16 -- small to force full-neighbor scenarios quickly */ + int n_neighbors = (fuzz_byte(&data, &size, 0) % 2) ? 16 : 8; + + /* search_list_size: small so beam search terminates quickly but still exercises loops */ + int search_list_size = 8 + (fuzz_byte(&data, &size, 0) % 24); + + /* alpha: vary to test RobustPrune pruning logic */ + float alpha_choices[] = {1.0f, 1.2f, 1.5f, 2.0f}; + float alpha = alpha_choices[fuzz_byte(&data, &size, 0) % 4]; + + int rc; + sqlite3 *db; + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + char sql[512]; + snprintf(sql, sizeof(sql), + "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[%d] INDEXED BY diskann(" + "neighbor_quantizer=%s, n_neighbors=%d, " + "search_list_size=%d" + "))", dims, quantizer, n_neighbors, search_list_size); + + rc = sqlite3_exec(db, sql, NULL, NULL, NULL); + if (rc != SQLITE_OK) { sqlite3_close(db); return 0; } + + sqlite3_stmt *stmtInsert = NULL, *stmtDelete = NULL, *stmtKnn = NULL; + sqlite3_prepare_v2(db, + "INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &stmtInsert, NULL); + sqlite3_prepare_v2(db, + "DELETE FROM v WHERE rowid = ?", -1, &stmtDelete, NULL); + + char knn_sql[256]; + snprintf(knn_sql, sizeof(knn_sql), + "SELECT rowid, distance FROM v WHERE emb MATCH ? AND k = ?"); + sqlite3_prepare_v2(db, knn_sql, -1, &stmtKnn, NULL); + + if (!stmtInsert || !stmtDelete || !stmtKnn) goto cleanup; + + /* Phase 1: Seed the graph with enough nodes to create multi-hop structure. + * Insert 2*n_neighbors nodes so the graph is dense enough for search + * to actually traverse multiple hops. */ + int seed_count = n_neighbors * 2; + if (seed_count > 64) seed_count = 64; /* Bound for performance */ + { + float *vec = malloc(dims * sizeof(float)); + if (!vec) goto cleanup; + for (int i = 1; i <= seed_count; i++) { + for (int j = 0; j < dims; j++) { + vec[j] = fuzz_float(&data, &size); + } + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, i); + sqlite3_bind_blob(stmtInsert, 2, vec, dims * sizeof(float), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + } + free(vec); + } + + /* Phase 2: Fuzz-driven operations on the seeded graph */ + float *vec = malloc(dims * sizeof(float)); + if (!vec) goto cleanup; + + while (size >= 2) { + uint8_t op = fuzz_byte(&data, &size, 0) % 5; + uint8_t param = fuzz_byte(&data, &size, 0); + + switch (op) { + case 0: { /* INSERT with fuzz-controlled vector and rowid */ + int64_t rowid = (int64_t)(param % 128) + 1; + for (int j = 0; j < dims; j++) { + vec[j] = fuzz_float(&data, &size); + } + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, rowid); + sqlite3_bind_blob(stmtInsert, 2, vec, dims * sizeof(float), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + break; + } + case 1: { /* DELETE */ + int64_t rowid = (int64_t)(param % 128) + 1; + sqlite3_reset(stmtDelete); + sqlite3_bind_int64(stmtDelete, 1, rowid); + sqlite3_step(stmtDelete); + break; + } + case 2: { /* KNN with fuzz query vector and variable k */ + for (int j = 0; j < dims; j++) { + vec[j] = fuzz_float(&data, &size); + } + int k = (param % 20) + 1; + sqlite3_reset(stmtKnn); + sqlite3_bind_blob(stmtKnn, 1, vec, dims * sizeof(float), SQLITE_TRANSIENT); + sqlite3_bind_int(stmtKnn, 2, k); + while (sqlite3_step(stmtKnn) == SQLITE_ROW) {} + break; + } + case 3: { /* KNN with k > number of nodes (boundary) */ + for (int j = 0; j < dims; j++) { + vec[j] = fuzz_float(&data, &size); + } + sqlite3_reset(stmtKnn); + sqlite3_bind_blob(stmtKnn, 1, vec, dims * sizeof(float), SQLITE_TRANSIENT); + sqlite3_bind_int(stmtKnn, 2, 1000); /* k >> graph size */ + while (sqlite3_step(stmtKnn) == SQLITE_ROW) {} + break; + } + case 4: { /* INSERT duplicate rowid (triggers OR REPLACE path) */ + int64_t rowid = (int64_t)(param % 32) + 1; + for (int j = 0; j < dims; j++) { + vec[j] = (float)(param + j) / 50.0f; + } + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, rowid); + sqlite3_bind_blob(stmtInsert, 2, vec, dims * sizeof(float), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + break; + } + } + } + free(vec); + +cleanup: + sqlite3_finalize(stmtInsert); + sqlite3_finalize(stmtDelete); + sqlite3_finalize(stmtKnn); + sqlite3_close(db); + return 0; +} diff --git a/tests/fuzz/diskann-delete-stress.c b/tests/fuzz/diskann-delete-stress.c new file mode 100644 index 0000000..d10a7ff --- /dev/null +++ b/tests/fuzz/diskann-delete-stress.c @@ -0,0 +1,175 @@ +/** + * Fuzz target for DiskANN delete path and graph connectivity maintenance. + * + * The delete path is the most complex graph mutation: + * 1. Read deleted node's neighbor list + * 2. For each neighbor, remove deleted node from their list + * 3. Try to fill the gap with one of deleted node's other neighbors + * 4. Handle medoid deletion (pick new medoid) + * + * Edge cases this targets: + * - Delete the medoid (entry point) -- forces medoid reassignment + * - Delete all nodes except one -- graph degenerates + * - Delete nodes in a chain -- cascading dangling edges + * - Re-insert at deleted rowids -- stale graph edges to old data + * - Delete nonexistent rowids -- should be no-op + * - Insert-delete-insert same rowid rapidly + * - Delete when graph has exactly n_neighbors entries (full nodes) + * + * Key code paths: + * - diskann_delete -> diskann_repair_reverse_edges + * - diskann_medoid_handle_delete + * - diskann_node_clear_neighbor + * - Interaction between delete and concurrent search + */ +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +static uint8_t fuzz_byte(const uint8_t **data, size_t *size, uint8_t def) { + if (*size == 0) return def; + uint8_t b = **data; + (*data)++; + (*size)--; + return b; +} + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 20) return 0; + + int rc; + sqlite3 *db; + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + /* int8 quantizer to exercise that distance code path */ + uint8_t quant = fuzz_byte(&data, &size, 0) % 2; + const char *qname = quant ? "int8" : "binary"; + + char sql[256]; + snprintf(sql, sizeof(sql), + "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[8] INDEXED BY diskann(neighbor_quantizer=%s, n_neighbors=8))", + qname); + rc = sqlite3_exec(db, sql, NULL, NULL, NULL); + if (rc != SQLITE_OK) { sqlite3_close(db); return 0; } + + sqlite3_stmt *stmtInsert = NULL, *stmtDelete = NULL, *stmtKnn = NULL; + sqlite3_prepare_v2(db, + "INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &stmtInsert, NULL); + sqlite3_prepare_v2(db, + "DELETE FROM v WHERE rowid = ?", -1, &stmtDelete, NULL); + sqlite3_prepare_v2(db, + "SELECT rowid, distance FROM v WHERE emb MATCH ? AND k = ?", + -1, &stmtKnn, NULL); + + if (!stmtInsert || !stmtDelete || !stmtKnn) goto cleanup; + + /* Phase 1: Build a graph of exactly n_neighbors+2 = 10 nodes. + * This makes every node nearly full, maximizing the chance that + * inserts trigger the "full node" path in add_reverse_edge. */ + for (int i = 1; i <= 10; i++) { + float vec[8]; + for (int j = 0; j < 8; j++) { + vec[j] = (float)((int8_t)fuzz_byte(&data, &size, (uint8_t)(i*13+j*7))) / 20.0f; + } + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, i); + sqlite3_bind_blob(stmtInsert, 2, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + } + + /* Phase 2: Fuzz-driven delete-heavy workload */ + while (size >= 2) { + uint8_t op = fuzz_byte(&data, &size, 0); + uint8_t param = fuzz_byte(&data, &size, 0); + + switch (op % 6) { + case 0: /* Delete existing node */ + case 1: { /* (weighted toward deletes) */ + int64_t rowid = (int64_t)(param % 16) + 1; + sqlite3_reset(stmtDelete); + sqlite3_bind_int64(stmtDelete, 1, rowid); + sqlite3_step(stmtDelete); + break; + } + case 2: { /* Delete then immediately re-insert same rowid */ + int64_t rowid = (int64_t)(param % 10) + 1; + sqlite3_reset(stmtDelete); + sqlite3_bind_int64(stmtDelete, 1, rowid); + sqlite3_step(stmtDelete); + + float vec[8]; + for (int j = 0; j < 8; j++) { + vec[j] = (float)((int8_t)fuzz_byte(&data, &size, (uint8_t)(rowid+j))) / 15.0f; + } + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, rowid); + sqlite3_bind_blob(stmtInsert, 2, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + break; + } + case 3: { /* KNN query on potentially sparse/empty graph */ + float qvec[8]; + for (int j = 0; j < 8; j++) { + qvec[j] = (float)((int8_t)fuzz_byte(&data, &size, 0)) / 10.0f; + } + int k = (param % 15) + 1; + sqlite3_reset(stmtKnn); + sqlite3_bind_blob(stmtKnn, 1, qvec, sizeof(qvec), SQLITE_TRANSIENT); + sqlite3_bind_int(stmtKnn, 2, k); + while (sqlite3_step(stmtKnn) == SQLITE_ROW) {} + break; + } + case 4: { /* Insert new node */ + int64_t rowid = (int64_t)(param % 32) + 1; + float vec[8]; + for (int j = 0; j < 8; j++) { + vec[j] = (float)((int8_t)fuzz_byte(&data, &size, 0)) / 10.0f; + } + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, rowid); + sqlite3_bind_blob(stmtInsert, 2, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + break; + } + case 5: { /* Delete ALL remaining nodes, then insert fresh */ + for (int i = 1; i <= 32; i++) { + sqlite3_reset(stmtDelete); + sqlite3_bind_int64(stmtDelete, 1, i); + sqlite3_step(stmtDelete); + } + /* Now insert one node into empty graph */ + float vec[8] = {1.0f, 0, 0, 0, 0, 0, 0, 0}; + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, 1); + sqlite3_bind_blob(stmtInsert, 2, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + break; + } + } + } + + /* Final KNN on whatever state the graph is in */ + { + float qvec[8] = {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f}; + sqlite3_reset(stmtKnn); + sqlite3_bind_blob(stmtKnn, 1, qvec, sizeof(qvec), SQLITE_TRANSIENT); + sqlite3_bind_int(stmtKnn, 2, 10); + while (sqlite3_step(stmtKnn) == SQLITE_ROW) {} + } + +cleanup: + sqlite3_finalize(stmtInsert); + sqlite3_finalize(stmtDelete); + sqlite3_finalize(stmtKnn); + sqlite3_close(db); + return 0; +} diff --git a/tests/fuzz/diskann-graph-corrupt.c b/tests/fuzz/diskann-graph-corrupt.c new file mode 100644 index 0000000..a8dbc19 --- /dev/null +++ b/tests/fuzz/diskann-graph-corrupt.c @@ -0,0 +1,123 @@ +/** + * Fuzz target for DiskANN shadow table corruption resilience. + * Creates and populates a DiskANN table, then corrupts shadow table blobs + * using fuzz data and runs queries. + */ +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 16) return 0; + + int rc; + sqlite3 *db; + + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + rc = sqlite3_exec(db, + "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8))", + NULL, NULL, NULL); + if (rc != SQLITE_OK) { sqlite3_close(db); return 0; } + + /* Insert a few vectors to create graph structure */ + { + sqlite3_stmt *stmt; + sqlite3_prepare_v2(db, + "INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &stmt, NULL); + for (int i = 1; i <= 10; i++) { + float vec[8]; + for (int j = 0; j < 8; j++) { + vec[j] = (float)i * 0.1f + (float)j * 0.01f; + } + sqlite3_reset(stmt); + sqlite3_bind_int64(stmt, 1, i); + sqlite3_bind_blob(stmt, 2, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_step(stmt); + } + sqlite3_finalize(stmt); + } + + /* Corrupt shadow table data using fuzz bytes */ + size_t offset = 0; + + /* Determine which row and column to corrupt */ + int target_row = (data[offset++] % 10) + 1; + int corrupt_type = data[offset++] % 3; /* 0=validity, 1=neighbor_ids, 2=qvecs */ + + const char *column_name; + switch (corrupt_type) { + case 0: column_name = "neighbors_validity"; break; + case 1: column_name = "neighbor_ids"; break; + default: column_name = "neighbor_quantized_vectors"; break; + } + + /* Read the blob, corrupt it, write it back */ + { + sqlite3_stmt *readStmt; + char sqlbuf[256]; + snprintf(sqlbuf, sizeof(sqlbuf), + "SELECT %s FROM v_diskann_nodes00 WHERE rowid = ?", column_name); + rc = sqlite3_prepare_v2(db, sqlbuf, -1, &readStmt, NULL); + if (rc == SQLITE_OK) { + sqlite3_bind_int64(readStmt, 1, target_row); + if (sqlite3_step(readStmt) == SQLITE_ROW) { + const void *blob = sqlite3_column_blob(readStmt, 0); + int blobSize = sqlite3_column_bytes(readStmt, 0); + if (blob && blobSize > 0) { + unsigned char *corrupt = sqlite3_malloc(blobSize); + if (corrupt) { + memcpy(corrupt, blob, blobSize); + /* Apply fuzz bytes as XOR corruption */ + size_t remaining = size - offset; + for (size_t i = 0; i < remaining && i < (size_t)blobSize; i++) { + corrupt[i % blobSize] ^= data[offset + i]; + } + /* Write back */ + sqlite3_stmt *writeStmt; + snprintf(sqlbuf, sizeof(sqlbuf), + "UPDATE v_diskann_nodes00 SET %s = ? WHERE rowid = ?", column_name); + rc = sqlite3_prepare_v2(db, sqlbuf, -1, &writeStmt, NULL); + if (rc == SQLITE_OK) { + sqlite3_bind_blob(writeStmt, 1, corrupt, blobSize, SQLITE_TRANSIENT); + sqlite3_bind_int64(writeStmt, 2, target_row); + sqlite3_step(writeStmt); + sqlite3_finalize(writeStmt); + } + sqlite3_free(corrupt); + } + } + } + sqlite3_finalize(readStmt); + } + } + + /* Run queries on corrupted graph -- should not crash */ + { + float qvec[8] = {1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; + sqlite3_stmt *knnStmt; + rc = sqlite3_prepare_v2(db, + "SELECT rowid, distance FROM v WHERE emb MATCH ? AND k = 5", + -1, &knnStmt, NULL); + if (rc == SQLITE_OK) { + sqlite3_bind_blob(knnStmt, 1, qvec, sizeof(qvec), SQLITE_STATIC); + while (sqlite3_step(knnStmt) == SQLITE_ROW) {} + sqlite3_finalize(knnStmt); + } + } + + /* Full scan */ + sqlite3_exec(db, "SELECT * FROM v", NULL, NULL, NULL); + + sqlite3_close(db); + return 0; +} diff --git a/tests/fuzz/diskann-int8-quant.c b/tests/fuzz/diskann-int8-quant.c new file mode 100644 index 0000000..f1bd31d --- /dev/null +++ b/tests/fuzz/diskann-int8-quant.c @@ -0,0 +1,164 @@ +/** + * Fuzz target for DiskANN int8 quantizer edge cases. + * + * The binary quantizer is simple (sign bit), but the int8 quantizer has + * interesting arithmetic: + * i8_val = (i8)(((src - (-1.0f)) / step) - 128.0f) + * where step = 2.0f / 255.0f + * + * Edge cases in this formula: + * - src values outside [-1, 1] cause clamping issues (no explicit clamp!) + * - src = NaN, +Inf, -Inf (from corrupted vectors or div-by-zero) + * - src very close to boundaries (-1.0, 1.0) -- rounding + * - The cast to i8 can overflow for extreme src values + * + * Also exercises int8 distance functions: + * - distance_l2_sqr_int8: accumulates squared differences, possible overflow + * - distance_cosine_int8: dot product with normalization + * - distance_l1_int8: absolute differences + * + * This fuzzer also tests the cosine distance metric path which the + * other fuzzers (using L2 default) don't cover. + */ +#include +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +static uint8_t fuzz_byte(const uint8_t **data, size_t *size, uint8_t def) { + if (*size == 0) return def; + uint8_t b = **data; + (*data)++; + (*size)--; + return b; +} + +static float fuzz_extreme_float(const uint8_t **data, size_t *size) { + uint8_t mode = fuzz_byte(data, size, 0) % 8; + uint8_t raw = fuzz_byte(data, size, 0); + switch (mode) { + case 0: return (float)((int8_t)raw) / 10.0f; /* Normal range */ + case 1: return (float)((int8_t)raw) * 100.0f; /* Large values */ + case 2: return (float)((int8_t)raw) / 1000.0f; /* Tiny values near 0 */ + case 3: return -1.0f; /* Exact boundary */ + case 4: return 1.0f; /* Exact boundary */ + case 5: return 0.0f; /* Zero */ + case 6: return (float)raw / 255.0f; /* [0, 1] range */ + case 7: return -(float)raw / 255.0f; /* [-1, 0] range */ + } + return 0.0f; +} + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 40) return 0; + + int rc; + sqlite3 *db; + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + /* Test both distance metrics with int8 quantizer */ + uint8_t metric_choice = fuzz_byte(&data, &size, 0) % 2; + const char *metric = metric_choice ? "cosine" : "L2"; + + int dims = 8 + (fuzz_byte(&data, &size, 0) % 3) * 8; /* 8, 16, or 24 */ + + char sql[512]; + snprintf(sql, sizeof(sql), + "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[%d] distance_metric=%s " + "INDEXED BY diskann(neighbor_quantizer=int8, n_neighbors=8, search_list_size=16))", + dims, metric); + + rc = sqlite3_exec(db, sql, NULL, NULL, NULL); + if (rc != SQLITE_OK) { sqlite3_close(db); return 0; } + + sqlite3_stmt *stmtInsert = NULL, *stmtKnn = NULL, *stmtDelete = NULL; + sqlite3_prepare_v2(db, + "INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &stmtInsert, NULL); + sqlite3_prepare_v2(db, + "SELECT rowid, distance FROM v WHERE emb MATCH ? AND k = ?", + -1, &stmtKnn, NULL); + sqlite3_prepare_v2(db, + "DELETE FROM v WHERE rowid = ?", -1, &stmtDelete, NULL); + + if (!stmtInsert || !stmtKnn || !stmtDelete) goto cleanup; + + /* Insert vectors with extreme float values to stress quantization */ + float *vec = malloc(dims * sizeof(float)); + if (!vec) goto cleanup; + + for (int i = 1; i <= 16; i++) { + for (int j = 0; j < dims; j++) { + vec[j] = fuzz_extreme_float(&data, &size); + } + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, i); + sqlite3_bind_blob(stmtInsert, 2, vec, dims * sizeof(float), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + } + + /* Fuzz-driven operations */ + while (size >= 2) { + uint8_t op = fuzz_byte(&data, &size, 0) % 4; + uint8_t param = fuzz_byte(&data, &size, 0); + + switch (op) { + case 0: { /* KNN with extreme query values */ + for (int j = 0; j < dims; j++) { + vec[j] = fuzz_extreme_float(&data, &size); + } + int k = (param % 10) + 1; + sqlite3_reset(stmtKnn); + sqlite3_bind_blob(stmtKnn, 1, vec, dims * sizeof(float), SQLITE_TRANSIENT); + sqlite3_bind_int(stmtKnn, 2, k); + while (sqlite3_step(stmtKnn) == SQLITE_ROW) {} + break; + } + case 1: { /* Insert with extreme values */ + int64_t rowid = (int64_t)(param % 32) + 1; + for (int j = 0; j < dims; j++) { + vec[j] = fuzz_extreme_float(&data, &size); + } + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, rowid); + sqlite3_bind_blob(stmtInsert, 2, vec, dims * sizeof(float), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + break; + } + case 2: { /* Delete */ + int64_t rowid = (int64_t)(param % 32) + 1; + sqlite3_reset(stmtDelete); + sqlite3_bind_int64(stmtDelete, 1, rowid); + sqlite3_step(stmtDelete); + break; + } + case 3: { /* KNN with all-zero or all-same-value query */ + float val = (param % 3 == 0) ? 0.0f : + (param % 3 == 1) ? 1.0f : -1.0f; + for (int j = 0; j < dims; j++) vec[j] = val; + sqlite3_reset(stmtKnn); + sqlite3_bind_blob(stmtKnn, 1, vec, dims * sizeof(float), SQLITE_TRANSIENT); + sqlite3_bind_int(stmtKnn, 2, 5); + while (sqlite3_step(stmtKnn) == SQLITE_ROW) {} + break; + } + } + } + + free(vec); + +cleanup: + sqlite3_finalize(stmtInsert); + sqlite3_finalize(stmtKnn); + sqlite3_finalize(stmtDelete); + sqlite3_close(db); + return 0; +} diff --git a/tests/fuzz/diskann-operations.c b/tests/fuzz/diskann-operations.c new file mode 100644 index 0000000..b36620b --- /dev/null +++ b/tests/fuzz/diskann-operations.c @@ -0,0 +1,100 @@ +/** + * Fuzz target for DiskANN insert/delete/query operation sequences. + * Uses fuzz bytes to drive random operations on a DiskANN-indexed table. + */ +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 6) return 0; + + int rc; + sqlite3 *db; + sqlite3_stmt *stmtInsert = NULL; + sqlite3_stmt *stmtDelete = NULL; + sqlite3_stmt *stmtKnn = NULL; + sqlite3_stmt *stmtScan = NULL; + + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + rc = sqlite3_exec(db, + "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8))", + NULL, NULL, NULL); + if (rc != SQLITE_OK) { sqlite3_close(db); return 0; } + + sqlite3_prepare_v2(db, + "INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &stmtInsert, NULL); + sqlite3_prepare_v2(db, + "DELETE FROM v WHERE rowid = ?", -1, &stmtDelete, NULL); + sqlite3_prepare_v2(db, + "SELECT rowid, distance FROM v WHERE emb MATCH ? AND k = 3", + -1, &stmtKnn, NULL); + sqlite3_prepare_v2(db, + "SELECT rowid FROM v", -1, &stmtScan, NULL); + + if (!stmtInsert || !stmtDelete || !stmtKnn || !stmtScan) goto cleanup; + + size_t i = 0; + while (i + 2 <= size) { + uint8_t op = data[i++] % 4; + uint8_t rowid_byte = data[i++]; + int64_t rowid = (int64_t)(rowid_byte % 32) + 1; + + switch (op) { + case 0: { + /* INSERT: consume 32 bytes for 8 floats, or use what's left */ + float vec[8] = {0}; + for (int j = 0; j < 8 && i < size; j++, i++) { + vec[j] = (float)((int8_t)data[i]) / 10.0f; + } + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, rowid); + sqlite3_bind_blob(stmtInsert, 2, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + break; + } + case 1: { + /* DELETE */ + sqlite3_reset(stmtDelete); + sqlite3_bind_int64(stmtDelete, 1, rowid); + sqlite3_step(stmtDelete); + break; + } + case 2: { + /* KNN query */ + float qvec[8] = {1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; + sqlite3_reset(stmtKnn); + sqlite3_bind_blob(stmtKnn, 1, qvec, sizeof(qvec), SQLITE_STATIC); + while (sqlite3_step(stmtKnn) == SQLITE_ROW) {} + break; + } + case 3: { + /* Full scan */ + sqlite3_reset(stmtScan); + while (sqlite3_step(stmtScan) == SQLITE_ROW) {} + break; + } + } + } + + /* Final operations -- must not crash regardless of prior state */ + sqlite3_exec(db, "SELECT * FROM v", NULL, NULL, NULL); + +cleanup: + sqlite3_finalize(stmtInsert); + sqlite3_finalize(stmtDelete); + sqlite3_finalize(stmtKnn); + sqlite3_finalize(stmtScan); + sqlite3_close(db); + return 0; +} diff --git a/tests/fuzz/diskann-prune-direct.c b/tests/fuzz/diskann-prune-direct.c new file mode 100644 index 0000000..7a440ad --- /dev/null +++ b/tests/fuzz/diskann-prune-direct.c @@ -0,0 +1,131 @@ +/** + * Fuzz target for DiskANN RobustPrune algorithm (diskann_prune_select). + * + * diskann_prune_select is exposed for testing and takes: + * - inter_distances: flattened NxN matrix of inter-candidate distances + * - p_distances: N distances from node p to each candidate + * - num_candidates, alpha, max_neighbors + * + * This is a pure function that doesn't need a database, so we can + * call it directly with fuzz-controlled inputs. This gives the fuzzer + * maximum speed (no SQLite overhead) to explore: + * + * - alpha boundary: alpha=0 (prunes nothing), alpha=very large (prunes all) + * - max_neighbors = 0, 1, num_candidates, > num_candidates + * - num_candidates = 0, 1, large + * - Distance matrices with: all zeros, all same, negative values, NaN, Inf + * - Non-symmetric distance matrices (should still work) + * - Memory: large num_candidates to stress malloc + * + * Key code paths: + * - diskann_prune_select alpha-pruning loop + * - Boundary: selectedCount reaches max_neighbors exactly + * - All candidates pruned before max_neighbors reached + */ +#include +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +/* Declare the test-exposed function. + * diskann_prune_select is not static -- it's a public symbol. */ +extern int diskann_prune_select( + const float *inter_distances, const float *p_distances, + int num_candidates, float alpha, int max_neighbors, + int *outSelected, int *outCount); + +static uint8_t fuzz_byte(const uint8_t **data, size_t *size, uint8_t def) { + if (*size == 0) return def; + uint8_t b = **data; + (*data)++; + (*size)--; + return b; +} + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 8) return 0; + + /* Consume parameters from fuzz data */ + int num_candidates = fuzz_byte(&data, &size, 0) % 33; /* 0..32 */ + int max_neighbors = fuzz_byte(&data, &size, 0) % 17; /* 0..16 */ + + /* Alpha: pick from interesting values */ + uint8_t alpha_idx = fuzz_byte(&data, &size, 0) % 8; + float alpha_values[] = {0.0f, 0.5f, 1.0f, 1.2f, 1.5f, 2.0f, 10.0f, 100.0f}; + float alpha = alpha_values[alpha_idx]; + + if (num_candidates == 0) { + /* Test empty case */ + int outCount = -1; + int rc = diskann_prune_select(NULL, NULL, 0, alpha, max_neighbors, + NULL, &outCount); + assert(rc == 0 /* SQLITE_OK */); + assert(outCount == 0); + return 0; + } + + /* Allocate arrays */ + int n = num_candidates; + float *inter_distances = malloc(n * n * sizeof(float)); + float *p_distances = malloc(n * sizeof(float)); + int *outSelected = malloc(n * sizeof(int)); + if (!inter_distances || !p_distances || !outSelected) { + free(inter_distances); + free(p_distances); + free(outSelected); + return 0; + } + + /* Fill p_distances from fuzz data (sorted ascending for correct input) */ + for (int i = 0; i < n; i++) { + uint8_t raw = fuzz_byte(&data, &size, (uint8_t)(i * 10)); + p_distances[i] = (float)raw / 10.0f; + } + /* Sort p_distances ascending (prune_select expects sorted input) */ + for (int i = 1; i < n; i++) { + float tmp = p_distances[i]; + int j = i - 1; + while (j >= 0 && p_distances[j] > tmp) { + p_distances[j + 1] = p_distances[j]; + j--; + } + p_distances[j + 1] = tmp; + } + + /* Fill inter-distance matrix from fuzz data */ + for (int i = 0; i < n * n; i++) { + uint8_t raw = fuzz_byte(&data, &size, (uint8_t)(i % 256)); + inter_distances[i] = (float)raw / 10.0f; + } + /* Make diagonal zero */ + for (int i = 0; i < n; i++) { + inter_distances[i * n + i] = 0.0f; + } + + int outCount = -1; + int rc = diskann_prune_select(inter_distances, p_distances, + n, alpha, max_neighbors, + outSelected, &outCount); + /* Basic sanity: should not crash, count should be valid */ + assert(rc == 0); + assert(outCount >= 0); + assert(outCount <= max_neighbors || max_neighbors == 0); + assert(outCount <= n); + + /* Verify outSelected flags are consistent with outCount */ + int flagCount = 0; + for (int i = 0; i < n; i++) { + if (outSelected[i]) flagCount++; + } + assert(flagCount == outCount); + + free(inter_distances); + free(p_distances); + free(outSelected); + return 0; +} diff --git a/tests/fuzz/diskann.dict b/tests/fuzz/diskann.dict new file mode 100644 index 0000000..31d289d --- /dev/null +++ b/tests/fuzz/diskann.dict @@ -0,0 +1,10 @@ +"neighbor_quantizer" +"binary" +"int8" +"n_neighbors" +"search_list_size" +"search_list_size_search" +"search_list_size_insert" +"alpha" +"=" +"," diff --git a/tests/sqlite-vec-internal.h b/tests/sqlite-vec-internal.h index 67f1370..313add4 100644 --- a/tests/sqlite-vec-internal.h +++ b/tests/sqlite-vec-internal.h @@ -73,6 +73,7 @@ enum Vec0IndexType { VEC0_INDEX_TYPE_RESCORE = 2, #endif VEC0_INDEX_TYPE_IVF = 3, + VEC0_INDEX_TYPE_DISKANN = 4, }; enum Vec0RescoreQuantizerType { @@ -114,6 +115,20 @@ struct Vec0RescoreConfig { }; #endif +enum Vec0DiskannQuantizerType { + VEC0_DISKANN_QUANTIZER_BINARY = 1, + VEC0_DISKANN_QUANTIZER_INT8 = 2, +}; + +struct Vec0DiskannConfig { + enum Vec0DiskannQuantizerType quantizer_type; + int n_neighbors; + int search_list_size; + int search_list_size_search; + int search_list_size_insert; + float alpha; + int buffer_threshold; +}; struct VectorColumnDefinition { char *name; @@ -126,6 +141,7 @@ struct VectorColumnDefinition { struct Vec0RescoreConfig rescore; #endif struct Vec0IvfConfig ivf; + struct Vec0DiskannConfig diskann; }; int vec0_parse_vector_column(const char *source, int source_length, @@ -136,6 +152,48 @@ int vec0_parse_partition_key_definition(const char *source, int source_length, int *out_column_name_length, int *out_column_type); +size_t diskann_quantized_vector_byte_size( + enum Vec0DiskannQuantizerType quantizer_type, size_t dimensions); + +int diskann_validity_byte_size(int n_neighbors); +size_t diskann_neighbor_ids_byte_size(int n_neighbors); +size_t diskann_neighbor_qvecs_byte_size( + int n_neighbors, enum Vec0DiskannQuantizerType quantizer_type, + size_t dimensions); +int diskann_node_init( + int n_neighbors, enum Vec0DiskannQuantizerType quantizer_type, + size_t dimensions, + unsigned char **outValidity, int *outValiditySize, + unsigned char **outNeighborIds, int *outNeighborIdsSize, + unsigned char **outNeighborQvecs, int *outNeighborQvecsSize); +int diskann_validity_get(const unsigned char *validity, int i); +void diskann_validity_set(unsigned char *validity, int i, int value); +int diskann_validity_count(const unsigned char *validity, int n_neighbors); +long long diskann_neighbor_id_get(const unsigned char *neighbor_ids, int i); +void diskann_neighbor_id_set(unsigned char *neighbor_ids, int i, long long rowid); +const unsigned char *diskann_neighbor_qvec_get( + const unsigned char *qvecs, int i, + enum Vec0DiskannQuantizerType quantizer_type, size_t dimensions); +void diskann_neighbor_qvec_set( + unsigned char *qvecs, int i, const unsigned char *src_qvec, + enum Vec0DiskannQuantizerType quantizer_type, size_t dimensions); +void diskann_node_set_neighbor( + unsigned char *validity, unsigned char *neighbor_ids, unsigned char *qvecs, int i, + long long neighbor_rowid, const unsigned char *neighbor_qvec, + enum Vec0DiskannQuantizerType quantizer_type, size_t dimensions); +void diskann_node_clear_neighbor( + unsigned char *validity, unsigned char *neighbor_ids, unsigned char *qvecs, int i, + enum Vec0DiskannQuantizerType quantizer_type, size_t dimensions); +int diskann_quantize_vector( + const float *src, size_t dimensions, + enum Vec0DiskannQuantizerType quantizer_type, + unsigned char *out); + +int diskann_prune_select( + const float *inter_distances, const float *p_distances, + int num_candidates, float alpha, int max_neighbors, + int *outSelected, int *outCount); + #ifdef SQLITE_VEC_TEST float _test_distance_l2_sqr_float(const float *a, const float *b, size_t dims); float _test_distance_cosine_float(const float *a, const float *b, size_t dims); @@ -151,6 +209,33 @@ size_t _test_rescore_quantized_byte_size_int8(size_t dimensions); void ivf_quantize_int8(const float *src, int8_t *dst, int D); void ivf_quantize_binary(const float *src, uint8_t *dst, int D); #endif +// DiskANN candidate list (opaque struct, use accessors) +struct DiskannCandidateList { + void *items; // opaque + int count; + int capacity; +}; + +int _test_diskann_candidate_list_init(struct DiskannCandidateList *list, int capacity); +void _test_diskann_candidate_list_free(struct DiskannCandidateList *list); +int _test_diskann_candidate_list_insert(struct DiskannCandidateList *list, long long rowid, float distance); +int _test_diskann_candidate_list_next_unvisited(const struct DiskannCandidateList *list); +int _test_diskann_candidate_list_count(const struct DiskannCandidateList *list); +long long _test_diskann_candidate_list_rowid(const struct DiskannCandidateList *list, int i); +float _test_diskann_candidate_list_distance(const struct DiskannCandidateList *list, int i); +void _test_diskann_candidate_list_set_visited(struct DiskannCandidateList *list, int i); + +// DiskANN visited set (opaque struct, use accessors) +struct DiskannVisitedSet { + void *slots; // opaque + int capacity; + int count; +}; + +int _test_diskann_visited_set_init(struct DiskannVisitedSet *set, int capacity); +void _test_diskann_visited_set_free(struct DiskannVisitedSet *set); +int _test_diskann_visited_set_contains(const struct DiskannVisitedSet *set, long long rowid); +int _test_diskann_visited_set_insert(struct DiskannVisitedSet *set, long long rowid); #endif #endif /* SQLITE_VEC_INTERNAL_H */ diff --git a/tests/test-diskann.py b/tests/test-diskann.py new file mode 100644 index 0000000..4c049ce --- /dev/null +++ b/tests/test-diskann.py @@ -0,0 +1,1160 @@ +import sqlite3 +import struct +import pytest +from helpers import _f32, exec + + +def test_diskann_create_basic(db): + """Basic DiskANN table creation with binary quantizer should succeed.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[128] INDEXED BY diskann(neighbor_quantizer=binary) + ) + """) + # Table should exist + tables = [ + row[0] + for row in db.execute( + "select name from sqlite_master where name like 't%' order by 1" + ).fetchall() + ] + assert "t" in tables + + +def test_diskann_create_int8_quantizer(db): + """DiskANN with int8 quantizer should succeed.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[64] INDEXED BY diskann(neighbor_quantizer=int8) + ) + """) + tables = [ + row[0] + for row in db.execute( + "select name from sqlite_master where name like 't%' order by 1" + ).fetchall() + ] + assert "t" in tables + + +def test_diskann_create_with_options(db): + """DiskANN with custom n_neighbors and search_list_size.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[128] INDEXED BY diskann( + neighbor_quantizer=binary, + n_neighbors=48, + search_list_size=256 + ) + ) + """) + tables = [ + row[0] + for row in db.execute( + "select name from sqlite_master where name like 't%' order by 1" + ).fetchall() + ] + assert "t" in tables + + +def test_diskann_create_with_distance_metric(db): + """DiskANN combined with distance_metric should work.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[128] distance_metric=cosine INDEXED BY diskann(neighbor_quantizer=binary) + ) + """) + tables = [ + row[0] + for row in db.execute( + "select name from sqlite_master where name like 't%' order by 1" + ).fetchall() + ] + assert "t" in tables + + +def test_diskann_create_error_missing_quantizer(db): + """Error when neighbor_quantizer is not specified.""" + result = exec(db, """ + CREATE VIRTUAL TABLE t USING vec0( + emb float[128] INDEXED BY diskann(n_neighbors=72) + ) + """) + assert "error" in result + + +def test_diskann_create_error_empty_parens(db): + """Error on empty parens.""" + result = exec(db, """ + CREATE VIRTUAL TABLE t USING vec0( + emb float[128] INDEXED BY diskann() + ) + """) + assert "error" in result + + +def test_diskann_create_error_unknown_quantizer(db): + """Error on unknown quantizer type.""" + result = exec(db, """ + CREATE VIRTUAL TABLE t USING vec0( + emb float[128] INDEXED BY diskann(neighbor_quantizer=unknown) + ) + """) + assert "error" in result + + +def test_diskann_create_error_bit_column(db): + """Error: DiskANN not supported on bit vector columns.""" + result = exec(db, """ + CREATE VIRTUAL TABLE t USING vec0( + emb bit[128] INDEXED BY diskann(neighbor_quantizer=binary) + ) + """) + assert "error" in result + assert "bit" in result["message"].lower() or "DiskANN" in result["message"] + + +def test_diskann_create_error_binary_quantizer_odd_dims(db): + """Error: binary quantizer requires dimensions divisible by 8.""" + result = exec(db, """ + CREATE VIRTUAL TABLE t USING vec0( + emb float[13] INDEXED BY diskann(neighbor_quantizer=binary) + ) + """) + assert "error" in result + assert "divisible" in result["message"].lower() + + +def test_diskann_create_error_bad_n_neighbors(db): + """Error: n_neighbors must be divisible by 8.""" + result = exec(db, """ + CREATE VIRTUAL TABLE t USING vec0( + emb float[128] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=13) + ) + """) + assert "error" in result + + +def test_diskann_shadow_tables_created(db): + """DiskANN table should create _vectors00 and _diskann_nodes00 shadow tables.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[64] INDEXED BY diskann(neighbor_quantizer=binary) + ) + """) + tables = sorted([ + row[0] + for row in db.execute( + "select name from sqlite_master where type='table' and name like 't_%' order by 1" + ).fetchall() + ]) + assert "t_vectors00" in tables + assert "t_diskann_nodes00" in tables + # DiskANN columns should NOT have _vector_chunks00 + assert "t_vector_chunks00" not in tables + + +def test_diskann_medoid_in_info(db): + """_info table should contain diskann_medoid_00 key with NULL value.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[64] INDEXED BY diskann(neighbor_quantizer=binary) + ) + """) + row = db.execute( + "SELECT key, value FROM t_info WHERE key = 'diskann_medoid_00'" + ).fetchone() + assert row is not None + assert row[0] == "diskann_medoid_00" + assert row[1] is None + + +def test_non_diskann_no_extra_tables(db): + """Non-DiskANN table must NOT create _vectors or _diskann_nodes tables.""" + db.execute("CREATE VIRTUAL TABLE t USING vec0(emb float[64])") + tables = [ + row[0] + for row in db.execute( + "select name from sqlite_master where type='table' and name like 't_%' order by 1" + ).fetchall() + ] + assert "t_vectors00" not in tables + assert "t_diskann_nodes00" not in tables + assert "t_vector_chunks00" in tables + + +def test_diskann_medoid_initial_null(db): + """Medoid should be NULL initially (empty graph).""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[64] INDEXED BY diskann(neighbor_quantizer=binary) + ) + """) + row = db.execute( + "SELECT value FROM t_info WHERE key = 'diskann_medoid_00'" + ).fetchone() + assert row[0] is None + + +def test_diskann_medoid_set_via_info(db): + """Setting medoid via _info table should be retrievable.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[64] INDEXED BY diskann(neighbor_quantizer=binary) + ) + """) + # Manually set medoid to simulate first insert + db.execute("UPDATE t_info SET value = 42 WHERE key = 'diskann_medoid_00'") + row = db.execute( + "SELECT value FROM t_info WHERE key = 'diskann_medoid_00'" + ).fetchone() + assert row[0] == 42 + + # Reset to NULL (empty graph) + db.execute("UPDATE t_info SET value = NULL WHERE key = 'diskann_medoid_00'") + row = db.execute( + "SELECT value FROM t_info WHERE key = 'diskann_medoid_00'" + ).fetchone() + assert row[0] is None + + +def test_diskann_single_insert(db): + """Insert 1 vector. Verify _vectors00, _diskann_nodes00, and medoid.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary) + ) + """) + db.execute( + "INSERT INTO t(rowid, emb) VALUES (1, ?)", + [_f32([1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])], + ) + # Verify _vectors00 has 1 row + count = db.execute("SELECT count(*) FROM t_vectors00").fetchone()[0] + assert count == 1 + + # Verify _diskann_nodes00 has 1 row + count = db.execute("SELECT count(*) FROM t_diskann_nodes00").fetchone()[0] + assert count == 1 + + # Verify medoid is set + medoid = db.execute( + "SELECT value FROM t_info WHERE key = 'diskann_medoid_00'" + ).fetchone()[0] + assert medoid == 1 + + +def test_diskann_multiple_inserts(db): + """Insert multiple vectors. Verify counts and that nodes have neighbors.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + import random + random.seed(42) + for i in range(1, 21): + vec = [random.gauss(0, 1) for _ in range(8)] + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, _f32(vec)]) + + # Verify counts + assert db.execute("SELECT count(*) FROM t_vectors00").fetchone()[0] == 20 + assert db.execute("SELECT count(*) FROM t_diskann_nodes00").fetchone()[0] == 20 + + # Every node after the first should have at least 1 neighbor + rows = db.execute( + "SELECT rowid, neighbors_validity FROM t_diskann_nodes00" + ).fetchall() + nodes_with_neighbors = 0 + for row in rows: + validity = row[1] + has_neighbor = any(b != 0 for b in validity) + if has_neighbor: + nodes_with_neighbors += 1 + # At minimum, nodes 2-20 should have neighbors (node 1 gets neighbors via reverse edges) + assert nodes_with_neighbors >= 19 + + +def test_diskann_bidirectional_edges(db): + """Insert A then B. B should be in A's neighbors and A in B's.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + db.execute( + "INSERT INTO t(rowid, emb) VALUES (1, ?)", + [_f32([1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])], + ) + db.execute( + "INSERT INTO t(rowid, emb) VALUES (2, ?)", + [_f32([0.9, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])], + ) + + # Check B(2) is in A(1)'s neighbor list + row_a = db.execute( + "SELECT neighbor_ids FROM t_diskann_nodes00 WHERE rowid = 1" + ).fetchone() + neighbor_ids_a = struct.unpack(f"{len(row_a[0])//8}q", row_a[0]) + assert 2 in neighbor_ids_a + + # Check A(1) is in B(2)'s neighbor list + row_b = db.execute( + "SELECT neighbor_ids FROM t_diskann_nodes00 WHERE rowid = 2" + ).fetchone() + neighbor_ids_b = struct.unpack(f"{len(row_b[0])//8}q", row_b[0]) + assert 1 in neighbor_ids_b + + +def test_diskann_delete_single(db): + """Insert 3, delete 1. Verify counts.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + for i in range(1, 4): + db.execute( + "INSERT INTO t(rowid, emb) VALUES (?, ?)", + [i, _f32([float(i)] * 8)], + ) + db.execute("DELETE FROM t WHERE rowid = 2") + + assert db.execute("SELECT count(*) FROM t_vectors00").fetchone()[0] == 2 + assert db.execute("SELECT count(*) FROM t_diskann_nodes00").fetchone()[0] == 2 + + +def test_diskann_delete_no_stale_references(db): + """After delete, no node should reference the deleted rowid.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + import random + random.seed(123) + for i in range(1, 11): + vec = [random.gauss(0, 1) for _ in range(8)] + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, _f32(vec)]) + + db.execute("DELETE FROM t WHERE rowid = 5") + + # Scan all remaining nodes and verify rowid 5 is not in any neighbor list + rows = db.execute( + "SELECT rowid, neighbors_validity, neighbor_ids FROM t_diskann_nodes00" + ).fetchall() + for row in rows: + validity = row[1] + neighbor_ids_blob = row[2] + n_neighbors = len(validity) * 8 + ids = struct.unpack(f"{n_neighbors}q", neighbor_ids_blob) + for i in range(n_neighbors): + byte_idx = i // 8 + bit_idx = i % 8 + if validity[byte_idx] & (1 << bit_idx): + assert ids[i] != 5, f"Node {row[0]} still references deleted rowid 5" + + +def test_diskann_delete_medoid(db): + """Delete the medoid. Verify a new non-NULL medoid is selected.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + for i in range(1, 4): + db.execute( + "INSERT INTO t(rowid, emb) VALUES (?, ?)", + [i, _f32([float(i)] * 8)], + ) + + medoid_before = db.execute( + "SELECT value FROM t_info WHERE key = 'diskann_medoid_00'" + ).fetchone()[0] + assert medoid_before == 1 + + db.execute("DELETE FROM t WHERE rowid = 1") + + medoid_after = db.execute( + "SELECT value FROM t_info WHERE key = 'diskann_medoid_00'" + ).fetchone()[0] + assert medoid_after is not None + assert medoid_after != 1 + + +def test_diskann_delete_all(db): + """Delete all vectors. Medoid should be NULL.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + for i in range(1, 4): + db.execute( + "INSERT INTO t(rowid, emb) VALUES (?, ?)", + [i, _f32([float(i)] * 8)], + ) + for i in range(1, 4): + db.execute("DELETE FROM t WHERE rowid = ?", [i]) + + assert db.execute("SELECT count(*) FROM t_vectors00").fetchone()[0] == 0 + assert db.execute("SELECT count(*) FROM t_diskann_nodes00").fetchone()[0] == 0 + + medoid = db.execute( + "SELECT value FROM t_info WHERE key = 'diskann_medoid_00'" + ).fetchone()[0] + assert medoid is None + + +def test_diskann_insert_delete_insert_cycle(db): + """Insert, delete, insert again. No crashes.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + db.execute("INSERT INTO t(rowid, emb) VALUES (1, ?)", [_f32([1.0] * 8)]) + db.execute("INSERT INTO t(rowid, emb) VALUES (2, ?)", [_f32([2.0] * 8)]) + db.execute("DELETE FROM t WHERE rowid = 1") + db.execute("INSERT INTO t(rowid, emb) VALUES (3, ?)", [_f32([3.0] * 8)]) + + assert db.execute("SELECT count(*) FROM t_vectors00").fetchone()[0] == 2 + assert db.execute("SELECT count(*) FROM t_diskann_nodes00").fetchone()[0] == 2 + + +def test_diskann_knn_basic(db): + """Basic KNN query should return results.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + db.execute("INSERT INTO t(rowid, emb) VALUES (1, ?)", [_f32([1, 0, 0, 0, 0, 0, 0, 0])]) + db.execute("INSERT INTO t(rowid, emb) VALUES (2, ?)", [_f32([0, 1, 0, 0, 0, 0, 0, 0])]) + db.execute("INSERT INTO t(rowid, emb) VALUES (3, ?)", [_f32([0.9, 0.1, 0, 0, 0, 0, 0, 0])]) + + rows = db.execute( + "SELECT rowid, distance FROM t WHERE emb MATCH ? AND k=2", + [_f32([1, 0, 0, 0, 0, 0, 0, 0])], + ).fetchall() + assert len(rows) == 2 + # Closest should be rowid 1 (exact match) + assert rows[0][0] == 1 + assert rows[0][1] < 0.01 # ~0 distance + + +def test_diskann_knn_distances_sorted(db): + """Returned distances should be in ascending order.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=16) + ) + """) + import random + random.seed(42) + for i in range(1, 51): + vec = [random.gauss(0, 1) for _ in range(8)] + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, _f32(vec)]) + + rows = db.execute( + "SELECT rowid, distance FROM t WHERE emb MATCH ? AND k=10", + [_f32([0.0] * 8)], + ).fetchall() + assert len(rows) == 10 + distances = [r[1] for r in rows] + for i in range(len(distances) - 1): + assert distances[i] <= distances[i + 1], f"Distances not sorted at index {i}" + + +def test_diskann_knn_empty_table(db): + """KNN on empty table should return 0 results.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + rows = db.execute( + "SELECT rowid, distance FROM t WHERE emb MATCH ? AND k=5", + [_f32([1, 0, 0, 0, 0, 0, 0, 0])], + ).fetchall() + assert len(rows) == 0 + + +def test_diskann_knn_after_delete(db): + """KNN after delete should not return deleted rows.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + db.execute("INSERT INTO t(rowid, emb) VALUES (1, ?)", [_f32([1, 0, 0, 0, 0, 0, 0, 0])]) + db.execute("INSERT INTO t(rowid, emb) VALUES (2, ?)", [_f32([0, 1, 0, 0, 0, 0, 0, 0])]) + db.execute("INSERT INTO t(rowid, emb) VALUES (3, ?)", [_f32([0.5, 0.5, 0, 0, 0, 0, 0, 0])]) + db.execute("DELETE FROM t WHERE rowid = 1") + + rows = db.execute( + "SELECT rowid, distance FROM t WHERE emb MATCH ? AND k=3", + [_f32([1, 0, 0, 0, 0, 0, 0, 0])], + ).fetchall() + rowids = [r[0] for r in rows] + assert 1 not in rowids + assert len(rows) == 2 + + +def test_diskann_no_index_still_works(db): + """Tables without INDEXED BY should still work identically.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[4] + ) + """) + db.execute("INSERT INTO t(rowid, emb) VALUES (1, ?)", [_f32([1, 2, 3, 4])]) + rows = db.execute( + "SELECT rowid, distance FROM t WHERE emb MATCH ? AND k=1", + [_f32([1, 2, 3, 4])], + ).fetchall() + assert len(rows) == 1 + assert rows[0][0] == 1 + + +def test_diskann_drop_table(db): + """DROP TABLE should clean up all shadow tables.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[128] INDEXED BY diskann(neighbor_quantizer=binary) + ) + """) + db.execute("DROP TABLE t") + tables = [ + row[0] + for row in db.execute( + "select name from sqlite_master where name like 't%'" + ).fetchall() + ] + assert len(tables) == 0 + + +def test_diskann_create_split_search_list_size(db): + """DiskANN with separate search_list_size_search and search_list_size_insert.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[128] INDEXED BY diskann( + neighbor_quantizer=binary, + search_list_size_search=256, + search_list_size_insert=64 + ) + ) + """) + tables = [ + row[0] + for row in db.execute( + "select name from sqlite_master where name like 't%' order by 1" + ).fetchall() + ] + assert "t" in tables + + +def test_diskann_create_error_mixed_search_list_size(db): + """Error when mixing search_list_size with search_list_size_search.""" + result = exec(db, """ + CREATE VIRTUAL TABLE t USING vec0( + emb float[128] INDEXED BY diskann( + neighbor_quantizer=binary, + search_list_size=128, + search_list_size_search=256 + ) + ) + """) + assert "error" in result + + +def test_diskann_command_search_list_size(db): + """Runtime search_list_size override via command insert.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[64] INDEXED BY diskann(neighbor_quantizer=binary) + ) + """) + import struct, random + random.seed(42) + for i in range(20): + vec = struct.pack("64f", *[random.random() for _ in range(64)]) + db.execute("INSERT INTO t(emb) VALUES (?)", [vec]) + + # Query with default search_list_size + query = struct.pack("64f", *[random.random() for _ in range(64)]) + results_before = db.execute( + "SELECT rowid, distance FROM t WHERE emb MATCH ? AND k = 5", [query] + ).fetchall() + assert len(results_before) == 5 + + # Override search_list_size_search at runtime + db.execute("INSERT INTO t(rowid) VALUES ('search_list_size_search=256')") + + # Query should still work + results_after = db.execute( + "SELECT rowid, distance FROM t WHERE emb MATCH ? AND k = 5", [query] + ).fetchall() + assert len(results_after) == 5 + + # Override search_list_size_insert at runtime + db.execute("INSERT INTO t(rowid) VALUES ('search_list_size_insert=32')") + + # Inserts should still work + vec = struct.pack("64f", *[random.random() for _ in range(64)]) + db.execute("INSERT INTO t(emb) VALUES (?)", [vec]) + + # Override unified search_list_size + db.execute("INSERT INTO t(rowid) VALUES ('search_list_size=64')") + + results_final = db.execute( + "SELECT rowid, distance FROM t WHERE emb MATCH ? AND k = 5", [query] + ).fetchall() + assert len(results_final) == 5 + + +def test_diskann_command_search_list_size_error(db): + """Error on invalid search_list_size command value.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[64] INDEXED BY diskann(neighbor_quantizer=binary) + ) + """) + result = exec(db, "INSERT INTO t(rowid) VALUES ('search_list_size=0')") + assert "error" in result + result = exec(db, "INSERT INTO t(rowid) VALUES ('search_list_size=-1')") + assert "error" in result + + +# ====================================================================== +# Error cases: DiskANN + auxiliary/metadata/partition columns +# ====================================================================== + +def test_diskann_create_error_with_auxiliary_column(db): + """DiskANN tables should not support auxiliary columns.""" + result = exec(db, """ + CREATE VIRTUAL TABLE t USING vec0( + emb float[64] INDEXED BY diskann(neighbor_quantizer=binary), + +extra text + ) + """) + assert "error" in result + assert "auxiliary" in result["message"].lower() or "Auxiliary" in result["message"] + + +def test_diskann_create_error_with_metadata_column(db): + """DiskANN tables should not support metadata columns.""" + result = exec(db, """ + CREATE VIRTUAL TABLE t USING vec0( + emb float[64] INDEXED BY diskann(neighbor_quantizer=binary), + metadata_col integer metadata + ) + """) + assert "error" in result + assert "metadata" in result["message"].lower() or "Metadata" in result["message"] + + +def test_diskann_create_error_with_partition_key(db): + """DiskANN tables should not support partition key columns.""" + result = exec(db, """ + CREATE VIRTUAL TABLE t USING vec0( + emb float[64] INDEXED BY diskann(neighbor_quantizer=binary), + user_id text partition key + ) + """) + assert "error" in result + assert "partition" in result["message"].lower() or "Partition" in result["message"] + + +# ====================================================================== +# Insert edge cases +# ====================================================================== + +def test_diskann_insert_no_rowid(db): + """INSERT without explicit rowid (auto-generated) should work.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary) + ) + """) + db.execute("INSERT INTO t(emb) VALUES (?)", [_f32([1.0] * 8)]) + db.execute("INSERT INTO t(emb) VALUES (?)", [_f32([2.0] * 8)]) + assert db.execute("SELECT count(*) FROM t_vectors00").fetchone()[0] == 2 + assert db.execute("SELECT count(*) FROM t_diskann_nodes00").fetchone()[0] == 2 + + +def test_diskann_insert_large_batch(db): + """INSERT 500+ vectors, verify all are queryable via KNN.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[16] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=16) + ) + """) + import random + random.seed(99) + N = 500 + for i in range(1, N + 1): + vec = [random.gauss(0, 1) for _ in range(16)] + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, _f32(vec)]) + + assert db.execute("SELECT count(*) FROM t_vectors00").fetchone()[0] == N + assert db.execute("SELECT count(*) FROM t_diskann_nodes00").fetchone()[0] == N + + # KNN should return results + query = [random.gauss(0, 1) for _ in range(16)] + rows = db.execute( + "SELECT rowid, distance FROM t WHERE emb MATCH ? AND k=10", + [_f32(query)], + ).fetchall() + assert len(rows) == 10 + # Distances should be sorted + distances = [r[1] for r in rows] + for i in range(len(distances) - 1): + assert distances[i] <= distances[i + 1] + + +def test_diskann_insert_zero_vector(db): + """Insert an all-zero vector (edge case for binary quantizer).""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary) + ) + """) + db.execute("INSERT INTO t(rowid, emb) VALUES (1, ?)", [_f32([0.0] * 8)]) + db.execute("INSERT INTO t(rowid, emb) VALUES (2, ?)", [_f32([1.0] * 8)]) + count = db.execute("SELECT count(*) FROM t_vectors00").fetchone()[0] + assert count == 2 + + # Query with zero vector should find rowid 1 as closest + rows = db.execute( + "SELECT rowid, distance FROM t WHERE emb MATCH ? AND k=2", + [_f32([0.0] * 8)], + ).fetchall() + assert len(rows) == 2 + assert rows[0][0] == 1 + + +def test_diskann_insert_large_values(db): + """Insert vectors with very large float values.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary) + ) + """) + import sys + large = sys.float_info.max / 1e300 # Large but not overflowing + db.execute("INSERT INTO t(rowid, emb) VALUES (1, ?)", [_f32([large] * 8)]) + db.execute("INSERT INTO t(rowid, emb) VALUES (2, ?)", [_f32([-large] * 8)]) + db.execute("INSERT INTO t(rowid, emb) VALUES (3, ?)", [_f32([0.0] * 8)]) + assert db.execute("SELECT count(*) FROM t_vectors00").fetchone()[0] == 3 + + +def test_diskann_insert_int8_quantizer_knn(db): + """Full insert + query cycle with int8 quantizer.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[16] INDEXED BY diskann(neighbor_quantizer=int8, n_neighbors=8) + ) + """) + import random + random.seed(77) + for i in range(1, 31): + vec = [random.gauss(0, 1) for _ in range(16)] + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, _f32(vec)]) + + assert db.execute("SELECT count(*) FROM t_vectors00").fetchone()[0] == 30 + + # KNN should work + query = [random.gauss(0, 1) for _ in range(16)] + rows = db.execute( + "SELECT rowid, distance FROM t WHERE emb MATCH ? AND k=5", + [_f32(query)], + ).fetchall() + assert len(rows) == 5 + distances = [r[1] for r in rows] + for i in range(len(distances) - 1): + assert distances[i] <= distances[i + 1] + + +# ====================================================================== +# Delete edge cases +# ====================================================================== + +def test_diskann_delete_nonexistent(db): + """DELETE of a nonexistent rowid should either be a no-op or return an error, not crash.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + db.execute("INSERT INTO t(rowid, emb) VALUES (1, ?)", [_f32([1.0] * 8)]) + # Deleting a nonexistent rowid may error but should not crash + result = exec(db, "DELETE FROM t WHERE rowid = 999") + # Whether it succeeds or errors, the existing row should still be there + assert db.execute("SELECT count(*) FROM t_vectors00").fetchone()[0] == 1 + + +def test_diskann_delete_then_reinsert_same_rowid(db): + """Delete rowid 5, then reinsert rowid 5 with a new vector.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + for i in range(1, 6): + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, _f32([float(i)] * 8)]) + + db.execute("DELETE FROM t WHERE rowid = 5") + assert db.execute("SELECT count(*) FROM t_vectors00").fetchone()[0] == 4 + + # Reinsert with new vector + db.execute("INSERT INTO t(rowid, emb) VALUES (5, ?)", [_f32([99.0] * 8)]) + assert db.execute("SELECT count(*) FROM t_vectors00").fetchone()[0] == 5 + assert db.execute("SELECT count(*) FROM t_diskann_nodes00").fetchone()[0] == 5 + + +def test_diskann_delete_all_then_insert(db): + """Delete everything, then insert new vectors. Graph should rebuild.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + for i in range(1, 6): + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, _f32([float(i)] * 8)]) + + # Delete all + for i in range(1, 6): + db.execute("DELETE FROM t WHERE rowid = ?", [i]) + assert db.execute("SELECT count(*) FROM t_vectors00").fetchone()[0] == 0 + + medoid = db.execute("SELECT value FROM t_info WHERE key = 'diskann_medoid_00'").fetchone()[0] + assert medoid is None + + # Insert new vectors + for i in range(10, 15): + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, _f32([float(i)] * 8)]) + + assert db.execute("SELECT count(*) FROM t_vectors00").fetchone()[0] == 5 + assert db.execute("SELECT count(*) FROM t_diskann_nodes00").fetchone()[0] == 5 + + medoid = db.execute("SELECT value FROM t_info WHERE key = 'diskann_medoid_00'").fetchone()[0] + assert medoid is not None + + # KNN should work + rows = db.execute( + "SELECT rowid FROM t WHERE emb MATCH ? AND k=3", + [_f32([12.0] * 8)], + ).fetchall() + assert len(rows) == 3 + + +def test_diskann_delete_preserves_graph_connectivity(db): + """After deleting a node, remaining nodes should still be reachable via KNN.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + import random + random.seed(456) + for i in range(1, 21): + vec = [random.gauss(0, 1) for _ in range(8)] + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, _f32(vec)]) + + # Delete 5 nodes + for i in [3, 7, 11, 15, 19]: + db.execute("DELETE FROM t WHERE rowid = ?", [i]) + + remaining = db.execute("SELECT count(*) FROM t_vectors00").fetchone()[0] + assert remaining == 15 + + # Every remaining node should be reachable via KNN (appears somewhere in top-k) + all_rowids = [r[0] for r in db.execute("SELECT rowid FROM t_vectors00").fetchall()] + reachable = set() + for rid in all_rowids: + vec_blob = db.execute("SELECT vector FROM t_vectors00 WHERE rowid = ?", [rid]).fetchone()[0] + rows = db.execute( + "SELECT rowid FROM t WHERE emb MATCH ? AND k=5", + [vec_blob], + ).fetchall() + assert len(rows) >= 1 # At least some results + for r in rows: + reachable.add(r[0]) + # Most nodes should be reachable through the graph + assert len(reachable) >= len(all_rowids) * 0.8, \ + f"Only {len(reachable)}/{len(all_rowids)} nodes reachable" + + +# ====================================================================== +# Update scenarios +# ====================================================================== + +def test_diskann_update_vector(db): + """UPDATE a vector on DiskANN table may not be supported; verify it either works or errors cleanly.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + db.execute("INSERT INTO t(rowid, emb) VALUES (1, ?)", [_f32([1, 0, 0, 0, 0, 0, 0, 0])]) + db.execute("INSERT INTO t(rowid, emb) VALUES (2, ?)", [_f32([0, 1, 0, 0, 0, 0, 0, 0])]) + db.execute("INSERT INTO t(rowid, emb) VALUES (3, ?)", [_f32([0, 0, 1, 0, 0, 0, 0, 0])]) + + # UPDATE may not be fully supported for DiskANN yet; verify no crash + result = exec(db, "UPDATE t SET emb = ? WHERE rowid = 1", [_f32([0, 0.9, 0.1, 0, 0, 0, 0, 0])]) + if "error" not in result: + # If UPDATE succeeded, verify KNN reflects the new value + rows = db.execute( + "SELECT rowid, distance FROM t WHERE emb MATCH ? AND k=3", + [_f32([0, 1, 0, 0, 0, 0, 0, 0])], + ).fetchall() + assert len(rows) == 3 + # rowid 2 should still be closest (exact match) + assert rows[0][0] == 2 + + +# ====================================================================== +# KNN correctness after mutations +# ====================================================================== + +def test_diskann_knn_recall_after_inserts(db): + """Insert N vectors, verify top-1 recall is 100% for exact matches.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=16) + ) + """) + import random + random.seed(200) + vectors = {} + for i in range(1, 51): + vec = [random.gauss(0, 1) for _ in range(8)] + vectors[i] = vec + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, _f32(vec)]) + + # Top-1 for each vector should return itself + correct = 0 + for rid, vec in vectors.items(): + rows = db.execute( + "SELECT rowid FROM t WHERE emb MATCH ? AND k=1", + [_f32(vec)], + ).fetchall() + if rows and rows[0][0] == rid: + correct += 1 + + # With binary quantizer, approximate search may not always return exact match + # but should have high recall (at least 80%) + assert correct >= len(vectors) * 0.8, f"Top-1 recall too low: {correct}/{len(vectors)}" + + +def test_diskann_knn_k_larger_than_table(db): + """k=100 on table with 5 rows should return 5.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + for i in range(1, 6): + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, _f32([float(i)] * 8)]) + + rows = db.execute( + "SELECT rowid, distance FROM t WHERE emb MATCH ? AND k=100", + [_f32([3.0] * 8)], + ).fetchall() + assert len(rows) == 5 + + +def test_diskann_knn_cosine_metric(db): + """KNN with cosine distance metric.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] distance_metric=cosine INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + # Insert orthogonal-ish vectors + db.execute("INSERT INTO t(rowid, emb) VALUES (1, ?)", [_f32([1, 0, 0, 0, 0, 0, 0, 0])]) + db.execute("INSERT INTO t(rowid, emb) VALUES (2, ?)", [_f32([0, 1, 0, 0, 0, 0, 0, 0])]) + db.execute("INSERT INTO t(rowid, emb) VALUES (3, ?)", [_f32([0.7, 0.7, 0, 0, 0, 0, 0, 0])]) + + rows = db.execute( + "SELECT rowid, distance FROM t WHERE emb MATCH ? AND k=3", + [_f32([1, 0, 0, 0, 0, 0, 0, 0])], + ).fetchall() + assert len(rows) == 3 + # rowid 1 should be closest (exact match in direction) + assert rows[0][0] == 1 + # Distances should be sorted + distances = [r[1] for r in rows] + for i in range(len(distances) - 1): + assert distances[i] <= distances[i + 1] + + +def test_diskann_knn_after_heavy_churn(db): + """Interleave many inserts and deletes, then query.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=16) + ) + """) + import random + random.seed(321) + + # Insert 50 vectors + for i in range(1, 51): + vec = [random.gauss(0, 1) for _ in range(8)] + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, _f32(vec)]) + + # Delete even-numbered rows + for i in range(2, 51, 2): + db.execute("DELETE FROM t WHERE rowid = ?", [i]) + + # Insert more vectors with higher rowids + for i in range(51, 76): + vec = [random.gauss(0, 1) for _ in range(8)] + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, _f32(vec)]) + + remaining = db.execute("SELECT count(*) FROM t_vectors00").fetchone()[0] + assert remaining == 50 # 25 odd + 25 new + + # KNN should still work and return results + query = [random.gauss(0, 1) for _ in range(8)] + rows = db.execute( + "SELECT rowid, distance FROM t WHERE emb MATCH ? AND k=10", + [_f32(query)], + ).fetchall() + assert len(rows) == 10 + # Distances should be sorted + distances = [r[1] for r in rows] + for i in range(len(distances) - 1): + assert distances[i] <= distances[i + 1] + + +def test_diskann_knn_batch_recall(db): + """Insert 100+ vectors and verify reasonable recall.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[16] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=16) + ) + """) + import random + random.seed(42) + N = 150 + vectors = {} + for i in range(1, N + 1): + vec = [random.gauss(0, 1) for _ in range(16)] + vectors[i] = vec + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, _f32(vec)]) + + # Brute-force top-5 for a query and compare with DiskANN + query = [random.gauss(0, 1) for _ in range(16)] + + # Compute true distances + true_dists = [] + for rid, vec in vectors.items(): + d = sum((a - b) ** 2 for a, b in zip(query, vec)) + true_dists.append((d, rid)) + true_dists.sort() + true_top5 = set(r for _, r in true_dists[:5]) + + rows = db.execute( + "SELECT rowid, distance FROM t WHERE emb MATCH ? AND k=5", + [_f32(query)], + ).fetchall() + result_top5 = set(r[0] for r in rows) + assert len(rows) == 5 + + # At least 3 of top-5 should match (reasonable recall for approximate search) + overlap = len(true_top5 & result_top5) + assert overlap >= 3, f"Recall too low: only {overlap}/5 overlap" + + +# ====================================================================== +# Additional edge cases +# ====================================================================== + +def test_diskann_insert_wrong_dimensions(db): + """INSERT with wrong dimension vector should error.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + result = exec(db, "INSERT INTO t(rowid, emb) VALUES (1, ?)", [_f32([1.0] * 4)]) + assert "error" in result + + +def test_diskann_knn_wrong_query_dimensions(db): + """KNN MATCH with wrong dimension query should error.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + db.execute("INSERT INTO t(rowid, emb) VALUES (1, ?)", [_f32([1.0] * 8)]) + + result = exec(db, "SELECT rowid FROM t WHERE emb MATCH ? AND k=1", [_f32([1.0] * 4)]) + assert "error" in result + + +def test_diskann_graph_connectivity_after_many_deletes(db): + """After many deletes, the graph should still be connected enough for search.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=16) + ) + """) + import random + random.seed(789) + N = 40 + for i in range(1, N + 1): + vec = [random.gauss(0, 1) for _ in range(8)] + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, _f32(vec)]) + + # Delete 30 of 40 nodes + to_delete = list(range(1, 31)) + for i in to_delete: + db.execute("DELETE FROM t WHERE rowid = ?", [i]) + + remaining = db.execute("SELECT count(*) FROM t_vectors00").fetchone()[0] + assert remaining == 10 + + # Search should still work and return results + query = [random.gauss(0, 1) for _ in range(8)] + rows = db.execute( + "SELECT rowid, distance FROM t WHERE emb MATCH ? AND k=10", + [_f32(query)], + ).fetchall() + # Should return some results (graph may be fragmented after heavy deletion) + assert len(rows) >= 1 + # Distances should be sorted + distances = [r[1] for r in rows] + for i in range(len(distances) - 1): + assert distances[i] <= distances[i + 1] + + +def test_diskann_large_batch_insert_500(db): + """Insert 500+ vectors and verify counts and KNN.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=16) + ) + """) + import random + random.seed(555) + N = 500 + for i in range(1, N + 1): + vec = [random.gauss(0, 1) for _ in range(8)] + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, _f32(vec)]) + + assert db.execute("SELECT count(*) FROM t_vectors00").fetchone()[0] == N + + query = [random.gauss(0, 1) for _ in range(8)] + rows = db.execute( + "SELECT rowid, distance FROM t WHERE emb MATCH ? AND k=20", + [_f32(query)], + ).fetchall() + assert len(rows) == 20 + distances = [r[1] for r in rows] + for i in range(len(distances) - 1): + assert distances[i] <= distances[i + 1] diff --git a/tests/test-unit.c b/tests/test-unit.c index 27a469d..83cedd5 100644 --- a/tests/test-unit.c +++ b/tests/test-unit.c @@ -1187,6 +1187,7 @@ void test_ivf_quantize_binary() { } void test_ivf_config_parsing() { +void test_vec0_parse_vector_column_diskann() { printf("Starting %s...\n", __func__); struct VectorColumnDefinition col; int rc; @@ -1199,6 +1200,34 @@ void test_ivf_config_parsing() { assert(col.index_type == VEC0_INDEX_TYPE_RESCORE); assert(col.rescore.quantizer_type == VEC0_RESCORE_QUANTIZER_BIT); assert(col.rescore.oversample == 8); // default + // Existing syntax (no INDEXED BY) should have diskann.enabled == 0 + { + const char *input = "emb float[128]"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_OK); + assert(col.index_type != VEC0_INDEX_TYPE_DISKANN); + sqlite3_free(col.name); + } + + // With distance_metric but no INDEXED BY + { + const char *input = "emb float[128] distance_metric=cosine"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_OK); + assert(col.index_type != VEC0_INDEX_TYPE_DISKANN); + assert(col.distance_metric == VEC0_DISTANCE_METRIC_COSINE); + sqlite3_free(col.name); + } + + // Basic binary quantizer + { + const char *input = "emb float[128] INDEXED BY diskann(neighbor_quantizer=binary)"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_OK); + assert(col.index_type == VEC0_INDEX_TYPE_DISKANN); + assert(col.diskann.quantizer_type == VEC0_DISKANN_QUANTIZER_BINARY); + assert(col.diskann.n_neighbors == 72); // default + assert(col.diskann.search_list_size == 128); // default assert(col.dimensions == 128); sqlite3_free(col.name); } @@ -1370,6 +1399,681 @@ void test_ivf_config_parsing() { printf(" All ivf_config_parsing tests passed.\n"); } #endif /* SQLITE_VEC_ENABLE_IVF */ + // INT8 quantizer + { + const char *input = "v float[64] INDEXED BY diskann(neighbor_quantizer=int8)"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_OK); + assert(col.index_type == VEC0_INDEX_TYPE_DISKANN); + assert(col.diskann.quantizer_type == VEC0_DISKANN_QUANTIZER_INT8); + sqlite3_free(col.name); + } + + // Custom n_neighbors + { + const char *input = "emb float[128] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=48)"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_OK); + assert(col.index_type == VEC0_INDEX_TYPE_DISKANN); + assert(col.diskann.n_neighbors == 48); + sqlite3_free(col.name); + } + + // Custom search_list_size + { + const char *input = "emb float[128] INDEXED BY diskann(neighbor_quantizer=binary, search_list_size=256)"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_OK); + assert(col.diskann.search_list_size == 256); + sqlite3_free(col.name); + } + + // Combined with distance_metric (distance_metric first) + { + const char *input = "emb float[128] distance_metric=cosine INDEXED BY diskann(neighbor_quantizer=int8)"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_OK); + assert(col.distance_metric == VEC0_DISTANCE_METRIC_COSINE); + assert(col.index_type == VEC0_INDEX_TYPE_DISKANN); + assert(col.diskann.quantizer_type == VEC0_DISKANN_QUANTIZER_INT8); + sqlite3_free(col.name); + } + + // Error: missing neighbor_quantizer (required) + { + const char *input = "emb float[128] INDEXED BY diskann(n_neighbors=72)"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_ERROR); + } + + // Error: empty parens + { + const char *input = "emb float[128] INDEXED BY diskann()"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_ERROR); + } + + // Error: unknown quantizer + { + const char *input = "emb float[128] INDEXED BY diskann(neighbor_quantizer=unknown)"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_ERROR); + } + + // Error: bad n_neighbors (not divisible by 8) + { + const char *input = "emb float[128] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=13)"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_ERROR); + } + + // Error: n_neighbors too large + { + const char *input = "emb float[128] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=512)"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_ERROR); + } + + // Error: missing BY + { + const char *input = "emb float[128] INDEXED diskann(neighbor_quantizer=binary)"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_ERROR); + } + + // Error: unknown algorithm + { + const char *input = "emb float[128] INDEXED BY hnsw(neighbor_quantizer=binary)"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_ERROR); + } + + // Error: unknown option key + { + const char *input = "emb float[128] INDEXED BY diskann(neighbor_quantizer=binary, foobar=baz)"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_ERROR); + } + + // Case insensitivity for keywords + { + const char *input = "emb float[128] indexed by DISKANN(NEIGHBOR_QUANTIZER=BINARY)"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_OK); + assert(col.index_type == VEC0_INDEX_TYPE_DISKANN); + assert(col.diskann.quantizer_type == VEC0_DISKANN_QUANTIZER_BINARY); + sqlite3_free(col.name); + } + + // Split search_list_size: search and insert + { + const char *input = "emb float[128] INDEXED BY diskann(neighbor_quantizer=binary, search_list_size_search=256, search_list_size_insert=64)"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_OK); + assert(col.diskann.search_list_size == 128); // default (unified) + assert(col.diskann.search_list_size_search == 256); + assert(col.diskann.search_list_size_insert == 64); + sqlite3_free(col.name); + } + + // Split search_list_size: only search + { + const char *input = "emb float[128] INDEXED BY diskann(neighbor_quantizer=binary, search_list_size_search=200)"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_OK); + assert(col.diskann.search_list_size_search == 200); + assert(col.diskann.search_list_size_insert == 0); + sqlite3_free(col.name); + } + + // Error: cannot mix search_list_size with search_list_size_search + { + const char *input = "emb float[128] INDEXED BY diskann(neighbor_quantizer=binary, search_list_size=128, search_list_size_search=256)"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_ERROR); + } + + // Error: cannot mix search_list_size with search_list_size_insert + { + const char *input = "emb float[128] INDEXED BY diskann(neighbor_quantizer=binary, search_list_size=128, search_list_size_insert=64)"; + rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == SQLITE_ERROR); + } + + printf(" All vec0_parse_vector_column_diskann tests passed.\n"); +} + +void test_diskann_validity_bitmap() { + printf("Starting %s...\n", __func__); + + unsigned char validity[3]; // 24 bits + memset(validity, 0, sizeof(validity)); + + // All initially invalid + for (int i = 0; i < 24; i++) { + assert(diskann_validity_get(validity, i) == 0); + } + assert(diskann_validity_count(validity, 24) == 0); + + // Set bit 0 + diskann_validity_set(validity, 0, 1); + assert(diskann_validity_get(validity, 0) == 1); + assert(diskann_validity_count(validity, 24) == 1); + + // Set bit 7 (last bit of first byte) + diskann_validity_set(validity, 7, 1); + assert(diskann_validity_get(validity, 7) == 1); + assert(diskann_validity_count(validity, 24) == 2); + + // Set bit 8 (first bit of second byte) + diskann_validity_set(validity, 8, 1); + assert(diskann_validity_get(validity, 8) == 1); + assert(diskann_validity_count(validity, 24) == 3); + + // Set bit 23 (last bit) + diskann_validity_set(validity, 23, 1); + assert(diskann_validity_get(validity, 23) == 1); + assert(diskann_validity_count(validity, 24) == 4); + + // Clear bit 0 + diskann_validity_set(validity, 0, 0); + assert(diskann_validity_get(validity, 0) == 0); + assert(diskann_validity_count(validity, 24) == 3); + + // Other bits unaffected + assert(diskann_validity_get(validity, 7) == 1); + assert(diskann_validity_get(validity, 8) == 1); + + printf(" All diskann_validity_bitmap tests passed.\n"); +} + +void test_diskann_neighbor_ids() { + printf("Starting %s...\n", __func__); + + unsigned char ids[8 * 8]; // 8 slots * 8 bytes each + memset(ids, 0, sizeof(ids)); + + // Set and get slot 0 + diskann_neighbor_id_set(ids, 0, 42); + assert(diskann_neighbor_id_get(ids, 0) == 42); + + // Set and get middle slot + diskann_neighbor_id_set(ids, 3, 12345); + assert(diskann_neighbor_id_get(ids, 3) == 12345); + + // Set and get last slot + diskann_neighbor_id_set(ids, 7, 99999); + assert(diskann_neighbor_id_get(ids, 7) == 99999); + + // Slot 0 still correct + assert(diskann_neighbor_id_get(ids, 0) == 42); + + // Large value + diskann_neighbor_id_set(ids, 1, INT64_MAX); + assert(diskann_neighbor_id_get(ids, 1) == INT64_MAX); + + printf(" All diskann_neighbor_ids tests passed.\n"); +} + +void test_diskann_quantize_binary() { + printf("Starting %s...\n", __func__); + + // 8-dimensional vector: positive values -> 1, negative/zero -> 0 + float src[8] = {1.0f, -1.0f, 0.5f, 0.0f, -0.5f, 0.1f, -0.1f, 100.0f}; + unsigned char out[1]; // 8 bits = 1 byte + + int rc = diskann_quantize_vector(src, 8, VEC0_DISKANN_QUANTIZER_BINARY, out); + assert(rc == 0); + + // Expected bits (LSB first within each byte): + // bit 0: 1.0 > 0 -> 1 + // bit 1: -1.0 > 0 -> 0 + // bit 2: 0.5 > 0 -> 1 + // bit 3: 0.0 > 0 -> 0 (not strictly greater) + // bit 4: -0.5 > 0 -> 0 + // bit 5: 0.1 > 0 -> 1 + // bit 6: -0.1 > 0 -> 0 + // bit 7: 100.0 > 0 -> 1 + // Expected byte: 1 + 0 + 4 + 0 + 0 + 32 + 0 + 128 = 0b10100101 = 0xA5 + assert(out[0] == 0xA5); + + printf(" All diskann_quantize_binary tests passed.\n"); +} + +void test_diskann_node_init_sizes() { + printf("Starting %s...\n", __func__); + + unsigned char *validity, *ids, *qvecs; + int validitySize, idsSize, qvecsSize; + + // 72 neighbors, binary quantizer, 1024 dims + int rc = diskann_node_init(72, VEC0_DISKANN_QUANTIZER_BINARY, 1024, + &validity, &validitySize, &ids, &idsSize, &qvecs, &qvecsSize); + assert(rc == 0); + assert(validitySize == 9); // 72/8 + assert(idsSize == 576); // 72 * 8 + assert(qvecsSize == 9216); // 72 * (1024/8) + + // All validity bits should be 0 + assert(diskann_validity_count(validity, 72) == 0); + + sqlite3_free(validity); + sqlite3_free(ids); + sqlite3_free(qvecs); + + // 8 neighbors, int8 quantizer, 32 dims + rc = diskann_node_init(8, VEC0_DISKANN_QUANTIZER_INT8, 32, + &validity, &validitySize, &ids, &idsSize, &qvecs, &qvecsSize); + assert(rc == 0); + assert(validitySize == 1); // 8/8 + assert(idsSize == 64); // 8 * 8 + assert(qvecsSize == 256); // 8 * 32 + + sqlite3_free(validity); + sqlite3_free(ids); + sqlite3_free(qvecs); + + printf(" All diskann_node_init_sizes tests passed.\n"); +} + +void test_diskann_node_set_clear_neighbor() { + printf("Starting %s...\n", __func__); + + unsigned char *validity, *ids, *qvecs; + int validitySize, idsSize, qvecsSize; + + // 8 neighbors, binary quantizer, 16 dims (2 bytes per qvec) + int rc = diskann_node_init(8, VEC0_DISKANN_QUANTIZER_BINARY, 16, + &validity, &validitySize, &ids, &idsSize, &qvecs, &qvecsSize); + assert(rc == 0); + + // Create a test quantized vector (2 bytes) + unsigned char test_qvec[2] = {0xAB, 0xCD}; + + // Set neighbor at slot 3 + diskann_node_set_neighbor(validity, ids, qvecs, 3, + 42, test_qvec, VEC0_DISKANN_QUANTIZER_BINARY, 16); + + // Verify slot 3 is valid + assert(diskann_validity_get(validity, 3) == 1); + assert(diskann_validity_count(validity, 8) == 1); + + // Verify rowid + assert(diskann_neighbor_id_get(ids, 3) == 42); + + // Verify quantized vector + const unsigned char *read_qvec = diskann_neighbor_qvec_get( + qvecs, 3, VEC0_DISKANN_QUANTIZER_BINARY, 16); + assert(read_qvec[0] == 0xAB); + assert(read_qvec[1] == 0xCD); + + // Clear slot 3 + diskann_node_clear_neighbor(validity, ids, qvecs, 3, + VEC0_DISKANN_QUANTIZER_BINARY, 16); + assert(diskann_validity_get(validity, 3) == 0); + assert(diskann_neighbor_id_get(ids, 3) == 0); + assert(diskann_validity_count(validity, 8) == 0); + + sqlite3_free(validity); + sqlite3_free(ids); + sqlite3_free(qvecs); + + printf(" All diskann_node_set_clear_neighbor tests passed.\n"); +} + +void test_diskann_prune_select() { + printf("Starting %s...\n", __func__); + + // Scenario: 5 candidates, sorted by distance to p + // Candidates: A(0), B(1), C(2), D(3), E(4) + // p_distances (already sorted): A=1.0, B=2.0, C=3.0, D=4.0, E=5.0 + // + // Inter-candidate distances (5x5 matrix): + // A B C D E + // A 0.0 1.5 3.0 4.0 5.0 + // B 1.5 0.0 1.5 3.0 4.0 + // C 3.0 1.5 0.0 1.5 3.0 + // D 4.0 3.0 1.5 0.0 1.5 + // E 5.0 4.0 3.0 1.5 0.0 + + float p_distances[5] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f}; + float inter[25] = { + 0.0f, 1.5f, 3.0f, 4.0f, 5.0f, + 1.5f, 0.0f, 1.5f, 3.0f, 4.0f, + 3.0f, 1.5f, 0.0f, 1.5f, 3.0f, + 4.0f, 3.0f, 1.5f, 0.0f, 1.5f, + 5.0f, 4.0f, 3.0f, 1.5f, 0.0f, + }; + int selected[5]; + int count; + + // alpha=1.0, R=3: greedy selection + // Round 1: Pick A (closest). Prune check: + // B: 1.0*1.5 <= 2.0? yes -> pruned + // C: 1.0*3.0 <= 3.0? yes -> pruned + // D: 1.0*4.0 <= 4.0? yes -> pruned + // E: 1.0*5.0 <= 5.0? yes -> pruned + // Result: only A selected + { + int rc = diskann_prune_select(inter, p_distances, 5, 1.0f, 3, selected, &count); + assert(rc == 0); + assert(count == 1); + assert(selected[0] == 1); // A + } + + // alpha=1.5, R=3: diversity-aware + // Round 1: Pick A. Prune check: + // B: 1.5*1.5=2.25 <= 2.0? no -> keep + // C: 1.5*3.0=4.5 <= 3.0? no -> keep + // D: 1.5*4.0=6.0 <= 4.0? no -> keep + // E: 1.5*5.0=7.5 <= 5.0? no -> keep + // Round 2: Pick B. Prune check: + // C: 1.5*1.5=2.25 <= 3.0? yes -> pruned + // D: 1.5*3.0=4.5 <= 4.0? no -> keep + // E: 1.5*4.0=6.0 <= 5.0? no -> keep + // Round 3: Pick D. Done, 3 selected. + { + int rc = diskann_prune_select(inter, p_distances, 5, 1.5f, 3, selected, &count); + assert(rc == 0); + assert(count == 3); + assert(selected[0] == 1); // A + assert(selected[1] == 1); // B + assert(selected[3] == 1); // D + assert(selected[2] == 0); // C pruned + assert(selected[4] == 0); // E not reached + } + + // R > num_candidates with very high alpha (no pruning): select all + { + int rc = diskann_prune_select(inter, p_distances, 5, 100.0f, 10, selected, &count); + assert(rc == 0); + assert(count == 5); + } + + // Empty candidate set + { + int rc = diskann_prune_select(NULL, NULL, 0, 1.2f, 3, selected, &count); + assert(rc == 0); + assert(count == 0); + } + + printf(" All diskann_prune_select tests passed.\n"); +} + +void test_diskann_quantized_vector_byte_size() { + printf("Starting %s...\n", __func__); + + // Binary quantizer: 1 bit per dimension, so 128 dims = 16 bytes + assert(diskann_quantized_vector_byte_size(VEC0_DISKANN_QUANTIZER_BINARY, 128) == 16); + assert(diskann_quantized_vector_byte_size(VEC0_DISKANN_QUANTIZER_BINARY, 8) == 1); + assert(diskann_quantized_vector_byte_size(VEC0_DISKANN_QUANTIZER_BINARY, 1024) == 128); + + // INT8 quantizer: 1 byte per dimension + assert(diskann_quantized_vector_byte_size(VEC0_DISKANN_QUANTIZER_INT8, 128) == 128); + assert(diskann_quantized_vector_byte_size(VEC0_DISKANN_QUANTIZER_INT8, 1) == 1); + assert(diskann_quantized_vector_byte_size(VEC0_DISKANN_QUANTIZER_INT8, 768) == 768); + + printf(" All diskann_quantized_vector_byte_size tests passed.\n"); +} + +void test_diskann_config_defaults() { + printf("Starting %s...\n", __func__); + + // A freshly zero-initialized VectorColumnDefinition should have diskann.enabled == 0 + struct VectorColumnDefinition col; + memset(&col, 0, sizeof(col)); + assert(col.index_type != VEC0_INDEX_TYPE_DISKANN); + assert(col.diskann.n_neighbors == 0); + assert(col.diskann.search_list_size == 0); + + // Verify parsing a normal vector column still works and diskann is not enabled + { + const char *input = "embedding float[768]"; + int rc = vec0_parse_vector_column(input, (int)strlen(input), &col); + assert(rc == 0 /* SQLITE_OK */); + assert(col.index_type != VEC0_INDEX_TYPE_DISKANN); + sqlite3_free(col.name); + } + + printf(" All diskann_config_defaults tests passed.\n"); +} + +// ====================================================================== +// Additional DiskANN unit tests +// ====================================================================== + +void test_diskann_quantize_int8() { + printf("Starting %s...\n", __func__); + + // INT8 quantization uses fixed range [-1, 1]: + // step = 2.0 / 255.0 + // out[i] = (i8)((src[i] + 1.0) / step - 128.0) + float src[4] = {-1.0f, 0.0f, 0.5f, 1.0f}; + unsigned char out[4]; + + int rc = diskann_quantize_vector(src, 4, VEC0_DISKANN_QUANTIZER_INT8, out); + assert(rc == 0); + + int8_t *signed_out = (int8_t *)out; + // -1.0 -> (0/step) - 128 = -128 + assert(signed_out[0] == -128); + // 0.0 -> (1.0/step) - 128 ~= 127.5 - 128 ~= -0.5 -> (i8)(-0.5) = 0 + assert(signed_out[1] >= -2 && signed_out[1] <= 2); + // 0.5 -> (1.5/step) - 128 ~= 191.25 - 128 = 63.25 -> (i8) 63 + assert(signed_out[2] >= 60 && signed_out[2] <= 66); + // 1.0 -> should be close to 127 (may have float precision issues) + assert(signed_out[3] >= 126 && signed_out[3] <= 127); + + printf(" All diskann_quantize_int8 tests passed.\n"); +} + +void test_diskann_quantize_binary_16d() { + printf("Starting %s...\n", __func__); + + // 16-dimensional vector (2 bytes output) + float src[16] = { + 1.0f, -1.0f, 0.5f, -0.5f, // byte 0: bit0=1, bit1=0, bit2=1, bit3=0 + 0.1f, -0.1f, 0.0f, 100.0f, // byte 0: bit4=1, bit5=0, bit6=0, bit7=1 + -1.0f, 1.0f, 1.0f, 1.0f, // byte 1: bit0=0, bit1=1, bit2=1, bit3=1 + -1.0f, -1.0f, 1.0f, -1.0f // byte 1: bit4=0, bit5=0, bit6=1, bit7=0 + }; + unsigned char out[2]; + + int rc = diskann_quantize_vector(src, 16, VEC0_DISKANN_QUANTIZER_BINARY, out); + assert(rc == 0); + + // byte 0: bits 0,2,4,7 set -> 0b10010101 = 0x95 + assert(out[0] == 0x95); + // byte 1: bits 1,2,3,6 set -> 0b01001110 = 0x4E + assert(out[1] == 0x4E); + + printf(" All diskann_quantize_binary_16d tests passed.\n"); +} + +void test_diskann_quantize_binary_all_positive() { + printf("Starting %s...\n", __func__); + + float src[8] = {1.0f, 2.0f, 0.1f, 0.001f, 100.0f, 42.0f, 0.5f, 3.14f}; + unsigned char out[1]; + + int rc = diskann_quantize_vector(src, 8, VEC0_DISKANN_QUANTIZER_BINARY, out); + assert(rc == 0); + assert(out[0] == 0xFF); // All bits set + + printf(" All diskann_quantize_binary_all_positive tests passed.\n"); +} + +void test_diskann_quantize_binary_all_negative() { + printf("Starting %s...\n", __func__); + + float src[8] = {-1.0f, -2.0f, -0.1f, -0.001f, -100.0f, -42.0f, -0.5f, 0.0f}; + unsigned char out[1]; + + int rc = diskann_quantize_vector(src, 8, VEC0_DISKANN_QUANTIZER_BINARY, out); + assert(rc == 0); + assert(out[0] == 0x00); // No bits set (all <= 0) + + printf(" All diskann_quantize_binary_all_negative tests passed.\n"); +} + +void test_diskann_candidate_list_operations() { + printf("Starting %s...\n", __func__); + + struct DiskannCandidateList list; + int rc = _test_diskann_candidate_list_init(&list, 5); + assert(rc == 0); + + // Insert candidates in non-sorted order + _test_diskann_candidate_list_insert(&list, 10, 3.0f); + _test_diskann_candidate_list_insert(&list, 20, 1.0f); + _test_diskann_candidate_list_insert(&list, 30, 2.0f); + + assert(_test_diskann_candidate_list_count(&list) == 3); + // Should be sorted by distance + assert(_test_diskann_candidate_list_rowid(&list, 0) == 20); // dist 1.0 + assert(_test_diskann_candidate_list_rowid(&list, 1) == 30); // dist 2.0 + assert(_test_diskann_candidate_list_rowid(&list, 2) == 10); // dist 3.0 + + assert(_test_diskann_candidate_list_distance(&list, 0) == 1.0f); + assert(_test_diskann_candidate_list_distance(&list, 1) == 2.0f); + assert(_test_diskann_candidate_list_distance(&list, 2) == 3.0f); + + // Deduplication: inserting same rowid with better distance should update + _test_diskann_candidate_list_insert(&list, 10, 0.5f); + assert(_test_diskann_candidate_list_count(&list) == 3); // Same count + assert(_test_diskann_candidate_list_rowid(&list, 0) == 10); // Now first + assert(_test_diskann_candidate_list_distance(&list, 0) == 0.5f); + + // Next unvisited: should be index 0 + int idx = _test_diskann_candidate_list_next_unvisited(&list); + assert(idx == 0); + + // Mark visited + _test_diskann_candidate_list_set_visited(&list, 0); + idx = _test_diskann_candidate_list_next_unvisited(&list); + assert(idx == 1); // Skip visited + + // Fill to capacity (5) and try inserting a worse candidate + _test_diskann_candidate_list_insert(&list, 40, 4.0f); + _test_diskann_candidate_list_insert(&list, 50, 5.0f); + assert(_test_diskann_candidate_list_count(&list) == 5); + + // Insert worse than worst -> should be discarded + int inserted = _test_diskann_candidate_list_insert(&list, 60, 10.0f); + assert(inserted == 0); + assert(_test_diskann_candidate_list_count(&list) == 5); + + // Insert better than worst -> should replace worst + inserted = _test_diskann_candidate_list_insert(&list, 60, 3.5f); + assert(inserted == 1); + assert(_test_diskann_candidate_list_count(&list) == 5); + + _test_diskann_candidate_list_free(&list); + + printf(" All diskann_candidate_list_operations tests passed.\n"); +} + +void test_diskann_visited_set_operations() { + printf("Starting %s...\n", __func__); + + struct DiskannVisitedSet set; + int rc = _test_diskann_visited_set_init(&set, 32); + assert(rc == 0); + + // Empty set + assert(_test_diskann_visited_set_contains(&set, 1) == 0); + assert(_test_diskann_visited_set_contains(&set, 100) == 0); + + // Insert and check + int inserted = _test_diskann_visited_set_insert(&set, 42); + assert(inserted == 1); + assert(_test_diskann_visited_set_contains(&set, 42) == 1); + assert(_test_diskann_visited_set_contains(&set, 43) == 0); + + // Double insert returns 0 + inserted = _test_diskann_visited_set_insert(&set, 42); + assert(inserted == 0); + + // Insert several + _test_diskann_visited_set_insert(&set, 1); + _test_diskann_visited_set_insert(&set, 2); + _test_diskann_visited_set_insert(&set, 100); + _test_diskann_visited_set_insert(&set, 999); + assert(_test_diskann_visited_set_contains(&set, 1) == 1); + assert(_test_diskann_visited_set_contains(&set, 2) == 1); + assert(_test_diskann_visited_set_contains(&set, 100) == 1); + assert(_test_diskann_visited_set_contains(&set, 999) == 1); + assert(_test_diskann_visited_set_contains(&set, 3) == 0); + + // Sentinel value (rowid 0) should not be insertable + assert(_test_diskann_visited_set_contains(&set, 0) == 0); + inserted = _test_diskann_visited_set_insert(&set, 0); + assert(inserted == 0); + + _test_diskann_visited_set_free(&set); + + printf(" All diskann_visited_set_operations tests passed.\n"); +} + +void test_diskann_prune_select_single_candidate() { + printf("Starting %s...\n", __func__); + + float p_distances[1] = {5.0f}; + float inter[1] = {0.0f}; + int selected[1]; + int count; + + int rc = diskann_prune_select(inter, p_distances, 1, 1.0f, 3, selected, &count); + assert(rc == 0); + assert(count == 1); + assert(selected[0] == 1); + + printf(" All diskann_prune_select_single_candidate tests passed.\n"); +} + +void test_diskann_prune_select_all_identical_distances() { + printf("Starting %s...\n", __func__); + + float p_distances[4] = {2.0f, 2.0f, 2.0f, 2.0f}; + // All inter-distances are equal too + float inter[16] = { + 0.0f, 1.0f, 1.0f, 1.0f, + 1.0f, 0.0f, 1.0f, 1.0f, + 1.0f, 1.0f, 0.0f, 1.0f, + 1.0f, 1.0f, 1.0f, 0.0f, + }; + int selected[4]; + int count; + + // alpha=1.0: pick first, then check if alpha * inter[0][j] <= p_dist[j] + // 1.0 * 1.0 <= 2.0? yes, so all are pruned after picking the first + int rc = diskann_prune_select(inter, p_distances, 4, 1.0f, 4, selected, &count); + assert(rc == 0); + assert(count >= 1); // At least one selected + + printf(" All diskann_prune_select_all_identical_distances tests passed.\n"); +} + +void test_diskann_prune_select_max_neighbors_1() { + printf("Starting %s...\n", __func__); + + float p_distances[3] = {1.0f, 2.0f, 3.0f}; + float inter[9] = { + 0.0f, 5.0f, 5.0f, + 5.0f, 0.0f, 5.0f, + 5.0f, 5.0f, 0.0f, + }; + int selected[3]; + int count; + + // R=1: should select exactly 1 + int rc = diskann_prune_select(inter, p_distances, 3, 1.0f, 1, selected, &count); + assert(rc == 0); + assert(count == 1); + assert(selected[0] == 1); // First (closest) is selected + + printf(" All diskann_prune_select_max_neighbors_1 tests passed.\n"); +} int main() { printf("Starting unit tests...\n"); @@ -1402,5 +2106,23 @@ int main() { test_ivf_quantize_binary(); test_ivf_config_parsing(); #endif + test_vec0_parse_vector_column_diskann(); + test_diskann_validity_bitmap(); + test_diskann_neighbor_ids(); + test_diskann_quantize_binary(); + test_diskann_node_init_sizes(); + test_diskann_node_set_clear_neighbor(); + test_diskann_prune_select(); + test_diskann_quantized_vector_byte_size(); + test_diskann_config_defaults(); + test_diskann_quantize_int8(); + test_diskann_quantize_binary_16d(); + test_diskann_quantize_binary_all_positive(); + test_diskann_quantize_binary_all_negative(); + test_diskann_candidate_list_operations(); + test_diskann_visited_set_operations(); + test_diskann_prune_select_single_candidate(); + test_diskann_prune_select_all_identical_distances(); + test_diskann_prune_select_max_neighbors_1(); printf("All unit tests passed.\n"); } From fb81c011ff6bae2f9afa2456f55bf24c79173c7b Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Mon, 30 Mar 2026 23:24:36 -0700 Subject: [PATCH 11/38] rm demo gha workflow --- .github/workflows/release-demo.yml | 118 ----------------------------- 1 file changed, 118 deletions(-) delete mode 100644 .github/workflows/release-demo.yml diff --git a/.github/workflows/release-demo.yml b/.github/workflows/release-demo.yml deleted file mode 100644 index 2f4b396..0000000 --- a/.github/workflows/release-demo.yml +++ /dev/null @@ -1,118 +0,0 @@ -name: "Release Demo (DiskANN)" -on: - push: - branches: [diskann-yolo2] -permissions: - contents: write -jobs: - build-linux-x86_64-extension: - runs-on: ubuntu-22.04 - steps: - - uses: actions/checkout@v4 - - run: ./scripts/vendor.sh - - run: make loadable static - - uses: actions/upload-artifact@v4 - with: - name: sqlite-vec-linux-x86_64-extension - path: dist/* - build-linux-aarch64-extension: - runs-on: ubuntu-22.04-arm - steps: - - uses: actions/checkout@v4 - - run: ./scripts/vendor.sh - - run: make loadable static - - uses: actions/upload-artifact@v4 - with: - name: sqlite-vec-linux-aarch64-extension - path: dist/* - build-macos-x86_64-extension: - runs-on: macos-15-intel - steps: - - uses: actions/checkout@v4 - - run: ./scripts/vendor.sh - - run: make loadable static - - uses: actions/upload-artifact@v4 - with: - name: sqlite-vec-macos-x86_64-extension - path: dist/* - build-macos-aarch64-extension: - runs-on: macos-14 - steps: - - uses: actions/checkout@v4 - - run: ./scripts/vendor.sh - - run: make loadable static - - uses: actions/upload-artifact@v4 - with: - name: sqlite-vec-macos-aarch64-extension - path: dist/* - build-windows-x86_64-extension: - runs-on: windows-2022 - steps: - - uses: actions/checkout@v4 - - uses: ilammy/msvc-dev-cmd@v1 - - uses: actions/setup-python@v5 - with: - python-version: "3.12" - - run: ./scripts/vendor.sh - shell: bash - - run: make sqlite-vec.h - - run: mkdir dist - - run: cl.exe /fPIC -shared /W4 /Ivendor/ /O2 /LD sqlite-vec.c -o dist/vec0.dll - - uses: actions/upload-artifact@v4 - with: - name: sqlite-vec-windows-x86_64-extension - path: dist/* - dist: - runs-on: ubuntu-latest - needs: - [ - build-linux-x86_64-extension, - build-linux-aarch64-extension, - build-macos-x86_64-extension, - build-macos-aarch64-extension, - build-windows-x86_64-extension, - ] - steps: - - uses: actions/checkout@v4 - - uses: actions/download-artifact@v4 - with: - name: sqlite-vec-linux-x86_64-extension - path: dist/linux-x86_64 - - uses: actions/download-artifact@v4 - with: - name: sqlite-vec-linux-aarch64-extension - path: dist/linux-aarch64 - - uses: actions/download-artifact@v4 - with: - name: sqlite-vec-macos-x86_64-extension - path: dist/macos-x86_64 - - uses: actions/download-artifact@v4 - with: - name: sqlite-vec-macos-aarch64-extension - path: dist/macos-aarch64 - - uses: actions/download-artifact@v4 - with: - name: sqlite-vec-windows-x86_64-extension - path: dist/windows-x86_64 - - run: make sqlite-vec.h - - run: | - ./scripts/vendor.sh - make amalgamation - mkdir -p amalgamation - cp dist/sqlite-vec.c sqlite-vec.h amalgamation/ - rm dist/sqlite-vec.c - - uses: asg017/setup-sqlite-dist@73e37b2ffb0b51e64a64eb035da38c958b9ff6c6 - - run: sqlite-dist build --set-version $(cat VERSION) - - name: Create release and upload assets - env: - GH_TOKEN: ${{ github.token }} - run: | - SHORT_SHA=$(echo "${{ github.sha }}" | head -c 10) - TAG="diskann-${SHORT_SHA}" - zip -j "amalgamation/sqlite-vec-amalgamation.zip" amalgamation/sqlite-vec.c amalgamation/sqlite-vec.h - gh release create "$TAG" \ - --title "$TAG" \ - --target "${{ github.sha }}" \ - --prerelease \ - amalgamation/sqlite-vec-amalgamation.zip \ - .sqlite-dist/pip/* From a248ecd0611f7fe6e68250e01f4f6f31f0a748f7 Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Tue, 31 Mar 2026 01:26:55 -0700 Subject: [PATCH 12/38] Fix DiskANN command dispatch when IVF is disabled The command insert handler (used for runtime config like search_list_size_search) was gated behind SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE, which defaults to 0. DiskANN commands were unreachable unless IVF was also compiled in. Widen the guard to also activate when SQLITE_VEC_ENABLE_DISKANN is set. Co-Authored-By: Claude Opus 4.6 (1M context) --- sqlite-vec.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sqlite-vec.c b/sqlite-vec.c index 5ca7834..abdafe0 100644 --- a/sqlite-vec.c +++ b/sqlite-vec.c @@ -10143,14 +10143,17 @@ static int vec0Update(sqlite3_vtab *pVTab, int argc, sqlite3_value **argv, } // INSERT operation else if (argc > 1 && sqlite3_value_type(argv[0]) == SQLITE_NULL) { -#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE - // Check for IVF command inserts: INSERT INTO t(rowid) VALUES ('compute-centroids') +#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE || SQLITE_VEC_ENABLE_DISKANN + // Check for command inserts: INSERT INTO t(rowid) VALUES ('command-string') // The id column holds the command string. sqlite3_value *idVal = argv[2 + VEC0_COLUMN_ID]; if (sqlite3_value_type(idVal) == SQLITE_TEXT) { const char *cmd = (const char *)sqlite3_value_text(idVal); vec0_vtab *p = (vec0_vtab *)pVTab; - int cmdRc = ivf_handle_command(p, cmd, argc, argv); + int cmdRc = SQLITE_EMPTY; +#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE + cmdRc = ivf_handle_command(p, cmd, argc, argv); +#endif #if SQLITE_VEC_ENABLE_DISKANN if (cmdRc == SQLITE_EMPTY) cmdRc = diskann_handle_command(p, cmd); From 8544081a67c3bd6e6695b4e0b27b133690836916 Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Tue, 31 Mar 2026 01:29:49 -0700 Subject: [PATCH 13/38] Add comprehensive ANN benchmarking suite (#279) Extend benchmarks-ann/ with results database (SQLite with per-query detail and continuous writes), dataset subfolder organization, --subset-size and --warmup options. Supports systematic comparison across flat, rescore, IVF, and DiskANN index types. --- .gitignore | 3 + benchmarks-ann/.gitignore | 6 + benchmarks-ann/Makefile | 28 +- benchmarks-ann/README.md | 114 ++- benchmarks-ann/bench.py | 946 +++++++++++++----- benchmarks-ann/datasets/cohere10m/Makefile | 27 + .../datasets/cohere10m/build_base_db.py | 134 +++ .../{seed => datasets/cohere1m}/.gitignore | 0 .../{seed => datasets/cohere1m}/Makefile | 0 .../cohere1m}/build_base_db.py | 0 benchmarks-ann/datasets/nyt-1024/Makefile | 30 + .../datasets/nyt-1024/build-base.py | 163 +++ benchmarks-ann/datasets/nyt-1024/queries.txt | 100 ++ benchmarks-ann/datasets/nyt-384/Makefile | 29 + benchmarks-ann/datasets/nyt-384/queries.txt | 100 ++ benchmarks-ann/datasets/nyt-768/Makefile | 37 + .../datasets/nyt-768/build-contents.py | 64 ++ .../datasets/nyt-768/distill-model.py | 13 + benchmarks-ann/datasets/nyt-768/queries.txt | 100 ++ benchmarks-ann/datasets/nyt/.gitignore | 1 + benchmarks-ann/datasets/nyt/Makefile | 30 + benchmarks-ann/datasets/nyt/build-base.py | 165 +++ benchmarks-ann/datasets/nyt/build-contents.py | 52 + benchmarks-ann/datasets/nyt/queries.txt | 100 ++ benchmarks-ann/faiss_kmeans.py | 101 ++ benchmarks-ann/results_schema.sql | 76 ++ 26 files changed, 2127 insertions(+), 292 deletions(-) create mode 100644 benchmarks-ann/datasets/cohere10m/Makefile create mode 100644 benchmarks-ann/datasets/cohere10m/build_base_db.py rename benchmarks-ann/{seed => datasets/cohere1m}/.gitignore (100%) rename benchmarks-ann/{seed => datasets/cohere1m}/Makefile (100%) rename benchmarks-ann/{seed => datasets/cohere1m}/build_base_db.py (100%) create mode 100644 benchmarks-ann/datasets/nyt-1024/Makefile create mode 100644 benchmarks-ann/datasets/nyt-1024/build-base.py create mode 100644 benchmarks-ann/datasets/nyt-1024/queries.txt create mode 100644 benchmarks-ann/datasets/nyt-384/Makefile create mode 100644 benchmarks-ann/datasets/nyt-384/queries.txt create mode 100644 benchmarks-ann/datasets/nyt-768/Makefile create mode 100644 benchmarks-ann/datasets/nyt-768/build-contents.py create mode 100644 benchmarks-ann/datasets/nyt-768/distill-model.py create mode 100644 benchmarks-ann/datasets/nyt-768/queries.txt create mode 100644 benchmarks-ann/datasets/nyt/.gitignore create mode 100644 benchmarks-ann/datasets/nyt/Makefile create mode 100644 benchmarks-ann/datasets/nyt/build-base.py create mode 100644 benchmarks-ann/datasets/nyt/build-contents.py create mode 100644 benchmarks-ann/datasets/nyt/queries.txt create mode 100644 benchmarks-ann/faiss_kmeans.py create mode 100644 benchmarks-ann/results_schema.sql diff --git a/.gitignore b/.gitignore index 0268d5d..ef549f4 100644 --- a/.gitignore +++ b/.gitignore @@ -31,3 +31,6 @@ poetry.lock memstat.c memstat.* + + +.DS_Store \ No newline at end of file diff --git a/benchmarks-ann/.gitignore b/benchmarks-ann/.gitignore index c418b76..95707b9 100644 --- a/benchmarks-ann/.gitignore +++ b/benchmarks-ann/.gitignore @@ -1,2 +1,8 @@ *.db +*.db-shm +*.db-wal +*.parquet runs/ + +viewer/ +searcher/ \ No newline at end of file diff --git a/benchmarks-ann/Makefile b/benchmarks-ann/Makefile index ddceb65..a631478 100644 --- a/benchmarks-ann/Makefile +++ b/benchmarks-ann/Makefile @@ -1,5 +1,5 @@ BENCH = python bench.py -BASE_DB = seed/base.db +BASE_DB = cohere1m/base.db EXT = ../dist/vec0 # --- Baseline (brute-force) configs --- @@ -33,7 +33,7 @@ ALL_CONFIGS = $(BASELINES) $(RESCORE_CONFIGS) $(IVF_CONFIGS) $(DISKANN_CONFIGS) # --- Data preparation --- seed: - $(MAKE) -C seed + $(MAKE) -C cohere1m ground-truth: seed python ground_truth.py --subset-size 10000 @@ -42,43 +42,43 @@ ground-truth: seed # --- Quick smoke test --- bench-smoke: seed - $(BENCH) --subset-size 5000 -k 10 -n 20 -o runs/smoke \ + $(BENCH) --subset-size 5000 -k 10 -n 20 --dataset cohere1m -o runs \ "brute-float:type=baseline,variant=float" \ "ivf-quick:type=ivf,nlist=16,nprobe=4" \ "diskann-quick:type=diskann,R=48,L=64,quantizer=binary" bench-rescore: seed - $(BENCH) --subset-size 10000 -k 10 -o runs/rescore \ + $(BENCH) --subset-size 10000 -k 10 --dataset cohere1m -o runs \ $(RESCORE_CONFIGS) # --- Standard sizes --- bench-10k: seed - $(BENCH) --subset-size 10000 -k 10 -o runs/10k $(ALL_CONFIGS) + $(BENCH) --subset-size 10000 -k 10 --dataset cohere1m -o runs $(ALL_CONFIGS) bench-50k: seed - $(BENCH) --subset-size 50000 -k 10 -o runs/50k $(ALL_CONFIGS) + $(BENCH) --subset-size 50000 -k 10 --dataset cohere1m -o runs $(ALL_CONFIGS) bench-100k: seed - $(BENCH) --subset-size 100000 -k 10 -o runs/100k $(ALL_CONFIGS) + $(BENCH) --subset-size 100000 -k 10 --dataset cohere1m -o runs $(ALL_CONFIGS) bench-all: bench-10k bench-50k bench-100k # --- IVF across sizes --- bench-ivf: seed - $(BENCH) --subset-size 10000 -k 10 -o runs/ivf $(BASELINES) $(IVF_CONFIGS) - $(BENCH) --subset-size 50000 -k 10 -o runs/ivf $(BASELINES) $(IVF_CONFIGS) - $(BENCH) --subset-size 100000 -k 10 -o runs/ivf $(BASELINES) $(IVF_CONFIGS) + $(BENCH) --subset-size 10000 -k 10 --dataset cohere1m -o runs $(BASELINES) $(IVF_CONFIGS) + $(BENCH) --subset-size 50000 -k 10 --dataset cohere1m -o runs $(BASELINES) $(IVF_CONFIGS) + $(BENCH) --subset-size 100000 -k 10 --dataset cohere1m -o runs $(BASELINES) $(IVF_CONFIGS) # --- DiskANN across sizes --- bench-diskann: seed - $(BENCH) --subset-size 10000 -k 10 -o runs/diskann $(BASELINES) $(DISKANN_CONFIGS) - $(BENCH) --subset-size 50000 -k 10 -o runs/diskann $(BASELINES) $(DISKANN_CONFIGS) - $(BENCH) --subset-size 100000 -k 10 -o runs/diskann $(BASELINES) $(DISKANN_CONFIGS) + $(BENCH) --subset-size 10000 -k 10 --dataset cohere1m -o runs $(BASELINES) $(DISKANN_CONFIGS) + $(BENCH) --subset-size 50000 -k 10 --dataset cohere1m -o runs $(BASELINES) $(DISKANN_CONFIGS) + $(BENCH) --subset-size 100000 -k 10 --dataset cohere1m -o runs $(BASELINES) $(DISKANN_CONFIGS) # --- Report --- report: - @echo "Use: sqlite3 runs//results.db 'SELECT * FROM bench_results ORDER BY recall DESC'" + @echo "Use: sqlite3 runs/cohere1m//results.db 'SELECT run_id, config_name, status, recall FROM runs JOIN run_results USING(run_id)'" # --- Cleanup --- clean: diff --git a/benchmarks-ann/README.md b/benchmarks-ann/README.md index 1f7fd5c..88f1c74 100644 --- a/benchmarks-ann/README.md +++ b/benchmarks-ann/README.md @@ -1,81 +1,111 @@ # KNN Benchmarks for sqlite-vec Benchmarking infrastructure for vec0 KNN configurations. Includes brute-force -baselines (float, int8, bit); index-specific branches add their own types -via the `INDEX_REGISTRY` in `bench.py`. +baselines (float, int8, bit), rescore, IVF, and DiskANN index types. + +## Datasets + +Each dataset is a subdirectory containing a `Makefile` and `build_base_db.py` +that produce a `base.db`. The benchmark runner auto-discovers any subdirectory +with a `base.db` file. + +``` +cohere1m/ # Cohere 768d cosine, 1M vectors + Makefile # downloads parquets from Zilliz, builds base.db + build_base_db.py + base.db # (generated) + +cohere10m/ # Cohere 768d cosine, 10M vectors (10 train shards) + Makefile # make -j12 download to fetch all shards in parallel + build_base_db.py + base.db # (generated) +``` + +Every `base.db` has the same schema: + +| Table | Columns | Description | +|-------|---------|-------------| +| `train` | `id INTEGER PRIMARY KEY, vector BLOB` | Indexed vectors (f32 blobs) | +| `query_vectors` | `id INTEGER PRIMARY KEY, vector BLOB` | Query vectors for KNN evaluation | +| `neighbors` | `query_vector_id INTEGER, rank INTEGER, neighbors_id TEXT` | Ground-truth nearest neighbors | + +To add a new dataset, create a directory with a `Makefile` that builds `base.db` +with the tables above. It will be available via `--dataset ` automatically. + +### Building datasets + +```bash +# Cohere 1M +cd cohere1m && make download && make && cd .. + +# Cohere 10M (parallel download recommended — 10 train shards + test + neighbors) +cd cohere10m && make -j12 download && make && cd .. +``` ## Prerequisites -- Built `dist/vec0` extension (run `make` from repo root) +- Built `dist/vec0` extension (run `make loadable` from repo root) - Python 3.10+ -- `uv` (for seed data prep): `pip install uv` +- `uv` ## Quick start ```bash -# 1. Download dataset and build seed DB (~3 GB download, ~5 min) -make seed +# 1. Build a dataset +cd cohere1m && make && cd .. -# 2. Run a quick smoke test (5k vectors, ~1 min) +# 2. Quick smoke test (5k vectors) make bench-smoke -# 3. Run full benchmark at 10k +# 3. Full benchmark at 10k make bench-10k ``` ## Usage -### Direct invocation - ```bash -python bench.py --subset-size 10000 \ +uv run python bench.py --subset-size 10000 -k 10 -n 50 --dataset cohere1m \ "brute-float:type=baseline,variant=float" \ - "brute-int8:type=baseline,variant=int8" \ - "brute-bit:type=baseline,variant=bit" + "rescore-bit-os8:type=rescore,quantizer=bit,oversample=8" ``` ### Config format `name:type=,key=val,key=val` -| Index type | Keys | Branch | -|-----------|------|--------| -| `baseline` | `variant` (float/int8/bit), `oversample` | this branch | - -Index branches register additional types in `INDEX_REGISTRY`. See the -docstring in `bench.py` for the extension API. +| Index type | Keys | +|-----------|------| +| `baseline` | `variant` (float/int8/bit), `oversample` | +| `rescore` | `quantizer` (bit/int8), `oversample` | +| `ivf` | `nlist`, `nprobe` | +| `diskann` | `R`, `L`, `quantizer` (binary/int8), `buffer_threshold` | ### Make targets | Target | Description | |--------|-------------| -| `make seed` | Download COHERE 1M dataset | -| `make ground-truth` | Pre-compute ground truth for 10k/50k/100k | -| `make bench-smoke` | Quick 5k baseline test | +| `make seed` | Download and build default dataset | +| `make bench-smoke` | Quick 5k test (3 configs) | | `make bench-10k` | All configs at 10k vectors | | `make bench-50k` | All configs at 50k vectors | | `make bench-100k` | All configs at 100k vectors | | `make bench-all` | 10k + 50k + 100k | +| `make bench-ivf` | Baselines + IVF across 10k/50k/100k | +| `make bench-diskann` | Baselines + DiskANN across 10k/50k/100k | + +## Results DB + +Each run writes to `runs///results.db` (SQLite, WAL mode). +Progress is written continuously — query from another terminal to monitor: + +```bash +sqlite3 runs/cohere1m/10000/results.db "SELECT run_id, config_name, status FROM runs" +``` + +See `results_schema.sql` for the full schema (tables: `runs`, `run_results`, +`insert_batches`, `queries`). ## Adding an index type -In your index branch, add an entry to `INDEX_REGISTRY` in `bench.py` and -append your configs to `ALL_CONFIGS` in the `Makefile`. See the existing -`baseline` entry and the comments in both files for the pattern. - -## Results - -Results are stored in `runs//results.db` using the schema in `schema.sql`. - -```bash -sqlite3 runs/10k/results.db " - SELECT config_name, recall, mean_ms, qps - FROM bench_results - ORDER BY recall DESC -" -``` - -## Dataset - -[Zilliz COHERE Medium 1M](https://zilliz.com/learn/datasets-for-vector-database-benchmarks): -768 dimensions, cosine distance, 1M train vectors + 10k query vectors with precomputed neighbors. +Add an entry to `INDEX_REGISTRY` in `bench.py` and append configs to +`ALL_CONFIGS` in the `Makefile`. See existing entries for the pattern. diff --git a/benchmarks-ann/bench.py b/benchmarks-ann/bench.py index 520db77..a4cbbe4 100644 --- a/benchmarks-ann/bench.py +++ b/benchmarks-ann/bench.py @@ -6,7 +6,7 @@ across different vec0 configurations. Config format: name:type=,key=val,key=val - Available types: none, vec0-flat, rescore, ivf, diskann + Available types: none, vec0-flat, quantized, rescore, ivf, diskann Usage: python bench.py --subset-size 10000 \ @@ -15,7 +15,7 @@ Usage: "flat-int8:type=vec0-flat,variant=int8" """ import argparse -from datetime import datetime, timezone +import json import os import sqlite3 import statistics @@ -23,9 +23,36 @@ import time _SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) EXT_PATH = os.path.join(_SCRIPT_DIR, "..", "dist", "vec0") -BASE_DB = os.path.join(_SCRIPT_DIR, "seed", "base.db") INSERT_BATCH_SIZE = 1000 +_DATASETS_DIR = os.path.join(_SCRIPT_DIR, "datasets") + +DATASETS = { + "cohere1m": {"base_db": os.path.join(_DATASETS_DIR, "cohere1m", "base.db"), "dimensions": 768}, + "cohere10m": {"base_db": os.path.join(_DATASETS_DIR, "cohere10m", "base.db"), "dimensions": 768}, + "nyt": {"base_db": os.path.join(_DATASETS_DIR, "nyt", "base.db"), "dimensions": 256}, + "nyt-768": {"base_db": os.path.join(_DATASETS_DIR, "nyt-768", "base.db"), "dimensions": 768}, + "nyt-1024": {"base_db": os.path.join(_DATASETS_DIR, "nyt-1024", "base.db"), "dimensions": 1024}, + "nyt-384": {"base_db": os.path.join(_DATASETS_DIR, "nyt-384", "base.db"), "dimensions": 384}, +} + + +# ============================================================================ +# Timing helpers +# ============================================================================ + + +def now_ns(): + return time.time_ns() + + +def ns_to_s(ns): + return ns / 1_000_000_000 + + +def ns_to_ms(ns): + return ns / 1_000_000 + # ============================================================================ # Index registry — extension point for ANN index branches @@ -36,7 +63,9 @@ INSERT_BATCH_SIZE = 1000 # "create_table_sql": fn(params) -> SQL string # "insert_sql": fn(params) -> SQL string (or None for default) # "post_insert_hook": fn(conn, params) -> train_time_s (or None) +# "train_sql": fn(params) -> SQL string (or None if no training) # "run_query": fn(conn, params, query, k) -> [(id, distance), ...] (or None for default MATCH) +# "query_sql": fn(params) -> SQL string (or None for default MATCH) # "describe": fn(params) -> str (one-line description) # # To add a new index type, add an entry here. Example (in your branch): @@ -59,6 +88,7 @@ INDEX_REGISTRY = {} def _none_create_table_sql(params): + # none uses raw tables — no dimension in DDL variant = params["variant"] if variant == "int8": return ( @@ -138,7 +168,7 @@ def _none_run_query(conn, params, query, k): return conn.execute( "SELECT id, vec_distance_cosine(:query, embedding) as distance " - "FROM vec_items ORDER BY 2 LIMIT :k", + "FROM vec_items WHERE distance IS NOT NULL ORDER BY 2 LIMIT :k", {"query": query, "k": k}, ).fetchall() @@ -155,7 +185,9 @@ INDEX_REGISTRY["none"] = { "create_table_sql": _none_create_table_sql, "insert_sql": _none_insert_sql, "post_insert_hook": None, + "train_sql": None, "run_query": _none_run_query, + "query_sql": None, "describe": _none_describe, } @@ -166,17 +198,18 @@ INDEX_REGISTRY["none"] = { def _vec0flat_create_table_sql(params): + D = params.get("_dimensions", 768) variant = params["variant"] extra = "" if variant == "int8": - extra = ", embedding_int8 int8[768]" + extra = f", embedding_int8 int8[{D}]" elif variant == "bit": - extra = ", embedding_bq bit[768]" + extra = f", embedding_bq bit[{D}]" return ( f"CREATE VIRTUAL TABLE vec_items USING vec0(" f" chunk_size=256," f" id integer primary key," - f" embedding float[768] distance_metric=cosine" + f" embedding float[{D}] distance_metric=cosine" f" {extra})" ) @@ -228,6 +261,32 @@ def _vec0flat_run_query(conn, params, query, k): return None # use default MATCH +def _vec0flat_query_sql(params): + variant = params["variant"] + oversample = params.get("oversample", 8) + if variant == "int8": + return ( + "WITH coarse AS (" + " SELECT id, embedding FROM vec_items" + " WHERE embedding_int8 MATCH vec_quantize_int8(:query, 'unit')" + f" LIMIT :k * {oversample}" + ") " + "SELECT id, vec_distance_cosine(embedding, :query) as distance " + "FROM coarse ORDER BY 2 LIMIT :k" + ) + elif variant == "bit": + return ( + "WITH coarse AS (" + " SELECT id, embedding FROM vec_items" + " WHERE embedding_bq MATCH vec_quantize_binary(:query)" + f" LIMIT :k * {oversample}" + ") " + "SELECT id, vec_distance_cosine(embedding, :query) as distance " + "FROM coarse ORDER BY 2 LIMIT :k" + ) + return None + + def _vec0flat_describe(params): v = params["variant"] if v in ("int8", "bit"): @@ -240,24 +299,115 @@ INDEX_REGISTRY["vec0-flat"] = { "create_table_sql": _vec0flat_create_table_sql, "insert_sql": _vec0flat_insert_sql, "post_insert_hook": None, + "train_sql": None, "run_query": _vec0flat_run_query, + "query_sql": _vec0flat_query_sql, "describe": _vec0flat_describe, } +# ============================================================================ +# Quantized-only implementation (no rescoring) +# ============================================================================ + + +def _quantized_create_table_sql(params): + D = params.get("_dimensions", 768) + quantizer = params["quantizer"] + if quantizer == "int8": + col = f"embedding int8[{D}]" + elif quantizer == "bit": + col = f"embedding bit[{D}]" + else: + raise ValueError(f"Unknown quantizer: {quantizer}") + return ( + f"CREATE VIRTUAL TABLE vec_items USING vec0(" + f" chunk_size=256," + f" id integer primary key," + f" {col})" + ) + + +def _quantized_insert_sql(params): + quantizer = params["quantizer"] + if quantizer == "int8": + return ( + "INSERT INTO vec_items(id, embedding) " + "SELECT id, vec_quantize_int8(vector, 'unit') " + "FROM base.train WHERE id >= :lo AND id < :hi" + ) + elif quantizer == "bit": + return ( + "INSERT INTO vec_items(id, embedding) " + "SELECT id, vec_quantize_binary(vector) " + "FROM base.train WHERE id >= :lo AND id < :hi" + ) + return None + + +def _quantized_run_query(conn, params, query, k): + """Search quantized column only — no rescoring.""" + quantizer = params["quantizer"] + if quantizer == "int8": + return conn.execute( + "SELECT id, distance FROM vec_items " + "WHERE embedding MATCH vec_quantize_int8(:query, 'unit') AND k = :k", + {"query": query, "k": k}, + ).fetchall() + elif quantizer == "bit": + return conn.execute( + "SELECT id, distance FROM vec_items " + "WHERE embedding MATCH vec_quantize_binary(:query) AND k = :k", + {"query": query, "k": k}, + ).fetchall() + return None + + +def _quantized_query_sql(params): + quantizer = params["quantizer"] + if quantizer == "int8": + return ( + "SELECT id, distance FROM vec_items " + "WHERE embedding MATCH vec_quantize_int8(:query, 'unit') AND k = :k" + ) + elif quantizer == "bit": + return ( + "SELECT id, distance FROM vec_items " + "WHERE embedding MATCH vec_quantize_binary(:query) AND k = :k" + ) + return None + + +def _quantized_describe(params): + return f"quantized {params['quantizer']}" + + +INDEX_REGISTRY["quantized"] = { + "defaults": {"quantizer": "bit"}, + "create_table_sql": _quantized_create_table_sql, + "insert_sql": _quantized_insert_sql, + "post_insert_hook": None, + "train_sql": None, + "run_query": _quantized_run_query, + "query_sql": _quantized_query_sql, + "describe": _quantized_describe, +} + + # ============================================================================ # Rescore implementation # ============================================================================ def _rescore_create_table_sql(params): + D = params.get("_dimensions", 768) quantizer = params.get("quantizer", "bit") oversample = params.get("oversample", 8) return ( f"CREATE VIRTUAL TABLE vec_items USING vec0(" f" chunk_size=256," f" id integer primary key," - f" embedding float[768] distance_metric=cosine" + f" embedding float[{D}] distance_metric=cosine" f" indexed by rescore(quantizer={quantizer}, oversample={oversample}))" ) @@ -273,7 +423,9 @@ INDEX_REGISTRY["rescore"] = { "create_table_sql": _rescore_create_table_sql, "insert_sql": None, "post_insert_hook": None, + "train_sql": None, "run_query": None, # default MATCH query works — rescore is automatic + "query_sql": None, "describe": _rescore_describe, } @@ -284,20 +436,25 @@ INDEX_REGISTRY["rescore"] = { def _ivf_create_table_sql(params): + D = params.get("_dimensions", 768) + quantizer = params.get("quantizer", "none") + oversample = params.get("oversample", 1) + parts = [f"nlist={params['nlist']}", f"nprobe={params['nprobe']}"] + if quantizer != "none": + parts.append(f"quantizer={quantizer}") + if oversample > 1: + parts.append(f"oversample={oversample}") + ivf_args = ", ".join(parts) return ( f"CREATE VIRTUAL TABLE vec_items USING vec0(" - f" id integer primary key," - f" embedding float[768] distance_metric=cosine" - f" indexed by ivf(" - f" nlist={params['nlist']}," - f" nprobe={params['nprobe']}" - f" )" - f")" + f"id integer primary key, " + f"embedding float[{D}] distance_metric=cosine " + f"indexed by ivf({ivf_args}))" ) def _ivf_post_insert_hook(conn, params): - print(" Training k-means centroids...", flush=True) + print(" Training k-means centroids (built-in)...", flush=True) t0 = time.perf_counter() conn.execute("INSERT INTO vec_items(id) VALUES ('compute-centroids')") conn.commit() @@ -306,16 +463,118 @@ def _ivf_post_insert_hook(conn, params): return elapsed +def _ivf_faiss_kmeans_hook(conn, params): + """Run FAISS k-means externally, then load centroids via set-centroid commands. + + Called BEFORE any inserts — centroids are loaded first so vectors get + assigned to partitions on insert (no assign-vectors step needed). + """ + import subprocess + import tempfile + + nlist = params["nlist"] + ntrain = params.get("train_sample", 0) or params.get("faiss_kmeans", 10000) + niter = params.get("faiss_niter", 20) + base_db = params.get("_base_db") # injected by build_index + + print(f" Training k-means via FAISS ({nlist} clusters, {ntrain} vectors, {niter} iters)...", + flush=True) + + centroids_db_path = tempfile.mktemp(suffix=".db") + t0 = time.perf_counter() + + result = subprocess.run( + [ + "uv", "run", "--with", "faiss-cpu", "--with", "numpy", + "python", os.path.join(_SCRIPT_DIR, "faiss_kmeans.py"), + "--base-db", base_db, + "--ntrain", str(ntrain), + "--nclusters", str(nlist), + "--niter", str(niter), + "-o", centroids_db_path, + ], + capture_output=True, text=True, + ) + if result.returncode != 0: + print(f" FAISS stderr: {result.stderr}", flush=True) + raise RuntimeError(f"faiss_kmeans.py failed: {result.stderr}") + + faiss_elapsed = time.perf_counter() - t0 + print(f" FAISS k-means done in {faiss_elapsed:.1f}s", flush=True) + + # Load centroids into vec0 via set-centroid commands + print(f" Loading {nlist} centroids into vec0...", flush=True) + cdb = sqlite3.connect(centroids_db_path) + centroids = cdb.execute( + "SELECT centroid_id, centroid FROM centroids ORDER BY centroid_id" + ).fetchall() + meta = dict(cdb.execute("SELECT key, value FROM meta").fetchall()) + cdb.close() + os.remove(centroids_db_path) + + for cid, blob in centroids: + conn.execute( + "INSERT INTO vec_items(id, embedding) VALUES (?, ?)", + (f"set-centroid:{cid}", blob), + ) + conn.commit() + + elapsed = time.perf_counter() - t0 + print(f" Centroids loaded in {elapsed:.1f}s total", flush=True) + + # Stash meta for results tracking + params["_faiss_meta"] = { + "ntrain": meta.get("ntrain"), + "nclusters": meta.get("nclusters"), + "niter": meta.get("niter"), + "faiss_elapsed_s": meta.get("elapsed_s"), + "total_elapsed_s": round(elapsed, 3), + "trainer": "faiss", + } + + return elapsed + + +def _ivf_pre_query_hook(conn, params): + """Override nprobe at runtime via command dispatch.""" + nprobe = params.get("nprobe") + if nprobe: + conn.execute( + "INSERT INTO vec_items(id) VALUES (?)", + (f"nprobe={nprobe}",), + ) + conn.commit() + print(f" Set nprobe={nprobe}") + + def _ivf_describe(params): - return f"ivf nlist={params['nlist']:<4} nprobe={params['nprobe']}" + ts = params.get("train_sample", 0) + q = params.get("quantizer", "none") + os_val = params.get("oversample", 1) + fk = params.get("faiss_kmeans", 0) + desc = f"ivf nlist={params['nlist']:<4} nprobe={params['nprobe']}" + if q != "none": + desc += f" q={q}" + if os_val > 1: + desc += f" os={os_val}" + if fk: + desc += f" faiss" + if ts: + desc += f" ts={ts}" + return desc INDEX_REGISTRY["ivf"] = { - "defaults": {"nlist": 128, "nprobe": 16}, + "defaults": {"nlist": 128, "nprobe": 16, "train_sample": 0, + "quantizer": "none", "oversample": 1, + "faiss_kmeans": 0, "faiss_niter": 20}, "create_table_sql": _ivf_create_table_sql, "insert_sql": None, "post_insert_hook": _ivf_post_insert_hook, + "pre_query_hook": _ivf_pre_query_hook, + "train_sql": lambda _: "INSERT INTO vec_items(id) VALUES ('compute-centroids')", "run_query": None, + "query_sql": None, "describe": _ivf_describe, } @@ -326,24 +585,35 @@ INDEX_REGISTRY["ivf"] = { def _diskann_create_table_sql(params): + D = params.get("_dimensions", 768) + parts = [ + f"neighbor_quantizer={params['quantizer']}", + f"n_neighbors={params['R']}", + ] + L_insert = params.get("L_insert", 0) + L_search = params.get("L_search", 0) + if L_insert or L_search: + li = L_insert or params["L"] + ls = L_search or params["L"] + parts.append(f"search_list_size_insert={li}") + parts.append(f"search_list_size_search={ls}") + else: + parts.append(f"search_list_size={params['L']}") bt = params["buffer_threshold"] - extra = f", buffer_threshold={bt}" if bt > 0 else "" + if bt > 0: + parts.append(f"buffer_threshold={bt}") + diskann_args = ", ".join(parts) return ( f"CREATE VIRTUAL TABLE vec_items USING vec0(" - f" id integer primary key," - f" embedding float[768] distance_metric=cosine" - f" INDEXED BY diskann(" - f" neighbor_quantizer={params['quantizer']}," - f" n_neighbors={params['R']}," - f" search_list_size={params['L']}" - f" {extra}" - f" )" - f")" + f"id integer primary key, " + f"embedding float[{D}] distance_metric=cosine " + f"indexed by diskann({diskann_args}))" ) def _diskann_pre_query_hook(conn, params): - L_search = params.get("L_search") + """Override search_list_size_search at runtime via command dispatch.""" + L_search = params.get("L_search", 0) if L_search: conn.execute( "INSERT INTO vec_items(id) VALUES (?)", @@ -354,20 +624,27 @@ def _diskann_pre_query_hook(conn, params): def _diskann_describe(params): - desc = f"diskann q={params['quantizer']:<6} R={params['R']:<3} L={params['L']}" - L_search = params.get("L_search") - if L_search: - desc += f" L_search={L_search}" - return desc + L_insert = params.get("L_insert", 0) + L_search = params.get("L_search", 0) + if L_insert or L_search: + li = L_insert or params["L"] + ls = L_search or params["L"] + l_str = f"Li={li} Ls={ls}" + else: + l_str = f"L={params['L']}" + return f"diskann q={params['quantizer']:<6} R={params['R']:<3} {l_str}" INDEX_REGISTRY["diskann"] = { - "defaults": {"R": 72, "L": 128, "quantizer": "binary", "buffer_threshold": 0}, + "defaults": {"R": 72, "L": 128, "L_insert": 0, "L_search": 0, + "quantizer": "binary", "buffer_threshold": 0}, "create_table_sql": _diskann_create_table_sql, "insert_sql": None, "post_insert_hook": None, "pre_query_hook": _diskann_pre_query_hook, + "train_sql": None, "run_query": None, + "query_sql": None, "describe": _diskann_describe, } @@ -377,8 +654,9 @@ INDEX_REGISTRY["diskann"] = { # ============================================================================ INT_KEYS = { - "R", "L", "L_search", "buffer_threshold", "nlist", "nprobe", "oversample", - "n_trees", "search_k", + "R", "L", "L_insert", "L_search", "buffer_threshold", + "nlist", "nprobe", "oversample", "n_trees", "search_k", + "train_sample", "faiss_kmeans", "faiss_niter", } @@ -414,6 +692,12 @@ def parse_config(spec): return name, params +def params_to_json(params): + """Serialize params to JSON, excluding internal keys.""" + return json.dumps({k: v for k, v in sorted(params.items()) + if not k.startswith("_") and k != "index_type"}) + + # ============================================================================ # Shared helpers # ============================================================================ @@ -428,31 +712,59 @@ def load_query_vectors(base_db_path, n): return [(r[0], r[1]) for r in rows] -def insert_loop(conn, sql, subset_size, label=""): - t0 = time.perf_counter() - for lo in range(0, subset_size, INSERT_BATCH_SIZE): +def insert_loop(conn, sql, subset_size, label="", results_db=None, run_id=None, + start_from=0): + loop_start_ns = now_ns() + for lo in range(start_from, subset_size, INSERT_BATCH_SIZE): hi = min(lo + INSERT_BATCH_SIZE, subset_size) + batch_start_ns = now_ns() conn.execute(sql, {"lo": lo, "hi": hi}) conn.commit() + batch_end_ns = now_ns() done = hi + + if results_db is not None and run_id is not None: + elapsed_total_ns = batch_end_ns - loop_start_ns + elapsed_total_s = ns_to_s(elapsed_total_ns) + rate = done / elapsed_total_s if elapsed_total_s > 0 else 0 + results_db.execute( + "INSERT INTO insert_batches " + "(run_id, batch_lo, batch_hi, rows_in_batch, " + " started_ns, ended_ns, duration_ns, " + " cumulative_rows, rate_rows_per_s) " + "VALUES (?,?,?,?,?,?,?,?,?)", + ( + run_id, lo, hi, hi - lo, + batch_start_ns, batch_end_ns, + batch_end_ns - batch_start_ns, + done, round(rate, 1), + ), + ) + + if results_db is not None and run_id is not None: + results_db.commit() + if done % 5000 == 0 or done == subset_size: - elapsed = time.perf_counter() - t0 - rate = done / elapsed if elapsed > 0 else 0 + elapsed_total_ns = batch_end_ns - loop_start_ns + elapsed_total_s = ns_to_s(elapsed_total_ns) + rate = done / elapsed_total_s if elapsed_total_s > 0 else 0 print( f" [{label}] {done:>8}/{subset_size} " - f"{elapsed:.1f}s {rate:.0f} rows/s", + f"{elapsed_total_s:.1f}s {rate:.0f} rows/s", flush=True, ) - return time.perf_counter() - t0 + + return time.perf_counter() # not used for timing anymore, kept for compat -def create_bench_db(db_path, ext_path, base_db): +def create_bench_db(db_path, ext_path, base_db, page_size=4096): if os.path.exists(db_path): os.remove(db_path) conn = sqlite3.connect(db_path) conn.enable_load_extension(True) conn.load_extension(ext_path) - conn.execute("PRAGMA page_size=8192") + if page_size != 4096: + conn.execute(f"PRAGMA page_size={page_size}") conn.execute(f"ATTACH DATABASE '{base_db}' AS base") return conn @@ -475,49 +787,212 @@ DEFAULT_INSERT_SQL = ( "SELECT id, vector FROM base.train WHERE id >= :lo AND id < :hi" ) +DEFAULT_QUERY_SQL = ( + "SELECT id, distance FROM vec_items " + "WHERE embedding MATCH :query AND k = :k" +) + + +# ============================================================================ +# Results DB helpers +# ============================================================================ + +_RESULTS_SCHEMA_PATH = os.path.join(_SCRIPT_DIR, "results_schema.sql") + + +def open_results_db(out_dir, dataset, subset_size, results_db_name="results.db"): + """Open/create the results DB in WAL mode.""" + sub_dir = os.path.join(out_dir, dataset, str(subset_size)) + os.makedirs(sub_dir, exist_ok=True) + db_path = os.path.join(sub_dir, results_db_name) + db = sqlite3.connect(db_path, timeout=60) + db.execute("PRAGMA journal_mode=WAL") + db.execute("PRAGMA busy_timeout=60000") + # Migrate existing DBs: add phase column before running schema + cols = {r[1] for r in db.execute("PRAGMA table_info(runs)").fetchall()} + if cols and "phase" not in cols: + db.execute("ALTER TABLE runs ADD COLUMN phase TEXT NOT NULL DEFAULT 'both'") + db.commit() + with open(_RESULTS_SCHEMA_PATH) as f: + db.executescript(f.read()) + return db, sub_dir + + +def create_run(results_db, config_name, index_type, params, dataset, + subset_size, k, n_queries, phase="both"): + """Insert a new run row and return the run_id.""" + cur = results_db.execute( + "INSERT INTO runs " + "(config_name, index_type, params, dataset, subset_size, " + " k, n_queries, phase, status, created_at_ns) " + "VALUES (?,?,?,?,?,?,?,?,?,?)", + ( + config_name, index_type, params_to_json(params), dataset, + subset_size, k, n_queries, phase, "pending", now_ns(), + ), + ) + results_db.commit() + return cur.lastrowid + + +def update_run_status(results_db, run_id, status): + results_db.execute( + "UPDATE runs SET status=? WHERE run_id=?", (status, run_id) + ) + results_db.commit() + # ============================================================================ # Build # ============================================================================ -def build_index(base_db, ext_path, name, params, subset_size, out_dir): - db_path = os.path.join(out_dir, f"{name}.{subset_size}.db") - conn = create_bench_db(db_path, ext_path, base_db) +def build_index(base_db, ext_path, name, params, subset_size, sub_dir, + results_db=None, run_id=None, k=None): + db_path = os.path.join(sub_dir, f"{name}.{subset_size}.db") + params["_base_db"] = base_db # expose to hooks (e.g. FAISS k-means) + page_size = int(params.get("page_size", 4096)) + conn = create_bench_db(db_path, ext_path, base_db, page_size=page_size) reg = INDEX_REGISTRY[params["index_type"]] - conn.execute(reg["create_table_sql"](params)) + create_sql = reg["create_table_sql"](params) + conn.execute(create_sql) label = params["index_type"] print(f" Inserting {subset_size} vectors...") sql_fn = reg.get("insert_sql") - sql = sql_fn(params) if sql_fn else None - if sql is None: - sql = DEFAULT_INSERT_SQL + insert_sql = sql_fn(params) if sql_fn else None + if insert_sql is None: + insert_sql = DEFAULT_INSERT_SQL - insert_time = insert_loop(conn, sql, subset_size, label) + train_sql_fn = reg.get("train_sql") + train_sql = train_sql_fn(params) if train_sql_fn else None - train_time = 0.0 + query_sql_fn = reg.get("query_sql") + query_sql = query_sql_fn(params) if query_sql_fn else None + if query_sql is None: + query_sql = DEFAULT_QUERY_SQL + + # -- Insert + Training phases -- + train_sample = params.get("train_sample", 0) hook = reg.get("post_insert_hook") - if hook: - train_time = hook(conn, params) + faiss_kmeans = params.get("faiss_kmeans", 0) + + train_started_ns = None + train_ended_ns = None + train_duration_ns = None + train_time_s = 0.0 + + if faiss_kmeans: + # FAISS mode: train on base.db first, load centroids, then insert all + if results_db and run_id: + update_run_status(results_db, run_id, "training") + train_started_ns = now_ns() + train_time_s = _ivf_faiss_kmeans_hook(conn, params) + train_ended_ns = now_ns() + train_duration_ns = train_ended_ns - train_started_ns + + # Now insert all vectors (they get assigned on insert) + if results_db and run_id: + update_run_status(results_db, run_id, "inserting") + insert_started_ns = now_ns() + insert_loop(conn, insert_sql, subset_size, label, + results_db=results_db, run_id=run_id) + insert_ended_ns = now_ns() + insert_duration_ns = insert_ended_ns - insert_started_ns + + elif train_sample and hook and train_sample < subset_size: + # Built-in k-means: insert sample, train, insert rest + if results_db and run_id: + update_run_status(results_db, run_id, "inserting") + insert_started_ns = now_ns() + + print(f" Inserting {train_sample} vectors (training sample)...") + insert_loop(conn, insert_sql, train_sample, label, + results_db=results_db, run_id=run_id) + insert_paused_ns = now_ns() + + # -- Training on sample -- + if results_db and run_id: + update_run_status(results_db, run_id, "training") + train_started_ns = now_ns() + train_time_s = hook(conn, params) + train_ended_ns = now_ns() + train_duration_ns = train_ended_ns - train_started_ns + + # -- Insert remaining vectors -- + if results_db and run_id: + update_run_status(results_db, run_id, "inserting") + print(f" Inserting remaining {subset_size - train_sample} vectors...") + insert_loop(conn, insert_sql, subset_size, label, + results_db=results_db, run_id=run_id, + start_from=train_sample) + insert_ended_ns = now_ns() + + # Insert time = total wall time minus training time + insert_duration_ns = (insert_paused_ns - insert_started_ns) + \ + (insert_ended_ns - train_ended_ns) + else: + # Standard flow: insert all, then train + if results_db and run_id: + update_run_status(results_db, run_id, "inserting") + insert_started_ns = now_ns() + + insert_loop(conn, insert_sql, subset_size, label, + results_db=results_db, run_id=run_id) + insert_ended_ns = now_ns() + insert_duration_ns = insert_ended_ns - insert_started_ns + + if hook: + if results_db and run_id: + update_run_status(results_db, run_id, "training") + train_started_ns = now_ns() + train_time_s = hook(conn, params) + train_ended_ns = now_ns() + train_duration_ns = train_ended_ns - train_started_ns row_count = conn.execute("SELECT count(*) FROM vec_items").fetchone()[0] conn.close() - file_size_mb = os.path.getsize(db_path) / (1024 * 1024) + file_size_bytes = os.path.getsize(db_path) + + build_duration_ns = insert_duration_ns + (train_duration_ns or 0) + insert_time_s = ns_to_s(insert_duration_ns) + + # If FAISS was used for training, record its meta as train_sql + faiss_meta = params.get("_faiss_meta") + if faiss_meta: + train_sql = json.dumps(faiss_meta) + + # Write run_results (build portion) + if results_db and run_id: + results_db.execute( + "INSERT INTO run_results " + "(run_id, insert_started_ns, insert_ended_ns, insert_duration_ns, " + " train_started_ns, train_ended_ns, train_duration_ns, " + " build_duration_ns, db_file_size_bytes, db_file_path, " + " create_sql, insert_sql, train_sql, query_sql, k) " + "VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)", + ( + run_id, insert_started_ns, insert_ended_ns, insert_duration_ns, + train_started_ns, train_ended_ns, train_duration_ns, + build_duration_ns, file_size_bytes, db_path, + create_sql, insert_sql, train_sql, query_sql, k, + ), + ) + results_db.commit() return { "db_path": db_path, - "insert_time_s": round(insert_time, 3), - "train_time_s": round(train_time, 3), - "total_time_s": round(insert_time + train_time, 3), - "insert_per_vec_ms": round((insert_time / row_count) * 1000, 2) + "insert_time_s": round(insert_time_s, 3), + "train_time_s": round(train_time_s, 3), + "total_time_s": round(insert_time_s + train_time_s, 3), + "insert_per_vec_ms": round((insert_time_s / row_count) * 1000, 2) if row_count else 0, "rows": row_count, - "file_size_mb": round(file_size_mb, 2), + "file_size_mb": round(file_size_bytes / (1024 * 1024), 2), } @@ -535,7 +1010,7 @@ def _default_match_query(conn, query, k): def measure_knn(db_path, ext_path, base_db, params, subset_size, k=10, n=50, - pre_query_hook=None): + results_db=None, run_id=None, pre_query_hook=None, warmup=0): conn = sqlite3.connect(db_path) conn.enable_load_extension(True) conn.load_extension(ext_path) @@ -549,10 +1024,25 @@ def measure_knn(db_path, ext_path, base_db, params, subset_size, k=10, n=50, reg = INDEX_REGISTRY[params["index_type"]] query_fn = reg.get("run_query") + # Warmup: run random queries to populate OS page cache + if warmup > 0: + import random + warmup_vecs = [qv for _, qv in query_vectors] + print(f" Warming up with {warmup} queries...", flush=True) + for _ in range(warmup): + wq = random.choice(warmup_vecs) + if query_fn: + query_fn(conn, params, wq, k) + else: + _default_match_query(conn, wq, k) + + if results_db and run_id: + update_run_status(results_db, run_id, "querying") + times_ms = [] recalls = [] - for qid, query in query_vectors: - t0 = time.perf_counter() + for i, (qid, query) in enumerate(query_vectors): + started_ns = now_ns() results = None if query_fn: @@ -560,9 +1050,13 @@ def measure_knn(db_path, ext_path, base_db, params, subset_size, k=10, n=50, if results is None: results = _default_match_query(conn, query, k) - elapsed_ms = (time.perf_counter() - t0) * 1000 - times_ms.append(elapsed_ms) - result_ids = set(r[0] for r in results) + ended_ns = now_ns() + duration_ms = ns_to_ms(ended_ns - started_ns) + times_ms.append(duration_ms) + + result_ids_list = [r[0] for r in results] + result_distances_list = [r[1] for r in results] + result_ids = set(result_ids_list) # Ground truth: use pre-computed neighbors table for full dataset, # otherwise brute-force over the subset @@ -580,91 +1074,62 @@ def measure_knn(db_path, ext_path, base_db, params, subset_size, k=10, n=50, ")", {"query": query, "k": k, "n": subset_size}, ).fetchall() - gt_ids = set(r[0] for r in gt_rows) + gt_ids_list = [r[0] for r in gt_rows] + gt_ids = set(gt_ids_list) if gt_ids: - recalls.append(len(result_ids & gt_ids) / len(gt_ids)) + q_recall = len(result_ids & gt_ids) / len(gt_ids) else: - recalls.append(0.0) + q_recall = 0.0 + recalls.append(q_recall) + + if results_db and run_id: + results_db.execute( + "INSERT INTO queries " + "(run_id, k, query_vector_id, started_ns, ended_ns, duration_ms, " + " result_ids, result_distances, ground_truth_ids, recall) " + "VALUES (?,?,?,?,?,?,?,?,?,?)", + ( + run_id, k, qid, started_ns, ended_ns, round(duration_ms, 4), + json.dumps(result_ids_list), + json.dumps(result_distances_list), + json.dumps(gt_ids_list), + round(q_recall, 6), + ), + ) + results_db.commit() conn.close() + mean_ms = round(statistics.mean(times_ms), 2) + median_ms = round(statistics.median(times_ms), 2) + p99_ms = (round(sorted(times_ms)[int(len(times_ms) * 0.99)], 2) + if len(times_ms) > 1 + else round(times_ms[0], 2)) + total_ms = round(sum(times_ms), 2) + recall = round(statistics.mean(recalls), 4) + qps = round(len(times_ms) / (total_ms / 1000), 1) if total_ms > 0 else 0 + + # Update run_results with query aggregates + if results_db and run_id: + results_db.execute( + "UPDATE run_results SET " + "query_mean_ms=?, query_median_ms=?, query_p99_ms=?, " + "query_total_ms=?, qps=?, recall=? " + "WHERE run_id=?", + (mean_ms, median_ms, p99_ms, total_ms, qps, recall, run_id), + ) + update_run_status(results_db, run_id, "done") + return { - "mean_ms": round(statistics.mean(times_ms), 2), - "median_ms": round(statistics.median(times_ms), 2), - "p99_ms": round(sorted(times_ms)[int(len(times_ms) * 0.99)], 2) - if len(times_ms) > 1 - else round(times_ms[0], 2), - "total_ms": round(sum(times_ms), 2), - "recall": round(statistics.mean(recalls), 4), + "mean_ms": mean_ms, + "median_ms": median_ms, + "p99_ms": p99_ms, + "total_ms": total_ms, + "recall": recall, } -# ============================================================================ -# Results persistence -# ============================================================================ - - -def open_results_db(results_path): - db = sqlite3.connect(results_path) - db.executescript(open(os.path.join(_SCRIPT_DIR, "schema.sql")).read()) - # Migrate existing DBs that predate the runs table - cols = {r[1] for r in db.execute("PRAGMA table_info(runs)").fetchall()} - if "phase" not in cols: - db.execute("ALTER TABLE runs ADD COLUMN phase TEXT NOT NULL DEFAULT 'both'") - db.commit() - return db - - -def create_run(db, config_name, index_type, subset_size, phase, k=None, n=None): - cur = db.execute( - "INSERT INTO runs (config_name, index_type, subset_size, phase, status, k, n) " - "VALUES (?, ?, ?, ?, 'pending', ?, ?)", - (config_name, index_type, subset_size, phase, k, n), - ) - db.commit() - return cur.lastrowid - - -def update_run(db, run_id, **kwargs): - sets = ", ".join(f"{k} = ?" for k in kwargs) - vals = list(kwargs.values()) + [run_id] - db.execute(f"UPDATE runs SET {sets} WHERE run_id = ?", vals) - db.commit() - - -def save_results(results_path, rows): - db = sqlite3.connect(results_path) - db.executescript(open(os.path.join(_SCRIPT_DIR, "schema.sql")).read()) - for r in rows: - db.execute( - "INSERT OR REPLACE INTO build_results " - "(config_name, index_type, subset_size, db_path, " - " insert_time_s, train_time_s, total_time_s, rows, file_size_mb) " - "VALUES (?,?,?,?,?,?,?,?,?)", - ( - r["name"], r["index_type"], r["n_vectors"], r["db_path"], - r["insert_time_s"], r["train_time_s"], r["total_time_s"], - r["rows"], r["file_size_mb"], - ), - ) - db.execute( - "INSERT OR REPLACE INTO bench_results " - "(config_name, index_type, subset_size, k, n, " - " mean_ms, median_ms, p99_ms, total_ms, qps, recall, db_path) " - "VALUES (?,?,?,?,?,?,?,?,?,?,?,?)", - ( - r["name"], r["index_type"], r["n_vectors"], r["k"], r["n_queries"], - r["mean_ms"], r["median_ms"], r["p99_ms"], r["total_ms"], - round(r["n_queries"] / (r["total_ms"] / 1000), 1) - if r["total_ms"] > 0 else 0, - r["recall"], r["db_path"], - ), - ) - db.commit() - db.close() - - # ============================================================================ # Reporting # ============================================================================ @@ -699,22 +1164,38 @@ def main(): epilog=__doc__, ) parser.add_argument("configs", nargs="+", help="config specs (name:type=X,key=val,...)") - parser.add_argument("--subset-size", type=int, required=True) + parser.add_argument("--subset-size", type=int, default=None, + help="number of vectors to use (default: all)") parser.add_argument("-k", type=int, default=10, help="KNN k (default 10)") parser.add_argument("-n", type=int, default=50, help="number of queries (default 50)") parser.add_argument("--phase", choices=["build", "query", "both"], default="both", help="build=build only, query=query existing index, both=default") - parser.add_argument("--base-db", default=BASE_DB) + parser.add_argument("--dataset", default="cohere1m", + choices=list(DATASETS.keys()), + help="dataset name (default: cohere1m)") parser.add_argument("--ext", default=EXT_PATH) - parser.add_argument("-o", "--out-dir", default="runs") - parser.add_argument("--results-db", default=None, - help="path to results DB (default: /results.db)") + parser.add_argument("-o", "--out-dir", default=os.path.join(_SCRIPT_DIR, "runs")) + parser.add_argument("--warmup", type=int, default=0, + help="run N random warmup queries before measuring (default: 0)") + parser.add_argument("--results-db-name", default="results.db", + help="results DB filename (default: results.db)") args = parser.parse_args() - os.makedirs(args.out_dir, exist_ok=True) - results_db_path = args.results_db or os.path.join(args.out_dir, "results.db") + dataset_cfg = DATASETS[args.dataset] + base_db = dataset_cfg["base_db"] + dimensions = dataset_cfg["dimensions"] + + if args.subset_size is None: + _tmp = sqlite3.connect(base_db) + args.subset_size = _tmp.execute("SELECT COUNT(*) FROM train").fetchone()[0] + _tmp.close() + print(f"Using full dataset: {args.subset_size} vectors") + + results_db, sub_dir = open_results_db(args.out_dir, args.dataset, args.subset_size, + results_db_name=args.results_db_name) configs = [parse_config(c) for c in args.configs] - results_db = open_results_db(results_db_path) + for _, params in configs: + params["_dimensions"] = dimensions all_results = [] for i, (name, params) in enumerate(configs, 1): @@ -722,31 +1203,30 @@ def main(): desc = reg["describe"](params) print(f"\n[{i}/{len(configs)}] {name} ({desc.strip()}) [phase={args.phase}]") - db_path = os.path.join(args.out_dir, f"{name}.{args.subset_size}.db") + db_path = os.path.join(sub_dir, f"{name}.{args.subset_size}.db") if args.phase == "build": - run_id = create_run(results_db, name, params["index_type"], - args.subset_size, "build") - update_run(results_db, run_id, status="inserting") + run_id = create_run( + results_db, name, params["index_type"], params, + args.dataset, args.subset_size, args.k, args.n, phase="build", + ) - build = build_index( - args.base_db, args.ext, name, params, args.subset_size, args.out_dir - ) - train_str = f" + {build['train_time_s']}s train" if build["train_time_s"] > 0 else "" - print( - f" Build: {build['insert_time_s']}s insert{train_str} " - f"{build['file_size_mb']} MB" - ) - update_run(results_db, run_id, - status="built", - db_path=build["db_path"], - insert_time_s=build["insert_time_s"], - train_time_s=build["train_time_s"], - total_build_time_s=build["total_time_s"], - rows=build["rows"], - file_size_mb=build["file_size_mb"], - finished_at=datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")) - print(f" Index DB: {build['db_path']}") + try: + build = build_index( + base_db, args.ext, name, params, args.subset_size, sub_dir, + results_db=results_db, run_id=run_id, k=args.k, + ) + train_str = f" + {build['train_time_s']}s train" if build["train_time_s"] > 0 else "" + print( + f" Build: {build['insert_time_s']}s insert{train_str} " + f"{build['file_size_mb']} MB" + ) + update_run_status(results_db, run_id, "built") + print(f" Index DB: {build['db_path']}") + except Exception as e: + update_run_status(results_db, run_id, "error") + print(f" ERROR: {e}") + raise elif args.phase == "query": if not os.path.exists(db_path): @@ -755,30 +1235,35 @@ def main(): f"Build it first with: --phase build" ) - run_id = create_run(results_db, name, params["index_type"], - args.subset_size, "query", k=args.k, n=args.n) - update_run(results_db, run_id, status="querying") - - pre_hook = reg.get("pre_query_hook") - print(f" Measuring KNN (k={args.k}, n={args.n})...") - knn = measure_knn( - db_path, args.ext, args.base_db, - params, args.subset_size, k=args.k, n=args.n, - pre_query_hook=pre_hook, + run_id = create_run( + results_db, name, params["index_type"], params, + args.dataset, args.subset_size, args.k, args.n, phase="query", ) - print(f" KNN: mean={knn['mean_ms']}ms recall@{args.k}={knn['recall']}") - qps = round(args.n / (knn["total_ms"] / 1000), 1) if knn["total_ms"] > 0 else 0 - update_run(results_db, run_id, - status="done", - db_path=db_path, - mean_ms=knn["mean_ms"], - median_ms=knn["median_ms"], - p99_ms=knn["p99_ms"], - total_query_ms=knn["total_ms"], - qps=qps, - recall=knn["recall"], - finished_at=datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")) + try: + # Create a run_results row so measure_knn can UPDATE it + file_size_bytes = os.path.getsize(db_path) + results_db.execute( + "INSERT INTO run_results " + "(run_id, db_file_size_bytes, db_file_path, k) " + "VALUES (?,?,?,?)", + (run_id, file_size_bytes, db_path, args.k), + ) + results_db.commit() + + pre_hook = reg.get("pre_query_hook") + print(f" Measuring KNN (k={args.k}, n={args.n})...") + knn = measure_knn( + db_path, args.ext, base_db, + params, args.subset_size, k=args.k, n=args.n, + results_db=results_db, run_id=run_id, + pre_query_hook=pre_hook, warmup=args.warmup, + ) + print(f" KNN: mean={knn['mean_ms']}ms recall@{args.k}={knn['recall']}") + except Exception as e: + update_run_status(results_db, run_id, "error") + print(f" ERROR: {e}") + raise file_size_mb = os.path.getsize(db_path) / (1024 * 1024) all_results.append({ @@ -803,43 +1288,35 @@ def main(): }) else: # both - run_id = create_run(results_db, name, params["index_type"], - args.subset_size, "both", k=args.k, n=args.n) - update_run(results_db, run_id, status="inserting") - - build = build_index( - args.base_db, args.ext, name, params, args.subset_size, args.out_dir + run_id = create_run( + results_db, name, params["index_type"], params, + args.dataset, args.subset_size, args.k, args.n, phase="both", ) - train_str = f" + {build['train_time_s']}s train" if build["train_time_s"] > 0 else "" - print( - f" Build: {build['insert_time_s']}s insert{train_str} " - f"{build['file_size_mb']} MB" - ) - update_run(results_db, run_id, status="querying", - db_path=build["db_path"], - insert_time_s=build["insert_time_s"], - train_time_s=build["train_time_s"], - total_build_time_s=build["total_time_s"], - rows=build["rows"], - file_size_mb=build["file_size_mb"]) - print(f" Measuring KNN (k={args.k}, n={args.n})...") - knn = measure_knn( - build["db_path"], args.ext, args.base_db, - params, args.subset_size, k=args.k, n=args.n, - ) - print(f" KNN: mean={knn['mean_ms']}ms recall@{args.k}={knn['recall']}") + try: + build = build_index( + base_db, args.ext, name, params, args.subset_size, sub_dir, + results_db=results_db, run_id=run_id, k=args.k, + ) + train_str = f" + {build['train_time_s']}s train" if build["train_time_s"] > 0 else "" + print( + f" Build: {build['insert_time_s']}s insert{train_str} " + f"{build['file_size_mb']} MB" + ) - qps = round(args.n / (knn["total_ms"] / 1000), 1) if knn["total_ms"] > 0 else 0 - update_run(results_db, run_id, - status="done", - mean_ms=knn["mean_ms"], - median_ms=knn["median_ms"], - p99_ms=knn["p99_ms"], - total_query_ms=knn["total_ms"], - qps=qps, - recall=knn["recall"], - finished_at=datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")) + pre_hook = reg.get("pre_query_hook") + print(f" Measuring KNN (k={args.k}, n={args.n})...") + knn = measure_knn( + build["db_path"], args.ext, base_db, + params, args.subset_size, k=args.k, n=args.n, + results_db=results_db, run_id=run_id, + pre_query_hook=pre_hook, warmup=args.warmup, + ) + print(f" KNN: mean={knn['mean_ms']}ms recall@{args.k}={knn['recall']}") + except Exception as e: + update_run_status(results_db, run_id, "error") + print(f" ERROR: {e}") + raise all_results.append({ "name": name, @@ -862,14 +1339,11 @@ def main(): "recall": knn["recall"], }) - results_db.close() - if all_results: print_report(all_results) - save_results(results_db_path, all_results) - print(f"\nResults saved to {results_db_path}") - elif args.phase == "build": - print(f"\nBuild complete. Results tracked in {results_db_path}") + + print(f"\nResults DB: {os.path.join(sub_dir, 'results.db')}") + results_db.close() if __name__ == "__main__": diff --git a/benchmarks-ann/datasets/cohere10m/Makefile b/benchmarks-ann/datasets/cohere10m/Makefile new file mode 100644 index 0000000..322b21c --- /dev/null +++ b/benchmarks-ann/datasets/cohere10m/Makefile @@ -0,0 +1,27 @@ +BASE_URL = https://assets.zilliz.com/benchmark/cohere_large_10m + +TRAIN_PARQUETS = $(shell printf 'train-%02d-of-10.parquet ' 0 1 2 3 4 5 6 7 8 9) +OTHER_PARQUETS = test.parquet neighbors.parquet +PARQUETS = $(TRAIN_PARQUETS) $(OTHER_PARQUETS) + +.PHONY: all download clean + +all: base.db + +# Use: make -j12 download +download: $(PARQUETS) + +train-%-of-10.parquet: + curl -L -o $@ $(BASE_URL)/$@ + +test.parquet: + curl -L -o $@ $(BASE_URL)/test.parquet + +neighbors.parquet: + curl -L -o $@ $(BASE_URL)/neighbors.parquet + +base.db: $(PARQUETS) build_base_db.py + uv run --with pandas --with pyarrow python build_base_db.py + +clean: + rm -f base.db diff --git a/benchmarks-ann/datasets/cohere10m/build_base_db.py b/benchmarks-ann/datasets/cohere10m/build_base_db.py new file mode 100644 index 0000000..ceaeb22 --- /dev/null +++ b/benchmarks-ann/datasets/cohere10m/build_base_db.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 +"""Build base.db from downloaded parquet files (10M dataset, 10 train shards). + +Reads train-00-of-10.parquet .. train-09-of-10.parquet, test.parquet, +neighbors.parquet and creates a SQLite database with tables: + train, query_vectors, neighbors. + +Usage: + uv run --with pandas --with pyarrow python build_base_db.py +""" +import json +import os +import sqlite3 +import struct +import sys +import time + +import pandas as pd + +TRAIN_SHARDS = 10 + + +def float_list_to_blob(floats): + """Pack a list of floats into a little-endian f32 blob.""" + return struct.pack(f"<{len(floats)}f", *floats) + + +def main(): + script_dir = os.path.dirname(os.path.abspath(__file__)) + db_path = os.path.join(script_dir, "base.db") + + train_paths = [ + os.path.join(script_dir, f"train-{i:02d}-of-{TRAIN_SHARDS}.parquet") + for i in range(TRAIN_SHARDS) + ] + test_path = os.path.join(script_dir, "test.parquet") + neighbors_path = os.path.join(script_dir, "neighbors.parquet") + + for p in train_paths + [test_path, neighbors_path]: + if not os.path.exists(p): + print(f"ERROR: {p} not found. Run 'make download' first.") + sys.exit(1) + + if os.path.exists(db_path): + os.remove(db_path) + + conn = sqlite3.connect(db_path) + conn.execute("PRAGMA journal_mode=WAL") + conn.execute("PRAGMA page_size=4096") + + # --- query_vectors (from test.parquet) --- + print("Loading test.parquet (query vectors)...") + t0 = time.perf_counter() + df_test = pd.read_parquet(test_path) + conn.execute( + "CREATE TABLE query_vectors (id INTEGER PRIMARY KEY, vector BLOB)" + ) + rows = [] + for _, row in df_test.iterrows(): + rows.append((int(row["id"]), float_list_to_blob(row["emb"]))) + conn.executemany("INSERT INTO query_vectors (id, vector) VALUES (?, ?)", rows) + conn.commit() + print(f" {len(rows)} query vectors in {time.perf_counter() - t0:.1f}s") + + # --- neighbors (from neighbors.parquet) --- + print("Loading neighbors.parquet...") + t0 = time.perf_counter() + df_neighbors = pd.read_parquet(neighbors_path) + conn.execute( + "CREATE TABLE neighbors (" + " query_vector_id INTEGER, rank INTEGER, neighbors_id TEXT," + " UNIQUE(query_vector_id, rank))" + ) + rows = [] + for _, row in df_neighbors.iterrows(): + qid = int(row["id"]) + nids = row["neighbors_id"] + if isinstance(nids, str): + nids = json.loads(nids) + for rank, nid in enumerate(nids): + rows.append((qid, rank, str(int(nid)))) + conn.executemany( + "INSERT INTO neighbors (query_vector_id, rank, neighbors_id) VALUES (?, ?, ?)", + rows, + ) + conn.commit() + print(f" {len(rows)} neighbor rows in {time.perf_counter() - t0:.1f}s") + + # --- train (from 10 shard parquets) --- + print(f"Loading {TRAIN_SHARDS} train shards (10M vectors, this will take a while)...") + conn.execute( + "CREATE TABLE train (id INTEGER PRIMARY KEY, vector BLOB)" + ) + + global_t0 = time.perf_counter() + total_inserted = 0 + batch_size = 10000 + + for shard_idx, train_path in enumerate(train_paths): + print(f" Shard {shard_idx + 1}/{TRAIN_SHARDS}: {os.path.basename(train_path)}") + t0 = time.perf_counter() + df = pd.read_parquet(train_path) + shard_len = len(df) + + for start in range(0, shard_len, batch_size): + chunk = df.iloc[start : start + batch_size] + rows = [] + for _, row in chunk.iterrows(): + rows.append((int(row["id"]), float_list_to_blob(row["emb"]))) + conn.executemany("INSERT INTO train (id, vector) VALUES (?, ?)", rows) + conn.commit() + + total_inserted += len(rows) + if total_inserted % 100000 < batch_size: + elapsed = time.perf_counter() - global_t0 + rate = total_inserted / elapsed if elapsed > 0 else 0 + print( + f" {total_inserted:>10} {elapsed:.0f}s {rate:.0f} rows/s", + flush=True, + ) + + shard_elapsed = time.perf_counter() - t0 + print(f" shard done: {shard_len} rows in {shard_elapsed:.1f}s") + + elapsed = time.perf_counter() - global_t0 + print(f" {total_inserted} train vectors in {elapsed:.1f}s") + + conn.close() + size_mb = os.path.getsize(db_path) / (1024 * 1024) + print(f"\nDone: {db_path} ({size_mb:.0f} MB)") + + +if __name__ == "__main__": + main() diff --git a/benchmarks-ann/seed/.gitignore b/benchmarks-ann/datasets/cohere1m/.gitignore similarity index 100% rename from benchmarks-ann/seed/.gitignore rename to benchmarks-ann/datasets/cohere1m/.gitignore diff --git a/benchmarks-ann/seed/Makefile b/benchmarks-ann/datasets/cohere1m/Makefile similarity index 100% rename from benchmarks-ann/seed/Makefile rename to benchmarks-ann/datasets/cohere1m/Makefile diff --git a/benchmarks-ann/seed/build_base_db.py b/benchmarks-ann/datasets/cohere1m/build_base_db.py similarity index 100% rename from benchmarks-ann/seed/build_base_db.py rename to benchmarks-ann/datasets/cohere1m/build_base_db.py diff --git a/benchmarks-ann/datasets/nyt-1024/Makefile b/benchmarks-ann/datasets/nyt-1024/Makefile new file mode 100644 index 0000000..0547409 --- /dev/null +++ b/benchmarks-ann/datasets/nyt-1024/Makefile @@ -0,0 +1,30 @@ +MODEL ?= mixedbread-ai/mxbai-embed-large-v1 +K ?= 100 +BATCH_SIZE ?= 256 +DATA_DIR ?= ../nyt/data + +all: base.db + +# Reuse data from ../nyt +$(DATA_DIR): + $(MAKE) -C ../nyt data + +contents.db: $(DATA_DIR) + uv run ../nyt-768/build-contents.py --data-dir $(DATA_DIR) -o $@ + +base.db: contents.db queries.txt + uv run build-base.py \ + --contents-db contents.db \ + --model $(MODEL) \ + --queries-file queries.txt \ + --batch-size $(BATCH_SIZE) \ + --k $(K) \ + -o $@ + +queries.txt: + cp ../nyt/queries.txt $@ + +clean: + rm -f base.db contents.db + +.PHONY: all clean diff --git a/benchmarks-ann/datasets/nyt-1024/build-base.py b/benchmarks-ann/datasets/nyt-1024/build-base.py new file mode 100644 index 0000000..a0a6b22 --- /dev/null +++ b/benchmarks-ann/datasets/nyt-1024/build-base.py @@ -0,0 +1,163 @@ +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "sentence-transformers", +# "torch<=2.7", +# "tqdm", +# ] +# /// + +import argparse +import sqlite3 +from array import array +from itertools import batched + +from sentence_transformers import SentenceTransformer +from tqdm import tqdm + + +def main(): + parser = argparse.ArgumentParser( + description="Build base.db with train vectors, query vectors, and brute-force KNN neighbors", + ) + parser.add_argument( + "--contents-db", "-c", default=None, + help="Path to contents.db (source of headlines and IDs)", + ) + parser.add_argument( + "--model", "-m", default="mixedbread-ai/mxbai-embed-large-v1", + help="HuggingFace model ID (default: mixedbread-ai/mxbai-embed-large-v1)", + ) + parser.add_argument( + "--queries-file", "-q", default="queries.txt", + help="Path to the queries file (default: queries.txt)", + ) + parser.add_argument( + "--output", "-o", required=True, + help="Path to the output base.db", + ) + parser.add_argument( + "--batch-size", "-b", type=int, default=256, + help="Batch size for embedding (default: 256)", + ) + parser.add_argument( + "--k", "-k", type=int, default=100, + help="Number of nearest neighbors (default: 100)", + ) + parser.add_argument( + "--limit", "-l", type=int, default=0, + help="Limit number of headlines to embed (0 = all)", + ) + parser.add_argument( + "--vec-path", "-v", default="~/projects/sqlite-vec/dist/vec0", + help="Path to sqlite-vec extension (default: ~/projects/sqlite-vec/dist/vec0)", + ) + parser.add_argument( + "--skip-neighbors", action="store_true", + help="Skip the brute-force KNN neighbor computation", + ) + args = parser.parse_args() + + import os + vec_path = os.path.expanduser(args.vec_path) + + print(f"Loading model {args.model}...") + model = SentenceTransformer(args.model) + + # Read headlines from contents.db + src = sqlite3.connect(args.contents_db) + limit_clause = f" LIMIT {args.limit}" if args.limit > 0 else "" + headlines = src.execute( + f"SELECT id, headline FROM contents ORDER BY id{limit_clause}" + ).fetchall() + src.close() + print(f"Loaded {len(headlines)} headlines from {args.contents_db}") + + # Read queries + with open(args.queries_file) as f: + queries = [line.strip() for line in f if line.strip()] + print(f"Loaded {len(queries)} queries from {args.queries_file}") + + # Create output database + db = sqlite3.connect(args.output) + db.enable_load_extension(True) + db.load_extension(vec_path) + db.enable_load_extension(False) + + db.execute("CREATE TABLE IF NOT EXISTS train(id INTEGER PRIMARY KEY, vector BLOB)") + db.execute("CREATE TABLE IF NOT EXISTS query_vectors(id INTEGER PRIMARY KEY, vector BLOB)") + db.execute( + "CREATE TABLE IF NOT EXISTS neighbors(" + " query_vector_id INTEGER, rank INTEGER, neighbors_id TEXT," + " UNIQUE(query_vector_id, rank))" + ) + + # Step 1: Embed headlines -> train table + print("Embedding headlines...") + for batch in tqdm( + batched(headlines, args.batch_size), + total=(len(headlines) + args.batch_size - 1) // args.batch_size, + ): + ids = [r[0] for r in batch] + texts = [r[1] for r in batch] + embeddings = model.encode(texts, normalize_embeddings=True) + + params = [ + (int(rid), array("f", emb.tolist()).tobytes()) + for rid, emb in zip(ids, embeddings) + ] + db.executemany("INSERT INTO train VALUES (?, ?)", params) + db.commit() + + del headlines + n = db.execute("SELECT count(*) FROM train").fetchone()[0] + print(f"Embedded {n} headlines") + + # Step 2: Embed queries -> query_vectors table + print("Embedding queries...") + query_embeddings = model.encode(queries, normalize_embeddings=True) + query_params = [] + for i, emb in enumerate(query_embeddings, 1): + blob = array("f", emb.tolist()).tobytes() + query_params.append((i, blob)) + db.executemany("INSERT INTO query_vectors VALUES (?, ?)", query_params) + db.commit() + print(f"Embedded {len(queries)} queries") + + if args.skip_neighbors: + db.close() + print(f"Done (skipped neighbors). Wrote {args.output}") + return + + # Step 3: Brute-force KNN via sqlite-vec -> neighbors table + n_queries = db.execute("SELECT count(*) FROM query_vectors").fetchone()[0] + print(f"Computing {args.k}-NN for {n_queries} queries via sqlite-vec...") + for query_id, query_blob in tqdm( + db.execute("SELECT id, vector FROM query_vectors").fetchall() + ): + results = db.execute( + """ + SELECT + train.id, + vec_distance_cosine(train.vector, ?) AS distance + FROM train + WHERE distance IS NOT NULL + ORDER BY distance ASC + LIMIT ? + """, + (query_blob, args.k), + ).fetchall() + + params = [ + (query_id, rank, str(rid)) + for rank, (rid, _dist) in enumerate(results) + ] + db.executemany("INSERT INTO neighbors VALUES (?, ?, ?)", params) + + db.commit() + db.close() + print(f"Done. Wrote {args.output}") + + +if __name__ == "__main__": + main() diff --git a/benchmarks-ann/datasets/nyt-1024/queries.txt b/benchmarks-ann/datasets/nyt-1024/queries.txt new file mode 100644 index 0000000..9e98f84 --- /dev/null +++ b/benchmarks-ann/datasets/nyt-1024/queries.txt @@ -0,0 +1,100 @@ +latest news on climate change policy +presidential election results and analysis +stock market crash causes +coronavirus vaccine development updates +artificial intelligence breakthrough in healthcare +supreme court ruling on abortion rights +tech companies layoff announcements +earthquake damages in California +cybersecurity breach at major corporation +space exploration mission to Mars +immigration reform legislation debate +renewable energy investment trends +healthcare costs rising across America +protests against police brutality +wildfires destroy homes in the West +Olympic games highlights and records +celebrity scandal rocks Hollywood +breakthrough cancer treatment discovered +housing market bubble concerns +federal reserve interest rate decision +school shooting tragedy response +diplomatic tensions between superpowers +drone strike kills terrorist leader +social media platform faces regulation +archaeological discovery reveals ancient civilization +unemployment rate hits record low +autonomous vehicles testing expansion +streaming service launches original content +opioid crisis intervention programs +trade war tariffs impact economy +infrastructure bill passes Congress +data privacy concerns grow +minimum wage increase proposal +college admissions scandal exposed +NFL player protest during anthem +cryptocurrency regulation debate +pandemic lockdown restrictions eased +mass shooting gun control debate +tax reform legislation impact +ransomware attack cripples pipeline +climate activists stage demonstration +sports team wins championship +banking system collapse fears +pharmaceutical company fraud charges +genetic engineering ethical concerns +border wall funding controversy +impeachment proceedings begin +nuclear weapons treaty violation +artificial meat alternative launch +student loan debt forgiveness +venture capital funding decline +facial recognition ban proposed +election interference investigation +pandemic preparedness failures +police reform measures announced +wildfire prevention strategies +ocean pollution crisis worsens +manufacturing jobs returning +pension fund shortfall concerns +antitrust investigation launched +voting rights protection act +mental health awareness campaign +homeless population increasing +space debris collision risk +drug cartel violence escalates +renewable energy jobs growth +infrastructure deterioration report +vaccine mandate legal challenge +cryptocurrency market volatility +autonomous drone delivery service +deep fake technology dangers +Arctic ice melting accelerates +income inequality gap widens +election fraud claims disputed +corporate merger blocked +medical breakthrough extends life +transportation strike disrupts city +racial justice protests spread +carbon emissions reduction goals +financial crisis warning signs +cyberbullying prevention efforts +asteroid near miss with Earth +gene therapy approval granted +labor union organizing drive +surveillance technology expansion +education funding cuts proposed +disaster relief efforts underway +housing affordability crisis +clean water access shortage +artificial intelligence job displacement +trade agreement negotiations +prison reform initiative launched +species extinction accelerates +political corruption scandal +terrorism threat level raised +food safety contamination outbreak +ai model release +affordability interest rates +peanut allergies in newbons +breaking bad walter white \ No newline at end of file diff --git a/benchmarks-ann/datasets/nyt-384/Makefile b/benchmarks-ann/datasets/nyt-384/Makefile new file mode 100644 index 0000000..76296a1 --- /dev/null +++ b/benchmarks-ann/datasets/nyt-384/Makefile @@ -0,0 +1,29 @@ +MODEL ?= mixedbread-ai/mxbai-embed-xsmall-v1 +K ?= 100 +BATCH_SIZE ?= 512 +DATA_DIR ?= ../nyt/data + +all: base.db + +$(DATA_DIR): + $(MAKE) -C ../nyt data + +contents.db: $(DATA_DIR) + uv run ../nyt-768/build-contents.py --data-dir $(DATA_DIR) -o $@ + +base.db: contents.db queries.txt + uv run ../nyt-1024/build-base.py \ + --contents-db contents.db \ + --model $(MODEL) \ + --queries-file queries.txt \ + --batch-size $(BATCH_SIZE) \ + --k $(K) \ + -o $@ + +queries.txt: + cp ../nyt/queries.txt $@ + +clean: + rm -f base.db contents.db + +.PHONY: all clean diff --git a/benchmarks-ann/datasets/nyt-384/queries.txt b/benchmarks-ann/datasets/nyt-384/queries.txt new file mode 100644 index 0000000..9e98f84 --- /dev/null +++ b/benchmarks-ann/datasets/nyt-384/queries.txt @@ -0,0 +1,100 @@ +latest news on climate change policy +presidential election results and analysis +stock market crash causes +coronavirus vaccine development updates +artificial intelligence breakthrough in healthcare +supreme court ruling on abortion rights +tech companies layoff announcements +earthquake damages in California +cybersecurity breach at major corporation +space exploration mission to Mars +immigration reform legislation debate +renewable energy investment trends +healthcare costs rising across America +protests against police brutality +wildfires destroy homes in the West +Olympic games highlights and records +celebrity scandal rocks Hollywood +breakthrough cancer treatment discovered +housing market bubble concerns +federal reserve interest rate decision +school shooting tragedy response +diplomatic tensions between superpowers +drone strike kills terrorist leader +social media platform faces regulation +archaeological discovery reveals ancient civilization +unemployment rate hits record low +autonomous vehicles testing expansion +streaming service launches original content +opioid crisis intervention programs +trade war tariffs impact economy +infrastructure bill passes Congress +data privacy concerns grow +minimum wage increase proposal +college admissions scandal exposed +NFL player protest during anthem +cryptocurrency regulation debate +pandemic lockdown restrictions eased +mass shooting gun control debate +tax reform legislation impact +ransomware attack cripples pipeline +climate activists stage demonstration +sports team wins championship +banking system collapse fears +pharmaceutical company fraud charges +genetic engineering ethical concerns +border wall funding controversy +impeachment proceedings begin +nuclear weapons treaty violation +artificial meat alternative launch +student loan debt forgiveness +venture capital funding decline +facial recognition ban proposed +election interference investigation +pandemic preparedness failures +police reform measures announced +wildfire prevention strategies +ocean pollution crisis worsens +manufacturing jobs returning +pension fund shortfall concerns +antitrust investigation launched +voting rights protection act +mental health awareness campaign +homeless population increasing +space debris collision risk +drug cartel violence escalates +renewable energy jobs growth +infrastructure deterioration report +vaccine mandate legal challenge +cryptocurrency market volatility +autonomous drone delivery service +deep fake technology dangers +Arctic ice melting accelerates +income inequality gap widens +election fraud claims disputed +corporate merger blocked +medical breakthrough extends life +transportation strike disrupts city +racial justice protests spread +carbon emissions reduction goals +financial crisis warning signs +cyberbullying prevention efforts +asteroid near miss with Earth +gene therapy approval granted +labor union organizing drive +surveillance technology expansion +education funding cuts proposed +disaster relief efforts underway +housing affordability crisis +clean water access shortage +artificial intelligence job displacement +trade agreement negotiations +prison reform initiative launched +species extinction accelerates +political corruption scandal +terrorism threat level raised +food safety contamination outbreak +ai model release +affordability interest rates +peanut allergies in newbons +breaking bad walter white \ No newline at end of file diff --git a/benchmarks-ann/datasets/nyt-768/Makefile b/benchmarks-ann/datasets/nyt-768/Makefile new file mode 100644 index 0000000..93bb72a --- /dev/null +++ b/benchmarks-ann/datasets/nyt-768/Makefile @@ -0,0 +1,37 @@ +MODEL ?= bge-base-en-v1.5-768 +K ?= 100 +BATCH_SIZE ?= 512 +DATA_DIR ?= ../nyt/data + +all: base.db + +# Reuse data from ../nyt +$(DATA_DIR): + $(MAKE) -C ../nyt data + +# Distill model (separate step, may take a while) +$(MODEL): + uv run distill-model.py + +contents.db: $(DATA_DIR) + uv run build-contents.py --data-dir $(DATA_DIR) -o $@ + +base.db: contents.db queries.txt $(MODEL) + uv run ../nyt/build-base.py \ + --contents-db contents.db \ + --model $(MODEL) \ + --queries-file queries.txt \ + --batch-size $(BATCH_SIZE) \ + --k $(K) \ + -o $@ + +queries.txt: + cp ../nyt/queries.txt $@ + +clean: + rm -f base.db contents.db + +clean-all: clean + rm -rf $(MODEL) + +.PHONY: all clean clean-all diff --git a/benchmarks-ann/datasets/nyt-768/build-contents.py b/benchmarks-ann/datasets/nyt-768/build-contents.py new file mode 100644 index 0000000..fc829d8 --- /dev/null +++ b/benchmarks-ann/datasets/nyt-768/build-contents.py @@ -0,0 +1,64 @@ +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "duckdb", +# ] +# /// + +import argparse +import sqlite3 +import duckdb + + +def main(): + parser = argparse.ArgumentParser( + description="Load NYT headline CSVs into a SQLite contents database (most recent 1M, deduplicated)", + ) + parser.add_argument( + "--data-dir", "-d", default="../nyt/data", + help="Directory containing NYT CSV files (default: ../nyt/data)", + ) + parser.add_argument( + "--limit", "-l", type=int, default=1_000_000, + help="Maximum number of headlines to keep (default: 1000000)", + ) + parser.add_argument( + "--output", "-o", required=True, + help="Path to the output SQLite database", + ) + args = parser.parse_args() + + glob_pattern = f"{args.data_dir}/new_york_times_stories_*.csv" + + con = duckdb.connect() + rows = con.execute( + f""" + WITH deduped AS ( + SELECT + headline, + max(pub_date) AS pub_date + FROM read_csv('{glob_pattern}', auto_detect=true, union_by_name=true) + WHERE headline IS NOT NULL AND trim(headline) != '' + GROUP BY headline + ) + SELECT + row_number() OVER (ORDER BY pub_date DESC) AS id, + headline + FROM deduped + ORDER BY pub_date DESC + LIMIT {args.limit} + """ + ).fetchall() + con.close() + + db = sqlite3.connect(args.output) + db.execute("CREATE TABLE contents(id INTEGER PRIMARY KEY, headline TEXT)") + db.executemany("INSERT INTO contents VALUES (?, ?)", rows) + db.commit() + db.close() + + print(f"Wrote {len(rows)} headlines to {args.output}") + + +if __name__ == "__main__": + main() diff --git a/benchmarks-ann/datasets/nyt-768/distill-model.py b/benchmarks-ann/datasets/nyt-768/distill-model.py new file mode 100644 index 0000000..3adca4a --- /dev/null +++ b/benchmarks-ann/datasets/nyt-768/distill-model.py @@ -0,0 +1,13 @@ +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "model2vec[distill]", +# "torch<=2.7", +# ] +# /// + +from model2vec.distill import distill + +model = distill(model_name="BAAI/bge-base-en-v1.5", pca_dims=768) +model.save_pretrained("bge-base-en-v1.5-768") +print("Saved distilled model to bge-base-en-v1.5-768/") diff --git a/benchmarks-ann/datasets/nyt-768/queries.txt b/benchmarks-ann/datasets/nyt-768/queries.txt new file mode 100644 index 0000000..9e98f84 --- /dev/null +++ b/benchmarks-ann/datasets/nyt-768/queries.txt @@ -0,0 +1,100 @@ +latest news on climate change policy +presidential election results and analysis +stock market crash causes +coronavirus vaccine development updates +artificial intelligence breakthrough in healthcare +supreme court ruling on abortion rights +tech companies layoff announcements +earthquake damages in California +cybersecurity breach at major corporation +space exploration mission to Mars +immigration reform legislation debate +renewable energy investment trends +healthcare costs rising across America +protests against police brutality +wildfires destroy homes in the West +Olympic games highlights and records +celebrity scandal rocks Hollywood +breakthrough cancer treatment discovered +housing market bubble concerns +federal reserve interest rate decision +school shooting tragedy response +diplomatic tensions between superpowers +drone strike kills terrorist leader +social media platform faces regulation +archaeological discovery reveals ancient civilization +unemployment rate hits record low +autonomous vehicles testing expansion +streaming service launches original content +opioid crisis intervention programs +trade war tariffs impact economy +infrastructure bill passes Congress +data privacy concerns grow +minimum wage increase proposal +college admissions scandal exposed +NFL player protest during anthem +cryptocurrency regulation debate +pandemic lockdown restrictions eased +mass shooting gun control debate +tax reform legislation impact +ransomware attack cripples pipeline +climate activists stage demonstration +sports team wins championship +banking system collapse fears +pharmaceutical company fraud charges +genetic engineering ethical concerns +border wall funding controversy +impeachment proceedings begin +nuclear weapons treaty violation +artificial meat alternative launch +student loan debt forgiveness +venture capital funding decline +facial recognition ban proposed +election interference investigation +pandemic preparedness failures +police reform measures announced +wildfire prevention strategies +ocean pollution crisis worsens +manufacturing jobs returning +pension fund shortfall concerns +antitrust investigation launched +voting rights protection act +mental health awareness campaign +homeless population increasing +space debris collision risk +drug cartel violence escalates +renewable energy jobs growth +infrastructure deterioration report +vaccine mandate legal challenge +cryptocurrency market volatility +autonomous drone delivery service +deep fake technology dangers +Arctic ice melting accelerates +income inequality gap widens +election fraud claims disputed +corporate merger blocked +medical breakthrough extends life +transportation strike disrupts city +racial justice protests spread +carbon emissions reduction goals +financial crisis warning signs +cyberbullying prevention efforts +asteroid near miss with Earth +gene therapy approval granted +labor union organizing drive +surveillance technology expansion +education funding cuts proposed +disaster relief efforts underway +housing affordability crisis +clean water access shortage +artificial intelligence job displacement +trade agreement negotiations +prison reform initiative launched +species extinction accelerates +political corruption scandal +terrorism threat level raised +food safety contamination outbreak +ai model release +affordability interest rates +peanut allergies in newbons +breaking bad walter white \ No newline at end of file diff --git a/benchmarks-ann/datasets/nyt/.gitignore b/benchmarks-ann/datasets/nyt/.gitignore new file mode 100644 index 0000000..adbb97d --- /dev/null +++ b/benchmarks-ann/datasets/nyt/.gitignore @@ -0,0 +1 @@ +data/ \ No newline at end of file diff --git a/benchmarks-ann/datasets/nyt/Makefile b/benchmarks-ann/datasets/nyt/Makefile new file mode 100644 index 0000000..dfaa6e9 --- /dev/null +++ b/benchmarks-ann/datasets/nyt/Makefile @@ -0,0 +1,30 @@ +MODEL ?= minishlab/potion-base-8M +K ?= 100 +BATCH_SIZE ?= 512 +DATA_DIR ?= data + +all: base.db contents.db + +# Download NYT headlines CSVs from Kaggle (requires `kaggle` CLI + API token) +$(DATA_DIR): + kaggle datasets download -d johnbandy/new-york-times-headlines -p $(DATA_DIR) --unzip + +contents.db: $(DATA_DIR) + uv run build-contents.py --data-dir $(DATA_DIR) -o $@ + +base.db: contents.db queries.txt + uv run build-base.py \ + --contents-db contents.db \ + --model $(MODEL) \ + --queries-file queries.txt \ + --batch-size $(BATCH_SIZE) \ + --k $(K) \ + -o $@ + +clean: + rm -f base.db contents.db + +clean-all: clean + rm -rf $(DATA_DIR) + +.PHONY: all clean clean-all diff --git a/benchmarks-ann/datasets/nyt/build-base.py b/benchmarks-ann/datasets/nyt/build-base.py new file mode 100644 index 0000000..db00aa2 --- /dev/null +++ b/benchmarks-ann/datasets/nyt/build-base.py @@ -0,0 +1,165 @@ +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "model2vec", +# "torch<=2.7", +# "tqdm", +# ] +# /// + +import argparse +import sqlite3 +from array import array +from itertools import batched + +from model2vec import StaticModel +from tqdm import tqdm + + +def main(): + parser = argparse.ArgumentParser( + description="Build base.db with train vectors, query vectors, and brute-force KNN neighbors", + ) + parser.add_argument( + "--contents-db", "-c", default=None, + help="Path to contents.db (source of headlines and IDs)", + ) + parser.add_argument( + "--model", "-m", default="minishlab/potion-base-8M", + help="HuggingFace model ID or local path (default: minishlab/potion-base-8M)", + ) + parser.add_argument( + "--queries-file", "-q", default="queries.txt", + help="Path to the queries file (default: queries.txt)", + ) + parser.add_argument( + "--output", "-o", required=True, + help="Path to the output base.db", + ) + parser.add_argument( + "--batch-size", "-b", type=int, default=512, + help="Batch size for embedding (default: 512)", + ) + parser.add_argument( + "--k", "-k", type=int, default=100, + help="Number of nearest neighbors (default: 100)", + ) + parser.add_argument( + "--vec-path", "-v", default="~/projects/sqlite-vec/dist/vec0", + help="Path to sqlite-vec extension (default: ~/projects/sqlite-vec/dist/vec0)", + ) + parser.add_argument( + "--rebuild-neighbors", action="store_true", + help="Only rebuild the neighbors table (skip embedding steps)", + ) + args = parser.parse_args() + + import os + vec_path = os.path.expanduser(args.vec_path) + + if args.rebuild_neighbors: + # Skip embedding, just open existing DB and rebuild neighbors + db = sqlite3.connect(args.output) + db.enable_load_extension(True) + db.load_extension(vec_path) + db.enable_load_extension(False) + db.execute("DROP TABLE IF EXISTS neighbors") + db.execute( + "CREATE TABLE neighbors(" + " query_vector_id INTEGER, rank INTEGER, neighbors_id TEXT," + " UNIQUE(query_vector_id, rank))" + ) + print(f"Rebuilding neighbors in {args.output}...") + else: + print(f"Loading model {args.model}...") + model = StaticModel.from_pretrained(args.model) + + # Read headlines from contents.db + src = sqlite3.connect(args.contents_db) + headlines = src.execute("SELECT id, headline FROM contents ORDER BY id").fetchall() + src.close() + print(f"Loaded {len(headlines)} headlines from {args.contents_db}") + + # Read queries + with open(args.queries_file) as f: + queries = [line.strip() for line in f if line.strip()] + print(f"Loaded {len(queries)} queries from {args.queries_file}") + + # Create output database + db = sqlite3.connect(args.output) + db.enable_load_extension(True) + db.load_extension(vec_path) + db.enable_load_extension(False) + + db.execute("CREATE TABLE train(id INTEGER PRIMARY KEY, vector BLOB)") + db.execute("CREATE TABLE query_vectors(id INTEGER PRIMARY KEY, vector BLOB)") + db.execute( + "CREATE TABLE neighbors(" + " query_vector_id INTEGER, rank INTEGER, neighbors_id TEXT," + " UNIQUE(query_vector_id, rank))" + ) + + # Step 1: Embed headlines -> train table + print("Embedding headlines...") + for batch in tqdm( + batched(headlines, args.batch_size), + total=(len(headlines) + args.batch_size - 1) // args.batch_size, + ): + ids = [r[0] for r in batch] + texts = [r[1] for r in batch] + embeddings = model.encode(texts) + + params = [ + (int(rid), array("f", emb.tolist()).tobytes()) + for rid, emb in zip(ids, embeddings) + ] + db.executemany("INSERT INTO train VALUES (?, ?)", params) + db.commit() + + del headlines + n = db.execute("SELECT count(*) FROM train").fetchone()[0] + print(f"Embedded {n} headlines") + + # Step 2: Embed queries -> query_vectors table + print("Embedding queries...") + query_embeddings = model.encode(queries) + query_params = [] + for i, emb in enumerate(query_embeddings, 1): + blob = array("f", emb.tolist()).tobytes() + query_params.append((i, blob)) + db.executemany("INSERT INTO query_vectors VALUES (?, ?)", query_params) + db.commit() + print(f"Embedded {len(queries)} queries") + + # Step 3: Brute-force KNN via sqlite-vec -> neighbors table + n_queries = db.execute("SELECT count(*) FROM query_vectors").fetchone()[0] + print(f"Computing {args.k}-NN for {n_queries} queries via sqlite-vec...") + for query_id, query_blob in tqdm( + db.execute("SELECT id, vector FROM query_vectors").fetchall() + ): + results = db.execute( + """ + SELECT + train.id, + vec_distance_cosine(train.vector, ?) AS distance + FROM train + WHERE distance IS NOT NULL + ORDER BY distance ASC + LIMIT ? + """, + (query_blob, args.k), + ).fetchall() + + params = [ + (query_id, rank, str(rid)) + for rank, (rid, _dist) in enumerate(results) + ] + db.executemany("INSERT INTO neighbors VALUES (?, ?, ?)", params) + + db.commit() + db.close() + print(f"Done. Wrote {args.output}") + + +if __name__ == "__main__": + main() diff --git a/benchmarks-ann/datasets/nyt/build-contents.py b/benchmarks-ann/datasets/nyt/build-contents.py new file mode 100644 index 0000000..7e99cb9 --- /dev/null +++ b/benchmarks-ann/datasets/nyt/build-contents.py @@ -0,0 +1,52 @@ +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "duckdb", +# ] +# /// + +import argparse +import os +import sqlite3 +import duckdb + + +def main(): + parser = argparse.ArgumentParser( + description="Load NYT headline CSVs into a SQLite contents database via DuckDB", + ) + parser.add_argument( + "--data-dir", "-d", default="data", + help="Directory containing NYT CSV files (default: data)", + ) + parser.add_argument( + "--output", "-o", required=True, + help="Path to the output SQLite database", + ) + args = parser.parse_args() + + glob_pattern = os.path.join(args.data_dir, "new_york_times_stories_*.csv") + + con = duckdb.connect() + rows = con.execute( + f""" + SELECT + row_number() OVER () AS id, + headline + FROM read_csv('{glob_pattern}', auto_detect=true, union_by_name=true) + WHERE headline IS NOT NULL AND headline != '' + """ + ).fetchall() + con.close() + + db = sqlite3.connect(args.output) + db.execute("CREATE TABLE contents(id INTEGER PRIMARY KEY, headline TEXT)") + db.executemany("INSERT INTO contents VALUES (?, ?)", rows) + db.commit() + db.close() + + print(f"Wrote {len(rows)} headlines to {args.output}") + + +if __name__ == "__main__": + main() diff --git a/benchmarks-ann/datasets/nyt/queries.txt b/benchmarks-ann/datasets/nyt/queries.txt new file mode 100644 index 0000000..9e98f84 --- /dev/null +++ b/benchmarks-ann/datasets/nyt/queries.txt @@ -0,0 +1,100 @@ +latest news on climate change policy +presidential election results and analysis +stock market crash causes +coronavirus vaccine development updates +artificial intelligence breakthrough in healthcare +supreme court ruling on abortion rights +tech companies layoff announcements +earthquake damages in California +cybersecurity breach at major corporation +space exploration mission to Mars +immigration reform legislation debate +renewable energy investment trends +healthcare costs rising across America +protests against police brutality +wildfires destroy homes in the West +Olympic games highlights and records +celebrity scandal rocks Hollywood +breakthrough cancer treatment discovered +housing market bubble concerns +federal reserve interest rate decision +school shooting tragedy response +diplomatic tensions between superpowers +drone strike kills terrorist leader +social media platform faces regulation +archaeological discovery reveals ancient civilization +unemployment rate hits record low +autonomous vehicles testing expansion +streaming service launches original content +opioid crisis intervention programs +trade war tariffs impact economy +infrastructure bill passes Congress +data privacy concerns grow +minimum wage increase proposal +college admissions scandal exposed +NFL player protest during anthem +cryptocurrency regulation debate +pandemic lockdown restrictions eased +mass shooting gun control debate +tax reform legislation impact +ransomware attack cripples pipeline +climate activists stage demonstration +sports team wins championship +banking system collapse fears +pharmaceutical company fraud charges +genetic engineering ethical concerns +border wall funding controversy +impeachment proceedings begin +nuclear weapons treaty violation +artificial meat alternative launch +student loan debt forgiveness +venture capital funding decline +facial recognition ban proposed +election interference investigation +pandemic preparedness failures +police reform measures announced +wildfire prevention strategies +ocean pollution crisis worsens +manufacturing jobs returning +pension fund shortfall concerns +antitrust investigation launched +voting rights protection act +mental health awareness campaign +homeless population increasing +space debris collision risk +drug cartel violence escalates +renewable energy jobs growth +infrastructure deterioration report +vaccine mandate legal challenge +cryptocurrency market volatility +autonomous drone delivery service +deep fake technology dangers +Arctic ice melting accelerates +income inequality gap widens +election fraud claims disputed +corporate merger blocked +medical breakthrough extends life +transportation strike disrupts city +racial justice protests spread +carbon emissions reduction goals +financial crisis warning signs +cyberbullying prevention efforts +asteroid near miss with Earth +gene therapy approval granted +labor union organizing drive +surveillance technology expansion +education funding cuts proposed +disaster relief efforts underway +housing affordability crisis +clean water access shortage +artificial intelligence job displacement +trade agreement negotiations +prison reform initiative launched +species extinction accelerates +political corruption scandal +terrorism threat level raised +food safety contamination outbreak +ai model release +affordability interest rates +peanut allergies in newbons +breaking bad walter white \ No newline at end of file diff --git a/benchmarks-ann/faiss_kmeans.py b/benchmarks-ann/faiss_kmeans.py new file mode 100644 index 0000000..9765a7b --- /dev/null +++ b/benchmarks-ann/faiss_kmeans.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python3 +"""Compute k-means centroids using FAISS and save to a centroids DB. + +Reads the first N vectors from a base.db, runs FAISS k-means, and writes +the centroids to an output SQLite DB as float32 blobs. + +Usage: + python faiss_kmeans.py --base-db datasets/cohere10m/base.db --ntrain 100000 \ + --nclusters 8192 -o centroids.db + +Output schema: + CREATE TABLE centroids ( + centroid_id INTEGER PRIMARY KEY, + centroid BLOB NOT NULL -- float32[D] + ); + CREATE TABLE meta (key TEXT PRIMARY KEY, value TEXT); + -- ntrain, nclusters, dimensions, elapsed_s +""" +import argparse +import os +import sqlite3 +import struct +import time + +import faiss +import numpy as np + + +def main(): + parser = argparse.ArgumentParser(description="FAISS k-means centroid computation") + parser.add_argument("--base-db", required=True, help="path to base.db with train table") + parser.add_argument("--ntrain", type=int, required=True, help="number of vectors to train on") + parser.add_argument("--nclusters", type=int, required=True, help="number of clusters (nlist)") + parser.add_argument("--niter", type=int, default=20, help="k-means iterations (default 20)") + parser.add_argument("--seed", type=int, default=42, help="random seed") + parser.add_argument("-o", "--output", required=True, help="output centroids DB path") + args = parser.parse_args() + + # Load vectors + print(f"Loading {args.ntrain} vectors from {args.base_db}...") + conn = sqlite3.connect(args.base_db) + rows = conn.execute( + "SELECT vector FROM train ORDER BY id LIMIT ?", (args.ntrain,) + ).fetchall() + conn.close() + + # Parse float32 blobs to numpy + first_blob = rows[0][0] + D = len(first_blob) // 4 # float32 + print(f" Dimensions: {D}, loaded {len(rows)} vectors") + + vectors = np.zeros((len(rows), D), dtype=np.float32) + for i, (blob,) in enumerate(rows): + vectors[i] = np.frombuffer(blob, dtype=np.float32) + + # Normalize for cosine distance (FAISS k-means on L2 of unit vectors ≈ cosine) + norms = np.linalg.norm(vectors, axis=1, keepdims=True) + norms[norms == 0] = 1 + vectors /= norms + + # Run FAISS k-means + print(f"Running k-means: {args.nclusters} clusters, {args.niter} iterations...") + t0 = time.perf_counter() + kmeans = faiss.Kmeans( + D, args.nclusters, + niter=args.niter, + seed=args.seed, + verbose=True, + gpu=False, + ) + kmeans.train(vectors) + elapsed = time.perf_counter() - t0 + print(f" Done in {elapsed:.1f}s") + + centroids = kmeans.centroids # (nclusters, D) float32 + + # Write output DB + if os.path.exists(args.output): + os.remove(args.output) + out = sqlite3.connect(args.output) + out.execute("CREATE TABLE centroids (centroid_id INTEGER PRIMARY KEY, centroid BLOB NOT NULL)") + out.execute("CREATE TABLE meta (key TEXT PRIMARY KEY, value TEXT)") + + for i in range(args.nclusters): + blob = centroids[i].tobytes() + out.execute("INSERT INTO centroids (centroid_id, centroid) VALUES (?, ?)", (i, blob)) + + out.execute("INSERT INTO meta VALUES ('ntrain', ?)", (str(args.ntrain),)) + out.execute("INSERT INTO meta VALUES ('nclusters', ?)", (str(args.nclusters),)) + out.execute("INSERT INTO meta VALUES ('dimensions', ?)", (str(D),)) + out.execute("INSERT INTO meta VALUES ('niter', ?)", (str(args.niter),)) + out.execute("INSERT INTO meta VALUES ('elapsed_s', ?)", (str(round(elapsed, 3)),)) + out.execute("INSERT INTO meta VALUES ('seed', ?)", (str(args.seed),)) + out.commit() + out.close() + + print(f"Wrote {args.nclusters} centroids to {args.output}") + + +if __name__ == "__main__": + main() diff --git a/benchmarks-ann/results_schema.sql b/benchmarks-ann/results_schema.sql new file mode 100644 index 0000000..7918709 --- /dev/null +++ b/benchmarks-ann/results_schema.sql @@ -0,0 +1,76 @@ +-- Comprehensive results schema for vec0 KNN benchmark runs. +-- Created in WAL mode: PRAGMA journal_mode=WAL + +CREATE TABLE IF NOT EXISTS runs ( + run_id INTEGER PRIMARY KEY AUTOINCREMENT, + config_name TEXT NOT NULL, + index_type TEXT NOT NULL, + params TEXT NOT NULL, -- JSON: {"R":48,"L":128,"quantizer":"binary"} + dataset TEXT NOT NULL, -- "cohere1m" + subset_size INTEGER NOT NULL, + k INTEGER NOT NULL, + n_queries INTEGER NOT NULL, + phase TEXT NOT NULL DEFAULT 'both', + -- 'build', 'query', or 'both' + status TEXT NOT NULL DEFAULT 'pending', + -- pending → inserting → training → querying → done | built | error + created_at_ns INTEGER NOT NULL -- time.time_ns() +); + +CREATE TABLE IF NOT EXISTS run_results ( + run_id INTEGER PRIMARY KEY REFERENCES runs(run_id), + insert_started_ns INTEGER, + insert_ended_ns INTEGER, + insert_duration_ns INTEGER, + train_started_ns INTEGER, -- NULL if no training + train_ended_ns INTEGER, + train_duration_ns INTEGER, + build_duration_ns INTEGER, -- insert + train + db_file_size_bytes INTEGER, + db_file_path TEXT, + create_sql TEXT, -- CREATE VIRTUAL TABLE ... + insert_sql TEXT, -- INSERT INTO vec_items ... + train_sql TEXT, -- NULL if no training step + query_sql TEXT, -- SELECT ... WHERE embedding MATCH ... + k INTEGER, -- denormalized from runs for easy filtering + query_mean_ms REAL, -- denormalized aggregates + query_median_ms REAL, + query_p99_ms REAL, + query_total_ms REAL, + qps REAL, + recall REAL +); + +CREATE TABLE IF NOT EXISTS insert_batches ( + batch_id INTEGER PRIMARY KEY AUTOINCREMENT, + run_id INTEGER NOT NULL REFERENCES runs(run_id), + batch_lo INTEGER NOT NULL, -- start index (inclusive) + batch_hi INTEGER NOT NULL, -- end index (exclusive) + rows_in_batch INTEGER NOT NULL, + started_ns INTEGER NOT NULL, + ended_ns INTEGER NOT NULL, + duration_ns INTEGER NOT NULL, + cumulative_rows INTEGER NOT NULL, -- total rows inserted so far + rate_rows_per_s REAL NOT NULL -- cumulative rate +); + +CREATE TABLE IF NOT EXISTS queries ( + query_id INTEGER PRIMARY KEY AUTOINCREMENT, + run_id INTEGER NOT NULL REFERENCES runs(run_id), + k INTEGER NOT NULL, + query_vector_id INTEGER NOT NULL, + started_ns INTEGER NOT NULL, + ended_ns INTEGER NOT NULL, + duration_ms REAL NOT NULL, + result_ids TEXT NOT NULL, -- JSON array + result_distances TEXT NOT NULL, -- JSON array + ground_truth_ids TEXT NOT NULL, -- JSON array + recall REAL NOT NULL, + UNIQUE(run_id, k, query_vector_id) +); + +CREATE INDEX IF NOT EXISTS idx_runs_config ON runs(config_name); +CREATE INDEX IF NOT EXISTS idx_runs_type ON runs(index_type); +CREATE INDEX IF NOT EXISTS idx_runs_status ON runs(status); +CREATE INDEX IF NOT EXISTS idx_batches_run ON insert_batches(run_id); +CREATE INDEX IF NOT EXISTS idx_queries_run ON queries(run_id); From 85973b38146291dc9aec8da5c85b47568ad18336 Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Tue, 31 Mar 2026 01:31:39 -0700 Subject: [PATCH 14/38] v0.1.10-alpha.1 --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 1a03094..63759ca 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.1.9 +0.1.10-alpha.1 \ No newline at end of file From 85cf4153972c2848831655bafd57cb5e03c5b20d Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Tue, 31 Mar 2026 12:57:06 -0700 Subject: [PATCH 15/38] Remove dead typedef macros and harmful u_int*_t redefinitions The UINT32_TYPE/UINT16_TYPE/INT16_TYPE/UINT8_TYPE/INT8_TYPE/LONGDOUBLE_TYPE macros (copied from sqlite3.c) were never used anywhere in sqlite-vec. The u_int8_t/u_int16_t/u_int64_t typedefs redefined standard types using BSD-only types despite already being included, breaking builds on musl/Alpine, strict C99, and requiring reactive platform guards. Co-Authored-By: Claude Opus 4.6 (1M context) --- sqlite-vec.c | 51 --------------------------------------------------- 1 file changed, 51 deletions(-) diff --git a/sqlite-vec.c b/sqlite-vec.c index abdafe0..e4cfbc1 100644 --- a/sqlite-vec.c +++ b/sqlite-vec.c @@ -22,61 +22,10 @@ SQLITE_EXTENSION_INIT1 #include "sqlite3.h" #endif -#ifndef UINT32_TYPE -#ifdef HAVE_UINT32_T -#define UINT32_TYPE uint32_t -#else -#define UINT32_TYPE unsigned int -#endif -#endif -#ifndef UINT16_TYPE -#ifdef HAVE_UINT16_T -#define UINT16_TYPE uint16_t -#else -#define UINT16_TYPE unsigned short int -#endif -#endif -#ifndef INT16_TYPE -#ifdef HAVE_INT16_T -#define INT16_TYPE int16_t -#else -#define INT16_TYPE short int -#endif -#endif -#ifndef UINT8_TYPE -#ifdef HAVE_UINT8_T -#define UINT8_TYPE uint8_t -#else -#define UINT8_TYPE unsigned char -#endif -#endif -#ifndef INT8_TYPE -#ifdef HAVE_INT8_T -#define INT8_TYPE int8_t -#else -#define INT8_TYPE signed char -#endif -#endif -#ifndef LONGDOUBLE_TYPE -#define LONGDOUBLE_TYPE long double -#endif - #ifndef SQLITE_VEC_ENABLE_DISKANN #define SQLITE_VEC_ENABLE_DISKANN 1 #endif -#ifndef _WIN32 -#ifndef __EMSCRIPTEN__ -#ifndef __COSMOPOLITAN__ -#ifndef __wasi__ -typedef u_int8_t uint8_t; -typedef u_int16_t uint16_t; -typedef u_int64_t uint64_t; -#endif -#endif -#endif -#endif - typedef int8_t i8; typedef uint8_t u8; typedef int16_t i16; From 3cfc2e0c1f08ed8120547b61c316afa0ee97b837 Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Tue, 31 Mar 2026 13:14:18 -0700 Subject: [PATCH 16/38] Fix broken unzip -d line in vendor.sh Remove stray incomplete `unzip -d` command that would error on CI. Co-Authored-By: Claude Opus 4.6 (1M context) --- scripts/vendor.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/vendor.sh b/scripts/vendor.sh index 0706aa5..033ea1e 100755 --- a/scripts/vendor.sh +++ b/scripts/vendor.sh @@ -1,7 +1,6 @@ #!/bin/bash mkdir -p vendor curl -o sqlite-amalgamation.zip https://www.sqlite.org/2024/sqlite-amalgamation-3450300.zip -unzip -d unzip sqlite-amalgamation.zip mv sqlite-amalgamation-3450300/* vendor/ rmdir sqlite-amalgamation-3450300 From 07f56e3cbe28cac5b6046da53dd0fffbbc7109a1 Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Tue, 31 Mar 2026 13:51:08 -0700 Subject: [PATCH 17/38] Fix #if SQLITE_VEC_ENABLE_RESCORE guards wrapping non-rescore logic Six sites used #if SQLITE_VEC_ENABLE_RESCORE to guard _vector_chunks skip logic that applies to ALL non-flat index types. When RESCORE was compiled out, DiskANN and IVF columns would incorrectly access flat chunk tables. Two sites also missed DiskANN in the skip enumeration, which would break mixed flat+DiskANN tables. Fix: replace all six compile-time guards with unconditional runtime `!= VEC0_INDEX_TYPE_FLAT` checks. Also move rescore_on_delete inside the !vec0_all_columns_diskann guard to prevent use of uninitialized chunk_id/chunk_offset, and initialize those variables to 0. Co-Authored-By: Claude Opus 4.6 (1M context) --- sqlite-vec.c | 51 ++++++++++++++++----------------------------------- 1 file changed, 16 insertions(+), 35 deletions(-) diff --git a/sqlite-vec.c b/sqlite-vec.c index e4cfbc1..db79d0b 100644 --- a/sqlite-vec.c +++ b/sqlite-vec.c @@ -4625,16 +4625,10 @@ int vec0_new_chunk(vec0_vtab *p, sqlite3_value ** partitionKeyValues, i64 *chunk } int vector_column_idx = p->user_column_idxs[i]; -#if SQLITE_VEC_ENABLE_RESCORE - // Rescore and IVF columns don't use _vector_chunks for float storage - if (p->vector_columns[vector_column_idx].index_type == VEC0_INDEX_TYPE_RESCORE -#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE - || p->vector_columns[vector_column_idx].index_type == VEC0_INDEX_TYPE_IVF -#endif - ) { + // Non-FLAT columns (rescore, IVF, DiskANN) don't use _vector_chunks + if (p->vector_columns[vector_column_idx].index_type != VEC0_INDEX_TYPE_FLAT) { continue; } -#endif i64 vectorsSize = p->chunk_size * vector_column_byte_size(p->vector_columns[vector_column_idx]); @@ -5418,11 +5412,9 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, sqlite3_finalize(stmt); for (int i = 0; i < pNew->numVectorColumns; i++) { -#if SQLITE_VEC_ENABLE_RESCORE - // Non-FLAT columns don't use _vector_chunks + // Non-FLAT columns (rescore, IVF, DiskANN) don't use _vector_chunks if (pNew->vector_columns[i].index_type != VEC0_INDEX_TYPE_FLAT) continue; -#endif char *zSql = sqlite3_mprintf(VEC0_SHADOW_VECTOR_N_CREATE, pNew->schemaName, pNew->tableName, i); if (!zSql) { @@ -5711,10 +5703,9 @@ static int vec0Destroy(sqlite3_vtab *pVtab) { continue; } #endif -#if SQLITE_VEC_ENABLE_RESCORE + // Non-FLAT columns (rescore, IVF, DiskANN) don't use _vector_chunks if (p->vector_columns[i].index_type != VEC0_INDEX_TYPE_FLAT) continue; -#endif zSql = sqlite3_mprintf("DROP TABLE \"%w\".\"%w\"", p->schemaName, p->shadowVectorChunksNames[i]); rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, 0); @@ -8764,15 +8755,9 @@ int vec0Update_InsertWriteFinalStep(vec0_vtab *p, i64 chunk_rowid, // Go insert the vector data into the vector chunk shadow tables for (int i = 0; i < p->numVectorColumns; i++) { -#if SQLITE_VEC_ENABLE_RESCORE - // Rescore and IVF columns don't use _vector_chunks - if (p->vector_columns[i].index_type == VEC0_INDEX_TYPE_RESCORE -#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE - || p->vector_columns[i].index_type == VEC0_INDEX_TYPE_IVF -#endif - ) + // Non-FLAT columns (rescore, IVF, DiskANN) don't use _vector_chunks + if (p->vector_columns[i].index_type != VEC0_INDEX_TYPE_FLAT) continue; -#endif sqlite3_blob *blobVectors; rc = sqlite3_blob_open(p->db, p->schemaName, p->shadowVectorChunksNames[i], @@ -9398,11 +9383,9 @@ int vec0Update_Delete_ClearVectors(vec0_vtab *p, i64 chunk_id, u64 chunk_offset) { int rc, brc; for (int i = 0; i < p->numVectorColumns; i++) { -#if SQLITE_VEC_ENABLE_RESCORE - // Non-FLAT columns don't use _vector_chunks + // Non-FLAT columns (rescore, IVF, DiskANN) don't use _vector_chunks if (p->vector_columns[i].index_type != VEC0_INDEX_TYPE_FLAT) continue; -#endif sqlite3_blob *blobVectors = NULL; size_t n = vector_column_byte_size(p->vector_columns[i]); @@ -9514,10 +9497,9 @@ int vec0Update_Delete_DeleteChunkIfEmpty(vec0_vtab *p, i64 chunk_id, // Delete from each _vector_chunksNN for (int i = 0; i < p->numVectorColumns; i++) { -#if SQLITE_VEC_ENABLE_RESCORE + // Non-FLAT columns (rescore, IVF, DiskANN) don't use _vector_chunks if (p->vector_columns[i].index_type != VEC0_INDEX_TYPE_FLAT) continue; -#endif zSql = sqlite3_mprintf( "DELETE FROM " VEC0_SHADOW_VECTOR_N_NAME " WHERE rowid = ?", p->schemaName, p->tableName, i); @@ -9711,8 +9693,8 @@ int vec0Update_Delete(sqlite3_vtab *pVTab, sqlite3_value *idValue) { vec0_vtab *p = (vec0_vtab *)pVTab; int rc; i64 rowid; - i64 chunk_id; - i64 chunk_offset; + i64 chunk_id = 0; + i64 chunk_offset = 0; if (p->pkIsText) { rc = vec0_rowid_from_id(p, idValue, &rowid); @@ -9764,16 +9746,15 @@ int vec0Update_Delete(sqlite3_vtab *pVTab, sqlite3_value *idValue) { if (rc != SQLITE_OK) { return rc; } - } - #if SQLITE_VEC_ENABLE_RESCORE - // 4b. zero out quantized data in rescore chunk tables, delete from rescore vectors - rc = rescore_on_delete(p, chunk_id, chunk_offset, rowid); - if (rc != SQLITE_OK) { - return rc; - } + // 4b. zero out quantized data in rescore chunk tables, delete from rescore vectors + rc = rescore_on_delete(p, chunk_id, chunk_offset, rowid); + if (rc != SQLITE_OK) { + return rc; + } #endif + } // 5. delete from _rowids table rc = vec0Update_Delete_DeleteRowids(p, rowid); From 9df59b4c03e2a882cb16609f825aa1e7726bce1d Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Tue, 31 Mar 2026 14:08:08 -0700 Subject: [PATCH 18/38] Temporarily block vector UPDATE for DiskANN and IVF indexes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vec0Update_UpdateVectorColumn writes to flat chunk blobs but does not update DiskANN graph or IVF index structures, silently corrupting KNN results. Now returns a clear error for these index types. Rescore UPDATE is unaffected — it already has a full implementation that updates both quantized chunks and float vectors. Co-Authored-By: Claude Opus 4.6 (1M context) --- sqlite-vec.c | 20 ++++++++++++++++++++ tests/test-diskann.py | 15 +++------------ tests/test-ivf-mutations.py | 12 ++++++++++++ 3 files changed, 35 insertions(+), 12 deletions(-) diff --git a/sqlite-vec.c b/sqlite-vec.c index db79d0b..53c4635 100644 --- a/sqlite-vec.c +++ b/sqlite-vec.c @@ -10055,6 +10055,26 @@ int vec0Update_Update(sqlite3_vtab *pVTab, int argc, sqlite3_value **argv) { continue; } + // Block vector UPDATE for index types that don't implement it — + // the DiskANN graph / IVF lists would become stale. + { + enum Vec0IndexType idx_type = p->vector_columns[vector_idx].index_type; + const char *idx_name = NULL; + if (idx_type == VEC0_INDEX_TYPE_DISKANN) idx_name = "DiskANN"; +#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE + else if (idx_type == VEC0_INDEX_TYPE_IVF) idx_name = "IVF"; +#endif + if (idx_name) { + vtab_set_error( + &p->base, + "UPDATE on vector column \"%.*s\" is not supported for %s indexes.", + p->vector_columns[vector_idx].name_length, + p->vector_columns[vector_idx].name, + idx_name); + return SQLITE_ERROR; + } + } + rc = vec0Update_UpdateVectorColumn(p, chunk_id, chunk_offset, vector_idx, valueVector, rowid); if (rc != SQLITE_OK) { diff --git a/tests/test-diskann.py b/tests/test-diskann.py index 4c049ce..4fad96b 100644 --- a/tests/test-diskann.py +++ b/tests/test-diskann.py @@ -891,7 +891,7 @@ def test_diskann_delete_preserves_graph_connectivity(db): # ====================================================================== def test_diskann_update_vector(db): - """UPDATE a vector on DiskANN table may not be supported; verify it either works or errors cleanly.""" + """UPDATE a vector on DiskANN table should error (will be implemented soon).""" db.execute(""" CREATE VIRTUAL TABLE t USING vec0( emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) @@ -901,17 +901,8 @@ def test_diskann_update_vector(db): db.execute("INSERT INTO t(rowid, emb) VALUES (2, ?)", [_f32([0, 1, 0, 0, 0, 0, 0, 0])]) db.execute("INSERT INTO t(rowid, emb) VALUES (3, ?)", [_f32([0, 0, 1, 0, 0, 0, 0, 0])]) - # UPDATE may not be fully supported for DiskANN yet; verify no crash - result = exec(db, "UPDATE t SET emb = ? WHERE rowid = 1", [_f32([0, 0.9, 0.1, 0, 0, 0, 0, 0])]) - if "error" not in result: - # If UPDATE succeeded, verify KNN reflects the new value - rows = db.execute( - "SELECT rowid, distance FROM t WHERE emb MATCH ? AND k=3", - [_f32([0, 1, 0, 0, 0, 0, 0, 0])], - ).fetchall() - assert len(rows) == 3 - # rowid 2 should still be closest (exact match) - assert rows[0][0] == 2 + with pytest.raises(sqlite3.OperationalError, match="UPDATE on vector column.*not supported for DiskANN"): + db.execute("UPDATE t SET emb = ? WHERE rowid = 1", [_f32([0, 0.9, 0.1, 0, 0, 0, 0, 0])]) # ====================================================================== diff --git a/tests/test-ivf-mutations.py b/tests/test-ivf-mutations.py index 5c61119..fce1832 100644 --- a/tests/test-ivf-mutations.py +++ b/tests/test-ivf-mutations.py @@ -573,3 +573,15 @@ def test_interleaved_ops_correctness(db): # Verify we get the right count (25 odd + 15 new - 10 deleted new = 30) expected_alive = set(range(1, 50, 2)) | set(range(50, 60)) | set(range(70, 75)) assert rowids.issubset(expected_alive) + + +def test_ivf_update_vector_blocked(db): + """UPDATE on a vector column with IVF index should error (index would become stale).""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(emb float[4] indexed by ivf(nlist=2))" + ) + db.execute("INSERT INTO t(rowid, emb) VALUES (1, ?)", [_f32([1, 0, 0, 0])]) + db.execute("INSERT INTO t(rowid, emb) VALUES (2, ?)", [_f32([0, 1, 0, 0])]) + + with pytest.raises(sqlite3.OperationalError, match="UPDATE on vector column.*not supported for IVF"): + db.execute("UPDATE t SET emb = ? WHERE rowid = 1", [_f32([0, 0, 1, 0])]) From 82f4eb08bfe781b676041f6177af5a1fb2466ff7 Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Tue, 31 Mar 2026 14:31:49 -0700 Subject: [PATCH 19/38] Add NULL checks after sqlite3_column_blob in rescore and DiskANN sqlite3_column_blob() returns NULL for zero-length blobs or on OOM. Several call sites in rescore KNN and DiskANN node/vector read passed the result directly to memcpy without checking, risking NULL deref on corrupt or empty databases. IVF already had proper NULL checks. Adds corruption regression tests that truncate shadow table blobs and verify the query errors cleanly instead of crashing. Co-Authored-By: Claude Opus 4.6 (1M context) --- sqlite-vec-diskann.c | 20 ++++++++++++++++---- sqlite-vec-rescore.c | 4 ++++ tests/test-diskann.py | 27 +++++++++++++++++++++++++++ tests/test-rescore.py | 29 +++++++++++++++++++++++++++++ 4 files changed, 76 insertions(+), 4 deletions(-) diff --git a/sqlite-vec-diskann.c b/sqlite-vec-diskann.c index 1a5fd2b..7d4da6e 100644 --- a/sqlite-vec-diskann.c +++ b/sqlite-vec-diskann.c @@ -410,9 +410,18 @@ static int diskann_node_read(vec0_vtab *p, int vec_col_idx, i64 rowid, return SQLITE_NOMEM; } - memcpy(v, sqlite3_column_blob(stmt, 0), vs); - memcpy(ids, sqlite3_column_blob(stmt, 1), is); - memcpy(qv, sqlite3_column_blob(stmt, 2), qs); + const void *blobV = sqlite3_column_blob(stmt, 0); + const void *blobIds = sqlite3_column_blob(stmt, 1); + const void *blobQv = sqlite3_column_blob(stmt, 2); + if (!blobV || !blobIds || !blobQv) { + sqlite3_free(v); + sqlite3_free(ids); + sqlite3_free(qv); + return SQLITE_ERROR; + } + memcpy(v, blobV, vs); + memcpy(ids, blobIds, is); + memcpy(qv, blobQv, qs); *outValidity = v; *outValiditySize = vs; *outNeighborIds = ids; *outNeighborIdsSize = is; @@ -480,9 +489,11 @@ static int diskann_vector_read(vec0_vtab *p, int vec_col_idx, i64 rowid, } int sz = sqlite3_column_bytes(stmt, 0); + const void *blob = sqlite3_column_blob(stmt, 0); + if (!blob || sz == 0) return SQLITE_ERROR; void *vec = sqlite3_malloc(sz); if (!vec) return SQLITE_NOMEM; - memcpy(vec, sqlite3_column_blob(stmt, 0), sz); + memcpy(vec, blob, sz); *outVector = vec; *outVectorSize = sz; @@ -1325,6 +1336,7 @@ static int diskann_flush_buffer(vec0_vtab *p, int vec_col_idx) { while ((rc = sqlite3_step(stmt)) == SQLITE_ROW) { i64 rowid = sqlite3_column_int64(stmt, 0); const void *vector = sqlite3_column_blob(stmt, 1); + if (!vector) continue; // Note: vector is already written to _vectors table, so // diskann_insert_graph will skip re-writing it (vector already exists). // We call the graph-only insert path. diff --git a/sqlite-vec-rescore.c b/sqlite-vec-rescore.c index ef4e692..ef0a35c 100644 --- a/sqlite-vec-rescore.c +++ b/sqlite-vec-rescore.c @@ -426,6 +426,10 @@ static int rescore_knn(vec0_vtab *p, vec0_cursor *pCur, unsigned char *chunkValidity = (unsigned char *)sqlite3_column_blob(stmtChunks, 1); i64 *chunkRowids = (i64 *)sqlite3_column_blob(stmtChunks, 2); + if (!chunkValidity || !chunkRowids) { + rc = SQLITE_ERROR; + goto cleanup; + } memset(chunk_distances, 0, p->chunk_size * sizeof(f32)); memset(chunk_topk_idxs, 0, k_oversample * sizeof(i32)); diff --git a/tests/test-diskann.py b/tests/test-diskann.py index 4fad96b..d71769c 100644 --- a/tests/test-diskann.py +++ b/tests/test-diskann.py @@ -1149,3 +1149,30 @@ def test_diskann_large_batch_insert_500(db): distances = [r[1] for r in rows] for i in range(len(distances) - 1): assert distances[i] <= distances[i + 1] + + +def test_corrupt_truncated_node_blob(db): + """KNN should error (not crash) when DiskANN node blob is truncated.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + for i in range(5): + vec = [0.0] * 8 + vec[i % 8] = 1.0 + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i + 1, _f32(vec)]) + + # Corrupt a DiskANN node: truncate neighbor_ids to 1 byte (wrong size) + db.execute( + "UPDATE t_diskann_nodes00 SET neighbor_ids = x'00' WHERE rowid = 1" + ) + + # Should not crash — may return wrong results or error + try: + db.execute( + "SELECT rowid FROM t WHERE emb MATCH ? AND k=3", + [_f32([1, 0, 0, 0, 0, 0, 0, 0])], + ).fetchall() + except sqlite3.OperationalError: + pass # Error is acceptable — crash is not diff --git a/tests/test-rescore.py b/tests/test-rescore.py index 5025857..1dc6cd7 100644 --- a/tests/test-rescore.py +++ b/tests/test-rescore.py @@ -566,3 +566,32 @@ def test_multiple_vector_columns(db): [float_vec([1.0] * 8)], ).fetchall() assert rows[0]["rowid"] == 2 + + +def test_corrupt_zeroblob_validity(db): + """KNN should error (not crash) when rescore chunk rowids blob is zeroed out.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[8] indexed by rescore(quantizer=bit)" + ")" + ) + db.execute( + "INSERT INTO t(rowid, embedding) VALUES (1, ?)", + [float_vec([1, 0, 0, 0, 0, 0, 0, 0])], + ) + db.execute( + "INSERT INTO t(rowid, embedding) VALUES (2, ?)", + [float_vec([0, 1, 0, 0, 0, 0, 0, 0])], + ) + + # Corrupt: replace rowids with a truncated blob (wrong size) + db.execute("UPDATE t_chunks SET rowids = x'00'") + + # Should not crash — may return wrong results or error + try: + rows = db.execute( + "SELECT rowid FROM t WHERE embedding MATCH ? ORDER BY distance LIMIT 1", + [float_vec([1, 0, 0, 0, 0, 0, 0, 0])], + ).fetchall() + except sqlite3.OperationalError: + pass # Error is acceptable — crash is not From 5e4c557f931a5103e21ba6a29197a9e475927790 Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Tue, 31 Mar 2026 14:35:55 -0700 Subject: [PATCH 20/38] Initialize rescore distance variable to FLT_MAX The `dist` variable in rescore KNN quantized distance computation was uninitialized. If the switch on quantizer_type or distance_metric didn't match any case, the uninitialized value would propagate into the top-k heap, potentially returning garbage results. Co-Authored-By: Claude Opus 4.6 (1M context) --- sqlite-vec-rescore.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sqlite-vec-rescore.c b/sqlite-vec-rescore.c index ef0a35c..1cf67bf 100644 --- a/sqlite-vec-rescore.c +++ b/sqlite-vec-rescore.c @@ -465,7 +465,7 @@ static int rescore_knn(vec0_vtab *p, vec0_cursor *pCur, for (int j = 0; j < p->chunk_size; j++) { if (!bitmap_get(b, j)) continue; - f32 dist; + f32 dist = FLT_MAX; switch (vector_column->rescore.quantizer_type) { case VEC0_RESCORE_QUANTIZER_BIT: { const u8 *base_j = ((u8 *)baseVectors) + (j * (qdim / CHAR_BIT)); From 4bee88384bf6339d4794da1e6644aaca29a5678f Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Tue, 31 Mar 2026 14:51:27 -0700 Subject: [PATCH 21/38] Reject IVF binary quantizer when dimensions not divisible by 8 The binary quantizer uses D/8 for buffer sizes and memset, which truncates for non-multiple-of-8 dimensions, causing OOB writes. Rather than using ceiling division, enforce the constraint at table creation time with a clear parse error. Co-Authored-By: Claude Opus 4.6 (1M context) --- sqlite-vec.c | 3 +++ tests/test-ivf-quantization.py | 17 +++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/sqlite-vec.c b/sqlite-vec.c index 53c4635..d12e25d 100644 --- a/sqlite-vec.c +++ b/sqlite-vec.c @@ -3074,6 +3074,9 @@ int vec0_parse_vector_column(const char *source, int source_length, if (rc != SQLITE_OK) { return SQLITE_ERROR; } + if (ivfConfig.quantizer == VEC0_IVF_QUANTIZER_BINARY && (dimensions % 8) != 0) { + return SQLITE_ERROR; + } #else return SQLITE_ERROR; // IVF not compiled in #endif diff --git a/tests/test-ivf-quantization.py b/tests/test-ivf-quantization.py index 9790680..b4d6ae3 100644 --- a/tests/test-ivf-quantization.py +++ b/tests/test-ivf-quantization.py @@ -253,3 +253,20 @@ def test_ivf_quantized_delete(db): db.execute("DELETE FROM t WHERE rowid = 5") # _ivf_vectors should have 9 rows assert db.execute("SELECT count(*) FROM t_ivf_vectors00").fetchone()[0] == 9 + + +def test_ivf_binary_rejects_non_multiple_of_8_dims(db): + """Binary quantizer requires dimensions divisible by 8.""" + with pytest.raises(sqlite3.OperationalError): + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " v float[12] indexed by ivf(quantizer=binary)" + ")" + ) + + # Dimensions divisible by 8 should work + db.execute( + "CREATE VIRTUAL TABLE t2 USING vec0(" + " v float[16] indexed by ivf(quantizer=binary)" + ")" + ) From 7de925be70b9b2c3cc6e17ffa65eb5e688f12e67 Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Tue, 31 Mar 2026 14:55:37 -0700 Subject: [PATCH 22/38] Fix int16 overflow in l2_sqr_int8_neon SIMD distance vmulq_s16(diff, diff) produced int16 results, but diff can be up to 255 for int8 vectors (-128 vs 127), and 255^2 = 65025 overflows int16 (max 32767). This caused NaN/wrong results for int8 vectors with large differences. Fix: use vmull_s16 (widening multiply) to produce int32 results directly, avoiding the intermediate int16 overflow. Co-Authored-By: Claude Opus 4.6 (1M context) --- sqlite-vec.c | 11 +++++++---- tests/test-loadable.py | 8 +++++++- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/sqlite-vec.c b/sqlite-vec.c index d12e25d..5379f29 100644 --- a/sqlite-vec.c +++ b/sqlite-vec.c @@ -258,13 +258,16 @@ static f32 l2_sqr_int8_neon(const void *pVect1v, const void *pVect2v, pVect1 += 8; pVect2 += 8; - // widen to protect against overflow + // widen i8 to i16 for subtraction int16x8_t v1_wide = vmovl_s8(v1); int16x8_t v2_wide = vmovl_s8(v2); - int16x8_t diff = vsubq_s16(v1_wide, v2_wide); - int16x8_t squared_diff = vmulq_s16(diff, diff); - int32x4_t sum = vpaddlq_s16(squared_diff); + + // widening multiply: i16*i16 -> i32 to avoid i16 overflow + // (diff can be up to 255, so diff*diff can be up to 65025 > INT16_MAX) + int32x4_t sq_lo = vmull_s16(vget_low_s16(diff), vget_low_s16(diff)); + int32x4_t sq_hi = vmull_s16(vget_high_s16(diff), vget_high_s16(diff)); + int32x4_t sum = vaddq_s32(sq_lo, sq_hi); sum_scalar += vgetq_lane_s32(sum, 0) + vgetq_lane_s32(sum, 1) + vgetq_lane_s32(sum, 2) + vgetq_lane_s32(sum, 3); diff --git a/tests/test-loadable.py b/tests/test-loadable.py index 40c6a5e..1ac0cf3 100644 --- a/tests/test-loadable.py +++ b/tests/test-loadable.py @@ -381,11 +381,17 @@ def test_vec_distance_l2(): x = vec_distance_l2(a_sql_t, b_sql_t, a=transform, b=transform) y = npy_l2(np.array(a), np.array(b)) - assert isclose(x, y, abs_tol=1e-6) + assert isclose(x, y, rel_tol=1e-5, abs_tol=1e-6) check([1.2, 0.1], [0.4, -0.4]) check([-1.2, -0.1], [-0.4, 0.4]) check([1, 2, 3], [-9, -8, -7], dtype=np.int8) + # Extreme int8 values: diff=255, squared=65025 which overflows i16 + # This tests the NEON widening multiply fix (slight float rounding expected) + check([-128] * 8, [127] * 8, dtype=np.int8) + check([-128] * 16, [127] * 16, dtype=np.int8) + check([-128, 127, -128, 127, -128, 127, -128, 127], + [127, -128, 127, -128, 127, -128, 127, -128], dtype=np.int8) def test_vec_length(): From 2f4c2e4bdb9a0ef78ee2950ff746cdd45df3c2b1 Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Tue, 31 Mar 2026 14:57:01 -0700 Subject: [PATCH 23/38] Fix alignment UB in distance_hamming_u64 Casting unaligned blob pointers to u64* is undefined behavior on strict-alignment architectures. Use memcpy to safely load u64 values from potentially unaligned memory (compilers optimize this to native loads on architectures that support unaligned access). Co-Authored-By: Claude Opus 4.6 (1M context) --- sqlite-vec.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sqlite-vec.c b/sqlite-vec.c index 5379f29..f8ab4f9 100644 --- a/sqlite-vec.c +++ b/sqlite-vec.c @@ -734,10 +734,13 @@ static unsigned int __builtin_popcountl(unsigned int x) { #endif #endif -static f32 distance_hamming_u64(u64 *a, u64 *b, size_t n) { +static f32 distance_hamming_u64(const u8 *a, const u8 *b, size_t n) { int same = 0; for (unsigned long i = 0; i < n; i++) { - same += __builtin_popcountl(a[i] ^ b[i]); + u64 va, vb; + memcpy(&va, a + i * sizeof(u64), sizeof(u64)); + memcpy(&vb, b + i * sizeof(u64), sizeof(u64)); + same += __builtin_popcountl(va ^ vb); } return (f32)same; } @@ -761,7 +764,7 @@ static f32 distance_hamming(const void *a, const void *b, const void *d) { #endif if ((dimensions % 64) == 0) { - return distance_hamming_u64((u64 *)a, (u64 *)b, n_bytes / sizeof(u64)); + return distance_hamming_u64((const u8 *)a, (const u8 *)b, n_bytes / sizeof(u64)); } return distance_hamming_u8((u8 *)a, (u8 *)b, n_bytes); } From b00865429b519ba6de0a2e094087052870f3d037 Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Tue, 31 Mar 2026 17:13:29 -0700 Subject: [PATCH 24/38] Filter deleted nodes from DiskANN search results and add delete tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DiskANN's delete repair only fixes forward edges (nodes the deleted node pointed to). Stale reverse edges can cause deleted rowids to appear in search results. Fix: track a 'confirmed' flag on each search candidate, set when the full-precision vector is successfully read during re-ranking. Only confirmed candidates are included in output. Zero additional SQL queries — piggybacks on the existing re-rank vector read. Also adds delete hardening tests: - Rescore: interleaved delete+KNN, rowid_in after deletes, full delete+reinsert cycle - DiskANN: delete+reinsert cycles with KNN verification, interleaved delete+KNN Co-Authored-By: Claude Opus 4.6 (1M context) --- sqlite-vec-diskann.c | 27 ++++++--- sqlite-vec.c | 3 +- tests/test-diskann.py | 70 +++++++++++++++++++++++ tests/test-rescore-mutations.py | 98 +++++++++++++++++++++++++++++++++ 4 files changed, 190 insertions(+), 8 deletions(-) diff --git a/sqlite-vec-diskann.c b/sqlite-vec-diskann.c index 7d4da6e..ab9db6a 100644 --- a/sqlite-vec-diskann.c +++ b/sqlite-vec-diskann.c @@ -608,6 +608,7 @@ static int diskann_candidate_list_insert( list->items[lo].rowid = rowid; list->items[lo].distance = distance; list->items[lo].visited = 0; + list->items[lo].confirmed = 0; list->count++; return 1; } @@ -741,8 +742,9 @@ static int diskann_search( return rc; } - // Seed with medoid + // Seed with medoid (confirmed — we already read its vector above) diskann_candidate_list_insert(&candidates, medoid, medoidDist); + candidates.items[0].confirmed = 1; // Pre-quantize query vector once for all quantized distance comparisons u8 *queryQuantized = NULL; @@ -815,16 +817,27 @@ static int diskann_search( sqlite3_free(fullVec); // Update distance in candidate list and re-sort diskann_candidate_list_insert(&candidates, currentRowid, exactDist); + // Mark as confirmed (vector exists, distance is exact) + for (int ci = 0; ci < candidates.count; ci++) { + if (candidates.items[ci].rowid == currentRowid) { + candidates.items[ci].confirmed = 1; + break; + } + } } + // If vector read failed, candidate stays unconfirmed (stale edge to deleted node) } - // 5. Output results (candidates are already sorted by distance) - int resultCount = (candidates.count < k) ? candidates.count : k; - *outCount = resultCount; - for (int i = 0; i < resultCount; i++) { - outRowids[i] = candidates.items[i].rowid; - outDistances[i] = candidates.items[i].distance; + // 5. Output results — only include confirmed candidates (whose vectors exist) + int resultCount = 0; + for (int i = 0; i < candidates.count && resultCount < k; i++) { + if (candidates.items[i].confirmed) { + outRowids[resultCount] = candidates.items[i].rowid; + outDistances[resultCount] = candidates.items[i].distance; + resultCount++; + } } + *outCount = resultCount; sqlite3_free(queryQuantized); diskann_candidate_list_free(&candidates); diff --git a/sqlite-vec.c b/sqlite-vec.c index f8ab4f9..cb597dd 100644 --- a/sqlite-vec.c +++ b/sqlite-vec.c @@ -2586,7 +2586,8 @@ struct Vec0DiskannConfig { struct Vec0DiskannCandidate { i64 rowid; f32 distance; - int visited; // 1 if this candidate's neighbors have been explored + int visited; // 1 if this candidate's neighbors have been explored + int confirmed; // 1 if full-precision vector was successfully read (node exists) }; /** diff --git a/tests/test-diskann.py b/tests/test-diskann.py index d71769c..f2a56a1 100644 --- a/tests/test-diskann.py +++ b/tests/test-diskann.py @@ -1176,3 +1176,73 @@ def test_corrupt_truncated_node_blob(db): ).fetchall() except sqlite3.OperationalError: pass # Error is acceptable — crash is not + + +def test_diskann_delete_reinsert_cycle_knn(db): + """Repeatedly delete and reinsert rows, verify KNN stays correct.""" + import random + random.seed(101) + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + N = 30 + vecs = {} + for i in range(1, N + 1): + v = [random.gauss(0, 1) for _ in range(8)] + vecs[i] = v + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, _f32(v)]) + + # 3 cycles: delete half, reinsert with new vectors, verify KNN + for cycle in range(3): + to_delete = random.sample(sorted(vecs.keys()), len(vecs) // 2) + for r in to_delete: + db.execute("DELETE FROM t WHERE rowid = ?", [r]) + del vecs[r] + + # Reinsert with new rowids + new_start = 100 + cycle * 50 + for i in range(len(to_delete)): + rid = new_start + i + v = [random.gauss(0, 1) for _ in range(8)] + vecs[rid] = v + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [rid, _f32(v)]) + + # KNN should return only alive rows + query = [0.0] * 8 + rows = db.execute( + "SELECT rowid FROM t WHERE emb MATCH ? AND k=10", + [_f32(query)], + ).fetchall() + returned = {r["rowid"] for r in rows} + assert returned.issubset(set(vecs.keys())), \ + f"Cycle {cycle}: deleted rowid in KNN results" + assert len(rows) >= 1 + + +def test_diskann_delete_interleaved_with_knn(db): + """Delete one row at a time, querying KNN after each delete.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + N = 20 + for i in range(1, N + 1): + vec = [0.0] * 8 + vec[i % 8] = float(i) + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, _f32(vec)]) + + alive = set(range(1, N + 1)) + for to_del in [1, 5, 10, 15, 20]: + db.execute("DELETE FROM t WHERE rowid = ?", [to_del]) + alive.discard(to_del) + + rows = db.execute( + "SELECT rowid FROM t WHERE emb MATCH ? AND k=5", + [_f32([1, 0, 0, 0, 0, 0, 0, 0])], + ).fetchall() + returned = {r["rowid"] for r in rows} + assert returned.issubset(alive), \ + f"Deleted rowid {to_del} found in KNN results" diff --git a/tests/test-rescore-mutations.py b/tests/test-rescore-mutations.py index 28495c2..dbb802a 100644 --- a/tests/test-rescore-mutations.py +++ b/tests/test-rescore-mutations.py @@ -443,6 +443,104 @@ def test_insert_batch_recall(db): # ============================================================================ +def test_delete_interleaved_with_knn(db): + """Delete rows one at a time, running KNN after each delete to verify correctness.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[8] indexed by rescore(quantizer=bit)" + ")" + ) + N = 30 + random.seed(42) + vecs = {i: [random.gauss(0, 1) for _ in range(8)] for i in range(1, N + 1)} + for rowid, vec in vecs.items(): + db.execute( + "INSERT INTO t(rowid, embedding) VALUES (?, ?)", + [rowid, float_vec(vec)], + ) + + alive = set(vecs.keys()) + query = [0.0] * 8 + + for to_del in [5, 10, 15, 20, 25]: + db.execute("DELETE FROM t WHERE rowid = ?", [to_del]) + alive.discard(to_del) + + rows = db.execute( + "SELECT rowid FROM t WHERE embedding MATCH ? ORDER BY distance LIMIT 10", + [float_vec(query)], + ).fetchall() + returned = {r["rowid"] for r in rows} + # All returned rows must be alive (not deleted) + assert returned.issubset(alive), f"Deleted rowid found in KNN after deleting {to_del}" + # Count should match alive set (up to k) + assert len(rows) == min(10, len(alive)) + + +def test_delete_with_rowid_in_constraint(db): + """Delete rows and verify KNN with rowid_in filter excludes deleted rows.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[8] indexed by rescore(quantizer=int8)" + ")" + ) + for i in range(1, 11): + db.execute( + "INSERT INTO t(rowid, embedding) VALUES (?, ?)", + [i, float_vec([float(i)] * 8)], + ) + + # Delete rows 3, 5, 7 + for r in [3, 5, 7]: + db.execute("DELETE FROM t WHERE rowid = ?", [r]) + + # KNN with rowid IN (1,2,3,4,5) — should only return 1, 2, 4 (3 and 5 deleted) + rows = db.execute( + "SELECT rowid FROM t WHERE embedding MATCH ? AND k = 5 AND rowid IN (1, 2, 3, 4, 5)", + [float_vec([1.0] * 8)], + ).fetchall() + returned = {r["rowid"] for r in rows} + assert 3 not in returned + assert 5 not in returned + assert returned.issubset({1, 2, 4}) + + +def test_delete_all_then_reinsert_batch(db): + """Delete all rows, reinsert a new batch, verify KNN only returns new rows.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[8] indexed by rescore(quantizer=bit)" + ")" + ) + # First batch + for i in range(1, 21): + db.execute( + "INSERT INTO t(rowid, embedding) VALUES (?, ?)", + [i, float_vec([float(i)] * 8)], + ) + + # Delete all + for i in range(1, 21): + db.execute("DELETE FROM t WHERE rowid = ?", [i]) + + assert db.execute("SELECT count(*) FROM t").fetchone()[0] == 0 + + # Second batch with different rowids and vectors + for i in range(100, 110): + db.execute( + "INSERT INTO t(rowid, embedding) VALUES (?, ?)", + [i, float_vec([float(i - 100)] * 8)], + ) + + rows = db.execute( + "SELECT rowid FROM t WHERE embedding MATCH ? ORDER BY distance LIMIT 5", + [float_vec([0.0] * 8)], + ).fetchall() + returned = {r["rowid"] for r in rows} + # All returned rowids should be from the second batch + assert returned.issubset(set(range(100, 110))) + + def test_knn_int8_cosine(db): """Rescore with quantizer=int8 and distance_metric=cosine.""" db.execute( From d033bf57283c6d389742c6806da3b662ea3ce6d5 Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Tue, 31 Mar 2026 17:13:40 -0700 Subject: [PATCH 25/38] Add delete recall benchmark suite New benchmarks-ann/bench-delete/ directory measures KNN recall degradation after random row deletion across index types (flat, rescore, IVF, DiskANN). For each config and delete percentage: builds index, measures baseline recall, copies DB, deletes random rows, measures post-delete recall, VACUUMs and records size savings. Includes Makefile targets, self-contained smoke test with synthetic data, and results DB for analysis. Co-Authored-By: Claude Opus 4.6 (1M context) --- benchmarks-ann/bench-delete/.gitignore | 3 + benchmarks-ann/bench-delete/Makefile | 41 ++ benchmarks-ann/bench-delete/README.md | 69 +++ benchmarks-ann/bench-delete/bench_delete.py | 593 ++++++++++++++++++++ benchmarks-ann/bench-delete/test_smoke.py | 124 ++++ 5 files changed, 830 insertions(+) create mode 100644 benchmarks-ann/bench-delete/.gitignore create mode 100644 benchmarks-ann/bench-delete/Makefile create mode 100644 benchmarks-ann/bench-delete/README.md create mode 100644 benchmarks-ann/bench-delete/bench_delete.py create mode 100644 benchmarks-ann/bench-delete/test_smoke.py diff --git a/benchmarks-ann/bench-delete/.gitignore b/benchmarks-ann/bench-delete/.gitignore new file mode 100644 index 0000000..0184df8 --- /dev/null +++ b/benchmarks-ann/bench-delete/.gitignore @@ -0,0 +1,3 @@ +runs/ +*.db +__pycache__/ diff --git a/benchmarks-ann/bench-delete/Makefile b/benchmarks-ann/bench-delete/Makefile new file mode 100644 index 0000000..681847b --- /dev/null +++ b/benchmarks-ann/bench-delete/Makefile @@ -0,0 +1,41 @@ +BENCH = python bench_delete.py +EXT = ../../dist/vec0 + +# --- Configs to test --- +FLAT = "flat:type=vec0-flat,variant=float" +RESCORE_BIT = "rescore-bit:type=rescore,quantizer=bit,oversample=8" +RESCORE_INT8 = "rescore-int8:type=rescore,quantizer=int8,oversample=8" +DISKANN_R48 = "diskann-R48:type=diskann,R=48,L=128,quantizer=binary" +DISKANN_R72 = "diskann-R72:type=diskann,R=72,L=128,quantizer=binary" + +ALL_CONFIGS = $(FLAT) $(RESCORE_BIT) $(RESCORE_INT8) $(DISKANN_R48) $(DISKANN_R72) + +DELETE_PCTS = 5,10,25,50,75,90 + +.PHONY: smoke bench-10k bench-50k bench-all report clean + +# Quick smoke test (small dataset, few queries) +smoke: + $(BENCH) --subset-size 5000 --delete-pct 10,50 -k 10 -n 20 \ + --dataset cohere1m --ext $(EXT) \ + $(FLAT) $(DISKANN_R48) + +# Standard benchmarks +bench-10k: + $(BENCH) --subset-size 10000 --delete-pct $(DELETE_PCTS) -k 10 -n 50 \ + --dataset cohere1m --ext $(EXT) $(ALL_CONFIGS) + +bench-50k: + $(BENCH) --subset-size 50000 --delete-pct $(DELETE_PCTS) -k 10 -n 50 \ + --dataset cohere1m --ext $(EXT) $(ALL_CONFIGS) + +bench-all: bench-10k bench-50k + +# Query saved results +report: + @echo "Query results:" + @echo " sqlite3 runs/cohere1m/10000/delete_results.db \\" + @echo " \"SELECT config_name, delete_pct, recall, query_mean_ms, vacuum_size_mb FROM delete_runs ORDER BY config_name, delete_pct\"" + +clean: + rm -rf runs/ diff --git a/benchmarks-ann/bench-delete/README.md b/benchmarks-ann/bench-delete/README.md new file mode 100644 index 0000000..8155566 --- /dev/null +++ b/benchmarks-ann/bench-delete/README.md @@ -0,0 +1,69 @@ +# bench-delete: Recall degradation after random deletion + +Measures how KNN recall changes after deleting a random percentage of rows +from different index types (flat, rescore, DiskANN). + +## Quick start + +```bash +# Ensure dataset exists +make -C ../datasets/cohere1m + +# Ensure extension is built +make -C ../.. loadable + +# Quick smoke test +make smoke + +# Full benchmark at 10k vectors +make bench-10k +``` + +## Usage + +```bash +python bench_delete.py --subset-size 10000 --delete-pct 10,25,50,75 \ + "flat:type=vec0-flat,variant=float" \ + "diskann-R72:type=diskann,R=72,L=128,quantizer=binary" \ + "rescore-bit:type=rescore,quantizer=bit,oversample=8" +``` + +## What it measures + +For each config and delete percentage: + +| Metric | Description | +|--------|-------------| +| **recall** | KNN recall@k after deletion (ground truth recomputed over surviving rows) | +| **delta** | Recall change vs 0% baseline | +| **query latency** | Mean/median query time after deletion | +| **db_size_mb** | DB file size before VACUUM | +| **vacuum_size_mb** | DB file size after VACUUM (space reclaimed) | +| **delete_time_s** | Wall time for the DELETE operations | + +## How it works + +1. Build index with N vectors (one copy per config) +2. Measure recall at k=10 (pre-delete baseline) +3. For each delete %: + - Copy the master DB + - Delete a random selection of rows (deterministic seed) + - Measure recall (ground truth recomputed over surviving rows only) + - VACUUM and measure size savings +4. Print comparison table + +## Expected behavior + +- **Flat index**: Recall should be 1.0 at all delete percentages (brute-force is always exact) +- **Rescore**: Recall should stay close to baseline (quantized scan + rescore is robust) +- **DiskANN**: Recall may degrade at high delete % due to graph fragmentation (dangling edges, broken connectivity) + +## Results DB + +Results are stored in `runs///delete_results.db`: + +```sql +SELECT config_name, delete_pct, recall, vacuum_size_mb +FROM delete_runs +ORDER BY config_name, delete_pct; +``` diff --git a/benchmarks-ann/bench-delete/bench_delete.py b/benchmarks-ann/bench-delete/bench_delete.py new file mode 100644 index 0000000..802f0a4 --- /dev/null +++ b/benchmarks-ann/bench-delete/bench_delete.py @@ -0,0 +1,593 @@ +#!/usr/bin/env python3 +"""Benchmark: measure recall degradation after random row deletion. + +Given a dataset and index config, this script: + 1. Builds the index (flat + ANN) + 2. Measures recall at k=10 (pre-delete baseline) + 3. Deletes a random % of rows + 4. Measures recall again (post-delete) + 5. Records DB size before/after deletion, recall delta, timings + +Usage: + python bench_delete.py --subset-size 10000 --delete-pct 25 \ + "diskann-R48:type=diskann,R=48,L=128,quantizer=binary" + + # Multiple delete percentages in one run: + python bench_delete.py --subset-size 10000 --delete-pct 10,25,50,75 \ + "diskann-R48:type=diskann,R=48,L=128,quantizer=binary" +""" +import argparse +import json +import os +import random +import shutil +import sqlite3 +import statistics +import struct +import time + +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +_BENCH_DIR = os.path.join(_SCRIPT_DIR, "..") +_ROOT_DIR = os.path.join(_BENCH_DIR, "..") + +EXT_PATH = os.path.join(_ROOT_DIR, "dist", "vec0") +DATASETS_DIR = os.path.join(_BENCH_DIR, "datasets") + +DATASETS = { + "cohere1m": {"base_db": os.path.join(DATASETS_DIR, "cohere1m", "base.db"), "dimensions": 768}, + "cohere10m": {"base_db": os.path.join(DATASETS_DIR, "cohere10m", "base.db"), "dimensions": 768}, + "nyt": {"base_db": os.path.join(DATASETS_DIR, "nyt", "base.db"), "dimensions": 256}, + "nyt-768": {"base_db": os.path.join(DATASETS_DIR, "nyt-768", "base.db"), "dimensions": 768}, + "nyt-1024": {"base_db": os.path.join(DATASETS_DIR, "nyt-1024", "base.db"), "dimensions": 1024}, + "nyt-384": {"base_db": os.path.join(DATASETS_DIR, "nyt-384", "base.db"), "dimensions": 384}, +} + +INSERT_BATCH_SIZE = 1000 + + +# ============================================================================ +# Timing helpers +# ============================================================================ + +def now_ns(): + return time.time_ns() + +def ns_to_s(ns): + return ns / 1_000_000_000 + +def ns_to_ms(ns): + return ns / 1_000_000 + + +# ============================================================================ +# Index registry (subset of bench.py — only types relevant to deletion) +# ============================================================================ + +def _vec0_flat_create(p): + dims = p["dimensions"] + variant = p.get("variant", "float") + col = f"embedding float[{dims}]" + if variant == "int8": + col = f"embedding int8[{dims}]" + elif variant == "bit": + col = f"embedding bit[{dims}]" + return f"CREATE VIRTUAL TABLE vec_items USING vec0(id INTEGER PRIMARY KEY, {col})" + +def _rescore_create(p): + dims = p["dimensions"] + q = p.get("quantizer", "bit") + os_val = p.get("oversample", 8) + return ( + f"CREATE VIRTUAL TABLE vec_items USING vec0(" + f"id INTEGER PRIMARY KEY, " + f"embedding float[{dims}] indexed by rescore(quantizer={q}, oversample={os_val}))" + ) + +def _diskann_create(p): + dims = p["dimensions"] + R = p.get("R", 72) + L = p.get("L", 128) + q = p.get("quantizer", "binary") + bt = p.get("buffer_threshold", 0) + sl_insert = p.get("search_list_size_insert", 0) + sl_search = p.get("search_list_size_search", 0) + parts = [ + f"neighbor_quantizer={q}", + f"n_neighbors={R}", + f"buffer_threshold={bt}", + ] + if sl_insert or sl_search: + # Per-path overrides — don't also set search_list_size + if sl_insert: + parts.append(f"search_list_size_insert={sl_insert}") + if sl_search: + parts.append(f"search_list_size_search={sl_search}") + else: + parts.append(f"search_list_size={L}") + opts = ", ".join(parts) + return ( + f"CREATE VIRTUAL TABLE vec_items USING vec0(" + f"id INTEGER PRIMARY KEY, " + f"embedding float[{dims}] indexed by diskann({opts}))" + ) + +def _ivf_create(p): + dims = p["dimensions"] + nlist = p.get("nlist", 128) + nprobe = p.get("nprobe", 16) + q = p.get("quantizer", "none") + os_val = p.get("oversample", 1) + parts = [f"nlist={nlist}", f"nprobe={nprobe}"] + if q != "none": + parts.append(f"quantizer={q}") + if os_val > 1: + parts.append(f"oversample={os_val}") + opts = ", ".join(parts) + return ( + f"CREATE VIRTUAL TABLE vec_items USING vec0(" + f"id INTEGER PRIMARY KEY, " + f"embedding float[{dims}] indexed by ivf({opts}))" + ) + + +INDEX_REGISTRY = { + "vec0-flat": { + "defaults": {"variant": "float"}, + "create_table_sql": _vec0_flat_create, + "post_insert_hook": None, + }, + "rescore": { + "defaults": {"quantizer": "bit", "oversample": 8}, + "create_table_sql": _rescore_create, + "post_insert_hook": None, + }, + "ivf": { + "defaults": {"nlist": 128, "nprobe": 16, "quantizer": "none", + "oversample": 1}, + "create_table_sql": _ivf_create, + "post_insert_hook": lambda conn, params: _ivf_train(conn), + }, + "diskann": { + "defaults": {"R": 72, "L": 128, "quantizer": "binary", + "buffer_threshold": 0}, + "create_table_sql": _diskann_create, + "post_insert_hook": None, + }, +} + + +def _ivf_train(conn): + """Trigger built-in k-means training for IVF.""" + t0 = now_ns() + conn.execute("INSERT INTO vec_items(id) VALUES ('compute-centroids')") + conn.commit() + return ns_to_s(now_ns() - t0) + + +# ============================================================================ +# Config parsing (same format as bench.py) +# ============================================================================ + +INT_KEYS = {"R", "L", "oversample", "nlist", "nprobe", "buffer_threshold", + "search_list_size_insert", "search_list_size_search"} + +def parse_config(spec): + if ":" not in spec: + raise ValueError(f"Config must be 'name:key=val,...': {spec}") + name, rest = spec.split(":", 1) + params = {} + for kv in rest.split(","): + k, v = kv.split("=", 1) + k = k.strip() + v = v.strip() + if k in INT_KEYS: + v = int(v) + params[k] = v + index_type = params.pop("type", None) + if not index_type or index_type not in INDEX_REGISTRY: + raise ValueError(f"Unknown index type: {index_type}") + params["index_type"] = index_type + merged = dict(INDEX_REGISTRY[index_type]["defaults"]) + merged.update(params) + return name, merged + + +# ============================================================================ +# DB helpers +# ============================================================================ + +def create_bench_db(db_path, ext_path, base_db, page_size=4096): + if os.path.exists(db_path): + os.remove(db_path) + conn = sqlite3.connect(db_path) + conn.execute(f"PRAGMA page_size={page_size}") + conn.execute("PRAGMA journal_mode=WAL") + conn.enable_load_extension(True) + conn.load_extension(ext_path) + conn.execute(f"ATTACH DATABASE '{base_db}' AS base") + return conn + + +def load_query_vectors(base_db, n): + conn = sqlite3.connect(base_db) + rows = conn.execute( + "SELECT id, vector FROM query_vectors LIMIT ?", (n,) + ).fetchall() + conn.close() + return rows + + +def insert_loop(conn, subset_size, label, start_from=0): + insert_sql = ( + "INSERT INTO vec_items(id, embedding) " + "SELECT id, vector FROM base.train " + "WHERE id >= :lo AND id < :hi" + ) + total = 0 + for lo in range(start_from, subset_size, INSERT_BATCH_SIZE): + hi = min(lo + INSERT_BATCH_SIZE, subset_size) + conn.execute(insert_sql, {"lo": lo, "hi": hi}) + conn.commit() + total += hi - lo + if total % 5000 == 0 or total == subset_size - start_from: + print(f" [{label}] inserted {total + start_from}/{subset_size}", flush=True) + + +# ============================================================================ +# Recall measurement +# ============================================================================ + +def measure_recall(conn, base_db, query_vectors, subset_size, k, alive_ids=None): + """Measure KNN recall. If alive_ids is provided, ground truth is computed + only over those IDs (to match post-delete state).""" + recalls = [] + times_ms = [] + + for qid, query in query_vectors: + t0 = now_ns() + results = conn.execute( + "SELECT id, distance FROM vec_items " + "WHERE embedding MATCH :query AND k = :k", + {"query": query, "k": k}, + ).fetchall() + t1 = now_ns() + times_ms.append(ns_to_ms(t1 - t0)) + + result_ids = set(r[0] for r in results) + + # Ground truth: brute-force cosine over surviving rows + if alive_ids is not None: + # After deletion — compute GT only over alive IDs + # Use a temp table for the alive set for efficiency + gt_rows = conn.execute( + "SELECT id FROM (" + " SELECT id, vec_distance_l2(vector, :query) as dist " + " FROM base.train WHERE id < :n ORDER BY dist LIMIT :k2" + ")", + {"query": query, "k2": k * 5, "n": subset_size}, + ).fetchall() + # Filter to only alive IDs, take top k + gt_alive = [r[0] for r in gt_rows if r[0] in alive_ids][:k] + gt_ids = set(gt_alive) + else: + gt_rows = conn.execute( + "SELECT id FROM (" + " SELECT id, vec_distance_l2(vector, :query) as dist " + " FROM base.train WHERE id < :n ORDER BY dist LIMIT :k" + ")", + {"query": query, "k": k, "n": subset_size}, + ).fetchall() + gt_ids = set(r[0] for r in gt_rows) + + if gt_ids: + recalls.append(len(result_ids & gt_ids) / len(gt_ids)) + else: + recalls.append(0.0) + + return { + "recall": round(statistics.mean(recalls), 4) if recalls else 0.0, + "mean_ms": round(statistics.mean(times_ms), 2) if times_ms else 0.0, + "median_ms": round(statistics.median(times_ms), 2) if times_ms else 0.0, + } + + +# ============================================================================ +# Delete benchmark core +# ============================================================================ + +def run_delete_benchmark(name, params, base_db, ext_path, subset_size, dims, + delete_pcts, k, n_queries, out_dir, seed_val): + params["dimensions"] = dims + reg = INDEX_REGISTRY[params["index_type"]] + create_sql = reg["create_table_sql"](params) + + results = [] + + # Build once, copy for each delete % + print(f"\n{'='*60}") + print(f"Config: {name} (type={params['index_type']})") + print(f"{'='*60}") + + os.makedirs(out_dir, exist_ok=True) + master_db_path = os.path.join(out_dir, f"{name}.{subset_size}.db") + print(f" Building index ({subset_size} vectors)...") + build_t0 = now_ns() + conn = create_bench_db(master_db_path, ext_path, base_db) + conn.execute(create_sql) + insert_loop(conn, subset_size, name) + hook = reg.get("post_insert_hook") + if hook: + print(f" Training...") + hook(conn, params) + conn.close() + build_time_s = ns_to_s(now_ns() - build_t0) + master_size = os.path.getsize(master_db_path) + print(f" Built in {build_time_s:.1f}s ({master_size / (1024*1024):.1f} MB)") + + # Load query vectors once + query_vectors = load_query_vectors(base_db, n_queries) + + # Measure pre-delete baseline on the master copy + print(f"\n --- 0% deleted (baseline) ---") + conn = sqlite3.connect(master_db_path) + conn.enable_load_extension(True) + conn.load_extension(ext_path) + conn.execute(f"ATTACH DATABASE '{base_db}' AS base") + baseline = measure_recall(conn, base_db, query_vectors, subset_size, k) + conn.close() + print(f" recall={baseline['recall']:.4f} " + f"query={baseline['mean_ms']:.2f}ms") + + results.append({ + "name": name, + "index_type": params["index_type"], + "subset_size": subset_size, + "delete_pct": 0, + "n_deleted": 0, + "n_remaining": subset_size, + "recall": baseline["recall"], + "query_mean_ms": baseline["mean_ms"], + "query_median_ms": baseline["median_ms"], + "db_size_mb": round(master_size / (1024 * 1024), 2), + "build_time_s": round(build_time_s, 1), + "delete_time_s": 0.0, + "vacuum_size_mb": round(master_size / (1024 * 1024), 2), + }) + + # All IDs in the dataset + all_ids = list(range(subset_size)) + + for pct in sorted(delete_pcts): + n_delete = int(subset_size * pct / 100) + print(f"\n --- {pct}% deleted ({n_delete} rows) ---") + + # Copy master DB and work on the copy + copy_path = os.path.join(out_dir, f"{name}.{subset_size}.del{pct}.db") + shutil.copy2(master_db_path, copy_path) + # Also copy WAL/SHM if they exist + for suffix in ["-wal", "-shm"]: + src = master_db_path + suffix + if os.path.exists(src): + shutil.copy2(src, copy_path + suffix) + + conn = sqlite3.connect(copy_path) + conn.enable_load_extension(True) + conn.load_extension(ext_path) + conn.execute(f"ATTACH DATABASE '{base_db}' AS base") + + # Pick random IDs to delete (deterministic per pct) + rng = random.Random(seed_val + pct) + to_delete = set(rng.sample(all_ids, n_delete)) + alive_ids = set(all_ids) - to_delete + + # Delete + delete_t0 = now_ns() + batch = [] + for i, rid in enumerate(to_delete): + batch.append(rid) + if len(batch) >= 500 or i == len(to_delete) - 1: + placeholders = ",".join("?" for _ in batch) + conn.execute( + f"DELETE FROM vec_items WHERE id IN ({placeholders})", + batch, + ) + conn.commit() + batch = [] + delete_time_s = ns_to_s(now_ns() - delete_t0) + + remaining = conn.execute("SELECT count(*) FROM vec_items").fetchone()[0] + pre_vacuum_size = os.path.getsize(copy_path) + print(f" deleted {n_delete} rows in {delete_time_s:.2f}s " + f"({remaining} remaining)") + + # Measure post-delete recall + post = measure_recall(conn, base_db, query_vectors, subset_size, k, + alive_ids=alive_ids) + print(f" recall={post['recall']:.4f} " + f"(delta={post['recall'] - baseline['recall']:+.4f}) " + f"query={post['mean_ms']:.2f}ms") + + # VACUUM and measure size savings — close fully, reopen without base + conn.close() + vconn = sqlite3.connect(copy_path) + vconn.execute("VACUUM") + vconn.close() + post_vacuum_size = os.path.getsize(copy_path) + saved_mb = (pre_vacuum_size - post_vacuum_size) / (1024 * 1024) + print(f" size: {pre_vacuum_size/(1024*1024):.1f} MB -> " + f"{post_vacuum_size/(1024*1024):.1f} MB after VACUUM " + f"(saved {saved_mb:.1f} MB)") + + results.append({ + "name": name, + "index_type": params["index_type"], + "subset_size": subset_size, + "delete_pct": pct, + "n_deleted": n_delete, + "n_remaining": remaining, + "recall": post["recall"], + "query_mean_ms": post["mean_ms"], + "query_median_ms": post["median_ms"], + "db_size_mb": round(pre_vacuum_size / (1024 * 1024), 2), + "build_time_s": round(build_time_s, 1), + "delete_time_s": round(delete_time_s, 2), + "vacuum_size_mb": round(post_vacuum_size / (1024 * 1024), 2), + }) + + return results + + +# ============================================================================ +# Results DB +# ============================================================================ + +RESULTS_SCHEMA = """\ +CREATE TABLE IF NOT EXISTS delete_runs ( + run_id INTEGER PRIMARY KEY, + config_name TEXT NOT NULL, + index_type TEXT NOT NULL, + params TEXT, + dataset TEXT NOT NULL, + subset_size INTEGER NOT NULL, + delete_pct INTEGER NOT NULL, + n_deleted INTEGER NOT NULL, + n_remaining INTEGER NOT NULL, + k INTEGER NOT NULL, + n_queries INTEGER NOT NULL, + seed INTEGER NOT NULL, + recall REAL, + query_mean_ms REAL, + query_median_ms REAL, + db_size_mb REAL, + vacuum_size_mb REAL, + build_time_s REAL, + delete_time_s REAL, + created_at TEXT DEFAULT (datetime('now')) +); +""" + +def save_results(results, out_dir, dataset, subset_size, params_json, k, n_queries, seed_val): + db_path = os.path.join(out_dir, "delete_results.db") + db = sqlite3.connect(db_path) + db.execute("PRAGMA journal_mode=WAL") + db.executescript(RESULTS_SCHEMA) + for r in results: + db.execute( + "INSERT INTO delete_runs " + "(config_name, index_type, params, dataset, subset_size, " + " delete_pct, n_deleted, n_remaining, k, n_queries, seed, " + " recall, query_mean_ms, query_median_ms, " + " db_size_mb, vacuum_size_mb, build_time_s, delete_time_s) " + "VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)", + ( + r["name"], r["index_type"], params_json, dataset, r["subset_size"], + r["delete_pct"], r["n_deleted"], r["n_remaining"], k, n_queries, seed_val, + r["recall"], r["query_mean_ms"], r["query_median_ms"], + r["db_size_mb"], r["vacuum_size_mb"], r["build_time_s"], r["delete_time_s"], + ), + ) + db.commit() + db.close() + return db_path + + +# ============================================================================ +# Reporting +# ============================================================================ + +def print_report(all_results): + print(f"\n{'name':>22} {'del%':>5} {'deleted':>8} {'remain':>8} " + f"{'recall':>7} {'delta':>7} {'qry(ms)':>8} " + f"{'size(MB)':>9} {'vacuumed':>9} {'del(s)':>7}") + print("-" * 110) + + # Group by config name + configs = {} + for r in all_results: + configs.setdefault(r["name"], []).append(r) + + for name, rows in configs.items(): + baseline_recall = rows[0]["recall"] # 0% delete is always first + for r in rows: + delta = r["recall"] - baseline_recall + delta_str = f"{delta:+.4f}" if r["delete_pct"] > 0 else "-" + print( + f"{r['name']:>22} {r['delete_pct']:>4}% " + f"{r['n_deleted']:>8} {r['n_remaining']:>8} " + f"{r['recall']:>7.4f} {delta_str:>7} {r['query_mean_ms']:>8.2f} " + f"{r['db_size_mb']:>9.1f} {r['vacuum_size_mb']:>9.1f} " + f"{r['delete_time_s']:>7.2f}" + ) + print() + + +# ============================================================================ +# Main +# ============================================================================ + +def main(): + parser = argparse.ArgumentParser( + description="Benchmark recall degradation after random row deletion", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + parser.add_argument("configs", nargs="+", + help="config specs (name:type=X,key=val,...)") + parser.add_argument("--subset-size", type=int, default=10000, + help="number of vectors to build (default: 10000)") + parser.add_argument("--delete-pct", type=str, default="10,25,50", + help="comma-separated delete percentages (default: 10,25,50)") + parser.add_argument("-k", type=int, default=10, help="KNN k (default 10)") + parser.add_argument("-n", type=int, default=50, + help="number of queries (default 50)") + parser.add_argument("--dataset", default="cohere1m", + choices=list(DATASETS.keys())) + parser.add_argument("--ext", default=EXT_PATH) + parser.add_argument("-o", "--out-dir", + default=os.path.join(_SCRIPT_DIR, "runs")) + parser.add_argument("--seed", type=int, default=42, + help="random seed for delete selection (default: 42)") + args = parser.parse_args() + + ds = DATASETS[args.dataset] + base_db = ds["base_db"] + dims = ds["dimensions"] + if not os.path.exists(base_db): + print(f"Error: dataset not found at {base_db}") + print(f"Run: make -C {os.path.dirname(base_db)}") + return 1 + + delete_pcts = [int(x.strip()) for x in args.delete_pct.split(",")] + for p in delete_pcts: + if not 0 < p < 100: + print(f"Error: delete percentage must be 1-99, got {p}") + return 1 + + out_dir = os.path.join(args.out_dir, args.dataset, str(args.subset_size)) + os.makedirs(out_dir, exist_ok=True) + + all_results = [] + for spec in args.configs: + name, params = parse_config(spec) + params_json = json.dumps(params) + results = run_delete_benchmark( + name, params, base_db, args.ext, args.subset_size, dims, + delete_pcts, args.k, args.n, out_dir, args.seed, + ) + all_results.extend(results) + + save_results(results, out_dir, args.dataset, args.subset_size, + params_json, args.k, args.n, args.seed) + + print_report(all_results) + + results_path = os.path.join(out_dir, "delete_results.db") + print(f"\nResults saved to: {results_path}") + print(f"Query: sqlite3 {results_path} " + f"\"SELECT config_name, delete_pct, recall, vacuum_size_mb " + f"FROM delete_runs ORDER BY config_name, delete_pct\"") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/benchmarks-ann/bench-delete/test_smoke.py b/benchmarks-ann/bench-delete/test_smoke.py new file mode 100644 index 0000000..0caba19 --- /dev/null +++ b/benchmarks-ann/bench-delete/test_smoke.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 +"""Quick self-contained smoke test using a synthetic dataset. +Creates a tiny base.db in a temp dir, runs the delete benchmark, verifies output. +""" +import os +import random +import sqlite3 +import struct +import sys +import tempfile + +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +_ROOT_DIR = os.path.join(_SCRIPT_DIR, "..", "..") +EXT_PATH = os.path.join(_ROOT_DIR, "dist", "vec0") + +DIMS = 8 +N_TRAIN = 200 +N_QUERIES = 10 +K_NEIGHBORS = 5 + + +def _f32(vals): + return struct.pack(f"{len(vals)}f", *vals) + + +def make_synthetic_base_db(path): + """Create a minimal base.db with train vectors and query vectors.""" + rng = random.Random(123) + db = sqlite3.connect(path) + db.execute("CREATE TABLE train(id INTEGER PRIMARY KEY, vector BLOB)") + db.execute("CREATE TABLE query_vectors(id INTEGER PRIMARY KEY, vector BLOB)") + + for i in range(N_TRAIN): + vec = [rng.gauss(0, 1) for _ in range(DIMS)] + db.execute("INSERT INTO train VALUES (?, ?)", (i, _f32(vec))) + + for i in range(N_QUERIES): + vec = [rng.gauss(0, 1) for _ in range(DIMS)] + db.execute("INSERT INTO query_vectors VALUES (?, ?)", (i, _f32(vec))) + + db.commit() + db.close() + + +def main(): + if not os.path.exists(EXT_PATH + ".dylib") and not os.path.exists(EXT_PATH + ".so"): + # Try bare path (sqlite handles extension) + pass + + with tempfile.TemporaryDirectory() as tmpdir: + base_db = os.path.join(tmpdir, "base.db") + make_synthetic_base_db(base_db) + + # Patch DATASETS to use our synthetic DB + import bench_delete + bench_delete.DATASETS["synthetic"] = { + "base_db": base_db, + "dimensions": DIMS, + } + + out_dir = os.path.join(tmpdir, "runs") + + # Test flat index + print("=== Testing flat index ===") + name, params = bench_delete.parse_config("flat:type=vec0-flat,variant=float") + params["dimensions"] = DIMS + results = bench_delete.run_delete_benchmark( + name, params, base_db, EXT_PATH, + subset_size=N_TRAIN, dims=DIMS, + delete_pcts=[25, 50], k=K_NEIGHBORS, n_queries=N_QUERIES, + out_dir=out_dir, seed_val=42, + ) + + bench_delete.print_report(results) + + # Flat recall should be 1.0 at all delete % + for r in results: + assert r["recall"] == 1.0, \ + f"Flat recall should be 1.0, got {r['recall']} at {r['delete_pct']}%" + print("\n PASS: flat recall is 1.0 at all delete percentages\n") + + # Test DiskANN + print("=== Testing DiskANN ===") + name2, params2 = bench_delete.parse_config( + "diskann:type=diskann,R=8,L=32,quantizer=binary" + ) + params2["dimensions"] = DIMS + results2 = bench_delete.run_delete_benchmark( + name2, params2, base_db, EXT_PATH, + subset_size=N_TRAIN, dims=DIMS, + delete_pcts=[25, 50], k=K_NEIGHBORS, n_queries=N_QUERIES, + out_dir=out_dir, seed_val=42, + ) + + bench_delete.print_report(results2) + + # DiskANN baseline (0%) should have decent recall + baseline = results2[0] + assert baseline["recall"] > 0.0, \ + f"DiskANN baseline recall is zero" + print(f" PASS: DiskANN baseline recall={baseline['recall']}") + + # Test rescore + print("\n=== Testing rescore ===") + name3, params3 = bench_delete.parse_config( + "rescore:type=rescore,quantizer=bit,oversample=4" + ) + params3["dimensions"] = DIMS + results3 = bench_delete.run_delete_benchmark( + name3, params3, base_db, EXT_PATH, + subset_size=N_TRAIN, dims=DIMS, + delete_pcts=[25, 50], k=K_NEIGHBORS, n_queries=N_QUERIES, + out_dir=out_dir, seed_val=42, + ) + + bench_delete.print_report(results3) + print(f" PASS: rescore baseline recall={results3[0]['recall']}") + + print("\n ALL SMOKE TESTS PASSED") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) From d684178a12385184fb1319d91cb9021a24679505 Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Tue, 31 Mar 2026 17:39:41 -0700 Subject: [PATCH 26/38] Add AVX2-optimized Hamming distance using VPSHUFB popcount Implements distance_hamming_avx2() which processes 32 bytes per iteration using the standard VPSHUFB nibble-lookup popcount pattern. Dispatched when SQLITE_VEC_ENABLE_AVX is defined and input >= 32 bytes. Falls back to u64 scalar or u8 byte-at-a-time for smaller inputs. Also adds -mavx2 flag to Makefile for x86-64 targets alongside existing -mavx. Co-Authored-By: Claude Opus 4.6 (1M context) --- Makefile | 4 ++-- sqlite-vec.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 89907fa..175ab16 100644 --- a/Makefile +++ b/Makefile @@ -37,7 +37,7 @@ endif ifndef OMIT_SIMD ifeq ($(shell uname -sm),Darwin x86_64) - CFLAGS += -mavx -DSQLITE_VEC_ENABLE_AVX + CFLAGS += -mavx -mavx2 -DSQLITE_VEC_ENABLE_AVX endif ifeq ($(shell uname -sm),Darwin arm64) CFLAGS += -mcpu=apple-m1 -DSQLITE_VEC_ENABLE_NEON @@ -45,7 +45,7 @@ ifndef OMIT_SIMD ifeq ($(shell uname -s),Linux) ifeq ($(findstring android,$(CC)),) ifneq ($(filter avx,$(shell grep -o 'avx[^ ]*' /proc/cpuinfo 2>/dev/null | head -1)),) - CFLAGS += -mavx -DSQLITE_VEC_ENABLE_AVX + CFLAGS += -mavx -mavx2 -DSQLITE_VEC_ENABLE_AVX endif endif endif diff --git a/sqlite-vec.c b/sqlite-vec.c index cb597dd..f239d47 100644 --- a/sqlite-vec.c +++ b/sqlite-vec.c @@ -708,6 +708,58 @@ static f32 distance_hamming_neon(const u8 *a, const u8 *b, size_t n_bytes) { } #endif +#ifdef SQLITE_VEC_ENABLE_AVX +/** + * AVX2 Hamming distance using VPSHUFB-based popcount. + * Processes 32 bytes (256 bits) per iteration. + */ +static f32 distance_hamming_avx2(const u8 *a, const u8 *b, size_t n_bytes) { + const u8 *pEnd = a + n_bytes; + + // VPSHUFB lookup table: popcount of low nibble + const __m256i lookup = _mm256_setr_epi8( + 0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4, + 0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4); + const __m256i low_mask = _mm256_set1_epi8(0x0f); + + __m256i acc = _mm256_setzero_si256(); + + while (a <= pEnd - 32) { + __m256i va = _mm256_loadu_si256((const __m256i *)a); + __m256i vb = _mm256_loadu_si256((const __m256i *)b); + __m256i xored = _mm256_xor_si256(va, vb); + + // VPSHUFB popcount: split into nibbles, lookup each + __m256i lo = _mm256_and_si256(xored, low_mask); + __m256i hi = _mm256_and_si256(_mm256_srli_epi16(xored, 4), low_mask); + __m256i popcnt = _mm256_add_epi8(_mm256_shuffle_epi8(lookup, lo), + _mm256_shuffle_epi8(lookup, hi)); + + // Horizontal sum: u8 -> u64 via sad against zero + acc = _mm256_add_epi64(acc, _mm256_sad_epu8(popcnt, _mm256_setzero_si256())); + a += 32; + b += 32; + } + + // Horizontal sum of 4 x u64 lanes + u64 tmp[4]; + _mm256_storeu_si256((__m256i *)tmp, acc); + u32 sum = (u32)(tmp[0] + tmp[1] + tmp[2] + tmp[3]); + + // Scalar tail + while (a < pEnd) { + u8 x = *a ^ *b; + x = x - ((x >> 1) & 0x55); + x = (x & 0x33) + ((x >> 2) & 0x33); + sum += (x + (x >> 4)) & 0x0F; + a++; + b++; + } + + return (f32)sum; +} +#endif + static f32 distance_hamming_u8(u8 *a, u8 *b, size_t n) { int same = 0; for (unsigned long i = 0; i < n; i++) { @@ -762,6 +814,11 @@ static f32 distance_hamming(const void *a, const void *b, const void *d) { return distance_hamming_neon((const u8 *)a, (const u8 *)b, n_bytes); } #endif +#ifdef SQLITE_VEC_ENABLE_AVX + if (n_bytes >= 32) { + return distance_hamming_avx2((const u8 *)a, (const u8 *)b, n_bytes); + } +#endif if ((dimensions % 64) == 0) { return distance_hamming_u64((const u8 *)a, (const u8 *)b, n_bytes / sizeof(u64)); From f2c9fb8f087c0a37e21e3f05f158207834826abd Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Tue, 31 Mar 2026 17:43:49 -0700 Subject: [PATCH 27/38] Add text PK, WAL concurrency tests, and fix bench-smoke config Infrastructure improvements: - Fix benchmarks-ann Makefile: type=baseline -> type=vec0-flat (baseline was never a valid INDEX_REGISTRY key) - Add DiskANN + text primary key test: insert, KNN, delete, KNN - Add rescore + text primary key test: insert, KNN, delete, KNN - Add WAL concurrency test: reader sees snapshot isolation while writer has an open transaction, KNN works on reader's snapshot Co-Authored-By: Claude Opus 4.6 (1M context) --- benchmarks-ann/Makefile | 8 +++--- tests/test-diskann.py | 43 +++++++++++++++++++++++++++++ tests/test-insert-delete.py | 54 +++++++++++++++++++++++++++++++++++++ tests/test-rescore.py | 37 +++++++++++++++++++++++++ 4 files changed, 138 insertions(+), 4 deletions(-) diff --git a/benchmarks-ann/Makefile b/benchmarks-ann/Makefile index a631478..9ae456e 100644 --- a/benchmarks-ann/Makefile +++ b/benchmarks-ann/Makefile @@ -4,9 +4,9 @@ EXT = ../dist/vec0 # --- Baseline (brute-force) configs --- BASELINES = \ - "brute-float:type=baseline,variant=float" \ - "brute-int8:type=baseline,variant=int8" \ - "brute-bit:type=baseline,variant=bit" + "brute-float:type=vec0-flat,variant=float" \ + "brute-int8:type=vec0-flat,variant=int8" \ + "brute-bit:type=vec0-flat,variant=bit" # --- IVF configs --- IVF_CONFIGS = \ @@ -43,7 +43,7 @@ ground-truth: seed # --- Quick smoke test --- bench-smoke: seed $(BENCH) --subset-size 5000 -k 10 -n 20 --dataset cohere1m -o runs \ - "brute-float:type=baseline,variant=float" \ + "brute-float:type=vec0-flat,variant=float" \ "ivf-quick:type=ivf,nlist=16,nprobe=4" \ "diskann-quick:type=diskann,R=48,L=64,quantizer=binary" diff --git a/tests/test-diskann.py b/tests/test-diskann.py index f2a56a1..d3f3e86 100644 --- a/tests/test-diskann.py +++ b/tests/test-diskann.py @@ -1246,3 +1246,46 @@ def test_diskann_delete_interleaved_with_knn(db): returned = {r["rowid"] for r in rows} assert returned.issubset(alive), \ f"Deleted rowid {to_del} found in KNN results" + + +# ====================================================================== +# Text primary key + DiskANN +# ====================================================================== + + +def test_diskann_text_pk_insert_knn_delete(db): + """DiskANN with text primary key: insert, KNN, delete, KNN again.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + id text primary key, + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + + vecs = { + "alpha": [1, 0, 0, 0, 0, 0, 0, 0], + "beta": [0, 1, 0, 0, 0, 0, 0, 0], + "gamma": [0, 0, 1, 0, 0, 0, 0, 0], + "delta": [0, 0, 0, 1, 0, 0, 0, 0], + "epsilon": [0, 0, 0, 0, 1, 0, 0, 0], + } + for name, vec in vecs.items(): + db.execute("INSERT INTO t(id, emb) VALUES (?, ?)", [name, _f32(vec)]) + + # KNN should return text IDs + rows = db.execute( + "SELECT id, distance FROM t WHERE emb MATCH ? AND k=3", + [_f32([1, 0, 0, 0, 0, 0, 0, 0])], + ).fetchall() + assert len(rows) >= 1 + ids = [r["id"] for r in rows] + assert "alpha" in ids # closest to query + + # Delete and verify + db.execute("DELETE FROM t WHERE id = 'alpha'") + rows = db.execute( + "SELECT id FROM t WHERE emb MATCH ? AND k=3", + [_f32([1, 0, 0, 0, 0, 0, 0, 0])], + ).fetchall() + ids = [r["id"] for r in rows] + assert "alpha" not in ids diff --git a/tests/test-insert-delete.py b/tests/test-insert-delete.py index eb34f84..7e97ea2 100644 --- a/tests/test-insert-delete.py +++ b/tests/test-insert-delete.py @@ -483,3 +483,57 @@ def test_delete_one_chunk_of_two_shrinks_pages(tmp_path): row = db.execute("select emb from v where rowid = ?", [i]).fetchone() assert row[0] == _f32([float(i)] * dims) db.close() + + +def test_wal_concurrent_reader_during_write(tmp_path): + """In WAL mode, a reader should see a consistent snapshot while a writer inserts.""" + dims = 4 + db_path = str(tmp_path / "test.db") + + # Writer: create table, insert initial rows, enable WAL + writer = sqlite3.connect(db_path) + writer.enable_load_extension(True) + writer.load_extension("dist/vec0") + writer.execute("PRAGMA journal_mode=WAL") + writer.execute( + f"CREATE VIRTUAL TABLE v USING vec0(emb float[{dims}])" + ) + for i in range(1, 11): + writer.execute("INSERT INTO v(rowid, emb) VALUES (?, ?)", [i, _f32([float(i)] * dims)]) + writer.commit() + + # Reader: open separate connection, start read + reader = sqlite3.connect(db_path) + reader.enable_load_extension(True) + reader.load_extension("dist/vec0") + + # Reader sees 10 rows + count_before = reader.execute("SELECT count(*) FROM v").fetchone()[0] + assert count_before == 10 + + # Writer inserts more rows (not yet committed) + writer.execute("BEGIN") + for i in range(11, 21): + writer.execute("INSERT INTO v(rowid, emb) VALUES (?, ?)", [i, _f32([float(i)] * dims)]) + + # Reader still sees 10 (WAL snapshot isolation) + count_during = reader.execute("SELECT count(*) FROM v").fetchone()[0] + assert count_during == 10 + + # KNN during writer's transaction should work on reader's snapshot + rows = reader.execute( + "SELECT rowid FROM v WHERE emb MATCH ? AND k = 5", + [_f32([1.0] * dims)], + ).fetchall() + assert len(rows) == 5 + assert all(r[0] <= 10 for r in rows) # only original rows + + # Writer commits + writer.commit() + + # Reader sees new rows after re-query (new snapshot) + count_after = reader.execute("SELECT count(*) FROM v").fetchone()[0] + assert count_after == 20 + + writer.close() + reader.close() diff --git a/tests/test-rescore.py b/tests/test-rescore.py index 1dc6cd7..7c9c669 100644 --- a/tests/test-rescore.py +++ b/tests/test-rescore.py @@ -595,3 +595,40 @@ def test_corrupt_zeroblob_validity(db): ).fetchall() except sqlite3.OperationalError: pass # Error is acceptable — crash is not + + +def test_rescore_text_pk_insert_knn_delete(db): + """Rescore with text primary key: insert, KNN, delete, KNN again.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " id text primary key," + " embedding float[128] indexed by rescore(quantizer=bit)" + ")" + ) + + import random + random.seed(99) + vecs = {} + for name in ["alpha", "beta", "gamma", "delta", "epsilon"]: + v = [random.gauss(0, 1) for _ in range(128)] + vecs[name] = v + db.execute("INSERT INTO t(id, embedding) VALUES (?, ?)", [name, float_vec(v)]) + + # KNN should return text IDs + rows = db.execute( + "SELECT id, distance FROM t WHERE embedding MATCH ? ORDER BY distance LIMIT 3", + [float_vec(vecs["alpha"])], + ).fetchall() + assert len(rows) >= 1 + ids = [r["id"] for r in rows] + assert "alpha" in ids + + # Delete and verify + db.execute("DELETE FROM t WHERE id = 'alpha'") + rows = db.execute( + "SELECT id FROM t WHERE embedding MATCH ? ORDER BY distance LIMIT 3", + [float_vec(vecs["alpha"])], + ).fetchall() + ids = [r["id"] for r in rows] + assert "alpha" not in ids + assert len(rows) >= 1 # other results still returned From 5522e86cd237a3e15276a8d1f03e34fedadd7177 Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Tue, 31 Mar 2026 17:49:40 -0700 Subject: [PATCH 28/38] Validate validity/rowids blob sizes in rescore KNN path The rescore KNN loop read validity and rowids blobs from the chunks iterator without checking their sizes matched chunk_size expectations. A truncated or corrupt blob could cause OOB reads in bitmap_copy or rowid array access. The flat KNN path already had these checks. Adds corruption tests: truncated rowids blob and truncated validity blob both produce errors instead of crashes. Co-Authored-By: Claude Opus 4.6 (1M context) --- sqlite-vec-rescore.c | 8 ++++++++ tests/test-rescore.py | 33 ++++++++++++++++++++++++++++----- 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/sqlite-vec-rescore.c b/sqlite-vec-rescore.c index 1cf67bf..5432612 100644 --- a/sqlite-vec-rescore.c +++ b/sqlite-vec-rescore.c @@ -426,10 +426,18 @@ static int rescore_knn(vec0_vtab *p, vec0_cursor *pCur, unsigned char *chunkValidity = (unsigned char *)sqlite3_column_blob(stmtChunks, 1); i64 *chunkRowids = (i64 *)sqlite3_column_blob(stmtChunks, 2); + int validityBytes = sqlite3_column_bytes(stmtChunks, 1); + int rowidsBytes = sqlite3_column_bytes(stmtChunks, 2); if (!chunkValidity || !chunkRowids) { rc = SQLITE_ERROR; goto cleanup; } + // Validate blob sizes match chunk_size expectations + if (validityBytes < (p->chunk_size + 7) / 8 || + rowidsBytes < p->chunk_size * (int)sizeof(i64)) { + rc = SQLITE_ERROR; + goto cleanup; + } memset(chunk_distances, 0, p->chunk_size * sizeof(f32)); memset(chunk_topk_idxs, 0, k_oversample * sizeof(i32)); diff --git a/tests/test-rescore.py b/tests/test-rescore.py index 7c9c669..aa8586e 100644 --- a/tests/test-rescore.py +++ b/tests/test-rescore.py @@ -587,14 +587,37 @@ def test_corrupt_zeroblob_validity(db): # Corrupt: replace rowids with a truncated blob (wrong size) db.execute("UPDATE t_chunks SET rowids = x'00'") - # Should not crash — may return wrong results or error - try: - rows = db.execute( + # Should error, not crash — blob size validation catches the mismatch + with pytest.raises(sqlite3.OperationalError): + db.execute( "SELECT rowid FROM t WHERE embedding MATCH ? ORDER BY distance LIMIT 1", [float_vec([1, 0, 0, 0, 0, 0, 0, 0])], ).fetchall() - except sqlite3.OperationalError: - pass # Error is acceptable — crash is not + + +def test_corrupt_truncated_validity_blob(db): + """KNN should error when rescore chunk validity blob is truncated.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[128] indexed by rescore(quantizer=bit)" + ")" + ) + for i in range(5): + import random + random.seed(i) + db.execute( + "INSERT INTO t(rowid, embedding) VALUES (?, ?)", + [i + 1, float_vec([random.gauss(0, 1) for _ in range(128)])], + ) + + # Corrupt: truncate validity blob to 1 byte (should be chunk_size/8 = 128 bytes) + db.execute("UPDATE t_chunks SET validity = x'FF'") + + with pytest.raises(sqlite3.OperationalError): + db.execute( + "SELECT rowid FROM t WHERE embedding MATCH ? ORDER BY distance LIMIT 1", + [float_vec([1.0] * 128)], + ).fetchall() def test_rescore_text_pk_insert_knn_delete(db): From c4c23bd8baaf70b079e3ead2675872dd452ba922 Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Tue, 31 Mar 2026 17:52:12 -0700 Subject: [PATCH 29/38] Reject NaN and Inf in float32 vector input NaN/Inf values in vectors break heap/sort invariants in KNN, causing wrong or unpredictable results. Now rejected at parse time in fvec_from_value() for both blob and JSON text input paths, with a clear error message identifying the offending element index. Co-Authored-By: Claude Opus 4.6 (1M context) --- sqlite-vec.c | 19 ++++++++++++++++++- tests/test-loadable.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/sqlite-vec.c b/sqlite-vec.c index f239d47..7261436 100644 --- a/sqlite-vec.c +++ b/sqlite-vec.c @@ -984,8 +984,18 @@ static int fvec_from_value(sqlite3_value *value, f32 **vector, return SQLITE_NOMEM; } memcpy(buf, blob, bytes); + size_t n = bytes / sizeof(f32); + for (size_t i = 0; i < n; i++) { + if (isnan(buf[i]) || isinf(buf[i])) { + *pzErr = sqlite3_mprintf( + "invalid float32 vector: element %d is %s", + (int)i, isnan(buf[i]) ? "NaN" : "Inf"); + sqlite3_free(buf); + return SQLITE_ERROR; + } + } *vector = buf; - *dimensions = bytes / sizeof(f32); + *dimensions = n; *cleanup = sqlite3_free; return SQLITE_OK; } @@ -1053,6 +1063,13 @@ static int fvec_from_value(sqlite3_value *value, f32 **vector, } f32 res = (f32)result; + if (isnan(res) || isinf(res)) { + sqlite3_free(x.z); + *pzErr = sqlite3_mprintf( + "invalid float32 vector: element %d is %s", + (int)x.length, isnan(res) ? "NaN" : "Inf"); + return SQLITE_ERROR; + } array_append(&x, (const void *)&res); offset += (endptr - ptr); diff --git a/tests/test-loadable.py b/tests/test-loadable.py index 1ac0cf3..0044144 100644 --- a/tests/test-loadable.py +++ b/tests/test-loadable.py @@ -365,6 +365,34 @@ def test_vec_distance_l1(): ) +def test_vec_reject_nan_inf(): + """NaN and Inf in float32 vectors should be rejected.""" + import struct, math + + # NaN via blob + nan_blob = struct.pack("4f", 1.0, float("nan"), 3.0, 4.0) + with pytest.raises(sqlite3.OperationalError, match="NaN"): + db.execute("SELECT vec_length(?)", [nan_blob]) + + # Inf via blob + inf_blob = struct.pack("4f", 1.0, float("inf"), 3.0, 4.0) + with pytest.raises(sqlite3.OperationalError, match="Inf"): + db.execute("SELECT vec_length(?)", [inf_blob]) + + # -Inf via blob + ninf_blob = struct.pack("4f", 1.0, float("-inf"), 3.0, 4.0) + with pytest.raises(sqlite3.OperationalError, match="Inf"): + db.execute("SELECT vec_length(?)", [ninf_blob]) + + # NaN via JSON + # Note: JSON doesn't have NaN literal, but strtod may parse "NaN" + # This tests the blob path which is the primary input method + + # Valid vectors still work + ok_blob = struct.pack("4f", 1.0, 2.0, 3.0, 4.0) + assert db.execute("SELECT vec_length(?)", [ok_blob]).fetchone()[0] == 4 + + def test_vec_distance_l2(): vec_distance_l2 = lambda *args, a="?", b="?": db.execute( f"select vec_distance_l2({a}, {b})", args From c36a995f1e7f35325db480de296cc70fae59b7cb Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Tue, 31 Mar 2026 17:54:38 -0700 Subject: [PATCH 30/38] Propagate diskann_node_write error in delete repair path diskann_repair_reverse_edges() ignored the return code from diskann_node_write() when writing repaired neighbor lists after a node deletion. A failed write would leave the graph inconsistent with no error reported to the caller. Co-Authored-By: Claude Opus 4.6 (1M context) --- sqlite-vec-diskann.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sqlite-vec-diskann.c b/sqlite-vec-diskann.c index ab9db6a..e0af464 100644 --- a/sqlite-vec-diskann.c +++ b/sqlite-vec-diskann.c @@ -1621,13 +1621,14 @@ static int diskann_repair_reverse_edges( break; } - diskann_node_write(p, vec_col_idx, nodeRowid, - validity, vs, neighborIds, nis, qvecs, qs); + rc = diskann_node_write(p, vec_col_idx, nodeRowid, + validity, vs, neighborIds, nis, qvecs, qs); } sqlite3_free(validity); sqlite3_free(neighborIds); sqlite3_free(qvecs); + if (rc != SQLITE_OK) return rc; } return SQLITE_OK; From 01b4b2a965b7471831d390d38d475595a9acde34 Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Tue, 31 Mar 2026 18:10:48 -0700 Subject: [PATCH 31/38] Scrub stale reverse edges on DiskANN delete (data leak fix) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After deleting a node, its rowid and quantized vector remained in other nodes' neighbor blobs via unidirectional reverse edges. This is a data leak — the deleted vector's compressed representation was still readable in shadow tables. Fix: after deleting the node and repairing forward edges, scan all remaining nodes and clear any neighbor slot that references the deleted rowid. Uses a lightweight two-pass approach: first scan reads only validity + neighbor_ids to find affected nodes, then does full read/clear/write only for those nodes. Tradeoff: O(N) scan per delete adds ~1ms/row at 10k vectors, ~10ms at 100k. Recall and query latency are unaffected. Co-Authored-By: Claude Opus 4.6 (1M context) --- sqlite-vec-diskann.c | 95 +++++++++++++++++++++++++++++++++++++++++++ tests/test-diskann.py | 37 +++++++++++++++++ 2 files changed, 132 insertions(+) diff --git a/sqlite-vec-diskann.c b/sqlite-vec-diskann.c index e0af464..5bd298b 100644 --- a/sqlite-vec-diskann.c +++ b/sqlite-vec-diskann.c @@ -1638,6 +1638,95 @@ static int diskann_repair_reverse_edges( * Delete a vector from the DiskANN graph (Algorithm 3: LM-Delete). * If the vector is in the buffer (not yet flushed), just remove from buffer. */ +/** + * Scan all nodes and clear any neighbor slot referencing deleted_rowid. + * This removes stale reverse edges that the forward-edge repair misses, + * preventing data leaks (deleted rowid + quantized vector lingering in + * other nodes' blobs). + */ +static int diskann_scrub_deleted_rowid( + vec0_vtab *p, int vec_col_idx, i64 deleted_rowid) { + + struct VectorColumnDefinition *col = &p->vector_columns[vec_col_idx]; + struct Vec0DiskannConfig *cfg = &col->diskann; + int rc; + sqlite3_stmt *stmt = NULL; + + // Lightweight scan: only read validity + neighbor_ids to find matches + char *zSql = sqlite3_mprintf( + "SELECT rowid, neighbors_validity, neighbor_ids " + "FROM " VEC0_SHADOW_DISKANN_NODES_N_NAME, + p->schemaName, p->tableName, vec_col_idx); + if (!zSql) return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) return rc; + + // Collect rowids that need updating (avoid modifying while iterating) + i64 *dirty = NULL; + int nDirty = 0, capDirty = 0; + + while (sqlite3_step(stmt) == SQLITE_ROW) { + const u8 *validity = (const u8 *)sqlite3_column_blob(stmt, 1); + const u8 *ids = (const u8 *)sqlite3_column_blob(stmt, 2); + int idsBytes = sqlite3_column_bytes(stmt, 2); + if (!validity || !ids) continue; + + int nSlots = idsBytes / (int)sizeof(i64); + if (nSlots > cfg->n_neighbors) nSlots = cfg->n_neighbors; + + for (int i = 0; i < nSlots; i++) { + if (!diskann_validity_get(validity, i)) continue; + i64 nid = diskann_neighbor_id_get(ids, i); + if (nid == deleted_rowid) { + i64 nodeRowid = sqlite3_column_int64(stmt, 0); + // Add to dirty list + if (nDirty >= capDirty) { + capDirty = capDirty ? capDirty * 2 : 16; + i64 *tmp = sqlite3_realloc64(dirty, capDirty * sizeof(i64)); + if (!tmp) { sqlite3_free(dirty); sqlite3_finalize(stmt); return SQLITE_NOMEM; } + dirty = tmp; + } + dirty[nDirty++] = nodeRowid; + break; // one match per node is enough + } + } + } + sqlite3_finalize(stmt); + + // Now do full read/clear/write for each dirty node + for (int d = 0; d < nDirty; d++) { + u8 *val = NULL, *nids = NULL, *qvecs = NULL; + int vs, nis, qs; + rc = diskann_node_read(p, vec_col_idx, dirty[d], + &val, &vs, &nids, &nis, &qvecs, &qs); + if (rc != SQLITE_OK) continue; + + int modified = 0; + for (int i = 0; i < cfg->n_neighbors; i++) { + if (diskann_validity_get(val, i) && + diskann_neighbor_id_get(nids, i) == deleted_rowid) { + diskann_node_clear_neighbor(val, nids, qvecs, i, + cfg->quantizer_type, col->dimensions); + modified = 1; + } + } + + if (modified) { + rc = diskann_node_write(p, vec_col_idx, dirty[d], + val, vs, nids, nis, qvecs, qs); + } + + sqlite3_free(val); + sqlite3_free(nids); + sqlite3_free(qvecs); + if (rc != SQLITE_OK) break; + } + + sqlite3_free(dirty); + return rc; +} + static int diskann_delete(vec0_vtab *p, int vec_col_idx, i64 rowid) { struct VectorColumnDefinition *col = &p->vector_columns[vec_col_idx]; struct Vec0DiskannConfig *cfg = &col->diskann; @@ -1706,6 +1795,12 @@ static int diskann_delete(vec0_vtab *p, int vec_col_idx, i64 rowid) { rc = diskann_medoid_handle_delete(p, vec_col_idx, rowid); } + // 5. Scrub stale reverse edges — removes deleted rowid + quantized vector + // from any node that still references it (data leak prevention) + if (rc == SQLITE_OK) { + rc = diskann_scrub_deleted_rowid(p, vec_col_idx, rowid); + } + return rc; } diff --git a/tests/test-diskann.py b/tests/test-diskann.py index d3f3e86..16ab872 100644 --- a/tests/test-diskann.py +++ b/tests/test-diskann.py @@ -1289,3 +1289,40 @@ def test_diskann_text_pk_insert_knn_delete(db): ).fetchall() ids = [r["id"] for r in rows] assert "alpha" not in ids + + +def test_diskann_delete_scrubs_all_references(db): + """After DELETE, no shadow table should contain the deleted rowid or its data.""" + import struct + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + for i in range(20): + vec = struct.pack("8f", *[float(i + d) for d in range(8)]) + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, vec]) + + target = 5 + db.execute("DELETE FROM t WHERE rowid = ?", [target]) + + # Node row itself should be gone + assert db.execute( + "SELECT count(*) FROM t_diskann_nodes00 WHERE rowid=?", [target] + ).fetchone()[0] == 0 + + # Vector should be gone + assert db.execute( + "SELECT count(*) FROM t_vectors00 WHERE rowid=?", [target] + ).fetchone()[0] == 0 + + # No other node should reference the deleted rowid in neighbor_ids + for row in db.execute("SELECT rowid, neighbor_ids FROM t_diskann_nodes00"): + node_rowid = row[0] + ids_blob = row[1] + for j in range(0, len(ids_blob), 8): + nid = struct.unpack(" Date: Tue, 31 Mar 2026 18:27:02 -0700 Subject: [PATCH 32/38] Enable auxiliary columns for rescore, IVF, and DiskANN indexes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The constructor previously rejected auxiliary columns (+col) for all non-flat index types. Analysis confirms all code paths already handle aux columns correctly — aux data lives in _auxiliary shadow table, independent of the vector index structures. Remove the three auxiliary column guards. Metadata and partition key guards remain in place (separate analysis needed). Adds 8 snapshot-based tests covering shadow table creation, insert+KNN returning aux values, aux UPDATE, aux DELETE cleanup, and DROP TABLE for both rescore and DiskANN. IVF aux verified with IVF-enabled build. Co-Authored-By: Claude Opus 4.6 (1M context) --- sqlite-vec.c | 16 - tests/__snapshots__/test-auxiliary.ambr | 371 ++++++++++++++++++++++++ tests/test-auxiliary.py | 199 ++++++++++++- tests/test-diskann.py | 13 +- tests/test-ivf-mutations.py | 14 +- tests/test-rescore-mutations.py | 21 +- 6 files changed, 597 insertions(+), 37 deletions(-) diff --git a/sqlite-vec.c b/sqlite-vec.c index 7261436..16c3b4d 100644 --- a/sqlite-vec.c +++ b/sqlite-vec.c @@ -5149,11 +5149,6 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, } } if (hasRescore) { - if (numAuxiliaryColumns > 0) { - *pzErr = sqlite3_mprintf(VEC_CONSTRUCTOR_ERROR - "Auxiliary columns are not supported with rescore indexes"); - goto error; - } if (numMetadataColumns > 0) { *pzErr = sqlite3_mprintf(VEC_CONSTRUCTOR_ERROR "Metadata columns are not supported with rescore indexes"); @@ -5183,11 +5178,6 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, "partition key columns are not supported with IVF indexes"); goto error; } - if (numAuxiliaryColumns > 0) { - *pzErr = sqlite3_mprintf(VEC_CONSTRUCTOR_ERROR - "auxiliary columns are not supported with IVF indexes"); - goto error; - } if (numMetadataColumns > 0) { *pzErr = sqlite3_mprintf(VEC_CONSTRUCTOR_ERROR "metadata columns are not supported with IVF indexes"); @@ -5199,12 +5189,6 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, // DiskANN columns cannot coexist with aux/metadata/partition columns for (int i = 0; i < numVectorColumns; i++) { if (pNew->vector_columns[i].index_type == VEC0_INDEX_TYPE_DISKANN) { - if (numAuxiliaryColumns > 0) { - *pzErr = sqlite3_mprintf( - VEC_CONSTRUCTOR_ERROR - "Auxiliary columns are not supported with DiskANN-indexed vector columns"); - goto error; - } if (numMetadataColumns > 0) { *pzErr = sqlite3_mprintf( VEC_CONSTRUCTOR_ERROR diff --git a/tests/__snapshots__/test-auxiliary.ambr b/tests/__snapshots__/test-auxiliary.ambr index 66a3ef3..7313faf 100644 --- a/tests/__snapshots__/test-auxiliary.ambr +++ b/tests/__snapshots__/test-auxiliary.ambr @@ -169,6 +169,200 @@ }), }) # --- +# name: test_diskann_aux_insert_knn[diskann aux select all] + OrderedDict({ + 'sql': 'SELECT rowid, label FROM t ORDER BY rowid', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'label': 'red', + }), + OrderedDict({ + 'rowid': 2, + 'label': 'green', + }), + OrderedDict({ + 'rowid': 3, + 'label': 'blue', + }), + ]), + }) +# --- +# name: test_diskann_aux_insert_knn[diskann aux shadow contents] + dict({ + 't_auxiliary': OrderedDict({ + 'sql': 'select * from t_auxiliary', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'value00': 'red', + }), + OrderedDict({ + 'rowid': 2, + 'value00': 'green', + }), + OrderedDict({ + 'rowid': 3, + 'value00': 'blue', + }), + ]), + }), + 't_chunks': OrderedDict({ + 'sql': 'select * from t_chunks', + 'rows': list([ + ]), + }), + 't_diskann_buffer00': OrderedDict({ + 'sql': 'select * from t_diskann_buffer00', + 'rows': list([ + ]), + }), + 't_diskann_nodes00': OrderedDict({ + 'sql': 'select * from t_diskann_nodes00', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'neighbors_validity': b'\x03', + 'neighbor_ids': b'\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + 'neighbor_quantized_vectors': b'\x02\x04\x00\x00\x00\x00\x00\x00', + }), + OrderedDict({ + 'rowid': 2, + 'neighbors_validity': b'\x03', + 'neighbor_ids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + 'neighbor_quantized_vectors': b'\x01\x04\x00\x00\x00\x00\x00\x00', + }), + OrderedDict({ + 'rowid': 3, + 'neighbors_validity': b'\x03', + 'neighbor_ids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + 'neighbor_quantized_vectors': b'\x01\x02\x00\x00\x00\x00\x00\x00', + }), + ]), + }), + 't_rowids': OrderedDict({ + 'sql': 'select * from t_rowids', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'id': None, + 'chunk_id': None, + 'chunk_offset': None, + }), + OrderedDict({ + 'rowid': 2, + 'id': None, + 'chunk_id': None, + 'chunk_offset': None, + }), + OrderedDict({ + 'rowid': 3, + 'id': None, + 'chunk_id': None, + 'chunk_offset': None, + }), + ]), + }), + 't_vectors00': OrderedDict({ + 'sql': 'select * from t_vectors00', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'vector': b'\x00\x00\x80?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + }), + OrderedDict({ + 'rowid': 2, + 'vector': b'\x00\x00\x00\x00\x00\x00\x80?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + }), + OrderedDict({ + 'rowid': 3, + 'vector': b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x80?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + }), + ]), + }), + }) +# --- +# name: test_diskann_aux_shadow_tables[diskann aux shadow tables] + OrderedDict({ + 'sql': "SELECT name, sql FROM sqlite_master WHERE type='table' AND name LIKE 't_%' ORDER BY name", + 'rows': list([ + OrderedDict({ + 'name': 't_auxiliary', + 'sql': 'CREATE TABLE "t_auxiliary"( rowid integer PRIMARY KEY , value00, value01)', + }), + OrderedDict({ + 'name': 't_chunks', + 'sql': 'CREATE TABLE "t_chunks"(chunk_id INTEGER PRIMARY KEY AUTOINCREMENT,size INTEGER NOT NULL,validity BLOB NOT NULL,rowids BLOB NOT NULL)', + }), + OrderedDict({ + 'name': 't_diskann_buffer00', + 'sql': 'CREATE TABLE "t_diskann_buffer00" (rowid INTEGER PRIMARY KEY, vector BLOB NOT NULL)', + }), + OrderedDict({ + 'name': 't_diskann_nodes00', + 'sql': 'CREATE TABLE "t_diskann_nodes00" (rowid INTEGER PRIMARY KEY, neighbors_validity BLOB NOT NULL, neighbor_ids BLOB NOT NULL, neighbor_quantized_vectors BLOB NOT NULL)', + }), + OrderedDict({ + 'name': 't_info', + 'sql': 'CREATE TABLE "t_info" (key text primary key, value any)', + }), + OrderedDict({ + 'name': 't_rowids', + 'sql': 'CREATE TABLE "t_rowids"(rowid INTEGER PRIMARY KEY AUTOINCREMENT,id,chunk_id INTEGER,chunk_offset INTEGER)', + }), + OrderedDict({ + 'name': 't_vectors00', + 'sql': 'CREATE TABLE "t_vectors00" (rowid INTEGER PRIMARY KEY, vector BLOB NOT NULL)', + }), + ]), + }) +# --- +# name: test_diskann_aux_update_and_delete[diskann aux after update+delete] + OrderedDict({ + 'sql': 'SELECT rowid, label FROM t ORDER BY rowid', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'label': 'item-1', + }), + OrderedDict({ + 'rowid': 2, + 'label': 'UPDATED', + }), + OrderedDict({ + 'rowid': 4, + 'label': 'item-4', + }), + OrderedDict({ + 'rowid': 5, + 'label': 'item-5', + }), + ]), + }) +# --- +# name: test_diskann_aux_update_and_delete[diskann aux shadow after update+delete] + OrderedDict({ + 'sql': 'SELECT rowid, value00 FROM t_auxiliary ORDER BY rowid', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'value00': 'item-1', + }), + OrderedDict({ + 'rowid': 2, + 'value00': 'UPDATED', + }), + OrderedDict({ + 'rowid': 4, + 'value00': 'item-4', + }), + OrderedDict({ + 'rowid': 5, + 'value00': 'item-5', + }), + ]), + }) +# --- # name: test_knn OrderedDict({ 'sql': 'select * from v', @@ -392,6 +586,183 @@ ]), }) # --- +# name: test_rescore_aux_delete[rescore aux after delete] + OrderedDict({ + 'sql': 'SELECT rowid, label FROM t ORDER BY rowid', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'label': 'item-1', + }), + OrderedDict({ + 'rowid': 2, + 'label': 'item-2', + }), + OrderedDict({ + 'rowid': 4, + 'label': 'item-4', + }), + OrderedDict({ + 'rowid': 5, + 'label': 'item-5', + }), + ]), + }) +# --- +# name: test_rescore_aux_delete[rescore aux shadow after delete] + OrderedDict({ + 'sql': 'SELECT rowid, value00 FROM t_auxiliary ORDER BY rowid', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'value00': 'item-1', + }), + OrderedDict({ + 'rowid': 2, + 'value00': 'item-2', + }), + OrderedDict({ + 'rowid': 4, + 'value00': 'item-4', + }), + OrderedDict({ + 'rowid': 5, + 'value00': 'item-5', + }), + ]), + }) +# --- +# name: test_rescore_aux_insert_knn[rescore aux select all] + OrderedDict({ + 'sql': 'SELECT rowid, label FROM t ORDER BY rowid', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'label': 'alpha', + }), + OrderedDict({ + 'rowid': 2, + 'label': 'beta', + }), + OrderedDict({ + 'rowid': 3, + 'label': 'gamma', + }), + ]), + }) +# --- +# name: test_rescore_aux_insert_knn[rescore aux shadow contents] + dict({ + 't_auxiliary': OrderedDict({ + 'sql': 'select * from t_auxiliary', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'value00': 'alpha', + }), + OrderedDict({ + 'rowid': 2, + 'value00': 'beta', + }), + OrderedDict({ + 'rowid': 3, + 'value00': 'gamma', + }), + ]), + }), + 't_chunks': OrderedDict({ + 'sql': 'select * from t_chunks', + 'rows': list([ + OrderedDict({ + 'chunk_id': 1, + 'size': 1024, + 'validity': b'\x07\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + 'rowids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + }), + ]), + }), + 't_rescore_chunks00': OrderedDict({ + 'sql': 'select * from t_rescore_chunks00', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'vectors': b'=\xf3<\xef\xf1H\x85\xa57\x16v\xe6/\x86\x7f\xace\x96\x11|1\x18a\xd8\x15\x1c&\x02z\x9e\xeb\x12\xa4\xd7\xd2i\x89\xc4\x18A>\xa2\x9bT\xcd=\xd9i\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + }), + ]), + }), + 't_rescore_vectors00': OrderedDict({ + 'sql': 'select * from t_rescore_vectors00', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'vector': b']\x1c\x8a>\xc4\x9eX\xbf\xceY\xe3=w\x9b\xed?\nQZ?\xdc\x9f@?\x80u\xa4\xbf\x16q\xfa\xbeB\x9b\\?B\x13\x8f?\x80\xd1\xf9\xbf\x10\x0c\xf9\xbb\xf8\x1c\x1e@G\xbd\xe3>\xea\x03[?\xecc\xcf?\x12\x03\xf3\xbf\xa1\xd4\xb2\xbd\x8f\x99\xb8>\x92\xb5M?\xa1\xc9\xbd?/\x90g?\xe1u)\xbf\x94\xe8\x12\xbe\xd6\xba\x16>\'i\xc4>\xc3?\xca>\t\x0e\x9c?\xf0\xa8\xb8\xbf\x1b\x98\x18?\xa8<\xd9??\xdca?\x89\xad\xd0?\x88\xb8<\xbe<\xfa\xab\xbd#\xfc\xf8\xbf\x1e\x90k?\xa0\xec\x1f?a\x1a\xc4?\xc0yU?\xcc\x11\xec\xbe(\xad\xe5\xbd\xfbx\xd0\xbfA\xd3\x16?1\xc5\xf6\xbe\xdcn\'\xbe\x00\xe6\xa0>\t\xd4\x06\xbeD\xfb\xbe?\x1a,\x10\xc0\x80\x8a\x83<\xd2\x8f\x1a\xc0\xf0\xab0\xbf\xfaD\x0b\xbe\x18`\xce\xbf\xa43\x91>\xd0\x13\xea=\x1cpz\xbf\x9ai\x81>\x1d+\xb2\xbd\xb1:\x91\xbe\r\x9e\xf4;|"\xf2\xbfA\x0c\x16@+\x92\t@\x99\x9e\xfb?&\xb9\xa1?_v[\xbf\x98\xb7\x87?\xfe"\xc7>%]#\xbe\xa0\xf2\xd5\xbe\x9e"\x06\xbe\x1dz\xd6>\x84\xa2\x9d?\xd7\xb3\xec\xbf\xbbJ\xbd?\xbd\xebD\xbf\xdd.\xa3\xbe1\xcd\'\xbe\xa2\xf9\xd6\xbfL\xa7I>\xef\x17\x0f<(0n\xbe\xbe\xaf\xdb>\x7fo\xb5<\xcah\xdf>d\x00f\xbe\xb1\x85`\xbf\x95 9?\xd1\xeb\xcd>gk\xb8\xbe\x18\xd3\xfe\xbd1\x80\xdb?\x8b\x86\x03?\x1a\xb7\x9d>\xadM\x1f@\x04\xa0\xca>tc\xfc=\x186\x96?7.\x03\xc0V\xfa\xf8?\xf2\xf2\xa3\xbe1\xa1\xa8\xbf\x06\xb1I\xbfs\xcbT=\xda\xe5}>\xcd@\xca\xbe\x1ee\x1a\xbf\x02H\x14\xbf\x99\xef8\xbeG\xd9\x8a?&\xdc\xff>O\xf8\x9e?\xbd+4>\x9d\xa4\xab=PB\x8c>fs\xac>\x8b\xb4\\?q\xe2S\xbf^\x9a@\xbf\xe7\x7f\xc8\xbf\xbb\x9e\x9f?\xc2\xa0\x07@\xe2mT\xbf@\xf1v\x89n\xbf\xfb\xe2T\xbc\xd4\xd2\xff?,o\xaf?\x0c+\xb6\xbf#\x84|\xbf\x80\xc8\xfd?9\x97\xdb>oa\'\xc0\x8f\xa9\x00>[\xc3\x83\xc0d\xe2\xd2\xbf\xba\xeai>1\x14s>\xe3\x11\x99\xbf\xd9j\x81\xbf\xb3\x1e\xe1\xbcS;)?\x86\x987\xbf\t\xf4\xe4\xbc\xb8f\xa4\xbf\x1c\x83k\xbe|*T\xbf\x00\xd8\xa8?\xc4\x966?2\x14\x14?H\xfe>?\xbd\xbb\x7f=\xcb\x1c\x9f\xbe\xe5\xad\x90?U\x085\xbf\xde{\xb2\xbf\x1f\x03\x10\xbf\x19\xd6J>b\xb9\x97=\xc18z\xbe\xe76\xa7\xbf\xed\x80\x98\xbf\xf5\x12\xb7\xbf\x86(\x9f\xbdaY\x16?\x07j\xb1>\x9ea\x8c\xbd\x91(\xb2\xbf\xe1\xa1\x0c\xbe\xc4_\xd1>\x8c\xad\xf2\xbdc\xab\xf4\xbd\x81<\xc6\xbe}\xa7\xaa\xbfk\xe4\xcb>\xcd89>dk\x81\xbe%\xa4\xb0\xbbAU\x11\xbfG\n\x15\xc0\xb6m\xcb?\xafoq>0\x17\xa5\xbe?j\x81>\xbet\xee;\xc0\xc2\\=S\x81\x8c\xbe2T\xca> \xbe\xe2\xbf1\xd2w?\xed\xfd\x08\xc0\x01\x17\xa0\xbf\x99o\xb1\xbfRX\xb7\xbf\x06f\xca\xbeD\x9e\x92?\x86W\t?\x03G}?\xdd\xbfv\xbdd\xf0\x0c\xbe\xf8\x8a\x1c\xbf\xd8\xc9\x9e\xbfy/\x13>\x802\xdc= 5`\xbf\x00\xf3"\xbf\x99\x92\x01>7 \x06\xc0{\xd7\x8d\xbf\xa5/\x8e\xbf\\\x82u?\x17M\x1e\xbf\xcf\xbbk\xbe\xc3\x84i\xbe\xf1\xa4&\xbfD\xb4\x1a\xc0au\x06\xc0:\xbc\x04\xbf\x0cK\xb2?mdD\xbf\xfa\xa4\x9b?\x89w\xd9>\xde\xb7\x8c?!e\x1a>\xc3\x05-\xbf\x11\xce\xdf\xbe!\xf3\x10?\xab!P?\x96\xbc\x9f?]\x1c\x19?f\x97\x88\xbf\xddRM\xbf,)\x8e>7\x14$>}\x8d\x18@O%\xc0\xbf/\xa5C>`B\xe5\xbd\xb71\x1b?s\x11V?3\xa2F=\x13\xaf\x87\xbf\xe2X\x17?\xa7\xc8\x91\xbf\x19^\x83\xbf\xc6w\xa4?[H\xa1\xbf\x17M2\xbf\xfb\x7f\xd5\xbf', + }), + OrderedDict({ + 'rowid': 3, + 'vector': b'b\xd9\x8b\xbf9\x81\x96\xbe\x83h\xe9?\\\xa4\x89\xbf\x93\xff\xc1\xbf6\xfe\xa8?\xd8\x19\xc1\xbf*\xf06?\xb2\x0c\xaa?2\\P>\xd9\xf0\x81?K~\xb4\xbe\x05\x00\x85?\xcfg\x8c\xbf?\x93\xca>\xf82Q=\x00\x8e\xa5\xbf\xf3:\x15@\xbc\x9c>\xbf}\x13\x90\xbf5z\x17?w17\xbf\xaf\xde(?<\x00]?M\xff\x00?\n7\xa8\xbe\x83kU\xbf-R\x1a=\xa1\xbc\x8c\xbe\xf7.\xb0>\xf1W$?\xe3\xb1\xd8\xbeZ\x89\x19>\x08\x0b\x19\xbf8\xbfK\xbe\t\x12\xcb=P\xd5\x81\xbf8C!\xbf$\xaa\x1b\xbf1\x8f\x8e>\xbb\x1c2\xbe"\x88R\xbekR\x86?\xb3\xfa\xee\xbe\x1aAN\xbf|\xca,\xbf\x0c{z>\x97;\n=Q>4\xbey\x12\x92\xbf[\xa1]\xbe\t\x93\x9c?\xf5\xbcR?\x1cj]\xbf\xa5w\xa8\xbf\xf5\xc1\x1b\xbej\xb25>5\xf48\xbe\x87\xe4-\xbf x\x8f\xbfoC\x01\xbe\xe7:\x16\xbf\x1c\xf1W?\xf6\x1e\xc8\xbf\xe9&\xd5\xbec^\x19@\x19\n\x98?My\xd0>\xa3\xa4\r?\xfc#\xab>\xf7\x1a\x81\xbf\xbb\xe8\x98\xbf\xb0]>\xbff\x92\xc7=\xb3\x16\x86\xbf4\xdc\x9b\xbe\xe2\xd4C\xbfi\xbb/?r\x0fo\xbf\xb8\xd8g>$\x9cW?\xfd\xb0P?\x05\r\xc0\xbeC\x08\xde?Pz\xcb?\x88\xd5\xe9\xbe\xd4\x07\x0c\xbf\x16s\x7f?\xf9=K\xbd\x9378\xbfI\xd6\xbb=\xe7j\x92\xbe\xeb\xfa\x9f?\x9d\x9d\x83\xbe4\xbbK>\xcf\x82\xab\xbfv\x98\x1c?a"Z\xbf\xaf/\x12?\xfe4\xbc>\x84\xed\x91\xbd\xeb\x857\xc0\x90\x89">\x05t\x92?\x1b\x00(>F>V\xbf\x84\x12\x1e>\xcb\xae\xd8?\xc0S+?\x95Z\x1a?\xbe\x93x\xbf>\xfe\'\xbf`\xa4\x8b?\xca\x08\xba\xbe\x89\xc2\n\xbf\xec2\xb2>\x1c\xb3\x04>w\xc0\x95\xbe\x94\xf0r?\xb5\x08\xc4=\x15~\x84>:\xc78\xbfV-\xdb\xbe\xaf\xde\xb2=\xd8\xc8\xe1\xbe\x06\xf9\x14@^\x16\x92>bk\xb1\xbe', + }), + ]), + }), + 't_rowids': OrderedDict({ + 'sql': 'select * from t_rowids', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'id': None, + 'chunk_id': 1, + 'chunk_offset': 0, + }), + OrderedDict({ + 'rowid': 2, + 'id': None, + 'chunk_id': 1, + 'chunk_offset': 1, + }), + OrderedDict({ + 'rowid': 3, + 'id': None, + 'chunk_id': 1, + 'chunk_offset': 2, + }), + ]), + }), + }) +# --- +# name: test_rescore_aux_shadow_tables[rescore aux shadow tables] + OrderedDict({ + 'sql': "SELECT name, sql FROM sqlite_master WHERE type='table' AND name LIKE 't_%' ORDER BY name", + 'rows': list([ + OrderedDict({ + 'name': 't_auxiliary', + 'sql': 'CREATE TABLE "t_auxiliary"( rowid integer PRIMARY KEY , value00, value01)', + }), + OrderedDict({ + 'name': 't_chunks', + 'sql': 'CREATE TABLE "t_chunks"(chunk_id INTEGER PRIMARY KEY AUTOINCREMENT,size INTEGER NOT NULL,validity BLOB NOT NULL,rowids BLOB NOT NULL)', + }), + OrderedDict({ + 'name': 't_info', + 'sql': 'CREATE TABLE "t_info" (key text primary key, value any)', + }), + OrderedDict({ + 'name': 't_rescore_chunks00', + 'sql': 'CREATE TABLE "t_rescore_chunks00"(rowid PRIMARY KEY, vectors BLOB NOT NULL)', + }), + OrderedDict({ + 'name': 't_rescore_vectors00', + 'sql': 'CREATE TABLE "t_rescore_vectors00"(rowid INTEGER PRIMARY KEY, vector BLOB NOT NULL)', + }), + OrderedDict({ + 'name': 't_rowids', + 'sql': 'CREATE TABLE "t_rowids"(rowid INTEGER PRIMARY KEY AUTOINCREMENT,id,chunk_id INTEGER,chunk_offset INTEGER)', + }), + ]), + }) +# --- # name: test_types OrderedDict({ 'sql': 'select * from v', diff --git a/tests/test-auxiliary.py b/tests/test-auxiliary.py index 807b2b8..dbe9654 100644 --- a/tests/test-auxiliary.py +++ b/tests/test-auxiliary.py @@ -1,5 +1,7 @@ import sqlite3 -from helpers import exec, vec0_shadow_table_contents +import struct +import pytest +from helpers import exec, vec0_shadow_table_contents, _f32 def test_constructor_limit(db, snapshot): @@ -126,3 +128,198 @@ def test_knn(db, snapshot): ) == snapshot(name="illegal KNN w/ aux") +# ====================================================================== +# Auxiliary columns with non-flat indexes +# ====================================================================== + + +def test_rescore_aux_shadow_tables(db, snapshot): + """Rescore + aux column: verify shadow tables are created correctly.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " emb float[128] indexed by rescore(quantizer=bit)," + " +label text," + " +score float" + ")" + ) + assert exec(db, "SELECT name, sql FROM sqlite_master WHERE type='table' AND name LIKE 't_%' ORDER BY name") == snapshot( + name="rescore aux shadow tables" + ) + + +def test_rescore_aux_insert_knn(db, snapshot): + """Insert with aux data, KNN should return aux column values.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " emb float[128] indexed by rescore(quantizer=bit)," + " +label text" + ")" + ) + import random + random.seed(77) + data = [ + ("alpha", [random.gauss(0, 1) for _ in range(128)]), + ("beta", [random.gauss(0, 1) for _ in range(128)]), + ("gamma", [random.gauss(0, 1) for _ in range(128)]), + ] + for label, vec in data: + db.execute( + "INSERT INTO t(emb, label) VALUES (?, ?)", + [_f32(vec), label], + ) + + assert exec(db, "SELECT rowid, label FROM t ORDER BY rowid") == snapshot( + name="rescore aux select all" + ) + assert vec0_shadow_table_contents(db, "t", skip_info=True) == snapshot( + name="rescore aux shadow contents" + ) + + # KNN should include aux column, "alpha" closest to its own vector + rows = db.execute( + "SELECT label, distance FROM t WHERE emb MATCH ? ORDER BY distance LIMIT 3", + [_f32(data[0][1])], + ).fetchall() + assert len(rows) == 3 + assert rows[0][0] == "alpha" + + +def test_rescore_aux_update(db): + """UPDATE aux column on rescore table should work without affecting vectors.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " emb float[128] indexed by rescore(quantizer=bit)," + " +label text" + ")" + ) + import random + random.seed(88) + vec = [random.gauss(0, 1) for _ in range(128)] + db.execute("INSERT INTO t(rowid, emb, label) VALUES (1, ?, 'original')", [_f32(vec)]) + db.execute("UPDATE t SET label = 'updated' WHERE rowid = 1") + + assert db.execute("SELECT label FROM t WHERE rowid = 1").fetchone()[0] == "updated" + + # KNN still works with updated aux + rows = db.execute( + "SELECT rowid, label FROM t WHERE emb MATCH ? ORDER BY distance LIMIT 1", + [_f32(vec)], + ).fetchall() + assert rows[0][0] == 1 + assert rows[0][1] == "updated" + + +def test_rescore_aux_delete(db, snapshot): + """DELETE should remove aux data from shadow table.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " emb float[128] indexed by rescore(quantizer=bit)," + " +label text" + ")" + ) + import random + random.seed(99) + for i in range(5): + db.execute( + "INSERT INTO t(rowid, emb, label) VALUES (?, ?, ?)", + [i + 1, _f32([random.gauss(0, 1) for _ in range(128)]), f"item-{i+1}"], + ) + + db.execute("DELETE FROM t WHERE rowid = 3") + + assert exec(db, "SELECT rowid, label FROM t ORDER BY rowid") == snapshot( + name="rescore aux after delete" + ) + assert exec(db, "SELECT rowid, value00 FROM t_auxiliary ORDER BY rowid") == snapshot( + name="rescore aux shadow after delete" + ) + + +def test_diskann_aux_shadow_tables(db, snapshot): + """DiskANN + aux column: verify shadow tables are created correctly.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8), + +label text, + +score float + ) + """) + assert exec(db, "SELECT name, sql FROM sqlite_master WHERE type='table' AND name LIKE 't_%' ORDER BY name") == snapshot( + name="diskann aux shadow tables" + ) + + +def test_diskann_aux_insert_knn(db, snapshot): + """DiskANN + aux: insert, KNN, verify aux values returned.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8), + +label text + ) + """) + data = [ + ("red", [1, 0, 0, 0, 0, 0, 0, 0]), + ("green", [0, 1, 0, 0, 0, 0, 0, 0]), + ("blue", [0, 0, 1, 0, 0, 0, 0, 0]), + ] + for label, vec in data: + db.execute("INSERT INTO t(emb, label) VALUES (?, ?)", [_f32(vec), label]) + + assert exec(db, "SELECT rowid, label FROM t ORDER BY rowid") == snapshot( + name="diskann aux select all" + ) + assert vec0_shadow_table_contents(db, "t", skip_info=True) == snapshot( + name="diskann aux shadow contents" + ) + + rows = db.execute( + "SELECT label, distance FROM t WHERE emb MATCH ? AND k = 3", + [_f32([1, 0, 0, 0, 0, 0, 0, 0])], + ).fetchall() + assert len(rows) >= 1 + assert rows[0][0] == "red" + + +def test_diskann_aux_update_and_delete(db, snapshot): + """DiskANN + aux: update aux column, delete row, verify cleanup.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8), + +label text + ) + """) + for i in range(5): + vec = [0.0] * 8 + vec[i % 8] = 1.0 + db.execute( + "INSERT INTO t(rowid, emb, label) VALUES (?, ?, ?)", + [i + 1, _f32(vec), f"item-{i+1}"], + ) + + db.execute("UPDATE t SET label = 'UPDATED' WHERE rowid = 2") + db.execute("DELETE FROM t WHERE rowid = 3") + + assert exec(db, "SELECT rowid, label FROM t ORDER BY rowid") == snapshot( + name="diskann aux after update+delete" + ) + assert exec(db, "SELECT rowid, value00 FROM t_auxiliary ORDER BY rowid") == snapshot( + name="diskann aux shadow after update+delete" + ) + + +def test_diskann_aux_drop_cleans_all(db): + """DROP TABLE should remove aux shadow table too.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8), + +label text + ) + """) + db.execute("INSERT INTO t(emb, label) VALUES (?, 'test')", [_f32([1]*8)]) + db.execute("DROP TABLE t") + + tables = [r[0] for r in db.execute( + "SELECT name FROM sqlite_master WHERE name LIKE 't_%'" + ).fetchall()] + assert "t_auxiliary" not in tables + diff --git a/tests/test-diskann.py b/tests/test-diskann.py index 16ab872..4369a8b 100644 --- a/tests/test-diskann.py +++ b/tests/test-diskann.py @@ -630,16 +630,19 @@ def test_diskann_command_search_list_size_error(db): # Error cases: DiskANN + auxiliary/metadata/partition columns # ====================================================================== -def test_diskann_create_error_with_auxiliary_column(db): - """DiskANN tables should not support auxiliary columns.""" - result = exec(db, """ +def test_diskann_create_with_auxiliary_column(db): + """DiskANN tables should support auxiliary columns.""" + db.execute(""" CREATE VIRTUAL TABLE t USING vec0( emb float[64] INDEXED BY diskann(neighbor_quantizer=binary), +extra text ) """) - assert "error" in result - assert "auxiliary" in result["message"].lower() or "Auxiliary" in result["message"] + # Auxiliary shadow table should exist + tables = [r[0] for r in db.execute( + "SELECT name FROM sqlite_master WHERE name LIKE 't_%' ORDER BY 1" + ).fetchall()] + assert "t_auxiliary" in tables def test_diskann_create_error_with_metadata_column(db): diff --git a/tests/test-ivf-mutations.py b/tests/test-ivf-mutations.py index fce1832..c20dac3 100644 --- a/tests/test-ivf-mutations.py +++ b/tests/test-ivf-mutations.py @@ -203,13 +203,15 @@ def test_update_vector_via_delete_insert(db): # ============================================================================ -def test_error_ivf_with_auxiliary_column(db): - result = exec( - db, - "CREATE VIRTUAL TABLE t USING vec0(v float[4] indexed by ivf(), +extra text)", +def test_ivf_with_auxiliary_column(db): + """IVF should support auxiliary columns.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(v float[4] indexed by ivf(), +extra text)" ) - assert "error" in result - assert "auxiliary" in result.get("message", "").lower() + tables = [r[0] for r in db.execute( + "SELECT name FROM sqlite_master WHERE name LIKE 't_%' ORDER BY 1" + ).fetchall()] + assert "t_auxiliary" in tables def test_error_ivf_with_metadata_column(db): diff --git a/tests/test-rescore-mutations.py b/tests/test-rescore-mutations.py index dbb802a..6015471 100644 --- a/tests/test-rescore-mutations.py +++ b/tests/test-rescore-mutations.py @@ -32,15 +32,18 @@ def unpack_float_vec(blob): # ============================================================================ -def test_create_error_with_aux_column(db): - """Rescore should reject auxiliary columns.""" - with pytest.raises(sqlite3.OperationalError, match="Auxiliary columns"): - db.execute( - "CREATE VIRTUAL TABLE t USING vec0(" - " embedding float[8] indexed by rescore(quantizer=bit)," - " +extra text" - ")" - ) +def test_create_with_aux_column(db): + """Rescore should support auxiliary columns.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " embedding float[128] indexed by rescore(quantizer=bit)," + " +extra text" + ")" + ) + tables = [r[0] for r in db.execute( + "SELECT name FROM sqlite_master WHERE name LIKE 't_%' ORDER BY 1" + ).fetchall()] + assert "t_auxiliary" in tables def test_create_error_with_metadata_column(db): From 6e2c4c6bab0edb2217120d96eb3050a4aa56a6ef Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Tue, 31 Mar 2026 22:39:18 -0700 Subject: [PATCH 33/38] Add FTS5-style command column and runtime oversample for rescore Replace the old INSERT INTO t(rowid) VALUES('command') hack with a proper hidden command column named after the table (FTS5 pattern): INSERT INTO t(t) VALUES ('oversample=16') The command column is the first hidden column (before distance and k) to reserve ability for future table-valued function argument use. Schema: CREATE TABLE x(rowid, , "" hidden, distance hidden, k hidden) For backwards compat, pre-v0.1.10 tables (detected via _info shadow table version) skip the command column to avoid name conflicts with user columns that may share the table's name. Verified with legacy fixture DB generated by sqlite-vec v0.1.6. Changes: - Add hidden command column to sqlite3_declare_vtab for new tables - Version-gate via _info shadow table for existing tables - Validate at CREATE time that no column name matches table name - Add rescore_handle_command() with oversample=N support - rescore_knn() prefers runtime oversample_search over CREATE default - Remove old rowid-based command dispatch - Migrate all DiskANN/IVF/fuzz tests and benchmarks to new syntax - Add legacy DB fixture (v0.1.6) and 9 backwards-compat tests Co-Authored-By: Claude Opus 4.6 (1M context) --- benchmarks-ann/bench-delete/bench_delete.py | 2 +- benchmarks-ann/bench.py | 10 +- sqlite-vec-rescore.c | 25 +++- sqlite-vec.c | 141 ++++++++++++++++---- tests/fixtures/legacy-v0.1.6.db | Bin 0 -> 106496 bytes tests/fuzz/diskann-command-inject.c | 8 +- tests/fuzz/ivf-cell-overflow.c | 12 +- tests/fuzz/ivf-kmeans.c | 8 +- tests/fuzz/ivf-knn-deep.c | 6 +- tests/fuzz/ivf-operations.c | 8 +- tests/fuzz/ivf-quantize.c | 4 +- tests/fuzz/ivf-rescore.c | 6 +- tests/fuzz/ivf-shadow-corrupt.c | 10 +- tests/generate_legacy_db.py | 81 +++++++++++ tests/test-diskann.py | 10 +- tests/test-general.py | 12 ++ tests/test-ivf-mutations.py | 26 ++-- tests/test-ivf-quantization.py | 14 +- tests/test-ivf.py | 26 ++-- tests/test-legacy-compat.py | 138 +++++++++++++++++++ tests/test-rescore.py | 70 ++++++++++ 21 files changed, 512 insertions(+), 105 deletions(-) create mode 100644 tests/fixtures/legacy-v0.1.6.db create mode 100644 tests/generate_legacy_db.py create mode 100644 tests/test-legacy-compat.py diff --git a/benchmarks-ann/bench-delete/bench_delete.py b/benchmarks-ann/bench-delete/bench_delete.py index 802f0a4..0ebd2ec 100644 --- a/benchmarks-ann/bench-delete/bench_delete.py +++ b/benchmarks-ann/bench-delete/bench_delete.py @@ -159,7 +159,7 @@ INDEX_REGISTRY = { def _ivf_train(conn): """Trigger built-in k-means training for IVF.""" t0 = now_ns() - conn.execute("INSERT INTO vec_items(id) VALUES ('compute-centroids')") + conn.execute("INSERT INTO vec_items(vec_items) VALUES ('compute-centroids')") conn.commit() return ns_to_s(now_ns() - t0) diff --git a/benchmarks-ann/bench.py b/benchmarks-ann/bench.py index a4cbbe4..966c458 100644 --- a/benchmarks-ann/bench.py +++ b/benchmarks-ann/bench.py @@ -456,7 +456,7 @@ def _ivf_create_table_sql(params): def _ivf_post_insert_hook(conn, params): print(" Training k-means centroids (built-in)...", flush=True) t0 = time.perf_counter() - conn.execute("INSERT INTO vec_items(id) VALUES ('compute-centroids')") + conn.execute("INSERT INTO vec_items(vec_items) VALUES ('compute-centroids')") conn.commit() elapsed = time.perf_counter() - t0 print(f" Training done in {elapsed:.1f}s", flush=True) @@ -514,7 +514,7 @@ def _ivf_faiss_kmeans_hook(conn, params): for cid, blob in centroids: conn.execute( - "INSERT INTO vec_items(id, embedding) VALUES (?, ?)", + "INSERT INTO vec_items(vec_items, embedding) VALUES (?, ?)", (f"set-centroid:{cid}", blob), ) conn.commit() @@ -540,7 +540,7 @@ def _ivf_pre_query_hook(conn, params): nprobe = params.get("nprobe") if nprobe: conn.execute( - "INSERT INTO vec_items(id) VALUES (?)", + "INSERT INTO vec_items(vec_items) VALUES (?)", (f"nprobe={nprobe}",), ) conn.commit() @@ -572,7 +572,7 @@ INDEX_REGISTRY["ivf"] = { "insert_sql": None, "post_insert_hook": _ivf_post_insert_hook, "pre_query_hook": _ivf_pre_query_hook, - "train_sql": lambda _: "INSERT INTO vec_items(id) VALUES ('compute-centroids')", + "train_sql": lambda _: "INSERT INTO vec_items(vec_items) VALUES ('compute-centroids')", "run_query": None, "query_sql": None, "describe": _ivf_describe, @@ -616,7 +616,7 @@ def _diskann_pre_query_hook(conn, params): L_search = params.get("L_search", 0) if L_search: conn.execute( - "INSERT INTO vec_items(id) VALUES (?)", + "INSERT INTO vec_items(vec_items) VALUES (?)", (f"search_list_size_search={L_search}",), ) conn.commit() diff --git a/sqlite-vec-rescore.c b/sqlite-vec-rescore.c index 5432612..6a47214 100644 --- a/sqlite-vec-rescore.c +++ b/sqlite-vec-rescore.c @@ -351,7 +351,9 @@ static int rescore_knn(vec0_vtab *p, vec0_cursor *pCur, (void)pCur; (void)aMetadataIn; int rc = SQLITE_OK; - int oversample = vector_column->rescore.oversample; + int oversample = vector_column->rescore.oversample_search > 0 + ? vector_column->rescore.oversample_search + : vector_column->rescore.oversample; i64 k_oversample = k * oversample; if (k_oversample > 4096) k_oversample = 4096; @@ -640,6 +642,27 @@ cleanup: return rc; } +/** + * Handle FTS5-style command dispatch for rescore parameters. + * Returns SQLITE_OK if handled, SQLITE_EMPTY if not a rescore command. + */ +static int rescore_handle_command(vec0_vtab *p, const char *command) { + if (strncmp(command, "oversample=", 11) == 0) { + int val = atoi(command + 11); + if (val < 1) { + vtab_set_error(&p->base, "oversample must be >= 1"); + return SQLITE_ERROR; + } + for (int i = 0; i < p->numVectorColumns; i++) { + if (p->vector_columns[i].index_type == VEC0_INDEX_TYPE_RESCORE) { + p->vector_columns[i].rescore.oversample_search = val; + } + } + return SQLITE_OK; + } + return SQLITE_EMPTY; +} + #ifdef SQLITE_VEC_TEST void _test_rescore_quantize_float_to_bit(const float *src, uint8_t *dst, size_t dim) { rescore_quantize_float_to_bit(src, dst, dim); diff --git a/sqlite-vec.c b/sqlite-vec.c index 16c3b4d..40fe0bf 100644 --- a/sqlite-vec.c +++ b/sqlite-vec.c @@ -2588,7 +2588,8 @@ enum Vec0RescoreQuantizerType { struct Vec0RescoreConfig { enum Vec0RescoreQuantizerType quantizer_type; - int oversample; + int oversample; // CREATE-time default + int oversample_search; // runtime override (0 = use default) }; #endif @@ -3399,8 +3400,9 @@ static sqlite3_module vec_eachModule = { #define VEC0_COLUMN_ID 0 #define VEC0_COLUMN_USERN_START 1 -#define VEC0_COLUMN_OFFSET_DISTANCE 1 -#define VEC0_COLUMN_OFFSET_K 2 +#define VEC0_COLUMN_OFFSET_COMMAND 1 +#define VEC0_COLUMN_OFFSET_DISTANCE 2 +#define VEC0_COLUMN_OFFSET_K 3 #define VEC0_SHADOW_INFO_NAME "\"%w\".\"%w_info\"" @@ -3498,6 +3500,10 @@ struct vec0_vtab { // Will change the schema of the _rowids table, and insert/query logic. int pkIsText; + // True if the hidden command column (named after the table) exists. + // Tables created before v0.1.10 or without _info table don't have it. + int hasCommandColumn; + // number of defined vector columns. int numVectorColumns; @@ -3777,20 +3783,19 @@ int vec0_num_defined_user_columns(vec0_vtab *p) { * @param p vec0 table * @return int */ -int vec0_column_distance_idx(vec0_vtab *p) { - return VEC0_COLUMN_USERN_START + (vec0_num_defined_user_columns(p) - 1) + - VEC0_COLUMN_OFFSET_DISTANCE; +int vec0_column_command_idx(vec0_vtab *p) { + // Command column is the first hidden column (right after user columns) + return VEC0_COLUMN_USERN_START + vec0_num_defined_user_columns(p); +} + +int vec0_column_distance_idx(vec0_vtab *p) { + int base = VEC0_COLUMN_USERN_START + vec0_num_defined_user_columns(p); + return base + (p->hasCommandColumn ? 1 : 0); } -/** - * @brief Returns the index of the k hidden column for the given vec0 table. - * - * @param p vec0 table - * @return int k column index - */ int vec0_column_k_idx(vec0_vtab *p) { - return VEC0_COLUMN_USERN_START + (vec0_num_defined_user_columns(p) - 1) + - VEC0_COLUMN_OFFSET_K; + int base = VEC0_COLUMN_USERN_START + vec0_num_defined_user_columns(p); + return base + (p->hasCommandColumn ? 2 : 1); } /** @@ -5205,6 +5210,74 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, } } + // Determine whether to add the FTS5-style hidden command column. + // New tables (isCreate) always get it; existing tables only if created + // with v0.1.10+ (which validated no column name == table name). + int hasCommandColumn = 0; + if (isCreate) { + // Validate no user column name conflicts with the table name + const char *tblName = argv[2]; + int tblNameLen = (int)strlen(tblName); + for (int i = 0; i < numVectorColumns; i++) { + if (pNew->vector_columns[i].name_length == tblNameLen && + sqlite3_strnicmp(pNew->vector_columns[i].name, tblName, tblNameLen) == 0) { + *pzErr = sqlite3_mprintf( + VEC_CONSTRUCTOR_ERROR + "column name '%s' conflicts with table name (reserved for command column)", + tblName); + goto error; + } + } + for (int i = 0; i < numPartitionColumns; i++) { + if (pNew->paritition_columns[i].name_length == tblNameLen && + sqlite3_strnicmp(pNew->paritition_columns[i].name, tblName, tblNameLen) == 0) { + *pzErr = sqlite3_mprintf( + VEC_CONSTRUCTOR_ERROR + "column name '%s' conflicts with table name (reserved for command column)", + tblName); + goto error; + } + } + for (int i = 0; i < numAuxiliaryColumns; i++) { + if (pNew->auxiliary_columns[i].name_length == tblNameLen && + sqlite3_strnicmp(pNew->auxiliary_columns[i].name, tblName, tblNameLen) == 0) { + *pzErr = sqlite3_mprintf( + VEC_CONSTRUCTOR_ERROR + "column name '%s' conflicts with table name (reserved for command column)", + tblName); + goto error; + } + } + for (int i = 0; i < numMetadataColumns; i++) { + if (pNew->metadata_columns[i].name_length == tblNameLen && + sqlite3_strnicmp(pNew->metadata_columns[i].name, tblName, tblNameLen) == 0) { + *pzErr = sqlite3_mprintf( + VEC_CONSTRUCTOR_ERROR + "column name '%s' conflicts with table name (reserved for command column)", + tblName); + goto error; + } + } + hasCommandColumn = 1; + } else { + // xConnect: check _info shadow table for version + sqlite3_stmt *stmtInfo = NULL; + char *zInfoSql = sqlite3_mprintf( + "SELECT value FROM " VEC0_SHADOW_INFO_NAME " WHERE key = 'CREATE_VERSION_PATCH'", + argv[1], argv[2]); + if (zInfoSql) { + int infoRc = sqlite3_prepare_v2(db, zInfoSql, -1, &stmtInfo, NULL); + sqlite3_free(zInfoSql); + if (infoRc == SQLITE_OK && sqlite3_step(stmtInfo) == SQLITE_ROW) { + int patch = sqlite3_column_int(stmtInfo, 0); + hasCommandColumn = (patch >= 10); // v0.1.10+ + } + // If _info doesn't exist or has no version, assume old table + sqlite3_finalize(stmtInfo); + } + } + pNew->hasCommandColumn = hasCommandColumn; + sqlite3_str *createStr = sqlite3_str_new(NULL); sqlite3_str_appendall(createStr, "CREATE TABLE x("); if (pkColumnName) { @@ -5246,7 +5319,11 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, } } - sqlite3_str_appendall(createStr, " distance hidden, k hidden) "); + if (hasCommandColumn) { + sqlite3_str_appendf(createStr, " \"%w\" hidden, distance hidden, k hidden) ", argv[2]); + } else { + sqlite3_str_appendall(createStr, " distance hidden, k hidden) "); + } if (pkColumnName) { sqlite3_str_appendall(createStr, "without rowid "); } @@ -10161,25 +10238,31 @@ static int vec0Update(sqlite3_vtab *pVTab, int argc, sqlite3_value **argv, } // INSERT operation else if (argc > 1 && sqlite3_value_type(argv[0]) == SQLITE_NULL) { -#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE || SQLITE_VEC_ENABLE_DISKANN - // Check for command inserts: INSERT INTO t(rowid) VALUES ('command-string') - // The id column holds the command string. - sqlite3_value *idVal = argv[2 + VEC0_COLUMN_ID]; - if (sqlite3_value_type(idVal) == SQLITE_TEXT) { - const char *cmd = (const char *)sqlite3_value_text(idVal); - vec0_vtab *p = (vec0_vtab *)pVTab; - int cmdRc = SQLITE_EMPTY; + vec0_vtab *p = (vec0_vtab *)pVTab; + // FTS5-style command dispatch via hidden column named after table + if (p->hasCommandColumn) { + sqlite3_value *cmdVal = argv[2 + vec0_column_command_idx(p)]; + if (sqlite3_value_type(cmdVal) == SQLITE_TEXT) { + const char *cmd = (const char *)sqlite3_value_text(cmdVal); + int cmdRc = SQLITE_EMPTY; +#if SQLITE_VEC_ENABLE_RESCORE + cmdRc = rescore_handle_command(p, cmd); +#endif #if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE - cmdRc = ivf_handle_command(p, cmd, argc, argv); + if (cmdRc == SQLITE_EMPTY) + cmdRc = ivf_handle_command(p, cmd, argc, argv); #endif #if SQLITE_VEC_ENABLE_DISKANN - if (cmdRc == SQLITE_EMPTY) - cmdRc = diskann_handle_command(p, cmd); + if (cmdRc == SQLITE_EMPTY) + cmdRc = diskann_handle_command(p, cmd); #endif - if (cmdRc != SQLITE_EMPTY) return cmdRc; // handled (or error) - // SQLITE_EMPTY means not a recognized command — fall through to normal insert + if (cmdRc == SQLITE_EMPTY) { + vtab_set_error(pVTab, "unknown vec0 command: '%s'", cmd); + return SQLITE_ERROR; + } + return cmdRc; + } } -#endif return vec0Update_Insert(pVTab, argc, argv, pRowid); } // UPDATE operation diff --git a/tests/fixtures/legacy-v0.1.6.db b/tests/fixtures/legacy-v0.1.6.db new file mode 100644 index 0000000000000000000000000000000000000000..58bd89d250a0907420336b40abbf14615b1dc244 GIT binary patch literal 106496 zcmeI*Piz$D9mesQo!xhL*6Vp~46F?p#^#?FjIkjOq^cCJ=_)KSHZ=x`MwM$|Y2ARG z*oMR$(p{gb9IA*@MJ|XOQsva5Mv71lz^Ou2R3SyGI3W&A#UY9~HB}@!^Uk}d`?9!I zm14wxUoCp(H}81neLvo{+1c6k(I1aYFO>WwTRhwO1SSI6E-+L#T6z1|cjwRg1i{lf=@+1207OeM=9W9Q& z<_CAE{+_Trb9Q!W;fPa79eUopV=v#EteyGt8&~f^ z|M~9zMoMI7i%(-&{wR|J%lD@$yF@a(Z=YRRtT+AWyKn3wS}B&D?EmdiawH+LJ9pYA z)M)i%f4=ng`O?g(l2ImVMrd^Q%uJmt^#)&=s}$o(U}>P9K=k-B1frGkn*Ddz-R2oT zFyDzqKbhCmn@aeQF$Ni~iY+^haEt%qp<8g4`;lDds88H6!vi_NFSk_q3+}X?%;s{b zN}|p@YImjwU;B8k-Q=kc{pR3EarvPozmi|bPlE*-0tg_000IagfB*srAbN6+mij~B*|7RN>>kK`u~928l{8FY=F!^P3D@odZ)Oqm_|LqUh_e z^u4TJ)R!Mw@@x5}{5)8oA%Fk^2q1s}0tg_000IagfWS%!v_IuEK)g`>j=*R z#9E!4ao!+&{onRltl&mN009ILKmY**5I_I{1Q2K-ftzVNWyO{aB7J3A z^&RFhefa7~s_#Ni==DNpQm49pK3H~j9lppNwx%eojLz`MZ`iyjEUB~lT)0)&t8}*M zT&;7B&b2zTI@@)2=&Zhre!Z?Y=+I2))440V3Ft( z7dmWbpI!Ure*Hl$d)i~pKG*1qz9<-WI_mKpr^^oa7JWKJIy?ok$D6a{xALKUAn(h& z@{YVMZ^`TOn*2mwm6zou`Mz9|7v-G%r93HrBFE(+IV^u5_sAWxTdtF=xZByF`~tU-&9FiA^duij68eMW@OQVuQ-{V!g_BVx7tk(V?jWr7R@R%BBQcNG^tFBv`SaF zDpMk*GAWWOg%BzeBB3%a;wl~CsEj3IVY&%Bj8t^}zqGvSwGRz+Ph_eHAbKjJ=C!_w0QUBGb?-}*qjrtp-{zq6h zg>Q|p<#$$aqalC*0tg_000IagfB*srAb`LNDBw75_z4MVYnPr}pU&Mn`*jZJ+@q5~ z009ILKmY**5I_I{1Q0*~0R#|0009ILKmY**5I_I{1Q0*~f#*%2`qvPu;~K6mL>V{g zlu>7ly2Yqh8+E%;Z!l`#sJ9q(k5Ts;b-z)+WYqhO`iDk6WYj-0>VrmIt3`zX0tg_0 z00IagfB*srAb`MgDp36^h|lTp7#0EuAbg0^`2I1@fw%1|>HyQ#6Ab