diff --git a/.gitignore b/.gitignore index 0268d5d..ef549f4 100644 --- a/.gitignore +++ b/.gitignore @@ -31,3 +31,6 @@ poetry.lock memstat.c memstat.* + + +.DS_Store \ No newline at end of file diff --git a/Makefile b/Makefile index 1ebdbed..175ab16 100644 --- a/Makefile +++ b/Makefile @@ -37,11 +37,18 @@ endif ifndef OMIT_SIMD ifeq ($(shell uname -sm),Darwin x86_64) - CFLAGS += -mavx -DSQLITE_VEC_ENABLE_AVX + CFLAGS += -mavx -mavx2 -DSQLITE_VEC_ENABLE_AVX endif ifeq ($(shell uname -sm),Darwin arm64) CFLAGS += -mcpu=apple-m1 -DSQLITE_VEC_ENABLE_NEON endif + ifeq ($(shell uname -s),Linux) + ifeq ($(findstring android,$(CC)),) + ifneq ($(filter avx,$(shell grep -o 'avx[^ ]*' /proc/cpuinfo 2>/dev/null | head -1)),) + CFLAGS += -mavx -mavx2 -DSQLITE_VEC_ENABLE_AVX + endif + endif + endif endif ifdef USE_BREW_SQLITE @@ -155,6 +162,13 @@ clean: rm -rf dist +TARGET_AMALGAMATION=$(prefix)/sqlite-vec.c + +amalgamation: $(TARGET_AMALGAMATION) + +$(TARGET_AMALGAMATION): sqlite-vec.c $(wildcard sqlite-vec-*.c) scripts/amalgamate.py $(prefix) + python3 scripts/amalgamate.py sqlite-vec.c > $@ + FORMAT_FILES=sqlite-vec.h sqlite-vec.c format: $(FORMAT_FILES) clang-format -i $(FORMAT_FILES) @@ -174,7 +188,7 @@ evidence-of: test: sqlite3 :memory: '.read test.sql' -.PHONY: version loadable static test clean gh-release evidence-of install uninstall +.PHONY: version loadable static test clean gh-release evidence-of install uninstall amalgamation publish-release: ./scripts/publish-release.sh @@ -190,7 +204,22 @@ test-loadable-watch: watchexec --exts c,py,Makefile --clear -- make test-loadable test-unit: - $(CC) -DSQLITE_CORE -DSQLITE_VEC_TEST tests/test-unit.c sqlite-vec.c vendor/sqlite3.c -I./ -Ivendor -o $(prefix)/test-unit && $(prefix)/test-unit + $(CC) -DSQLITE_CORE -DSQLITE_VEC_TEST -DSQLITE_VEC_ENABLE_RESCORE -DSQLITE_VEC_ENABLE_DISKANN=1 tests/test-unit.c sqlite-vec.c vendor/sqlite3.c -I./ -Ivendor $(CFLAGS) -o $(prefix)/test-unit && $(prefix)/test-unit + +# Standalone sqlite3 CLI with vec0 compiled in. Useful for benchmarking, +# profiling (has debug symbols), and scripting without .load_extension. +# make cli +# dist/sqlite3 :memory: "SELECT vec_version()" +# dist/sqlite3 < script.sql +cli: sqlite-vec.h $(prefix) + $(CC) -O2 -g \ + -DSQLITE_CORE \ + -DSQLITE_EXTRA_INIT=core_init \ + -DSQLITE_THREADSAFE=0 \ + -Ivendor/ -I./ \ + $(CFLAGS) \ + vendor/sqlite3.c vendor/shell.c sqlite-vec.c examples/sqlite3-cli/core_init.c \ + -ldl -lm -o $(prefix)/sqlite3 fuzz-build: $(MAKE) -C tests/fuzz all diff --git a/TODO.md b/TODO.md new file mode 100644 index 0000000..4c3cc19 --- /dev/null +++ b/TODO.md @@ -0,0 +1,73 @@ +# TODO: `ann` base branch + consolidated benchmarks + +## 1. Create `ann` branch with shared code + +### 1.1 Branch setup +- [x] `git checkout -B ann origin/main` +- [x] Cherry-pick `624f998` (vec0_distance_full shared distance dispatch) +- [x] Cherry-pick stdint.h fix for test header +- [ ] Pull NEON cosine optimization from ivf-yolo3 into shared code + - Currently only in ivf branch but is general-purpose (benefits all distance calcs) + - Lives in `distance_cosine_float()` — ~57 lines of ARM NEON vectorized cosine + +### 1.2 Benchmark infrastructure (`benchmarks-ann/`) +- [x] Seed data pipeline (`seed/Makefile`, `seed/build_base_db.py`) +- [x] Ground truth generator (`ground_truth.py`) +- [x] Results schema (`schema.sql`) +- [x] Benchmark runner with `INDEX_REGISTRY` extension point (`bench.py`) + - Baseline configs (float, int8-rescore, bit-rescore) implemented + - Index branches register their types via `INDEX_REGISTRY` dict +- [x] Makefile with baseline targets +- [x] README + +### 1.3 Rebase feature branches onto `ann` +- [x] Rebase `diskann-yolo2` onto `ann` (1 commit: DiskANN implementation) +- [x] Rebase `ivf-yolo3` onto `ann` (1 commit: IVF implementation) +- [x] Rebase `annoy-yolo2` onto `ann` (2 commits: Annoy implementation + schema fix) +- [x] Verify each branch has only its index-specific commits remaining +- [ ] Force-push all 4 branches to origin + +--- + +## 2. Per-branch: register index type in benchmarks + +Each index branch should add to `benchmarks-ann/` when rebased onto `ann`: + +### 2.1 Register in `bench.py` + +Add an `INDEX_REGISTRY` entry. Each entry provides: +- `defaults` — default param values +- `create_table_sql(params)` — CREATE VIRTUAL TABLE with INDEXED BY clause +- `insert_sql(params)` — custom insert SQL, or None for default +- `post_insert_hook(conn, params)` — training/building step, returns time +- `run_query(conn, params, query, k)` — custom query, or None for default MATCH +- `describe(params)` — one-line description for report output + +### 2.2 Add configs to `Makefile` + +Append index-specific config variables and targets. Example pattern: + +```makefile +DISKANN_CONFIGS = \ + "diskann-R48-binary:type=diskann,R=48,L=128,quantizer=binary" \ + ... + +ALL_CONFIGS += $(DISKANN_CONFIGS) + +bench-diskann: seed + $(BENCH) --subset-size 10000 -k 10 -o runs/diskann $(BASELINES) $(DISKANN_CONFIGS) + ... +``` + +### 2.3 Migrate existing benchmark results/docs + +- Move useful results docs (RESULTS.md, etc.) into `benchmarks-ann/results/` +- Delete redundant per-branch benchmark directories once consolidated infra is proven + +--- + +## 3. Future improvements + +- [ ] Reporting script (`report.py`) — query results.db, produce markdown comparison tables +- [ ] Profiling targets in Makefile (lift from ivf-yolo3's Instruments/perf wrappers) +- [ ] Pre-computed ground truth integration (use GT DB files instead of on-the-fly brute-force) diff --git a/VERSION b/VERSION index 699c6c6..99c0cc4 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.1.8 +0.1.10-alpha.3 \ No newline at end of file diff --git a/benchmarks-ann/.gitignore b/benchmarks-ann/.gitignore new file mode 100644 index 0000000..95707b9 --- /dev/null +++ b/benchmarks-ann/.gitignore @@ -0,0 +1,8 @@ +*.db +*.db-shm +*.db-wal +*.parquet +runs/ + +viewer/ +searcher/ \ No newline at end of file diff --git a/benchmarks-ann/Makefile b/benchmarks-ann/Makefile new file mode 100644 index 0000000..9ae456e --- /dev/null +++ b/benchmarks-ann/Makefile @@ -0,0 +1,85 @@ +BENCH = python bench.py +BASE_DB = cohere1m/base.db +EXT = ../dist/vec0 + +# --- Baseline (brute-force) configs --- +BASELINES = \ + "brute-float:type=vec0-flat,variant=float" \ + "brute-int8:type=vec0-flat,variant=int8" \ + "brute-bit:type=vec0-flat,variant=bit" + +# --- IVF configs --- +IVF_CONFIGS = \ + "ivf-n32-p8:type=ivf,nlist=32,nprobe=8" \ + "ivf-n128-p16:type=ivf,nlist=128,nprobe=16" \ + "ivf-n512-p32:type=ivf,nlist=512,nprobe=32" + +RESCORE_CONFIGS = \ + "rescore-bit-os8:type=rescore,quantizer=bit,oversample=8" \ + "rescore-bit-os16:type=rescore,quantizer=bit,oversample=16" \ + "rescore-int8-os8:type=rescore,quantizer=int8,oversample=8" + +# --- DiskANN configs --- +DISKANN_CONFIGS = \ + "diskann-R48-binary:type=diskann,R=48,L=128,quantizer=binary" \ + "diskann-R72-binary:type=diskann,R=72,L=128,quantizer=binary" \ + "diskann-R72-int8:type=diskann,R=72,L=128,quantizer=int8" \ + "diskann-R72-L256:type=diskann,R=72,L=256,quantizer=binary" + +ALL_CONFIGS = $(BASELINES) $(RESCORE_CONFIGS) $(IVF_CONFIGS) $(DISKANN_CONFIGS) + +.PHONY: seed ground-truth bench-smoke bench-rescore bench-ivf bench-diskann bench-10k bench-50k bench-100k bench-all \ + report clean + +# --- Data preparation --- +seed: + $(MAKE) -C cohere1m + +ground-truth: seed + python ground_truth.py --subset-size 10000 + python ground_truth.py --subset-size 50000 + python ground_truth.py --subset-size 100000 + +# --- Quick smoke test --- +bench-smoke: seed + $(BENCH) --subset-size 5000 -k 10 -n 20 --dataset cohere1m -o runs \ + "brute-float:type=vec0-flat,variant=float" \ + "ivf-quick:type=ivf,nlist=16,nprobe=4" \ + "diskann-quick:type=diskann,R=48,L=64,quantizer=binary" + +bench-rescore: seed + $(BENCH) --subset-size 10000 -k 10 --dataset cohere1m -o runs \ + $(RESCORE_CONFIGS) + + +# --- Standard sizes --- +bench-10k: seed + $(BENCH) --subset-size 10000 -k 10 --dataset cohere1m -o runs $(ALL_CONFIGS) + +bench-50k: seed + $(BENCH) --subset-size 50000 -k 10 --dataset cohere1m -o runs $(ALL_CONFIGS) + +bench-100k: seed + $(BENCH) --subset-size 100000 -k 10 --dataset cohere1m -o runs $(ALL_CONFIGS) + +bench-all: bench-10k bench-50k bench-100k + +# --- IVF across sizes --- +bench-ivf: seed + $(BENCH) --subset-size 10000 -k 10 --dataset cohere1m -o runs $(BASELINES) $(IVF_CONFIGS) + $(BENCH) --subset-size 50000 -k 10 --dataset cohere1m -o runs $(BASELINES) $(IVF_CONFIGS) + $(BENCH) --subset-size 100000 -k 10 --dataset cohere1m -o runs $(BASELINES) $(IVF_CONFIGS) + +# --- DiskANN across sizes --- +bench-diskann: seed + $(BENCH) --subset-size 10000 -k 10 --dataset cohere1m -o runs $(BASELINES) $(DISKANN_CONFIGS) + $(BENCH) --subset-size 50000 -k 10 --dataset cohere1m -o runs $(BASELINES) $(DISKANN_CONFIGS) + $(BENCH) --subset-size 100000 -k 10 --dataset cohere1m -o runs $(BASELINES) $(DISKANN_CONFIGS) + +# --- Report --- +report: + @echo "Use: sqlite3 runs/cohere1m//results.db 'SELECT run_id, config_name, status, recall FROM runs JOIN run_results USING(run_id)'" + +# --- Cleanup --- +clean: + rm -rf runs/ diff --git a/benchmarks-ann/README.md b/benchmarks-ann/README.md new file mode 100644 index 0000000..88f1c74 --- /dev/null +++ b/benchmarks-ann/README.md @@ -0,0 +1,111 @@ +# KNN Benchmarks for sqlite-vec + +Benchmarking infrastructure for vec0 KNN configurations. Includes brute-force +baselines (float, int8, bit), rescore, IVF, and DiskANN index types. + +## Datasets + +Each dataset is a subdirectory containing a `Makefile` and `build_base_db.py` +that produce a `base.db`. The benchmark runner auto-discovers any subdirectory +with a `base.db` file. + +``` +cohere1m/ # Cohere 768d cosine, 1M vectors + Makefile # downloads parquets from Zilliz, builds base.db + build_base_db.py + base.db # (generated) + +cohere10m/ # Cohere 768d cosine, 10M vectors (10 train shards) + Makefile # make -j12 download to fetch all shards in parallel + build_base_db.py + base.db # (generated) +``` + +Every `base.db` has the same schema: + +| Table | Columns | Description | +|-------|---------|-------------| +| `train` | `id INTEGER PRIMARY KEY, vector BLOB` | Indexed vectors (f32 blobs) | +| `query_vectors` | `id INTEGER PRIMARY KEY, vector BLOB` | Query vectors for KNN evaluation | +| `neighbors` | `query_vector_id INTEGER, rank INTEGER, neighbors_id TEXT` | Ground-truth nearest neighbors | + +To add a new dataset, create a directory with a `Makefile` that builds `base.db` +with the tables above. It will be available via `--dataset ` automatically. + +### Building datasets + +```bash +# Cohere 1M +cd cohere1m && make download && make && cd .. + +# Cohere 10M (parallel download recommended — 10 train shards + test + neighbors) +cd cohere10m && make -j12 download && make && cd .. +``` + +## Prerequisites + +- Built `dist/vec0` extension (run `make loadable` from repo root) +- Python 3.10+ +- `uv` + +## Quick start + +```bash +# 1. Build a dataset +cd cohere1m && make && cd .. + +# 2. Quick smoke test (5k vectors) +make bench-smoke + +# 3. Full benchmark at 10k +make bench-10k +``` + +## Usage + +```bash +uv run python bench.py --subset-size 10000 -k 10 -n 50 --dataset cohere1m \ + "brute-float:type=baseline,variant=float" \ + "rescore-bit-os8:type=rescore,quantizer=bit,oversample=8" +``` + +### Config format + +`name:type=,key=val,key=val` + +| Index type | Keys | +|-----------|------| +| `baseline` | `variant` (float/int8/bit), `oversample` | +| `rescore` | `quantizer` (bit/int8), `oversample` | +| `ivf` | `nlist`, `nprobe` | +| `diskann` | `R`, `L`, `quantizer` (binary/int8), `buffer_threshold` | + +### Make targets + +| Target | Description | +|--------|-------------| +| `make seed` | Download and build default dataset | +| `make bench-smoke` | Quick 5k test (3 configs) | +| `make bench-10k` | All configs at 10k vectors | +| `make bench-50k` | All configs at 50k vectors | +| `make bench-100k` | All configs at 100k vectors | +| `make bench-all` | 10k + 50k + 100k | +| `make bench-ivf` | Baselines + IVF across 10k/50k/100k | +| `make bench-diskann` | Baselines + DiskANN across 10k/50k/100k | + +## Results DB + +Each run writes to `runs///results.db` (SQLite, WAL mode). +Progress is written continuously — query from another terminal to monitor: + +```bash +sqlite3 runs/cohere1m/10000/results.db "SELECT run_id, config_name, status FROM runs" +``` + +See `results_schema.sql` for the full schema (tables: `runs`, `run_results`, +`insert_batches`, `queries`). + +## Adding an index type + +Add an entry to `INDEX_REGISTRY` in `bench.py` and append configs to +`ALL_CONFIGS` in the `Makefile`. See existing entries for the pattern. diff --git a/benchmarks-ann/bench-delete/.gitignore b/benchmarks-ann/bench-delete/.gitignore new file mode 100644 index 0000000..0184df8 --- /dev/null +++ b/benchmarks-ann/bench-delete/.gitignore @@ -0,0 +1,3 @@ +runs/ +*.db +__pycache__/ diff --git a/benchmarks-ann/bench-delete/Makefile b/benchmarks-ann/bench-delete/Makefile new file mode 100644 index 0000000..681847b --- /dev/null +++ b/benchmarks-ann/bench-delete/Makefile @@ -0,0 +1,41 @@ +BENCH = python bench_delete.py +EXT = ../../dist/vec0 + +# --- Configs to test --- +FLAT = "flat:type=vec0-flat,variant=float" +RESCORE_BIT = "rescore-bit:type=rescore,quantizer=bit,oversample=8" +RESCORE_INT8 = "rescore-int8:type=rescore,quantizer=int8,oversample=8" +DISKANN_R48 = "diskann-R48:type=diskann,R=48,L=128,quantizer=binary" +DISKANN_R72 = "diskann-R72:type=diskann,R=72,L=128,quantizer=binary" + +ALL_CONFIGS = $(FLAT) $(RESCORE_BIT) $(RESCORE_INT8) $(DISKANN_R48) $(DISKANN_R72) + +DELETE_PCTS = 5,10,25,50,75,90 + +.PHONY: smoke bench-10k bench-50k bench-all report clean + +# Quick smoke test (small dataset, few queries) +smoke: + $(BENCH) --subset-size 5000 --delete-pct 10,50 -k 10 -n 20 \ + --dataset cohere1m --ext $(EXT) \ + $(FLAT) $(DISKANN_R48) + +# Standard benchmarks +bench-10k: + $(BENCH) --subset-size 10000 --delete-pct $(DELETE_PCTS) -k 10 -n 50 \ + --dataset cohere1m --ext $(EXT) $(ALL_CONFIGS) + +bench-50k: + $(BENCH) --subset-size 50000 --delete-pct $(DELETE_PCTS) -k 10 -n 50 \ + --dataset cohere1m --ext $(EXT) $(ALL_CONFIGS) + +bench-all: bench-10k bench-50k + +# Query saved results +report: + @echo "Query results:" + @echo " sqlite3 runs/cohere1m/10000/delete_results.db \\" + @echo " \"SELECT config_name, delete_pct, recall, query_mean_ms, vacuum_size_mb FROM delete_runs ORDER BY config_name, delete_pct\"" + +clean: + rm -rf runs/ diff --git a/benchmarks-ann/bench-delete/README.md b/benchmarks-ann/bench-delete/README.md new file mode 100644 index 0000000..8155566 --- /dev/null +++ b/benchmarks-ann/bench-delete/README.md @@ -0,0 +1,69 @@ +# bench-delete: Recall degradation after random deletion + +Measures how KNN recall changes after deleting a random percentage of rows +from different index types (flat, rescore, DiskANN). + +## Quick start + +```bash +# Ensure dataset exists +make -C ../datasets/cohere1m + +# Ensure extension is built +make -C ../.. loadable + +# Quick smoke test +make smoke + +# Full benchmark at 10k vectors +make bench-10k +``` + +## Usage + +```bash +python bench_delete.py --subset-size 10000 --delete-pct 10,25,50,75 \ + "flat:type=vec0-flat,variant=float" \ + "diskann-R72:type=diskann,R=72,L=128,quantizer=binary" \ + "rescore-bit:type=rescore,quantizer=bit,oversample=8" +``` + +## What it measures + +For each config and delete percentage: + +| Metric | Description | +|--------|-------------| +| **recall** | KNN recall@k after deletion (ground truth recomputed over surviving rows) | +| **delta** | Recall change vs 0% baseline | +| **query latency** | Mean/median query time after deletion | +| **db_size_mb** | DB file size before VACUUM | +| **vacuum_size_mb** | DB file size after VACUUM (space reclaimed) | +| **delete_time_s** | Wall time for the DELETE operations | + +## How it works + +1. Build index with N vectors (one copy per config) +2. Measure recall at k=10 (pre-delete baseline) +3. For each delete %: + - Copy the master DB + - Delete a random selection of rows (deterministic seed) + - Measure recall (ground truth recomputed over surviving rows only) + - VACUUM and measure size savings +4. Print comparison table + +## Expected behavior + +- **Flat index**: Recall should be 1.0 at all delete percentages (brute-force is always exact) +- **Rescore**: Recall should stay close to baseline (quantized scan + rescore is robust) +- **DiskANN**: Recall may degrade at high delete % due to graph fragmentation (dangling edges, broken connectivity) + +## Results DB + +Results are stored in `runs///delete_results.db`: + +```sql +SELECT config_name, delete_pct, recall, vacuum_size_mb +FROM delete_runs +ORDER BY config_name, delete_pct; +``` diff --git a/benchmarks-ann/bench-delete/bench_delete.py b/benchmarks-ann/bench-delete/bench_delete.py new file mode 100644 index 0000000..0ebd2ec --- /dev/null +++ b/benchmarks-ann/bench-delete/bench_delete.py @@ -0,0 +1,593 @@ +#!/usr/bin/env python3 +"""Benchmark: measure recall degradation after random row deletion. + +Given a dataset and index config, this script: + 1. Builds the index (flat + ANN) + 2. Measures recall at k=10 (pre-delete baseline) + 3. Deletes a random % of rows + 4. Measures recall again (post-delete) + 5. Records DB size before/after deletion, recall delta, timings + +Usage: + python bench_delete.py --subset-size 10000 --delete-pct 25 \ + "diskann-R48:type=diskann,R=48,L=128,quantizer=binary" + + # Multiple delete percentages in one run: + python bench_delete.py --subset-size 10000 --delete-pct 10,25,50,75 \ + "diskann-R48:type=diskann,R=48,L=128,quantizer=binary" +""" +import argparse +import json +import os +import random +import shutil +import sqlite3 +import statistics +import struct +import time + +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +_BENCH_DIR = os.path.join(_SCRIPT_DIR, "..") +_ROOT_DIR = os.path.join(_BENCH_DIR, "..") + +EXT_PATH = os.path.join(_ROOT_DIR, "dist", "vec0") +DATASETS_DIR = os.path.join(_BENCH_DIR, "datasets") + +DATASETS = { + "cohere1m": {"base_db": os.path.join(DATASETS_DIR, "cohere1m", "base.db"), "dimensions": 768}, + "cohere10m": {"base_db": os.path.join(DATASETS_DIR, "cohere10m", "base.db"), "dimensions": 768}, + "nyt": {"base_db": os.path.join(DATASETS_DIR, "nyt", "base.db"), "dimensions": 256}, + "nyt-768": {"base_db": os.path.join(DATASETS_DIR, "nyt-768", "base.db"), "dimensions": 768}, + "nyt-1024": {"base_db": os.path.join(DATASETS_DIR, "nyt-1024", "base.db"), "dimensions": 1024}, + "nyt-384": {"base_db": os.path.join(DATASETS_DIR, "nyt-384", "base.db"), "dimensions": 384}, +} + +INSERT_BATCH_SIZE = 1000 + + +# ============================================================================ +# Timing helpers +# ============================================================================ + +def now_ns(): + return time.time_ns() + +def ns_to_s(ns): + return ns / 1_000_000_000 + +def ns_to_ms(ns): + return ns / 1_000_000 + + +# ============================================================================ +# Index registry (subset of bench.py — only types relevant to deletion) +# ============================================================================ + +def _vec0_flat_create(p): + dims = p["dimensions"] + variant = p.get("variant", "float") + col = f"embedding float[{dims}]" + if variant == "int8": + col = f"embedding int8[{dims}]" + elif variant == "bit": + col = f"embedding bit[{dims}]" + return f"CREATE VIRTUAL TABLE vec_items USING vec0(id INTEGER PRIMARY KEY, {col})" + +def _rescore_create(p): + dims = p["dimensions"] + q = p.get("quantizer", "bit") + os_val = p.get("oversample", 8) + return ( + f"CREATE VIRTUAL TABLE vec_items USING vec0(" + f"id INTEGER PRIMARY KEY, " + f"embedding float[{dims}] indexed by rescore(quantizer={q}, oversample={os_val}))" + ) + +def _diskann_create(p): + dims = p["dimensions"] + R = p.get("R", 72) + L = p.get("L", 128) + q = p.get("quantizer", "binary") + bt = p.get("buffer_threshold", 0) + sl_insert = p.get("search_list_size_insert", 0) + sl_search = p.get("search_list_size_search", 0) + parts = [ + f"neighbor_quantizer={q}", + f"n_neighbors={R}", + f"buffer_threshold={bt}", + ] + if sl_insert or sl_search: + # Per-path overrides — don't also set search_list_size + if sl_insert: + parts.append(f"search_list_size_insert={sl_insert}") + if sl_search: + parts.append(f"search_list_size_search={sl_search}") + else: + parts.append(f"search_list_size={L}") + opts = ", ".join(parts) + return ( + f"CREATE VIRTUAL TABLE vec_items USING vec0(" + f"id INTEGER PRIMARY KEY, " + f"embedding float[{dims}] indexed by diskann({opts}))" + ) + +def _ivf_create(p): + dims = p["dimensions"] + nlist = p.get("nlist", 128) + nprobe = p.get("nprobe", 16) + q = p.get("quantizer", "none") + os_val = p.get("oversample", 1) + parts = [f"nlist={nlist}", f"nprobe={nprobe}"] + if q != "none": + parts.append(f"quantizer={q}") + if os_val > 1: + parts.append(f"oversample={os_val}") + opts = ", ".join(parts) + return ( + f"CREATE VIRTUAL TABLE vec_items USING vec0(" + f"id INTEGER PRIMARY KEY, " + f"embedding float[{dims}] indexed by ivf({opts}))" + ) + + +INDEX_REGISTRY = { + "vec0-flat": { + "defaults": {"variant": "float"}, + "create_table_sql": _vec0_flat_create, + "post_insert_hook": None, + }, + "rescore": { + "defaults": {"quantizer": "bit", "oversample": 8}, + "create_table_sql": _rescore_create, + "post_insert_hook": None, + }, + "ivf": { + "defaults": {"nlist": 128, "nprobe": 16, "quantizer": "none", + "oversample": 1}, + "create_table_sql": _ivf_create, + "post_insert_hook": lambda conn, params: _ivf_train(conn), + }, + "diskann": { + "defaults": {"R": 72, "L": 128, "quantizer": "binary", + "buffer_threshold": 0}, + "create_table_sql": _diskann_create, + "post_insert_hook": None, + }, +} + + +def _ivf_train(conn): + """Trigger built-in k-means training for IVF.""" + t0 = now_ns() + conn.execute("INSERT INTO vec_items(vec_items) VALUES ('compute-centroids')") + conn.commit() + return ns_to_s(now_ns() - t0) + + +# ============================================================================ +# Config parsing (same format as bench.py) +# ============================================================================ + +INT_KEYS = {"R", "L", "oversample", "nlist", "nprobe", "buffer_threshold", + "search_list_size_insert", "search_list_size_search"} + +def parse_config(spec): + if ":" not in spec: + raise ValueError(f"Config must be 'name:key=val,...': {spec}") + name, rest = spec.split(":", 1) + params = {} + for kv in rest.split(","): + k, v = kv.split("=", 1) + k = k.strip() + v = v.strip() + if k in INT_KEYS: + v = int(v) + params[k] = v + index_type = params.pop("type", None) + if not index_type or index_type not in INDEX_REGISTRY: + raise ValueError(f"Unknown index type: {index_type}") + params["index_type"] = index_type + merged = dict(INDEX_REGISTRY[index_type]["defaults"]) + merged.update(params) + return name, merged + + +# ============================================================================ +# DB helpers +# ============================================================================ + +def create_bench_db(db_path, ext_path, base_db, page_size=4096): + if os.path.exists(db_path): + os.remove(db_path) + conn = sqlite3.connect(db_path) + conn.execute(f"PRAGMA page_size={page_size}") + conn.execute("PRAGMA journal_mode=WAL") + conn.enable_load_extension(True) + conn.load_extension(ext_path) + conn.execute(f"ATTACH DATABASE '{base_db}' AS base") + return conn + + +def load_query_vectors(base_db, n): + conn = sqlite3.connect(base_db) + rows = conn.execute( + "SELECT id, vector FROM query_vectors LIMIT ?", (n,) + ).fetchall() + conn.close() + return rows + + +def insert_loop(conn, subset_size, label, start_from=0): + insert_sql = ( + "INSERT INTO vec_items(id, embedding) " + "SELECT id, vector FROM base.train " + "WHERE id >= :lo AND id < :hi" + ) + total = 0 + for lo in range(start_from, subset_size, INSERT_BATCH_SIZE): + hi = min(lo + INSERT_BATCH_SIZE, subset_size) + conn.execute(insert_sql, {"lo": lo, "hi": hi}) + conn.commit() + total += hi - lo + if total % 5000 == 0 or total == subset_size - start_from: + print(f" [{label}] inserted {total + start_from}/{subset_size}", flush=True) + + +# ============================================================================ +# Recall measurement +# ============================================================================ + +def measure_recall(conn, base_db, query_vectors, subset_size, k, alive_ids=None): + """Measure KNN recall. If alive_ids is provided, ground truth is computed + only over those IDs (to match post-delete state).""" + recalls = [] + times_ms = [] + + for qid, query in query_vectors: + t0 = now_ns() + results = conn.execute( + "SELECT id, distance FROM vec_items " + "WHERE embedding MATCH :query AND k = :k", + {"query": query, "k": k}, + ).fetchall() + t1 = now_ns() + times_ms.append(ns_to_ms(t1 - t0)) + + result_ids = set(r[0] for r in results) + + # Ground truth: brute-force cosine over surviving rows + if alive_ids is not None: + # After deletion — compute GT only over alive IDs + # Use a temp table for the alive set for efficiency + gt_rows = conn.execute( + "SELECT id FROM (" + " SELECT id, vec_distance_l2(vector, :query) as dist " + " FROM base.train WHERE id < :n ORDER BY dist LIMIT :k2" + ")", + {"query": query, "k2": k * 5, "n": subset_size}, + ).fetchall() + # Filter to only alive IDs, take top k + gt_alive = [r[0] for r in gt_rows if r[0] in alive_ids][:k] + gt_ids = set(gt_alive) + else: + gt_rows = conn.execute( + "SELECT id FROM (" + " SELECT id, vec_distance_l2(vector, :query) as dist " + " FROM base.train WHERE id < :n ORDER BY dist LIMIT :k" + ")", + {"query": query, "k": k, "n": subset_size}, + ).fetchall() + gt_ids = set(r[0] for r in gt_rows) + + if gt_ids: + recalls.append(len(result_ids & gt_ids) / len(gt_ids)) + else: + recalls.append(0.0) + + return { + "recall": round(statistics.mean(recalls), 4) if recalls else 0.0, + "mean_ms": round(statistics.mean(times_ms), 2) if times_ms else 0.0, + "median_ms": round(statistics.median(times_ms), 2) if times_ms else 0.0, + } + + +# ============================================================================ +# Delete benchmark core +# ============================================================================ + +def run_delete_benchmark(name, params, base_db, ext_path, subset_size, dims, + delete_pcts, k, n_queries, out_dir, seed_val): + params["dimensions"] = dims + reg = INDEX_REGISTRY[params["index_type"]] + create_sql = reg["create_table_sql"](params) + + results = [] + + # Build once, copy for each delete % + print(f"\n{'='*60}") + print(f"Config: {name} (type={params['index_type']})") + print(f"{'='*60}") + + os.makedirs(out_dir, exist_ok=True) + master_db_path = os.path.join(out_dir, f"{name}.{subset_size}.db") + print(f" Building index ({subset_size} vectors)...") + build_t0 = now_ns() + conn = create_bench_db(master_db_path, ext_path, base_db) + conn.execute(create_sql) + insert_loop(conn, subset_size, name) + hook = reg.get("post_insert_hook") + if hook: + print(f" Training...") + hook(conn, params) + conn.close() + build_time_s = ns_to_s(now_ns() - build_t0) + master_size = os.path.getsize(master_db_path) + print(f" Built in {build_time_s:.1f}s ({master_size / (1024*1024):.1f} MB)") + + # Load query vectors once + query_vectors = load_query_vectors(base_db, n_queries) + + # Measure pre-delete baseline on the master copy + print(f"\n --- 0% deleted (baseline) ---") + conn = sqlite3.connect(master_db_path) + conn.enable_load_extension(True) + conn.load_extension(ext_path) + conn.execute(f"ATTACH DATABASE '{base_db}' AS base") + baseline = measure_recall(conn, base_db, query_vectors, subset_size, k) + conn.close() + print(f" recall={baseline['recall']:.4f} " + f"query={baseline['mean_ms']:.2f}ms") + + results.append({ + "name": name, + "index_type": params["index_type"], + "subset_size": subset_size, + "delete_pct": 0, + "n_deleted": 0, + "n_remaining": subset_size, + "recall": baseline["recall"], + "query_mean_ms": baseline["mean_ms"], + "query_median_ms": baseline["median_ms"], + "db_size_mb": round(master_size / (1024 * 1024), 2), + "build_time_s": round(build_time_s, 1), + "delete_time_s": 0.0, + "vacuum_size_mb": round(master_size / (1024 * 1024), 2), + }) + + # All IDs in the dataset + all_ids = list(range(subset_size)) + + for pct in sorted(delete_pcts): + n_delete = int(subset_size * pct / 100) + print(f"\n --- {pct}% deleted ({n_delete} rows) ---") + + # Copy master DB and work on the copy + copy_path = os.path.join(out_dir, f"{name}.{subset_size}.del{pct}.db") + shutil.copy2(master_db_path, copy_path) + # Also copy WAL/SHM if they exist + for suffix in ["-wal", "-shm"]: + src = master_db_path + suffix + if os.path.exists(src): + shutil.copy2(src, copy_path + suffix) + + conn = sqlite3.connect(copy_path) + conn.enable_load_extension(True) + conn.load_extension(ext_path) + conn.execute(f"ATTACH DATABASE '{base_db}' AS base") + + # Pick random IDs to delete (deterministic per pct) + rng = random.Random(seed_val + pct) + to_delete = set(rng.sample(all_ids, n_delete)) + alive_ids = set(all_ids) - to_delete + + # Delete + delete_t0 = now_ns() + batch = [] + for i, rid in enumerate(to_delete): + batch.append(rid) + if len(batch) >= 500 or i == len(to_delete) - 1: + placeholders = ",".join("?" for _ in batch) + conn.execute( + f"DELETE FROM vec_items WHERE id IN ({placeholders})", + batch, + ) + conn.commit() + batch = [] + delete_time_s = ns_to_s(now_ns() - delete_t0) + + remaining = conn.execute("SELECT count(*) FROM vec_items").fetchone()[0] + pre_vacuum_size = os.path.getsize(copy_path) + print(f" deleted {n_delete} rows in {delete_time_s:.2f}s " + f"({remaining} remaining)") + + # Measure post-delete recall + post = measure_recall(conn, base_db, query_vectors, subset_size, k, + alive_ids=alive_ids) + print(f" recall={post['recall']:.4f} " + f"(delta={post['recall'] - baseline['recall']:+.4f}) " + f"query={post['mean_ms']:.2f}ms") + + # VACUUM and measure size savings — close fully, reopen without base + conn.close() + vconn = sqlite3.connect(copy_path) + vconn.execute("VACUUM") + vconn.close() + post_vacuum_size = os.path.getsize(copy_path) + saved_mb = (pre_vacuum_size - post_vacuum_size) / (1024 * 1024) + print(f" size: {pre_vacuum_size/(1024*1024):.1f} MB -> " + f"{post_vacuum_size/(1024*1024):.1f} MB after VACUUM " + f"(saved {saved_mb:.1f} MB)") + + results.append({ + "name": name, + "index_type": params["index_type"], + "subset_size": subset_size, + "delete_pct": pct, + "n_deleted": n_delete, + "n_remaining": remaining, + "recall": post["recall"], + "query_mean_ms": post["mean_ms"], + "query_median_ms": post["median_ms"], + "db_size_mb": round(pre_vacuum_size / (1024 * 1024), 2), + "build_time_s": round(build_time_s, 1), + "delete_time_s": round(delete_time_s, 2), + "vacuum_size_mb": round(post_vacuum_size / (1024 * 1024), 2), + }) + + return results + + +# ============================================================================ +# Results DB +# ============================================================================ + +RESULTS_SCHEMA = """\ +CREATE TABLE IF NOT EXISTS delete_runs ( + run_id INTEGER PRIMARY KEY, + config_name TEXT NOT NULL, + index_type TEXT NOT NULL, + params TEXT, + dataset TEXT NOT NULL, + subset_size INTEGER NOT NULL, + delete_pct INTEGER NOT NULL, + n_deleted INTEGER NOT NULL, + n_remaining INTEGER NOT NULL, + k INTEGER NOT NULL, + n_queries INTEGER NOT NULL, + seed INTEGER NOT NULL, + recall REAL, + query_mean_ms REAL, + query_median_ms REAL, + db_size_mb REAL, + vacuum_size_mb REAL, + build_time_s REAL, + delete_time_s REAL, + created_at TEXT DEFAULT (datetime('now')) +); +""" + +def save_results(results, out_dir, dataset, subset_size, params_json, k, n_queries, seed_val): + db_path = os.path.join(out_dir, "delete_results.db") + db = sqlite3.connect(db_path) + db.execute("PRAGMA journal_mode=WAL") + db.executescript(RESULTS_SCHEMA) + for r in results: + db.execute( + "INSERT INTO delete_runs " + "(config_name, index_type, params, dataset, subset_size, " + " delete_pct, n_deleted, n_remaining, k, n_queries, seed, " + " recall, query_mean_ms, query_median_ms, " + " db_size_mb, vacuum_size_mb, build_time_s, delete_time_s) " + "VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)", + ( + r["name"], r["index_type"], params_json, dataset, r["subset_size"], + r["delete_pct"], r["n_deleted"], r["n_remaining"], k, n_queries, seed_val, + r["recall"], r["query_mean_ms"], r["query_median_ms"], + r["db_size_mb"], r["vacuum_size_mb"], r["build_time_s"], r["delete_time_s"], + ), + ) + db.commit() + db.close() + return db_path + + +# ============================================================================ +# Reporting +# ============================================================================ + +def print_report(all_results): + print(f"\n{'name':>22} {'del%':>5} {'deleted':>8} {'remain':>8} " + f"{'recall':>7} {'delta':>7} {'qry(ms)':>8} " + f"{'size(MB)':>9} {'vacuumed':>9} {'del(s)':>7}") + print("-" * 110) + + # Group by config name + configs = {} + for r in all_results: + configs.setdefault(r["name"], []).append(r) + + for name, rows in configs.items(): + baseline_recall = rows[0]["recall"] # 0% delete is always first + for r in rows: + delta = r["recall"] - baseline_recall + delta_str = f"{delta:+.4f}" if r["delete_pct"] > 0 else "-" + print( + f"{r['name']:>22} {r['delete_pct']:>4}% " + f"{r['n_deleted']:>8} {r['n_remaining']:>8} " + f"{r['recall']:>7.4f} {delta_str:>7} {r['query_mean_ms']:>8.2f} " + f"{r['db_size_mb']:>9.1f} {r['vacuum_size_mb']:>9.1f} " + f"{r['delete_time_s']:>7.2f}" + ) + print() + + +# ============================================================================ +# Main +# ============================================================================ + +def main(): + parser = argparse.ArgumentParser( + description="Benchmark recall degradation after random row deletion", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + parser.add_argument("configs", nargs="+", + help="config specs (name:type=X,key=val,...)") + parser.add_argument("--subset-size", type=int, default=10000, + help="number of vectors to build (default: 10000)") + parser.add_argument("--delete-pct", type=str, default="10,25,50", + help="comma-separated delete percentages (default: 10,25,50)") + parser.add_argument("-k", type=int, default=10, help="KNN k (default 10)") + parser.add_argument("-n", type=int, default=50, + help="number of queries (default 50)") + parser.add_argument("--dataset", default="cohere1m", + choices=list(DATASETS.keys())) + parser.add_argument("--ext", default=EXT_PATH) + parser.add_argument("-o", "--out-dir", + default=os.path.join(_SCRIPT_DIR, "runs")) + parser.add_argument("--seed", type=int, default=42, + help="random seed for delete selection (default: 42)") + args = parser.parse_args() + + ds = DATASETS[args.dataset] + base_db = ds["base_db"] + dims = ds["dimensions"] + if not os.path.exists(base_db): + print(f"Error: dataset not found at {base_db}") + print(f"Run: make -C {os.path.dirname(base_db)}") + return 1 + + delete_pcts = [int(x.strip()) for x in args.delete_pct.split(",")] + for p in delete_pcts: + if not 0 < p < 100: + print(f"Error: delete percentage must be 1-99, got {p}") + return 1 + + out_dir = os.path.join(args.out_dir, args.dataset, str(args.subset_size)) + os.makedirs(out_dir, exist_ok=True) + + all_results = [] + for spec in args.configs: + name, params = parse_config(spec) + params_json = json.dumps(params) + results = run_delete_benchmark( + name, params, base_db, args.ext, args.subset_size, dims, + delete_pcts, args.k, args.n, out_dir, args.seed, + ) + all_results.extend(results) + + save_results(results, out_dir, args.dataset, args.subset_size, + params_json, args.k, args.n, args.seed) + + print_report(all_results) + + results_path = os.path.join(out_dir, "delete_results.db") + print(f"\nResults saved to: {results_path}") + print(f"Query: sqlite3 {results_path} " + f"\"SELECT config_name, delete_pct, recall, vacuum_size_mb " + f"FROM delete_runs ORDER BY config_name, delete_pct\"") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/benchmarks-ann/bench-delete/test_smoke.py b/benchmarks-ann/bench-delete/test_smoke.py new file mode 100644 index 0000000..0caba19 --- /dev/null +++ b/benchmarks-ann/bench-delete/test_smoke.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 +"""Quick self-contained smoke test using a synthetic dataset. +Creates a tiny base.db in a temp dir, runs the delete benchmark, verifies output. +""" +import os +import random +import sqlite3 +import struct +import sys +import tempfile + +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +_ROOT_DIR = os.path.join(_SCRIPT_DIR, "..", "..") +EXT_PATH = os.path.join(_ROOT_DIR, "dist", "vec0") + +DIMS = 8 +N_TRAIN = 200 +N_QUERIES = 10 +K_NEIGHBORS = 5 + + +def _f32(vals): + return struct.pack(f"{len(vals)}f", *vals) + + +def make_synthetic_base_db(path): + """Create a minimal base.db with train vectors and query vectors.""" + rng = random.Random(123) + db = sqlite3.connect(path) + db.execute("CREATE TABLE train(id INTEGER PRIMARY KEY, vector BLOB)") + db.execute("CREATE TABLE query_vectors(id INTEGER PRIMARY KEY, vector BLOB)") + + for i in range(N_TRAIN): + vec = [rng.gauss(0, 1) for _ in range(DIMS)] + db.execute("INSERT INTO train VALUES (?, ?)", (i, _f32(vec))) + + for i in range(N_QUERIES): + vec = [rng.gauss(0, 1) for _ in range(DIMS)] + db.execute("INSERT INTO query_vectors VALUES (?, ?)", (i, _f32(vec))) + + db.commit() + db.close() + + +def main(): + if not os.path.exists(EXT_PATH + ".dylib") and not os.path.exists(EXT_PATH + ".so"): + # Try bare path (sqlite handles extension) + pass + + with tempfile.TemporaryDirectory() as tmpdir: + base_db = os.path.join(tmpdir, "base.db") + make_synthetic_base_db(base_db) + + # Patch DATASETS to use our synthetic DB + import bench_delete + bench_delete.DATASETS["synthetic"] = { + "base_db": base_db, + "dimensions": DIMS, + } + + out_dir = os.path.join(tmpdir, "runs") + + # Test flat index + print("=== Testing flat index ===") + name, params = bench_delete.parse_config("flat:type=vec0-flat,variant=float") + params["dimensions"] = DIMS + results = bench_delete.run_delete_benchmark( + name, params, base_db, EXT_PATH, + subset_size=N_TRAIN, dims=DIMS, + delete_pcts=[25, 50], k=K_NEIGHBORS, n_queries=N_QUERIES, + out_dir=out_dir, seed_val=42, + ) + + bench_delete.print_report(results) + + # Flat recall should be 1.0 at all delete % + for r in results: + assert r["recall"] == 1.0, \ + f"Flat recall should be 1.0, got {r['recall']} at {r['delete_pct']}%" + print("\n PASS: flat recall is 1.0 at all delete percentages\n") + + # Test DiskANN + print("=== Testing DiskANN ===") + name2, params2 = bench_delete.parse_config( + "diskann:type=diskann,R=8,L=32,quantizer=binary" + ) + params2["dimensions"] = DIMS + results2 = bench_delete.run_delete_benchmark( + name2, params2, base_db, EXT_PATH, + subset_size=N_TRAIN, dims=DIMS, + delete_pcts=[25, 50], k=K_NEIGHBORS, n_queries=N_QUERIES, + out_dir=out_dir, seed_val=42, + ) + + bench_delete.print_report(results2) + + # DiskANN baseline (0%) should have decent recall + baseline = results2[0] + assert baseline["recall"] > 0.0, \ + f"DiskANN baseline recall is zero" + print(f" PASS: DiskANN baseline recall={baseline['recall']}") + + # Test rescore + print("\n=== Testing rescore ===") + name3, params3 = bench_delete.parse_config( + "rescore:type=rescore,quantizer=bit,oversample=4" + ) + params3["dimensions"] = DIMS + results3 = bench_delete.run_delete_benchmark( + name3, params3, base_db, EXT_PATH, + subset_size=N_TRAIN, dims=DIMS, + delete_pcts=[25, 50], k=K_NEIGHBORS, n_queries=N_QUERIES, + out_dir=out_dir, seed_val=42, + ) + + bench_delete.print_report(results3) + print(f" PASS: rescore baseline recall={results3[0]['recall']}") + + print("\n ALL SMOKE TESTS PASSED") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/benchmarks-ann/bench.py b/benchmarks-ann/bench.py new file mode 100644 index 0000000..966c458 --- /dev/null +++ b/benchmarks-ann/bench.py @@ -0,0 +1,1350 @@ +#!/usr/bin/env python3 +"""Benchmark runner for sqlite-vec KNN configurations. + +Measures insert time, build/train time, DB size, KNN latency, and recall +across different vec0 configurations. + +Config format: name:type=,key=val,key=val + + Available types: none, vec0-flat, quantized, rescore, ivf, diskann + +Usage: + python bench.py --subset-size 10000 \ + "raw:type=none" \ + "flat:type=vec0-flat,variant=float" \ + "flat-int8:type=vec0-flat,variant=int8" +""" +import argparse +import json +import os +import sqlite3 +import statistics +import time + +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +EXT_PATH = os.path.join(_SCRIPT_DIR, "..", "dist", "vec0") +INSERT_BATCH_SIZE = 1000 + +_DATASETS_DIR = os.path.join(_SCRIPT_DIR, "datasets") + +DATASETS = { + "cohere1m": {"base_db": os.path.join(_DATASETS_DIR, "cohere1m", "base.db"), "dimensions": 768}, + "cohere10m": {"base_db": os.path.join(_DATASETS_DIR, "cohere10m", "base.db"), "dimensions": 768}, + "nyt": {"base_db": os.path.join(_DATASETS_DIR, "nyt", "base.db"), "dimensions": 256}, + "nyt-768": {"base_db": os.path.join(_DATASETS_DIR, "nyt-768", "base.db"), "dimensions": 768}, + "nyt-1024": {"base_db": os.path.join(_DATASETS_DIR, "nyt-1024", "base.db"), "dimensions": 1024}, + "nyt-384": {"base_db": os.path.join(_DATASETS_DIR, "nyt-384", "base.db"), "dimensions": 384}, +} + + +# ============================================================================ +# Timing helpers +# ============================================================================ + + +def now_ns(): + return time.time_ns() + + +def ns_to_s(ns): + return ns / 1_000_000_000 + + +def ns_to_ms(ns): + return ns / 1_000_000 + + +# ============================================================================ +# Index registry — extension point for ANN index branches +# ============================================================================ +# +# Each index type provides a dict with: +# "defaults": dict of default params +# "create_table_sql": fn(params) -> SQL string +# "insert_sql": fn(params) -> SQL string (or None for default) +# "post_insert_hook": fn(conn, params) -> train_time_s (or None) +# "train_sql": fn(params) -> SQL string (or None if no training) +# "run_query": fn(conn, params, query, k) -> [(id, distance), ...] (or None for default MATCH) +# "query_sql": fn(params) -> SQL string (or None for default MATCH) +# "describe": fn(params) -> str (one-line description) +# +# To add a new index type, add an entry here. Example (in your branch): +# +# INDEX_REGISTRY["diskann"] = { +# "defaults": {"R": 72, "L": 128, "quantizer": "binary", "buffer_threshold": 0}, +# "create_table_sql": lambda p: f"CREATE VIRTUAL TABLE vec_items USING vec0(...)", +# "insert_sql": None, +# "post_insert_hook": None, +# "run_query": None, +# "describe": lambda p: f"diskann q={p['quantizer']} R={p['R']} L={p['L']}", +# } + +INDEX_REGISTRY = {} + + +# ============================================================================ +# "none" — regular table, no vec0, manual KNN via vec_distance_cosine() +# ============================================================================ + + +def _none_create_table_sql(params): + # none uses raw tables — no dimension in DDL + variant = params["variant"] + if variant == "int8": + return ( + "CREATE TABLE vec_items (" + " id INTEGER PRIMARY KEY," + " embedding BLOB NOT NULL," + " embedding_int8 BLOB NOT NULL)" + ) + elif variant == "bit": + return ( + "CREATE TABLE vec_items (" + " id INTEGER PRIMARY KEY," + " embedding BLOB NOT NULL," + " embedding_bq BLOB NOT NULL)" + ) + return ( + "CREATE TABLE vec_items (" + " id INTEGER PRIMARY KEY," + " embedding BLOB NOT NULL)" + ) + + +def _none_insert_sql(params): + variant = params["variant"] + if variant == "int8": + return ( + "INSERT INTO vec_items(id, embedding, embedding_int8) " + "SELECT id, vector, vec_quantize_int8(vector, 'unit') " + "FROM base.train WHERE id >= :lo AND id < :hi" + ) + elif variant == "bit": + return ( + "INSERT INTO vec_items(id, embedding, embedding_bq) " + "SELECT id, vector, vec_quantize_binary(vector) " + "FROM base.train WHERE id >= :lo AND id < :hi" + ) + return ( + "INSERT INTO vec_items(id, embedding) " + "SELECT id, vector FROM base.train WHERE id >= :lo AND id < :hi" + ) + + +def _none_run_query(conn, params, query, k): + variant = params["variant"] + oversample = params.get("oversample", 8) + + if variant == "int8": + q_int8 = conn.execute( + "SELECT vec_quantize_int8(:query, 'unit')", {"query": query} + ).fetchone()[0] + return conn.execute( + "WITH coarse AS (" + " SELECT id, embedding FROM (" + " SELECT id, embedding, vec_distance_cosine(vec_int8(:q_int8), vec_int8(embedding_int8)) as dist " + " FROM vec_items ORDER BY dist LIMIT :oversample_k" + " )" + ") " + "SELECT id, vec_distance_cosine(:query, embedding) as distance " + "FROM coarse ORDER BY 2 LIMIT :k", + {"q_int8": q_int8, "query": query, "k": k, "oversample_k": k * oversample}, + ).fetchall() + elif variant == "bit": + q_bit = conn.execute( + "SELECT vec_quantize_binary(:query)", {"query": query} + ).fetchone()[0] + return conn.execute( + "WITH coarse AS (" + " SELECT id, embedding FROM (" + " SELECT id, embedding, vec_distance_hamming(vec_bit(:q_bit), vec_bit(embedding_bq)) as dist " + " FROM vec_items ORDER BY dist LIMIT :oversample_k" + " )" + ") " + "SELECT id, vec_distance_cosine(:query, embedding) as distance " + "FROM coarse ORDER BY 2 LIMIT :k", + {"q_bit": q_bit, "query": query, "k": k, "oversample_k": k * oversample}, + ).fetchall() + + return conn.execute( + "SELECT id, vec_distance_cosine(:query, embedding) as distance " + "FROM vec_items WHERE distance IS NOT NULL ORDER BY 2 LIMIT :k", + {"query": query, "k": k}, + ).fetchall() + + +def _none_describe(params): + v = params["variant"] + if v in ("int8", "bit"): + return f"none {v} (os={params['oversample']})" + return f"none float" + + +INDEX_REGISTRY["none"] = { + "defaults": {"variant": "float", "oversample": 8}, + "create_table_sql": _none_create_table_sql, + "insert_sql": _none_insert_sql, + "post_insert_hook": None, + "train_sql": None, + "run_query": _none_run_query, + "query_sql": None, + "describe": _none_describe, +} + + +# ============================================================================ +# vec0-flat — vec0 virtual table with brute-force MATCH +# ============================================================================ + + +def _vec0flat_create_table_sql(params): + D = params.get("_dimensions", 768) + variant = params["variant"] + extra = "" + if variant == "int8": + extra = f", embedding_int8 int8[{D}]" + elif variant == "bit": + extra = f", embedding_bq bit[{D}]" + return ( + f"CREATE VIRTUAL TABLE vec_items USING vec0(" + f" chunk_size=256," + f" id integer primary key," + f" embedding float[{D}] distance_metric=cosine" + f" {extra})" + ) + + +def _vec0flat_insert_sql(params): + variant = params["variant"] + if variant == "int8": + return ( + "INSERT INTO vec_items(id, embedding, embedding_int8) " + "SELECT id, vector, vec_quantize_int8(vector, 'unit') " + "FROM base.train WHERE id >= :lo AND id < :hi" + ) + elif variant == "bit": + return ( + "INSERT INTO vec_items(id, embedding, embedding_bq) " + "SELECT id, vector, vec_quantize_binary(vector) " + "FROM base.train WHERE id >= :lo AND id < :hi" + ) + return None # use default + + +def _vec0flat_run_query(conn, params, query, k): + variant = params["variant"] + oversample = params.get("oversample", 8) + + if variant == "int8": + return conn.execute( + "WITH coarse AS (" + " SELECT id, embedding FROM vec_items" + " WHERE embedding_int8 MATCH vec_quantize_int8(:query, 'unit')" + " LIMIT :oversample_k" + ") " + "SELECT id, vec_distance_cosine(embedding, :query) as distance " + "FROM coarse ORDER BY 2 LIMIT :k", + {"query": query, "k": k, "oversample_k": k * oversample}, + ).fetchall() + elif variant == "bit": + return conn.execute( + "WITH coarse AS (" + " SELECT id, embedding FROM vec_items" + " WHERE embedding_bq MATCH vec_quantize_binary(:query)" + " LIMIT :oversample_k" + ") " + "SELECT id, vec_distance_cosine(embedding, :query) as distance " + "FROM coarse ORDER BY 2 LIMIT :k", + {"query": query, "k": k, "oversample_k": k * oversample}, + ).fetchall() + + return None # use default MATCH + + +def _vec0flat_query_sql(params): + variant = params["variant"] + oversample = params.get("oversample", 8) + if variant == "int8": + return ( + "WITH coarse AS (" + " SELECT id, embedding FROM vec_items" + " WHERE embedding_int8 MATCH vec_quantize_int8(:query, 'unit')" + f" LIMIT :k * {oversample}" + ") " + "SELECT id, vec_distance_cosine(embedding, :query) as distance " + "FROM coarse ORDER BY 2 LIMIT :k" + ) + elif variant == "bit": + return ( + "WITH coarse AS (" + " SELECT id, embedding FROM vec_items" + " WHERE embedding_bq MATCH vec_quantize_binary(:query)" + f" LIMIT :k * {oversample}" + ") " + "SELECT id, vec_distance_cosine(embedding, :query) as distance " + "FROM coarse ORDER BY 2 LIMIT :k" + ) + return None + + +def _vec0flat_describe(params): + v = params["variant"] + if v in ("int8", "bit"): + return f"vec0-flat {v} (os={params['oversample']})" + return f"vec0-flat {v}" + + +INDEX_REGISTRY["vec0-flat"] = { + "defaults": {"variant": "float", "oversample": 8}, + "create_table_sql": _vec0flat_create_table_sql, + "insert_sql": _vec0flat_insert_sql, + "post_insert_hook": None, + "train_sql": None, + "run_query": _vec0flat_run_query, + "query_sql": _vec0flat_query_sql, + "describe": _vec0flat_describe, +} + + +# ============================================================================ +# Quantized-only implementation (no rescoring) +# ============================================================================ + + +def _quantized_create_table_sql(params): + D = params.get("_dimensions", 768) + quantizer = params["quantizer"] + if quantizer == "int8": + col = f"embedding int8[{D}]" + elif quantizer == "bit": + col = f"embedding bit[{D}]" + else: + raise ValueError(f"Unknown quantizer: {quantizer}") + return ( + f"CREATE VIRTUAL TABLE vec_items USING vec0(" + f" chunk_size=256," + f" id integer primary key," + f" {col})" + ) + + +def _quantized_insert_sql(params): + quantizer = params["quantizer"] + if quantizer == "int8": + return ( + "INSERT INTO vec_items(id, embedding) " + "SELECT id, vec_quantize_int8(vector, 'unit') " + "FROM base.train WHERE id >= :lo AND id < :hi" + ) + elif quantizer == "bit": + return ( + "INSERT INTO vec_items(id, embedding) " + "SELECT id, vec_quantize_binary(vector) " + "FROM base.train WHERE id >= :lo AND id < :hi" + ) + return None + + +def _quantized_run_query(conn, params, query, k): + """Search quantized column only — no rescoring.""" + quantizer = params["quantizer"] + if quantizer == "int8": + return conn.execute( + "SELECT id, distance FROM vec_items " + "WHERE embedding MATCH vec_quantize_int8(:query, 'unit') AND k = :k", + {"query": query, "k": k}, + ).fetchall() + elif quantizer == "bit": + return conn.execute( + "SELECT id, distance FROM vec_items " + "WHERE embedding MATCH vec_quantize_binary(:query) AND k = :k", + {"query": query, "k": k}, + ).fetchall() + return None + + +def _quantized_query_sql(params): + quantizer = params["quantizer"] + if quantizer == "int8": + return ( + "SELECT id, distance FROM vec_items " + "WHERE embedding MATCH vec_quantize_int8(:query, 'unit') AND k = :k" + ) + elif quantizer == "bit": + return ( + "SELECT id, distance FROM vec_items " + "WHERE embedding MATCH vec_quantize_binary(:query) AND k = :k" + ) + return None + + +def _quantized_describe(params): + return f"quantized {params['quantizer']}" + + +INDEX_REGISTRY["quantized"] = { + "defaults": {"quantizer": "bit"}, + "create_table_sql": _quantized_create_table_sql, + "insert_sql": _quantized_insert_sql, + "post_insert_hook": None, + "train_sql": None, + "run_query": _quantized_run_query, + "query_sql": _quantized_query_sql, + "describe": _quantized_describe, +} + + +# ============================================================================ +# Rescore implementation +# ============================================================================ + + +def _rescore_create_table_sql(params): + D = params.get("_dimensions", 768) + quantizer = params.get("quantizer", "bit") + oversample = params.get("oversample", 8) + return ( + f"CREATE VIRTUAL TABLE vec_items USING vec0(" + f" chunk_size=256," + f" id integer primary key," + f" embedding float[{D}] distance_metric=cosine" + f" indexed by rescore(quantizer={quantizer}, oversample={oversample}))" + ) + + +def _rescore_describe(params): + q = params.get("quantizer", "bit") + os = params.get("oversample", 8) + return f"rescore {q} (os={os})" + + +INDEX_REGISTRY["rescore"] = { + "defaults": {"quantizer": "bit", "oversample": 8}, + "create_table_sql": _rescore_create_table_sql, + "insert_sql": None, + "post_insert_hook": None, + "train_sql": None, + "run_query": None, # default MATCH query works — rescore is automatic + "query_sql": None, + "describe": _rescore_describe, +} + + +# ============================================================================ +# IVF implementation +# ============================================================================ + + +def _ivf_create_table_sql(params): + D = params.get("_dimensions", 768) + quantizer = params.get("quantizer", "none") + oversample = params.get("oversample", 1) + parts = [f"nlist={params['nlist']}", f"nprobe={params['nprobe']}"] + if quantizer != "none": + parts.append(f"quantizer={quantizer}") + if oversample > 1: + parts.append(f"oversample={oversample}") + ivf_args = ", ".join(parts) + return ( + f"CREATE VIRTUAL TABLE vec_items USING vec0(" + f"id integer primary key, " + f"embedding float[{D}] distance_metric=cosine " + f"indexed by ivf({ivf_args}))" + ) + + +def _ivf_post_insert_hook(conn, params): + print(" Training k-means centroids (built-in)...", flush=True) + t0 = time.perf_counter() + conn.execute("INSERT INTO vec_items(vec_items) VALUES ('compute-centroids')") + conn.commit() + elapsed = time.perf_counter() - t0 + print(f" Training done in {elapsed:.1f}s", flush=True) + return elapsed + + +def _ivf_faiss_kmeans_hook(conn, params): + """Run FAISS k-means externally, then load centroids via set-centroid commands. + + Called BEFORE any inserts — centroids are loaded first so vectors get + assigned to partitions on insert (no assign-vectors step needed). + """ + import subprocess + import tempfile + + nlist = params["nlist"] + ntrain = params.get("train_sample", 0) or params.get("faiss_kmeans", 10000) + niter = params.get("faiss_niter", 20) + base_db = params.get("_base_db") # injected by build_index + + print(f" Training k-means via FAISS ({nlist} clusters, {ntrain} vectors, {niter} iters)...", + flush=True) + + centroids_db_path = tempfile.mktemp(suffix=".db") + t0 = time.perf_counter() + + result = subprocess.run( + [ + "uv", "run", "--with", "faiss-cpu", "--with", "numpy", + "python", os.path.join(_SCRIPT_DIR, "faiss_kmeans.py"), + "--base-db", base_db, + "--ntrain", str(ntrain), + "--nclusters", str(nlist), + "--niter", str(niter), + "-o", centroids_db_path, + ], + capture_output=True, text=True, + ) + if result.returncode != 0: + print(f" FAISS stderr: {result.stderr}", flush=True) + raise RuntimeError(f"faiss_kmeans.py failed: {result.stderr}") + + faiss_elapsed = time.perf_counter() - t0 + print(f" FAISS k-means done in {faiss_elapsed:.1f}s", flush=True) + + # Load centroids into vec0 via set-centroid commands + print(f" Loading {nlist} centroids into vec0...", flush=True) + cdb = sqlite3.connect(centroids_db_path) + centroids = cdb.execute( + "SELECT centroid_id, centroid FROM centroids ORDER BY centroid_id" + ).fetchall() + meta = dict(cdb.execute("SELECT key, value FROM meta").fetchall()) + cdb.close() + os.remove(centroids_db_path) + + for cid, blob in centroids: + conn.execute( + "INSERT INTO vec_items(vec_items, embedding) VALUES (?, ?)", + (f"set-centroid:{cid}", blob), + ) + conn.commit() + + elapsed = time.perf_counter() - t0 + print(f" Centroids loaded in {elapsed:.1f}s total", flush=True) + + # Stash meta for results tracking + params["_faiss_meta"] = { + "ntrain": meta.get("ntrain"), + "nclusters": meta.get("nclusters"), + "niter": meta.get("niter"), + "faiss_elapsed_s": meta.get("elapsed_s"), + "total_elapsed_s": round(elapsed, 3), + "trainer": "faiss", + } + + return elapsed + + +def _ivf_pre_query_hook(conn, params): + """Override nprobe at runtime via command dispatch.""" + nprobe = params.get("nprobe") + if nprobe: + conn.execute( + "INSERT INTO vec_items(vec_items) VALUES (?)", + (f"nprobe={nprobe}",), + ) + conn.commit() + print(f" Set nprobe={nprobe}") + + +def _ivf_describe(params): + ts = params.get("train_sample", 0) + q = params.get("quantizer", "none") + os_val = params.get("oversample", 1) + fk = params.get("faiss_kmeans", 0) + desc = f"ivf nlist={params['nlist']:<4} nprobe={params['nprobe']}" + if q != "none": + desc += f" q={q}" + if os_val > 1: + desc += f" os={os_val}" + if fk: + desc += f" faiss" + if ts: + desc += f" ts={ts}" + return desc + + +INDEX_REGISTRY["ivf"] = { + "defaults": {"nlist": 128, "nprobe": 16, "train_sample": 0, + "quantizer": "none", "oversample": 1, + "faiss_kmeans": 0, "faiss_niter": 20}, + "create_table_sql": _ivf_create_table_sql, + "insert_sql": None, + "post_insert_hook": _ivf_post_insert_hook, + "pre_query_hook": _ivf_pre_query_hook, + "train_sql": lambda _: "INSERT INTO vec_items(vec_items) VALUES ('compute-centroids')", + "run_query": None, + "query_sql": None, + "describe": _ivf_describe, +} + + +# ============================================================================ +# DiskANN implementation +# ============================================================================ + + +def _diskann_create_table_sql(params): + D = params.get("_dimensions", 768) + parts = [ + f"neighbor_quantizer={params['quantizer']}", + f"n_neighbors={params['R']}", + ] + L_insert = params.get("L_insert", 0) + L_search = params.get("L_search", 0) + if L_insert or L_search: + li = L_insert or params["L"] + ls = L_search or params["L"] + parts.append(f"search_list_size_insert={li}") + parts.append(f"search_list_size_search={ls}") + else: + parts.append(f"search_list_size={params['L']}") + bt = params["buffer_threshold"] + if bt > 0: + parts.append(f"buffer_threshold={bt}") + diskann_args = ", ".join(parts) + return ( + f"CREATE VIRTUAL TABLE vec_items USING vec0(" + f"id integer primary key, " + f"embedding float[{D}] distance_metric=cosine " + f"indexed by diskann({diskann_args}))" + ) + + +def _diskann_pre_query_hook(conn, params): + """Override search_list_size_search at runtime via command dispatch.""" + L_search = params.get("L_search", 0) + if L_search: + conn.execute( + "INSERT INTO vec_items(vec_items) VALUES (?)", + (f"search_list_size_search={L_search}",), + ) + conn.commit() + print(f" Set search_list_size_search={L_search}") + + +def _diskann_describe(params): + L_insert = params.get("L_insert", 0) + L_search = params.get("L_search", 0) + if L_insert or L_search: + li = L_insert or params["L"] + ls = L_search or params["L"] + l_str = f"Li={li} Ls={ls}" + else: + l_str = f"L={params['L']}" + return f"diskann q={params['quantizer']:<6} R={params['R']:<3} {l_str}" + + +INDEX_REGISTRY["diskann"] = { + "defaults": {"R": 72, "L": 128, "L_insert": 0, "L_search": 0, + "quantizer": "binary", "buffer_threshold": 0}, + "create_table_sql": _diskann_create_table_sql, + "insert_sql": None, + "post_insert_hook": None, + "pre_query_hook": _diskann_pre_query_hook, + "train_sql": None, + "run_query": None, + "query_sql": None, + "describe": _diskann_describe, +} + + +# ============================================================================ +# Config parsing +# ============================================================================ + +INT_KEYS = { + "R", "L", "L_insert", "L_search", "buffer_threshold", + "nlist", "nprobe", "oversample", "n_trees", "search_k", + "train_sample", "faiss_kmeans", "faiss_niter", +} + + +def parse_config(spec): + """Parse 'name:type=baseline,key=val,...' into (name, params_dict).""" + if ":" in spec: + name, opts_str = spec.split(":", 1) + else: + name, opts_str = spec, "" + + raw = {} + if opts_str: + for kv in opts_str.split(","): + k, v = kv.split("=", 1) + raw[k.strip()] = v.strip() + + index_type = raw.pop("type", "vec0-flat") + if index_type not in INDEX_REGISTRY: + raise ValueError( + f"Unknown index type: {index_type}. " + f"Available: {', '.join(sorted(INDEX_REGISTRY.keys()))}" + ) + + reg = INDEX_REGISTRY[index_type] + params = dict(reg["defaults"]) + for k, v in raw.items(): + if k in INT_KEYS: + params[k] = int(v) + else: + params[k] = v + params["index_type"] = index_type + + return name, params + + +def params_to_json(params): + """Serialize params to JSON, excluding internal keys.""" + return json.dumps({k: v for k, v in sorted(params.items()) + if not k.startswith("_") and k != "index_type"}) + + +# ============================================================================ +# Shared helpers +# ============================================================================ + + +def load_query_vectors(base_db_path, n): + conn = sqlite3.connect(base_db_path) + rows = conn.execute( + "SELECT id, vector FROM query_vectors ORDER BY id LIMIT :n", {"n": n} + ).fetchall() + conn.close() + return [(r[0], r[1]) for r in rows] + + +def insert_loop(conn, sql, subset_size, label="", results_db=None, run_id=None, + start_from=0): + loop_start_ns = now_ns() + for lo in range(start_from, subset_size, INSERT_BATCH_SIZE): + hi = min(lo + INSERT_BATCH_SIZE, subset_size) + batch_start_ns = now_ns() + conn.execute(sql, {"lo": lo, "hi": hi}) + conn.commit() + batch_end_ns = now_ns() + done = hi + + if results_db is not None and run_id is not None: + elapsed_total_ns = batch_end_ns - loop_start_ns + elapsed_total_s = ns_to_s(elapsed_total_ns) + rate = done / elapsed_total_s if elapsed_total_s > 0 else 0 + results_db.execute( + "INSERT INTO insert_batches " + "(run_id, batch_lo, batch_hi, rows_in_batch, " + " started_ns, ended_ns, duration_ns, " + " cumulative_rows, rate_rows_per_s) " + "VALUES (?,?,?,?,?,?,?,?,?)", + ( + run_id, lo, hi, hi - lo, + batch_start_ns, batch_end_ns, + batch_end_ns - batch_start_ns, + done, round(rate, 1), + ), + ) + + if results_db is not None and run_id is not None: + results_db.commit() + + if done % 5000 == 0 or done == subset_size: + elapsed_total_ns = batch_end_ns - loop_start_ns + elapsed_total_s = ns_to_s(elapsed_total_ns) + rate = done / elapsed_total_s if elapsed_total_s > 0 else 0 + print( + f" [{label}] {done:>8}/{subset_size} " + f"{elapsed_total_s:.1f}s {rate:.0f} rows/s", + flush=True, + ) + + return time.perf_counter() # not used for timing anymore, kept for compat + + +def create_bench_db(db_path, ext_path, base_db, page_size=4096): + if os.path.exists(db_path): + os.remove(db_path) + conn = sqlite3.connect(db_path) + conn.enable_load_extension(True) + conn.load_extension(ext_path) + if page_size != 4096: + conn.execute(f"PRAGMA page_size={page_size}") + conn.execute(f"ATTACH DATABASE '{base_db}' AS base") + return conn + + +def open_existing_bench_db(db_path, ext_path, base_db): + if not os.path.exists(db_path): + raise FileNotFoundError( + f"Index DB not found: {db_path}\n" + f"Build it first with: --phase build" + ) + conn = sqlite3.connect(db_path) + conn.enable_load_extension(True) + conn.load_extension(ext_path) + conn.execute(f"ATTACH DATABASE '{base_db}' AS base") + return conn + + +DEFAULT_INSERT_SQL = ( + "INSERT INTO vec_items(id, embedding) " + "SELECT id, vector FROM base.train WHERE id >= :lo AND id < :hi" +) + +DEFAULT_QUERY_SQL = ( + "SELECT id, distance FROM vec_items " + "WHERE embedding MATCH :query AND k = :k" +) + + +# ============================================================================ +# Results DB helpers +# ============================================================================ + +_RESULTS_SCHEMA_PATH = os.path.join(_SCRIPT_DIR, "results_schema.sql") + + +def open_results_db(out_dir, dataset, subset_size, results_db_name="results.db"): + """Open/create the results DB in WAL mode.""" + sub_dir = os.path.join(out_dir, dataset, str(subset_size)) + os.makedirs(sub_dir, exist_ok=True) + db_path = os.path.join(sub_dir, results_db_name) + db = sqlite3.connect(db_path, timeout=60) + db.execute("PRAGMA journal_mode=WAL") + db.execute("PRAGMA busy_timeout=60000") + # Migrate existing DBs: add phase column before running schema + cols = {r[1] for r in db.execute("PRAGMA table_info(runs)").fetchall()} + if cols and "phase" not in cols: + db.execute("ALTER TABLE runs ADD COLUMN phase TEXT NOT NULL DEFAULT 'both'") + db.commit() + with open(_RESULTS_SCHEMA_PATH) as f: + db.executescript(f.read()) + return db, sub_dir + + +def create_run(results_db, config_name, index_type, params, dataset, + subset_size, k, n_queries, phase="both"): + """Insert a new run row and return the run_id.""" + cur = results_db.execute( + "INSERT INTO runs " + "(config_name, index_type, params, dataset, subset_size, " + " k, n_queries, phase, status, created_at_ns) " + "VALUES (?,?,?,?,?,?,?,?,?,?)", + ( + config_name, index_type, params_to_json(params), dataset, + subset_size, k, n_queries, phase, "pending", now_ns(), + ), + ) + results_db.commit() + return cur.lastrowid + + +def update_run_status(results_db, run_id, status): + results_db.execute( + "UPDATE runs SET status=? WHERE run_id=?", (status, run_id) + ) + results_db.commit() + + +# ============================================================================ +# Build +# ============================================================================ + + +def build_index(base_db, ext_path, name, params, subset_size, sub_dir, + results_db=None, run_id=None, k=None): + db_path = os.path.join(sub_dir, f"{name}.{subset_size}.db") + params["_base_db"] = base_db # expose to hooks (e.g. FAISS k-means) + page_size = int(params.get("page_size", 4096)) + conn = create_bench_db(db_path, ext_path, base_db, page_size=page_size) + + reg = INDEX_REGISTRY[params["index_type"]] + + create_sql = reg["create_table_sql"](params) + conn.execute(create_sql) + + label = params["index_type"] + print(f" Inserting {subset_size} vectors...") + + sql_fn = reg.get("insert_sql") + insert_sql = sql_fn(params) if sql_fn else None + if insert_sql is None: + insert_sql = DEFAULT_INSERT_SQL + + train_sql_fn = reg.get("train_sql") + train_sql = train_sql_fn(params) if train_sql_fn else None + + query_sql_fn = reg.get("query_sql") + query_sql = query_sql_fn(params) if query_sql_fn else None + if query_sql is None: + query_sql = DEFAULT_QUERY_SQL + + # -- Insert + Training phases -- + train_sample = params.get("train_sample", 0) + hook = reg.get("post_insert_hook") + faiss_kmeans = params.get("faiss_kmeans", 0) + + train_started_ns = None + train_ended_ns = None + train_duration_ns = None + train_time_s = 0.0 + + if faiss_kmeans: + # FAISS mode: train on base.db first, load centroids, then insert all + if results_db and run_id: + update_run_status(results_db, run_id, "training") + train_started_ns = now_ns() + train_time_s = _ivf_faiss_kmeans_hook(conn, params) + train_ended_ns = now_ns() + train_duration_ns = train_ended_ns - train_started_ns + + # Now insert all vectors (they get assigned on insert) + if results_db and run_id: + update_run_status(results_db, run_id, "inserting") + insert_started_ns = now_ns() + insert_loop(conn, insert_sql, subset_size, label, + results_db=results_db, run_id=run_id) + insert_ended_ns = now_ns() + insert_duration_ns = insert_ended_ns - insert_started_ns + + elif train_sample and hook and train_sample < subset_size: + # Built-in k-means: insert sample, train, insert rest + if results_db and run_id: + update_run_status(results_db, run_id, "inserting") + insert_started_ns = now_ns() + + print(f" Inserting {train_sample} vectors (training sample)...") + insert_loop(conn, insert_sql, train_sample, label, + results_db=results_db, run_id=run_id) + insert_paused_ns = now_ns() + + # -- Training on sample -- + if results_db and run_id: + update_run_status(results_db, run_id, "training") + train_started_ns = now_ns() + train_time_s = hook(conn, params) + train_ended_ns = now_ns() + train_duration_ns = train_ended_ns - train_started_ns + + # -- Insert remaining vectors -- + if results_db and run_id: + update_run_status(results_db, run_id, "inserting") + print(f" Inserting remaining {subset_size - train_sample} vectors...") + insert_loop(conn, insert_sql, subset_size, label, + results_db=results_db, run_id=run_id, + start_from=train_sample) + insert_ended_ns = now_ns() + + # Insert time = total wall time minus training time + insert_duration_ns = (insert_paused_ns - insert_started_ns) + \ + (insert_ended_ns - train_ended_ns) + else: + # Standard flow: insert all, then train + if results_db and run_id: + update_run_status(results_db, run_id, "inserting") + insert_started_ns = now_ns() + + insert_loop(conn, insert_sql, subset_size, label, + results_db=results_db, run_id=run_id) + insert_ended_ns = now_ns() + insert_duration_ns = insert_ended_ns - insert_started_ns + + if hook: + if results_db and run_id: + update_run_status(results_db, run_id, "training") + train_started_ns = now_ns() + train_time_s = hook(conn, params) + train_ended_ns = now_ns() + train_duration_ns = train_ended_ns - train_started_ns + + row_count = conn.execute("SELECT count(*) FROM vec_items").fetchone()[0] + conn.close() + file_size_bytes = os.path.getsize(db_path) + + build_duration_ns = insert_duration_ns + (train_duration_ns or 0) + insert_time_s = ns_to_s(insert_duration_ns) + + # If FAISS was used for training, record its meta as train_sql + faiss_meta = params.get("_faiss_meta") + if faiss_meta: + train_sql = json.dumps(faiss_meta) + + # Write run_results (build portion) + if results_db and run_id: + results_db.execute( + "INSERT INTO run_results " + "(run_id, insert_started_ns, insert_ended_ns, insert_duration_ns, " + " train_started_ns, train_ended_ns, train_duration_ns, " + " build_duration_ns, db_file_size_bytes, db_file_path, " + " create_sql, insert_sql, train_sql, query_sql, k) " + "VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)", + ( + run_id, insert_started_ns, insert_ended_ns, insert_duration_ns, + train_started_ns, train_ended_ns, train_duration_ns, + build_duration_ns, file_size_bytes, db_path, + create_sql, insert_sql, train_sql, query_sql, k, + ), + ) + results_db.commit() + + return { + "db_path": db_path, + "insert_time_s": round(insert_time_s, 3), + "train_time_s": round(train_time_s, 3), + "total_time_s": round(insert_time_s + train_time_s, 3), + "insert_per_vec_ms": round((insert_time_s / row_count) * 1000, 2) + if row_count + else 0, + "rows": row_count, + "file_size_mb": round(file_size_bytes / (1024 * 1024), 2), + } + + +# ============================================================================ +# KNN measurement +# ============================================================================ + + +def _default_match_query(conn, query, k): + return conn.execute( + "SELECT id, distance FROM vec_items " + "WHERE embedding MATCH :query AND k = :k", + {"query": query, "k": k}, + ).fetchall() + + +def measure_knn(db_path, ext_path, base_db, params, subset_size, k=10, n=50, + results_db=None, run_id=None, pre_query_hook=None, warmup=0): + conn = sqlite3.connect(db_path) + conn.enable_load_extension(True) + conn.load_extension(ext_path) + conn.execute(f"ATTACH DATABASE '{base_db}' AS base") + + if pre_query_hook: + pre_query_hook(conn, params) + + query_vectors = load_query_vectors(base_db, n) + + reg = INDEX_REGISTRY[params["index_type"]] + query_fn = reg.get("run_query") + + # Warmup: run random queries to populate OS page cache + if warmup > 0: + import random + warmup_vecs = [qv for _, qv in query_vectors] + print(f" Warming up with {warmup} queries...", flush=True) + for _ in range(warmup): + wq = random.choice(warmup_vecs) + if query_fn: + query_fn(conn, params, wq, k) + else: + _default_match_query(conn, wq, k) + + if results_db and run_id: + update_run_status(results_db, run_id, "querying") + + times_ms = [] + recalls = [] + for i, (qid, query) in enumerate(query_vectors): + started_ns = now_ns() + + results = None + if query_fn: + results = query_fn(conn, params, query, k) + if results is None: + results = _default_match_query(conn, query, k) + + ended_ns = now_ns() + duration_ms = ns_to_ms(ended_ns - started_ns) + times_ms.append(duration_ms) + + result_ids_list = [r[0] for r in results] + result_distances_list = [r[1] for r in results] + result_ids = set(result_ids_list) + + # Ground truth: use pre-computed neighbors table for full dataset, + # otherwise brute-force over the subset + if subset_size >= 1000000: + gt_rows = conn.execute( + "SELECT CAST(neighbors_id AS INTEGER) FROM base.neighbors " + "WHERE query_vector_id = :qid AND rank < :k", + {"qid": qid, "k": k}, + ).fetchall() + else: + gt_rows = conn.execute( + "SELECT id FROM (" + " SELECT id, vec_distance_cosine(vector, :query) as dist " + " FROM base.train WHERE id < :n ORDER BY dist LIMIT :k" + ")", + {"query": query, "k": k, "n": subset_size}, + ).fetchall() + gt_ids_list = [r[0] for r in gt_rows] + gt_ids = set(gt_ids_list) + + if gt_ids: + q_recall = len(result_ids & gt_ids) / len(gt_ids) + else: + q_recall = 0.0 + recalls.append(q_recall) + + if results_db and run_id: + results_db.execute( + "INSERT INTO queries " + "(run_id, k, query_vector_id, started_ns, ended_ns, duration_ms, " + " result_ids, result_distances, ground_truth_ids, recall) " + "VALUES (?,?,?,?,?,?,?,?,?,?)", + ( + run_id, k, qid, started_ns, ended_ns, round(duration_ms, 4), + json.dumps(result_ids_list), + json.dumps(result_distances_list), + json.dumps(gt_ids_list), + round(q_recall, 6), + ), + ) + results_db.commit() + + conn.close() + + mean_ms = round(statistics.mean(times_ms), 2) + median_ms = round(statistics.median(times_ms), 2) + p99_ms = (round(sorted(times_ms)[int(len(times_ms) * 0.99)], 2) + if len(times_ms) > 1 + else round(times_ms[0], 2)) + total_ms = round(sum(times_ms), 2) + recall = round(statistics.mean(recalls), 4) + qps = round(len(times_ms) / (total_ms / 1000), 1) if total_ms > 0 else 0 + + # Update run_results with query aggregates + if results_db and run_id: + results_db.execute( + "UPDATE run_results SET " + "query_mean_ms=?, query_median_ms=?, query_p99_ms=?, " + "query_total_ms=?, qps=?, recall=? " + "WHERE run_id=?", + (mean_ms, median_ms, p99_ms, total_ms, qps, recall, run_id), + ) + update_run_status(results_db, run_id, "done") + + return { + "mean_ms": mean_ms, + "median_ms": median_ms, + "p99_ms": p99_ms, + "total_ms": total_ms, + "recall": recall, + } + + +# ============================================================================ +# Reporting +# ============================================================================ + + +def print_report(all_results): + print( + f"\n{'name':>20} {'N':>7} {'type':>10} {'config':>28} " + f"{'ins(s)':>7} {'train':>6} {'MB':>7} " + f"{'qry(ms)':>8} {'recall':>7}" + ) + print("-" * 115) + for r in all_results: + train = f"{r['train_time_s']:.1f}" if r["train_time_s"] > 0 else "-" + print( + f"{r['name']:>20} {r['n_vectors']:>7} {r['index_type']:>10} " + f"{r['config_desc']:>28} " + f"{r['insert_time_s']:>7.1f} {train:>6} {r['file_size_mb']:>7.1f} " + f"{r['mean_ms']:>8.2f} {r['recall']:>7.4f}" + ) + + +# ============================================================================ +# Main +# ============================================================================ + + +def main(): + parser = argparse.ArgumentParser( + description="Benchmark runner for sqlite-vec KNN configurations", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + parser.add_argument("configs", nargs="+", help="config specs (name:type=X,key=val,...)") + parser.add_argument("--subset-size", type=int, default=None, + help="number of vectors to use (default: all)") + parser.add_argument("-k", type=int, default=10, help="KNN k (default 10)") + parser.add_argument("-n", type=int, default=50, help="number of queries (default 50)") + parser.add_argument("--phase", choices=["build", "query", "both"], default="both", + help="build=build only, query=query existing index, both=default") + parser.add_argument("--dataset", default="cohere1m", + choices=list(DATASETS.keys()), + help="dataset name (default: cohere1m)") + parser.add_argument("--ext", default=EXT_PATH) + parser.add_argument("-o", "--out-dir", default=os.path.join(_SCRIPT_DIR, "runs")) + parser.add_argument("--warmup", type=int, default=0, + help="run N random warmup queries before measuring (default: 0)") + parser.add_argument("--results-db-name", default="results.db", + help="results DB filename (default: results.db)") + args = parser.parse_args() + + dataset_cfg = DATASETS[args.dataset] + base_db = dataset_cfg["base_db"] + dimensions = dataset_cfg["dimensions"] + + if args.subset_size is None: + _tmp = sqlite3.connect(base_db) + args.subset_size = _tmp.execute("SELECT COUNT(*) FROM train").fetchone()[0] + _tmp.close() + print(f"Using full dataset: {args.subset_size} vectors") + + results_db, sub_dir = open_results_db(args.out_dir, args.dataset, args.subset_size, + results_db_name=args.results_db_name) + configs = [parse_config(c) for c in args.configs] + for _, params in configs: + params["_dimensions"] = dimensions + + all_results = [] + for i, (name, params) in enumerate(configs, 1): + reg = INDEX_REGISTRY[params["index_type"]] + desc = reg["describe"](params) + print(f"\n[{i}/{len(configs)}] {name} ({desc.strip()}) [phase={args.phase}]") + + db_path = os.path.join(sub_dir, f"{name}.{args.subset_size}.db") + + if args.phase == "build": + run_id = create_run( + results_db, name, params["index_type"], params, + args.dataset, args.subset_size, args.k, args.n, phase="build", + ) + + try: + build = build_index( + base_db, args.ext, name, params, args.subset_size, sub_dir, + results_db=results_db, run_id=run_id, k=args.k, + ) + train_str = f" + {build['train_time_s']}s train" if build["train_time_s"] > 0 else "" + print( + f" Build: {build['insert_time_s']}s insert{train_str} " + f"{build['file_size_mb']} MB" + ) + update_run_status(results_db, run_id, "built") + print(f" Index DB: {build['db_path']}") + except Exception as e: + update_run_status(results_db, run_id, "error") + print(f" ERROR: {e}") + raise + + elif args.phase == "query": + if not os.path.exists(db_path): + raise FileNotFoundError( + f"Index DB not found: {db_path}\n" + f"Build it first with: --phase build" + ) + + run_id = create_run( + results_db, name, params["index_type"], params, + args.dataset, args.subset_size, args.k, args.n, phase="query", + ) + + try: + # Create a run_results row so measure_knn can UPDATE it + file_size_bytes = os.path.getsize(db_path) + results_db.execute( + "INSERT INTO run_results " + "(run_id, db_file_size_bytes, db_file_path, k) " + "VALUES (?,?,?,?)", + (run_id, file_size_bytes, db_path, args.k), + ) + results_db.commit() + + pre_hook = reg.get("pre_query_hook") + print(f" Measuring KNN (k={args.k}, n={args.n})...") + knn = measure_knn( + db_path, args.ext, base_db, + params, args.subset_size, k=args.k, n=args.n, + results_db=results_db, run_id=run_id, + pre_query_hook=pre_hook, warmup=args.warmup, + ) + print(f" KNN: mean={knn['mean_ms']}ms recall@{args.k}={knn['recall']}") + except Exception as e: + update_run_status(results_db, run_id, "error") + print(f" ERROR: {e}") + raise + + file_size_mb = os.path.getsize(db_path) / (1024 * 1024) + all_results.append({ + "name": name, + "n_vectors": args.subset_size, + "index_type": params["index_type"], + "config_desc": desc, + "db_path": db_path, + "insert_time_s": 0, + "train_time_s": 0, + "total_time_s": 0, + "insert_per_vec_ms": 0, + "rows": 0, + "file_size_mb": file_size_mb, + "k": args.k, + "n_queries": args.n, + "mean_ms": knn["mean_ms"], + "median_ms": knn["median_ms"], + "p99_ms": knn["p99_ms"], + "total_ms": knn["total_ms"], + "recall": knn["recall"], + }) + + else: # both + run_id = create_run( + results_db, name, params["index_type"], params, + args.dataset, args.subset_size, args.k, args.n, phase="both", + ) + + try: + build = build_index( + base_db, args.ext, name, params, args.subset_size, sub_dir, + results_db=results_db, run_id=run_id, k=args.k, + ) + train_str = f" + {build['train_time_s']}s train" if build["train_time_s"] > 0 else "" + print( + f" Build: {build['insert_time_s']}s insert{train_str} " + f"{build['file_size_mb']} MB" + ) + + pre_hook = reg.get("pre_query_hook") + print(f" Measuring KNN (k={args.k}, n={args.n})...") + knn = measure_knn( + build["db_path"], args.ext, base_db, + params, args.subset_size, k=args.k, n=args.n, + results_db=results_db, run_id=run_id, + pre_query_hook=pre_hook, warmup=args.warmup, + ) + print(f" KNN: mean={knn['mean_ms']}ms recall@{args.k}={knn['recall']}") + except Exception as e: + update_run_status(results_db, run_id, "error") + print(f" ERROR: {e}") + raise + + all_results.append({ + "name": name, + "n_vectors": args.subset_size, + "index_type": params["index_type"], + "config_desc": desc, + "db_path": build["db_path"], + "insert_time_s": build["insert_time_s"], + "train_time_s": build["train_time_s"], + "total_time_s": build["total_time_s"], + "insert_per_vec_ms": build["insert_per_vec_ms"], + "rows": build["rows"], + "file_size_mb": build["file_size_mb"], + "k": args.k, + "n_queries": args.n, + "mean_ms": knn["mean_ms"], + "median_ms": knn["median_ms"], + "p99_ms": knn["p99_ms"], + "total_ms": knn["total_ms"], + "recall": knn["recall"], + }) + + if all_results: + print_report(all_results) + + print(f"\nResults DB: {os.path.join(sub_dir, 'results.db')}") + results_db.close() + + +if __name__ == "__main__": + main() diff --git a/benchmarks-ann/datasets/cohere10m/Makefile b/benchmarks-ann/datasets/cohere10m/Makefile new file mode 100644 index 0000000..322b21c --- /dev/null +++ b/benchmarks-ann/datasets/cohere10m/Makefile @@ -0,0 +1,27 @@ +BASE_URL = https://assets.zilliz.com/benchmark/cohere_large_10m + +TRAIN_PARQUETS = $(shell printf 'train-%02d-of-10.parquet ' 0 1 2 3 4 5 6 7 8 9) +OTHER_PARQUETS = test.parquet neighbors.parquet +PARQUETS = $(TRAIN_PARQUETS) $(OTHER_PARQUETS) + +.PHONY: all download clean + +all: base.db + +# Use: make -j12 download +download: $(PARQUETS) + +train-%-of-10.parquet: + curl -L -o $@ $(BASE_URL)/$@ + +test.parquet: + curl -L -o $@ $(BASE_URL)/test.parquet + +neighbors.parquet: + curl -L -o $@ $(BASE_URL)/neighbors.parquet + +base.db: $(PARQUETS) build_base_db.py + uv run --with pandas --with pyarrow python build_base_db.py + +clean: + rm -f base.db diff --git a/benchmarks-ann/datasets/cohere10m/build_base_db.py b/benchmarks-ann/datasets/cohere10m/build_base_db.py new file mode 100644 index 0000000..ceaeb22 --- /dev/null +++ b/benchmarks-ann/datasets/cohere10m/build_base_db.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 +"""Build base.db from downloaded parquet files (10M dataset, 10 train shards). + +Reads train-00-of-10.parquet .. train-09-of-10.parquet, test.parquet, +neighbors.parquet and creates a SQLite database with tables: + train, query_vectors, neighbors. + +Usage: + uv run --with pandas --with pyarrow python build_base_db.py +""" +import json +import os +import sqlite3 +import struct +import sys +import time + +import pandas as pd + +TRAIN_SHARDS = 10 + + +def float_list_to_blob(floats): + """Pack a list of floats into a little-endian f32 blob.""" + return struct.pack(f"<{len(floats)}f", *floats) + + +def main(): + script_dir = os.path.dirname(os.path.abspath(__file__)) + db_path = os.path.join(script_dir, "base.db") + + train_paths = [ + os.path.join(script_dir, f"train-{i:02d}-of-{TRAIN_SHARDS}.parquet") + for i in range(TRAIN_SHARDS) + ] + test_path = os.path.join(script_dir, "test.parquet") + neighbors_path = os.path.join(script_dir, "neighbors.parquet") + + for p in train_paths + [test_path, neighbors_path]: + if not os.path.exists(p): + print(f"ERROR: {p} not found. Run 'make download' first.") + sys.exit(1) + + if os.path.exists(db_path): + os.remove(db_path) + + conn = sqlite3.connect(db_path) + conn.execute("PRAGMA journal_mode=WAL") + conn.execute("PRAGMA page_size=4096") + + # --- query_vectors (from test.parquet) --- + print("Loading test.parquet (query vectors)...") + t0 = time.perf_counter() + df_test = pd.read_parquet(test_path) + conn.execute( + "CREATE TABLE query_vectors (id INTEGER PRIMARY KEY, vector BLOB)" + ) + rows = [] + for _, row in df_test.iterrows(): + rows.append((int(row["id"]), float_list_to_blob(row["emb"]))) + conn.executemany("INSERT INTO query_vectors (id, vector) VALUES (?, ?)", rows) + conn.commit() + print(f" {len(rows)} query vectors in {time.perf_counter() - t0:.1f}s") + + # --- neighbors (from neighbors.parquet) --- + print("Loading neighbors.parquet...") + t0 = time.perf_counter() + df_neighbors = pd.read_parquet(neighbors_path) + conn.execute( + "CREATE TABLE neighbors (" + " query_vector_id INTEGER, rank INTEGER, neighbors_id TEXT," + " UNIQUE(query_vector_id, rank))" + ) + rows = [] + for _, row in df_neighbors.iterrows(): + qid = int(row["id"]) + nids = row["neighbors_id"] + if isinstance(nids, str): + nids = json.loads(nids) + for rank, nid in enumerate(nids): + rows.append((qid, rank, str(int(nid)))) + conn.executemany( + "INSERT INTO neighbors (query_vector_id, rank, neighbors_id) VALUES (?, ?, ?)", + rows, + ) + conn.commit() + print(f" {len(rows)} neighbor rows in {time.perf_counter() - t0:.1f}s") + + # --- train (from 10 shard parquets) --- + print(f"Loading {TRAIN_SHARDS} train shards (10M vectors, this will take a while)...") + conn.execute( + "CREATE TABLE train (id INTEGER PRIMARY KEY, vector BLOB)" + ) + + global_t0 = time.perf_counter() + total_inserted = 0 + batch_size = 10000 + + for shard_idx, train_path in enumerate(train_paths): + print(f" Shard {shard_idx + 1}/{TRAIN_SHARDS}: {os.path.basename(train_path)}") + t0 = time.perf_counter() + df = pd.read_parquet(train_path) + shard_len = len(df) + + for start in range(0, shard_len, batch_size): + chunk = df.iloc[start : start + batch_size] + rows = [] + for _, row in chunk.iterrows(): + rows.append((int(row["id"]), float_list_to_blob(row["emb"]))) + conn.executemany("INSERT INTO train (id, vector) VALUES (?, ?)", rows) + conn.commit() + + total_inserted += len(rows) + if total_inserted % 100000 < batch_size: + elapsed = time.perf_counter() - global_t0 + rate = total_inserted / elapsed if elapsed > 0 else 0 + print( + f" {total_inserted:>10} {elapsed:.0f}s {rate:.0f} rows/s", + flush=True, + ) + + shard_elapsed = time.perf_counter() - t0 + print(f" shard done: {shard_len} rows in {shard_elapsed:.1f}s") + + elapsed = time.perf_counter() - global_t0 + print(f" {total_inserted} train vectors in {elapsed:.1f}s") + + conn.close() + size_mb = os.path.getsize(db_path) / (1024 * 1024) + print(f"\nDone: {db_path} ({size_mb:.0f} MB)") + + +if __name__ == "__main__": + main() diff --git a/benchmarks-ann/datasets/cohere1m/.gitignore b/benchmarks-ann/datasets/cohere1m/.gitignore new file mode 100644 index 0000000..8efed50 --- /dev/null +++ b/benchmarks-ann/datasets/cohere1m/.gitignore @@ -0,0 +1,2 @@ +*.parquet +base.db diff --git a/benchmarks-ann/datasets/cohere1m/Makefile b/benchmarks-ann/datasets/cohere1m/Makefile new file mode 100644 index 0000000..186bf66 --- /dev/null +++ b/benchmarks-ann/datasets/cohere1m/Makefile @@ -0,0 +1,24 @@ +BASE_URL = https://assets.zilliz.com/benchmark/cohere_medium_1m + +PARQUETS = train.parquet test.parquet neighbors.parquet + +.PHONY: all download base.db clean + +all: base.db + +download: $(PARQUETS) + +train.parquet: + curl -L -o $@ $(BASE_URL)/train.parquet + +test.parquet: + curl -L -o $@ $(BASE_URL)/test.parquet + +neighbors.parquet: + curl -L -o $@ $(BASE_URL)/neighbors.parquet + +base.db: $(PARQUETS) build_base_db.py + uv run --with pandas --with pyarrow python build_base_db.py + +clean: + rm -f base.db diff --git a/benchmarks-ann/datasets/cohere1m/build_base_db.py b/benchmarks-ann/datasets/cohere1m/build_base_db.py new file mode 100644 index 0000000..33d280d --- /dev/null +++ b/benchmarks-ann/datasets/cohere1m/build_base_db.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python3 +"""Build base.db from downloaded parquet files. + +Reads train.parquet, test.parquet, neighbors.parquet and creates a SQLite +database with tables: train, query_vectors, neighbors. + +Usage: + uv run --with pandas --with pyarrow python build_base_db.py +""" +import json +import os +import sqlite3 +import struct +import sys +import time + +import pandas as pd + + +def float_list_to_blob(floats): + """Pack a list of floats into a little-endian f32 blob.""" + return struct.pack(f"<{len(floats)}f", *floats) + + +def main(): + seed_dir = os.path.dirname(os.path.abspath(__file__)) + db_path = os.path.join(seed_dir, "base.db") + + train_path = os.path.join(seed_dir, "train.parquet") + test_path = os.path.join(seed_dir, "test.parquet") + neighbors_path = os.path.join(seed_dir, "neighbors.parquet") + + for p in (train_path, test_path, neighbors_path): + if not os.path.exists(p): + print(f"ERROR: {p} not found. Run 'make download' first.") + sys.exit(1) + + if os.path.exists(db_path): + os.remove(db_path) + + conn = sqlite3.connect(db_path) + conn.execute("PRAGMA journal_mode=WAL") + conn.execute("PRAGMA page_size=4096") + + # --- query_vectors (from test.parquet) --- + print("Loading test.parquet (query vectors)...") + t0 = time.perf_counter() + df_test = pd.read_parquet(test_path) + conn.execute( + "CREATE TABLE query_vectors (id INTEGER PRIMARY KEY, vector BLOB)" + ) + rows = [] + for _, row in df_test.iterrows(): + rows.append((int(row["id"]), float_list_to_blob(row["emb"]))) + conn.executemany("INSERT INTO query_vectors (id, vector) VALUES (?, ?)", rows) + conn.commit() + print(f" {len(rows)} query vectors in {time.perf_counter() - t0:.1f}s") + + # --- neighbors (from neighbors.parquet) --- + print("Loading neighbors.parquet...") + t0 = time.perf_counter() + df_neighbors = pd.read_parquet(neighbors_path) + conn.execute( + "CREATE TABLE neighbors (" + " query_vector_id INTEGER, rank INTEGER, neighbors_id TEXT," + " UNIQUE(query_vector_id, rank))" + ) + rows = [] + for _, row in df_neighbors.iterrows(): + qid = int(row["id"]) + # neighbors_id may be a numpy array or JSON string + nids = row["neighbors_id"] + if isinstance(nids, str): + nids = json.loads(nids) + for rank, nid in enumerate(nids): + rows.append((qid, rank, str(int(nid)))) + conn.executemany( + "INSERT INTO neighbors (query_vector_id, rank, neighbors_id) VALUES (?, ?, ?)", + rows, + ) + conn.commit() + print(f" {len(rows)} neighbor rows in {time.perf_counter() - t0:.1f}s") + + # --- train (from train.parquet) --- + print("Loading train.parquet (1M vectors, this takes a few minutes)...") + t0 = time.perf_counter() + conn.execute( + "CREATE TABLE train (id INTEGER PRIMARY KEY, vector BLOB)" + ) + + batch_size = 10000 + df_iter = pd.read_parquet(train_path) + total = len(df_iter) + + for start in range(0, total, batch_size): + chunk = df_iter.iloc[start : start + batch_size] + rows = [] + for _, row in chunk.iterrows(): + rows.append((int(row["id"]), float_list_to_blob(row["emb"]))) + conn.executemany("INSERT INTO train (id, vector) VALUES (?, ?)", rows) + conn.commit() + + done = min(start + batch_size, total) + elapsed = time.perf_counter() - t0 + rate = done / elapsed if elapsed > 0 else 0 + eta = (total - done) / rate if rate > 0 else 0 + print( + f" {done:>8}/{total} {elapsed:.0f}s {rate:.0f} rows/s eta {eta:.0f}s", + flush=True, + ) + + elapsed = time.perf_counter() - t0 + print(f" {total} train vectors in {elapsed:.1f}s") + + conn.close() + size_mb = os.path.getsize(db_path) / (1024 * 1024) + print(f"\nDone: {db_path} ({size_mb:.0f} MB)") + + +if __name__ == "__main__": + main() diff --git a/benchmarks-ann/datasets/nyt-1024/Makefile b/benchmarks-ann/datasets/nyt-1024/Makefile new file mode 100644 index 0000000..0547409 --- /dev/null +++ b/benchmarks-ann/datasets/nyt-1024/Makefile @@ -0,0 +1,30 @@ +MODEL ?= mixedbread-ai/mxbai-embed-large-v1 +K ?= 100 +BATCH_SIZE ?= 256 +DATA_DIR ?= ../nyt/data + +all: base.db + +# Reuse data from ../nyt +$(DATA_DIR): + $(MAKE) -C ../nyt data + +contents.db: $(DATA_DIR) + uv run ../nyt-768/build-contents.py --data-dir $(DATA_DIR) -o $@ + +base.db: contents.db queries.txt + uv run build-base.py \ + --contents-db contents.db \ + --model $(MODEL) \ + --queries-file queries.txt \ + --batch-size $(BATCH_SIZE) \ + --k $(K) \ + -o $@ + +queries.txt: + cp ../nyt/queries.txt $@ + +clean: + rm -f base.db contents.db + +.PHONY: all clean diff --git a/benchmarks-ann/datasets/nyt-1024/build-base.py b/benchmarks-ann/datasets/nyt-1024/build-base.py new file mode 100644 index 0000000..a0a6b22 --- /dev/null +++ b/benchmarks-ann/datasets/nyt-1024/build-base.py @@ -0,0 +1,163 @@ +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "sentence-transformers", +# "torch<=2.7", +# "tqdm", +# ] +# /// + +import argparse +import sqlite3 +from array import array +from itertools import batched + +from sentence_transformers import SentenceTransformer +from tqdm import tqdm + + +def main(): + parser = argparse.ArgumentParser( + description="Build base.db with train vectors, query vectors, and brute-force KNN neighbors", + ) + parser.add_argument( + "--contents-db", "-c", default=None, + help="Path to contents.db (source of headlines and IDs)", + ) + parser.add_argument( + "--model", "-m", default="mixedbread-ai/mxbai-embed-large-v1", + help="HuggingFace model ID (default: mixedbread-ai/mxbai-embed-large-v1)", + ) + parser.add_argument( + "--queries-file", "-q", default="queries.txt", + help="Path to the queries file (default: queries.txt)", + ) + parser.add_argument( + "--output", "-o", required=True, + help="Path to the output base.db", + ) + parser.add_argument( + "--batch-size", "-b", type=int, default=256, + help="Batch size for embedding (default: 256)", + ) + parser.add_argument( + "--k", "-k", type=int, default=100, + help="Number of nearest neighbors (default: 100)", + ) + parser.add_argument( + "--limit", "-l", type=int, default=0, + help="Limit number of headlines to embed (0 = all)", + ) + parser.add_argument( + "--vec-path", "-v", default="~/projects/sqlite-vec/dist/vec0", + help="Path to sqlite-vec extension (default: ~/projects/sqlite-vec/dist/vec0)", + ) + parser.add_argument( + "--skip-neighbors", action="store_true", + help="Skip the brute-force KNN neighbor computation", + ) + args = parser.parse_args() + + import os + vec_path = os.path.expanduser(args.vec_path) + + print(f"Loading model {args.model}...") + model = SentenceTransformer(args.model) + + # Read headlines from contents.db + src = sqlite3.connect(args.contents_db) + limit_clause = f" LIMIT {args.limit}" if args.limit > 0 else "" + headlines = src.execute( + f"SELECT id, headline FROM contents ORDER BY id{limit_clause}" + ).fetchall() + src.close() + print(f"Loaded {len(headlines)} headlines from {args.contents_db}") + + # Read queries + with open(args.queries_file) as f: + queries = [line.strip() for line in f if line.strip()] + print(f"Loaded {len(queries)} queries from {args.queries_file}") + + # Create output database + db = sqlite3.connect(args.output) + db.enable_load_extension(True) + db.load_extension(vec_path) + db.enable_load_extension(False) + + db.execute("CREATE TABLE IF NOT EXISTS train(id INTEGER PRIMARY KEY, vector BLOB)") + db.execute("CREATE TABLE IF NOT EXISTS query_vectors(id INTEGER PRIMARY KEY, vector BLOB)") + db.execute( + "CREATE TABLE IF NOT EXISTS neighbors(" + " query_vector_id INTEGER, rank INTEGER, neighbors_id TEXT," + " UNIQUE(query_vector_id, rank))" + ) + + # Step 1: Embed headlines -> train table + print("Embedding headlines...") + for batch in tqdm( + batched(headlines, args.batch_size), + total=(len(headlines) + args.batch_size - 1) // args.batch_size, + ): + ids = [r[0] for r in batch] + texts = [r[1] for r in batch] + embeddings = model.encode(texts, normalize_embeddings=True) + + params = [ + (int(rid), array("f", emb.tolist()).tobytes()) + for rid, emb in zip(ids, embeddings) + ] + db.executemany("INSERT INTO train VALUES (?, ?)", params) + db.commit() + + del headlines + n = db.execute("SELECT count(*) FROM train").fetchone()[0] + print(f"Embedded {n} headlines") + + # Step 2: Embed queries -> query_vectors table + print("Embedding queries...") + query_embeddings = model.encode(queries, normalize_embeddings=True) + query_params = [] + for i, emb in enumerate(query_embeddings, 1): + blob = array("f", emb.tolist()).tobytes() + query_params.append((i, blob)) + db.executemany("INSERT INTO query_vectors VALUES (?, ?)", query_params) + db.commit() + print(f"Embedded {len(queries)} queries") + + if args.skip_neighbors: + db.close() + print(f"Done (skipped neighbors). Wrote {args.output}") + return + + # Step 3: Brute-force KNN via sqlite-vec -> neighbors table + n_queries = db.execute("SELECT count(*) FROM query_vectors").fetchone()[0] + print(f"Computing {args.k}-NN for {n_queries} queries via sqlite-vec...") + for query_id, query_blob in tqdm( + db.execute("SELECT id, vector FROM query_vectors").fetchall() + ): + results = db.execute( + """ + SELECT + train.id, + vec_distance_cosine(train.vector, ?) AS distance + FROM train + WHERE distance IS NOT NULL + ORDER BY distance ASC + LIMIT ? + """, + (query_blob, args.k), + ).fetchall() + + params = [ + (query_id, rank, str(rid)) + for rank, (rid, _dist) in enumerate(results) + ] + db.executemany("INSERT INTO neighbors VALUES (?, ?, ?)", params) + + db.commit() + db.close() + print(f"Done. Wrote {args.output}") + + +if __name__ == "__main__": + main() diff --git a/benchmarks-ann/datasets/nyt-1024/queries.txt b/benchmarks-ann/datasets/nyt-1024/queries.txt new file mode 100644 index 0000000..9e98f84 --- /dev/null +++ b/benchmarks-ann/datasets/nyt-1024/queries.txt @@ -0,0 +1,100 @@ +latest news on climate change policy +presidential election results and analysis +stock market crash causes +coronavirus vaccine development updates +artificial intelligence breakthrough in healthcare +supreme court ruling on abortion rights +tech companies layoff announcements +earthquake damages in California +cybersecurity breach at major corporation +space exploration mission to Mars +immigration reform legislation debate +renewable energy investment trends +healthcare costs rising across America +protests against police brutality +wildfires destroy homes in the West +Olympic games highlights and records +celebrity scandal rocks Hollywood +breakthrough cancer treatment discovered +housing market bubble concerns +federal reserve interest rate decision +school shooting tragedy response +diplomatic tensions between superpowers +drone strike kills terrorist leader +social media platform faces regulation +archaeological discovery reveals ancient civilization +unemployment rate hits record low +autonomous vehicles testing expansion +streaming service launches original content +opioid crisis intervention programs +trade war tariffs impact economy +infrastructure bill passes Congress +data privacy concerns grow +minimum wage increase proposal +college admissions scandal exposed +NFL player protest during anthem +cryptocurrency regulation debate +pandemic lockdown restrictions eased +mass shooting gun control debate +tax reform legislation impact +ransomware attack cripples pipeline +climate activists stage demonstration +sports team wins championship +banking system collapse fears +pharmaceutical company fraud charges +genetic engineering ethical concerns +border wall funding controversy +impeachment proceedings begin +nuclear weapons treaty violation +artificial meat alternative launch +student loan debt forgiveness +venture capital funding decline +facial recognition ban proposed +election interference investigation +pandemic preparedness failures +police reform measures announced +wildfire prevention strategies +ocean pollution crisis worsens +manufacturing jobs returning +pension fund shortfall concerns +antitrust investigation launched +voting rights protection act +mental health awareness campaign +homeless population increasing +space debris collision risk +drug cartel violence escalates +renewable energy jobs growth +infrastructure deterioration report +vaccine mandate legal challenge +cryptocurrency market volatility +autonomous drone delivery service +deep fake technology dangers +Arctic ice melting accelerates +income inequality gap widens +election fraud claims disputed +corporate merger blocked +medical breakthrough extends life +transportation strike disrupts city +racial justice protests spread +carbon emissions reduction goals +financial crisis warning signs +cyberbullying prevention efforts +asteroid near miss with Earth +gene therapy approval granted +labor union organizing drive +surveillance technology expansion +education funding cuts proposed +disaster relief efforts underway +housing affordability crisis +clean water access shortage +artificial intelligence job displacement +trade agreement negotiations +prison reform initiative launched +species extinction accelerates +political corruption scandal +terrorism threat level raised +food safety contamination outbreak +ai model release +affordability interest rates +peanut allergies in newbons +breaking bad walter white \ No newline at end of file diff --git a/benchmarks-ann/datasets/nyt-384/Makefile b/benchmarks-ann/datasets/nyt-384/Makefile new file mode 100644 index 0000000..76296a1 --- /dev/null +++ b/benchmarks-ann/datasets/nyt-384/Makefile @@ -0,0 +1,29 @@ +MODEL ?= mixedbread-ai/mxbai-embed-xsmall-v1 +K ?= 100 +BATCH_SIZE ?= 512 +DATA_DIR ?= ../nyt/data + +all: base.db + +$(DATA_DIR): + $(MAKE) -C ../nyt data + +contents.db: $(DATA_DIR) + uv run ../nyt-768/build-contents.py --data-dir $(DATA_DIR) -o $@ + +base.db: contents.db queries.txt + uv run ../nyt-1024/build-base.py \ + --contents-db contents.db \ + --model $(MODEL) \ + --queries-file queries.txt \ + --batch-size $(BATCH_SIZE) \ + --k $(K) \ + -o $@ + +queries.txt: + cp ../nyt/queries.txt $@ + +clean: + rm -f base.db contents.db + +.PHONY: all clean diff --git a/benchmarks-ann/datasets/nyt-384/queries.txt b/benchmarks-ann/datasets/nyt-384/queries.txt new file mode 100644 index 0000000..9e98f84 --- /dev/null +++ b/benchmarks-ann/datasets/nyt-384/queries.txt @@ -0,0 +1,100 @@ +latest news on climate change policy +presidential election results and analysis +stock market crash causes +coronavirus vaccine development updates +artificial intelligence breakthrough in healthcare +supreme court ruling on abortion rights +tech companies layoff announcements +earthquake damages in California +cybersecurity breach at major corporation +space exploration mission to Mars +immigration reform legislation debate +renewable energy investment trends +healthcare costs rising across America +protests against police brutality +wildfires destroy homes in the West +Olympic games highlights and records +celebrity scandal rocks Hollywood +breakthrough cancer treatment discovered +housing market bubble concerns +federal reserve interest rate decision +school shooting tragedy response +diplomatic tensions between superpowers +drone strike kills terrorist leader +social media platform faces regulation +archaeological discovery reveals ancient civilization +unemployment rate hits record low +autonomous vehicles testing expansion +streaming service launches original content +opioid crisis intervention programs +trade war tariffs impact economy +infrastructure bill passes Congress +data privacy concerns grow +minimum wage increase proposal +college admissions scandal exposed +NFL player protest during anthem +cryptocurrency regulation debate +pandemic lockdown restrictions eased +mass shooting gun control debate +tax reform legislation impact +ransomware attack cripples pipeline +climate activists stage demonstration +sports team wins championship +banking system collapse fears +pharmaceutical company fraud charges +genetic engineering ethical concerns +border wall funding controversy +impeachment proceedings begin +nuclear weapons treaty violation +artificial meat alternative launch +student loan debt forgiveness +venture capital funding decline +facial recognition ban proposed +election interference investigation +pandemic preparedness failures +police reform measures announced +wildfire prevention strategies +ocean pollution crisis worsens +manufacturing jobs returning +pension fund shortfall concerns +antitrust investigation launched +voting rights protection act +mental health awareness campaign +homeless population increasing +space debris collision risk +drug cartel violence escalates +renewable energy jobs growth +infrastructure deterioration report +vaccine mandate legal challenge +cryptocurrency market volatility +autonomous drone delivery service +deep fake technology dangers +Arctic ice melting accelerates +income inequality gap widens +election fraud claims disputed +corporate merger blocked +medical breakthrough extends life +transportation strike disrupts city +racial justice protests spread +carbon emissions reduction goals +financial crisis warning signs +cyberbullying prevention efforts +asteroid near miss with Earth +gene therapy approval granted +labor union organizing drive +surveillance technology expansion +education funding cuts proposed +disaster relief efforts underway +housing affordability crisis +clean water access shortage +artificial intelligence job displacement +trade agreement negotiations +prison reform initiative launched +species extinction accelerates +political corruption scandal +terrorism threat level raised +food safety contamination outbreak +ai model release +affordability interest rates +peanut allergies in newbons +breaking bad walter white \ No newline at end of file diff --git a/benchmarks-ann/datasets/nyt-768/Makefile b/benchmarks-ann/datasets/nyt-768/Makefile new file mode 100644 index 0000000..93bb72a --- /dev/null +++ b/benchmarks-ann/datasets/nyt-768/Makefile @@ -0,0 +1,37 @@ +MODEL ?= bge-base-en-v1.5-768 +K ?= 100 +BATCH_SIZE ?= 512 +DATA_DIR ?= ../nyt/data + +all: base.db + +# Reuse data from ../nyt +$(DATA_DIR): + $(MAKE) -C ../nyt data + +# Distill model (separate step, may take a while) +$(MODEL): + uv run distill-model.py + +contents.db: $(DATA_DIR) + uv run build-contents.py --data-dir $(DATA_DIR) -o $@ + +base.db: contents.db queries.txt $(MODEL) + uv run ../nyt/build-base.py \ + --contents-db contents.db \ + --model $(MODEL) \ + --queries-file queries.txt \ + --batch-size $(BATCH_SIZE) \ + --k $(K) \ + -o $@ + +queries.txt: + cp ../nyt/queries.txt $@ + +clean: + rm -f base.db contents.db + +clean-all: clean + rm -rf $(MODEL) + +.PHONY: all clean clean-all diff --git a/benchmarks-ann/datasets/nyt-768/build-contents.py b/benchmarks-ann/datasets/nyt-768/build-contents.py new file mode 100644 index 0000000..fc829d8 --- /dev/null +++ b/benchmarks-ann/datasets/nyt-768/build-contents.py @@ -0,0 +1,64 @@ +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "duckdb", +# ] +# /// + +import argparse +import sqlite3 +import duckdb + + +def main(): + parser = argparse.ArgumentParser( + description="Load NYT headline CSVs into a SQLite contents database (most recent 1M, deduplicated)", + ) + parser.add_argument( + "--data-dir", "-d", default="../nyt/data", + help="Directory containing NYT CSV files (default: ../nyt/data)", + ) + parser.add_argument( + "--limit", "-l", type=int, default=1_000_000, + help="Maximum number of headlines to keep (default: 1000000)", + ) + parser.add_argument( + "--output", "-o", required=True, + help="Path to the output SQLite database", + ) + args = parser.parse_args() + + glob_pattern = f"{args.data_dir}/new_york_times_stories_*.csv" + + con = duckdb.connect() + rows = con.execute( + f""" + WITH deduped AS ( + SELECT + headline, + max(pub_date) AS pub_date + FROM read_csv('{glob_pattern}', auto_detect=true, union_by_name=true) + WHERE headline IS NOT NULL AND trim(headline) != '' + GROUP BY headline + ) + SELECT + row_number() OVER (ORDER BY pub_date DESC) AS id, + headline + FROM deduped + ORDER BY pub_date DESC + LIMIT {args.limit} + """ + ).fetchall() + con.close() + + db = sqlite3.connect(args.output) + db.execute("CREATE TABLE contents(id INTEGER PRIMARY KEY, headline TEXT)") + db.executemany("INSERT INTO contents VALUES (?, ?)", rows) + db.commit() + db.close() + + print(f"Wrote {len(rows)} headlines to {args.output}") + + +if __name__ == "__main__": + main() diff --git a/benchmarks-ann/datasets/nyt-768/distill-model.py b/benchmarks-ann/datasets/nyt-768/distill-model.py new file mode 100644 index 0000000..3adca4a --- /dev/null +++ b/benchmarks-ann/datasets/nyt-768/distill-model.py @@ -0,0 +1,13 @@ +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "model2vec[distill]", +# "torch<=2.7", +# ] +# /// + +from model2vec.distill import distill + +model = distill(model_name="BAAI/bge-base-en-v1.5", pca_dims=768) +model.save_pretrained("bge-base-en-v1.5-768") +print("Saved distilled model to bge-base-en-v1.5-768/") diff --git a/benchmarks-ann/datasets/nyt-768/queries.txt b/benchmarks-ann/datasets/nyt-768/queries.txt new file mode 100644 index 0000000..9e98f84 --- /dev/null +++ b/benchmarks-ann/datasets/nyt-768/queries.txt @@ -0,0 +1,100 @@ +latest news on climate change policy +presidential election results and analysis +stock market crash causes +coronavirus vaccine development updates +artificial intelligence breakthrough in healthcare +supreme court ruling on abortion rights +tech companies layoff announcements +earthquake damages in California +cybersecurity breach at major corporation +space exploration mission to Mars +immigration reform legislation debate +renewable energy investment trends +healthcare costs rising across America +protests against police brutality +wildfires destroy homes in the West +Olympic games highlights and records +celebrity scandal rocks Hollywood +breakthrough cancer treatment discovered +housing market bubble concerns +federal reserve interest rate decision +school shooting tragedy response +diplomatic tensions between superpowers +drone strike kills terrorist leader +social media platform faces regulation +archaeological discovery reveals ancient civilization +unemployment rate hits record low +autonomous vehicles testing expansion +streaming service launches original content +opioid crisis intervention programs +trade war tariffs impact economy +infrastructure bill passes Congress +data privacy concerns grow +minimum wage increase proposal +college admissions scandal exposed +NFL player protest during anthem +cryptocurrency regulation debate +pandemic lockdown restrictions eased +mass shooting gun control debate +tax reform legislation impact +ransomware attack cripples pipeline +climate activists stage demonstration +sports team wins championship +banking system collapse fears +pharmaceutical company fraud charges +genetic engineering ethical concerns +border wall funding controversy +impeachment proceedings begin +nuclear weapons treaty violation +artificial meat alternative launch +student loan debt forgiveness +venture capital funding decline +facial recognition ban proposed +election interference investigation +pandemic preparedness failures +police reform measures announced +wildfire prevention strategies +ocean pollution crisis worsens +manufacturing jobs returning +pension fund shortfall concerns +antitrust investigation launched +voting rights protection act +mental health awareness campaign +homeless population increasing +space debris collision risk +drug cartel violence escalates +renewable energy jobs growth +infrastructure deterioration report +vaccine mandate legal challenge +cryptocurrency market volatility +autonomous drone delivery service +deep fake technology dangers +Arctic ice melting accelerates +income inequality gap widens +election fraud claims disputed +corporate merger blocked +medical breakthrough extends life +transportation strike disrupts city +racial justice protests spread +carbon emissions reduction goals +financial crisis warning signs +cyberbullying prevention efforts +asteroid near miss with Earth +gene therapy approval granted +labor union organizing drive +surveillance technology expansion +education funding cuts proposed +disaster relief efforts underway +housing affordability crisis +clean water access shortage +artificial intelligence job displacement +trade agreement negotiations +prison reform initiative launched +species extinction accelerates +political corruption scandal +terrorism threat level raised +food safety contamination outbreak +ai model release +affordability interest rates +peanut allergies in newbons +breaking bad walter white \ No newline at end of file diff --git a/benchmarks-ann/datasets/nyt/.gitignore b/benchmarks-ann/datasets/nyt/.gitignore new file mode 100644 index 0000000..adbb97d --- /dev/null +++ b/benchmarks-ann/datasets/nyt/.gitignore @@ -0,0 +1 @@ +data/ \ No newline at end of file diff --git a/benchmarks-ann/datasets/nyt/Makefile b/benchmarks-ann/datasets/nyt/Makefile new file mode 100644 index 0000000..dfaa6e9 --- /dev/null +++ b/benchmarks-ann/datasets/nyt/Makefile @@ -0,0 +1,30 @@ +MODEL ?= minishlab/potion-base-8M +K ?= 100 +BATCH_SIZE ?= 512 +DATA_DIR ?= data + +all: base.db contents.db + +# Download NYT headlines CSVs from Kaggle (requires `kaggle` CLI + API token) +$(DATA_DIR): + kaggle datasets download -d johnbandy/new-york-times-headlines -p $(DATA_DIR) --unzip + +contents.db: $(DATA_DIR) + uv run build-contents.py --data-dir $(DATA_DIR) -o $@ + +base.db: contents.db queries.txt + uv run build-base.py \ + --contents-db contents.db \ + --model $(MODEL) \ + --queries-file queries.txt \ + --batch-size $(BATCH_SIZE) \ + --k $(K) \ + -o $@ + +clean: + rm -f base.db contents.db + +clean-all: clean + rm -rf $(DATA_DIR) + +.PHONY: all clean clean-all diff --git a/benchmarks-ann/datasets/nyt/build-base.py b/benchmarks-ann/datasets/nyt/build-base.py new file mode 100644 index 0000000..db00aa2 --- /dev/null +++ b/benchmarks-ann/datasets/nyt/build-base.py @@ -0,0 +1,165 @@ +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "model2vec", +# "torch<=2.7", +# "tqdm", +# ] +# /// + +import argparse +import sqlite3 +from array import array +from itertools import batched + +from model2vec import StaticModel +from tqdm import tqdm + + +def main(): + parser = argparse.ArgumentParser( + description="Build base.db with train vectors, query vectors, and brute-force KNN neighbors", + ) + parser.add_argument( + "--contents-db", "-c", default=None, + help="Path to contents.db (source of headlines and IDs)", + ) + parser.add_argument( + "--model", "-m", default="minishlab/potion-base-8M", + help="HuggingFace model ID or local path (default: minishlab/potion-base-8M)", + ) + parser.add_argument( + "--queries-file", "-q", default="queries.txt", + help="Path to the queries file (default: queries.txt)", + ) + parser.add_argument( + "--output", "-o", required=True, + help="Path to the output base.db", + ) + parser.add_argument( + "--batch-size", "-b", type=int, default=512, + help="Batch size for embedding (default: 512)", + ) + parser.add_argument( + "--k", "-k", type=int, default=100, + help="Number of nearest neighbors (default: 100)", + ) + parser.add_argument( + "--vec-path", "-v", default="~/projects/sqlite-vec/dist/vec0", + help="Path to sqlite-vec extension (default: ~/projects/sqlite-vec/dist/vec0)", + ) + parser.add_argument( + "--rebuild-neighbors", action="store_true", + help="Only rebuild the neighbors table (skip embedding steps)", + ) + args = parser.parse_args() + + import os + vec_path = os.path.expanduser(args.vec_path) + + if args.rebuild_neighbors: + # Skip embedding, just open existing DB and rebuild neighbors + db = sqlite3.connect(args.output) + db.enable_load_extension(True) + db.load_extension(vec_path) + db.enable_load_extension(False) + db.execute("DROP TABLE IF EXISTS neighbors") + db.execute( + "CREATE TABLE neighbors(" + " query_vector_id INTEGER, rank INTEGER, neighbors_id TEXT," + " UNIQUE(query_vector_id, rank))" + ) + print(f"Rebuilding neighbors in {args.output}...") + else: + print(f"Loading model {args.model}...") + model = StaticModel.from_pretrained(args.model) + + # Read headlines from contents.db + src = sqlite3.connect(args.contents_db) + headlines = src.execute("SELECT id, headline FROM contents ORDER BY id").fetchall() + src.close() + print(f"Loaded {len(headlines)} headlines from {args.contents_db}") + + # Read queries + with open(args.queries_file) as f: + queries = [line.strip() for line in f if line.strip()] + print(f"Loaded {len(queries)} queries from {args.queries_file}") + + # Create output database + db = sqlite3.connect(args.output) + db.enable_load_extension(True) + db.load_extension(vec_path) + db.enable_load_extension(False) + + db.execute("CREATE TABLE train(id INTEGER PRIMARY KEY, vector BLOB)") + db.execute("CREATE TABLE query_vectors(id INTEGER PRIMARY KEY, vector BLOB)") + db.execute( + "CREATE TABLE neighbors(" + " query_vector_id INTEGER, rank INTEGER, neighbors_id TEXT," + " UNIQUE(query_vector_id, rank))" + ) + + # Step 1: Embed headlines -> train table + print("Embedding headlines...") + for batch in tqdm( + batched(headlines, args.batch_size), + total=(len(headlines) + args.batch_size - 1) // args.batch_size, + ): + ids = [r[0] for r in batch] + texts = [r[1] for r in batch] + embeddings = model.encode(texts) + + params = [ + (int(rid), array("f", emb.tolist()).tobytes()) + for rid, emb in zip(ids, embeddings) + ] + db.executemany("INSERT INTO train VALUES (?, ?)", params) + db.commit() + + del headlines + n = db.execute("SELECT count(*) FROM train").fetchone()[0] + print(f"Embedded {n} headlines") + + # Step 2: Embed queries -> query_vectors table + print("Embedding queries...") + query_embeddings = model.encode(queries) + query_params = [] + for i, emb in enumerate(query_embeddings, 1): + blob = array("f", emb.tolist()).tobytes() + query_params.append((i, blob)) + db.executemany("INSERT INTO query_vectors VALUES (?, ?)", query_params) + db.commit() + print(f"Embedded {len(queries)} queries") + + # Step 3: Brute-force KNN via sqlite-vec -> neighbors table + n_queries = db.execute("SELECT count(*) FROM query_vectors").fetchone()[0] + print(f"Computing {args.k}-NN for {n_queries} queries via sqlite-vec...") + for query_id, query_blob in tqdm( + db.execute("SELECT id, vector FROM query_vectors").fetchall() + ): + results = db.execute( + """ + SELECT + train.id, + vec_distance_cosine(train.vector, ?) AS distance + FROM train + WHERE distance IS NOT NULL + ORDER BY distance ASC + LIMIT ? + """, + (query_blob, args.k), + ).fetchall() + + params = [ + (query_id, rank, str(rid)) + for rank, (rid, _dist) in enumerate(results) + ] + db.executemany("INSERT INTO neighbors VALUES (?, ?, ?)", params) + + db.commit() + db.close() + print(f"Done. Wrote {args.output}") + + +if __name__ == "__main__": + main() diff --git a/benchmarks-ann/datasets/nyt/build-contents.py b/benchmarks-ann/datasets/nyt/build-contents.py new file mode 100644 index 0000000..7e99cb9 --- /dev/null +++ b/benchmarks-ann/datasets/nyt/build-contents.py @@ -0,0 +1,52 @@ +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "duckdb", +# ] +# /// + +import argparse +import os +import sqlite3 +import duckdb + + +def main(): + parser = argparse.ArgumentParser( + description="Load NYT headline CSVs into a SQLite contents database via DuckDB", + ) + parser.add_argument( + "--data-dir", "-d", default="data", + help="Directory containing NYT CSV files (default: data)", + ) + parser.add_argument( + "--output", "-o", required=True, + help="Path to the output SQLite database", + ) + args = parser.parse_args() + + glob_pattern = os.path.join(args.data_dir, "new_york_times_stories_*.csv") + + con = duckdb.connect() + rows = con.execute( + f""" + SELECT + row_number() OVER () AS id, + headline + FROM read_csv('{glob_pattern}', auto_detect=true, union_by_name=true) + WHERE headline IS NOT NULL AND headline != '' + """ + ).fetchall() + con.close() + + db = sqlite3.connect(args.output) + db.execute("CREATE TABLE contents(id INTEGER PRIMARY KEY, headline TEXT)") + db.executemany("INSERT INTO contents VALUES (?, ?)", rows) + db.commit() + db.close() + + print(f"Wrote {len(rows)} headlines to {args.output}") + + +if __name__ == "__main__": + main() diff --git a/benchmarks-ann/datasets/nyt/queries.txt b/benchmarks-ann/datasets/nyt/queries.txt new file mode 100644 index 0000000..9e98f84 --- /dev/null +++ b/benchmarks-ann/datasets/nyt/queries.txt @@ -0,0 +1,100 @@ +latest news on climate change policy +presidential election results and analysis +stock market crash causes +coronavirus vaccine development updates +artificial intelligence breakthrough in healthcare +supreme court ruling on abortion rights +tech companies layoff announcements +earthquake damages in California +cybersecurity breach at major corporation +space exploration mission to Mars +immigration reform legislation debate +renewable energy investment trends +healthcare costs rising across America +protests against police brutality +wildfires destroy homes in the West +Olympic games highlights and records +celebrity scandal rocks Hollywood +breakthrough cancer treatment discovered +housing market bubble concerns +federal reserve interest rate decision +school shooting tragedy response +diplomatic tensions between superpowers +drone strike kills terrorist leader +social media platform faces regulation +archaeological discovery reveals ancient civilization +unemployment rate hits record low +autonomous vehicles testing expansion +streaming service launches original content +opioid crisis intervention programs +trade war tariffs impact economy +infrastructure bill passes Congress +data privacy concerns grow +minimum wage increase proposal +college admissions scandal exposed +NFL player protest during anthem +cryptocurrency regulation debate +pandemic lockdown restrictions eased +mass shooting gun control debate +tax reform legislation impact +ransomware attack cripples pipeline +climate activists stage demonstration +sports team wins championship +banking system collapse fears +pharmaceutical company fraud charges +genetic engineering ethical concerns +border wall funding controversy +impeachment proceedings begin +nuclear weapons treaty violation +artificial meat alternative launch +student loan debt forgiveness +venture capital funding decline +facial recognition ban proposed +election interference investigation +pandemic preparedness failures +police reform measures announced +wildfire prevention strategies +ocean pollution crisis worsens +manufacturing jobs returning +pension fund shortfall concerns +antitrust investigation launched +voting rights protection act +mental health awareness campaign +homeless population increasing +space debris collision risk +drug cartel violence escalates +renewable energy jobs growth +infrastructure deterioration report +vaccine mandate legal challenge +cryptocurrency market volatility +autonomous drone delivery service +deep fake technology dangers +Arctic ice melting accelerates +income inequality gap widens +election fraud claims disputed +corporate merger blocked +medical breakthrough extends life +transportation strike disrupts city +racial justice protests spread +carbon emissions reduction goals +financial crisis warning signs +cyberbullying prevention efforts +asteroid near miss with Earth +gene therapy approval granted +labor union organizing drive +surveillance technology expansion +education funding cuts proposed +disaster relief efforts underway +housing affordability crisis +clean water access shortage +artificial intelligence job displacement +trade agreement negotiations +prison reform initiative launched +species extinction accelerates +political corruption scandal +terrorism threat level raised +food safety contamination outbreak +ai model release +affordability interest rates +peanut allergies in newbons +breaking bad walter white \ No newline at end of file diff --git a/benchmarks-ann/faiss_kmeans.py b/benchmarks-ann/faiss_kmeans.py new file mode 100644 index 0000000..9765a7b --- /dev/null +++ b/benchmarks-ann/faiss_kmeans.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python3 +"""Compute k-means centroids using FAISS and save to a centroids DB. + +Reads the first N vectors from a base.db, runs FAISS k-means, and writes +the centroids to an output SQLite DB as float32 blobs. + +Usage: + python faiss_kmeans.py --base-db datasets/cohere10m/base.db --ntrain 100000 \ + --nclusters 8192 -o centroids.db + +Output schema: + CREATE TABLE centroids ( + centroid_id INTEGER PRIMARY KEY, + centroid BLOB NOT NULL -- float32[D] + ); + CREATE TABLE meta (key TEXT PRIMARY KEY, value TEXT); + -- ntrain, nclusters, dimensions, elapsed_s +""" +import argparse +import os +import sqlite3 +import struct +import time + +import faiss +import numpy as np + + +def main(): + parser = argparse.ArgumentParser(description="FAISS k-means centroid computation") + parser.add_argument("--base-db", required=True, help="path to base.db with train table") + parser.add_argument("--ntrain", type=int, required=True, help="number of vectors to train on") + parser.add_argument("--nclusters", type=int, required=True, help="number of clusters (nlist)") + parser.add_argument("--niter", type=int, default=20, help="k-means iterations (default 20)") + parser.add_argument("--seed", type=int, default=42, help="random seed") + parser.add_argument("-o", "--output", required=True, help="output centroids DB path") + args = parser.parse_args() + + # Load vectors + print(f"Loading {args.ntrain} vectors from {args.base_db}...") + conn = sqlite3.connect(args.base_db) + rows = conn.execute( + "SELECT vector FROM train ORDER BY id LIMIT ?", (args.ntrain,) + ).fetchall() + conn.close() + + # Parse float32 blobs to numpy + first_blob = rows[0][0] + D = len(first_blob) // 4 # float32 + print(f" Dimensions: {D}, loaded {len(rows)} vectors") + + vectors = np.zeros((len(rows), D), dtype=np.float32) + for i, (blob,) in enumerate(rows): + vectors[i] = np.frombuffer(blob, dtype=np.float32) + + # Normalize for cosine distance (FAISS k-means on L2 of unit vectors ≈ cosine) + norms = np.linalg.norm(vectors, axis=1, keepdims=True) + norms[norms == 0] = 1 + vectors /= norms + + # Run FAISS k-means + print(f"Running k-means: {args.nclusters} clusters, {args.niter} iterations...") + t0 = time.perf_counter() + kmeans = faiss.Kmeans( + D, args.nclusters, + niter=args.niter, + seed=args.seed, + verbose=True, + gpu=False, + ) + kmeans.train(vectors) + elapsed = time.perf_counter() - t0 + print(f" Done in {elapsed:.1f}s") + + centroids = kmeans.centroids # (nclusters, D) float32 + + # Write output DB + if os.path.exists(args.output): + os.remove(args.output) + out = sqlite3.connect(args.output) + out.execute("CREATE TABLE centroids (centroid_id INTEGER PRIMARY KEY, centroid BLOB NOT NULL)") + out.execute("CREATE TABLE meta (key TEXT PRIMARY KEY, value TEXT)") + + for i in range(args.nclusters): + blob = centroids[i].tobytes() + out.execute("INSERT INTO centroids (centroid_id, centroid) VALUES (?, ?)", (i, blob)) + + out.execute("INSERT INTO meta VALUES ('ntrain', ?)", (str(args.ntrain),)) + out.execute("INSERT INTO meta VALUES ('nclusters', ?)", (str(args.nclusters),)) + out.execute("INSERT INTO meta VALUES ('dimensions', ?)", (str(D),)) + out.execute("INSERT INTO meta VALUES ('niter', ?)", (str(args.niter),)) + out.execute("INSERT INTO meta VALUES ('elapsed_s', ?)", (str(round(elapsed, 3)),)) + out.execute("INSERT INTO meta VALUES ('seed', ?)", (str(args.seed),)) + out.commit() + out.close() + + print(f"Wrote {args.nclusters} centroids to {args.output}") + + +if __name__ == "__main__": + main() diff --git a/benchmarks-ann/ground_truth.py b/benchmarks-ann/ground_truth.py new file mode 100644 index 0000000..636a495 --- /dev/null +++ b/benchmarks-ann/ground_truth.py @@ -0,0 +1,168 @@ +#!/usr/bin/env python3 +"""Compute per-subset ground truth for ANN benchmarks. + +For subset sizes < 1M, builds a temporary vec0 float table with the first N +vectors and runs brute-force KNN to get correct ground truth per subset. + +For 1M (the full dataset), converts the existing `neighbors` table. + +Output: ground_truth.{subset_size}.db with table: + ground_truth(query_vector_id, rank, neighbor_id, distance) + +Usage: + python ground_truth.py --subset-size 50000 + python ground_truth.py --subset-size 1000000 +""" +import argparse +import os +import sqlite3 +import time + +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +EXT_PATH = os.path.join(_SCRIPT_DIR, "..", "dist", "vec0") +BASE_DB = os.path.join(_SCRIPT_DIR, "seed", "base.db") +FULL_DATASET_SIZE = 1_000_000 + + +def gen_ground_truth_subset(base_db, ext_path, subset_size, n_queries, k, out_path): + """Build ground truth by brute-force KNN over the first `subset_size` vectors.""" + if os.path.exists(out_path): + os.remove(out_path) + + conn = sqlite3.connect(out_path) + conn.enable_load_extension(True) + conn.load_extension(ext_path) + + conn.execute( + "CREATE TABLE ground_truth (" + " query_vector_id INTEGER NOT NULL," + " rank INTEGER NOT NULL," + " neighbor_id INTEGER NOT NULL," + " distance REAL NOT NULL," + " PRIMARY KEY (query_vector_id, rank)" + ")" + ) + + conn.execute(f"ATTACH DATABASE '{base_db}' AS base") + + print(f" Building temp vec0 table with {subset_size} vectors...") + conn.execute( + "CREATE VIRTUAL TABLE tmp_vec USING vec0(" + " id integer primary key," + " embedding float[768] distance_metric=cosine" + ")" + ) + + t0 = time.perf_counter() + conn.execute( + "INSERT INTO tmp_vec(id, embedding) " + "SELECT id, vector FROM base.train WHERE id < :n", + {"n": subset_size}, + ) + conn.commit() + build_time = time.perf_counter() - t0 + print(f" Temp table built in {build_time:.1f}s") + + query_vectors = conn.execute( + "SELECT id, vector FROM base.query_vectors ORDER BY id LIMIT :n", + {"n": n_queries}, + ).fetchall() + + print(f" Running brute-force KNN for {len(query_vectors)} queries, k={k}...") + t0 = time.perf_counter() + + for i, (qid, qvec) in enumerate(query_vectors): + results = conn.execute( + "SELECT id, distance FROM tmp_vec " + "WHERE embedding MATCH :query AND k = :k", + {"query": qvec, "k": k}, + ).fetchall() + + for rank, (nid, dist) in enumerate(results): + conn.execute( + "INSERT INTO ground_truth(query_vector_id, rank, neighbor_id, distance) " + "VALUES (?, ?, ?, ?)", + (qid, rank, nid, dist), + ) + + if (i + 1) % 10 == 0 or i == 0: + elapsed = time.perf_counter() - t0 + eta = (elapsed / (i + 1)) * (len(query_vectors) - i - 1) + print( + f" {i+1}/{len(query_vectors)} queries " + f"elapsed={elapsed:.1f}s eta={eta:.1f}s", + flush=True, + ) + + conn.commit() + conn.execute("DROP TABLE tmp_vec") + conn.execute("DETACH DATABASE base") + conn.commit() + + elapsed = time.perf_counter() - t0 + total_rows = conn.execute("SELECT count(*) FROM ground_truth").fetchone()[0] + conn.close() + print(f" Ground truth: {total_rows} rows in {elapsed:.1f}s -> {out_path}") + + +def gen_ground_truth_full(base_db, n_queries, k, out_path): + """Convert the existing neighbors table for the full 1M dataset.""" + if os.path.exists(out_path): + os.remove(out_path) + + conn = sqlite3.connect(out_path) + conn.execute(f"ATTACH DATABASE '{base_db}' AS base") + + conn.execute( + "CREATE TABLE ground_truth (" + " query_vector_id INTEGER NOT NULL," + " rank INTEGER NOT NULL," + " neighbor_id INTEGER NOT NULL," + " distance REAL," + " PRIMARY KEY (query_vector_id, rank)" + ")" + ) + + conn.execute( + "INSERT INTO ground_truth(query_vector_id, rank, neighbor_id) " + "SELECT query_vector_id, rank, CAST(neighbors_id AS INTEGER) " + "FROM base.neighbors " + "WHERE query_vector_id < :n AND rank < :k", + {"n": n_queries, "k": k}, + ) + conn.commit() + + total_rows = conn.execute("SELECT count(*) FROM ground_truth").fetchone()[0] + conn.execute("DETACH DATABASE base") + conn.close() + print(f" Ground truth (full): {total_rows} rows -> {out_path}") + + +def main(): + parser = argparse.ArgumentParser(description="Generate per-subset ground truth") + parser.add_argument( + "--subset-size", type=int, required=True, help="number of vectors in subset" + ) + parser.add_argument("-n", type=int, default=100, help="number of query vectors") + parser.add_argument("-k", type=int, default=100, help="max k for ground truth") + parser.add_argument("--base-db", default=BASE_DB) + parser.add_argument("--ext", default=EXT_PATH) + parser.add_argument( + "-o", "--out-dir", default=os.path.join(_SCRIPT_DIR, "seed"), + help="output directory for ground_truth.{N}.db", + ) + args = parser.parse_args() + + os.makedirs(args.out_dir, exist_ok=True) + out_path = os.path.join(args.out_dir, f"ground_truth.{args.subset_size}.db") + + if args.subset_size >= FULL_DATASET_SIZE: + gen_ground_truth_full(args.base_db, args.n, args.k, out_path) + else: + gen_ground_truth_subset( + args.base_db, args.ext, args.subset_size, args.n, args.k, out_path + ) + + +if __name__ == "__main__": + main() diff --git a/benchmarks-ann/profile.py b/benchmarks-ann/profile.py new file mode 100644 index 0000000..0792373 --- /dev/null +++ b/benchmarks-ann/profile.py @@ -0,0 +1,440 @@ +#!/usr/bin/env python3 +"""CPU profiling for sqlite-vec KNN configurations using macOS `sample` tool. + +Builds dist/sqlite3 (with -g3), generates a SQL workload (inserts + repeated +KNN queries) for each config, profiles the sqlite3 process with `sample`, and +prints the top-N hottest functions by self (exclusive) CPU samples. + +Usage: + cd benchmarks-ann + uv run profile.py --subset-size 50000 -n 50 \\ + "baseline-int8:type=baseline,variant=int8,oversample=8" \\ + "rescore-int8:type=rescore,quantizer=int8,oversample=8" +""" + +import argparse +import os +import re +import shutil +import subprocess +import sys +import tempfile + +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +_PROJECT_ROOT = os.path.join(_SCRIPT_DIR, "..") + +sys.path.insert(0, _SCRIPT_DIR) +from bench import ( + BASE_DB, + DEFAULT_INSERT_SQL, + INDEX_REGISTRY, + INSERT_BATCH_SIZE, + parse_config, +) + +SQLITE3_PATH = os.path.join(_PROJECT_ROOT, "dist", "sqlite3") +EXT_PATH = os.path.join(_PROJECT_ROOT, "dist", "vec0") + + +# ============================================================================ +# SQL generation +# ============================================================================ + + +def _query_sql_for_config(params, query_id, k): + """Return a SQL query string for a single KNN query by query_vector id.""" + index_type = params["index_type"] + qvec = f"(SELECT vector FROM base.query_vectors WHERE id = {query_id})" + + if index_type == "baseline": + variant = params.get("variant", "float") + oversample = params.get("oversample", 8) + oversample_k = k * oversample + + if variant == "int8": + return ( + f"WITH coarse AS (" + f" SELECT id, embedding FROM vec_items" + f" WHERE embedding_int8 MATCH vec_quantize_int8({qvec}, 'unit')" + f" LIMIT {oversample_k}" + f") " + f"SELECT id, vec_distance_cosine(embedding, {qvec}) as distance " + f"FROM coarse ORDER BY 2 LIMIT {k};" + ) + elif variant == "bit": + return ( + f"WITH coarse AS (" + f" SELECT id, embedding FROM vec_items" + f" WHERE embedding_bq MATCH vec_quantize_binary({qvec})" + f" LIMIT {oversample_k}" + f") " + f"SELECT id, vec_distance_cosine(embedding, {qvec}) as distance " + f"FROM coarse ORDER BY 2 LIMIT {k};" + ) + + # Default MATCH query (baseline-float, rescore, and others) + return ( + f"SELECT id, distance FROM vec_items" + f" WHERE embedding MATCH {qvec} AND k = {k};" + ) + + +def generate_sql(db_path, params, subset_size, n_queries, k, repeats): + """Generate a complete SQL workload: load ext, create table, insert, query.""" + lines = [] + lines.append(".bail on") + lines.append(f".load {EXT_PATH}") + lines.append(f"ATTACH DATABASE '{os.path.abspath(BASE_DB)}' AS base;") + lines.append("PRAGMA page_size=8192;") + + # Create table + reg = INDEX_REGISTRY[params["index_type"]] + lines.append(reg["create_table_sql"](params) + ";") + + # Inserts + sql_fn = reg.get("insert_sql") + insert_sql = sql_fn(params) if sql_fn else None + if insert_sql is None: + insert_sql = DEFAULT_INSERT_SQL + for lo in range(0, subset_size, INSERT_BATCH_SIZE): + hi = min(lo + INSERT_BATCH_SIZE, subset_size) + stmt = insert_sql.replace(":lo", str(lo)).replace(":hi", str(hi)) + lines.append(stmt + ";") + if hi % 10000 == 0 or hi == subset_size: + lines.append("-- progress: inserted %d/%d" % (hi, subset_size)) + + # Queries (repeated) + lines.append("-- BEGIN QUERIES") + for _rep in range(repeats): + for qid in range(n_queries): + lines.append(_query_sql_for_config(params, qid, k)) + + return "\n".join(lines) + + +# ============================================================================ +# Profiling with macOS `sample` +# ============================================================================ + + +def run_profile(sqlite3_path, db_path, sql_file, sample_output, duration=120): + """Run sqlite3 under macOS `sample` profiler. + + Starts sqlite3 directly with stdin from the SQL file, then immediately + attaches `sample` to its PID with -mayDie (tolerates process exit). + The workload must be long enough for sample to attach and capture useful data. + """ + sql_fd = open(sql_file, "r") + proc = subprocess.Popen( + [sqlite3_path, db_path], + stdin=sql_fd, + stdout=subprocess.DEVNULL, + stderr=subprocess.PIPE, + ) + + pid = proc.pid + print(f" sqlite3 PID: {pid}") + + # Attach sample immediately (1ms interval, -mayDie tolerates process exit) + sample_proc = subprocess.Popen( + ["sample", str(pid), str(duration), "1", "-mayDie", "-file", sample_output], + stdout=subprocess.DEVNULL, + stderr=subprocess.PIPE, + ) + + # Wait for sqlite3 to finish + _, stderr = proc.communicate() + sql_fd.close() + rc = proc.returncode + if rc != 0: + print(f" sqlite3 failed (rc={rc}):", file=sys.stderr) + print(f" {stderr.decode().strip()}", file=sys.stderr) + sample_proc.kill() + return False + + # Wait for sample to finish + sample_proc.wait() + return True + + +# ============================================================================ +# Parse `sample` output +# ============================================================================ + +# Tree-drawing characters used by macOS `sample` to represent hierarchy. +# We replace them with spaces so indentation depth reflects tree depth. +_TREE_CHARS_RE = re.compile(r"[+!:|]") + +# After tree chars are replaced with spaces, each call-graph line looks like: +# " 800 rescore_knn (in vec0.dylib) + 3808,3640,... [0x1a,0x2b,...] file.c:123" +# We extract just (indent, count, symbol, module) — everything after "(in ...)" +# is decoration we don't need. +_LEADING_RE = re.compile(r"^(\s+)(\d+)\s+(.+)") + + +def _extract_symbol_and_module(rest): + """Given the text after 'count ', extract (symbol, module). + + Handles patterns like: + 'rescore_knn (in vec0.dylib) + 3808,3640,... [0x...]' + 'pread (in libsystem_kernel.dylib) + 8 [0x...]' + '??? (in ) [0x...]' + 'start (in dyld) + 2840 [0x198650274]' + 'Thread_26759239 DispatchQueue_1: ...' + """ + # Try to find "(in ...)" to split symbol from module + m = re.match(r"^(.+?)\s+\(in\s+(.+?)\)", rest) + if m: + return m.group(1).strip(), m.group(2).strip() + # No module — return whole thing as symbol, strip trailing junk + sym = re.sub(r"\s+\[0x[0-9a-f].*", "", rest).strip() + return sym, "" + + +def _parse_call_graph_lines(text): + """Parse call-graph section into list of (depth, count, symbol, module).""" + entries = [] + for raw_line in text.split("\n"): + # Strip tree-drawing characters, replace with spaces to preserve depth + line = _TREE_CHARS_RE.sub(" ", raw_line) + m = _LEADING_RE.match(line) + if not m: + continue + depth = len(m.group(1)) + count = int(m.group(2)) + rest = m.group(3) + symbol, module = _extract_symbol_and_module(rest) + entries.append((depth, count, symbol, module)) + return entries + + +def parse_sample_output(filepath): + """Parse `sample` call-graph output, compute exclusive (self) samples per function. + + Returns dict of {display_name: self_sample_count}. + """ + with open(filepath, "r") as f: + text = f.read() + + # Find "Call graph:" section + cg_start = text.find("Call graph:") + if cg_start == -1: + print(" Warning: no 'Call graph:' section found in sample output") + return {} + + # End at "Total number in stack" or EOF + cg_end = text.find("\nTotal number in stack", cg_start) + if cg_end == -1: + cg_end = len(text) + + entries = _parse_call_graph_lines(text[cg_start:cg_end]) + + if not entries: + print(" Warning: no call graph entries parsed") + return {} + + # Compute self (exclusive) samples per function: + # self = count - sum(direct_children_counts) + self_samples = {} + for i, (depth, count, sym, mod) in enumerate(entries): + children_sum = 0 + child_depth = None + for j in range(i + 1, len(entries)): + j_depth = entries[j][0] + if j_depth <= depth: + break + if child_depth is None: + child_depth = j_depth + if j_depth == child_depth: + children_sum += entries[j][1] + + self_count = count - children_sum + if self_count > 0: + key = f"{sym} ({mod})" if mod else sym + self_samples[key] = self_samples.get(key, 0) + self_count + + return self_samples + + +# ============================================================================ +# Display +# ============================================================================ + + +def print_profile(title, self_samples, top_n=20): + total = sum(self_samples.values()) + if total == 0: + print(f"\n=== {title} (no samples) ===") + return + + sorted_syms = sorted(self_samples.items(), key=lambda x: -x[1]) + + print(f"\n=== {title} (top {top_n}, {total} total self-samples) ===") + for sym, count in sorted_syms[:top_n]: + pct = 100.0 * count / total + print(f" {pct:5.1f}% {count:>6} {sym}") + + +# ============================================================================ +# Main +# ============================================================================ + + +def main(): + parser = argparse.ArgumentParser( + description="CPU profiling for sqlite-vec KNN configurations", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + parser.add_argument( + "configs", nargs="+", help="config specs (name:type=X,key=val,...)" + ) + parser.add_argument("--subset-size", type=int, required=True) + parser.add_argument("-k", type=int, default=10, help="KNN k (default 10)") + parser.add_argument( + "-n", type=int, default=50, help="number of distinct queries (default 50)" + ) + parser.add_argument( + "--repeats", + type=int, + default=10, + help="repeat query set N times for more samples (default 10)", + ) + parser.add_argument( + "--top", type=int, default=20, help="show top N functions (default 20)" + ) + parser.add_argument("--base-db", default=BASE_DB) + parser.add_argument("--sqlite3", default=SQLITE3_PATH) + parser.add_argument( + "--keep-temp", + action="store_true", + help="keep temp directory with DBs, SQL, and sample output", + ) + args = parser.parse_args() + + # Check prerequisites + if not os.path.exists(args.base_db): + print(f"Error: base DB not found at {args.base_db}", file=sys.stderr) + print("Run 'make seed' in benchmarks-ann/ first.", file=sys.stderr) + sys.exit(1) + + if not shutil.which("sample"): + print("Error: macOS 'sample' tool not found.", file=sys.stderr) + sys.exit(1) + + # Build CLI + print("Building dist/sqlite3...") + result = subprocess.run( + ["make", "cli"], cwd=_PROJECT_ROOT, capture_output=True, text=True + ) + if result.returncode != 0: + print(f"Error: make cli failed:\n{result.stderr}", file=sys.stderr) + sys.exit(1) + print(" done.") + + if not os.path.exists(args.sqlite3): + print(f"Error: sqlite3 not found at {args.sqlite3}", file=sys.stderr) + sys.exit(1) + + configs = [parse_config(c) for c in args.configs] + + tmpdir = tempfile.mkdtemp(prefix="sqlite-vec-profile-") + print(f"Working directory: {tmpdir}") + + all_profiles = [] + + for i, (name, params) in enumerate(configs, 1): + reg = INDEX_REGISTRY[params["index_type"]] + desc = reg["describe"](params) + print(f"\n[{i}/{len(configs)}] {name} ({desc})") + + # Generate SQL workload + db_path = os.path.join(tmpdir, f"{name}.db") + sql_text = generate_sql( + db_path, params, args.subset_size, args.n, args.k, args.repeats + ) + sql_file = os.path.join(tmpdir, f"{name}.sql") + with open(sql_file, "w") as f: + f.write(sql_text) + + total_queries = args.n * args.repeats + print( + f" SQL workload: {args.subset_size} inserts + " + f"{total_queries} queries ({args.n} x {args.repeats} repeats)" + ) + + # Profile + sample_file = os.path.join(tmpdir, f"{name}.sample.txt") + print(f" Profiling...") + ok = run_profile(args.sqlite3, db_path, sql_file, sample_file) + if not ok: + print(f" FAILED — skipping {name}") + all_profiles.append((name, desc, {})) + continue + + if not os.path.exists(sample_file): + print(f" Warning: sample output not created") + all_profiles.append((name, desc, {})) + continue + + # Parse + self_samples = parse_sample_output(sample_file) + all_profiles.append((name, desc, self_samples)) + + # Show individual profile + print_profile(f"{name} ({desc})", self_samples, args.top) + + # Side-by-side comparison if multiple configs + if len(all_profiles) > 1: + print("\n" + "=" * 80) + print("COMPARISON") + print("=" * 80) + + # Collect all symbols that appear in top-N of any config + all_syms = set() + for _name, _desc, prof in all_profiles: + sorted_syms = sorted(prof.items(), key=lambda x: -x[1]) + for sym, _count in sorted_syms[: args.top]: + all_syms.add(sym) + + # Build comparison table + rows = [] + for sym in all_syms: + row = [sym] + for _name, _desc, prof in all_profiles: + total = sum(prof.values()) + count = prof.get(sym, 0) + pct = 100.0 * count / total if total > 0 else 0.0 + row.append((pct, count)) + max_pct = max(r[0] for r in row[1:]) + rows.append((max_pct, row)) + + rows.sort(key=lambda x: -x[0]) + + # Header + header = f"{'function':>40}" + for name, desc, _ in all_profiles: + header += f" {name:>14}" + print(header) + print("-" * len(header)) + + for _sort_key, row in rows[: args.top * 2]: + sym = row[0] + display_sym = sym if len(sym) <= 40 else sym[:37] + "..." + line = f"{display_sym:>40}" + for pct, count in row[1:]: + if count > 0: + line += f" {pct:>13.1f}%" + else: + line += f" {'-':>14}" + print(line) + + if args.keep_temp: + print(f"\nTemp files kept at: {tmpdir}") + else: + shutil.rmtree(tmpdir) + print(f"\nTemp files cleaned up. Use --keep-temp to preserve.") + + +if __name__ == "__main__": + main() diff --git a/benchmarks-ann/results_schema.sql b/benchmarks-ann/results_schema.sql new file mode 100644 index 0000000..7918709 --- /dev/null +++ b/benchmarks-ann/results_schema.sql @@ -0,0 +1,76 @@ +-- Comprehensive results schema for vec0 KNN benchmark runs. +-- Created in WAL mode: PRAGMA journal_mode=WAL + +CREATE TABLE IF NOT EXISTS runs ( + run_id INTEGER PRIMARY KEY AUTOINCREMENT, + config_name TEXT NOT NULL, + index_type TEXT NOT NULL, + params TEXT NOT NULL, -- JSON: {"R":48,"L":128,"quantizer":"binary"} + dataset TEXT NOT NULL, -- "cohere1m" + subset_size INTEGER NOT NULL, + k INTEGER NOT NULL, + n_queries INTEGER NOT NULL, + phase TEXT NOT NULL DEFAULT 'both', + -- 'build', 'query', or 'both' + status TEXT NOT NULL DEFAULT 'pending', + -- pending → inserting → training → querying → done | built | error + created_at_ns INTEGER NOT NULL -- time.time_ns() +); + +CREATE TABLE IF NOT EXISTS run_results ( + run_id INTEGER PRIMARY KEY REFERENCES runs(run_id), + insert_started_ns INTEGER, + insert_ended_ns INTEGER, + insert_duration_ns INTEGER, + train_started_ns INTEGER, -- NULL if no training + train_ended_ns INTEGER, + train_duration_ns INTEGER, + build_duration_ns INTEGER, -- insert + train + db_file_size_bytes INTEGER, + db_file_path TEXT, + create_sql TEXT, -- CREATE VIRTUAL TABLE ... + insert_sql TEXT, -- INSERT INTO vec_items ... + train_sql TEXT, -- NULL if no training step + query_sql TEXT, -- SELECT ... WHERE embedding MATCH ... + k INTEGER, -- denormalized from runs for easy filtering + query_mean_ms REAL, -- denormalized aggregates + query_median_ms REAL, + query_p99_ms REAL, + query_total_ms REAL, + qps REAL, + recall REAL +); + +CREATE TABLE IF NOT EXISTS insert_batches ( + batch_id INTEGER PRIMARY KEY AUTOINCREMENT, + run_id INTEGER NOT NULL REFERENCES runs(run_id), + batch_lo INTEGER NOT NULL, -- start index (inclusive) + batch_hi INTEGER NOT NULL, -- end index (exclusive) + rows_in_batch INTEGER NOT NULL, + started_ns INTEGER NOT NULL, + ended_ns INTEGER NOT NULL, + duration_ns INTEGER NOT NULL, + cumulative_rows INTEGER NOT NULL, -- total rows inserted so far + rate_rows_per_s REAL NOT NULL -- cumulative rate +); + +CREATE TABLE IF NOT EXISTS queries ( + query_id INTEGER PRIMARY KEY AUTOINCREMENT, + run_id INTEGER NOT NULL REFERENCES runs(run_id), + k INTEGER NOT NULL, + query_vector_id INTEGER NOT NULL, + started_ns INTEGER NOT NULL, + ended_ns INTEGER NOT NULL, + duration_ms REAL NOT NULL, + result_ids TEXT NOT NULL, -- JSON array + result_distances TEXT NOT NULL, -- JSON array + ground_truth_ids TEXT NOT NULL, -- JSON array + recall REAL NOT NULL, + UNIQUE(run_id, k, query_vector_id) +); + +CREATE INDEX IF NOT EXISTS idx_runs_config ON runs(config_name); +CREATE INDEX IF NOT EXISTS idx_runs_type ON runs(index_type); +CREATE INDEX IF NOT EXISTS idx_runs_status ON runs(status); +CREATE INDEX IF NOT EXISTS idx_batches_run ON insert_batches(run_id); +CREATE INDEX IF NOT EXISTS idx_queries_run ON queries(run_id); diff --git a/benchmarks-ann/schema.sql b/benchmarks-ann/schema.sql new file mode 100644 index 0000000..ae8acf3 --- /dev/null +++ b/benchmarks-ann/schema.sql @@ -0,0 +1,60 @@ +-- Canonical results schema for vec0 KNN benchmark comparisons. +-- The index_type column is a free-form TEXT field. Baseline configs use +-- "baseline"; index-specific branches add their own types (registered +-- via INDEX_REGISTRY in bench.py). + +CREATE TABLE IF NOT EXISTS runs ( + run_id INTEGER PRIMARY KEY AUTOINCREMENT, + config_name TEXT NOT NULL, + index_type TEXT NOT NULL, + subset_size INTEGER NOT NULL, + phase TEXT NOT NULL DEFAULT 'both', -- 'build', 'query', or 'both' + status TEXT NOT NULL DEFAULT 'pending', + k INTEGER, + n INTEGER, + db_path TEXT, + insert_time_s REAL, + train_time_s REAL, + total_build_time_s REAL, + rows INTEGER, + file_size_mb REAL, + mean_ms REAL, + median_ms REAL, + p99_ms REAL, + total_query_ms REAL, + qps REAL, + recall REAL, + created_at TEXT NOT NULL DEFAULT (datetime('now')), + finished_at TEXT +); + +CREATE TABLE IF NOT EXISTS build_results ( + config_name TEXT NOT NULL, + index_type TEXT NOT NULL, + subset_size INTEGER NOT NULL, + db_path TEXT NOT NULL, + insert_time_s REAL NOT NULL, + train_time_s REAL, -- NULL when no training/build step is needed + total_time_s REAL NOT NULL, + rows INTEGER NOT NULL, + file_size_mb REAL NOT NULL, + created_at TEXT NOT NULL DEFAULT (datetime('now')), + PRIMARY KEY (config_name, subset_size) +); + +CREATE TABLE IF NOT EXISTS bench_results ( + config_name TEXT NOT NULL, + index_type TEXT NOT NULL, + subset_size INTEGER NOT NULL, + k INTEGER NOT NULL, + n INTEGER NOT NULL, + mean_ms REAL NOT NULL, + median_ms REAL NOT NULL, + p99_ms REAL NOT NULL, + total_ms REAL NOT NULL, + qps REAL NOT NULL, + recall REAL NOT NULL, + db_path TEXT NOT NULL, + created_at TEXT NOT NULL DEFAULT (datetime('now')), + PRIMARY KEY (config_name, subset_size, k) +); diff --git a/benchmarks/exhaustive-memory/bench.py b/benchmarks/exhaustive-memory/bench.py index c9da831..7c969d6 100644 --- a/benchmarks/exhaustive-memory/bench.py +++ b/benchmarks/exhaustive-memory/bench.py @@ -248,59 +248,6 @@ def bench_libsql(base, query, page_size, k) -> BenchResult: return BenchResult(f"libsql ({page_size})", build_time, times) -def register_np(db, array, name): - ptr = array.__array_interface__["data"][0] - nvectors, dimensions = array.__array_interface__["shape"] - element_type = array.__array_interface__["typestr"] - - assert element_type == " BenchResult: - print(f"sqlite-vec static...") - - db = sqlite3.connect(":memory:") - db.enable_load_extension(True) - db.load_extension("../../dist/vec0") - - - - t = time.time() - register_np(db, base, "base") - build_time = time.time() - t - - times = [] - results = [] - for ( - idx, - q, - ) in enumerate(query): - t0 = time.time() - result = db.execute( - """ - select - rowid - from base - where vector match ? - and k = ? - order by distance - """, - [q.tobytes(), k], - ).fetchall() - assert len(result) == k - times.append(time.time() - t0) - return BenchResult(f"sqlite-vec static", build_time, times) - def bench_faiss(base, query, k) -> BenchResult: import faiss dimensions = base.shape[1] @@ -438,8 +385,6 @@ def suite(name, base, query, k, benchmarks): for b in benchmarks: if b == "faiss": results.append(bench_faiss(base, query, k=k)) - elif b == "vec-static": - results.append(bench_sqlite_vec_static(base, query, k=k)) elif b.startswith("vec-scalar"): _, page_size = b.split('.') results.append(bench_sqlite_vec_scalar(base, query, page_size, k=k)) @@ -541,7 +486,7 @@ def parse_args(): help="Number of queries to use. Defaults all", ) parser.add_argument( - "-x", help="type of runs to make", default="faiss,vec-scalar.4096,vec-static,vec-vec0.4096.16,usearch,duckdb,hnswlib,numpy" + "-x", help="type of runs to make", default="faiss,vec-scalar.4096,vec-vec0.4096.16,usearch,duckdb,hnswlib,numpy" ) args = parser.parse_args() diff --git a/benchmarks/profiling/build-from-npy.sql b/benchmarks/profiling/build-from-npy.sql index 134df70..92ef59c 100644 --- a/benchmarks/profiling/build-from-npy.sql +++ b/benchmarks/profiling/build-from-npy.sql @@ -8,10 +8,3 @@ create virtual table vec_items using vec0( embedding float[1536] ); --- 65s (limit 1e5), ~615MB on disk -insert into vec_items - select - rowid, - vector - from vec_npy_each(vec_npy_file('examples/dbpedia-openai/data/vectors.npy')) - limit 1e5; diff --git a/benchmarks/self-params/build.py b/benchmarks/self-params/build.py index bc6e388..c5d9fc1 100644 --- a/benchmarks/self-params/build.py +++ b/benchmarks/self-params/build.py @@ -6,7 +6,6 @@ def connect(path): db = sqlite3.connect(path) db.enable_load_extension(True) db.load_extension("../dist/vec0") - db.execute("select load_extension('../dist/vec0', 'sqlite3_vec_fs_read_init')") db.enable_load_extension(False) return db @@ -18,8 +17,6 @@ page_sizes = [ # 4096, 8192, chunk_sizes = [128, 256, 1024, 2048] types = ["f32", "int8", "bit"] -SRC = "../examples/dbpedia-openai/data/vectors.npy" - for page_size in page_sizes: for chunk_size in chunk_sizes: for t in types: @@ -42,15 +39,8 @@ for page_size in page_sizes: func = "vec_quantize_i8(vector, 'unit')" if t == "bit": func = "vec_quantize_binary(vector)" - db.execute( - f""" - insert into vec_items - select rowid, {func} - from vec_npy_each(vec_npy_file(?)) - limit 100000 - """, - [SRC], - ) + # TODO: replace with non-npy data loading + pass elapsed = time.time() - t0 print(elapsed) diff --git a/bindings/go/ncruces/go-sqlite3.patch b/bindings/go/ncruces/go-sqlite3.patch index f202bc3..03bead9 100644 --- a/bindings/go/ncruces/go-sqlite3.patch +++ b/bindings/go/ncruces/go-sqlite3.patch @@ -6,7 +6,6 @@ index ed2aaec..4cc0b0e 100755 -Wl,--initial-memory=327680 \ -D_HAVE_SQLITE_CONFIG_H \ -DSQLITE_CUSTOM_INCLUDE=sqlite_opt.h \ -+ -DSQLITE_VEC_OMIT_FS=1 \ $(awk '{print "-Wl,--export="$0}' exports.txt) "$BINARYEN/wasm-ctor-eval" -g -c _initialize sqlite3.wasm -o sqlite3.tmp diff --git a/bindings/python/extra_init.py b/bindings/python/extra_init.py index 267bc41..4408855 100644 --- a/bindings/python/extra_init.py +++ b/bindings/python/extra_init.py @@ -1,6 +1,5 @@ from typing import List from struct import pack -from sqlite3 import Connection def serialize_float32(vector: List[float]) -> bytes: @@ -13,33 +12,3 @@ def serialize_int8(vector: List[int]) -> bytes: return pack("%sb" % len(vector), *vector) -try: - import numpy.typing as npt - - def register_numpy(db: Connection, name: str, array: npt.NDArray): - """ayoo""" - - ptr = array.__array_interface__["data"][0] - nvectors, dimensions = array.__array_interface__["shape"] - element_type = array.__array_interface__["typestr"] - - assert element_type == " dist/sqlite-vec.c +""" + +import re +import sys +import os + + +def strip_lsp_block(content): + """Remove the LSP-support pattern: + #ifndef SQLITE_VEC_H + #include "sqlite-vec.c" // ... + #endif + """ + pattern = re.compile( + r'^\s*#ifndef\s+SQLITE_VEC_H\s*\n' + r'\s*#include\s+"sqlite-vec\.c"[^\n]*\n' + r'\s*#endif[^\n]*\n', + re.MULTILINE, + ) + return pattern.sub('', content) + + +def strip_include_guard(content, guard_macro): + """Remove the include guard pair: + #ifndef GUARD_MACRO + #define GUARD_MACRO + ...content... + (trailing #endif removed) + """ + # Strip the #ifndef / #define pair at the top + header_pattern = re.compile( + r'^\s*#ifndef\s+' + re.escape(guard_macro) + r'\s*\n' + r'\s*#define\s+' + re.escape(guard_macro) + r'\s*\n', + re.MULTILINE, + ) + content = header_pattern.sub('', content, count=1) + + # Strip the trailing #endif (last one in file that closes the guard) + # Find the last #endif and remove it + lines = content.rstrip('\n').split('\n') + for i in range(len(lines) - 1, -1, -1): + if re.match(r'^\s*#endif', lines[i]): + lines.pop(i) + break + + return '\n'.join(lines) + '\n' + + +def detect_include_guard(content): + """Detect an include guard macro like SQLITE_VEC_IVF_C.""" + m = re.match( + r'\s*(?:/\*[\s\S]*?\*/\s*)?' # optional block comment + r'#ifndef\s+(SQLITE_VEC_\w+_C)\s*\n' + r'#define\s+\1', + content, + ) + return m.group(1) if m else None + + +def inline_include(match, base_dir): + """Replace an #include "sqlite-vec-*.c" with the file's contents.""" + filename = match.group(1) + filepath = os.path.join(base_dir, filename) + + if not os.path.exists(filepath): + print(f"Warning: {filepath} not found, leaving #include in place", file=sys.stderr) + return match.group(0) + + with open(filepath, 'r') as f: + content = f.read() + + # Strip LSP-support block + content = strip_lsp_block(content) + + # Strip include guard if present + guard = detect_include_guard(content) + if guard: + content = strip_include_guard(content, guard) + + separator = '/' * 78 + header = f'\n{separator}\n// Begin inlined: {filename}\n{separator}\n\n' + footer = f'\n{separator}\n// End inlined: {filename}\n{separator}\n' + + return header + content.strip('\n') + footer + + +def amalgamate(input_path): + base_dir = os.path.dirname(os.path.abspath(input_path)) + + with open(input_path, 'r') as f: + content = f.read() + + # Replace #include "sqlite-vec-*.c" with inlined contents + include_pattern = re.compile(r'^#include\s+"(sqlite-vec-[^"]+\.c)"\s*$', re.MULTILINE) + content = include_pattern.sub(lambda m: inline_include(m, base_dir), content) + + return content + + +def main(): + if len(sys.argv) != 2: + print(f"Usage: {sys.argv[0]} ", file=sys.stderr) + sys.exit(1) + + result = amalgamate(sys.argv[1]) + sys.stdout.write(result) + + +if __name__ == '__main__': + main() diff --git a/scripts/vendor.sh b/scripts/vendor.sh index 0706aa5..033ea1e 100755 --- a/scripts/vendor.sh +++ b/scripts/vendor.sh @@ -1,7 +1,6 @@ #!/bin/bash mkdir -p vendor curl -o sqlite-amalgamation.zip https://www.sqlite.org/2024/sqlite-amalgamation-3450300.zip -unzip -d unzip sqlite-amalgamation.zip mv sqlite-amalgamation-3450300/* vendor/ rmdir sqlite-amalgamation-3450300 diff --git a/site/api-reference.md b/site/api-reference.md index bd144ea..ba8c648 100644 --- a/site/api-reference.md +++ b/site/api-reference.md @@ -568,65 +568,6 @@ select 'todo'; -- 'todo' -``` - -## NumPy Utilities {#numpy} - -Functions to read data from or work with [NumPy arrays](https://numpy.org/doc/stable/reference/generated/numpy.array.html). - -### `vec_npy_each(vector)` {#vec_npy_each} - -xxx - - -```sql --- db.execute('select quote(?)', [to_npy(np.array([[1.0], [2.0], [3.0]], dtype=np.float32))]).fetchone() -select - rowid, - vector, - vec_type(vector), - vec_to_json(vector) -from vec_npy_each( - X'934E554D5059010076007B276465736372273A20273C6634272C2027666F727472616E5F6F72646572273A2046616C73652C20277368617065273A2028332C2031292C207D202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020200A0000803F0000004000004040' -) -/* -┌───────┬─────────────┬──────────────────┬─────────────────────┐ -│ rowid │ vector │ vec_type(vector) │ vec_to_json(vector) │ -├───────┼─────────────┼──────────────────┼─────────────────────┤ -│ 0 │ X'0000803F' │ 'float32' │ '[1.000000]' │ -├───────┼─────────────┼──────────────────┼─────────────────────┤ -│ 1 │ X'00000040' │ 'float32' │ '[2.000000]' │ -├───────┼─────────────┼──────────────────┼─────────────────────┤ -│ 2 │ X'00004040' │ 'float32' │ '[3.000000]' │ -└───────┴─────────────┴──────────────────┴─────────────────────┘ - -*/ - - --- db.execute('select quote(?)', [to_npy(np.array([[1.0], [2.0], [3.0]], dtype=np.float32))]).fetchone() -select - rowid, - vector, - vec_type(vector), - vec_to_json(vector) -from vec_npy_each( - X'934E554D5059010076007B276465736372273A20273C6634272C2027666F727472616E5F6F72646572273A2046616C73652C20277368617065273A2028332C2031292C207D202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020200A0000803F0000004000004040' -) -/* -┌───────┬─────────────┬──────────────────┬─────────────────────┐ -│ rowid │ vector │ vec_type(vector) │ vec_to_json(vector) │ -├───────┼─────────────┼──────────────────┼─────────────────────┤ -│ 0 │ X'0000803F' │ 'float32' │ '[1.000000]' │ -├───────┼─────────────┼──────────────────┼─────────────────────┤ -│ 1 │ X'00000040' │ 'float32' │ '[2.000000]' │ -├───────┼─────────────┼──────────────────┼─────────────────────┤ -│ 2 │ X'00004040' │ 'float32' │ '[3.000000]' │ -└───────┴─────────────┴──────────────────┴─────────────────────┘ - -*/ - - - ``` ## Meta {#meta} diff --git a/site/compiling.md b/site/compiling.md index 9ce3c83..b3b2e33 100644 --- a/site/compiling.md +++ b/site/compiling.md @@ -59,5 +59,4 @@ The current compile-time flags are: - `SQLITE_VEC_ENABLE_AVX`, enables AVX CPU instructions for some vector search operations - `SQLITE_VEC_ENABLE_NEON`, enables NEON CPU instructions for some vector search operations -- `SQLITE_VEC_OMIT_FS`, removes some obsure SQL functions and features that use the filesystem, meant for some WASM builds where there's no available filesystem - `SQLITE_VEC_STATIC`, meant for statically linking `sqlite-vec` diff --git a/sqlite-dist.toml b/sqlite-dist.toml index ab1c08a..23bdf6e 100644 --- a/sqlite-dist.toml +++ b/sqlite-dist.toml @@ -1,6 +1,6 @@ [package] name = "sqlite-vec" -license = "MIT OR Apache" +license = "MIT OR Apache-2.0" homepage = "https://alexgarcia.xyz/sqlite-vec" repo = "https://github.com/asg017/sqlite-vec" description = "A vector search SQLite extension." diff --git a/sqlite-vec-diskann.c b/sqlite-vec-diskann.c new file mode 100644 index 0000000..5bd298b --- /dev/null +++ b/sqlite-vec-diskann.c @@ -0,0 +1,1889 @@ +// DiskANN algorithm implementation +// This file is #include'd into sqlite-vec.c — not compiled separately. + +// ============================================================ +// DiskANN node blob encode/decode functions +// ============================================================ + +/** Compute size of validity bitmap in bytes. */ +int diskann_validity_byte_size(int n_neighbors) { + return n_neighbors / CHAR_BIT; +} + +/** Compute size of neighbor IDs blob in bytes. */ +size_t diskann_neighbor_ids_byte_size(int n_neighbors) { + return (size_t)n_neighbors * sizeof(i64); +} + +/** Compute size of quantized vectors blob in bytes. */ +size_t diskann_neighbor_qvecs_byte_size( + int n_neighbors, enum Vec0DiskannQuantizerType quantizer_type, + size_t dimensions) { + return (size_t)n_neighbors * + diskann_quantized_vector_byte_size(quantizer_type, dimensions); +} + +/** + * Create empty blobs for a new DiskANN node (all neighbors invalid). + * Caller must free the returned pointers with sqlite3_free(). + */ +int diskann_node_init( + int n_neighbors, enum Vec0DiskannQuantizerType quantizer_type, + size_t dimensions, + u8 **outValidity, int *outValiditySize, + u8 **outNeighborIds, int *outNeighborIdsSize, + u8 **outNeighborQvecs, int *outNeighborQvecsSize) { + + int validitySize = diskann_validity_byte_size(n_neighbors); + size_t idsSize = diskann_neighbor_ids_byte_size(n_neighbors); + size_t qvecsSize = diskann_neighbor_qvecs_byte_size( + n_neighbors, quantizer_type, dimensions); + + u8 *validity = sqlite3_malloc(validitySize); + u8 *ids = sqlite3_malloc(idsSize); + u8 *qvecs = sqlite3_malloc(qvecsSize); + + if (!validity || !ids || !qvecs) { + sqlite3_free(validity); + sqlite3_free(ids); + sqlite3_free(qvecs); + return SQLITE_NOMEM; + } + + memset(validity, 0, validitySize); + memset(ids, 0, idsSize); + memset(qvecs, 0, qvecsSize); + + *outValidity = validity; *outValiditySize = validitySize; + *outNeighborIds = ids; *outNeighborIdsSize = (int)idsSize; + *outNeighborQvecs = qvecs; *outNeighborQvecsSize = (int)qvecsSize; + return SQLITE_OK; +} + +/** Check if neighbor slot i is valid. */ +int diskann_validity_get(const u8 *validity, int i) { + return (validity[i / CHAR_BIT] >> (i % CHAR_BIT)) & 1; +} + +/** Set neighbor slot i as valid (1) or invalid (0). */ +void diskann_validity_set(u8 *validity, int i, int value) { + if (value) { + validity[i / CHAR_BIT] |= (1 << (i % CHAR_BIT)); + } else { + validity[i / CHAR_BIT] &= ~(1 << (i % CHAR_BIT)); + } +} + +/** Count the number of valid neighbors. */ +int diskann_validity_count(const u8 *validity, int n_neighbors) { + int count = 0; + for (int i = 0; i < n_neighbors; i++) { + count += diskann_validity_get(validity, i); + } + return count; +} + +/** Get the rowid of the neighbor in slot i. */ +i64 diskann_neighbor_id_get(const u8 *neighbor_ids, int i) { + i64 result; + memcpy(&result, neighbor_ids + i * sizeof(i64), sizeof(i64)); + return result; +} + +/** Set the rowid of the neighbor in slot i. */ +void diskann_neighbor_id_set(u8 *neighbor_ids, int i, i64 rowid) { + memcpy(neighbor_ids + i * sizeof(i64), &rowid, sizeof(i64)); +} + +/** Get a pointer to the quantized vector in slot i (read-only). */ +const u8 *diskann_neighbor_qvec_get( + const u8 *qvecs, int i, + enum Vec0DiskannQuantizerType quantizer_type, size_t dimensions) { + size_t qvec_size = diskann_quantized_vector_byte_size(quantizer_type, dimensions); + return qvecs + (size_t)i * qvec_size; +} + +/** Copy a quantized vector into slot i. */ +void diskann_neighbor_qvec_set( + u8 *qvecs, int i, const u8 *src_qvec, + enum Vec0DiskannQuantizerType quantizer_type, size_t dimensions) { + size_t qvec_size = diskann_quantized_vector_byte_size(quantizer_type, dimensions); + memcpy(qvecs + (size_t)i * qvec_size, src_qvec, qvec_size); +} + +/** + * Set neighbor slot i with a rowid and quantized vector, and mark as valid. + */ +void diskann_node_set_neighbor( + u8 *validity, u8 *neighbor_ids, u8 *qvecs, int i, + i64 neighbor_rowid, const u8 *neighbor_qvec, + enum Vec0DiskannQuantizerType quantizer_type, size_t dimensions) { + diskann_validity_set(validity, i, 1); + diskann_neighbor_id_set(neighbor_ids, i, neighbor_rowid); + diskann_neighbor_qvec_set(qvecs, i, neighbor_qvec, quantizer_type, dimensions); +} + +/** + * Clear neighbor slot i (mark invalid, zero out data). + */ +void diskann_node_clear_neighbor( + u8 *validity, u8 *neighbor_ids, u8 *qvecs, int i, + enum Vec0DiskannQuantizerType quantizer_type, size_t dimensions) { + diskann_validity_set(validity, i, 0); + diskann_neighbor_id_set(neighbor_ids, i, 0); + size_t qvec_size = diskann_quantized_vector_byte_size(quantizer_type, dimensions); + memset(qvecs + (size_t)i * qvec_size, 0, qvec_size); +} + +/** + * Quantize a full-precision float32 vector into the target quantizer format. + * Output buffer must be pre-allocated with diskann_quantized_vector_byte_size() bytes. + */ +int diskann_quantize_vector( + const f32 *src, size_t dimensions, + enum Vec0DiskannQuantizerType quantizer_type, + u8 *out) { + + switch (quantizer_type) { + case VEC0_DISKANN_QUANTIZER_BINARY: { + memset(out, 0, dimensions / CHAR_BIT); + for (size_t i = 0; i < dimensions; i++) { + if (src[i] > 0.0f) { + out[i / CHAR_BIT] |= (1 << (i % CHAR_BIT)); + } + } + return SQLITE_OK; + } + case VEC0_DISKANN_QUANTIZER_INT8: { + f32 step = (1.0f - (-1.0f)) / 255.0f; + for (size_t i = 0; i < dimensions; i++) { + ((i8 *)out)[i] = (i8)(((src[i] - (-1.0f)) / step) - 128.0f); + } + return SQLITE_OK; + } + } + return SQLITE_ERROR; +} + +/** + * Compute approximate distance between a full-precision query vector and a + * quantized neighbor vector. Used during graph traversal. + */ +/** + * Compute distance between a pre-quantized query and a quantized neighbor. + * The caller is responsible for quantizing the query vector once and passing + * the result here for each neighbor comparison. + */ +static f32 diskann_distance_quantized_precomputed( + const u8 *query_quantized, const u8 *quantized_neighbor, + size_t dimensions, + enum Vec0DiskannQuantizerType quantizer_type, + enum Vec0DistanceMetrics distance_metric) { + + switch (quantizer_type) { + case VEC0_DISKANN_QUANTIZER_BINARY: + return distance_hamming(query_quantized, quantized_neighbor, &dimensions); + case VEC0_DISKANN_QUANTIZER_INT8: { + switch (distance_metric) { + case VEC0_DISTANCE_METRIC_L2: + return distance_l2_sqr_int8(query_quantized, quantized_neighbor, &dimensions); + case VEC0_DISTANCE_METRIC_COSINE: + return distance_cosine_int8(query_quantized, quantized_neighbor, &dimensions); + case VEC0_DISTANCE_METRIC_L1: + return (f32)distance_l1_int8(query_quantized, quantized_neighbor, &dimensions); + } + break; + } + } + return FLT_MAX; +} + +/** + * Quantize a float query vector. Returns allocated buffer (caller must free). + */ +static u8 *diskann_quantize_query( + const f32 *query_vector, size_t dimensions, + enum Vec0DiskannQuantizerType quantizer_type) { + size_t qsize = diskann_quantized_vector_byte_size(quantizer_type, dimensions); + u8 *buf = sqlite3_malloc(qsize); + if (!buf) return NULL; + diskann_quantize_vector(query_vector, dimensions, quantizer_type, buf); + return buf; +} + +/** + * Legacy wrapper: quantizes on-the-fly (used by callers that don't pre-quantize). + */ +f32 diskann_distance_quantized( + const void *query_vector, const u8 *quantized_neighbor, + size_t dimensions, + enum Vec0DiskannQuantizerType quantizer_type, + enum Vec0DistanceMetrics distance_metric) { + + u8 *query_q = diskann_quantize_query((const f32 *)query_vector, dimensions, quantizer_type); + if (!query_q) return FLT_MAX; + f32 dist = diskann_distance_quantized_precomputed( + query_q, quantized_neighbor, dimensions, quantizer_type, distance_metric); + sqlite3_free(query_q); + return dist; +} + +// ============================================================ +// DiskANN medoid / entry point management +// ============================================================ + +/** + * Get the current medoid rowid for the given vector column's DiskANN index. + * Returns SQLITE_OK with *outMedoid set to the medoid rowid. + * If the graph is empty, returns SQLITE_OK with *outIsEmpty = 1. + */ +static int diskann_medoid_get(vec0_vtab *p, int vec_col_idx, + i64 *outMedoid, int *outIsEmpty) { + int rc; + sqlite3_stmt *stmt = NULL; + char *key = sqlite3_mprintf("diskann_medoid_%02d", vec_col_idx); + char *zSql = sqlite3_mprintf( + "SELECT value FROM " VEC0_SHADOW_INFO_NAME " WHERE key = ?", + p->schemaName, p->tableName); + if (!key || !zSql) { + sqlite3_free(key); + sqlite3_free(zSql); + return SQLITE_NOMEM; + } + + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) { + sqlite3_free(key); + return rc; + } + + sqlite3_bind_text(stmt, 1, key, -1, sqlite3_free); + rc = sqlite3_step(stmt); + if (rc == SQLITE_ROW) { + if (sqlite3_column_type(stmt, 0) == SQLITE_NULL) { + *outIsEmpty = 1; + } else { + *outIsEmpty = 0; + *outMedoid = sqlite3_column_int64(stmt, 0); + } + rc = SQLITE_OK; + } else { + rc = SQLITE_ERROR; + } + sqlite3_finalize(stmt); + return rc; +} + +/** + * Set the medoid rowid for the given vector column's DiskANN index. + * Pass isEmpty = 1 to mark the graph as empty (NULL medoid). + */ +static int diskann_medoid_set(vec0_vtab *p, int vec_col_idx, + i64 medoidRowid, int isEmpty) { + int rc; + sqlite3_stmt *stmt = NULL; + char *key = sqlite3_mprintf("diskann_medoid_%02d", vec_col_idx); + char *zSql = sqlite3_mprintf( + "UPDATE " VEC0_SHADOW_INFO_NAME " SET value = ?2 WHERE key = ?1", + p->schemaName, p->tableName); + if (!key || !zSql) { + sqlite3_free(key); + sqlite3_free(zSql); + return SQLITE_NOMEM; + } + + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) { + sqlite3_free(key); + return rc; + } + + sqlite3_bind_text(stmt, 1, key, -1, sqlite3_free); + if (isEmpty) { + sqlite3_bind_null(stmt, 2); + } else { + sqlite3_bind_int64(stmt, 2, medoidRowid); + } + rc = sqlite3_step(stmt); + sqlite3_finalize(stmt); + return (rc == SQLITE_DONE) ? SQLITE_OK : SQLITE_ERROR; +} + + +/** + * Called when deleting a vector. If the deleted vector was the medoid, + * pick a new one (the first available vector, or set to empty if none remain). + */ +static int diskann_medoid_handle_delete(vec0_vtab *p, int vec_col_idx, + i64 deletedRowid) { + i64 currentMedoid; + int isEmpty; + int rc = diskann_medoid_get(p, vec_col_idx, ¤tMedoid, &isEmpty); + if (rc != SQLITE_OK) return rc; + + if (!isEmpty && currentMedoid == deletedRowid) { + sqlite3_stmt *stmt = NULL; + char *zSql = sqlite3_mprintf( + "SELECT rowid FROM " VEC0_SHADOW_VECTORS_N_NAME " WHERE rowid != ?1 LIMIT 1", + p->schemaName, p->tableName, vec_col_idx); + if (!zSql) return SQLITE_NOMEM; + + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) return rc; + + sqlite3_bind_int64(stmt, 1, deletedRowid); + rc = sqlite3_step(stmt); + if (rc == SQLITE_ROW) { + i64 newMedoid = sqlite3_column_int64(stmt, 0); + sqlite3_finalize(stmt); + return diskann_medoid_set(p, vec_col_idx, newMedoid, 0); + } else { + sqlite3_finalize(stmt); + return diskann_medoid_set(p, vec_col_idx, -1, 1); + } + } + return SQLITE_OK; +} + +// ============================================================ +// DiskANN database I/O helpers +// ============================================================ + +/** + * Read a node's full data from _diskann_nodes. + * Returns blobs that must be freed by the caller with sqlite3_free(). + */ +static int diskann_node_read(vec0_vtab *p, int vec_col_idx, i64 rowid, + u8 **outValidity, int *outValiditySize, + u8 **outNeighborIds, int *outNeighborIdsSize, + u8 **outQvecs, int *outQvecsSize) { + int rc; + if (!p->stmtDiskannNodeRead[vec_col_idx]) { + char *zSql = sqlite3_mprintf( + "SELECT neighbors_validity, neighbor_ids, neighbor_quantized_vectors " + "FROM " VEC0_SHADOW_DISKANN_NODES_N_NAME " WHERE rowid = ?", + p->schemaName, p->tableName, vec_col_idx); + if (!zSql) return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, + &p->stmtDiskannNodeRead[vec_col_idx], NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) return rc; + } + + sqlite3_stmt *stmt = p->stmtDiskannNodeRead[vec_col_idx]; + sqlite3_reset(stmt); + sqlite3_bind_int64(stmt, 1, rowid); + + rc = sqlite3_step(stmt); + if (rc != SQLITE_ROW) { + return SQLITE_ERROR; + } + + int vs = sqlite3_column_bytes(stmt, 0); + int is = sqlite3_column_bytes(stmt, 1); + int qs = sqlite3_column_bytes(stmt, 2); + + // Validate blob sizes against config expectations to detect truncated / + // corrupt data before any caller iterates using cfg->n_neighbors. + { + struct VectorColumnDefinition *col = &p->vector_columns[vec_col_idx]; + struct Vec0DiskannConfig *cfg = &col->diskann; + int expectedVs = diskann_validity_byte_size(cfg->n_neighbors); + int expectedIs = (int)diskann_neighbor_ids_byte_size(cfg->n_neighbors); + int expectedQs = (int)diskann_neighbor_qvecs_byte_size( + cfg->n_neighbors, cfg->quantizer_type, col->dimensions); + if (vs != expectedVs || is != expectedIs || qs != expectedQs) { + return SQLITE_CORRUPT; + } + } + + u8 *v = sqlite3_malloc(vs); + u8 *ids = sqlite3_malloc(is); + u8 *qv = sqlite3_malloc(qs); + if (!v || !ids || !qv) { + sqlite3_free(v); + sqlite3_free(ids); + sqlite3_free(qv); + return SQLITE_NOMEM; + } + + const void *blobV = sqlite3_column_blob(stmt, 0); + const void *blobIds = sqlite3_column_blob(stmt, 1); + const void *blobQv = sqlite3_column_blob(stmt, 2); + if (!blobV || !blobIds || !blobQv) { + sqlite3_free(v); + sqlite3_free(ids); + sqlite3_free(qv); + return SQLITE_ERROR; + } + memcpy(v, blobV, vs); + memcpy(ids, blobIds, is); + memcpy(qv, blobQv, qs); + + *outValidity = v; *outValiditySize = vs; + *outNeighborIds = ids; *outNeighborIdsSize = is; + *outQvecs = qv; *outQvecsSize = qs; + return SQLITE_OK; +} + +/** + * Write (INSERT OR REPLACE) a node's data to _diskann_nodes. + */ +static int diskann_node_write(vec0_vtab *p, int vec_col_idx, i64 rowid, + const u8 *validity, int validitySize, + const u8 *neighborIds, int neighborIdsSize, + const u8 *qvecs, int qvecsSize) { + int rc; + if (!p->stmtDiskannNodeWrite[vec_col_idx]) { + char *zSql = sqlite3_mprintf( + "INSERT OR REPLACE INTO " VEC0_SHADOW_DISKANN_NODES_N_NAME + " (rowid, neighbors_validity, neighbor_ids, neighbor_quantized_vectors) " + "VALUES (?, ?, ?, ?)", + p->schemaName, p->tableName, vec_col_idx); + if (!zSql) return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, + &p->stmtDiskannNodeWrite[vec_col_idx], NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) return rc; + } + + sqlite3_stmt *stmt = p->stmtDiskannNodeWrite[vec_col_idx]; + sqlite3_reset(stmt); + sqlite3_bind_int64(stmt, 1, rowid); + sqlite3_bind_blob(stmt, 2, validity, validitySize, SQLITE_TRANSIENT); + sqlite3_bind_blob(stmt, 3, neighborIds, neighborIdsSize, SQLITE_TRANSIENT); + sqlite3_bind_blob(stmt, 4, qvecs, qvecsSize, SQLITE_TRANSIENT); + + rc = sqlite3_step(stmt); + return (rc == SQLITE_DONE) ? SQLITE_OK : SQLITE_ERROR; +} + +/** + * Read the full-precision vector for a given rowid from _vectors. + * Caller must free *outVector with sqlite3_free(). + */ +static int diskann_vector_read(vec0_vtab *p, int vec_col_idx, i64 rowid, + void **outVector, int *outVectorSize) { + int rc; + if (!p->stmtVectorsRead[vec_col_idx]) { + char *zSql = sqlite3_mprintf( + "SELECT vector FROM " VEC0_SHADOW_VECTORS_N_NAME " WHERE rowid = ?", + p->schemaName, p->tableName, vec_col_idx); + if (!zSql) return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, + &p->stmtVectorsRead[vec_col_idx], NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) return rc; + } + + sqlite3_stmt *stmt = p->stmtVectorsRead[vec_col_idx]; + sqlite3_reset(stmt); + sqlite3_bind_int64(stmt, 1, rowid); + + rc = sqlite3_step(stmt); + if (rc != SQLITE_ROW) { + return SQLITE_ERROR; + } + + int sz = sqlite3_column_bytes(stmt, 0); + const void *blob = sqlite3_column_blob(stmt, 0); + if (!blob || sz == 0) return SQLITE_ERROR; + void *vec = sqlite3_malloc(sz); + if (!vec) return SQLITE_NOMEM; + memcpy(vec, blob, sz); + + *outVector = vec; + *outVectorSize = sz; + return SQLITE_OK; +} + +/** + * Write a full-precision vector to _vectors. + */ +static int diskann_vector_write(vec0_vtab *p, int vec_col_idx, i64 rowid, + const void *vector, int vectorSize) { + int rc; + if (!p->stmtVectorsInsert[vec_col_idx]) { + char *zSql = sqlite3_mprintf( + "INSERT OR REPLACE INTO " VEC0_SHADOW_VECTORS_N_NAME + " (rowid, vector) VALUES (?, ?)", + p->schemaName, p->tableName, vec_col_idx); + if (!zSql) return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, + &p->stmtVectorsInsert[vec_col_idx], NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) return rc; + } + + sqlite3_stmt *stmt = p->stmtVectorsInsert[vec_col_idx]; + sqlite3_reset(stmt); + sqlite3_bind_int64(stmt, 1, rowid); + sqlite3_bind_blob(stmt, 2, vector, vectorSize, SQLITE_TRANSIENT); + + rc = sqlite3_step(stmt); + return (rc == SQLITE_DONE) ? SQLITE_OK : SQLITE_ERROR; +} + +// ============================================================ +// DiskANN search data structures +// ============================================================ + +/** + * A sorted candidate list for greedy beam search. + */ +struct DiskannCandidateList { + struct Vec0DiskannCandidate *items; + int count; + int capacity; +}; + +static int diskann_candidate_list_init(struct DiskannCandidateList *list, int capacity) { + list->items = sqlite3_malloc(capacity * sizeof(struct Vec0DiskannCandidate)); + if (!list->items) return SQLITE_NOMEM; + list->count = 0; + list->capacity = capacity; + return SQLITE_OK; +} + +static void diskann_candidate_list_free(struct DiskannCandidateList *list) { + sqlite3_free(list->items); + list->items = NULL; + list->count = 0; + list->capacity = 0; +} + +/** + * Insert a candidate into the sorted list, maintaining sort order by distance. + * Deduplicates by rowid. If at capacity and new candidate is worse, discards it. + * Returns 1 if inserted, 0 if discarded. + */ +static int diskann_candidate_list_insert( + struct DiskannCandidateList *list, i64 rowid, f32 distance) { + + // Check for duplicate + for (int i = 0; i < list->count; i++) { + if (list->items[i].rowid == rowid) { + // Update distance if better + if (distance < list->items[i].distance) { + list->items[i].distance = distance; + // Re-sort this item into position + struct Vec0DiskannCandidate tmp = list->items[i]; + int j = i - 1; + while (j >= 0 && list->items[j].distance > tmp.distance) { + list->items[j + 1] = list->items[j]; + j--; + } + list->items[j + 1] = tmp; + } + return 1; + } + } + + // If at capacity, check if new candidate is better than worst + if (list->count >= list->capacity) { + if (distance >= list->items[list->count - 1].distance) { + return 0; // Discard + } + list->count--; // Make room by dropping the worst + } + + // Binary search for insertion point + int lo = 0, hi = list->count; + while (lo < hi) { + int mid = (lo + hi) / 2; + if (list->items[mid].distance < distance) { + lo = mid + 1; + } else { + hi = mid; + } + } + + // Shift elements to make room + memmove(&list->items[lo + 1], &list->items[lo], + (list->count - lo) * sizeof(struct Vec0DiskannCandidate)); + + list->items[lo].rowid = rowid; + list->items[lo].distance = distance; + list->items[lo].visited = 0; + list->items[lo].confirmed = 0; + list->count++; + return 1; +} + +/** + * Find the closest unvisited candidate. Returns its index, or -1 if none. + */ +static int diskann_candidate_list_next_unvisited( + const struct DiskannCandidateList *list) { + for (int i = 0; i < list->count; i++) { + if (!list->items[i].visited) return i; + } + return -1; +} + + + +/** + * Simple hash set for tracking visited rowids during search. + * Uses open addressing with linear probing. + */ +struct DiskannVisitedSet { + i64 *slots; + int capacity; + int count; +}; + +static int diskann_visited_set_init(struct DiskannVisitedSet *set, int capacity) { + // Round up to power of 2 + int cap = 16; + while (cap < capacity) cap *= 2; + set->slots = sqlite3_malloc(cap * sizeof(i64)); + if (!set->slots) return SQLITE_NOMEM; + memset(set->slots, 0, cap * sizeof(i64)); + set->capacity = cap; + set->count = 0; + return SQLITE_OK; +} + +static void diskann_visited_set_free(struct DiskannVisitedSet *set) { + sqlite3_free(set->slots); + set->slots = NULL; + set->capacity = 0; + set->count = 0; +} + +static int diskann_visited_set_contains(const struct DiskannVisitedSet *set, i64 rowid) { + if (rowid == 0) return 0; // 0 is our sentinel for empty + int mask = set->capacity - 1; + int idx = (int)(((u64)rowid * 0x9E3779B97F4A7C15ULL) >> 32) & mask; + for (int i = 0; i < set->capacity; i++) { + int slot = (idx + i) & mask; + if (set->slots[slot] == 0) return 0; + if (set->slots[slot] == rowid) return 1; + } + return 0; +} + +static int diskann_visited_set_insert(struct DiskannVisitedSet *set, i64 rowid) { + if (rowid == 0) return 0; + int mask = set->capacity - 1; + int idx = (int)(((u64)rowid * 0x9E3779B97F4A7C15ULL) >> 32) & mask; + for (int i = 0; i < set->capacity; i++) { + int slot = (idx + i) & mask; + if (set->slots[slot] == 0) { + set->slots[slot] = rowid; + set->count++; + return 1; + } + if (set->slots[slot] == rowid) return 0; // Already there + } + return 0; // Full (shouldn't happen with proper sizing) +} + +// ============================================================ +// DiskANN greedy beam search (LM-Search) +// ============================================================ + +/** + * Perform LM-Search: greedy beam search over the DiskANN graph. + * Follows Algorithm 1 from the LM-DiskANN paper. + */ +static int diskann_search( + vec0_vtab *p, int vec_col_idx, + const void *queryVector, size_t dimensions, + enum VectorElementType elementType, + int k, int searchListSize, + i64 *outRowids, f32 *outDistances, int *outCount) { + + struct VectorColumnDefinition *col = &p->vector_columns[vec_col_idx]; + struct Vec0DiskannConfig *cfg = &col->diskann; + int rc; + + if (searchListSize <= 0) { + searchListSize = cfg->search_list_size_search > 0 ? cfg->search_list_size_search : cfg->search_list_size; + } + if (searchListSize < k) { + searchListSize = k; + } + + // 1. Get the medoid (entry point) + i64 medoid; + int isEmpty; + rc = diskann_medoid_get(p, vec_col_idx, &medoid, &isEmpty); + if (rc != SQLITE_OK) return rc; + if (isEmpty) { + *outCount = 0; + return SQLITE_OK; + } + + // 2. Compute distance from query to medoid using full-precision vector + void *medoidVector = NULL; + int medoidVectorSize; + rc = diskann_vector_read(p, vec_col_idx, medoid, &medoidVector, &medoidVectorSize); + if (rc != SQLITE_OK) return rc; + + f32 medoidDist = vec0_distance_full(queryVector, medoidVector, + dimensions, elementType, + col->distance_metric); + sqlite3_free(medoidVector); + + // 3. Initialize candidate list and visited set + struct DiskannCandidateList candidates; + rc = diskann_candidate_list_init(&candidates, searchListSize); + if (rc != SQLITE_OK) return rc; + + struct DiskannVisitedSet visited; + rc = diskann_visited_set_init(&visited, searchListSize * 4); + if (rc != SQLITE_OK) { + diskann_candidate_list_free(&candidates); + return rc; + } + + // Seed with medoid (confirmed — we already read its vector above) + diskann_candidate_list_insert(&candidates, medoid, medoidDist); + candidates.items[0].confirmed = 1; + + // Pre-quantize query vector once for all quantized distance comparisons + u8 *queryQuantized = NULL; + if (elementType == SQLITE_VEC_ELEMENT_TYPE_FLOAT32) { + queryQuantized = diskann_quantize_query( + (const f32 *)queryVector, dimensions, cfg->quantizer_type); + } + + // 4. Greedy beam search loop (Algorithm 1 from LM-DiskANN paper) + while (1) { + int nextIdx = diskann_candidate_list_next_unvisited(&candidates); + if (nextIdx < 0) break; + + struct Vec0DiskannCandidate *current = &candidates.items[nextIdx]; + current->visited = 1; + i64 currentRowid = current->rowid; + + // Read the node's neighbor data + u8 *validity = NULL, *neighborIds = NULL, *qvecs = NULL; + int validitySize, neighborIdsSize, qvecsSize; + rc = diskann_node_read(p, vec_col_idx, currentRowid, + &validity, &validitySize, + &neighborIds, &neighborIdsSize, + &qvecs, &qvecsSize); + if (rc != SQLITE_OK) { + continue; // Skip if node doesn't exist + } + + // Insert all valid neighbors with approximate (quantized) distances + for (int i = 0; i < cfg->n_neighbors; i++) { + if (!diskann_validity_get(validity, i)) continue; + + i64 neighborRowid = diskann_neighbor_id_get(neighborIds, i); + + if (diskann_visited_set_contains(&visited, neighborRowid)) continue; + + const u8 *neighborQvec = diskann_neighbor_qvec_get( + qvecs, i, cfg->quantizer_type, dimensions); + + f32 approxDist; + if (queryQuantized) { + approxDist = diskann_distance_quantized_precomputed( + queryQuantized, neighborQvec, dimensions, + cfg->quantizer_type, col->distance_metric); + } else { + approxDist = diskann_distance_quantized( + queryVector, neighborQvec, dimensions, + cfg->quantizer_type, col->distance_metric); + } + + diskann_candidate_list_insert(&candidates, neighborRowid, approxDist); + } + + sqlite3_free(validity); + sqlite3_free(neighborIds); + sqlite3_free(qvecs); + + // Add to visited set + diskann_visited_set_insert(&visited, currentRowid); + + // Paper line 13: Re-rank p* using full-precision distance + // We already have exact distance for medoid; for others, update now + void *fullVec = NULL; + int fullVecSize; + rc = diskann_vector_read(p, vec_col_idx, currentRowid, &fullVec, &fullVecSize); + if (rc == SQLITE_OK) { + f32 exactDist = vec0_distance_full(queryVector, fullVec, + dimensions, elementType, + col->distance_metric); + sqlite3_free(fullVec); + // Update distance in candidate list and re-sort + diskann_candidate_list_insert(&candidates, currentRowid, exactDist); + // Mark as confirmed (vector exists, distance is exact) + for (int ci = 0; ci < candidates.count; ci++) { + if (candidates.items[ci].rowid == currentRowid) { + candidates.items[ci].confirmed = 1; + break; + } + } + } + // If vector read failed, candidate stays unconfirmed (stale edge to deleted node) + } + + // 5. Output results — only include confirmed candidates (whose vectors exist) + int resultCount = 0; + for (int i = 0; i < candidates.count && resultCount < k; i++) { + if (candidates.items[i].confirmed) { + outRowids[resultCount] = candidates.items[i].rowid; + outDistances[resultCount] = candidates.items[i].distance; + resultCount++; + } + } + *outCount = resultCount; + + sqlite3_free(queryQuantized); + diskann_candidate_list_free(&candidates); + diskann_visited_set_free(&visited); + return SQLITE_OK; +} + +// ============================================================ +// DiskANN RobustPrune (Algorithm 4 from LM-DiskANN paper) +// ============================================================ + +/** + * RobustPrune: Select up to max_neighbors neighbors for node p from a + * candidate set, using alpha-pruning for diversity. + * + * Following Algorithm 4 (LM-Prune): + * C = C union N_out(p) \ {p} + * N_out(p) = empty + * while C not empty: + * p* = argmin d(p, c) for c in C + * N_out(p).insert(p*) + * if |N_out(p)| >= R: break + * for each p' in C: + * if alpha * d(p*, p') <= d(p, p'): remove p' from C + */ +/** + * Pure function: given pre-sorted candidates and a distance matrix, select + * up to max_neighbors using alpha-pruning. inter_distances is a flattened + * num_candidates x num_candidates matrix where inter_distances[i*num_candidates+j] + * = d(candidate_i, candidate_j). p_distances[i] = d(p, candidate_i), already sorted. + * outSelected[i] = 1 if selected. Returns count of selected. + */ +int diskann_prune_select( + const f32 *inter_distances, const f32 *p_distances, + int num_candidates, f32 alpha, int max_neighbors, + int *outSelected, int *outCount) { + + if (num_candidates == 0) { + *outCount = 0; + return SQLITE_OK; + } + + u8 *active = sqlite3_malloc(num_candidates); + if (!active) return SQLITE_NOMEM; + memset(active, 1, num_candidates); + memset(outSelected, 0, num_candidates * sizeof(int)); + + int selectedCount = 0; + + for (int round = 0; round < num_candidates && selectedCount < max_neighbors; round++) { + int bestIdx = -1; + for (int i = 0; i < num_candidates; i++) { + if (active[i]) { bestIdx = i; break; } + } + if (bestIdx < 0) break; + + outSelected[bestIdx] = 1; + selectedCount++; + active[bestIdx] = 0; + + for (int i = 0; i < num_candidates; i++) { + if (!active[i]) continue; + f32 dist_best_to_i = inter_distances[bestIdx * num_candidates + i]; + if (alpha * dist_best_to_i <= p_distances[i]) { + active[i] = 0; + } + } + } + + *outCount = selectedCount; + sqlite3_free(active); + return SQLITE_OK; +} + +static int diskann_robust_prune( + vec0_vtab *p, int vec_col_idx, + i64 p_rowid, const void *p_vector, + i64 *candidates, f32 *candidate_distances, int num_candidates, + f32 alpha, int max_neighbors, + i64 *outNeighborRowids, int *outNeighborCount) { + + struct VectorColumnDefinition *col = &p->vector_columns[vec_col_idx]; + int rc; + + // Remove p itself from candidates + for (int i = 0; i < num_candidates; i++) { + if (candidates[i] == p_rowid) { + candidates[i] = candidates[num_candidates - 1]; + candidate_distances[i] = candidate_distances[num_candidates - 1]; + num_candidates--; + break; + } + } + + if (num_candidates == 0) { + *outNeighborCount = 0; + return SQLITE_OK; + } + + // Sort candidates by distance to p (ascending) - insertion sort + for (int i = 1; i < num_candidates; i++) { + f32 tmpDist = candidate_distances[i]; + i64 tmpRowid = candidates[i]; + int j = i - 1; + while (j >= 0 && candidate_distances[j] > tmpDist) { + candidate_distances[j + 1] = candidate_distances[j]; + candidates[j + 1] = candidates[j]; + j--; + } + candidate_distances[j + 1] = tmpDist; + candidates[j + 1] = tmpRowid; + } + + // Active flags + u8 *active = sqlite3_malloc(num_candidates); + if (!active) return SQLITE_NOMEM; + memset(active, 1, num_candidates); + + // Cache full-precision vectors for inter-candidate distance + void **candidateVectors = sqlite3_malloc(num_candidates * sizeof(void *)); + if (!candidateVectors) { + sqlite3_free(active); + return SQLITE_NOMEM; + } + memset(candidateVectors, 0, num_candidates * sizeof(void *)); + + int selectedCount = 0; + + for (int round = 0; round < num_candidates && selectedCount < max_neighbors; round++) { + // Find closest active candidate + int bestIdx = -1; + for (int i = 0; i < num_candidates; i++) { + if (active[i]) { bestIdx = i; break; } + } + if (bestIdx < 0) break; + + // Select this candidate + outNeighborRowids[selectedCount] = candidates[bestIdx]; + selectedCount++; + active[bestIdx] = 0; + + // Load selected candidate's vector + if (!candidateVectors[bestIdx]) { + int vecSize; + rc = diskann_vector_read(p, vec_col_idx, candidates[bestIdx], + &candidateVectors[bestIdx], &vecSize); + if (rc != SQLITE_OK) continue; + } + + // Alpha-prune: remove candidates covered by the selected neighbor + for (int i = 0; i < num_candidates; i++) { + if (!active[i]) continue; + + if (!candidateVectors[i]) { + int vecSize; + rc = diskann_vector_read(p, vec_col_idx, candidates[i], + &candidateVectors[i], &vecSize); + if (rc != SQLITE_OK) continue; + } + + f32 dist_selected_to_i = vec0_distance_full( + candidateVectors[bestIdx], candidateVectors[i], + col->dimensions, col->element_type, col->distance_metric); + + if (alpha * dist_selected_to_i <= candidate_distances[i]) { + active[i] = 0; + } + } + } + + *outNeighborCount = selectedCount; + + for (int i = 0; i < num_candidates; i++) { + sqlite3_free(candidateVectors[i]); + } + sqlite3_free(candidateVectors); + sqlite3_free(active); + + return SQLITE_OK; +} + +/** + * After RobustPrune selects neighbors, build the node blobs and write to DB. + * Quantizes each neighbor's vector and packs into the node format. + */ +static int diskann_write_pruned_neighbors( + vec0_vtab *p, int vec_col_idx, i64 nodeRowid, + const i64 *neighborRowids, int neighborCount) { + + struct VectorColumnDefinition *col = &p->vector_columns[vec_col_idx]; + struct Vec0DiskannConfig *cfg = &col->diskann; + int rc; + + u8 *validity, *neighborIds, *qvecs; + int validitySize, neighborIdsSize, qvecsSize; + rc = diskann_node_init(cfg->n_neighbors, cfg->quantizer_type, + col->dimensions, + &validity, &validitySize, + &neighborIds, &neighborIdsSize, + &qvecs, &qvecsSize); + if (rc != SQLITE_OK) return rc; + + size_t qvecSize = diskann_quantized_vector_byte_size( + cfg->quantizer_type, col->dimensions); + u8 *qvec = sqlite3_malloc(qvecSize); + if (!qvec) { + sqlite3_free(validity); + sqlite3_free(neighborIds); + sqlite3_free(qvecs); + return SQLITE_NOMEM; + } + + for (int i = 0; i < neighborCount && i < cfg->n_neighbors; i++) { + void *neighborVec = NULL; + int neighborVecSize; + rc = diskann_vector_read(p, vec_col_idx, neighborRowids[i], + &neighborVec, &neighborVecSize); + if (rc != SQLITE_OK) continue; + + if (col->element_type == SQLITE_VEC_ELEMENT_TYPE_FLOAT32) { + diskann_quantize_vector((const f32 *)neighborVec, col->dimensions, + cfg->quantizer_type, qvec); + } else { + memcpy(qvec, neighborVec, + qvecSize < (size_t)neighborVecSize ? qvecSize : (size_t)neighborVecSize); + } + + diskann_node_set_neighbor(validity, neighborIds, qvecs, i, + neighborRowids[i], qvec, + cfg->quantizer_type, col->dimensions); + + sqlite3_free(neighborVec); + } + sqlite3_free(qvec); + + rc = diskann_node_write(p, vec_col_idx, nodeRowid, + validity, validitySize, + neighborIds, neighborIdsSize, + qvecs, qvecsSize); + + sqlite3_free(validity); + sqlite3_free(neighborIds); + sqlite3_free(qvecs); + return rc; +} + +// ============================================================ +// DiskANN insert (Algorithm 2 from LM-DiskANN paper) +// ============================================================ + +/** + * Add a reverse edge: make target_rowid a neighbor of node_rowid. + * If node is full, run RobustPrune. + */ +static int diskann_add_reverse_edge( + vec0_vtab *p, int vec_col_idx, + i64 node_rowid, i64 target_rowid, const void *target_vector) { + + struct VectorColumnDefinition *col = &p->vector_columns[vec_col_idx]; + struct Vec0DiskannConfig *cfg = &col->diskann; + int rc; + + u8 *validity = NULL, *neighborIds = NULL, *qvecs = NULL; + int validitySize, neighborIdsSize, qvecsSize; + rc = diskann_node_read(p, vec_col_idx, node_rowid, + &validity, &validitySize, + &neighborIds, &neighborIdsSize, + &qvecs, &qvecsSize); + if (rc != SQLITE_OK) return rc; + + int currentCount = diskann_validity_count(validity, cfg->n_neighbors); + + // Check if target is already a neighbor + for (int i = 0; i < cfg->n_neighbors; i++) { + if (diskann_validity_get(validity, i) && + diskann_neighbor_id_get(neighborIds, i) == target_rowid) { + sqlite3_free(validity); + sqlite3_free(neighborIds); + sqlite3_free(qvecs); + return SQLITE_OK; + } + } + + if (currentCount < cfg->n_neighbors) { + // Room available: find first empty slot + for (int i = 0; i < cfg->n_neighbors; i++) { + if (!diskann_validity_get(validity, i)) { + size_t qvecSize = diskann_quantized_vector_byte_size( + cfg->quantizer_type, col->dimensions); + u8 *qvec = sqlite3_malloc(qvecSize); + if (!qvec) { + sqlite3_free(validity); + sqlite3_free(neighborIds); + sqlite3_free(qvecs); + return SQLITE_NOMEM; + } + + if (col->element_type == SQLITE_VEC_ELEMENT_TYPE_FLOAT32) { + diskann_quantize_vector((const f32 *)target_vector, col->dimensions, + cfg->quantizer_type, qvec); + } else { + size_t vbs = vector_column_byte_size(*col); + memcpy(qvec, target_vector, qvecSize < vbs ? qvecSize : vbs); + } + + diskann_node_set_neighbor(validity, neighborIds, qvecs, i, + target_rowid, qvec, + cfg->quantizer_type, col->dimensions); + sqlite3_free(qvec); + break; + } + } + + rc = diskann_node_write(p, vec_col_idx, node_rowid, + validity, validitySize, + neighborIds, neighborIdsSize, + qvecs, qvecsSize); + } else { + // Full: lazy replacement — use quantized distances to find the worst + // existing neighbor and replace it if target is closer. This avoids + // reading all neighbors' float vectors (the expensive RobustPrune path). + + // Quantize the node's vector and the target vector for comparison + void *nodeVector = NULL; + int nodeVecSize; + rc = diskann_vector_read(p, vec_col_idx, node_rowid, + &nodeVector, &nodeVecSize); + if (rc != SQLITE_OK) { + sqlite3_free(validity); + sqlite3_free(neighborIds); + sqlite3_free(qvecs); + return rc; + } + + // Quantize target for node-level comparison + size_t qvecSize = diskann_quantized_vector_byte_size( + cfg->quantizer_type, col->dimensions); + u8 *targetQ = sqlite3_malloc(qvecSize); + u8 *nodeQ = sqlite3_malloc(qvecSize); + if (!targetQ || !nodeQ) { + sqlite3_free(targetQ); + sqlite3_free(nodeQ); + sqlite3_free(nodeVector); + sqlite3_free(validity); + sqlite3_free(neighborIds); + sqlite3_free(qvecs); + return SQLITE_NOMEM; + } + + if (col->element_type == SQLITE_VEC_ELEMENT_TYPE_FLOAT32) { + diskann_quantize_vector((const f32 *)target_vector, col->dimensions, + cfg->quantizer_type, targetQ); + diskann_quantize_vector((const f32 *)nodeVector, col->dimensions, + cfg->quantizer_type, nodeQ); + } else { + memcpy(targetQ, target_vector, qvecSize); + memcpy(nodeQ, nodeVector, qvecSize); + } + + // Compute quantized distance from node to target + f32 targetDist = diskann_distance_quantized_precomputed( + nodeQ, targetQ, col->dimensions, + cfg->quantizer_type, col->distance_metric); + + // Find the worst (farthest) existing neighbor using quantized distances + int worstIdx = -1; + f32 worstDist = -1.0f; + for (int i = 0; i < cfg->n_neighbors; i++) { + if (!diskann_validity_get(validity, i)) continue; + const u8 *nqvec = diskann_neighbor_qvec_get( + qvecs, i, cfg->quantizer_type, col->dimensions); + f32 d = diskann_distance_quantized_precomputed( + nodeQ, nqvec, col->dimensions, + cfg->quantizer_type, col->distance_metric); + if (d > worstDist) { + worstDist = d; + worstIdx = i; + } + } + + // Replace worst neighbor if target is closer + if (worstIdx >= 0 && targetDist < worstDist) { + diskann_node_set_neighbor(validity, neighborIds, qvecs, worstIdx, + target_rowid, targetQ, + cfg->quantizer_type, col->dimensions); + rc = diskann_node_write(p, vec_col_idx, node_rowid, + validity, validitySize, + neighborIds, neighborIdsSize, + qvecs, qvecsSize); + } else { + rc = SQLITE_OK; // target is farther than all existing neighbors, skip + } + + sqlite3_free(targetQ); + sqlite3_free(nodeQ); + sqlite3_free(nodeVector); + } + + sqlite3_free(validity); + sqlite3_free(neighborIds); + sqlite3_free(qvecs); + return rc; +} + +// ============================================================ +// DiskANN buffer helpers (for batched inserts) +// ============================================================ + +/** + * Insert a vector into the _diskann_buffer table. + */ +static int diskann_buffer_write(vec0_vtab *p, int vec_col_idx, + i64 rowid, const void *vector, int vectorSize) { + sqlite3_stmt *stmt = NULL; + char *zSql = sqlite3_mprintf( + "INSERT INTO " VEC0_SHADOW_DISKANN_BUFFER_N_NAME + " (rowid, vector) VALUES (?, ?)", + p->schemaName, p->tableName, vec_col_idx); + if (!zSql) return SQLITE_NOMEM; + int rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) return rc; + sqlite3_bind_int64(stmt, 1, rowid); + sqlite3_bind_blob(stmt, 2, vector, vectorSize, SQLITE_STATIC); + rc = sqlite3_step(stmt); + sqlite3_finalize(stmt); + return (rc == SQLITE_DONE) ? SQLITE_OK : SQLITE_ERROR; +} + +/** + * Delete a vector from the _diskann_buffer table. + */ +static int diskann_buffer_delete(vec0_vtab *p, int vec_col_idx, i64 rowid) { + sqlite3_stmt *stmt = NULL; + char *zSql = sqlite3_mprintf( + "DELETE FROM " VEC0_SHADOW_DISKANN_BUFFER_N_NAME " WHERE rowid = ?", + p->schemaName, p->tableName, vec_col_idx); + if (!zSql) return SQLITE_NOMEM; + int rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) return rc; + sqlite3_bind_int64(stmt, 1, rowid); + rc = sqlite3_step(stmt); + sqlite3_finalize(stmt); + return (rc == SQLITE_DONE) ? SQLITE_OK : SQLITE_ERROR; +} + +/** + * Check if a rowid exists in the _diskann_buffer table. + * Returns SQLITE_OK and sets *exists to 1 if found, 0 if not. + */ +static int diskann_buffer_exists(vec0_vtab *p, int vec_col_idx, + i64 rowid, int *exists) { + sqlite3_stmt *stmt = NULL; + char *zSql = sqlite3_mprintf( + "SELECT 1 FROM " VEC0_SHADOW_DISKANN_BUFFER_N_NAME " WHERE rowid = ?", + p->schemaName, p->tableName, vec_col_idx); + if (!zSql) return SQLITE_NOMEM; + int rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) return rc; + sqlite3_bind_int64(stmt, 1, rowid); + rc = sqlite3_step(stmt); + *exists = (rc == SQLITE_ROW) ? 1 : 0; + sqlite3_finalize(stmt); + return SQLITE_OK; +} + +/** + * Get the count of rows in the _diskann_buffer table. + */ +static int diskann_buffer_count(vec0_vtab *p, int vec_col_idx, i64 *count) { + sqlite3_stmt *stmt = NULL; + char *zSql = sqlite3_mprintf( + "SELECT count(*) FROM " VEC0_SHADOW_DISKANN_BUFFER_N_NAME, + p->schemaName, p->tableName, vec_col_idx); + if (!zSql) return SQLITE_NOMEM; + int rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) return rc; + rc = sqlite3_step(stmt); + if (rc == SQLITE_ROW) { + *count = sqlite3_column_int64(stmt, 0); + sqlite3_finalize(stmt); + return SQLITE_OK; + } + sqlite3_finalize(stmt); + return SQLITE_ERROR; +} + +// Forward declaration: diskann_insert_graph does the actual graph insertion +static int diskann_insert_graph(vec0_vtab *p, int vec_col_idx, + i64 rowid, const void *vector); + +/** + * Flush all buffered vectors into the DiskANN graph. + * Iterates over _diskann_buffer rows and calls diskann_insert_graph for each. + */ +static int diskann_flush_buffer(vec0_vtab *p, int vec_col_idx) { + sqlite3_stmt *stmt = NULL; + char *zSql = sqlite3_mprintf( + "SELECT rowid, vector FROM " VEC0_SHADOW_DISKANN_BUFFER_N_NAME, + p->schemaName, p->tableName, vec_col_idx); + if (!zSql) return SQLITE_NOMEM; + int rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) return rc; + + while ((rc = sqlite3_step(stmt)) == SQLITE_ROW) { + i64 rowid = sqlite3_column_int64(stmt, 0); + const void *vector = sqlite3_column_blob(stmt, 1); + if (!vector) continue; + // Note: vector is already written to _vectors table, so + // diskann_insert_graph will skip re-writing it (vector already exists). + // We call the graph-only insert path. + int insertRc = diskann_insert_graph(p, vec_col_idx, rowid, vector); + if (insertRc != SQLITE_OK) { + sqlite3_finalize(stmt); + return insertRc; + } + } + sqlite3_finalize(stmt); + + // Clear the buffer + zSql = sqlite3_mprintf( + "DELETE FROM " VEC0_SHADOW_DISKANN_BUFFER_N_NAME, + p->schemaName, p->tableName, vec_col_idx); + if (!zSql) return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) return rc; + rc = sqlite3_step(stmt); + sqlite3_finalize(stmt); + return (rc == SQLITE_DONE) ? SQLITE_OK : SQLITE_ERROR; +} + +/** + * Insert a new vector into the DiskANN graph (graph-only path). + * The vector must already be written to _vectors table. + * This is the core graph insertion logic (Algorithm 2: LM-Insert). + */ +static int diskann_insert_graph(vec0_vtab *p, int vec_col_idx, + i64 rowid, const void *vector) { + struct VectorColumnDefinition *col = &p->vector_columns[vec_col_idx]; + struct Vec0DiskannConfig *cfg = &col->diskann; + int rc; + + // Handle first insert (empty graph) + i64 medoid; + int isEmpty; + rc = diskann_medoid_get(p, vec_col_idx, &medoid, &isEmpty); + if (rc != SQLITE_OK) return rc; + + if (isEmpty) { + u8 *validity, *neighborIds, *qvecs; + int validitySize, neighborIdsSize, qvecsSize; + rc = diskann_node_init(cfg->n_neighbors, cfg->quantizer_type, + col->dimensions, + &validity, &validitySize, + &neighborIds, &neighborIdsSize, + &qvecs, &qvecsSize); + if (rc != SQLITE_OK) return rc; + + rc = diskann_node_write(p, vec_col_idx, rowid, + validity, validitySize, + neighborIds, neighborIdsSize, + qvecs, qvecsSize); + sqlite3_free(validity); + sqlite3_free(neighborIds); + sqlite3_free(qvecs); + if (rc != SQLITE_OK) return rc; + + return diskann_medoid_set(p, vec_col_idx, rowid, 0); + } + + // Search for nearest neighbors + int L = cfg->search_list_size_insert > 0 ? cfg->search_list_size_insert : cfg->search_list_size; + i64 *searchRowids = sqlite3_malloc(L * sizeof(i64)); + f32 *searchDistances = sqlite3_malloc(L * sizeof(f32)); + if (!searchRowids || !searchDistances) { + sqlite3_free(searchRowids); + sqlite3_free(searchDistances); + return SQLITE_NOMEM; + } + + int searchCount; + rc = diskann_search(p, vec_col_idx, vector, col->dimensions, + col->element_type, L, L, + searchRowids, searchDistances, &searchCount); + if (rc != SQLITE_OK) { + sqlite3_free(searchRowids); + sqlite3_free(searchDistances); + return rc; + } + + // RobustPrune to select neighbors for x + i64 *selectedNeighbors = sqlite3_malloc(cfg->n_neighbors * sizeof(i64)); + int selectedCount = 0; + if (!selectedNeighbors) { + sqlite3_free(searchRowids); + sqlite3_free(searchDistances); + return SQLITE_NOMEM; + } + + rc = diskann_robust_prune(p, vec_col_idx, rowid, vector, + searchRowids, searchDistances, searchCount, + cfg->alpha, cfg->n_neighbors, + selectedNeighbors, &selectedCount); + sqlite3_free(searchRowids); + sqlite3_free(searchDistances); + if (rc != SQLITE_OK) { + sqlite3_free(selectedNeighbors); + return rc; + } + + // Write x's node with selected neighbors + rc = diskann_write_pruned_neighbors(p, vec_col_idx, rowid, + selectedNeighbors, selectedCount); + if (rc != SQLITE_OK) { + sqlite3_free(selectedNeighbors); + return rc; + } + + // Add bidirectional edges + for (int i = 0; i < selectedCount; i++) { + diskann_add_reverse_edge(p, vec_col_idx, + selectedNeighbors[i], rowid, vector); + } + + sqlite3_free(selectedNeighbors); + return SQLITE_OK; +} + +/** + * Insert a new vector into the DiskANN index (Algorithm 2: LM-Insert). + * When buffer_threshold > 0, vectors are buffered and flushed in batch. + */ +static int diskann_insert(vec0_vtab *p, int vec_col_idx, + i64 rowid, const void *vector) { + struct VectorColumnDefinition *col = &p->vector_columns[vec_col_idx]; + struct Vec0DiskannConfig *cfg = &col->diskann; + int rc; + size_t vectorSize = vector_column_byte_size(*col); + + // 1. Write full-precision vector to _vectors table (always needed for queries) + rc = diskann_vector_write(p, vec_col_idx, rowid, vector, (int)vectorSize); + if (rc != SQLITE_OK) return rc; + + // 2. If buffering is enabled, write to buffer instead of graph + if (cfg->buffer_threshold > 0) { + rc = diskann_buffer_write(p, vec_col_idx, rowid, vector, (int)vectorSize); + if (rc != SQLITE_OK) return rc; + + i64 count; + rc = diskann_buffer_count(p, vec_col_idx, &count); + if (rc != SQLITE_OK) return rc; + + if (count >= cfg->buffer_threshold) { + return diskann_flush_buffer(p, vec_col_idx); + } + return SQLITE_OK; + } + + // 3. Legacy per-row insert directly into graph + return diskann_insert_graph(p, vec_col_idx, rowid, vector); +} + +/** + * Returns 1 if ALL vector columns in this table are DiskANN-indexed. + */ +// ============================================================ +// DiskANN delete (Algorithm 3 from LM-DiskANN paper) +// ============================================================ + +static int diskann_node_delete(vec0_vtab *p, int vec_col_idx, i64 rowid) { + sqlite3_stmt *stmt = NULL; + char *zSql = sqlite3_mprintf( + "DELETE FROM " VEC0_SHADOW_DISKANN_NODES_N_NAME " WHERE rowid = ?", + p->schemaName, p->tableName, vec_col_idx); + if (!zSql) return SQLITE_NOMEM; + int rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) return rc; + sqlite3_bind_int64(stmt, 1, rowid); + rc = sqlite3_step(stmt); + sqlite3_finalize(stmt); + return (rc == SQLITE_DONE) ? SQLITE_OK : SQLITE_ERROR; +} + +static int diskann_vector_delete(vec0_vtab *p, int vec_col_idx, i64 rowid) { + sqlite3_stmt *stmt = NULL; + char *zSql = sqlite3_mprintf( + "DELETE FROM " VEC0_SHADOW_VECTORS_N_NAME " WHERE rowid = ?", + p->schemaName, p->tableName, vec_col_idx); + if (!zSql) return SQLITE_NOMEM; + int rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) return rc; + sqlite3_bind_int64(stmt, 1, rowid); + rc = sqlite3_step(stmt); + sqlite3_finalize(stmt); + return (rc == SQLITE_DONE) ? SQLITE_OK : SQLITE_ERROR; +} + +/** + * Repair graph after deleting a node. Following Algorithm 3 (LM-Delete): + * For each neighbor n of the deleted node, add deleted node's other neighbors + * to n's candidate set, then remove the deleted node from n's neighbor list. + * Uses simple slot replacement rather than full RobustPrune for performance. + */ +static int diskann_repair_reverse_edges( + vec0_vtab *p, int vec_col_idx, i64 deleted_rowid, + const i64 *deleted_neighbors, int deleted_neighbor_count) { + + struct VectorColumnDefinition *col = &p->vector_columns[vec_col_idx]; + struct Vec0DiskannConfig *cfg = &col->diskann; + int rc; + + // For each neighbor of the deleted node, fix their neighbor list + for (int dn = 0; dn < deleted_neighbor_count; dn++) { + i64 nodeRowid = deleted_neighbors[dn]; + + u8 *validity = NULL, *neighborIds = NULL, *qvecs = NULL; + int vs, nis, qs; + rc = diskann_node_read(p, vec_col_idx, nodeRowid, + &validity, &vs, &neighborIds, &nis, &qvecs, &qs); + if (rc != SQLITE_OK) continue; + + // Find and clear the deleted node's slot + int clearedSlot = -1; + for (int i = 0; i < cfg->n_neighbors; i++) { + if (diskann_validity_get(validity, i) && + diskann_neighbor_id_get(neighborIds, i) == deleted_rowid) { + diskann_node_clear_neighbor(validity, neighborIds, qvecs, i, + cfg->quantizer_type, col->dimensions); + clearedSlot = i; + break; + } + } + + if (clearedSlot >= 0) { + // Try to fill the cleared slot with one of the deleted node's other neighbors + for (int di = 0; di < deleted_neighbor_count; di++) { + i64 candidate = deleted_neighbors[di]; + if (candidate == nodeRowid || candidate == deleted_rowid) continue; + + // Check not already a neighbor + int alreadyNeighbor = 0; + for (int ni = 0; ni < cfg->n_neighbors; ni++) { + if (diskann_validity_get(validity, ni) && + diskann_neighbor_id_get(neighborIds, ni) == candidate) { + alreadyNeighbor = 1; + break; + } + } + if (alreadyNeighbor) continue; + + // Load, quantize, and set + void *candidateVec = NULL; + int cvs; + rc = diskann_vector_read(p, vec_col_idx, candidate, &candidateVec, &cvs); + if (rc != SQLITE_OK) continue; + + size_t qvecSize = diskann_quantized_vector_byte_size( + cfg->quantizer_type, col->dimensions); + u8 *qvec = sqlite3_malloc(qvecSize); + if (qvec) { + if (col->element_type == SQLITE_VEC_ELEMENT_TYPE_FLOAT32) { + diskann_quantize_vector((const f32 *)candidateVec, col->dimensions, + cfg->quantizer_type, qvec); + } else { + memcpy(qvec, candidateVec, + qvecSize < (size_t)cvs ? qvecSize : (size_t)cvs); + } + diskann_node_set_neighbor(validity, neighborIds, qvecs, clearedSlot, + candidate, qvec, + cfg->quantizer_type, col->dimensions); + sqlite3_free(qvec); + } + sqlite3_free(candidateVec); + break; + } + + rc = diskann_node_write(p, vec_col_idx, nodeRowid, + validity, vs, neighborIds, nis, qvecs, qs); + } + + sqlite3_free(validity); + sqlite3_free(neighborIds); + sqlite3_free(qvecs); + if (rc != SQLITE_OK) return rc; + } + + return SQLITE_OK; +} + +/** + * Delete a vector from the DiskANN graph (Algorithm 3: LM-Delete). + * If the vector is in the buffer (not yet flushed), just remove from buffer. + */ +/** + * Scan all nodes and clear any neighbor slot referencing deleted_rowid. + * This removes stale reverse edges that the forward-edge repair misses, + * preventing data leaks (deleted rowid + quantized vector lingering in + * other nodes' blobs). + */ +static int diskann_scrub_deleted_rowid( + vec0_vtab *p, int vec_col_idx, i64 deleted_rowid) { + + struct VectorColumnDefinition *col = &p->vector_columns[vec_col_idx]; + struct Vec0DiskannConfig *cfg = &col->diskann; + int rc; + sqlite3_stmt *stmt = NULL; + + // Lightweight scan: only read validity + neighbor_ids to find matches + char *zSql = sqlite3_mprintf( + "SELECT rowid, neighbors_validity, neighbor_ids " + "FROM " VEC0_SHADOW_DISKANN_NODES_N_NAME, + p->schemaName, p->tableName, vec_col_idx); + if (!zSql) return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) return rc; + + // Collect rowids that need updating (avoid modifying while iterating) + i64 *dirty = NULL; + int nDirty = 0, capDirty = 0; + + while (sqlite3_step(stmt) == SQLITE_ROW) { + const u8 *validity = (const u8 *)sqlite3_column_blob(stmt, 1); + const u8 *ids = (const u8 *)sqlite3_column_blob(stmt, 2); + int idsBytes = sqlite3_column_bytes(stmt, 2); + if (!validity || !ids) continue; + + int nSlots = idsBytes / (int)sizeof(i64); + if (nSlots > cfg->n_neighbors) nSlots = cfg->n_neighbors; + + for (int i = 0; i < nSlots; i++) { + if (!diskann_validity_get(validity, i)) continue; + i64 nid = diskann_neighbor_id_get(ids, i); + if (nid == deleted_rowid) { + i64 nodeRowid = sqlite3_column_int64(stmt, 0); + // Add to dirty list + if (nDirty >= capDirty) { + capDirty = capDirty ? capDirty * 2 : 16; + i64 *tmp = sqlite3_realloc64(dirty, capDirty * sizeof(i64)); + if (!tmp) { sqlite3_free(dirty); sqlite3_finalize(stmt); return SQLITE_NOMEM; } + dirty = tmp; + } + dirty[nDirty++] = nodeRowid; + break; // one match per node is enough + } + } + } + sqlite3_finalize(stmt); + + // Now do full read/clear/write for each dirty node + for (int d = 0; d < nDirty; d++) { + u8 *val = NULL, *nids = NULL, *qvecs = NULL; + int vs, nis, qs; + rc = diskann_node_read(p, vec_col_idx, dirty[d], + &val, &vs, &nids, &nis, &qvecs, &qs); + if (rc != SQLITE_OK) continue; + + int modified = 0; + for (int i = 0; i < cfg->n_neighbors; i++) { + if (diskann_validity_get(val, i) && + diskann_neighbor_id_get(nids, i) == deleted_rowid) { + diskann_node_clear_neighbor(val, nids, qvecs, i, + cfg->quantizer_type, col->dimensions); + modified = 1; + } + } + + if (modified) { + rc = diskann_node_write(p, vec_col_idx, dirty[d], + val, vs, nids, nis, qvecs, qs); + } + + sqlite3_free(val); + sqlite3_free(nids); + sqlite3_free(qvecs); + if (rc != SQLITE_OK) break; + } + + sqlite3_free(dirty); + return rc; +} + +static int diskann_delete(vec0_vtab *p, int vec_col_idx, i64 rowid) { + struct VectorColumnDefinition *col = &p->vector_columns[vec_col_idx]; + struct Vec0DiskannConfig *cfg = &col->diskann; + int rc; + + // Check if this rowid is in the buffer (not yet in graph) + if (cfg->buffer_threshold > 0) { + int inBuffer = 0; + rc = diskann_buffer_exists(p, vec_col_idx, rowid, &inBuffer); + if (rc != SQLITE_OK) return rc; + if (inBuffer) { + // Just remove from buffer and _vectors, no graph repair needed + rc = diskann_buffer_delete(p, vec_col_idx, rowid); + if (rc == SQLITE_OK) { + rc = diskann_vector_delete(p, vec_col_idx, rowid); + } + return rc; + } + } + + // 1. Read the node to get its neighbor list + u8 *delValidity = NULL, *delNeighborIds = NULL, *delQvecs = NULL; + int dvs, dnis, dqs; + rc = diskann_node_read(p, vec_col_idx, rowid, + &delValidity, &dvs, &delNeighborIds, &dnis, + &delQvecs, &dqs); + if (rc != SQLITE_OK) { + return SQLITE_OK; // Node doesn't exist, nothing to do + } + + i64 *deletedNeighbors = sqlite3_malloc(cfg->n_neighbors * sizeof(i64)); + int deletedNeighborCount = 0; + if (!deletedNeighbors) { + sqlite3_free(delValidity); + sqlite3_free(delNeighborIds); + sqlite3_free(delQvecs); + return SQLITE_NOMEM; + } + + for (int i = 0; i < cfg->n_neighbors; i++) { + if (diskann_validity_get(delValidity, i)) { + deletedNeighbors[deletedNeighborCount++] = + diskann_neighbor_id_get(delNeighborIds, i); + } + } + + sqlite3_free(delValidity); + sqlite3_free(delNeighborIds); + sqlite3_free(delQvecs); + + // 2. Repair reverse edges + rc = diskann_repair_reverse_edges(p, vec_col_idx, rowid, + deletedNeighbors, deletedNeighborCount); + sqlite3_free(deletedNeighbors); + + // 3. Delete node and vector + if (rc == SQLITE_OK) { + rc = diskann_node_delete(p, vec_col_idx, rowid); + } + if (rc == SQLITE_OK) { + rc = diskann_vector_delete(p, vec_col_idx, rowid); + } + + // 4. Handle medoid deletion + if (rc == SQLITE_OK) { + rc = diskann_medoid_handle_delete(p, vec_col_idx, rowid); + } + + // 5. Scrub stale reverse edges — removes deleted rowid + quantized vector + // from any node that still references it (data leak prevention) + if (rc == SQLITE_OK) { + rc = diskann_scrub_deleted_rowid(p, vec_col_idx, rowid); + } + + return rc; +} + +static int vec0_all_columns_diskann(vec0_vtab *p) { + for (int i = 0; i < p->numVectorColumns; i++) { + if (p->vector_columns[i].index_type != VEC0_INDEX_TYPE_DISKANN) return 0; + } + return p->numVectorColumns > 0; +} + +// ============================================================================ +// Command dispatch +// ============================================================================ + +static int diskann_handle_command(vec0_vtab *p, const char *command) { + int col_idx = -1; + for (int i = 0; i < p->numVectorColumns; i++) { + if (p->vector_columns[i].index_type == VEC0_INDEX_TYPE_DISKANN) { col_idx = i; break; } + } + if (col_idx < 0) return SQLITE_EMPTY; + + struct Vec0DiskannConfig *cfg = &p->vector_columns[col_idx].diskann; + + if (strncmp(command, "search_list_size_search=", 24) == 0) { + int val = atoi(command + 24); + if (val < 1) { vtab_set_error(&p->base, "search_list_size_search must be >= 1"); return SQLITE_ERROR; } + cfg->search_list_size_search = val; + return SQLITE_OK; + } + if (strncmp(command, "search_list_size_insert=", 24) == 0) { + int val = atoi(command + 24); + if (val < 1) { vtab_set_error(&p->base, "search_list_size_insert must be >= 1"); return SQLITE_ERROR; } + cfg->search_list_size_insert = val; + return SQLITE_OK; + } + if (strncmp(command, "search_list_size=", 17) == 0) { + int val = atoi(command + 17); + if (val < 1) { vtab_set_error(&p->base, "search_list_size must be >= 1"); return SQLITE_ERROR; } + cfg->search_list_size = val; + return SQLITE_OK; + } + return SQLITE_EMPTY; +} + +#ifdef SQLITE_VEC_TEST +// Expose internal DiskANN data structures and functions for unit testing. + +int _test_diskann_candidate_list_init(struct DiskannCandidateList *list, int capacity) { + return diskann_candidate_list_init(list, capacity); +} +void _test_diskann_candidate_list_free(struct DiskannCandidateList *list) { + diskann_candidate_list_free(list); +} +int _test_diskann_candidate_list_insert(struct DiskannCandidateList *list, long long rowid, float distance) { + return diskann_candidate_list_insert(list, (i64)rowid, (f32)distance); +} +int _test_diskann_candidate_list_next_unvisited(const struct DiskannCandidateList *list) { + return diskann_candidate_list_next_unvisited(list); +} +int _test_diskann_candidate_list_count(const struct DiskannCandidateList *list) { + return list->count; +} +long long _test_diskann_candidate_list_rowid(const struct DiskannCandidateList *list, int i) { + return (long long)list->items[i].rowid; +} +float _test_diskann_candidate_list_distance(const struct DiskannCandidateList *list, int i) { + return (float)list->items[i].distance; +} +void _test_diskann_candidate_list_set_visited(struct DiskannCandidateList *list, int i) { + list->items[i].visited = 1; +} + +int _test_diskann_visited_set_init(struct DiskannVisitedSet *set, int capacity) { + return diskann_visited_set_init(set, capacity); +} +void _test_diskann_visited_set_free(struct DiskannVisitedSet *set) { + diskann_visited_set_free(set); +} +int _test_diskann_visited_set_contains(const struct DiskannVisitedSet *set, long long rowid) { + return diskann_visited_set_contains(set, (i64)rowid); +} +int _test_diskann_visited_set_insert(struct DiskannVisitedSet *set, long long rowid) { + return diskann_visited_set_insert(set, (i64)rowid); +} +#endif /* SQLITE_VEC_TEST */ + diff --git a/sqlite-vec-ivf-kmeans.c b/sqlite-vec-ivf-kmeans.c new file mode 100644 index 0000000..0faa803 --- /dev/null +++ b/sqlite-vec-ivf-kmeans.c @@ -0,0 +1,214 @@ +/** + * sqlite-vec-ivf-kmeans.c — Pure k-means clustering algorithm. + * + * No SQLite dependency. Operates on float arrays in memory. + * #include'd into sqlite-vec.c after struct definitions. + */ + +#ifndef SQLITE_VEC_IVF_KMEANS_C +#define SQLITE_VEC_IVF_KMEANS_C + +// When opened standalone in an editor, pull in types so the LSP is happy. +// When #include'd from sqlite-vec.c, SQLITE_VEC_H is already defined. +#ifndef SQLITE_VEC_H +#include "sqlite-vec.c" // IWYU pragma: keep +#endif + +#include +#include + +#define VEC0_IVF_KMEANS_MAX_ITER 25 +#define VEC0_IVF_KMEANS_DEFAULT_SEED 0 + +// Simple xorshift32 PRNG +static uint32_t ivf_xorshift32(uint32_t *state) { + uint32_t x = *state; + x ^= x << 13; + x ^= x >> 17; + x ^= x << 5; + *state = x; + return x; +} + +// L2 squared distance between two float vectors +static float ivf_l2_dist(const float *a, const float *b, int D) { + float sum = 0.0f; + for (int d = 0; d < D; d++) { + float diff = a[d] - b[d]; + sum += diff * diff; + } + return sum; +} + +// Find nearest centroid for a single vector. Returns centroid index. +static int ivf_nearest_centroid(const float *vec, const float *centroids, + int D, int k) { + float min_dist = FLT_MAX; + int best = 0; + for (int c = 0; c < k; c++) { + float dist = ivf_l2_dist(vec, ¢roids[c * D], D); + if (dist < min_dist) { + min_dist = dist; + best = c; + } + } + return best; +} + +/** + * K-means++ initialization. + * Picks k initial centroids from the data with probability proportional + * to squared distance from nearest existing centroid. + */ +static int ivf_kmeans_init_plusplus(const float *vectors, int N, int D, + int k, uint32_t seed, float *centroids) { + if (N <= 0 || k <= 0 || D <= 0) + return -1; + if (seed == 0) + seed = 42; + + // Pick first centroid randomly + int first = ivf_xorshift32(&seed) % N; + memcpy(centroids, &vectors[first * D], D * sizeof(float)); + + if (k == 1) + return 0; + + // Allocate distance array + float *dists = sqlite3_malloc64((i64)N * sizeof(float)); + if (!dists) + return -1; + + for (int c = 1; c < k; c++) { + // Compute D(x) = distance to nearest existing centroid + double total = 0.0; + for (int i = 0; i < N; i++) { + float d = ivf_l2_dist(&vectors[i * D], ¢roids[(c - 1) * D], D); + if (c == 1 || d < dists[i]) { + dists[i] = d; + } + total += dists[i]; + } + + // Weighted random selection + if (total <= 0.0) { + // All distances zero — pick randomly + int pick = ivf_xorshift32(&seed) % N; + memcpy(¢roids[c * D], &vectors[pick * D], D * sizeof(float)); + } else { + double threshold = ((double)ivf_xorshift32(&seed) / (double)0xFFFFFFFF) * total; + double cumulative = 0.0; + int pick = N - 1; + for (int i = 0; i < N; i++) { + cumulative += dists[i]; + if (cumulative >= threshold) { + pick = i; + break; + } + } + memcpy(¢roids[c * D], &vectors[pick * D], D * sizeof(float)); + } + } + + sqlite3_free(dists); + return 0; +} + +/** + * Lloyd's k-means algorithm. + * + * @param vectors N*D float array (row-major) + * @param N number of vectors + * @param D dimensionality + * @param k number of clusters + * @param max_iter maximum iterations + * @param seed PRNG seed for initialization + * @param out_centroids output: k*D float array (caller-allocated) + * @return 0 on success, -1 on error + */ +static int ivf_kmeans(const float *vectors, int N, int D, int k, + int max_iter, uint32_t seed, float *out_centroids) { + if (N <= 0 || D <= 0 || k <= 0) + return -1; + + // Clamp k to N + if (k > N) + k = N; + + // Allocate working memory + int *assignments = sqlite3_malloc64((i64)N * sizeof(int)); + float *new_centroids = sqlite3_malloc64((i64)k * D * sizeof(float)); + int *counts = sqlite3_malloc64((i64)k * sizeof(int)); + + if (!assignments || !new_centroids || !counts) { + sqlite3_free(assignments); + sqlite3_free(new_centroids); + sqlite3_free(counts); + return -1; + } + + memset(assignments, -1, N * sizeof(int)); + + // Initialize centroids via k-means++ + if (ivf_kmeans_init_plusplus(vectors, N, D, k, seed, out_centroids) != 0) { + sqlite3_free(assignments); + sqlite3_free(new_centroids); + sqlite3_free(counts); + return -1; + } + + for (int iter = 0; iter < max_iter; iter++) { + // Assignment step + int changed = 0; + for (int i = 0; i < N; i++) { + int nearest = ivf_nearest_centroid(&vectors[i * D], out_centroids, D, k); + if (nearest != assignments[i]) { + assignments[i] = nearest; + changed++; + } + } + if (changed == 0) + break; + + // Update step + memset(new_centroids, 0, (size_t)k * D * sizeof(float)); + memset(counts, 0, k * sizeof(int)); + + for (int i = 0; i < N; i++) { + int c = assignments[i]; + counts[c]++; + for (int d = 0; d < D; d++) { + new_centroids[c * D + d] += vectors[i * D + d]; + } + } + + for (int c = 0; c < k; c++) { + if (counts[c] == 0) { + // Empty cluster: reassign to farthest point from its nearest centroid + float max_dist = -1.0f; + int farthest = 0; + for (int i = 0; i < N; i++) { + float d = ivf_l2_dist(&vectors[i * D], + &out_centroids[assignments[i] * D], D); + if (d > max_dist) { + max_dist = d; + farthest = i; + } + } + memcpy(&out_centroids[c * D], &vectors[farthest * D], + D * sizeof(float)); + } else { + for (int d = 0; d < D; d++) { + out_centroids[c * D + d] = new_centroids[c * D + d] / counts[c]; + } + } + } + } + + sqlite3_free(assignments); + sqlite3_free(new_centroids); + sqlite3_free(counts); + return 0; +} + +#endif /* SQLITE_VEC_IVF_KMEANS_C */ diff --git a/sqlite-vec-ivf.c b/sqlite-vec-ivf.c new file mode 100644 index 0000000..5bc8edb --- /dev/null +++ b/sqlite-vec-ivf.c @@ -0,0 +1,1445 @@ +/** + * sqlite-vec-ivf.c — IVF (Inverted File Index) for sqlite-vec. + * + * #include'd into sqlite-vec.c after struct definitions and before vec0_init(). + * + * Storage: fixed-size packed blob cells (capped at IVF_CELL_MAX_VECTORS). + * Multiple cell rows per centroid. cell_id is auto-increment rowid, + * centroid_id is indexed for lookup. This keeps blobs small (~200KB) + * and avoids expensive overflow page traversal on insert. + */ + +#ifndef SQLITE_VEC_IVF_C +#define SQLITE_VEC_IVF_C + +#ifdef SQLITE_VEC_TEST +#define IVF_STATIC +#else +#define IVF_STATIC static +#endif + +// When opened standalone in an editor, pull in sqlite-vec.c so the LSP +// can resolve all types (vec0_vtab, VectorColumnDefinition, etc.). +// When #include'd from sqlite-vec.c, SQLITE_VEC_H is already defined. +#ifndef SQLITE_VEC_H +#include "sqlite-vec.c" // IWYU pragma: keep +#endif + +#define VEC0_IVF_DEFAULT_NLIST 128 +#define VEC0_IVF_DEFAULT_NPROBE 10 +#define VEC0_IVF_MAX_NLIST 65536 +#define VEC0_IVF_CELL_MAX_VECTORS 64 // ~200KB per cell at 768-dim f32 +#define VEC0_IVF_UNASSIGNED_CENTROID_ID (-1) + +#define VEC0_SHADOW_IVF_CENTROIDS_NAME "\"%w\".\"%w_ivf_centroids%02d\"" +#define VEC0_SHADOW_IVF_CELLS_NAME "\"%w\".\"%w_ivf_cells%02d\"" +#define VEC0_SHADOW_IVF_ROWID_MAP_NAME "\"%w\".\"%w_ivf_rowid_map%02d\"" +#define VEC0_SHADOW_IVF_VECTORS_NAME "\"%w\".\"%w_ivf_vectors%02d\"" + +// ============================================================================ +// Parser +// ============================================================================ + +static int vec0_parse_ivf_options(struct Vec0Scanner *scanner, + struct Vec0IvfConfig *config) { + struct Vec0Token token; + int rc; + config->nlist = VEC0_IVF_DEFAULT_NLIST; + config->nprobe = -1; + config->quantizer = VEC0_IVF_QUANTIZER_NONE; + config->oversample = 1; + int nprobe_explicit = 0; + + rc = vec0_scanner_next(scanner, &token); + if (rc != VEC0_TOKEN_RESULT_SOME || token.token_type != TOKEN_TYPE_LPAREN) + return SQLITE_ERROR; + + rc = vec0_scanner_next(scanner, &token); + if (rc == VEC0_TOKEN_RESULT_SOME && token.token_type == TOKEN_TYPE_RPAREN) { + config->nprobe = VEC0_IVF_DEFAULT_NPROBE; + return SQLITE_OK; + } + + while (1) { + if (rc != VEC0_TOKEN_RESULT_SOME || token.token_type != TOKEN_TYPE_IDENTIFIER) + return SQLITE_ERROR; + char *key = token.start; + int keyLength = token.end - token.start; + + rc = vec0_scanner_next(scanner, &token); + if (rc != VEC0_TOKEN_RESULT_SOME || token.token_type != TOKEN_TYPE_EQ) + return SQLITE_ERROR; + + // Read value — can be digit or identifier + rc = vec0_scanner_next(scanner, &token); + if (rc != VEC0_TOKEN_RESULT_SOME) return SQLITE_ERROR; + if (token.token_type != TOKEN_TYPE_DIGIT && + token.token_type != TOKEN_TYPE_IDENTIFIER) + return SQLITE_ERROR; + + char *val = token.start; + int valLength = token.end - token.start; + + if (sqlite3_strnicmp(key, "nlist", keyLength) == 0) { + if (token.token_type != TOKEN_TYPE_DIGIT) return SQLITE_ERROR; + int v = atoi(val); + if (v < 0 || v > VEC0_IVF_MAX_NLIST) return SQLITE_ERROR; + config->nlist = v; + } else if (sqlite3_strnicmp(key, "nprobe", keyLength) == 0) { + if (token.token_type != TOKEN_TYPE_DIGIT) return SQLITE_ERROR; + int v = atoi(val); + if (v < 1 || v > VEC0_IVF_MAX_NLIST) return SQLITE_ERROR; + config->nprobe = v; + nprobe_explicit = 1; + } else if (sqlite3_strnicmp(key, "quantizer", keyLength) == 0) { + if (token.token_type != TOKEN_TYPE_IDENTIFIER) return SQLITE_ERROR; + if (sqlite3_strnicmp(val, "none", valLength) == 0) { + config->quantizer = VEC0_IVF_QUANTIZER_NONE; + } else if (sqlite3_strnicmp(val, "int8", valLength) == 0) { + config->quantizer = VEC0_IVF_QUANTIZER_INT8; + } else if (sqlite3_strnicmp(val, "binary", valLength) == 0) { + config->quantizer = VEC0_IVF_QUANTIZER_BINARY; + } else { + return SQLITE_ERROR; + } + } else if (sqlite3_strnicmp(key, "oversample", keyLength) == 0) { + if (token.token_type != TOKEN_TYPE_DIGIT) return SQLITE_ERROR; + int v = atoi(val); + if (v < 1) return SQLITE_ERROR; + config->oversample = v; + } else { + return SQLITE_ERROR; + } + + rc = vec0_scanner_next(scanner, &token); + if (rc != VEC0_TOKEN_RESULT_SOME) return SQLITE_ERROR; + if (token.token_type == TOKEN_TYPE_RPAREN) break; + if (token.token_type != TOKEN_TYPE_COMMA) return SQLITE_ERROR; + rc = vec0_scanner_next(scanner, &token); + } + + if (config->nprobe < 0) config->nprobe = VEC0_IVF_DEFAULT_NPROBE; + if (config->nlist > 0 && config->nprobe > config->nlist) { + if (nprobe_explicit) return SQLITE_ERROR; + config->nprobe = config->nlist; + } + + // Validation: oversample > 1 only makes sense with quantization + if (config->oversample > 1 && config->quantizer == VEC0_IVF_QUANTIZER_NONE) { + return SQLITE_ERROR; + } + + return SQLITE_OK; +} + +// ============================================================================ +// Helpers +// ============================================================================ + +/** + * Size of a stored vector in bytes, accounting for quantization. + */ +static int ivf_vec_size(vec0_vtab *p, int col_idx) { + int D = (int)p->vector_columns[col_idx].dimensions; + switch (p->vector_columns[col_idx].ivf.quantizer) { + case VEC0_IVF_QUANTIZER_INT8: return D; + case VEC0_IVF_QUANTIZER_BINARY: return D / 8; + default: return D * (int)sizeof(float); + } +} + +/** + * Size of the full-precision vector in bytes (always float32). + */ +static int ivf_full_vec_size(vec0_vtab *p, int col_idx) { + return (int)(p->vector_columns[col_idx].dimensions * sizeof(float)); +} + +/** + * Quantize float32 vector to int8. + * Uses unit normalization: clamp to [-1,1], scale to [-127,127]. + */ +IVF_STATIC void ivf_quantize_int8(const float *src, int8_t *dst, int D) { + for (int i = 0; i < D; i++) { + float v = src[i]; + if (v > 1.0f) v = 1.0f; + if (v < -1.0f) v = -1.0f; + dst[i] = (int8_t)(v * 127.0f); + } +} + +/** + * Quantize float32 vector to binary (sign-bit quantization). + * Each bit = 1 if src[i] > 0, else 0. + */ +IVF_STATIC void ivf_quantize_binary(const float *src, uint8_t *dst, int D) { + memset(dst, 0, D / 8); + for (int i = 0; i < D; i++) { + if (src[i] > 0.0f) { + dst[i / 8] |= (1 << (i % 8)); + } + } +} + +/** + * Quantize a float32 vector to the target type based on config. + * dst must be pre-allocated to ivf_vec_size() bytes. + * If quantizer=none, copies src as-is. + */ +static void ivf_quantize(vec0_vtab *p, int col_idx, + const float *src, void *dst) { + int D = (int)p->vector_columns[col_idx].dimensions; + switch (p->vector_columns[col_idx].ivf.quantizer) { + case VEC0_IVF_QUANTIZER_INT8: + ivf_quantize_int8(src, (int8_t *)dst, D); + break; + case VEC0_IVF_QUANTIZER_BINARY: + ivf_quantize_binary(src, (uint8_t *)dst, D); + break; + default: + memcpy(dst, src, D * sizeof(float)); + break; + } +} + +// Forward declaration +static float ivf_distance(vec0_vtab *p, int col_idx, const void *a, const void *b); + +/** + * Find nearest centroid. Works with quantized or float centroids. + * vec and centroids must be in the same representation (both quantized or both float). + * vecSize = size of one vector in bytes. + */ +static int ivf_find_nearest_centroid(vec0_vtab *p, int col_idx, + const void *vec, const void *centroids, + int vecSize, int k) { + float min_dist = FLT_MAX; + int best = 0; + const unsigned char *cdata = (const unsigned char *)centroids; + for (int c = 0; c < k; c++) { + float dist = ivf_distance(p, col_idx, vec, cdata + c * vecSize); + if (dist < min_dist) { min_dist = dist; best = c; } + } + return best; +} + +/** + * Compute distance between two vectors using the column's distance_metric. + * Dispatches to SIMD-optimized functions (NEON/AVX) via distance_*_float(). + * For float32 (non-quantized) vectors. + */ +static float ivf_distance_float(vec0_vtab *p, int col_idx, + const float *a, const float *b) { + size_t dims = p->vector_columns[col_idx].dimensions; + switch (p->vector_columns[col_idx].distance_metric) { + case VEC0_DISTANCE_METRIC_COSINE: + return distance_cosine_float(a, b, &dims); + case VEC0_DISTANCE_METRIC_L1: + return (float)distance_l1_f32(a, b, &dims); + case VEC0_DISTANCE_METRIC_L2: + default: + return distance_l2_sqr_float(a, b, &dims); + } +} + +/** + * Compute distance between two quantized vectors. + * For int8: uses L2 or cosine on int8. + * For binary: uses hamming distance. + * For none: delegates to ivf_distance_float. + */ +static float ivf_distance(vec0_vtab *p, int col_idx, + const void *a, const void *b) { + size_t dims = p->vector_columns[col_idx].dimensions; + switch (p->vector_columns[col_idx].ivf.quantizer) { + case VEC0_IVF_QUANTIZER_INT8: + return distance_l2_sqr_int8(a, b, &dims); + case VEC0_IVF_QUANTIZER_BINARY: + return distance_hamming(a, b, &dims); + default: + return ivf_distance_float(p, col_idx, (const float *)a, (const float *)b); + } +} + +static int ivf_ensure_stmt(vec0_vtab *p, sqlite3_stmt **pStmt, const char *fmt, + int col_idx) { + if (*pStmt) return SQLITE_OK; + char *zSql = sqlite3_mprintf(fmt, p->schemaName, p->tableName, col_idx); + if (!zSql) return SQLITE_NOMEM; + int rc = sqlite3_prepare_v2(p->db, zSql, -1, pStmt, NULL); + sqlite3_free(zSql); + return rc; +} + +static int ivf_exec(vec0_vtab *p, const char *fmt, int col_idx) { + sqlite3_stmt *stmt = NULL; + char *zSql = sqlite3_mprintf(fmt, p->schemaName, p->tableName, col_idx); + if (!zSql) return SQLITE_NOMEM; + int rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if (rc == SQLITE_OK) { sqlite3_step(stmt); sqlite3_finalize(stmt); } + return SQLITE_OK; +} + +static int ivf_is_trained(vec0_vtab *p, int col_idx) { + if (p->ivfTrainedCache[col_idx] >= 0) return p->ivfTrainedCache[col_idx]; + sqlite3_stmt *stmt = NULL; + int trained = 0; + char *zSql = sqlite3_mprintf( + "SELECT value FROM " VEC0_SHADOW_INFO_NAME " WHERE key = 'ivf_trained_%d'", + p->schemaName, p->tableName, col_idx); + if (!zSql) return 0; + if (sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL) == SQLITE_OK) { + if (sqlite3_step(stmt) == SQLITE_ROW) + trained = (sqlite3_column_int(stmt, 0) == 1); + } + sqlite3_free(zSql); + sqlite3_finalize(stmt); + p->ivfTrainedCache[col_idx] = trained; + return trained; +} + +// ============================================================================ +// Cell operations — fixed-size cells, multiple rows per centroid +// ============================================================================ + +/** + * Create a new cell row. Returns the new cell_id (rowid) via *out_cell_id. + */ +static int ivf_cell_create(vec0_vtab *p, int col_idx, i64 centroid_id, + i64 *out_cell_id) { + sqlite3_stmt *stmt = NULL; + int rc; + int cap = VEC0_IVF_CELL_MAX_VECTORS; + int vecSize = ivf_vec_size(p, col_idx); + char *zSql = sqlite3_mprintf( + "INSERT INTO " VEC0_SHADOW_IVF_CELLS_NAME + " (centroid_id, n_vectors, validity, rowids, vectors) VALUES (?, 0, ?, ?, ?)", + p->schemaName, p->tableName, col_idx); + if (!zSql) return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) return rc; + sqlite3_bind_int64(stmt, 1, centroid_id); + sqlite3_bind_zeroblob(stmt, 2, cap / 8); + sqlite3_bind_zeroblob(stmt, 3, cap * (int)sizeof(i64)); + sqlite3_bind_zeroblob(stmt, 4, cap * vecSize); + rc = sqlite3_step(stmt); + sqlite3_finalize(stmt); + if (rc != SQLITE_DONE) return SQLITE_ERROR; + if (out_cell_id) *out_cell_id = sqlite3_last_insert_rowid(p->db); + return SQLITE_OK; +} + +/** + * Find a cell with space for the given centroid, or create one. + * Returns cell_id (rowid) and current n_vectors. + */ +static int ivf_cell_find_or_create(vec0_vtab *p, int col_idx, i64 centroid_id, + i64 *out_cell_id, int *out_n) { + int rc; + // Find existing cell with space + rc = ivf_ensure_stmt(p, &p->stmtIvfCellMeta[col_idx], + "SELECT rowid, n_vectors FROM " VEC0_SHADOW_IVF_CELLS_NAME + " WHERE centroid_id = ? AND n_vectors < %d LIMIT 1", + col_idx); + // The %d in the format won't work with ivf_ensure_stmt since it only has 3 + // format args. Use a direct approach instead. + sqlite3_finalize(p->stmtIvfCellMeta[col_idx]); + p->stmtIvfCellMeta[col_idx] = NULL; + + char *zSql = sqlite3_mprintf( + "SELECT rowid, n_vectors FROM " VEC0_SHADOW_IVF_CELLS_NAME + " WHERE centroid_id = ? AND n_vectors < %d LIMIT 1", + p->schemaName, p->tableName, col_idx, VEC0_IVF_CELL_MAX_VECTORS); + if (!zSql) return SQLITE_NOMEM; + // Cache this manually + if (!p->stmtIvfCellMeta[col_idx]) { + rc = sqlite3_prepare_v2(p->db, zSql, -1, &p->stmtIvfCellMeta[col_idx], NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) return rc; + } else { + sqlite3_free(zSql); + } + + sqlite3_stmt *stmt = p->stmtIvfCellMeta[col_idx]; + sqlite3_reset(stmt); + sqlite3_bind_int64(stmt, 1, centroid_id); + + if (sqlite3_step(stmt) == SQLITE_ROW) { + *out_cell_id = sqlite3_column_int64(stmt, 0); + *out_n = sqlite3_column_int(stmt, 1); + return SQLITE_OK; + } + + // No cell with space — create new one + rc = ivf_cell_create(p, col_idx, centroid_id, out_cell_id); + *out_n = 0; + return rc; +} + +/** + * Insert vector into cell at slot = n_vectors (append). + * Cell must have space (n_vectors < VEC0_IVF_CELL_MAX_VECTORS). + */ +static int ivf_cell_insert(vec0_vtab *p, int col_idx, i64 centroid_id, + i64 rowid, const void *vectorData, int vectorSize) { + int rc; + i64 cell_id; + int n_vectors; + + rc = ivf_cell_find_or_create(p, col_idx, centroid_id, &cell_id, &n_vectors); + if (rc != SQLITE_OK) return rc; + + int slot = n_vectors; + char *cellsTable = p->shadowIvfCellsNames[col_idx]; + + // Set validity bit + sqlite3_blob *blob = NULL; + rc = sqlite3_blob_open(p->db, p->schemaName, cellsTable, "validity", + cell_id, 1, &blob); + if (rc != SQLITE_OK) return rc; + unsigned char bx; + sqlite3_blob_read(blob, &bx, 1, slot / 8); + bx |= (1 << (slot % 8)); + sqlite3_blob_write(blob, &bx, 1, slot / 8); + sqlite3_blob_close(blob); + + // Write rowid + rc = sqlite3_blob_open(p->db, p->schemaName, cellsTable, "rowids", + cell_id, 1, &blob); + if (rc == SQLITE_OK) { + sqlite3_blob_write(blob, &rowid, sizeof(i64), slot * (int)sizeof(i64)); + sqlite3_blob_close(blob); + } + + // Write vector + rc = sqlite3_blob_open(p->db, p->schemaName, cellsTable, "vectors", + cell_id, 1, &blob); + if (rc == SQLITE_OK) { + sqlite3_blob_write(blob, vectorData, vectorSize, slot * vectorSize); + sqlite3_blob_close(blob); + } + + // Increment n_vectors (cached) + ivf_ensure_stmt(p, &p->stmtIvfCellUpdateN[col_idx], + "UPDATE " VEC0_SHADOW_IVF_CELLS_NAME + " SET n_vectors = n_vectors + 1 WHERE rowid = ?", col_idx); + if (p->stmtIvfCellUpdateN[col_idx]) { + sqlite3_stmt *s = p->stmtIvfCellUpdateN[col_idx]; + sqlite3_reset(s); + sqlite3_bind_int64(s, 1, cell_id); + sqlite3_step(s); + } + + // Insert rowid_map (cached) + ivf_ensure_stmt(p, &p->stmtIvfRowidMapInsert[col_idx], + "INSERT INTO " VEC0_SHADOW_IVF_ROWID_MAP_NAME + " (rowid, cell_id, slot) VALUES (?, ?, ?)", col_idx); + if (p->stmtIvfRowidMapInsert[col_idx]) { + sqlite3_stmt *s = p->stmtIvfRowidMapInsert[col_idx]; + sqlite3_reset(s); + sqlite3_bind_int64(s, 1, rowid); + sqlite3_bind_int64(s, 2, cell_id); + sqlite3_bind_int(s, 3, slot); + sqlite3_step(s); + } + + return SQLITE_OK; +} + +// ============================================================================ +// Shadow tables +// ============================================================================ + +static int ivf_create_shadow_tables(vec0_vtab *p, int col_idx) { + sqlite3_stmt *stmt = NULL; + int rc; + char *zSql; + + zSql = sqlite3_mprintf( + "CREATE TABLE " VEC0_SHADOW_IVF_CENTROIDS_NAME + " (centroid_id INTEGER PRIMARY KEY, centroid BLOB NOT NULL)", + p->schemaName, p->tableName, col_idx); + if (!zSql) return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); sqlite3_free(zSql); + if (rc != SQLITE_OK || sqlite3_step(stmt) != SQLITE_DONE) { sqlite3_finalize(stmt); return SQLITE_ERROR; } + sqlite3_finalize(stmt); + + // cell_id is rowid (auto-increment), centroid_id is indexed + zSql = sqlite3_mprintf( + "CREATE TABLE " VEC0_SHADOW_IVF_CELLS_NAME + " (centroid_id INTEGER NOT NULL," + " n_vectors INTEGER NOT NULL DEFAULT 0," + " validity BLOB NOT NULL," + " rowids BLOB NOT NULL," + " vectors BLOB NOT NULL)", + p->schemaName, p->tableName, col_idx); + if (!zSql) return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); sqlite3_free(zSql); + if (rc != SQLITE_OK || sqlite3_step(stmt) != SQLITE_DONE) { sqlite3_finalize(stmt); return SQLITE_ERROR; } + sqlite3_finalize(stmt); + + // Index on centroid_id for cell lookup + zSql = sqlite3_mprintf( + "CREATE INDEX \"%w_ivf_cells%02d_centroid\" ON \"%w_ivf_cells%02d\" (centroid_id)", + p->tableName, col_idx, p->tableName, col_idx); + if (!zSql) return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); sqlite3_free(zSql); + if (rc != SQLITE_OK || sqlite3_step(stmt) != SQLITE_DONE) { sqlite3_finalize(stmt); return SQLITE_ERROR; } + sqlite3_finalize(stmt); + + zSql = sqlite3_mprintf( + "CREATE TABLE " VEC0_SHADOW_IVF_ROWID_MAP_NAME + " (rowid INTEGER PRIMARY KEY, cell_id INTEGER NOT NULL, slot INTEGER NOT NULL)", + p->schemaName, p->tableName, col_idx); + if (!zSql) return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); sqlite3_free(zSql); + if (rc != SQLITE_OK || sqlite3_step(stmt) != SQLITE_DONE) { sqlite3_finalize(stmt); return SQLITE_ERROR; } + sqlite3_finalize(stmt); + + // _ivf_vectors — full-precision KV store (only when quantizer != none) + if (p->vector_columns[col_idx].ivf.quantizer != VEC0_IVF_QUANTIZER_NONE) { + zSql = sqlite3_mprintf( + "CREATE TABLE " VEC0_SHADOW_IVF_VECTORS_NAME + " (rowid INTEGER PRIMARY KEY, vector BLOB NOT NULL)", + p->schemaName, p->tableName, col_idx); + if (!zSql) return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); sqlite3_free(zSql); + if (rc != SQLITE_OK || sqlite3_step(stmt) != SQLITE_DONE) { sqlite3_finalize(stmt); return SQLITE_ERROR; } + sqlite3_finalize(stmt); + } + + zSql = sqlite3_mprintf( + "INSERT INTO " VEC0_SHADOW_INFO_NAME " (key, value) VALUES ('ivf_trained_%d', '0')", + p->schemaName, p->tableName, col_idx); + if (!zSql) return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); sqlite3_free(zSql); + if (rc != SQLITE_OK || sqlite3_step(stmt) != SQLITE_DONE) { sqlite3_finalize(stmt); return SQLITE_ERROR; } + sqlite3_finalize(stmt); + + return SQLITE_OK; +} + +static int ivf_drop_shadow_tables(vec0_vtab *p, int col_idx) { + ivf_exec(p, "DROP TABLE IF EXISTS " VEC0_SHADOW_IVF_CENTROIDS_NAME, col_idx); + ivf_exec(p, "DROP TABLE IF EXISTS " VEC0_SHADOW_IVF_CELLS_NAME, col_idx); + ivf_exec(p, "DROP TABLE IF EXISTS " VEC0_SHADOW_IVF_ROWID_MAP_NAME, col_idx); + ivf_exec(p, "DROP TABLE IF EXISTS " VEC0_SHADOW_IVF_VECTORS_NAME, col_idx); + return SQLITE_OK; +} + +// ============================================================================ +// Insert / Delete +// ============================================================================ + +static int ivf_insert(vec0_vtab *p, int col_idx, i64 rowid, + const void *vectorData, int vectorSize) { + UNUSED_PARAMETER(vectorSize); + int quantizer = p->vector_columns[col_idx].ivf.quantizer; + int qvecSize = ivf_vec_size(p, col_idx); + int rc; + + // Quantize the input vector (or copy as-is if no quantization) + void *qvec = sqlite3_malloc(qvecSize); + if (!qvec) return SQLITE_NOMEM; + ivf_quantize(p, col_idx, (const float *)vectorData, qvec); + + if (!ivf_is_trained(p, col_idx)) { + rc = ivf_cell_insert(p, col_idx, VEC0_IVF_UNASSIGNED_CENTROID_ID, + rowid, qvec, qvecSize); + } else { + // Find nearest centroid using quantized distance + int best_centroid = -1; + float min_dist = FLT_MAX; + + rc = ivf_ensure_stmt(p, &p->stmtIvfCentroidsAll[col_idx], + "SELECT centroid_id, centroid FROM " VEC0_SHADOW_IVF_CENTROIDS_NAME, col_idx); + if (rc != SQLITE_OK) { sqlite3_free(qvec); return rc; } + sqlite3_stmt *stmt = p->stmtIvfCentroidsAll[col_idx]; + sqlite3_reset(stmt); + while (sqlite3_step(stmt) == SQLITE_ROW) { + int cid = sqlite3_column_int(stmt, 0); + const void *c = sqlite3_column_blob(stmt, 1); + int cBytes = sqlite3_column_bytes(stmt, 1); + if (!c || cBytes != qvecSize) continue; + float dist = ivf_distance(p, col_idx, qvec, c); + if (dist < min_dist) { min_dist = dist; best_centroid = cid; } + } + if (best_centroid < 0) { sqlite3_free(qvec); return SQLITE_ERROR; } + + rc = ivf_cell_insert(p, col_idx, best_centroid, rowid, qvec, qvecSize); + } + + sqlite3_free(qvec); + if (rc != SQLITE_OK) return rc; + + // Store full-precision vector in KV table when quantized + if (quantizer != VEC0_IVF_QUANTIZER_NONE) { + sqlite3_stmt *stmt = NULL; + char *zSql = sqlite3_mprintf( + "INSERT INTO " VEC0_SHADOW_IVF_VECTORS_NAME " (rowid, vector) VALUES (?, ?)", + p->schemaName, p->tableName, col_idx); + if (!zSql) return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); sqlite3_free(zSql); + if (rc != SQLITE_OK) return rc; + sqlite3_bind_int64(stmt, 1, rowid); + sqlite3_bind_blob(stmt, 2, vectorData, ivf_full_vec_size(p, col_idx), SQLITE_STATIC); + rc = sqlite3_step(stmt); + sqlite3_finalize(stmt); + if (rc != SQLITE_DONE) return SQLITE_ERROR; + } + + return SQLITE_OK; +} + +static int ivf_delete(vec0_vtab *p, int col_idx, i64 rowid) { + int rc; + i64 cell_id = 0; + int slot = -1; + + rc = ivf_ensure_stmt(p, &p->stmtIvfRowidMapLookup[col_idx], + "SELECT cell_id, slot FROM " VEC0_SHADOW_IVF_ROWID_MAP_NAME + " WHERE rowid = ?", col_idx); + if (rc != SQLITE_OK) return rc; + sqlite3_stmt *s = p->stmtIvfRowidMapLookup[col_idx]; + sqlite3_reset(s); + sqlite3_bind_int64(s, 1, rowid); + if (sqlite3_step(s) == SQLITE_ROW) { + cell_id = sqlite3_column_int64(s, 0); + slot = sqlite3_column_int(s, 1); + } + if (slot < 0) return SQLITE_OK; + + // Clear validity bit + char *cellsTable = p->shadowIvfCellsNames[col_idx]; + sqlite3_blob *blob = NULL; + rc = sqlite3_blob_open(p->db, p->schemaName, cellsTable, "validity", + cell_id, 1, &blob); + if (rc == SQLITE_OK) { + unsigned char bx; + sqlite3_blob_read(blob, &bx, 1, slot / 8); + bx &= ~(1 << (slot % 8)); + sqlite3_blob_write(blob, &bx, 1, slot / 8); + sqlite3_blob_close(blob); + } + + // Decrement n_vectors + if (p->stmtIvfCellUpdateN[col_idx]) { + // This stmt does +1, but we want -1. Use a different cached stmt. + } + // Just use inline for decrement (not hot path) + { + sqlite3_stmt *stmtDec = NULL; + char *zSql = sqlite3_mprintf( + "UPDATE " VEC0_SHADOW_IVF_CELLS_NAME + " SET n_vectors = n_vectors - 1 WHERE rowid = ?", + p->schemaName, p->tableName, col_idx); + if (zSql) { + sqlite3_prepare_v2(p->db, zSql, -1, &stmtDec, NULL); sqlite3_free(zSql); + if (stmtDec) { sqlite3_bind_int64(stmtDec, 1, cell_id); sqlite3_step(stmtDec); sqlite3_finalize(stmtDec); } + } + } + + // Delete from rowid_map + ivf_ensure_stmt(p, &p->stmtIvfRowidMapDelete[col_idx], + "DELETE FROM " VEC0_SHADOW_IVF_ROWID_MAP_NAME " WHERE rowid = ?", col_idx); + if (p->stmtIvfRowidMapDelete[col_idx]) { + sqlite3_stmt *sd = p->stmtIvfRowidMapDelete[col_idx]; + sqlite3_reset(sd); + sqlite3_bind_int64(sd, 1, rowid); + sqlite3_step(sd); + } + + // Delete from _ivf_vectors (full-precision KV) when quantized + if (p->vector_columns[col_idx].ivf.quantizer != VEC0_IVF_QUANTIZER_NONE) { + sqlite3_stmt *stmtDelVec = NULL; + char *zSql = sqlite3_mprintf( + "DELETE FROM " VEC0_SHADOW_IVF_VECTORS_NAME " WHERE rowid = ?", + p->schemaName, p->tableName, col_idx); + if (zSql) { + sqlite3_prepare_v2(p->db, zSql, -1, &stmtDelVec, NULL); sqlite3_free(zSql); + if (stmtDelVec) { sqlite3_bind_int64(stmtDelVec, 1, rowid); sqlite3_step(stmtDelVec); sqlite3_finalize(stmtDelVec); } + } + } + + return SQLITE_OK; +} + +// ============================================================================ +// Point query +// ============================================================================ + +static int ivf_get_vector_data(vec0_vtab *p, i64 rowid, int col_idx, + void **outVector, int *outVectorSize) { + int rc; + int vecSize = ivf_vec_size(p, col_idx); + i64 cell_id = 0; + int slot = -1; + + rc = ivf_ensure_stmt(p, &p->stmtIvfRowidMapLookup[col_idx], + "SELECT cell_id, slot FROM " VEC0_SHADOW_IVF_ROWID_MAP_NAME + " WHERE rowid = ?", col_idx); + if (rc != SQLITE_OK) return rc; + sqlite3_stmt *s = p->stmtIvfRowidMapLookup[col_idx]; + sqlite3_reset(s); + sqlite3_bind_int64(s, 1, rowid); + if (sqlite3_step(s) != SQLITE_ROW) return SQLITE_EMPTY; + cell_id = sqlite3_column_int64(s, 0); + slot = sqlite3_column_int(s, 1); + + void *buf = sqlite3_malloc(vecSize); + if (!buf) return SQLITE_NOMEM; + + sqlite3_blob *blob = NULL; + rc = sqlite3_blob_open(p->db, p->schemaName, p->shadowIvfCellsNames[col_idx], + "vectors", cell_id, 0, &blob); + if (rc != SQLITE_OK) { sqlite3_free(buf); return rc; } + rc = sqlite3_blob_read(blob, buf, vecSize, slot * vecSize); + sqlite3_blob_close(blob); + if (rc != SQLITE_OK) { sqlite3_free(buf); return rc; } + + *outVector = buf; + if (outVectorSize) *outVectorSize = vecSize; + return SQLITE_OK; +} + +// ============================================================================ +// Centroid commands +// ============================================================================ + +static int ivf_load_all_vectors(vec0_vtab *p, int col_idx, + float **out_vectors, i64 **out_rowids, int *out_N) { + sqlite3_stmt *stmt = NULL; + int rc; + int D = (int)p->vector_columns[col_idx].dimensions; + int vecSize = D * (int)sizeof(float); + int quantizer = p->vector_columns[col_idx].ivf.quantizer; + + // When quantized, load full-precision vectors from _ivf_vectors KV table + if (quantizer != VEC0_IVF_QUANTIZER_NONE) { + int total = 0; + char *zSql = sqlite3_mprintf( + "SELECT count(*) FROM " VEC0_SHADOW_IVF_VECTORS_NAME, + p->schemaName, p->tableName, col_idx); + if (!zSql) return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); sqlite3_free(zSql); + if (rc == SQLITE_OK && sqlite3_step(stmt) == SQLITE_ROW) total = sqlite3_column_int(stmt, 0); + sqlite3_finalize(stmt); + if (total == 0) { *out_vectors = NULL; *out_rowids = NULL; *out_N = 0; return SQLITE_OK; } + + float *vectors = sqlite3_malloc64((i64)total * D * sizeof(float)); + i64 *rowids = sqlite3_malloc64((i64)total * sizeof(i64)); + if (!vectors || !rowids) { sqlite3_free(vectors); sqlite3_free(rowids); return SQLITE_NOMEM; } + + int idx = 0; + zSql = sqlite3_mprintf( + "SELECT rowid, vector FROM " VEC0_SHADOW_IVF_VECTORS_NAME, + p->schemaName, p->tableName, col_idx); + if (!zSql) { sqlite3_free(vectors); sqlite3_free(rowids); return SQLITE_NOMEM; } + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); sqlite3_free(zSql); + if (rc == SQLITE_OK) { + while (sqlite3_step(stmt) == SQLITE_ROW && idx < total) { + rowids[idx] = sqlite3_column_int64(stmt, 0); + const void *blob = sqlite3_column_blob(stmt, 1); + int blobBytes = sqlite3_column_bytes(stmt, 1); + if (blob && blobBytes == vecSize) { + memcpy(&vectors[idx * D], blob, vecSize); + idx++; + } + } + } + sqlite3_finalize(stmt); + *out_vectors = vectors; *out_rowids = rowids; *out_N = idx; + return SQLITE_OK; + } + + // Non-quantized: load from cells (existing path) + + // Count total + int total = 0; + char *zSql = sqlite3_mprintf( + "SELECT COALESCE(SUM(n_vectors),0) FROM " VEC0_SHADOW_IVF_CELLS_NAME, + p->schemaName, p->tableName, col_idx); + if (!zSql) return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); sqlite3_free(zSql); + if (rc == SQLITE_OK && sqlite3_step(stmt) == SQLITE_ROW) total = sqlite3_column_int(stmt, 0); + sqlite3_finalize(stmt); + + if (total == 0) { *out_vectors = NULL; *out_rowids = NULL; *out_N = 0; return SQLITE_OK; } + + float *vectors = sqlite3_malloc64((i64)total * D * sizeof(float)); + i64 *rowids = sqlite3_malloc64((i64)total * sizeof(i64)); + if (!vectors || !rowids) { sqlite3_free(vectors); sqlite3_free(rowids); return SQLITE_NOMEM; } + + int idx = 0; + zSql = sqlite3_mprintf( + "SELECT n_vectors, validity, rowids, vectors FROM " VEC0_SHADOW_IVF_CELLS_NAME, + p->schemaName, p->tableName, col_idx); + if (!zSql) { sqlite3_free(vectors); sqlite3_free(rowids); return SQLITE_NOMEM; } + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); sqlite3_free(zSql); + if (rc != SQLITE_OK) { sqlite3_free(vectors); sqlite3_free(rowids); return rc; } + + while (sqlite3_step(stmt) == SQLITE_ROW) { + int n = sqlite3_column_int(stmt, 0); + if (n == 0) continue; + const unsigned char *val = (const unsigned char *)sqlite3_column_blob(stmt, 1); + const i64 *rids = (const i64 *)sqlite3_column_blob(stmt, 2); + const float *vecs = (const float *)sqlite3_column_blob(stmt, 3); + int valBytes = sqlite3_column_bytes(stmt, 1); + int ridsBytes = sqlite3_column_bytes(stmt, 2); + int vecsBytes = sqlite3_column_bytes(stmt, 3); + if (!val || !rids || !vecs) continue; + int cap = valBytes * 8; + // Clamp cap to the number of entries actually backed by the rowids and vectors blobs + if (ridsBytes / (int)sizeof(i64) < cap) cap = ridsBytes / (int)sizeof(i64); + if (vecsBytes / vecSize < cap) cap = vecsBytes / vecSize; + for (int i = 0; i < cap && idx < total; i++) { + if (val[i / 8] & (1 << (i % 8))) { + rowids[idx] = rids[i]; + memcpy(&vectors[idx * D], &vecs[i * D], vecSize); + idx++; + } + } + } + sqlite3_finalize(stmt); + *out_vectors = vectors; *out_rowids = rowids; *out_N = idx; + return SQLITE_OK; +} + +static void ivf_invalidate_cached(vec0_vtab *p, int col_idx) { + sqlite3_finalize(p->stmtIvfCellMeta[col_idx]); p->stmtIvfCellMeta[col_idx] = NULL; + sqlite3_finalize(p->stmtIvfCentroidsAll[col_idx]); p->stmtIvfCentroidsAll[col_idx] = NULL; + sqlite3_finalize(p->stmtIvfCellUpdateN[col_idx]); p->stmtIvfCellUpdateN[col_idx] = NULL; + sqlite3_finalize(p->stmtIvfRowidMapInsert[col_idx]); p->stmtIvfRowidMapInsert[col_idx] = NULL; +} + +static int ivf_cmd_compute_centroids(vec0_vtab *p, int col_idx, int nlist_override, + int max_iter, uint32_t seed) { + int rc; + int D = (int)p->vector_columns[col_idx].dimensions; + int vecSize = D * (int)sizeof(float); + int quantizer = p->vector_columns[col_idx].ivf.quantizer; + int nlist = nlist_override > 0 ? nlist_override : p->vector_columns[col_idx].ivf.nlist; + if (nlist <= 0) { vtab_set_error(&p->base, "nlist must be specified"); return SQLITE_ERROR; } + + float *vectors = NULL; i64 *rowids = NULL; int N = 0; + rc = ivf_load_all_vectors(p, col_idx, &vectors, &rowids, &N); + if (rc != SQLITE_OK) return rc; + if (N == 0) { vtab_set_error(&p->base, "No vectors"); sqlite3_free(vectors); sqlite3_free(rowids); return SQLITE_ERROR; } + if (nlist > N) nlist = N; + + float *centroids = sqlite3_malloc64((i64)nlist * D * sizeof(float)); + if (!centroids) { sqlite3_free(vectors); sqlite3_free(rowids); return SQLITE_NOMEM; } + if (ivf_kmeans(vectors, N, D, nlist, max_iter, seed, centroids) != 0) { + sqlite3_free(vectors); sqlite3_free(rowids); sqlite3_free(centroids); return SQLITE_ERROR; + } + + // Compute assignments + int *assignments = sqlite3_malloc64((i64)N * sizeof(int)); + if (!assignments) { sqlite3_free(vectors); sqlite3_free(rowids); sqlite3_free(centroids); return SQLITE_NOMEM; } + // Assignment uses float32 distances (k-means operates in float32 space) + for (int i = 0; i < N; i++) { + float min_d = FLT_MAX; + int best = 0; + for (int c = 0; c < nlist; c++) { + float d = ivf_distance_float(p, col_idx, &vectors[i * D], ¢roids[c * D]); + if (d < min_d) { min_d = d; best = c; } + } + assignments[i] = best; + } + + // Invalidate all cached stmts before dropping/recreating tables + ivf_invalidate_cached(p, col_idx); + + sqlite3_exec(p->db, "SAVEPOINT ivf_train", NULL, NULL, NULL); + sqlite3_stmt *stmt = NULL; + char *zSql; + + // Clear all data + ivf_exec(p, "DELETE FROM " VEC0_SHADOW_IVF_CENTROIDS_NAME, col_idx); + ivf_exec(p, "DELETE FROM " VEC0_SHADOW_IVF_CELLS_NAME, col_idx); + ivf_exec(p, "DELETE FROM " VEC0_SHADOW_IVF_ROWID_MAP_NAME, col_idx); + + // Write centroids (quantized if quantizer is set) + int qvecSize = ivf_vec_size(p, col_idx); + void *qbuf = sqlite3_malloc(qvecSize > vecSize ? qvecSize : vecSize); + if (!qbuf) { rc = SQLITE_NOMEM; goto train_error; } + + zSql = sqlite3_mprintf( + "INSERT INTO " VEC0_SHADOW_IVF_CENTROIDS_NAME " (centroid_id, centroid) VALUES (?, ?)", + p->schemaName, p->tableName, col_idx); + if (!zSql) { sqlite3_free(qbuf); rc = SQLITE_NOMEM; goto train_error; } + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); sqlite3_free(zSql); + if (rc != SQLITE_OK) { sqlite3_free(qbuf); goto train_error; } + for (int i = 0; i < nlist; i++) { + ivf_quantize(p, col_idx, ¢roids[i * D], qbuf); + sqlite3_reset(stmt); + sqlite3_bind_int(stmt, 1, i); + sqlite3_bind_blob(stmt, 2, qbuf, qvecSize, SQLITE_TRANSIENT); + if (sqlite3_step(stmt) != SQLITE_DONE) { sqlite3_finalize(stmt); sqlite3_free(qbuf); rc = SQLITE_ERROR; goto train_error; } + } + sqlite3_finalize(stmt); + + // Build cells: group vectors by centroid, create fixed-size cells + { + // Prepare INSERT statements + sqlite3_stmt *stmtCell = NULL; + zSql = sqlite3_mprintf( + "INSERT INTO " VEC0_SHADOW_IVF_CELLS_NAME + " (centroid_id, n_vectors, validity, rowids, vectors) VALUES (?, ?, ?, ?, ?)", + p->schemaName, p->tableName, col_idx); + if (!zSql) { rc = SQLITE_NOMEM; goto train_error; } + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmtCell, NULL); sqlite3_free(zSql); + if (rc != SQLITE_OK) goto train_error; + + sqlite3_stmt *stmtMap = NULL; + zSql = sqlite3_mprintf( + "INSERT INTO " VEC0_SHADOW_IVF_ROWID_MAP_NAME " (rowid, cell_id, slot) VALUES (?, ?, ?)", + p->schemaName, p->tableName, col_idx); + if (!zSql) { sqlite3_finalize(stmtCell); rc = SQLITE_NOMEM; goto train_error; } + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmtMap, NULL); sqlite3_free(zSql); + if (rc != SQLITE_OK) { sqlite3_finalize(stmtCell); goto train_error; } + + int cap = VEC0_IVF_CELL_MAX_VECTORS; + unsigned char *val = sqlite3_malloc(cap / 8); + i64 *rids = sqlite3_malloc64((i64)cap * sizeof(i64)); + unsigned char *vecs = sqlite3_malloc64((i64)cap * qvecSize); // quantized size + if (!val || !rids || !vecs) { + sqlite3_free(val); sqlite3_free(rids); sqlite3_free(vecs); + sqlite3_finalize(stmtCell); sqlite3_finalize(stmtMap); + sqlite3_free(qbuf); + rc = SQLITE_NOMEM; goto train_error; + } + + // Process one centroid at a time + for (int c = 0; c < nlist; c++) { + int slot = 0; + memset(val, 0, cap / 8); + memset(rids, 0, cap * sizeof(i64)); + + for (int i = 0; i < N; i++) { + if (assignments[i] != c) continue; + + if (slot >= cap) { + // Flush current cell + sqlite3_reset(stmtCell); + sqlite3_bind_int(stmtCell, 1, c); + sqlite3_bind_int(stmtCell, 2, slot); + sqlite3_bind_blob(stmtCell, 3, val, cap / 8, SQLITE_TRANSIENT); + sqlite3_bind_blob(stmtCell, 4, rids, cap * (int)sizeof(i64), SQLITE_TRANSIENT); + sqlite3_bind_blob(stmtCell, 5, vecs, cap * qvecSize, SQLITE_TRANSIENT); + sqlite3_step(stmtCell); + i64 flushed_cell_id = sqlite3_last_insert_rowid(p->db); + + for (int s = 0; s < slot; s++) { + sqlite3_reset(stmtMap); + sqlite3_bind_int64(stmtMap, 1, rids[s]); + sqlite3_bind_int64(stmtMap, 2, flushed_cell_id); + sqlite3_bind_int(stmtMap, 3, s); + sqlite3_step(stmtMap); + } + + slot = 0; + memset(val, 0, cap / 8); + memset(rids, 0, cap * sizeof(i64)); + } + + val[slot / 8] |= (1 << (slot % 8)); + rids[slot] = rowids[i]; + // Quantize float32 vector into cell blob + ivf_quantize(p, col_idx, &vectors[i * D], &vecs[slot * qvecSize]); + slot++; + } + + // Flush remaining + if (slot > 0) { + sqlite3_reset(stmtCell); + sqlite3_bind_int(stmtCell, 1, c); + sqlite3_bind_int(stmtCell, 2, slot); + sqlite3_bind_blob(stmtCell, 3, val, cap / 8, SQLITE_TRANSIENT); + sqlite3_bind_blob(stmtCell, 4, rids, cap * (int)sizeof(i64), SQLITE_TRANSIENT); + sqlite3_bind_blob(stmtCell, 5, vecs, cap * qvecSize, SQLITE_TRANSIENT); + sqlite3_step(stmtCell); + i64 flushed_cell_id = sqlite3_last_insert_rowid(p->db); + + for (int s = 0; s < slot; s++) { + sqlite3_reset(stmtMap); + sqlite3_bind_int64(stmtMap, 1, rids[s]); + sqlite3_bind_int64(stmtMap, 2, flushed_cell_id); + sqlite3_bind_int(stmtMap, 3, s); + sqlite3_step(stmtMap); + } + } + } + + sqlite3_free(val); sqlite3_free(rids); sqlite3_free(vecs); + sqlite3_finalize(stmtCell); sqlite3_finalize(stmtMap); + } + + sqlite3_free(qbuf); + + // Store full-precision vectors in _ivf_vectors when quantized + if (quantizer != VEC0_IVF_QUANTIZER_NONE) { + ivf_exec(p, "DELETE FROM " VEC0_SHADOW_IVF_VECTORS_NAME, col_idx); + zSql = sqlite3_mprintf( + "INSERT INTO " VEC0_SHADOW_IVF_VECTORS_NAME " (rowid, vector) VALUES (?, ?)", + p->schemaName, p->tableName, col_idx); + if (!zSql) { rc = SQLITE_NOMEM; goto train_error; } + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); sqlite3_free(zSql); + if (rc != SQLITE_OK) goto train_error; + for (int i = 0; i < N; i++) { + sqlite3_reset(stmt); + sqlite3_bind_int64(stmt, 1, rowids[i]); + sqlite3_bind_blob(stmt, 2, &vectors[i * D], vecSize, SQLITE_STATIC); + sqlite3_step(stmt); + } + sqlite3_finalize(stmt); + } + + // Set trained = 1 + { + zSql = sqlite3_mprintf( + "INSERT OR REPLACE INTO " VEC0_SHADOW_INFO_NAME " (key, value) VALUES ('ivf_trained_%d', '1')", + p->schemaName, p->tableName, col_idx); + if (zSql) { sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); sqlite3_free(zSql); + sqlite3_step(stmt); sqlite3_finalize(stmt); } + } + p->ivfTrainedCache[col_idx] = 1; + + sqlite3_exec(p->db, "RELEASE ivf_train", NULL, NULL, NULL); + sqlite3_free(vectors); sqlite3_free(rowids); sqlite3_free(centroids); sqlite3_free(assignments); + return SQLITE_OK; + +train_error: + sqlite3_exec(p->db, "ROLLBACK TO ivf_train", NULL, NULL, NULL); + sqlite3_exec(p->db, "RELEASE ivf_train", NULL, NULL, NULL); + sqlite3_free(vectors); sqlite3_free(rowids); sqlite3_free(centroids); sqlite3_free(assignments); + return rc; +} + +static int ivf_cmd_set_centroid(vec0_vtab *p, int col_idx, int centroid_id, + const void *vectorData, int vectorSize) { + sqlite3_stmt *stmt = NULL; + int rc; + int D = (int)p->vector_columns[col_idx].dimensions; + if (vectorSize != (int)(D * sizeof(float))) { vtab_set_error(&p->base, "Dimension mismatch"); return SQLITE_ERROR; } + + char *zSql = sqlite3_mprintf( + "INSERT OR REPLACE INTO " VEC0_SHADOW_IVF_CENTROIDS_NAME " (centroid_id, centroid) VALUES (?, ?)", + p->schemaName, p->tableName, col_idx); + if (!zSql) return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); sqlite3_free(zSql); + if (rc != SQLITE_OK) return rc; + sqlite3_bind_int(stmt, 1, centroid_id); + sqlite3_bind_blob(stmt, 2, vectorData, vectorSize, SQLITE_STATIC); + rc = sqlite3_step(stmt); sqlite3_finalize(stmt); + if (rc != SQLITE_DONE) return SQLITE_ERROR; + + zSql = sqlite3_mprintf( + "INSERT OR REPLACE INTO " VEC0_SHADOW_INFO_NAME " (key, value) VALUES ('ivf_trained_%d', '1')", + p->schemaName, p->tableName, col_idx); + if (zSql) { sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); sqlite3_free(zSql); + sqlite3_step(stmt); sqlite3_finalize(stmt); } + p->ivfTrainedCache[col_idx] = 1; + sqlite3_finalize(p->stmtIvfCentroidsAll[col_idx]); p->stmtIvfCentroidsAll[col_idx] = NULL; + return SQLITE_OK; +} + +static int ivf_cmd_assign_vectors(vec0_vtab *p, int col_idx) { + if (!ivf_is_trained(p, col_idx)) { vtab_set_error(&p->base, "No centroids"); return SQLITE_ERROR; } + + int D = (int)p->vector_columns[col_idx].dimensions; + int vecSize = D * (int)sizeof(float); + int rc; + sqlite3_stmt *stmt = NULL; + char *zSql; + + // Load centroids + int nlist = 0; + float *centroids = NULL; + zSql = sqlite3_mprintf("SELECT count(*) FROM " VEC0_SHADOW_IVF_CENTROIDS_NAME, + p->schemaName, p->tableName, col_idx); + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); sqlite3_free(zSql); + if (rc == SQLITE_OK && sqlite3_step(stmt) == SQLITE_ROW) nlist = sqlite3_column_int(stmt, 0); + sqlite3_finalize(stmt); + if (nlist == 0) { vtab_set_error(&p->base, "No centroids"); return SQLITE_ERROR; } + + centroids = sqlite3_malloc64((i64)nlist * D * sizeof(float)); + if (!centroids) return SQLITE_NOMEM; + zSql = sqlite3_mprintf("SELECT centroid_id, centroid FROM " VEC0_SHADOW_IVF_CENTROIDS_NAME " ORDER BY centroid_id", + p->schemaName, p->tableName, col_idx); + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); sqlite3_free(zSql); + { int ci = 0; while (sqlite3_step(stmt) == SQLITE_ROW && ci < nlist) { + const void *b = sqlite3_column_blob(stmt, 1); + int bBytes = sqlite3_column_bytes(stmt, 1); + if (b && bBytes == vecSize) memcpy(¢roids[ci * D], b, vecSize); + ci++; + }} + sqlite3_finalize(stmt); + + // Read unassigned cells, re-insert into trained cells + zSql = sqlite3_mprintf( + "SELECT rowid, n_vectors, validity, rowids, vectors FROM " VEC0_SHADOW_IVF_CELLS_NAME + " WHERE centroid_id = %d", + p->schemaName, p->tableName, col_idx, VEC0_IVF_UNASSIGNED_CENTROID_ID); + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); sqlite3_free(zSql); + + // Invalidate cached stmts since we'll be modifying cells + ivf_invalidate_cached(p, col_idx); + + while (sqlite3_step(stmt) == SQLITE_ROW) { + int n = sqlite3_column_int(stmt, 1); + const unsigned char *val = (const unsigned char *)sqlite3_column_blob(stmt, 2); + const i64 *rids = (const i64 *)sqlite3_column_blob(stmt, 3); + const float *vecs = (const float *)sqlite3_column_blob(stmt, 4); + int valBytes = sqlite3_column_bytes(stmt, 2); + int ridsBytes = sqlite3_column_bytes(stmt, 3); + int vecsBytes = sqlite3_column_bytes(stmt, 4); + if (!val || !rids || !vecs) continue; + int cap = valBytes * 8; + if (ridsBytes / (int)sizeof(i64) < cap) cap = ridsBytes / (int)sizeof(i64); + if (vecsBytes / vecSize < cap) cap = vecsBytes / vecSize; + + for (int i = 0; i < cap && n > 0; i++) { + if (!(val[i / 8] & (1 << (i % 8)))) continue; + n--; + int cid = ivf_find_nearest_centroid(p, col_idx, &vecs[i * D], centroids, D, nlist); + + // Delete old rowid_map entry + sqlite3_stmt *sd = NULL; + char *zd = sqlite3_mprintf("DELETE FROM " VEC0_SHADOW_IVF_ROWID_MAP_NAME " WHERE rowid = ?", + p->schemaName, p->tableName, col_idx); + if (zd) { sqlite3_prepare_v2(p->db, zd, -1, &sd, NULL); sqlite3_free(zd); + sqlite3_bind_int64(sd, 1, rids[i]); sqlite3_step(sd); sqlite3_finalize(sd); } + + ivf_cell_insert(p, col_idx, cid, rids[i], &vecs[i * D], vecSize); + } + } + sqlite3_finalize(stmt); + + // Delete unassigned cells + zSql = sqlite3_mprintf( + "DELETE FROM " VEC0_SHADOW_IVF_CELLS_NAME " WHERE centroid_id = %d", + p->schemaName, p->tableName, col_idx, VEC0_IVF_UNASSIGNED_CENTROID_ID); + if (zSql) { sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); sqlite3_free(zSql); + sqlite3_step(stmt); sqlite3_finalize(stmt); } + + sqlite3_free(centroids); + return SQLITE_OK; +} + +static int ivf_cmd_clear_centroids(vec0_vtab *p, int col_idx) { + float *vectors = NULL; i64 *rowids = NULL; int N = 0; + int vecSize = ivf_vec_size(p, col_idx); + int D = (int)p->vector_columns[col_idx].dimensions; + int rc; + sqlite3_stmt *stmt = NULL; + char *zSql; + + rc = ivf_load_all_vectors(p, col_idx, &vectors, &rowids, &N); + if (rc != SQLITE_OK) return rc; + + ivf_invalidate_cached(p, col_idx); + + ivf_exec(p, "DELETE FROM " VEC0_SHADOW_IVF_CENTROIDS_NAME, col_idx); + ivf_exec(p, "DELETE FROM " VEC0_SHADOW_IVF_CELLS_NAME, col_idx); + ivf_exec(p, "DELETE FROM " VEC0_SHADOW_IVF_ROWID_MAP_NAME, col_idx); + + // Re-insert all vectors into unassigned cells + for (int i = 0; i < N; i++) { + ivf_cell_insert(p, col_idx, VEC0_IVF_UNASSIGNED_CENTROID_ID, + rowids[i], &vectors[i * D], vecSize); + } + + zSql = sqlite3_mprintf( + "INSERT OR REPLACE INTO " VEC0_SHADOW_INFO_NAME " (key, value) VALUES ('ivf_trained_%d', '0')", + p->schemaName, p->tableName, col_idx); + if (zSql) { sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); sqlite3_free(zSql); + sqlite3_step(stmt); sqlite3_finalize(stmt); } + p->ivfTrainedCache[col_idx] = 0; + + sqlite3_free(vectors); sqlite3_free(rowids); + return SQLITE_OK; +} + +// ============================================================================ +// KNN Query — scan all cells for probed centroids +// ============================================================================ + +struct IvfCentroidDist { int id; float dist; }; +struct IvfCandidate { i64 rowid; float distance; }; + +static int ivf_candidate_cmp(const void *a, const void *b) { + float da = ((const struct IvfCandidate *)a)->distance; + float db = ((const struct IvfCandidate *)b)->distance; + if (da < db) return -1; + if (da > db) return 1; + return 0; +} + +/** + * Scan cell rows from a prepared statement, computing distances in-memory. + * The statement must return (n_vectors, validity, rowids, vectors) columns. + * queryVecQ is the quantized query (same type as cell vectors). + * qvecSize is the size of one quantized vector in bytes. + */ +static int ivf_scan_cells_from_stmt(vec0_vtab *p, int col_idx, + sqlite3_stmt *stmt, + const void *queryVecQ, int qvecSize, + struct IvfCandidate **candidates, + int *nCandidates, int *cap) { + while (sqlite3_step(stmt) == SQLITE_ROW) { + int n = sqlite3_column_int(stmt, 0); + if (n == 0) continue; + const unsigned char *validity = (const unsigned char *)sqlite3_column_blob(stmt, 1); + const i64 *rowids = (const i64 *)sqlite3_column_blob(stmt, 2); + const unsigned char *vectors = (const unsigned char *)sqlite3_column_blob(stmt, 3); + int valBytes = sqlite3_column_bytes(stmt, 1); + int ridsBytes = sqlite3_column_bytes(stmt, 2); + int vecsBytes = sqlite3_column_bytes(stmt, 3); + if (!validity || !rowids || !vectors) continue; + int cell_cap = valBytes * 8; + if (ridsBytes / (int)sizeof(i64) < cell_cap) cell_cap = ridsBytes / (int)sizeof(i64); + if (vecsBytes / qvecSize < cell_cap) cell_cap = vecsBytes / qvecSize; + + int found = 0; + for (int i = 0; i < cell_cap && found < n; i++) { + if (!(validity[i / 8] & (1 << (i % 8)))) continue; + found++; + if (*nCandidates >= *cap) { + *cap *= 2; + struct IvfCandidate *tmp = sqlite3_realloc64(*candidates, (i64)*cap * sizeof(struct IvfCandidate)); + if (!tmp) return SQLITE_NOMEM; + *candidates = tmp; + } + (*candidates)[*nCandidates].rowid = rowids[i]; + (*candidates)[*nCandidates].distance = ivf_distance(p, col_idx, + queryVecQ, &vectors[i * qvecSize]); + (*nCandidates)++; + } + } + return SQLITE_OK; +} + +static int ivf_query_knn(vec0_vtab *p, int col_idx, + const void *queryVector, int queryVectorSize, + i64 k, struct vec0_query_knn_data *knn_data) { + UNUSED_PARAMETER(queryVectorSize); + int rc; + int nprobe = p->vector_columns[col_idx].ivf.nprobe; + int trained = ivf_is_trained(p, col_idx); + int quantizer = p->vector_columns[col_idx].ivf.quantizer; + int oversample = p->vector_columns[col_idx].ivf.oversample; + int qvecSize = ivf_vec_size(p, col_idx); + + // Quantize query vector for scanning + void *queryQ = sqlite3_malloc(qvecSize); + if (!queryQ) return SQLITE_NOMEM; + ivf_quantize(p, col_idx, (const float *)queryVector, queryQ); + + // With oversample, collect more candidates for re-ranking + i64 collect_k = (oversample > 1) ? k * oversample : k; + + int cap = (collect_k < 1024) ? 1024 : (int)collect_k * 2; + int nCandidates = 0; + struct IvfCandidate *candidates = sqlite3_malloc64((i64)cap * sizeof(struct IvfCandidate)); + if (!candidates) { sqlite3_free(queryQ); return SQLITE_NOMEM; } + + if (trained) { + // Find top nprobe centroids using quantized distance + int nlist = 0; + rc = ivf_ensure_stmt(p, &p->stmtIvfCentroidsAll[col_idx], + "SELECT centroid_id, centroid FROM " VEC0_SHADOW_IVF_CENTROIDS_NAME, col_idx); + if (rc != SQLITE_OK) { sqlite3_free(queryQ); sqlite3_free(candidates); return rc; } + sqlite3_stmt *stmt = p->stmtIvfCentroidsAll[col_idx]; + sqlite3_reset(stmt); + + int centroid_cap = 64; + struct IvfCentroidDist *cd = sqlite3_malloc64(centroid_cap * sizeof(*cd)); + if (!cd) { sqlite3_free(queryQ); sqlite3_free(candidates); return SQLITE_NOMEM; } + + while (sqlite3_step(stmt) == SQLITE_ROW) { + if (nlist >= centroid_cap) { + centroid_cap *= 2; + struct IvfCentroidDist *tmp = sqlite3_realloc64(cd, centroid_cap * sizeof(*cd)); + if (!tmp) { sqlite3_free(cd); sqlite3_free(queryQ); sqlite3_free(candidates); return SQLITE_NOMEM; } + cd = tmp; + } + cd[nlist].id = sqlite3_column_int(stmt, 0); + const void *c = sqlite3_column_blob(stmt, 1); + int cBytes = sqlite3_column_bytes(stmt, 1); + // Compare quantized query with quantized centroid + cd[nlist].dist = (c && cBytes == qvecSize) ? ivf_distance(p, col_idx, queryQ, c) : FLT_MAX; + nlist++; + } + + int actual_nprobe = nprobe < nlist ? nprobe : nlist; + for (int i = 0; i < actual_nprobe; i++) { + int min_j = i; + for (int j = i + 1; j < nlist; j++) { + if (cd[j].dist < cd[min_j].dist) min_j = j; + } + if (min_j != i) { struct IvfCentroidDist tmp = cd[i]; cd[i] = cd[min_j]; cd[min_j] = tmp; } + } + + // Scan probed cells + unassigned with quantized distance + { + sqlite3_str *s = sqlite3_str_new(NULL); + sqlite3_str_appendf(s, + "SELECT n_vectors, validity, rowids, vectors FROM " VEC0_SHADOW_IVF_CELLS_NAME + " WHERE centroid_id IN (", + p->schemaName, p->tableName, col_idx); + for (int i = 0; i < actual_nprobe; i++) { + if (i > 0) sqlite3_str_appendall(s, ","); + sqlite3_str_appendf(s, "%d", cd[i].id); + } + sqlite3_str_appendf(s, ",%d)", VEC0_IVF_UNASSIGNED_CENTROID_ID); + char *zSql = sqlite3_str_finish(s); + if (!zSql) { sqlite3_free(cd); sqlite3_free(queryQ); sqlite3_free(candidates); return SQLITE_NOMEM; } + + sqlite3_stmt *stmtScan = NULL; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmtScan, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) { sqlite3_free(cd); sqlite3_free(queryQ); sqlite3_free(candidates); return rc; } + + rc = ivf_scan_cells_from_stmt(p, col_idx, stmtScan, queryQ, qvecSize, + &candidates, &nCandidates, &cap); + sqlite3_finalize(stmtScan); + if (rc != SQLITE_OK) { sqlite3_free(cd); sqlite3_free(queryQ); sqlite3_free(candidates); return rc; } + } + + sqlite3_free(cd); + } else { + // Flat mode: scan only unassigned cells + sqlite3_stmt *stmtScan = NULL; + char *zSql = sqlite3_mprintf( + "SELECT n_vectors, validity, rowids, vectors FROM " VEC0_SHADOW_IVF_CELLS_NAME + " WHERE centroid_id = %d", + p->schemaName, p->tableName, col_idx, VEC0_IVF_UNASSIGNED_CENTROID_ID); + if (!zSql) { sqlite3_free(queryQ); sqlite3_free(candidates); return SQLITE_NOMEM; } + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmtScan, NULL); sqlite3_free(zSql); + if (rc == SQLITE_OK) { + rc = ivf_scan_cells_from_stmt(p, col_idx, stmtScan, queryQ, qvecSize, + &candidates, &nCandidates, &cap); + sqlite3_finalize(stmtScan); + if (rc != SQLITE_OK) { sqlite3_free(queryQ); sqlite3_free(candidates); return rc; } + } + } + + sqlite3_free(queryQ); + + // Sort candidates by quantized distance + qsort(candidates, nCandidates, sizeof(struct IvfCandidate), ivf_candidate_cmp); + + // Oversample re-ranking: re-score top (oversample*k) with full-precision vectors + if (oversample > 1 && quantizer != VEC0_IVF_QUANTIZER_NONE && nCandidates > 0) { + i64 rescore_n = collect_k < nCandidates ? collect_k : nCandidates; + sqlite3_stmt *stmtVec = NULL; + char *zSql = sqlite3_mprintf( + "SELECT vector FROM " VEC0_SHADOW_IVF_VECTORS_NAME " WHERE rowid = ?", + p->schemaName, p->tableName, col_idx); + if (zSql) { + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmtVec, NULL); sqlite3_free(zSql); + if (rc == SQLITE_OK) { + for (i64 i = 0; i < rescore_n; i++) { + sqlite3_reset(stmtVec); + sqlite3_bind_int64(stmtVec, 1, candidates[i].rowid); + if (sqlite3_step(stmtVec) == SQLITE_ROW) { + const float *fullVec = (const float *)sqlite3_column_blob(stmtVec, 0); + int fullVecBytes = sqlite3_column_bytes(stmtVec, 0); + if (fullVec && fullVecBytes == (int)p->vector_columns[col_idx].dimensions * (int)sizeof(float)) { + candidates[i].distance = ivf_distance_float(p, col_idx, + (const float *)queryVector, fullVec); + } + } + } + sqlite3_finalize(stmtVec); + } + } + // Re-sort after re-scoring + qsort(candidates, (size_t)rescore_n, sizeof(struct IvfCandidate), ivf_candidate_cmp); + nCandidates = (int)rescore_n; + } + + qsort(candidates, nCandidates, sizeof(struct IvfCandidate), ivf_candidate_cmp); + i64 nResults = nCandidates < k ? nCandidates : k; + + if (nResults == 0) { + knn_data->rowids = NULL; knn_data->distances = NULL; + knn_data->k = k; knn_data->k_used = 0; knn_data->current_idx = 0; + sqlite3_free(candidates); return SQLITE_OK; + } + + knn_data->rowids = sqlite3_malloc64(nResults * sizeof(i64)); + knn_data->distances = sqlite3_malloc64(nResults * sizeof(f32)); + if (!knn_data->rowids || !knn_data->distances) { + sqlite3_free(knn_data->rowids); sqlite3_free(knn_data->distances); + sqlite3_free(candidates); return SQLITE_NOMEM; + } + for (i64 i = 0; i < nResults; i++) { + knn_data->rowids[i] = candidates[i].rowid; + knn_data->distances[i] = candidates[i].distance; + } + knn_data->k = k; knn_data->k_used = nResults; knn_data->current_idx = 0; + sqlite3_free(candidates); + return SQLITE_OK; +} + +// ============================================================================ +// Command dispatch +// ============================================================================ + +static int ivf_handle_command(vec0_vtab *p, const char *command, + int argc, sqlite3_value **argv) { + UNUSED_PARAMETER(argc); + int col_idx = -1; + for (int i = 0; i < p->numVectorColumns; i++) { + if (p->vector_columns[i].index_type == VEC0_INDEX_TYPE_IVF) { col_idx = i; break; } + } + if (col_idx < 0) return SQLITE_EMPTY; + + // nprobe=N — change nprobe at runtime without rebuilding + if (strncmp(command, "nprobe=", 7) == 0) { + int new_nprobe = atoi(command + 7); + if (new_nprobe < 1) { + vtab_set_error(&p->base, "nprobe must be >= 1"); + return SQLITE_ERROR; + } + p->vector_columns[col_idx].ivf.nprobe = new_nprobe; + return SQLITE_OK; + } + + if (strcmp(command, "compute-centroids") == 0) + return ivf_cmd_compute_centroids(p, col_idx, 0, VEC0_IVF_KMEANS_MAX_ITER, VEC0_IVF_KMEANS_DEFAULT_SEED); + + if (strncmp(command, "compute-centroids:", 18) == 0) { + const char *json = command + 18; + int nlist = 0, max_iter = VEC0_IVF_KMEANS_MAX_ITER; + uint32_t seed = VEC0_IVF_KMEANS_DEFAULT_SEED; + const char *pn = strstr(json, "\"nlist\":"); if (pn) nlist = atoi(pn + 8); + const char *pi = strstr(json, "\"max_iterations\":"); if (pi) max_iter = atoi(pi + 17); + const char *ps = strstr(json, "\"seed\":"); if (ps) seed = (uint32_t)atoi(ps + 7); + return ivf_cmd_compute_centroids(p, col_idx, nlist, max_iter, seed); + } + + if (strncmp(command, "set-centroid:", 13) == 0) { + int centroid_id = atoi(command + 13); + for (int i = 0; i < (int)(p->numVectorColumns + p->numPartitionColumns + + p->numAuxiliaryColumns + p->numMetadataColumns); i++) { + if (p->user_column_kinds[i] == SQLITE_VEC0_USER_COLUMN_KIND_VECTOR && + p->user_column_idxs[i] == col_idx) { + sqlite3_value *v = argv[2 + VEC0_COLUMN_USERN_START + i]; + if (sqlite3_value_type(v) == SQLITE_NULL) { vtab_set_error(&p->base, "set-centroid requires vector"); return SQLITE_ERROR; } + return ivf_cmd_set_centroid(p, col_idx, centroid_id, sqlite3_value_blob(v), sqlite3_value_bytes(v)); + } + } + return SQLITE_ERROR; + } + + if (strcmp(command, "assign-vectors") == 0) return ivf_cmd_assign_vectors(p, col_idx); + if (strcmp(command, "clear-centroids") == 0) return ivf_cmd_clear_centroids(p, col_idx); + return SQLITE_EMPTY; +} + +#endif /* SQLITE_VEC_IVF_C */ diff --git a/sqlite-vec-rescore.c b/sqlite-vec-rescore.c new file mode 100644 index 0000000..6a47214 --- /dev/null +++ b/sqlite-vec-rescore.c @@ -0,0 +1,687 @@ +/** + * sqlite-vec-rescore.c — Rescore index logic for sqlite-vec. + * + * This file is #included into sqlite-vec.c after the vec0_vtab definition. + * All functions receive a vec0_vtab *p and access p->vector_columns[i].rescore. + * + * Shadow tables per rescore-enabled vector column: + * _rescore_chunks{NN} — quantized vectors in chunk layout (for coarse scan) + * _rescore_vectors{NN} — float vectors keyed by rowid (for fast rescore lookup) + */ + +// ============================================================================ +// Shadow table lifecycle +// ============================================================================ + +static int rescore_create_tables(vec0_vtab *p, sqlite3 *db, char **pzErr) { + for (int i = 0; i < p->numVectorColumns; i++) { + if (p->vector_columns[i].index_type != VEC0_INDEX_TYPE_RESCORE) + continue; + + // Quantized chunk table (same layout as _vector_chunks) + char *zSql = sqlite3_mprintf( + "CREATE TABLE \"%w\".\"%w_rescore_chunks%02d\"" + "(rowid PRIMARY KEY, vectors BLOB NOT NULL)", + p->schemaName, p->tableName, i); + if (!zSql) + return SQLITE_NOMEM; + sqlite3_stmt *stmt; + int rc = sqlite3_prepare_v2(db, zSql, -1, &stmt, 0); + sqlite3_free(zSql); + if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) { + *pzErr = sqlite3_mprintf( + "Could not create '_rescore_chunks%02d' shadow table: %s", i, + sqlite3_errmsg(db)); + sqlite3_finalize(stmt); + return SQLITE_ERROR; + } + sqlite3_finalize(stmt); + + // Float vector table (rowid-keyed for fast random access) + zSql = sqlite3_mprintf( + "CREATE TABLE \"%w\".\"%w_rescore_vectors%02d\"" + "(rowid INTEGER PRIMARY KEY, vector BLOB NOT NULL)", + p->schemaName, p->tableName, i); + if (!zSql) + return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(db, zSql, -1, &stmt, 0); + sqlite3_free(zSql); + if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) { + *pzErr = sqlite3_mprintf( + "Could not create '_rescore_vectors%02d' shadow table: %s", i, + sqlite3_errmsg(db)); + sqlite3_finalize(stmt); + return SQLITE_ERROR; + } + sqlite3_finalize(stmt); + } + return SQLITE_OK; +} + +static int rescore_drop_tables(vec0_vtab *p) { + for (int i = 0; i < p->numVectorColumns; i++) { + sqlite3_stmt *stmt; + int rc; + char *zSql; + + if (p->shadowRescoreChunksNames[i]) { + zSql = sqlite3_mprintf("DROP TABLE IF EXISTS \"%w\".\"%w\"", + p->schemaName, p->shadowRescoreChunksNames[i]); + if (!zSql) + return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, 0); + sqlite3_free(zSql); + if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) { + sqlite3_finalize(stmt); + return SQLITE_ERROR; + } + sqlite3_finalize(stmt); + } + + if (p->shadowRescoreVectorsNames[i]) { + zSql = sqlite3_mprintf("DROP TABLE IF EXISTS \"%w\".\"%w\"", + p->schemaName, p->shadowRescoreVectorsNames[i]); + if (!zSql) + return SQLITE_NOMEM; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, 0); + sqlite3_free(zSql); + if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) { + sqlite3_finalize(stmt); + return SQLITE_ERROR; + } + sqlite3_finalize(stmt); + } + } + return SQLITE_OK; +} + +static size_t rescore_quantized_byte_size(struct VectorColumnDefinition *col) { + switch (col->rescore.quantizer_type) { + case VEC0_RESCORE_QUANTIZER_BIT: + return col->dimensions / CHAR_BIT; + case VEC0_RESCORE_QUANTIZER_INT8: + return col->dimensions; + default: + return 0; + } +} + +/** + * Insert a new chunk row into each _rescore_chunks{NN} table with a zeroblob. + */ +static int rescore_new_chunk(vec0_vtab *p, i64 chunk_rowid) { + for (int i = 0; i < p->numVectorColumns; i++) { + if (p->vector_columns[i].index_type != VEC0_INDEX_TYPE_RESCORE) + continue; + size_t quantized_size = + rescore_quantized_byte_size(&p->vector_columns[i]); + i64 blob_size = (i64)p->chunk_size * (i64)quantized_size; + + char *zSql = sqlite3_mprintf( + "INSERT INTO \"%w\".\"%w\"(_rowid_, rowid, vectors) VALUES (?, ?, ?)", + p->schemaName, p->shadowRescoreChunksNames[i]); + if (!zSql) + return SQLITE_NOMEM; + sqlite3_stmt *stmt; + int rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) { + sqlite3_finalize(stmt); + return rc; + } + sqlite3_bind_int64(stmt, 1, chunk_rowid); + sqlite3_bind_int64(stmt, 2, chunk_rowid); + sqlite3_bind_zeroblob64(stmt, 3, blob_size); + rc = sqlite3_step(stmt); + sqlite3_finalize(stmt); + if (rc != SQLITE_DONE) + return rc; + } + return SQLITE_OK; +} + +// ============================================================================ +// Quantization +// ============================================================================ + +static void rescore_quantize_float_to_bit(const float *src, uint8_t *dst, + size_t dimensions) { + memset(dst, 0, dimensions / CHAR_BIT); + for (size_t i = 0; i < dimensions; i++) { + if (src[i] >= 0.0f) { + dst[i / CHAR_BIT] |= (1 << (i % CHAR_BIT)); + } + } +} + +static void rescore_quantize_float_to_int8(const float *src, int8_t *dst, + size_t dimensions) { + float step = 2.0f / 255.0f; + for (size_t i = 0; i < dimensions; i++) { + float v = (src[i] - (-1.0f)) / step - 128.0f; + if (!(v <= 127.0f)) v = 127.0f; + if (!(v >= -128.0f)) v = -128.0f; + dst[i] = (int8_t)v; + } +} + +// ============================================================================ +// Insert path +// ============================================================================ + +/** + * Quantize float vector to _rescore_chunks and store in _rescore_vectors. + */ +static int rescore_on_insert(vec0_vtab *p, i64 chunk_rowid, i64 chunk_offset, + i64 rowid, void *vectorDatas[]) { + for (int i = 0; i < p->numVectorColumns; i++) { + if (p->vector_columns[i].index_type != VEC0_INDEX_TYPE_RESCORE) + continue; + + struct VectorColumnDefinition *col = &p->vector_columns[i]; + size_t qsize = rescore_quantized_byte_size(col); + size_t fsize = vector_column_byte_size(*col); + int rc; + + // 1. Write quantized vector to _rescore_chunks blob + { + void *qbuf = sqlite3_malloc(qsize); + if (!qbuf) + return SQLITE_NOMEM; + + switch (col->rescore.quantizer_type) { + case VEC0_RESCORE_QUANTIZER_BIT: + rescore_quantize_float_to_bit((const float *)vectorDatas[i], + (uint8_t *)qbuf, col->dimensions); + break; + case VEC0_RESCORE_QUANTIZER_INT8: + rescore_quantize_float_to_int8((const float *)vectorDatas[i], + (int8_t *)qbuf, col->dimensions); + break; + } + + sqlite3_blob *blob = NULL; + rc = sqlite3_blob_open(p->db, p->schemaName, + p->shadowRescoreChunksNames[i], "vectors", + chunk_rowid, 1, &blob); + if (rc != SQLITE_OK) { + sqlite3_free(qbuf); + return rc; + } + rc = sqlite3_blob_write(blob, qbuf, qsize, chunk_offset * qsize); + sqlite3_free(qbuf); + int brc = sqlite3_blob_close(blob); + if (rc != SQLITE_OK) + return rc; + if (brc != SQLITE_OK) + return brc; + } + + // 2. Insert float vector into _rescore_vectors (rowid-keyed) + { + char *zSql = sqlite3_mprintf( + "INSERT INTO \"%w\".\"%w\"(rowid, vector) VALUES (?, ?)", + p->schemaName, p->shadowRescoreVectorsNames[i]); + if (!zSql) + return SQLITE_NOMEM; + sqlite3_stmt *stmt; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) { + sqlite3_finalize(stmt); + return rc; + } + sqlite3_bind_int64(stmt, 1, rowid); + sqlite3_bind_blob(stmt, 2, vectorDatas[i], fsize, SQLITE_TRANSIENT); + rc = sqlite3_step(stmt); + sqlite3_finalize(stmt); + if (rc != SQLITE_DONE) + return SQLITE_ERROR; + } + } + return SQLITE_OK; +} + +// ============================================================================ +// Delete path +// ============================================================================ + +/** + * Zero out quantized vector in _rescore_chunks and delete from _rescore_vectors. + */ +static int rescore_on_delete(vec0_vtab *p, i64 chunk_id, u64 chunk_offset, + i64 rowid) { + for (int i = 0; i < p->numVectorColumns; i++) { + if (p->vector_columns[i].index_type != VEC0_INDEX_TYPE_RESCORE) + continue; + int rc; + + // 1. Zero out quantized data in _rescore_chunks + { + size_t qsize = rescore_quantized_byte_size(&p->vector_columns[i]); + void *zeroBuf = sqlite3_malloc(qsize); + if (!zeroBuf) + return SQLITE_NOMEM; + memset(zeroBuf, 0, qsize); + + sqlite3_blob *blob = NULL; + rc = sqlite3_blob_open(p->db, p->schemaName, + p->shadowRescoreChunksNames[i], "vectors", + chunk_id, 1, &blob); + if (rc != SQLITE_OK) { + sqlite3_free(zeroBuf); + return rc; + } + rc = sqlite3_blob_write(blob, zeroBuf, qsize, chunk_offset * qsize); + sqlite3_free(zeroBuf); + int brc = sqlite3_blob_close(blob); + if (rc != SQLITE_OK) + return rc; + if (brc != SQLITE_OK) + return brc; + } + + // 2. Delete from _rescore_vectors + { + char *zSql = sqlite3_mprintf( + "DELETE FROM \"%w\".\"%w\" WHERE rowid = ?", + p->schemaName, p->shadowRescoreVectorsNames[i]); + if (!zSql) + return SQLITE_NOMEM; + sqlite3_stmt *stmt; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) + return rc; + sqlite3_bind_int64(stmt, 1, rowid); + rc = sqlite3_step(stmt); + sqlite3_finalize(stmt); + if (rc != SQLITE_DONE) + return SQLITE_ERROR; + } + } + return SQLITE_OK; +} + +/** + * Delete a chunk row from _rescore_chunks{NN} tables. + * (_rescore_vectors rows were already deleted per-row in rescore_on_delete) + */ +static int rescore_delete_chunk(vec0_vtab *p, i64 chunk_id) { + for (int i = 0; i < p->numVectorColumns; i++) { + if (!p->shadowRescoreChunksNames[i]) + continue; + char *zSql = sqlite3_mprintf( + "DELETE FROM \"%w\".\"%w\" WHERE rowid = ?", + p->schemaName, p->shadowRescoreChunksNames[i]); + if (!zSql) + return SQLITE_NOMEM; + sqlite3_stmt *stmt; + int rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) + return rc; + sqlite3_bind_int64(stmt, 1, chunk_id); + rc = sqlite3_step(stmt); + sqlite3_finalize(stmt); + if (rc != SQLITE_DONE) + return SQLITE_ERROR; + } + return SQLITE_OK; +} + +// ============================================================================ +// KNN rescore query +// ============================================================================ + +/** + * Phase 1: Coarse scan of quantized chunks → top k*oversample candidates (rowids). + * Phase 2: For each candidate, blob_open _rescore_vectors by rowid, read float + * vector, compute float distance. Sort, return top k. + * + * Phase 2 is fast because _rescore_vectors has INTEGER PRIMARY KEY, so + * sqlite3_blob_open/reopen addresses rows directly by rowid — no index lookup. + */ +static int rescore_knn(vec0_vtab *p, vec0_cursor *pCur, + struct VectorColumnDefinition *vector_column, + int vectorColumnIdx, struct Array *arrayRowidsIn, + struct Array *aMetadataIn, const char *idxStr, int argc, + sqlite3_value **argv, void *queryVector, i64 k, + struct vec0_query_knn_data *knn_data) { + (void)pCur; + (void)aMetadataIn; + int rc = SQLITE_OK; + int oversample = vector_column->rescore.oversample_search > 0 + ? vector_column->rescore.oversample_search + : vector_column->rescore.oversample; + i64 k_oversample = k * oversample; + if (k_oversample > 4096) + k_oversample = 4096; + + size_t qdim = vector_column->dimensions; + size_t qsize = rescore_quantized_byte_size(vector_column); + size_t fsize = vector_column_byte_size(*vector_column); + + // Quantize the query vector + void *quantizedQuery = sqlite3_malloc(qsize); + if (!quantizedQuery) + return SQLITE_NOMEM; + + switch (vector_column->rescore.quantizer_type) { + case VEC0_RESCORE_QUANTIZER_BIT: + rescore_quantize_float_to_bit((const float *)queryVector, + (uint8_t *)quantizedQuery, qdim); + break; + case VEC0_RESCORE_QUANTIZER_INT8: + rescore_quantize_float_to_int8((const float *)queryVector, + (int8_t *)quantizedQuery, qdim); + break; + } + + // Phase 1: Scan quantized chunks for k*oversample candidates + sqlite3_stmt *stmtChunks = NULL; + rc = vec0_chunks_iter(p, idxStr, argc, argv, &stmtChunks); + if (rc != SQLITE_OK) { + sqlite3_free(quantizedQuery); + return rc; + } + + i64 *cand_rowids = sqlite3_malloc(k_oversample * sizeof(i64)); + f32 *cand_distances = sqlite3_malloc(k_oversample * sizeof(f32)); + i64 *tmp_rowids = sqlite3_malloc(k_oversample * sizeof(i64)); + f32 *tmp_distances = sqlite3_malloc(k_oversample * sizeof(f32)); + f32 *chunk_distances = sqlite3_malloc(p->chunk_size * sizeof(f32)); + i32 *chunk_topk_idxs = sqlite3_malloc(k_oversample * sizeof(i32)); + u8 *b = sqlite3_malloc(p->chunk_size / CHAR_BIT); + u8 *bTaken = sqlite3_malloc(p->chunk_size / CHAR_BIT); + u8 *bmRowids = NULL; + void *baseVectors = sqlite3_malloc((i64)p->chunk_size * (i64)qsize); + + if (!cand_rowids || !cand_distances || !tmp_rowids || !tmp_distances || + !chunk_distances || !chunk_topk_idxs || !b || !bTaken || !baseVectors) { + rc = SQLITE_NOMEM; + goto cleanup; + } + memset(cand_rowids, 0, k_oversample * sizeof(i64)); + memset(cand_distances, 0, k_oversample * sizeof(f32)); + + if (arrayRowidsIn) { + bmRowids = sqlite3_malloc(p->chunk_size / CHAR_BIT); + if (!bmRowids) { + rc = SQLITE_NOMEM; + goto cleanup; + } + } + + i64 cand_used = 0; + + while (1) { + rc = sqlite3_step(stmtChunks); + if (rc == SQLITE_DONE) + break; + if (rc != SQLITE_ROW) { + rc = SQLITE_ERROR; + goto cleanup; + } + + i64 chunk_id = sqlite3_column_int64(stmtChunks, 0); + unsigned char *chunkValidity = + (unsigned char *)sqlite3_column_blob(stmtChunks, 1); + i64 *chunkRowids = (i64 *)sqlite3_column_blob(stmtChunks, 2); + int validityBytes = sqlite3_column_bytes(stmtChunks, 1); + int rowidsBytes = sqlite3_column_bytes(stmtChunks, 2); + if (!chunkValidity || !chunkRowids) { + rc = SQLITE_ERROR; + goto cleanup; + } + // Validate blob sizes match chunk_size expectations + if (validityBytes < (p->chunk_size + 7) / 8 || + rowidsBytes < p->chunk_size * (int)sizeof(i64)) { + rc = SQLITE_ERROR; + goto cleanup; + } + + memset(chunk_distances, 0, p->chunk_size * sizeof(f32)); + memset(chunk_topk_idxs, 0, k_oversample * sizeof(i32)); + bitmap_copy(b, chunkValidity, p->chunk_size); + + if (arrayRowidsIn) { + bitmap_clear(bmRowids, p->chunk_size); + for (int j = 0; j < p->chunk_size; j++) { + if (!bitmap_get(chunkValidity, j)) + continue; + i64 rid = chunkRowids[j]; + void *found = bsearch(&rid, arrayRowidsIn->z, arrayRowidsIn->length, + sizeof(i64), _cmp); + bitmap_set(bmRowids, j, found ? 1 : 0); + } + bitmap_and_inplace(b, bmRowids, p->chunk_size); + } + + // Read quantized vectors + sqlite3_blob *blobQ = NULL; + rc = sqlite3_blob_open(p->db, p->schemaName, + p->shadowRescoreChunksNames[vectorColumnIdx], + "vectors", chunk_id, 0, &blobQ); + if (rc != SQLITE_OK) + goto cleanup; + rc = sqlite3_blob_read(blobQ, baseVectors, + (i64)p->chunk_size * (i64)qsize, 0); + sqlite3_blob_close(blobQ); + if (rc != SQLITE_OK) + goto cleanup; + + // Compute quantized distances + for (int j = 0; j < p->chunk_size; j++) { + if (!bitmap_get(b, j)) + continue; + f32 dist = FLT_MAX; + switch (vector_column->rescore.quantizer_type) { + case VEC0_RESCORE_QUANTIZER_BIT: { + const u8 *base_j = ((u8 *)baseVectors) + (j * (qdim / CHAR_BIT)); + dist = distance_hamming(base_j, (u8 *)quantizedQuery, &qdim); + break; + } + case VEC0_RESCORE_QUANTIZER_INT8: { + const i8 *base_j = ((i8 *)baseVectors) + (j * qdim); + switch (vector_column->distance_metric) { + case VEC0_DISTANCE_METRIC_L2: + dist = distance_l2_sqr_int8(base_j, (i8 *)quantizedQuery, &qdim); + break; + case VEC0_DISTANCE_METRIC_COSINE: + dist = distance_cosine_int8(base_j, (i8 *)quantizedQuery, &qdim); + break; + case VEC0_DISTANCE_METRIC_L1: + dist = (f32)distance_l1_int8(base_j, (i8 *)quantizedQuery, &qdim); + break; + } + break; + } + } + chunk_distances[j] = dist; + } + + int used1; + min_idx(chunk_distances, p->chunk_size, b, chunk_topk_idxs, + min(k_oversample, p->chunk_size), bTaken, &used1); + + i64 merged_used; + merge_sorted_lists(cand_distances, cand_rowids, cand_used, chunk_distances, + chunkRowids, chunk_topk_idxs, + min(min(k_oversample, p->chunk_size), used1), + tmp_distances, tmp_rowids, k_oversample, &merged_used); + + for (i64 j = 0; j < merged_used; j++) { + cand_rowids[j] = tmp_rowids[j]; + cand_distances[j] = tmp_distances[j]; + } + cand_used = merged_used; + } + rc = SQLITE_OK; + + // Phase 2: Rescore candidates using _rescore_vectors (rowid-keyed) + if (cand_used == 0) { + knn_data->current_idx = 0; + knn_data->k = 0; + knn_data->rowids = NULL; + knn_data->distances = NULL; + knn_data->k_used = 0; + goto cleanup; + } + { + f32 *float_distances = sqlite3_malloc(cand_used * sizeof(f32)); + void *fBuf = sqlite3_malloc(fsize); + if (!float_distances || !fBuf) { + sqlite3_free(float_distances); + sqlite3_free(fBuf); + rc = SQLITE_NOMEM; + goto cleanup; + } + + // Open blob on _rescore_vectors, then reopen for each candidate rowid. + // blob_reopen is O(1) for INTEGER PRIMARY KEY tables. + sqlite3_blob *blobFloat = NULL; + rc = sqlite3_blob_open(p->db, p->schemaName, + p->shadowRescoreVectorsNames[vectorColumnIdx], + "vector", cand_rowids[0], 0, &blobFloat); + if (rc != SQLITE_OK) { + sqlite3_free(float_distances); + sqlite3_free(fBuf); + goto cleanup; + } + + rc = sqlite3_blob_read(blobFloat, fBuf, fsize, 0); + if (rc != SQLITE_OK) { + sqlite3_blob_close(blobFloat); + sqlite3_free(float_distances); + sqlite3_free(fBuf); + goto cleanup; + } + float_distances[0] = + vec0_distance_full(fBuf, queryVector, vector_column->dimensions, + vector_column->element_type, + vector_column->distance_metric); + + for (i64 j = 1; j < cand_used; j++) { + rc = sqlite3_blob_reopen(blobFloat, cand_rowids[j]); + if (rc != SQLITE_OK) { + sqlite3_blob_close(blobFloat); + sqlite3_free(float_distances); + sqlite3_free(fBuf); + goto cleanup; + } + rc = sqlite3_blob_read(blobFloat, fBuf, fsize, 0); + if (rc != SQLITE_OK) { + sqlite3_blob_close(blobFloat); + sqlite3_free(float_distances); + sqlite3_free(fBuf); + goto cleanup; + } + float_distances[j] = + vec0_distance_full(fBuf, queryVector, vector_column->dimensions, + vector_column->element_type, + vector_column->distance_metric); + } + sqlite3_blob_close(blobFloat); + sqlite3_free(fBuf); + + // Sort by float distance + for (i64 a = 0; a + 1 < cand_used; a++) { + i64 minIdx = a; + for (i64 c = a + 1; c < cand_used; c++) { + if (float_distances[c] < float_distances[minIdx]) + minIdx = c; + } + if (minIdx != a) { + f32 td = float_distances[a]; + float_distances[a] = float_distances[minIdx]; + float_distances[minIdx] = td; + i64 tr = cand_rowids[a]; + cand_rowids[a] = cand_rowids[minIdx]; + cand_rowids[minIdx] = tr; + } + } + + i64 result_k = min(k, cand_used); + i64 *out_rowids = sqlite3_malloc(result_k * sizeof(i64)); + f32 *out_distances = sqlite3_malloc(result_k * sizeof(f32)); + if (!out_rowids || !out_distances) { + sqlite3_free(out_rowids); + sqlite3_free(out_distances); + sqlite3_free(float_distances); + rc = SQLITE_NOMEM; + goto cleanup; + } + for (i64 j = 0; j < result_k; j++) { + out_rowids[j] = cand_rowids[j]; + out_distances[j] = float_distances[j]; + } + + knn_data->current_idx = 0; + knn_data->k = result_k; + knn_data->rowids = out_rowids; + knn_data->distances = out_distances; + knn_data->k_used = result_k; + + sqlite3_free(float_distances); + } + +cleanup: + sqlite3_finalize(stmtChunks); + sqlite3_free(quantizedQuery); + sqlite3_free(cand_rowids); + sqlite3_free(cand_distances); + sqlite3_free(tmp_rowids); + sqlite3_free(tmp_distances); + sqlite3_free(chunk_distances); + sqlite3_free(chunk_topk_idxs); + sqlite3_free(b); + sqlite3_free(bTaken); + sqlite3_free(bmRowids); + sqlite3_free(baseVectors); + return rc; +} + +/** + * Handle FTS5-style command dispatch for rescore parameters. + * Returns SQLITE_OK if handled, SQLITE_EMPTY if not a rescore command. + */ +static int rescore_handle_command(vec0_vtab *p, const char *command) { + if (strncmp(command, "oversample=", 11) == 0) { + int val = atoi(command + 11); + if (val < 1) { + vtab_set_error(&p->base, "oversample must be >= 1"); + return SQLITE_ERROR; + } + for (int i = 0; i < p->numVectorColumns; i++) { + if (p->vector_columns[i].index_type == VEC0_INDEX_TYPE_RESCORE) { + p->vector_columns[i].rescore.oversample_search = val; + } + } + return SQLITE_OK; + } + return SQLITE_EMPTY; +} + +#ifdef SQLITE_VEC_TEST +void _test_rescore_quantize_float_to_bit(const float *src, uint8_t *dst, size_t dim) { + rescore_quantize_float_to_bit(src, dst, dim); +} +void _test_rescore_quantize_float_to_int8(const float *src, int8_t *dst, size_t dim) { + rescore_quantize_float_to_int8(src, dst, dim); +} +size_t _test_rescore_quantized_byte_size_bit(size_t dimensions) { + struct VectorColumnDefinition col; + memset(&col, 0, sizeof(col)); + col.dimensions = dimensions; + col.rescore.quantizer_type = VEC0_RESCORE_QUANTIZER_BIT; + return rescore_quantized_byte_size(&col); +} +size_t _test_rescore_quantized_byte_size_int8(size_t dimensions) { + struct VectorColumnDefinition col; + memset(&col, 0, sizeof(col)); + col.dimensions = dimensions; + col.rescore.quantizer_type = VEC0_RESCORE_QUANTIZER_INT8; + return rescore_quantized_byte_size(&col); +} +#endif diff --git a/sqlite-vec.c b/sqlite-vec.c index c1874a7..dc33c67 100644 --- a/sqlite-vec.c +++ b/sqlite-vec.c @@ -11,7 +11,7 @@ #include #include -#ifndef SQLITE_VEC_OMIT_FS +#ifdef SQLITE_VEC_DEBUG #include #endif @@ -22,55 +22,8 @@ SQLITE_EXTENSION_INIT1 #include "sqlite3.h" #endif -#ifndef UINT32_TYPE -#ifdef HAVE_UINT32_T -#define UINT32_TYPE uint32_t -#else -#define UINT32_TYPE unsigned int -#endif -#endif -#ifndef UINT16_TYPE -#ifdef HAVE_UINT16_T -#define UINT16_TYPE uint16_t -#else -#define UINT16_TYPE unsigned short int -#endif -#endif -#ifndef INT16_TYPE -#ifdef HAVE_INT16_T -#define INT16_TYPE int16_t -#else -#define INT16_TYPE short int -#endif -#endif -#ifndef UINT8_TYPE -#ifdef HAVE_UINT8_T -#define UINT8_TYPE uint8_t -#else -#define UINT8_TYPE unsigned char -#endif -#endif -#ifndef INT8_TYPE -#ifdef HAVE_INT8_T -#define INT8_TYPE int8_t -#else -#define INT8_TYPE signed char -#endif -#endif -#ifndef LONGDOUBLE_TYPE -#define LONGDOUBLE_TYPE long double -#endif - -#ifndef _WIN32 -#ifndef __EMSCRIPTEN__ -#ifndef __COSMOPOLITAN__ -#ifndef __wasi__ -typedef u_int8_t uint8_t; -typedef u_int16_t uint16_t; -typedef u_int64_t uint64_t; -#endif -#endif -#endif +#ifndef SQLITE_VEC_ENABLE_DISKANN +#define SQLITE_VEC_ENABLE_DISKANN 1 #endif typedef int8_t i8; @@ -93,6 +46,10 @@ typedef size_t usize; #define COMPILER_SUPPORTS_VTAB_IN 1 #endif +#ifndef SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE +#define SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE 0 +#endif + #ifndef SQLITE_SUBTYPE #define SQLITE_SUBTYPE 0x000100000 #endif @@ -112,6 +69,10 @@ typedef size_t usize; #define countof(x) (sizeof(x) / sizeof((x)[0])) #define min(a, b) (((a) <= (b)) ? (a) : (b)) +#ifndef SQLITE_VEC_ENABLE_RESCORE +#define SQLITE_VEC_ENABLE_RESCORE 1 +#endif + enum VectorElementType { // clang-format off SQLITE_VEC_ELEMENT_TYPE_FLOAT32 = 223 + 0, @@ -224,6 +185,63 @@ static f32 l2_sqr_float_neon(const void *pVect1v, const void *pVect2v, return sqrt(sum_scalar); } +static f32 cosine_float_neon(const void *pVect1v, const void *pVect2v, + const void *qty_ptr) { + f32 *pVect1 = (f32 *)pVect1v; + f32 *pVect2 = (f32 *)pVect2v; + size_t qty = *((size_t *)qty_ptr); + size_t qty16 = qty >> 4; + const f32 *pEnd1 = pVect1 + (qty16 << 4); + + float32x4_t dot0 = vdupq_n_f32(0), dot1 = vdupq_n_f32(0); + float32x4_t dot2 = vdupq_n_f32(0), dot3 = vdupq_n_f32(0); + float32x4_t amag0 = vdupq_n_f32(0), amag1 = vdupq_n_f32(0); + float32x4_t amag2 = vdupq_n_f32(0), amag3 = vdupq_n_f32(0); + float32x4_t bmag0 = vdupq_n_f32(0), bmag1 = vdupq_n_f32(0); + float32x4_t bmag2 = vdupq_n_f32(0), bmag3 = vdupq_n_f32(0); + + while (pVect1 < pEnd1) { + float32x4_t v1, v2; + v1 = vld1q_f32(pVect1); pVect1 += 4; + v2 = vld1q_f32(pVect2); pVect2 += 4; + dot0 = vfmaq_f32(dot0, v1, v2); + amag0 = vfmaq_f32(amag0, v1, v1); + bmag0 = vfmaq_f32(bmag0, v2, v2); + + v1 = vld1q_f32(pVect1); pVect1 += 4; + v2 = vld1q_f32(pVect2); pVect2 += 4; + dot1 = vfmaq_f32(dot1, v1, v2); + amag1 = vfmaq_f32(amag1, v1, v1); + bmag1 = vfmaq_f32(bmag1, v2, v2); + + v1 = vld1q_f32(pVect1); pVect1 += 4; + v2 = vld1q_f32(pVect2); pVect2 += 4; + dot2 = vfmaq_f32(dot2, v1, v2); + amag2 = vfmaq_f32(amag2, v1, v1); + bmag2 = vfmaq_f32(bmag2, v2, v2); + + v1 = vld1q_f32(pVect1); pVect1 += 4; + v2 = vld1q_f32(pVect2); pVect2 += 4; + dot3 = vfmaq_f32(dot3, v1, v2); + amag3 = vfmaq_f32(amag3, v1, v1); + bmag3 = vfmaq_f32(bmag3, v2, v2); + } + + f32 dot_s = vaddvq_f32(vaddq_f32(vaddq_f32(dot0, dot1), vaddq_f32(dot2, dot3))); + f32 amag_s = vaddvq_f32(vaddq_f32(vaddq_f32(amag0, amag1), vaddq_f32(amag2, amag3))); + f32 bmag_s = vaddvq_f32(vaddq_f32(vaddq_f32(bmag0, bmag1), vaddq_f32(bmag2, bmag3))); + + const f32 *pEnd2 = pVect1 + (qty - (qty16 << 4)); + while (pVect1 < pEnd2) { + dot_s += *pVect1 * *pVect2; + amag_s += *pVect1 * *pVect1; + bmag_s += *pVect2 * *pVect2; + pVect1++; pVect2++; + } + + return 1.0f - (dot_s / (sqrtf(amag_s) * sqrtf(bmag_s))); +} + static f32 l2_sqr_int8_neon(const void *pVect1v, const void *pVect2v, const void *qty_ptr) { i8 *pVect1 = (i8 *)pVect1v; @@ -240,13 +258,16 @@ static f32 l2_sqr_int8_neon(const void *pVect1v, const void *pVect2v, pVect1 += 8; pVect2 += 8; - // widen to protect against overflow + // widen i8 to i16 for subtraction int16x8_t v1_wide = vmovl_s8(v1); int16x8_t v2_wide = vmovl_s8(v2); - int16x8_t diff = vsubq_s16(v1_wide, v2_wide); - int16x8_t squared_diff = vmulq_s16(diff, diff); - int32x4_t sum = vpaddlq_s16(squared_diff); + + // widening multiply: i16*i16 -> i32 to avoid i16 overflow + // (diff can be up to 255, so diff*diff can be up to 65025 > INT16_MAX) + int32x4_t sq_lo = vmull_s16(vget_low_s16(diff), vget_low_s16(diff)); + int32x4_t sq_hi = vmull_s16(vget_high_s16(diff), vget_high_s16(diff)); + int32x4_t sum = vaddq_s32(sq_lo, sq_hi); sum_scalar += vgetq_lane_s32(sum, 0) + vgetq_lane_s32(sum, 1) + vgetq_lane_s32(sum, 2) + vgetq_lane_s32(sum, 3); @@ -462,6 +483,11 @@ static double distance_l1_f32(const void *a, const void *b, const void *d) { static f32 distance_cosine_float(const void *pVect1v, const void *pVect2v, const void *qty_ptr) { +#ifdef SQLITE_VEC_ENABLE_NEON + if ((*(const size_t *)qty_ptr) > 16) { + return cosine_float_neon(pVect1v, pVect2v, qty_ptr); + } +#endif f32 *pVect1 = (f32 *)pVect1v; f32 *pVect2 = (f32 *)pVect2v; size_t qty = *((size_t *)qty_ptr); @@ -478,8 +504,7 @@ static f32 distance_cosine_float(const void *pVect1v, const void *pVect2v, } return 1 - (dot / (sqrt(aMag) * sqrt(bMag))); } -static f32 distance_cosine_int8(const void *pA, const void *pB, - const void *pD) { +static f32 cosine_int8(const void *pA, const void *pB, const void *pD) { i8 *a = (i8 *)pA; i8 *b = (i8 *)pB; size_t d = *((size_t *)pD); @@ -497,6 +522,125 @@ static f32 distance_cosine_int8(const void *pA, const void *pB, return 1 - (dot / (sqrt(aMag) * sqrt(bMag))); } +#ifdef SQLITE_VEC_ENABLE_NEON +static f32 cosine_int8_neon(const void *pA, const void *pB, const void *pD) { + const i8 *a = (const i8 *)pA; + const i8 *b = (const i8 *)pB; + size_t d = *((const size_t *)pD); + const i8 *aEnd = a + d; + + int32x4_t dot_acc1 = vdupq_n_s32(0); + int32x4_t dot_acc2 = vdupq_n_s32(0); + int32x4_t aMag_acc1 = vdupq_n_s32(0); + int32x4_t aMag_acc2 = vdupq_n_s32(0); + int32x4_t bMag_acc1 = vdupq_n_s32(0); + int32x4_t bMag_acc2 = vdupq_n_s32(0); + + while (a < aEnd - 31) { + int8x16_t va1 = vld1q_s8(a); + int8x16_t vb1 = vld1q_s8(b); + int16x8_t a1_lo = vmovl_s8(vget_low_s8(va1)); + int16x8_t a1_hi = vmovl_s8(vget_high_s8(va1)); + int16x8_t b1_lo = vmovl_s8(vget_low_s8(vb1)); + int16x8_t b1_hi = vmovl_s8(vget_high_s8(vb1)); + + dot_acc1 = vmlal_s16(dot_acc1, vget_low_s16(a1_lo), vget_low_s16(b1_lo)); + dot_acc1 = vmlal_s16(dot_acc1, vget_high_s16(a1_lo), vget_high_s16(b1_lo)); + dot_acc2 = vmlal_s16(dot_acc2, vget_low_s16(a1_hi), vget_low_s16(b1_hi)); + dot_acc2 = vmlal_s16(dot_acc2, vget_high_s16(a1_hi), vget_high_s16(b1_hi)); + + aMag_acc1 = vmlal_s16(aMag_acc1, vget_low_s16(a1_lo), vget_low_s16(a1_lo)); + aMag_acc1 = vmlal_s16(aMag_acc1, vget_high_s16(a1_lo), vget_high_s16(a1_lo)); + aMag_acc2 = vmlal_s16(aMag_acc2, vget_low_s16(a1_hi), vget_low_s16(a1_hi)); + aMag_acc2 = vmlal_s16(aMag_acc2, vget_high_s16(a1_hi), vget_high_s16(a1_hi)); + + bMag_acc1 = vmlal_s16(bMag_acc1, vget_low_s16(b1_lo), vget_low_s16(b1_lo)); + bMag_acc1 = vmlal_s16(bMag_acc1, vget_high_s16(b1_lo), vget_high_s16(b1_lo)); + bMag_acc2 = vmlal_s16(bMag_acc2, vget_low_s16(b1_hi), vget_low_s16(b1_hi)); + bMag_acc2 = vmlal_s16(bMag_acc2, vget_high_s16(b1_hi), vget_high_s16(b1_hi)); + + int8x16_t va2 = vld1q_s8(a + 16); + int8x16_t vb2 = vld1q_s8(b + 16); + int16x8_t a2_lo = vmovl_s8(vget_low_s8(va2)); + int16x8_t a2_hi = vmovl_s8(vget_high_s8(va2)); + int16x8_t b2_lo = vmovl_s8(vget_low_s8(vb2)); + int16x8_t b2_hi = vmovl_s8(vget_high_s8(vb2)); + + dot_acc1 = vmlal_s16(dot_acc1, vget_low_s16(a2_lo), vget_low_s16(b2_lo)); + dot_acc1 = vmlal_s16(dot_acc1, vget_high_s16(a2_lo), vget_high_s16(b2_lo)); + dot_acc2 = vmlal_s16(dot_acc2, vget_low_s16(a2_hi), vget_low_s16(b2_hi)); + dot_acc2 = vmlal_s16(dot_acc2, vget_high_s16(a2_hi), vget_high_s16(b2_hi)); + + aMag_acc1 = vmlal_s16(aMag_acc1, vget_low_s16(a2_lo), vget_low_s16(a2_lo)); + aMag_acc1 = vmlal_s16(aMag_acc1, vget_high_s16(a2_lo), vget_high_s16(a2_lo)); + aMag_acc2 = vmlal_s16(aMag_acc2, vget_low_s16(a2_hi), vget_low_s16(a2_hi)); + aMag_acc2 = vmlal_s16(aMag_acc2, vget_high_s16(a2_hi), vget_high_s16(a2_hi)); + + bMag_acc1 = vmlal_s16(bMag_acc1, vget_low_s16(b2_lo), vget_low_s16(b2_lo)); + bMag_acc1 = vmlal_s16(bMag_acc1, vget_high_s16(b2_lo), vget_high_s16(b2_lo)); + bMag_acc2 = vmlal_s16(bMag_acc2, vget_low_s16(b2_hi), vget_low_s16(b2_hi)); + bMag_acc2 = vmlal_s16(bMag_acc2, vget_high_s16(b2_hi), vget_high_s16(b2_hi)); + + a += 32; + b += 32; + } + + while (a < aEnd - 15) { + int8x16_t va = vld1q_s8(a); + int8x16_t vb = vld1q_s8(b); + int16x8_t a_lo = vmovl_s8(vget_low_s8(va)); + int16x8_t a_hi = vmovl_s8(vget_high_s8(va)); + int16x8_t b_lo = vmovl_s8(vget_low_s8(vb)); + int16x8_t b_hi = vmovl_s8(vget_high_s8(vb)); + + dot_acc1 = vmlal_s16(dot_acc1, vget_low_s16(a_lo), vget_low_s16(b_lo)); + dot_acc1 = vmlal_s16(dot_acc1, vget_high_s16(a_lo), vget_high_s16(b_lo)); + dot_acc1 = vmlal_s16(dot_acc1, vget_low_s16(a_hi), vget_low_s16(b_hi)); + dot_acc1 = vmlal_s16(dot_acc1, vget_high_s16(a_hi), vget_high_s16(b_hi)); + + aMag_acc1 = vmlal_s16(aMag_acc1, vget_low_s16(a_lo), vget_low_s16(a_lo)); + aMag_acc1 = vmlal_s16(aMag_acc1, vget_high_s16(a_lo), vget_high_s16(a_lo)); + aMag_acc1 = vmlal_s16(aMag_acc1, vget_low_s16(a_hi), vget_low_s16(a_hi)); + aMag_acc1 = vmlal_s16(aMag_acc1, vget_high_s16(a_hi), vget_high_s16(a_hi)); + + bMag_acc1 = vmlal_s16(bMag_acc1, vget_low_s16(b_lo), vget_low_s16(b_lo)); + bMag_acc1 = vmlal_s16(bMag_acc1, vget_high_s16(b_lo), vget_high_s16(b_lo)); + bMag_acc1 = vmlal_s16(bMag_acc1, vget_low_s16(b_hi), vget_low_s16(b_hi)); + bMag_acc1 = vmlal_s16(bMag_acc1, vget_high_s16(b_hi), vget_high_s16(b_hi)); + + a += 16; + b += 16; + } + + int32x4_t dot_sum = vaddq_s32(dot_acc1, dot_acc2); + int32x4_t aMag_sum = vaddq_s32(aMag_acc1, aMag_acc2); + int32x4_t bMag_sum = vaddq_s32(bMag_acc1, bMag_acc2); + + i32 dot = vaddvq_s32(dot_sum); + i32 aMag = vaddvq_s32(aMag_sum); + i32 bMag = vaddvq_s32(bMag_sum); + + while (a < aEnd) { + dot += (i32)*a * (i32)*b; + aMag += (i32)*a * (i32)*a; + bMag += (i32)*b * (i32)*b; + a++; + b++; + } + + return 1.0f - ((f32)dot / (sqrtf((f32)aMag) * sqrtf((f32)bMag))); +} +#endif + +static f32 distance_cosine_int8(const void *a, const void *b, const void *d) { +#ifdef SQLITE_VEC_ENABLE_NEON + if ((*(const size_t *)d) > 15) { + return cosine_int8_neon(a, b, d); + } +#endif + return cosine_int8(a, b, d); +} + // https://github.com/facebookresearch/faiss/blob/77e2e79cd0a680adc343b9840dd865da724c579e/faiss/utils/hamming_distance/common.h#L34 static u8 hamdist_table[256] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, @@ -511,6 +655,111 @@ static u8 hamdist_table[256] = { 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8}; +#ifdef SQLITE_VEC_ENABLE_NEON +static f32 distance_hamming_neon(const u8 *a, const u8 *b, size_t n_bytes) { + const u8 *pEnd = a + n_bytes; + + uint32x4_t acc1 = vdupq_n_u32(0); + uint32x4_t acc2 = vdupq_n_u32(0); + uint32x4_t acc3 = vdupq_n_u32(0); + uint32x4_t acc4 = vdupq_n_u32(0); + + while (a <= pEnd - 64) { + uint8x16_t v1 = vld1q_u8(a); + uint8x16_t v2 = vld1q_u8(b); + acc1 = vaddq_u32(acc1, vpaddlq_u16(vpaddlq_u8(vcntq_u8(veorq_u8(v1, v2))))); + + v1 = vld1q_u8(a + 16); + v2 = vld1q_u8(b + 16); + acc2 = vaddq_u32(acc2, vpaddlq_u16(vpaddlq_u8(vcntq_u8(veorq_u8(v1, v2))))); + + v1 = vld1q_u8(a + 32); + v2 = vld1q_u8(b + 32); + acc3 = vaddq_u32(acc3, vpaddlq_u16(vpaddlq_u8(vcntq_u8(veorq_u8(v1, v2))))); + + v1 = vld1q_u8(a + 48); + v2 = vld1q_u8(b + 48); + acc4 = vaddq_u32(acc4, vpaddlq_u16(vpaddlq_u8(vcntq_u8(veorq_u8(v1, v2))))); + + a += 64; + b += 64; + } + + while (a <= pEnd - 16) { + uint8x16_t v1 = vld1q_u8(a); + uint8x16_t v2 = vld1q_u8(b); + acc1 = vaddq_u32(acc1, vpaddlq_u16(vpaddlq_u8(vcntq_u8(veorq_u8(v1, v2))))); + a += 16; + b += 16; + } + + acc1 = vaddq_u32(acc1, acc2); + acc3 = vaddq_u32(acc3, acc4); + acc1 = vaddq_u32(acc1, acc3); + u32 sum = vaddvq_u32(acc1); + + while (a < pEnd) { + sum += hamdist_table[*a ^ *b]; + a++; + b++; + } + + return (f32)sum; +} +#endif + +#ifdef SQLITE_VEC_ENABLE_AVX +/** + * AVX2 Hamming distance using VPSHUFB-based popcount. + * Processes 32 bytes (256 bits) per iteration. + */ +static f32 distance_hamming_avx2(const u8 *a, const u8 *b, size_t n_bytes) { + const u8 *pEnd = a + n_bytes; + + // VPSHUFB lookup table: popcount of low nibble + const __m256i lookup = _mm256_setr_epi8( + 0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4, + 0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4); + const __m256i low_mask = _mm256_set1_epi8(0x0f); + + __m256i acc = _mm256_setzero_si256(); + + while (a <= pEnd - 32) { + __m256i va = _mm256_loadu_si256((const __m256i *)a); + __m256i vb = _mm256_loadu_si256((const __m256i *)b); + __m256i xored = _mm256_xor_si256(va, vb); + + // VPSHUFB popcount: split into nibbles, lookup each + __m256i lo = _mm256_and_si256(xored, low_mask); + __m256i hi = _mm256_and_si256(_mm256_srli_epi16(xored, 4), low_mask); + __m256i popcnt = _mm256_add_epi8(_mm256_shuffle_epi8(lookup, lo), + _mm256_shuffle_epi8(lookup, hi)); + + // Horizontal sum: u8 -> u64 via sad against zero + acc = _mm256_add_epi64(acc, _mm256_sad_epu8(popcnt, _mm256_setzero_si256())); + a += 32; + b += 32; + } + + // Horizontal sum of 4 x u64 lanes + u64 tmp[4]; + _mm256_storeu_si256((__m256i *)tmp, acc); + u32 sum = (u32)(tmp[0] + tmp[1] + tmp[2] + tmp[3]); + + // Scalar tail + while (a < pEnd) { + u8 x = *a ^ *b; + x = x - ((x >> 1) & 0x55); + x = (x & 0x33) + ((x >> 2) & 0x33); + sum += (x + (x >> 4)) & 0x0F; + a++; + b++; + } + + return (f32)sum; +} +#endif + static f32 distance_hamming_u8(u8 *a, u8 *b, size_t n) { int same = 0; for (unsigned long i = 0; i < n; i++) { @@ -537,10 +786,13 @@ static unsigned int __builtin_popcountl(unsigned int x) { #endif #endif -static f32 distance_hamming_u64(u64 *a, u64 *b, size_t n) { +static f32 distance_hamming_u64(const u8 *a, const u8 *b, size_t n) { int same = 0; for (unsigned long i = 0; i < n; i++) { - same += __builtin_popcountl(a[i] ^ b[i]); + u64 va, vb; + memcpy(&va, a + i * sizeof(u64), sizeof(u64)); + memcpy(&vb, b + i * sizeof(u64), sizeof(u64)); + same += __builtin_popcountl(va ^ vb); } return (f32)same; } @@ -555,11 +807,23 @@ static f32 distance_hamming_u64(u64 *a, u64 *b, size_t n) { */ static f32 distance_hamming(const void *a, const void *b, const void *d) { size_t dimensions = *((size_t *)d); + size_t n_bytes = dimensions / CHAR_BIT; + +#ifdef SQLITE_VEC_ENABLE_NEON + if (dimensions >= 128) { + return distance_hamming_neon((const u8 *)a, (const u8 *)b, n_bytes); + } +#endif +#ifdef SQLITE_VEC_ENABLE_AVX + if (n_bytes >= 32) { + return distance_hamming_avx2((const u8 *)a, (const u8 *)b, n_bytes); + } +#endif if ((dimensions % 64) == 0) { - return distance_hamming_u64((u64 *)a, (u64 *)b, dimensions / 8 / CHAR_BIT); + return distance_hamming_u64((const u8 *)a, (const u8 *)b, n_bytes / sizeof(u64)); } - return distance_hamming_u8((u8 *)a, (u8 *)b, dimensions / CHAR_BIT); + return distance_hamming_u8((u8 *)a, (u8 *)b, n_bytes); } #ifdef SQLITE_VEC_TEST @@ -720,8 +984,18 @@ static int fvec_from_value(sqlite3_value *value, f32 **vector, return SQLITE_NOMEM; } memcpy(buf, blob, bytes); + size_t n = bytes / sizeof(f32); + for (size_t i = 0; i < n; i++) { + if (isnan(buf[i]) || isinf(buf[i])) { + *pzErr = sqlite3_mprintf( + "invalid float32 vector: element %d is %s", + (int)i, isnan(buf[i]) ? "NaN" : "Inf"); + sqlite3_free(buf); + return SQLITE_ERROR; + } + } *vector = buf; - *dimensions = bytes / sizeof(f32); + *dimensions = n; *cleanup = sqlite3_free; return SQLITE_OK; } @@ -789,6 +1063,13 @@ static int fvec_from_value(sqlite3_value *value, f32 **vector, } f32 res = (f32)result; + if (isnan(res) || isinf(res)) { + sqlite3_free(x.z); + *pzErr = sqlite3_mprintf( + "invalid float32 vector: element %d is %s", + (int)x.length, isnan(res) ? "NaN" : "Inf"); + return SQLITE_ERROR; + } array_append(&x, (const void *)&res); offset += (endptr - ptr); @@ -1065,33 +1346,6 @@ int ensure_vector_match(sqlite3_value *aValue, sqlite3_value *bValue, void **a, int _cmp(const void *a, const void *b) { return (*(i64 *)a - *(i64 *)b); } -struct VecNpyFile { - char *path; - size_t pathLength; -}; -#define SQLITE_VEC_NPY_FILE_NAME "vec0-npy-file" - -#ifndef SQLITE_VEC_OMIT_FS -static void vec_npy_file(sqlite3_context *context, int argc, - sqlite3_value **argv) { - assert(argc == 1); - char *path = (char *)sqlite3_value_text(argv[0]); - size_t pathLength = sqlite3_value_bytes(argv[0]); - struct VecNpyFile *f; - - f = sqlite3_malloc(sizeof(*f)); - if (!f) { - sqlite3_result_error_nomem(context); - return; - } - memset(f, 0, sizeof(*f)); - - f->path = path; - f->pathLength = pathLength; - sqlite3_result_pointer(context, f, SQLITE_VEC_NPY_FILE_NAME, sqlite3_free); -} -#endif - #pragma region scalar functions static void vec_f32(sqlite3_context *context, int argc, sqlite3_value **argv) { assert(argc == 1); @@ -2281,12 +2535,163 @@ enum Vec0DistanceMetrics { VEC0_DISTANCE_METRIC_L1 = 3, }; +/** + * Compute distance between two full-precision vectors using the appropriate + * distance function for the given element type and metric. + * Shared utility used by ANN index implementations. + */ +static f32 vec0_distance_full( + const void *a, const void *b, size_t dimensions, + enum VectorElementType elementType, + enum Vec0DistanceMetrics metric) { + switch (elementType) { + case SQLITE_VEC_ELEMENT_TYPE_FLOAT32: + switch (metric) { + case VEC0_DISTANCE_METRIC_L2: + return distance_l2_sqr_float(a, b, &dimensions); + case VEC0_DISTANCE_METRIC_COSINE: + return distance_cosine_float(a, b, &dimensions); + case VEC0_DISTANCE_METRIC_L1: + return (f32)distance_l1_f32(a, b, &dimensions); + } + break; + case SQLITE_VEC_ELEMENT_TYPE_INT8: + switch (metric) { + case VEC0_DISTANCE_METRIC_L2: + return distance_l2_sqr_int8(a, b, &dimensions); + case VEC0_DISTANCE_METRIC_COSINE: + return distance_cosine_int8(a, b, &dimensions); + case VEC0_DISTANCE_METRIC_L1: + return (f32)distance_l1_int8(a, b, &dimensions); + } + break; + case SQLITE_VEC_ELEMENT_TYPE_BIT: + return distance_hamming(a, b, &dimensions); + } + return 0.0f; +} + +enum Vec0IndexType { + VEC0_INDEX_TYPE_FLAT = 1, +#if SQLITE_VEC_ENABLE_RESCORE + VEC0_INDEX_TYPE_RESCORE = 2, +#endif + VEC0_INDEX_TYPE_IVF = 3, + VEC0_INDEX_TYPE_DISKANN = 4, +}; + +#if SQLITE_VEC_ENABLE_RESCORE +enum Vec0RescoreQuantizerType { + VEC0_RESCORE_QUANTIZER_BIT = 1, + VEC0_RESCORE_QUANTIZER_INT8 = 2, +}; + +struct Vec0RescoreConfig { + enum Vec0RescoreQuantizerType quantizer_type; + int oversample; // CREATE-time default + int oversample_search; // runtime override (0 = use default) +}; +#endif + +#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE +enum Vec0IvfQuantizer { + VEC0_IVF_QUANTIZER_NONE = 0, + VEC0_IVF_QUANTIZER_INT8 = 1, + VEC0_IVF_QUANTIZER_BINARY = 2, +}; + +struct Vec0IvfConfig { + int nlist; // number of centroids (0 = deferred) + int nprobe; // cells to probe at query time + int quantizer; // VEC0_IVF_QUANTIZER_NONE / INT8 / BINARY + int oversample; // >= 1 (1 = no oversampling) +}; +#else +struct Vec0IvfConfig { char _unused; }; +#endif + +// ============================================================ +// DiskANN types and constants +// ============================================================ + +#define VEC0_DISKANN_DEFAULT_N_NEIGHBORS 72 +#define VEC0_DISKANN_MAX_N_NEIGHBORS 256 +#define VEC0_DISKANN_DEFAULT_SEARCH_LIST_SIZE 128 +#define VEC0_DISKANN_DEFAULT_ALPHA 1.2f + +/** + * Quantizer type used for compressing neighbor vectors in the DiskANN graph. + */ +enum Vec0DiskannQuantizerType { + VEC0_DISKANN_QUANTIZER_BINARY = 1, // 1 bit per dimension (1/32 compression) + VEC0_DISKANN_QUANTIZER_INT8 = 2, // 1 byte per dimension (1/4 compression) +}; + +/** + * Configuration for a DiskANN index on a single vector column. + * Parsed from `INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=72)`. + */ +struct Vec0DiskannConfig { + // Quantizer type for neighbor vectors + enum Vec0DiskannQuantizerType quantizer_type; + + // Maximum number of neighbors per node (R in the paper). Must be divisible by 8. + int n_neighbors; + + // Search list size (L in the paper) — unified default for both insert and query. + int search_list_size; + + // Per-path overrides (0 = fall back to search_list_size). + int search_list_size_search; + int search_list_size_insert; + + // Alpha parameter for RobustPrune (distance scaling factor, typically 1.0-1.5) + f32 alpha; + + // Buffer threshold for batched inserts. When > 0, inserts go into a flat + // buffer table and are flushed into the graph when the buffer reaches this + // size. 0 = disabled (legacy per-row insert behavior). + int buffer_threshold; +}; + +/** + * Represents a single candidate during greedy beam search. + * Used in priority queues / sorted arrays during LM-Search. + */ +struct Vec0DiskannCandidate { + i64 rowid; + f32 distance; + int visited; // 1 if this candidate's neighbors have been explored + int confirmed; // 1 if full-precision vector was successfully read (node exists) +}; + +/** + * Returns the byte size of a quantized vector for the given quantizer type + * and number of dimensions. + */ +size_t diskann_quantized_vector_byte_size( + enum Vec0DiskannQuantizerType quantizer_type, size_t dimensions) { + switch (quantizer_type) { + case VEC0_DISKANN_QUANTIZER_BINARY: + return dimensions / CHAR_BIT; // 1 bit per dimension + case VEC0_DISKANN_QUANTIZER_INT8: + return dimensions * sizeof(i8); // 1 byte per dimension + } + return 0; +} + struct VectorColumnDefinition { char *name; int name_length; size_t dimensions; enum VectorElementType element_type; enum Vec0DistanceMetrics distance_metric; + enum Vec0IndexType index_type; +#if SQLITE_VEC_ENABLE_RESCORE + struct Vec0RescoreConfig rescore; +#endif + struct Vec0IvfConfig ivf; + struct Vec0DiskannConfig diskann; }; struct Vec0PartitionColumnDefinition { @@ -2323,6 +2728,111 @@ size_t vector_column_byte_size(struct VectorColumnDefinition column) { return vector_byte_size(column.element_type, column.dimensions); } +#if SQLITE_VEC_ENABLE_RESCORE +/** + * @brief Parse rescore options from an "INDEXED BY rescore(...)" clause. + * + * @param scanner Scanner positioned right after the opening '(' of rescore(...) + * @param outConfig Output rescore config + * @param pzErr Error message output + * @return int SQLITE_OK on success, SQLITE_ERROR on error. + */ +static int vec0_parse_rescore_options(struct Vec0Scanner *scanner, + struct Vec0RescoreConfig *outConfig, + char **pzErr) { + struct Vec0Token token; + int rc; + int hasQuantizer = 0; + outConfig->oversample = 8; + outConfig->quantizer_type = 0; + + while (1) { + rc = vec0_scanner_next(scanner, &token); + if (rc == VEC0_TOKEN_RESULT_EOF) { + break; + } + // ')' closes rescore options + if (rc == VEC0_TOKEN_RESULT_SOME && token.token_type == TOKEN_TYPE_RPAREN) { + break; + } + if (rc != VEC0_TOKEN_RESULT_SOME || token.token_type != TOKEN_TYPE_IDENTIFIER) { + *pzErr = sqlite3_mprintf("Expected option name in rescore(...)"); + return SQLITE_ERROR; + } + + char *key = token.start; + int keyLength = token.end - token.start; + + // expect '=' + rc = vec0_scanner_next(scanner, &token); + if (rc != VEC0_TOKEN_RESULT_SOME || token.token_type != TOKEN_TYPE_EQ) { + *pzErr = sqlite3_mprintf("Expected '=' after option name in rescore(...)"); + return SQLITE_ERROR; + } + + // value + rc = vec0_scanner_next(scanner, &token); + if (rc != VEC0_TOKEN_RESULT_SOME) { + *pzErr = sqlite3_mprintf("Expected value after '=' in rescore(...)"); + return SQLITE_ERROR; + } + + if (sqlite3_strnicmp(key, "quantizer", keyLength) == 0) { + if (token.token_type != TOKEN_TYPE_IDENTIFIER) { + *pzErr = sqlite3_mprintf("Expected identifier for quantizer value in rescore(...)"); + return SQLITE_ERROR; + } + int valLen = token.end - token.start; + if (sqlite3_strnicmp(token.start, "bit", valLen) == 0) { + outConfig->quantizer_type = VEC0_RESCORE_QUANTIZER_BIT; + } else if (sqlite3_strnicmp(token.start, "int8", valLen) == 0) { + outConfig->quantizer_type = VEC0_RESCORE_QUANTIZER_INT8; + } else { + *pzErr = sqlite3_mprintf("Unknown quantizer type '%.*s' in rescore(...). Expected 'bit' or 'int8'.", valLen, token.start); + return SQLITE_ERROR; + } + hasQuantizer = 1; + } else if (sqlite3_strnicmp(key, "oversample", keyLength) == 0) { + if (token.token_type != TOKEN_TYPE_DIGIT) { + *pzErr = sqlite3_mprintf("Expected integer for oversample value in rescore(...)"); + return SQLITE_ERROR; + } + outConfig->oversample = atoi(token.start); + if (outConfig->oversample <= 0 || outConfig->oversample > 128) { + *pzErr = sqlite3_mprintf("oversample in rescore(...) must be between 1 and 128, got %d", outConfig->oversample); + return SQLITE_ERROR; + } + } else { + *pzErr = sqlite3_mprintf("Unknown option '%.*s' in rescore(...)", keyLength, key); + return SQLITE_ERROR; + } + + // optional comma between options + rc = vec0_scanner_next(scanner, &token); + if (rc == VEC0_TOKEN_RESULT_EOF) { + break; + } + if (rc == VEC0_TOKEN_RESULT_SOME && token.token_type == TOKEN_TYPE_RPAREN) { + break; + } + if (rc == VEC0_TOKEN_RESULT_SOME && token.token_type == TOKEN_TYPE_COMMA) { + continue; + } + // If it's not a comma or rparen, it might be the next key — push back isn't + // possible with this scanner, so we'll treat unexpected tokens as errors + *pzErr = sqlite3_mprintf("Unexpected token in rescore(...) options"); + return SQLITE_ERROR; + } + + if (!hasQuantizer) { + *pzErr = sqlite3_mprintf("rescore(...) requires a 'quantizer' option (quantizer=bit or quantizer=int8)"); + return SQLITE_ERROR; + } + + return SQLITE_OK; +} +#endif /* SQLITE_VEC_ENABLE_RESCORE */ + /** * @brief Parse an vec0 vtab argv[i] column definition and see if * it's a vector column defintion, ex `contents_embedding float[768]`. @@ -2333,6 +2843,132 @@ size_t vector_column_byte_size(struct VectorColumnDefinition column) { * @return int SQLITE_OK on success, SQLITE_EMPTY is it's not a vector column * definition, SQLITE_ERROR on error. */ +#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE +// Forward declaration — defined in sqlite-vec-ivf.c +static int vec0_parse_ivf_options(struct Vec0Scanner *scanner, + struct Vec0IvfConfig *config); +#endif + +/** + * Parse the options inside diskann(...) parentheses. + * Scanner should be positioned right before the '(' token. + * + * Recognized options: + * neighbor_quantizer = binary | int8 (required) + * n_neighbors = (optional, default 72) + * search_list_size = (optional, default 128) + */ +static int vec0_parse_diskann_options(struct Vec0Scanner *scanner, + struct Vec0DiskannConfig *config) { + int rc; + struct Vec0Token token; + int hasQuantizer = 0; + + // Set defaults + config->n_neighbors = VEC0_DISKANN_DEFAULT_N_NEIGHBORS; + config->search_list_size = VEC0_DISKANN_DEFAULT_SEARCH_LIST_SIZE; + config->search_list_size_search = 0; + config->search_list_size_insert = 0; + config->alpha = VEC0_DISKANN_DEFAULT_ALPHA; + config->buffer_threshold = 0; + int hasSearchListSize = 0; + int hasSearchListSizeSplit = 0; + + // Expect '(' + rc = vec0_scanner_next(scanner, &token); + if (rc != VEC0_TOKEN_RESULT_SOME || token.token_type != TOKEN_TYPE_LPAREN) { + return SQLITE_ERROR; + } + + while (1) { + // key + rc = vec0_scanner_next(scanner, &token); + if (rc == VEC0_TOKEN_RESULT_SOME && token.token_type == TOKEN_TYPE_RPAREN) { + break; // empty parens or trailing comma + } + if (rc != VEC0_TOKEN_RESULT_SOME || token.token_type != TOKEN_TYPE_IDENTIFIER) { + return SQLITE_ERROR; + } + char *optKey = token.start; + int optKeyLen = token.end - token.start; + + // '=' + rc = vec0_scanner_next(scanner, &token); + if (rc != VEC0_TOKEN_RESULT_SOME || token.token_type != TOKEN_TYPE_EQ) { + return SQLITE_ERROR; + } + + // value (identifier or digit) + rc = vec0_scanner_next(scanner, &token); + if (rc != VEC0_TOKEN_RESULT_SOME) { + return SQLITE_ERROR; + } + char *optVal = token.start; + int optValLen = token.end - token.start; + + if (sqlite3_strnicmp(optKey, "neighbor_quantizer", optKeyLen) == 0) { + if (sqlite3_strnicmp(optVal, "binary", optValLen) == 0) { + config->quantizer_type = VEC0_DISKANN_QUANTIZER_BINARY; + } else if (sqlite3_strnicmp(optVal, "int8", optValLen) == 0) { + config->quantizer_type = VEC0_DISKANN_QUANTIZER_INT8; + } else { + return SQLITE_ERROR; // unknown quantizer + } + hasQuantizer = 1; + } else if (sqlite3_strnicmp(optKey, "n_neighbors", optKeyLen) == 0) { + config->n_neighbors = atoi(optVal); + if (config->n_neighbors <= 0 || (config->n_neighbors % 8) != 0 || + config->n_neighbors > VEC0_DISKANN_MAX_N_NEIGHBORS) { + return SQLITE_ERROR; + } + } else if (sqlite3_strnicmp(optKey, "search_list_size_search", optKeyLen) == 0 && optKeyLen == 23) { + config->search_list_size_search = atoi(optVal); + if (config->search_list_size_search <= 0) { + return SQLITE_ERROR; + } + hasSearchListSizeSplit = 1; + } else if (sqlite3_strnicmp(optKey, "search_list_size_insert", optKeyLen) == 0 && optKeyLen == 23) { + config->search_list_size_insert = atoi(optVal); + if (config->search_list_size_insert <= 0) { + return SQLITE_ERROR; + } + hasSearchListSizeSplit = 1; + } else if (sqlite3_strnicmp(optKey, "search_list_size", optKeyLen) == 0) { + config->search_list_size = atoi(optVal); + if (config->search_list_size <= 0) { + return SQLITE_ERROR; + } + hasSearchListSize = 1; + } else if (sqlite3_strnicmp(optKey, "buffer_threshold", optKeyLen) == 0) { + config->buffer_threshold = atoi(optVal); + if (config->buffer_threshold < 0) { + return SQLITE_ERROR; + } + } else { + return SQLITE_ERROR; // unknown option + } + + // Expect ',' or ')' + rc = vec0_scanner_next(scanner, &token); + if (rc == VEC0_TOKEN_RESULT_SOME && token.token_type == TOKEN_TYPE_RPAREN) { + break; + } + if (rc != VEC0_TOKEN_RESULT_SOME || token.token_type != TOKEN_TYPE_COMMA) { + return SQLITE_ERROR; + } + } + + if (!hasQuantizer) { + return SQLITE_ERROR; // neighbor_quantizer is required + } + + if (hasSearchListSize && hasSearchListSizeSplit) { + return SQLITE_ERROR; // cannot mix search_list_size with search_list_size_search/insert + } + + return SQLITE_OK; +} + int vec0_parse_vector_column(const char *source, int source_length, struct VectorColumnDefinition *outColumn) { // parses a vector column definition like so: @@ -2346,8 +2982,16 @@ int vec0_parse_vector_column(const char *source, int source_length, int nameLength; enum VectorElementType elementType; enum Vec0DistanceMetrics distanceMetric = VEC0_DISTANCE_METRIC_L2; + enum Vec0IndexType indexType = VEC0_INDEX_TYPE_FLAT; +#if SQLITE_VEC_ENABLE_RESCORE + struct Vec0RescoreConfig rescoreConfig; + memset(&rescoreConfig, 0, sizeof(rescoreConfig)); +#endif + struct Vec0IvfConfig ivfConfig; + memset(&ivfConfig, 0, sizeof(ivfConfig)); + struct Vec0DiskannConfig diskannConfig; + memset(&diskannConfig, 0, sizeof(diskannConfig)); int dimensions; - vec0_scanner_init(&scanner, source, source_length); // starts with an identifier @@ -2449,6 +3093,90 @@ int vec0_parse_vector_column(const char *source, int source_length, return SQLITE_ERROR; } } + // INDEXED BY flat() | rescore(...) + else if (sqlite3_strnicmp(key, "indexed", keyLength) == 0) { + // expect "by" + rc = vec0_scanner_next(&scanner, &token); + if (rc != VEC0_TOKEN_RESULT_SOME || + token.token_type != TOKEN_TYPE_IDENTIFIER || + sqlite3_strnicmp(token.start, "by", token.end - token.start) != 0) { + return SQLITE_ERROR; + } + // expect index type name + rc = vec0_scanner_next(&scanner, &token); + if (rc != VEC0_TOKEN_RESULT_SOME || + token.token_type != TOKEN_TYPE_IDENTIFIER) { + return SQLITE_ERROR; + } + int indexNameLen = token.end - token.start; + if (sqlite3_strnicmp(token.start, "flat", indexNameLen) == 0) { + indexType = VEC0_INDEX_TYPE_FLAT; + // expect '(' + rc = vec0_scanner_next(&scanner, &token); + if (rc != VEC0_TOKEN_RESULT_SOME || + token.token_type != TOKEN_TYPE_LPAREN) { + return SQLITE_ERROR; + } + // expect ')' + rc = vec0_scanner_next(&scanner, &token); + if (rc != VEC0_TOKEN_RESULT_SOME || + token.token_type != TOKEN_TYPE_RPAREN) { + return SQLITE_ERROR; + } + } +#if SQLITE_VEC_ENABLE_RESCORE + else if (sqlite3_strnicmp(token.start, "rescore", indexNameLen) == 0) { + indexType = VEC0_INDEX_TYPE_RESCORE; + if (elementType != SQLITE_VEC_ELEMENT_TYPE_FLOAT32) { + return SQLITE_ERROR; + } + // expect '(' + rc = vec0_scanner_next(&scanner, &token); + if (rc != VEC0_TOKEN_RESULT_SOME || token.token_type != TOKEN_TYPE_LPAREN) { + return SQLITE_ERROR; + } + char *rescoreErr = NULL; + rc = vec0_parse_rescore_options(&scanner, &rescoreConfig, &rescoreErr); + if (rc != SQLITE_OK) { + if (rescoreErr) sqlite3_free(rescoreErr); + return SQLITE_ERROR; + } + // validate dimensions for bit quantizer + if (rescoreConfig.quantizer_type == VEC0_RESCORE_QUANTIZER_BIT && + (dimensions % CHAR_BIT) != 0) { + return SQLITE_ERROR; + } + } +#endif + else if (sqlite3_strnicmp(token.start, "ivf", indexNameLen) == 0) { +#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE + indexType = VEC0_INDEX_TYPE_IVF; + memset(&ivfConfig, 0, sizeof(ivfConfig)); + rc = vec0_parse_ivf_options(&scanner, &ivfConfig); + if (rc != SQLITE_OK) { + return SQLITE_ERROR; + } + if (ivfConfig.quantizer == VEC0_IVF_QUANTIZER_BINARY && (dimensions % 8) != 0) { + return SQLITE_ERROR; + } +#else + return SQLITE_ERROR; // IVF not compiled in +#endif + } else if (sqlite3_strnicmp(token.start, "diskann", indexNameLen) == 0) { +#if SQLITE_VEC_ENABLE_DISKANN + indexType = VEC0_INDEX_TYPE_DISKANN; + rc = vec0_parse_diskann_options(&scanner, &diskannConfig); + if (rc != SQLITE_OK) { + return rc; + } +#else + return SQLITE_ERROR; +#endif + } else { + // unknown index type + return SQLITE_ERROR; + } + } // unknown key else { return SQLITE_ERROR; @@ -2463,6 +3191,12 @@ int vec0_parse_vector_column(const char *source, int source_length, outColumn->distance_metric = distanceMetric; outColumn->element_type = elementType; outColumn->dimensions = dimensions; + outColumn->index_type = indexType; +#if SQLITE_VEC_ENABLE_RESCORE + outColumn->rescore = rescoreConfig; +#endif + outColumn->ivf = ivfConfig; + outColumn->diskann = diskannConfig; return SQLITE_OK; } @@ -2660,765 +3394,15 @@ static sqlite3_module vec_eachModule = { #pragma endregion -#pragma region vec_npy_each table function -enum NpyTokenType { - NPY_TOKEN_TYPE_IDENTIFIER, - NPY_TOKEN_TYPE_NUMBER, - NPY_TOKEN_TYPE_LPAREN, - NPY_TOKEN_TYPE_RPAREN, - NPY_TOKEN_TYPE_LBRACE, - NPY_TOKEN_TYPE_RBRACE, - NPY_TOKEN_TYPE_COLON, - NPY_TOKEN_TYPE_COMMA, - NPY_TOKEN_TYPE_STRING, - NPY_TOKEN_TYPE_FALSE, -}; - -struct NpyToken { - enum NpyTokenType token_type; - unsigned char *start; - unsigned char *end; -}; - -int npy_token_next(unsigned char *start, unsigned char *end, - struct NpyToken *out) { - unsigned char *ptr = start; - while (ptr < end) { - unsigned char curr = *ptr; - if (is_whitespace(curr)) { - ptr++; - continue; - } else if (curr == '(') { - out->start = ptr++; - out->end = ptr; - out->token_type = NPY_TOKEN_TYPE_LPAREN; - return VEC0_TOKEN_RESULT_SOME; - } else if (curr == ')') { - out->start = ptr++; - out->end = ptr; - out->token_type = NPY_TOKEN_TYPE_RPAREN; - return VEC0_TOKEN_RESULT_SOME; - } else if (curr == '{') { - out->start = ptr++; - out->end = ptr; - out->token_type = NPY_TOKEN_TYPE_LBRACE; - return VEC0_TOKEN_RESULT_SOME; - } else if (curr == '}') { - out->start = ptr++; - out->end = ptr; - out->token_type = NPY_TOKEN_TYPE_RBRACE; - return VEC0_TOKEN_RESULT_SOME; - } else if (curr == ':') { - out->start = ptr++; - out->end = ptr; - out->token_type = NPY_TOKEN_TYPE_COLON; - return VEC0_TOKEN_RESULT_SOME; - } else if (curr == ',') { - out->start = ptr++; - out->end = ptr; - out->token_type = NPY_TOKEN_TYPE_COMMA; - return VEC0_TOKEN_RESULT_SOME; - } else if (curr == '\'') { - unsigned char *start = ptr; - ptr++; - while (ptr < end) { - if ((*ptr) == '\'') { - break; - } - ptr++; - } - if (ptr >= end || (*ptr) != '\'') { - return VEC0_TOKEN_RESULT_ERROR; - } - out->start = start; - out->end = ++ptr; - out->token_type = NPY_TOKEN_TYPE_STRING; - return VEC0_TOKEN_RESULT_SOME; - } else if (curr == 'F' && - strncmp((char *)ptr, "False", strlen("False")) == 0) { - out->start = ptr; - out->end = (ptr + (int)strlen("False")); - ptr = out->end; - out->token_type = NPY_TOKEN_TYPE_FALSE; - return VEC0_TOKEN_RESULT_SOME; - } else if (is_digit(curr)) { - unsigned char *start = ptr; - while (ptr < end && (is_digit(*ptr))) { - ptr++; - } - out->start = start; - out->end = ptr; - out->token_type = NPY_TOKEN_TYPE_NUMBER; - return VEC0_TOKEN_RESULT_SOME; - } else { - return VEC0_TOKEN_RESULT_ERROR; - } - } - return VEC0_TOKEN_RESULT_ERROR; -} - -struct NpyScanner { - unsigned char *start; - unsigned char *end; - unsigned char *ptr; -}; - -void npy_scanner_init(struct NpyScanner *scanner, const unsigned char *source, - int source_length) { - scanner->start = (unsigned char *)source; - scanner->end = (unsigned char *)source + source_length; - scanner->ptr = (unsigned char *)source; -} - -int npy_scanner_next(struct NpyScanner *scanner, struct NpyToken *out) { - int rc = npy_token_next(scanner->start, scanner->end, out); - if (rc == VEC0_TOKEN_RESULT_SOME) { - scanner->start = out->end; - } - return rc; -} - -#define NPY_PARSE_ERROR "Error parsing numpy array: " -int parse_npy_header(sqlite3_vtab *pVTab, const unsigned char *header, - size_t headerLength, - enum VectorElementType *out_element_type, - int *fortran_order, size_t *numElements, - size_t *numDimensions) { - - struct NpyScanner scanner; - struct NpyToken token; - int rc; - npy_scanner_init(&scanner, header, headerLength); - - if (npy_scanner_next(&scanner, &token) != VEC0_TOKEN_RESULT_SOME && - token.token_type != NPY_TOKEN_TYPE_LBRACE) { - vtab_set_error(pVTab, - NPY_PARSE_ERROR "numpy header did not start with '{'"); - return SQLITE_ERROR; - } - while (1) { - rc = npy_scanner_next(&scanner, &token); - if (rc != VEC0_TOKEN_RESULT_SOME) { - vtab_set_error(pVTab, NPY_PARSE_ERROR "expected key in numpy header"); - return SQLITE_ERROR; - } - - if (token.token_type == NPY_TOKEN_TYPE_RBRACE) { - break; - } - if (token.token_type != NPY_TOKEN_TYPE_STRING) { - vtab_set_error(pVTab, NPY_PARSE_ERROR - "expected a string as key in numpy header"); - return SQLITE_ERROR; - } - unsigned char *key = token.start; - - rc = npy_scanner_next(&scanner, &token); - if ((rc != VEC0_TOKEN_RESULT_SOME) || - (token.token_type != NPY_TOKEN_TYPE_COLON)) { - vtab_set_error(pVTab, NPY_PARSE_ERROR - "expected a ':' after key in numpy header"); - return SQLITE_ERROR; - } - - if (strncmp((char *)key, "'descr'", strlen("'descr'")) == 0) { - rc = npy_scanner_next(&scanner, &token); - if ((rc != VEC0_TOKEN_RESULT_SOME) || - (token.token_type != NPY_TOKEN_TYPE_STRING)) { - vtab_set_error(pVTab, NPY_PARSE_ERROR - "expected a string value after 'descr' key"); - return SQLITE_ERROR; - } - if (strncmp((char *)token.start, "'maxChunks = 1024; - pCur->chunksBufferSize = - (vector_byte_size(element_type, numDimensions)) * pCur->maxChunks; - pCur->chunksBuffer = sqlite3_malloc(pCur->chunksBufferSize); - if (pCur->chunksBufferSize && !pCur->chunksBuffer) { - return SQLITE_NOMEM; - } - - pCur->currentChunkSize = - fread(pCur->chunksBuffer, vector_byte_size(element_type, numDimensions), - pCur->maxChunks, file); - - pCur->currentChunkIndex = 0; - pCur->elementType = element_type; - pCur->nElements = numElements; - pCur->nDimensions = numDimensions; - pCur->input_type = VEC_NPY_EACH_INPUT_FILE; - - pCur->eof = pCur->currentChunkSize == 0; - pCur->file = file; - return SQLITE_OK; -} -#endif - -int parse_npy_buffer(sqlite3_vtab *pVTab, const unsigned char *buffer, - int bufferLength, void **data, size_t *numElements, - size_t *numDimensions, - enum VectorElementType *element_type) { - - if (bufferLength < 10) { - // IMP: V03312_20150 - vtab_set_error(pVTab, "numpy array too short"); - return SQLITE_ERROR; - } - if (memcmp(NPY_MAGIC, buffer, sizeof(NPY_MAGIC)) != 0) { - // V11954_28792 - vtab_set_error(pVTab, "numpy array does not contain the 'magic' header"); - return SQLITE_ERROR; - } - - u8 major = buffer[6]; - u8 minor = buffer[7]; - uint16_t headerLength = 0; - memcpy(&headerLength, &buffer[8], sizeof(uint16_t)); - - i32 totalHeaderLength = sizeof(NPY_MAGIC) + sizeof(major) + sizeof(minor) + - sizeof(headerLength) + headerLength; - i32 dataSize = bufferLength - totalHeaderLength; - - if (dataSize < 0) { - vtab_set_error(pVTab, "numpy array header length is invalid"); - return SQLITE_ERROR; - } - - const unsigned char *header = &buffer[10]; - int fortran_order; - - int rc = parse_npy_header(pVTab, header, headerLength, element_type, - &fortran_order, numElements, numDimensions); - if (rc != SQLITE_OK) { - return rc; - } - - i32 expectedDataSize = - (*numElements * vector_byte_size(*element_type, *numDimensions)); - if (expectedDataSize != dataSize) { - vtab_set_error(pVTab, - "numpy array error: Expected a data size of %d, found %d", - expectedDataSize, dataSize); - return SQLITE_ERROR; - } - - *data = (void *)&buffer[totalHeaderLength]; - return SQLITE_OK; -} - -static int vec_npy_eachConnect(sqlite3 *db, void *pAux, int argc, - const char *const *argv, sqlite3_vtab **ppVtab, - char **pzErr) { - UNUSED_PARAMETER(pAux); - UNUSED_PARAMETER(argc); - UNUSED_PARAMETER(argv); - UNUSED_PARAMETER(pzErr); - vec_npy_each_vtab *pNew; - int rc; - - rc = sqlite3_declare_vtab(db, "CREATE TABLE x(vector, input hidden)"); -#define VEC_NPY_EACH_COLUMN_VECTOR 0 -#define VEC_NPY_EACH_COLUMN_INPUT 1 - if (rc == SQLITE_OK) { - pNew = sqlite3_malloc(sizeof(*pNew)); - *ppVtab = (sqlite3_vtab *)pNew; - if (pNew == 0) - return SQLITE_NOMEM; - memset(pNew, 0, sizeof(*pNew)); - } - return rc; -} - -static int vec_npy_eachDisconnect(sqlite3_vtab *pVtab) { - vec_npy_each_vtab *p = (vec_npy_each_vtab *)pVtab; - sqlite3_free(p); - return SQLITE_OK; -} - -static int vec_npy_eachOpen(sqlite3_vtab *p, sqlite3_vtab_cursor **ppCursor) { - UNUSED_PARAMETER(p); - vec_npy_each_cursor *pCur; - pCur = sqlite3_malloc(sizeof(*pCur)); - if (pCur == 0) - return SQLITE_NOMEM; - memset(pCur, 0, sizeof(*pCur)); - *ppCursor = &pCur->base; - return SQLITE_OK; -} - -static int vec_npy_eachClose(sqlite3_vtab_cursor *cur) { - vec_npy_each_cursor *pCur = (vec_npy_each_cursor *)cur; -#ifndef SQLITE_VEC_OMIT_FS - if (pCur->file) { - fclose(pCur->file); - pCur->file = NULL; - } -#endif - if (pCur->chunksBuffer) { - sqlite3_free(pCur->chunksBuffer); - pCur->chunksBuffer = NULL; - } - if (pCur->vector) { - pCur->vector = NULL; - } - sqlite3_free(pCur); - return SQLITE_OK; -} - -static int vec_npy_eachBestIndex(sqlite3_vtab *pVTab, - sqlite3_index_info *pIdxInfo) { - int hasInput; - for (int i = 0; i < pIdxInfo->nConstraint; i++) { - const struct sqlite3_index_constraint *pCons = &pIdxInfo->aConstraint[i]; - // printf("i=%d iColumn=%d, op=%d, usable=%d\n", i, pCons->iColumn, - // pCons->op, pCons->usable); - switch (pCons->iColumn) { - case VEC_NPY_EACH_COLUMN_INPUT: { - if (pCons->op == SQLITE_INDEX_CONSTRAINT_EQ && pCons->usable) { - hasInput = 1; - pIdxInfo->aConstraintUsage[i].argvIndex = 1; - pIdxInfo->aConstraintUsage[i].omit = 1; - } - break; - } - } - } - if (!hasInput) { - pVTab->zErrMsg = sqlite3_mprintf("input argument is required"); - return SQLITE_ERROR; - } - - pIdxInfo->estimatedCost = (double)100000; - pIdxInfo->estimatedRows = 100000; - - return SQLITE_OK; -} - -static int vec_npy_eachFilter(sqlite3_vtab_cursor *pVtabCursor, int idxNum, - const char *idxStr, int argc, - sqlite3_value **argv) { - UNUSED_PARAMETER(idxNum); - UNUSED_PARAMETER(idxStr); - assert(argc == 1); - int rc; - - vec_npy_each_cursor *pCur = (vec_npy_each_cursor *)pVtabCursor; - -#ifndef SQLITE_VEC_OMIT_FS - if (pCur->file) { - fclose(pCur->file); - pCur->file = NULL; - } -#endif - if (pCur->chunksBuffer) { - sqlite3_free(pCur->chunksBuffer); - pCur->chunksBuffer = NULL; - } - if (pCur->vector) { - pCur->vector = NULL; - } - -#ifndef SQLITE_VEC_OMIT_FS - struct VecNpyFile *f = NULL; - if ((f = sqlite3_value_pointer(argv[0], SQLITE_VEC_NPY_FILE_NAME))) { - FILE *file = fopen(f->path, "r"); - if (!file) { - vtab_set_error(pVtabCursor->pVtab, "Could not open numpy file"); - return SQLITE_ERROR; - } - - rc = parse_npy_file(pVtabCursor->pVtab, file, pCur); - if (rc != SQLITE_OK) { -#ifndef SQLITE_VEC_OMIT_FS - fclose(file); -#endif - return rc; - } - - } else -#endif - { - - const unsigned char *input = sqlite3_value_blob(argv[0]); - int inputLength = sqlite3_value_bytes(argv[0]); - void *data; - size_t numElements; - size_t numDimensions; - enum VectorElementType element_type; - - rc = parse_npy_buffer(pVtabCursor->pVtab, input, inputLength, &data, - &numElements, &numDimensions, &element_type); - if (rc != SQLITE_OK) { - return rc; - } - - pCur->vector = data; - pCur->elementType = element_type; - pCur->nElements = numElements; - pCur->nDimensions = numDimensions; - pCur->input_type = VEC_NPY_EACH_INPUT_BUFFER; - } - - pCur->iRowid = 0; - return SQLITE_OK; -} - -static int vec_npy_eachRowid(sqlite3_vtab_cursor *cur, sqlite_int64 *pRowid) { - vec_npy_each_cursor *pCur = (vec_npy_each_cursor *)cur; - *pRowid = pCur->iRowid; - return SQLITE_OK; -} - -static int vec_npy_eachEof(sqlite3_vtab_cursor *cur) { - vec_npy_each_cursor *pCur = (vec_npy_each_cursor *)cur; - if (pCur->input_type == VEC_NPY_EACH_INPUT_BUFFER) { - return (!pCur->nElements) || (size_t)pCur->iRowid >= pCur->nElements; - } - return pCur->eof; -} - -static int vec_npy_eachNext(sqlite3_vtab_cursor *cur) { - vec_npy_each_cursor *pCur = (vec_npy_each_cursor *)cur; - pCur->iRowid++; - if (pCur->input_type == VEC_NPY_EACH_INPUT_BUFFER) { - return SQLITE_OK; - } - -#ifndef SQLITE_VEC_OMIT_FS - // else: input is a file - pCur->currentChunkIndex++; - if (pCur->currentChunkIndex >= pCur->currentChunkSize) { - pCur->currentChunkSize = - fread(pCur->chunksBuffer, - vector_byte_size(pCur->elementType, pCur->nDimensions), - pCur->maxChunks, pCur->file); - if (!pCur->currentChunkSize) { - pCur->eof = 1; - } - pCur->currentChunkIndex = 0; - } -#endif - return SQLITE_OK; -} - -static int vec_npy_eachColumnBuffer(vec_npy_each_cursor *pCur, - sqlite3_context *context, int i) { - switch (i) { - case VEC_NPY_EACH_COLUMN_VECTOR: { - sqlite3_result_subtype(context, pCur->elementType); - switch (pCur->elementType) { - case SQLITE_VEC_ELEMENT_TYPE_FLOAT32: { - sqlite3_result_blob( - context, - &((unsigned char *) - pCur->vector)[pCur->iRowid * pCur->nDimensions * sizeof(f32)], - pCur->nDimensions * sizeof(f32), SQLITE_TRANSIENT); - - break; - } - case SQLITE_VEC_ELEMENT_TYPE_INT8: - case SQLITE_VEC_ELEMENT_TYPE_BIT: { - // https://github.com/asg017/sqlite-vec/issues/42 - sqlite3_result_error(context, - "vec_npy_each only supports float32 vectors", -1); - break; - } - } - - break; - } - } - return SQLITE_OK; -} -static int vec_npy_eachColumnFile(vec_npy_each_cursor *pCur, - sqlite3_context *context, int i) { - switch (i) { - case VEC_NPY_EACH_COLUMN_VECTOR: { - switch (pCur->elementType) { - case SQLITE_VEC_ELEMENT_TYPE_FLOAT32: { - sqlite3_result_blob( - context, - &((unsigned char *) - pCur->chunksBuffer)[pCur->currentChunkIndex * - pCur->nDimensions * sizeof(f32)], - pCur->nDimensions * sizeof(f32), SQLITE_TRANSIENT); - break; - } - case SQLITE_VEC_ELEMENT_TYPE_INT8: - case SQLITE_VEC_ELEMENT_TYPE_BIT: { - // https://github.com/asg017/sqlite-vec/issues/42 - sqlite3_result_error(context, - "vec_npy_each only supports float32 vectors", -1); - break; - } - } - break; - } - } - return SQLITE_OK; -} -static int vec_npy_eachColumn(sqlite3_vtab_cursor *cur, - sqlite3_context *context, int i) { - vec_npy_each_cursor *pCur = (vec_npy_each_cursor *)cur; - switch (pCur->input_type) { - case VEC_NPY_EACH_INPUT_BUFFER: - return vec_npy_eachColumnBuffer(pCur, context, i); - case VEC_NPY_EACH_INPUT_FILE: - return vec_npy_eachColumnFile(pCur, context, i); - } - return SQLITE_ERROR; -} - -static sqlite3_module vec_npy_eachModule = { - /* iVersion */ 0, - /* xCreate */ 0, - /* xConnect */ vec_npy_eachConnect, - /* xBestIndex */ vec_npy_eachBestIndex, - /* xDisconnect */ vec_npy_eachDisconnect, - /* xDestroy */ 0, - /* xOpen */ vec_npy_eachOpen, - /* xClose */ vec_npy_eachClose, - /* xFilter */ vec_npy_eachFilter, - /* xNext */ vec_npy_eachNext, - /* xEof */ vec_npy_eachEof, - /* xColumn */ vec_npy_eachColumn, - /* xRowid */ vec_npy_eachRowid, - /* xUpdate */ 0, - /* xBegin */ 0, - /* xSync */ 0, - /* xCommit */ 0, - /* xRollback */ 0, - /* xFindMethod */ 0, - /* xRename */ 0, - /* xSavepoint */ 0, - /* xRelease */ 0, - /* xRollbackTo */ 0, - /* xShadowName */ 0, -#if SQLITE_VERSION_NUMBER >= 3044000 - /* xIntegrity */ 0, -#endif -}; - -#pragma endregion #pragma region vec0 virtual table #define VEC0_COLUMN_ID 0 #define VEC0_COLUMN_USERN_START 1 -#define VEC0_COLUMN_OFFSET_DISTANCE 1 -#define VEC0_COLUMN_OFFSET_K 2 +#define VEC0_COLUMN_OFFSET_COMMAND 1 +#define VEC0_COLUMN_OFFSET_DISTANCE 2 +#define VEC0_COLUMN_OFFSET_K 3 #define VEC0_SHADOW_INFO_NAME "\"%w\".\"%w_info\"" @@ -3473,6 +3457,9 @@ static sqlite3_module vec_npy_eachModule = { #define VEC0_SHADOW_AUXILIARY_NAME "\"%w\".\"%w_auxiliary\"" #define VEC0_SHADOW_METADATA_N_NAME "\"%w\".\"%w_metadatachunks%02d\"" +#define VEC0_SHADOW_VECTORS_N_NAME "\"%w\".\"%w_vectors%02d\"" +#define VEC0_SHADOW_DISKANN_NODES_N_NAME "\"%w\".\"%w_diskann_nodes%02d\"" +#define VEC0_SHADOW_DISKANN_BUFFER_N_NAME "\"%w\".\"%w_diskann_buffer%02d\"" #define VEC0_SHADOW_METADATA_TEXT_DATA_NAME "\"%w\".\"%w_metadatatext%02d\"" #define VEC_INTERAL_ERROR "Internal sqlite-vec error: " @@ -3513,6 +3500,10 @@ struct vec0_vtab { // Will change the schema of the _rowids table, and insert/query logic. int pkIsText; + // True if the hidden command column (named after the table) exists. + // Tables created before v0.1.10 or without _info table don't have it. + int hasCommandColumn; + // number of defined vector columns. int numVectorColumns; @@ -3555,6 +3546,19 @@ struct vec0_vtab { // The first numVectorColumns entries must be freed with sqlite3_free() char *shadowVectorChunksNames[VEC0_MAX_VECTOR_COLUMNS]; +#if SQLITE_VEC_ENABLE_RESCORE + // Name of all rescore chunk shadow tables, ie `_rescore_chunks00` + // Only populated for vector columns with rescore enabled. + // Must be freed with sqlite3_free() + char *shadowRescoreChunksNames[VEC0_MAX_VECTOR_COLUMNS]; + + // Name of all rescore vector shadow tables, ie `_rescore_vectors00` + // Rowid-keyed table for fast random-access float vector reads during rescore. + // Only populated for vector columns with rescore enabled. + // Must be freed with sqlite3_free() + char *shadowRescoreVectorsNames[VEC0_MAX_VECTOR_COLUMNS]; +#endif + // Name of all metadata chunk shadow tables, ie `_metadatachunks00` // Only the first numMetadataColumns entries will be available. // The first numMetadataColumns entries must be freed with sqlite3_free() @@ -3567,6 +3571,18 @@ struct vec0_vtab { int chunk_size; +#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE + // IVF cached state per vector column + char *shadowIvfCellsNames[VEC0_MAX_VECTOR_COLUMNS]; // table name for blob_open + int ivfTrainedCache[VEC0_MAX_VECTOR_COLUMNS]; // -1=unknown, 0=no, 1=yes + sqlite3_stmt *stmtIvfCellMeta[VEC0_MAX_VECTOR_COLUMNS]; // SELECT n_vectors, length(validity)*8 FROM cells WHERE cell_id=? + sqlite3_stmt *stmtIvfCellUpdateN[VEC0_MAX_VECTOR_COLUMNS]; // UPDATE cells SET n_vectors=n_vectors+? WHERE cell_id=? + sqlite3_stmt *stmtIvfRowidMapInsert[VEC0_MAX_VECTOR_COLUMNS]; // INSERT INTO rowid_map(rowid,cell_id,slot) VALUES(?,?,?) + sqlite3_stmt *stmtIvfRowidMapLookup[VEC0_MAX_VECTOR_COLUMNS]; // SELECT cell_id,slot FROM rowid_map WHERE rowid=? + sqlite3_stmt *stmtIvfRowidMapDelete[VEC0_MAX_VECTOR_COLUMNS]; // DELETE FROM rowid_map WHERE rowid=? + sqlite3_stmt *stmtIvfCentroidsAll[VEC0_MAX_VECTOR_COLUMNS]; // SELECT centroid_id,centroid FROM centroids +#endif + // select latest chunk from _chunks, getting chunk_id sqlite3_stmt *stmtLatestChunk; @@ -3622,8 +3638,38 @@ struct vec0_vtab { * Must be cleaned up with sqlite3_finalize(). */ sqlite3_stmt *stmtRowidsGetChunkPosition; + + // === DiskANN additions === +#if SQLITE_VEC_ENABLE_DISKANN + // Shadow table names for DiskANN, per vector column + // e.g., "{schema}"."{table}_vectors{00..15}" + char *shadowVectorsNames[VEC0_MAX_VECTOR_COLUMNS]; + + // e.g., "{schema}"."{table}_diskann_nodes{00..15}" + char *shadowDiskannNodesNames[VEC0_MAX_VECTOR_COLUMNS]; + + // Prepared statements for DiskANN operations (per vector column) + // These will be lazily prepared on first use. + sqlite3_stmt *stmtDiskannNodeRead[VEC0_MAX_VECTOR_COLUMNS]; + sqlite3_stmt *stmtDiskannNodeWrite[VEC0_MAX_VECTOR_COLUMNS]; + sqlite3_stmt *stmtDiskannNodeInsert[VEC0_MAX_VECTOR_COLUMNS]; + sqlite3_stmt *stmtVectorsRead[VEC0_MAX_VECTOR_COLUMNS]; + sqlite3_stmt *stmtVectorsInsert[VEC0_MAX_VECTOR_COLUMNS]; +#endif }; +#if SQLITE_VEC_ENABLE_RESCORE +// Forward declarations for rescore functions (defined in sqlite-vec-rescore.c, +// included later after all helpers they depend on are defined). +static int rescore_create_tables(vec0_vtab *p, sqlite3 *db, char **pzErr); +static int rescore_drop_tables(vec0_vtab *p); +static int rescore_new_chunk(vec0_vtab *p, i64 chunk_rowid); +static int rescore_on_insert(vec0_vtab *p, i64 chunk_rowid, i64 chunk_offset, + i64 rowid, void *vectorDatas[]); +static int rescore_on_delete(vec0_vtab *p, i64 chunk_id, u64 chunk_offset, i64 rowid); +static int rescore_delete_chunk(vec0_vtab *p, i64 chunk_id); +#endif + /** * @brief Finalize all the sqlite3_stmt members in a vec0_vtab. * @@ -3640,6 +3686,24 @@ void vec0_free_resources(vec0_vtab *p) { p->stmtRowidsUpdatePosition = NULL; sqlite3_finalize(p->stmtRowidsGetChunkPosition); p->stmtRowidsGetChunkPosition = NULL; + +#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE + for (int i = 0; i < VEC0_MAX_VECTOR_COLUMNS; i++) { + sqlite3_finalize(p->stmtIvfCellMeta[i]); p->stmtIvfCellMeta[i] = NULL; + sqlite3_finalize(p->stmtIvfCellUpdateN[i]); p->stmtIvfCellUpdateN[i] = NULL; + sqlite3_finalize(p->stmtIvfRowidMapInsert[i]); p->stmtIvfRowidMapInsert[i] = NULL; + sqlite3_finalize(p->stmtIvfRowidMapLookup[i]); p->stmtIvfRowidMapLookup[i] = NULL; + sqlite3_finalize(p->stmtIvfRowidMapDelete[i]); p->stmtIvfRowidMapDelete[i] = NULL; + sqlite3_finalize(p->stmtIvfCentroidsAll[i]); p->stmtIvfCentroidsAll[i] = NULL; +#if SQLITE_VEC_ENABLE_DISKANN + sqlite3_finalize(p->stmtDiskannNodeRead[i]); p->stmtDiskannNodeRead[i] = NULL; + sqlite3_finalize(p->stmtDiskannNodeWrite[i]); p->stmtDiskannNodeWrite[i] = NULL; + sqlite3_finalize(p->stmtDiskannNodeInsert[i]); p->stmtDiskannNodeInsert[i] = NULL; + sqlite3_finalize(p->stmtVectorsRead[i]); p->stmtVectorsRead[i] = NULL; + sqlite3_finalize(p->stmtVectorsInsert[i]); p->stmtVectorsInsert[i] = NULL; +#endif + } +#endif } /** @@ -3662,6 +3726,25 @@ void vec0_free(vec0_vtab *p) { for (int i = 0; i < p->numVectorColumns; i++) { sqlite3_free(p->shadowVectorChunksNames[i]); p->shadowVectorChunksNames[i] = NULL; +#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE + sqlite3_free(p->shadowIvfCellsNames[i]); + p->shadowIvfCellsNames[i] = NULL; +#endif + +#if SQLITE_VEC_ENABLE_RESCORE + sqlite3_free(p->shadowRescoreChunksNames[i]); + p->shadowRescoreChunksNames[i] = NULL; + + sqlite3_free(p->shadowRescoreVectorsNames[i]); + p->shadowRescoreVectorsNames[i] = NULL; +#endif + +#if SQLITE_VEC_ENABLE_DISKANN + sqlite3_free(p->shadowVectorsNames[i]); + p->shadowVectorsNames[i] = NULL; + sqlite3_free(p->shadowDiskannNodesNames[i]); + p->shadowDiskannNodesNames[i] = NULL; +#endif sqlite3_free(p->vector_columns[i].name); p->vector_columns[i].name = NULL; @@ -3683,6 +3766,12 @@ void vec0_free(vec0_vtab *p) { } } +#if SQLITE_VEC_ENABLE_DISKANN +#include "sqlite-vec-diskann.c" +#else +static int vec0_all_columns_diskann(vec0_vtab *p) { (void)p; return 0; } +#endif + int vec0_num_defined_user_columns(vec0_vtab *p) { return p->numVectorColumns + p->numPartitionColumns + p->numAuxiliaryColumns + p->numMetadataColumns; } @@ -3694,20 +3783,19 @@ int vec0_num_defined_user_columns(vec0_vtab *p) { * @param p vec0 table * @return int */ -int vec0_column_distance_idx(vec0_vtab *p) { - return VEC0_COLUMN_USERN_START + (vec0_num_defined_user_columns(p) - 1) + - VEC0_COLUMN_OFFSET_DISTANCE; +int vec0_column_command_idx(vec0_vtab *p) { + // Command column is the first hidden column (right after user columns) + return VEC0_COLUMN_USERN_START + vec0_num_defined_user_columns(p); +} + +int vec0_column_distance_idx(vec0_vtab *p) { + int base = VEC0_COLUMN_USERN_START + vec0_num_defined_user_columns(p); + return base + (p->hasCommandColumn ? 1 : 0); } -/** - * @brief Returns the index of the k hidden column for the given vec0 table. - * - * @param p vec0 table - * @return int k column index - */ int vec0_column_k_idx(vec0_vtab *p) { - return VEC0_COLUMN_USERN_START + (vec0_num_defined_user_columns(p) - 1) + - VEC0_COLUMN_OFFSET_K; + int base = VEC0_COLUMN_USERN_START + vec0_num_defined_user_columns(p); + return base + (p->hasCommandColumn ? 2 : 1); } /** @@ -3942,12 +4030,44 @@ int vec0_result_id(vec0_vtab *p, sqlite3_context *context, i64 rowid) { * will be stored. * @return int SQLITE_OK on success. */ +#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE +// Forward declaration — defined in sqlite-vec-ivf.c (included later) +static int ivf_get_vector_data(vec0_vtab *p, i64 rowid, int col_idx, + void **outVector, int *outVectorSize); +#endif + int vec0_get_vector_data(vec0_vtab *pVtab, i64 rowid, int vector_column_idx, void **outVector, int *outVectorSize) { vec0_vtab *p = pVtab; int rc, brc; + +#if SQLITE_VEC_ENABLE_DISKANN + // DiskANN fast path: read from _vectors table + if (p->vector_columns[vector_column_idx].index_type == VEC0_INDEX_TYPE_DISKANN) { + void *vec = NULL; + int vecSize; + rc = diskann_vector_read(p, vector_column_idx, rowid, &vec, &vecSize); + if (rc != SQLITE_OK) { + vtab_set_error(&pVtab->base, + "Could not fetch vector data for %lld from DiskANN vectors table", + rowid); + return SQLITE_ERROR; + } + *outVector = vec; + if (outVectorSize) *outVectorSize = vecSize; + return SQLITE_OK; + } +#endif + i64 chunk_id; i64 chunk_offset; + +#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE + // IVF-indexed columns store vectors in _ivf_cells, not _vector_chunks + if (p->vector_columns[vector_column_idx].index_type == VEC0_INDEX_TYPE_IVF) { + return ivf_get_vector_data(p, rowid, vector_column_idx, outVector, outVectorSize); + } +#endif size_t size; void *buf = NULL; int blobOffset; @@ -3955,6 +4075,41 @@ int vec0_get_vector_data(vec0_vtab *pVtab, i64 rowid, int vector_column_idx, assert((vector_column_idx >= 0) && (vector_column_idx < pVtab->numVectorColumns)); +#if SQLITE_VEC_ENABLE_RESCORE + // Rescore columns store float vectors in _rescore_vectors (rowid-keyed) + if (p->vector_columns[vector_column_idx].index_type == VEC0_INDEX_TYPE_RESCORE) { + size = vector_column_byte_size(p->vector_columns[vector_column_idx]); + rc = sqlite3_blob_open(p->db, p->schemaName, + p->shadowRescoreVectorsNames[vector_column_idx], + "vector", rowid, 0, &vectorBlob); + if (rc != SQLITE_OK) { + vtab_set_error(&pVtab->base, + "Could not fetch vector data for %lld from rescore vectors", + rowid); + rc = SQLITE_ERROR; + goto cleanup; + } + buf = sqlite3_malloc(size); + if (!buf) { + rc = SQLITE_NOMEM; + goto cleanup; + } + rc = sqlite3_blob_read(vectorBlob, buf, size, 0); + if (rc != SQLITE_OK) { + sqlite3_free(buf); + buf = NULL; + rc = SQLITE_ERROR; + goto cleanup; + } + *outVector = buf; + if (outVectorSize) { + *outVectorSize = size; + } + rc = SQLITE_OK; + goto cleanup; + } +#endif /* SQLITE_VEC_ENABLE_RESCORE */ + rc = vec0_get_chunk_position(pVtab, rowid, NULL, &chunk_id, &chunk_offset); if (rc == SQLITE_EMPTY) { vtab_set_error(&pVtab->base, "Could not find a row with rowid %lld", rowid); @@ -4558,6 +4713,12 @@ int vec0_new_chunk(vec0_vtab *p, sqlite3_value ** partitionKeyValues, i64 *chunk continue; } int vector_column_idx = p->user_column_idxs[i]; + + // Non-FLAT columns (rescore, IVF, DiskANN) don't use _vector_chunks + if (p->vector_columns[vector_column_idx].index_type != VEC0_INDEX_TYPE_FLAT) { + continue; + } + i64 vectorsSize = p->chunk_size * vector_column_byte_size(p->vector_columns[vector_column_idx]); @@ -4588,6 +4749,14 @@ int vec0_new_chunk(vec0_vtab *p, sqlite3_value ** partitionKeyValues, i64 *chunk } } +#if SQLITE_VEC_ENABLE_RESCORE + // Create new rescore chunks for each rescore-enabled vector column + rc = rescore_new_chunk(p, rowid); + if (rc != SQLITE_OK) { + return rc; + } +#endif + // Step 3: Create new metadata chunks for each metadata column for (int i = 0; i < vec0_num_defined_user_columns(p); i++) { if(p->user_column_kinds[i] != SQLITE_VEC0_USER_COLUMN_KIND_METADATA) { @@ -4717,6 +4886,12 @@ void vec0_cursor_clear(vec0_cursor *pCur) { } } +// IVF index implementation — #include'd here after all struct/helper definitions +#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE +#include "sqlite-vec-ivf-kmeans.c" +#include "sqlite-vec-ivf.c" +#endif + #define VEC_CONSTRUCTOR_ERROR "vec0 constructor error: " static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, sqlite3_vtab **ppVtab, char **pzErr, bool isCreate) { @@ -4778,6 +4953,26 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, (i64)vecColumn.dimensions, SQLITE_VEC_VEC0_MAX_DIMENSIONS); goto error; } + + // DiskANN validation + if (vecColumn.index_type == VEC0_INDEX_TYPE_DISKANN) { + if (vecColumn.element_type == SQLITE_VEC_ELEMENT_TYPE_BIT) { + sqlite3_free(vecColumn.name); + *pzErr = sqlite3_mprintf( + VEC_CONSTRUCTOR_ERROR + "DiskANN index is not supported on bit vector columns"); + goto error; + } + if (vecColumn.diskann.quantizer_type == VEC0_DISKANN_QUANTIZER_BINARY && + (vecColumn.dimensions % CHAR_BIT) != 0) { + sqlite3_free(vecColumn.name); + *pzErr = sqlite3_mprintf( + VEC_CONSTRUCTOR_ERROR + "DiskANN with binary quantizer requires dimensions divisible by 8"); + goto error; + } + } + pNew->user_column_kinds[user_column_idx] = SQLITE_VEC0_USER_COLUMN_KIND_VECTOR; pNew->user_column_idxs[user_column_idx] = numVectorColumns; memcpy(&pNew->vector_columns[numVectorColumns], &vecColumn, sizeof(vecColumn)); @@ -4949,6 +5144,140 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, goto error; } +#if SQLITE_VEC_ENABLE_RESCORE + { + int hasRescore = 0; + for (int i = 0; i < numVectorColumns; i++) { + if (pNew->vector_columns[i].index_type == VEC0_INDEX_TYPE_RESCORE) { + hasRescore = 1; + break; + } + } + if (hasRescore) { + if (numMetadataColumns > 0) { + *pzErr = sqlite3_mprintf(VEC_CONSTRUCTOR_ERROR + "Metadata columns are not supported with rescore indexes"); + goto error; + } + if (numPartitionColumns > 0) { + *pzErr = sqlite3_mprintf(VEC_CONSTRUCTOR_ERROR + "Partition key columns are not supported with rescore indexes"); + goto error; + } + } + } +#endif + + // IVF indexes do not support auxiliary, metadata, or partition key columns. + { + int has_ivf = 0; + for (int i = 0; i < numVectorColumns; i++) { + if (pNew->vector_columns[i].index_type == VEC0_INDEX_TYPE_IVF) { + has_ivf = 1; + break; + } + } + if (has_ivf) { + if (numPartitionColumns > 0) { + *pzErr = sqlite3_mprintf(VEC_CONSTRUCTOR_ERROR + "partition key columns are not supported with IVF indexes"); + goto error; + } + if (numMetadataColumns > 0) { + *pzErr = sqlite3_mprintf(VEC_CONSTRUCTOR_ERROR + "metadata columns are not supported with IVF indexes"); + goto error; + } + } + } + + // DiskANN columns cannot coexist with aux/metadata/partition columns + for (int i = 0; i < numVectorColumns; i++) { + if (pNew->vector_columns[i].index_type == VEC0_INDEX_TYPE_DISKANN) { + if (numMetadataColumns > 0) { + *pzErr = sqlite3_mprintf( + VEC_CONSTRUCTOR_ERROR + "Metadata columns are not supported with DiskANN-indexed vector columns"); + goto error; + } + if (numPartitionColumns > 0) { + *pzErr = sqlite3_mprintf( + VEC_CONSTRUCTOR_ERROR + "Partition key columns are not supported with DiskANN-indexed vector columns"); + goto error; + } + break; + } + } + + // Determine whether to add the FTS5-style hidden command column. + // New tables (isCreate) always get it; existing tables only if created + // with v0.1.10+ (which validated no column name == table name). + int hasCommandColumn = 0; + if (isCreate) { + // Validate no user column name conflicts with the table name + const char *tblName = argv[2]; + int tblNameLen = (int)strlen(tblName); + for (int i = 0; i < numVectorColumns; i++) { + if (pNew->vector_columns[i].name_length == tblNameLen && + sqlite3_strnicmp(pNew->vector_columns[i].name, tblName, tblNameLen) == 0) { + *pzErr = sqlite3_mprintf( + VEC_CONSTRUCTOR_ERROR + "column name '%s' conflicts with table name (reserved for command column)", + tblName); + goto error; + } + } + for (int i = 0; i < numPartitionColumns; i++) { + if (pNew->paritition_columns[i].name_length == tblNameLen && + sqlite3_strnicmp(pNew->paritition_columns[i].name, tblName, tblNameLen) == 0) { + *pzErr = sqlite3_mprintf( + VEC_CONSTRUCTOR_ERROR + "column name '%s' conflicts with table name (reserved for command column)", + tblName); + goto error; + } + } + for (int i = 0; i < numAuxiliaryColumns; i++) { + if (pNew->auxiliary_columns[i].name_length == tblNameLen && + sqlite3_strnicmp(pNew->auxiliary_columns[i].name, tblName, tblNameLen) == 0) { + *pzErr = sqlite3_mprintf( + VEC_CONSTRUCTOR_ERROR + "column name '%s' conflicts with table name (reserved for command column)", + tblName); + goto error; + } + } + for (int i = 0; i < numMetadataColumns; i++) { + if (pNew->metadata_columns[i].name_length == tblNameLen && + sqlite3_strnicmp(pNew->metadata_columns[i].name, tblName, tblNameLen) == 0) { + *pzErr = sqlite3_mprintf( + VEC_CONSTRUCTOR_ERROR + "column name '%s' conflicts with table name (reserved for command column)", + tblName); + goto error; + } + } + hasCommandColumn = 1; + } else { + // xConnect: check _info shadow table for version + sqlite3_stmt *stmtInfo = NULL; + char *zInfoSql = sqlite3_mprintf( + "SELECT value FROM " VEC0_SHADOW_INFO_NAME " WHERE key = 'CREATE_VERSION_PATCH'", + argv[1], argv[2]); + if (zInfoSql) { + int infoRc = sqlite3_prepare_v2(db, zInfoSql, -1, &stmtInfo, NULL); + sqlite3_free(zInfoSql); + if (infoRc == SQLITE_OK && sqlite3_step(stmtInfo) == SQLITE_ROW) { + int patch = sqlite3_column_int(stmtInfo, 0); + hasCommandColumn = (patch >= 10); // v0.1.10+ + } + // If _info doesn't exist or has no version, assume old table + sqlite3_finalize(stmtInfo); + } + } + pNew->hasCommandColumn = hasCommandColumn; + sqlite3_str *createStr = sqlite3_str_new(NULL); sqlite3_str_appendall(createStr, "CREATE TABLE x("); if (pkColumnName) { @@ -4990,7 +5319,11 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, } } - sqlite3_str_appendall(createStr, " distance hidden, k hidden) "); + if (hasCommandColumn) { + sqlite3_str_appendf(createStr, " \"%w\" hidden, distance hidden, k hidden) ", argv[2]); + } else { + sqlite3_str_appendall(createStr, " distance hidden, k hidden) "); + } if (pkColumnName) { sqlite3_str_appendall(createStr, "without rowid "); } @@ -5039,7 +5372,44 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, if (!pNew->shadowVectorChunksNames[i]) { goto error; } +#if SQLITE_VEC_ENABLE_RESCORE + if (pNew->vector_columns[i].index_type == VEC0_INDEX_TYPE_RESCORE) { + pNew->shadowRescoreChunksNames[i] = + sqlite3_mprintf("%s_rescore_chunks%02d", tableName, i); + if (!pNew->shadowRescoreChunksNames[i]) { + goto error; + } + pNew->shadowRescoreVectorsNames[i] = + sqlite3_mprintf("%s_rescore_vectors%02d", tableName, i); + if (!pNew->shadowRescoreVectorsNames[i]) { + goto error; + } + } +#endif +#if SQLITE_VEC_ENABLE_DISKANN + if (pNew->vector_columns[i].index_type == VEC0_INDEX_TYPE_DISKANN) { + pNew->shadowVectorsNames[i] = + sqlite3_mprintf("%s_vectors%02d", tableName, i); + if (!pNew->shadowVectorsNames[i]) { + goto error; + } + pNew->shadowDiskannNodesNames[i] = + sqlite3_mprintf("%s_diskann_nodes%02d", tableName, i); + if (!pNew->shadowDiskannNodesNames[i]) { + goto error; + } + } +#endif } +#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE + for (int i = 0; i < pNew->numVectorColumns; i++) { + if (pNew->vector_columns[i].index_type != VEC0_INDEX_TYPE_IVF) continue; + pNew->shadowIvfCellsNames[i] = + sqlite3_mprintf("%s_ivf_cells%02d", tableName, i); + if (!pNew->shadowIvfCellsNames[i]) goto error; + pNew->ivfTrainedCache[i] = -1; // unknown + } +#endif for (int i = 0; i < pNew->numMetadataColumns; i++) { pNew->shadowMetadataChunksNames[i] = sqlite3_mprintf("%s_metadatachunks%02d", tableName, i); @@ -5105,7 +5475,32 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, } sqlite3_finalize(stmt); - +#if SQLITE_VEC_ENABLE_DISKANN + // Seed medoid entries for DiskANN-indexed columns + for (int i = 0; i < pNew->numVectorColumns; i++) { + if (pNew->vector_columns[i].index_type != VEC0_INDEX_TYPE_DISKANN) { + continue; + } + char *key = sqlite3_mprintf("diskann_medoid_%02d", i); + char *zInsert = sqlite3_mprintf( + "INSERT INTO " VEC0_SHADOW_INFO_NAME "(key, value) VALUES (?1, ?2)", + pNew->schemaName, pNew->tableName); + rc = sqlite3_prepare_v2(db, zInsert, -1, &stmt, NULL); + sqlite3_free(zInsert); + if (rc != SQLITE_OK) { + sqlite3_free(key); + sqlite3_finalize(stmt); + goto error; + } + sqlite3_bind_text(stmt, 1, key, -1, sqlite3_free); + sqlite3_bind_null(stmt, 2); // NULL means empty graph + if (sqlite3_step(stmt) != SQLITE_DONE) { + sqlite3_finalize(stmt); + goto error; + } + sqlite3_finalize(stmt); + } +#endif // create the _chunks shadow table char *zCreateShadowChunks = NULL; @@ -5162,6 +5557,9 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, sqlite3_finalize(stmt); for (int i = 0; i < pNew->numVectorColumns; i++) { + // Non-FLAT columns (rescore, IVF, DiskANN) don't use _vector_chunks + if (pNew->vector_columns[i].index_type != VEC0_INDEX_TYPE_FLAT) + continue; char *zSql = sqlite3_mprintf(VEC0_SHADOW_VECTOR_N_CREATE, pNew->schemaName, pNew->tableName, i); if (!zSql) { @@ -5180,6 +5578,103 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, sqlite3_finalize(stmt); } +#if SQLITE_VEC_ENABLE_RESCORE + rc = rescore_create_tables(pNew, db, pzErr); + if (rc != SQLITE_OK) { + goto error; + } +#endif + +#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE + // Create IVF shadow tables for IVF-indexed vector columns + for (int i = 0; i < pNew->numVectorColumns; i++) { + if (pNew->vector_columns[i].index_type != VEC0_INDEX_TYPE_IVF) continue; + rc = ivf_create_shadow_tables(pNew, i); + if (rc != SQLITE_OK) { + *pzErr = sqlite3_mprintf("Could not create IVF shadow tables for column %d", i); + goto error; + } + } +#endif + +#if SQLITE_VEC_ENABLE_DISKANN + // Create DiskANN shadow tables for indexed vector columns + for (int i = 0; i < pNew->numVectorColumns; i++) { + if (pNew->vector_columns[i].index_type != VEC0_INDEX_TYPE_DISKANN) { + continue; + } + + // Create _vectors{NN} table + { + char *zSql = sqlite3_mprintf( + "CREATE TABLE " VEC0_SHADOW_VECTORS_N_NAME + " (rowid INTEGER PRIMARY KEY, vector BLOB NOT NULL);", + pNew->schemaName, pNew->tableName, i); + if (!zSql) { + goto error; + } + rc = sqlite3_prepare_v2(db, zSql, -1, &stmt, 0); + sqlite3_free(zSql); + if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) { + sqlite3_finalize(stmt); + *pzErr = sqlite3_mprintf( + "Could not create '_vectors%02d' shadow table: %s", i, + sqlite3_errmsg(db)); + goto error; + } + sqlite3_finalize(stmt); + } + + // Create _diskann_nodes{NN} table + { + char *zSql = sqlite3_mprintf( + "CREATE TABLE " VEC0_SHADOW_DISKANN_NODES_N_NAME " (" + "rowid INTEGER PRIMARY KEY, " + "neighbors_validity BLOB NOT NULL, " + "neighbor_ids BLOB NOT NULL, " + "neighbor_quantized_vectors BLOB NOT NULL" + ");", + pNew->schemaName, pNew->tableName, i); + if (!zSql) { + goto error; + } + rc = sqlite3_prepare_v2(db, zSql, -1, &stmt, 0); + sqlite3_free(zSql); + if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) { + sqlite3_finalize(stmt); + *pzErr = sqlite3_mprintf( + "Could not create '_diskann_nodes%02d' shadow table: %s", i, + sqlite3_errmsg(db)); + goto error; + } + sqlite3_finalize(stmt); + } + + // Create _diskann_buffer{NN} table (for batched inserts) + { + char *zSql = sqlite3_mprintf( + "CREATE TABLE " VEC0_SHADOW_DISKANN_BUFFER_N_NAME " (" + "rowid INTEGER PRIMARY KEY, " + "vector BLOB NOT NULL" + ");", + pNew->schemaName, pNew->tableName, i); + if (!zSql) { + goto error; + } + rc = sqlite3_prepare_v2(db, zSql, -1, &stmt, 0); + sqlite3_free(zSql); + if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) { + sqlite3_finalize(stmt); + *pzErr = sqlite3_mprintf( + "Could not create '_diskann_buffer%02d' shadow table: %s", i, + sqlite3_errmsg(db)); + goto error; + } + sqlite3_finalize(stmt); + } + } +#endif + // See SHADOW_TABLE_ROWID_QUIRK in vec0_new_chunk() — same "rowid PRIMARY KEY" // without INTEGER type issue applies here. for (int i = 0; i < pNew->numMetadataColumns; i++) { @@ -5314,6 +5809,48 @@ static int vec0Destroy(sqlite3_vtab *pVtab) { sqlite3_finalize(stmt); for (int i = 0; i < p->numVectorColumns; i++) { +#if SQLITE_VEC_ENABLE_DISKANN + if (p->vector_columns[i].index_type == VEC0_INDEX_TYPE_DISKANN) { + // Drop DiskANN shadow tables + zSql = sqlite3_mprintf("DROP TABLE IF EXISTS " VEC0_SHADOW_VECTORS_N_NAME, + p->schemaName, p->tableName, i); + if (zSql) { + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, 0); + sqlite3_free((void *)zSql); + if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) { + rc = SQLITE_ERROR; + goto done; + } + sqlite3_finalize(stmt); + } + zSql = sqlite3_mprintf("DROP TABLE IF EXISTS " VEC0_SHADOW_DISKANN_NODES_N_NAME, + p->schemaName, p->tableName, i); + if (zSql) { + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, 0); + sqlite3_free((void *)zSql); + if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) { + rc = SQLITE_ERROR; + goto done; + } + sqlite3_finalize(stmt); + } + zSql = sqlite3_mprintf("DROP TABLE IF EXISTS " VEC0_SHADOW_DISKANN_BUFFER_N_NAME, + p->schemaName, p->tableName, i); + if (zSql) { + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, 0); + sqlite3_free((void *)zSql); + if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) { + rc = SQLITE_ERROR; + goto done; + } + sqlite3_finalize(stmt); + } + continue; + } +#endif + // Non-FLAT columns (rescore, IVF, DiskANN) don't use _vector_chunks + if (p->vector_columns[i].index_type != VEC0_INDEX_TYPE_FLAT) + continue; zSql = sqlite3_mprintf("DROP TABLE \"%w\".\"%w\"", p->schemaName, p->shadowVectorChunksNames[i]); rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, 0); @@ -5325,6 +5862,21 @@ static int vec0Destroy(sqlite3_vtab *pVtab) { sqlite3_finalize(stmt); } +#if SQLITE_VEC_ENABLE_RESCORE + rc = rescore_drop_tables(p); + if (rc != SQLITE_OK) { + goto done; + } +#endif + +#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE + // Drop IVF shadow tables + for (int i = 0; i < p->numVectorColumns; i++) { + if (p->vector_columns[i].index_type != VEC0_INDEX_TYPE_IVF) continue; + ivf_drop_shadow_tables(p, i); + } +#endif + if(p->numAuxiliaryColumns > 0) { zSql = sqlite3_mprintf("DROP TABLE " VEC0_SHADOW_AUXILIARY_NAME, p->schemaName, p->tableName); rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, 0); @@ -5959,6 +6511,65 @@ int min_idx(const f32 *distances, i32 n, u8 *candidates, i32 *out, i32 k, assert(k > 0); assert(k <= n); +#ifdef SQLITE_VEC_EXPERIMENTAL_MIN_IDX + // Max-heap variant: O(n log k) single-pass. + // out[0..heap_size-1] stores indices; heap ordered by distances descending + // so out[0] is always the index of the LARGEST distance in the top-k. + (void)bTaken; + int heap_size = 0; + + #define HEAP_SIFT_UP(pos) do { \ + int _c = (pos); \ + while (_c > 0) { \ + int _p = (_c - 1) / 2; \ + if (distances[out[_p]] < distances[out[_c]]) { \ + i32 _tmp = out[_p]; out[_p] = out[_c]; out[_c] = _tmp; \ + _c = _p; \ + } else break; \ + } \ + } while(0) + + #define HEAP_SIFT_DOWN(pos, sz) do { \ + int _p = (pos); \ + for (;;) { \ + int _l = 2*_p + 1, _r = 2*_p + 2, _largest = _p; \ + if (_l < (sz) && distances[out[_l]] > distances[out[_largest]]) \ + _largest = _l; \ + if (_r < (sz) && distances[out[_r]] > distances[out[_largest]]) \ + _largest = _r; \ + if (_largest == _p) break; \ + i32 _tmp = out[_p]; out[_p] = out[_largest]; out[_largest] = _tmp; \ + _p = _largest; \ + } \ + } while(0) + + for (int i = 0; i < n; i++) { + if (!bitmap_get(candidates, i)) + continue; + if (heap_size < k) { + out[heap_size] = i; + heap_size++; + HEAP_SIFT_UP(heap_size - 1); + } else if (distances[i] < distances[out[0]]) { + out[0] = i; + HEAP_SIFT_DOWN(0, heap_size); + } + } + + // Heapsort to produce ascending order. + for (int i = heap_size - 1; i > 0; i--) { + i32 tmp = out[0]; out[0] = out[i]; out[i] = tmp; + HEAP_SIFT_DOWN(0, i); + } + + #undef HEAP_SIFT_UP + #undef HEAP_SIFT_DOWN + + *k_used = heap_size; + return SQLITE_OK; + +#else + // Original: O(n*k) repeated linear scan with bitmap. bitmap_clear(bTaken, n); for (int ik = 0; ik < k; ik++) { @@ -5984,6 +6595,7 @@ int min_idx(const f32 *distances, i32 n, u8 *candidates, i32 *out, i32 k, } *k_used = k; return SQLITE_OK; +#endif } int vec0_get_metadata_text_long_value( @@ -7026,6 +7638,175 @@ cleanup: return rc; } +#if SQLITE_VEC_ENABLE_RESCORE +#include "sqlite-vec-rescore.c" +#endif + +#if SQLITE_VEC_ENABLE_DISKANN +/** + * Handle a KNN query using the DiskANN graph search. + */ +static int vec0Filter_knn_diskann( + vec0_cursor *pCur, vec0_vtab *p, int idxNum, + const char *idxStr, int argc, sqlite3_value **argv) { + + int rc; + int vectorColumnIdx = idxNum; + struct VectorColumnDefinition *vector_column = &p->vector_columns[vectorColumnIdx]; + struct vec0_query_knn_data *knn_data; + + knn_data = sqlite3_malloc(sizeof(*knn_data)); + if (!knn_data) return SQLITE_NOMEM; + memset(knn_data, 0, sizeof(*knn_data)); + + // Parse query_idx and k_idx from idxStr + int query_idx = -1; + int k_idx = -1; + for (int i = 0; i < argc; i++) { + if (idxStr[1 + (i * 4)] == VEC0_IDXSTR_KIND_KNN_MATCH) { + query_idx = i; + } + if (idxStr[1 + (i * 4)] == VEC0_IDXSTR_KIND_KNN_K) { + k_idx = i; + } + } + assert(query_idx >= 0); + assert(k_idx >= 0); + + // Extract query vector + void *queryVector; + size_t dimensions; + enum VectorElementType elementType; + vector_cleanup queryVectorCleanup = vector_cleanup_noop; + char *pzError; + + rc = vector_from_value(argv[query_idx], &queryVector, &dimensions, + &elementType, &queryVectorCleanup, &pzError); + if (rc != SQLITE_OK) { + vtab_set_error(&p->base, "Invalid query vector: %z", pzError); + sqlite3_free(knn_data); + return SQLITE_ERROR; + } + + if (elementType != vector_column->element_type || + dimensions != vector_column->dimensions) { + vtab_set_error(&p->base, "Query vector type/dimension mismatch"); + queryVectorCleanup(queryVector); + sqlite3_free(knn_data); + return SQLITE_ERROR; + } + + i64 k = sqlite3_value_int64(argv[k_idx]); + if (k <= 0) { + knn_data->k = 0; + knn_data->k_used = 0; + pCur->knn_data = knn_data; + pCur->query_plan = VEC0_QUERY_PLAN_KNN; + queryVectorCleanup(queryVector); + return SQLITE_OK; + } + + // Run DiskANN search + i64 *resultRowids = sqlite3_malloc(k * sizeof(i64)); + f32 *resultDistances = sqlite3_malloc(k * sizeof(f32)); + if (!resultRowids || !resultDistances) { + sqlite3_free(resultRowids); + sqlite3_free(resultDistances); + queryVectorCleanup(queryVector); + sqlite3_free(knn_data); + return SQLITE_NOMEM; + } + + int resultCount; + rc = diskann_search(p, vectorColumnIdx, queryVector, dimensions, + elementType, (int)k, 0, + resultRowids, resultDistances, &resultCount); + + if (rc != SQLITE_OK) { + queryVectorCleanup(queryVector); + sqlite3_free(resultRowids); + sqlite3_free(resultDistances); + sqlite3_free(knn_data); + return rc; + } + + // Scan _diskann_buffer for any buffered (unflushed) vectors and merge + // with graph results. This ensures no recall loss for buffered vectors. + { + sqlite3_stmt *bufStmt = NULL; + char *zSql = sqlite3_mprintf( + "SELECT rowid, vector FROM " VEC0_SHADOW_DISKANN_BUFFER_N_NAME, + p->schemaName, p->tableName, vectorColumnIdx); + if (!zSql) { + queryVectorCleanup(queryVector); + sqlite3_free(resultRowids); + sqlite3_free(resultDistances); + sqlite3_free(knn_data); + return SQLITE_NOMEM; + } + int bufRc = sqlite3_prepare_v2(p->db, zSql, -1, &bufStmt, NULL); + sqlite3_free(zSql); + if (bufRc == SQLITE_OK) { + while (sqlite3_step(bufStmt) == SQLITE_ROW) { + i64 bufRowid = sqlite3_column_int64(bufStmt, 0); + const void *bufVec = sqlite3_column_blob(bufStmt, 1); + f32 dist = vec0_distance_full( + queryVector, bufVec, dimensions, elementType, + vector_column->distance_metric); + + // Check if this buffer vector should replace the worst graph result + if (resultCount < (int)k) { + // Still have room, just add it + resultRowids[resultCount] = bufRowid; + resultDistances[resultCount] = dist; + resultCount++; + } else { + // Find worst (largest distance) in results + int worstIdx = 0; + for (int wi = 1; wi < resultCount; wi++) { + if (resultDistances[wi] > resultDistances[worstIdx]) { + worstIdx = wi; + } + } + if (dist < resultDistances[worstIdx]) { + resultRowids[worstIdx] = bufRowid; + resultDistances[worstIdx] = dist; + } + } + } + sqlite3_finalize(bufStmt); + } + } + + queryVectorCleanup(queryVector); + + // Sort results by distance (ascending) + for (int si = 0; si < resultCount - 1; si++) { + for (int sj = si + 1; sj < resultCount; sj++) { + if (resultDistances[sj] < resultDistances[si]) { + f32 tmpD = resultDistances[si]; + resultDistances[si] = resultDistances[sj]; + resultDistances[sj] = tmpD; + i64 tmpR = resultRowids[si]; + resultRowids[si] = resultRowids[sj]; + resultRowids[sj] = tmpR; + } + } + } + + knn_data->k = resultCount; + knn_data->k_used = resultCount; + knn_data->rowids = resultRowids; + knn_data->distances = resultDistances; + knn_data->current_idx = 0; + + pCur->knn_data = knn_data; + pCur->query_plan = VEC0_QUERY_PLAN_KNN; + + return SQLITE_OK; +} +#endif /* SQLITE_VEC_ENABLE_DISKANN */ + int vec0Filter_knn(vec0_cursor *pCur, vec0_vtab *p, int idxNum, const char *idxStr, int argc, sqlite3_value **argv) { assert(argc == (strlen(idxStr)-1) / 4); @@ -7036,6 +7817,13 @@ int vec0Filter_knn(vec0_cursor *pCur, vec0_vtab *p, int idxNum, struct VectorColumnDefinition *vector_column = &p->vector_columns[vectorColumnIdx]; +#if SQLITE_VEC_ENABLE_DISKANN + // DiskANN dispatch + if (vector_column->index_type == VEC0_INDEX_TYPE_DISKANN) { + return vec0Filter_knn_diskann(pCur, p, idxNum, idxStr, argc, argv); + } +#endif + struct Array *arrayRowidsIn = NULL; sqlite3_stmt *stmtChunks = NULL; void *queryVector; @@ -7258,6 +8046,36 @@ int vec0Filter_knn(vec0_cursor *pCur, vec0_vtab *p, int idxNum, } #endif +#if SQLITE_VEC_ENABLE_RESCORE + // Dispatch to rescore KNN path if this vector column has rescore enabled + if (vector_column->index_type == VEC0_INDEX_TYPE_RESCORE) { + rc = rescore_knn(p, pCur, vector_column, vectorColumnIdx, arrayRowidsIn, + aMetadataIn, idxStr, argc, argv, queryVector, k, knn_data); + if (rc != SQLITE_OK) { + goto cleanup; + } + pCur->knn_data = knn_data; + pCur->query_plan = VEC0_QUERY_PLAN_KNN; + rc = SQLITE_OK; + goto cleanup; + } +#endif + +#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE + // IVF dispatch: if vector column has IVF, use IVF query instead of chunk scan + if (vector_column->index_type == VEC0_INDEX_TYPE_IVF) { + rc = ivf_query_knn(p, vectorColumnIdx, queryVector, + (int)vector_column_byte_size(*vector_column), k, knn_data); + if (rc != SQLITE_OK) { + goto cleanup; + } + pCur->knn_data = knn_data; + pCur->query_plan = VEC0_QUERY_PLAN_KNN; + rc = SQLITE_OK; + goto cleanup; + } +#endif + rc = vec0_chunks_iter(p, idxStr, argc, argv, &stmtChunks); if (rc != SQLITE_OK) { // IMP: V06942_23781 @@ -8082,6 +8900,10 @@ int vec0Update_InsertWriteFinalStep(vec0_vtab *p, i64 chunk_rowid, // Go insert the vector data into the vector chunk shadow tables for (int i = 0; i < p->numVectorColumns; i++) { + // Non-FLAT columns (rescore, IVF, DiskANN) don't use _vector_chunks + if (p->vector_columns[i].index_type != VEC0_INDEX_TYPE_FLAT) + continue; + sqlite3_blob *blobVectors; rc = sqlite3_blob_open(p->db, p->schemaName, p->shadowVectorChunksNames[i], "vectors", chunk_rowid, 1, &blobVectors); @@ -8339,6 +9161,9 @@ int vec0_write_metadata_value(vec0_vtab *p, int metadata_column_idx, i64 rowid, * * @return int SQLITE_OK on success, otherwise error code on failure */ +// Forward declaration: needed for INSERT OR REPLACE handling in vec0Update_Insert +int vec0Update_Delete(sqlite3_vtab *pVTab, sqlite3_value *idValue); + int vec0Update_Insert(sqlite3_vtab *pVTab, int argc, sqlite3_value **argv, sqlite_int64 *pRowid) { UNUSED_PARAMETER(argc); @@ -8459,30 +9284,100 @@ int vec0Update_Insert(sqlite3_vtab *pVTab, int argc, sqlite3_value **argv, goto cleanup; } + // Handle INSERT OR REPLACE: if the conflict resolution is REPLACE and the + // row already exists, delete the existing row first before inserting. + if (sqlite3_vtab_on_conflict(p->db) == SQLITE_REPLACE) { + sqlite3_value *idValue = argv[2 + VEC0_COLUMN_ID]; + int idType = sqlite3_value_type(idValue); + int existingRowExists = 0; + + if (p->pkIsText && idType == SQLITE_TEXT) { + i64 existingRowid; + rc = vec0_rowid_from_id(p, idValue, &existingRowid); + if (rc == SQLITE_OK) { + existingRowExists = 1; + } else if (rc == SQLITE_EMPTY) { + rc = SQLITE_OK; // row doesn't exist, proceed with normal insert + } else { + goto cleanup; + } + } else if (!p->pkIsText && idType == SQLITE_INTEGER) { + i64 existingRowid = sqlite3_value_int64(idValue); + i64 chunk_id_tmp, chunk_offset_tmp; + rc = vec0_get_chunk_position(p, existingRowid, NULL, &chunk_id_tmp, &chunk_offset_tmp); + if (rc == SQLITE_OK) { + existingRowExists = 1; + } else if (rc == SQLITE_EMPTY) { + rc = SQLITE_OK; // row doesn't exist, proceed with normal insert + } else { + goto cleanup; + } + } + + if (existingRowExists) { + rc = vec0Update_Delete(pVTab, idValue); + if (rc != SQLITE_OK) { + goto cleanup; + } + } + } + // Step #1: Insert/get a rowid for this row, from the _rowids table. rc = vec0Update_InsertRowidStep(p, argv[2 + VEC0_COLUMN_ID], &rowid); if (rc != SQLITE_OK) { goto cleanup; } - // Step #2: Find the next "available" position in the _chunks table for this - // row. - rc = vec0Update_InsertNextAvailableStep(p, partitionKeyValues, - &chunk_rowid, &chunk_offset, - &blobChunksValidity, - &bufferChunksValidity); - if (rc != SQLITE_OK) { - goto cleanup; + if (!vec0_all_columns_diskann(p)) { + // Step #2: Find the next "available" position in the _chunks table for this + // row. + rc = vec0Update_InsertNextAvailableStep(p, partitionKeyValues, + &chunk_rowid, &chunk_offset, + &blobChunksValidity, + &bufferChunksValidity); + if (rc != SQLITE_OK) { + goto cleanup; + } + + // Step #3: With the next available chunk position, write out all the vectors + // to their specified location. + rc = vec0Update_InsertWriteFinalStep(p, chunk_rowid, chunk_offset, rowid, + vectorDatas, blobChunksValidity, + bufferChunksValidity); + if (rc != SQLITE_OK) { + goto cleanup; + } } - // Step #3: With the next available chunk position, write out all the vectors - // to their specified location. - rc = vec0Update_InsertWriteFinalStep(p, chunk_rowid, chunk_offset, rowid, - vectorDatas, blobChunksValidity, - bufferChunksValidity); +#if SQLITE_VEC_ENABLE_DISKANN + // Step #4: Insert into DiskANN graph for indexed vector columns + for (int i = 0; i < p->numVectorColumns; i++) { + if (p->vector_columns[i].index_type != VEC0_INDEX_TYPE_DISKANN) continue; + rc = diskann_insert(p, i, rowid, vectorDatas[i]); + if (rc != SQLITE_OK) { + goto cleanup; + } + } +#endif + +#if SQLITE_VEC_ENABLE_RESCORE + rc = rescore_on_insert(p, chunk_rowid, chunk_offset, rowid, vectorDatas); if (rc != SQLITE_OK) { goto cleanup; } +#endif + +#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE + // Step #4: IVF index insert (if any vector column uses IVF) + for (int i = 0; i < p->numVectorColumns; i++) { + if (p->vector_columns[i].index_type != VEC0_INDEX_TYPE_IVF) continue; + int vecSize = (int)vector_column_byte_size(p->vector_columns[i]); + rc = ivf_insert(p, i, rowid, vectorDatas[i], vecSize); + if (rc != SQLITE_OK) { + goto cleanup; + } + } +#endif if(p->numAuxiliaryColumns > 0) { sqlite3_stmt *stmt; @@ -8674,6 +9569,9 @@ int vec0Update_Delete_ClearVectors(vec0_vtab *p, i64 chunk_id, u64 chunk_offset) { int rc, brc; for (int i = 0; i < p->numVectorColumns; i++) { + // Non-FLAT columns (rescore, IVF, DiskANN) don't use _vector_chunks + if (p->vector_columns[i].index_type != VEC0_INDEX_TYPE_FLAT) + continue; sqlite3_blob *blobVectors = NULL; size_t n = vector_column_byte_size(p->vector_columns[i]); @@ -8785,6 +9683,9 @@ int vec0Update_Delete_DeleteChunkIfEmpty(vec0_vtab *p, i64 chunk_id, // Delete from each _vector_chunksNN for (int i = 0; i < p->numVectorColumns; i++) { + // Non-FLAT columns (rescore, IVF, DiskANN) don't use _vector_chunks + if (p->vector_columns[i].index_type != VEC0_INDEX_TYPE_FLAT) + continue; zSql = sqlite3_mprintf( "DELETE FROM " VEC0_SHADOW_VECTOR_N_NAME " WHERE rowid = ?", p->schemaName, p->tableName, i); @@ -8801,6 +9702,12 @@ int vec0Update_Delete_DeleteChunkIfEmpty(vec0_vtab *p, i64 chunk_id, return SQLITE_ERROR; } +#if SQLITE_VEC_ENABLE_RESCORE + rc = rescore_delete_chunk(p, chunk_id); + if (rc != SQLITE_OK) + return rc; +#endif + // Delete from each _metadatachunksNN for (int i = 0; i < p->numMetadataColumns; i++) { zSql = sqlite3_mprintf( @@ -8944,11 +9851,17 @@ int vec0Update_Delete_ClearMetadata(vec0_vtab *p, int metadata_idx, i64 rowid, i } sqlite3_bind_int64(stmt, 1, rowid); rc = sqlite3_step(stmt); + sqlite3_finalize(stmt); if(rc != SQLITE_DONE) { rc = SQLITE_ERROR; goto done; } - sqlite3_finalize(stmt); + // Fix for https://github.com/asg017/sqlite-vec/issues/274 + // sqlite3_step returns SQLITE_DONE (101) on DML success, but the + // `done:` epilogue treats anything other than SQLITE_OK as an error. + // Without this, SQLITE_DONE propagates up to vec0Update_Delete, + // which aborts the DELETE scan and silently drops remaining rows. + rc = SQLITE_OK; } break; } @@ -8966,8 +9879,8 @@ int vec0Update_Delete(sqlite3_vtab *pVTab, sqlite3_value *idValue) { vec0_vtab *p = (vec0_vtab *)pVTab; int rc; i64 rowid; - i64 chunk_id; - i64 chunk_offset; + i64 chunk_id = 0; + i64 chunk_offset = 0; if (p->pkIsText) { rc = vec0_rowid_from_id(p, idValue, &rowid); @@ -8984,28 +9897,49 @@ int vec0Update_Delete(sqlite3_vtab *pVTab, sqlite3_value *idValue) { // 4. Zero out vector data in all vector column chunks // 5. Delete value in _rowids table - // 1. get chunk_id and chunk_offset from _rowids - rc = vec0_get_chunk_position(p, rowid, NULL, &chunk_id, &chunk_offset); - if (rc != SQLITE_OK) { - return rc; +#if SQLITE_VEC_ENABLE_DISKANN + // DiskANN graph deletion for indexed columns + for (int i = 0; i < p->numVectorColumns; i++) { + if (p->vector_columns[i].index_type != VEC0_INDEX_TYPE_DISKANN) continue; + rc = diskann_delete(p, i, rowid); + if (rc != SQLITE_OK) { + return rc; + } } +#endif - // 2. clear validity bit - rc = vec0Update_Delete_ClearValidity(p, chunk_id, chunk_offset); - if (rc != SQLITE_OK) { - return rc; - } + if (!vec0_all_columns_diskann(p)) { + // 1. get chunk_id and chunk_offset from _rowids + rc = vec0_get_chunk_position(p, rowid, NULL, &chunk_id, &chunk_offset); + if (rc != SQLITE_OK) { + return rc; + } - // 3. zero out rowid in chunks.rowids - rc = vec0Update_Delete_ClearRowid(p, chunk_id, chunk_offset); - if (rc != SQLITE_OK) { - return rc; - } + // 2. clear validity bit + rc = vec0Update_Delete_ClearValidity(p, chunk_id, chunk_offset); + if (rc != SQLITE_OK) { + return rc; + } - // 4. zero out any data in vector chunks tables - rc = vec0Update_Delete_ClearVectors(p, chunk_id, chunk_offset); - if (rc != SQLITE_OK) { - return rc; + // 3. zero out rowid in chunks.rowids + rc = vec0Update_Delete_ClearRowid(p, chunk_id, chunk_offset); + if (rc != SQLITE_OK) { + return rc; + } + + // 4. zero out any data in vector chunks tables + rc = vec0Update_Delete_ClearVectors(p, chunk_id, chunk_offset); + if (rc != SQLITE_OK) { + return rc; + } + +#if SQLITE_VEC_ENABLE_RESCORE + // 4b. zero out quantized data in rescore chunk tables, delete from rescore vectors + rc = rescore_on_delete(p, chunk_id, chunk_offset, rowid); + if (rc != SQLITE_OK) { + return rc; + } +#endif } // 5. delete from _rowids table @@ -9022,22 +9956,33 @@ int vec0Update_Delete(sqlite3_vtab *pVTab, sqlite3_value *idValue) { } } - // 7. delete metadata - for(int i = 0; i < p->numMetadataColumns; i++) { - rc = vec0Update_Delete_ClearMetadata(p, i, rowid, chunk_id, chunk_offset); - if (rc != SQLITE_OK) { - return rc; + // 7. delete metadata and reclaim chunk (only when using chunk-based storage) + if (!vec0_all_columns_diskann(p)) { + for(int i = 0; i < p->numMetadataColumns; i++) { + rc = vec0Update_Delete_ClearMetadata(p, i, rowid, chunk_id, chunk_offset); + if (rc != SQLITE_OK) { + return rc; + } + } + + // 8. reclaim chunk if fully empty + { + int chunkDeleted; + rc = vec0Update_Delete_DeleteChunkIfEmpty(p, chunk_id, &chunkDeleted); + if (rc != SQLITE_OK) { + return rc; + } } } - // 8. reclaim chunk if fully empty - { - int chunkDeleted; - rc = vec0Update_Delete_DeleteChunkIfEmpty(p, chunk_id, &chunkDeleted); - if (rc != SQLITE_OK) { - return rc; - } +#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE + // 7. delete from IVF index + for (int i = 0; i < p->numVectorColumns; i++) { + if (p->vector_columns[i].index_type != VEC0_INDEX_TYPE_IVF) continue; + rc = ivf_delete(p, i, rowid); + if (rc != SQLITE_OK) return rc; } +#endif return SQLITE_OK; } @@ -9065,8 +10010,11 @@ int vec0Update_UpdateAuxColumn(vec0_vtab *p, int auxiliary_column_idx, sqlite3_v } int vec0Update_UpdateVectorColumn(vec0_vtab *p, i64 chunk_id, i64 chunk_offset, - int i, sqlite3_value *valueVector) { + int i, sqlite3_value *valueVector, i64 rowid) { int rc; +#if !SQLITE_VEC_ENABLE_RESCORE + UNUSED_PARAMETER(rowid); +#endif sqlite3_blob *blobVectors = NULL; @@ -9110,6 +10058,59 @@ int vec0Update_UpdateVectorColumn(vec0_vtab *p, i64 chunk_id, i64 chunk_offset, goto cleanup; } +#if SQLITE_VEC_ENABLE_RESCORE + if (p->vector_columns[i].index_type == VEC0_INDEX_TYPE_RESCORE) { + // For rescore columns, update _rescore_vectors and _rescore_chunks + struct VectorColumnDefinition *col = &p->vector_columns[i]; + size_t qsize = rescore_quantized_byte_size(col); + size_t fsize = vector_column_byte_size(*col); + + // 1. Update quantized chunk + { + void *qbuf = sqlite3_malloc(qsize); + if (!qbuf) { rc = SQLITE_NOMEM; goto cleanup; } + switch (col->rescore.quantizer_type) { + case VEC0_RESCORE_QUANTIZER_BIT: + rescore_quantize_float_to_bit((const float *)vector, (uint8_t *)qbuf, col->dimensions); + break; + case VEC0_RESCORE_QUANTIZER_INT8: + rescore_quantize_float_to_int8((const float *)vector, (int8_t *)qbuf, col->dimensions); + break; + } + sqlite3_blob *blobQ = NULL; + rc = sqlite3_blob_open(p->db, p->schemaName, + p->shadowRescoreChunksNames[i], "vectors", + chunk_id, 1, &blobQ); + if (rc != SQLITE_OK) { sqlite3_free(qbuf); goto cleanup; } + rc = sqlite3_blob_write(blobQ, qbuf, qsize, chunk_offset * qsize); + sqlite3_free(qbuf); + int brc2 = sqlite3_blob_close(blobQ); + if (rc != SQLITE_OK) goto cleanup; + if (brc2 != SQLITE_OK) { rc = brc2; goto cleanup; } + } + + // 2. Update float vector in _rescore_vectors (keyed by user rowid) + { + char *zSql = sqlite3_mprintf( + "UPDATE \"%w\".\"%w\" SET vector = ? WHERE rowid = ?", + p->schemaName, p->shadowRescoreVectorsNames[i]); + if (!zSql) { rc = SQLITE_NOMEM; goto cleanup; } + sqlite3_stmt *stmtUp; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmtUp, NULL); + sqlite3_free(zSql); + if (rc != SQLITE_OK) goto cleanup; + sqlite3_bind_blob(stmtUp, 1, vector, fsize, SQLITE_TRANSIENT); + sqlite3_bind_int64(stmtUp, 2, rowid); + rc = sqlite3_step(stmtUp); + sqlite3_finalize(stmtUp); + if (rc != SQLITE_DONE) { rc = SQLITE_ERROR; goto cleanup; } + } + + rc = SQLITE_OK; + goto cleanup; + } +#endif + rc = sqlite3_blob_open(p->db, p->schemaName, p->shadowVectorChunksNames[i], "vectors", chunk_id, 1, &blobVectors); if (rc != SQLITE_OK) { @@ -9240,8 +10241,28 @@ int vec0Update_Update(sqlite3_vtab *pVTab, int argc, sqlite3_value **argv) { continue; } + // Block vector UPDATE for index types that don't implement it — + // the DiskANN graph / IVF lists would become stale. + { + enum Vec0IndexType idx_type = p->vector_columns[vector_idx].index_type; + const char *idx_name = NULL; + if (idx_type == VEC0_INDEX_TYPE_DISKANN) idx_name = "DiskANN"; +#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE + else if (idx_type == VEC0_INDEX_TYPE_IVF) idx_name = "IVF"; +#endif + if (idx_name) { + vtab_set_error( + &p->base, + "UPDATE on vector column \"%.*s\" is not supported for %s indexes.", + p->vector_columns[vector_idx].name_length, + p->vector_columns[vector_idx].name, + idx_name); + return SQLITE_ERROR; + } + } + rc = vec0Update_UpdateVectorColumn(p, chunk_id, chunk_offset, vector_idx, - valueVector); + valueVector, rowid); if (rc != SQLITE_OK) { return SQLITE_ERROR; } @@ -9258,6 +10279,31 @@ static int vec0Update(sqlite3_vtab *pVTab, int argc, sqlite3_value **argv, } // INSERT operation else if (argc > 1 && sqlite3_value_type(argv[0]) == SQLITE_NULL) { + vec0_vtab *p = (vec0_vtab *)pVTab; + // FTS5-style command dispatch via hidden column named after table + if (p->hasCommandColumn) { + sqlite3_value *cmdVal = argv[2 + vec0_column_command_idx(p)]; + if (sqlite3_value_type(cmdVal) == SQLITE_TEXT) { + const char *cmd = (const char *)sqlite3_value_text(cmdVal); + int cmdRc = SQLITE_EMPTY; +#if SQLITE_VEC_ENABLE_RESCORE + cmdRc = rescore_handle_command(p, cmd); +#endif +#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE + if (cmdRc == SQLITE_EMPTY) + cmdRc = ivf_handle_command(p, cmd, argc, argv); +#endif +#if SQLITE_VEC_ENABLE_DISKANN + if (cmdRc == SQLITE_EMPTY) + cmdRc = diskann_handle_command(p, cmd); +#endif + if (cmdRc == SQLITE_EMPTY) { + vtab_set_error(pVTab, "unknown vec0 command: '%s'", cmd); + return SQLITE_ERROR; + } + return cmdRc; + } + } return vec0Update_Insert(pVTab, argc, argv, pRowid); } // UPDATE operation @@ -9357,6 +10403,163 @@ static int vec0Rollback(sqlite3_vtab *pVTab) { return SQLITE_OK; } +/** + * xRename implementation for vec0. + * Renames all shadow tables to match the new virtual table name, + * then updates cached table names and finalizes stale prepared statements. + */ +static int vec0Rename(sqlite3_vtab *pVtab, const char *zNew) { + vec0_vtab *p = (vec0_vtab *)pVtab; + int rc = SQLITE_OK; + + // Build a single SQL string with ALTER TABLE RENAME for every shadow table. + sqlite3_str *s = sqlite3_str_new(p->db); + + // Core shadow tables (always present) + sqlite3_str_appendf(s, + "ALTER TABLE \"%w\".\"%w_info\" RENAME TO \"%w_info\";", + p->schemaName, p->tableName, zNew); + sqlite3_str_appendf(s, + "ALTER TABLE \"%w\".\"%w_rowids\" RENAME TO \"%w_rowids\";", + p->schemaName, p->tableName, zNew); + sqlite3_str_appendf(s, + "ALTER TABLE \"%w\".\"%w_chunks\" RENAME TO \"%w_chunks\";", + p->schemaName, p->tableName, zNew); + + // Auxiliary shadow table (only if auxiliary columns exist) + if (p->numAuxiliaryColumns > 0) { + sqlite3_str_appendf(s, + "ALTER TABLE \"%w\".\"%w_auxiliary\" RENAME TO \"%w_auxiliary\";", + p->schemaName, p->tableName, zNew); + } + + // Per-vector-column shadow tables + for (int i = 0; i < p->numVectorColumns; i++) { + sqlite3_str_appendf(s, + "ALTER TABLE \"%w\".\"%w_vector_chunks%02d\" RENAME TO \"%w_vector_chunks%02d\";", + p->schemaName, p->tableName, i, zNew, i); + +#if SQLITE_VEC_ENABLE_RESCORE + if (p->shadowRescoreChunksNames[i]) { + sqlite3_str_appendf(s, + "ALTER TABLE \"%w\".\"%w_rescore_chunks%02d\" RENAME TO \"%w_rescore_chunks%02d\";", + p->schemaName, p->tableName, i, zNew, i); + sqlite3_str_appendf(s, + "ALTER TABLE \"%w\".\"%w_rescore_vectors%02d\" RENAME TO \"%w_rescore_vectors%02d\";", + p->schemaName, p->tableName, i, zNew, i); + } +#endif + +#if SQLITE_VEC_ENABLE_DISKANN + if (p->shadowVectorsNames[i]) { + sqlite3_str_appendf(s, + "ALTER TABLE \"%w\".\"%w_vectors%02d\" RENAME TO \"%w_vectors%02d\";", + p->schemaName, p->tableName, i, zNew, i); + sqlite3_str_appendf(s, + "ALTER TABLE \"%w\".\"%w_diskann_nodes%02d\" RENAME TO \"%w_diskann_nodes%02d\";", + p->schemaName, p->tableName, i, zNew, i); + sqlite3_str_appendf(s, + "ALTER TABLE \"%w\".\"%w_diskann_buffer%02d\" RENAME TO \"%w_diskann_buffer%02d\";", + p->schemaName, p->tableName, i, zNew, i); + } +#endif + } + +#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE + for (int i = 0; i < p->numVectorColumns; i++) { + if (p->shadowIvfCellsNames[i]) { + sqlite3_str_appendf(s, + "ALTER TABLE \"%w\".\"%w_ivf_cells%02d\" RENAME TO \"%w_ivf_cells%02d\";", + p->schemaName, p->tableName, i, zNew, i); + } + } +#endif + + // Per-metadata-column shadow tables + for (int i = 0; i < p->numMetadataColumns; i++) { + sqlite3_str_appendf(s, + "ALTER TABLE \"%w\".\"%w_metadatachunks%02d\" RENAME TO \"%w_metadatachunks%02d\";", + p->schemaName, p->tableName, i, zNew, i); + if (p->metadata_columns[i].kind == VEC0_METADATA_COLUMN_KIND_TEXT) { + sqlite3_str_appendf(s, + "ALTER TABLE \"%w\".\"%w_metadatatext%02d\" RENAME TO \"%w_metadatatext%02d\";", + p->schemaName, p->tableName, i, zNew, i); + } + } + + char *zSql = sqlite3_str_finish(s); + if (!zSql) { + return SQLITE_NOMEM; + } + + rc = sqlite3_exec(p->db, zSql, 0, 0, 0); + sqlite3_free(zSql); + if (rc != SQLITE_OK) { + return rc; + } + + // Finalize all prepared statements — they reference old table names. + vec0_free_resources(p); + + // Update cached table name + sqlite3_free(p->tableName); + p->tableName = sqlite3_mprintf("%s", zNew); + if (!p->tableName) return SQLITE_NOMEM; + + // Update cached shadow table names + sqlite3_free(p->shadowRowidsName); + p->shadowRowidsName = sqlite3_mprintf("%s_rowids", zNew); + + sqlite3_free(p->shadowChunksName); + p->shadowChunksName = sqlite3_mprintf("%s_chunks", zNew); + + for (int i = 0; i < p->numVectorColumns; i++) { + sqlite3_free(p->shadowVectorChunksNames[i]); + p->shadowVectorChunksNames[i] = + sqlite3_mprintf("%s_vector_chunks%02d", zNew, i); + +#if SQLITE_VEC_ENABLE_RESCORE + if (p->shadowRescoreChunksNames[i]) { + sqlite3_free(p->shadowRescoreChunksNames[i]); + p->shadowRescoreChunksNames[i] = + sqlite3_mprintf("%s_rescore_chunks%02d", zNew, i); + sqlite3_free(p->shadowRescoreVectorsNames[i]); + p->shadowRescoreVectorsNames[i] = + sqlite3_mprintf("%s_rescore_vectors%02d", zNew, i); + } +#endif + +#if SQLITE_VEC_ENABLE_DISKANN + if (p->shadowVectorsNames[i]) { + sqlite3_free(p->shadowVectorsNames[i]); + p->shadowVectorsNames[i] = + sqlite3_mprintf("%s_vectors%02d", zNew, i); + sqlite3_free(p->shadowDiskannNodesNames[i]); + p->shadowDiskannNodesNames[i] = + sqlite3_mprintf("%s_diskann_nodes%02d", zNew, i); + } +#endif + } + +#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE + for (int i = 0; i < p->numVectorColumns; i++) { + if (p->shadowIvfCellsNames[i]) { + sqlite3_free(p->shadowIvfCellsNames[i]); + p->shadowIvfCellsNames[i] = + sqlite3_mprintf("%s_ivf_cells%02d", zNew, i); + } + } +#endif + + for (int i = 0; i < p->numMetadataColumns; i++) { + sqlite3_free(p->shadowMetadataChunksNames[i]); + p->shadowMetadataChunksNames[i] = + sqlite3_mprintf("%s_metadatachunks%02d", zNew, i); + } + + return SQLITE_OK; +} + static sqlite3_module vec0Module = { /* iVersion */ 3, /* xCreate */ vec0Create, @@ -9377,7 +10580,7 @@ static sqlite3_module vec0Module = { /* xCommit */ vec0Commit, /* xRollback */ vec0Rollback, /* xFindFunction */ 0, - /* xRename */ 0, // https://github.com/asg017/sqlite-vec/issues/43 + /* xRename */ vec0Rename, /* xSavepoint */ 0, /* xRelease */ 0, /* xRollbackTo */ 0, @@ -9388,652 +10591,6 @@ static sqlite3_module vec0Module = { }; #pragma endregion -static char *POINTER_NAME_STATIC_BLOB_DEF = "vec0-static_blob_def"; -struct static_blob_definition { - void *p; - size_t dimensions; - size_t nvectors; - enum VectorElementType element_type; -}; -static void vec_static_blob_from_raw(sqlite3_context *context, int argc, - sqlite3_value **argv) { - - assert(argc == 4); - struct static_blob_definition *p; - p = sqlite3_malloc(sizeof(*p)); - if (!p) { - sqlite3_result_error_nomem(context); - return; - } - memset(p, 0, sizeof(*p)); - p->p = (void *)sqlite3_value_int64(argv[0]); - p->element_type = SQLITE_VEC_ELEMENT_TYPE_FLOAT32; - p->dimensions = sqlite3_value_int64(argv[2]); - p->nvectors = sqlite3_value_int64(argv[3]); - sqlite3_result_pointer(context, p, POINTER_NAME_STATIC_BLOB_DEF, - sqlite3_free); -} -#pragma region vec_static_blobs() table function - -#define MAX_STATIC_BLOBS 16 - -typedef struct static_blob static_blob; -struct static_blob { - char *name; - void *p; - size_t dimensions; - size_t nvectors; - enum VectorElementType element_type; -}; - -typedef struct vec_static_blob_data vec_static_blob_data; -struct vec_static_blob_data { - static_blob static_blobs[MAX_STATIC_BLOBS]; -}; - -typedef struct vec_static_blobs_vtab vec_static_blobs_vtab; -struct vec_static_blobs_vtab { - sqlite3_vtab base; - vec_static_blob_data *data; -}; - -typedef struct vec_static_blobs_cursor vec_static_blobs_cursor; -struct vec_static_blobs_cursor { - sqlite3_vtab_cursor base; - sqlite3_int64 iRowid; -}; - -static int vec_static_blobsConnect(sqlite3 *db, void *pAux, int argc, - const char *const *argv, - sqlite3_vtab **ppVtab, char **pzErr) { - UNUSED_PARAMETER(argc); - UNUSED_PARAMETER(argv); - UNUSED_PARAMETER(pzErr); - - vec_static_blobs_vtab *pNew; -#define VEC_STATIC_BLOBS_NAME 0 -#define VEC_STATIC_BLOBS_DATA 1 -#define VEC_STATIC_BLOBS_DIMENSIONS 2 -#define VEC_STATIC_BLOBS_COUNT 3 - int rc = sqlite3_declare_vtab( - db, "CREATE TABLE x(name, data, dimensions hidden, count hidden)"); - if (rc == SQLITE_OK) { - pNew = sqlite3_malloc(sizeof(*pNew)); - *ppVtab = (sqlite3_vtab *)pNew; - if (pNew == 0) - return SQLITE_NOMEM; - memset(pNew, 0, sizeof(*pNew)); - pNew->data = pAux; - } - return rc; -} - -static int vec_static_blobsDisconnect(sqlite3_vtab *pVtab) { - vec_static_blobs_vtab *p = (vec_static_blobs_vtab *)pVtab; - sqlite3_free(p); - return SQLITE_OK; -} - -static int vec_static_blobsUpdate(sqlite3_vtab *pVTab, int argc, - sqlite3_value **argv, sqlite_int64 *pRowid) { - UNUSED_PARAMETER(pRowid); - vec_static_blobs_vtab *p = (vec_static_blobs_vtab *)pVTab; - // DELETE operation - if (argc == 1 && sqlite3_value_type(argv[0]) != SQLITE_NULL) { - return SQLITE_ERROR; - } - // INSERT operation - else if (argc > 1 && sqlite3_value_type(argv[0]) == SQLITE_NULL) { - const char *key = - (const char *)sqlite3_value_text(argv[2 + VEC_STATIC_BLOBS_NAME]); - int idx = -1; - for (int i = 0; i < MAX_STATIC_BLOBS; i++) { - if (!p->data->static_blobs[i].name) { - p->data->static_blobs[i].name = sqlite3_mprintf("%s", key); - idx = i; - break; - } - } - if (idx < 0) - abort(); - struct static_blob_definition *def = sqlite3_value_pointer( - argv[2 + VEC_STATIC_BLOBS_DATA], POINTER_NAME_STATIC_BLOB_DEF); - p->data->static_blobs[idx].p = def->p; - p->data->static_blobs[idx].dimensions = def->dimensions; - p->data->static_blobs[idx].nvectors = def->nvectors; - p->data->static_blobs[idx].element_type = def->element_type; - - return SQLITE_OK; - } - // UPDATE operation - else if (argc > 1 && sqlite3_value_type(argv[0]) != SQLITE_NULL) { - return SQLITE_ERROR; - } - return SQLITE_ERROR; -} - -static int vec_static_blobsOpen(sqlite3_vtab *p, - sqlite3_vtab_cursor **ppCursor) { - UNUSED_PARAMETER(p); - vec_static_blobs_cursor *pCur; - pCur = sqlite3_malloc(sizeof(*pCur)); - if (pCur == 0) - return SQLITE_NOMEM; - memset(pCur, 0, sizeof(*pCur)); - *ppCursor = &pCur->base; - return SQLITE_OK; -} - -static int vec_static_blobsClose(sqlite3_vtab_cursor *cur) { - vec_static_blobs_cursor *pCur = (vec_static_blobs_cursor *)cur; - sqlite3_free(pCur); - return SQLITE_OK; -} - -static int vec_static_blobsBestIndex(sqlite3_vtab *pVTab, - sqlite3_index_info *pIdxInfo) { - UNUSED_PARAMETER(pVTab); - pIdxInfo->idxNum = 1; - pIdxInfo->estimatedCost = (double)10; - pIdxInfo->estimatedRows = 10; - return SQLITE_OK; -} - -static int vec_static_blobsNext(sqlite3_vtab_cursor *cur); -static int vec_static_blobsFilter(sqlite3_vtab_cursor *pVtabCursor, int idxNum, - const char *idxStr, int argc, - sqlite3_value **argv) { - UNUSED_PARAMETER(idxNum); - UNUSED_PARAMETER(idxStr); - UNUSED_PARAMETER(argc); - UNUSED_PARAMETER(argv); - vec_static_blobs_cursor *pCur = (vec_static_blobs_cursor *)pVtabCursor; - pCur->iRowid = -1; - vec_static_blobsNext(pVtabCursor); - return SQLITE_OK; -} - -static int vec_static_blobsRowid(sqlite3_vtab_cursor *cur, - sqlite_int64 *pRowid) { - vec_static_blobs_cursor *pCur = (vec_static_blobs_cursor *)cur; - *pRowid = pCur->iRowid; - return SQLITE_OK; -} - -static int vec_static_blobsNext(sqlite3_vtab_cursor *cur) { - vec_static_blobs_cursor *pCur = (vec_static_blobs_cursor *)cur; - vec_static_blobs_vtab *p = (vec_static_blobs_vtab *)pCur->base.pVtab; - pCur->iRowid++; - while (pCur->iRowid < MAX_STATIC_BLOBS) { - if (p->data->static_blobs[pCur->iRowid].name) { - return SQLITE_OK; - } - pCur->iRowid++; - } - return SQLITE_OK; -} - -static int vec_static_blobsEof(sqlite3_vtab_cursor *cur) { - vec_static_blobs_cursor *pCur = (vec_static_blobs_cursor *)cur; - return pCur->iRowid >= MAX_STATIC_BLOBS; -} - -static int vec_static_blobsColumn(sqlite3_vtab_cursor *cur, - sqlite3_context *context, int i) { - vec_static_blobs_cursor *pCur = (vec_static_blobs_cursor *)cur; - vec_static_blobs_vtab *p = (vec_static_blobs_vtab *)cur->pVtab; - switch (i) { - case VEC_STATIC_BLOBS_NAME: - sqlite3_result_text(context, p->data->static_blobs[pCur->iRowid].name, -1, - SQLITE_TRANSIENT); - break; - case VEC_STATIC_BLOBS_DATA: - sqlite3_result_null(context); - break; - case VEC_STATIC_BLOBS_DIMENSIONS: - sqlite3_result_int64(context, - p->data->static_blobs[pCur->iRowid].dimensions); - break; - case VEC_STATIC_BLOBS_COUNT: - sqlite3_result_int64(context, p->data->static_blobs[pCur->iRowid].nvectors); - break; - } - return SQLITE_OK; -} - -static sqlite3_module vec_static_blobsModule = { - /* iVersion */ 3, - /* xCreate */ 0, - /* xConnect */ vec_static_blobsConnect, - /* xBestIndex */ vec_static_blobsBestIndex, - /* xDisconnect */ vec_static_blobsDisconnect, - /* xDestroy */ 0, - /* xOpen */ vec_static_blobsOpen, - /* xClose */ vec_static_blobsClose, - /* xFilter */ vec_static_blobsFilter, - /* xNext */ vec_static_blobsNext, - /* xEof */ vec_static_blobsEof, - /* xColumn */ vec_static_blobsColumn, - /* xRowid */ vec_static_blobsRowid, - /* xUpdate */ vec_static_blobsUpdate, - /* xBegin */ 0, - /* xSync */ 0, - /* xCommit */ 0, - /* xRollback */ 0, - /* xFindMethod */ 0, - /* xRename */ 0, - /* xSavepoint */ 0, - /* xRelease */ 0, - /* xRollbackTo */ 0, - /* xShadowName */ 0, -#if SQLITE_VERSION_NUMBER >= 3044000 - /* xIntegrity */ 0 -#endif -}; -#pragma endregion - -#pragma region vec_static_blob_entries() table function - -typedef struct vec_static_blob_entries_vtab vec_static_blob_entries_vtab; -struct vec_static_blob_entries_vtab { - sqlite3_vtab base; - static_blob *blob; -}; -typedef enum { - VEC_SBE__QUERYPLAN_FULLSCAN = 1, - VEC_SBE__QUERYPLAN_KNN = 2 -} vec_sbe_query_plan; - -struct sbe_query_knn_data { - i64 k; - i64 k_used; - // Array of rowids of size k. Must be freed with sqlite3_free(). - i32 *rowids; - // Array of distances of size k. Must be freed with sqlite3_free(). - f32 *distances; - i64 current_idx; -}; -void sbe_query_knn_data_clear(struct sbe_query_knn_data *knn_data) { - if (!knn_data) - return; - - if (knn_data->rowids) { - sqlite3_free(knn_data->rowids); - knn_data->rowids = NULL; - } - if (knn_data->distances) { - sqlite3_free(knn_data->distances); - knn_data->distances = NULL; - } -} - -typedef struct vec_static_blob_entries_cursor vec_static_blob_entries_cursor; -struct vec_static_blob_entries_cursor { - sqlite3_vtab_cursor base; - sqlite3_int64 iRowid; - vec_sbe_query_plan query_plan; - struct sbe_query_knn_data *knn_data; -}; - -static int vec_static_blob_entriesConnect(sqlite3 *db, void *pAux, int argc, - const char *const *argv, - sqlite3_vtab **ppVtab, char **pzErr) { - UNUSED_PARAMETER(argc); - UNUSED_PARAMETER(argv); - UNUSED_PARAMETER(pzErr); - vec_static_blob_data *blob_data = pAux; - int idx = -1; - for (int i = 0; i < MAX_STATIC_BLOBS; i++) { - if (!blob_data->static_blobs[i].name) - continue; - if (strncmp(blob_data->static_blobs[i].name, argv[3], - strlen(blob_data->static_blobs[i].name)) == 0) { - idx = i; - break; - } - } - if (idx < 0) - abort(); - vec_static_blob_entries_vtab *pNew; -#define VEC_STATIC_BLOB_ENTRIES_VECTOR 0 -#define VEC_STATIC_BLOB_ENTRIES_DISTANCE 1 -#define VEC_STATIC_BLOB_ENTRIES_K 2 - int rc = sqlite3_declare_vtab( - db, "CREATE TABLE x(vector, distance hidden, k hidden)"); - if (rc == SQLITE_OK) { - pNew = sqlite3_malloc(sizeof(*pNew)); - *ppVtab = (sqlite3_vtab *)pNew; - if (pNew == 0) - return SQLITE_NOMEM; - memset(pNew, 0, sizeof(*pNew)); - pNew->blob = &blob_data->static_blobs[idx]; - } - return rc; -} - -static int vec_static_blob_entriesCreate(sqlite3 *db, void *pAux, int argc, - const char *const *argv, - sqlite3_vtab **ppVtab, char **pzErr) { - return vec_static_blob_entriesConnect(db, pAux, argc, argv, ppVtab, pzErr); -} - -static int vec_static_blob_entriesDisconnect(sqlite3_vtab *pVtab) { - vec_static_blob_entries_vtab *p = (vec_static_blob_entries_vtab *)pVtab; - sqlite3_free(p); - return SQLITE_OK; -} - -static int vec_static_blob_entriesOpen(sqlite3_vtab *p, - sqlite3_vtab_cursor **ppCursor) { - UNUSED_PARAMETER(p); - vec_static_blob_entries_cursor *pCur; - pCur = sqlite3_malloc(sizeof(*pCur)); - if (pCur == 0) - return SQLITE_NOMEM; - memset(pCur, 0, sizeof(*pCur)); - *ppCursor = &pCur->base; - return SQLITE_OK; -} - -static int vec_static_blob_entriesClose(sqlite3_vtab_cursor *cur) { - vec_static_blob_entries_cursor *pCur = (vec_static_blob_entries_cursor *)cur; - sqlite3_free(pCur->knn_data); - sqlite3_free(pCur); - return SQLITE_OK; -} - -static int vec_static_blob_entriesBestIndex(sqlite3_vtab *pVTab, - sqlite3_index_info *pIdxInfo) { - vec_static_blob_entries_vtab *p = (vec_static_blob_entries_vtab *)pVTab; - int iMatchTerm = -1; - int iLimitTerm = -1; - // int iRowidTerm = -1; // https://github.com/asg017/sqlite-vec/issues/47 - int iKTerm = -1; - - for (int i = 0; i < pIdxInfo->nConstraint; i++) { - if (!pIdxInfo->aConstraint[i].usable) - continue; - - int iColumn = pIdxInfo->aConstraint[i].iColumn; - int op = pIdxInfo->aConstraint[i].op; - if (op == SQLITE_INDEX_CONSTRAINT_MATCH && - iColumn == VEC_STATIC_BLOB_ENTRIES_VECTOR) { - if (iMatchTerm > -1) { - // https://github.com/asg017/sqlite-vec/issues/51 - return SQLITE_ERROR; - } - iMatchTerm = i; - } - if (op == SQLITE_INDEX_CONSTRAINT_LIMIT) { - iLimitTerm = i; - } - if (op == SQLITE_INDEX_CONSTRAINT_EQ && - iColumn == VEC_STATIC_BLOB_ENTRIES_K) { - iKTerm = i; - } - } - if (iMatchTerm >= 0) { - if (iLimitTerm < 0 && iKTerm < 0) { - // https://github.com/asg017/sqlite-vec/issues/51 - return SQLITE_ERROR; - } - if (iLimitTerm >= 0 && iKTerm >= 0) { - return SQLITE_ERROR; // limit or k, not both - } - if (pIdxInfo->nOrderBy < 1) { - vtab_set_error(pVTab, "ORDER BY distance required"); - return SQLITE_CONSTRAINT; - } - if (pIdxInfo->nOrderBy > 1) { - // https://github.com/asg017/sqlite-vec/issues/51 - vtab_set_error(pVTab, "more than 1 ORDER BY clause provided"); - return SQLITE_CONSTRAINT; - } - if (pIdxInfo->aOrderBy[0].iColumn != VEC_STATIC_BLOB_ENTRIES_DISTANCE) { - vtab_set_error(pVTab, "ORDER BY must be on the distance column"); - return SQLITE_CONSTRAINT; - } - if (pIdxInfo->aOrderBy[0].desc) { - vtab_set_error(pVTab, - "Only ascending in ORDER BY distance clause is supported, " - "DESC is not supported yet."); - return SQLITE_CONSTRAINT; - } - - pIdxInfo->idxNum = VEC_SBE__QUERYPLAN_KNN; - pIdxInfo->estimatedCost = (double)10; - pIdxInfo->estimatedRows = 10; - - pIdxInfo->orderByConsumed = 1; - pIdxInfo->aConstraintUsage[iMatchTerm].argvIndex = 1; - pIdxInfo->aConstraintUsage[iMatchTerm].omit = 1; - if (iLimitTerm >= 0) { - pIdxInfo->aConstraintUsage[iLimitTerm].argvIndex = 2; - pIdxInfo->aConstraintUsage[iLimitTerm].omit = 1; - } else { - pIdxInfo->aConstraintUsage[iKTerm].argvIndex = 2; - pIdxInfo->aConstraintUsage[iKTerm].omit = 1; - } - - } else { - pIdxInfo->idxNum = VEC_SBE__QUERYPLAN_FULLSCAN; - pIdxInfo->estimatedCost = (double)p->blob->nvectors; - pIdxInfo->estimatedRows = p->blob->nvectors; - } - return SQLITE_OK; -} - -static int vec_static_blob_entriesFilter(sqlite3_vtab_cursor *pVtabCursor, - int idxNum, const char *idxStr, - int argc, sqlite3_value **argv) { - UNUSED_PARAMETER(idxStr); - assert(argc >= 0 && argc <= 3); - vec_static_blob_entries_cursor *pCur = - (vec_static_blob_entries_cursor *)pVtabCursor; - vec_static_blob_entries_vtab *p = - (vec_static_blob_entries_vtab *)pCur->base.pVtab; - - if (idxNum == VEC_SBE__QUERYPLAN_KNN) { - assert(argc == 2); - pCur->query_plan = VEC_SBE__QUERYPLAN_KNN; - struct sbe_query_knn_data *knn_data; - knn_data = sqlite3_malloc(sizeof(*knn_data)); - if (!knn_data) { - return SQLITE_NOMEM; - } - memset(knn_data, 0, sizeof(*knn_data)); - - void *queryVector; - size_t dimensions; - enum VectorElementType elementType; - vector_cleanup cleanup; - char *err; - int rc = vector_from_value(argv[0], &queryVector, &dimensions, &elementType, - &cleanup, &err); - if (rc != SQLITE_OK) { - return SQLITE_ERROR; - } - if (elementType != p->blob->element_type) { - return SQLITE_ERROR; - } - if (dimensions != p->blob->dimensions) { - return SQLITE_ERROR; - } - - i64 k = min(sqlite3_value_int64(argv[1]), (i64)p->blob->nvectors); - if (k < 0) { - // HANDLE https://github.com/asg017/sqlite-vec/issues/55 - return SQLITE_ERROR; - } - if (k == 0) { - knn_data->k = 0; - pCur->knn_data = knn_data; - return SQLITE_OK; - } - - size_t bsize = (p->blob->nvectors + 7) & ~7; - - i32 *topk_rowids = sqlite3_malloc(k * sizeof(i32)); - if (!topk_rowids) { - // HANDLE https://github.com/asg017/sqlite-vec/issues/55 - return SQLITE_ERROR; - } - f32 *distances = sqlite3_malloc(bsize * sizeof(f32)); - if (!distances) { - // HANDLE https://github.com/asg017/sqlite-vec/issues/55 - return SQLITE_ERROR; - } - - for (size_t i = 0; i < p->blob->nvectors; i++) { - // https://github.com/asg017/sqlite-vec/issues/52 - float *v = ((float *)p->blob->p) + (i * p->blob->dimensions); - distances[i] = - distance_l2_sqr_float(v, (float *)queryVector, &p->blob->dimensions); - } - u8 *candidates = bitmap_new(bsize); - assert(candidates); - - u8 *taken = bitmap_new(bsize); - assert(taken); - - bitmap_fill(candidates, bsize); - for (size_t i = bsize; i >= p->blob->nvectors; i--) { - bitmap_set(candidates, i, 0); - } - i32 k_used = 0; - min_idx(distances, bsize, candidates, topk_rowids, k, taken, &k_used); - knn_data->current_idx = 0; - knn_data->distances = distances; - knn_data->k = k; - knn_data->rowids = topk_rowids; - - pCur->knn_data = knn_data; - } else { - pCur->query_plan = VEC_SBE__QUERYPLAN_FULLSCAN; - pCur->iRowid = 0; - } - - return SQLITE_OK; -} - -static int vec_static_blob_entriesRowid(sqlite3_vtab_cursor *cur, - sqlite_int64 *pRowid) { - vec_static_blob_entries_cursor *pCur = (vec_static_blob_entries_cursor *)cur; - switch (pCur->query_plan) { - case VEC_SBE__QUERYPLAN_FULLSCAN: { - *pRowid = pCur->iRowid; - return SQLITE_OK; - } - case VEC_SBE__QUERYPLAN_KNN: { - i32 rowid = ((i32 *)pCur->knn_data->rowids)[pCur->knn_data->current_idx]; - *pRowid = (sqlite3_int64)rowid; - return SQLITE_OK; - } - } - return SQLITE_ERROR; -} - -static int vec_static_blob_entriesNext(sqlite3_vtab_cursor *cur) { - vec_static_blob_entries_cursor *pCur = (vec_static_blob_entries_cursor *)cur; - switch (pCur->query_plan) { - case VEC_SBE__QUERYPLAN_FULLSCAN: { - pCur->iRowid++; - return SQLITE_OK; - } - case VEC_SBE__QUERYPLAN_KNN: { - pCur->knn_data->current_idx++; - return SQLITE_OK; - } - } - return SQLITE_ERROR; -} - -static int vec_static_blob_entriesEof(sqlite3_vtab_cursor *cur) { - vec_static_blob_entries_cursor *pCur = (vec_static_blob_entries_cursor *)cur; - vec_static_blob_entries_vtab *p = - (vec_static_blob_entries_vtab *)pCur->base.pVtab; - switch (pCur->query_plan) { - case VEC_SBE__QUERYPLAN_FULLSCAN: { - return (size_t)pCur->iRowid >= p->blob->nvectors; - } - case VEC_SBE__QUERYPLAN_KNN: { - return pCur->knn_data->current_idx >= pCur->knn_data->k; - } - } - return SQLITE_ERROR; -} - -static int vec_static_blob_entriesColumn(sqlite3_vtab_cursor *cur, - sqlite3_context *context, int i) { - vec_static_blob_entries_cursor *pCur = (vec_static_blob_entries_cursor *)cur; - vec_static_blob_entries_vtab *p = (vec_static_blob_entries_vtab *)cur->pVtab; - - switch (pCur->query_plan) { - case VEC_SBE__QUERYPLAN_FULLSCAN: { - switch (i) { - case VEC_STATIC_BLOB_ENTRIES_VECTOR: - - sqlite3_result_blob( - context, - ((unsigned char *)p->blob->p) + - (pCur->iRowid * p->blob->dimensions * sizeof(float)), - p->blob->dimensions * sizeof(float), SQLITE_TRANSIENT); - sqlite3_result_subtype(context, p->blob->element_type); - break; - } - return SQLITE_OK; - } - case VEC_SBE__QUERYPLAN_KNN: { - switch (i) { - case VEC_STATIC_BLOB_ENTRIES_VECTOR: { - i32 rowid = ((i32 *)pCur->knn_data->rowids)[pCur->knn_data->current_idx]; - sqlite3_result_blob(context, - ((unsigned char *)p->blob->p) + - (rowid * p->blob->dimensions * sizeof(float)), - p->blob->dimensions * sizeof(float), - SQLITE_TRANSIENT); - sqlite3_result_subtype(context, p->blob->element_type); - break; - } - } - return SQLITE_OK; - } - } - return SQLITE_ERROR; -} - -static sqlite3_module vec_static_blob_entriesModule = { - /* iVersion */ 3, - /* xCreate */ - vec_static_blob_entriesCreate, // handle rm? - // https://github.com/asg017/sqlite-vec/issues/55 - /* xConnect */ vec_static_blob_entriesConnect, - /* xBestIndex */ vec_static_blob_entriesBestIndex, - /* xDisconnect */ vec_static_blob_entriesDisconnect, - /* xDestroy */ vec_static_blob_entriesDisconnect, - /* xOpen */ vec_static_blob_entriesOpen, - /* xClose */ vec_static_blob_entriesClose, - /* xFilter */ vec_static_blob_entriesFilter, - /* xNext */ vec_static_blob_entriesNext, - /* xEof */ vec_static_blob_entriesEof, - /* xColumn */ vec_static_blob_entriesColumn, - /* xRowid */ vec_static_blob_entriesRowid, - /* xUpdate */ 0, - /* xBegin */ 0, - /* xSync */ 0, - /* xCommit */ 0, - /* xRollback */ 0, - /* xFindMethod */ 0, - /* xRename */ 0, - /* xSavepoint */ 0, - /* xRelease */ 0, - /* xRollbackTo */ 0, - /* xShadowName */ 0, -#if SQLITE_VERSION_NUMBER >= 3044000 - /* xIntegrity */ 0 -#endif -}; -#pragma endregion #ifdef SQLITE_VEC_ENABLE_AVX #define SQLITE_VEC_DEBUG_BUILD_AVX "avx" @@ -10045,9 +10602,28 @@ static sqlite3_module vec_static_blob_entriesModule = { #else #define SQLITE_VEC_DEBUG_BUILD_NEON "" #endif +#if SQLITE_VEC_ENABLE_RESCORE +#define SQLITE_VEC_DEBUG_BUILD_RESCORE "rescore" +#else +#define SQLITE_VEC_DEBUG_BUILD_RESCORE "" +#endif + +#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE +#define SQLITE_VEC_DEBUG_BUILD_IVF "ivf" +#else +#define SQLITE_VEC_DEBUG_BUILD_IVF "" +#endif + +#if SQLITE_VEC_ENABLE_DISKANN +#define SQLITE_VEC_DEBUG_BUILD_DISKANN "diskann" +#else +#define SQLITE_VEC_DEBUG_BUILD_DISKANN "" +#endif #define SQLITE_VEC_DEBUG_BUILD \ - SQLITE_VEC_DEBUG_BUILD_AVX " " SQLITE_VEC_DEBUG_BUILD_NEON + SQLITE_VEC_DEBUG_BUILD_AVX " " SQLITE_VEC_DEBUG_BUILD_NEON " " \ + SQLITE_VEC_DEBUG_BUILD_RESCORE " " SQLITE_VEC_DEBUG_BUILD_IVF " " \ + SQLITE_VEC_DEBUG_BUILD_DISKANN #define SQLITE_VEC_DEBUG_STRING \ "Version: " SQLITE_VEC_VERSION "\n" \ @@ -10139,55 +10715,4 @@ SQLITE_VEC_API int sqlite3_vec_init(sqlite3 *db, char **pzErrMsg, return SQLITE_OK; } -#ifndef SQLITE_VEC_OMIT_FS -SQLITE_VEC_API int sqlite3_vec_numpy_init(sqlite3 *db, char **pzErrMsg, - const sqlite3_api_routines *pApi) { - UNUSED_PARAMETER(pzErrMsg); -#ifndef SQLITE_CORE - SQLITE_EXTENSION_INIT2(pApi); -#endif - int rc = SQLITE_OK; - rc = sqlite3_create_function_v2(db, "vec_npy_file", 1, SQLITE_RESULT_SUBTYPE, - NULL, vec_npy_file, NULL, NULL, NULL); - if(rc != SQLITE_OK) { - return rc; - } - rc = sqlite3_create_module_v2(db, "vec_npy_each", &vec_npy_eachModule, NULL, NULL); - return rc; -} -#endif -SQLITE_VEC_API int -sqlite3_vec_static_blobs_init(sqlite3 *db, char **pzErrMsg, - const sqlite3_api_routines *pApi) { - UNUSED_PARAMETER(pzErrMsg); -#ifndef SQLITE_CORE - SQLITE_EXTENSION_INIT2(pApi); -#endif - - int rc = SQLITE_OK; - vec_static_blob_data *static_blob_data; - static_blob_data = sqlite3_malloc(sizeof(*static_blob_data)); - if (!static_blob_data) { - return SQLITE_NOMEM; - } - memset(static_blob_data, 0, sizeof(*static_blob_data)); - - rc = sqlite3_create_function_v2( - db, "vec_static_blob_from_raw", 4, - DEFAULT_FLAGS | SQLITE_SUBTYPE | SQLITE_RESULT_SUBTYPE, NULL, - vec_static_blob_from_raw, NULL, NULL, NULL); - if (rc != SQLITE_OK) - return rc; - - rc = sqlite3_create_module_v2(db, "vec_static_blobs", &vec_static_blobsModule, - static_blob_data, sqlite3_free); - if (rc != SQLITE_OK) - return rc; - rc = sqlite3_create_module_v2(db, "vec_static_blob_entries", - &vec_static_blob_entriesModule, - static_blob_data, NULL); - if (rc != SQLITE_OK) - return rc; - return rc; -} diff --git a/tests/__snapshots__/test-auxiliary.ambr b/tests/__snapshots__/test-auxiliary.ambr index 66a3ef3..7313faf 100644 --- a/tests/__snapshots__/test-auxiliary.ambr +++ b/tests/__snapshots__/test-auxiliary.ambr @@ -169,6 +169,200 @@ }), }) # --- +# name: test_diskann_aux_insert_knn[diskann aux select all] + OrderedDict({ + 'sql': 'SELECT rowid, label FROM t ORDER BY rowid', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'label': 'red', + }), + OrderedDict({ + 'rowid': 2, + 'label': 'green', + }), + OrderedDict({ + 'rowid': 3, + 'label': 'blue', + }), + ]), + }) +# --- +# name: test_diskann_aux_insert_knn[diskann aux shadow contents] + dict({ + 't_auxiliary': OrderedDict({ + 'sql': 'select * from t_auxiliary', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'value00': 'red', + }), + OrderedDict({ + 'rowid': 2, + 'value00': 'green', + }), + OrderedDict({ + 'rowid': 3, + 'value00': 'blue', + }), + ]), + }), + 't_chunks': OrderedDict({ + 'sql': 'select * from t_chunks', + 'rows': list([ + ]), + }), + 't_diskann_buffer00': OrderedDict({ + 'sql': 'select * from t_diskann_buffer00', + 'rows': list([ + ]), + }), + 't_diskann_nodes00': OrderedDict({ + 'sql': 'select * from t_diskann_nodes00', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'neighbors_validity': b'\x03', + 'neighbor_ids': b'\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + 'neighbor_quantized_vectors': b'\x02\x04\x00\x00\x00\x00\x00\x00', + }), + OrderedDict({ + 'rowid': 2, + 'neighbors_validity': b'\x03', + 'neighbor_ids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + 'neighbor_quantized_vectors': b'\x01\x04\x00\x00\x00\x00\x00\x00', + }), + OrderedDict({ + 'rowid': 3, + 'neighbors_validity': b'\x03', + 'neighbor_ids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + 'neighbor_quantized_vectors': b'\x01\x02\x00\x00\x00\x00\x00\x00', + }), + ]), + }), + 't_rowids': OrderedDict({ + 'sql': 'select * from t_rowids', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'id': None, + 'chunk_id': None, + 'chunk_offset': None, + }), + OrderedDict({ + 'rowid': 2, + 'id': None, + 'chunk_id': None, + 'chunk_offset': None, + }), + OrderedDict({ + 'rowid': 3, + 'id': None, + 'chunk_id': None, + 'chunk_offset': None, + }), + ]), + }), + 't_vectors00': OrderedDict({ + 'sql': 'select * from t_vectors00', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'vector': b'\x00\x00\x80?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + }), + OrderedDict({ + 'rowid': 2, + 'vector': b'\x00\x00\x00\x00\x00\x00\x80?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + }), + OrderedDict({ + 'rowid': 3, + 'vector': b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x80?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + }), + ]), + }), + }) +# --- +# name: test_diskann_aux_shadow_tables[diskann aux shadow tables] + OrderedDict({ + 'sql': "SELECT name, sql FROM sqlite_master WHERE type='table' AND name LIKE 't_%' ORDER BY name", + 'rows': list([ + OrderedDict({ + 'name': 't_auxiliary', + 'sql': 'CREATE TABLE "t_auxiliary"( rowid integer PRIMARY KEY , value00, value01)', + }), + OrderedDict({ + 'name': 't_chunks', + 'sql': 'CREATE TABLE "t_chunks"(chunk_id INTEGER PRIMARY KEY AUTOINCREMENT,size INTEGER NOT NULL,validity BLOB NOT NULL,rowids BLOB NOT NULL)', + }), + OrderedDict({ + 'name': 't_diskann_buffer00', + 'sql': 'CREATE TABLE "t_diskann_buffer00" (rowid INTEGER PRIMARY KEY, vector BLOB NOT NULL)', + }), + OrderedDict({ + 'name': 't_diskann_nodes00', + 'sql': 'CREATE TABLE "t_diskann_nodes00" (rowid INTEGER PRIMARY KEY, neighbors_validity BLOB NOT NULL, neighbor_ids BLOB NOT NULL, neighbor_quantized_vectors BLOB NOT NULL)', + }), + OrderedDict({ + 'name': 't_info', + 'sql': 'CREATE TABLE "t_info" (key text primary key, value any)', + }), + OrderedDict({ + 'name': 't_rowids', + 'sql': 'CREATE TABLE "t_rowids"(rowid INTEGER PRIMARY KEY AUTOINCREMENT,id,chunk_id INTEGER,chunk_offset INTEGER)', + }), + OrderedDict({ + 'name': 't_vectors00', + 'sql': 'CREATE TABLE "t_vectors00" (rowid INTEGER PRIMARY KEY, vector BLOB NOT NULL)', + }), + ]), + }) +# --- +# name: test_diskann_aux_update_and_delete[diskann aux after update+delete] + OrderedDict({ + 'sql': 'SELECT rowid, label FROM t ORDER BY rowid', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'label': 'item-1', + }), + OrderedDict({ + 'rowid': 2, + 'label': 'UPDATED', + }), + OrderedDict({ + 'rowid': 4, + 'label': 'item-4', + }), + OrderedDict({ + 'rowid': 5, + 'label': 'item-5', + }), + ]), + }) +# --- +# name: test_diskann_aux_update_and_delete[diskann aux shadow after update+delete] + OrderedDict({ + 'sql': 'SELECT rowid, value00 FROM t_auxiliary ORDER BY rowid', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'value00': 'item-1', + }), + OrderedDict({ + 'rowid': 2, + 'value00': 'UPDATED', + }), + OrderedDict({ + 'rowid': 4, + 'value00': 'item-4', + }), + OrderedDict({ + 'rowid': 5, + 'value00': 'item-5', + }), + ]), + }) +# --- # name: test_knn OrderedDict({ 'sql': 'select * from v', @@ -392,6 +586,183 @@ ]), }) # --- +# name: test_rescore_aux_delete[rescore aux after delete] + OrderedDict({ + 'sql': 'SELECT rowid, label FROM t ORDER BY rowid', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'label': 'item-1', + }), + OrderedDict({ + 'rowid': 2, + 'label': 'item-2', + }), + OrderedDict({ + 'rowid': 4, + 'label': 'item-4', + }), + OrderedDict({ + 'rowid': 5, + 'label': 'item-5', + }), + ]), + }) +# --- +# name: test_rescore_aux_delete[rescore aux shadow after delete] + OrderedDict({ + 'sql': 'SELECT rowid, value00 FROM t_auxiliary ORDER BY rowid', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'value00': 'item-1', + }), + OrderedDict({ + 'rowid': 2, + 'value00': 'item-2', + }), + OrderedDict({ + 'rowid': 4, + 'value00': 'item-4', + }), + OrderedDict({ + 'rowid': 5, + 'value00': 'item-5', + }), + ]), + }) +# --- +# name: test_rescore_aux_insert_knn[rescore aux select all] + OrderedDict({ + 'sql': 'SELECT rowid, label FROM t ORDER BY rowid', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'label': 'alpha', + }), + OrderedDict({ + 'rowid': 2, + 'label': 'beta', + }), + OrderedDict({ + 'rowid': 3, + 'label': 'gamma', + }), + ]), + }) +# --- +# name: test_rescore_aux_insert_knn[rescore aux shadow contents] + dict({ + 't_auxiliary': OrderedDict({ + 'sql': 'select * from t_auxiliary', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'value00': 'alpha', + }), + OrderedDict({ + 'rowid': 2, + 'value00': 'beta', + }), + OrderedDict({ + 'rowid': 3, + 'value00': 'gamma', + }), + ]), + }), + 't_chunks': OrderedDict({ + 'sql': 'select * from t_chunks', + 'rows': list([ + OrderedDict({ + 'chunk_id': 1, + 'size': 1024, + 'validity': b'\x07\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + 'rowids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + }), + ]), + }), + 't_rescore_chunks00': OrderedDict({ + 'sql': 'select * from t_rescore_chunks00', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'vectors': b'=\xf3<\xef\xf1H\x85\xa57\x16v\xe6/\x86\x7f\xace\x96\x11|1\x18a\xd8\x15\x1c&\x02z\x9e\xeb\x12\xa4\xd7\xd2i\x89\xc4\x18A>\xa2\x9bT\xcd=\xd9i\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + }), + ]), + }), + 't_rescore_vectors00': OrderedDict({ + 'sql': 'select * from t_rescore_vectors00', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'vector': b']\x1c\x8a>\xc4\x9eX\xbf\xceY\xe3=w\x9b\xed?\nQZ?\xdc\x9f@?\x80u\xa4\xbf\x16q\xfa\xbeB\x9b\\?B\x13\x8f?\x80\xd1\xf9\xbf\x10\x0c\xf9\xbb\xf8\x1c\x1e@G\xbd\xe3>\xea\x03[?\xecc\xcf?\x12\x03\xf3\xbf\xa1\xd4\xb2\xbd\x8f\x99\xb8>\x92\xb5M?\xa1\xc9\xbd?/\x90g?\xe1u)\xbf\x94\xe8\x12\xbe\xd6\xba\x16>\'i\xc4>\xc3?\xca>\t\x0e\x9c?\xf0\xa8\xb8\xbf\x1b\x98\x18?\xa8<\xd9??\xdca?\x89\xad\xd0?\x88\xb8<\xbe<\xfa\xab\xbd#\xfc\xf8\xbf\x1e\x90k?\xa0\xec\x1f?a\x1a\xc4?\xc0yU?\xcc\x11\xec\xbe(\xad\xe5\xbd\xfbx\xd0\xbfA\xd3\x16?1\xc5\xf6\xbe\xdcn\'\xbe\x00\xe6\xa0>\t\xd4\x06\xbeD\xfb\xbe?\x1a,\x10\xc0\x80\x8a\x83<\xd2\x8f\x1a\xc0\xf0\xab0\xbf\xfaD\x0b\xbe\x18`\xce\xbf\xa43\x91>\xd0\x13\xea=\x1cpz\xbf\x9ai\x81>\x1d+\xb2\xbd\xb1:\x91\xbe\r\x9e\xf4;|"\xf2\xbfA\x0c\x16@+\x92\t@\x99\x9e\xfb?&\xb9\xa1?_v[\xbf\x98\xb7\x87?\xfe"\xc7>%]#\xbe\xa0\xf2\xd5\xbe\x9e"\x06\xbe\x1dz\xd6>\x84\xa2\x9d?\xd7\xb3\xec\xbf\xbbJ\xbd?\xbd\xebD\xbf\xdd.\xa3\xbe1\xcd\'\xbe\xa2\xf9\xd6\xbfL\xa7I>\xef\x17\x0f<(0n\xbe\xbe\xaf\xdb>\x7fo\xb5<\xcah\xdf>d\x00f\xbe\xb1\x85`\xbf\x95 9?\xd1\xeb\xcd>gk\xb8\xbe\x18\xd3\xfe\xbd1\x80\xdb?\x8b\x86\x03?\x1a\xb7\x9d>\xadM\x1f@\x04\xa0\xca>tc\xfc=\x186\x96?7.\x03\xc0V\xfa\xf8?\xf2\xf2\xa3\xbe1\xa1\xa8\xbf\x06\xb1I\xbfs\xcbT=\xda\xe5}>\xcd@\xca\xbe\x1ee\x1a\xbf\x02H\x14\xbf\x99\xef8\xbeG\xd9\x8a?&\xdc\xff>O\xf8\x9e?\xbd+4>\x9d\xa4\xab=PB\x8c>fs\xac>\x8b\xb4\\?q\xe2S\xbf^\x9a@\xbf\xe7\x7f\xc8\xbf\xbb\x9e\x9f?\xc2\xa0\x07@\xe2mT\xbf@\xf1v\x89n\xbf\xfb\xe2T\xbc\xd4\xd2\xff?,o\xaf?\x0c+\xb6\xbf#\x84|\xbf\x80\xc8\xfd?9\x97\xdb>oa\'\xc0\x8f\xa9\x00>[\xc3\x83\xc0d\xe2\xd2\xbf\xba\xeai>1\x14s>\xe3\x11\x99\xbf\xd9j\x81\xbf\xb3\x1e\xe1\xbcS;)?\x86\x987\xbf\t\xf4\xe4\xbc\xb8f\xa4\xbf\x1c\x83k\xbe|*T\xbf\x00\xd8\xa8?\xc4\x966?2\x14\x14?H\xfe>?\xbd\xbb\x7f=\xcb\x1c\x9f\xbe\xe5\xad\x90?U\x085\xbf\xde{\xb2\xbf\x1f\x03\x10\xbf\x19\xd6J>b\xb9\x97=\xc18z\xbe\xe76\xa7\xbf\xed\x80\x98\xbf\xf5\x12\xb7\xbf\x86(\x9f\xbdaY\x16?\x07j\xb1>\x9ea\x8c\xbd\x91(\xb2\xbf\xe1\xa1\x0c\xbe\xc4_\xd1>\x8c\xad\xf2\xbdc\xab\xf4\xbd\x81<\xc6\xbe}\xa7\xaa\xbfk\xe4\xcb>\xcd89>dk\x81\xbe%\xa4\xb0\xbbAU\x11\xbfG\n\x15\xc0\xb6m\xcb?\xafoq>0\x17\xa5\xbe?j\x81>\xbet\xee;\xc0\xc2\\=S\x81\x8c\xbe2T\xca> \xbe\xe2\xbf1\xd2w?\xed\xfd\x08\xc0\x01\x17\xa0\xbf\x99o\xb1\xbfRX\xb7\xbf\x06f\xca\xbeD\x9e\x92?\x86W\t?\x03G}?\xdd\xbfv\xbdd\xf0\x0c\xbe\xf8\x8a\x1c\xbf\xd8\xc9\x9e\xbfy/\x13>\x802\xdc= 5`\xbf\x00\xf3"\xbf\x99\x92\x01>7 \x06\xc0{\xd7\x8d\xbf\xa5/\x8e\xbf\\\x82u?\x17M\x1e\xbf\xcf\xbbk\xbe\xc3\x84i\xbe\xf1\xa4&\xbfD\xb4\x1a\xc0au\x06\xc0:\xbc\x04\xbf\x0cK\xb2?mdD\xbf\xfa\xa4\x9b?\x89w\xd9>\xde\xb7\x8c?!e\x1a>\xc3\x05-\xbf\x11\xce\xdf\xbe!\xf3\x10?\xab!P?\x96\xbc\x9f?]\x1c\x19?f\x97\x88\xbf\xddRM\xbf,)\x8e>7\x14$>}\x8d\x18@O%\xc0\xbf/\xa5C>`B\xe5\xbd\xb71\x1b?s\x11V?3\xa2F=\x13\xaf\x87\xbf\xe2X\x17?\xa7\xc8\x91\xbf\x19^\x83\xbf\xc6w\xa4?[H\xa1\xbf\x17M2\xbf\xfb\x7f\xd5\xbf', + }), + OrderedDict({ + 'rowid': 3, + 'vector': b'b\xd9\x8b\xbf9\x81\x96\xbe\x83h\xe9?\\\xa4\x89\xbf\x93\xff\xc1\xbf6\xfe\xa8?\xd8\x19\xc1\xbf*\xf06?\xb2\x0c\xaa?2\\P>\xd9\xf0\x81?K~\xb4\xbe\x05\x00\x85?\xcfg\x8c\xbf?\x93\xca>\xf82Q=\x00\x8e\xa5\xbf\xf3:\x15@\xbc\x9c>\xbf}\x13\x90\xbf5z\x17?w17\xbf\xaf\xde(?<\x00]?M\xff\x00?\n7\xa8\xbe\x83kU\xbf-R\x1a=\xa1\xbc\x8c\xbe\xf7.\xb0>\xf1W$?\xe3\xb1\xd8\xbeZ\x89\x19>\x08\x0b\x19\xbf8\xbfK\xbe\t\x12\xcb=P\xd5\x81\xbf8C!\xbf$\xaa\x1b\xbf1\x8f\x8e>\xbb\x1c2\xbe"\x88R\xbekR\x86?\xb3\xfa\xee\xbe\x1aAN\xbf|\xca,\xbf\x0c{z>\x97;\n=Q>4\xbey\x12\x92\xbf[\xa1]\xbe\t\x93\x9c?\xf5\xbcR?\x1cj]\xbf\xa5w\xa8\xbf\xf5\xc1\x1b\xbej\xb25>5\xf48\xbe\x87\xe4-\xbf x\x8f\xbfoC\x01\xbe\xe7:\x16\xbf\x1c\xf1W?\xf6\x1e\xc8\xbf\xe9&\xd5\xbec^\x19@\x19\n\x98?My\xd0>\xa3\xa4\r?\xfc#\xab>\xf7\x1a\x81\xbf\xbb\xe8\x98\xbf\xb0]>\xbff\x92\xc7=\xb3\x16\x86\xbf4\xdc\x9b\xbe\xe2\xd4C\xbfi\xbb/?r\x0fo\xbf\xb8\xd8g>$\x9cW?\xfd\xb0P?\x05\r\xc0\xbeC\x08\xde?Pz\xcb?\x88\xd5\xe9\xbe\xd4\x07\x0c\xbf\x16s\x7f?\xf9=K\xbd\x9378\xbfI\xd6\xbb=\xe7j\x92\xbe\xeb\xfa\x9f?\x9d\x9d\x83\xbe4\xbbK>\xcf\x82\xab\xbfv\x98\x1c?a"Z\xbf\xaf/\x12?\xfe4\xbc>\x84\xed\x91\xbd\xeb\x857\xc0\x90\x89">\x05t\x92?\x1b\x00(>F>V\xbf\x84\x12\x1e>\xcb\xae\xd8?\xc0S+?\x95Z\x1a?\xbe\x93x\xbf>\xfe\'\xbf`\xa4\x8b?\xca\x08\xba\xbe\x89\xc2\n\xbf\xec2\xb2>\x1c\xb3\x04>w\xc0\x95\xbe\x94\xf0r?\xb5\x08\xc4=\x15~\x84>:\xc78\xbfV-\xdb\xbe\xaf\xde\xb2=\xd8\xc8\xe1\xbe\x06\xf9\x14@^\x16\x92>bk\xb1\xbe', + }), + ]), + }), + 't_rowids': OrderedDict({ + 'sql': 'select * from t_rowids', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'id': None, + 'chunk_id': 1, + 'chunk_offset': 0, + }), + OrderedDict({ + 'rowid': 2, + 'id': None, + 'chunk_id': 1, + 'chunk_offset': 1, + }), + OrderedDict({ + 'rowid': 3, + 'id': None, + 'chunk_id': 1, + 'chunk_offset': 2, + }), + ]), + }), + }) +# --- +# name: test_rescore_aux_shadow_tables[rescore aux shadow tables] + OrderedDict({ + 'sql': "SELECT name, sql FROM sqlite_master WHERE type='table' AND name LIKE 't_%' ORDER BY name", + 'rows': list([ + OrderedDict({ + 'name': 't_auxiliary', + 'sql': 'CREATE TABLE "t_auxiliary"( rowid integer PRIMARY KEY , value00, value01)', + }), + OrderedDict({ + 'name': 't_chunks', + 'sql': 'CREATE TABLE "t_chunks"(chunk_id INTEGER PRIMARY KEY AUTOINCREMENT,size INTEGER NOT NULL,validity BLOB NOT NULL,rowids BLOB NOT NULL)', + }), + OrderedDict({ + 'name': 't_info', + 'sql': 'CREATE TABLE "t_info" (key text primary key, value any)', + }), + OrderedDict({ + 'name': 't_rescore_chunks00', + 'sql': 'CREATE TABLE "t_rescore_chunks00"(rowid PRIMARY KEY, vectors BLOB NOT NULL)', + }), + OrderedDict({ + 'name': 't_rescore_vectors00', + 'sql': 'CREATE TABLE "t_rescore_vectors00"(rowid INTEGER PRIMARY KEY, vector BLOB NOT NULL)', + }), + OrderedDict({ + 'name': 't_rowids', + 'sql': 'CREATE TABLE "t_rowids"(rowid INTEGER PRIMARY KEY AUTOINCREMENT,id,chunk_id INTEGER,chunk_offset INTEGER)', + }), + ]), + }) +# --- # name: test_types OrderedDict({ 'sql': 'select * from v', diff --git a/tests/__snapshots__/test-metadata.ambr b/tests/__snapshots__/test-metadata.ambr index ff7b112..e5ffaf2 100644 --- a/tests/__snapshots__/test-metadata.ambr +++ b/tests/__snapshots__/test-metadata.ambr @@ -27,8 +27,8 @@ OrderedDict({ 'chunk_id': 1, 'size': 8, - 'validity': b'\x06', - 'rowids': b'\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + 'validity': b'\x02', + 'rowids': b'\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', }), ]), }), @@ -37,7 +37,7 @@ 'rows': list([ OrderedDict({ 'rowid': 1, - 'data': b'\x06', + 'data': b'\x02', }), ]), }), @@ -46,7 +46,7 @@ 'rows': list([ OrderedDict({ 'rowid': 1, - 'data': b'\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + 'data': b'\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', }), ]), }), @@ -55,7 +55,7 @@ 'rows': list([ OrderedDict({ 'rowid': 1, - 'data': b'\x00\x00\x00\x00\x00\x00\x00\x00\x9a\x99\x99\x99\x99\x99\x01@ffffff\n@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + 'data': b'\x00\x00\x00\x00\x00\x00\x00\x00\x9a\x99\x99\x99\x99\x99\x01@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', }), ]), }), @@ -64,17 +64,13 @@ 'rows': list([ OrderedDict({ 'rowid': 1, - 'data': b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00test2\x00\x00\x00\x00\x00\x00\x00\r\x00\x00\x00123456789012\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + 'data': b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00test2\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', }), ]), }), 'v_metadatatext03': OrderedDict({ 'sql': 'select * from v_metadatatext03', 'rows': list([ - OrderedDict({ - 'rowid': 3, - 'data': '1234567890123', - }), ]), }), 'v_rowids': OrderedDict({ @@ -86,12 +82,6 @@ 'chunk_id': 1, 'chunk_offset': 1, }), - OrderedDict({ - 'rowid': 3, - 'id': None, - 'chunk_id': 1, - 'chunk_offset': 2, - }), ]), }), 'v_vector_chunks00': OrderedDict({ @@ -99,7 +89,7 @@ 'rows': list([ OrderedDict({ 'rowid': 1, - 'vectors': b'\x00\x00\x00\x00""""3333\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + 'vectors': b'\x00\x00\x00\x00""""\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', }), ]), }), @@ -370,14 +360,6 @@ 'f': 2.2, 't': 'test2', }), - OrderedDict({ - 'rowid': 3, - 'vector': b'3333', - 'b': 1, - 'n': 3, - 'f': 3.3, - 't': '1234567890123', - }), ]), }) # --- diff --git a/tests/conftest.py b/tests/conftest.py index 9549d37..3a24468 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,29 @@ import pytest import sqlite3 +import os + + +def _vec_debug(): + db = sqlite3.connect(":memory:") + db.enable_load_extension(True) + db.load_extension("dist/vec0") + db.enable_load_extension(False) + return db.execute("SELECT vec_debug()").fetchone()[0] + + +def _has_build_flag(flag): + return flag in _vec_debug().split("Build flags:")[-1] + + +def pytest_collection_modifyitems(config, items): + has_ivf = _has_build_flag("ivf") + if has_ivf: + return + skip_ivf = pytest.mark.skip(reason="IVF not enabled (compile with -DSQLITE_VEC_EXPERIMENTAL_IVF_ENABLE=1)") + ivf_prefixes = ("test-ivf",) + for item in items: + if any(item.fspath.basename.startswith(p) for p in ivf_prefixes): + item.add_marker(skip_ivf) @pytest.fixture() diff --git a/tests/correctness/test-correctness.py b/tests/correctness/test-correctness.py index cb01f8f..9ed0319 100644 --- a/tests/correctness/test-correctness.py +++ b/tests/correctness/test-correctness.py @@ -48,7 +48,6 @@ import json db = sqlite3.connect(":memory:") db.enable_load_extension(True) db.load_extension("../../dist/vec0") -db.execute("select load_extension('../../dist/vec0', 'sqlite3_vec_fs_read_init')") db.enable_load_extension(False) results = db.execute( @@ -75,17 +74,21 @@ print(b) db.execute('PRAGMA page_size=16384') -print("Loading into sqlite-vec vec0 table...") -t0 = time.time() -db.execute("create virtual table v using vec0(a float[3072], chunk_size=16)") -db.execute('insert into v select rowid, vector from vec_npy_each(vec_npy_file("dbpedia_openai_3_large_00.npy"))') -print(time.time() - t0) - print("loading numpy array...") t0 = time.time() base = np.load('dbpedia_openai_3_large_00.npy') print(time.time() - t0) +print("Loading into sqlite-vec vec0 table...") +t0 = time.time() +db.execute("create virtual table v using vec0(a float[3072], chunk_size=16)") +with db: + db.executemany( + "insert into v(rowid, a) values (?, ?)", + [(i, row.tobytes()) for i, row in enumerate(base)], + ) +print(time.time() - t0) + np.random.seed(1) queries = base[np.random.choice(base.shape[0], 20, replace=False), :] diff --git a/tests/fixtures/legacy-v0.1.6.db b/tests/fixtures/legacy-v0.1.6.db new file mode 100644 index 0000000..58bd89d Binary files /dev/null and b/tests/fixtures/legacy-v0.1.6.db differ diff --git a/tests/fuzz/.gitignore b/tests/fuzz/.gitignore index 757d1ac..b9c7d30 100644 --- a/tests/fuzz/.gitignore +++ b/tests/fuzz/.gitignore @@ -1,2 +1,7 @@ *.dSYM targets/ +corpus/ +crash-* +leak-* +timeout-* +*.log diff --git a/tests/fuzz/Makefile b/tests/fuzz/Makefile index 21629ef..202dc2b 100644 --- a/tests/fuzz/Makefile +++ b/tests/fuzz/Makefile @@ -26,7 +26,7 @@ FUZZ_LDFLAGS ?= $(shell \ echo "-Wl,-ld_classic"; \ fi) -FUZZ_CFLAGS = $(FUZZ_SANITIZERS) -I ../../ -I ../../vendor -DSQLITE_CORE -g $(FUZZ_LDFLAGS) +FUZZ_CFLAGS = $(FUZZ_SANITIZERS) -I ../../ -I ../../vendor -DSQLITE_CORE -DSQLITE_VEC_ENABLE_DISKANN=1 -g $(FUZZ_LDFLAGS) FUZZ_SRCS = ../../vendor/sqlite3.c ../../sqlite-vec.c TARGET_DIR = ./targets @@ -72,10 +72,94 @@ $(TARGET_DIR)/vec_mismatch: vec-mismatch.c $(FUZZ_SRCS) | $(TARGET_DIR) $(TARGET_DIR)/vec0_delete_completeness: vec0-delete-completeness.c $(FUZZ_SRCS) | $(TARGET_DIR) $(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@ +$(TARGET_DIR)/rescore_operations: rescore-operations.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) -DSQLITE_VEC_ENABLE_RESCORE $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/rescore_create: rescore-create.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) -DSQLITE_VEC_ENABLE_RESCORE $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/rescore_quantize: rescore-quantize.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) -DSQLITE_VEC_ENABLE_RESCORE -DSQLITE_VEC_TEST $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/rescore_shadow_corrupt: rescore-shadow-corrupt.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) -DSQLITE_VEC_ENABLE_RESCORE $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/rescore_knn_deep: rescore-knn-deep.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) -DSQLITE_VEC_ENABLE_RESCORE $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/rescore_quantize_edge: rescore-quantize-edge.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) -DSQLITE_VEC_ENABLE_RESCORE -DSQLITE_VEC_TEST $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/rescore_interleave: rescore-interleave.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) -DSQLITE_VEC_ENABLE_RESCORE $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/ivf_create: ivf-create.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/ivf_operations: ivf-operations.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/ivf_quantize: ivf-quantize.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/ivf_kmeans: ivf-kmeans.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/ivf_shadow_corrupt: ivf-shadow-corrupt.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/ivf_knn_deep: ivf-knn-deep.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/ivf_cell_overflow: ivf-cell-overflow.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/ivf_rescore: ivf-rescore.c $(FUZZ_SRCS) | $(TARGET_DIR) +$(TARGET_DIR)/diskann_operations: diskann-operations.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/diskann_create: diskann-create.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/diskann_graph_corrupt: diskann-graph-corrupt.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/diskann_deep_search: diskann-deep-search.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/diskann_blob_truncate: diskann-blob-truncate.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/diskann_delete_stress: diskann-delete-stress.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/diskann_buffer_flush: diskann-buffer-flush.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/diskann_int8_quant: diskann-int8-quant.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/diskann_prune_direct: diskann-prune-direct.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@ + +$(TARGET_DIR)/diskann_command_inject: diskann-command-inject.c $(FUZZ_SRCS) | $(TARGET_DIR) + $(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@ + FUZZ_TARGETS = vec0_create exec json numpy \ shadow_corrupt vec0_operations scalar_functions \ vec0_create_full metadata_columns vec_each vec_mismatch \ - vec0_delete_completeness + vec0_delete_completeness \ + rescore_operations rescore_create rescore_quantize \ + rescore_shadow_corrupt rescore_knn_deep \ + rescore_quantize_edge rescore_interleave \ + ivf_create ivf_operations \ + ivf_quantize ivf_kmeans ivf_shadow_corrupt \ + ivf_knn_deep ivf_cell_overflow ivf_rescore + diskann_operations diskann_create diskann_graph_corrupt \ + diskann_deep_search diskann_blob_truncate \ + diskann_delete_stress diskann_buffer_flush \ + diskann_int8_quant diskann_prune_direct \ + diskann_command_inject all: $(addprefix $(TARGET_DIR)/,$(FUZZ_TARGETS)) diff --git a/tests/fuzz/diskann-blob-truncate.c b/tests/fuzz/diskann-blob-truncate.c new file mode 100644 index 0000000..903a0d7 --- /dev/null +++ b/tests/fuzz/diskann-blob-truncate.c @@ -0,0 +1,250 @@ +/** + * Fuzz target for DiskANN shadow table blob size mismatches. + * + * The critical vulnerability: diskann_node_read() copies whatever blob size + * SQLite returns, but diskann_search/insert/delete index into those blobs + * using cfg->n_neighbors * sizeof(i64) etc. If the blob is truncated, + * extended, or has wrong size, this causes out-of-bounds reads/writes. + * + * This fuzzer: + * 1. Creates a valid DiskANN graph with several nodes + * 2. Uses fuzz data to directly write malformed blobs to shadow tables: + * - Truncated neighbor_ids (fewer bytes than n_neighbors * 8) + * - Truncated validity bitmaps + * - Oversized blobs with garbage trailing data + * - Zero-length blobs + * - Blobs with valid headers but corrupted neighbor rowids + * 3. Runs INSERT, DELETE, and KNN operations that traverse the corrupted graph + * + * Key code paths targeted: + * - diskann_node_read with mismatched blob sizes + * - diskann_validity_get / diskann_neighbor_id_get on truncated blobs + * - diskann_add_reverse_edge reading corrupted neighbor data + * - diskann_repair_reverse_edges traversing corrupted neighbor lists + * - diskann_search iterating neighbors from corrupted blobs + */ +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +static uint8_t fuzz_byte(const uint8_t **data, size_t *size, uint8_t def) { + if (*size == 0) return def; + uint8_t b = **data; + (*data)++; + (*size)--; + return b; +} + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 32) return 0; + + int rc; + sqlite3 *db; + + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + /* Use binary quantizer, float[16], n_neighbors=8 for predictable blob sizes: + * validity: 8/8 = 1 byte + * neighbor_ids: 8 * 8 = 64 bytes + * qvecs: 8 * (16/8) = 16 bytes (binary: 2 bytes per qvec) + */ + rc = sqlite3_exec(db, + "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[16] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8))", + NULL, NULL, NULL); + if (rc != SQLITE_OK) { sqlite3_close(db); return 0; } + + /* Insert 12 vectors to create a valid graph structure */ + { + sqlite3_stmt *stmt; + sqlite3_prepare_v2(db, + "INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &stmt, NULL); + for (int i = 1; i <= 12; i++) { + float vec[16]; + for (int j = 0; j < 16; j++) { + vec[j] = (float)i * 0.1f + (float)j * 0.01f; + } + sqlite3_reset(stmt); + sqlite3_bind_int64(stmt, 1, i); + sqlite3_bind_blob(stmt, 2, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_step(stmt); + } + sqlite3_finalize(stmt); + } + + /* Now corrupt shadow table blobs using fuzz data */ + const char *columns[] = { + "neighbors_validity", + "neighbor_ids", + "neighbor_quantized_vectors" + }; + + /* Expected sizes for n_neighbors=8, dims=16, binary quantizer */ + int expected_sizes[] = {1, 64, 16}; + + while (size >= 4) { + int target_row = (fuzz_byte(&data, &size, 0) % 12) + 1; + int col_idx = fuzz_byte(&data, &size, 0) % 3; + uint8_t corrupt_mode = fuzz_byte(&data, &size, 0) % 6; + uint8_t extra = fuzz_byte(&data, &size, 0); + + char sqlbuf[256]; + snprintf(sqlbuf, sizeof(sqlbuf), + "UPDATE v_diskann_nodes00 SET %s = ? WHERE rowid = ?", + columns[col_idx]); + + sqlite3_stmt *writeStmt; + rc = sqlite3_prepare_v2(db, sqlbuf, -1, &writeStmt, NULL); + if (rc != SQLITE_OK) continue; + + int expected = expected_sizes[col_idx]; + unsigned char *blob = NULL; + int blob_size = 0; + + switch (corrupt_mode) { + case 0: { + /* Truncated blob: 0 to expected-1 bytes */ + blob_size = extra % expected; + if (blob_size == 0) blob_size = 0; /* zero-length is interesting */ + blob = sqlite3_malloc(blob_size > 0 ? blob_size : 1); + if (!blob) { sqlite3_finalize(writeStmt); continue; } + for (int i = 0; i < blob_size; i++) { + blob[i] = fuzz_byte(&data, &size, 0); + } + break; + } + case 1: { + /* Oversized blob: expected + extra bytes */ + blob_size = expected + (extra % 64); + blob = sqlite3_malloc(blob_size); + if (!blob) { sqlite3_finalize(writeStmt); continue; } + for (int i = 0; i < blob_size; i++) { + blob[i] = fuzz_byte(&data, &size, 0xFF); + } + break; + } + case 2: { + /* Zero-length blob */ + blob_size = 0; + blob = NULL; + sqlite3_bind_zeroblob(writeStmt, 1, 0); + sqlite3_bind_int64(writeStmt, 2, target_row); + sqlite3_step(writeStmt); + sqlite3_finalize(writeStmt); + continue; + } + case 3: { + /* Correct size but all-ones validity (all slots "valid") with + * garbage neighbor IDs -- forces reading non-existent nodes */ + blob_size = expected; + blob = sqlite3_malloc(blob_size); + if (!blob) { sqlite3_finalize(writeStmt); continue; } + memset(blob, 0xFF, blob_size); + break; + } + case 4: { + /* neighbor_ids with very large rowid values (near INT64_MAX) */ + blob_size = expected; + blob = sqlite3_malloc(blob_size); + if (!blob) { sqlite3_finalize(writeStmt); continue; } + memset(blob, 0x7F, blob_size); /* fills with large positive values */ + break; + } + case 5: { + /* neighbor_ids with negative rowid values (rowid=0 is sentinel) */ + blob_size = expected; + blob = sqlite3_malloc(blob_size); + if (!blob) { sqlite3_finalize(writeStmt); continue; } + memset(blob, 0x80, blob_size); /* fills with large negative values */ + /* Flip some bytes from fuzz data */ + for (int i = 0; i < blob_size && size > 0; i++) { + blob[i] ^= fuzz_byte(&data, &size, 0); + } + break; + } + } + + if (blob) { + sqlite3_bind_blob(writeStmt, 1, blob, blob_size, SQLITE_TRANSIENT); + } else { + sqlite3_bind_blob(writeStmt, 1, "", 0, SQLITE_STATIC); + } + sqlite3_bind_int64(writeStmt, 2, target_row); + sqlite3_step(writeStmt); + sqlite3_finalize(writeStmt); + sqlite3_free(blob); + } + + /* Exercise the corrupted graph with various operations */ + + /* KNN query */ + { + float qvec[16]; + for (int j = 0; j < 16; j++) qvec[j] = (float)j * 0.1f; + sqlite3_stmt *knnStmt; + rc = sqlite3_prepare_v2(db, + "SELECT rowid, distance FROM v WHERE emb MATCH ? AND k = 5", + -1, &knnStmt, NULL); + if (rc == SQLITE_OK) { + sqlite3_bind_blob(knnStmt, 1, qvec, sizeof(qvec), SQLITE_STATIC); + while (sqlite3_step(knnStmt) == SQLITE_ROW) {} + sqlite3_finalize(knnStmt); + } + } + + /* Insert into corrupted graph (triggers add_reverse_edge on corrupted nodes) */ + { + float vec[16]; + for (int j = 0; j < 16; j++) vec[j] = 0.5f; + sqlite3_stmt *stmt; + sqlite3_prepare_v2(db, + "INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &stmt, NULL); + if (stmt) { + sqlite3_bind_int64(stmt, 1, 100); + sqlite3_bind_blob(stmt, 2, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_step(stmt); + sqlite3_finalize(stmt); + } + } + + /* Delete from corrupted graph (triggers repair_reverse_edges) */ + { + sqlite3_stmt *stmt; + sqlite3_prepare_v2(db, + "DELETE FROM v WHERE rowid = ?", -1, &stmt, NULL); + if (stmt) { + sqlite3_bind_int64(stmt, 1, 5); + sqlite3_step(stmt); + sqlite3_finalize(stmt); + } + } + + /* Another KNN to traverse the post-mutation graph */ + { + float qvec[16]; + for (int j = 0; j < 16; j++) qvec[j] = -0.5f + (float)j * 0.07f; + sqlite3_stmt *knnStmt; + rc = sqlite3_prepare_v2(db, + "SELECT rowid, distance FROM v WHERE emb MATCH ? AND k = 12", + -1, &knnStmt, NULL); + if (rc == SQLITE_OK) { + sqlite3_bind_blob(knnStmt, 1, qvec, sizeof(qvec), SQLITE_STATIC); + while (sqlite3_step(knnStmt) == SQLITE_ROW) {} + sqlite3_finalize(knnStmt); + } + } + + /* Full scan */ + sqlite3_exec(db, "SELECT * FROM v", NULL, NULL, NULL); + + sqlite3_close(db); + return 0; +} diff --git a/tests/fuzz/diskann-buffer-flush.c b/tests/fuzz/diskann-buffer-flush.c new file mode 100644 index 0000000..f10e100 --- /dev/null +++ b/tests/fuzz/diskann-buffer-flush.c @@ -0,0 +1,164 @@ +/** + * Fuzz target for DiskANN buffered insert and flush paths. + * + * When buffer_threshold > 0, inserts go into a flat buffer table and + * are flushed into the graph in batch. This fuzzer exercises: + * + * - diskann_buffer_write / diskann_buffer_delete / diskann_buffer_exists + * - diskann_flush_buffer (batch graph insertion) + * - diskann_insert with buffer_threshold (batching logic) + * - Buffer-graph merge in vec0Filter_knn_diskann (unflushed vectors + * must be scanned during KNN and merged with graph results) + * - Delete of a buffered (not yet flushed) vector + * - Delete of a graph vector while buffer has pending inserts + * - Interaction: insert to buffer, query (triggers buffer scan), flush, + * query again (now from graph) + * + * The buffer merge path in vec0Filter_knn_diskann is particularly + * interesting because it does a brute-force scan of buffer vectors and + * merges with the top-k from graph search. + */ +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +static uint8_t fuzz_byte(const uint8_t **data, size_t *size, uint8_t def) { + if (*size == 0) return def; + uint8_t b = **data; + (*data)++; + (*size)--; + return b; +} + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 16) return 0; + + int rc; + sqlite3 *db; + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + /* buffer_threshold: small (3-8) to trigger frequent flushes */ + int buf_threshold = 3 + (fuzz_byte(&data, &size, 0) % 6); + int dims = 8; + + char sql[512]; + snprintf(sql, sizeof(sql), + "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[%d] INDEXED BY diskann(" + "neighbor_quantizer=binary, n_neighbors=8, " + "search_list_size=16, buffer_threshold=%d" + "))", dims, buf_threshold); + + rc = sqlite3_exec(db, sql, NULL, NULL, NULL); + if (rc != SQLITE_OK) { sqlite3_close(db); return 0; } + + sqlite3_stmt *stmtInsert = NULL, *stmtDelete = NULL, *stmtKnn = NULL; + sqlite3_prepare_v2(db, + "INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &stmtInsert, NULL); + sqlite3_prepare_v2(db, + "DELETE FROM v WHERE rowid = ?", -1, &stmtDelete, NULL); + sqlite3_prepare_v2(db, + "SELECT rowid, distance FROM v WHERE emb MATCH ? AND k = ?", + -1, &stmtKnn, NULL); + + if (!stmtInsert || !stmtDelete || !stmtKnn) goto cleanup; + + float vec[8]; + int next_rowid = 1; + + while (size >= 2) { + uint8_t op = fuzz_byte(&data, &size, 0) % 6; + uint8_t param = fuzz_byte(&data, &size, 0); + + switch (op) { + case 0: { /* Insert: accumulates in buffer until threshold */ + int64_t rowid = next_rowid++; + if (next_rowid > 64) next_rowid = 1; /* wrap around for reuse */ + for (int j = 0; j < dims; j++) { + vec[j] = (float)((int8_t)fuzz_byte(&data, &size, 0)) / 10.0f; + } + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, rowid); + sqlite3_bind_blob(stmtInsert, 2, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + break; + } + case 1: { /* KNN query while buffer may have unflushed vectors */ + for (int j = 0; j < dims; j++) { + vec[j] = (float)((int8_t)fuzz_byte(&data, &size, 0)) / 10.0f; + } + int k = (param % 10) + 1; + sqlite3_reset(stmtKnn); + sqlite3_bind_blob(stmtKnn, 1, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_bind_int(stmtKnn, 2, k); + while (sqlite3_step(stmtKnn) == SQLITE_ROW) {} + break; + } + case 2: { /* Delete a potentially-buffered vector */ + int64_t rowid = (int64_t)(param % 64) + 1; + sqlite3_reset(stmtDelete); + sqlite3_bind_int64(stmtDelete, 1, rowid); + sqlite3_step(stmtDelete); + break; + } + case 3: { /* Insert several at once to trigger flush mid-batch */ + for (int i = 0; i < buf_threshold + 1 && size >= 2; i++) { + int64_t rowid = (int64_t)(fuzz_byte(&data, &size, 0) % 64) + 1; + for (int j = 0; j < dims; j++) { + vec[j] = (float)((int8_t)fuzz_byte(&data, &size, 0)) / 10.0f; + } + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, rowid); + sqlite3_bind_blob(stmtInsert, 2, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + } + break; + } + case 4: { /* Insert then immediately delete (still in buffer) */ + int64_t rowid = (int64_t)(param % 64) + 1; + for (int j = 0; j < dims; j++) vec[j] = 0.1f * param; + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, rowid); + sqlite3_bind_blob(stmtInsert, 2, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + + sqlite3_reset(stmtDelete); + sqlite3_bind_int64(stmtDelete, 1, rowid); + sqlite3_step(stmtDelete); + break; + } + case 5: { /* Query with k=0 and k=1 (boundary) */ + for (int j = 0; j < dims; j++) vec[j] = 0.0f; + sqlite3_reset(stmtKnn); + sqlite3_bind_blob(stmtKnn, 1, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_bind_int(stmtKnn, 2, param % 2); /* k=0 or k=1 */ + while (sqlite3_step(stmtKnn) == SQLITE_ROW) {} + break; + } + } + } + + /* Final query to exercise post-operation state */ + { + float qvec[8] = {1.0f, -1.0f, 0.5f, -0.5f, 0.0f, 0.0f, 0.0f, 0.0f}; + sqlite3_reset(stmtKnn); + sqlite3_bind_blob(stmtKnn, 1, qvec, sizeof(qvec), SQLITE_TRANSIENT); + sqlite3_bind_int(stmtKnn, 2, 20); + while (sqlite3_step(stmtKnn) == SQLITE_ROW) {} + } + +cleanup: + sqlite3_finalize(stmtInsert); + sqlite3_finalize(stmtDelete); + sqlite3_finalize(stmtKnn); + sqlite3_close(db); + return 0; +} diff --git a/tests/fuzz/diskann-command-inject.c b/tests/fuzz/diskann-command-inject.c new file mode 100644 index 0000000..22661bf --- /dev/null +++ b/tests/fuzz/diskann-command-inject.c @@ -0,0 +1,158 @@ +/** + * Fuzz target for DiskANN runtime command dispatch (diskann_handle_command). + * + * The command handler parses strings like "search_list_size_search=42" and + * modifies live DiskANN config. This fuzzer exercises: + * + * - atoi on fuzz-controlled strings (integer overflow, negative, non-numeric) + * - strncmp boundary with fuzz data (near-matches to valid commands) + * - Changing search_list_size mid-operation (affects subsequent queries) + * - Setting search_list_size to 1 (minimum - single-candidate beam search) + * - Setting search_list_size very large (memory pressure) + * - Interleaving command changes with inserts and queries + * + * Also tests the UPDATE v SET command = ? path through the vtable. + */ +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +static uint8_t fuzz_byte(const uint8_t **data, size_t *size, uint8_t def) { + if (*size == 0) return def; + uint8_t b = **data; + (*data)++; + (*size)--; + return b; +} + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 20) return 0; + + int rc; + sqlite3 *db; + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + rc = sqlite3_exec(db, + "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8))", + NULL, NULL, NULL); + if (rc != SQLITE_OK) { sqlite3_close(db); return 0; } + + /* Insert some vectors first */ + { + sqlite3_stmt *stmt; + sqlite3_prepare_v2(db, + "INSERT INTO v(v, emb) VALUES (?, ?)", -1, &stmt, NULL); + for (int i = 1; i <= 8; i++) { + float vec[8]; + for (int j = 0; j < 8; j++) vec[j] = (float)i * 0.1f + (float)j * 0.01f; + sqlite3_reset(stmt); + sqlite3_bind_int64(stmt, 1, i); + sqlite3_bind_blob(stmt, 2, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_step(stmt); + } + sqlite3_finalize(stmt); + } + + sqlite3_stmt *stmtCmd = NULL; + sqlite3_stmt *stmtInsert = NULL; + sqlite3_stmt *stmtKnn = NULL; + + /* Commands are dispatched via INSERT INTO t(t) VALUES ('cmd_string') */ + sqlite3_prepare_v2(db, + "INSERT INTO v(v) VALUES (?)", -1, &stmtCmd, NULL); + sqlite3_prepare_v2(db, + "INSERT INTO v(v, emb) VALUES (?, ?)", -1, &stmtInsert, NULL); + sqlite3_prepare_v2(db, + "SELECT rowid, distance FROM v WHERE emb MATCH ? AND k = ?", + -1, &stmtKnn, NULL); + + if (!stmtCmd || !stmtInsert || !stmtKnn) goto cleanup; + + /* Fuzz-driven command + operation interleaving */ + while (size >= 2) { + uint8_t op = fuzz_byte(&data, &size, 0) % 5; + + switch (op) { + case 0: { /* Send fuzz command string */ + int cmd_len = fuzz_byte(&data, &size, 0) % 64; + char cmd[65]; + for (int i = 0; i < cmd_len && size > 0; i++) { + cmd[i] = (char)fuzz_byte(&data, &size, 0); + } + cmd[cmd_len] = '\0'; + sqlite3_reset(stmtCmd); + sqlite3_bind_text(stmtCmd, 1, cmd, -1, SQLITE_TRANSIENT); + sqlite3_step(stmtCmd); /* May fail -- that's expected */ + break; + } + case 1: { /* Send valid-looking command with fuzz value */ + const char *prefixes[] = { + "search_list_size=", + "search_list_size_search=", + "search_list_size_insert=", + }; + int prefix_idx = fuzz_byte(&data, &size, 0) % 3; + int val = (int)(int8_t)fuzz_byte(&data, &size, 0); + + char cmd[128]; + snprintf(cmd, sizeof(cmd), "%s%d", prefixes[prefix_idx], val); + sqlite3_reset(stmtCmd); + sqlite3_bind_text(stmtCmd, 1, cmd, -1, SQLITE_TRANSIENT); + sqlite3_step(stmtCmd); + break; + } + case 2: { /* KNN query (uses whatever search_list_size is set) */ + float qvec[8] = {1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; + qvec[0] = (float)((int8_t)fuzz_byte(&data, &size, 127)) / 10.0f; + int k = fuzz_byte(&data, &size, 3) % 10 + 1; + sqlite3_reset(stmtKnn); + sqlite3_bind_blob(stmtKnn, 1, qvec, sizeof(qvec), SQLITE_TRANSIENT); + sqlite3_bind_int(stmtKnn, 2, k); + while (sqlite3_step(stmtKnn) == SQLITE_ROW) {} + break; + } + case 3: { /* Insert (uses whatever search_list_size_insert is set) */ + int64_t rowid = (int64_t)(fuzz_byte(&data, &size, 0) % 32) + 1; + float vec[8]; + for (int j = 0; j < 8; j++) { + vec[j] = (float)((int8_t)fuzz_byte(&data, &size, 0)) / 10.0f; + } + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, rowid); + sqlite3_bind_blob(stmtInsert, 2, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + break; + } + case 4: { /* Set search_list_size to extreme values */ + const char *extreme_cmds[] = { + "search_list_size=1", + "search_list_size=2", + "search_list_size=1000", + "search_list_size_search=1", + "search_list_size_insert=1", + }; + int idx = fuzz_byte(&data, &size, 0) % 5; + sqlite3_reset(stmtCmd); + sqlite3_bind_text(stmtCmd, 1, extreme_cmds[idx], -1, SQLITE_STATIC); + sqlite3_step(stmtCmd); + break; + } + } + } + +cleanup: + sqlite3_finalize(stmtCmd); + sqlite3_finalize(stmtInsert); + sqlite3_finalize(stmtKnn); + sqlite3_close(db); + return 0; +} diff --git a/tests/fuzz/diskann-create.c b/tests/fuzz/diskann-create.c new file mode 100644 index 0000000..1b40a84 --- /dev/null +++ b/tests/fuzz/diskann-create.c @@ -0,0 +1,44 @@ +/** + * Fuzz target for DiskANN CREATE TABLE config parsing. + * Feeds fuzz data as the INDEXED BY diskann(...) option string. + */ +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size > 4096) return 0; /* Limit input size */ + + int rc; + sqlite3 *db; + sqlite3_stmt *stmt; + + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + sqlite3_str *s = sqlite3_str_new(NULL); + assert(s); + sqlite3_str_appendall(s, + "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[64] INDEXED BY diskann("); + sqlite3_str_appendf(s, "%.*s", (int)size, data); + sqlite3_str_appendall(s, "))"); + const char *zSql = sqlite3_str_finish(s); + assert(zSql); + + rc = sqlite3_prepare_v2(db, zSql, -1, &stmt, NULL); + sqlite3_free((char *)zSql); + if (rc == SQLITE_OK) { + sqlite3_step(stmt); + } + sqlite3_finalize(stmt); + sqlite3_close(db); + return 0; +} diff --git a/tests/fuzz/diskann-deep-search.c b/tests/fuzz/diskann-deep-search.c new file mode 100644 index 0000000..35d548c --- /dev/null +++ b/tests/fuzz/diskann-deep-search.c @@ -0,0 +1,187 @@ +/** + * Fuzz target for DiskANN greedy beam search deep paths. + * + * Builds a graph with enough nodes to force multi-hop traversal, then + * uses fuzz data to control: query vector values, k, search_list_size + * overrides, and interleaved insert/delete/query sequences that stress + * the candidate list growth, visited set hash collisions, and the + * re-ranking logic. + * + * Key code paths targeted: + * - diskann_candidate_list_insert (sorted insert, dedup, eviction) + * - diskann_visited_set (hash collisions, capacity) + * - diskann_search (full beam search loop, re-ranking with exact dist) + * - diskann_distance_quantized_precomputed (both binary and int8) + * - Buffer merge in vec0Filter_knn_diskann + */ +#include +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +/* Consume one byte from fuzz input, or return default. */ +static uint8_t fuzz_byte(const uint8_t **data, size_t *size, uint8_t def) { + if (*size == 0) return def; + uint8_t b = **data; + (*data)++; + (*size)--; + return b; +} + +static uint16_t fuzz_u16(const uint8_t **data, size_t *size) { + uint8_t lo = fuzz_byte(data, size, 0); + uint8_t hi = fuzz_byte(data, size, 0); + return (uint16_t)hi << 8 | lo; +} + +static float fuzz_float(const uint8_t **data, size_t *size) { + return (float)((int8_t)fuzz_byte(data, size, 0)) / 10.0f; +} + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 32) return 0; + + /* Use first bytes to pick quantizer type and dimensions */ + uint8_t quantizer_choice = fuzz_byte(&data, &size, 0) % 2; + const char *quantizer = quantizer_choice ? "int8" : "binary"; + + /* Dimensions must be divisible by 8. Pick from {8, 16, 32} */ + int dim_choices[] = {8, 16, 32}; + int dims = dim_choices[fuzz_byte(&data, &size, 0) % 3]; + + /* n_neighbors: 8 or 16 -- small to force full-neighbor scenarios quickly */ + int n_neighbors = (fuzz_byte(&data, &size, 0) % 2) ? 16 : 8; + + /* search_list_size: small so beam search terminates quickly but still exercises loops */ + int search_list_size = 8 + (fuzz_byte(&data, &size, 0) % 24); + + /* alpha: vary to test RobustPrune pruning logic */ + float alpha_choices[] = {1.0f, 1.2f, 1.5f, 2.0f}; + float alpha = alpha_choices[fuzz_byte(&data, &size, 0) % 4]; + + int rc; + sqlite3 *db; + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + char sql[512]; + snprintf(sql, sizeof(sql), + "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[%d] INDEXED BY diskann(" + "neighbor_quantizer=%s, n_neighbors=%d, " + "search_list_size=%d" + "))", dims, quantizer, n_neighbors, search_list_size); + + rc = sqlite3_exec(db, sql, NULL, NULL, NULL); + if (rc != SQLITE_OK) { sqlite3_close(db); return 0; } + + sqlite3_stmt *stmtInsert = NULL, *stmtDelete = NULL, *stmtKnn = NULL; + sqlite3_prepare_v2(db, + "INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &stmtInsert, NULL); + sqlite3_prepare_v2(db, + "DELETE FROM v WHERE rowid = ?", -1, &stmtDelete, NULL); + + char knn_sql[256]; + snprintf(knn_sql, sizeof(knn_sql), + "SELECT rowid, distance FROM v WHERE emb MATCH ? AND k = ?"); + sqlite3_prepare_v2(db, knn_sql, -1, &stmtKnn, NULL); + + if (!stmtInsert || !stmtDelete || !stmtKnn) goto cleanup; + + /* Phase 1: Seed the graph with enough nodes to create multi-hop structure. + * Insert 2*n_neighbors nodes so the graph is dense enough for search + * to actually traverse multiple hops. */ + int seed_count = n_neighbors * 2; + if (seed_count > 64) seed_count = 64; /* Bound for performance */ + { + float *vec = malloc(dims * sizeof(float)); + if (!vec) goto cleanup; + for (int i = 1; i <= seed_count; i++) { + for (int j = 0; j < dims; j++) { + vec[j] = fuzz_float(&data, &size); + } + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, i); + sqlite3_bind_blob(stmtInsert, 2, vec, dims * sizeof(float), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + } + free(vec); + } + + /* Phase 2: Fuzz-driven operations on the seeded graph */ + float *vec = malloc(dims * sizeof(float)); + if (!vec) goto cleanup; + + while (size >= 2) { + uint8_t op = fuzz_byte(&data, &size, 0) % 5; + uint8_t param = fuzz_byte(&data, &size, 0); + + switch (op) { + case 0: { /* INSERT with fuzz-controlled vector and rowid */ + int64_t rowid = (int64_t)(param % 128) + 1; + for (int j = 0; j < dims; j++) { + vec[j] = fuzz_float(&data, &size); + } + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, rowid); + sqlite3_bind_blob(stmtInsert, 2, vec, dims * sizeof(float), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + break; + } + case 1: { /* DELETE */ + int64_t rowid = (int64_t)(param % 128) + 1; + sqlite3_reset(stmtDelete); + sqlite3_bind_int64(stmtDelete, 1, rowid); + sqlite3_step(stmtDelete); + break; + } + case 2: { /* KNN with fuzz query vector and variable k */ + for (int j = 0; j < dims; j++) { + vec[j] = fuzz_float(&data, &size); + } + int k = (param % 20) + 1; + sqlite3_reset(stmtKnn); + sqlite3_bind_blob(stmtKnn, 1, vec, dims * sizeof(float), SQLITE_TRANSIENT); + sqlite3_bind_int(stmtKnn, 2, k); + while (sqlite3_step(stmtKnn) == SQLITE_ROW) {} + break; + } + case 3: { /* KNN with k > number of nodes (boundary) */ + for (int j = 0; j < dims; j++) { + vec[j] = fuzz_float(&data, &size); + } + sqlite3_reset(stmtKnn); + sqlite3_bind_blob(stmtKnn, 1, vec, dims * sizeof(float), SQLITE_TRANSIENT); + sqlite3_bind_int(stmtKnn, 2, 1000); /* k >> graph size */ + while (sqlite3_step(stmtKnn) == SQLITE_ROW) {} + break; + } + case 4: { /* INSERT duplicate rowid (triggers OR REPLACE path) */ + int64_t rowid = (int64_t)(param % 32) + 1; + for (int j = 0; j < dims; j++) { + vec[j] = (float)(param + j) / 50.0f; + } + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, rowid); + sqlite3_bind_blob(stmtInsert, 2, vec, dims * sizeof(float), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + break; + } + } + } + free(vec); + +cleanup: + sqlite3_finalize(stmtInsert); + sqlite3_finalize(stmtDelete); + sqlite3_finalize(stmtKnn); + sqlite3_close(db); + return 0; +} diff --git a/tests/fuzz/diskann-delete-stress.c b/tests/fuzz/diskann-delete-stress.c new file mode 100644 index 0000000..d10a7ff --- /dev/null +++ b/tests/fuzz/diskann-delete-stress.c @@ -0,0 +1,175 @@ +/** + * Fuzz target for DiskANN delete path and graph connectivity maintenance. + * + * The delete path is the most complex graph mutation: + * 1. Read deleted node's neighbor list + * 2. For each neighbor, remove deleted node from their list + * 3. Try to fill the gap with one of deleted node's other neighbors + * 4. Handle medoid deletion (pick new medoid) + * + * Edge cases this targets: + * - Delete the medoid (entry point) -- forces medoid reassignment + * - Delete all nodes except one -- graph degenerates + * - Delete nodes in a chain -- cascading dangling edges + * - Re-insert at deleted rowids -- stale graph edges to old data + * - Delete nonexistent rowids -- should be no-op + * - Insert-delete-insert same rowid rapidly + * - Delete when graph has exactly n_neighbors entries (full nodes) + * + * Key code paths: + * - diskann_delete -> diskann_repair_reverse_edges + * - diskann_medoid_handle_delete + * - diskann_node_clear_neighbor + * - Interaction between delete and concurrent search + */ +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +static uint8_t fuzz_byte(const uint8_t **data, size_t *size, uint8_t def) { + if (*size == 0) return def; + uint8_t b = **data; + (*data)++; + (*size)--; + return b; +} + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 20) return 0; + + int rc; + sqlite3 *db; + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + /* int8 quantizer to exercise that distance code path */ + uint8_t quant = fuzz_byte(&data, &size, 0) % 2; + const char *qname = quant ? "int8" : "binary"; + + char sql[256]; + snprintf(sql, sizeof(sql), + "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[8] INDEXED BY diskann(neighbor_quantizer=%s, n_neighbors=8))", + qname); + rc = sqlite3_exec(db, sql, NULL, NULL, NULL); + if (rc != SQLITE_OK) { sqlite3_close(db); return 0; } + + sqlite3_stmt *stmtInsert = NULL, *stmtDelete = NULL, *stmtKnn = NULL; + sqlite3_prepare_v2(db, + "INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &stmtInsert, NULL); + sqlite3_prepare_v2(db, + "DELETE FROM v WHERE rowid = ?", -1, &stmtDelete, NULL); + sqlite3_prepare_v2(db, + "SELECT rowid, distance FROM v WHERE emb MATCH ? AND k = ?", + -1, &stmtKnn, NULL); + + if (!stmtInsert || !stmtDelete || !stmtKnn) goto cleanup; + + /* Phase 1: Build a graph of exactly n_neighbors+2 = 10 nodes. + * This makes every node nearly full, maximizing the chance that + * inserts trigger the "full node" path in add_reverse_edge. */ + for (int i = 1; i <= 10; i++) { + float vec[8]; + for (int j = 0; j < 8; j++) { + vec[j] = (float)((int8_t)fuzz_byte(&data, &size, (uint8_t)(i*13+j*7))) / 20.0f; + } + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, i); + sqlite3_bind_blob(stmtInsert, 2, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + } + + /* Phase 2: Fuzz-driven delete-heavy workload */ + while (size >= 2) { + uint8_t op = fuzz_byte(&data, &size, 0); + uint8_t param = fuzz_byte(&data, &size, 0); + + switch (op % 6) { + case 0: /* Delete existing node */ + case 1: { /* (weighted toward deletes) */ + int64_t rowid = (int64_t)(param % 16) + 1; + sqlite3_reset(stmtDelete); + sqlite3_bind_int64(stmtDelete, 1, rowid); + sqlite3_step(stmtDelete); + break; + } + case 2: { /* Delete then immediately re-insert same rowid */ + int64_t rowid = (int64_t)(param % 10) + 1; + sqlite3_reset(stmtDelete); + sqlite3_bind_int64(stmtDelete, 1, rowid); + sqlite3_step(stmtDelete); + + float vec[8]; + for (int j = 0; j < 8; j++) { + vec[j] = (float)((int8_t)fuzz_byte(&data, &size, (uint8_t)(rowid+j))) / 15.0f; + } + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, rowid); + sqlite3_bind_blob(stmtInsert, 2, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + break; + } + case 3: { /* KNN query on potentially sparse/empty graph */ + float qvec[8]; + for (int j = 0; j < 8; j++) { + qvec[j] = (float)((int8_t)fuzz_byte(&data, &size, 0)) / 10.0f; + } + int k = (param % 15) + 1; + sqlite3_reset(stmtKnn); + sqlite3_bind_blob(stmtKnn, 1, qvec, sizeof(qvec), SQLITE_TRANSIENT); + sqlite3_bind_int(stmtKnn, 2, k); + while (sqlite3_step(stmtKnn) == SQLITE_ROW) {} + break; + } + case 4: { /* Insert new node */ + int64_t rowid = (int64_t)(param % 32) + 1; + float vec[8]; + for (int j = 0; j < 8; j++) { + vec[j] = (float)((int8_t)fuzz_byte(&data, &size, 0)) / 10.0f; + } + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, rowid); + sqlite3_bind_blob(stmtInsert, 2, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + break; + } + case 5: { /* Delete ALL remaining nodes, then insert fresh */ + for (int i = 1; i <= 32; i++) { + sqlite3_reset(stmtDelete); + sqlite3_bind_int64(stmtDelete, 1, i); + sqlite3_step(stmtDelete); + } + /* Now insert one node into empty graph */ + float vec[8] = {1.0f, 0, 0, 0, 0, 0, 0, 0}; + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, 1); + sqlite3_bind_blob(stmtInsert, 2, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + break; + } + } + } + + /* Final KNN on whatever state the graph is in */ + { + float qvec[8] = {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f}; + sqlite3_reset(stmtKnn); + sqlite3_bind_blob(stmtKnn, 1, qvec, sizeof(qvec), SQLITE_TRANSIENT); + sqlite3_bind_int(stmtKnn, 2, 10); + while (sqlite3_step(stmtKnn) == SQLITE_ROW) {} + } + +cleanup: + sqlite3_finalize(stmtInsert); + sqlite3_finalize(stmtDelete); + sqlite3_finalize(stmtKnn); + sqlite3_close(db); + return 0; +} diff --git a/tests/fuzz/diskann-graph-corrupt.c b/tests/fuzz/diskann-graph-corrupt.c new file mode 100644 index 0000000..a8dbc19 --- /dev/null +++ b/tests/fuzz/diskann-graph-corrupt.c @@ -0,0 +1,123 @@ +/** + * Fuzz target for DiskANN shadow table corruption resilience. + * Creates and populates a DiskANN table, then corrupts shadow table blobs + * using fuzz data and runs queries. + */ +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 16) return 0; + + int rc; + sqlite3 *db; + + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + rc = sqlite3_exec(db, + "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8))", + NULL, NULL, NULL); + if (rc != SQLITE_OK) { sqlite3_close(db); return 0; } + + /* Insert a few vectors to create graph structure */ + { + sqlite3_stmt *stmt; + sqlite3_prepare_v2(db, + "INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &stmt, NULL); + for (int i = 1; i <= 10; i++) { + float vec[8]; + for (int j = 0; j < 8; j++) { + vec[j] = (float)i * 0.1f + (float)j * 0.01f; + } + sqlite3_reset(stmt); + sqlite3_bind_int64(stmt, 1, i); + sqlite3_bind_blob(stmt, 2, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_step(stmt); + } + sqlite3_finalize(stmt); + } + + /* Corrupt shadow table data using fuzz bytes */ + size_t offset = 0; + + /* Determine which row and column to corrupt */ + int target_row = (data[offset++] % 10) + 1; + int corrupt_type = data[offset++] % 3; /* 0=validity, 1=neighbor_ids, 2=qvecs */ + + const char *column_name; + switch (corrupt_type) { + case 0: column_name = "neighbors_validity"; break; + case 1: column_name = "neighbor_ids"; break; + default: column_name = "neighbor_quantized_vectors"; break; + } + + /* Read the blob, corrupt it, write it back */ + { + sqlite3_stmt *readStmt; + char sqlbuf[256]; + snprintf(sqlbuf, sizeof(sqlbuf), + "SELECT %s FROM v_diskann_nodes00 WHERE rowid = ?", column_name); + rc = sqlite3_prepare_v2(db, sqlbuf, -1, &readStmt, NULL); + if (rc == SQLITE_OK) { + sqlite3_bind_int64(readStmt, 1, target_row); + if (sqlite3_step(readStmt) == SQLITE_ROW) { + const void *blob = sqlite3_column_blob(readStmt, 0); + int blobSize = sqlite3_column_bytes(readStmt, 0); + if (blob && blobSize > 0) { + unsigned char *corrupt = sqlite3_malloc(blobSize); + if (corrupt) { + memcpy(corrupt, blob, blobSize); + /* Apply fuzz bytes as XOR corruption */ + size_t remaining = size - offset; + for (size_t i = 0; i < remaining && i < (size_t)blobSize; i++) { + corrupt[i % blobSize] ^= data[offset + i]; + } + /* Write back */ + sqlite3_stmt *writeStmt; + snprintf(sqlbuf, sizeof(sqlbuf), + "UPDATE v_diskann_nodes00 SET %s = ? WHERE rowid = ?", column_name); + rc = sqlite3_prepare_v2(db, sqlbuf, -1, &writeStmt, NULL); + if (rc == SQLITE_OK) { + sqlite3_bind_blob(writeStmt, 1, corrupt, blobSize, SQLITE_TRANSIENT); + sqlite3_bind_int64(writeStmt, 2, target_row); + sqlite3_step(writeStmt); + sqlite3_finalize(writeStmt); + } + sqlite3_free(corrupt); + } + } + } + sqlite3_finalize(readStmt); + } + } + + /* Run queries on corrupted graph -- should not crash */ + { + float qvec[8] = {1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; + sqlite3_stmt *knnStmt; + rc = sqlite3_prepare_v2(db, + "SELECT rowid, distance FROM v WHERE emb MATCH ? AND k = 5", + -1, &knnStmt, NULL); + if (rc == SQLITE_OK) { + sqlite3_bind_blob(knnStmt, 1, qvec, sizeof(qvec), SQLITE_STATIC); + while (sqlite3_step(knnStmt) == SQLITE_ROW) {} + sqlite3_finalize(knnStmt); + } + } + + /* Full scan */ + sqlite3_exec(db, "SELECT * FROM v", NULL, NULL, NULL); + + sqlite3_close(db); + return 0; +} diff --git a/tests/fuzz/diskann-int8-quant.c b/tests/fuzz/diskann-int8-quant.c new file mode 100644 index 0000000..f1bd31d --- /dev/null +++ b/tests/fuzz/diskann-int8-quant.c @@ -0,0 +1,164 @@ +/** + * Fuzz target for DiskANN int8 quantizer edge cases. + * + * The binary quantizer is simple (sign bit), but the int8 quantizer has + * interesting arithmetic: + * i8_val = (i8)(((src - (-1.0f)) / step) - 128.0f) + * where step = 2.0f / 255.0f + * + * Edge cases in this formula: + * - src values outside [-1, 1] cause clamping issues (no explicit clamp!) + * - src = NaN, +Inf, -Inf (from corrupted vectors or div-by-zero) + * - src very close to boundaries (-1.0, 1.0) -- rounding + * - The cast to i8 can overflow for extreme src values + * + * Also exercises int8 distance functions: + * - distance_l2_sqr_int8: accumulates squared differences, possible overflow + * - distance_cosine_int8: dot product with normalization + * - distance_l1_int8: absolute differences + * + * This fuzzer also tests the cosine distance metric path which the + * other fuzzers (using L2 default) don't cover. + */ +#include +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +static uint8_t fuzz_byte(const uint8_t **data, size_t *size, uint8_t def) { + if (*size == 0) return def; + uint8_t b = **data; + (*data)++; + (*size)--; + return b; +} + +static float fuzz_extreme_float(const uint8_t **data, size_t *size) { + uint8_t mode = fuzz_byte(data, size, 0) % 8; + uint8_t raw = fuzz_byte(data, size, 0); + switch (mode) { + case 0: return (float)((int8_t)raw) / 10.0f; /* Normal range */ + case 1: return (float)((int8_t)raw) * 100.0f; /* Large values */ + case 2: return (float)((int8_t)raw) / 1000.0f; /* Tiny values near 0 */ + case 3: return -1.0f; /* Exact boundary */ + case 4: return 1.0f; /* Exact boundary */ + case 5: return 0.0f; /* Zero */ + case 6: return (float)raw / 255.0f; /* [0, 1] range */ + case 7: return -(float)raw / 255.0f; /* [-1, 0] range */ + } + return 0.0f; +} + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 40) return 0; + + int rc; + sqlite3 *db; + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + /* Test both distance metrics with int8 quantizer */ + uint8_t metric_choice = fuzz_byte(&data, &size, 0) % 2; + const char *metric = metric_choice ? "cosine" : "L2"; + + int dims = 8 + (fuzz_byte(&data, &size, 0) % 3) * 8; /* 8, 16, or 24 */ + + char sql[512]; + snprintf(sql, sizeof(sql), + "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[%d] distance_metric=%s " + "INDEXED BY diskann(neighbor_quantizer=int8, n_neighbors=8, search_list_size=16))", + dims, metric); + + rc = sqlite3_exec(db, sql, NULL, NULL, NULL); + if (rc != SQLITE_OK) { sqlite3_close(db); return 0; } + + sqlite3_stmt *stmtInsert = NULL, *stmtKnn = NULL, *stmtDelete = NULL; + sqlite3_prepare_v2(db, + "INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &stmtInsert, NULL); + sqlite3_prepare_v2(db, + "SELECT rowid, distance FROM v WHERE emb MATCH ? AND k = ?", + -1, &stmtKnn, NULL); + sqlite3_prepare_v2(db, + "DELETE FROM v WHERE rowid = ?", -1, &stmtDelete, NULL); + + if (!stmtInsert || !stmtKnn || !stmtDelete) goto cleanup; + + /* Insert vectors with extreme float values to stress quantization */ + float *vec = malloc(dims * sizeof(float)); + if (!vec) goto cleanup; + + for (int i = 1; i <= 16; i++) { + for (int j = 0; j < dims; j++) { + vec[j] = fuzz_extreme_float(&data, &size); + } + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, i); + sqlite3_bind_blob(stmtInsert, 2, vec, dims * sizeof(float), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + } + + /* Fuzz-driven operations */ + while (size >= 2) { + uint8_t op = fuzz_byte(&data, &size, 0) % 4; + uint8_t param = fuzz_byte(&data, &size, 0); + + switch (op) { + case 0: { /* KNN with extreme query values */ + for (int j = 0; j < dims; j++) { + vec[j] = fuzz_extreme_float(&data, &size); + } + int k = (param % 10) + 1; + sqlite3_reset(stmtKnn); + sqlite3_bind_blob(stmtKnn, 1, vec, dims * sizeof(float), SQLITE_TRANSIENT); + sqlite3_bind_int(stmtKnn, 2, k); + while (sqlite3_step(stmtKnn) == SQLITE_ROW) {} + break; + } + case 1: { /* Insert with extreme values */ + int64_t rowid = (int64_t)(param % 32) + 1; + for (int j = 0; j < dims; j++) { + vec[j] = fuzz_extreme_float(&data, &size); + } + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, rowid); + sqlite3_bind_blob(stmtInsert, 2, vec, dims * sizeof(float), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + break; + } + case 2: { /* Delete */ + int64_t rowid = (int64_t)(param % 32) + 1; + sqlite3_reset(stmtDelete); + sqlite3_bind_int64(stmtDelete, 1, rowid); + sqlite3_step(stmtDelete); + break; + } + case 3: { /* KNN with all-zero or all-same-value query */ + float val = (param % 3 == 0) ? 0.0f : + (param % 3 == 1) ? 1.0f : -1.0f; + for (int j = 0; j < dims; j++) vec[j] = val; + sqlite3_reset(stmtKnn); + sqlite3_bind_blob(stmtKnn, 1, vec, dims * sizeof(float), SQLITE_TRANSIENT); + sqlite3_bind_int(stmtKnn, 2, 5); + while (sqlite3_step(stmtKnn) == SQLITE_ROW) {} + break; + } + } + } + + free(vec); + +cleanup: + sqlite3_finalize(stmtInsert); + sqlite3_finalize(stmtKnn); + sqlite3_finalize(stmtDelete); + sqlite3_close(db); + return 0; +} diff --git a/tests/fuzz/diskann-operations.c b/tests/fuzz/diskann-operations.c new file mode 100644 index 0000000..b36620b --- /dev/null +++ b/tests/fuzz/diskann-operations.c @@ -0,0 +1,100 @@ +/** + * Fuzz target for DiskANN insert/delete/query operation sequences. + * Uses fuzz bytes to drive random operations on a DiskANN-indexed table. + */ +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 6) return 0; + + int rc; + sqlite3 *db; + sqlite3_stmt *stmtInsert = NULL; + sqlite3_stmt *stmtDelete = NULL; + sqlite3_stmt *stmtKnn = NULL; + sqlite3_stmt *stmtScan = NULL; + + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + rc = sqlite3_exec(db, + "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8))", + NULL, NULL, NULL); + if (rc != SQLITE_OK) { sqlite3_close(db); return 0; } + + sqlite3_prepare_v2(db, + "INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &stmtInsert, NULL); + sqlite3_prepare_v2(db, + "DELETE FROM v WHERE rowid = ?", -1, &stmtDelete, NULL); + sqlite3_prepare_v2(db, + "SELECT rowid, distance FROM v WHERE emb MATCH ? AND k = 3", + -1, &stmtKnn, NULL); + sqlite3_prepare_v2(db, + "SELECT rowid FROM v", -1, &stmtScan, NULL); + + if (!stmtInsert || !stmtDelete || !stmtKnn || !stmtScan) goto cleanup; + + size_t i = 0; + while (i + 2 <= size) { + uint8_t op = data[i++] % 4; + uint8_t rowid_byte = data[i++]; + int64_t rowid = (int64_t)(rowid_byte % 32) + 1; + + switch (op) { + case 0: { + /* INSERT: consume 32 bytes for 8 floats, or use what's left */ + float vec[8] = {0}; + for (int j = 0; j < 8 && i < size; j++, i++) { + vec[j] = (float)((int8_t)data[i]) / 10.0f; + } + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, rowid); + sqlite3_bind_blob(stmtInsert, 2, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + break; + } + case 1: { + /* DELETE */ + sqlite3_reset(stmtDelete); + sqlite3_bind_int64(stmtDelete, 1, rowid); + sqlite3_step(stmtDelete); + break; + } + case 2: { + /* KNN query */ + float qvec[8] = {1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; + sqlite3_reset(stmtKnn); + sqlite3_bind_blob(stmtKnn, 1, qvec, sizeof(qvec), SQLITE_STATIC); + while (sqlite3_step(stmtKnn) == SQLITE_ROW) {} + break; + } + case 3: { + /* Full scan */ + sqlite3_reset(stmtScan); + while (sqlite3_step(stmtScan) == SQLITE_ROW) {} + break; + } + } + } + + /* Final operations -- must not crash regardless of prior state */ + sqlite3_exec(db, "SELECT * FROM v", NULL, NULL, NULL); + +cleanup: + sqlite3_finalize(stmtInsert); + sqlite3_finalize(stmtDelete); + sqlite3_finalize(stmtKnn); + sqlite3_finalize(stmtScan); + sqlite3_close(db); + return 0; +} diff --git a/tests/fuzz/diskann-prune-direct.c b/tests/fuzz/diskann-prune-direct.c new file mode 100644 index 0000000..7a440ad --- /dev/null +++ b/tests/fuzz/diskann-prune-direct.c @@ -0,0 +1,131 @@ +/** + * Fuzz target for DiskANN RobustPrune algorithm (diskann_prune_select). + * + * diskann_prune_select is exposed for testing and takes: + * - inter_distances: flattened NxN matrix of inter-candidate distances + * - p_distances: N distances from node p to each candidate + * - num_candidates, alpha, max_neighbors + * + * This is a pure function that doesn't need a database, so we can + * call it directly with fuzz-controlled inputs. This gives the fuzzer + * maximum speed (no SQLite overhead) to explore: + * + * - alpha boundary: alpha=0 (prunes nothing), alpha=very large (prunes all) + * - max_neighbors = 0, 1, num_candidates, > num_candidates + * - num_candidates = 0, 1, large + * - Distance matrices with: all zeros, all same, negative values, NaN, Inf + * - Non-symmetric distance matrices (should still work) + * - Memory: large num_candidates to stress malloc + * + * Key code paths: + * - diskann_prune_select alpha-pruning loop + * - Boundary: selectedCount reaches max_neighbors exactly + * - All candidates pruned before max_neighbors reached + */ +#include +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +/* Declare the test-exposed function. + * diskann_prune_select is not static -- it's a public symbol. */ +extern int diskann_prune_select( + const float *inter_distances, const float *p_distances, + int num_candidates, float alpha, int max_neighbors, + int *outSelected, int *outCount); + +static uint8_t fuzz_byte(const uint8_t **data, size_t *size, uint8_t def) { + if (*size == 0) return def; + uint8_t b = **data; + (*data)++; + (*size)--; + return b; +} + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 8) return 0; + + /* Consume parameters from fuzz data */ + int num_candidates = fuzz_byte(&data, &size, 0) % 33; /* 0..32 */ + int max_neighbors = fuzz_byte(&data, &size, 0) % 17; /* 0..16 */ + + /* Alpha: pick from interesting values */ + uint8_t alpha_idx = fuzz_byte(&data, &size, 0) % 8; + float alpha_values[] = {0.0f, 0.5f, 1.0f, 1.2f, 1.5f, 2.0f, 10.0f, 100.0f}; + float alpha = alpha_values[alpha_idx]; + + if (num_candidates == 0) { + /* Test empty case */ + int outCount = -1; + int rc = diskann_prune_select(NULL, NULL, 0, alpha, max_neighbors, + NULL, &outCount); + assert(rc == 0 /* SQLITE_OK */); + assert(outCount == 0); + return 0; + } + + /* Allocate arrays */ + int n = num_candidates; + float *inter_distances = malloc(n * n * sizeof(float)); + float *p_distances = malloc(n * sizeof(float)); + int *outSelected = malloc(n * sizeof(int)); + if (!inter_distances || !p_distances || !outSelected) { + free(inter_distances); + free(p_distances); + free(outSelected); + return 0; + } + + /* Fill p_distances from fuzz data (sorted ascending for correct input) */ + for (int i = 0; i < n; i++) { + uint8_t raw = fuzz_byte(&data, &size, (uint8_t)(i * 10)); + p_distances[i] = (float)raw / 10.0f; + } + /* Sort p_distances ascending (prune_select expects sorted input) */ + for (int i = 1; i < n; i++) { + float tmp = p_distances[i]; + int j = i - 1; + while (j >= 0 && p_distances[j] > tmp) { + p_distances[j + 1] = p_distances[j]; + j--; + } + p_distances[j + 1] = tmp; + } + + /* Fill inter-distance matrix from fuzz data */ + for (int i = 0; i < n * n; i++) { + uint8_t raw = fuzz_byte(&data, &size, (uint8_t)(i % 256)); + inter_distances[i] = (float)raw / 10.0f; + } + /* Make diagonal zero */ + for (int i = 0; i < n; i++) { + inter_distances[i * n + i] = 0.0f; + } + + int outCount = -1; + int rc = diskann_prune_select(inter_distances, p_distances, + n, alpha, max_neighbors, + outSelected, &outCount); + /* Basic sanity: should not crash, count should be valid */ + assert(rc == 0); + assert(outCount >= 0); + assert(outCount <= max_neighbors || max_neighbors == 0); + assert(outCount <= n); + + /* Verify outSelected flags are consistent with outCount */ + int flagCount = 0; + for (int i = 0; i < n; i++) { + if (outSelected[i]) flagCount++; + } + assert(flagCount == outCount); + + free(inter_distances); + free(p_distances); + free(outSelected); + return 0; +} diff --git a/tests/fuzz/diskann.dict b/tests/fuzz/diskann.dict new file mode 100644 index 0000000..31d289d --- /dev/null +++ b/tests/fuzz/diskann.dict @@ -0,0 +1,10 @@ +"neighbor_quantizer" +"binary" +"int8" +"n_neighbors" +"search_list_size" +"search_list_size_search" +"search_list_size_insert" +"alpha" +"=" +"," diff --git a/tests/fuzz/ivf-cell-overflow.c b/tests/fuzz/ivf-cell-overflow.c new file mode 100644 index 0000000..65ae6b2 --- /dev/null +++ b/tests/fuzz/ivf-cell-overflow.c @@ -0,0 +1,192 @@ +/** + * Fuzz target: IVF cell overflow and boundary conditions. + * + * Pushes cells past VEC0_IVF_CELL_MAX_VECTORS (64) to trigger cell + * splitting, then exercises blob I/O at slot boundaries. + * + * Targets: + * - Cell splitting when n_vectors reaches cap (64) + * - Blob offset arithmetic: slot * vecSize, slot / 8, slot % 8 + * - Validity bitmap at byte boundaries (slot 7->8, 15->16, etc.) + * - Insert into full cell -> create new cell path + * - Delete from various slot positions (first, last, middle) + * - Multiple cells per centroid + * - assign-vectors command with multi-cell centroids + */ +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 8) return 0; + + int rc; + sqlite3 *db; + + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + // Use small dimensions for speed but enough vectors to overflow cells + int dim = (data[0] % 8) + 2; // 2..9 + int nlist = (data[1] % 4) + 1; // 1..4 + // We need >64 vectors to overflow a cell + int num_vecs = (data[2] % 64) + 65; // 65..128 + int delete_pattern = data[3]; // Controls which vectors to delete + + const uint8_t *payload = data + 4; + size_t payload_size = size - 4; + + char sql[256]; + snprintf(sql, sizeof(sql), + "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[%d] indexed by ivf(nlist=%d, nprobe=%d))", + dim, nlist, nlist); + + rc = sqlite3_exec(db, sql, NULL, NULL, NULL); + if (rc != SQLITE_OK) { sqlite3_close(db); return 0; } + + // Insert enough vectors to overflow at least one cell + sqlite3_stmt *stmtInsert = NULL; + sqlite3_prepare_v2(db, + "INSERT INTO v(v, emb) VALUES (?, ?)", -1, &stmtInsert, NULL); + if (!stmtInsert) { sqlite3_close(db); return 0; } + + size_t offset = 0; + for (int i = 0; i < num_vecs; i++) { + float *vec = sqlite3_malloc(dim * sizeof(float)); + if (!vec) break; + for (int d = 0; d < dim; d++) { + if (offset < payload_size) { + vec[d] = ((float)(int8_t)payload[offset++]) / 50.0f; + } else { + // Cluster vectors near specific centroids to ensure some cells overflow + int cluster = i % nlist; + vec[d] = (float)cluster + (float)(i % 10) * 0.01f + d * 0.001f; + } + } + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, (int64_t)(i + 1)); + sqlite3_bind_blob(stmtInsert, 2, vec, dim * sizeof(float), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + sqlite3_free(vec); + } + sqlite3_finalize(stmtInsert); + + // Train to assign vectors to centroids (triggers cell building) + sqlite3_exec(db, + "INSERT INTO v(v) VALUES ('compute-centroids')", + NULL, NULL, NULL); + + // Delete vectors at boundary positions based on fuzz data + // This tests validity bitmap manipulation at different slot positions + for (int i = 0; i < num_vecs; i++) { + int byte_idx = i / 8; + if (byte_idx < (int)payload_size && (payload[byte_idx] & (1 << (i % 8)))) { + // Use delete_pattern to thin deletions + if ((delete_pattern + i) % 3 == 0) { + char delsql[64]; + snprintf(delsql, sizeof(delsql), "DELETE FROM v WHERE rowid = %d", i + 1); + sqlite3_exec(db, delsql, NULL, NULL, NULL); + } + } + } + + // Insert more vectors after deletions (into cells with holes) + { + sqlite3_stmt *si = NULL; + sqlite3_prepare_v2(db, + "INSERT INTO v(v, emb) VALUES (?, ?)", -1, &si, NULL); + if (si) { + for (int i = 0; i < 10; i++) { + float *vec = sqlite3_malloc(dim * sizeof(float)); + if (!vec) break; + for (int d = 0; d < dim; d++) + vec[d] = (float)(i + 200) * 0.01f; + sqlite3_reset(si); + sqlite3_bind_int64(si, 1, (int64_t)(num_vecs + i + 1)); + sqlite3_bind_blob(si, 2, vec, dim * sizeof(float), SQLITE_TRANSIENT); + sqlite3_step(si); + sqlite3_free(vec); + } + sqlite3_finalize(si); + } + } + + // KNN query that must scan multiple cells per centroid + { + float *qvec = sqlite3_malloc(dim * sizeof(float)); + if (qvec) { + for (int d = 0; d < dim; d++) qvec[d] = 0.0f; + sqlite3_stmt *sk = NULL; + snprintf(sql, sizeof(sql), + "SELECT rowid, distance FROM v WHERE emb MATCH ? LIMIT 20"); + sqlite3_prepare_v2(db, sql, -1, &sk, NULL); + if (sk) { + sqlite3_bind_blob(sk, 1, qvec, dim * sizeof(float), SQLITE_TRANSIENT); + while (sqlite3_step(sk) == SQLITE_ROW) {} + sqlite3_finalize(sk); + } + sqlite3_free(qvec); + } + } + + // Test assign-vectors with multi-cell state + // First clear centroids + sqlite3_exec(db, + "INSERT INTO v(v) VALUES ('clear-centroids')", + NULL, NULL, NULL); + + // Set centroids manually, then assign + for (int c = 0; c < nlist; c++) { + float *cvec = sqlite3_malloc(dim * sizeof(float)); + if (!cvec) break; + for (int d = 0; d < dim; d++) cvec[d] = (float)c + d * 0.1f; + + char cmd[128]; + snprintf(cmd, sizeof(cmd), + "INSERT INTO v(v, emb) VALUES ('set-centroid:%d', ?)", c); + sqlite3_stmt *sc = NULL; + sqlite3_prepare_v2(db, cmd, -1, &sc, NULL); + if (sc) { + sqlite3_bind_blob(sc, 1, cvec, dim * sizeof(float), SQLITE_TRANSIENT); + sqlite3_step(sc); + sqlite3_finalize(sc); + } + sqlite3_free(cvec); + } + + sqlite3_exec(db, + "INSERT INTO v(v) VALUES ('assign-vectors')", + NULL, NULL, NULL); + + // Final query after assign-vectors + { + float *qvec = sqlite3_malloc(dim * sizeof(float)); + if (qvec) { + for (int d = 0; d < dim; d++) qvec[d] = 1.0f; + sqlite3_stmt *sk = NULL; + sqlite3_prepare_v2(db, + "SELECT rowid, distance FROM v WHERE emb MATCH ? LIMIT 5", + -1, &sk, NULL); + if (sk) { + sqlite3_bind_blob(sk, 1, qvec, dim * sizeof(float), SQLITE_TRANSIENT); + while (sqlite3_step(sk) == SQLITE_ROW) {} + sqlite3_finalize(sk); + } + sqlite3_free(qvec); + } + } + + // Full scan + sqlite3_exec(db, "SELECT * FROM v", NULL, NULL, NULL); + + sqlite3_close(db); + return 0; +} diff --git a/tests/fuzz/ivf-create.c b/tests/fuzz/ivf-create.c new file mode 100644 index 0000000..222b67b --- /dev/null +++ b/tests/fuzz/ivf-create.c @@ -0,0 +1,36 @@ +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + int rc; + sqlite3 *db; + sqlite3_stmt *stmt; + + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + sqlite3_str *s = sqlite3_str_new(NULL); + assert(s); + sqlite3_str_appendall(s, "CREATE VIRTUAL TABLE v USING vec0(emb float[4] indexed by ivf("); + sqlite3_str_appendf(s, "%.*s", (int)size, data); + sqlite3_str_appendall(s, "))"); + const char *zSql = sqlite3_str_finish(s); + assert(zSql); + + rc = sqlite3_prepare_v2(db, zSql, -1, &stmt, NULL); + sqlite3_free((void *)zSql); + if (rc == SQLITE_OK) { + sqlite3_step(stmt); + } + sqlite3_finalize(stmt); + sqlite3_close(db); + return 0; +} diff --git a/tests/fuzz/ivf-create.dict b/tests/fuzz/ivf-create.dict new file mode 100644 index 0000000..9a014e7 --- /dev/null +++ b/tests/fuzz/ivf-create.dict @@ -0,0 +1,16 @@ +"nlist" +"nprobe" +"quantizer" +"oversample" +"binary" +"int8" +"none" +"=" +"," +"(" +")" +"0" +"1" +"128" +"65536" +"65537" diff --git a/tests/fuzz/ivf-kmeans.c b/tests/fuzz/ivf-kmeans.c new file mode 100644 index 0000000..1d37184 --- /dev/null +++ b/tests/fuzz/ivf-kmeans.c @@ -0,0 +1,180 @@ +/** + * Fuzz target: IVF k-means clustering. + * + * Builds a table, inserts fuzz-controlled vectors, then runs + * compute-centroids with fuzz-controlled parameters (nlist, max_iter, seed). + * Targets: + * - kmeans with N < k (clamping), N == 1, k == 1 + * - kmeans with duplicate/identical vectors (all distances zero) + * - kmeans with NaN/Inf vectors + * - Empty cluster reassignment path (farthest-point heuristic) + * - Large nlist relative to N + * - The compute-centroids:{json} command parsing + * - clear-centroids followed by compute-centroids (round-trip) + */ +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 10) return 0; + + int rc; + sqlite3 *db; + + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + // Parse fuzz header + // Byte 0-1: dimension (1..128) + // Byte 2: nlist for CREATE (1..64) + // Byte 3: nlist override for compute-centroids (0 = use default) + // Byte 4: max_iter (1..50) + // Byte 5-8: seed + // Byte 9: num_vectors (1..64) + // Remaining: vector float data + + int dim = (data[0] | (data[1] << 8)) % 128 + 1; + int nlist_create = (data[2] % 64) + 1; + int nlist_override = data[3] % 65; // 0 means use table default + int max_iter = (data[4] % 50) + 1; + uint32_t seed = (uint32_t)data[5] | ((uint32_t)data[6] << 8) | + ((uint32_t)data[7] << 16) | ((uint32_t)data[8] << 24); + int num_vecs = (data[9] % 64) + 1; + + const uint8_t *payload = data + 10; + size_t payload_size = size - 10; + + char sql[256]; + snprintf(sql, sizeof(sql), + "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[%d] indexed by ivf(nlist=%d, nprobe=%d))", + dim, nlist_create, nlist_create); + + rc = sqlite3_exec(db, sql, NULL, NULL, NULL); + if (rc != SQLITE_OK) { sqlite3_close(db); return 0; } + + // Insert vectors + sqlite3_stmt *stmtInsert = NULL; + sqlite3_prepare_v2(db, + "INSERT INTO v(v, emb) VALUES (?, ?)", -1, &stmtInsert, NULL); + if (!stmtInsert) { sqlite3_close(db); return 0; } + + size_t offset = 0; + for (int i = 0; i < num_vecs; i++) { + float *vec = sqlite3_malloc(dim * sizeof(float)); + if (!vec) break; + + for (int d = 0; d < dim; d++) { + if (offset + 4 <= payload_size) { + memcpy(&vec[d], payload + offset, sizeof(float)); + offset += 4; + } else if (offset < payload_size) { + // Scale to interesting range including values > 1, < -1 + vec[d] = ((float)(int8_t)payload[offset++]) / 5.0f; + } else { + // Reuse earlier bytes to fill remaining dimensions + vec[d] = (float)(i * dim + d) * 0.01f; + } + } + + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, (int64_t)(i + 1)); + sqlite3_bind_blob(stmtInsert, 2, vec, dim * sizeof(float), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + sqlite3_free(vec); + } + sqlite3_finalize(stmtInsert); + + // Exercise compute-centroids with JSON options + { + char cmd[256]; + snprintf(cmd, sizeof(cmd), + "INSERT INTO v(rowid) VALUES " + "('compute-centroids:{\"nlist\":%d,\"max_iterations\":%d,\"seed\":%u}')", + nlist_override, max_iter, seed); + sqlite3_exec(db, cmd, NULL, NULL, NULL); + } + + // KNN query after training + { + float *qvec = sqlite3_malloc(dim * sizeof(float)); + if (qvec) { + for (int d = 0; d < dim; d++) { + qvec[d] = (d < 3) ? 1.0f : 0.0f; + } + sqlite3_stmt *stmtKnn = NULL; + sqlite3_prepare_v2(db, + "SELECT rowid, distance FROM v WHERE emb MATCH ? LIMIT 5", + -1, &stmtKnn, NULL); + if (stmtKnn) { + sqlite3_bind_blob(stmtKnn, 1, qvec, dim * sizeof(float), SQLITE_TRANSIENT); + while (sqlite3_step(stmtKnn) == SQLITE_ROW) {} + sqlite3_finalize(stmtKnn); + } + sqlite3_free(qvec); + } + } + + // Clear centroids and re-compute to test round-trip + sqlite3_exec(db, + "INSERT INTO v(v) VALUES ('clear-centroids')", + NULL, NULL, NULL); + + // Insert a few more vectors in untrained state + { + sqlite3_stmt *si = NULL; + sqlite3_prepare_v2(db, + "INSERT INTO v(v, emb) VALUES (?, ?)", -1, &si, NULL); + if (si) { + for (int i = 0; i < 3; i++) { + float *vec = sqlite3_malloc(dim * sizeof(float)); + if (!vec) break; + for (int d = 0; d < dim; d++) vec[d] = (float)(i + 100) * 0.1f; + sqlite3_reset(si); + sqlite3_bind_int64(si, 1, (int64_t)(num_vecs + i + 1)); + sqlite3_bind_blob(si, 2, vec, dim * sizeof(float), SQLITE_TRANSIENT); + sqlite3_step(si); + sqlite3_free(vec); + } + sqlite3_finalize(si); + } + } + + // Re-train + sqlite3_exec(db, + "INSERT INTO v(v) VALUES ('compute-centroids')", + NULL, NULL, NULL); + + // Delete some rows after training, then query + sqlite3_exec(db, "DELETE FROM v WHERE rowid = 1", NULL, NULL, NULL); + sqlite3_exec(db, "DELETE FROM v WHERE rowid = 2", NULL, NULL, NULL); + + // Query after deletes + { + float *qvec = sqlite3_malloc(dim * sizeof(float)); + if (qvec) { + for (int d = 0; d < dim; d++) qvec[d] = 0.5f; + sqlite3_stmt *stmtKnn = NULL; + sqlite3_prepare_v2(db, + "SELECT rowid, distance FROM v WHERE emb MATCH ? LIMIT 10", + -1, &stmtKnn, NULL); + if (stmtKnn) { + sqlite3_bind_blob(stmtKnn, 1, qvec, dim * sizeof(float), SQLITE_TRANSIENT); + while (sqlite3_step(stmtKnn) == SQLITE_ROW) {} + sqlite3_finalize(stmtKnn); + } + sqlite3_free(qvec); + } + } + + sqlite3_close(db); + return 0; +} diff --git a/tests/fuzz/ivf-knn-deep.c b/tests/fuzz/ivf-knn-deep.c new file mode 100644 index 0000000..f5adb1e --- /dev/null +++ b/tests/fuzz/ivf-knn-deep.c @@ -0,0 +1,199 @@ +/** + * Fuzz target: IVF KNN search deep paths. + * + * Exercises the full KNN pipeline with fuzz-controlled: + * - nprobe values (including > nlist, =1, =nlist) + * - Query vectors (including adversarial floats) + * - Mix of trained/untrained state + * - Oversample + rescore path (quantizer=int8 with oversample>1) + * - Multiple interleaved KNN queries + * - Candidate array realloc path (many vectors in probed cells) + * + * Targets: + * - ivf_scan_cells_from_stmt: candidate realloc, distance computation + * - ivf_query_knn: centroid sorting, nprobe selection + * - Oversample rescore: re-ranking with full-precision vectors + * - qsort with NaN distances + */ +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +static uint16_t read_u16(const uint8_t *p) { + return (uint16_t)(p[0] | (p[1] << 8)); +} + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 16) return 0; + + int rc; + sqlite3 *db; + + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + // Header + int dim = (data[0] % 32) + 2; // 2..33 + int nlist = (data[1] % 16) + 1; // 1..16 + int nprobe_initial = (data[2] % 20) + 1; // 1..20 (can be > nlist) + int quantizer_type = data[3] % 3; // 0=none, 1=int8, 2=binary + int oversample = (data[4] % 4) + 1; // 1..4 + int num_vecs = (data[5] % 80) + 4; // 4..83 + int num_queries = (data[6] % 8) + 1; // 1..8 + int k_limit = (data[7] % 20) + 1; // 1..20 + + const uint8_t *payload = data + 8; + size_t payload_size = size - 8; + + // For binary quantizer, dimension must be multiple of 8 + if (quantizer_type == 2) { + dim = ((dim + 7) / 8) * 8; + if (dim == 0) dim = 8; + } + + const char *qname; + switch (quantizer_type) { + case 1: qname = "int8"; break; + case 2: qname = "binary"; break; + default: qname = "none"; break; + } + + // Oversample only valid with quantization + if (quantizer_type == 0) oversample = 1; + + // Cap nprobe to nlist for CREATE (parser rejects nprobe > nlist) + int nprobe_create = nprobe_initial <= nlist ? nprobe_initial : nlist; + + char sql[512]; + snprintf(sql, sizeof(sql), + "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[%d] indexed by ivf(nlist=%d, nprobe=%d, quantizer=%s%s))", + dim, nlist, nprobe_create, qname, + oversample > 1 ? ", oversample=2" : ""); + + // If that fails (e.g. oversample with none), try without oversample + rc = sqlite3_exec(db, sql, NULL, NULL, NULL); + if (rc != SQLITE_OK) { + snprintf(sql, sizeof(sql), + "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[%d] indexed by ivf(nlist=%d, nprobe=%d, quantizer=%s))", + dim, nlist, nprobe_create, qname); + rc = sqlite3_exec(db, sql, NULL, NULL, NULL); + if (rc != SQLITE_OK) { sqlite3_close(db); return 0; } + } + + // Insert vectors + sqlite3_stmt *stmtInsert = NULL; + sqlite3_prepare_v2(db, + "INSERT INTO v(v, emb) VALUES (?, ?)", -1, &stmtInsert, NULL); + if (!stmtInsert) { sqlite3_close(db); return 0; } + + size_t offset = 0; + for (int i = 0; i < num_vecs; i++) { + float *vec = sqlite3_malloc(dim * sizeof(float)); + if (!vec) break; + for (int d = 0; d < dim; d++) { + if (offset < payload_size) { + vec[d] = ((float)(int8_t)payload[offset++]) / 20.0f; + } else { + vec[d] = (float)((i * dim + d) % 256 - 128) / 128.0f; + } + } + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, (int64_t)(i + 1)); + sqlite3_bind_blob(stmtInsert, 2, vec, dim * sizeof(float), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + sqlite3_free(vec); + } + sqlite3_finalize(stmtInsert); + + // Query BEFORE training (flat scan path) + { + float *qvec = sqlite3_malloc(dim * sizeof(float)); + if (qvec) { + for (int d = 0; d < dim; d++) qvec[d] = 0.5f; + sqlite3_stmt *sk = NULL; + snprintf(sql, sizeof(sql), + "SELECT rowid, distance FROM v WHERE emb MATCH ? LIMIT %d", k_limit); + sqlite3_prepare_v2(db, sql, -1, &sk, NULL); + if (sk) { + sqlite3_bind_blob(sk, 1, qvec, dim * sizeof(float), SQLITE_TRANSIENT); + while (sqlite3_step(sk) == SQLITE_ROW) {} + sqlite3_finalize(sk); + } + sqlite3_free(qvec); + } + } + + // Train + sqlite3_exec(db, + "INSERT INTO v(v) VALUES ('compute-centroids')", + NULL, NULL, NULL); + + // Change nprobe at runtime (can exceed nlist -- tests clamping in query) + { + char cmd[64]; + snprintf(cmd, sizeof(cmd), + "INSERT INTO v(v) VALUES ('nprobe=%d')", nprobe_initial); + sqlite3_exec(db, cmd, NULL, NULL, NULL); + } + + // Multiple KNN queries with different fuzz-derived query vectors + for (int q = 0; q < num_queries; q++) { + float *qvec = sqlite3_malloc(dim * sizeof(float)); + if (!qvec) break; + for (int d = 0; d < dim; d++) { + if (offset < payload_size) { + qvec[d] = ((float)(int8_t)payload[offset++]) / 10.0f; + } else { + qvec[d] = (q == 0) ? 1.0f : 0.0f; + } + } + + sqlite3_stmt *sk = NULL; + snprintf(sql, sizeof(sql), + "SELECT rowid, distance FROM v WHERE emb MATCH ? LIMIT %d", k_limit); + sqlite3_prepare_v2(db, sql, -1, &sk, NULL); + if (sk) { + sqlite3_bind_blob(sk, 1, qvec, dim * sizeof(float), SQLITE_TRANSIENT); + while (sqlite3_step(sk) == SQLITE_ROW) {} + sqlite3_finalize(sk); + } + sqlite3_free(qvec); + } + + // Delete half the vectors then query again + for (int i = 1; i <= num_vecs / 2; i++) { + char delsql[64]; + snprintf(delsql, sizeof(delsql), "DELETE FROM v WHERE rowid = %d", i); + sqlite3_exec(db, delsql, NULL, NULL, NULL); + } + + // Query after mass deletion + { + float *qvec = sqlite3_malloc(dim * sizeof(float)); + if (qvec) { + for (int d = 0; d < dim; d++) qvec[d] = -0.5f; + sqlite3_stmt *sk = NULL; + snprintf(sql, sizeof(sql), + "SELECT rowid, distance FROM v WHERE emb MATCH ? LIMIT %d", k_limit); + sqlite3_prepare_v2(db, sql, -1, &sk, NULL); + if (sk) { + sqlite3_bind_blob(sk, 1, qvec, dim * sizeof(float), SQLITE_TRANSIENT); + while (sqlite3_step(sk) == SQLITE_ROW) {} + sqlite3_finalize(sk); + } + sqlite3_free(qvec); + } + } + + sqlite3_close(db); + return 0; +} diff --git a/tests/fuzz/ivf-operations.c b/tests/fuzz/ivf-operations.c new file mode 100644 index 0000000..c8d0c01 --- /dev/null +++ b/tests/fuzz/ivf-operations.c @@ -0,0 +1,121 @@ +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 6) return 0; + + int rc; + sqlite3 *db; + sqlite3_stmt *stmtInsert = NULL; + sqlite3_stmt *stmtDelete = NULL; + sqlite3_stmt *stmtKnn = NULL; + sqlite3_stmt *stmtScan = NULL; + + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + rc = sqlite3_exec(db, + "CREATE VIRTUAL TABLE v USING vec0(emb float[4] indexed by ivf(nlist=4, nprobe=4))", + NULL, NULL, NULL); + if (rc != SQLITE_OK) { sqlite3_close(db); return 0; } + + sqlite3_prepare_v2(db, + "INSERT INTO v(v, emb) VALUES (?, ?)", -1, &stmtInsert, NULL); + sqlite3_prepare_v2(db, + "DELETE FROM v WHERE rowid = ?", -1, &stmtDelete, NULL); + sqlite3_prepare_v2(db, + "SELECT rowid, distance FROM v WHERE emb MATCH ? LIMIT 3", + -1, &stmtKnn, NULL); + sqlite3_prepare_v2(db, + "SELECT rowid FROM v", -1, &stmtScan, NULL); + + if (!stmtInsert || !stmtDelete || !stmtKnn || !stmtScan) goto cleanup; + + size_t i = 0; + while (i + 2 <= size) { + uint8_t op = data[i++] % 7; + uint8_t rowid_byte = data[i++]; + int64_t rowid = (int64_t)(rowid_byte % 32) + 1; + + switch (op) { + case 0: { + // INSERT: consume 16 bytes for 4 floats, or use what's left + float vec[4] = {0.0f, 0.0f, 0.0f, 0.0f}; + for (int j = 0; j < 4 && i < size; j++, i++) { + vec[j] = (float)((int8_t)data[i]) / 10.0f; + } + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, rowid); + sqlite3_bind_blob(stmtInsert, 2, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + break; + } + case 1: { + // DELETE + sqlite3_reset(stmtDelete); + sqlite3_bind_int64(stmtDelete, 1, rowid); + sqlite3_step(stmtDelete); + break; + } + case 2: { + // KNN query with a fixed query vector + float qvec[4] = {1.0f, 0.0f, 0.0f, 0.0f}; + sqlite3_reset(stmtKnn); + sqlite3_bind_blob(stmtKnn, 1, qvec, sizeof(qvec), SQLITE_STATIC); + while (sqlite3_step(stmtKnn) == SQLITE_ROW) {} + break; + } + case 3: { + // Full scan + sqlite3_reset(stmtScan); + while (sqlite3_step(stmtScan) == SQLITE_ROW) {} + break; + } + case 4: { + // compute-centroids command + sqlite3_exec(db, + "INSERT INTO v(v) VALUES ('compute-centroids')", + NULL, NULL, NULL); + break; + } + case 5: { + // clear-centroids command + sqlite3_exec(db, + "INSERT INTO v(v) VALUES ('clear-centroids')", + NULL, NULL, NULL); + break; + } + case 6: { + // nprobe=N command + if (i < size) { + uint8_t n = data[i++]; + int nprobe = (n % 4) + 1; + char buf[64]; + snprintf(buf, sizeof(buf), + "INSERT INTO v(v) VALUES ('nprobe=%d')", nprobe); + sqlite3_exec(db, buf, NULL, NULL, NULL); + } + break; + } + } + } + + // Final operations — must not crash regardless of prior state + sqlite3_exec(db, "SELECT * FROM v", NULL, NULL, NULL); + +cleanup: + sqlite3_finalize(stmtInsert); + sqlite3_finalize(stmtDelete); + sqlite3_finalize(stmtKnn); + sqlite3_finalize(stmtScan); + sqlite3_close(db); + return 0; +} diff --git a/tests/fuzz/ivf-quantize.c b/tests/fuzz/ivf-quantize.c new file mode 100644 index 0000000..bc8800b --- /dev/null +++ b/tests/fuzz/ivf-quantize.c @@ -0,0 +1,129 @@ +/** + * Fuzz target: IVF quantization functions. + * + * Directly exercises ivf_quantize_int8 and ivf_quantize_binary with + * fuzz-controlled dimensions and float data. Targets: + * - ivf_quantize_int8: clamping, int8 overflow boundary + * - ivf_quantize_binary: D not divisible by 8, memset(D/8) undercount + * - Round-trip through CREATE TABLE + INSERT with quantized IVF + */ +#include +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 8) return 0; + + int rc; + sqlite3 *db; + + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + // Byte 0: quantizer type (0=int8, 1=binary) + // Byte 1: dimension (1..64, but we test edge cases) + // Byte 2: nlist (1..8) + // Byte 3: num_vectors to insert (1..32) + // Remaining: float data + int qtype = data[0] % 2; + int dim = (data[1] % 64) + 1; + int nlist = (data[2] % 8) + 1; + int num_vecs = (data[3] % 32) + 1; + const uint8_t *payload = data + 4; + size_t payload_size = size - 4; + + // For binary quantizer, D must be multiple of 8 to avoid the D/8 bug + // in production. But we explicitly want to test non-multiples too to + // find the bug. Use dim as-is. + const char *quantizer = qtype ? "binary" : "int8"; + + // Binary quantizer needs D multiple of 8 in current code, but let's + // test both valid and invalid dimensions to see what happens. + // For binary with non-multiple-of-8, the code does memset(dst, 0, D/8) + // which underallocates when D%8 != 0. + char sql[256]; + snprintf(sql, sizeof(sql), + "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[%d] indexed by ivf(nlist=%d, nprobe=%d, quantizer=%s))", + dim, nlist, nlist, quantizer); + + rc = sqlite3_exec(db, sql, NULL, NULL, NULL); + if (rc != SQLITE_OK) { sqlite3_close(db); return 0; } + + // Insert vectors with fuzz-controlled float values + sqlite3_stmt *stmtInsert = NULL; + sqlite3_prepare_v2(db, + "INSERT INTO v(v, emb) VALUES (?, ?)", -1, &stmtInsert, NULL); + if (!stmtInsert) { sqlite3_close(db); return 0; } + + size_t offset = 0; + for (int i = 0; i < num_vecs && offset < payload_size; i++) { + // Build float vector from fuzz data + float *vec = sqlite3_malloc(dim * sizeof(float)); + if (!vec) break; + + for (int d = 0; d < dim; d++) { + if (offset + 4 <= payload_size) { + // Use raw bytes as float -- can produce NaN, Inf, denormals + memcpy(&vec[d], payload + offset, sizeof(float)); + offset += 4; + } else if (offset < payload_size) { + // Partial: use byte as scaled value + vec[d] = ((float)(int8_t)payload[offset++]) / 50.0f; + } else { + vec[d] = 0.0f; + } + } + + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, (int64_t)(i + 1)); + sqlite3_bind_blob(stmtInsert, 2, vec, dim * sizeof(float), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + sqlite3_free(vec); + } + sqlite3_finalize(stmtInsert); + + // Trigger compute-centroids to exercise kmeans + quantization together + sqlite3_exec(db, + "INSERT INTO v(v) VALUES ('compute-centroids')", + NULL, NULL, NULL); + + // KNN query with fuzz-derived query vector + { + float *qvec = sqlite3_malloc(dim * sizeof(float)); + if (qvec) { + for (int d = 0; d < dim; d++) { + if (offset < payload_size) { + qvec[d] = ((float)(int8_t)payload[offset++]) / 10.0f; + } else { + qvec[d] = 1.0f; + } + } + + sqlite3_stmt *stmtKnn = NULL; + sqlite3_prepare_v2(db, + "SELECT rowid, distance FROM v WHERE emb MATCH ? LIMIT 5", + -1, &stmtKnn, NULL); + if (stmtKnn) { + sqlite3_bind_blob(stmtKnn, 1, qvec, dim * sizeof(float), SQLITE_TRANSIENT); + while (sqlite3_step(stmtKnn) == SQLITE_ROW) {} + sqlite3_finalize(stmtKnn); + } + sqlite3_free(qvec); + } + } + + // Full scan + sqlite3_exec(db, "SELECT * FROM v", NULL, NULL, NULL); + + sqlite3_close(db); + return 0; +} diff --git a/tests/fuzz/ivf-rescore.c b/tests/fuzz/ivf-rescore.c new file mode 100644 index 0000000..3cddf88 --- /dev/null +++ b/tests/fuzz/ivf-rescore.c @@ -0,0 +1,182 @@ +/** + * Fuzz target: IVF oversample + rescore path. + * + * Specifically targets the code path where quantizer != none AND + * oversample > 1, which triggers: + * 1. Quantized KNN scan to collect oversample*k candidates + * 2. Full-precision vector lookup from _ivf_vectors table + * 3. Re-scoring with float32 distances + * 4. Re-sort and truncation + * + * This path has the most complex memory management in the KNN query: + * - Two separate distance computations (quantized + float) + * - Cross-table lookups (cells + vectors KV store) + * - Candidate array resizing + * - qsort over partially re-scored arrays + * + * Also tests the int8 + binary quantization round-trip fidelity + * under adversarial float inputs. + */ +#include +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 12) return 0; + + int rc; + sqlite3 *db; + + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + // Header + int quantizer_type = (data[0] % 2) + 1; // 1=int8, 2=binary (never none) + int dim = (data[1] % 32) + 8; // 8..39 + int nlist = (data[2] % 8) + 1; // 1..8 + int oversample = (data[3] % 4) + 2; // 2..5 (always > 1) + int num_vecs = (data[4] % 60) + 8; // 8..67 + int k_limit = (data[5] % 15) + 1; // 1..15 + + const uint8_t *payload = data + 6; + size_t payload_size = size - 6; + + // Binary quantizer needs D multiple of 8 + if (quantizer_type == 2) { + dim = ((dim + 7) / 8) * 8; + } + + const char *qname = (quantizer_type == 1) ? "int8" : "binary"; + + char sql[512]; + snprintf(sql, sizeof(sql), + "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[%d] indexed by ivf(nlist=%d, nprobe=%d, quantizer=%s, oversample=%d))", + dim, nlist, nlist, qname, oversample); + + rc = sqlite3_exec(db, sql, NULL, NULL, NULL); + if (rc != SQLITE_OK) { sqlite3_close(db); return 0; } + + // Insert vectors with diverse values + sqlite3_stmt *stmtInsert = NULL; + sqlite3_prepare_v2(db, + "INSERT INTO v(v, emb) VALUES (?, ?)", -1, &stmtInsert, NULL); + if (!stmtInsert) { sqlite3_close(db); return 0; } + + size_t offset = 0; + for (int i = 0; i < num_vecs; i++) { + float *vec = sqlite3_malloc(dim * sizeof(float)); + if (!vec) break; + for (int d = 0; d < dim; d++) { + if (offset + 4 <= payload_size) { + // Use raw bytes as float for adversarial values + memcpy(&vec[d], payload + offset, sizeof(float)); + offset += 4; + // Sanitize: replace NaN/Inf with bounded values to avoid + // poisoning the entire computation. We want edge values, + // not complete nonsense. + if (isnan(vec[d]) || isinf(vec[d])) { + vec[d] = (vec[d] > 0) ? 1e6f : -1e6f; + if (isnan(vec[d])) vec[d] = 0.0f; + } + } else if (offset < payload_size) { + vec[d] = ((float)(int8_t)payload[offset++]) / 30.0f; + } else { + vec[d] = (float)(i * dim + d) * 0.001f; + } + } + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, (int64_t)(i + 1)); + sqlite3_bind_blob(stmtInsert, 2, vec, dim * sizeof(float), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + sqlite3_free(vec); + } + sqlite3_finalize(stmtInsert); + + // Train + sqlite3_exec(db, + "INSERT INTO v(v) VALUES ('compute-centroids')", + NULL, NULL, NULL); + + // Multiple KNN queries to exercise rescore path + for (int q = 0; q < 4; q++) { + float *qvec = sqlite3_malloc(dim * sizeof(float)); + if (!qvec) break; + for (int d = 0; d < dim; d++) { + if (offset < payload_size) { + qvec[d] = ((float)(int8_t)payload[offset++]) / 10.0f; + } else { + qvec[d] = (q == 0) ? 1.0f : (q == 1) ? -1.0f : 0.0f; + } + } + + sqlite3_stmt *sk = NULL; + snprintf(sql, sizeof(sql), + "SELECT rowid, distance FROM v WHERE emb MATCH ? LIMIT %d", k_limit); + sqlite3_prepare_v2(db, sql, -1, &sk, NULL); + if (sk) { + sqlite3_bind_blob(sk, 1, qvec, dim * sizeof(float), SQLITE_TRANSIENT); + while (sqlite3_step(sk) == SQLITE_ROW) {} + sqlite3_finalize(sk); + } + sqlite3_free(qvec); + } + + // Delete some vectors, then query again (rescore with missing _ivf_vectors rows) + for (int i = 1; i <= num_vecs / 3; i++) { + char delsql[64]; + snprintf(delsql, sizeof(delsql), "DELETE FROM v WHERE rowid = %d", i); + sqlite3_exec(db, delsql, NULL, NULL, NULL); + } + + { + float *qvec = sqlite3_malloc(dim * sizeof(float)); + if (qvec) { + for (int d = 0; d < dim; d++) qvec[d] = 0.5f; + sqlite3_stmt *sk = NULL; + snprintf(sql, sizeof(sql), + "SELECT rowid, distance FROM v WHERE emb MATCH ? LIMIT %d", k_limit); + sqlite3_prepare_v2(db, sql, -1, &sk, NULL); + if (sk) { + sqlite3_bind_blob(sk, 1, qvec, dim * sizeof(float), SQLITE_TRANSIENT); + while (sqlite3_step(sk) == SQLITE_ROW) {} + sqlite3_finalize(sk); + } + sqlite3_free(qvec); + } + } + + // Retrain after deletions + sqlite3_exec(db, + "INSERT INTO v(v) VALUES ('compute-centroids')", + NULL, NULL, NULL); + + // Query after retrain + { + float *qvec = sqlite3_malloc(dim * sizeof(float)); + if (qvec) { + for (int d = 0; d < dim; d++) qvec[d] = -0.3f; + sqlite3_stmt *sk = NULL; + snprintf(sql, sizeof(sql), + "SELECT rowid, distance FROM v WHERE emb MATCH ? LIMIT %d", k_limit); + sqlite3_prepare_v2(db, sql, -1, &sk, NULL); + if (sk) { + sqlite3_bind_blob(sk, 1, qvec, dim * sizeof(float), SQLITE_TRANSIENT); + while (sqlite3_step(sk) == SQLITE_ROW) {} + sqlite3_finalize(sk); + } + sqlite3_free(qvec); + } + } + + sqlite3_close(db); + return 0; +} diff --git a/tests/fuzz/ivf-shadow-corrupt.c b/tests/fuzz/ivf-shadow-corrupt.c new file mode 100644 index 0000000..74d72c3 --- /dev/null +++ b/tests/fuzz/ivf-shadow-corrupt.c @@ -0,0 +1,228 @@ +/** + * Fuzz target: IVF shadow table corruption. + * + * Creates a trained IVF table, then corrupts IVF shadow table blobs + * (centroids, cells validity/rowids/vectors, rowid_map) with fuzz data. + * Then exercises all read/write paths. Must not crash. + * + * Targets: + * - Cell validity bitmap with wrong size + * - Cell rowids blob with wrong size/alignment + * - Cell vectors blob with wrong size + * - Centroid blob with wrong size + * - n_vectors inconsistent with validity bitmap + * - Missing rowid_map entries + * - KNN scan over corrupted cells + * - Insert/delete with corrupted rowid_map + */ +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 4) return 0; + + int rc; + sqlite3 *db; + + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + // Create IVF table and insert enough vectors to train + rc = sqlite3_exec(db, + "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[8] indexed by ivf(nlist=2, nprobe=2))", + NULL, NULL, NULL); + if (rc != SQLITE_OK) { sqlite3_close(db); return 0; } + + // Insert 10 vectors + { + sqlite3_stmt *si = NULL; + sqlite3_prepare_v2(db, + "INSERT INTO v(v, emb) VALUES (?, ?)", -1, &si, NULL); + if (!si) { sqlite3_close(db); return 0; } + for (int i = 0; i < 10; i++) { + float vec[8]; + for (int d = 0; d < 8; d++) { + vec[d] = (float)(i * 8 + d) * 0.1f; + } + sqlite3_reset(si); + sqlite3_bind_int64(si, 1, i + 1); + sqlite3_bind_blob(si, 2, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_step(si); + } + sqlite3_finalize(si); + } + + // Train + sqlite3_exec(db, + "INSERT INTO v(v) VALUES ('compute-centroids')", + NULL, NULL, NULL); + + // Now corrupt shadow tables based on fuzz input + uint8_t target = data[0] % 10; + const uint8_t *payload = data + 1; + int payload_size = (int)(size - 1); + + // Limit payload to avoid huge allocations + if (payload_size > 4096) payload_size = 4096; + + sqlite3_stmt *stmt = NULL; + + switch (target) { + case 0: { + // Corrupt cell validity blob + rc = sqlite3_prepare_v2(db, + "UPDATE v_ivf_cells00 SET validity = ? WHERE rowid = 1", + -1, &stmt, NULL); + if (rc == SQLITE_OK) { + sqlite3_bind_blob(stmt, 1, payload, payload_size, SQLITE_STATIC); + sqlite3_step(stmt); sqlite3_finalize(stmt); + } + break; + } + case 1: { + // Corrupt cell rowids blob + rc = sqlite3_prepare_v2(db, + "UPDATE v_ivf_cells00 SET rowids = ? WHERE rowid = 1", + -1, &stmt, NULL); + if (rc == SQLITE_OK) { + sqlite3_bind_blob(stmt, 1, payload, payload_size, SQLITE_STATIC); + sqlite3_step(stmt); sqlite3_finalize(stmt); + } + break; + } + case 2: { + // Corrupt cell vectors blob + rc = sqlite3_prepare_v2(db, + "UPDATE v_ivf_cells00 SET vectors = ? WHERE rowid = 1", + -1, &stmt, NULL); + if (rc == SQLITE_OK) { + sqlite3_bind_blob(stmt, 1, payload, payload_size, SQLITE_STATIC); + sqlite3_step(stmt); sqlite3_finalize(stmt); + } + break; + } + case 3: { + // Corrupt centroid blob + rc = sqlite3_prepare_v2(db, + "UPDATE v_ivf_centroids00 SET centroid = ? WHERE centroid_id = 0", + -1, &stmt, NULL); + if (rc == SQLITE_OK) { + sqlite3_bind_blob(stmt, 1, payload, payload_size, SQLITE_STATIC); + sqlite3_step(stmt); sqlite3_finalize(stmt); + } + break; + } + case 4: { + // Set n_vectors to a bogus value (larger than cell capacity) + int bogus_n = 99999; + if (payload_size >= 4) { + memcpy(&bogus_n, payload, 4); + bogus_n = abs(bogus_n) % 100000; + } + char sql[128]; + snprintf(sql, sizeof(sql), + "UPDATE v_ivf_cells00 SET n_vectors = %d WHERE rowid = 1", bogus_n); + sqlite3_exec(db, sql, NULL, NULL, NULL); + break; + } + case 5: { + // Delete rowid_map entries (orphan vectors) + sqlite3_exec(db, + "DELETE FROM v_ivf_rowid_map00 WHERE rowid IN (1, 2, 3)", + NULL, NULL, NULL); + break; + } + case 6: { + // Corrupt rowid_map slot values + char sql[128]; + int bogus_slot = payload_size > 0 ? (int)payload[0] * 1000 : 99999; + snprintf(sql, sizeof(sql), + "UPDATE v_ivf_rowid_map00 SET slot = %d WHERE rowid = 1", bogus_slot); + sqlite3_exec(db, sql, NULL, NULL, NULL); + break; + } + case 7: { + // Corrupt rowid_map cell_id values + sqlite3_exec(db, + "UPDATE v_ivf_rowid_map00 SET cell_id = 99999 WHERE rowid = 1", + NULL, NULL, NULL); + break; + } + case 8: { + // Delete all centroids (make trained but no centroids) + sqlite3_exec(db, + "DELETE FROM v_ivf_centroids00", + NULL, NULL, NULL); + break; + } + case 9: { + // Set validity to NULL + sqlite3_exec(db, + "UPDATE v_ivf_cells00 SET validity = NULL WHERE rowid = 1", + NULL, NULL, NULL); + break; + } + } + + // Exercise all read paths over corrupted state — must not crash + float qvec[8] = {1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; + + // KNN query + { + sqlite3_stmt *sk = NULL; + sqlite3_prepare_v2(db, + "SELECT rowid, distance FROM v WHERE emb MATCH ? LIMIT 5", + -1, &sk, NULL); + if (sk) { + sqlite3_bind_blob(sk, 1, qvec, sizeof(qvec), SQLITE_STATIC); + while (sqlite3_step(sk) == SQLITE_ROW) {} + sqlite3_finalize(sk); + } + } + + // Full scan + sqlite3_exec(db, "SELECT * FROM v", NULL, NULL, NULL); + + // Point query + sqlite3_exec(db, "SELECT * FROM v WHERE rowid = 1", NULL, NULL, NULL); + sqlite3_exec(db, "SELECT * FROM v WHERE rowid = 5", NULL, NULL, NULL); + + // Delete + sqlite3_exec(db, "DELETE FROM v WHERE rowid = 3", NULL, NULL, NULL); + + // Insert after corruption + { + float newvec[8] = {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f}; + sqlite3_stmt *si = NULL; + sqlite3_prepare_v2(db, + "INSERT INTO v(v, emb) VALUES (?, ?)", -1, &si, NULL); + if (si) { + sqlite3_bind_int64(si, 1, 100); + sqlite3_bind_blob(si, 2, newvec, sizeof(newvec), SQLITE_STATIC); + sqlite3_step(si); + sqlite3_finalize(si); + } + } + + // compute-centroids over corrupted state + sqlite3_exec(db, + "INSERT INTO v(v) VALUES ('compute-centroids')", + NULL, NULL, NULL); + + // clear-centroids + sqlite3_exec(db, + "INSERT INTO v(v) VALUES ('clear-centroids')", + NULL, NULL, NULL); + + sqlite3_close(db); + return 0; +} diff --git a/tests/fuzz/numpy.c b/tests/fuzz/rescore-create.c similarity index 51% rename from tests/fuzz/numpy.c rename to tests/fuzz/rescore-create.c index 9e2900b..3e69d6d 100644 --- a/tests/fuzz/numpy.c +++ b/tests/fuzz/rescore-create.c @@ -1,6 +1,5 @@ #include #include - #include #include #include @@ -8,9 +7,6 @@ #include "sqlite3.h" #include -extern int sqlite3_vec_numpy_init(sqlite3 *db, char **pzErrMsg, - const sqlite3_api_routines *pApi); - int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { int rc = SQLITE_OK; sqlite3 *db; @@ -20,17 +16,20 @@ int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { assert(rc == SQLITE_OK); rc = sqlite3_vec_init(db, NULL, NULL); assert(rc == SQLITE_OK); - rc = sqlite3_vec_numpy_init(db, NULL, NULL); - assert(rc == SQLITE_OK); - rc = sqlite3_prepare_v2(db, "select * from vec_npy_each(?)", -1, &stmt, NULL); - assert(rc == SQLITE_OK); - sqlite3_bind_blob(stmt, 1, data, size, SQLITE_STATIC); - rc = sqlite3_step(stmt); - while (rc == SQLITE_ROW) { - rc = sqlite3_step(stmt); + sqlite3_str *s = sqlite3_str_new(NULL); + assert(s); + sqlite3_str_appendall(s, "CREATE VIRTUAL TABLE v USING vec0(emb float[128] indexed by rescore("); + sqlite3_str_appendf(s, "%.*s", (int)size, data); + sqlite3_str_appendall(s, "))"); + const char *zSql = sqlite3_str_finish(s); + assert(zSql); + + rc = sqlite3_prepare_v2(db, zSql, -1, &stmt, NULL); + sqlite3_free((void *)zSql); + if (rc == SQLITE_OK) { + sqlite3_step(stmt); } - sqlite3_finalize(stmt); sqlite3_close(db); return 0; diff --git a/tests/fuzz/rescore-create.dict b/tests/fuzz/rescore-create.dict new file mode 100644 index 0000000..a8adf71 --- /dev/null +++ b/tests/fuzz/rescore-create.dict @@ -0,0 +1,20 @@ +"rescore" +"quantizer" +"bit" +"int8" +"oversample" +"indexed" +"by" +"float" +"(" +")" +"," +"=" +"[" +"]" +"1" +"8" +"16" +"128" +"256" +"1024" diff --git a/tests/fuzz/rescore-interleave.c b/tests/fuzz/rescore-interleave.c new file mode 100644 index 0000000..74e8b8d --- /dev/null +++ b/tests/fuzz/rescore-interleave.c @@ -0,0 +1,151 @@ +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +/** + * Fuzz target: interleaved insert/update/delete/KNN operations on rescore + * tables with BOTH quantizer types, exercising the int8 quantizer path + * and the update code path that the existing rescore-operations.c misses. + * + * Key differences from rescore-operations.c: + * - Tests BOTH bit and int8 quantizers (the existing target only tests bit) + * - Fuzz-controlled query vectors (not fixed [1,0,0,...]) + * - Exercises the UPDATE path (line 9080+ in sqlite-vec.c) + * - Tests with 16 dimensions (more realistic, exercises more of the + * quantization loop) + * - Interleaves KNN between mutations to stress the blob_reopen path + * when _rescore_vectors rows have been deleted/modified + */ +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 8) return 0; + + int rc; + sqlite3 *db; + sqlite3_stmt *stmtInsert = NULL; + sqlite3_stmt *stmtUpdate = NULL; + sqlite3_stmt *stmtDelete = NULL; + sqlite3_stmt *stmtKnn = NULL; + + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + /* Use first byte to pick quantizer */ + int use_int8 = data[0] & 1; + data++; size--; + + const char *create_sql = use_int8 + ? "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[16] indexed by rescore(quantizer=int8))" + : "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[16] indexed by rescore(quantizer=bit))"; + + rc = sqlite3_exec(db, create_sql, NULL, NULL, NULL); + if (rc != SQLITE_OK) { sqlite3_close(db); return 0; } + + sqlite3_prepare_v2(db, + "INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &stmtInsert, NULL); + sqlite3_prepare_v2(db, + "UPDATE v SET emb = ? WHERE rowid = ?", -1, &stmtUpdate, NULL); + sqlite3_prepare_v2(db, + "DELETE FROM v WHERE rowid = ?", -1, &stmtDelete, NULL); + sqlite3_prepare_v2(db, + "SELECT rowid, distance FROM v WHERE emb MATCH ? " + "ORDER BY distance LIMIT 5", -1, &stmtKnn, NULL); + + if (!stmtInsert || !stmtUpdate || !stmtDelete || !stmtKnn) + goto cleanup; + + size_t i = 0; + while (i + 2 <= size) { + uint8_t op = data[i++] % 5; /* 5 operations now */ + uint8_t rowid_byte = data[i++]; + int64_t rowid = (int64_t)(rowid_byte % 24) + 1; + + switch (op) { + case 0: { + /* INSERT: consume bytes for 16 floats */ + float vec[16] = {0}; + for (int j = 0; j < 16 && i < size; j++, i++) { + vec[j] = (float)((int8_t)data[i]) / 8.0f; + } + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, rowid); + sqlite3_bind_blob(stmtInsert, 2, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + break; + } + case 1: { + /* DELETE */ + sqlite3_reset(stmtDelete); + sqlite3_bind_int64(stmtDelete, 1, rowid); + sqlite3_step(stmtDelete); + break; + } + case 2: { + /* KNN with fuzz-controlled query vector */ + float qvec[16] = {0}; + for (int j = 0; j < 16 && i < size; j++, i++) { + qvec[j] = (float)((int8_t)data[i]) / 4.0f; + } + sqlite3_reset(stmtKnn); + sqlite3_bind_blob(stmtKnn, 1, qvec, sizeof(qvec), SQLITE_STATIC); + while (sqlite3_step(stmtKnn) == SQLITE_ROW) { + (void)sqlite3_column_int64(stmtKnn, 0); + (void)sqlite3_column_double(stmtKnn, 1); + } + break; + } + case 3: { + /* UPDATE: modify an existing vector (exercises rescore update path) */ + float vec[16] = {0}; + for (int j = 0; j < 16 && i < size; j++, i++) { + vec[j] = (float)((int8_t)data[i]) / 6.0f; + } + sqlite3_reset(stmtUpdate); + sqlite3_bind_blob(stmtUpdate, 1, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_bind_int64(stmtUpdate, 2, rowid); + sqlite3_step(stmtUpdate); + break; + } + case 4: { + /* INSERT then immediately UPDATE same row (stresses blob lifecycle) */ + float vec1[16] = {0}; + float vec2[16] = {0}; + for (int j = 0; j < 16 && i < size; j++, i++) { + vec1[j] = (float)((int8_t)data[i]) / 10.0f; + vec2[j] = -vec1[j]; /* opposite direction */ + } + /* Insert */ + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, rowid); + sqlite3_bind_blob(stmtInsert, 2, vec1, sizeof(vec1), SQLITE_TRANSIENT); + if (sqlite3_step(stmtInsert) == SQLITE_DONE) { + /* Only update if insert succeeded (rowid might already exist) */ + sqlite3_reset(stmtUpdate); + sqlite3_bind_blob(stmtUpdate, 1, vec2, sizeof(vec2), SQLITE_TRANSIENT); + sqlite3_bind_int64(stmtUpdate, 2, rowid); + sqlite3_step(stmtUpdate); + } + break; + } + } + } + + /* Final consistency check: full scan must not crash */ + sqlite3_exec(db, "SELECT * FROM v", NULL, NULL, NULL); + +cleanup: + sqlite3_finalize(stmtInsert); + sqlite3_finalize(stmtUpdate); + sqlite3_finalize(stmtDelete); + sqlite3_finalize(stmtKnn); + sqlite3_close(db); + return 0; +} diff --git a/tests/fuzz/rescore-knn-deep.c b/tests/fuzz/rescore-knn-deep.c new file mode 100644 index 0000000..8ff3c37 --- /dev/null +++ b/tests/fuzz/rescore-knn-deep.c @@ -0,0 +1,178 @@ +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +/** + * Fuzz target: deep exercise of rescore KNN with fuzz-controlled query vectors + * and both quantizer types (bit + int8), multiple distance metrics. + * + * The existing rescore-operations.c only tests bit quantizer with a fixed + * query vector. This target: + * - Tests both bit and int8 quantizers + * - Uses fuzz-controlled query vectors (hits NaN/Inf/denormal paths) + * - Tests all distance metrics with int8 (L2, cosine, L1) + * - Exercises large LIMIT values (oversample multiplication) + * - Tests KNN with rowid IN constraints + * - Exercises the insert->query->update->query->delete->query cycle + */ +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 20) return 0; + + int rc; + sqlite3 *db; + + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + /* Use first 4 bytes for configuration */ + uint8_t config = data[0]; + uint8_t num_inserts = (data[1] % 20) + 3; /* 3..22 inserts */ + uint8_t limit_val = (data[2] % 50) + 1; /* 1..50 for LIMIT */ + uint8_t metric_choice = data[3] % 3; + data += 4; + size -= 4; + + int use_int8 = config & 1; + const char *metric_str; + switch (metric_choice) { + case 0: metric_str = ""; break; /* default L2 */ + case 1: metric_str = " distance_metric=cosine"; break; + case 2: metric_str = " distance_metric=l1"; break; + default: metric_str = ""; break; + } + + /* Build CREATE TABLE statement */ + char create_sql[256]; + if (use_int8) { + snprintf(create_sql, sizeof(create_sql), + "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[16] indexed by rescore(quantizer=int8)%s)", metric_str); + } else { + /* bit quantizer ignores distance_metric for the coarse pass (always hamming), + but the float rescore phase uses the specified metric */ + snprintf(create_sql, sizeof(create_sql), + "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[16] indexed by rescore(quantizer=bit)%s)", metric_str); + } + + rc = sqlite3_exec(db, create_sql, NULL, NULL, NULL); + if (rc != SQLITE_OK) { sqlite3_close(db); return 0; } + + /* Insert vectors using fuzz data */ + { + sqlite3_stmt *ins = NULL; + sqlite3_prepare_v2(db, + "INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &ins, NULL); + if (!ins) { sqlite3_close(db); return 0; } + + size_t cursor = 0; + for (int i = 0; i < num_inserts && cursor + 1 < size; i++) { + float vec[16]; + for (int j = 0; j < 16; j++) { + if (cursor < size) { + /* Map fuzz byte to float -- includes potential for + interesting float values via reinterpretation */ + int8_t sb = (int8_t)data[cursor++]; + vec[j] = (float)sb / 5.0f; + } else { + vec[j] = 0.0f; + } + } + sqlite3_reset(ins); + sqlite3_bind_int64(ins, 1, (sqlite3_int64)(i + 1)); + sqlite3_bind_blob(ins, 2, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_step(ins); + } + sqlite3_finalize(ins); + } + + /* Build a fuzz-controlled query vector from remaining data */ + float qvec[16] = {0}; + { + size_t cursor = 0; + for (int j = 0; j < 16 && cursor < size; j++) { + int8_t sb = (int8_t)data[cursor++]; + qvec[j] = (float)sb / 3.0f; + } + } + + /* KNN query with fuzz-controlled vector and LIMIT */ + { + char knn_sql[256]; + snprintf(knn_sql, sizeof(knn_sql), + "SELECT rowid, distance FROM v WHERE emb MATCH ? " + "ORDER BY distance LIMIT %d", (int)limit_val); + + sqlite3_stmt *knn = NULL; + sqlite3_prepare_v2(db, knn_sql, -1, &knn, NULL); + if (knn) { + sqlite3_bind_blob(knn, 1, qvec, sizeof(qvec), SQLITE_STATIC); + while (sqlite3_step(knn) == SQLITE_ROW) { + /* Read results to ensure distance computation didn't produce garbage + that crashes the cursor iteration */ + (void)sqlite3_column_int64(knn, 0); + (void)sqlite3_column_double(knn, 1); + } + sqlite3_finalize(knn); + } + } + + /* Update some vectors, then query again */ + { + float uvec[16]; + for (int j = 0; j < 16; j++) uvec[j] = qvec[15 - j]; /* reverse of query */ + sqlite3_stmt *upd = NULL; + sqlite3_prepare_v2(db, + "UPDATE v SET emb = ? WHERE rowid = 1", -1, &upd, NULL); + if (upd) { + sqlite3_bind_blob(upd, 1, uvec, sizeof(uvec), SQLITE_STATIC); + sqlite3_step(upd); + sqlite3_finalize(upd); + } + } + + /* Second KNN after update */ + { + sqlite3_stmt *knn = NULL; + sqlite3_prepare_v2(db, + "SELECT rowid, distance FROM v WHERE emb MATCH ? " + "ORDER BY distance LIMIT 10", -1, &knn, NULL); + if (knn) { + sqlite3_bind_blob(knn, 1, qvec, sizeof(qvec), SQLITE_STATIC); + while (sqlite3_step(knn) == SQLITE_ROW) {} + sqlite3_finalize(knn); + } + } + + /* Delete half the rows, then KNN again */ + for (int i = 1; i <= num_inserts; i += 2) { + char del_sql[64]; + snprintf(del_sql, sizeof(del_sql), + "DELETE FROM v WHERE rowid = %d", i); + sqlite3_exec(db, del_sql, NULL, NULL, NULL); + } + + /* Third KNN after deletes -- exercises distance computation over + zeroed-out slots in the quantized chunk */ + { + sqlite3_stmt *knn = NULL; + sqlite3_prepare_v2(db, + "SELECT rowid, distance FROM v WHERE emb MATCH ? " + "ORDER BY distance LIMIT 5", -1, &knn, NULL); + if (knn) { + sqlite3_bind_blob(knn, 1, qvec, sizeof(qvec), SQLITE_STATIC); + while (sqlite3_step(knn) == SQLITE_ROW) {} + sqlite3_finalize(knn); + } + } + + sqlite3_close(db); + return 0; +} diff --git a/tests/fuzz/rescore-operations.c b/tests/fuzz/rescore-operations.c new file mode 100644 index 0000000..4bb7ff1 --- /dev/null +++ b/tests/fuzz/rescore-operations.c @@ -0,0 +1,96 @@ +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 6) return 0; + + int rc; + sqlite3 *db; + sqlite3_stmt *stmtInsert = NULL; + sqlite3_stmt *stmtDelete = NULL; + sqlite3_stmt *stmtKnn = NULL; + sqlite3_stmt *stmtScan = NULL; + + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + rc = sqlite3_exec(db, + "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[8] indexed by rescore(quantizer=bit))", + NULL, NULL, NULL); + if (rc != SQLITE_OK) { sqlite3_close(db); return 0; } + + sqlite3_prepare_v2(db, + "INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &stmtInsert, NULL); + sqlite3_prepare_v2(db, + "DELETE FROM v WHERE rowid = ?", -1, &stmtDelete, NULL); + sqlite3_prepare_v2(db, + "SELECT rowid, distance FROM v WHERE emb MATCH ? ORDER BY distance LIMIT 3", + -1, &stmtKnn, NULL); + sqlite3_prepare_v2(db, + "SELECT rowid FROM v", -1, &stmtScan, NULL); + + if (!stmtInsert || !stmtDelete || !stmtKnn || !stmtScan) goto cleanup; + + size_t i = 0; + while (i + 2 <= size) { + uint8_t op = data[i++] % 4; + uint8_t rowid_byte = data[i++]; + int64_t rowid = (int64_t)(rowid_byte % 32) + 1; + + switch (op) { + case 0: { + // INSERT: consume 32 bytes for 8 floats, or use what's left + float vec[8] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; + for (int j = 0; j < 8 && i < size; j++, i++) { + vec[j] = (float)((int8_t)data[i]) / 10.0f; + } + sqlite3_reset(stmtInsert); + sqlite3_bind_int64(stmtInsert, 1, rowid); + sqlite3_bind_blob(stmtInsert, 2, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_step(stmtInsert); + break; + } + case 1: { + // DELETE + sqlite3_reset(stmtDelete); + sqlite3_bind_int64(stmtDelete, 1, rowid); + sqlite3_step(stmtDelete); + break; + } + case 2: { + // KNN query with a fixed query vector + float qvec[8] = {1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; + sqlite3_reset(stmtKnn); + sqlite3_bind_blob(stmtKnn, 1, qvec, sizeof(qvec), SQLITE_STATIC); + while (sqlite3_step(stmtKnn) == SQLITE_ROW) {} + break; + } + case 3: { + // Full scan + sqlite3_reset(stmtScan); + while (sqlite3_step(stmtScan) == SQLITE_ROW) {} + break; + } + } + } + + // Final operations -- must not crash regardless of prior state + sqlite3_exec(db, "SELECT * FROM v", NULL, NULL, NULL); + +cleanup: + sqlite3_finalize(stmtInsert); + sqlite3_finalize(stmtDelete); + sqlite3_finalize(stmtKnn); + sqlite3_finalize(stmtScan); + sqlite3_close(db); + return 0; +} diff --git a/tests/fuzz/rescore-quantize-edge.c b/tests/fuzz/rescore-quantize-edge.c new file mode 100644 index 0000000..4ab9e20 --- /dev/null +++ b/tests/fuzz/rescore-quantize-edge.c @@ -0,0 +1,177 @@ +#include +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +/* Test wrappers from sqlite-vec-rescore.c (SQLITE_VEC_TEST build) */ +extern void _test_rescore_quantize_float_to_bit(const float *src, uint8_t *dst, size_t dim); +extern void _test_rescore_quantize_float_to_int8(const float *src, int8_t *dst, size_t dim); +extern size_t _test_rescore_quantized_byte_size_bit(size_t dimensions); +extern size_t _test_rescore_quantized_byte_size_int8(size_t dimensions); + +/** + * Fuzz target: edge cases in rescore quantization functions. + * + * The existing rescore-quantize.c only tests dimensions that are multiples of 8 + * and never passes special float values. This target: + * + * - Tests rescore_quantized_byte_size with arbitrary dimension values + * (including 0, 1, 7, MAX values -- looking for integer division issues) + * - Passes raw float reinterpretation of fuzz bytes (NaN, Inf, denormals, + * negative zero -- these are the values that break min/max/range logic) + * - Tests the int8 quantizer with all-identical values (range=0 branch) + * - Tests the int8 quantizer with extreme ranges (overflow in scale calc) + * - Tests bit quantizer with exact float threshold (0.0f boundary) + */ +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 8) return 0; + + uint8_t mode = data[0] % 5; + data++; size--; + + switch (mode) { + case 0: { + /* Test rescore_quantized_byte_size with fuzz-controlled dimensions. + This function does dimensions / CHAR_BIT for bit, dimensions for int8. + We're checking it doesn't do anything weird with edge values. */ + if (size < sizeof(size_t)) return 0; + size_t dim; + memcpy(&dim, data, sizeof(dim)); + + /* These should never crash, just return values */ + size_t bit_size = _test_rescore_quantized_byte_size_bit(dim); + size_t int8_size = _test_rescore_quantized_byte_size_int8(dim); + + /* Verify basic invariants */ + (void)bit_size; + (void)int8_size; + break; + } + + case 1: { + /* Bit quantize with raw reinterpreted floats (NaN, Inf, denormal). + The key check: src[i] >= 0.0f -- NaN comparison is always false, + so NaN should produce 0-bits. But denormals and -0.0f are tricky. */ + size_t num_floats = size / sizeof(float); + if (num_floats == 0) return 0; + /* Round to multiple of 8 for bit quantizer */ + size_t dim = (num_floats / 8) * 8; + if (dim == 0) return 0; + + const float *src = (const float *)data; + size_t bit_bytes = dim / 8; + uint8_t *dst = (uint8_t *)malloc(bit_bytes); + if (!dst) return 0; + + _test_rescore_quantize_float_to_bit(src, dst, dim); + + /* Verify: for each bit, if src >= 0 then bit should be set */ + for (size_t i = 0; i < dim; i++) { + int bit_set = (dst[i / 8] >> (i % 8)) & 1; + if (src[i] >= 0.0f) { + assert(bit_set == 1); + } else if (src[i] < 0.0f) { + /* Definitely negative -- bit must be 0 */ + assert(bit_set == 0); + } + /* NaN: comparison is false, so bit_set should be 0 */ + } + + free(dst); + break; + } + + case 2: { + /* Int8 quantize with raw reinterpreted floats. + The dangerous paths: + - All values identical (range == 0) -> memset path + - vmin/vmax with NaN (NaN < anything is false, NaN > anything is false) + - Extreme range causing scale = 255/range to be Inf or 0 + - denormals near the clamping boundaries */ + size_t num_floats = size / sizeof(float); + if (num_floats == 0) return 0; + + const float *src = (const float *)data; + int8_t *dst = (int8_t *)malloc(num_floats); + if (!dst) return 0; + + _test_rescore_quantize_float_to_int8(src, dst, num_floats); + + /* Output must always be in [-128, 127] (trivially true for int8_t, + but check the actual clamping logic worked) */ + for (size_t i = 0; i < num_floats; i++) { + assert(dst[i] >= -128 && dst[i] <= 127); + } + + free(dst); + break; + } + + case 3: { + /* Int8 quantize stress: all-same values (range=0 branch) */ + size_t dim = (size < 64) ? size : 64; + if (dim == 0) return 0; + + float *src = (float *)malloc(dim * sizeof(float)); + int8_t *dst = (int8_t *)malloc(dim); + if (!src || !dst) { free(src); free(dst); return 0; } + + /* Fill with a single value derived from fuzz data */ + float val; + memcpy(&val, data, sizeof(float) < size ? sizeof(float) : size); + for (size_t i = 0; i < dim; i++) src[i] = val; + + _test_rescore_quantize_float_to_int8(src, dst, dim); + + /* All outputs should be 0 when range == 0 */ + for (size_t i = 0; i < dim; i++) { + assert(dst[i] == 0); + } + + free(src); + free(dst); + break; + } + + case 4: { + /* Int8 quantize with extreme range: one huge positive, one huge negative. + Tests scale = 255.0f / range overflow to Inf, then v * Inf = Inf, + then clamping to [-128, 127]. */ + if (size < 2 * sizeof(float)) return 0; + + float extreme[2]; + memcpy(extreme, data, 2 * sizeof(float)); + + /* Only test if both are finite (NaN/Inf tested in case 2) */ + if (!isfinite(extreme[0]) || !isfinite(extreme[1])) return 0; + + /* Build a vector with these two extreme values plus some fuzz */ + size_t dim = 16; + float src[16]; + src[0] = extreme[0]; + src[1] = extreme[1]; + for (size_t i = 2; i < dim; i++) { + if (2 * sizeof(float) + (i - 2) < size) { + src[i] = (float)((int8_t)data[2 * sizeof(float) + (i - 2)]) * 1000.0f; + } else { + src[i] = 0.0f; + } + } + + int8_t dst[16]; + _test_rescore_quantize_float_to_int8(src, dst, dim); + + for (size_t i = 0; i < dim; i++) { + assert(dst[i] >= -128 && dst[i] <= 127); + } + break; + } + } + + return 0; +} diff --git a/tests/fuzz/rescore-quantize.c b/tests/fuzz/rescore-quantize.c new file mode 100644 index 0000000..6aad445 --- /dev/null +++ b/tests/fuzz/rescore-quantize.c @@ -0,0 +1,54 @@ +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +/* These are SQLITE_VEC_TEST wrappers defined in sqlite-vec-rescore.c */ +extern void _test_rescore_quantize_float_to_bit(const float *src, uint8_t *dst, size_t dim); +extern void _test_rescore_quantize_float_to_int8(const float *src, int8_t *dst, size_t dim); + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + /* Need at least 4 bytes for one float */ + if (size < 4) return 0; + + /* Use the input as an array of floats. Dimensions must be a multiple of 8 + * for the bit quantizer. */ + size_t num_floats = size / sizeof(float); + if (num_floats == 0) return 0; + + /* Round down to multiple of 8 for bit quantizer compatibility */ + size_t dim = (num_floats / 8) * 8; + if (dim == 0) dim = 8; + if (dim > num_floats) return 0; + + const float *src = (const float *)data; + + /* Allocate output buffers */ + size_t bit_bytes = dim / 8; + uint8_t *bit_dst = (uint8_t *)malloc(bit_bytes); + int8_t *int8_dst = (int8_t *)malloc(dim); + if (!bit_dst || !int8_dst) { + free(bit_dst); + free(int8_dst); + return 0; + } + + /* Test bit quantization */ + _test_rescore_quantize_float_to_bit(src, bit_dst, dim); + + /* Test int8 quantization */ + _test_rescore_quantize_float_to_int8(src, int8_dst, dim); + + /* Verify int8 output is in range */ + for (size_t i = 0; i < dim; i++) { + assert(int8_dst[i] >= -128 && int8_dst[i] <= 127); + } + + free(bit_dst); + free(int8_dst); + return 0; +} diff --git a/tests/fuzz/rescore-shadow-corrupt.c b/tests/fuzz/rescore-shadow-corrupt.c new file mode 100644 index 0000000..edd87ef --- /dev/null +++ b/tests/fuzz/rescore-shadow-corrupt.c @@ -0,0 +1,230 @@ +#include +#include +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +/** + * Fuzz target: corrupt rescore shadow tables then exercise KNN/read/write. + * + * This targets the dangerous code paths in rescore_knn (Phase 1 + 2): + * - sqlite3_blob_read into baseVectors with potentially wrong-sized blobs + * - distance computation on corrupted/partial quantized data + * - blob_reopen on _rescore_vectors with missing/corrupted rows + * - insert/delete after corruption (blob_write to wrong offsets) + * + * The existing shadow-corrupt.c only tests vec0 without rescore. + */ +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + if (size < 4) return 0; + + int rc; + sqlite3 *db; + + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + /* Pick quantizer type from first byte */ + int use_int8 = data[0] & 1; + int target = (data[1] % 8); + const uint8_t *payload = data + 2; + int payload_size = (int)(size - 2); + + const char *create_sql = use_int8 + ? "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[16] indexed by rescore(quantizer=int8))" + : "CREATE VIRTUAL TABLE v USING vec0(" + "emb float[16] indexed by rescore(quantizer=bit))"; + + rc = sqlite3_exec(db, create_sql, NULL, NULL, NULL); + if (rc != SQLITE_OK) { sqlite3_close(db); return 0; } + + /* Insert several vectors so there's a full chunk to corrupt */ + { + sqlite3_stmt *ins = NULL; + sqlite3_prepare_v2(db, + "INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &ins, NULL); + if (!ins) { sqlite3_close(db); return 0; } + + for (int i = 1; i <= 8; i++) { + float vec[16]; + for (int j = 0; j < 16; j++) vec[j] = (float)(i * 10 + j) / 100.0f; + sqlite3_reset(ins); + sqlite3_bind_int64(ins, 1, i); + sqlite3_bind_blob(ins, 2, vec, sizeof(vec), SQLITE_TRANSIENT); + sqlite3_step(ins); + } + sqlite3_finalize(ins); + } + + /* Now corrupt rescore shadow tables based on fuzz input */ + sqlite3_stmt *stmt = NULL; + + switch (target) { + case 0: { + /* Corrupt _rescore_chunks00 vectors blob with fuzz data */ + rc = sqlite3_prepare_v2(db, + "UPDATE v_rescore_chunks00 SET vectors = ? WHERE rowid = 1", + -1, &stmt, NULL); + if (rc == SQLITE_OK) { + sqlite3_bind_blob(stmt, 1, payload, payload_size, SQLITE_STATIC); + sqlite3_step(stmt); + sqlite3_finalize(stmt); + stmt = NULL; + } + break; + } + case 1: { + /* Corrupt _rescore_vectors00 vector blob for a specific row */ + rc = sqlite3_prepare_v2(db, + "UPDATE v_rescore_vectors00 SET vector = ? WHERE rowid = 3", + -1, &stmt, NULL); + if (rc == SQLITE_OK) { + sqlite3_bind_blob(stmt, 1, payload, payload_size, SQLITE_STATIC); + sqlite3_step(stmt); + sqlite3_finalize(stmt); + stmt = NULL; + } + break; + } + case 2: { + /* Truncate the quantized chunk blob to wrong size */ + rc = sqlite3_prepare_v2(db, + "UPDATE v_rescore_chunks00 SET vectors = X'DEADBEEF' WHERE rowid = 1", + -1, &stmt, NULL); + if (rc == SQLITE_OK) { + sqlite3_step(stmt); + sqlite3_finalize(stmt); + stmt = NULL; + } + break; + } + case 3: { + /* Delete rows from _rescore_vectors (orphan the float vectors) */ + sqlite3_exec(db, + "DELETE FROM v_rescore_vectors00 WHERE rowid IN (2, 4, 6)", + NULL, NULL, NULL); + break; + } + case 4: { + /* Delete the chunk row entirely from _rescore_chunks */ + sqlite3_exec(db, + "DELETE FROM v_rescore_chunks00 WHERE rowid = 1", + NULL, NULL, NULL); + break; + } + case 5: { + /* Set vectors to NULL in _rescore_chunks */ + sqlite3_exec(db, + "UPDATE v_rescore_chunks00 SET vectors = NULL WHERE rowid = 1", + NULL, NULL, NULL); + break; + } + case 6: { + /* Set vector to NULL in _rescore_vectors */ + sqlite3_exec(db, + "UPDATE v_rescore_vectors00 SET vector = NULL WHERE rowid = 3", + NULL, NULL, NULL); + break; + } + case 7: { + /* Corrupt BOTH tables with fuzz data */ + int half = payload_size / 2; + rc = sqlite3_prepare_v2(db, + "UPDATE v_rescore_chunks00 SET vectors = ? WHERE rowid = 1", + -1, &stmt, NULL); + if (rc == SQLITE_OK) { + sqlite3_bind_blob(stmt, 1, payload, half, SQLITE_STATIC); + sqlite3_step(stmt); + sqlite3_finalize(stmt); + stmt = NULL; + } + rc = sqlite3_prepare_v2(db, + "UPDATE v_rescore_vectors00 SET vector = ? WHERE rowid = 1", + -1, &stmt, NULL); + if (rc == SQLITE_OK) { + sqlite3_bind_blob(stmt, 1, payload + half, + payload_size - half, SQLITE_STATIC); + sqlite3_step(stmt); + sqlite3_finalize(stmt); + stmt = NULL; + } + break; + } + } + + /* Exercise ALL read/write paths -- NONE should crash */ + + /* KNN query (triggers rescore_knn Phase 1 + Phase 2) */ + { + float qvec[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1}; + sqlite3_stmt *knn = NULL; + sqlite3_prepare_v2(db, + "SELECT rowid, distance FROM v WHERE emb MATCH ? " + "ORDER BY distance LIMIT 5", -1, &knn, NULL); + if (knn) { + sqlite3_bind_blob(knn, 1, qvec, sizeof(qvec), SQLITE_STATIC); + while (sqlite3_step(knn) == SQLITE_ROW) {} + sqlite3_finalize(knn); + } + } + + /* Full scan (triggers reading from _rescore_vectors) */ + sqlite3_exec(db, "SELECT * FROM v", NULL, NULL, NULL); + + /* Point lookups */ + sqlite3_exec(db, "SELECT * FROM v WHERE rowid = 1", NULL, NULL, NULL); + sqlite3_exec(db, "SELECT * FROM v WHERE rowid = 3", NULL, NULL, NULL); + + /* Insert after corruption */ + { + float vec[16] = {0}; + sqlite3_stmt *ins = NULL; + sqlite3_prepare_v2(db, + "INSERT INTO v(rowid, emb) VALUES (99, ?)", -1, &ins, NULL); + if (ins) { + sqlite3_bind_blob(ins, 1, vec, sizeof(vec), SQLITE_STATIC); + sqlite3_step(ins); + sqlite3_finalize(ins); + } + } + + /* Delete after corruption */ + sqlite3_exec(db, "DELETE FROM v WHERE rowid = 5", NULL, NULL, NULL); + + /* Update after corruption */ + { + float vec[16] = {1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1}; + sqlite3_stmt *upd = NULL; + sqlite3_prepare_v2(db, + "UPDATE v SET emb = ? WHERE rowid = 1", -1, &upd, NULL); + if (upd) { + sqlite3_bind_blob(upd, 1, vec, sizeof(vec), SQLITE_STATIC); + sqlite3_step(upd); + sqlite3_finalize(upd); + } + } + + /* KNN again after modifications to corrupted state */ + { + float qvec[16] = {0,0,0,0, 0,0,0,0, 1,1,1,1, 1,1,1,1}; + sqlite3_stmt *knn = NULL; + sqlite3_prepare_v2(db, + "SELECT rowid, distance FROM v WHERE emb MATCH ? " + "ORDER BY distance LIMIT 3", -1, &knn, NULL); + if (knn) { + sqlite3_bind_blob(knn, 1, qvec, sizeof(qvec), SQLITE_STATIC); + while (sqlite3_step(knn) == SQLITE_ROW) {} + sqlite3_finalize(knn); + } + } + + sqlite3_exec(db, "DROP TABLE v", NULL, NULL, NULL); + sqlite3_close(db); + return 0; +} diff --git a/tests/generate_legacy_db.py b/tests/generate_legacy_db.py new file mode 100644 index 0000000..4611690 --- /dev/null +++ b/tests/generate_legacy_db.py @@ -0,0 +1,81 @@ +# /// script +# requires-python = ">=3.10" +# dependencies = ["sqlite-vec==0.1.6"] +# /// +"""Generate a legacy sqlite-vec database for backwards-compat testing. + +Usage: + uv run --script generate_legacy_db.py + +Creates tests/fixtures/legacy-v0.1.6.db with a vec0 table containing +test data that can be read by the current version of sqlite-vec. +""" +import sqlite3 +import sqlite_vec +import struct +import os + +FIXTURE_DIR = os.path.join(os.path.dirname(__file__), "fixtures") +DB_PATH = os.path.join(FIXTURE_DIR, "legacy-v0.1.6.db") + +DIMS = 4 +N_ROWS = 50 + + +def _f32(vals): + return struct.pack(f"{len(vals)}f", *vals) + + +def main(): + os.makedirs(FIXTURE_DIR, exist_ok=True) + if os.path.exists(DB_PATH): + os.remove(DB_PATH) + + db = sqlite3.connect(DB_PATH) + db.enable_load_extension(True) + sqlite_vec.load(db) + + # Print version for verification + version = db.execute("SELECT vec_version()").fetchone()[0] + print(f"sqlite-vec version: {version}") + + # Create a basic vec0 table — flat index, no fancy features + db.execute(f"CREATE VIRTUAL TABLE legacy_vectors USING vec0(emb float[{DIMS}])") + + # Insert test data: vectors where element[0] == rowid for easy verification + for i in range(1, N_ROWS + 1): + vec = [float(i), 0.0, 0.0, 0.0] + db.execute("INSERT INTO legacy_vectors(rowid, emb) VALUES (?, ?)", [i, _f32(vec)]) + + db.commit() + + # Verify + count = db.execute("SELECT count(*) FROM legacy_vectors").fetchone()[0] + print(f"Inserted {count} rows") + + # Test KNN works + query = _f32([1.0, 0.0, 0.0, 0.0]) + rows = db.execute( + "SELECT rowid, distance FROM legacy_vectors WHERE emb MATCH ? AND k = 5", + [query], + ).fetchall() + print(f"KNN top 5: {[(r[0], round(r[1], 4)) for r in rows]}") + assert rows[0][0] == 1 # closest to [1,0,0,0] + assert len(rows) == 5 + + # Also create a table with name == column name (the conflict case) + # This was allowed in old versions — new code must not break on reconnect + db.execute("CREATE VIRTUAL TABLE emb USING vec0(emb float[4])") + for i in range(1, 11): + db.execute("INSERT INTO emb(rowid, emb) VALUES (?, ?)", [i, _f32([float(i), 0, 0, 0])]) + db.commit() + + count2 = db.execute("SELECT count(*) FROM emb").fetchone()[0] + print(f"Table 'emb' with column 'emb': {count2} rows (name conflict case)") + + db.close() + print(f"\nGenerated: {DB_PATH}") + + +if __name__ == "__main__": + main() diff --git a/tests/sqlite-vec-internal.h b/tests/sqlite-vec-internal.h index a540849..313add4 100644 --- a/tests/sqlite-vec-internal.h +++ b/tests/sqlite-vec-internal.h @@ -3,6 +3,11 @@ #include #include +#include + +#ifndef SQLITE_VEC_ENABLE_IVF +#define SQLITE_VEC_ENABLE_IVF 1 +#endif int min_idx( const float *distances, @@ -62,12 +67,81 @@ enum Vec0DistanceMetrics { VEC0_DISTANCE_METRIC_L1 = 3, }; +enum Vec0IndexType { + VEC0_INDEX_TYPE_FLAT = 1, +#ifdef SQLITE_VEC_ENABLE_RESCORE + VEC0_INDEX_TYPE_RESCORE = 2, +#endif + VEC0_INDEX_TYPE_IVF = 3, + VEC0_INDEX_TYPE_DISKANN = 4, +}; + +enum Vec0RescoreQuantizerType { + VEC0_RESCORE_QUANTIZER_BIT = 1, + VEC0_RESCORE_QUANTIZER_INT8 = 2, +}; + +struct Vec0RescoreConfig { + enum Vec0RescoreQuantizerType quantizer_type; + int oversample; +}; + +#if SQLITE_VEC_ENABLE_IVF +enum Vec0IvfQuantizer { + VEC0_IVF_QUANTIZER_NONE = 0, + VEC0_IVF_QUANTIZER_INT8 = 1, + VEC0_IVF_QUANTIZER_BINARY = 2, +}; + +struct Vec0IvfConfig { + int nlist; + int nprobe; + int quantizer; + int oversample; +}; +#else +struct Vec0IvfConfig { char _unused; }; +#endif + +#ifdef SQLITE_VEC_ENABLE_RESCORE +enum Vec0RescoreQuantizerType { + VEC0_RESCORE_QUANTIZER_BIT = 1, + VEC0_RESCORE_QUANTIZER_INT8 = 2, +}; + +struct Vec0RescoreConfig { + enum Vec0RescoreQuantizerType quantizer_type; + int oversample; +}; +#endif + +enum Vec0DiskannQuantizerType { + VEC0_DISKANN_QUANTIZER_BINARY = 1, + VEC0_DISKANN_QUANTIZER_INT8 = 2, +}; + +struct Vec0DiskannConfig { + enum Vec0DiskannQuantizerType quantizer_type; + int n_neighbors; + int search_list_size; + int search_list_size_search; + int search_list_size_insert; + float alpha; + int buffer_threshold; +}; + struct VectorColumnDefinition { char *name; int name_length; size_t dimensions; enum VectorElementType element_type; enum Vec0DistanceMetrics distance_metric; + enum Vec0IndexType index_type; +#ifdef SQLITE_VEC_ENABLE_RESCORE + struct Vec0RescoreConfig rescore; +#endif + struct Vec0IvfConfig ivf; + struct Vec0DiskannConfig diskann; }; int vec0_parse_vector_column(const char *source, int source_length, @@ -78,10 +152,90 @@ int vec0_parse_partition_key_definition(const char *source, int source_length, int *out_column_name_length, int *out_column_type); +size_t diskann_quantized_vector_byte_size( + enum Vec0DiskannQuantizerType quantizer_type, size_t dimensions); + +int diskann_validity_byte_size(int n_neighbors); +size_t diskann_neighbor_ids_byte_size(int n_neighbors); +size_t diskann_neighbor_qvecs_byte_size( + int n_neighbors, enum Vec0DiskannQuantizerType quantizer_type, + size_t dimensions); +int diskann_node_init( + int n_neighbors, enum Vec0DiskannQuantizerType quantizer_type, + size_t dimensions, + unsigned char **outValidity, int *outValiditySize, + unsigned char **outNeighborIds, int *outNeighborIdsSize, + unsigned char **outNeighborQvecs, int *outNeighborQvecsSize); +int diskann_validity_get(const unsigned char *validity, int i); +void diskann_validity_set(unsigned char *validity, int i, int value); +int diskann_validity_count(const unsigned char *validity, int n_neighbors); +long long diskann_neighbor_id_get(const unsigned char *neighbor_ids, int i); +void diskann_neighbor_id_set(unsigned char *neighbor_ids, int i, long long rowid); +const unsigned char *diskann_neighbor_qvec_get( + const unsigned char *qvecs, int i, + enum Vec0DiskannQuantizerType quantizer_type, size_t dimensions); +void diskann_neighbor_qvec_set( + unsigned char *qvecs, int i, const unsigned char *src_qvec, + enum Vec0DiskannQuantizerType quantizer_type, size_t dimensions); +void diskann_node_set_neighbor( + unsigned char *validity, unsigned char *neighbor_ids, unsigned char *qvecs, int i, + long long neighbor_rowid, const unsigned char *neighbor_qvec, + enum Vec0DiskannQuantizerType quantizer_type, size_t dimensions); +void diskann_node_clear_neighbor( + unsigned char *validity, unsigned char *neighbor_ids, unsigned char *qvecs, int i, + enum Vec0DiskannQuantizerType quantizer_type, size_t dimensions); +int diskann_quantize_vector( + const float *src, size_t dimensions, + enum Vec0DiskannQuantizerType quantizer_type, + unsigned char *out); + +int diskann_prune_select( + const float *inter_distances, const float *p_distances, + int num_candidates, float alpha, int max_neighbors, + int *outSelected, int *outCount); + #ifdef SQLITE_VEC_TEST float _test_distance_l2_sqr_float(const float *a, const float *b, size_t dims); float _test_distance_cosine_float(const float *a, const float *b, size_t dims); float _test_distance_hamming(const unsigned char *a, const unsigned char *b, size_t dims); + +#ifdef SQLITE_VEC_ENABLE_RESCORE +void _test_rescore_quantize_float_to_bit(const float *src, uint8_t *dst, size_t dim); +void _test_rescore_quantize_float_to_int8(const float *src, int8_t *dst, size_t dim); +size_t _test_rescore_quantized_byte_size_bit(size_t dimensions); +size_t _test_rescore_quantized_byte_size_int8(size_t dimensions); +#endif +#if SQLITE_VEC_ENABLE_IVF +void ivf_quantize_int8(const float *src, int8_t *dst, int D); +void ivf_quantize_binary(const float *src, uint8_t *dst, int D); +#endif +// DiskANN candidate list (opaque struct, use accessors) +struct DiskannCandidateList { + void *items; // opaque + int count; + int capacity; +}; + +int _test_diskann_candidate_list_init(struct DiskannCandidateList *list, int capacity); +void _test_diskann_candidate_list_free(struct DiskannCandidateList *list); +int _test_diskann_candidate_list_insert(struct DiskannCandidateList *list, long long rowid, float distance); +int _test_diskann_candidate_list_next_unvisited(const struct DiskannCandidateList *list); +int _test_diskann_candidate_list_count(const struct DiskannCandidateList *list); +long long _test_diskann_candidate_list_rowid(const struct DiskannCandidateList *list, int i); +float _test_diskann_candidate_list_distance(const struct DiskannCandidateList *list, int i); +void _test_diskann_candidate_list_set_visited(struct DiskannCandidateList *list, int i); + +// DiskANN visited set (opaque struct, use accessors) +struct DiskannVisitedSet { + void *slots; // opaque + int capacity; + int count; +}; + +int _test_diskann_visited_set_init(struct DiskannVisitedSet *set, int capacity); +void _test_diskann_visited_set_free(struct DiskannVisitedSet *set); +int _test_diskann_visited_set_contains(const struct DiskannVisitedSet *set, long long rowid); +int _test_diskann_visited_set_insert(struct DiskannVisitedSet *set, long long rowid); #endif #endif /* SQLITE_VEC_INTERNAL_H */ diff --git a/tests/test-auxiliary.py b/tests/test-auxiliary.py index 807b2b8..dbe9654 100644 --- a/tests/test-auxiliary.py +++ b/tests/test-auxiliary.py @@ -1,5 +1,7 @@ import sqlite3 -from helpers import exec, vec0_shadow_table_contents +import struct +import pytest +from helpers import exec, vec0_shadow_table_contents, _f32 def test_constructor_limit(db, snapshot): @@ -126,3 +128,198 @@ def test_knn(db, snapshot): ) == snapshot(name="illegal KNN w/ aux") +# ====================================================================== +# Auxiliary columns with non-flat indexes +# ====================================================================== + + +def test_rescore_aux_shadow_tables(db, snapshot): + """Rescore + aux column: verify shadow tables are created correctly.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " emb float[128] indexed by rescore(quantizer=bit)," + " +label text," + " +score float" + ")" + ) + assert exec(db, "SELECT name, sql FROM sqlite_master WHERE type='table' AND name LIKE 't_%' ORDER BY name") == snapshot( + name="rescore aux shadow tables" + ) + + +def test_rescore_aux_insert_knn(db, snapshot): + """Insert with aux data, KNN should return aux column values.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " emb float[128] indexed by rescore(quantizer=bit)," + " +label text" + ")" + ) + import random + random.seed(77) + data = [ + ("alpha", [random.gauss(0, 1) for _ in range(128)]), + ("beta", [random.gauss(0, 1) for _ in range(128)]), + ("gamma", [random.gauss(0, 1) for _ in range(128)]), + ] + for label, vec in data: + db.execute( + "INSERT INTO t(emb, label) VALUES (?, ?)", + [_f32(vec), label], + ) + + assert exec(db, "SELECT rowid, label FROM t ORDER BY rowid") == snapshot( + name="rescore aux select all" + ) + assert vec0_shadow_table_contents(db, "t", skip_info=True) == snapshot( + name="rescore aux shadow contents" + ) + + # KNN should include aux column, "alpha" closest to its own vector + rows = db.execute( + "SELECT label, distance FROM t WHERE emb MATCH ? ORDER BY distance LIMIT 3", + [_f32(data[0][1])], + ).fetchall() + assert len(rows) == 3 + assert rows[0][0] == "alpha" + + +def test_rescore_aux_update(db): + """UPDATE aux column on rescore table should work without affecting vectors.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " emb float[128] indexed by rescore(quantizer=bit)," + " +label text" + ")" + ) + import random + random.seed(88) + vec = [random.gauss(0, 1) for _ in range(128)] + db.execute("INSERT INTO t(rowid, emb, label) VALUES (1, ?, 'original')", [_f32(vec)]) + db.execute("UPDATE t SET label = 'updated' WHERE rowid = 1") + + assert db.execute("SELECT label FROM t WHERE rowid = 1").fetchone()[0] == "updated" + + # KNN still works with updated aux + rows = db.execute( + "SELECT rowid, label FROM t WHERE emb MATCH ? ORDER BY distance LIMIT 1", + [_f32(vec)], + ).fetchall() + assert rows[0][0] == 1 + assert rows[0][1] == "updated" + + +def test_rescore_aux_delete(db, snapshot): + """DELETE should remove aux data from shadow table.""" + db.execute( + "CREATE VIRTUAL TABLE t USING vec0(" + " emb float[128] indexed by rescore(quantizer=bit)," + " +label text" + ")" + ) + import random + random.seed(99) + for i in range(5): + db.execute( + "INSERT INTO t(rowid, emb, label) VALUES (?, ?, ?)", + [i + 1, _f32([random.gauss(0, 1) for _ in range(128)]), f"item-{i+1}"], + ) + + db.execute("DELETE FROM t WHERE rowid = 3") + + assert exec(db, "SELECT rowid, label FROM t ORDER BY rowid") == snapshot( + name="rescore aux after delete" + ) + assert exec(db, "SELECT rowid, value00 FROM t_auxiliary ORDER BY rowid") == snapshot( + name="rescore aux shadow after delete" + ) + + +def test_diskann_aux_shadow_tables(db, snapshot): + """DiskANN + aux column: verify shadow tables are created correctly.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8), + +label text, + +score float + ) + """) + assert exec(db, "SELECT name, sql FROM sqlite_master WHERE type='table' AND name LIKE 't_%' ORDER BY name") == snapshot( + name="diskann aux shadow tables" + ) + + +def test_diskann_aux_insert_knn(db, snapshot): + """DiskANN + aux: insert, KNN, verify aux values returned.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8), + +label text + ) + """) + data = [ + ("red", [1, 0, 0, 0, 0, 0, 0, 0]), + ("green", [0, 1, 0, 0, 0, 0, 0, 0]), + ("blue", [0, 0, 1, 0, 0, 0, 0, 0]), + ] + for label, vec in data: + db.execute("INSERT INTO t(emb, label) VALUES (?, ?)", [_f32(vec), label]) + + assert exec(db, "SELECT rowid, label FROM t ORDER BY rowid") == snapshot( + name="diskann aux select all" + ) + assert vec0_shadow_table_contents(db, "t", skip_info=True) == snapshot( + name="diskann aux shadow contents" + ) + + rows = db.execute( + "SELECT label, distance FROM t WHERE emb MATCH ? AND k = 3", + [_f32([1, 0, 0, 0, 0, 0, 0, 0])], + ).fetchall() + assert len(rows) >= 1 + assert rows[0][0] == "red" + + +def test_diskann_aux_update_and_delete(db, snapshot): + """DiskANN + aux: update aux column, delete row, verify cleanup.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8), + +label text + ) + """) + for i in range(5): + vec = [0.0] * 8 + vec[i % 8] = 1.0 + db.execute( + "INSERT INTO t(rowid, emb, label) VALUES (?, ?, ?)", + [i + 1, _f32(vec), f"item-{i+1}"], + ) + + db.execute("UPDATE t SET label = 'UPDATED' WHERE rowid = 2") + db.execute("DELETE FROM t WHERE rowid = 3") + + assert exec(db, "SELECT rowid, label FROM t ORDER BY rowid") == snapshot( + name="diskann aux after update+delete" + ) + assert exec(db, "SELECT rowid, value00 FROM t_auxiliary ORDER BY rowid") == snapshot( + name="diskann aux shadow after update+delete" + ) + + +def test_diskann_aux_drop_cleans_all(db): + """DROP TABLE should remove aux shadow table too.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8), + +label text + ) + """) + db.execute("INSERT INTO t(emb, label) VALUES (?, 'test')", [_f32([1]*8)]) + db.execute("DROP TABLE t") + + tables = [r[0] for r in db.execute( + "SELECT name FROM sqlite_master WHERE name LIKE 't_%'" + ).fetchall()] + assert "t_auxiliary" not in tables + diff --git a/tests/test-diskann.py b/tests/test-diskann.py new file mode 100644 index 0000000..4e65160 --- /dev/null +++ b/tests/test-diskann.py @@ -0,0 +1,1331 @@ +import sqlite3 +import struct +import pytest +from helpers import _f32, exec + + +def test_diskann_create_basic(db): + """Basic DiskANN table creation with binary quantizer should succeed.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[128] INDEXED BY diskann(neighbor_quantizer=binary) + ) + """) + # Table should exist + tables = [ + row[0] + for row in db.execute( + "select name from sqlite_master where name like 't%' order by 1" + ).fetchall() + ] + assert "t" in tables + + +def test_diskann_create_int8_quantizer(db): + """DiskANN with int8 quantizer should succeed.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[64] INDEXED BY diskann(neighbor_quantizer=int8) + ) + """) + tables = [ + row[0] + for row in db.execute( + "select name from sqlite_master where name like 't%' order by 1" + ).fetchall() + ] + assert "t" in tables + + +def test_diskann_create_with_options(db): + """DiskANN with custom n_neighbors and search_list_size.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[128] INDEXED BY diskann( + neighbor_quantizer=binary, + n_neighbors=48, + search_list_size=256 + ) + ) + """) + tables = [ + row[0] + for row in db.execute( + "select name from sqlite_master where name like 't%' order by 1" + ).fetchall() + ] + assert "t" in tables + + +def test_diskann_create_with_distance_metric(db): + """DiskANN combined with distance_metric should work.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[128] distance_metric=cosine INDEXED BY diskann(neighbor_quantizer=binary) + ) + """) + tables = [ + row[0] + for row in db.execute( + "select name from sqlite_master where name like 't%' order by 1" + ).fetchall() + ] + assert "t" in tables + + +def test_diskann_create_error_missing_quantizer(db): + """Error when neighbor_quantizer is not specified.""" + result = exec(db, """ + CREATE VIRTUAL TABLE t USING vec0( + emb float[128] INDEXED BY diskann(n_neighbors=72) + ) + """) + assert "error" in result + + +def test_diskann_create_error_empty_parens(db): + """Error on empty parens.""" + result = exec(db, """ + CREATE VIRTUAL TABLE t USING vec0( + emb float[128] INDEXED BY diskann() + ) + """) + assert "error" in result + + +def test_diskann_create_error_unknown_quantizer(db): + """Error on unknown quantizer type.""" + result = exec(db, """ + CREATE VIRTUAL TABLE t USING vec0( + emb float[128] INDEXED BY diskann(neighbor_quantizer=unknown) + ) + """) + assert "error" in result + + +def test_diskann_create_error_bit_column(db): + """Error: DiskANN not supported on bit vector columns.""" + result = exec(db, """ + CREATE VIRTUAL TABLE t USING vec0( + emb bit[128] INDEXED BY diskann(neighbor_quantizer=binary) + ) + """) + assert "error" in result + assert "bit" in result["message"].lower() or "DiskANN" in result["message"] + + +def test_diskann_create_error_binary_quantizer_odd_dims(db): + """Error: binary quantizer requires dimensions divisible by 8.""" + result = exec(db, """ + CREATE VIRTUAL TABLE t USING vec0( + emb float[13] INDEXED BY diskann(neighbor_quantizer=binary) + ) + """) + assert "error" in result + assert "divisible" in result["message"].lower() + + +def test_diskann_create_error_bad_n_neighbors(db): + """Error: n_neighbors must be divisible by 8.""" + result = exec(db, """ + CREATE VIRTUAL TABLE t USING vec0( + emb float[128] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=13) + ) + """) + assert "error" in result + + +def test_diskann_shadow_tables_created(db): + """DiskANN table should create _vectors00 and _diskann_nodes00 shadow tables.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[64] INDEXED BY diskann(neighbor_quantizer=binary) + ) + """) + tables = sorted([ + row[0] + for row in db.execute( + "select name from sqlite_master where type='table' and name like 't_%' order by 1" + ).fetchall() + ]) + assert "t_vectors00" in tables + assert "t_diskann_nodes00" in tables + # DiskANN columns should NOT have _vector_chunks00 + assert "t_vector_chunks00" not in tables + + +def test_diskann_medoid_in_info(db): + """_info table should contain diskann_medoid_00 key with NULL value.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[64] INDEXED BY diskann(neighbor_quantizer=binary) + ) + """) + row = db.execute( + "SELECT key, value FROM t_info WHERE key = 'diskann_medoid_00'" + ).fetchone() + assert row is not None + assert row[0] == "diskann_medoid_00" + assert row[1] is None + + +def test_non_diskann_no_extra_tables(db): + """Non-DiskANN table must NOT create _vectors or _diskann_nodes tables.""" + db.execute("CREATE VIRTUAL TABLE t USING vec0(emb float[64])") + tables = [ + row[0] + for row in db.execute( + "select name from sqlite_master where type='table' and name like 't_%' order by 1" + ).fetchall() + ] + assert "t_vectors00" not in tables + assert "t_diskann_nodes00" not in tables + assert "t_vector_chunks00" in tables + + +def test_diskann_medoid_initial_null(db): + """Medoid should be NULL initially (empty graph).""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[64] INDEXED BY diskann(neighbor_quantizer=binary) + ) + """) + row = db.execute( + "SELECT value FROM t_info WHERE key = 'diskann_medoid_00'" + ).fetchone() + assert row[0] is None + + +def test_diskann_medoid_set_via_info(db): + """Setting medoid via _info table should be retrievable.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[64] INDEXED BY diskann(neighbor_quantizer=binary) + ) + """) + # Manually set medoid to simulate first insert + db.execute("UPDATE t_info SET value = 42 WHERE key = 'diskann_medoid_00'") + row = db.execute( + "SELECT value FROM t_info WHERE key = 'diskann_medoid_00'" + ).fetchone() + assert row[0] == 42 + + # Reset to NULL (empty graph) + db.execute("UPDATE t_info SET value = NULL WHERE key = 'diskann_medoid_00'") + row = db.execute( + "SELECT value FROM t_info WHERE key = 'diskann_medoid_00'" + ).fetchone() + assert row[0] is None + + +def test_diskann_single_insert(db): + """Insert 1 vector. Verify _vectors00, _diskann_nodes00, and medoid.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary) + ) + """) + db.execute( + "INSERT INTO t(rowid, emb) VALUES (1, ?)", + [_f32([1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])], + ) + # Verify _vectors00 has 1 row + count = db.execute("SELECT count(*) FROM t_vectors00").fetchone()[0] + assert count == 1 + + # Verify _diskann_nodes00 has 1 row + count = db.execute("SELECT count(*) FROM t_diskann_nodes00").fetchone()[0] + assert count == 1 + + # Verify medoid is set + medoid = db.execute( + "SELECT value FROM t_info WHERE key = 'diskann_medoid_00'" + ).fetchone()[0] + assert medoid == 1 + + +def test_diskann_multiple_inserts(db): + """Insert multiple vectors. Verify counts and that nodes have neighbors.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + import random + random.seed(42) + for i in range(1, 21): + vec = [random.gauss(0, 1) for _ in range(8)] + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, _f32(vec)]) + + # Verify counts + assert db.execute("SELECT count(*) FROM t_vectors00").fetchone()[0] == 20 + assert db.execute("SELECT count(*) FROM t_diskann_nodes00").fetchone()[0] == 20 + + # Every node after the first should have at least 1 neighbor + rows = db.execute( + "SELECT rowid, neighbors_validity FROM t_diskann_nodes00" + ).fetchall() + nodes_with_neighbors = 0 + for row in rows: + validity = row[1] + has_neighbor = any(b != 0 for b in validity) + if has_neighbor: + nodes_with_neighbors += 1 + # At minimum, nodes 2-20 should have neighbors (node 1 gets neighbors via reverse edges) + assert nodes_with_neighbors >= 19 + + +def test_diskann_bidirectional_edges(db): + """Insert A then B. B should be in A's neighbors and A in B's.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + db.execute( + "INSERT INTO t(rowid, emb) VALUES (1, ?)", + [_f32([1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])], + ) + db.execute( + "INSERT INTO t(rowid, emb) VALUES (2, ?)", + [_f32([0.9, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])], + ) + + # Check B(2) is in A(1)'s neighbor list + row_a = db.execute( + "SELECT neighbor_ids FROM t_diskann_nodes00 WHERE rowid = 1" + ).fetchone() + neighbor_ids_a = struct.unpack(f"{len(row_a[0])//8}q", row_a[0]) + assert 2 in neighbor_ids_a + + # Check A(1) is in B(2)'s neighbor list + row_b = db.execute( + "SELECT neighbor_ids FROM t_diskann_nodes00 WHERE rowid = 2" + ).fetchone() + neighbor_ids_b = struct.unpack(f"{len(row_b[0])//8}q", row_b[0]) + assert 1 in neighbor_ids_b + + +def test_diskann_delete_single(db): + """Insert 3, delete 1. Verify counts.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + for i in range(1, 4): + db.execute( + "INSERT INTO t(rowid, emb) VALUES (?, ?)", + [i, _f32([float(i)] * 8)], + ) + db.execute("DELETE FROM t WHERE rowid = 2") + + assert db.execute("SELECT count(*) FROM t_vectors00").fetchone()[0] == 2 + assert db.execute("SELECT count(*) FROM t_diskann_nodes00").fetchone()[0] == 2 + + +def test_diskann_delete_no_stale_references(db): + """After delete, no node should reference the deleted rowid.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + import random + random.seed(123) + for i in range(1, 11): + vec = [random.gauss(0, 1) for _ in range(8)] + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, _f32(vec)]) + + db.execute("DELETE FROM t WHERE rowid = 5") + + # Scan all remaining nodes and verify rowid 5 is not in any neighbor list + rows = db.execute( + "SELECT rowid, neighbors_validity, neighbor_ids FROM t_diskann_nodes00" + ).fetchall() + for row in rows: + validity = row[1] + neighbor_ids_blob = row[2] + n_neighbors = len(validity) * 8 + ids = struct.unpack(f"{n_neighbors}q", neighbor_ids_blob) + for i in range(n_neighbors): + byte_idx = i // 8 + bit_idx = i % 8 + if validity[byte_idx] & (1 << bit_idx): + assert ids[i] != 5, f"Node {row[0]} still references deleted rowid 5" + + +def test_diskann_delete_medoid(db): + """Delete the medoid. Verify a new non-NULL medoid is selected.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + for i in range(1, 4): + db.execute( + "INSERT INTO t(rowid, emb) VALUES (?, ?)", + [i, _f32([float(i)] * 8)], + ) + + medoid_before = db.execute( + "SELECT value FROM t_info WHERE key = 'diskann_medoid_00'" + ).fetchone()[0] + assert medoid_before == 1 + + db.execute("DELETE FROM t WHERE rowid = 1") + + medoid_after = db.execute( + "SELECT value FROM t_info WHERE key = 'diskann_medoid_00'" + ).fetchone()[0] + assert medoid_after is not None + assert medoid_after != 1 + + +def test_diskann_delete_all(db): + """Delete all vectors. Medoid should be NULL.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + for i in range(1, 4): + db.execute( + "INSERT INTO t(rowid, emb) VALUES (?, ?)", + [i, _f32([float(i)] * 8)], + ) + for i in range(1, 4): + db.execute("DELETE FROM t WHERE rowid = ?", [i]) + + assert db.execute("SELECT count(*) FROM t_vectors00").fetchone()[0] == 0 + assert db.execute("SELECT count(*) FROM t_diskann_nodes00").fetchone()[0] == 0 + + medoid = db.execute( + "SELECT value FROM t_info WHERE key = 'diskann_medoid_00'" + ).fetchone()[0] + assert medoid is None + + +def test_diskann_insert_delete_insert_cycle(db): + """Insert, delete, insert again. No crashes.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + db.execute("INSERT INTO t(rowid, emb) VALUES (1, ?)", [_f32([1.0] * 8)]) + db.execute("INSERT INTO t(rowid, emb) VALUES (2, ?)", [_f32([2.0] * 8)]) + db.execute("DELETE FROM t WHERE rowid = 1") + db.execute("INSERT INTO t(rowid, emb) VALUES (3, ?)", [_f32([3.0] * 8)]) + + assert db.execute("SELECT count(*) FROM t_vectors00").fetchone()[0] == 2 + assert db.execute("SELECT count(*) FROM t_diskann_nodes00").fetchone()[0] == 2 + + +def test_diskann_knn_basic(db): + """Basic KNN query should return results.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + db.execute("INSERT INTO t(rowid, emb) VALUES (1, ?)", [_f32([1, 0, 0, 0, 0, 0, 0, 0])]) + db.execute("INSERT INTO t(rowid, emb) VALUES (2, ?)", [_f32([0, 1, 0, 0, 0, 0, 0, 0])]) + db.execute("INSERT INTO t(rowid, emb) VALUES (3, ?)", [_f32([0.9, 0.1, 0, 0, 0, 0, 0, 0])]) + + rows = db.execute( + "SELECT rowid, distance FROM t WHERE emb MATCH ? AND k=2", + [_f32([1, 0, 0, 0, 0, 0, 0, 0])], + ).fetchall() + assert len(rows) == 2 + # Closest should be rowid 1 (exact match) + assert rows[0][0] == 1 + assert rows[0][1] < 0.01 # ~0 distance + + +def test_diskann_knn_distances_sorted(db): + """Returned distances should be in ascending order.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=16) + ) + """) + import random + random.seed(42) + for i in range(1, 51): + vec = [random.gauss(0, 1) for _ in range(8)] + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, _f32(vec)]) + + rows = db.execute( + "SELECT rowid, distance FROM t WHERE emb MATCH ? AND k=10", + [_f32([0.0] * 8)], + ).fetchall() + assert len(rows) == 10 + distances = [r[1] for r in rows] + for i in range(len(distances) - 1): + assert distances[i] <= distances[i + 1], f"Distances not sorted at index {i}" + + +def test_diskann_knn_empty_table(db): + """KNN on empty table should return 0 results.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + rows = db.execute( + "SELECT rowid, distance FROM t WHERE emb MATCH ? AND k=5", + [_f32([1, 0, 0, 0, 0, 0, 0, 0])], + ).fetchall() + assert len(rows) == 0 + + +def test_diskann_knn_after_delete(db): + """KNN after delete should not return deleted rows.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + db.execute("INSERT INTO t(rowid, emb) VALUES (1, ?)", [_f32([1, 0, 0, 0, 0, 0, 0, 0])]) + db.execute("INSERT INTO t(rowid, emb) VALUES (2, ?)", [_f32([0, 1, 0, 0, 0, 0, 0, 0])]) + db.execute("INSERT INTO t(rowid, emb) VALUES (3, ?)", [_f32([0.5, 0.5, 0, 0, 0, 0, 0, 0])]) + db.execute("DELETE FROM t WHERE rowid = 1") + + rows = db.execute( + "SELECT rowid, distance FROM t WHERE emb MATCH ? AND k=3", + [_f32([1, 0, 0, 0, 0, 0, 0, 0])], + ).fetchall() + rowids = [r[0] for r in rows] + assert 1 not in rowids + assert len(rows) == 2 + + +def test_diskann_no_index_still_works(db): + """Tables without INDEXED BY should still work identically.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[4] + ) + """) + db.execute("INSERT INTO t(rowid, emb) VALUES (1, ?)", [_f32([1, 2, 3, 4])]) + rows = db.execute( + "SELECT rowid, distance FROM t WHERE emb MATCH ? AND k=1", + [_f32([1, 2, 3, 4])], + ).fetchall() + assert len(rows) == 1 + assert rows[0][0] == 1 + + +def test_diskann_drop_table(db): + """DROP TABLE should clean up all shadow tables.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[128] INDEXED BY diskann(neighbor_quantizer=binary) + ) + """) + db.execute("DROP TABLE t") + tables = [ + row[0] + for row in db.execute( + "select name from sqlite_master where name like 't%'" + ).fetchall() + ] + assert len(tables) == 0 + + +def test_diskann_create_split_search_list_size(db): + """DiskANN with separate search_list_size_search and search_list_size_insert.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[128] INDEXED BY diskann( + neighbor_quantizer=binary, + search_list_size_search=256, + search_list_size_insert=64 + ) + ) + """) + tables = [ + row[0] + for row in db.execute( + "select name from sqlite_master where name like 't%' order by 1" + ).fetchall() + ] + assert "t" in tables + + +def test_diskann_create_error_mixed_search_list_size(db): + """Error when mixing search_list_size with search_list_size_search.""" + result = exec(db, """ + CREATE VIRTUAL TABLE t USING vec0( + emb float[128] INDEXED BY diskann( + neighbor_quantizer=binary, + search_list_size=128, + search_list_size_search=256 + ) + ) + """) + assert "error" in result + + +def test_diskann_command_search_list_size(db): + """Runtime search_list_size override via command insert.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[64] INDEXED BY diskann(neighbor_quantizer=binary) + ) + """) + import struct, random + random.seed(42) + for i in range(20): + vec = struct.pack("64f", *[random.random() for _ in range(64)]) + db.execute("INSERT INTO t(emb) VALUES (?)", [vec]) + + # Query with default search_list_size + query = struct.pack("64f", *[random.random() for _ in range(64)]) + results_before = db.execute( + "SELECT rowid, distance FROM t WHERE emb MATCH ? AND k = 5", [query] + ).fetchall() + assert len(results_before) == 5 + + # Override search_list_size_search at runtime + db.execute("INSERT INTO t(t) VALUES ('search_list_size_search=256')") + + # Query should still work + results_after = db.execute( + "SELECT rowid, distance FROM t WHERE emb MATCH ? AND k = 5", [query] + ).fetchall() + assert len(results_after) == 5 + + # Override search_list_size_insert at runtime + db.execute("INSERT INTO t(t) VALUES ('search_list_size_insert=32')") + + # Inserts should still work + vec = struct.pack("64f", *[random.random() for _ in range(64)]) + db.execute("INSERT INTO t(emb) VALUES (?)", [vec]) + + # Override unified search_list_size + db.execute("INSERT INTO t(t) VALUES ('search_list_size=64')") + + results_final = db.execute( + "SELECT rowid, distance FROM t WHERE emb MATCH ? AND k = 5", [query] + ).fetchall() + assert len(results_final) == 5 + + +def test_diskann_command_search_list_size_error(db): + """Error on invalid search_list_size command value.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[64] INDEXED BY diskann(neighbor_quantizer=binary) + ) + """) + result = exec(db, "INSERT INTO t(t) VALUES ('search_list_size=0')") + assert "error" in result + result = exec(db, "INSERT INTO t(t) VALUES ('search_list_size=-1')") + assert "error" in result + + +# ====================================================================== +# Error cases: DiskANN + auxiliary/metadata/partition columns +# ====================================================================== + +def test_diskann_create_with_auxiliary_column(db): + """DiskANN tables should support auxiliary columns.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[64] INDEXED BY diskann(neighbor_quantizer=binary), + +extra text + ) + """) + # Auxiliary shadow table should exist + tables = [r[0] for r in db.execute( + "SELECT name FROM sqlite_master WHERE name LIKE 't_%' ORDER BY 1" + ).fetchall()] + assert "t_auxiliary" in tables + + +def test_diskann_create_error_with_metadata_column(db): + """DiskANN tables should not support metadata columns.""" + result = exec(db, """ + CREATE VIRTUAL TABLE t USING vec0( + emb float[64] INDEXED BY diskann(neighbor_quantizer=binary), + metadata_col integer metadata + ) + """) + assert "error" in result + assert "metadata" in result["message"].lower() or "Metadata" in result["message"] + + +def test_diskann_create_error_with_partition_key(db): + """DiskANN tables should not support partition key columns.""" + result = exec(db, """ + CREATE VIRTUAL TABLE t USING vec0( + emb float[64] INDEXED BY diskann(neighbor_quantizer=binary), + user_id text partition key + ) + """) + assert "error" in result + assert "partition" in result["message"].lower() or "Partition" in result["message"] + + +# ====================================================================== +# Insert edge cases +# ====================================================================== + +def test_diskann_insert_no_rowid(db): + """INSERT without explicit rowid (auto-generated) should work.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary) + ) + """) + db.execute("INSERT INTO t(emb) VALUES (?)", [_f32([1.0] * 8)]) + db.execute("INSERT INTO t(emb) VALUES (?)", [_f32([2.0] * 8)]) + assert db.execute("SELECT count(*) FROM t_vectors00").fetchone()[0] == 2 + assert db.execute("SELECT count(*) FROM t_diskann_nodes00").fetchone()[0] == 2 + + +def test_diskann_insert_large_batch(db): + """INSERT 500+ vectors, verify all are queryable via KNN.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[16] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=16) + ) + """) + import random + random.seed(99) + N = 500 + for i in range(1, N + 1): + vec = [random.gauss(0, 1) for _ in range(16)] + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, _f32(vec)]) + + assert db.execute("SELECT count(*) FROM t_vectors00").fetchone()[0] == N + assert db.execute("SELECT count(*) FROM t_diskann_nodes00").fetchone()[0] == N + + # KNN should return results + query = [random.gauss(0, 1) for _ in range(16)] + rows = db.execute( + "SELECT rowid, distance FROM t WHERE emb MATCH ? AND k=10", + [_f32(query)], + ).fetchall() + assert len(rows) == 10 + # Distances should be sorted + distances = [r[1] for r in rows] + for i in range(len(distances) - 1): + assert distances[i] <= distances[i + 1] + + +def test_diskann_insert_zero_vector(db): + """Insert an all-zero vector (edge case for binary quantizer).""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary) + ) + """) + db.execute("INSERT INTO t(rowid, emb) VALUES (1, ?)", [_f32([0.0] * 8)]) + db.execute("INSERT INTO t(rowid, emb) VALUES (2, ?)", [_f32([1.0] * 8)]) + count = db.execute("SELECT count(*) FROM t_vectors00").fetchone()[0] + assert count == 2 + + # Query with zero vector should find rowid 1 as closest + rows = db.execute( + "SELECT rowid, distance FROM t WHERE emb MATCH ? AND k=2", + [_f32([0.0] * 8)], + ).fetchall() + assert len(rows) == 2 + assert rows[0][0] == 1 + + +def test_diskann_insert_large_values(db): + """Insert vectors with very large float values.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary) + ) + """) + import sys + large = sys.float_info.max / 1e300 # Large but not overflowing + db.execute("INSERT INTO t(rowid, emb) VALUES (1, ?)", [_f32([large] * 8)]) + db.execute("INSERT INTO t(rowid, emb) VALUES (2, ?)", [_f32([-large] * 8)]) + db.execute("INSERT INTO t(rowid, emb) VALUES (3, ?)", [_f32([0.0] * 8)]) + assert db.execute("SELECT count(*) FROM t_vectors00").fetchone()[0] == 3 + + +def test_diskann_insert_int8_quantizer_knn(db): + """Full insert + query cycle with int8 quantizer.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[16] INDEXED BY diskann(neighbor_quantizer=int8, n_neighbors=8) + ) + """) + import random + random.seed(77) + for i in range(1, 31): + vec = [random.gauss(0, 1) for _ in range(16)] + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, _f32(vec)]) + + assert db.execute("SELECT count(*) FROM t_vectors00").fetchone()[0] == 30 + + # KNN should work + query = [random.gauss(0, 1) for _ in range(16)] + rows = db.execute( + "SELECT rowid, distance FROM t WHERE emb MATCH ? AND k=5", + [_f32(query)], + ).fetchall() + assert len(rows) == 5 + distances = [r[1] for r in rows] + for i in range(len(distances) - 1): + assert distances[i] <= distances[i + 1] + + +# ====================================================================== +# Delete edge cases +# ====================================================================== + +def test_diskann_delete_nonexistent(db): + """DELETE of a nonexistent rowid should either be a no-op or return an error, not crash.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + db.execute("INSERT INTO t(rowid, emb) VALUES (1, ?)", [_f32([1.0] * 8)]) + # Deleting a nonexistent rowid may error but should not crash + result = exec(db, "DELETE FROM t WHERE rowid = 999") + # Whether it succeeds or errors, the existing row should still be there + assert db.execute("SELECT count(*) FROM t_vectors00").fetchone()[0] == 1 + + +def test_diskann_delete_then_reinsert_same_rowid(db): + """Delete rowid 5, then reinsert rowid 5 with a new vector.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + for i in range(1, 6): + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, _f32([float(i)] * 8)]) + + db.execute("DELETE FROM t WHERE rowid = 5") + assert db.execute("SELECT count(*) FROM t_vectors00").fetchone()[0] == 4 + + # Reinsert with new vector + db.execute("INSERT INTO t(rowid, emb) VALUES (5, ?)", [_f32([99.0] * 8)]) + assert db.execute("SELECT count(*) FROM t_vectors00").fetchone()[0] == 5 + assert db.execute("SELECT count(*) FROM t_diskann_nodes00").fetchone()[0] == 5 + + +def test_diskann_delete_all_then_insert(db): + """Delete everything, then insert new vectors. Graph should rebuild.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + for i in range(1, 6): + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, _f32([float(i)] * 8)]) + + # Delete all + for i in range(1, 6): + db.execute("DELETE FROM t WHERE rowid = ?", [i]) + assert db.execute("SELECT count(*) FROM t_vectors00").fetchone()[0] == 0 + + medoid = db.execute("SELECT value FROM t_info WHERE key = 'diskann_medoid_00'").fetchone()[0] + assert medoid is None + + # Insert new vectors + for i in range(10, 15): + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, _f32([float(i)] * 8)]) + + assert db.execute("SELECT count(*) FROM t_vectors00").fetchone()[0] == 5 + assert db.execute("SELECT count(*) FROM t_diskann_nodes00").fetchone()[0] == 5 + + medoid = db.execute("SELECT value FROM t_info WHERE key = 'diskann_medoid_00'").fetchone()[0] + assert medoid is not None + + # KNN should work + rows = db.execute( + "SELECT rowid FROM t WHERE emb MATCH ? AND k=3", + [_f32([12.0] * 8)], + ).fetchall() + assert len(rows) == 3 + + +def test_diskann_delete_preserves_graph_connectivity(db): + """After deleting a node, remaining nodes should still be reachable via KNN.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + import random + random.seed(456) + for i in range(1, 21): + vec = [random.gauss(0, 1) for _ in range(8)] + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, _f32(vec)]) + + # Delete 5 nodes + for i in [3, 7, 11, 15, 19]: + db.execute("DELETE FROM t WHERE rowid = ?", [i]) + + remaining = db.execute("SELECT count(*) FROM t_vectors00").fetchone()[0] + assert remaining == 15 + + # Every remaining node should be reachable via KNN (appears somewhere in top-k) + all_rowids = [r[0] for r in db.execute("SELECT rowid FROM t_vectors00").fetchall()] + reachable = set() + for rid in all_rowids: + vec_blob = db.execute("SELECT vector FROM t_vectors00 WHERE rowid = ?", [rid]).fetchone()[0] + rows = db.execute( + "SELECT rowid FROM t WHERE emb MATCH ? AND k=5", + [vec_blob], + ).fetchall() + assert len(rows) >= 1 # At least some results + for r in rows: + reachable.add(r[0]) + # Most nodes should be reachable through the graph + assert len(reachable) >= len(all_rowids) * 0.8, \ + f"Only {len(reachable)}/{len(all_rowids)} nodes reachable" + + +# ====================================================================== +# Update scenarios +# ====================================================================== + +def test_diskann_update_vector(db): + """UPDATE a vector on DiskANN table should error (will be implemented soon).""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + db.execute("INSERT INTO t(rowid, emb) VALUES (1, ?)", [_f32([1, 0, 0, 0, 0, 0, 0, 0])]) + db.execute("INSERT INTO t(rowid, emb) VALUES (2, ?)", [_f32([0, 1, 0, 0, 0, 0, 0, 0])]) + db.execute("INSERT INTO t(rowid, emb) VALUES (3, ?)", [_f32([0, 0, 1, 0, 0, 0, 0, 0])]) + + with pytest.raises(sqlite3.OperationalError, match="UPDATE on vector column.*not supported for DiskANN"): + db.execute("UPDATE t SET emb = ? WHERE rowid = 1", [_f32([0, 0.9, 0.1, 0, 0, 0, 0, 0])]) + + +# ====================================================================== +# KNN correctness after mutations +# ====================================================================== + +def test_diskann_knn_recall_after_inserts(db): + """Insert N vectors, verify top-1 recall is 100% for exact matches.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=16) + ) + """) + import random + random.seed(200) + vectors = {} + for i in range(1, 51): + vec = [random.gauss(0, 1) for _ in range(8)] + vectors[i] = vec + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, _f32(vec)]) + + # Top-1 for each vector should return itself + correct = 0 + for rid, vec in vectors.items(): + rows = db.execute( + "SELECT rowid FROM t WHERE emb MATCH ? AND k=1", + [_f32(vec)], + ).fetchall() + if rows and rows[0][0] == rid: + correct += 1 + + # With binary quantizer, approximate search may not always return exact match + # but should have high recall (at least 80%) + assert correct >= len(vectors) * 0.8, f"Top-1 recall too low: {correct}/{len(vectors)}" + + +def test_diskann_knn_k_larger_than_table(db): + """k=100 on table with 5 rows should return 5.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + for i in range(1, 6): + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, _f32([float(i)] * 8)]) + + rows = db.execute( + "SELECT rowid, distance FROM t WHERE emb MATCH ? AND k=100", + [_f32([3.0] * 8)], + ).fetchall() + assert len(rows) == 5 + + +def test_diskann_knn_cosine_metric(db): + """KNN with cosine distance metric.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] distance_metric=cosine INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + # Insert orthogonal-ish vectors + db.execute("INSERT INTO t(rowid, emb) VALUES (1, ?)", [_f32([1, 0, 0, 0, 0, 0, 0, 0])]) + db.execute("INSERT INTO t(rowid, emb) VALUES (2, ?)", [_f32([0, 1, 0, 0, 0, 0, 0, 0])]) + db.execute("INSERT INTO t(rowid, emb) VALUES (3, ?)", [_f32([0.7, 0.7, 0, 0, 0, 0, 0, 0])]) + + rows = db.execute( + "SELECT rowid, distance FROM t WHERE emb MATCH ? AND k=3", + [_f32([1, 0, 0, 0, 0, 0, 0, 0])], + ).fetchall() + assert len(rows) == 3 + # rowid 1 should be closest (exact match in direction) + assert rows[0][0] == 1 + # Distances should be sorted + distances = [r[1] for r in rows] + for i in range(len(distances) - 1): + assert distances[i] <= distances[i + 1] + + +def test_diskann_knn_after_heavy_churn(db): + """Interleave many inserts and deletes, then query.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=16) + ) + """) + import random + random.seed(321) + + # Insert 50 vectors + for i in range(1, 51): + vec = [random.gauss(0, 1) for _ in range(8)] + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, _f32(vec)]) + + # Delete even-numbered rows + for i in range(2, 51, 2): + db.execute("DELETE FROM t WHERE rowid = ?", [i]) + + # Insert more vectors with higher rowids + for i in range(51, 76): + vec = [random.gauss(0, 1) for _ in range(8)] + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, _f32(vec)]) + + remaining = db.execute("SELECT count(*) FROM t_vectors00").fetchone()[0] + assert remaining == 50 # 25 odd + 25 new + + # KNN should still work and return results + query = [random.gauss(0, 1) for _ in range(8)] + rows = db.execute( + "SELECT rowid, distance FROM t WHERE emb MATCH ? AND k=10", + [_f32(query)], + ).fetchall() + assert len(rows) == 10 + # Distances should be sorted + distances = [r[1] for r in rows] + for i in range(len(distances) - 1): + assert distances[i] <= distances[i + 1] + + +def test_diskann_knn_batch_recall(db): + """Insert 100+ vectors and verify reasonable recall.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[16] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=16) + ) + """) + import random + random.seed(42) + N = 150 + vectors = {} + for i in range(1, N + 1): + vec = [random.gauss(0, 1) for _ in range(16)] + vectors[i] = vec + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, _f32(vec)]) + + # Brute-force top-5 for a query and compare with DiskANN + query = [random.gauss(0, 1) for _ in range(16)] + + # Compute true distances + true_dists = [] + for rid, vec in vectors.items(): + d = sum((a - b) ** 2 for a, b in zip(query, vec)) + true_dists.append((d, rid)) + true_dists.sort() + true_top5 = set(r for _, r in true_dists[:5]) + + rows = db.execute( + "SELECT rowid, distance FROM t WHERE emb MATCH ? AND k=5", + [_f32(query)], + ).fetchall() + result_top5 = set(r[0] for r in rows) + assert len(rows) == 5 + + # At least 3 of top-5 should match (reasonable recall for approximate search) + overlap = len(true_top5 & result_top5) + assert overlap >= 3, f"Recall too low: only {overlap}/5 overlap" + + +# ====================================================================== +# Additional edge cases +# ====================================================================== + +def test_diskann_insert_wrong_dimensions(db): + """INSERT with wrong dimension vector should error.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + result = exec(db, "INSERT INTO t(rowid, emb) VALUES (1, ?)", [_f32([1.0] * 4)]) + assert "error" in result + + +def test_diskann_knn_wrong_query_dimensions(db): + """KNN MATCH with wrong dimension query should error.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + db.execute("INSERT INTO t(rowid, emb) VALUES (1, ?)", [_f32([1.0] * 8)]) + + result = exec(db, "SELECT rowid FROM t WHERE emb MATCH ? AND k=1", [_f32([1.0] * 4)]) + assert "error" in result + + +def test_diskann_graph_connectivity_after_many_deletes(db): + """After many deletes, the graph should still be connected enough for search.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=16) + ) + """) + import random + random.seed(789) + N = 40 + for i in range(1, N + 1): + vec = [random.gauss(0, 1) for _ in range(8)] + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, _f32(vec)]) + + # Delete 30 of 40 nodes + to_delete = list(range(1, 31)) + for i in to_delete: + db.execute("DELETE FROM t WHERE rowid = ?", [i]) + + remaining = db.execute("SELECT count(*) FROM t_vectors00").fetchone()[0] + assert remaining == 10 + + # Search should still work and return results + query = [random.gauss(0, 1) for _ in range(8)] + rows = db.execute( + "SELECT rowid, distance FROM t WHERE emb MATCH ? AND k=10", + [_f32(query)], + ).fetchall() + # Should return some results (graph may be fragmented after heavy deletion) + assert len(rows) >= 1 + # Distances should be sorted + distances = [r[1] for r in rows] + for i in range(len(distances) - 1): + assert distances[i] <= distances[i + 1] + + +def test_diskann_large_batch_insert_500(db): + """Insert 500+ vectors and verify counts and KNN.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=16) + ) + """) + import random + random.seed(555) + N = 500 + for i in range(1, N + 1): + vec = [random.gauss(0, 1) for _ in range(8)] + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, _f32(vec)]) + + assert db.execute("SELECT count(*) FROM t_vectors00").fetchone()[0] == N + + query = [random.gauss(0, 1) for _ in range(8)] + rows = db.execute( + "SELECT rowid, distance FROM t WHERE emb MATCH ? AND k=20", + [_f32(query)], + ).fetchall() + assert len(rows) == 20 + distances = [r[1] for r in rows] + for i in range(len(distances) - 1): + assert distances[i] <= distances[i + 1] + + +def test_corrupt_truncated_node_blob(db): + """KNN should error (not crash) when DiskANN node blob is truncated.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + for i in range(5): + vec = [0.0] * 8 + vec[i % 8] = 1.0 + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i + 1, _f32(vec)]) + + # Corrupt a DiskANN node: truncate neighbor_ids to 1 byte (wrong size) + db.execute( + "UPDATE t_diskann_nodes00 SET neighbor_ids = x'00' WHERE rowid = 1" + ) + + # Should not crash — may return wrong results or error + try: + db.execute( + "SELECT rowid FROM t WHERE emb MATCH ? AND k=3", + [_f32([1, 0, 0, 0, 0, 0, 0, 0])], + ).fetchall() + except sqlite3.OperationalError: + pass # Error is acceptable — crash is not + + +def test_diskann_delete_reinsert_cycle_knn(db): + """Repeatedly delete and reinsert rows, verify KNN stays correct.""" + import random + random.seed(101) + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + N = 30 + vecs = {} + for i in range(1, N + 1): + v = [random.gauss(0, 1) for _ in range(8)] + vecs[i] = v + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, _f32(v)]) + + # 3 cycles: delete half, reinsert with new vectors, verify KNN + for cycle in range(3): + to_delete = random.sample(sorted(vecs.keys()), len(vecs) // 2) + for r in to_delete: + db.execute("DELETE FROM t WHERE rowid = ?", [r]) + del vecs[r] + + # Reinsert with new rowids + new_start = 100 + cycle * 50 + for i in range(len(to_delete)): + rid = new_start + i + v = [random.gauss(0, 1) for _ in range(8)] + vecs[rid] = v + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [rid, _f32(v)]) + + # KNN should return only alive rows + query = [0.0] * 8 + rows = db.execute( + "SELECT rowid FROM t WHERE emb MATCH ? AND k=10", + [_f32(query)], + ).fetchall() + returned = {r["rowid"] for r in rows} + assert returned.issubset(set(vecs.keys())), \ + f"Cycle {cycle}: deleted rowid in KNN results" + assert len(rows) >= 1 + + +def test_diskann_delete_interleaved_with_knn(db): + """Delete one row at a time, querying KNN after each delete.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + N = 20 + for i in range(1, N + 1): + vec = [0.0] * 8 + vec[i % 8] = float(i) + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, _f32(vec)]) + + alive = set(range(1, N + 1)) + for to_del in [1, 5, 10, 15, 20]: + db.execute("DELETE FROM t WHERE rowid = ?", [to_del]) + alive.discard(to_del) + + rows = db.execute( + "SELECT rowid FROM t WHERE emb MATCH ? AND k=5", + [_f32([1, 0, 0, 0, 0, 0, 0, 0])], + ).fetchall() + returned = {r["rowid"] for r in rows} + assert returned.issubset(alive), \ + f"Deleted rowid {to_del} found in KNN results" + + +# ====================================================================== +# Text primary key + DiskANN +# ====================================================================== + + +def test_diskann_text_pk_insert_knn_delete(db): + """DiskANN with text primary key: insert, KNN, delete, KNN again.""" + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + id text primary key, + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + + vecs = { + "alpha": [1, 0, 0, 0, 0, 0, 0, 0], + "beta": [0, 1, 0, 0, 0, 0, 0, 0], + "gamma": [0, 0, 1, 0, 0, 0, 0, 0], + "delta": [0, 0, 0, 1, 0, 0, 0, 0], + "epsilon": [0, 0, 0, 0, 1, 0, 0, 0], + } + for name, vec in vecs.items(): + db.execute("INSERT INTO t(id, emb) VALUES (?, ?)", [name, _f32(vec)]) + + # KNN should return text IDs + rows = db.execute( + "SELECT id, distance FROM t WHERE emb MATCH ? AND k=3", + [_f32([1, 0, 0, 0, 0, 0, 0, 0])], + ).fetchall() + assert len(rows) >= 1 + ids = [r["id"] for r in rows] + assert "alpha" in ids # closest to query + + # Delete and verify + db.execute("DELETE FROM t WHERE id = 'alpha'") + rows = db.execute( + "SELECT id FROM t WHERE emb MATCH ? AND k=3", + [_f32([1, 0, 0, 0, 0, 0, 0, 0])], + ).fetchall() + ids = [r["id"] for r in rows] + assert "alpha" not in ids + + +def test_diskann_delete_scrubs_all_references(db): + """After DELETE, no shadow table should contain the deleted rowid or its data.""" + import struct + db.execute(""" + CREATE VIRTUAL TABLE t USING vec0( + emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8) + ) + """) + for i in range(20): + vec = struct.pack("8f", *[float(i + d) for d in range(8)]) + db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i, vec]) + + target = 5 + db.execute("DELETE FROM t WHERE rowid = ?", [target]) + + # Node row itself should be gone + assert db.execute( + "SELECT count(*) FROM t_diskann_nodes00 WHERE rowid=?", [target] + ).fetchone()[0] == 0 + + # Vector should be gone + assert db.execute( + "SELECT count(*) FROM t_vectors00 WHERE rowid=?", [target] + ).fetchone()[0] == 0 + + # No other node should reference the deleted rowid in neighbor_ids + for row in db.execute("SELECT rowid, neighbor_ids FROM t_diskann_nodes00"): + node_rowid = row[0] + ids_blob = row[1] + for j in range(0, len(ids_blob), 8): + nid = struct.unpack("