Add comprehensive ANN benchmarking suite (#279)

Extend benchmarks-ann/ with results database (SQLite with per-query detail and continuous writes), dataset subfolder organization, --subset-size and --warmup options. Supports systematic comparison across flat, rescore, IVF, and DiskANN index types.
2026-06-14 15:25:18 +02:00 · 2026-03-31 01:29:49 -07:00 · 2026-03-31 01:29:49 -07:00 · 8544081a67
commit 8544081a67
parent a248ecd061
26 changed files with 2127 additions and 292 deletions
--- a/benchmarks-ann/.gitignore
+++ b/benchmarks-ann/.gitignore
@ -1,2 +1,8 @@
 *.db
+*.db-shm
+*.db-wal
+*.parquet
 runs/
+
+viewer/
+searcher/
--- a/benchmarks-ann/Makefile
+++ b/benchmarks-ann/Makefile
@ -1,5 +1,5 @@
 BENCH = python bench.py
-BASE_DB = seed/base.db
+BASE_DB = cohere1m/base.db
 EXT = ../dist/vec0

 # --- Baseline (brute-force) configs ---
@ -33,7 +33,7 @@ ALL_CONFIGS = $(BASELINES) $(RESCORE_CONFIGS) $(IVF_CONFIGS) $(DISKANN_CONFIGS)

 # --- Data preparation ---
 seed:
-	$(MAKE) -C seed
+	$(MAKE) -C cohere1m

 ground-truth: seed
 	python ground_truth.py --subset-size 10000
@ -42,43 +42,43 @@ ground-truth: seed

 # --- Quick smoke test ---
 bench-smoke: seed
-	$(BENCH) --subset-size 5000 -k 10 -n 20 -o runs/smoke \
+	$(BENCH) --subset-size 5000 -k 10 -n 20 --dataset cohere1m -o runs \
 		"brute-float:type=baseline,variant=float" \
 		"ivf-quick:type=ivf,nlist=16,nprobe=4" \
 		"diskann-quick:type=diskann,R=48,L=64,quantizer=binary"

 bench-rescore: seed
-	$(BENCH) --subset-size 10000 -k 10 -o runs/rescore \
+	$(BENCH) --subset-size 10000 -k 10 --dataset cohere1m -o runs \
 		$(RESCORE_CONFIGS)


 # --- Standard sizes ---
 bench-10k: seed
-	$(BENCH) --subset-size 10000 -k 10 -o runs/10k $(ALL_CONFIGS)
+	$(BENCH) --subset-size 10000 -k 10 --dataset cohere1m -o runs $(ALL_CONFIGS)

 bench-50k: seed
-	$(BENCH) --subset-size 50000 -k 10 -o runs/50k $(ALL_CONFIGS)
+	$(BENCH) --subset-size 50000 -k 10 --dataset cohere1m -o runs $(ALL_CONFIGS)

 bench-100k: seed
-	$(BENCH) --subset-size 100000 -k 10 -o runs/100k $(ALL_CONFIGS)
+	$(BENCH) --subset-size 100000 -k 10 --dataset cohere1m -o runs $(ALL_CONFIGS)

 bench-all: bench-10k bench-50k bench-100k

 # --- IVF across sizes ---
 bench-ivf: seed
-	$(BENCH) --subset-size 10000 -k 10 -o runs/ivf $(BASELINES) $(IVF_CONFIGS)
-	$(BENCH) --subset-size 50000 -k 10 -o runs/ivf $(BASELINES) $(IVF_CONFIGS)
-	$(BENCH) --subset-size 100000 -k 10 -o runs/ivf $(BASELINES) $(IVF_CONFIGS)
+	$(BENCH) --subset-size 10000 -k 10 --dataset cohere1m -o runs $(BASELINES) $(IVF_CONFIGS)
+	$(BENCH) --subset-size 50000 -k 10 --dataset cohere1m -o runs $(BASELINES) $(IVF_CONFIGS)
+	$(BENCH) --subset-size 100000 -k 10 --dataset cohere1m -o runs $(BASELINES) $(IVF_CONFIGS)

 # --- DiskANN across sizes ---
 bench-diskann: seed
-	$(BENCH) --subset-size 10000 -k 10 -o runs/diskann $(BASELINES) $(DISKANN_CONFIGS)
-	$(BENCH) --subset-size 50000 -k 10 -o runs/diskann $(BASELINES) $(DISKANN_CONFIGS)
-	$(BENCH) --subset-size 100000 -k 10 -o runs/diskann $(BASELINES) $(DISKANN_CONFIGS)
+	$(BENCH) --subset-size 10000 -k 10 --dataset cohere1m -o runs $(BASELINES) $(DISKANN_CONFIGS)
+	$(BENCH) --subset-size 50000 -k 10 --dataset cohere1m -o runs $(BASELINES) $(DISKANN_CONFIGS)
+	$(BENCH) --subset-size 100000 -k 10 --dataset cohere1m -o runs $(BASELINES) $(DISKANN_CONFIGS)

 # --- Report ---
 report:
-	@echo "Use: sqlite3 runs/<dir>/results.db 'SELECT * FROM bench_results ORDER BY recall DESC'"
+	@echo "Use: sqlite3 runs/cohere1m/<size>/results.db 'SELECT run_id, config_name, status, recall FROM runs JOIN run_results USING(run_id)'"

 # --- Cleanup ---
 clean:
--- a/benchmarks-ann/README.md
+++ b/benchmarks-ann/README.md
@ -1,81 +1,111 @@
 # KNN Benchmarks for sqlite-vec

 Benchmarking infrastructure for vec0 KNN configurations. Includes brute-force
-baselines (float, int8, bit); index-specific branches add their own types
-via the `INDEX_REGISTRY` in `bench.py`.
+baselines (float, int8, bit), rescore, IVF, and DiskANN index types.
+
+## Datasets
+
+Each dataset is a subdirectory containing a `Makefile` and `build_base_db.py`
+that produce a `base.db`. The benchmark runner auto-discovers any subdirectory
+with a `base.db` file.
+
+```
+cohere1m/           # Cohere 768d cosine, 1M vectors
+  Makefile          # downloads parquets from Zilliz, builds base.db
+  build_base_db.py
+  base.db           # (generated)
+
+cohere10m/          # Cohere 768d cosine, 10M vectors (10 train shards)
+  Makefile          # make -j12 download to fetch all shards in parallel
+  build_base_db.py
+  base.db           # (generated)
+```
+
+Every `base.db` has the same schema:
+
+| Table | Columns | Description |
+|-------|---------|-------------|
+| `train` | `id INTEGER PRIMARY KEY, vector BLOB` | Indexed vectors (f32 blobs) |
+| `query_vectors` | `id INTEGER PRIMARY KEY, vector BLOB` | Query vectors for KNN evaluation |
+| `neighbors` | `query_vector_id INTEGER, rank INTEGER, neighbors_id TEXT` | Ground-truth nearest neighbors |
+
+To add a new dataset, create a directory with a `Makefile` that builds `base.db`
+with the tables above. It will be available via `--dataset <dirname>` automatically.
+
+### Building datasets
+
+```bash
+# Cohere 1M
+cd cohere1m && make download && make && cd ..
+
+# Cohere 10M (parallel download recommended — 10 train shards + test + neighbors)
+cd cohere10m && make -j12 download && make && cd ..
+```

 ## Prerequisites

- Built `dist/vec0` extension (run `make` from repo root)
+- Built `dist/vec0` extension (run `make loadable` from repo root)
 - Python 3.10+
- `uv` (for seed data prep): `pip install uv`
+- `uv`

 ## Quick start

 ```bash
-# 1. Download dataset and build seed DB (~3 GB download, ~5 min)
-make seed
+# 1. Build a dataset
+cd cohere1m && make && cd ..

-# 2. Run a quick smoke test (5k vectors, ~1 min)
+# 2. Quick smoke test (5k vectors)
 make bench-smoke

-# 3. Run full benchmark at 10k
+# 3. Full benchmark at 10k
 make bench-10k
 ```

 ## Usage

-### Direct invocation
-
 ```bash
-python bench.py --subset-size 10000 \
+uv run python bench.py --subset-size 10000 -k 10 -n 50 --dataset cohere1m \
  "brute-float:type=baseline,variant=float" \
-  "brute-int8:type=baseline,variant=int8" \
-  "brute-bit:type=baseline,variant=bit"
+  "rescore-bit-os8:type=rescore,quantizer=bit,oversample=8"
 ```

 ### Config format

 `name:type=<index_type>,key=val,key=val`

-| Index type | Keys | Branch |
-|-----------|------|--------|
-| `baseline` | `variant` (float/int8/bit), `oversample` | this branch |
-
-Index branches register additional types in `INDEX_REGISTRY`. See the
-docstring in `bench.py` for the extension API.
+| Index type | Keys |
+|-----------|------|
+| `baseline` | `variant` (float/int8/bit), `oversample` |
+| `rescore` | `quantizer` (bit/int8), `oversample` |
+| `ivf` | `nlist`, `nprobe` |
+| `diskann` | `R`, `L`, `quantizer` (binary/int8), `buffer_threshold` |

 ### Make targets

 | Target | Description |
 |--------|-------------|
-| `make seed` | Download COHERE 1M dataset |
-| `make ground-truth` | Pre-compute ground truth for 10k/50k/100k |
-| `make bench-smoke` | Quick 5k baseline test |
+| `make seed` | Download and build default dataset |
+| `make bench-smoke` | Quick 5k test (3 configs) |
 | `make bench-10k` | All configs at 10k vectors |
 | `make bench-50k` | All configs at 50k vectors |
 | `make bench-100k` | All configs at 100k vectors |
 | `make bench-all` | 10k + 50k + 100k |
+| `make bench-ivf` | Baselines + IVF across 10k/50k/100k |
+| `make bench-diskann` | Baselines + DiskANN across 10k/50k/100k |
+
+## Results DB
+
+Each run writes to `runs/<dataset>/<subset_size>/results.db` (SQLite, WAL mode).
+Progress is written continuously — query from another terminal to monitor:
+
+```bash
+sqlite3 runs/cohere1m/10000/results.db "SELECT run_id, config_name, status FROM runs"
+```
+
+See `results_schema.sql` for the full schema (tables: `runs`, `run_results`,
+`insert_batches`, `queries`).

 ## Adding an index type

-In your index branch, add an entry to `INDEX_REGISTRY` in `bench.py` and
-append your configs to `ALL_CONFIGS` in the `Makefile`. See the existing
-`baseline` entry and the comments in both files for the pattern.
-
-## Results
-
-Results are stored in `runs/<dir>/results.db` using the schema in `schema.sql`.
-
-```bash
-sqlite3 runs/10k/results.db "
-  SELECT config_name, recall, mean_ms, qps
-  FROM bench_results
-  ORDER BY recall DESC
-"
-```
-
-## Dataset
-
-[Zilliz COHERE Medium 1M](https://zilliz.com/learn/datasets-for-vector-database-benchmarks):
-768 dimensions, cosine distance, 1M train vectors + 10k query vectors with precomputed neighbors.
+Add an entry to `INDEX_REGISTRY` in `bench.py` and append configs to
+`ALL_CONFIGS` in the `Makefile`. See existing entries for the pattern.
--- a/benchmarks-ann/bench.py
+++ b/benchmarks-ann/bench.py
--- a/benchmarks-ann/datasets/cohere10m/Makefile
+++ b/benchmarks-ann/datasets/cohere10m/Makefile
@ -0,0 +1,27 @@
+BASE_URL = https://assets.zilliz.com/benchmark/cohere_large_10m
+
+TRAIN_PARQUETS = $(shell printf 'train-%02d-of-10.parquet ' 0 1 2 3 4 5 6 7 8 9)
+OTHER_PARQUETS = test.parquet neighbors.parquet
+PARQUETS = $(TRAIN_PARQUETS) $(OTHER_PARQUETS)
+
+.PHONY: all download clean
+
+all: base.db
+
+# Use: make -j12 download
+download: $(PARQUETS)
+
+train-%-of-10.parquet:
+	curl -L -o $@ $(BASE_URL)/$@
+
+test.parquet:
+	curl -L -o $@ $(BASE_URL)/test.parquet
+
+neighbors.parquet:
+	curl -L -o $@ $(BASE_URL)/neighbors.parquet
+
+base.db: $(PARQUETS) build_base_db.py
+	uv run --with pandas --with pyarrow python build_base_db.py
+
+clean:
+	rm -f base.db
--- a/benchmarks-ann/datasets/cohere10m/build_base_db.py
+++ b/benchmarks-ann/datasets/cohere10m/build_base_db.py
@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+"""Build base.db from downloaded parquet files (10M dataset, 10 train shards).
+
+Reads train-00-of-10.parquet .. train-09-of-10.parquet, test.parquet,
+neighbors.parquet and creates a SQLite database with tables:
+  train, query_vectors, neighbors.
+
+Usage:
+  uv run --with pandas --with pyarrow python build_base_db.py
+"""
+import json
+import os
+import sqlite3
+import struct
+import sys
+import time
+
+import pandas as pd
+
+TRAIN_SHARDS = 10
+
+
+def float_list_to_blob(floats):
+    """Pack a list of floats into a little-endian f32 blob."""
+    return struct.pack(f"<{len(floats)}f", *floats)
+
+
+def main():
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    db_path = os.path.join(script_dir, "base.db")
+
+    train_paths = [
+        os.path.join(script_dir, f"train-{i:02d}-of-{TRAIN_SHARDS}.parquet")
+        for i in range(TRAIN_SHARDS)
+    ]
+    test_path = os.path.join(script_dir, "test.parquet")
+    neighbors_path = os.path.join(script_dir, "neighbors.parquet")
+
+    for p in train_paths + [test_path, neighbors_path]:
+        if not os.path.exists(p):
+            print(f"ERROR: {p} not found. Run 'make download' first.")
+            sys.exit(1)
+
+    if os.path.exists(db_path):
+        os.remove(db_path)
+
+    conn = sqlite3.connect(db_path)
+    conn.execute("PRAGMA journal_mode=WAL")
+    conn.execute("PRAGMA page_size=4096")
+
+    # --- query_vectors (from test.parquet) ---
+    print("Loading test.parquet (query vectors)...")
+    t0 = time.perf_counter()
+    df_test = pd.read_parquet(test_path)
+    conn.execute(
+        "CREATE TABLE query_vectors (id INTEGER PRIMARY KEY, vector BLOB)"
+    )
+    rows = []
+    for _, row in df_test.iterrows():
+        rows.append((int(row["id"]), float_list_to_blob(row["emb"])))
+    conn.executemany("INSERT INTO query_vectors (id, vector) VALUES (?, ?)", rows)
+    conn.commit()
+    print(f"  {len(rows)} query vectors in {time.perf_counter() - t0:.1f}s")
+
+    # --- neighbors (from neighbors.parquet) ---
+    print("Loading neighbors.parquet...")
+    t0 = time.perf_counter()
+    df_neighbors = pd.read_parquet(neighbors_path)
+    conn.execute(
+        "CREATE TABLE neighbors ("
+        "  query_vector_id INTEGER, rank INTEGER, neighbors_id TEXT,"
+        "  UNIQUE(query_vector_id, rank))"
+    )
+    rows = []
+    for _, row in df_neighbors.iterrows():
+        qid = int(row["id"])
+        nids = row["neighbors_id"]
+        if isinstance(nids, str):
+            nids = json.loads(nids)
+        for rank, nid in enumerate(nids):
+            rows.append((qid, rank, str(int(nid))))
+    conn.executemany(
+        "INSERT INTO neighbors (query_vector_id, rank, neighbors_id) VALUES (?, ?, ?)",
+        rows,
+    )
+    conn.commit()
+    print(f"  {len(rows)} neighbor rows in {time.perf_counter() - t0:.1f}s")
+
+    # --- train (from 10 shard parquets) ---
+    print(f"Loading {TRAIN_SHARDS} train shards (10M vectors, this will take a while)...")
+    conn.execute(
+        "CREATE TABLE train (id INTEGER PRIMARY KEY, vector BLOB)"
+    )
+
+    global_t0 = time.perf_counter()
+    total_inserted = 0
+    batch_size = 10000
+
+    for shard_idx, train_path in enumerate(train_paths):
+        print(f"  Shard {shard_idx + 1}/{TRAIN_SHARDS}: {os.path.basename(train_path)}")
+        t0 = time.perf_counter()
+        df = pd.read_parquet(train_path)
+        shard_len = len(df)
+
+        for start in range(0, shard_len, batch_size):
+            chunk = df.iloc[start : start + batch_size]
+            rows = []
+            for _, row in chunk.iterrows():
+                rows.append((int(row["id"]), float_list_to_blob(row["emb"])))
+            conn.executemany("INSERT INTO train (id, vector) VALUES (?, ?)", rows)
+            conn.commit()
+
+            total_inserted += len(rows)
+            if total_inserted % 100000 < batch_size:
+                elapsed = time.perf_counter() - global_t0
+                rate = total_inserted / elapsed if elapsed > 0 else 0
+                print(
+                    f"    {total_inserted:>10}  {elapsed:.0f}s  {rate:.0f} rows/s",
+                    flush=True,
+                )
+
+        shard_elapsed = time.perf_counter() - t0
+        print(f"    shard done: {shard_len} rows in {shard_elapsed:.1f}s")
+
+    elapsed = time.perf_counter() - global_t0
+    print(f"  {total_inserted} train vectors in {elapsed:.1f}s")
+
+    conn.close()
+    size_mb = os.path.getsize(db_path) / (1024 * 1024)
+    print(f"\nDone: {db_path} ({size_mb:.0f} MB)")
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks-ann/datasets/cohere1m/.gitignore
+++ b/benchmarks-ann/datasets/cohere1m/.gitignore
--- a/benchmarks-ann/datasets/cohere1m/Makefile
+++ b/benchmarks-ann/datasets/cohere1m/Makefile
--- a/benchmarks-ann/datasets/cohere1m/build_base_db.py
+++ b/benchmarks-ann/datasets/cohere1m/build_base_db.py
--- a/benchmarks-ann/datasets/nyt-1024/Makefile
+++ b/benchmarks-ann/datasets/nyt-1024/Makefile
@ -0,0 +1,30 @@
+MODEL ?= mixedbread-ai/mxbai-embed-large-v1
+K ?= 100
+BATCH_SIZE ?= 256
+DATA_DIR ?= ../nyt/data
+
+all: base.db
+
+# Reuse data from ../nyt
+$(DATA_DIR):
+	$(MAKE) -C ../nyt data
+
+contents.db: $(DATA_DIR)
+	uv run ../nyt-768/build-contents.py --data-dir $(DATA_DIR) -o $@
+
+base.db: contents.db queries.txt
+	uv run build-base.py \
+		--contents-db contents.db \
+		--model $(MODEL) \
+		--queries-file queries.txt \
+		--batch-size $(BATCH_SIZE) \
+		--k $(K) \
+		-o $@
+
+queries.txt:
+	cp ../nyt/queries.txt $@
+
+clean:
+	rm -f base.db contents.db
+
+.PHONY: all clean
--- a/benchmarks-ann/datasets/nyt-1024/build-base.py
+++ b/benchmarks-ann/datasets/nyt-1024/build-base.py
@ -0,0 +1,163 @@
+# /// script
+# requires-python = ">=3.12"
+# dependencies = [
+#     "sentence-transformers",
+#     "torch<=2.7",
+#     "tqdm",
+# ]
+# ///
+
+import argparse
+import sqlite3
+from array import array
+from itertools import batched
+
+from sentence_transformers import SentenceTransformer
+from tqdm import tqdm
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Build base.db with train vectors, query vectors, and brute-force KNN neighbors",
+    )
+    parser.add_argument(
+        "--contents-db", "-c", default=None,
+        help="Path to contents.db (source of headlines and IDs)",
+    )
+    parser.add_argument(
+        "--model", "-m", default="mixedbread-ai/mxbai-embed-large-v1",
+        help="HuggingFace model ID (default: mixedbread-ai/mxbai-embed-large-v1)",
+    )
+    parser.add_argument(
+        "--queries-file", "-q", default="queries.txt",
+        help="Path to the queries file (default: queries.txt)",
+    )
+    parser.add_argument(
+        "--output", "-o", required=True,
+        help="Path to the output base.db",
+    )
+    parser.add_argument(
+        "--batch-size", "-b", type=int, default=256,
+        help="Batch size for embedding (default: 256)",
+    )
+    parser.add_argument(
+        "--k", "-k", type=int, default=100,
+        help="Number of nearest neighbors (default: 100)",
+    )
+    parser.add_argument(
+        "--limit", "-l", type=int, default=0,
+        help="Limit number of headlines to embed (0 = all)",
+    )
+    parser.add_argument(
+        "--vec-path", "-v", default="~/projects/sqlite-vec/dist/vec0",
+        help="Path to sqlite-vec extension (default: ~/projects/sqlite-vec/dist/vec0)",
+    )
+    parser.add_argument(
+        "--skip-neighbors", action="store_true",
+        help="Skip the brute-force KNN neighbor computation",
+    )
+    args = parser.parse_args()
+
+    import os
+    vec_path = os.path.expanduser(args.vec_path)
+
+    print(f"Loading model {args.model}...")
+    model = SentenceTransformer(args.model)
+
+    # Read headlines from contents.db
+    src = sqlite3.connect(args.contents_db)
+    limit_clause = f" LIMIT {args.limit}" if args.limit > 0 else ""
+    headlines = src.execute(
+        f"SELECT id, headline FROM contents ORDER BY id{limit_clause}"
+    ).fetchall()
+    src.close()
+    print(f"Loaded {len(headlines)} headlines from {args.contents_db}")
+
+    # Read queries
+    with open(args.queries_file) as f:
+        queries = [line.strip() for line in f if line.strip()]
+    print(f"Loaded {len(queries)} queries from {args.queries_file}")
+
+    # Create output database
+    db = sqlite3.connect(args.output)
+    db.enable_load_extension(True)
+    db.load_extension(vec_path)
+    db.enable_load_extension(False)
+
+    db.execute("CREATE TABLE IF NOT EXISTS train(id INTEGER PRIMARY KEY, vector BLOB)")
+    db.execute("CREATE TABLE IF NOT EXISTS query_vectors(id INTEGER PRIMARY KEY, vector BLOB)")
+    db.execute(
+        "CREATE TABLE IF NOT EXISTS neighbors("
+        "  query_vector_id INTEGER, rank INTEGER, neighbors_id TEXT,"
+        "  UNIQUE(query_vector_id, rank))"
+    )
+
+    # Step 1: Embed headlines -> train table
+    print("Embedding headlines...")
+    for batch in tqdm(
+        batched(headlines, args.batch_size),
+        total=(len(headlines) + args.batch_size - 1) // args.batch_size,
+    ):
+        ids = [r[0] for r in batch]
+        texts = [r[1] for r in batch]
+        embeddings = model.encode(texts, normalize_embeddings=True)
+
+        params = [
+            (int(rid), array("f", emb.tolist()).tobytes())
+            for rid, emb in zip(ids, embeddings)
+        ]
+        db.executemany("INSERT INTO train VALUES (?, ?)", params)
+        db.commit()
+
+    del headlines
+    n = db.execute("SELECT count(*) FROM train").fetchone()[0]
+    print(f"Embedded {n} headlines")
+
+    # Step 2: Embed queries -> query_vectors table
+    print("Embedding queries...")
+    query_embeddings = model.encode(queries, normalize_embeddings=True)
+    query_params = []
+    for i, emb in enumerate(query_embeddings, 1):
+        blob = array("f", emb.tolist()).tobytes()
+        query_params.append((i, blob))
+    db.executemany("INSERT INTO query_vectors VALUES (?, ?)", query_params)
+    db.commit()
+    print(f"Embedded {len(queries)} queries")
+
+    if args.skip_neighbors:
+        db.close()
+        print(f"Done (skipped neighbors). Wrote {args.output}")
+        return
+
+    # Step 3: Brute-force KNN via sqlite-vec -> neighbors table
+    n_queries = db.execute("SELECT count(*) FROM query_vectors").fetchone()[0]
+    print(f"Computing {args.k}-NN for {n_queries} queries via sqlite-vec...")
+    for query_id, query_blob in tqdm(
+        db.execute("SELECT id, vector FROM query_vectors").fetchall()
+    ):
+        results = db.execute(
+            """
+            SELECT
+                train.id,
+                vec_distance_cosine(train.vector, ?) AS distance
+            FROM train
+            WHERE distance IS NOT NULL
+            ORDER BY distance ASC
+            LIMIT ?
+            """,
+            (query_blob, args.k),
+        ).fetchall()
+
+        params = [
+            (query_id, rank, str(rid))
+            for rank, (rid, _dist) in enumerate(results)
+        ]
+        db.executemany("INSERT INTO neighbors VALUES (?, ?, ?)", params)
+
+    db.commit()
+    db.close()
+    print(f"Done. Wrote {args.output}")
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks-ann/datasets/nyt-1024/queries.txt
+++ b/benchmarks-ann/datasets/nyt-1024/queries.txt
@ -0,0 +1,100 @@
+latest news on climate change policy
+presidential election results and analysis
+stock market crash causes
+coronavirus vaccine development updates
+artificial intelligence breakthrough in healthcare
+supreme court ruling on abortion rights
+tech companies layoff announcements
+earthquake damages in California
+cybersecurity breach at major corporation
+space exploration mission to Mars
+immigration reform legislation debate
+renewable energy investment trends
+healthcare costs rising across America
+protests against police brutality
+wildfires destroy homes in the West
+Olympic games highlights and records
+celebrity scandal rocks Hollywood
+breakthrough cancer treatment discovered
+housing market bubble concerns
+federal reserve interest rate decision
+school shooting tragedy response
+diplomatic tensions between superpowers
+drone strike kills terrorist leader
+social media platform faces regulation
+archaeological discovery reveals ancient civilization
+unemployment rate hits record low
+autonomous vehicles testing expansion
+streaming service launches original content
+opioid crisis intervention programs
+trade war tariffs impact economy
+infrastructure bill passes Congress
+data privacy concerns grow
+minimum wage increase proposal
+college admissions scandal exposed
+NFL player protest during anthem
+cryptocurrency regulation debate
+pandemic lockdown restrictions eased
+mass shooting gun control debate
+tax reform legislation impact
+ransomware attack cripples pipeline
+climate activists stage demonstration
+sports team wins championship
+banking system collapse fears
+pharmaceutical company fraud charges
+genetic engineering ethical concerns
+border wall funding controversy
+impeachment proceedings begin
+nuclear weapons treaty violation
+artificial meat alternative launch
+student loan debt forgiveness
+venture capital funding decline
+facial recognition ban proposed
+election interference investigation
+pandemic preparedness failures
+police reform measures announced
+wildfire prevention strategies
+ocean pollution crisis worsens
+manufacturing jobs returning
+pension fund shortfall concerns
+antitrust investigation launched
+voting rights protection act
+mental health awareness campaign
+homeless population increasing
+space debris collision risk
+drug cartel violence escalates
+renewable energy jobs growth
+infrastructure deterioration report
+vaccine mandate legal challenge
+cryptocurrency market volatility
+autonomous drone delivery service
+deep fake technology dangers
+Arctic ice melting accelerates
+income inequality gap widens
+election fraud claims disputed
+corporate merger blocked
+medical breakthrough extends life
+transportation strike disrupts city
+racial justice protests spread
+carbon emissions reduction goals
+financial crisis warning signs
+cyberbullying prevention efforts
+asteroid near miss with Earth
+gene therapy approval granted
+labor union organizing drive
+surveillance technology expansion
+education funding cuts proposed
+disaster relief efforts underway
+housing affordability crisis
+clean water access shortage
+artificial intelligence job displacement
+trade agreement negotiations
+prison reform initiative launched
+species extinction accelerates
+political corruption scandal
+terrorism threat level raised
+food safety contamination outbreak
+ai model release
+affordability interest rates
+peanut allergies in newbons
+breaking bad walter white
--- a/benchmarks-ann/datasets/nyt-384/Makefile
+++ b/benchmarks-ann/datasets/nyt-384/Makefile
@ -0,0 +1,29 @@
+MODEL ?= mixedbread-ai/mxbai-embed-xsmall-v1
+K ?= 100
+BATCH_SIZE ?= 512
+DATA_DIR ?= ../nyt/data
+
+all: base.db
+
+$(DATA_DIR):
+	$(MAKE) -C ../nyt data
+
+contents.db: $(DATA_DIR)
+	uv run ../nyt-768/build-contents.py --data-dir $(DATA_DIR) -o $@
+
+base.db: contents.db queries.txt
+	uv run ../nyt-1024/build-base.py \
+		--contents-db contents.db \
+		--model $(MODEL) \
+		--queries-file queries.txt \
+		--batch-size $(BATCH_SIZE) \
+		--k $(K) \
+		-o $@
+
+queries.txt:
+	cp ../nyt/queries.txt $@
+
+clean:
+	rm -f base.db contents.db
+
+.PHONY: all clean
--- a/benchmarks-ann/datasets/nyt-384/queries.txt
+++ b/benchmarks-ann/datasets/nyt-384/queries.txt
@ -0,0 +1,100 @@
+latest news on climate change policy
+presidential election results and analysis
+stock market crash causes
+coronavirus vaccine development updates
+artificial intelligence breakthrough in healthcare
+supreme court ruling on abortion rights
+tech companies layoff announcements
+earthquake damages in California
+cybersecurity breach at major corporation
+space exploration mission to Mars
+immigration reform legislation debate
+renewable energy investment trends
+healthcare costs rising across America
+protests against police brutality
+wildfires destroy homes in the West
+Olympic games highlights and records
+celebrity scandal rocks Hollywood
+breakthrough cancer treatment discovered
+housing market bubble concerns
+federal reserve interest rate decision
+school shooting tragedy response
+diplomatic tensions between superpowers
+drone strike kills terrorist leader
+social media platform faces regulation
+archaeological discovery reveals ancient civilization
+unemployment rate hits record low
+autonomous vehicles testing expansion
+streaming service launches original content
+opioid crisis intervention programs
+trade war tariffs impact economy
+infrastructure bill passes Congress
+data privacy concerns grow
+minimum wage increase proposal
+college admissions scandal exposed
+NFL player protest during anthem
+cryptocurrency regulation debate
+pandemic lockdown restrictions eased
+mass shooting gun control debate
+tax reform legislation impact
+ransomware attack cripples pipeline
+climate activists stage demonstration
+sports team wins championship
+banking system collapse fears
+pharmaceutical company fraud charges
+genetic engineering ethical concerns
+border wall funding controversy
+impeachment proceedings begin
+nuclear weapons treaty violation
+artificial meat alternative launch
+student loan debt forgiveness
+venture capital funding decline
+facial recognition ban proposed
+election interference investigation
+pandemic preparedness failures
+police reform measures announced
+wildfire prevention strategies
+ocean pollution crisis worsens
+manufacturing jobs returning
+pension fund shortfall concerns
+antitrust investigation launched
+voting rights protection act
+mental health awareness campaign
+homeless population increasing
+space debris collision risk
+drug cartel violence escalates
+renewable energy jobs growth
+infrastructure deterioration report
+vaccine mandate legal challenge
+cryptocurrency market volatility
+autonomous drone delivery service
+deep fake technology dangers
+Arctic ice melting accelerates
+income inequality gap widens
+election fraud claims disputed
+corporate merger blocked
+medical breakthrough extends life
+transportation strike disrupts city
+racial justice protests spread
+carbon emissions reduction goals
+financial crisis warning signs
+cyberbullying prevention efforts
+asteroid near miss with Earth
+gene therapy approval granted
+labor union organizing drive
+surveillance technology expansion
+education funding cuts proposed
+disaster relief efforts underway
+housing affordability crisis
+clean water access shortage
+artificial intelligence job displacement
+trade agreement negotiations
+prison reform initiative launched
+species extinction accelerates
+political corruption scandal
+terrorism threat level raised
+food safety contamination outbreak
+ai model release
+affordability interest rates
+peanut allergies in newbons
+breaking bad walter white
--- a/benchmarks-ann/datasets/nyt-768/Makefile
+++ b/benchmarks-ann/datasets/nyt-768/Makefile
@ -0,0 +1,37 @@
+MODEL ?= bge-base-en-v1.5-768
+K ?= 100
+BATCH_SIZE ?= 512
+DATA_DIR ?= ../nyt/data
+
+all: base.db
+
+# Reuse data from ../nyt
+$(DATA_DIR):
+	$(MAKE) -C ../nyt data
+
+# Distill model (separate step, may take a while)
+$(MODEL):
+	uv run distill-model.py
+
+contents.db: $(DATA_DIR)
+	uv run build-contents.py --data-dir $(DATA_DIR) -o $@
+
+base.db: contents.db queries.txt $(MODEL)
+	uv run ../nyt/build-base.py \
+		--contents-db contents.db \
+		--model $(MODEL) \
+		--queries-file queries.txt \
+		--batch-size $(BATCH_SIZE) \
+		--k $(K) \
+		-o $@
+
+queries.txt:
+	cp ../nyt/queries.txt $@
+
+clean:
+	rm -f base.db contents.db
+
+clean-all: clean
+	rm -rf $(MODEL)
+
+.PHONY: all clean clean-all
--- a/benchmarks-ann/datasets/nyt-768/build-contents.py
+++ b/benchmarks-ann/datasets/nyt-768/build-contents.py
@ -0,0 +1,64 @@
+# /// script
+# requires-python = ">=3.12"
+# dependencies = [
+#     "duckdb",
+# ]
+# ///
+
+import argparse
+import sqlite3
+import duckdb
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Load NYT headline CSVs into a SQLite contents database (most recent 1M, deduplicated)",
+    )
+    parser.add_argument(
+        "--data-dir", "-d", default="../nyt/data",
+        help="Directory containing NYT CSV files (default: ../nyt/data)",
+    )
+    parser.add_argument(
+        "--limit", "-l", type=int, default=1_000_000,
+        help="Maximum number of headlines to keep (default: 1000000)",
+    )
+    parser.add_argument(
+        "--output", "-o", required=True,
+        help="Path to the output SQLite database",
+    )
+    args = parser.parse_args()
+
+    glob_pattern = f"{args.data_dir}/new_york_times_stories_*.csv"
+
+    con = duckdb.connect()
+    rows = con.execute(
+        f"""
+        WITH deduped AS (
+            SELECT
+                headline,
+                max(pub_date) AS pub_date
+            FROM read_csv('{glob_pattern}', auto_detect=true, union_by_name=true)
+            WHERE headline IS NOT NULL AND trim(headline) != ''
+            GROUP BY headline
+        )
+        SELECT
+            row_number() OVER (ORDER BY pub_date DESC) AS id,
+            headline
+        FROM deduped
+        ORDER BY pub_date DESC
+        LIMIT {args.limit}
+        """
+    ).fetchall()
+    con.close()
+
+    db = sqlite3.connect(args.output)
+    db.execute("CREATE TABLE contents(id INTEGER PRIMARY KEY, headline TEXT)")
+    db.executemany("INSERT INTO contents VALUES (?, ?)", rows)
+    db.commit()
+    db.close()
+
+    print(f"Wrote {len(rows)} headlines to {args.output}")
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks-ann/datasets/nyt-768/distill-model.py
+++ b/benchmarks-ann/datasets/nyt-768/distill-model.py
@ -0,0 +1,13 @@
+# /// script
+# requires-python = ">=3.12"
+# dependencies = [
+#     "model2vec[distill]",
+#     "torch<=2.7",
+# ]
+# ///
+
+from model2vec.distill import distill
+
+model = distill(model_name="BAAI/bge-base-en-v1.5", pca_dims=768)
+model.save_pretrained("bge-base-en-v1.5-768")
+print("Saved distilled model to bge-base-en-v1.5-768/")
--- a/benchmarks-ann/datasets/nyt-768/queries.txt
+++ b/benchmarks-ann/datasets/nyt-768/queries.txt
@ -0,0 +1,100 @@
+latest news on climate change policy
+presidential election results and analysis
+stock market crash causes
+coronavirus vaccine development updates
+artificial intelligence breakthrough in healthcare
+supreme court ruling on abortion rights
+tech companies layoff announcements
+earthquake damages in California
+cybersecurity breach at major corporation
+space exploration mission to Mars
+immigration reform legislation debate
+renewable energy investment trends
+healthcare costs rising across America
+protests against police brutality
+wildfires destroy homes in the West
+Olympic games highlights and records
+celebrity scandal rocks Hollywood
+breakthrough cancer treatment discovered
+housing market bubble concerns
+federal reserve interest rate decision
+school shooting tragedy response
+diplomatic tensions between superpowers
+drone strike kills terrorist leader
+social media platform faces regulation
+archaeological discovery reveals ancient civilization
+unemployment rate hits record low
+autonomous vehicles testing expansion
+streaming service launches original content
+opioid crisis intervention programs
+trade war tariffs impact economy
+infrastructure bill passes Congress
+data privacy concerns grow
+minimum wage increase proposal
+college admissions scandal exposed
+NFL player protest during anthem
+cryptocurrency regulation debate
+pandemic lockdown restrictions eased
+mass shooting gun control debate
+tax reform legislation impact
+ransomware attack cripples pipeline
+climate activists stage demonstration
+sports team wins championship
+banking system collapse fears
+pharmaceutical company fraud charges
+genetic engineering ethical concerns
+border wall funding controversy
+impeachment proceedings begin
+nuclear weapons treaty violation
+artificial meat alternative launch
+student loan debt forgiveness
+venture capital funding decline
+facial recognition ban proposed
+election interference investigation
+pandemic preparedness failures
+police reform measures announced
+wildfire prevention strategies
+ocean pollution crisis worsens
+manufacturing jobs returning
+pension fund shortfall concerns
+antitrust investigation launched
+voting rights protection act
+mental health awareness campaign
+homeless population increasing
+space debris collision risk
+drug cartel violence escalates
+renewable energy jobs growth
+infrastructure deterioration report
+vaccine mandate legal challenge
+cryptocurrency market volatility
+autonomous drone delivery service
+deep fake technology dangers
+Arctic ice melting accelerates
+income inequality gap widens
+election fraud claims disputed
+corporate merger blocked
+medical breakthrough extends life
+transportation strike disrupts city
+racial justice protests spread
+carbon emissions reduction goals
+financial crisis warning signs
+cyberbullying prevention efforts
+asteroid near miss with Earth
+gene therapy approval granted
+labor union organizing drive
+surveillance technology expansion
+education funding cuts proposed
+disaster relief efforts underway
+housing affordability crisis
+clean water access shortage
+artificial intelligence job displacement
+trade agreement negotiations
+prison reform initiative launched
+species extinction accelerates
+political corruption scandal
+terrorism threat level raised
+food safety contamination outbreak
+ai model release
+affordability interest rates
+peanut allergies in newbons
+breaking bad walter white
--- a/benchmarks-ann/datasets/nyt/.gitignore
+++ b/benchmarks-ann/datasets/nyt/.gitignore
@ -0,0 +1 @@
+data/
--- a/benchmarks-ann/datasets/nyt/Makefile
+++ b/benchmarks-ann/datasets/nyt/Makefile
@ -0,0 +1,30 @@
+MODEL ?= minishlab/potion-base-8M
+K ?= 100
+BATCH_SIZE ?= 512
+DATA_DIR ?= data
+
+all: base.db contents.db
+
+# Download NYT headlines CSVs from Kaggle (requires `kaggle` CLI + API token)
+$(DATA_DIR):
+	kaggle datasets download -d johnbandy/new-york-times-headlines -p $(DATA_DIR) --unzip
+
+contents.db: $(DATA_DIR)
+	uv run build-contents.py --data-dir $(DATA_DIR) -o $@
+
+base.db: contents.db queries.txt
+	uv run build-base.py \
+		--contents-db contents.db \
+		--model $(MODEL) \
+		--queries-file queries.txt \
+		--batch-size $(BATCH_SIZE) \
+		--k $(K) \
+		-o $@
+
+clean:
+	rm -f base.db contents.db
+
+clean-all: clean
+	rm -rf $(DATA_DIR)
+
+.PHONY: all clean clean-all
--- a/benchmarks-ann/datasets/nyt/build-base.py
+++ b/benchmarks-ann/datasets/nyt/build-base.py
@ -0,0 +1,165 @@
+# /// script
+# requires-python = ">=3.12"
+# dependencies = [
+#     "model2vec",
+#     "torch<=2.7",
+#     "tqdm",
+# ]
+# ///
+
+import argparse
+import sqlite3
+from array import array
+from itertools import batched
+
+from model2vec import StaticModel
+from tqdm import tqdm
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Build base.db with train vectors, query vectors, and brute-force KNN neighbors",
+    )
+    parser.add_argument(
+        "--contents-db", "-c", default=None,
+        help="Path to contents.db (source of headlines and IDs)",
+    )
+    parser.add_argument(
+        "--model", "-m", default="minishlab/potion-base-8M",
+        help="HuggingFace model ID or local path (default: minishlab/potion-base-8M)",
+    )
+    parser.add_argument(
+        "--queries-file", "-q", default="queries.txt",
+        help="Path to the queries file (default: queries.txt)",
+    )
+    parser.add_argument(
+        "--output", "-o", required=True,
+        help="Path to the output base.db",
+    )
+    parser.add_argument(
+        "--batch-size", "-b", type=int, default=512,
+        help="Batch size for embedding (default: 512)",
+    )
+    parser.add_argument(
+        "--k", "-k", type=int, default=100,
+        help="Number of nearest neighbors (default: 100)",
+    )
+    parser.add_argument(
+        "--vec-path", "-v", default="~/projects/sqlite-vec/dist/vec0",
+        help="Path to sqlite-vec extension (default: ~/projects/sqlite-vec/dist/vec0)",
+    )
+    parser.add_argument(
+        "--rebuild-neighbors", action="store_true",
+        help="Only rebuild the neighbors table (skip embedding steps)",
+    )
+    args = parser.parse_args()
+
+    import os
+    vec_path = os.path.expanduser(args.vec_path)
+
+    if args.rebuild_neighbors:
+        # Skip embedding, just open existing DB and rebuild neighbors
+        db = sqlite3.connect(args.output)
+        db.enable_load_extension(True)
+        db.load_extension(vec_path)
+        db.enable_load_extension(False)
+        db.execute("DROP TABLE IF EXISTS neighbors")
+        db.execute(
+            "CREATE TABLE neighbors("
+            "  query_vector_id INTEGER, rank INTEGER, neighbors_id TEXT,"
+            "  UNIQUE(query_vector_id, rank))"
+        )
+        print(f"Rebuilding neighbors in {args.output}...")
+    else:
+        print(f"Loading model {args.model}...")
+        model = StaticModel.from_pretrained(args.model)
+
+        # Read headlines from contents.db
+        src = sqlite3.connect(args.contents_db)
+        headlines = src.execute("SELECT id, headline FROM contents ORDER BY id").fetchall()
+        src.close()
+        print(f"Loaded {len(headlines)} headlines from {args.contents_db}")
+
+        # Read queries
+        with open(args.queries_file) as f:
+            queries = [line.strip() for line in f if line.strip()]
+        print(f"Loaded {len(queries)} queries from {args.queries_file}")
+
+        # Create output database
+        db = sqlite3.connect(args.output)
+        db.enable_load_extension(True)
+        db.load_extension(vec_path)
+        db.enable_load_extension(False)
+
+        db.execute("CREATE TABLE train(id INTEGER PRIMARY KEY, vector BLOB)")
+        db.execute("CREATE TABLE query_vectors(id INTEGER PRIMARY KEY, vector BLOB)")
+        db.execute(
+            "CREATE TABLE neighbors("
+            "  query_vector_id INTEGER, rank INTEGER, neighbors_id TEXT,"
+            "  UNIQUE(query_vector_id, rank))"
+        )
+
+        # Step 1: Embed headlines -> train table
+        print("Embedding headlines...")
+        for batch in tqdm(
+            batched(headlines, args.batch_size),
+            total=(len(headlines) + args.batch_size - 1) // args.batch_size,
+        ):
+            ids = [r[0] for r in batch]
+            texts = [r[1] for r in batch]
+            embeddings = model.encode(texts)
+
+            params = [
+                (int(rid), array("f", emb.tolist()).tobytes())
+                for rid, emb in zip(ids, embeddings)
+            ]
+            db.executemany("INSERT INTO train VALUES (?, ?)", params)
+            db.commit()
+
+        del headlines
+        n = db.execute("SELECT count(*) FROM train").fetchone()[0]
+        print(f"Embedded {n} headlines")
+
+        # Step 2: Embed queries -> query_vectors table
+        print("Embedding queries...")
+        query_embeddings = model.encode(queries)
+        query_params = []
+        for i, emb in enumerate(query_embeddings, 1):
+            blob = array("f", emb.tolist()).tobytes()
+            query_params.append((i, blob))
+        db.executemany("INSERT INTO query_vectors VALUES (?, ?)", query_params)
+        db.commit()
+        print(f"Embedded {len(queries)} queries")
+
+    # Step 3: Brute-force KNN via sqlite-vec -> neighbors table
+    n_queries = db.execute("SELECT count(*) FROM query_vectors").fetchone()[0]
+    print(f"Computing {args.k}-NN for {n_queries} queries via sqlite-vec...")
+    for query_id, query_blob in tqdm(
+        db.execute("SELECT id, vector FROM query_vectors").fetchall()
+    ):
+        results = db.execute(
+            """
+            SELECT
+                train.id,
+                vec_distance_cosine(train.vector, ?) AS distance
+            FROM train
+            WHERE distance IS NOT NULL
+            ORDER BY distance ASC
+            LIMIT ?
+            """,
+            (query_blob, args.k),
+        ).fetchall()
+
+        params = [
+            (query_id, rank, str(rid))
+            for rank, (rid, _dist) in enumerate(results)
+        ]
+        db.executemany("INSERT INTO neighbors VALUES (?, ?, ?)", params)
+
+    db.commit()
+    db.close()
+    print(f"Done. Wrote {args.output}")
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks-ann/datasets/nyt/build-contents.py
+++ b/benchmarks-ann/datasets/nyt/build-contents.py
@ -0,0 +1,52 @@
+# /// script
+# requires-python = ">=3.12"
+# dependencies = [
+#     "duckdb",
+# ]
+# ///
+
+import argparse
+import os
+import sqlite3
+import duckdb
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Load NYT headline CSVs into a SQLite contents database via DuckDB",
+    )
+    parser.add_argument(
+        "--data-dir", "-d", default="data",
+        help="Directory containing NYT CSV files (default: data)",
+    )
+    parser.add_argument(
+        "--output", "-o", required=True,
+        help="Path to the output SQLite database",
+    )
+    args = parser.parse_args()
+
+    glob_pattern = os.path.join(args.data_dir, "new_york_times_stories_*.csv")
+
+    con = duckdb.connect()
+    rows = con.execute(
+        f"""
+        SELECT
+            row_number() OVER () AS id,
+            headline
+        FROM read_csv('{glob_pattern}', auto_detect=true, union_by_name=true)
+        WHERE headline IS NOT NULL AND headline != ''
+        """
+    ).fetchall()
+    con.close()
+
+    db = sqlite3.connect(args.output)
+    db.execute("CREATE TABLE contents(id INTEGER PRIMARY KEY, headline TEXT)")
+    db.executemany("INSERT INTO contents VALUES (?, ?)", rows)
+    db.commit()
+    db.close()
+
+    print(f"Wrote {len(rows)} headlines to {args.output}")
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks-ann/datasets/nyt/queries.txt
+++ b/benchmarks-ann/datasets/nyt/queries.txt
@ -0,0 +1,100 @@
+latest news on climate change policy
+presidential election results and analysis
+stock market crash causes
+coronavirus vaccine development updates
+artificial intelligence breakthrough in healthcare
+supreme court ruling on abortion rights
+tech companies layoff announcements
+earthquake damages in California
+cybersecurity breach at major corporation
+space exploration mission to Mars
+immigration reform legislation debate
+renewable energy investment trends
+healthcare costs rising across America
+protests against police brutality
+wildfires destroy homes in the West
+Olympic games highlights and records
+celebrity scandal rocks Hollywood
+breakthrough cancer treatment discovered
+housing market bubble concerns
+federal reserve interest rate decision
+school shooting tragedy response
+diplomatic tensions between superpowers
+drone strike kills terrorist leader
+social media platform faces regulation
+archaeological discovery reveals ancient civilization
+unemployment rate hits record low
+autonomous vehicles testing expansion
+streaming service launches original content
+opioid crisis intervention programs
+trade war tariffs impact economy
+infrastructure bill passes Congress
+data privacy concerns grow
+minimum wage increase proposal
+college admissions scandal exposed
+NFL player protest during anthem
+cryptocurrency regulation debate
+pandemic lockdown restrictions eased
+mass shooting gun control debate
+tax reform legislation impact
+ransomware attack cripples pipeline
+climate activists stage demonstration
+sports team wins championship
+banking system collapse fears
+pharmaceutical company fraud charges
+genetic engineering ethical concerns
+border wall funding controversy
+impeachment proceedings begin
+nuclear weapons treaty violation
+artificial meat alternative launch
+student loan debt forgiveness
+venture capital funding decline
+facial recognition ban proposed
+election interference investigation
+pandemic preparedness failures
+police reform measures announced
+wildfire prevention strategies
+ocean pollution crisis worsens
+manufacturing jobs returning
+pension fund shortfall concerns
+antitrust investigation launched
+voting rights protection act
+mental health awareness campaign
+homeless population increasing
+space debris collision risk
+drug cartel violence escalates
+renewable energy jobs growth
+infrastructure deterioration report
+vaccine mandate legal challenge
+cryptocurrency market volatility
+autonomous drone delivery service
+deep fake technology dangers
+Arctic ice melting accelerates
+income inequality gap widens
+election fraud claims disputed
+corporate merger blocked
+medical breakthrough extends life
+transportation strike disrupts city
+racial justice protests spread
+carbon emissions reduction goals
+financial crisis warning signs
+cyberbullying prevention efforts
+asteroid near miss with Earth
+gene therapy approval granted
+labor union organizing drive
+surveillance technology expansion
+education funding cuts proposed
+disaster relief efforts underway
+housing affordability crisis
+clean water access shortage
+artificial intelligence job displacement
+trade agreement negotiations
+prison reform initiative launched
+species extinction accelerates
+political corruption scandal
+terrorism threat level raised
+food safety contamination outbreak
+ai model release
+affordability interest rates
+peanut allergies in newbons
+breaking bad walter white
--- a/benchmarks-ann/faiss_kmeans.py
+++ b/benchmarks-ann/faiss_kmeans.py
@ -0,0 +1,101 @@
+#!/usr/bin/env python3
+"""Compute k-means centroids using FAISS and save to a centroids DB.
+
+Reads the first N vectors from a base.db, runs FAISS k-means, and writes
+the centroids to an output SQLite DB as float32 blobs.
+
+Usage:
+  python faiss_kmeans.py --base-db datasets/cohere10m/base.db --ntrain 100000 \
+    --nclusters 8192 -o centroids.db
+
+Output schema:
+  CREATE TABLE centroids (
+    centroid_id INTEGER PRIMARY KEY,
+    centroid BLOB NOT NULL          -- float32[D]
+  );
+  CREATE TABLE meta (key TEXT PRIMARY KEY, value TEXT);
+    -- ntrain, nclusters, dimensions, elapsed_s
+"""
+import argparse
+import os
+import sqlite3
+import struct
+import time
+
+import faiss
+import numpy as np
+
+
+def main():
+    parser = argparse.ArgumentParser(description="FAISS k-means centroid computation")
+    parser.add_argument("--base-db", required=True, help="path to base.db with train table")
+    parser.add_argument("--ntrain", type=int, required=True, help="number of vectors to train on")
+    parser.add_argument("--nclusters", type=int, required=True, help="number of clusters (nlist)")
+    parser.add_argument("--niter", type=int, default=20, help="k-means iterations (default 20)")
+    parser.add_argument("--seed", type=int, default=42, help="random seed")
+    parser.add_argument("-o", "--output", required=True, help="output centroids DB path")
+    args = parser.parse_args()
+
+    # Load vectors
+    print(f"Loading {args.ntrain} vectors from {args.base_db}...")
+    conn = sqlite3.connect(args.base_db)
+    rows = conn.execute(
+        "SELECT vector FROM train ORDER BY id LIMIT ?", (args.ntrain,)
+    ).fetchall()
+    conn.close()
+
+    # Parse float32 blobs to numpy
+    first_blob = rows[0][0]
+    D = len(first_blob) // 4  # float32
+    print(f"  Dimensions: {D}, loaded {len(rows)} vectors")
+
+    vectors = np.zeros((len(rows), D), dtype=np.float32)
+    for i, (blob,) in enumerate(rows):
+        vectors[i] = np.frombuffer(blob, dtype=np.float32)
+
+    # Normalize for cosine distance (FAISS k-means on L2 of unit vectors ≈ cosine)
+    norms = np.linalg.norm(vectors, axis=1, keepdims=True)
+    norms[norms == 0] = 1
+    vectors /= norms
+
+    # Run FAISS k-means
+    print(f"Running k-means: {args.nclusters} clusters, {args.niter} iterations...")
+    t0 = time.perf_counter()
+    kmeans = faiss.Kmeans(
+        D, args.nclusters,
+        niter=args.niter,
+        seed=args.seed,
+        verbose=True,
+        gpu=False,
+    )
+    kmeans.train(vectors)
+    elapsed = time.perf_counter() - t0
+    print(f"  Done in {elapsed:.1f}s")
+
+    centroids = kmeans.centroids  # (nclusters, D) float32
+
+    # Write output DB
+    if os.path.exists(args.output):
+        os.remove(args.output)
+    out = sqlite3.connect(args.output)
+    out.execute("CREATE TABLE centroids (centroid_id INTEGER PRIMARY KEY, centroid BLOB NOT NULL)")
+    out.execute("CREATE TABLE meta (key TEXT PRIMARY KEY, value TEXT)")
+
+    for i in range(args.nclusters):
+        blob = centroids[i].tobytes()
+        out.execute("INSERT INTO centroids (centroid_id, centroid) VALUES (?, ?)", (i, blob))
+
+    out.execute("INSERT INTO meta VALUES ('ntrain', ?)", (str(args.ntrain),))
+    out.execute("INSERT INTO meta VALUES ('nclusters', ?)", (str(args.nclusters),))
+    out.execute("INSERT INTO meta VALUES ('dimensions', ?)", (str(D),))
+    out.execute("INSERT INTO meta VALUES ('niter', ?)", (str(args.niter),))
+    out.execute("INSERT INTO meta VALUES ('elapsed_s', ?)", (str(round(elapsed, 3)),))
+    out.execute("INSERT INTO meta VALUES ('seed', ?)", (str(args.seed),))
+    out.commit()
+    out.close()
+
+    print(f"Wrote {args.nclusters} centroids to {args.output}")
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks-ann/results_schema.sql
+++ b/benchmarks-ann/results_schema.sql
@ -0,0 +1,76 @@
+-- Comprehensive results schema for vec0 KNN benchmark runs.
+-- Created in WAL mode: PRAGMA journal_mode=WAL
+
+CREATE TABLE IF NOT EXISTS runs (
+  run_id          INTEGER PRIMARY KEY AUTOINCREMENT,
+  config_name     TEXT    NOT NULL,
+  index_type      TEXT    NOT NULL,
+  params          TEXT    NOT NULL,   -- JSON: {"R":48,"L":128,"quantizer":"binary"}
+  dataset         TEXT    NOT NULL,   -- "cohere1m"
+  subset_size     INTEGER NOT NULL,
+  k               INTEGER NOT NULL,
+  n_queries       INTEGER NOT NULL,
+  phase           TEXT    NOT NULL DEFAULT 'both',
+    -- 'build', 'query', or 'both'
+  status          TEXT    NOT NULL DEFAULT 'pending',
+    -- pending → inserting → training → querying → done | built | error
+  created_at_ns   INTEGER NOT NULL    -- time.time_ns()
+);
+
+CREATE TABLE IF NOT EXISTS run_results (
+  run_id              INTEGER PRIMARY KEY REFERENCES runs(run_id),
+  insert_started_ns   INTEGER,
+  insert_ended_ns     INTEGER,
+  insert_duration_ns  INTEGER,
+  train_started_ns    INTEGER,       -- NULL if no training
+  train_ended_ns      INTEGER,
+  train_duration_ns   INTEGER,
+  build_duration_ns   INTEGER,       -- insert + train
+  db_file_size_bytes  INTEGER,
+  db_file_path        TEXT,
+  create_sql          TEXT,           -- CREATE VIRTUAL TABLE ...
+  insert_sql          TEXT,           -- INSERT INTO vec_items ...
+  train_sql           TEXT,           -- NULL if no training step
+  query_sql           TEXT,           -- SELECT ... WHERE embedding MATCH ...
+  k                   INTEGER,       -- denormalized from runs for easy filtering
+  query_mean_ms       REAL,          -- denormalized aggregates
+  query_median_ms     REAL,
+  query_p99_ms        REAL,
+  query_total_ms      REAL,
+  qps                 REAL,
+  recall              REAL
+);
+
+CREATE TABLE IF NOT EXISTS insert_batches (
+  batch_id        INTEGER PRIMARY KEY AUTOINCREMENT,
+  run_id          INTEGER NOT NULL REFERENCES runs(run_id),
+  batch_lo        INTEGER NOT NULL,   -- start index (inclusive)
+  batch_hi        INTEGER NOT NULL,   -- end index (exclusive)
+  rows_in_batch   INTEGER NOT NULL,
+  started_ns      INTEGER NOT NULL,
+  ended_ns        INTEGER NOT NULL,
+  duration_ns     INTEGER NOT NULL,
+  cumulative_rows INTEGER NOT NULL,   -- total rows inserted so far
+  rate_rows_per_s REAL    NOT NULL    -- cumulative rate
+);
+
+CREATE TABLE IF NOT EXISTS queries (
+  query_id          INTEGER PRIMARY KEY AUTOINCREMENT,
+  run_id            INTEGER NOT NULL REFERENCES runs(run_id),
+  k                 INTEGER NOT NULL,
+  query_vector_id   INTEGER NOT NULL,
+  started_ns        INTEGER NOT NULL,
+  ended_ns          INTEGER NOT NULL,
+  duration_ms       REAL    NOT NULL,
+  result_ids        TEXT    NOT NULL,  -- JSON array
+  result_distances  TEXT    NOT NULL,  -- JSON array
+  ground_truth_ids  TEXT    NOT NULL,  -- JSON array
+  recall            REAL    NOT NULL,
+  UNIQUE(run_id, k, query_vector_id)
+);
+
+CREATE INDEX IF NOT EXISTS idx_runs_config  ON runs(config_name);
+CREATE INDEX IF NOT EXISTS idx_runs_type    ON runs(index_type);
+CREATE INDEX IF NOT EXISTS idx_runs_status  ON runs(status);
+CREATE INDEX IF NOT EXISTS idx_batches_run  ON insert_batches(run_id);
+CREATE INDEX IF NOT EXISTS idx_queries_run  ON queries(run_id);