Add comprehensive ANN benchmarking suite

Extend benchmarks-ann/ with results database (SQLite with per-query detail and continuous writes), dataset subfolder organization, --subset-size and --warmup options. Supports systematic comparison across flat, rescore, IVF, and DiskANN index types.
2026-06-08 15:05:18 +02:00 · 2026-03-29 19:47:12 -07:00 · 2026-03-29 19:47:12 -07:00 · dbbb4b98f7
commit dbbb4b98f7
parent a248ecd061
26 changed files with 2127 additions and 292 deletions
--- a/benchmarks-ann/datasets/cohere10m/Makefile
+++ b/benchmarks-ann/datasets/cohere10m/Makefile
@ -0,0 +1,27 @@
+BASE_URL = https://assets.zilliz.com/benchmark/cohere_large_10m
+
+TRAIN_PARQUETS = $(shell printf 'train-%02d-of-10.parquet ' 0 1 2 3 4 5 6 7 8 9)
+OTHER_PARQUETS = test.parquet neighbors.parquet
+PARQUETS = $(TRAIN_PARQUETS) $(OTHER_PARQUETS)
+
+.PHONY: all download clean
+
+all: base.db
+
+# Use: make -j12 download
+download: $(PARQUETS)
+
+train-%-of-10.parquet:
+	curl -L -o $@ $(BASE_URL)/$@
+
+test.parquet:
+	curl -L -o $@ $(BASE_URL)/test.parquet
+
+neighbors.parquet:
+	curl -L -o $@ $(BASE_URL)/neighbors.parquet
+
+base.db: $(PARQUETS) build_base_db.py
+	uv run --with pandas --with pyarrow python build_base_db.py
+
+clean:
+	rm -f base.db
--- a/benchmarks-ann/datasets/cohere10m/build_base_db.py
+++ b/benchmarks-ann/datasets/cohere10m/build_base_db.py
@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+"""Build base.db from downloaded parquet files (10M dataset, 10 train shards).
+
+Reads train-00-of-10.parquet .. train-09-of-10.parquet, test.parquet,
+neighbors.parquet and creates a SQLite database with tables:
+  train, query_vectors, neighbors.
+
+Usage:
+  uv run --with pandas --with pyarrow python build_base_db.py
+"""
+import json
+import os
+import sqlite3
+import struct
+import sys
+import time
+
+import pandas as pd
+
+TRAIN_SHARDS = 10
+
+
+def float_list_to_blob(floats):
+    """Pack a list of floats into a little-endian f32 blob."""
+    return struct.pack(f"<{len(floats)}f", *floats)
+
+
+def main():
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    db_path = os.path.join(script_dir, "base.db")
+
+    train_paths = [
+        os.path.join(script_dir, f"train-{i:02d}-of-{TRAIN_SHARDS}.parquet")
+        for i in range(TRAIN_SHARDS)
+    ]
+    test_path = os.path.join(script_dir, "test.parquet")
+    neighbors_path = os.path.join(script_dir, "neighbors.parquet")
+
+    for p in train_paths + [test_path, neighbors_path]:
+        if not os.path.exists(p):
+            print(f"ERROR: {p} not found. Run 'make download' first.")
+            sys.exit(1)
+
+    if os.path.exists(db_path):
+        os.remove(db_path)
+
+    conn = sqlite3.connect(db_path)
+    conn.execute("PRAGMA journal_mode=WAL")
+    conn.execute("PRAGMA page_size=4096")
+
+    # --- query_vectors (from test.parquet) ---
+    print("Loading test.parquet (query vectors)...")
+    t0 = time.perf_counter()
+    df_test = pd.read_parquet(test_path)
+    conn.execute(
+        "CREATE TABLE query_vectors (id INTEGER PRIMARY KEY, vector BLOB)"
+    )
+    rows = []
+    for _, row in df_test.iterrows():
+        rows.append((int(row["id"]), float_list_to_blob(row["emb"])))
+    conn.executemany("INSERT INTO query_vectors (id, vector) VALUES (?, ?)", rows)
+    conn.commit()
+    print(f"  {len(rows)} query vectors in {time.perf_counter() - t0:.1f}s")
+
+    # --- neighbors (from neighbors.parquet) ---
+    print("Loading neighbors.parquet...")
+    t0 = time.perf_counter()
+    df_neighbors = pd.read_parquet(neighbors_path)
+    conn.execute(
+        "CREATE TABLE neighbors ("
+        "  query_vector_id INTEGER, rank INTEGER, neighbors_id TEXT,"
+        "  UNIQUE(query_vector_id, rank))"
+    )
+    rows = []
+    for _, row in df_neighbors.iterrows():
+        qid = int(row["id"])
+        nids = row["neighbors_id"]
+        if isinstance(nids, str):
+            nids = json.loads(nids)
+        for rank, nid in enumerate(nids):
+            rows.append((qid, rank, str(int(nid))))
+    conn.executemany(
+        "INSERT INTO neighbors (query_vector_id, rank, neighbors_id) VALUES (?, ?, ?)",
+        rows,
+    )
+    conn.commit()
+    print(f"  {len(rows)} neighbor rows in {time.perf_counter() - t0:.1f}s")
+
+    # --- train (from 10 shard parquets) ---
+    print(f"Loading {TRAIN_SHARDS} train shards (10M vectors, this will take a while)...")
+    conn.execute(
+        "CREATE TABLE train (id INTEGER PRIMARY KEY, vector BLOB)"
+    )
+
+    global_t0 = time.perf_counter()
+    total_inserted = 0
+    batch_size = 10000
+
+    for shard_idx, train_path in enumerate(train_paths):
+        print(f"  Shard {shard_idx + 1}/{TRAIN_SHARDS}: {os.path.basename(train_path)}")
+        t0 = time.perf_counter()
+        df = pd.read_parquet(train_path)
+        shard_len = len(df)
+
+        for start in range(0, shard_len, batch_size):
+            chunk = df.iloc[start : start + batch_size]
+            rows = []
+            for _, row in chunk.iterrows():
+                rows.append((int(row["id"]), float_list_to_blob(row["emb"])))
+            conn.executemany("INSERT INTO train (id, vector) VALUES (?, ?)", rows)
+            conn.commit()
+
+            total_inserted += len(rows)
+            if total_inserted % 100000 < batch_size:
+                elapsed = time.perf_counter() - global_t0
+                rate = total_inserted / elapsed if elapsed > 0 else 0
+                print(
+                    f"    {total_inserted:>10}  {elapsed:.0f}s  {rate:.0f} rows/s",
+                    flush=True,
+                )
+
+        shard_elapsed = time.perf_counter() - t0
+        print(f"    shard done: {shard_len} rows in {shard_elapsed:.1f}s")
+
+    elapsed = time.perf_counter() - global_t0
+    print(f"  {total_inserted} train vectors in {elapsed:.1f}s")
+
+    conn.close()
+    size_mb = os.path.getsize(db_path) / (1024 * 1024)
+    print(f"\nDone: {db_path} ({size_mb:.0f} MB)")
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks-ann/datasets/cohere1m/.gitignore
+++ b/benchmarks-ann/datasets/cohere1m/.gitignore
@ -0,0 +1,2 @@
+*.parquet
+base.db
--- a/benchmarks-ann/datasets/cohere1m/Makefile
+++ b/benchmarks-ann/datasets/cohere1m/Makefile
@ -0,0 +1,24 @@
+BASE_URL = https://assets.zilliz.com/benchmark/cohere_medium_1m
+
+PARQUETS = train.parquet test.parquet neighbors.parquet
+
+.PHONY: all download base.db clean
+
+all: base.db
+
+download: $(PARQUETS)
+
+train.parquet:
+	curl -L -o $@ $(BASE_URL)/train.parquet
+
+test.parquet:
+	curl -L -o $@ $(BASE_URL)/test.parquet
+
+neighbors.parquet:
+	curl -L -o $@ $(BASE_URL)/neighbors.parquet
+
+base.db: $(PARQUETS) build_base_db.py
+	uv run --with pandas --with pyarrow python build_base_db.py
+
+clean:
+	rm -f base.db
--- a/benchmarks-ann/datasets/cohere1m/build_base_db.py
+++ b/benchmarks-ann/datasets/cohere1m/build_base_db.py
@ -0,0 +1,121 @@
+#!/usr/bin/env python3
+"""Build base.db from downloaded parquet files.
+
+Reads train.parquet, test.parquet, neighbors.parquet and creates a SQLite
+database with tables: train, query_vectors, neighbors.
+
+Usage:
+  uv run --with pandas --with pyarrow python build_base_db.py
+"""
+import json
+import os
+import sqlite3
+import struct
+import sys
+import time
+
+import pandas as pd
+
+
+def float_list_to_blob(floats):
+    """Pack a list of floats into a little-endian f32 blob."""
+    return struct.pack(f"<{len(floats)}f", *floats)
+
+
+def main():
+    seed_dir = os.path.dirname(os.path.abspath(__file__))
+    db_path = os.path.join(seed_dir, "base.db")
+
+    train_path = os.path.join(seed_dir, "train.parquet")
+    test_path = os.path.join(seed_dir, "test.parquet")
+    neighbors_path = os.path.join(seed_dir, "neighbors.parquet")
+
+    for p in (train_path, test_path, neighbors_path):
+        if not os.path.exists(p):
+            print(f"ERROR: {p} not found. Run 'make download' first.")
+            sys.exit(1)
+
+    if os.path.exists(db_path):
+        os.remove(db_path)
+
+    conn = sqlite3.connect(db_path)
+    conn.execute("PRAGMA journal_mode=WAL")
+    conn.execute("PRAGMA page_size=4096")
+
+    # --- query_vectors (from test.parquet) ---
+    print("Loading test.parquet (query vectors)...")
+    t0 = time.perf_counter()
+    df_test = pd.read_parquet(test_path)
+    conn.execute(
+        "CREATE TABLE query_vectors (id INTEGER PRIMARY KEY, vector BLOB)"
+    )
+    rows = []
+    for _, row in df_test.iterrows():
+        rows.append((int(row["id"]), float_list_to_blob(row["emb"])))
+    conn.executemany("INSERT INTO query_vectors (id, vector) VALUES (?, ?)", rows)
+    conn.commit()
+    print(f"  {len(rows)} query vectors in {time.perf_counter() - t0:.1f}s")
+
+    # --- neighbors (from neighbors.parquet) ---
+    print("Loading neighbors.parquet...")
+    t0 = time.perf_counter()
+    df_neighbors = pd.read_parquet(neighbors_path)
+    conn.execute(
+        "CREATE TABLE neighbors ("
+        "  query_vector_id INTEGER, rank INTEGER, neighbors_id TEXT,"
+        "  UNIQUE(query_vector_id, rank))"
+    )
+    rows = []
+    for _, row in df_neighbors.iterrows():
+        qid = int(row["id"])
+        # neighbors_id may be a numpy array or JSON string
+        nids = row["neighbors_id"]
+        if isinstance(nids, str):
+            nids = json.loads(nids)
+        for rank, nid in enumerate(nids):
+            rows.append((qid, rank, str(int(nid))))
+    conn.executemany(
+        "INSERT INTO neighbors (query_vector_id, rank, neighbors_id) VALUES (?, ?, ?)",
+        rows,
+    )
+    conn.commit()
+    print(f"  {len(rows)} neighbor rows in {time.perf_counter() - t0:.1f}s")
+
+    # --- train (from train.parquet) ---
+    print("Loading train.parquet (1M vectors, this takes a few minutes)...")
+    t0 = time.perf_counter()
+    conn.execute(
+        "CREATE TABLE train (id INTEGER PRIMARY KEY, vector BLOB)"
+    )
+
+    batch_size = 10000
+    df_iter = pd.read_parquet(train_path)
+    total = len(df_iter)
+
+    for start in range(0, total, batch_size):
+        chunk = df_iter.iloc[start : start + batch_size]
+        rows = []
+        for _, row in chunk.iterrows():
+            rows.append((int(row["id"]), float_list_to_blob(row["emb"])))
+        conn.executemany("INSERT INTO train (id, vector) VALUES (?, ?)", rows)
+        conn.commit()
+
+        done = min(start + batch_size, total)
+        elapsed = time.perf_counter() - t0
+        rate = done / elapsed if elapsed > 0 else 0
+        eta = (total - done) / rate if rate > 0 else 0
+        print(
+            f"    {done:>8}/{total}  {elapsed:.0f}s  {rate:.0f} rows/s  eta {eta:.0f}s",
+            flush=True,
+        )
+
+    elapsed = time.perf_counter() - t0
+    print(f"  {total} train vectors in {elapsed:.1f}s")
+
+    conn.close()
+    size_mb = os.path.getsize(db_path) / (1024 * 1024)
+    print(f"\nDone: {db_path} ({size_mb:.0f} MB)")
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks-ann/datasets/nyt-1024/Makefile
+++ b/benchmarks-ann/datasets/nyt-1024/Makefile
@ -0,0 +1,30 @@
+MODEL ?= mixedbread-ai/mxbai-embed-large-v1
+K ?= 100
+BATCH_SIZE ?= 256
+DATA_DIR ?= ../nyt/data
+
+all: base.db
+
+# Reuse data from ../nyt
+$(DATA_DIR):
+	$(MAKE) -C ../nyt data
+
+contents.db: $(DATA_DIR)
+	uv run ../nyt-768/build-contents.py --data-dir $(DATA_DIR) -o $@
+
+base.db: contents.db queries.txt
+	uv run build-base.py \
+		--contents-db contents.db \
+		--model $(MODEL) \
+		--queries-file queries.txt \
+		--batch-size $(BATCH_SIZE) \
+		--k $(K) \
+		-o $@
+
+queries.txt:
+	cp ../nyt/queries.txt $@
+
+clean:
+	rm -f base.db contents.db
+
+.PHONY: all clean
--- a/benchmarks-ann/datasets/nyt-1024/build-base.py
+++ b/benchmarks-ann/datasets/nyt-1024/build-base.py
@ -0,0 +1,163 @@
+# /// script
+# requires-python = ">=3.12"
+# dependencies = [
+#     "sentence-transformers",
+#     "torch<=2.7",
+#     "tqdm",
+# ]
+# ///
+
+import argparse
+import sqlite3
+from array import array
+from itertools import batched
+
+from sentence_transformers import SentenceTransformer
+from tqdm import tqdm
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Build base.db with train vectors, query vectors, and brute-force KNN neighbors",
+    )
+    parser.add_argument(
+        "--contents-db", "-c", default=None,
+        help="Path to contents.db (source of headlines and IDs)",
+    )
+    parser.add_argument(
+        "--model", "-m", default="mixedbread-ai/mxbai-embed-large-v1",
+        help="HuggingFace model ID (default: mixedbread-ai/mxbai-embed-large-v1)",
+    )
+    parser.add_argument(
+        "--queries-file", "-q", default="queries.txt",
+        help="Path to the queries file (default: queries.txt)",
+    )
+    parser.add_argument(
+        "--output", "-o", required=True,
+        help="Path to the output base.db",
+    )
+    parser.add_argument(
+        "--batch-size", "-b", type=int, default=256,
+        help="Batch size for embedding (default: 256)",
+    )
+    parser.add_argument(
+        "--k", "-k", type=int, default=100,
+        help="Number of nearest neighbors (default: 100)",
+    )
+    parser.add_argument(
+        "--limit", "-l", type=int, default=0,
+        help="Limit number of headlines to embed (0 = all)",
+    )
+    parser.add_argument(
+        "--vec-path", "-v", default="~/projects/sqlite-vec/dist/vec0",
+        help="Path to sqlite-vec extension (default: ~/projects/sqlite-vec/dist/vec0)",
+    )
+    parser.add_argument(
+        "--skip-neighbors", action="store_true",
+        help="Skip the brute-force KNN neighbor computation",
+    )
+    args = parser.parse_args()
+
+    import os
+    vec_path = os.path.expanduser(args.vec_path)
+
+    print(f"Loading model {args.model}...")
+    model = SentenceTransformer(args.model)
+
+    # Read headlines from contents.db
+    src = sqlite3.connect(args.contents_db)
+    limit_clause = f" LIMIT {args.limit}" if args.limit > 0 else ""
+    headlines = src.execute(
+        f"SELECT id, headline FROM contents ORDER BY id{limit_clause}"
+    ).fetchall()
+    src.close()
+    print(f"Loaded {len(headlines)} headlines from {args.contents_db}")
+
+    # Read queries
+    with open(args.queries_file) as f:
+        queries = [line.strip() for line in f if line.strip()]
+    print(f"Loaded {len(queries)} queries from {args.queries_file}")
+
+    # Create output database
+    db = sqlite3.connect(args.output)
+    db.enable_load_extension(True)
+    db.load_extension(vec_path)
+    db.enable_load_extension(False)
+
+    db.execute("CREATE TABLE IF NOT EXISTS train(id INTEGER PRIMARY KEY, vector BLOB)")
+    db.execute("CREATE TABLE IF NOT EXISTS query_vectors(id INTEGER PRIMARY KEY, vector BLOB)")
+    db.execute(
+        "CREATE TABLE IF NOT EXISTS neighbors("
+        "  query_vector_id INTEGER, rank INTEGER, neighbors_id TEXT,"
+        "  UNIQUE(query_vector_id, rank))"
+    )
+
+    # Step 1: Embed headlines -> train table
+    print("Embedding headlines...")
+    for batch in tqdm(
+        batched(headlines, args.batch_size),
+        total=(len(headlines) + args.batch_size - 1) // args.batch_size,
+    ):
+        ids = [r[0] for r in batch]
+        texts = [r[1] for r in batch]
+        embeddings = model.encode(texts, normalize_embeddings=True)
+
+        params = [
+            (int(rid), array("f", emb.tolist()).tobytes())
+            for rid, emb in zip(ids, embeddings)
+        ]
+        db.executemany("INSERT INTO train VALUES (?, ?)", params)
+        db.commit()
+
+    del headlines
+    n = db.execute("SELECT count(*) FROM train").fetchone()[0]
+    print(f"Embedded {n} headlines")
+
+    # Step 2: Embed queries -> query_vectors table
+    print("Embedding queries...")
+    query_embeddings = model.encode(queries, normalize_embeddings=True)
+    query_params = []
+    for i, emb in enumerate(query_embeddings, 1):
+        blob = array("f", emb.tolist()).tobytes()
+        query_params.append((i, blob))
+    db.executemany("INSERT INTO query_vectors VALUES (?, ?)", query_params)
+    db.commit()
+    print(f"Embedded {len(queries)} queries")
+
+    if args.skip_neighbors:
+        db.close()
+        print(f"Done (skipped neighbors). Wrote {args.output}")
+        return
+
+    # Step 3: Brute-force KNN via sqlite-vec -> neighbors table
+    n_queries = db.execute("SELECT count(*) FROM query_vectors").fetchone()[0]
+    print(f"Computing {args.k}-NN for {n_queries} queries via sqlite-vec...")
+    for query_id, query_blob in tqdm(
+        db.execute("SELECT id, vector FROM query_vectors").fetchall()
+    ):
+        results = db.execute(
+            """
+            SELECT
+                train.id,
+                vec_distance_cosine(train.vector, ?) AS distance
+            FROM train
+            WHERE distance IS NOT NULL
+            ORDER BY distance ASC
+            LIMIT ?
+            """,
+            (query_blob, args.k),
+        ).fetchall()
+
+        params = [
+            (query_id, rank, str(rid))
+            for rank, (rid, _dist) in enumerate(results)
+        ]
+        db.executemany("INSERT INTO neighbors VALUES (?, ?, ?)", params)
+
+    db.commit()
+    db.close()
+    print(f"Done. Wrote {args.output}")
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks-ann/datasets/nyt-1024/queries.txt
+++ b/benchmarks-ann/datasets/nyt-1024/queries.txt
@ -0,0 +1,100 @@
+latest news on climate change policy
+presidential election results and analysis
+stock market crash causes
+coronavirus vaccine development updates
+artificial intelligence breakthrough in healthcare
+supreme court ruling on abortion rights
+tech companies layoff announcements
+earthquake damages in California
+cybersecurity breach at major corporation
+space exploration mission to Mars
+immigration reform legislation debate
+renewable energy investment trends
+healthcare costs rising across America
+protests against police brutality
+wildfires destroy homes in the West
+Olympic games highlights and records
+celebrity scandal rocks Hollywood
+breakthrough cancer treatment discovered
+housing market bubble concerns
+federal reserve interest rate decision
+school shooting tragedy response
+diplomatic tensions between superpowers
+drone strike kills terrorist leader
+social media platform faces regulation
+archaeological discovery reveals ancient civilization
+unemployment rate hits record low
+autonomous vehicles testing expansion
+streaming service launches original content
+opioid crisis intervention programs
+trade war tariffs impact economy
+infrastructure bill passes Congress
+data privacy concerns grow
+minimum wage increase proposal
+college admissions scandal exposed
+NFL player protest during anthem
+cryptocurrency regulation debate
+pandemic lockdown restrictions eased
+mass shooting gun control debate
+tax reform legislation impact
+ransomware attack cripples pipeline
+climate activists stage demonstration
+sports team wins championship
+banking system collapse fears
+pharmaceutical company fraud charges
+genetic engineering ethical concerns
+border wall funding controversy
+impeachment proceedings begin
+nuclear weapons treaty violation
+artificial meat alternative launch
+student loan debt forgiveness
+venture capital funding decline
+facial recognition ban proposed
+election interference investigation
+pandemic preparedness failures
+police reform measures announced
+wildfire prevention strategies
+ocean pollution crisis worsens
+manufacturing jobs returning
+pension fund shortfall concerns
+antitrust investigation launched
+voting rights protection act
+mental health awareness campaign
+homeless population increasing
+space debris collision risk
+drug cartel violence escalates
+renewable energy jobs growth
+infrastructure deterioration report
+vaccine mandate legal challenge
+cryptocurrency market volatility
+autonomous drone delivery service
+deep fake technology dangers
+Arctic ice melting accelerates
+income inequality gap widens
+election fraud claims disputed
+corporate merger blocked
+medical breakthrough extends life
+transportation strike disrupts city
+racial justice protests spread
+carbon emissions reduction goals
+financial crisis warning signs
+cyberbullying prevention efforts
+asteroid near miss with Earth
+gene therapy approval granted
+labor union organizing drive
+surveillance technology expansion
+education funding cuts proposed
+disaster relief efforts underway
+housing affordability crisis
+clean water access shortage
+artificial intelligence job displacement
+trade agreement negotiations
+prison reform initiative launched
+species extinction accelerates
+political corruption scandal
+terrorism threat level raised
+food safety contamination outbreak
+ai model release
+affordability interest rates
+peanut allergies in newbons
+breaking bad walter white
--- a/benchmarks-ann/datasets/nyt-384/Makefile
+++ b/benchmarks-ann/datasets/nyt-384/Makefile
@ -0,0 +1,29 @@
+MODEL ?= mixedbread-ai/mxbai-embed-xsmall-v1
+K ?= 100
+BATCH_SIZE ?= 512
+DATA_DIR ?= ../nyt/data
+
+all: base.db
+
+$(DATA_DIR):
+	$(MAKE) -C ../nyt data
+
+contents.db: $(DATA_DIR)
+	uv run ../nyt-768/build-contents.py --data-dir $(DATA_DIR) -o $@
+
+base.db: contents.db queries.txt
+	uv run ../nyt-1024/build-base.py \
+		--contents-db contents.db \
+		--model $(MODEL) \
+		--queries-file queries.txt \
+		--batch-size $(BATCH_SIZE) \
+		--k $(K) \
+		-o $@
+
+queries.txt:
+	cp ../nyt/queries.txt $@
+
+clean:
+	rm -f base.db contents.db
+
+.PHONY: all clean
--- a/benchmarks-ann/datasets/nyt-384/queries.txt
+++ b/benchmarks-ann/datasets/nyt-384/queries.txt
@ -0,0 +1,100 @@
+latest news on climate change policy
+presidential election results and analysis
+stock market crash causes
+coronavirus vaccine development updates
+artificial intelligence breakthrough in healthcare
+supreme court ruling on abortion rights
+tech companies layoff announcements
+earthquake damages in California
+cybersecurity breach at major corporation
+space exploration mission to Mars
+immigration reform legislation debate
+renewable energy investment trends
+healthcare costs rising across America
+protests against police brutality
+wildfires destroy homes in the West
+Olympic games highlights and records
+celebrity scandal rocks Hollywood
+breakthrough cancer treatment discovered
+housing market bubble concerns
+federal reserve interest rate decision
+school shooting tragedy response
+diplomatic tensions between superpowers
+drone strike kills terrorist leader
+social media platform faces regulation
+archaeological discovery reveals ancient civilization
+unemployment rate hits record low
+autonomous vehicles testing expansion
+streaming service launches original content
+opioid crisis intervention programs
+trade war tariffs impact economy
+infrastructure bill passes Congress
+data privacy concerns grow
+minimum wage increase proposal
+college admissions scandal exposed
+NFL player protest during anthem
+cryptocurrency regulation debate
+pandemic lockdown restrictions eased
+mass shooting gun control debate
+tax reform legislation impact
+ransomware attack cripples pipeline
+climate activists stage demonstration
+sports team wins championship
+banking system collapse fears
+pharmaceutical company fraud charges
+genetic engineering ethical concerns
+border wall funding controversy
+impeachment proceedings begin
+nuclear weapons treaty violation
+artificial meat alternative launch
+student loan debt forgiveness
+venture capital funding decline
+facial recognition ban proposed
+election interference investigation
+pandemic preparedness failures
+police reform measures announced
+wildfire prevention strategies
+ocean pollution crisis worsens
+manufacturing jobs returning
+pension fund shortfall concerns
+antitrust investigation launched
+voting rights protection act
+mental health awareness campaign
+homeless population increasing
+space debris collision risk
+drug cartel violence escalates
+renewable energy jobs growth
+infrastructure deterioration report
+vaccine mandate legal challenge
+cryptocurrency market volatility
+autonomous drone delivery service
+deep fake technology dangers
+Arctic ice melting accelerates
+income inequality gap widens
+election fraud claims disputed
+corporate merger blocked
+medical breakthrough extends life
+transportation strike disrupts city
+racial justice protests spread
+carbon emissions reduction goals
+financial crisis warning signs
+cyberbullying prevention efforts
+asteroid near miss with Earth
+gene therapy approval granted
+labor union organizing drive
+surveillance technology expansion
+education funding cuts proposed
+disaster relief efforts underway
+housing affordability crisis
+clean water access shortage
+artificial intelligence job displacement
+trade agreement negotiations
+prison reform initiative launched
+species extinction accelerates
+political corruption scandal
+terrorism threat level raised
+food safety contamination outbreak
+ai model release
+affordability interest rates
+peanut allergies in newbons
+breaking bad walter white
--- a/benchmarks-ann/datasets/nyt-768/Makefile
+++ b/benchmarks-ann/datasets/nyt-768/Makefile
@ -0,0 +1,37 @@
+MODEL ?= bge-base-en-v1.5-768
+K ?= 100
+BATCH_SIZE ?= 512
+DATA_DIR ?= ../nyt/data
+
+all: base.db
+
+# Reuse data from ../nyt
+$(DATA_DIR):
+	$(MAKE) -C ../nyt data
+
+# Distill model (separate step, may take a while)
+$(MODEL):
+	uv run distill-model.py
+
+contents.db: $(DATA_DIR)
+	uv run build-contents.py --data-dir $(DATA_DIR) -o $@
+
+base.db: contents.db queries.txt $(MODEL)
+	uv run ../nyt/build-base.py \
+		--contents-db contents.db \
+		--model $(MODEL) \
+		--queries-file queries.txt \
+		--batch-size $(BATCH_SIZE) \
+		--k $(K) \
+		-o $@
+
+queries.txt:
+	cp ../nyt/queries.txt $@
+
+clean:
+	rm -f base.db contents.db
+
+clean-all: clean
+	rm -rf $(MODEL)
+
+.PHONY: all clean clean-all
--- a/benchmarks-ann/datasets/nyt-768/build-contents.py
+++ b/benchmarks-ann/datasets/nyt-768/build-contents.py
@ -0,0 +1,64 @@
+# /// script
+# requires-python = ">=3.12"
+# dependencies = [
+#     "duckdb",
+# ]
+# ///
+
+import argparse
+import sqlite3
+import duckdb
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Load NYT headline CSVs into a SQLite contents database (most recent 1M, deduplicated)",
+    )
+    parser.add_argument(
+        "--data-dir", "-d", default="../nyt/data",
+        help="Directory containing NYT CSV files (default: ../nyt/data)",
+    )
+    parser.add_argument(
+        "--limit", "-l", type=int, default=1_000_000,
+        help="Maximum number of headlines to keep (default: 1000000)",
+    )
+    parser.add_argument(
+        "--output", "-o", required=True,
+        help="Path to the output SQLite database",
+    )
+    args = parser.parse_args()
+
+    glob_pattern = f"{args.data_dir}/new_york_times_stories_*.csv"
+
+    con = duckdb.connect()
+    rows = con.execute(
+        f"""
+        WITH deduped AS (
+            SELECT
+                headline,
+                max(pub_date) AS pub_date
+            FROM read_csv('{glob_pattern}', auto_detect=true, union_by_name=true)
+            WHERE headline IS NOT NULL AND trim(headline) != ''
+            GROUP BY headline
+        )
+        SELECT
+            row_number() OVER (ORDER BY pub_date DESC) AS id,
+            headline
+        FROM deduped
+        ORDER BY pub_date DESC
+        LIMIT {args.limit}
+        """
+    ).fetchall()
+    con.close()
+
+    db = sqlite3.connect(args.output)
+    db.execute("CREATE TABLE contents(id INTEGER PRIMARY KEY, headline TEXT)")
+    db.executemany("INSERT INTO contents VALUES (?, ?)", rows)
+    db.commit()
+    db.close()
+
+    print(f"Wrote {len(rows)} headlines to {args.output}")
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks-ann/datasets/nyt-768/distill-model.py
+++ b/benchmarks-ann/datasets/nyt-768/distill-model.py
@ -0,0 +1,13 @@
+# /// script
+# requires-python = ">=3.12"
+# dependencies = [
+#     "model2vec[distill]",
+#     "torch<=2.7",
+# ]
+# ///
+
+from model2vec.distill import distill
+
+model = distill(model_name="BAAI/bge-base-en-v1.5", pca_dims=768)
+model.save_pretrained("bge-base-en-v1.5-768")
+print("Saved distilled model to bge-base-en-v1.5-768/")
--- a/benchmarks-ann/datasets/nyt-768/queries.txt
+++ b/benchmarks-ann/datasets/nyt-768/queries.txt
@ -0,0 +1,100 @@
+latest news on climate change policy
+presidential election results and analysis
+stock market crash causes
+coronavirus vaccine development updates
+artificial intelligence breakthrough in healthcare
+supreme court ruling on abortion rights
+tech companies layoff announcements
+earthquake damages in California
+cybersecurity breach at major corporation
+space exploration mission to Mars
+immigration reform legislation debate
+renewable energy investment trends
+healthcare costs rising across America
+protests against police brutality
+wildfires destroy homes in the West
+Olympic games highlights and records
+celebrity scandal rocks Hollywood
+breakthrough cancer treatment discovered
+housing market bubble concerns
+federal reserve interest rate decision
+school shooting tragedy response
+diplomatic tensions between superpowers
+drone strike kills terrorist leader
+social media platform faces regulation
+archaeological discovery reveals ancient civilization
+unemployment rate hits record low
+autonomous vehicles testing expansion
+streaming service launches original content
+opioid crisis intervention programs
+trade war tariffs impact economy
+infrastructure bill passes Congress
+data privacy concerns grow
+minimum wage increase proposal
+college admissions scandal exposed
+NFL player protest during anthem
+cryptocurrency regulation debate
+pandemic lockdown restrictions eased
+mass shooting gun control debate
+tax reform legislation impact
+ransomware attack cripples pipeline
+climate activists stage demonstration
+sports team wins championship
+banking system collapse fears
+pharmaceutical company fraud charges
+genetic engineering ethical concerns
+border wall funding controversy
+impeachment proceedings begin
+nuclear weapons treaty violation
+artificial meat alternative launch
+student loan debt forgiveness
+venture capital funding decline
+facial recognition ban proposed
+election interference investigation
+pandemic preparedness failures
+police reform measures announced
+wildfire prevention strategies
+ocean pollution crisis worsens
+manufacturing jobs returning
+pension fund shortfall concerns
+antitrust investigation launched
+voting rights protection act
+mental health awareness campaign
+homeless population increasing
+space debris collision risk
+drug cartel violence escalates
+renewable energy jobs growth
+infrastructure deterioration report
+vaccine mandate legal challenge
+cryptocurrency market volatility
+autonomous drone delivery service
+deep fake technology dangers
+Arctic ice melting accelerates
+income inequality gap widens
+election fraud claims disputed
+corporate merger blocked
+medical breakthrough extends life
+transportation strike disrupts city
+racial justice protests spread
+carbon emissions reduction goals
+financial crisis warning signs
+cyberbullying prevention efforts
+asteroid near miss with Earth
+gene therapy approval granted
+labor union organizing drive
+surveillance technology expansion
+education funding cuts proposed
+disaster relief efforts underway
+housing affordability crisis
+clean water access shortage
+artificial intelligence job displacement
+trade agreement negotiations
+prison reform initiative launched
+species extinction accelerates
+political corruption scandal
+terrorism threat level raised
+food safety contamination outbreak
+ai model release
+affordability interest rates
+peanut allergies in newbons
+breaking bad walter white
--- a/benchmarks-ann/datasets/nyt/.gitignore
+++ b/benchmarks-ann/datasets/nyt/.gitignore
@ -0,0 +1 @@
+data/
--- a/benchmarks-ann/datasets/nyt/Makefile
+++ b/benchmarks-ann/datasets/nyt/Makefile
@ -0,0 +1,30 @@
+MODEL ?= minishlab/potion-base-8M
+K ?= 100
+BATCH_SIZE ?= 512
+DATA_DIR ?= data
+
+all: base.db contents.db
+
+# Download NYT headlines CSVs from Kaggle (requires `kaggle` CLI + API token)
+$(DATA_DIR):
+	kaggle datasets download -d johnbandy/new-york-times-headlines -p $(DATA_DIR) --unzip
+
+contents.db: $(DATA_DIR)
+	uv run build-contents.py --data-dir $(DATA_DIR) -o $@
+
+base.db: contents.db queries.txt
+	uv run build-base.py \
+		--contents-db contents.db \
+		--model $(MODEL) \
+		--queries-file queries.txt \
+		--batch-size $(BATCH_SIZE) \
+		--k $(K) \
+		-o $@
+
+clean:
+	rm -f base.db contents.db
+
+clean-all: clean
+	rm -rf $(DATA_DIR)
+
+.PHONY: all clean clean-all
--- a/benchmarks-ann/datasets/nyt/build-base.py
+++ b/benchmarks-ann/datasets/nyt/build-base.py
@ -0,0 +1,165 @@
+# /// script
+# requires-python = ">=3.12"
+# dependencies = [
+#     "model2vec",
+#     "torch<=2.7",
+#     "tqdm",
+# ]
+# ///
+
+import argparse
+import sqlite3
+from array import array
+from itertools import batched
+
+from model2vec import StaticModel
+from tqdm import tqdm
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Build base.db with train vectors, query vectors, and brute-force KNN neighbors",
+    )
+    parser.add_argument(
+        "--contents-db", "-c", default=None,
+        help="Path to contents.db (source of headlines and IDs)",
+    )
+    parser.add_argument(
+        "--model", "-m", default="minishlab/potion-base-8M",
+        help="HuggingFace model ID or local path (default: minishlab/potion-base-8M)",
+    )
+    parser.add_argument(
+        "--queries-file", "-q", default="queries.txt",
+        help="Path to the queries file (default: queries.txt)",
+    )
+    parser.add_argument(
+        "--output", "-o", required=True,
+        help="Path to the output base.db",
+    )
+    parser.add_argument(
+        "--batch-size", "-b", type=int, default=512,
+        help="Batch size for embedding (default: 512)",
+    )
+    parser.add_argument(
+        "--k", "-k", type=int, default=100,
+        help="Number of nearest neighbors (default: 100)",
+    )
+    parser.add_argument(
+        "--vec-path", "-v", default="~/projects/sqlite-vec/dist/vec0",
+        help="Path to sqlite-vec extension (default: ~/projects/sqlite-vec/dist/vec0)",
+    )
+    parser.add_argument(
+        "--rebuild-neighbors", action="store_true",
+        help="Only rebuild the neighbors table (skip embedding steps)",
+    )
+    args = parser.parse_args()
+
+    import os
+    vec_path = os.path.expanduser(args.vec_path)
+
+    if args.rebuild_neighbors:
+        # Skip embedding, just open existing DB and rebuild neighbors
+        db = sqlite3.connect(args.output)
+        db.enable_load_extension(True)
+        db.load_extension(vec_path)
+        db.enable_load_extension(False)
+        db.execute("DROP TABLE IF EXISTS neighbors")
+        db.execute(
+            "CREATE TABLE neighbors("
+            "  query_vector_id INTEGER, rank INTEGER, neighbors_id TEXT,"
+            "  UNIQUE(query_vector_id, rank))"
+        )
+        print(f"Rebuilding neighbors in {args.output}...")
+    else:
+        print(f"Loading model {args.model}...")
+        model = StaticModel.from_pretrained(args.model)
+
+        # Read headlines from contents.db
+        src = sqlite3.connect(args.contents_db)
+        headlines = src.execute("SELECT id, headline FROM contents ORDER BY id").fetchall()
+        src.close()
+        print(f"Loaded {len(headlines)} headlines from {args.contents_db}")
+
+        # Read queries
+        with open(args.queries_file) as f:
+            queries = [line.strip() for line in f if line.strip()]
+        print(f"Loaded {len(queries)} queries from {args.queries_file}")
+
+        # Create output database
+        db = sqlite3.connect(args.output)
+        db.enable_load_extension(True)
+        db.load_extension(vec_path)
+        db.enable_load_extension(False)
+
+        db.execute("CREATE TABLE train(id INTEGER PRIMARY KEY, vector BLOB)")
+        db.execute("CREATE TABLE query_vectors(id INTEGER PRIMARY KEY, vector BLOB)")
+        db.execute(
+            "CREATE TABLE neighbors("
+            "  query_vector_id INTEGER, rank INTEGER, neighbors_id TEXT,"
+            "  UNIQUE(query_vector_id, rank))"
+        )
+
+        # Step 1: Embed headlines -> train table
+        print("Embedding headlines...")
+        for batch in tqdm(
+            batched(headlines, args.batch_size),
+            total=(len(headlines) + args.batch_size - 1) // args.batch_size,
+        ):
+            ids = [r[0] for r in batch]
+            texts = [r[1] for r in batch]
+            embeddings = model.encode(texts)
+
+            params = [
+                (int(rid), array("f", emb.tolist()).tobytes())
+                for rid, emb in zip(ids, embeddings)
+            ]
+            db.executemany("INSERT INTO train VALUES (?, ?)", params)
+            db.commit()
+
+        del headlines
+        n = db.execute("SELECT count(*) FROM train").fetchone()[0]
+        print(f"Embedded {n} headlines")
+
+        # Step 2: Embed queries -> query_vectors table
+        print("Embedding queries...")
+        query_embeddings = model.encode(queries)
+        query_params = []
+        for i, emb in enumerate(query_embeddings, 1):
+            blob = array("f", emb.tolist()).tobytes()
+            query_params.append((i, blob))
+        db.executemany("INSERT INTO query_vectors VALUES (?, ?)", query_params)
+        db.commit()
+        print(f"Embedded {len(queries)} queries")
+
+    # Step 3: Brute-force KNN via sqlite-vec -> neighbors table
+    n_queries = db.execute("SELECT count(*) FROM query_vectors").fetchone()[0]
+    print(f"Computing {args.k}-NN for {n_queries} queries via sqlite-vec...")
+    for query_id, query_blob in tqdm(
+        db.execute("SELECT id, vector FROM query_vectors").fetchall()
+    ):
+        results = db.execute(
+            """
+            SELECT
+                train.id,
+                vec_distance_cosine(train.vector, ?) AS distance
+            FROM train
+            WHERE distance IS NOT NULL
+            ORDER BY distance ASC
+            LIMIT ?
+            """,
+            (query_blob, args.k),
+        ).fetchall()
+
+        params = [
+            (query_id, rank, str(rid))
+            for rank, (rid, _dist) in enumerate(results)
+        ]
+        db.executemany("INSERT INTO neighbors VALUES (?, ?, ?)", params)
+
+    db.commit()
+    db.close()
+    print(f"Done. Wrote {args.output}")
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks-ann/datasets/nyt/build-contents.py
+++ b/benchmarks-ann/datasets/nyt/build-contents.py
@ -0,0 +1,52 @@
+# /// script
+# requires-python = ">=3.12"
+# dependencies = [
+#     "duckdb",
+# ]
+# ///
+
+import argparse
+import os
+import sqlite3
+import duckdb
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Load NYT headline CSVs into a SQLite contents database via DuckDB",
+    )
+    parser.add_argument(
+        "--data-dir", "-d", default="data",
+        help="Directory containing NYT CSV files (default: data)",
+    )
+    parser.add_argument(
+        "--output", "-o", required=True,
+        help="Path to the output SQLite database",
+    )
+    args = parser.parse_args()
+
+    glob_pattern = os.path.join(args.data_dir, "new_york_times_stories_*.csv")
+
+    con = duckdb.connect()
+    rows = con.execute(
+        f"""
+        SELECT
+            row_number() OVER () AS id,
+            headline
+        FROM read_csv('{glob_pattern}', auto_detect=true, union_by_name=true)
+        WHERE headline IS NOT NULL AND headline != ''
+        """
+    ).fetchall()
+    con.close()
+
+    db = sqlite3.connect(args.output)
+    db.execute("CREATE TABLE contents(id INTEGER PRIMARY KEY, headline TEXT)")
+    db.executemany("INSERT INTO contents VALUES (?, ?)", rows)
+    db.commit()
+    db.close()
+
+    print(f"Wrote {len(rows)} headlines to {args.output}")
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks-ann/datasets/nyt/queries.txt
+++ b/benchmarks-ann/datasets/nyt/queries.txt
@ -0,0 +1,100 @@
+latest news on climate change policy
+presidential election results and analysis
+stock market crash causes
+coronavirus vaccine development updates
+artificial intelligence breakthrough in healthcare
+supreme court ruling on abortion rights
+tech companies layoff announcements
+earthquake damages in California
+cybersecurity breach at major corporation
+space exploration mission to Mars
+immigration reform legislation debate
+renewable energy investment trends
+healthcare costs rising across America
+protests against police brutality
+wildfires destroy homes in the West
+Olympic games highlights and records
+celebrity scandal rocks Hollywood
+breakthrough cancer treatment discovered
+housing market bubble concerns
+federal reserve interest rate decision
+school shooting tragedy response
+diplomatic tensions between superpowers
+drone strike kills terrorist leader
+social media platform faces regulation
+archaeological discovery reveals ancient civilization
+unemployment rate hits record low
+autonomous vehicles testing expansion
+streaming service launches original content
+opioid crisis intervention programs
+trade war tariffs impact economy
+infrastructure bill passes Congress
+data privacy concerns grow
+minimum wage increase proposal
+college admissions scandal exposed
+NFL player protest during anthem
+cryptocurrency regulation debate
+pandemic lockdown restrictions eased
+mass shooting gun control debate
+tax reform legislation impact
+ransomware attack cripples pipeline
+climate activists stage demonstration
+sports team wins championship
+banking system collapse fears
+pharmaceutical company fraud charges
+genetic engineering ethical concerns
+border wall funding controversy
+impeachment proceedings begin
+nuclear weapons treaty violation
+artificial meat alternative launch
+student loan debt forgiveness
+venture capital funding decline
+facial recognition ban proposed
+election interference investigation
+pandemic preparedness failures
+police reform measures announced
+wildfire prevention strategies
+ocean pollution crisis worsens
+manufacturing jobs returning
+pension fund shortfall concerns
+antitrust investigation launched
+voting rights protection act
+mental health awareness campaign
+homeless population increasing
+space debris collision risk
+drug cartel violence escalates
+renewable energy jobs growth
+infrastructure deterioration report
+vaccine mandate legal challenge
+cryptocurrency market volatility
+autonomous drone delivery service
+deep fake technology dangers
+Arctic ice melting accelerates
+income inequality gap widens
+election fraud claims disputed
+corporate merger blocked
+medical breakthrough extends life
+transportation strike disrupts city
+racial justice protests spread
+carbon emissions reduction goals
+financial crisis warning signs
+cyberbullying prevention efforts
+asteroid near miss with Earth
+gene therapy approval granted
+labor union organizing drive
+surveillance technology expansion
+education funding cuts proposed
+disaster relief efforts underway
+housing affordability crisis
+clean water access shortage
+artificial intelligence job displacement
+trade agreement negotiations
+prison reform initiative launched
+species extinction accelerates
+political corruption scandal
+terrorism threat level raised
+food safety contamination outbreak
+ai model release
+affordability interest rates
+peanut allergies in newbons
+breaking bad walter white