Add comprehensive ANN benchmarking suite

Extend benchmarks-ann/ with results database (SQLite with per-query detail and continuous writes), dataset subfolder organization, --subset-size and --warmup options. Supports systematic comparison across flat, rescore, IVF, and DiskANN index types.
2026-07-26 17:11:08 +02:00 · 2026-03-29 19:47:12 -07:00 · 2026-03-29 19:47:12 -07:00 · dbbb4b98f7
commit dbbb4b98f7
parent a248ecd061
26 changed files with 2127 additions and 292 deletions
--- a/benchmarks-ann/datasets/nyt/.gitignore
+++ b/benchmarks-ann/datasets/nyt/.gitignore
@ -0,0 +1 @@
+data/
--- a/benchmarks-ann/datasets/nyt/Makefile
+++ b/benchmarks-ann/datasets/nyt/Makefile
@ -0,0 +1,30 @@
+MODEL ?= minishlab/potion-base-8M
+K ?= 100
+BATCH_SIZE ?= 512
+DATA_DIR ?= data
+
+all: base.db contents.db
+
+# Download NYT headlines CSVs from Kaggle (requires `kaggle` CLI + API token)
+$(DATA_DIR):
+	kaggle datasets download -d johnbandy/new-york-times-headlines -p $(DATA_DIR) --unzip
+
+contents.db: $(DATA_DIR)
+	uv run build-contents.py --data-dir $(DATA_DIR) -o $@
+
+base.db: contents.db queries.txt
+	uv run build-base.py \
+		--contents-db contents.db \
+		--model $(MODEL) \
+		--queries-file queries.txt \
+		--batch-size $(BATCH_SIZE) \
+		--k $(K) \
+		-o $@
+
+clean:
+	rm -f base.db contents.db
+
+clean-all: clean
+	rm -rf $(DATA_DIR)
+
+.PHONY: all clean clean-all
--- a/benchmarks-ann/datasets/nyt/build-base.py
+++ b/benchmarks-ann/datasets/nyt/build-base.py
@ -0,0 +1,165 @@
+# /// script
+# requires-python = ">=3.12"
+# dependencies = [
+#     "model2vec",
+#     "torch<=2.7",
+#     "tqdm",
+# ]
+# ///
+
+import argparse
+import sqlite3
+from array import array
+from itertools import batched
+
+from model2vec import StaticModel
+from tqdm import tqdm
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Build base.db with train vectors, query vectors, and brute-force KNN neighbors",
+    )
+    parser.add_argument(
+        "--contents-db", "-c", default=None,
+        help="Path to contents.db (source of headlines and IDs)",
+    )
+    parser.add_argument(
+        "--model", "-m", default="minishlab/potion-base-8M",
+        help="HuggingFace model ID or local path (default: minishlab/potion-base-8M)",
+    )
+    parser.add_argument(
+        "--queries-file", "-q", default="queries.txt",
+        help="Path to the queries file (default: queries.txt)",
+    )
+    parser.add_argument(
+        "--output", "-o", required=True,
+        help="Path to the output base.db",
+    )
+    parser.add_argument(
+        "--batch-size", "-b", type=int, default=512,
+        help="Batch size for embedding (default: 512)",
+    )
+    parser.add_argument(
+        "--k", "-k", type=int, default=100,
+        help="Number of nearest neighbors (default: 100)",
+    )
+    parser.add_argument(
+        "--vec-path", "-v", default="~/projects/sqlite-vec/dist/vec0",
+        help="Path to sqlite-vec extension (default: ~/projects/sqlite-vec/dist/vec0)",
+    )
+    parser.add_argument(
+        "--rebuild-neighbors", action="store_true",
+        help="Only rebuild the neighbors table (skip embedding steps)",
+    )
+    args = parser.parse_args()
+
+    import os
+    vec_path = os.path.expanduser(args.vec_path)
+
+    if args.rebuild_neighbors:
+        # Skip embedding, just open existing DB and rebuild neighbors
+        db = sqlite3.connect(args.output)
+        db.enable_load_extension(True)
+        db.load_extension(vec_path)
+        db.enable_load_extension(False)
+        db.execute("DROP TABLE IF EXISTS neighbors")
+        db.execute(
+            "CREATE TABLE neighbors("
+            "  query_vector_id INTEGER, rank INTEGER, neighbors_id TEXT,"
+            "  UNIQUE(query_vector_id, rank))"
+        )
+        print(f"Rebuilding neighbors in {args.output}...")
+    else:
+        print(f"Loading model {args.model}...")
+        model = StaticModel.from_pretrained(args.model)
+
+        # Read headlines from contents.db
+        src = sqlite3.connect(args.contents_db)
+        headlines = src.execute("SELECT id, headline FROM contents ORDER BY id").fetchall()
+        src.close()
+        print(f"Loaded {len(headlines)} headlines from {args.contents_db}")
+
+        # Read queries
+        with open(args.queries_file) as f:
+            queries = [line.strip() for line in f if line.strip()]
+        print(f"Loaded {len(queries)} queries from {args.queries_file}")
+
+        # Create output database
+        db = sqlite3.connect(args.output)
+        db.enable_load_extension(True)
+        db.load_extension(vec_path)
+        db.enable_load_extension(False)
+
+        db.execute("CREATE TABLE train(id INTEGER PRIMARY KEY, vector BLOB)")
+        db.execute("CREATE TABLE query_vectors(id INTEGER PRIMARY KEY, vector BLOB)")
+        db.execute(
+            "CREATE TABLE neighbors("
+            "  query_vector_id INTEGER, rank INTEGER, neighbors_id TEXT,"
+            "  UNIQUE(query_vector_id, rank))"
+        )
+
+        # Step 1: Embed headlines -> train table
+        print("Embedding headlines...")
+        for batch in tqdm(
+            batched(headlines, args.batch_size),
+            total=(len(headlines) + args.batch_size - 1) // args.batch_size,
+        ):
+            ids = [r[0] for r in batch]
+            texts = [r[1] for r in batch]
+            embeddings = model.encode(texts)
+
+            params = [
+                (int(rid), array("f", emb.tolist()).tobytes())
+                for rid, emb in zip(ids, embeddings)
+            ]
+            db.executemany("INSERT INTO train VALUES (?, ?)", params)
+            db.commit()
+
+        del headlines
+        n = db.execute("SELECT count(*) FROM train").fetchone()[0]
+        print(f"Embedded {n} headlines")
+
+        # Step 2: Embed queries -> query_vectors table
+        print("Embedding queries...")
+        query_embeddings = model.encode(queries)
+        query_params = []
+        for i, emb in enumerate(query_embeddings, 1):
+            blob = array("f", emb.tolist()).tobytes()
+            query_params.append((i, blob))
+        db.executemany("INSERT INTO query_vectors VALUES (?, ?)", query_params)
+        db.commit()
+        print(f"Embedded {len(queries)} queries")
+
+    # Step 3: Brute-force KNN via sqlite-vec -> neighbors table
+    n_queries = db.execute("SELECT count(*) FROM query_vectors").fetchone()[0]
+    print(f"Computing {args.k}-NN for {n_queries} queries via sqlite-vec...")
+    for query_id, query_blob in tqdm(
+        db.execute("SELECT id, vector FROM query_vectors").fetchall()
+    ):
+        results = db.execute(
+            """
+            SELECT
+                train.id,
+                vec_distance_cosine(train.vector, ?) AS distance
+            FROM train
+            WHERE distance IS NOT NULL
+            ORDER BY distance ASC
+            LIMIT ?
+            """,
+            (query_blob, args.k),
+        ).fetchall()
+
+        params = [
+            (query_id, rank, str(rid))
+            for rank, (rid, _dist) in enumerate(results)
+        ]
+        db.executemany("INSERT INTO neighbors VALUES (?, ?, ?)", params)
+
+    db.commit()
+    db.close()
+    print(f"Done. Wrote {args.output}")
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks-ann/datasets/nyt/build-contents.py
+++ b/benchmarks-ann/datasets/nyt/build-contents.py
@ -0,0 +1,52 @@
+# /// script
+# requires-python = ">=3.12"
+# dependencies = [
+#     "duckdb",
+# ]
+# ///
+
+import argparse
+import os
+import sqlite3
+import duckdb
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Load NYT headline CSVs into a SQLite contents database via DuckDB",
+    )
+    parser.add_argument(
+        "--data-dir", "-d", default="data",
+        help="Directory containing NYT CSV files (default: data)",
+    )
+    parser.add_argument(
+        "--output", "-o", required=True,
+        help="Path to the output SQLite database",
+    )
+    args = parser.parse_args()
+
+    glob_pattern = os.path.join(args.data_dir, "new_york_times_stories_*.csv")
+
+    con = duckdb.connect()
+    rows = con.execute(
+        f"""
+        SELECT
+            row_number() OVER () AS id,
+            headline
+        FROM read_csv('{glob_pattern}', auto_detect=true, union_by_name=true)
+        WHERE headline IS NOT NULL AND headline != ''
+        """
+    ).fetchall()
+    con.close()
+
+    db = sqlite3.connect(args.output)
+    db.execute("CREATE TABLE contents(id INTEGER PRIMARY KEY, headline TEXT)")
+    db.executemany("INSERT INTO contents VALUES (?, ?)", rows)
+    db.commit()
+    db.close()
+
+    print(f"Wrote {len(rows)} headlines to {args.output}")
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks-ann/datasets/nyt/queries.txt
+++ b/benchmarks-ann/datasets/nyt/queries.txt
@ -0,0 +1,100 @@
+latest news on climate change policy
+presidential election results and analysis
+stock market crash causes
+coronavirus vaccine development updates
+artificial intelligence breakthrough in healthcare
+supreme court ruling on abortion rights
+tech companies layoff announcements
+earthquake damages in California
+cybersecurity breach at major corporation
+space exploration mission to Mars
+immigration reform legislation debate
+renewable energy investment trends
+healthcare costs rising across America
+protests against police brutality
+wildfires destroy homes in the West
+Olympic games highlights and records
+celebrity scandal rocks Hollywood
+breakthrough cancer treatment discovered
+housing market bubble concerns
+federal reserve interest rate decision
+school shooting tragedy response
+diplomatic tensions between superpowers
+drone strike kills terrorist leader
+social media platform faces regulation
+archaeological discovery reveals ancient civilization
+unemployment rate hits record low
+autonomous vehicles testing expansion
+streaming service launches original content
+opioid crisis intervention programs
+trade war tariffs impact economy
+infrastructure bill passes Congress
+data privacy concerns grow
+minimum wage increase proposal
+college admissions scandal exposed
+NFL player protest during anthem
+cryptocurrency regulation debate
+pandemic lockdown restrictions eased
+mass shooting gun control debate
+tax reform legislation impact
+ransomware attack cripples pipeline
+climate activists stage demonstration
+sports team wins championship
+banking system collapse fears
+pharmaceutical company fraud charges
+genetic engineering ethical concerns
+border wall funding controversy
+impeachment proceedings begin
+nuclear weapons treaty violation
+artificial meat alternative launch
+student loan debt forgiveness
+venture capital funding decline
+facial recognition ban proposed
+election interference investigation
+pandemic preparedness failures
+police reform measures announced
+wildfire prevention strategies
+ocean pollution crisis worsens
+manufacturing jobs returning
+pension fund shortfall concerns
+antitrust investigation launched
+voting rights protection act
+mental health awareness campaign
+homeless population increasing
+space debris collision risk
+drug cartel violence escalates
+renewable energy jobs growth
+infrastructure deterioration report
+vaccine mandate legal challenge
+cryptocurrency market volatility
+autonomous drone delivery service
+deep fake technology dangers
+Arctic ice melting accelerates
+income inequality gap widens
+election fraud claims disputed
+corporate merger blocked
+medical breakthrough extends life
+transportation strike disrupts city
+racial justice protests spread
+carbon emissions reduction goals
+financial crisis warning signs
+cyberbullying prevention efforts
+asteroid near miss with Earth
+gene therapy approval granted
+labor union organizing drive
+surveillance technology expansion
+education funding cuts proposed
+disaster relief efforts underway
+housing affordability crisis
+clean water access shortage
+artificial intelligence job displacement
+trade agreement negotiations
+prison reform initiative launched
+species extinction accelerates
+political corruption scandal
+terrorism threat level raised
+food safety contamination outbreak
+ai model release
+affordability interest rates
+peanut allergies in newbons
+breaking bad walter white