Add comprehensive ANN benchmarking suite

Extend benchmarks-ann/ with results database (SQLite with per-query detail and continuous writes), dataset subfolder organization, --subset-size and --warmup options. Supports systematic comparison across flat, rescore, IVF, and DiskANN index types.
2026-04-25 16:56:27 +02:00 · 2026-03-29 19:47:12 -07:00 · 2026-03-29 19:47:12 -07:00 · dbbb4b98f7
commit dbbb4b98f7
parent a248ecd061
26 changed files with 2127 additions and 292 deletions
--- a/benchmarks-ann/seed/build_base_db.py
+++ b/benchmarks-ann/seed/build_base_db.py
@ -1,121 +0,0 @@
-#!/usr/bin/env python3
-"""Build base.db from downloaded parquet files.
-
-Reads train.parquet, test.parquet, neighbors.parquet and creates a SQLite
-database with tables: train, query_vectors, neighbors.
-
-Usage:
-  uv run --with pandas --with pyarrow python build_base_db.py
-"""
-import json
-import os
-import sqlite3
-import struct
-import sys
-import time
-
-import pandas as pd
-
-
-def float_list_to_blob(floats):
-    """Pack a list of floats into a little-endian f32 blob."""
-    return struct.pack(f"<{len(floats)}f", *floats)
-
-
-def main():
-    seed_dir = os.path.dirname(os.path.abspath(__file__))
-    db_path = os.path.join(seed_dir, "base.db")
-
-    train_path = os.path.join(seed_dir, "train.parquet")
-    test_path = os.path.join(seed_dir, "test.parquet")
-    neighbors_path = os.path.join(seed_dir, "neighbors.parquet")
-
-    for p in (train_path, test_path, neighbors_path):
-        if not os.path.exists(p):
-            print(f"ERROR: {p} not found. Run 'make download' first.")
-            sys.exit(1)
-
-    if os.path.exists(db_path):
-        os.remove(db_path)
-
-    conn = sqlite3.connect(db_path)
-    conn.execute("PRAGMA journal_mode=WAL")
-    conn.execute("PRAGMA page_size=4096")
-
-    # --- query_vectors (from test.parquet) ---
-    print("Loading test.parquet (query vectors)...")
-    t0 = time.perf_counter()
-    df_test = pd.read_parquet(test_path)
-    conn.execute(
-        "CREATE TABLE query_vectors (id INTEGER PRIMARY KEY, vector BLOB)"
-    )
-    rows = []
-    for _, row in df_test.iterrows():
-        rows.append((int(row["id"]), float_list_to_blob(row["emb"])))
-    conn.executemany("INSERT INTO query_vectors (id, vector) VALUES (?, ?)", rows)
-    conn.commit()
-    print(f"  {len(rows)} query vectors in {time.perf_counter() - t0:.1f}s")
-
-    # --- neighbors (from neighbors.parquet) ---
-    print("Loading neighbors.parquet...")
-    t0 = time.perf_counter()
-    df_neighbors = pd.read_parquet(neighbors_path)
-    conn.execute(
-        "CREATE TABLE neighbors ("
-        "  query_vector_id INTEGER, rank INTEGER, neighbors_id TEXT,"
-        "  UNIQUE(query_vector_id, rank))"
-    )
-    rows = []
-    for _, row in df_neighbors.iterrows():
-        qid = int(row["id"])
-        # neighbors_id may be a numpy array or JSON string
-        nids = row["neighbors_id"]
-        if isinstance(nids, str):
-            nids = json.loads(nids)
-        for rank, nid in enumerate(nids):
-            rows.append((qid, rank, str(int(nid))))
-    conn.executemany(
-        "INSERT INTO neighbors (query_vector_id, rank, neighbors_id) VALUES (?, ?, ?)",
-        rows,
-    )
-    conn.commit()
-    print(f"  {len(rows)} neighbor rows in {time.perf_counter() - t0:.1f}s")
-
-    # --- train (from train.parquet) ---
-    print("Loading train.parquet (1M vectors, this takes a few minutes)...")
-    t0 = time.perf_counter()
-    conn.execute(
-        "CREATE TABLE train (id INTEGER PRIMARY KEY, vector BLOB)"
-    )
-
-    batch_size = 10000
-    df_iter = pd.read_parquet(train_path)
-    total = len(df_iter)
-
-    for start in range(0, total, batch_size):
-        chunk = df_iter.iloc[start : start + batch_size]
-        rows = []
-        for _, row in chunk.iterrows():
-            rows.append((int(row["id"]), float_list_to_blob(row["emb"])))
-        conn.executemany("INSERT INTO train (id, vector) VALUES (?, ?)", rows)
-        conn.commit()
-
-        done = min(start + batch_size, total)
-        elapsed = time.perf_counter() - t0
-        rate = done / elapsed if elapsed > 0 else 0
-        eta = (total - done) / rate if rate > 0 else 0
-        print(
-            f"    {done:>8}/{total}  {elapsed:.0f}s  {rate:.0f} rows/s  eta {eta:.0f}s",
-            flush=True,
-        )
-
-    elapsed = time.perf_counter() - t0
-    print(f"  {total} train vectors in {elapsed:.1f}s")
-
-    conn.close()
-    size_mb = os.path.getsize(db_path) / (1024 * 1024)
-    print(f"\nDone: {db_path} ({size_mb:.0f} MB)")
-
-
-if __name__ == "__main__":
-    main()