mirror of
https://github.com/asg017/sqlite-vec.git
synced 2026-04-25 16:56:27 +02:00
Add comprehensive ANN benchmarking suite
Extend benchmarks-ann/ with results database (SQLite with per-query detail and continuous writes), dataset subfolder organization, --subset-size and --warmup options. Supports systematic comparison across flat, rescore, IVF, and DiskANN index types.
This commit is contained in:
parent
a248ecd061
commit
dbbb4b98f7
26 changed files with 2127 additions and 292 deletions
|
|
@ -1,121 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Build base.db from downloaded parquet files.
|
||||
|
||||
Reads train.parquet, test.parquet, neighbors.parquet and creates a SQLite
|
||||
database with tables: train, query_vectors, neighbors.
|
||||
|
||||
Usage:
|
||||
uv run --with pandas --with pyarrow python build_base_db.py
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
import sqlite3
|
||||
import struct
|
||||
import sys
|
||||
import time
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def float_list_to_blob(floats):
|
||||
"""Pack a list of floats into a little-endian f32 blob."""
|
||||
return struct.pack(f"<{len(floats)}f", *floats)
|
||||
|
||||
|
||||
def main():
|
||||
seed_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
db_path = os.path.join(seed_dir, "base.db")
|
||||
|
||||
train_path = os.path.join(seed_dir, "train.parquet")
|
||||
test_path = os.path.join(seed_dir, "test.parquet")
|
||||
neighbors_path = os.path.join(seed_dir, "neighbors.parquet")
|
||||
|
||||
for p in (train_path, test_path, neighbors_path):
|
||||
if not os.path.exists(p):
|
||||
print(f"ERROR: {p} not found. Run 'make download' first.")
|
||||
sys.exit(1)
|
||||
|
||||
if os.path.exists(db_path):
|
||||
os.remove(db_path)
|
||||
|
||||
conn = sqlite3.connect(db_path)
|
||||
conn.execute("PRAGMA journal_mode=WAL")
|
||||
conn.execute("PRAGMA page_size=4096")
|
||||
|
||||
# --- query_vectors (from test.parquet) ---
|
||||
print("Loading test.parquet (query vectors)...")
|
||||
t0 = time.perf_counter()
|
||||
df_test = pd.read_parquet(test_path)
|
||||
conn.execute(
|
||||
"CREATE TABLE query_vectors (id INTEGER PRIMARY KEY, vector BLOB)"
|
||||
)
|
||||
rows = []
|
||||
for _, row in df_test.iterrows():
|
||||
rows.append((int(row["id"]), float_list_to_blob(row["emb"])))
|
||||
conn.executemany("INSERT INTO query_vectors (id, vector) VALUES (?, ?)", rows)
|
||||
conn.commit()
|
||||
print(f" {len(rows)} query vectors in {time.perf_counter() - t0:.1f}s")
|
||||
|
||||
# --- neighbors (from neighbors.parquet) ---
|
||||
print("Loading neighbors.parquet...")
|
||||
t0 = time.perf_counter()
|
||||
df_neighbors = pd.read_parquet(neighbors_path)
|
||||
conn.execute(
|
||||
"CREATE TABLE neighbors ("
|
||||
" query_vector_id INTEGER, rank INTEGER, neighbors_id TEXT,"
|
||||
" UNIQUE(query_vector_id, rank))"
|
||||
)
|
||||
rows = []
|
||||
for _, row in df_neighbors.iterrows():
|
||||
qid = int(row["id"])
|
||||
# neighbors_id may be a numpy array or JSON string
|
||||
nids = row["neighbors_id"]
|
||||
if isinstance(nids, str):
|
||||
nids = json.loads(nids)
|
||||
for rank, nid in enumerate(nids):
|
||||
rows.append((qid, rank, str(int(nid))))
|
||||
conn.executemany(
|
||||
"INSERT INTO neighbors (query_vector_id, rank, neighbors_id) VALUES (?, ?, ?)",
|
||||
rows,
|
||||
)
|
||||
conn.commit()
|
||||
print(f" {len(rows)} neighbor rows in {time.perf_counter() - t0:.1f}s")
|
||||
|
||||
# --- train (from train.parquet) ---
|
||||
print("Loading train.parquet (1M vectors, this takes a few minutes)...")
|
||||
t0 = time.perf_counter()
|
||||
conn.execute(
|
||||
"CREATE TABLE train (id INTEGER PRIMARY KEY, vector BLOB)"
|
||||
)
|
||||
|
||||
batch_size = 10000
|
||||
df_iter = pd.read_parquet(train_path)
|
||||
total = len(df_iter)
|
||||
|
||||
for start in range(0, total, batch_size):
|
||||
chunk = df_iter.iloc[start : start + batch_size]
|
||||
rows = []
|
||||
for _, row in chunk.iterrows():
|
||||
rows.append((int(row["id"]), float_list_to_blob(row["emb"])))
|
||||
conn.executemany("INSERT INTO train (id, vector) VALUES (?, ?)", rows)
|
||||
conn.commit()
|
||||
|
||||
done = min(start + batch_size, total)
|
||||
elapsed = time.perf_counter() - t0
|
||||
rate = done / elapsed if elapsed > 0 else 0
|
||||
eta = (total - done) / rate if rate > 0 else 0
|
||||
print(
|
||||
f" {done:>8}/{total} {elapsed:.0f}s {rate:.0f} rows/s eta {eta:.0f}s",
|
||||
flush=True,
|
||||
)
|
||||
|
||||
elapsed = time.perf_counter() - t0
|
||||
print(f" {total} train vectors in {elapsed:.1f}s")
|
||||
|
||||
conn.close()
|
||||
size_mb = os.path.getsize(db_path) / (1024 * 1024)
|
||||
print(f"\nDone: {db_path} ({size_mb:.0f} MB)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue