Add comprehensive ANN benchmarking suite

Extend benchmarks-ann/ with results database (SQLite with per-query detail
and continuous writes), dataset subfolder organization, --subset-size and
--warmup options. Supports systematic comparison across flat, rescore, IVF,
and DiskANN index types.
This commit is contained in:
Alex Garcia 2026-03-29 19:47:12 -07:00
parent a248ecd061
commit dbbb4b98f7
26 changed files with 2127 additions and 292 deletions

View file

@ -0,0 +1 @@
data/

View file

@ -0,0 +1,30 @@
MODEL ?= minishlab/potion-base-8M
K ?= 100
BATCH_SIZE ?= 512
DATA_DIR ?= data
all: base.db contents.db
# Download NYT headlines CSVs from Kaggle (requires `kaggle` CLI + API token)
$(DATA_DIR):
kaggle datasets download -d johnbandy/new-york-times-headlines -p $(DATA_DIR) --unzip
contents.db: $(DATA_DIR)
uv run build-contents.py --data-dir $(DATA_DIR) -o $@
base.db: contents.db queries.txt
uv run build-base.py \
--contents-db contents.db \
--model $(MODEL) \
--queries-file queries.txt \
--batch-size $(BATCH_SIZE) \
--k $(K) \
-o $@
clean:
rm -f base.db contents.db
clean-all: clean
rm -rf $(DATA_DIR)
.PHONY: all clean clean-all

View file

@ -0,0 +1,165 @@
# /// script
# requires-python = ">=3.12"
# dependencies = [
# "model2vec",
# "torch<=2.7",
# "tqdm",
# ]
# ///
import argparse
import sqlite3
from array import array
from itertools import batched
from model2vec import StaticModel
from tqdm import tqdm
def main():
parser = argparse.ArgumentParser(
description="Build base.db with train vectors, query vectors, and brute-force KNN neighbors",
)
parser.add_argument(
"--contents-db", "-c", default=None,
help="Path to contents.db (source of headlines and IDs)",
)
parser.add_argument(
"--model", "-m", default="minishlab/potion-base-8M",
help="HuggingFace model ID or local path (default: minishlab/potion-base-8M)",
)
parser.add_argument(
"--queries-file", "-q", default="queries.txt",
help="Path to the queries file (default: queries.txt)",
)
parser.add_argument(
"--output", "-o", required=True,
help="Path to the output base.db",
)
parser.add_argument(
"--batch-size", "-b", type=int, default=512,
help="Batch size for embedding (default: 512)",
)
parser.add_argument(
"--k", "-k", type=int, default=100,
help="Number of nearest neighbors (default: 100)",
)
parser.add_argument(
"--vec-path", "-v", default="~/projects/sqlite-vec/dist/vec0",
help="Path to sqlite-vec extension (default: ~/projects/sqlite-vec/dist/vec0)",
)
parser.add_argument(
"--rebuild-neighbors", action="store_true",
help="Only rebuild the neighbors table (skip embedding steps)",
)
args = parser.parse_args()
import os
vec_path = os.path.expanduser(args.vec_path)
if args.rebuild_neighbors:
# Skip embedding, just open existing DB and rebuild neighbors
db = sqlite3.connect(args.output)
db.enable_load_extension(True)
db.load_extension(vec_path)
db.enable_load_extension(False)
db.execute("DROP TABLE IF EXISTS neighbors")
db.execute(
"CREATE TABLE neighbors("
" query_vector_id INTEGER, rank INTEGER, neighbors_id TEXT,"
" UNIQUE(query_vector_id, rank))"
)
print(f"Rebuilding neighbors in {args.output}...")
else:
print(f"Loading model {args.model}...")
model = StaticModel.from_pretrained(args.model)
# Read headlines from contents.db
src = sqlite3.connect(args.contents_db)
headlines = src.execute("SELECT id, headline FROM contents ORDER BY id").fetchall()
src.close()
print(f"Loaded {len(headlines)} headlines from {args.contents_db}")
# Read queries
with open(args.queries_file) as f:
queries = [line.strip() for line in f if line.strip()]
print(f"Loaded {len(queries)} queries from {args.queries_file}")
# Create output database
db = sqlite3.connect(args.output)
db.enable_load_extension(True)
db.load_extension(vec_path)
db.enable_load_extension(False)
db.execute("CREATE TABLE train(id INTEGER PRIMARY KEY, vector BLOB)")
db.execute("CREATE TABLE query_vectors(id INTEGER PRIMARY KEY, vector BLOB)")
db.execute(
"CREATE TABLE neighbors("
" query_vector_id INTEGER, rank INTEGER, neighbors_id TEXT,"
" UNIQUE(query_vector_id, rank))"
)
# Step 1: Embed headlines -> train table
print("Embedding headlines...")
for batch in tqdm(
batched(headlines, args.batch_size),
total=(len(headlines) + args.batch_size - 1) // args.batch_size,
):
ids = [r[0] for r in batch]
texts = [r[1] for r in batch]
embeddings = model.encode(texts)
params = [
(int(rid), array("f", emb.tolist()).tobytes())
for rid, emb in zip(ids, embeddings)
]
db.executemany("INSERT INTO train VALUES (?, ?)", params)
db.commit()
del headlines
n = db.execute("SELECT count(*) FROM train").fetchone()[0]
print(f"Embedded {n} headlines")
# Step 2: Embed queries -> query_vectors table
print("Embedding queries...")
query_embeddings = model.encode(queries)
query_params = []
for i, emb in enumerate(query_embeddings, 1):
blob = array("f", emb.tolist()).tobytes()
query_params.append((i, blob))
db.executemany("INSERT INTO query_vectors VALUES (?, ?)", query_params)
db.commit()
print(f"Embedded {len(queries)} queries")
# Step 3: Brute-force KNN via sqlite-vec -> neighbors table
n_queries = db.execute("SELECT count(*) FROM query_vectors").fetchone()[0]
print(f"Computing {args.k}-NN for {n_queries} queries via sqlite-vec...")
for query_id, query_blob in tqdm(
db.execute("SELECT id, vector FROM query_vectors").fetchall()
):
results = db.execute(
"""
SELECT
train.id,
vec_distance_cosine(train.vector, ?) AS distance
FROM train
WHERE distance IS NOT NULL
ORDER BY distance ASC
LIMIT ?
""",
(query_blob, args.k),
).fetchall()
params = [
(query_id, rank, str(rid))
for rank, (rid, _dist) in enumerate(results)
]
db.executemany("INSERT INTO neighbors VALUES (?, ?, ?)", params)
db.commit()
db.close()
print(f"Done. Wrote {args.output}")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,52 @@
# /// script
# requires-python = ">=3.12"
# dependencies = [
# "duckdb",
# ]
# ///
import argparse
import os
import sqlite3
import duckdb
def main():
parser = argparse.ArgumentParser(
description="Load NYT headline CSVs into a SQLite contents database via DuckDB",
)
parser.add_argument(
"--data-dir", "-d", default="data",
help="Directory containing NYT CSV files (default: data)",
)
parser.add_argument(
"--output", "-o", required=True,
help="Path to the output SQLite database",
)
args = parser.parse_args()
glob_pattern = os.path.join(args.data_dir, "new_york_times_stories_*.csv")
con = duckdb.connect()
rows = con.execute(
f"""
SELECT
row_number() OVER () AS id,
headline
FROM read_csv('{glob_pattern}', auto_detect=true, union_by_name=true)
WHERE headline IS NOT NULL AND headline != ''
"""
).fetchall()
con.close()
db = sqlite3.connect(args.output)
db.execute("CREATE TABLE contents(id INTEGER PRIMARY KEY, headline TEXT)")
db.executemany("INSERT INTO contents VALUES (?, ?)", rows)
db.commit()
db.close()
print(f"Wrote {len(rows)} headlines to {args.output}")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,100 @@
latest news on climate change policy
presidential election results and analysis
stock market crash causes
coronavirus vaccine development updates
artificial intelligence breakthrough in healthcare
supreme court ruling on abortion rights
tech companies layoff announcements
earthquake damages in California
cybersecurity breach at major corporation
space exploration mission to Mars
immigration reform legislation debate
renewable energy investment trends
healthcare costs rising across America
protests against police brutality
wildfires destroy homes in the West
Olympic games highlights and records
celebrity scandal rocks Hollywood
breakthrough cancer treatment discovered
housing market bubble concerns
federal reserve interest rate decision
school shooting tragedy response
diplomatic tensions between superpowers
drone strike kills terrorist leader
social media platform faces regulation
archaeological discovery reveals ancient civilization
unemployment rate hits record low
autonomous vehicles testing expansion
streaming service launches original content
opioid crisis intervention programs
trade war tariffs impact economy
infrastructure bill passes Congress
data privacy concerns grow
minimum wage increase proposal
college admissions scandal exposed
NFL player protest during anthem
cryptocurrency regulation debate
pandemic lockdown restrictions eased
mass shooting gun control debate
tax reform legislation impact
ransomware attack cripples pipeline
climate activists stage demonstration
sports team wins championship
banking system collapse fears
pharmaceutical company fraud charges
genetic engineering ethical concerns
border wall funding controversy
impeachment proceedings begin
nuclear weapons treaty violation
artificial meat alternative launch
student loan debt forgiveness
venture capital funding decline
facial recognition ban proposed
election interference investigation
pandemic preparedness failures
police reform measures announced
wildfire prevention strategies
ocean pollution crisis worsens
manufacturing jobs returning
pension fund shortfall concerns
antitrust investigation launched
voting rights protection act
mental health awareness campaign
homeless population increasing
space debris collision risk
drug cartel violence escalates
renewable energy jobs growth
infrastructure deterioration report
vaccine mandate legal challenge
cryptocurrency market volatility
autonomous drone delivery service
deep fake technology dangers
Arctic ice melting accelerates
income inequality gap widens
election fraud claims disputed
corporate merger blocked
medical breakthrough extends life
transportation strike disrupts city
racial justice protests spread
carbon emissions reduction goals
financial crisis warning signs
cyberbullying prevention efforts
asteroid near miss with Earth
gene therapy approval granted
labor union organizing drive
surveillance technology expansion
education funding cuts proposed
disaster relief efforts underway
housing affordability crisis
clean water access shortage
artificial intelligence job displacement
trade agreement negotiations
prison reform initiative launched
species extinction accelerates
political corruption scandal
terrorism threat level raised
food safety contamination outbreak
ai model release
affordability interest rates
peanut allergies in newbons
breaking bad walter white