Add comprehensive ANN benchmarking suite

Extend benchmarks-ann/ with results database (SQLite with per-query detail
and continuous writes), dataset subfolder organization, --subset-size and
--warmup options. Supports systematic comparison across flat, rescore, IVF,
and DiskANN index types.
This commit is contained in:
Alex Garcia 2026-03-29 19:47:12 -07:00
parent a248ecd061
commit dbbb4b98f7
26 changed files with 2127 additions and 292 deletions

View file

@ -0,0 +1,27 @@
BASE_URL = https://assets.zilliz.com/benchmark/cohere_large_10m
TRAIN_PARQUETS = $(shell printf 'train-%02d-of-10.parquet ' 0 1 2 3 4 5 6 7 8 9)
OTHER_PARQUETS = test.parquet neighbors.parquet
PARQUETS = $(TRAIN_PARQUETS) $(OTHER_PARQUETS)
.PHONY: all download clean
all: base.db
# Use: make -j12 download
download: $(PARQUETS)
train-%-of-10.parquet:
curl -L -o $@ $(BASE_URL)/$@
test.parquet:
curl -L -o $@ $(BASE_URL)/test.parquet
neighbors.parquet:
curl -L -o $@ $(BASE_URL)/neighbors.parquet
base.db: $(PARQUETS) build_base_db.py
uv run --with pandas --with pyarrow python build_base_db.py
clean:
rm -f base.db

View file

@ -0,0 +1,134 @@
#!/usr/bin/env python3
"""Build base.db from downloaded parquet files (10M dataset, 10 train shards).
Reads train-00-of-10.parquet .. train-09-of-10.parquet, test.parquet,
neighbors.parquet and creates a SQLite database with tables:
train, query_vectors, neighbors.
Usage:
uv run --with pandas --with pyarrow python build_base_db.py
"""
import json
import os
import sqlite3
import struct
import sys
import time
import pandas as pd
TRAIN_SHARDS = 10
def float_list_to_blob(floats):
"""Pack a list of floats into a little-endian f32 blob."""
return struct.pack(f"<{len(floats)}f", *floats)
def main():
script_dir = os.path.dirname(os.path.abspath(__file__))
db_path = os.path.join(script_dir, "base.db")
train_paths = [
os.path.join(script_dir, f"train-{i:02d}-of-{TRAIN_SHARDS}.parquet")
for i in range(TRAIN_SHARDS)
]
test_path = os.path.join(script_dir, "test.parquet")
neighbors_path = os.path.join(script_dir, "neighbors.parquet")
for p in train_paths + [test_path, neighbors_path]:
if not os.path.exists(p):
print(f"ERROR: {p} not found. Run 'make download' first.")
sys.exit(1)
if os.path.exists(db_path):
os.remove(db_path)
conn = sqlite3.connect(db_path)
conn.execute("PRAGMA journal_mode=WAL")
conn.execute("PRAGMA page_size=4096")
# --- query_vectors (from test.parquet) ---
print("Loading test.parquet (query vectors)...")
t0 = time.perf_counter()
df_test = pd.read_parquet(test_path)
conn.execute(
"CREATE TABLE query_vectors (id INTEGER PRIMARY KEY, vector BLOB)"
)
rows = []
for _, row in df_test.iterrows():
rows.append((int(row["id"]), float_list_to_blob(row["emb"])))
conn.executemany("INSERT INTO query_vectors (id, vector) VALUES (?, ?)", rows)
conn.commit()
print(f" {len(rows)} query vectors in {time.perf_counter() - t0:.1f}s")
# --- neighbors (from neighbors.parquet) ---
print("Loading neighbors.parquet...")
t0 = time.perf_counter()
df_neighbors = pd.read_parquet(neighbors_path)
conn.execute(
"CREATE TABLE neighbors ("
" query_vector_id INTEGER, rank INTEGER, neighbors_id TEXT,"
" UNIQUE(query_vector_id, rank))"
)
rows = []
for _, row in df_neighbors.iterrows():
qid = int(row["id"])
nids = row["neighbors_id"]
if isinstance(nids, str):
nids = json.loads(nids)
for rank, nid in enumerate(nids):
rows.append((qid, rank, str(int(nid))))
conn.executemany(
"INSERT INTO neighbors (query_vector_id, rank, neighbors_id) VALUES (?, ?, ?)",
rows,
)
conn.commit()
print(f" {len(rows)} neighbor rows in {time.perf_counter() - t0:.1f}s")
# --- train (from 10 shard parquets) ---
print(f"Loading {TRAIN_SHARDS} train shards (10M vectors, this will take a while)...")
conn.execute(
"CREATE TABLE train (id INTEGER PRIMARY KEY, vector BLOB)"
)
global_t0 = time.perf_counter()
total_inserted = 0
batch_size = 10000
for shard_idx, train_path in enumerate(train_paths):
print(f" Shard {shard_idx + 1}/{TRAIN_SHARDS}: {os.path.basename(train_path)}")
t0 = time.perf_counter()
df = pd.read_parquet(train_path)
shard_len = len(df)
for start in range(0, shard_len, batch_size):
chunk = df.iloc[start : start + batch_size]
rows = []
for _, row in chunk.iterrows():
rows.append((int(row["id"]), float_list_to_blob(row["emb"])))
conn.executemany("INSERT INTO train (id, vector) VALUES (?, ?)", rows)
conn.commit()
total_inserted += len(rows)
if total_inserted % 100000 < batch_size:
elapsed = time.perf_counter() - global_t0
rate = total_inserted / elapsed if elapsed > 0 else 0
print(
f" {total_inserted:>10} {elapsed:.0f}s {rate:.0f} rows/s",
flush=True,
)
shard_elapsed = time.perf_counter() - t0
print(f" shard done: {shard_len} rows in {shard_elapsed:.1f}s")
elapsed = time.perf_counter() - global_t0
print(f" {total_inserted} train vectors in {elapsed:.1f}s")
conn.close()
size_mb = os.path.getsize(db_path) / (1024 * 1024)
print(f"\nDone: {db_path} ({size_mb:.0f} MB)")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,2 @@
*.parquet
base.db

View file

@ -0,0 +1,24 @@
BASE_URL = https://assets.zilliz.com/benchmark/cohere_medium_1m
PARQUETS = train.parquet test.parquet neighbors.parquet
.PHONY: all download base.db clean
all: base.db
download: $(PARQUETS)
train.parquet:
curl -L -o $@ $(BASE_URL)/train.parquet
test.parquet:
curl -L -o $@ $(BASE_URL)/test.parquet
neighbors.parquet:
curl -L -o $@ $(BASE_URL)/neighbors.parquet
base.db: $(PARQUETS) build_base_db.py
uv run --with pandas --with pyarrow python build_base_db.py
clean:
rm -f base.db

View file

@ -0,0 +1,121 @@
#!/usr/bin/env python3
"""Build base.db from downloaded parquet files.
Reads train.parquet, test.parquet, neighbors.parquet and creates a SQLite
database with tables: train, query_vectors, neighbors.
Usage:
uv run --with pandas --with pyarrow python build_base_db.py
"""
import json
import os
import sqlite3
import struct
import sys
import time
import pandas as pd
def float_list_to_blob(floats):
"""Pack a list of floats into a little-endian f32 blob."""
return struct.pack(f"<{len(floats)}f", *floats)
def main():
seed_dir = os.path.dirname(os.path.abspath(__file__))
db_path = os.path.join(seed_dir, "base.db")
train_path = os.path.join(seed_dir, "train.parquet")
test_path = os.path.join(seed_dir, "test.parquet")
neighbors_path = os.path.join(seed_dir, "neighbors.parquet")
for p in (train_path, test_path, neighbors_path):
if not os.path.exists(p):
print(f"ERROR: {p} not found. Run 'make download' first.")
sys.exit(1)
if os.path.exists(db_path):
os.remove(db_path)
conn = sqlite3.connect(db_path)
conn.execute("PRAGMA journal_mode=WAL")
conn.execute("PRAGMA page_size=4096")
# --- query_vectors (from test.parquet) ---
print("Loading test.parquet (query vectors)...")
t0 = time.perf_counter()
df_test = pd.read_parquet(test_path)
conn.execute(
"CREATE TABLE query_vectors (id INTEGER PRIMARY KEY, vector BLOB)"
)
rows = []
for _, row in df_test.iterrows():
rows.append((int(row["id"]), float_list_to_blob(row["emb"])))
conn.executemany("INSERT INTO query_vectors (id, vector) VALUES (?, ?)", rows)
conn.commit()
print(f" {len(rows)} query vectors in {time.perf_counter() - t0:.1f}s")
# --- neighbors (from neighbors.parquet) ---
print("Loading neighbors.parquet...")
t0 = time.perf_counter()
df_neighbors = pd.read_parquet(neighbors_path)
conn.execute(
"CREATE TABLE neighbors ("
" query_vector_id INTEGER, rank INTEGER, neighbors_id TEXT,"
" UNIQUE(query_vector_id, rank))"
)
rows = []
for _, row in df_neighbors.iterrows():
qid = int(row["id"])
# neighbors_id may be a numpy array or JSON string
nids = row["neighbors_id"]
if isinstance(nids, str):
nids = json.loads(nids)
for rank, nid in enumerate(nids):
rows.append((qid, rank, str(int(nid))))
conn.executemany(
"INSERT INTO neighbors (query_vector_id, rank, neighbors_id) VALUES (?, ?, ?)",
rows,
)
conn.commit()
print(f" {len(rows)} neighbor rows in {time.perf_counter() - t0:.1f}s")
# --- train (from train.parquet) ---
print("Loading train.parquet (1M vectors, this takes a few minutes)...")
t0 = time.perf_counter()
conn.execute(
"CREATE TABLE train (id INTEGER PRIMARY KEY, vector BLOB)"
)
batch_size = 10000
df_iter = pd.read_parquet(train_path)
total = len(df_iter)
for start in range(0, total, batch_size):
chunk = df_iter.iloc[start : start + batch_size]
rows = []
for _, row in chunk.iterrows():
rows.append((int(row["id"]), float_list_to_blob(row["emb"])))
conn.executemany("INSERT INTO train (id, vector) VALUES (?, ?)", rows)
conn.commit()
done = min(start + batch_size, total)
elapsed = time.perf_counter() - t0
rate = done / elapsed if elapsed > 0 else 0
eta = (total - done) / rate if rate > 0 else 0
print(
f" {done:>8}/{total} {elapsed:.0f}s {rate:.0f} rows/s eta {eta:.0f}s",
flush=True,
)
elapsed = time.perf_counter() - t0
print(f" {total} train vectors in {elapsed:.1f}s")
conn.close()
size_mb = os.path.getsize(db_path) / (1024 * 1024)
print(f"\nDone: {db_path} ({size_mb:.0f} MB)")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,30 @@
MODEL ?= mixedbread-ai/mxbai-embed-large-v1
K ?= 100
BATCH_SIZE ?= 256
DATA_DIR ?= ../nyt/data
all: base.db
# Reuse data from ../nyt
$(DATA_DIR):
$(MAKE) -C ../nyt data
contents.db: $(DATA_DIR)
uv run ../nyt-768/build-contents.py --data-dir $(DATA_DIR) -o $@
base.db: contents.db queries.txt
uv run build-base.py \
--contents-db contents.db \
--model $(MODEL) \
--queries-file queries.txt \
--batch-size $(BATCH_SIZE) \
--k $(K) \
-o $@
queries.txt:
cp ../nyt/queries.txt $@
clean:
rm -f base.db contents.db
.PHONY: all clean

View file

@ -0,0 +1,163 @@
# /// script
# requires-python = ">=3.12"
# dependencies = [
# "sentence-transformers",
# "torch<=2.7",
# "tqdm",
# ]
# ///
import argparse
import sqlite3
from array import array
from itertools import batched
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
def main():
parser = argparse.ArgumentParser(
description="Build base.db with train vectors, query vectors, and brute-force KNN neighbors",
)
parser.add_argument(
"--contents-db", "-c", default=None,
help="Path to contents.db (source of headlines and IDs)",
)
parser.add_argument(
"--model", "-m", default="mixedbread-ai/mxbai-embed-large-v1",
help="HuggingFace model ID (default: mixedbread-ai/mxbai-embed-large-v1)",
)
parser.add_argument(
"--queries-file", "-q", default="queries.txt",
help="Path to the queries file (default: queries.txt)",
)
parser.add_argument(
"--output", "-o", required=True,
help="Path to the output base.db",
)
parser.add_argument(
"--batch-size", "-b", type=int, default=256,
help="Batch size for embedding (default: 256)",
)
parser.add_argument(
"--k", "-k", type=int, default=100,
help="Number of nearest neighbors (default: 100)",
)
parser.add_argument(
"--limit", "-l", type=int, default=0,
help="Limit number of headlines to embed (0 = all)",
)
parser.add_argument(
"--vec-path", "-v", default="~/projects/sqlite-vec/dist/vec0",
help="Path to sqlite-vec extension (default: ~/projects/sqlite-vec/dist/vec0)",
)
parser.add_argument(
"--skip-neighbors", action="store_true",
help="Skip the brute-force KNN neighbor computation",
)
args = parser.parse_args()
import os
vec_path = os.path.expanduser(args.vec_path)
print(f"Loading model {args.model}...")
model = SentenceTransformer(args.model)
# Read headlines from contents.db
src = sqlite3.connect(args.contents_db)
limit_clause = f" LIMIT {args.limit}" if args.limit > 0 else ""
headlines = src.execute(
f"SELECT id, headline FROM contents ORDER BY id{limit_clause}"
).fetchall()
src.close()
print(f"Loaded {len(headlines)} headlines from {args.contents_db}")
# Read queries
with open(args.queries_file) as f:
queries = [line.strip() for line in f if line.strip()]
print(f"Loaded {len(queries)} queries from {args.queries_file}")
# Create output database
db = sqlite3.connect(args.output)
db.enable_load_extension(True)
db.load_extension(vec_path)
db.enable_load_extension(False)
db.execute("CREATE TABLE IF NOT EXISTS train(id INTEGER PRIMARY KEY, vector BLOB)")
db.execute("CREATE TABLE IF NOT EXISTS query_vectors(id INTEGER PRIMARY KEY, vector BLOB)")
db.execute(
"CREATE TABLE IF NOT EXISTS neighbors("
" query_vector_id INTEGER, rank INTEGER, neighbors_id TEXT,"
" UNIQUE(query_vector_id, rank))"
)
# Step 1: Embed headlines -> train table
print("Embedding headlines...")
for batch in tqdm(
batched(headlines, args.batch_size),
total=(len(headlines) + args.batch_size - 1) // args.batch_size,
):
ids = [r[0] for r in batch]
texts = [r[1] for r in batch]
embeddings = model.encode(texts, normalize_embeddings=True)
params = [
(int(rid), array("f", emb.tolist()).tobytes())
for rid, emb in zip(ids, embeddings)
]
db.executemany("INSERT INTO train VALUES (?, ?)", params)
db.commit()
del headlines
n = db.execute("SELECT count(*) FROM train").fetchone()[0]
print(f"Embedded {n} headlines")
# Step 2: Embed queries -> query_vectors table
print("Embedding queries...")
query_embeddings = model.encode(queries, normalize_embeddings=True)
query_params = []
for i, emb in enumerate(query_embeddings, 1):
blob = array("f", emb.tolist()).tobytes()
query_params.append((i, blob))
db.executemany("INSERT INTO query_vectors VALUES (?, ?)", query_params)
db.commit()
print(f"Embedded {len(queries)} queries")
if args.skip_neighbors:
db.close()
print(f"Done (skipped neighbors). Wrote {args.output}")
return
# Step 3: Brute-force KNN via sqlite-vec -> neighbors table
n_queries = db.execute("SELECT count(*) FROM query_vectors").fetchone()[0]
print(f"Computing {args.k}-NN for {n_queries} queries via sqlite-vec...")
for query_id, query_blob in tqdm(
db.execute("SELECT id, vector FROM query_vectors").fetchall()
):
results = db.execute(
"""
SELECT
train.id,
vec_distance_cosine(train.vector, ?) AS distance
FROM train
WHERE distance IS NOT NULL
ORDER BY distance ASC
LIMIT ?
""",
(query_blob, args.k),
).fetchall()
params = [
(query_id, rank, str(rid))
for rank, (rid, _dist) in enumerate(results)
]
db.executemany("INSERT INTO neighbors VALUES (?, ?, ?)", params)
db.commit()
db.close()
print(f"Done. Wrote {args.output}")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,100 @@
latest news on climate change policy
presidential election results and analysis
stock market crash causes
coronavirus vaccine development updates
artificial intelligence breakthrough in healthcare
supreme court ruling on abortion rights
tech companies layoff announcements
earthquake damages in California
cybersecurity breach at major corporation
space exploration mission to Mars
immigration reform legislation debate
renewable energy investment trends
healthcare costs rising across America
protests against police brutality
wildfires destroy homes in the West
Olympic games highlights and records
celebrity scandal rocks Hollywood
breakthrough cancer treatment discovered
housing market bubble concerns
federal reserve interest rate decision
school shooting tragedy response
diplomatic tensions between superpowers
drone strike kills terrorist leader
social media platform faces regulation
archaeological discovery reveals ancient civilization
unemployment rate hits record low
autonomous vehicles testing expansion
streaming service launches original content
opioid crisis intervention programs
trade war tariffs impact economy
infrastructure bill passes Congress
data privacy concerns grow
minimum wage increase proposal
college admissions scandal exposed
NFL player protest during anthem
cryptocurrency regulation debate
pandemic lockdown restrictions eased
mass shooting gun control debate
tax reform legislation impact
ransomware attack cripples pipeline
climate activists stage demonstration
sports team wins championship
banking system collapse fears
pharmaceutical company fraud charges
genetic engineering ethical concerns
border wall funding controversy
impeachment proceedings begin
nuclear weapons treaty violation
artificial meat alternative launch
student loan debt forgiveness
venture capital funding decline
facial recognition ban proposed
election interference investigation
pandemic preparedness failures
police reform measures announced
wildfire prevention strategies
ocean pollution crisis worsens
manufacturing jobs returning
pension fund shortfall concerns
antitrust investigation launched
voting rights protection act
mental health awareness campaign
homeless population increasing
space debris collision risk
drug cartel violence escalates
renewable energy jobs growth
infrastructure deterioration report
vaccine mandate legal challenge
cryptocurrency market volatility
autonomous drone delivery service
deep fake technology dangers
Arctic ice melting accelerates
income inequality gap widens
election fraud claims disputed
corporate merger blocked
medical breakthrough extends life
transportation strike disrupts city
racial justice protests spread
carbon emissions reduction goals
financial crisis warning signs
cyberbullying prevention efforts
asteroid near miss with Earth
gene therapy approval granted
labor union organizing drive
surveillance technology expansion
education funding cuts proposed
disaster relief efforts underway
housing affordability crisis
clean water access shortage
artificial intelligence job displacement
trade agreement negotiations
prison reform initiative launched
species extinction accelerates
political corruption scandal
terrorism threat level raised
food safety contamination outbreak
ai model release
affordability interest rates
peanut allergies in newbons
breaking bad walter white

View file

@ -0,0 +1,29 @@
MODEL ?= mixedbread-ai/mxbai-embed-xsmall-v1
K ?= 100
BATCH_SIZE ?= 512
DATA_DIR ?= ../nyt/data
all: base.db
$(DATA_DIR):
$(MAKE) -C ../nyt data
contents.db: $(DATA_DIR)
uv run ../nyt-768/build-contents.py --data-dir $(DATA_DIR) -o $@
base.db: contents.db queries.txt
uv run ../nyt-1024/build-base.py \
--contents-db contents.db \
--model $(MODEL) \
--queries-file queries.txt \
--batch-size $(BATCH_SIZE) \
--k $(K) \
-o $@
queries.txt:
cp ../nyt/queries.txt $@
clean:
rm -f base.db contents.db
.PHONY: all clean

View file

@ -0,0 +1,100 @@
latest news on climate change policy
presidential election results and analysis
stock market crash causes
coronavirus vaccine development updates
artificial intelligence breakthrough in healthcare
supreme court ruling on abortion rights
tech companies layoff announcements
earthquake damages in California
cybersecurity breach at major corporation
space exploration mission to Mars
immigration reform legislation debate
renewable energy investment trends
healthcare costs rising across America
protests against police brutality
wildfires destroy homes in the West
Olympic games highlights and records
celebrity scandal rocks Hollywood
breakthrough cancer treatment discovered
housing market bubble concerns
federal reserve interest rate decision
school shooting tragedy response
diplomatic tensions between superpowers
drone strike kills terrorist leader
social media platform faces regulation
archaeological discovery reveals ancient civilization
unemployment rate hits record low
autonomous vehicles testing expansion
streaming service launches original content
opioid crisis intervention programs
trade war tariffs impact economy
infrastructure bill passes Congress
data privacy concerns grow
minimum wage increase proposal
college admissions scandal exposed
NFL player protest during anthem
cryptocurrency regulation debate
pandemic lockdown restrictions eased
mass shooting gun control debate
tax reform legislation impact
ransomware attack cripples pipeline
climate activists stage demonstration
sports team wins championship
banking system collapse fears
pharmaceutical company fraud charges
genetic engineering ethical concerns
border wall funding controversy
impeachment proceedings begin
nuclear weapons treaty violation
artificial meat alternative launch
student loan debt forgiveness
venture capital funding decline
facial recognition ban proposed
election interference investigation
pandemic preparedness failures
police reform measures announced
wildfire prevention strategies
ocean pollution crisis worsens
manufacturing jobs returning
pension fund shortfall concerns
antitrust investigation launched
voting rights protection act
mental health awareness campaign
homeless population increasing
space debris collision risk
drug cartel violence escalates
renewable energy jobs growth
infrastructure deterioration report
vaccine mandate legal challenge
cryptocurrency market volatility
autonomous drone delivery service
deep fake technology dangers
Arctic ice melting accelerates
income inequality gap widens
election fraud claims disputed
corporate merger blocked
medical breakthrough extends life
transportation strike disrupts city
racial justice protests spread
carbon emissions reduction goals
financial crisis warning signs
cyberbullying prevention efforts
asteroid near miss with Earth
gene therapy approval granted
labor union organizing drive
surveillance technology expansion
education funding cuts proposed
disaster relief efforts underway
housing affordability crisis
clean water access shortage
artificial intelligence job displacement
trade agreement negotiations
prison reform initiative launched
species extinction accelerates
political corruption scandal
terrorism threat level raised
food safety contamination outbreak
ai model release
affordability interest rates
peanut allergies in newbons
breaking bad walter white

View file

@ -0,0 +1,37 @@
MODEL ?= bge-base-en-v1.5-768
K ?= 100
BATCH_SIZE ?= 512
DATA_DIR ?= ../nyt/data
all: base.db
# Reuse data from ../nyt
$(DATA_DIR):
$(MAKE) -C ../nyt data
# Distill model (separate step, may take a while)
$(MODEL):
uv run distill-model.py
contents.db: $(DATA_DIR)
uv run build-contents.py --data-dir $(DATA_DIR) -o $@
base.db: contents.db queries.txt $(MODEL)
uv run ../nyt/build-base.py \
--contents-db contents.db \
--model $(MODEL) \
--queries-file queries.txt \
--batch-size $(BATCH_SIZE) \
--k $(K) \
-o $@
queries.txt:
cp ../nyt/queries.txt $@
clean:
rm -f base.db contents.db
clean-all: clean
rm -rf $(MODEL)
.PHONY: all clean clean-all

View file

@ -0,0 +1,64 @@
# /// script
# requires-python = ">=3.12"
# dependencies = [
# "duckdb",
# ]
# ///
import argparse
import sqlite3
import duckdb
def main():
parser = argparse.ArgumentParser(
description="Load NYT headline CSVs into a SQLite contents database (most recent 1M, deduplicated)",
)
parser.add_argument(
"--data-dir", "-d", default="../nyt/data",
help="Directory containing NYT CSV files (default: ../nyt/data)",
)
parser.add_argument(
"--limit", "-l", type=int, default=1_000_000,
help="Maximum number of headlines to keep (default: 1000000)",
)
parser.add_argument(
"--output", "-o", required=True,
help="Path to the output SQLite database",
)
args = parser.parse_args()
glob_pattern = f"{args.data_dir}/new_york_times_stories_*.csv"
con = duckdb.connect()
rows = con.execute(
f"""
WITH deduped AS (
SELECT
headline,
max(pub_date) AS pub_date
FROM read_csv('{glob_pattern}', auto_detect=true, union_by_name=true)
WHERE headline IS NOT NULL AND trim(headline) != ''
GROUP BY headline
)
SELECT
row_number() OVER (ORDER BY pub_date DESC) AS id,
headline
FROM deduped
ORDER BY pub_date DESC
LIMIT {args.limit}
"""
).fetchall()
con.close()
db = sqlite3.connect(args.output)
db.execute("CREATE TABLE contents(id INTEGER PRIMARY KEY, headline TEXT)")
db.executemany("INSERT INTO contents VALUES (?, ?)", rows)
db.commit()
db.close()
print(f"Wrote {len(rows)} headlines to {args.output}")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,13 @@
# /// script
# requires-python = ">=3.12"
# dependencies = [
# "model2vec[distill]",
# "torch<=2.7",
# ]
# ///
from model2vec.distill import distill
model = distill(model_name="BAAI/bge-base-en-v1.5", pca_dims=768)
model.save_pretrained("bge-base-en-v1.5-768")
print("Saved distilled model to bge-base-en-v1.5-768/")

View file

@ -0,0 +1,100 @@
latest news on climate change policy
presidential election results and analysis
stock market crash causes
coronavirus vaccine development updates
artificial intelligence breakthrough in healthcare
supreme court ruling on abortion rights
tech companies layoff announcements
earthquake damages in California
cybersecurity breach at major corporation
space exploration mission to Mars
immigration reform legislation debate
renewable energy investment trends
healthcare costs rising across America
protests against police brutality
wildfires destroy homes in the West
Olympic games highlights and records
celebrity scandal rocks Hollywood
breakthrough cancer treatment discovered
housing market bubble concerns
federal reserve interest rate decision
school shooting tragedy response
diplomatic tensions between superpowers
drone strike kills terrorist leader
social media platform faces regulation
archaeological discovery reveals ancient civilization
unemployment rate hits record low
autonomous vehicles testing expansion
streaming service launches original content
opioid crisis intervention programs
trade war tariffs impact economy
infrastructure bill passes Congress
data privacy concerns grow
minimum wage increase proposal
college admissions scandal exposed
NFL player protest during anthem
cryptocurrency regulation debate
pandemic lockdown restrictions eased
mass shooting gun control debate
tax reform legislation impact
ransomware attack cripples pipeline
climate activists stage demonstration
sports team wins championship
banking system collapse fears
pharmaceutical company fraud charges
genetic engineering ethical concerns
border wall funding controversy
impeachment proceedings begin
nuclear weapons treaty violation
artificial meat alternative launch
student loan debt forgiveness
venture capital funding decline
facial recognition ban proposed
election interference investigation
pandemic preparedness failures
police reform measures announced
wildfire prevention strategies
ocean pollution crisis worsens
manufacturing jobs returning
pension fund shortfall concerns
antitrust investigation launched
voting rights protection act
mental health awareness campaign
homeless population increasing
space debris collision risk
drug cartel violence escalates
renewable energy jobs growth
infrastructure deterioration report
vaccine mandate legal challenge
cryptocurrency market volatility
autonomous drone delivery service
deep fake technology dangers
Arctic ice melting accelerates
income inequality gap widens
election fraud claims disputed
corporate merger blocked
medical breakthrough extends life
transportation strike disrupts city
racial justice protests spread
carbon emissions reduction goals
financial crisis warning signs
cyberbullying prevention efforts
asteroid near miss with Earth
gene therapy approval granted
labor union organizing drive
surveillance technology expansion
education funding cuts proposed
disaster relief efforts underway
housing affordability crisis
clean water access shortage
artificial intelligence job displacement
trade agreement negotiations
prison reform initiative launched
species extinction accelerates
political corruption scandal
terrorism threat level raised
food safety contamination outbreak
ai model release
affordability interest rates
peanut allergies in newbons
breaking bad walter white

View file

@ -0,0 +1 @@
data/

View file

@ -0,0 +1,30 @@
MODEL ?= minishlab/potion-base-8M
K ?= 100
BATCH_SIZE ?= 512
DATA_DIR ?= data
all: base.db contents.db
# Download NYT headlines CSVs from Kaggle (requires `kaggle` CLI + API token)
$(DATA_DIR):
kaggle datasets download -d johnbandy/new-york-times-headlines -p $(DATA_DIR) --unzip
contents.db: $(DATA_DIR)
uv run build-contents.py --data-dir $(DATA_DIR) -o $@
base.db: contents.db queries.txt
uv run build-base.py \
--contents-db contents.db \
--model $(MODEL) \
--queries-file queries.txt \
--batch-size $(BATCH_SIZE) \
--k $(K) \
-o $@
clean:
rm -f base.db contents.db
clean-all: clean
rm -rf $(DATA_DIR)
.PHONY: all clean clean-all

View file

@ -0,0 +1,165 @@
# /// script
# requires-python = ">=3.12"
# dependencies = [
# "model2vec",
# "torch<=2.7",
# "tqdm",
# ]
# ///
import argparse
import sqlite3
from array import array
from itertools import batched
from model2vec import StaticModel
from tqdm import tqdm
def main():
parser = argparse.ArgumentParser(
description="Build base.db with train vectors, query vectors, and brute-force KNN neighbors",
)
parser.add_argument(
"--contents-db", "-c", default=None,
help="Path to contents.db (source of headlines and IDs)",
)
parser.add_argument(
"--model", "-m", default="minishlab/potion-base-8M",
help="HuggingFace model ID or local path (default: minishlab/potion-base-8M)",
)
parser.add_argument(
"--queries-file", "-q", default="queries.txt",
help="Path to the queries file (default: queries.txt)",
)
parser.add_argument(
"--output", "-o", required=True,
help="Path to the output base.db",
)
parser.add_argument(
"--batch-size", "-b", type=int, default=512,
help="Batch size for embedding (default: 512)",
)
parser.add_argument(
"--k", "-k", type=int, default=100,
help="Number of nearest neighbors (default: 100)",
)
parser.add_argument(
"--vec-path", "-v", default="~/projects/sqlite-vec/dist/vec0",
help="Path to sqlite-vec extension (default: ~/projects/sqlite-vec/dist/vec0)",
)
parser.add_argument(
"--rebuild-neighbors", action="store_true",
help="Only rebuild the neighbors table (skip embedding steps)",
)
args = parser.parse_args()
import os
vec_path = os.path.expanduser(args.vec_path)
if args.rebuild_neighbors:
# Skip embedding, just open existing DB and rebuild neighbors
db = sqlite3.connect(args.output)
db.enable_load_extension(True)
db.load_extension(vec_path)
db.enable_load_extension(False)
db.execute("DROP TABLE IF EXISTS neighbors")
db.execute(
"CREATE TABLE neighbors("
" query_vector_id INTEGER, rank INTEGER, neighbors_id TEXT,"
" UNIQUE(query_vector_id, rank))"
)
print(f"Rebuilding neighbors in {args.output}...")
else:
print(f"Loading model {args.model}...")
model = StaticModel.from_pretrained(args.model)
# Read headlines from contents.db
src = sqlite3.connect(args.contents_db)
headlines = src.execute("SELECT id, headline FROM contents ORDER BY id").fetchall()
src.close()
print(f"Loaded {len(headlines)} headlines from {args.contents_db}")
# Read queries
with open(args.queries_file) as f:
queries = [line.strip() for line in f if line.strip()]
print(f"Loaded {len(queries)} queries from {args.queries_file}")
# Create output database
db = sqlite3.connect(args.output)
db.enable_load_extension(True)
db.load_extension(vec_path)
db.enable_load_extension(False)
db.execute("CREATE TABLE train(id INTEGER PRIMARY KEY, vector BLOB)")
db.execute("CREATE TABLE query_vectors(id INTEGER PRIMARY KEY, vector BLOB)")
db.execute(
"CREATE TABLE neighbors("
" query_vector_id INTEGER, rank INTEGER, neighbors_id TEXT,"
" UNIQUE(query_vector_id, rank))"
)
# Step 1: Embed headlines -> train table
print("Embedding headlines...")
for batch in tqdm(
batched(headlines, args.batch_size),
total=(len(headlines) + args.batch_size - 1) // args.batch_size,
):
ids = [r[0] for r in batch]
texts = [r[1] for r in batch]
embeddings = model.encode(texts)
params = [
(int(rid), array("f", emb.tolist()).tobytes())
for rid, emb in zip(ids, embeddings)
]
db.executemany("INSERT INTO train VALUES (?, ?)", params)
db.commit()
del headlines
n = db.execute("SELECT count(*) FROM train").fetchone()[0]
print(f"Embedded {n} headlines")
# Step 2: Embed queries -> query_vectors table
print("Embedding queries...")
query_embeddings = model.encode(queries)
query_params = []
for i, emb in enumerate(query_embeddings, 1):
blob = array("f", emb.tolist()).tobytes()
query_params.append((i, blob))
db.executemany("INSERT INTO query_vectors VALUES (?, ?)", query_params)
db.commit()
print(f"Embedded {len(queries)} queries")
# Step 3: Brute-force KNN via sqlite-vec -> neighbors table
n_queries = db.execute("SELECT count(*) FROM query_vectors").fetchone()[0]
print(f"Computing {args.k}-NN for {n_queries} queries via sqlite-vec...")
for query_id, query_blob in tqdm(
db.execute("SELECT id, vector FROM query_vectors").fetchall()
):
results = db.execute(
"""
SELECT
train.id,
vec_distance_cosine(train.vector, ?) AS distance
FROM train
WHERE distance IS NOT NULL
ORDER BY distance ASC
LIMIT ?
""",
(query_blob, args.k),
).fetchall()
params = [
(query_id, rank, str(rid))
for rank, (rid, _dist) in enumerate(results)
]
db.executemany("INSERT INTO neighbors VALUES (?, ?, ?)", params)
db.commit()
db.close()
print(f"Done. Wrote {args.output}")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,52 @@
# /// script
# requires-python = ">=3.12"
# dependencies = [
# "duckdb",
# ]
# ///
import argparse
import os
import sqlite3
import duckdb
def main():
parser = argparse.ArgumentParser(
description="Load NYT headline CSVs into a SQLite contents database via DuckDB",
)
parser.add_argument(
"--data-dir", "-d", default="data",
help="Directory containing NYT CSV files (default: data)",
)
parser.add_argument(
"--output", "-o", required=True,
help="Path to the output SQLite database",
)
args = parser.parse_args()
glob_pattern = os.path.join(args.data_dir, "new_york_times_stories_*.csv")
con = duckdb.connect()
rows = con.execute(
f"""
SELECT
row_number() OVER () AS id,
headline
FROM read_csv('{glob_pattern}', auto_detect=true, union_by_name=true)
WHERE headline IS NOT NULL AND headline != ''
"""
).fetchall()
con.close()
db = sqlite3.connect(args.output)
db.execute("CREATE TABLE contents(id INTEGER PRIMARY KEY, headline TEXT)")
db.executemany("INSERT INTO contents VALUES (?, ?)", rows)
db.commit()
db.close()
print(f"Wrote {len(rows)} headlines to {args.output}")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,100 @@
latest news on climate change policy
presidential election results and analysis
stock market crash causes
coronavirus vaccine development updates
artificial intelligence breakthrough in healthcare
supreme court ruling on abortion rights
tech companies layoff announcements
earthquake damages in California
cybersecurity breach at major corporation
space exploration mission to Mars
immigration reform legislation debate
renewable energy investment trends
healthcare costs rising across America
protests against police brutality
wildfires destroy homes in the West
Olympic games highlights and records
celebrity scandal rocks Hollywood
breakthrough cancer treatment discovered
housing market bubble concerns
federal reserve interest rate decision
school shooting tragedy response
diplomatic tensions between superpowers
drone strike kills terrorist leader
social media platform faces regulation
archaeological discovery reveals ancient civilization
unemployment rate hits record low
autonomous vehicles testing expansion
streaming service launches original content
opioid crisis intervention programs
trade war tariffs impact economy
infrastructure bill passes Congress
data privacy concerns grow
minimum wage increase proposal
college admissions scandal exposed
NFL player protest during anthem
cryptocurrency regulation debate
pandemic lockdown restrictions eased
mass shooting gun control debate
tax reform legislation impact
ransomware attack cripples pipeline
climate activists stage demonstration
sports team wins championship
banking system collapse fears
pharmaceutical company fraud charges
genetic engineering ethical concerns
border wall funding controversy
impeachment proceedings begin
nuclear weapons treaty violation
artificial meat alternative launch
student loan debt forgiveness
venture capital funding decline
facial recognition ban proposed
election interference investigation
pandemic preparedness failures
police reform measures announced
wildfire prevention strategies
ocean pollution crisis worsens
manufacturing jobs returning
pension fund shortfall concerns
antitrust investigation launched
voting rights protection act
mental health awareness campaign
homeless population increasing
space debris collision risk
drug cartel violence escalates
renewable energy jobs growth
infrastructure deterioration report
vaccine mandate legal challenge
cryptocurrency market volatility
autonomous drone delivery service
deep fake technology dangers
Arctic ice melting accelerates
income inequality gap widens
election fraud claims disputed
corporate merger blocked
medical breakthrough extends life
transportation strike disrupts city
racial justice protests spread
carbon emissions reduction goals
financial crisis warning signs
cyberbullying prevention efforts
asteroid near miss with Earth
gene therapy approval granted
labor union organizing drive
surveillance technology expansion
education funding cuts proposed
disaster relief efforts underway
housing affordability crisis
clean water access shortage
artificial intelligence job displacement
trade agreement negotiations
prison reform initiative launched
species extinction accelerates
political corruption scandal
terrorism threat level raised
food safety contamination outbreak
ai model release
affordability interest rates
peanut allergies in newbons
breaking bad walter white