mirror of
https://github.com/asg017/sqlite-vec.git
synced 2026-04-25 08:46:49 +02:00
Add comprehensive ANN benchmarking suite
Extend benchmarks-ann/ with results database (SQLite with per-query detail and continuous writes), dataset subfolder organization, --subset-size and --warmup options. Supports systematic comparison across flat, rescore, IVF, and DiskANN index types.
This commit is contained in:
parent
a248ecd061
commit
dbbb4b98f7
26 changed files with 2127 additions and 292 deletions
27
benchmarks-ann/datasets/cohere10m/Makefile
Normal file
27
benchmarks-ann/datasets/cohere10m/Makefile
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
BASE_URL = https://assets.zilliz.com/benchmark/cohere_large_10m
|
||||
|
||||
TRAIN_PARQUETS = $(shell printf 'train-%02d-of-10.parquet ' 0 1 2 3 4 5 6 7 8 9)
|
||||
OTHER_PARQUETS = test.parquet neighbors.parquet
|
||||
PARQUETS = $(TRAIN_PARQUETS) $(OTHER_PARQUETS)
|
||||
|
||||
.PHONY: all download clean
|
||||
|
||||
all: base.db
|
||||
|
||||
# Use: make -j12 download
|
||||
download: $(PARQUETS)
|
||||
|
||||
train-%-of-10.parquet:
|
||||
curl -L -o $@ $(BASE_URL)/$@
|
||||
|
||||
test.parquet:
|
||||
curl -L -o $@ $(BASE_URL)/test.parquet
|
||||
|
||||
neighbors.parquet:
|
||||
curl -L -o $@ $(BASE_URL)/neighbors.parquet
|
||||
|
||||
base.db: $(PARQUETS) build_base_db.py
|
||||
uv run --with pandas --with pyarrow python build_base_db.py
|
||||
|
||||
clean:
|
||||
rm -f base.db
|
||||
134
benchmarks-ann/datasets/cohere10m/build_base_db.py
Normal file
134
benchmarks-ann/datasets/cohere10m/build_base_db.py
Normal file
|
|
@ -0,0 +1,134 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Build base.db from downloaded parquet files (10M dataset, 10 train shards).
|
||||
|
||||
Reads train-00-of-10.parquet .. train-09-of-10.parquet, test.parquet,
|
||||
neighbors.parquet and creates a SQLite database with tables:
|
||||
train, query_vectors, neighbors.
|
||||
|
||||
Usage:
|
||||
uv run --with pandas --with pyarrow python build_base_db.py
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
import sqlite3
|
||||
import struct
|
||||
import sys
|
||||
import time
|
||||
|
||||
import pandas as pd
|
||||
|
||||
TRAIN_SHARDS = 10
|
||||
|
||||
|
||||
def float_list_to_blob(floats):
|
||||
"""Pack a list of floats into a little-endian f32 blob."""
|
||||
return struct.pack(f"<{len(floats)}f", *floats)
|
||||
|
||||
|
||||
def main():
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
db_path = os.path.join(script_dir, "base.db")
|
||||
|
||||
train_paths = [
|
||||
os.path.join(script_dir, f"train-{i:02d}-of-{TRAIN_SHARDS}.parquet")
|
||||
for i in range(TRAIN_SHARDS)
|
||||
]
|
||||
test_path = os.path.join(script_dir, "test.parquet")
|
||||
neighbors_path = os.path.join(script_dir, "neighbors.parquet")
|
||||
|
||||
for p in train_paths + [test_path, neighbors_path]:
|
||||
if not os.path.exists(p):
|
||||
print(f"ERROR: {p} not found. Run 'make download' first.")
|
||||
sys.exit(1)
|
||||
|
||||
if os.path.exists(db_path):
|
||||
os.remove(db_path)
|
||||
|
||||
conn = sqlite3.connect(db_path)
|
||||
conn.execute("PRAGMA journal_mode=WAL")
|
||||
conn.execute("PRAGMA page_size=4096")
|
||||
|
||||
# --- query_vectors (from test.parquet) ---
|
||||
print("Loading test.parquet (query vectors)...")
|
||||
t0 = time.perf_counter()
|
||||
df_test = pd.read_parquet(test_path)
|
||||
conn.execute(
|
||||
"CREATE TABLE query_vectors (id INTEGER PRIMARY KEY, vector BLOB)"
|
||||
)
|
||||
rows = []
|
||||
for _, row in df_test.iterrows():
|
||||
rows.append((int(row["id"]), float_list_to_blob(row["emb"])))
|
||||
conn.executemany("INSERT INTO query_vectors (id, vector) VALUES (?, ?)", rows)
|
||||
conn.commit()
|
||||
print(f" {len(rows)} query vectors in {time.perf_counter() - t0:.1f}s")
|
||||
|
||||
# --- neighbors (from neighbors.parquet) ---
|
||||
print("Loading neighbors.parquet...")
|
||||
t0 = time.perf_counter()
|
||||
df_neighbors = pd.read_parquet(neighbors_path)
|
||||
conn.execute(
|
||||
"CREATE TABLE neighbors ("
|
||||
" query_vector_id INTEGER, rank INTEGER, neighbors_id TEXT,"
|
||||
" UNIQUE(query_vector_id, rank))"
|
||||
)
|
||||
rows = []
|
||||
for _, row in df_neighbors.iterrows():
|
||||
qid = int(row["id"])
|
||||
nids = row["neighbors_id"]
|
||||
if isinstance(nids, str):
|
||||
nids = json.loads(nids)
|
||||
for rank, nid in enumerate(nids):
|
||||
rows.append((qid, rank, str(int(nid))))
|
||||
conn.executemany(
|
||||
"INSERT INTO neighbors (query_vector_id, rank, neighbors_id) VALUES (?, ?, ?)",
|
||||
rows,
|
||||
)
|
||||
conn.commit()
|
||||
print(f" {len(rows)} neighbor rows in {time.perf_counter() - t0:.1f}s")
|
||||
|
||||
# --- train (from 10 shard parquets) ---
|
||||
print(f"Loading {TRAIN_SHARDS} train shards (10M vectors, this will take a while)...")
|
||||
conn.execute(
|
||||
"CREATE TABLE train (id INTEGER PRIMARY KEY, vector BLOB)"
|
||||
)
|
||||
|
||||
global_t0 = time.perf_counter()
|
||||
total_inserted = 0
|
||||
batch_size = 10000
|
||||
|
||||
for shard_idx, train_path in enumerate(train_paths):
|
||||
print(f" Shard {shard_idx + 1}/{TRAIN_SHARDS}: {os.path.basename(train_path)}")
|
||||
t0 = time.perf_counter()
|
||||
df = pd.read_parquet(train_path)
|
||||
shard_len = len(df)
|
||||
|
||||
for start in range(0, shard_len, batch_size):
|
||||
chunk = df.iloc[start : start + batch_size]
|
||||
rows = []
|
||||
for _, row in chunk.iterrows():
|
||||
rows.append((int(row["id"]), float_list_to_blob(row["emb"])))
|
||||
conn.executemany("INSERT INTO train (id, vector) VALUES (?, ?)", rows)
|
||||
conn.commit()
|
||||
|
||||
total_inserted += len(rows)
|
||||
if total_inserted % 100000 < batch_size:
|
||||
elapsed = time.perf_counter() - global_t0
|
||||
rate = total_inserted / elapsed if elapsed > 0 else 0
|
||||
print(
|
||||
f" {total_inserted:>10} {elapsed:.0f}s {rate:.0f} rows/s",
|
||||
flush=True,
|
||||
)
|
||||
|
||||
shard_elapsed = time.perf_counter() - t0
|
||||
print(f" shard done: {shard_len} rows in {shard_elapsed:.1f}s")
|
||||
|
||||
elapsed = time.perf_counter() - global_t0
|
||||
print(f" {total_inserted} train vectors in {elapsed:.1f}s")
|
||||
|
||||
conn.close()
|
||||
size_mb = os.path.getsize(db_path) / (1024 * 1024)
|
||||
print(f"\nDone: {db_path} ({size_mb:.0f} MB)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
2
benchmarks-ann/datasets/cohere1m/.gitignore
vendored
Normal file
2
benchmarks-ann/datasets/cohere1m/.gitignore
vendored
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
*.parquet
|
||||
base.db
|
||||
24
benchmarks-ann/datasets/cohere1m/Makefile
Normal file
24
benchmarks-ann/datasets/cohere1m/Makefile
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
BASE_URL = https://assets.zilliz.com/benchmark/cohere_medium_1m
|
||||
|
||||
PARQUETS = train.parquet test.parquet neighbors.parquet
|
||||
|
||||
.PHONY: all download base.db clean
|
||||
|
||||
all: base.db
|
||||
|
||||
download: $(PARQUETS)
|
||||
|
||||
train.parquet:
|
||||
curl -L -o $@ $(BASE_URL)/train.parquet
|
||||
|
||||
test.parquet:
|
||||
curl -L -o $@ $(BASE_URL)/test.parquet
|
||||
|
||||
neighbors.parquet:
|
||||
curl -L -o $@ $(BASE_URL)/neighbors.parquet
|
||||
|
||||
base.db: $(PARQUETS) build_base_db.py
|
||||
uv run --with pandas --with pyarrow python build_base_db.py
|
||||
|
||||
clean:
|
||||
rm -f base.db
|
||||
121
benchmarks-ann/datasets/cohere1m/build_base_db.py
Normal file
121
benchmarks-ann/datasets/cohere1m/build_base_db.py
Normal file
|
|
@ -0,0 +1,121 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Build base.db from downloaded parquet files.
|
||||
|
||||
Reads train.parquet, test.parquet, neighbors.parquet and creates a SQLite
|
||||
database with tables: train, query_vectors, neighbors.
|
||||
|
||||
Usage:
|
||||
uv run --with pandas --with pyarrow python build_base_db.py
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
import sqlite3
|
||||
import struct
|
||||
import sys
|
||||
import time
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def float_list_to_blob(floats):
|
||||
"""Pack a list of floats into a little-endian f32 blob."""
|
||||
return struct.pack(f"<{len(floats)}f", *floats)
|
||||
|
||||
|
||||
def main():
|
||||
seed_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
db_path = os.path.join(seed_dir, "base.db")
|
||||
|
||||
train_path = os.path.join(seed_dir, "train.parquet")
|
||||
test_path = os.path.join(seed_dir, "test.parquet")
|
||||
neighbors_path = os.path.join(seed_dir, "neighbors.parquet")
|
||||
|
||||
for p in (train_path, test_path, neighbors_path):
|
||||
if not os.path.exists(p):
|
||||
print(f"ERROR: {p} not found. Run 'make download' first.")
|
||||
sys.exit(1)
|
||||
|
||||
if os.path.exists(db_path):
|
||||
os.remove(db_path)
|
||||
|
||||
conn = sqlite3.connect(db_path)
|
||||
conn.execute("PRAGMA journal_mode=WAL")
|
||||
conn.execute("PRAGMA page_size=4096")
|
||||
|
||||
# --- query_vectors (from test.parquet) ---
|
||||
print("Loading test.parquet (query vectors)...")
|
||||
t0 = time.perf_counter()
|
||||
df_test = pd.read_parquet(test_path)
|
||||
conn.execute(
|
||||
"CREATE TABLE query_vectors (id INTEGER PRIMARY KEY, vector BLOB)"
|
||||
)
|
||||
rows = []
|
||||
for _, row in df_test.iterrows():
|
||||
rows.append((int(row["id"]), float_list_to_blob(row["emb"])))
|
||||
conn.executemany("INSERT INTO query_vectors (id, vector) VALUES (?, ?)", rows)
|
||||
conn.commit()
|
||||
print(f" {len(rows)} query vectors in {time.perf_counter() - t0:.1f}s")
|
||||
|
||||
# --- neighbors (from neighbors.parquet) ---
|
||||
print("Loading neighbors.parquet...")
|
||||
t0 = time.perf_counter()
|
||||
df_neighbors = pd.read_parquet(neighbors_path)
|
||||
conn.execute(
|
||||
"CREATE TABLE neighbors ("
|
||||
" query_vector_id INTEGER, rank INTEGER, neighbors_id TEXT,"
|
||||
" UNIQUE(query_vector_id, rank))"
|
||||
)
|
||||
rows = []
|
||||
for _, row in df_neighbors.iterrows():
|
||||
qid = int(row["id"])
|
||||
# neighbors_id may be a numpy array or JSON string
|
||||
nids = row["neighbors_id"]
|
||||
if isinstance(nids, str):
|
||||
nids = json.loads(nids)
|
||||
for rank, nid in enumerate(nids):
|
||||
rows.append((qid, rank, str(int(nid))))
|
||||
conn.executemany(
|
||||
"INSERT INTO neighbors (query_vector_id, rank, neighbors_id) VALUES (?, ?, ?)",
|
||||
rows,
|
||||
)
|
||||
conn.commit()
|
||||
print(f" {len(rows)} neighbor rows in {time.perf_counter() - t0:.1f}s")
|
||||
|
||||
# --- train (from train.parquet) ---
|
||||
print("Loading train.parquet (1M vectors, this takes a few minutes)...")
|
||||
t0 = time.perf_counter()
|
||||
conn.execute(
|
||||
"CREATE TABLE train (id INTEGER PRIMARY KEY, vector BLOB)"
|
||||
)
|
||||
|
||||
batch_size = 10000
|
||||
df_iter = pd.read_parquet(train_path)
|
||||
total = len(df_iter)
|
||||
|
||||
for start in range(0, total, batch_size):
|
||||
chunk = df_iter.iloc[start : start + batch_size]
|
||||
rows = []
|
||||
for _, row in chunk.iterrows():
|
||||
rows.append((int(row["id"]), float_list_to_blob(row["emb"])))
|
||||
conn.executemany("INSERT INTO train (id, vector) VALUES (?, ?)", rows)
|
||||
conn.commit()
|
||||
|
||||
done = min(start + batch_size, total)
|
||||
elapsed = time.perf_counter() - t0
|
||||
rate = done / elapsed if elapsed > 0 else 0
|
||||
eta = (total - done) / rate if rate > 0 else 0
|
||||
print(
|
||||
f" {done:>8}/{total} {elapsed:.0f}s {rate:.0f} rows/s eta {eta:.0f}s",
|
||||
flush=True,
|
||||
)
|
||||
|
||||
elapsed = time.perf_counter() - t0
|
||||
print(f" {total} train vectors in {elapsed:.1f}s")
|
||||
|
||||
conn.close()
|
||||
size_mb = os.path.getsize(db_path) / (1024 * 1024)
|
||||
print(f"\nDone: {db_path} ({size_mb:.0f} MB)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
30
benchmarks-ann/datasets/nyt-1024/Makefile
Normal file
30
benchmarks-ann/datasets/nyt-1024/Makefile
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
MODEL ?= mixedbread-ai/mxbai-embed-large-v1
|
||||
K ?= 100
|
||||
BATCH_SIZE ?= 256
|
||||
DATA_DIR ?= ../nyt/data
|
||||
|
||||
all: base.db
|
||||
|
||||
# Reuse data from ../nyt
|
||||
$(DATA_DIR):
|
||||
$(MAKE) -C ../nyt data
|
||||
|
||||
contents.db: $(DATA_DIR)
|
||||
uv run ../nyt-768/build-contents.py --data-dir $(DATA_DIR) -o $@
|
||||
|
||||
base.db: contents.db queries.txt
|
||||
uv run build-base.py \
|
||||
--contents-db contents.db \
|
||||
--model $(MODEL) \
|
||||
--queries-file queries.txt \
|
||||
--batch-size $(BATCH_SIZE) \
|
||||
--k $(K) \
|
||||
-o $@
|
||||
|
||||
queries.txt:
|
||||
cp ../nyt/queries.txt $@
|
||||
|
||||
clean:
|
||||
rm -f base.db contents.db
|
||||
|
||||
.PHONY: all clean
|
||||
163
benchmarks-ann/datasets/nyt-1024/build-base.py
Normal file
163
benchmarks-ann/datasets/nyt-1024/build-base.py
Normal file
|
|
@ -0,0 +1,163 @@
|
|||
# /// script
|
||||
# requires-python = ">=3.12"
|
||||
# dependencies = [
|
||||
# "sentence-transformers",
|
||||
# "torch<=2.7",
|
||||
# "tqdm",
|
||||
# ]
|
||||
# ///
|
||||
|
||||
import argparse
|
||||
import sqlite3
|
||||
from array import array
|
||||
from itertools import batched
|
||||
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Build base.db with train vectors, query vectors, and brute-force KNN neighbors",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--contents-db", "-c", default=None,
|
||||
help="Path to contents.db (source of headlines and IDs)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model", "-m", default="mixedbread-ai/mxbai-embed-large-v1",
|
||||
help="HuggingFace model ID (default: mixedbread-ai/mxbai-embed-large-v1)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--queries-file", "-q", default="queries.txt",
|
||||
help="Path to the queries file (default: queries.txt)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", "-o", required=True,
|
||||
help="Path to the output base.db",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--batch-size", "-b", type=int, default=256,
|
||||
help="Batch size for embedding (default: 256)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--k", "-k", type=int, default=100,
|
||||
help="Number of nearest neighbors (default: 100)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--limit", "-l", type=int, default=0,
|
||||
help="Limit number of headlines to embed (0 = all)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--vec-path", "-v", default="~/projects/sqlite-vec/dist/vec0",
|
||||
help="Path to sqlite-vec extension (default: ~/projects/sqlite-vec/dist/vec0)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-neighbors", action="store_true",
|
||||
help="Skip the brute-force KNN neighbor computation",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
import os
|
||||
vec_path = os.path.expanduser(args.vec_path)
|
||||
|
||||
print(f"Loading model {args.model}...")
|
||||
model = SentenceTransformer(args.model)
|
||||
|
||||
# Read headlines from contents.db
|
||||
src = sqlite3.connect(args.contents_db)
|
||||
limit_clause = f" LIMIT {args.limit}" if args.limit > 0 else ""
|
||||
headlines = src.execute(
|
||||
f"SELECT id, headline FROM contents ORDER BY id{limit_clause}"
|
||||
).fetchall()
|
||||
src.close()
|
||||
print(f"Loaded {len(headlines)} headlines from {args.contents_db}")
|
||||
|
||||
# Read queries
|
||||
with open(args.queries_file) as f:
|
||||
queries = [line.strip() for line in f if line.strip()]
|
||||
print(f"Loaded {len(queries)} queries from {args.queries_file}")
|
||||
|
||||
# Create output database
|
||||
db = sqlite3.connect(args.output)
|
||||
db.enable_load_extension(True)
|
||||
db.load_extension(vec_path)
|
||||
db.enable_load_extension(False)
|
||||
|
||||
db.execute("CREATE TABLE IF NOT EXISTS train(id INTEGER PRIMARY KEY, vector BLOB)")
|
||||
db.execute("CREATE TABLE IF NOT EXISTS query_vectors(id INTEGER PRIMARY KEY, vector BLOB)")
|
||||
db.execute(
|
||||
"CREATE TABLE IF NOT EXISTS neighbors("
|
||||
" query_vector_id INTEGER, rank INTEGER, neighbors_id TEXT,"
|
||||
" UNIQUE(query_vector_id, rank))"
|
||||
)
|
||||
|
||||
# Step 1: Embed headlines -> train table
|
||||
print("Embedding headlines...")
|
||||
for batch in tqdm(
|
||||
batched(headlines, args.batch_size),
|
||||
total=(len(headlines) + args.batch_size - 1) // args.batch_size,
|
||||
):
|
||||
ids = [r[0] for r in batch]
|
||||
texts = [r[1] for r in batch]
|
||||
embeddings = model.encode(texts, normalize_embeddings=True)
|
||||
|
||||
params = [
|
||||
(int(rid), array("f", emb.tolist()).tobytes())
|
||||
for rid, emb in zip(ids, embeddings)
|
||||
]
|
||||
db.executemany("INSERT INTO train VALUES (?, ?)", params)
|
||||
db.commit()
|
||||
|
||||
del headlines
|
||||
n = db.execute("SELECT count(*) FROM train").fetchone()[0]
|
||||
print(f"Embedded {n} headlines")
|
||||
|
||||
# Step 2: Embed queries -> query_vectors table
|
||||
print("Embedding queries...")
|
||||
query_embeddings = model.encode(queries, normalize_embeddings=True)
|
||||
query_params = []
|
||||
for i, emb in enumerate(query_embeddings, 1):
|
||||
blob = array("f", emb.tolist()).tobytes()
|
||||
query_params.append((i, blob))
|
||||
db.executemany("INSERT INTO query_vectors VALUES (?, ?)", query_params)
|
||||
db.commit()
|
||||
print(f"Embedded {len(queries)} queries")
|
||||
|
||||
if args.skip_neighbors:
|
||||
db.close()
|
||||
print(f"Done (skipped neighbors). Wrote {args.output}")
|
||||
return
|
||||
|
||||
# Step 3: Brute-force KNN via sqlite-vec -> neighbors table
|
||||
n_queries = db.execute("SELECT count(*) FROM query_vectors").fetchone()[0]
|
||||
print(f"Computing {args.k}-NN for {n_queries} queries via sqlite-vec...")
|
||||
for query_id, query_blob in tqdm(
|
||||
db.execute("SELECT id, vector FROM query_vectors").fetchall()
|
||||
):
|
||||
results = db.execute(
|
||||
"""
|
||||
SELECT
|
||||
train.id,
|
||||
vec_distance_cosine(train.vector, ?) AS distance
|
||||
FROM train
|
||||
WHERE distance IS NOT NULL
|
||||
ORDER BY distance ASC
|
||||
LIMIT ?
|
||||
""",
|
||||
(query_blob, args.k),
|
||||
).fetchall()
|
||||
|
||||
params = [
|
||||
(query_id, rank, str(rid))
|
||||
for rank, (rid, _dist) in enumerate(results)
|
||||
]
|
||||
db.executemany("INSERT INTO neighbors VALUES (?, ?, ?)", params)
|
||||
|
||||
db.commit()
|
||||
db.close()
|
||||
print(f"Done. Wrote {args.output}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
100
benchmarks-ann/datasets/nyt-1024/queries.txt
Normal file
100
benchmarks-ann/datasets/nyt-1024/queries.txt
Normal file
|
|
@ -0,0 +1,100 @@
|
|||
latest news on climate change policy
|
||||
presidential election results and analysis
|
||||
stock market crash causes
|
||||
coronavirus vaccine development updates
|
||||
artificial intelligence breakthrough in healthcare
|
||||
supreme court ruling on abortion rights
|
||||
tech companies layoff announcements
|
||||
earthquake damages in California
|
||||
cybersecurity breach at major corporation
|
||||
space exploration mission to Mars
|
||||
immigration reform legislation debate
|
||||
renewable energy investment trends
|
||||
healthcare costs rising across America
|
||||
protests against police brutality
|
||||
wildfires destroy homes in the West
|
||||
Olympic games highlights and records
|
||||
celebrity scandal rocks Hollywood
|
||||
breakthrough cancer treatment discovered
|
||||
housing market bubble concerns
|
||||
federal reserve interest rate decision
|
||||
school shooting tragedy response
|
||||
diplomatic tensions between superpowers
|
||||
drone strike kills terrorist leader
|
||||
social media platform faces regulation
|
||||
archaeological discovery reveals ancient civilization
|
||||
unemployment rate hits record low
|
||||
autonomous vehicles testing expansion
|
||||
streaming service launches original content
|
||||
opioid crisis intervention programs
|
||||
trade war tariffs impact economy
|
||||
infrastructure bill passes Congress
|
||||
data privacy concerns grow
|
||||
minimum wage increase proposal
|
||||
college admissions scandal exposed
|
||||
NFL player protest during anthem
|
||||
cryptocurrency regulation debate
|
||||
pandemic lockdown restrictions eased
|
||||
mass shooting gun control debate
|
||||
tax reform legislation impact
|
||||
ransomware attack cripples pipeline
|
||||
climate activists stage demonstration
|
||||
sports team wins championship
|
||||
banking system collapse fears
|
||||
pharmaceutical company fraud charges
|
||||
genetic engineering ethical concerns
|
||||
border wall funding controversy
|
||||
impeachment proceedings begin
|
||||
nuclear weapons treaty violation
|
||||
artificial meat alternative launch
|
||||
student loan debt forgiveness
|
||||
venture capital funding decline
|
||||
facial recognition ban proposed
|
||||
election interference investigation
|
||||
pandemic preparedness failures
|
||||
police reform measures announced
|
||||
wildfire prevention strategies
|
||||
ocean pollution crisis worsens
|
||||
manufacturing jobs returning
|
||||
pension fund shortfall concerns
|
||||
antitrust investigation launched
|
||||
voting rights protection act
|
||||
mental health awareness campaign
|
||||
homeless population increasing
|
||||
space debris collision risk
|
||||
drug cartel violence escalates
|
||||
renewable energy jobs growth
|
||||
infrastructure deterioration report
|
||||
vaccine mandate legal challenge
|
||||
cryptocurrency market volatility
|
||||
autonomous drone delivery service
|
||||
deep fake technology dangers
|
||||
Arctic ice melting accelerates
|
||||
income inequality gap widens
|
||||
election fraud claims disputed
|
||||
corporate merger blocked
|
||||
medical breakthrough extends life
|
||||
transportation strike disrupts city
|
||||
racial justice protests spread
|
||||
carbon emissions reduction goals
|
||||
financial crisis warning signs
|
||||
cyberbullying prevention efforts
|
||||
asteroid near miss with Earth
|
||||
gene therapy approval granted
|
||||
labor union organizing drive
|
||||
surveillance technology expansion
|
||||
education funding cuts proposed
|
||||
disaster relief efforts underway
|
||||
housing affordability crisis
|
||||
clean water access shortage
|
||||
artificial intelligence job displacement
|
||||
trade agreement negotiations
|
||||
prison reform initiative launched
|
||||
species extinction accelerates
|
||||
political corruption scandal
|
||||
terrorism threat level raised
|
||||
food safety contamination outbreak
|
||||
ai model release
|
||||
affordability interest rates
|
||||
peanut allergies in newbons
|
||||
breaking bad walter white
|
||||
29
benchmarks-ann/datasets/nyt-384/Makefile
Normal file
29
benchmarks-ann/datasets/nyt-384/Makefile
Normal file
|
|
@ -0,0 +1,29 @@
|
|||
MODEL ?= mixedbread-ai/mxbai-embed-xsmall-v1
|
||||
K ?= 100
|
||||
BATCH_SIZE ?= 512
|
||||
DATA_DIR ?= ../nyt/data
|
||||
|
||||
all: base.db
|
||||
|
||||
$(DATA_DIR):
|
||||
$(MAKE) -C ../nyt data
|
||||
|
||||
contents.db: $(DATA_DIR)
|
||||
uv run ../nyt-768/build-contents.py --data-dir $(DATA_DIR) -o $@
|
||||
|
||||
base.db: contents.db queries.txt
|
||||
uv run ../nyt-1024/build-base.py \
|
||||
--contents-db contents.db \
|
||||
--model $(MODEL) \
|
||||
--queries-file queries.txt \
|
||||
--batch-size $(BATCH_SIZE) \
|
||||
--k $(K) \
|
||||
-o $@
|
||||
|
||||
queries.txt:
|
||||
cp ../nyt/queries.txt $@
|
||||
|
||||
clean:
|
||||
rm -f base.db contents.db
|
||||
|
||||
.PHONY: all clean
|
||||
100
benchmarks-ann/datasets/nyt-384/queries.txt
Normal file
100
benchmarks-ann/datasets/nyt-384/queries.txt
Normal file
|
|
@ -0,0 +1,100 @@
|
|||
latest news on climate change policy
|
||||
presidential election results and analysis
|
||||
stock market crash causes
|
||||
coronavirus vaccine development updates
|
||||
artificial intelligence breakthrough in healthcare
|
||||
supreme court ruling on abortion rights
|
||||
tech companies layoff announcements
|
||||
earthquake damages in California
|
||||
cybersecurity breach at major corporation
|
||||
space exploration mission to Mars
|
||||
immigration reform legislation debate
|
||||
renewable energy investment trends
|
||||
healthcare costs rising across America
|
||||
protests against police brutality
|
||||
wildfires destroy homes in the West
|
||||
Olympic games highlights and records
|
||||
celebrity scandal rocks Hollywood
|
||||
breakthrough cancer treatment discovered
|
||||
housing market bubble concerns
|
||||
federal reserve interest rate decision
|
||||
school shooting tragedy response
|
||||
diplomatic tensions between superpowers
|
||||
drone strike kills terrorist leader
|
||||
social media platform faces regulation
|
||||
archaeological discovery reveals ancient civilization
|
||||
unemployment rate hits record low
|
||||
autonomous vehicles testing expansion
|
||||
streaming service launches original content
|
||||
opioid crisis intervention programs
|
||||
trade war tariffs impact economy
|
||||
infrastructure bill passes Congress
|
||||
data privacy concerns grow
|
||||
minimum wage increase proposal
|
||||
college admissions scandal exposed
|
||||
NFL player protest during anthem
|
||||
cryptocurrency regulation debate
|
||||
pandemic lockdown restrictions eased
|
||||
mass shooting gun control debate
|
||||
tax reform legislation impact
|
||||
ransomware attack cripples pipeline
|
||||
climate activists stage demonstration
|
||||
sports team wins championship
|
||||
banking system collapse fears
|
||||
pharmaceutical company fraud charges
|
||||
genetic engineering ethical concerns
|
||||
border wall funding controversy
|
||||
impeachment proceedings begin
|
||||
nuclear weapons treaty violation
|
||||
artificial meat alternative launch
|
||||
student loan debt forgiveness
|
||||
venture capital funding decline
|
||||
facial recognition ban proposed
|
||||
election interference investigation
|
||||
pandemic preparedness failures
|
||||
police reform measures announced
|
||||
wildfire prevention strategies
|
||||
ocean pollution crisis worsens
|
||||
manufacturing jobs returning
|
||||
pension fund shortfall concerns
|
||||
antitrust investigation launched
|
||||
voting rights protection act
|
||||
mental health awareness campaign
|
||||
homeless population increasing
|
||||
space debris collision risk
|
||||
drug cartel violence escalates
|
||||
renewable energy jobs growth
|
||||
infrastructure deterioration report
|
||||
vaccine mandate legal challenge
|
||||
cryptocurrency market volatility
|
||||
autonomous drone delivery service
|
||||
deep fake technology dangers
|
||||
Arctic ice melting accelerates
|
||||
income inequality gap widens
|
||||
election fraud claims disputed
|
||||
corporate merger blocked
|
||||
medical breakthrough extends life
|
||||
transportation strike disrupts city
|
||||
racial justice protests spread
|
||||
carbon emissions reduction goals
|
||||
financial crisis warning signs
|
||||
cyberbullying prevention efforts
|
||||
asteroid near miss with Earth
|
||||
gene therapy approval granted
|
||||
labor union organizing drive
|
||||
surveillance technology expansion
|
||||
education funding cuts proposed
|
||||
disaster relief efforts underway
|
||||
housing affordability crisis
|
||||
clean water access shortage
|
||||
artificial intelligence job displacement
|
||||
trade agreement negotiations
|
||||
prison reform initiative launched
|
||||
species extinction accelerates
|
||||
political corruption scandal
|
||||
terrorism threat level raised
|
||||
food safety contamination outbreak
|
||||
ai model release
|
||||
affordability interest rates
|
||||
peanut allergies in newbons
|
||||
breaking bad walter white
|
||||
37
benchmarks-ann/datasets/nyt-768/Makefile
Normal file
37
benchmarks-ann/datasets/nyt-768/Makefile
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
MODEL ?= bge-base-en-v1.5-768
|
||||
K ?= 100
|
||||
BATCH_SIZE ?= 512
|
||||
DATA_DIR ?= ../nyt/data
|
||||
|
||||
all: base.db
|
||||
|
||||
# Reuse data from ../nyt
|
||||
$(DATA_DIR):
|
||||
$(MAKE) -C ../nyt data
|
||||
|
||||
# Distill model (separate step, may take a while)
|
||||
$(MODEL):
|
||||
uv run distill-model.py
|
||||
|
||||
contents.db: $(DATA_DIR)
|
||||
uv run build-contents.py --data-dir $(DATA_DIR) -o $@
|
||||
|
||||
base.db: contents.db queries.txt $(MODEL)
|
||||
uv run ../nyt/build-base.py \
|
||||
--contents-db contents.db \
|
||||
--model $(MODEL) \
|
||||
--queries-file queries.txt \
|
||||
--batch-size $(BATCH_SIZE) \
|
||||
--k $(K) \
|
||||
-o $@
|
||||
|
||||
queries.txt:
|
||||
cp ../nyt/queries.txt $@
|
||||
|
||||
clean:
|
||||
rm -f base.db contents.db
|
||||
|
||||
clean-all: clean
|
||||
rm -rf $(MODEL)
|
||||
|
||||
.PHONY: all clean clean-all
|
||||
64
benchmarks-ann/datasets/nyt-768/build-contents.py
Normal file
64
benchmarks-ann/datasets/nyt-768/build-contents.py
Normal file
|
|
@ -0,0 +1,64 @@
|
|||
# /// script
|
||||
# requires-python = ">=3.12"
|
||||
# dependencies = [
|
||||
# "duckdb",
|
||||
# ]
|
||||
# ///
|
||||
|
||||
import argparse
|
||||
import sqlite3
|
||||
import duckdb
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Load NYT headline CSVs into a SQLite contents database (most recent 1M, deduplicated)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--data-dir", "-d", default="../nyt/data",
|
||||
help="Directory containing NYT CSV files (default: ../nyt/data)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--limit", "-l", type=int, default=1_000_000,
|
||||
help="Maximum number of headlines to keep (default: 1000000)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", "-o", required=True,
|
||||
help="Path to the output SQLite database",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
glob_pattern = f"{args.data_dir}/new_york_times_stories_*.csv"
|
||||
|
||||
con = duckdb.connect()
|
||||
rows = con.execute(
|
||||
f"""
|
||||
WITH deduped AS (
|
||||
SELECT
|
||||
headline,
|
||||
max(pub_date) AS pub_date
|
||||
FROM read_csv('{glob_pattern}', auto_detect=true, union_by_name=true)
|
||||
WHERE headline IS NOT NULL AND trim(headline) != ''
|
||||
GROUP BY headline
|
||||
)
|
||||
SELECT
|
||||
row_number() OVER (ORDER BY pub_date DESC) AS id,
|
||||
headline
|
||||
FROM deduped
|
||||
ORDER BY pub_date DESC
|
||||
LIMIT {args.limit}
|
||||
"""
|
||||
).fetchall()
|
||||
con.close()
|
||||
|
||||
db = sqlite3.connect(args.output)
|
||||
db.execute("CREATE TABLE contents(id INTEGER PRIMARY KEY, headline TEXT)")
|
||||
db.executemany("INSERT INTO contents VALUES (?, ?)", rows)
|
||||
db.commit()
|
||||
db.close()
|
||||
|
||||
print(f"Wrote {len(rows)} headlines to {args.output}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
13
benchmarks-ann/datasets/nyt-768/distill-model.py
Normal file
13
benchmarks-ann/datasets/nyt-768/distill-model.py
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
# /// script
|
||||
# requires-python = ">=3.12"
|
||||
# dependencies = [
|
||||
# "model2vec[distill]",
|
||||
# "torch<=2.7",
|
||||
# ]
|
||||
# ///
|
||||
|
||||
from model2vec.distill import distill
|
||||
|
||||
model = distill(model_name="BAAI/bge-base-en-v1.5", pca_dims=768)
|
||||
model.save_pretrained("bge-base-en-v1.5-768")
|
||||
print("Saved distilled model to bge-base-en-v1.5-768/")
|
||||
100
benchmarks-ann/datasets/nyt-768/queries.txt
Normal file
100
benchmarks-ann/datasets/nyt-768/queries.txt
Normal file
|
|
@ -0,0 +1,100 @@
|
|||
latest news on climate change policy
|
||||
presidential election results and analysis
|
||||
stock market crash causes
|
||||
coronavirus vaccine development updates
|
||||
artificial intelligence breakthrough in healthcare
|
||||
supreme court ruling on abortion rights
|
||||
tech companies layoff announcements
|
||||
earthquake damages in California
|
||||
cybersecurity breach at major corporation
|
||||
space exploration mission to Mars
|
||||
immigration reform legislation debate
|
||||
renewable energy investment trends
|
||||
healthcare costs rising across America
|
||||
protests against police brutality
|
||||
wildfires destroy homes in the West
|
||||
Olympic games highlights and records
|
||||
celebrity scandal rocks Hollywood
|
||||
breakthrough cancer treatment discovered
|
||||
housing market bubble concerns
|
||||
federal reserve interest rate decision
|
||||
school shooting tragedy response
|
||||
diplomatic tensions between superpowers
|
||||
drone strike kills terrorist leader
|
||||
social media platform faces regulation
|
||||
archaeological discovery reveals ancient civilization
|
||||
unemployment rate hits record low
|
||||
autonomous vehicles testing expansion
|
||||
streaming service launches original content
|
||||
opioid crisis intervention programs
|
||||
trade war tariffs impact economy
|
||||
infrastructure bill passes Congress
|
||||
data privacy concerns grow
|
||||
minimum wage increase proposal
|
||||
college admissions scandal exposed
|
||||
NFL player protest during anthem
|
||||
cryptocurrency regulation debate
|
||||
pandemic lockdown restrictions eased
|
||||
mass shooting gun control debate
|
||||
tax reform legislation impact
|
||||
ransomware attack cripples pipeline
|
||||
climate activists stage demonstration
|
||||
sports team wins championship
|
||||
banking system collapse fears
|
||||
pharmaceutical company fraud charges
|
||||
genetic engineering ethical concerns
|
||||
border wall funding controversy
|
||||
impeachment proceedings begin
|
||||
nuclear weapons treaty violation
|
||||
artificial meat alternative launch
|
||||
student loan debt forgiveness
|
||||
venture capital funding decline
|
||||
facial recognition ban proposed
|
||||
election interference investigation
|
||||
pandemic preparedness failures
|
||||
police reform measures announced
|
||||
wildfire prevention strategies
|
||||
ocean pollution crisis worsens
|
||||
manufacturing jobs returning
|
||||
pension fund shortfall concerns
|
||||
antitrust investigation launched
|
||||
voting rights protection act
|
||||
mental health awareness campaign
|
||||
homeless population increasing
|
||||
space debris collision risk
|
||||
drug cartel violence escalates
|
||||
renewable energy jobs growth
|
||||
infrastructure deterioration report
|
||||
vaccine mandate legal challenge
|
||||
cryptocurrency market volatility
|
||||
autonomous drone delivery service
|
||||
deep fake technology dangers
|
||||
Arctic ice melting accelerates
|
||||
income inequality gap widens
|
||||
election fraud claims disputed
|
||||
corporate merger blocked
|
||||
medical breakthrough extends life
|
||||
transportation strike disrupts city
|
||||
racial justice protests spread
|
||||
carbon emissions reduction goals
|
||||
financial crisis warning signs
|
||||
cyberbullying prevention efforts
|
||||
asteroid near miss with Earth
|
||||
gene therapy approval granted
|
||||
labor union organizing drive
|
||||
surveillance technology expansion
|
||||
education funding cuts proposed
|
||||
disaster relief efforts underway
|
||||
housing affordability crisis
|
||||
clean water access shortage
|
||||
artificial intelligence job displacement
|
||||
trade agreement negotiations
|
||||
prison reform initiative launched
|
||||
species extinction accelerates
|
||||
political corruption scandal
|
||||
terrorism threat level raised
|
||||
food safety contamination outbreak
|
||||
ai model release
|
||||
affordability interest rates
|
||||
peanut allergies in newbons
|
||||
breaking bad walter white
|
||||
1
benchmarks-ann/datasets/nyt/.gitignore
vendored
Normal file
1
benchmarks-ann/datasets/nyt/.gitignore
vendored
Normal file
|
|
@ -0,0 +1 @@
|
|||
data/
|
||||
30
benchmarks-ann/datasets/nyt/Makefile
Normal file
30
benchmarks-ann/datasets/nyt/Makefile
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
MODEL ?= minishlab/potion-base-8M
|
||||
K ?= 100
|
||||
BATCH_SIZE ?= 512
|
||||
DATA_DIR ?= data
|
||||
|
||||
all: base.db contents.db
|
||||
|
||||
# Download NYT headlines CSVs from Kaggle (requires `kaggle` CLI + API token)
|
||||
$(DATA_DIR):
|
||||
kaggle datasets download -d johnbandy/new-york-times-headlines -p $(DATA_DIR) --unzip
|
||||
|
||||
contents.db: $(DATA_DIR)
|
||||
uv run build-contents.py --data-dir $(DATA_DIR) -o $@
|
||||
|
||||
base.db: contents.db queries.txt
|
||||
uv run build-base.py \
|
||||
--contents-db contents.db \
|
||||
--model $(MODEL) \
|
||||
--queries-file queries.txt \
|
||||
--batch-size $(BATCH_SIZE) \
|
||||
--k $(K) \
|
||||
-o $@
|
||||
|
||||
clean:
|
||||
rm -f base.db contents.db
|
||||
|
||||
clean-all: clean
|
||||
rm -rf $(DATA_DIR)
|
||||
|
||||
.PHONY: all clean clean-all
|
||||
165
benchmarks-ann/datasets/nyt/build-base.py
Normal file
165
benchmarks-ann/datasets/nyt/build-base.py
Normal file
|
|
@ -0,0 +1,165 @@
|
|||
# /// script
|
||||
# requires-python = ">=3.12"
|
||||
# dependencies = [
|
||||
# "model2vec",
|
||||
# "torch<=2.7",
|
||||
# "tqdm",
|
||||
# ]
|
||||
# ///
|
||||
|
||||
import argparse
|
||||
import sqlite3
|
||||
from array import array
|
||||
from itertools import batched
|
||||
|
||||
from model2vec import StaticModel
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Build base.db with train vectors, query vectors, and brute-force KNN neighbors",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--contents-db", "-c", default=None,
|
||||
help="Path to contents.db (source of headlines and IDs)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model", "-m", default="minishlab/potion-base-8M",
|
||||
help="HuggingFace model ID or local path (default: minishlab/potion-base-8M)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--queries-file", "-q", default="queries.txt",
|
||||
help="Path to the queries file (default: queries.txt)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", "-o", required=True,
|
||||
help="Path to the output base.db",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--batch-size", "-b", type=int, default=512,
|
||||
help="Batch size for embedding (default: 512)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--k", "-k", type=int, default=100,
|
||||
help="Number of nearest neighbors (default: 100)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--vec-path", "-v", default="~/projects/sqlite-vec/dist/vec0",
|
||||
help="Path to sqlite-vec extension (default: ~/projects/sqlite-vec/dist/vec0)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--rebuild-neighbors", action="store_true",
|
||||
help="Only rebuild the neighbors table (skip embedding steps)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
import os
|
||||
vec_path = os.path.expanduser(args.vec_path)
|
||||
|
||||
if args.rebuild_neighbors:
|
||||
# Skip embedding, just open existing DB and rebuild neighbors
|
||||
db = sqlite3.connect(args.output)
|
||||
db.enable_load_extension(True)
|
||||
db.load_extension(vec_path)
|
||||
db.enable_load_extension(False)
|
||||
db.execute("DROP TABLE IF EXISTS neighbors")
|
||||
db.execute(
|
||||
"CREATE TABLE neighbors("
|
||||
" query_vector_id INTEGER, rank INTEGER, neighbors_id TEXT,"
|
||||
" UNIQUE(query_vector_id, rank))"
|
||||
)
|
||||
print(f"Rebuilding neighbors in {args.output}...")
|
||||
else:
|
||||
print(f"Loading model {args.model}...")
|
||||
model = StaticModel.from_pretrained(args.model)
|
||||
|
||||
# Read headlines from contents.db
|
||||
src = sqlite3.connect(args.contents_db)
|
||||
headlines = src.execute("SELECT id, headline FROM contents ORDER BY id").fetchall()
|
||||
src.close()
|
||||
print(f"Loaded {len(headlines)} headlines from {args.contents_db}")
|
||||
|
||||
# Read queries
|
||||
with open(args.queries_file) as f:
|
||||
queries = [line.strip() for line in f if line.strip()]
|
||||
print(f"Loaded {len(queries)} queries from {args.queries_file}")
|
||||
|
||||
# Create output database
|
||||
db = sqlite3.connect(args.output)
|
||||
db.enable_load_extension(True)
|
||||
db.load_extension(vec_path)
|
||||
db.enable_load_extension(False)
|
||||
|
||||
db.execute("CREATE TABLE train(id INTEGER PRIMARY KEY, vector BLOB)")
|
||||
db.execute("CREATE TABLE query_vectors(id INTEGER PRIMARY KEY, vector BLOB)")
|
||||
db.execute(
|
||||
"CREATE TABLE neighbors("
|
||||
" query_vector_id INTEGER, rank INTEGER, neighbors_id TEXT,"
|
||||
" UNIQUE(query_vector_id, rank))"
|
||||
)
|
||||
|
||||
# Step 1: Embed headlines -> train table
|
||||
print("Embedding headlines...")
|
||||
for batch in tqdm(
|
||||
batched(headlines, args.batch_size),
|
||||
total=(len(headlines) + args.batch_size - 1) // args.batch_size,
|
||||
):
|
||||
ids = [r[0] for r in batch]
|
||||
texts = [r[1] for r in batch]
|
||||
embeddings = model.encode(texts)
|
||||
|
||||
params = [
|
||||
(int(rid), array("f", emb.tolist()).tobytes())
|
||||
for rid, emb in zip(ids, embeddings)
|
||||
]
|
||||
db.executemany("INSERT INTO train VALUES (?, ?)", params)
|
||||
db.commit()
|
||||
|
||||
del headlines
|
||||
n = db.execute("SELECT count(*) FROM train").fetchone()[0]
|
||||
print(f"Embedded {n} headlines")
|
||||
|
||||
# Step 2: Embed queries -> query_vectors table
|
||||
print("Embedding queries...")
|
||||
query_embeddings = model.encode(queries)
|
||||
query_params = []
|
||||
for i, emb in enumerate(query_embeddings, 1):
|
||||
blob = array("f", emb.tolist()).tobytes()
|
||||
query_params.append((i, blob))
|
||||
db.executemany("INSERT INTO query_vectors VALUES (?, ?)", query_params)
|
||||
db.commit()
|
||||
print(f"Embedded {len(queries)} queries")
|
||||
|
||||
# Step 3: Brute-force KNN via sqlite-vec -> neighbors table
|
||||
n_queries = db.execute("SELECT count(*) FROM query_vectors").fetchone()[0]
|
||||
print(f"Computing {args.k}-NN for {n_queries} queries via sqlite-vec...")
|
||||
for query_id, query_blob in tqdm(
|
||||
db.execute("SELECT id, vector FROM query_vectors").fetchall()
|
||||
):
|
||||
results = db.execute(
|
||||
"""
|
||||
SELECT
|
||||
train.id,
|
||||
vec_distance_cosine(train.vector, ?) AS distance
|
||||
FROM train
|
||||
WHERE distance IS NOT NULL
|
||||
ORDER BY distance ASC
|
||||
LIMIT ?
|
||||
""",
|
||||
(query_blob, args.k),
|
||||
).fetchall()
|
||||
|
||||
params = [
|
||||
(query_id, rank, str(rid))
|
||||
for rank, (rid, _dist) in enumerate(results)
|
||||
]
|
||||
db.executemany("INSERT INTO neighbors VALUES (?, ?, ?)", params)
|
||||
|
||||
db.commit()
|
||||
db.close()
|
||||
print(f"Done. Wrote {args.output}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
52
benchmarks-ann/datasets/nyt/build-contents.py
Normal file
52
benchmarks-ann/datasets/nyt/build-contents.py
Normal file
|
|
@ -0,0 +1,52 @@
|
|||
# /// script
|
||||
# requires-python = ">=3.12"
|
||||
# dependencies = [
|
||||
# "duckdb",
|
||||
# ]
|
||||
# ///
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sqlite3
|
||||
import duckdb
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Load NYT headline CSVs into a SQLite contents database via DuckDB",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--data-dir", "-d", default="data",
|
||||
help="Directory containing NYT CSV files (default: data)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", "-o", required=True,
|
||||
help="Path to the output SQLite database",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
glob_pattern = os.path.join(args.data_dir, "new_york_times_stories_*.csv")
|
||||
|
||||
con = duckdb.connect()
|
||||
rows = con.execute(
|
||||
f"""
|
||||
SELECT
|
||||
row_number() OVER () AS id,
|
||||
headline
|
||||
FROM read_csv('{glob_pattern}', auto_detect=true, union_by_name=true)
|
||||
WHERE headline IS NOT NULL AND headline != ''
|
||||
"""
|
||||
).fetchall()
|
||||
con.close()
|
||||
|
||||
db = sqlite3.connect(args.output)
|
||||
db.execute("CREATE TABLE contents(id INTEGER PRIMARY KEY, headline TEXT)")
|
||||
db.executemany("INSERT INTO contents VALUES (?, ?)", rows)
|
||||
db.commit()
|
||||
db.close()
|
||||
|
||||
print(f"Wrote {len(rows)} headlines to {args.output}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
100
benchmarks-ann/datasets/nyt/queries.txt
Normal file
100
benchmarks-ann/datasets/nyt/queries.txt
Normal file
|
|
@ -0,0 +1,100 @@
|
|||
latest news on climate change policy
|
||||
presidential election results and analysis
|
||||
stock market crash causes
|
||||
coronavirus vaccine development updates
|
||||
artificial intelligence breakthrough in healthcare
|
||||
supreme court ruling on abortion rights
|
||||
tech companies layoff announcements
|
||||
earthquake damages in California
|
||||
cybersecurity breach at major corporation
|
||||
space exploration mission to Mars
|
||||
immigration reform legislation debate
|
||||
renewable energy investment trends
|
||||
healthcare costs rising across America
|
||||
protests against police brutality
|
||||
wildfires destroy homes in the West
|
||||
Olympic games highlights and records
|
||||
celebrity scandal rocks Hollywood
|
||||
breakthrough cancer treatment discovered
|
||||
housing market bubble concerns
|
||||
federal reserve interest rate decision
|
||||
school shooting tragedy response
|
||||
diplomatic tensions between superpowers
|
||||
drone strike kills terrorist leader
|
||||
social media platform faces regulation
|
||||
archaeological discovery reveals ancient civilization
|
||||
unemployment rate hits record low
|
||||
autonomous vehicles testing expansion
|
||||
streaming service launches original content
|
||||
opioid crisis intervention programs
|
||||
trade war tariffs impact economy
|
||||
infrastructure bill passes Congress
|
||||
data privacy concerns grow
|
||||
minimum wage increase proposal
|
||||
college admissions scandal exposed
|
||||
NFL player protest during anthem
|
||||
cryptocurrency regulation debate
|
||||
pandemic lockdown restrictions eased
|
||||
mass shooting gun control debate
|
||||
tax reform legislation impact
|
||||
ransomware attack cripples pipeline
|
||||
climate activists stage demonstration
|
||||
sports team wins championship
|
||||
banking system collapse fears
|
||||
pharmaceutical company fraud charges
|
||||
genetic engineering ethical concerns
|
||||
border wall funding controversy
|
||||
impeachment proceedings begin
|
||||
nuclear weapons treaty violation
|
||||
artificial meat alternative launch
|
||||
student loan debt forgiveness
|
||||
venture capital funding decline
|
||||
facial recognition ban proposed
|
||||
election interference investigation
|
||||
pandemic preparedness failures
|
||||
police reform measures announced
|
||||
wildfire prevention strategies
|
||||
ocean pollution crisis worsens
|
||||
manufacturing jobs returning
|
||||
pension fund shortfall concerns
|
||||
antitrust investigation launched
|
||||
voting rights protection act
|
||||
mental health awareness campaign
|
||||
homeless population increasing
|
||||
space debris collision risk
|
||||
drug cartel violence escalates
|
||||
renewable energy jobs growth
|
||||
infrastructure deterioration report
|
||||
vaccine mandate legal challenge
|
||||
cryptocurrency market volatility
|
||||
autonomous drone delivery service
|
||||
deep fake technology dangers
|
||||
Arctic ice melting accelerates
|
||||
income inequality gap widens
|
||||
election fraud claims disputed
|
||||
corporate merger blocked
|
||||
medical breakthrough extends life
|
||||
transportation strike disrupts city
|
||||
racial justice protests spread
|
||||
carbon emissions reduction goals
|
||||
financial crisis warning signs
|
||||
cyberbullying prevention efforts
|
||||
asteroid near miss with Earth
|
||||
gene therapy approval granted
|
||||
labor union organizing drive
|
||||
surveillance technology expansion
|
||||
education funding cuts proposed
|
||||
disaster relief efforts underway
|
||||
housing affordability crisis
|
||||
clean water access shortage
|
||||
artificial intelligence job displacement
|
||||
trade agreement negotiations
|
||||
prison reform initiative launched
|
||||
species extinction accelerates
|
||||
political corruption scandal
|
||||
terrorism threat level raised
|
||||
food safety contamination outbreak
|
||||
ai model release
|
||||
affordability interest rates
|
||||
peanut allergies in newbons
|
||||
breaking bad walter white
|
||||
Loading…
Add table
Add a link
Reference in a new issue