sqlite-vec/benchmarks-ann/datasets/nyt-768/build-contents.py
Alex Garcia 8544081a67
Add comprehensive ANN benchmarking suite (#279)
Extend benchmarks-ann/ with results database (SQLite with per-query detail
and continuous writes), dataset subfolder organization, --subset-size and
--warmup options. Supports systematic comparison across flat, rescore, IVF,
and DiskANN index types.
2026-03-31 01:29:49 -07:00

64 lines
1.7 KiB
Python

# /// script
# requires-python = ">=3.12"
# dependencies = [
# "duckdb",
# ]
# ///
import argparse
import sqlite3
import duckdb
def main():
parser = argparse.ArgumentParser(
description="Load NYT headline CSVs into a SQLite contents database (most recent 1M, deduplicated)",
)
parser.add_argument(
"--data-dir", "-d", default="../nyt/data",
help="Directory containing NYT CSV files (default: ../nyt/data)",
)
parser.add_argument(
"--limit", "-l", type=int, default=1_000_000,
help="Maximum number of headlines to keep (default: 1000000)",
)
parser.add_argument(
"--output", "-o", required=True,
help="Path to the output SQLite database",
)
args = parser.parse_args()
glob_pattern = f"{args.data_dir}/new_york_times_stories_*.csv"
con = duckdb.connect()
rows = con.execute(
f"""
WITH deduped AS (
SELECT
headline,
max(pub_date) AS pub_date
FROM read_csv('{glob_pattern}', auto_detect=true, union_by_name=true)
WHERE headline IS NOT NULL AND trim(headline) != ''
GROUP BY headline
)
SELECT
row_number() OVER (ORDER BY pub_date DESC) AS id,
headline
FROM deduped
ORDER BY pub_date DESC
LIMIT {args.limit}
"""
).fetchall()
con.close()
db = sqlite3.connect(args.output)
db.execute("CREATE TABLE contents(id INTEGER PRIMARY KEY, headline TEXT)")
db.executemany("INSERT INTO contents VALUES (?, ?)", rows)
db.commit()
db.close()
print(f"Wrote {len(rows)} headlines to {args.output}")
if __name__ == "__main__":
main()