mirror of
https://github.com/asg017/sqlite-vec.git
synced 2026-04-25 08:46:49 +02:00
Extend benchmarks-ann/ with results database (SQLite with per-query detail and continuous writes), dataset subfolder organization, --subset-size and --warmup options. Supports systematic comparison across flat, rescore, IVF, and DiskANN index types.
64 lines
1.7 KiB
Python
64 lines
1.7 KiB
Python
# /// script
|
|
# requires-python = ">=3.12"
|
|
# dependencies = [
|
|
# "duckdb",
|
|
# ]
|
|
# ///
|
|
|
|
import argparse
|
|
import sqlite3
|
|
import duckdb
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Load NYT headline CSVs into a SQLite contents database (most recent 1M, deduplicated)",
|
|
)
|
|
parser.add_argument(
|
|
"--data-dir", "-d", default="../nyt/data",
|
|
help="Directory containing NYT CSV files (default: ../nyt/data)",
|
|
)
|
|
parser.add_argument(
|
|
"--limit", "-l", type=int, default=1_000_000,
|
|
help="Maximum number of headlines to keep (default: 1000000)",
|
|
)
|
|
parser.add_argument(
|
|
"--output", "-o", required=True,
|
|
help="Path to the output SQLite database",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
glob_pattern = f"{args.data_dir}/new_york_times_stories_*.csv"
|
|
|
|
con = duckdb.connect()
|
|
rows = con.execute(
|
|
f"""
|
|
WITH deduped AS (
|
|
SELECT
|
|
headline,
|
|
max(pub_date) AS pub_date
|
|
FROM read_csv('{glob_pattern}', auto_detect=true, union_by_name=true)
|
|
WHERE headline IS NOT NULL AND trim(headline) != ''
|
|
GROUP BY headline
|
|
)
|
|
SELECT
|
|
row_number() OVER (ORDER BY pub_date DESC) AS id,
|
|
headline
|
|
FROM deduped
|
|
ORDER BY pub_date DESC
|
|
LIMIT {args.limit}
|
|
"""
|
|
).fetchall()
|
|
con.close()
|
|
|
|
db = sqlite3.connect(args.output)
|
|
db.execute("CREATE TABLE contents(id INTEGER PRIMARY KEY, headline TEXT)")
|
|
db.executemany("INSERT INTO contents VALUES (?, ?)", rows)
|
|
db.commit()
|
|
db.close()
|
|
|
|
print(f"Wrote {len(rows)} headlines to {args.output}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|