mirror of
https://github.com/asg017/sqlite-vec.git
synced 2026-04-26 09:16:22 +02:00
Add comprehensive ANN benchmarking suite
Extend benchmarks-ann/ with results database (SQLite with per-query detail and continuous writes), dataset subfolder organization, --subset-size and --warmup options. Supports systematic comparison across flat, rescore, IVF, and DiskANN index types.
This commit is contained in:
parent
a248ecd061
commit
dbbb4b98f7
26 changed files with 2127 additions and 292 deletions
64
benchmarks-ann/datasets/nyt-768/build-contents.py
Normal file
64
benchmarks-ann/datasets/nyt-768/build-contents.py
Normal file
|
|
@ -0,0 +1,64 @@
|
|||
# /// script
|
||||
# requires-python = ">=3.12"
|
||||
# dependencies = [
|
||||
# "duckdb",
|
||||
# ]
|
||||
# ///
|
||||
|
||||
import argparse
|
||||
import sqlite3
|
||||
import duckdb
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Load NYT headline CSVs into a SQLite contents database (most recent 1M, deduplicated)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--data-dir", "-d", default="../nyt/data",
|
||||
help="Directory containing NYT CSV files (default: ../nyt/data)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--limit", "-l", type=int, default=1_000_000,
|
||||
help="Maximum number of headlines to keep (default: 1000000)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", "-o", required=True,
|
||||
help="Path to the output SQLite database",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
glob_pattern = f"{args.data_dir}/new_york_times_stories_*.csv"
|
||||
|
||||
con = duckdb.connect()
|
||||
rows = con.execute(
|
||||
f"""
|
||||
WITH deduped AS (
|
||||
SELECT
|
||||
headline,
|
||||
max(pub_date) AS pub_date
|
||||
FROM read_csv('{glob_pattern}', auto_detect=true, union_by_name=true)
|
||||
WHERE headline IS NOT NULL AND trim(headline) != ''
|
||||
GROUP BY headline
|
||||
)
|
||||
SELECT
|
||||
row_number() OVER (ORDER BY pub_date DESC) AS id,
|
||||
headline
|
||||
FROM deduped
|
||||
ORDER BY pub_date DESC
|
||||
LIMIT {args.limit}
|
||||
"""
|
||||
).fetchall()
|
||||
con.close()
|
||||
|
||||
db = sqlite3.connect(args.output)
|
||||
db.execute("CREATE TABLE contents(id INTEGER PRIMARY KEY, headline TEXT)")
|
||||
db.executemany("INSERT INTO contents VALUES (?, ?)", rows)
|
||||
db.commit()
|
||||
db.close()
|
||||
|
||||
print(f"Wrote {len(rows)} headlines to {args.output}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue