mirror of
https://github.com/asg017/sqlite-vec.git
synced 2026-04-25 08:46:49 +02:00
Add comprehensive ANN benchmarking suite (#279)
Extend benchmarks-ann/ with results database (SQLite with per-query detail and continuous writes), dataset subfolder organization, --subset-size and --warmup options. Supports systematic comparison across flat, rescore, IVF, and DiskANN index types.
This commit is contained in:
parent
a248ecd061
commit
8544081a67
26 changed files with 2127 additions and 292 deletions
52
benchmarks-ann/datasets/nyt/build-contents.py
Normal file
52
benchmarks-ann/datasets/nyt/build-contents.py
Normal file
|
|
@ -0,0 +1,52 @@
|
|||
# /// script
|
||||
# requires-python = ">=3.12"
|
||||
# dependencies = [
|
||||
# "duckdb",
|
||||
# ]
|
||||
# ///
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sqlite3
|
||||
import duckdb
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Load NYT headline CSVs into a SQLite contents database via DuckDB",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--data-dir", "-d", default="data",
|
||||
help="Directory containing NYT CSV files (default: data)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", "-o", required=True,
|
||||
help="Path to the output SQLite database",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
glob_pattern = os.path.join(args.data_dir, "new_york_times_stories_*.csv")
|
||||
|
||||
con = duckdb.connect()
|
||||
rows = con.execute(
|
||||
f"""
|
||||
SELECT
|
||||
row_number() OVER () AS id,
|
||||
headline
|
||||
FROM read_csv('{glob_pattern}', auto_detect=true, union_by_name=true)
|
||||
WHERE headline IS NOT NULL AND headline != ''
|
||||
"""
|
||||
).fetchall()
|
||||
con.close()
|
||||
|
||||
db = sqlite3.connect(args.output)
|
||||
db.execute("CREATE TABLE contents(id INTEGER PRIMARY KEY, headline TEXT)")
|
||||
db.executemany("INSERT INTO contents VALUES (?, ?)", rows)
|
||||
db.commit()
|
||||
db.close()
|
||||
|
||||
print(f"Wrote {len(rows)} headlines to {args.output}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue