mirror of
https://github.com/asg017/sqlite-vec.git
synced 2026-04-25 00:36:56 +02:00
Add ANN search support for vec0 virtual table (#273)
Add approximate nearest neighbor infrastructure to vec0: shared distance dispatch (vec0_distance_full), flat index type with parser, NEON-optimized cosine/Hamming for float32/int8, amalgamation script, and benchmark suite (benchmarks-ann/) with ground-truth generation and profiling tools. Remove unused vec_npy_each/vec_static_blobs code, fix missing stdint.h include.
This commit is contained in:
parent
e9f598abfa
commit
0de765f457
27 changed files with 2177 additions and 2116 deletions
14
Makefile
14
Makefile
|
|
@ -42,6 +42,11 @@ ifndef OMIT_SIMD
|
||||||
ifeq ($(shell uname -sm),Darwin arm64)
|
ifeq ($(shell uname -sm),Darwin arm64)
|
||||||
CFLAGS += -mcpu=apple-m1 -DSQLITE_VEC_ENABLE_NEON
|
CFLAGS += -mcpu=apple-m1 -DSQLITE_VEC_ENABLE_NEON
|
||||||
endif
|
endif
|
||||||
|
ifeq ($(shell uname -s),Linux)
|
||||||
|
ifneq ($(filter avx,$(shell grep -o 'avx[^ ]*' /proc/cpuinfo 2>/dev/null | head -1)),)
|
||||||
|
CFLAGS += -mavx -DSQLITE_VEC_ENABLE_AVX
|
||||||
|
endif
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef USE_BREW_SQLITE
|
ifdef USE_BREW_SQLITE
|
||||||
|
|
@ -155,6 +160,13 @@ clean:
|
||||||
rm -rf dist
|
rm -rf dist
|
||||||
|
|
||||||
|
|
||||||
|
TARGET_AMALGAMATION=$(prefix)/sqlite-vec.c
|
||||||
|
|
||||||
|
amalgamation: $(TARGET_AMALGAMATION)
|
||||||
|
|
||||||
|
$(TARGET_AMALGAMATION): sqlite-vec.c $(wildcard sqlite-vec-*.c) scripts/amalgamate.py $(prefix)
|
||||||
|
python3 scripts/amalgamate.py sqlite-vec.c > $@
|
||||||
|
|
||||||
FORMAT_FILES=sqlite-vec.h sqlite-vec.c
|
FORMAT_FILES=sqlite-vec.h sqlite-vec.c
|
||||||
format: $(FORMAT_FILES)
|
format: $(FORMAT_FILES)
|
||||||
clang-format -i $(FORMAT_FILES)
|
clang-format -i $(FORMAT_FILES)
|
||||||
|
|
@ -174,7 +186,7 @@ evidence-of:
|
||||||
test:
|
test:
|
||||||
sqlite3 :memory: '.read test.sql'
|
sqlite3 :memory: '.read test.sql'
|
||||||
|
|
||||||
.PHONY: version loadable static test clean gh-release evidence-of install uninstall
|
.PHONY: version loadable static test clean gh-release evidence-of install uninstall amalgamation
|
||||||
|
|
||||||
publish-release:
|
publish-release:
|
||||||
./scripts/publish-release.sh
|
./scripts/publish-release.sh
|
||||||
|
|
|
||||||
73
TODO.md
Normal file
73
TODO.md
Normal file
|
|
@ -0,0 +1,73 @@
|
||||||
|
# TODO: `ann` base branch + consolidated benchmarks
|
||||||
|
|
||||||
|
## 1. Create `ann` branch with shared code
|
||||||
|
|
||||||
|
### 1.1 Branch setup
|
||||||
|
- [x] `git checkout -B ann origin/main`
|
||||||
|
- [x] Cherry-pick `624f998` (vec0_distance_full shared distance dispatch)
|
||||||
|
- [x] Cherry-pick stdint.h fix for test header
|
||||||
|
- [ ] Pull NEON cosine optimization from ivf-yolo3 into shared code
|
||||||
|
- Currently only in ivf branch but is general-purpose (benefits all distance calcs)
|
||||||
|
- Lives in `distance_cosine_float()` — ~57 lines of ARM NEON vectorized cosine
|
||||||
|
|
||||||
|
### 1.2 Benchmark infrastructure (`benchmarks-ann/`)
|
||||||
|
- [x] Seed data pipeline (`seed/Makefile`, `seed/build_base_db.py`)
|
||||||
|
- [x] Ground truth generator (`ground_truth.py`)
|
||||||
|
- [x] Results schema (`schema.sql`)
|
||||||
|
- [x] Benchmark runner with `INDEX_REGISTRY` extension point (`bench.py`)
|
||||||
|
- Baseline configs (float, int8-rescore, bit-rescore) implemented
|
||||||
|
- Index branches register their types via `INDEX_REGISTRY` dict
|
||||||
|
- [x] Makefile with baseline targets
|
||||||
|
- [x] README
|
||||||
|
|
||||||
|
### 1.3 Rebase feature branches onto `ann`
|
||||||
|
- [x] Rebase `diskann-yolo2` onto `ann` (1 commit: DiskANN implementation)
|
||||||
|
- [x] Rebase `ivf-yolo3` onto `ann` (1 commit: IVF implementation)
|
||||||
|
- [x] Rebase `annoy-yolo2` onto `ann` (2 commits: Annoy implementation + schema fix)
|
||||||
|
- [x] Verify each branch has only its index-specific commits remaining
|
||||||
|
- [ ] Force-push all 4 branches to origin
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. Per-branch: register index type in benchmarks
|
||||||
|
|
||||||
|
Each index branch should add to `benchmarks-ann/` when rebased onto `ann`:
|
||||||
|
|
||||||
|
### 2.1 Register in `bench.py`
|
||||||
|
|
||||||
|
Add an `INDEX_REGISTRY` entry. Each entry provides:
|
||||||
|
- `defaults` — default param values
|
||||||
|
- `create_table_sql(params)` — CREATE VIRTUAL TABLE with INDEXED BY clause
|
||||||
|
- `insert_sql(params)` — custom insert SQL, or None for default
|
||||||
|
- `post_insert_hook(conn, params)` — training/building step, returns time
|
||||||
|
- `run_query(conn, params, query, k)` — custom query, or None for default MATCH
|
||||||
|
- `describe(params)` — one-line description for report output
|
||||||
|
|
||||||
|
### 2.2 Add configs to `Makefile`
|
||||||
|
|
||||||
|
Append index-specific config variables and targets. Example pattern:
|
||||||
|
|
||||||
|
```makefile
|
||||||
|
DISKANN_CONFIGS = \
|
||||||
|
"diskann-R48-binary:type=diskann,R=48,L=128,quantizer=binary" \
|
||||||
|
...
|
||||||
|
|
||||||
|
ALL_CONFIGS += $(DISKANN_CONFIGS)
|
||||||
|
|
||||||
|
bench-diskann: seed
|
||||||
|
$(BENCH) --subset-size 10000 -k 10 -o runs/diskann $(BASELINES) $(DISKANN_CONFIGS)
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2.3 Migrate existing benchmark results/docs
|
||||||
|
|
||||||
|
- Move useful results docs (RESULTS.md, etc.) into `benchmarks-ann/results/`
|
||||||
|
- Delete redundant per-branch benchmark directories once consolidated infra is proven
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. Future improvements
|
||||||
|
|
||||||
|
- [ ] Reporting script (`report.py`) — query results.db, produce markdown comparison tables
|
||||||
|
- [ ] Profiling targets in Makefile (lift from ivf-yolo3's Instruments/perf wrappers)
|
||||||
|
- [ ] Pre-computed ground truth integration (use GT DB files instead of on-the-fly brute-force)
|
||||||
2
benchmarks-ann/.gitignore
vendored
Normal file
2
benchmarks-ann/.gitignore
vendored
Normal file
|
|
@ -0,0 +1,2 @@
|
||||||
|
*.db
|
||||||
|
runs/
|
||||||
61
benchmarks-ann/Makefile
Normal file
61
benchmarks-ann/Makefile
Normal file
|
|
@ -0,0 +1,61 @@
|
||||||
|
BENCH = python bench.py
|
||||||
|
BASE_DB = seed/base.db
|
||||||
|
EXT = ../dist/vec0
|
||||||
|
|
||||||
|
# --- Baseline (brute-force) configs ---
|
||||||
|
BASELINES = \
|
||||||
|
"brute-float:type=baseline,variant=float" \
|
||||||
|
"brute-int8:type=baseline,variant=int8" \
|
||||||
|
"brute-bit:type=baseline,variant=bit"
|
||||||
|
|
||||||
|
# --- Index-specific configs ---
|
||||||
|
# Each index branch should add its own configs here. Example:
|
||||||
|
#
|
||||||
|
# DISKANN_CONFIGS = \
|
||||||
|
# "diskann-R48-binary:type=diskann,R=48,L=128,quantizer=binary" \
|
||||||
|
# "diskann-R72-int8:type=diskann,R=72,L=128,quantizer=int8"
|
||||||
|
#
|
||||||
|
# IVF_CONFIGS = \
|
||||||
|
# "ivf-n128-p16:type=ivf,nlist=128,nprobe=16"
|
||||||
|
#
|
||||||
|
# ANNOY_CONFIGS = \
|
||||||
|
# "annoy-t50:type=annoy,n_trees=50"
|
||||||
|
|
||||||
|
ALL_CONFIGS = $(BASELINES)
|
||||||
|
|
||||||
|
.PHONY: seed ground-truth bench-smoke bench-10k bench-50k bench-100k bench-all \
|
||||||
|
report clean
|
||||||
|
|
||||||
|
# --- Data preparation ---
|
||||||
|
seed:
|
||||||
|
$(MAKE) -C seed
|
||||||
|
|
||||||
|
ground-truth: seed
|
||||||
|
python ground_truth.py --subset-size 10000
|
||||||
|
python ground_truth.py --subset-size 50000
|
||||||
|
python ground_truth.py --subset-size 100000
|
||||||
|
|
||||||
|
# --- Quick smoke test ---
|
||||||
|
bench-smoke: seed
|
||||||
|
$(BENCH) --subset-size 5000 -k 10 -n 20 -o runs/smoke \
|
||||||
|
$(BASELINES)
|
||||||
|
|
||||||
|
# --- Standard sizes ---
|
||||||
|
bench-10k: seed
|
||||||
|
$(BENCH) --subset-size 10000 -k 10 -o runs/10k $(ALL_CONFIGS)
|
||||||
|
|
||||||
|
bench-50k: seed
|
||||||
|
$(BENCH) --subset-size 50000 -k 10 -o runs/50k $(ALL_CONFIGS)
|
||||||
|
|
||||||
|
bench-100k: seed
|
||||||
|
$(BENCH) --subset-size 100000 -k 10 -o runs/100k $(ALL_CONFIGS)
|
||||||
|
|
||||||
|
bench-all: bench-10k bench-50k bench-100k
|
||||||
|
|
||||||
|
# --- Report ---
|
||||||
|
report:
|
||||||
|
@echo "Use: sqlite3 runs/<dir>/results.db 'SELECT * FROM bench_results ORDER BY recall DESC'"
|
||||||
|
|
||||||
|
# --- Cleanup ---
|
||||||
|
clean:
|
||||||
|
rm -rf runs/
|
||||||
81
benchmarks-ann/README.md
Normal file
81
benchmarks-ann/README.md
Normal file
|
|
@ -0,0 +1,81 @@
|
||||||
|
# KNN Benchmarks for sqlite-vec
|
||||||
|
|
||||||
|
Benchmarking infrastructure for vec0 KNN configurations. Includes brute-force
|
||||||
|
baselines (float, int8, bit); index-specific branches add their own types
|
||||||
|
via the `INDEX_REGISTRY` in `bench.py`.
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
- Built `dist/vec0` extension (run `make` from repo root)
|
||||||
|
- Python 3.10+
|
||||||
|
- `uv` (for seed data prep): `pip install uv`
|
||||||
|
|
||||||
|
## Quick start
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 1. Download dataset and build seed DB (~3 GB download, ~5 min)
|
||||||
|
make seed
|
||||||
|
|
||||||
|
# 2. Run a quick smoke test (5k vectors, ~1 min)
|
||||||
|
make bench-smoke
|
||||||
|
|
||||||
|
# 3. Run full benchmark at 10k
|
||||||
|
make bench-10k
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
### Direct invocation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python bench.py --subset-size 10000 \
|
||||||
|
"brute-float:type=baseline,variant=float" \
|
||||||
|
"brute-int8:type=baseline,variant=int8" \
|
||||||
|
"brute-bit:type=baseline,variant=bit"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Config format
|
||||||
|
|
||||||
|
`name:type=<index_type>,key=val,key=val`
|
||||||
|
|
||||||
|
| Index type | Keys | Branch |
|
||||||
|
|-----------|------|--------|
|
||||||
|
| `baseline` | `variant` (float/int8/bit), `oversample` | this branch |
|
||||||
|
|
||||||
|
Index branches register additional types in `INDEX_REGISTRY`. See the
|
||||||
|
docstring in `bench.py` for the extension API.
|
||||||
|
|
||||||
|
### Make targets
|
||||||
|
|
||||||
|
| Target | Description |
|
||||||
|
|--------|-------------|
|
||||||
|
| `make seed` | Download COHERE 1M dataset |
|
||||||
|
| `make ground-truth` | Pre-compute ground truth for 10k/50k/100k |
|
||||||
|
| `make bench-smoke` | Quick 5k baseline test |
|
||||||
|
| `make bench-10k` | All configs at 10k vectors |
|
||||||
|
| `make bench-50k` | All configs at 50k vectors |
|
||||||
|
| `make bench-100k` | All configs at 100k vectors |
|
||||||
|
| `make bench-all` | 10k + 50k + 100k |
|
||||||
|
|
||||||
|
## Adding an index type
|
||||||
|
|
||||||
|
In your index branch, add an entry to `INDEX_REGISTRY` in `bench.py` and
|
||||||
|
append your configs to `ALL_CONFIGS` in the `Makefile`. See the existing
|
||||||
|
`baseline` entry and the comments in both files for the pattern.
|
||||||
|
|
||||||
|
## Results
|
||||||
|
|
||||||
|
Results are stored in `runs/<dir>/results.db` using the schema in `schema.sql`.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sqlite3 runs/10k/results.db "
|
||||||
|
SELECT config_name, recall, mean_ms, qps
|
||||||
|
FROM bench_results
|
||||||
|
ORDER BY recall DESC
|
||||||
|
"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Dataset
|
||||||
|
|
||||||
|
[Zilliz COHERE Medium 1M](https://zilliz.com/learn/datasets-for-vector-database-benchmarks):
|
||||||
|
768 dimensions, cosine distance, 1M train vectors + 10k query vectors with precomputed neighbors.
|
||||||
488
benchmarks-ann/bench.py
Normal file
488
benchmarks-ann/bench.py
Normal file
|
|
@ -0,0 +1,488 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Benchmark runner for sqlite-vec KNN configurations.
|
||||||
|
|
||||||
|
Measures insert time, build/train time, DB size, KNN latency, and recall
|
||||||
|
across different vec0 configurations.
|
||||||
|
|
||||||
|
Config format: name:type=<index_type>,key=val,key=val
|
||||||
|
|
||||||
|
Baseline (brute-force) keys:
|
||||||
|
type=baseline, variant=float|int8|bit, oversample=8
|
||||||
|
|
||||||
|
Index-specific types can be registered via INDEX_REGISTRY (see below).
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python bench.py --subset-size 10000 \
|
||||||
|
"brute-float:type=baseline,variant=float" \
|
||||||
|
"brute-int8:type=baseline,variant=int8" \
|
||||||
|
"brute-bit:type=baseline,variant=bit"
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import sqlite3
|
||||||
|
import statistics
|
||||||
|
import time
|
||||||
|
|
||||||
|
_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
EXT_PATH = os.path.join(_SCRIPT_DIR, "..", "dist", "vec0")
|
||||||
|
BASE_DB = os.path.join(_SCRIPT_DIR, "seed", "base.db")
|
||||||
|
INSERT_BATCH_SIZE = 1000
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Index registry — extension point for ANN index branches
|
||||||
|
# ============================================================================
|
||||||
|
#
|
||||||
|
# Each index type provides a dict with:
|
||||||
|
# "defaults": dict of default params
|
||||||
|
# "create_table_sql": fn(params) -> SQL string
|
||||||
|
# "insert_sql": fn(params) -> SQL string (or None for default)
|
||||||
|
# "post_insert_hook": fn(conn, params) -> train_time_s (or None)
|
||||||
|
# "run_query": fn(conn, params, query, k) -> [(id, distance), ...] (or None for default MATCH)
|
||||||
|
# "describe": fn(params) -> str (one-line description)
|
||||||
|
#
|
||||||
|
# To add a new index type, add an entry here. Example (in your branch):
|
||||||
|
#
|
||||||
|
# INDEX_REGISTRY["diskann"] = {
|
||||||
|
# "defaults": {"R": 72, "L": 128, "quantizer": "binary", "buffer_threshold": 0},
|
||||||
|
# "create_table_sql": lambda p: f"CREATE VIRTUAL TABLE vec_items USING vec0(...)",
|
||||||
|
# "insert_sql": None,
|
||||||
|
# "post_insert_hook": None,
|
||||||
|
# "run_query": None,
|
||||||
|
# "describe": lambda p: f"diskann q={p['quantizer']} R={p['R']} L={p['L']}",
|
||||||
|
# }
|
||||||
|
|
||||||
|
INDEX_REGISTRY = {}
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Baseline implementation
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
def _baseline_create_table_sql(params):
|
||||||
|
variant = params["variant"]
|
||||||
|
extra = ""
|
||||||
|
if variant == "int8":
|
||||||
|
extra = ", embedding_int8 int8[768]"
|
||||||
|
elif variant == "bit":
|
||||||
|
extra = ", embedding_bq bit[768]"
|
||||||
|
return (
|
||||||
|
f"CREATE VIRTUAL TABLE vec_items USING vec0("
|
||||||
|
f" chunk_size=256,"
|
||||||
|
f" id integer primary key,"
|
||||||
|
f" embedding float[768] distance_metric=cosine"
|
||||||
|
f" {extra})"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _baseline_insert_sql(params):
|
||||||
|
variant = params["variant"]
|
||||||
|
if variant == "int8":
|
||||||
|
return (
|
||||||
|
"INSERT INTO vec_items(id, embedding, embedding_int8) "
|
||||||
|
"SELECT id, vector, vec_quantize_int8(vector, 'unit') "
|
||||||
|
"FROM base.train WHERE id >= :lo AND id < :hi"
|
||||||
|
)
|
||||||
|
elif variant == "bit":
|
||||||
|
return (
|
||||||
|
"INSERT INTO vec_items(id, embedding, embedding_bq) "
|
||||||
|
"SELECT id, vector, vec_quantize_binary(vector) "
|
||||||
|
"FROM base.train WHERE id >= :lo AND id < :hi"
|
||||||
|
)
|
||||||
|
return None # use default
|
||||||
|
|
||||||
|
|
||||||
|
def _baseline_run_query(conn, params, query, k):
|
||||||
|
variant = params["variant"]
|
||||||
|
oversample = params.get("oversample", 8)
|
||||||
|
|
||||||
|
if variant == "int8":
|
||||||
|
return conn.execute(
|
||||||
|
"WITH coarse AS ("
|
||||||
|
" SELECT id, embedding FROM vec_items"
|
||||||
|
" WHERE embedding_int8 MATCH vec_quantize_int8(:query, 'unit')"
|
||||||
|
" LIMIT :oversample_k"
|
||||||
|
") "
|
||||||
|
"SELECT id, vec_distance_cosine(embedding, :query) as distance "
|
||||||
|
"FROM coarse ORDER BY 2 LIMIT :k",
|
||||||
|
{"query": query, "k": k, "oversample_k": k * oversample},
|
||||||
|
).fetchall()
|
||||||
|
elif variant == "bit":
|
||||||
|
return conn.execute(
|
||||||
|
"WITH coarse AS ("
|
||||||
|
" SELECT id, embedding FROM vec_items"
|
||||||
|
" WHERE embedding_bq MATCH vec_quantize_binary(:query)"
|
||||||
|
" LIMIT :oversample_k"
|
||||||
|
") "
|
||||||
|
"SELECT id, vec_distance_cosine(embedding, :query) as distance "
|
||||||
|
"FROM coarse ORDER BY 2 LIMIT :k",
|
||||||
|
{"query": query, "k": k, "oversample_k": k * oversample},
|
||||||
|
).fetchall()
|
||||||
|
|
||||||
|
return None # use default MATCH
|
||||||
|
|
||||||
|
|
||||||
|
def _baseline_describe(params):
|
||||||
|
v = params["variant"]
|
||||||
|
if v in ("int8", "bit"):
|
||||||
|
return f"baseline {v} (os={params['oversample']})"
|
||||||
|
return f"baseline {v}"
|
||||||
|
|
||||||
|
|
||||||
|
INDEX_REGISTRY["baseline"] = {
|
||||||
|
"defaults": {"variant": "float", "oversample": 8},
|
||||||
|
"create_table_sql": _baseline_create_table_sql,
|
||||||
|
"insert_sql": _baseline_insert_sql,
|
||||||
|
"post_insert_hook": None,
|
||||||
|
"run_query": _baseline_run_query,
|
||||||
|
"describe": _baseline_describe,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Config parsing
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
INT_KEYS = {
|
||||||
|
"R", "L", "buffer_threshold", "nlist", "nprobe", "oversample",
|
||||||
|
"n_trees", "search_k",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def parse_config(spec):
|
||||||
|
"""Parse 'name:type=baseline,key=val,...' into (name, params_dict)."""
|
||||||
|
if ":" in spec:
|
||||||
|
name, opts_str = spec.split(":", 1)
|
||||||
|
else:
|
||||||
|
name, opts_str = spec, ""
|
||||||
|
|
||||||
|
raw = {}
|
||||||
|
if opts_str:
|
||||||
|
for kv in opts_str.split(","):
|
||||||
|
k, v = kv.split("=", 1)
|
||||||
|
raw[k.strip()] = v.strip()
|
||||||
|
|
||||||
|
index_type = raw.pop("type", "baseline")
|
||||||
|
if index_type not in INDEX_REGISTRY:
|
||||||
|
raise ValueError(
|
||||||
|
f"Unknown index type: {index_type}. "
|
||||||
|
f"Available: {', '.join(sorted(INDEX_REGISTRY.keys()))}"
|
||||||
|
)
|
||||||
|
|
||||||
|
reg = INDEX_REGISTRY[index_type]
|
||||||
|
params = dict(reg["defaults"])
|
||||||
|
for k, v in raw.items():
|
||||||
|
if k in INT_KEYS:
|
||||||
|
params[k] = int(v)
|
||||||
|
else:
|
||||||
|
params[k] = v
|
||||||
|
params["index_type"] = index_type
|
||||||
|
|
||||||
|
return name, params
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Shared helpers
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
def load_query_vectors(base_db_path, n):
|
||||||
|
conn = sqlite3.connect(base_db_path)
|
||||||
|
rows = conn.execute(
|
||||||
|
"SELECT id, vector FROM query_vectors ORDER BY id LIMIT :n", {"n": n}
|
||||||
|
).fetchall()
|
||||||
|
conn.close()
|
||||||
|
return [(r[0], r[1]) for r in rows]
|
||||||
|
|
||||||
|
|
||||||
|
def insert_loop(conn, sql, subset_size, label=""):
|
||||||
|
t0 = time.perf_counter()
|
||||||
|
for lo in range(0, subset_size, INSERT_BATCH_SIZE):
|
||||||
|
hi = min(lo + INSERT_BATCH_SIZE, subset_size)
|
||||||
|
conn.execute(sql, {"lo": lo, "hi": hi})
|
||||||
|
conn.commit()
|
||||||
|
done = hi
|
||||||
|
if done % 5000 == 0 or done == subset_size:
|
||||||
|
elapsed = time.perf_counter() - t0
|
||||||
|
rate = done / elapsed if elapsed > 0 else 0
|
||||||
|
print(
|
||||||
|
f" [{label}] {done:>8}/{subset_size} "
|
||||||
|
f"{elapsed:.1f}s {rate:.0f} rows/s",
|
||||||
|
flush=True,
|
||||||
|
)
|
||||||
|
return time.perf_counter() - t0
|
||||||
|
|
||||||
|
|
||||||
|
def open_bench_db(db_path, ext_path, base_db):
|
||||||
|
if os.path.exists(db_path):
|
||||||
|
os.remove(db_path)
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.enable_load_extension(True)
|
||||||
|
conn.load_extension(ext_path)
|
||||||
|
conn.execute("PRAGMA page_size=8192")
|
||||||
|
conn.execute(f"ATTACH DATABASE '{base_db}' AS base")
|
||||||
|
return conn
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_INSERT_SQL = (
|
||||||
|
"INSERT INTO vec_items(id, embedding) "
|
||||||
|
"SELECT id, vector FROM base.train WHERE id >= :lo AND id < :hi"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Build
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
def build_index(base_db, ext_path, name, params, subset_size, out_dir):
|
||||||
|
db_path = os.path.join(out_dir, f"{name}.{subset_size}.db")
|
||||||
|
conn = open_bench_db(db_path, ext_path, base_db)
|
||||||
|
|
||||||
|
reg = INDEX_REGISTRY[params["index_type"]]
|
||||||
|
|
||||||
|
conn.execute(reg["create_table_sql"](params))
|
||||||
|
|
||||||
|
label = params["index_type"]
|
||||||
|
print(f" Inserting {subset_size} vectors...")
|
||||||
|
|
||||||
|
sql_fn = reg.get("insert_sql")
|
||||||
|
sql = sql_fn(params) if sql_fn else None
|
||||||
|
if sql is None:
|
||||||
|
sql = DEFAULT_INSERT_SQL
|
||||||
|
|
||||||
|
insert_time = insert_loop(conn, sql, subset_size, label)
|
||||||
|
|
||||||
|
train_time = 0.0
|
||||||
|
hook = reg.get("post_insert_hook")
|
||||||
|
if hook:
|
||||||
|
train_time = hook(conn, params)
|
||||||
|
|
||||||
|
row_count = conn.execute("SELECT count(*) FROM vec_items").fetchone()[0]
|
||||||
|
conn.close()
|
||||||
|
file_size_mb = os.path.getsize(db_path) / (1024 * 1024)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"db_path": db_path,
|
||||||
|
"insert_time_s": round(insert_time, 3),
|
||||||
|
"train_time_s": round(train_time, 3),
|
||||||
|
"total_time_s": round(insert_time + train_time, 3),
|
||||||
|
"insert_per_vec_ms": round((insert_time / row_count) * 1000, 2)
|
||||||
|
if row_count
|
||||||
|
else 0,
|
||||||
|
"rows": row_count,
|
||||||
|
"file_size_mb": round(file_size_mb, 2),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# KNN measurement
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
def _default_match_query(conn, query, k):
|
||||||
|
return conn.execute(
|
||||||
|
"SELECT id, distance FROM vec_items "
|
||||||
|
"WHERE embedding MATCH :query AND k = :k",
|
||||||
|
{"query": query, "k": k},
|
||||||
|
).fetchall()
|
||||||
|
|
||||||
|
|
||||||
|
def measure_knn(db_path, ext_path, base_db, params, subset_size, k=10, n=50):
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.enable_load_extension(True)
|
||||||
|
conn.load_extension(ext_path)
|
||||||
|
conn.execute(f"ATTACH DATABASE '{base_db}' AS base")
|
||||||
|
|
||||||
|
query_vectors = load_query_vectors(base_db, n)
|
||||||
|
|
||||||
|
reg = INDEX_REGISTRY[params["index_type"]]
|
||||||
|
query_fn = reg.get("run_query")
|
||||||
|
|
||||||
|
times_ms = []
|
||||||
|
recalls = []
|
||||||
|
for qid, query in query_vectors:
|
||||||
|
t0 = time.perf_counter()
|
||||||
|
|
||||||
|
results = None
|
||||||
|
if query_fn:
|
||||||
|
results = query_fn(conn, params, query, k)
|
||||||
|
if results is None:
|
||||||
|
results = _default_match_query(conn, query, k)
|
||||||
|
|
||||||
|
elapsed_ms = (time.perf_counter() - t0) * 1000
|
||||||
|
times_ms.append(elapsed_ms)
|
||||||
|
result_ids = set(r[0] for r in results)
|
||||||
|
|
||||||
|
# Ground truth: use pre-computed neighbors table for full dataset,
|
||||||
|
# otherwise brute-force over the subset
|
||||||
|
if subset_size >= 1000000:
|
||||||
|
gt_rows = conn.execute(
|
||||||
|
"SELECT CAST(neighbors_id AS INTEGER) FROM base.neighbors "
|
||||||
|
"WHERE query_vector_id = :qid AND rank < :k",
|
||||||
|
{"qid": qid, "k": k},
|
||||||
|
).fetchall()
|
||||||
|
else:
|
||||||
|
gt_rows = conn.execute(
|
||||||
|
"SELECT id FROM ("
|
||||||
|
" SELECT id, vec_distance_cosine(vector, :query) as dist "
|
||||||
|
" FROM base.train WHERE id < :n ORDER BY dist LIMIT :k"
|
||||||
|
")",
|
||||||
|
{"query": query, "k": k, "n": subset_size},
|
||||||
|
).fetchall()
|
||||||
|
gt_ids = set(r[0] for r in gt_rows)
|
||||||
|
|
||||||
|
if gt_ids:
|
||||||
|
recalls.append(len(result_ids & gt_ids) / len(gt_ids))
|
||||||
|
else:
|
||||||
|
recalls.append(0.0)
|
||||||
|
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
return {
|
||||||
|
"mean_ms": round(statistics.mean(times_ms), 2),
|
||||||
|
"median_ms": round(statistics.median(times_ms), 2),
|
||||||
|
"p99_ms": round(sorted(times_ms)[int(len(times_ms) * 0.99)], 2)
|
||||||
|
if len(times_ms) > 1
|
||||||
|
else round(times_ms[0], 2),
|
||||||
|
"total_ms": round(sum(times_ms), 2),
|
||||||
|
"recall": round(statistics.mean(recalls), 4),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Results persistence
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
def save_results(results_path, rows):
|
||||||
|
db = sqlite3.connect(results_path)
|
||||||
|
db.executescript(open(os.path.join(_SCRIPT_DIR, "schema.sql")).read())
|
||||||
|
for r in rows:
|
||||||
|
db.execute(
|
||||||
|
"INSERT OR REPLACE INTO build_results "
|
||||||
|
"(config_name, index_type, subset_size, db_path, "
|
||||||
|
" insert_time_s, train_time_s, total_time_s, rows, file_size_mb) "
|
||||||
|
"VALUES (?,?,?,?,?,?,?,?,?)",
|
||||||
|
(
|
||||||
|
r["name"], r["index_type"], r["n_vectors"], r["db_path"],
|
||||||
|
r["insert_time_s"], r["train_time_s"], r["total_time_s"],
|
||||||
|
r["rows"], r["file_size_mb"],
|
||||||
|
),
|
||||||
|
)
|
||||||
|
db.execute(
|
||||||
|
"INSERT OR REPLACE INTO bench_results "
|
||||||
|
"(config_name, index_type, subset_size, k, n, "
|
||||||
|
" mean_ms, median_ms, p99_ms, total_ms, qps, recall, db_path) "
|
||||||
|
"VALUES (?,?,?,?,?,?,?,?,?,?,?,?)",
|
||||||
|
(
|
||||||
|
r["name"], r["index_type"], r["n_vectors"], r["k"], r["n_queries"],
|
||||||
|
r["mean_ms"], r["median_ms"], r["p99_ms"], r["total_ms"],
|
||||||
|
round(r["n_queries"] / (r["total_ms"] / 1000), 1)
|
||||||
|
if r["total_ms"] > 0 else 0,
|
||||||
|
r["recall"], r["db_path"],
|
||||||
|
),
|
||||||
|
)
|
||||||
|
db.commit()
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Reporting
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
def print_report(all_results):
|
||||||
|
print(
|
||||||
|
f"\n{'name':>20} {'N':>7} {'type':>10} {'config':>28} "
|
||||||
|
f"{'ins(s)':>7} {'train':>6} {'MB':>7} "
|
||||||
|
f"{'qry(ms)':>8} {'recall':>7}"
|
||||||
|
)
|
||||||
|
print("-" * 115)
|
||||||
|
for r in all_results:
|
||||||
|
train = f"{r['train_time_s']:.1f}" if r["train_time_s"] > 0 else "-"
|
||||||
|
print(
|
||||||
|
f"{r['name']:>20} {r['n_vectors']:>7} {r['index_type']:>10} "
|
||||||
|
f"{r['config_desc']:>28} "
|
||||||
|
f"{r['insert_time_s']:>7.1f} {train:>6} {r['file_size_mb']:>7.1f} "
|
||||||
|
f"{r['mean_ms']:>8.2f} {r['recall']:>7.4f}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Main
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Benchmark runner for sqlite-vec KNN configurations",
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
epilog=__doc__,
|
||||||
|
)
|
||||||
|
parser.add_argument("configs", nargs="+", help="config specs (name:type=X,key=val,...)")
|
||||||
|
parser.add_argument("--subset-size", type=int, required=True)
|
||||||
|
parser.add_argument("-k", type=int, default=10, help="KNN k (default 10)")
|
||||||
|
parser.add_argument("-n", type=int, default=50, help="number of queries (default 50)")
|
||||||
|
parser.add_argument("--base-db", default=BASE_DB)
|
||||||
|
parser.add_argument("--ext", default=EXT_PATH)
|
||||||
|
parser.add_argument("-o", "--out-dir", default="runs")
|
||||||
|
parser.add_argument("--results-db", default=None,
|
||||||
|
help="path to results DB (default: <out-dir>/results.db)")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
os.makedirs(args.out_dir, exist_ok=True)
|
||||||
|
results_db = args.results_db or os.path.join(args.out_dir, "results.db")
|
||||||
|
configs = [parse_config(c) for c in args.configs]
|
||||||
|
|
||||||
|
all_results = []
|
||||||
|
for i, (name, params) in enumerate(configs, 1):
|
||||||
|
reg = INDEX_REGISTRY[params["index_type"]]
|
||||||
|
desc = reg["describe"](params)
|
||||||
|
print(f"\n[{i}/{len(configs)}] {name} ({desc.strip()})")
|
||||||
|
|
||||||
|
build = build_index(
|
||||||
|
args.base_db, args.ext, name, params, args.subset_size, args.out_dir
|
||||||
|
)
|
||||||
|
train_str = f" + {build['train_time_s']}s train" if build["train_time_s"] > 0 else ""
|
||||||
|
print(
|
||||||
|
f" Build: {build['insert_time_s']}s insert{train_str} "
|
||||||
|
f"{build['file_size_mb']} MB"
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f" Measuring KNN (k={args.k}, n={args.n})...")
|
||||||
|
knn = measure_knn(
|
||||||
|
build["db_path"], args.ext, args.base_db,
|
||||||
|
params, args.subset_size, k=args.k, n=args.n,
|
||||||
|
)
|
||||||
|
print(f" KNN: mean={knn['mean_ms']}ms recall@{args.k}={knn['recall']}")
|
||||||
|
|
||||||
|
all_results.append({
|
||||||
|
"name": name,
|
||||||
|
"n_vectors": args.subset_size,
|
||||||
|
"index_type": params["index_type"],
|
||||||
|
"config_desc": desc,
|
||||||
|
"db_path": build["db_path"],
|
||||||
|
"insert_time_s": build["insert_time_s"],
|
||||||
|
"train_time_s": build["train_time_s"],
|
||||||
|
"total_time_s": build["total_time_s"],
|
||||||
|
"insert_per_vec_ms": build["insert_per_vec_ms"],
|
||||||
|
"rows": build["rows"],
|
||||||
|
"file_size_mb": build["file_size_mb"],
|
||||||
|
"k": args.k,
|
||||||
|
"n_queries": args.n,
|
||||||
|
"mean_ms": knn["mean_ms"],
|
||||||
|
"median_ms": knn["median_ms"],
|
||||||
|
"p99_ms": knn["p99_ms"],
|
||||||
|
"total_ms": knn["total_ms"],
|
||||||
|
"recall": knn["recall"],
|
||||||
|
})
|
||||||
|
|
||||||
|
print_report(all_results)
|
||||||
|
save_results(results_db, all_results)
|
||||||
|
print(f"\nResults saved to {results_db}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
168
benchmarks-ann/ground_truth.py
Normal file
168
benchmarks-ann/ground_truth.py
Normal file
|
|
@ -0,0 +1,168 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Compute per-subset ground truth for ANN benchmarks.
|
||||||
|
|
||||||
|
For subset sizes < 1M, builds a temporary vec0 float table with the first N
|
||||||
|
vectors and runs brute-force KNN to get correct ground truth per subset.
|
||||||
|
|
||||||
|
For 1M (the full dataset), converts the existing `neighbors` table.
|
||||||
|
|
||||||
|
Output: ground_truth.{subset_size}.db with table:
|
||||||
|
ground_truth(query_vector_id, rank, neighbor_id, distance)
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python ground_truth.py --subset-size 50000
|
||||||
|
python ground_truth.py --subset-size 1000000
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import sqlite3
|
||||||
|
import time
|
||||||
|
|
||||||
|
_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
EXT_PATH = os.path.join(_SCRIPT_DIR, "..", "dist", "vec0")
|
||||||
|
BASE_DB = os.path.join(_SCRIPT_DIR, "seed", "base.db")
|
||||||
|
FULL_DATASET_SIZE = 1_000_000
|
||||||
|
|
||||||
|
|
||||||
|
def gen_ground_truth_subset(base_db, ext_path, subset_size, n_queries, k, out_path):
|
||||||
|
"""Build ground truth by brute-force KNN over the first `subset_size` vectors."""
|
||||||
|
if os.path.exists(out_path):
|
||||||
|
os.remove(out_path)
|
||||||
|
|
||||||
|
conn = sqlite3.connect(out_path)
|
||||||
|
conn.enable_load_extension(True)
|
||||||
|
conn.load_extension(ext_path)
|
||||||
|
|
||||||
|
conn.execute(
|
||||||
|
"CREATE TABLE ground_truth ("
|
||||||
|
" query_vector_id INTEGER NOT NULL,"
|
||||||
|
" rank INTEGER NOT NULL,"
|
||||||
|
" neighbor_id INTEGER NOT NULL,"
|
||||||
|
" distance REAL NOT NULL,"
|
||||||
|
" PRIMARY KEY (query_vector_id, rank)"
|
||||||
|
")"
|
||||||
|
)
|
||||||
|
|
||||||
|
conn.execute(f"ATTACH DATABASE '{base_db}' AS base")
|
||||||
|
|
||||||
|
print(f" Building temp vec0 table with {subset_size} vectors...")
|
||||||
|
conn.execute(
|
||||||
|
"CREATE VIRTUAL TABLE tmp_vec USING vec0("
|
||||||
|
" id integer primary key,"
|
||||||
|
" embedding float[768] distance_metric=cosine"
|
||||||
|
")"
|
||||||
|
)
|
||||||
|
|
||||||
|
t0 = time.perf_counter()
|
||||||
|
conn.execute(
|
||||||
|
"INSERT INTO tmp_vec(id, embedding) "
|
||||||
|
"SELECT id, vector FROM base.train WHERE id < :n",
|
||||||
|
{"n": subset_size},
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
build_time = time.perf_counter() - t0
|
||||||
|
print(f" Temp table built in {build_time:.1f}s")
|
||||||
|
|
||||||
|
query_vectors = conn.execute(
|
||||||
|
"SELECT id, vector FROM base.query_vectors ORDER BY id LIMIT :n",
|
||||||
|
{"n": n_queries},
|
||||||
|
).fetchall()
|
||||||
|
|
||||||
|
print(f" Running brute-force KNN for {len(query_vectors)} queries, k={k}...")
|
||||||
|
t0 = time.perf_counter()
|
||||||
|
|
||||||
|
for i, (qid, qvec) in enumerate(query_vectors):
|
||||||
|
results = conn.execute(
|
||||||
|
"SELECT id, distance FROM tmp_vec "
|
||||||
|
"WHERE embedding MATCH :query AND k = :k",
|
||||||
|
{"query": qvec, "k": k},
|
||||||
|
).fetchall()
|
||||||
|
|
||||||
|
for rank, (nid, dist) in enumerate(results):
|
||||||
|
conn.execute(
|
||||||
|
"INSERT INTO ground_truth(query_vector_id, rank, neighbor_id, distance) "
|
||||||
|
"VALUES (?, ?, ?, ?)",
|
||||||
|
(qid, rank, nid, dist),
|
||||||
|
)
|
||||||
|
|
||||||
|
if (i + 1) % 10 == 0 or i == 0:
|
||||||
|
elapsed = time.perf_counter() - t0
|
||||||
|
eta = (elapsed / (i + 1)) * (len(query_vectors) - i - 1)
|
||||||
|
print(
|
||||||
|
f" {i+1}/{len(query_vectors)} queries "
|
||||||
|
f"elapsed={elapsed:.1f}s eta={eta:.1f}s",
|
||||||
|
flush=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
conn.execute("DROP TABLE tmp_vec")
|
||||||
|
conn.execute("DETACH DATABASE base")
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
elapsed = time.perf_counter() - t0
|
||||||
|
total_rows = conn.execute("SELECT count(*) FROM ground_truth").fetchone()[0]
|
||||||
|
conn.close()
|
||||||
|
print(f" Ground truth: {total_rows} rows in {elapsed:.1f}s -> {out_path}")
|
||||||
|
|
||||||
|
|
||||||
|
def gen_ground_truth_full(base_db, n_queries, k, out_path):
|
||||||
|
"""Convert the existing neighbors table for the full 1M dataset."""
|
||||||
|
if os.path.exists(out_path):
|
||||||
|
os.remove(out_path)
|
||||||
|
|
||||||
|
conn = sqlite3.connect(out_path)
|
||||||
|
conn.execute(f"ATTACH DATABASE '{base_db}' AS base")
|
||||||
|
|
||||||
|
conn.execute(
|
||||||
|
"CREATE TABLE ground_truth ("
|
||||||
|
" query_vector_id INTEGER NOT NULL,"
|
||||||
|
" rank INTEGER NOT NULL,"
|
||||||
|
" neighbor_id INTEGER NOT NULL,"
|
||||||
|
" distance REAL,"
|
||||||
|
" PRIMARY KEY (query_vector_id, rank)"
|
||||||
|
")"
|
||||||
|
)
|
||||||
|
|
||||||
|
conn.execute(
|
||||||
|
"INSERT INTO ground_truth(query_vector_id, rank, neighbor_id) "
|
||||||
|
"SELECT query_vector_id, rank, CAST(neighbors_id AS INTEGER) "
|
||||||
|
"FROM base.neighbors "
|
||||||
|
"WHERE query_vector_id < :n AND rank < :k",
|
||||||
|
{"n": n_queries, "k": k},
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
total_rows = conn.execute("SELECT count(*) FROM ground_truth").fetchone()[0]
|
||||||
|
conn.execute("DETACH DATABASE base")
|
||||||
|
conn.close()
|
||||||
|
print(f" Ground truth (full): {total_rows} rows -> {out_path}")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Generate per-subset ground truth")
|
||||||
|
parser.add_argument(
|
||||||
|
"--subset-size", type=int, required=True, help="number of vectors in subset"
|
||||||
|
)
|
||||||
|
parser.add_argument("-n", type=int, default=100, help="number of query vectors")
|
||||||
|
parser.add_argument("-k", type=int, default=100, help="max k for ground truth")
|
||||||
|
parser.add_argument("--base-db", default=BASE_DB)
|
||||||
|
parser.add_argument("--ext", default=EXT_PATH)
|
||||||
|
parser.add_argument(
|
||||||
|
"-o", "--out-dir", default=os.path.join(_SCRIPT_DIR, "seed"),
|
||||||
|
help="output directory for ground_truth.{N}.db",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
os.makedirs(args.out_dir, exist_ok=True)
|
||||||
|
out_path = os.path.join(args.out_dir, f"ground_truth.{args.subset_size}.db")
|
||||||
|
|
||||||
|
if args.subset_size >= FULL_DATASET_SIZE:
|
||||||
|
gen_ground_truth_full(args.base_db, args.n, args.k, out_path)
|
||||||
|
else:
|
||||||
|
gen_ground_truth_subset(
|
||||||
|
args.base_db, args.ext, args.subset_size, args.n, args.k, out_path
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
440
benchmarks-ann/profile.py
Normal file
440
benchmarks-ann/profile.py
Normal file
|
|
@ -0,0 +1,440 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""CPU profiling for sqlite-vec KNN configurations using macOS `sample` tool.
|
||||||
|
|
||||||
|
Builds dist/sqlite3 (with -g3), generates a SQL workload (inserts + repeated
|
||||||
|
KNN queries) for each config, profiles the sqlite3 process with `sample`, and
|
||||||
|
prints the top-N hottest functions by self (exclusive) CPU samples.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
cd benchmarks-ann
|
||||||
|
uv run profile.py --subset-size 50000 -n 50 \\
|
||||||
|
"baseline-int8:type=baseline,variant=int8,oversample=8" \\
|
||||||
|
"rescore-int8:type=rescore,quantizer=int8,oversample=8"
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
_PROJECT_ROOT = os.path.join(_SCRIPT_DIR, "..")
|
||||||
|
|
||||||
|
sys.path.insert(0, _SCRIPT_DIR)
|
||||||
|
from bench import (
|
||||||
|
BASE_DB,
|
||||||
|
DEFAULT_INSERT_SQL,
|
||||||
|
INDEX_REGISTRY,
|
||||||
|
INSERT_BATCH_SIZE,
|
||||||
|
parse_config,
|
||||||
|
)
|
||||||
|
|
||||||
|
SQLITE3_PATH = os.path.join(_PROJECT_ROOT, "dist", "sqlite3")
|
||||||
|
EXT_PATH = os.path.join(_PROJECT_ROOT, "dist", "vec0")
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# SQL generation
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
def _query_sql_for_config(params, query_id, k):
|
||||||
|
"""Return a SQL query string for a single KNN query by query_vector id."""
|
||||||
|
index_type = params["index_type"]
|
||||||
|
qvec = f"(SELECT vector FROM base.query_vectors WHERE id = {query_id})"
|
||||||
|
|
||||||
|
if index_type == "baseline":
|
||||||
|
variant = params.get("variant", "float")
|
||||||
|
oversample = params.get("oversample", 8)
|
||||||
|
oversample_k = k * oversample
|
||||||
|
|
||||||
|
if variant == "int8":
|
||||||
|
return (
|
||||||
|
f"WITH coarse AS ("
|
||||||
|
f" SELECT id, embedding FROM vec_items"
|
||||||
|
f" WHERE embedding_int8 MATCH vec_quantize_int8({qvec}, 'unit')"
|
||||||
|
f" LIMIT {oversample_k}"
|
||||||
|
f") "
|
||||||
|
f"SELECT id, vec_distance_cosine(embedding, {qvec}) as distance "
|
||||||
|
f"FROM coarse ORDER BY 2 LIMIT {k};"
|
||||||
|
)
|
||||||
|
elif variant == "bit":
|
||||||
|
return (
|
||||||
|
f"WITH coarse AS ("
|
||||||
|
f" SELECT id, embedding FROM vec_items"
|
||||||
|
f" WHERE embedding_bq MATCH vec_quantize_binary({qvec})"
|
||||||
|
f" LIMIT {oversample_k}"
|
||||||
|
f") "
|
||||||
|
f"SELECT id, vec_distance_cosine(embedding, {qvec}) as distance "
|
||||||
|
f"FROM coarse ORDER BY 2 LIMIT {k};"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Default MATCH query (baseline-float, rescore, and others)
|
||||||
|
return (
|
||||||
|
f"SELECT id, distance FROM vec_items"
|
||||||
|
f" WHERE embedding MATCH {qvec} AND k = {k};"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def generate_sql(db_path, params, subset_size, n_queries, k, repeats):
|
||||||
|
"""Generate a complete SQL workload: load ext, create table, insert, query."""
|
||||||
|
lines = []
|
||||||
|
lines.append(".bail on")
|
||||||
|
lines.append(f".load {EXT_PATH}")
|
||||||
|
lines.append(f"ATTACH DATABASE '{os.path.abspath(BASE_DB)}' AS base;")
|
||||||
|
lines.append("PRAGMA page_size=8192;")
|
||||||
|
|
||||||
|
# Create table
|
||||||
|
reg = INDEX_REGISTRY[params["index_type"]]
|
||||||
|
lines.append(reg["create_table_sql"](params) + ";")
|
||||||
|
|
||||||
|
# Inserts
|
||||||
|
sql_fn = reg.get("insert_sql")
|
||||||
|
insert_sql = sql_fn(params) if sql_fn else None
|
||||||
|
if insert_sql is None:
|
||||||
|
insert_sql = DEFAULT_INSERT_SQL
|
||||||
|
for lo in range(0, subset_size, INSERT_BATCH_SIZE):
|
||||||
|
hi = min(lo + INSERT_BATCH_SIZE, subset_size)
|
||||||
|
stmt = insert_sql.replace(":lo", str(lo)).replace(":hi", str(hi))
|
||||||
|
lines.append(stmt + ";")
|
||||||
|
if hi % 10000 == 0 or hi == subset_size:
|
||||||
|
lines.append("-- progress: inserted %d/%d" % (hi, subset_size))
|
||||||
|
|
||||||
|
# Queries (repeated)
|
||||||
|
lines.append("-- BEGIN QUERIES")
|
||||||
|
for _rep in range(repeats):
|
||||||
|
for qid in range(n_queries):
|
||||||
|
lines.append(_query_sql_for_config(params, qid, k))
|
||||||
|
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Profiling with macOS `sample`
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
def run_profile(sqlite3_path, db_path, sql_file, sample_output, duration=120):
|
||||||
|
"""Run sqlite3 under macOS `sample` profiler.
|
||||||
|
|
||||||
|
Starts sqlite3 directly with stdin from the SQL file, then immediately
|
||||||
|
attaches `sample` to its PID with -mayDie (tolerates process exit).
|
||||||
|
The workload must be long enough for sample to attach and capture useful data.
|
||||||
|
"""
|
||||||
|
sql_fd = open(sql_file, "r")
|
||||||
|
proc = subprocess.Popen(
|
||||||
|
[sqlite3_path, db_path],
|
||||||
|
stdin=sql_fd,
|
||||||
|
stdout=subprocess.DEVNULL,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
)
|
||||||
|
|
||||||
|
pid = proc.pid
|
||||||
|
print(f" sqlite3 PID: {pid}")
|
||||||
|
|
||||||
|
# Attach sample immediately (1ms interval, -mayDie tolerates process exit)
|
||||||
|
sample_proc = subprocess.Popen(
|
||||||
|
["sample", str(pid), str(duration), "1", "-mayDie", "-file", sample_output],
|
||||||
|
stdout=subprocess.DEVNULL,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Wait for sqlite3 to finish
|
||||||
|
_, stderr = proc.communicate()
|
||||||
|
sql_fd.close()
|
||||||
|
rc = proc.returncode
|
||||||
|
if rc != 0:
|
||||||
|
print(f" sqlite3 failed (rc={rc}):", file=sys.stderr)
|
||||||
|
print(f" {stderr.decode().strip()}", file=sys.stderr)
|
||||||
|
sample_proc.kill()
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Wait for sample to finish
|
||||||
|
sample_proc.wait()
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Parse `sample` output
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
# Tree-drawing characters used by macOS `sample` to represent hierarchy.
|
||||||
|
# We replace them with spaces so indentation depth reflects tree depth.
|
||||||
|
_TREE_CHARS_RE = re.compile(r"[+!:|]")
|
||||||
|
|
||||||
|
# After tree chars are replaced with spaces, each call-graph line looks like:
|
||||||
|
# " 800 rescore_knn (in vec0.dylib) + 3808,3640,... [0x1a,0x2b,...] file.c:123"
|
||||||
|
# We extract just (indent, count, symbol, module) — everything after "(in ...)"
|
||||||
|
# is decoration we don't need.
|
||||||
|
_LEADING_RE = re.compile(r"^(\s+)(\d+)\s+(.+)")
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_symbol_and_module(rest):
|
||||||
|
"""Given the text after 'count ', extract (symbol, module).
|
||||||
|
|
||||||
|
Handles patterns like:
|
||||||
|
'rescore_knn (in vec0.dylib) + 3808,3640,... [0x...]'
|
||||||
|
'pread (in libsystem_kernel.dylib) + 8 [0x...]'
|
||||||
|
'??? (in <unknown binary>) [0x...]'
|
||||||
|
'start (in dyld) + 2840 [0x198650274]'
|
||||||
|
'Thread_26759239 DispatchQueue_1: ...'
|
||||||
|
"""
|
||||||
|
# Try to find "(in ...)" to split symbol from module
|
||||||
|
m = re.match(r"^(.+?)\s+\(in\s+(.+?)\)", rest)
|
||||||
|
if m:
|
||||||
|
return m.group(1).strip(), m.group(2).strip()
|
||||||
|
# No module — return whole thing as symbol, strip trailing junk
|
||||||
|
sym = re.sub(r"\s+\[0x[0-9a-f].*", "", rest).strip()
|
||||||
|
return sym, ""
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_call_graph_lines(text):
|
||||||
|
"""Parse call-graph section into list of (depth, count, symbol, module)."""
|
||||||
|
entries = []
|
||||||
|
for raw_line in text.split("\n"):
|
||||||
|
# Strip tree-drawing characters, replace with spaces to preserve depth
|
||||||
|
line = _TREE_CHARS_RE.sub(" ", raw_line)
|
||||||
|
m = _LEADING_RE.match(line)
|
||||||
|
if not m:
|
||||||
|
continue
|
||||||
|
depth = len(m.group(1))
|
||||||
|
count = int(m.group(2))
|
||||||
|
rest = m.group(3)
|
||||||
|
symbol, module = _extract_symbol_and_module(rest)
|
||||||
|
entries.append((depth, count, symbol, module))
|
||||||
|
return entries
|
||||||
|
|
||||||
|
|
||||||
|
def parse_sample_output(filepath):
|
||||||
|
"""Parse `sample` call-graph output, compute exclusive (self) samples per function.
|
||||||
|
|
||||||
|
Returns dict of {display_name: self_sample_count}.
|
||||||
|
"""
|
||||||
|
with open(filepath, "r") as f:
|
||||||
|
text = f.read()
|
||||||
|
|
||||||
|
# Find "Call graph:" section
|
||||||
|
cg_start = text.find("Call graph:")
|
||||||
|
if cg_start == -1:
|
||||||
|
print(" Warning: no 'Call graph:' section found in sample output")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
# End at "Total number in stack" or EOF
|
||||||
|
cg_end = text.find("\nTotal number in stack", cg_start)
|
||||||
|
if cg_end == -1:
|
||||||
|
cg_end = len(text)
|
||||||
|
|
||||||
|
entries = _parse_call_graph_lines(text[cg_start:cg_end])
|
||||||
|
|
||||||
|
if not entries:
|
||||||
|
print(" Warning: no call graph entries parsed")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
# Compute self (exclusive) samples per function:
|
||||||
|
# self = count - sum(direct_children_counts)
|
||||||
|
self_samples = {}
|
||||||
|
for i, (depth, count, sym, mod) in enumerate(entries):
|
||||||
|
children_sum = 0
|
||||||
|
child_depth = None
|
||||||
|
for j in range(i + 1, len(entries)):
|
||||||
|
j_depth = entries[j][0]
|
||||||
|
if j_depth <= depth:
|
||||||
|
break
|
||||||
|
if child_depth is None:
|
||||||
|
child_depth = j_depth
|
||||||
|
if j_depth == child_depth:
|
||||||
|
children_sum += entries[j][1]
|
||||||
|
|
||||||
|
self_count = count - children_sum
|
||||||
|
if self_count > 0:
|
||||||
|
key = f"{sym} ({mod})" if mod else sym
|
||||||
|
self_samples[key] = self_samples.get(key, 0) + self_count
|
||||||
|
|
||||||
|
return self_samples
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Display
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
def print_profile(title, self_samples, top_n=20):
|
||||||
|
total = sum(self_samples.values())
|
||||||
|
if total == 0:
|
||||||
|
print(f"\n=== {title} (no samples) ===")
|
||||||
|
return
|
||||||
|
|
||||||
|
sorted_syms = sorted(self_samples.items(), key=lambda x: -x[1])
|
||||||
|
|
||||||
|
print(f"\n=== {title} (top {top_n}, {total} total self-samples) ===")
|
||||||
|
for sym, count in sorted_syms[:top_n]:
|
||||||
|
pct = 100.0 * count / total
|
||||||
|
print(f" {pct:5.1f}% {count:>6} {sym}")
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Main
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="CPU profiling for sqlite-vec KNN configurations",
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
epilog=__doc__,
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"configs", nargs="+", help="config specs (name:type=X,key=val,...)"
|
||||||
|
)
|
||||||
|
parser.add_argument("--subset-size", type=int, required=True)
|
||||||
|
parser.add_argument("-k", type=int, default=10, help="KNN k (default 10)")
|
||||||
|
parser.add_argument(
|
||||||
|
"-n", type=int, default=50, help="number of distinct queries (default 50)"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--repeats",
|
||||||
|
type=int,
|
||||||
|
default=10,
|
||||||
|
help="repeat query set N times for more samples (default 10)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--top", type=int, default=20, help="show top N functions (default 20)"
|
||||||
|
)
|
||||||
|
parser.add_argument("--base-db", default=BASE_DB)
|
||||||
|
parser.add_argument("--sqlite3", default=SQLITE3_PATH)
|
||||||
|
parser.add_argument(
|
||||||
|
"--keep-temp",
|
||||||
|
action="store_true",
|
||||||
|
help="keep temp directory with DBs, SQL, and sample output",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Check prerequisites
|
||||||
|
if not os.path.exists(args.base_db):
|
||||||
|
print(f"Error: base DB not found at {args.base_db}", file=sys.stderr)
|
||||||
|
print("Run 'make seed' in benchmarks-ann/ first.", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if not shutil.which("sample"):
|
||||||
|
print("Error: macOS 'sample' tool not found.", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Build CLI
|
||||||
|
print("Building dist/sqlite3...")
|
||||||
|
result = subprocess.run(
|
||||||
|
["make", "cli"], cwd=_PROJECT_ROOT, capture_output=True, text=True
|
||||||
|
)
|
||||||
|
if result.returncode != 0:
|
||||||
|
print(f"Error: make cli failed:\n{result.stderr}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
print(" done.")
|
||||||
|
|
||||||
|
if not os.path.exists(args.sqlite3):
|
||||||
|
print(f"Error: sqlite3 not found at {args.sqlite3}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
configs = [parse_config(c) for c in args.configs]
|
||||||
|
|
||||||
|
tmpdir = tempfile.mkdtemp(prefix="sqlite-vec-profile-")
|
||||||
|
print(f"Working directory: {tmpdir}")
|
||||||
|
|
||||||
|
all_profiles = []
|
||||||
|
|
||||||
|
for i, (name, params) in enumerate(configs, 1):
|
||||||
|
reg = INDEX_REGISTRY[params["index_type"]]
|
||||||
|
desc = reg["describe"](params)
|
||||||
|
print(f"\n[{i}/{len(configs)}] {name} ({desc})")
|
||||||
|
|
||||||
|
# Generate SQL workload
|
||||||
|
db_path = os.path.join(tmpdir, f"{name}.db")
|
||||||
|
sql_text = generate_sql(
|
||||||
|
db_path, params, args.subset_size, args.n, args.k, args.repeats
|
||||||
|
)
|
||||||
|
sql_file = os.path.join(tmpdir, f"{name}.sql")
|
||||||
|
with open(sql_file, "w") as f:
|
||||||
|
f.write(sql_text)
|
||||||
|
|
||||||
|
total_queries = args.n * args.repeats
|
||||||
|
print(
|
||||||
|
f" SQL workload: {args.subset_size} inserts + "
|
||||||
|
f"{total_queries} queries ({args.n} x {args.repeats} repeats)"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Profile
|
||||||
|
sample_file = os.path.join(tmpdir, f"{name}.sample.txt")
|
||||||
|
print(f" Profiling...")
|
||||||
|
ok = run_profile(args.sqlite3, db_path, sql_file, sample_file)
|
||||||
|
if not ok:
|
||||||
|
print(f" FAILED — skipping {name}")
|
||||||
|
all_profiles.append((name, desc, {}))
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not os.path.exists(sample_file):
|
||||||
|
print(f" Warning: sample output not created")
|
||||||
|
all_profiles.append((name, desc, {}))
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Parse
|
||||||
|
self_samples = parse_sample_output(sample_file)
|
||||||
|
all_profiles.append((name, desc, self_samples))
|
||||||
|
|
||||||
|
# Show individual profile
|
||||||
|
print_profile(f"{name} ({desc})", self_samples, args.top)
|
||||||
|
|
||||||
|
# Side-by-side comparison if multiple configs
|
||||||
|
if len(all_profiles) > 1:
|
||||||
|
print("\n" + "=" * 80)
|
||||||
|
print("COMPARISON")
|
||||||
|
print("=" * 80)
|
||||||
|
|
||||||
|
# Collect all symbols that appear in top-N of any config
|
||||||
|
all_syms = set()
|
||||||
|
for _name, _desc, prof in all_profiles:
|
||||||
|
sorted_syms = sorted(prof.items(), key=lambda x: -x[1])
|
||||||
|
for sym, _count in sorted_syms[: args.top]:
|
||||||
|
all_syms.add(sym)
|
||||||
|
|
||||||
|
# Build comparison table
|
||||||
|
rows = []
|
||||||
|
for sym in all_syms:
|
||||||
|
row = [sym]
|
||||||
|
for _name, _desc, prof in all_profiles:
|
||||||
|
total = sum(prof.values())
|
||||||
|
count = prof.get(sym, 0)
|
||||||
|
pct = 100.0 * count / total if total > 0 else 0.0
|
||||||
|
row.append((pct, count))
|
||||||
|
max_pct = max(r[0] for r in row[1:])
|
||||||
|
rows.append((max_pct, row))
|
||||||
|
|
||||||
|
rows.sort(key=lambda x: -x[0])
|
||||||
|
|
||||||
|
# Header
|
||||||
|
header = f"{'function':>40}"
|
||||||
|
for name, desc, _ in all_profiles:
|
||||||
|
header += f" {name:>14}"
|
||||||
|
print(header)
|
||||||
|
print("-" * len(header))
|
||||||
|
|
||||||
|
for _sort_key, row in rows[: args.top * 2]:
|
||||||
|
sym = row[0]
|
||||||
|
display_sym = sym if len(sym) <= 40 else sym[:37] + "..."
|
||||||
|
line = f"{display_sym:>40}"
|
||||||
|
for pct, count in row[1:]:
|
||||||
|
if count > 0:
|
||||||
|
line += f" {pct:>13.1f}%"
|
||||||
|
else:
|
||||||
|
line += f" {'-':>14}"
|
||||||
|
print(line)
|
||||||
|
|
||||||
|
if args.keep_temp:
|
||||||
|
print(f"\nTemp files kept at: {tmpdir}")
|
||||||
|
else:
|
||||||
|
shutil.rmtree(tmpdir)
|
||||||
|
print(f"\nTemp files cleaned up. Use --keep-temp to preserve.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
35
benchmarks-ann/schema.sql
Normal file
35
benchmarks-ann/schema.sql
Normal file
|
|
@ -0,0 +1,35 @@
|
||||||
|
-- Canonical results schema for vec0 KNN benchmark comparisons.
|
||||||
|
-- The index_type column is a free-form TEXT field. Baseline configs use
|
||||||
|
-- "baseline"; index-specific branches add their own types (registered
|
||||||
|
-- via INDEX_REGISTRY in bench.py).
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS build_results (
|
||||||
|
config_name TEXT NOT NULL,
|
||||||
|
index_type TEXT NOT NULL,
|
||||||
|
subset_size INTEGER NOT NULL,
|
||||||
|
db_path TEXT NOT NULL,
|
||||||
|
insert_time_s REAL NOT NULL,
|
||||||
|
train_time_s REAL, -- NULL when no training/build step is needed
|
||||||
|
total_time_s REAL NOT NULL,
|
||||||
|
rows INTEGER NOT NULL,
|
||||||
|
file_size_mb REAL NOT NULL,
|
||||||
|
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||||
|
PRIMARY KEY (config_name, subset_size)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS bench_results (
|
||||||
|
config_name TEXT NOT NULL,
|
||||||
|
index_type TEXT NOT NULL,
|
||||||
|
subset_size INTEGER NOT NULL,
|
||||||
|
k INTEGER NOT NULL,
|
||||||
|
n INTEGER NOT NULL,
|
||||||
|
mean_ms REAL NOT NULL,
|
||||||
|
median_ms REAL NOT NULL,
|
||||||
|
p99_ms REAL NOT NULL,
|
||||||
|
total_ms REAL NOT NULL,
|
||||||
|
qps REAL NOT NULL,
|
||||||
|
recall REAL NOT NULL,
|
||||||
|
db_path TEXT NOT NULL,
|
||||||
|
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||||
|
PRIMARY KEY (config_name, subset_size, k)
|
||||||
|
);
|
||||||
2
benchmarks-ann/seed/.gitignore
vendored
Normal file
2
benchmarks-ann/seed/.gitignore
vendored
Normal file
|
|
@ -0,0 +1,2 @@
|
||||||
|
*.parquet
|
||||||
|
base.db
|
||||||
24
benchmarks-ann/seed/Makefile
Normal file
24
benchmarks-ann/seed/Makefile
Normal file
|
|
@ -0,0 +1,24 @@
|
||||||
|
BASE_URL = https://assets.zilliz.com/benchmark/cohere_medium_1m
|
||||||
|
|
||||||
|
PARQUETS = train.parquet test.parquet neighbors.parquet
|
||||||
|
|
||||||
|
.PHONY: all download base.db clean
|
||||||
|
|
||||||
|
all: base.db
|
||||||
|
|
||||||
|
download: $(PARQUETS)
|
||||||
|
|
||||||
|
train.parquet:
|
||||||
|
curl -L -o $@ $(BASE_URL)/train.parquet
|
||||||
|
|
||||||
|
test.parquet:
|
||||||
|
curl -L -o $@ $(BASE_URL)/test.parquet
|
||||||
|
|
||||||
|
neighbors.parquet:
|
||||||
|
curl -L -o $@ $(BASE_URL)/neighbors.parquet
|
||||||
|
|
||||||
|
base.db: $(PARQUETS) build_base_db.py
|
||||||
|
uv run --with pandas --with pyarrow python build_base_db.py
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -f base.db
|
||||||
121
benchmarks-ann/seed/build_base_db.py
Normal file
121
benchmarks-ann/seed/build_base_db.py
Normal file
|
|
@ -0,0 +1,121 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Build base.db from downloaded parquet files.
|
||||||
|
|
||||||
|
Reads train.parquet, test.parquet, neighbors.parquet and creates a SQLite
|
||||||
|
database with tables: train, query_vectors, neighbors.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
uv run --with pandas --with pyarrow python build_base_db.py
|
||||||
|
"""
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sqlite3
|
||||||
|
import struct
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
def float_list_to_blob(floats):
|
||||||
|
"""Pack a list of floats into a little-endian f32 blob."""
|
||||||
|
return struct.pack(f"<{len(floats)}f", *floats)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
seed_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
db_path = os.path.join(seed_dir, "base.db")
|
||||||
|
|
||||||
|
train_path = os.path.join(seed_dir, "train.parquet")
|
||||||
|
test_path = os.path.join(seed_dir, "test.parquet")
|
||||||
|
neighbors_path = os.path.join(seed_dir, "neighbors.parquet")
|
||||||
|
|
||||||
|
for p in (train_path, test_path, neighbors_path):
|
||||||
|
if not os.path.exists(p):
|
||||||
|
print(f"ERROR: {p} not found. Run 'make download' first.")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if os.path.exists(db_path):
|
||||||
|
os.remove(db_path)
|
||||||
|
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.execute("PRAGMA journal_mode=WAL")
|
||||||
|
conn.execute("PRAGMA page_size=4096")
|
||||||
|
|
||||||
|
# --- query_vectors (from test.parquet) ---
|
||||||
|
print("Loading test.parquet (query vectors)...")
|
||||||
|
t0 = time.perf_counter()
|
||||||
|
df_test = pd.read_parquet(test_path)
|
||||||
|
conn.execute(
|
||||||
|
"CREATE TABLE query_vectors (id INTEGER PRIMARY KEY, vector BLOB)"
|
||||||
|
)
|
||||||
|
rows = []
|
||||||
|
for _, row in df_test.iterrows():
|
||||||
|
rows.append((int(row["id"]), float_list_to_blob(row["emb"])))
|
||||||
|
conn.executemany("INSERT INTO query_vectors (id, vector) VALUES (?, ?)", rows)
|
||||||
|
conn.commit()
|
||||||
|
print(f" {len(rows)} query vectors in {time.perf_counter() - t0:.1f}s")
|
||||||
|
|
||||||
|
# --- neighbors (from neighbors.parquet) ---
|
||||||
|
print("Loading neighbors.parquet...")
|
||||||
|
t0 = time.perf_counter()
|
||||||
|
df_neighbors = pd.read_parquet(neighbors_path)
|
||||||
|
conn.execute(
|
||||||
|
"CREATE TABLE neighbors ("
|
||||||
|
" query_vector_id INTEGER, rank INTEGER, neighbors_id TEXT,"
|
||||||
|
" UNIQUE(query_vector_id, rank))"
|
||||||
|
)
|
||||||
|
rows = []
|
||||||
|
for _, row in df_neighbors.iterrows():
|
||||||
|
qid = int(row["id"])
|
||||||
|
# neighbors_id may be a numpy array or JSON string
|
||||||
|
nids = row["neighbors_id"]
|
||||||
|
if isinstance(nids, str):
|
||||||
|
nids = json.loads(nids)
|
||||||
|
for rank, nid in enumerate(nids):
|
||||||
|
rows.append((qid, rank, str(int(nid))))
|
||||||
|
conn.executemany(
|
||||||
|
"INSERT INTO neighbors (query_vector_id, rank, neighbors_id) VALUES (?, ?, ?)",
|
||||||
|
rows,
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
print(f" {len(rows)} neighbor rows in {time.perf_counter() - t0:.1f}s")
|
||||||
|
|
||||||
|
# --- train (from train.parquet) ---
|
||||||
|
print("Loading train.parquet (1M vectors, this takes a few minutes)...")
|
||||||
|
t0 = time.perf_counter()
|
||||||
|
conn.execute(
|
||||||
|
"CREATE TABLE train (id INTEGER PRIMARY KEY, vector BLOB)"
|
||||||
|
)
|
||||||
|
|
||||||
|
batch_size = 10000
|
||||||
|
df_iter = pd.read_parquet(train_path)
|
||||||
|
total = len(df_iter)
|
||||||
|
|
||||||
|
for start in range(0, total, batch_size):
|
||||||
|
chunk = df_iter.iloc[start : start + batch_size]
|
||||||
|
rows = []
|
||||||
|
for _, row in chunk.iterrows():
|
||||||
|
rows.append((int(row["id"]), float_list_to_blob(row["emb"])))
|
||||||
|
conn.executemany("INSERT INTO train (id, vector) VALUES (?, ?)", rows)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
done = min(start + batch_size, total)
|
||||||
|
elapsed = time.perf_counter() - t0
|
||||||
|
rate = done / elapsed if elapsed > 0 else 0
|
||||||
|
eta = (total - done) / rate if rate > 0 else 0
|
||||||
|
print(
|
||||||
|
f" {done:>8}/{total} {elapsed:.0f}s {rate:.0f} rows/s eta {eta:.0f}s",
|
||||||
|
flush=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
elapsed = time.perf_counter() - t0
|
||||||
|
print(f" {total} train vectors in {elapsed:.1f}s")
|
||||||
|
|
||||||
|
conn.close()
|
||||||
|
size_mb = os.path.getsize(db_path) / (1024 * 1024)
|
||||||
|
print(f"\nDone: {db_path} ({size_mb:.0f} MB)")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
@ -248,59 +248,6 @@ def bench_libsql(base, query, page_size, k) -> BenchResult:
|
||||||
return BenchResult(f"libsql ({page_size})", build_time, times)
|
return BenchResult(f"libsql ({page_size})", build_time, times)
|
||||||
|
|
||||||
|
|
||||||
def register_np(db, array, name):
|
|
||||||
ptr = array.__array_interface__["data"][0]
|
|
||||||
nvectors, dimensions = array.__array_interface__["shape"]
|
|
||||||
element_type = array.__array_interface__["typestr"]
|
|
||||||
|
|
||||||
assert element_type == "<f4"
|
|
||||||
|
|
||||||
name_escaped = db.execute("select printf('%w', ?)", [name]).fetchone()[0]
|
|
||||||
|
|
||||||
db.execute(
|
|
||||||
"insert into temp.vec_static_blobs(name, data) select ?, vec_static_blob_from_raw(?, ?, ?, ?)",
|
|
||||||
[name, ptr, element_type, dimensions, nvectors],
|
|
||||||
)
|
|
||||||
|
|
||||||
db.execute(
|
|
||||||
f'create virtual table "{name_escaped}" using vec_static_blob_entries({name_escaped})'
|
|
||||||
)
|
|
||||||
|
|
||||||
def bench_sqlite_vec_static(base, query, k) -> BenchResult:
|
|
||||||
print(f"sqlite-vec static...")
|
|
||||||
|
|
||||||
db = sqlite3.connect(":memory:")
|
|
||||||
db.enable_load_extension(True)
|
|
||||||
db.load_extension("../../dist/vec0")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
t = time.time()
|
|
||||||
register_np(db, base, "base")
|
|
||||||
build_time = time.time() - t
|
|
||||||
|
|
||||||
times = []
|
|
||||||
results = []
|
|
||||||
for (
|
|
||||||
idx,
|
|
||||||
q,
|
|
||||||
) in enumerate(query):
|
|
||||||
t0 = time.time()
|
|
||||||
result = db.execute(
|
|
||||||
"""
|
|
||||||
select
|
|
||||||
rowid
|
|
||||||
from base
|
|
||||||
where vector match ?
|
|
||||||
and k = ?
|
|
||||||
order by distance
|
|
||||||
""",
|
|
||||||
[q.tobytes(), k],
|
|
||||||
).fetchall()
|
|
||||||
assert len(result) == k
|
|
||||||
times.append(time.time() - t0)
|
|
||||||
return BenchResult(f"sqlite-vec static", build_time, times)
|
|
||||||
|
|
||||||
def bench_faiss(base, query, k) -> BenchResult:
|
def bench_faiss(base, query, k) -> BenchResult:
|
||||||
import faiss
|
import faiss
|
||||||
dimensions = base.shape[1]
|
dimensions = base.shape[1]
|
||||||
|
|
@ -438,8 +385,6 @@ def suite(name, base, query, k, benchmarks):
|
||||||
for b in benchmarks:
|
for b in benchmarks:
|
||||||
if b == "faiss":
|
if b == "faiss":
|
||||||
results.append(bench_faiss(base, query, k=k))
|
results.append(bench_faiss(base, query, k=k))
|
||||||
elif b == "vec-static":
|
|
||||||
results.append(bench_sqlite_vec_static(base, query, k=k))
|
|
||||||
elif b.startswith("vec-scalar"):
|
elif b.startswith("vec-scalar"):
|
||||||
_, page_size = b.split('.')
|
_, page_size = b.split('.')
|
||||||
results.append(bench_sqlite_vec_scalar(base, query, page_size, k=k))
|
results.append(bench_sqlite_vec_scalar(base, query, page_size, k=k))
|
||||||
|
|
@ -541,7 +486,7 @@ def parse_args():
|
||||||
help="Number of queries to use. Defaults all",
|
help="Number of queries to use. Defaults all",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-x", help="type of runs to make", default="faiss,vec-scalar.4096,vec-static,vec-vec0.4096.16,usearch,duckdb,hnswlib,numpy"
|
"-x", help="type of runs to make", default="faiss,vec-scalar.4096,vec-vec0.4096.16,usearch,duckdb,hnswlib,numpy"
|
||||||
)
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
|
||||||
|
|
@ -8,10 +8,3 @@ create virtual table vec_items using vec0(
|
||||||
embedding float[1536]
|
embedding float[1536]
|
||||||
);
|
);
|
||||||
|
|
||||||
-- 65s (limit 1e5), ~615MB on disk
|
|
||||||
insert into vec_items
|
|
||||||
select
|
|
||||||
rowid,
|
|
||||||
vector
|
|
||||||
from vec_npy_each(vec_npy_file('examples/dbpedia-openai/data/vectors.npy'))
|
|
||||||
limit 1e5;
|
|
||||||
|
|
|
||||||
|
|
@ -6,7 +6,6 @@ def connect(path):
|
||||||
db = sqlite3.connect(path)
|
db = sqlite3.connect(path)
|
||||||
db.enable_load_extension(True)
|
db.enable_load_extension(True)
|
||||||
db.load_extension("../dist/vec0")
|
db.load_extension("../dist/vec0")
|
||||||
db.execute("select load_extension('../dist/vec0', 'sqlite3_vec_fs_read_init')")
|
|
||||||
db.enable_load_extension(False)
|
db.enable_load_extension(False)
|
||||||
return db
|
return db
|
||||||
|
|
||||||
|
|
@ -18,8 +17,6 @@ page_sizes = [ # 4096, 8192,
|
||||||
chunk_sizes = [128, 256, 1024, 2048]
|
chunk_sizes = [128, 256, 1024, 2048]
|
||||||
types = ["f32", "int8", "bit"]
|
types = ["f32", "int8", "bit"]
|
||||||
|
|
||||||
SRC = "../examples/dbpedia-openai/data/vectors.npy"
|
|
||||||
|
|
||||||
for page_size in page_sizes:
|
for page_size in page_sizes:
|
||||||
for chunk_size in chunk_sizes:
|
for chunk_size in chunk_sizes:
|
||||||
for t in types:
|
for t in types:
|
||||||
|
|
@ -42,15 +39,8 @@ for page_size in page_sizes:
|
||||||
func = "vec_quantize_i8(vector, 'unit')"
|
func = "vec_quantize_i8(vector, 'unit')"
|
||||||
if t == "bit":
|
if t == "bit":
|
||||||
func = "vec_quantize_binary(vector)"
|
func = "vec_quantize_binary(vector)"
|
||||||
db.execute(
|
# TODO: replace with non-npy data loading
|
||||||
f"""
|
pass
|
||||||
insert into vec_items
|
|
||||||
select rowid, {func}
|
|
||||||
from vec_npy_each(vec_npy_file(?))
|
|
||||||
limit 100000
|
|
||||||
""",
|
|
||||||
[SRC],
|
|
||||||
)
|
|
||||||
elapsed = time.time() - t0
|
elapsed = time.time() - t0
|
||||||
print(elapsed)
|
print(elapsed)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -6,7 +6,6 @@ index ed2aaec..4cc0b0e 100755
|
||||||
-Wl,--initial-memory=327680 \
|
-Wl,--initial-memory=327680 \
|
||||||
-D_HAVE_SQLITE_CONFIG_H \
|
-D_HAVE_SQLITE_CONFIG_H \
|
||||||
-DSQLITE_CUSTOM_INCLUDE=sqlite_opt.h \
|
-DSQLITE_CUSTOM_INCLUDE=sqlite_opt.h \
|
||||||
+ -DSQLITE_VEC_OMIT_FS=1 \
|
|
||||||
$(awk '{print "-Wl,--export="$0}' exports.txt)
|
$(awk '{print "-Wl,--export="$0}' exports.txt)
|
||||||
|
|
||||||
"$BINARYEN/wasm-ctor-eval" -g -c _initialize sqlite3.wasm -o sqlite3.tmp
|
"$BINARYEN/wasm-ctor-eval" -g -c _initialize sqlite3.wasm -o sqlite3.tmp
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,5 @@
|
||||||
from typing import List
|
from typing import List
|
||||||
from struct import pack
|
from struct import pack
|
||||||
from sqlite3 import Connection
|
|
||||||
|
|
||||||
|
|
||||||
def serialize_float32(vector: List[float]) -> bytes:
|
def serialize_float32(vector: List[float]) -> bytes:
|
||||||
|
|
@ -13,33 +12,3 @@ def serialize_int8(vector: List[int]) -> bytes:
|
||||||
return pack("%sb" % len(vector), *vector)
|
return pack("%sb" % len(vector), *vector)
|
||||||
|
|
||||||
|
|
||||||
try:
|
|
||||||
import numpy.typing as npt
|
|
||||||
|
|
||||||
def register_numpy(db: Connection, name: str, array: npt.NDArray):
|
|
||||||
"""ayoo"""
|
|
||||||
|
|
||||||
ptr = array.__array_interface__["data"][0]
|
|
||||||
nvectors, dimensions = array.__array_interface__["shape"]
|
|
||||||
element_type = array.__array_interface__["typestr"]
|
|
||||||
|
|
||||||
assert element_type == "<f4"
|
|
||||||
|
|
||||||
name_escaped = db.execute("select printf('%w', ?)", [name]).fetchone()[0]
|
|
||||||
|
|
||||||
db.execute(
|
|
||||||
"""
|
|
||||||
insert into temp.vec_static_blobs(name, data)
|
|
||||||
select ?, vec_static_blob_from_raw(?, ?, ?, ?)
|
|
||||||
""",
|
|
||||||
[name, ptr, element_type, dimensions, nvectors],
|
|
||||||
)
|
|
||||||
|
|
||||||
db.execute(
|
|
||||||
f'create virtual table "{name_escaped}" using vec_static_blob_entries({name_escaped})'
|
|
||||||
)
|
|
||||||
|
|
||||||
except ImportError:
|
|
||||||
|
|
||||||
def register_numpy(db: Connection, name: str, array):
|
|
||||||
raise Exception("numpy package is required for register_numpy")
|
|
||||||
|
|
|
||||||
119
scripts/amalgamate.py
Normal file
119
scripts/amalgamate.py
Normal file
|
|
@ -0,0 +1,119 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Amalgamate sqlite-vec into a single distributable .c file.
|
||||||
|
|
||||||
|
Reads the dev sqlite-vec.c and inlines any #include "sqlite-vec-*.c" files,
|
||||||
|
stripping LSP-support blocks and per-file include guards.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python3 scripts/amalgamate.py sqlite-vec.c > dist/sqlite-vec.c
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
def strip_lsp_block(content):
|
||||||
|
"""Remove the LSP-support pattern:
|
||||||
|
#ifndef SQLITE_VEC_H
|
||||||
|
#include "sqlite-vec.c" // ...
|
||||||
|
#endif
|
||||||
|
"""
|
||||||
|
pattern = re.compile(
|
||||||
|
r'^\s*#ifndef\s+SQLITE_VEC_H\s*\n'
|
||||||
|
r'\s*#include\s+"sqlite-vec\.c"[^\n]*\n'
|
||||||
|
r'\s*#endif[^\n]*\n',
|
||||||
|
re.MULTILINE,
|
||||||
|
)
|
||||||
|
return pattern.sub('', content)
|
||||||
|
|
||||||
|
|
||||||
|
def strip_include_guard(content, guard_macro):
|
||||||
|
"""Remove the include guard pair:
|
||||||
|
#ifndef GUARD_MACRO
|
||||||
|
#define GUARD_MACRO
|
||||||
|
...content...
|
||||||
|
(trailing #endif removed)
|
||||||
|
"""
|
||||||
|
# Strip the #ifndef / #define pair at the top
|
||||||
|
header_pattern = re.compile(
|
||||||
|
r'^\s*#ifndef\s+' + re.escape(guard_macro) + r'\s*\n'
|
||||||
|
r'\s*#define\s+' + re.escape(guard_macro) + r'\s*\n',
|
||||||
|
re.MULTILINE,
|
||||||
|
)
|
||||||
|
content = header_pattern.sub('', content, count=1)
|
||||||
|
|
||||||
|
# Strip the trailing #endif (last one in file that closes the guard)
|
||||||
|
# Find the last #endif and remove it
|
||||||
|
lines = content.rstrip('\n').split('\n')
|
||||||
|
for i in range(len(lines) - 1, -1, -1):
|
||||||
|
if re.match(r'^\s*#endif', lines[i]):
|
||||||
|
lines.pop(i)
|
||||||
|
break
|
||||||
|
|
||||||
|
return '\n'.join(lines) + '\n'
|
||||||
|
|
||||||
|
|
||||||
|
def detect_include_guard(content):
|
||||||
|
"""Detect an include guard macro like SQLITE_VEC_IVF_C."""
|
||||||
|
m = re.match(
|
||||||
|
r'\s*(?:/\*[\s\S]*?\*/\s*)?' # optional block comment
|
||||||
|
r'#ifndef\s+(SQLITE_VEC_\w+_C)\s*\n'
|
||||||
|
r'#define\s+\1',
|
||||||
|
content,
|
||||||
|
)
|
||||||
|
return m.group(1) if m else None
|
||||||
|
|
||||||
|
|
||||||
|
def inline_include(match, base_dir):
|
||||||
|
"""Replace an #include "sqlite-vec-*.c" with the file's contents."""
|
||||||
|
filename = match.group(1)
|
||||||
|
filepath = os.path.join(base_dir, filename)
|
||||||
|
|
||||||
|
if not os.path.exists(filepath):
|
||||||
|
print(f"Warning: {filepath} not found, leaving #include in place", file=sys.stderr)
|
||||||
|
return match.group(0)
|
||||||
|
|
||||||
|
with open(filepath, 'r') as f:
|
||||||
|
content = f.read()
|
||||||
|
|
||||||
|
# Strip LSP-support block
|
||||||
|
content = strip_lsp_block(content)
|
||||||
|
|
||||||
|
# Strip include guard if present
|
||||||
|
guard = detect_include_guard(content)
|
||||||
|
if guard:
|
||||||
|
content = strip_include_guard(content, guard)
|
||||||
|
|
||||||
|
separator = '/' * 78
|
||||||
|
header = f'\n{separator}\n// Begin inlined: {filename}\n{separator}\n\n'
|
||||||
|
footer = f'\n{separator}\n// End inlined: {filename}\n{separator}\n'
|
||||||
|
|
||||||
|
return header + content.strip('\n') + footer
|
||||||
|
|
||||||
|
|
||||||
|
def amalgamate(input_path):
|
||||||
|
base_dir = os.path.dirname(os.path.abspath(input_path))
|
||||||
|
|
||||||
|
with open(input_path, 'r') as f:
|
||||||
|
content = f.read()
|
||||||
|
|
||||||
|
# Replace #include "sqlite-vec-*.c" with inlined contents
|
||||||
|
include_pattern = re.compile(r'^#include\s+"(sqlite-vec-[^"]+\.c)"\s*$', re.MULTILINE)
|
||||||
|
content = include_pattern.sub(lambda m: inline_include(m, base_dir), content)
|
||||||
|
|
||||||
|
return content
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
if len(sys.argv) != 2:
|
||||||
|
print(f"Usage: {sys.argv[0]} <input-file>", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
result = amalgamate(sys.argv[1])
|
||||||
|
sys.stdout.write(result)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
|
|
@ -568,65 +568,6 @@ select 'todo';
|
||||||
-- 'todo'
|
-- 'todo'
|
||||||
|
|
||||||
|
|
||||||
```
|
|
||||||
|
|
||||||
## NumPy Utilities {#numpy}
|
|
||||||
|
|
||||||
Functions to read data from or work with [NumPy arrays](https://numpy.org/doc/stable/reference/generated/numpy.array.html).
|
|
||||||
|
|
||||||
### `vec_npy_each(vector)` {#vec_npy_each}
|
|
||||||
|
|
||||||
xxx
|
|
||||||
|
|
||||||
|
|
||||||
```sql
|
|
||||||
-- db.execute('select quote(?)', [to_npy(np.array([[1.0], [2.0], [3.0]], dtype=np.float32))]).fetchone()
|
|
||||||
select
|
|
||||||
rowid,
|
|
||||||
vector,
|
|
||||||
vec_type(vector),
|
|
||||||
vec_to_json(vector)
|
|
||||||
from vec_npy_each(
|
|
||||||
X'934E554D5059010076007B276465736372273A20273C6634272C2027666F727472616E5F6F72646572273A2046616C73652C20277368617065273A2028332C2031292C207D202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020200A0000803F0000004000004040'
|
|
||||||
)
|
|
||||||
/*
|
|
||||||
┌───────┬─────────────┬──────────────────┬─────────────────────┐
|
|
||||||
│ rowid │ vector │ vec_type(vector) │ vec_to_json(vector) │
|
|
||||||
├───────┼─────────────┼──────────────────┼─────────────────────┤
|
|
||||||
│ 0 │ X'0000803F' │ 'float32' │ '[1.000000]' │
|
|
||||||
├───────┼─────────────┼──────────────────┼─────────────────────┤
|
|
||||||
│ 1 │ X'00000040' │ 'float32' │ '[2.000000]' │
|
|
||||||
├───────┼─────────────┼──────────────────┼─────────────────────┤
|
|
||||||
│ 2 │ X'00004040' │ 'float32' │ '[3.000000]' │
|
|
||||||
└───────┴─────────────┴──────────────────┴─────────────────────┘
|
|
||||||
|
|
||||||
*/
|
|
||||||
|
|
||||||
|
|
||||||
-- db.execute('select quote(?)', [to_npy(np.array([[1.0], [2.0], [3.0]], dtype=np.float32))]).fetchone()
|
|
||||||
select
|
|
||||||
rowid,
|
|
||||||
vector,
|
|
||||||
vec_type(vector),
|
|
||||||
vec_to_json(vector)
|
|
||||||
from vec_npy_each(
|
|
||||||
X'934E554D5059010076007B276465736372273A20273C6634272C2027666F727472616E5F6F72646572273A2046616C73652C20277368617065273A2028332C2031292C207D202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020200A0000803F0000004000004040'
|
|
||||||
)
|
|
||||||
/*
|
|
||||||
┌───────┬─────────────┬──────────────────┬─────────────────────┐
|
|
||||||
│ rowid │ vector │ vec_type(vector) │ vec_to_json(vector) │
|
|
||||||
├───────┼─────────────┼──────────────────┼─────────────────────┤
|
|
||||||
│ 0 │ X'0000803F' │ 'float32' │ '[1.000000]' │
|
|
||||||
├───────┼─────────────┼──────────────────┼─────────────────────┤
|
|
||||||
│ 1 │ X'00000040' │ 'float32' │ '[2.000000]' │
|
|
||||||
├───────┼─────────────┼──────────────────┼─────────────────────┤
|
|
||||||
│ 2 │ X'00004040' │ 'float32' │ '[3.000000]' │
|
|
||||||
└───────┴─────────────┴──────────────────┴─────────────────────┘
|
|
||||||
|
|
||||||
*/
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## Meta {#meta}
|
## Meta {#meta}
|
||||||
|
|
|
||||||
|
|
@ -59,5 +59,4 @@ The current compile-time flags are:
|
||||||
|
|
||||||
- `SQLITE_VEC_ENABLE_AVX`, enables AVX CPU instructions for some vector search operations
|
- `SQLITE_VEC_ENABLE_AVX`, enables AVX CPU instructions for some vector search operations
|
||||||
- `SQLITE_VEC_ENABLE_NEON`, enables NEON CPU instructions for some vector search operations
|
- `SQLITE_VEC_ENABLE_NEON`, enables NEON CPU instructions for some vector search operations
|
||||||
- `SQLITE_VEC_OMIT_FS`, removes some obsure SQL functions and features that use the filesystem, meant for some WASM builds where there's no available filesystem
|
|
||||||
- `SQLITE_VEC_STATIC`, meant for statically linking `sqlite-vec`
|
- `SQLITE_VEC_STATIC`, meant for statically linking `sqlite-vec`
|
||||||
|
|
|
||||||
1863
sqlite-vec.c
1863
sqlite-vec.c
File diff suppressed because it is too large
Load diff
|
|
@ -48,7 +48,6 @@ import json
|
||||||
db = sqlite3.connect(":memory:")
|
db = sqlite3.connect(":memory:")
|
||||||
db.enable_load_extension(True)
|
db.enable_load_extension(True)
|
||||||
db.load_extension("../../dist/vec0")
|
db.load_extension("../../dist/vec0")
|
||||||
db.execute("select load_extension('../../dist/vec0', 'sqlite3_vec_fs_read_init')")
|
|
||||||
db.enable_load_extension(False)
|
db.enable_load_extension(False)
|
||||||
|
|
||||||
results = db.execute(
|
results = db.execute(
|
||||||
|
|
@ -75,17 +74,21 @@ print(b)
|
||||||
|
|
||||||
db.execute('PRAGMA page_size=16384')
|
db.execute('PRAGMA page_size=16384')
|
||||||
|
|
||||||
print("Loading into sqlite-vec vec0 table...")
|
|
||||||
t0 = time.time()
|
|
||||||
db.execute("create virtual table v using vec0(a float[3072], chunk_size=16)")
|
|
||||||
db.execute('insert into v select rowid, vector from vec_npy_each(vec_npy_file("dbpedia_openai_3_large_00.npy"))')
|
|
||||||
print(time.time() - t0)
|
|
||||||
|
|
||||||
print("loading numpy array...")
|
print("loading numpy array...")
|
||||||
t0 = time.time()
|
t0 = time.time()
|
||||||
base = np.load('dbpedia_openai_3_large_00.npy')
|
base = np.load('dbpedia_openai_3_large_00.npy')
|
||||||
print(time.time() - t0)
|
print(time.time() - t0)
|
||||||
|
|
||||||
|
print("Loading into sqlite-vec vec0 table...")
|
||||||
|
t0 = time.time()
|
||||||
|
db.execute("create virtual table v using vec0(a float[3072], chunk_size=16)")
|
||||||
|
with db:
|
||||||
|
db.executemany(
|
||||||
|
"insert into v(rowid, a) values (?, ?)",
|
||||||
|
[(i, row.tobytes()) for i, row in enumerate(base)],
|
||||||
|
)
|
||||||
|
print(time.time() - t0)
|
||||||
|
|
||||||
np.random.seed(1)
|
np.random.seed(1)
|
||||||
queries = base[np.random.choice(base.shape[0], 20, replace=False), :]
|
queries = base[np.random.choice(base.shape[0], 20, replace=False), :]
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,37 +0,0 @@
|
||||||
#include <stdint.h>
|
|
||||||
#include <stddef.h>
|
|
||||||
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include "sqlite-vec.h"
|
|
||||||
#include "sqlite3.h"
|
|
||||||
#include <assert.h>
|
|
||||||
|
|
||||||
extern int sqlite3_vec_numpy_init(sqlite3 *db, char **pzErrMsg,
|
|
||||||
const sqlite3_api_routines *pApi);
|
|
||||||
|
|
||||||
int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
|
|
||||||
int rc = SQLITE_OK;
|
|
||||||
sqlite3 *db;
|
|
||||||
sqlite3_stmt *stmt;
|
|
||||||
|
|
||||||
rc = sqlite3_open(":memory:", &db);
|
|
||||||
assert(rc == SQLITE_OK);
|
|
||||||
rc = sqlite3_vec_init(db, NULL, NULL);
|
|
||||||
assert(rc == SQLITE_OK);
|
|
||||||
rc = sqlite3_vec_numpy_init(db, NULL, NULL);
|
|
||||||
assert(rc == SQLITE_OK);
|
|
||||||
|
|
||||||
rc = sqlite3_prepare_v2(db, "select * from vec_npy_each(?)", -1, &stmt, NULL);
|
|
||||||
assert(rc == SQLITE_OK);
|
|
||||||
sqlite3_bind_blob(stmt, 1, data, size, SQLITE_STATIC);
|
|
||||||
rc = sqlite3_step(stmt);
|
|
||||||
while (rc == SQLITE_ROW) {
|
|
||||||
rc = sqlite3_step(stmt);
|
|
||||||
}
|
|
||||||
|
|
||||||
sqlite3_finalize(stmt);
|
|
||||||
sqlite3_close(db);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
@ -3,6 +3,7 @@
|
||||||
|
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
int min_idx(
|
int min_idx(
|
||||||
const float *distances,
|
const float *distances,
|
||||||
|
|
@ -62,12 +63,17 @@ enum Vec0DistanceMetrics {
|
||||||
VEC0_DISTANCE_METRIC_L1 = 3,
|
VEC0_DISTANCE_METRIC_L1 = 3,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum Vec0IndexType {
|
||||||
|
VEC0_INDEX_TYPE_FLAT = 1,
|
||||||
|
};
|
||||||
|
|
||||||
struct VectorColumnDefinition {
|
struct VectorColumnDefinition {
|
||||||
char *name;
|
char *name;
|
||||||
int name_length;
|
int name_length;
|
||||||
size_t dimensions;
|
size_t dimensions;
|
||||||
enum VectorElementType element_type;
|
enum VectorElementType element_type;
|
||||||
enum Vec0DistanceMetrics distance_metric;
|
enum Vec0DistanceMetrics distance_metric;
|
||||||
|
enum Vec0IndexType index_type;
|
||||||
};
|
};
|
||||||
|
|
||||||
int vec0_parse_vector_column(const char *source, int source_length,
|
int vec0_parse_vector_column(const char *source, int source_length,
|
||||||
|
|
|
||||||
|
|
@ -119,148 +119,6 @@ FUNCTIONS = [
|
||||||
MODULES = [
|
MODULES = [
|
||||||
"vec0",
|
"vec0",
|
||||||
"vec_each",
|
"vec_each",
|
||||||
# "vec_static_blob_entries",
|
|
||||||
# "vec_static_blobs",
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def register_numpy(db, name: str, array):
|
|
||||||
ptr = array.__array_interface__["data"][0]
|
|
||||||
nvectors, dimensions = array.__array_interface__["shape"]
|
|
||||||
element_type = array.__array_interface__["typestr"]
|
|
||||||
|
|
||||||
assert element_type == "<f4"
|
|
||||||
|
|
||||||
name_escaped = db.execute("select printf('%w', ?)", [name]).fetchone()[0]
|
|
||||||
|
|
||||||
db.execute(
|
|
||||||
"""
|
|
||||||
insert into temp.vec_static_blobs(name, data)
|
|
||||||
select ?, vec_static_blob_from_raw(?, ?, ?, ?)
|
|
||||||
""",
|
|
||||||
[name, ptr, element_type, dimensions, nvectors],
|
|
||||||
)
|
|
||||||
|
|
||||||
db.execute(
|
|
||||||
f'create virtual table "{name_escaped}" using vec_static_blob_entries({name_escaped})'
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_vec_static_blob_entries():
|
|
||||||
db = connect(EXT_PATH, extra_entrypoint="sqlite3_vec_static_blobs_init")
|
|
||||||
|
|
||||||
x = np.array([[0.1, 0.2, 0.3, 0.4], [0.9, 0.8, 0.7, 0.6]], dtype=np.float32)
|
|
||||||
y = np.array([[0.2, 0.3], [0.9, 0.8], [0.6, 0.5]], dtype=np.float32)
|
|
||||||
z = np.array(
|
|
||||||
[
|
|
||||||
[0.1, 0.1, 0.1, 0.1],
|
|
||||||
[0.2, 0.2, 0.2, 0.2],
|
|
||||||
[0.3, 0.3, 0.3, 0.3],
|
|
||||||
[0.4, 0.4, 0.4, 0.4],
|
|
||||||
[0.5, 0.5, 0.5, 0.5],
|
|
||||||
],
|
|
||||||
dtype=np.float32,
|
|
||||||
)
|
|
||||||
|
|
||||||
register_numpy(db, "x", x)
|
|
||||||
register_numpy(db, "y", y)
|
|
||||||
register_numpy(db, "z", z)
|
|
||||||
assert execute_all(
|
|
||||||
db, "select *, dimensions, count from temp.vec_static_blobs;"
|
|
||||||
) == [
|
|
||||||
{
|
|
||||||
"count": 2,
|
|
||||||
"data": None,
|
|
||||||
"dimensions": 4,
|
|
||||||
"name": "x",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"count": 3,
|
|
||||||
"data": None,
|
|
||||||
"dimensions": 2,
|
|
||||||
"name": "y",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"count": 5,
|
|
||||||
"data": None,
|
|
||||||
"dimensions": 4,
|
|
||||||
"name": "z",
|
|
||||||
},
|
|
||||||
]
|
|
||||||
|
|
||||||
assert execute_all(db, "select vec_to_json(vector) from x;") == [
|
|
||||||
{
|
|
||||||
"vec_to_json(vector)": "[0.100000,0.200000,0.300000,0.400000]",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"vec_to_json(vector)": "[0.900000,0.800000,0.700000,0.600000]",
|
|
||||||
},
|
|
||||||
]
|
|
||||||
assert execute_all(db, "select (vector) from y limit 2;") == [
|
|
||||||
{
|
|
||||||
"vector": b"\xcd\xccL>\x9a\x99\x99>",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"vector": b"fff?\xcd\xccL?",
|
|
||||||
},
|
|
||||||
]
|
|
||||||
assert execute_all(db, "select rowid, (vector) from z") == [
|
|
||||||
{
|
|
||||||
"rowid": 0,
|
|
||||||
"vector": b"\xcd\xcc\xcc=\xcd\xcc\xcc=\xcd\xcc\xcc=\xcd\xcc\xcc=",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"rowid": 1,
|
|
||||||
"vector": b"\xcd\xccL>\xcd\xccL>\xcd\xccL>\xcd\xccL>",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"rowid": 2,
|
|
||||||
"vector": b"\x9a\x99\x99>\x9a\x99\x99>\x9a\x99\x99>\x9a\x99\x99>",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"rowid": 3,
|
|
||||||
"vector": b"\xcd\xcc\xcc>\xcd\xcc\xcc>\xcd\xcc\xcc>\xcd\xcc\xcc>",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"rowid": 4,
|
|
||||||
"vector": b"\x00\x00\x00?\x00\x00\x00?\x00\x00\x00?\x00\x00\x00?",
|
|
||||||
},
|
|
||||||
]
|
|
||||||
assert execute_all(
|
|
||||||
db,
|
|
||||||
"select rowid, vec_to_json(vector) as v from z where vector match ? and k = 3 order by distance;",
|
|
||||||
[np.array([0.3, 0.3, 0.3, 0.3], dtype=np.float32)],
|
|
||||||
) == [
|
|
||||||
{
|
|
||||||
"rowid": 2,
|
|
||||||
"v": "[0.300000,0.300000,0.300000,0.300000]",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"rowid": 3,
|
|
||||||
"v": "[0.400000,0.400000,0.400000,0.400000]",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"rowid": 1,
|
|
||||||
"v": "[0.200000,0.200000,0.200000,0.200000]",
|
|
||||||
},
|
|
||||||
]
|
|
||||||
assert execute_all(
|
|
||||||
db,
|
|
||||||
"select rowid, vec_to_json(vector) as v from z where vector match ? and k = 3 order by distance;",
|
|
||||||
[np.array([0.6, 0.6, 0.6, 0.6], dtype=np.float32)],
|
|
||||||
) == [
|
|
||||||
{
|
|
||||||
"rowid": 4,
|
|
||||||
"v": "[0.500000,0.500000,0.500000,0.500000]",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"rowid": 3,
|
|
||||||
"v": "[0.400000,0.400000,0.400000,0.400000]",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"rowid": 2,
|
|
||||||
"v": "[0.300000,0.300000,0.300000,0.300000]",
|
|
||||||
},
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1618,231 +1476,6 @@ def test_vec_each():
|
||||||
vec_each_f32(None)
|
vec_each_f32(None)
|
||||||
|
|
||||||
|
|
||||||
import io
|
|
||||||
|
|
||||||
|
|
||||||
def to_npy(arr):
|
|
||||||
buf = io.BytesIO()
|
|
||||||
np.save(buf, arr)
|
|
||||||
buf.seek(0)
|
|
||||||
return buf.read()
|
|
||||||
|
|
||||||
|
|
||||||
def test_vec_npy_each():
|
|
||||||
db = connect(EXT_PATH, extra_entrypoint="sqlite3_vec_numpy_init")
|
|
||||||
vec_npy_each = lambda *args: execute_all(
|
|
||||||
db, "select rowid, * from vec_npy_each(?)", args
|
|
||||||
)
|
|
||||||
assert vec_npy_each(to_npy(np.array([1.1, 2.2, 3.3], dtype=np.float32))) == [
|
|
||||||
{
|
|
||||||
"rowid": 0,
|
|
||||||
"vector": _f32([1.1, 2.2, 3.3]),
|
|
||||||
},
|
|
||||||
]
|
|
||||||
assert vec_npy_each(to_npy(np.array([[1.1, 2.2, 3.3]], dtype=np.float32))) == [
|
|
||||||
{
|
|
||||||
"rowid": 0,
|
|
||||||
"vector": _f32([1.1, 2.2, 3.3]),
|
|
||||||
},
|
|
||||||
]
|
|
||||||
assert vec_npy_each(
|
|
||||||
to_npy(np.array([[1.1, 2.2, 3.3], [9.9, 8.8, 7.7]], dtype=np.float32))
|
|
||||||
) == [
|
|
||||||
{
|
|
||||||
"rowid": 0,
|
|
||||||
"vector": _f32([1.1, 2.2, 3.3]),
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"rowid": 1,
|
|
||||||
"vector": _f32([9.9, 8.8, 7.7]),
|
|
||||||
},
|
|
||||||
]
|
|
||||||
|
|
||||||
assert vec_npy_each(to_npy(np.array([], dtype=np.float32))) == []
|
|
||||||
|
|
||||||
|
|
||||||
def test_vec_npy_each_errors():
|
|
||||||
db = connect(EXT_PATH, extra_entrypoint="sqlite3_vec_numpy_init")
|
|
||||||
vec_npy_each = lambda *args: execute_all(
|
|
||||||
db, "select rowid, * from vec_npy_each(?)", args
|
|
||||||
)
|
|
||||||
|
|
||||||
full = b"\x93NUMPY\x01\x00v\x00{'descr': '<f4', 'fortran_order': False, 'shape': (2, 4), } \n\xcd\xcc\x8c?\xcd\xcc\x0c@33S@\xcd\xcc\x8c@ff\x1eA\xcd\xcc\x0cAff\xf6@33\xd3@"
|
|
||||||
|
|
||||||
# EVIDENCE-OF: V03312_20150 numpy validation too short
|
|
||||||
with _raises("numpy array too short"):
|
|
||||||
vec_npy_each(b"")
|
|
||||||
# EVIDENCE-OF: V11954_28792 numpy validate magic
|
|
||||||
with _raises("numpy array does not contain the 'magic' header"):
|
|
||||||
vec_npy_each(b"\x93NUMPX\x01\x00v\x00")
|
|
||||||
|
|
||||||
with _raises("numpy array header length is invalid"):
|
|
||||||
vec_npy_each(b"\x93NUMPY\x01\x00v\x00")
|
|
||||||
|
|
||||||
with _raises("numpy header did not start with '{'"):
|
|
||||||
vec_npy_each(
|
|
||||||
b"\x93NUMPY\x01\x00v\x00c'descr': '<f4', 'fortran_order': False, 'shape': (2, 4), } \n\xcd\xcc\x8c?\xcd\xcc\x0c@33S@\xcd\xcc\x8c@ff\x1eA\xcd\xcc\x0cAff\xf6@33\xd3@"
|
|
||||||
)
|
|
||||||
|
|
||||||
with _raises("expected key in numpy header"):
|
|
||||||
vec_npy_each(
|
|
||||||
b"\x93NUMPY\x01\x00v\x00{ \n\xcd\xcc\x8c?\xcd\xcc\x0c@33S@\xcd\xcc\x8c@ff\x1eA\xcd\xcc\x0cAff\xf6@33\xd3@"
|
|
||||||
)
|
|
||||||
|
|
||||||
with _raises("expected a string as key in numpy header"):
|
|
||||||
vec_npy_each(
|
|
||||||
b"\x93NUMPY\x01\x00v\x00{False: '<f4', 'fortran_order': False, 'shape': (2, 4), } \n\xcd\xcc\x8c?\xcd\xcc\x0c@33S@\xcd\xcc\x8c@ff\x1eA\xcd\xcc\x0cAff\xf6@33\xd3@"
|
|
||||||
)
|
|
||||||
|
|
||||||
with _raises("expected a ':' after key in numpy header"):
|
|
||||||
vec_npy_each(
|
|
||||||
b"\x93NUMPY\x01\x00v\x00{'descr' \n\xcd\xcc\x8c?\xcd\xcc\x0c@33S@\xcd\xcc\x8c@ff\x1eA\xcd\xcc\x0cAff\xf6@33\xd3@"
|
|
||||||
)
|
|
||||||
with _raises("expected a ':' after key in numpy header"):
|
|
||||||
vec_npy_each(
|
|
||||||
b"\x93NUMPY\x01\x00v\x00{'descr' False \n\xcd\xcc\x8c?\xcd\xcc\x0c@33S@\xcd\xcc\x8c@ff\x1eA\xcd\xcc\x0cAff\xf6@33\xd3@"
|
|
||||||
)
|
|
||||||
|
|
||||||
with _raises("expected a string value after 'descr' key"):
|
|
||||||
vec_npy_each(
|
|
||||||
b"\x93NUMPY\x01\x00v\x00{'descr': \n\xcd\xcc\x8c?\xcd\xcc\x0c@33S@\xcd\xcc\x8c@ff\x1eA\xcd\xcc\x0cAff\xf6@33\xd3@"
|
|
||||||
)
|
|
||||||
|
|
||||||
with _raises("Only '<f4' values are supported in sqlite-vec numpy functions"):
|
|
||||||
vec_npy_each(
|
|
||||||
b"\x93NUMPY\x01\x00v\x00{'descr': '=f4', 'fortran_order': False, 'shape': (2, 4), } \n\xcd\xcc\x8c?\xcd\xcc\x0c@33S@\xcd\xcc\x8c@ff\x1eA\xcd\xcc\x0cAff\xf6@33\xd3@"
|
|
||||||
)
|
|
||||||
|
|
||||||
with _raises(
|
|
||||||
"Only fortran_order = False is supported in sqlite-vec numpy functions"
|
|
||||||
):
|
|
||||||
vec_npy_each(
|
|
||||||
b"\x93NUMPY\x01\x00v\x00{'descr': '<f4', 'fortran_order': True, 'shape': (2, 4), } \n\xcd\xcc\x8c?\xcd\xcc\x0c@33S@\xcd\xcc\x8c@ff\x1eA\xcd\xcc\x0cAff\xf6@33\xd3@"
|
|
||||||
)
|
|
||||||
|
|
||||||
with _raises(
|
|
||||||
"Error parsing numpy array: Expected left parenthesis '(' after shape key"
|
|
||||||
):
|
|
||||||
vec_npy_each(
|
|
||||||
b"\x93NUMPY\x01\x00v\x00{'shape': 2, 'descr': '<f4', 'fortran_order': False, } \n\xcd\xcc\x8c?\xcd\xcc\x0c@33S@\xcd\xcc\x8c@ff\x1eA\xcd\xcc\x0cAff\xf6@33\xd3@"
|
|
||||||
)
|
|
||||||
|
|
||||||
with _raises(
|
|
||||||
"Error parsing numpy array: Expected an initial number in shape value"
|
|
||||||
):
|
|
||||||
vec_npy_each(
|
|
||||||
b"\x93NUMPY\x01\x00v\x00{'shape': (, 'descr': '<f4', 'fortran_order': False, } \n\xcd\xcc\x8c?\xcd\xcc\x0c@33S@\xcd\xcc\x8c@ff\x1eA\xcd\xcc\x0cAff\xf6@33\xd3@"
|
|
||||||
)
|
|
||||||
|
|
||||||
with _raises("Error parsing numpy array: Expected comma after first shape value"):
|
|
||||||
vec_npy_each(
|
|
||||||
b"\x93NUMPY\x01\x00v\x00{'shape': (2), 'descr': '<f4', 'fortran_order': False, } \n\xcd\xcc\x8c?\xcd\xcc\x0c@33S@\xcd\xcc\x8c@ff\x1eA\xcd\xcc\x0cAff\xf6@33\xd3@"
|
|
||||||
)
|
|
||||||
|
|
||||||
with _raises(
|
|
||||||
"Error parsing numpy array: unexpected header EOF while parsing shape"
|
|
||||||
):
|
|
||||||
vec_npy_each(
|
|
||||||
b"\x93NUMPY\x01\x00v\x00{'shape': (2, \n\xcd\xcc\x8c?\xcd\xcc\x0c@33S@\xcd\xcc\x8c@ff\x1eA\xcd\xcc\x0cAff\xf6@33\xd3@"
|
|
||||||
)
|
|
||||||
|
|
||||||
with _raises("Error parsing numpy array: unknown type in shape value"):
|
|
||||||
vec_npy_each(
|
|
||||||
b"\x93NUMPY\x01\x00v\x00{'shape': (2, 'nope' \n\xcd\xcc\x8c?\xcd\xcc\x0c@33S@\xcd\xcc\x8c@ff\x1eA\xcd\xcc\x0cAff\xf6@33\xd3@"
|
|
||||||
)
|
|
||||||
|
|
||||||
with _raises(
|
|
||||||
"Error parsing numpy array: expected right parenthesis after shape value"
|
|
||||||
):
|
|
||||||
vec_npy_each(
|
|
||||||
b"\x93NUMPY\x01\x00v\x00{'shape': (2,4 ( \n\xcd\xcc\x8c?\xcd\xcc\x0c@33S@\xcd\xcc\x8c@ff\x1eA\xcd\xcc\x0cAff\xf6@33\xd3@"
|
|
||||||
)
|
|
||||||
|
|
||||||
with _raises("Error parsing numpy array: unknown key in numpy header"):
|
|
||||||
vec_npy_each(
|
|
||||||
b"\x93NUMPY\x01\x00v\x00{'no': '<f4', 'fortran_order': False, 'shape': (2, 4), } \n\xcd\xcc\x8c?\xcd\xcc\x0c@33S@\xcd\xcc\x8c@ff\x1eA\xcd\xcc\x0cAff\xf6@33\xd3@"
|
|
||||||
)
|
|
||||||
|
|
||||||
with _raises("Error parsing numpy array: unknown extra token after value"):
|
|
||||||
vec_npy_each(
|
|
||||||
b"\x93NUMPY\x01\x00v\x00{'descr': '<f4' 'asdf', 'fortran_order': False, 'shape': (2, 4), } \n\xcd\xcc\x8c?\xcd\xcc\x0c@33S@\xcd\xcc\x8c@ff\x1eA\xcd\xcc\x0cAff\xf6@33\xd3@"
|
|
||||||
)
|
|
||||||
|
|
||||||
with _raises("numpy array error: Expected a data size of 32, found 31"):
|
|
||||||
vec_npy_each(
|
|
||||||
b"\x93NUMPY\x01\x00v\x00{'descr': '<f4', 'fortran_order': False, 'shape': (2, 4), } \n\xcd\xcc\x8c?\xcd\xcc\x0c@33S@\xcd\xcc\x8c@ff\x1eA\xcd\xcc\x0cAff\xf6@33\xd3"
|
|
||||||
)
|
|
||||||
|
|
||||||
# with _raises("XXX"):
|
|
||||||
# vec_npy_each(b"\x93NUMPY\x01\x00v\x00{'descr': '<f4', 'fortran_order': False, 'shape': (2, 4), } \n\xcd\xcc\x8c?\xcd\xcc\x0c@33S@\xcd\xcc\x8c@ff\x1eA\xcd\xcc\x0cAff\xf6@33\xd3@")
|
|
||||||
|
|
||||||
|
|
||||||
import tempfile
|
|
||||||
|
|
||||||
|
|
||||||
def test_vec_npy_each_errors_files():
|
|
||||||
db = connect(EXT_PATH, extra_entrypoint="sqlite3_vec_numpy_init")
|
|
||||||
|
|
||||||
def vec_npy_each(data):
|
|
||||||
with tempfile.NamedTemporaryFile(delete_on_close=False) as f:
|
|
||||||
f.write(data)
|
|
||||||
f.close()
|
|
||||||
try:
|
|
||||||
return execute_all(
|
|
||||||
db, "select rowid, * from vec_npy_each(vec_npy_file(?))", [f.name]
|
|
||||||
)
|
|
||||||
finally:
|
|
||||||
f.close()
|
|
||||||
|
|
||||||
with _raises("Could not open numpy file"):
|
|
||||||
db.execute('select * from vec_npy_each(vec_npy_file("not exist"))')
|
|
||||||
|
|
||||||
with _raises("numpy array file too short"):
|
|
||||||
vec_npy_each(b"\x93NUMPY\x01\x00v")
|
|
||||||
|
|
||||||
with _raises("numpy array file does not contain the 'magic' header"):
|
|
||||||
vec_npy_each(b"\x93XUMPY\x01\x00v\x00")
|
|
||||||
|
|
||||||
with _raises("numpy array file header length is invalid"):
|
|
||||||
vec_npy_each(b"\x93NUMPY\x01\x00v\x00")
|
|
||||||
|
|
||||||
with _raises(
|
|
||||||
"Error parsing numpy array: Only fortran_order = False is supported in sqlite-vec numpy functions"
|
|
||||||
):
|
|
||||||
vec_npy_each(
|
|
||||||
b"\x93NUMPY\x01\x00v\x00{'descr': '<f4', 'fortran_order': True, 'shape': (2, 4), } \n\xcd\xcc\x8c?\xcd\xcc\x0c@33S@\xcd\xcc\x8c@ff\x1eA\xcd\xcc\x0cAff\xf6@33\xd3@"
|
|
||||||
)
|
|
||||||
|
|
||||||
with _raises("numpy array file error: Expected a data size of 32, found 31"):
|
|
||||||
vec_npy_each(
|
|
||||||
b"\x93NUMPY\x01\x00v\x00{'descr': '<f4', 'fortran_order': False, 'shape': (2, 4), } \n\xcd\xcc\x8c?\xcd\xcc\x0c@33S@\xcd\xcc\x8c@ff\x1eA\xcd\xcc\x0cAff\xf6@33\xd3"
|
|
||||||
)
|
|
||||||
|
|
||||||
assert vec_npy_each(to_npy(np.array([1.1, 2.2, 3.3], dtype=np.float32))) == [
|
|
||||||
{
|
|
||||||
"rowid": 0,
|
|
||||||
"vector": _f32([1.1, 2.2, 3.3]),
|
|
||||||
},
|
|
||||||
]
|
|
||||||
assert vec_npy_each(
|
|
||||||
to_npy(np.array([[1.1, 2.2, 3.3], [4.4, 5.5, 6.6]], dtype=np.float32))
|
|
||||||
) == [
|
|
||||||
{
|
|
||||||
"rowid": 0,
|
|
||||||
"vector": _f32([1.1, 2.2, 3.3]),
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"rowid": 1,
|
|
||||||
"vector": _f32([4.4, 5.5, 6.6]),
|
|
||||||
},
|
|
||||||
]
|
|
||||||
assert vec_npy_each(to_npy(np.array([], dtype=np.float32))) == []
|
|
||||||
x1025 = vec_npy_each(to_npy(np.array([[0.1, 0.2, 0.3]] * 1025, dtype=np.float32)))
|
|
||||||
assert len(x1025) == 1025
|
|
||||||
|
|
||||||
# np.array([[.1, .2, 3]] * 99, dtype=np.float32).shape
|
|
||||||
|
|
||||||
|
|
||||||
def test_vec0_constructor():
|
def test_vec0_constructor():
|
||||||
vec_constructor_error_prefix = "vec0 constructor error: {}"
|
vec_constructor_error_prefix = "vec0 constructor error: {}"
|
||||||
vec_col_error_prefix = "vec0 constructor error: could not parse vector column '{}'"
|
vec_col_error_prefix = "vec0 constructor error: could not parse vector column '{}'"
|
||||||
|
|
@ -1923,6 +1556,54 @@ def test_vec0_constructor():
|
||||||
db.execute("create virtual table v using vec0(4)")
|
db.execute("create virtual table v using vec0(4)")
|
||||||
|
|
||||||
|
|
||||||
|
def test_vec0_indexed_by_flat():
|
||||||
|
db.execute("drop table if exists t_ibf")
|
||||||
|
db.execute("drop table if exists t_ibf2")
|
||||||
|
db.execute("drop table if exists t_ibf3")
|
||||||
|
db.execute("drop table if exists t_ibf4")
|
||||||
|
|
||||||
|
# indexed by flat() should succeed and behave identically to no index clause
|
||||||
|
db.execute("create virtual table t_ibf using vec0(emb float[4] indexed by flat())")
|
||||||
|
db.execute(
|
||||||
|
"insert into t_ibf(rowid, emb) values (1, X'00000000000000000000000000000000')"
|
||||||
|
)
|
||||||
|
rows = db.execute("select rowid from t_ibf where emb match X'00000000000000000000000000000000' and k = 1").fetchall()
|
||||||
|
assert len(rows) == 1
|
||||||
|
assert rows[0][0] == 1
|
||||||
|
db.execute("drop table t_ibf")
|
||||||
|
|
||||||
|
# indexed by flat() with distance_metric
|
||||||
|
db.execute(
|
||||||
|
"create virtual table t_ibf2 using vec0(emb float[4] distance_metric=cosine indexed by flat())"
|
||||||
|
)
|
||||||
|
db.execute("drop table t_ibf2")
|
||||||
|
|
||||||
|
# indexed by flat() on int8
|
||||||
|
db.execute("create virtual table t_ibf3 using vec0(emb int8[4] indexed by flat())")
|
||||||
|
db.execute("drop table t_ibf3")
|
||||||
|
|
||||||
|
# indexed by flat() on bit
|
||||||
|
db.execute("create virtual table t_ibf4 using vec0(emb bit[8] indexed by flat())")
|
||||||
|
db.execute("drop table t_ibf4")
|
||||||
|
|
||||||
|
# Error: unknown index type
|
||||||
|
with _raises(
|
||||||
|
"vec0 constructor error: could not parse vector column 'emb float[4] indexed by unknown()'",
|
||||||
|
sqlite3.DatabaseError,
|
||||||
|
):
|
||||||
|
db.execute("create virtual table v using vec0(emb float[4] indexed by unknown())")
|
||||||
|
|
||||||
|
# Error: indexed by (missing type)
|
||||||
|
with _raises(
|
||||||
|
"vec0 constructor error: could not parse vector column 'emb float[4] indexed by'",
|
||||||
|
sqlite3.DatabaseError,
|
||||||
|
):
|
||||||
|
db.execute("create virtual table v using vec0(emb float[4] indexed by)")
|
||||||
|
|
||||||
|
if db.in_transaction:
|
||||||
|
db.rollback()
|
||||||
|
|
||||||
|
|
||||||
def test_vec0_create_errors():
|
def test_vec0_create_errors():
|
||||||
# EVIDENCE-OF: V17740_01811 vec0 create _chunks error handling
|
# EVIDENCE-OF: V17740_01811 vec0 create _chunks error handling
|
||||||
db.set_authorizer(authorizer_deny_on(sqlite3.SQLITE_CREATE_TABLE, "t1_chunks"))
|
db.set_authorizer(authorizer_deny_on(sqlite3.SQLITE_CREATE_TABLE, "t1_chunks"))
|
||||||
|
|
|
||||||
|
|
@ -500,6 +500,83 @@ void test_vec0_parse_vector_column() {
|
||||||
assert(rc == SQLITE_ERROR);
|
assert(rc == SQLITE_ERROR);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// indexed by flat()
|
||||||
|
{
|
||||||
|
const char *input = "emb float[768] indexed by flat()";
|
||||||
|
rc = vec0_parse_vector_column(input, (int)strlen(input), &col);
|
||||||
|
assert(rc == SQLITE_OK);
|
||||||
|
assert(col.index_type == VEC0_INDEX_TYPE_FLAT);
|
||||||
|
assert(col.dimensions == 768);
|
||||||
|
sqlite3_free(col.name);
|
||||||
|
}
|
||||||
|
|
||||||
|
// indexed by flat() with distance_metric
|
||||||
|
{
|
||||||
|
const char *input = "emb float[768] distance_metric=cosine indexed by flat()";
|
||||||
|
rc = vec0_parse_vector_column(input, (int)strlen(input), &col);
|
||||||
|
assert(rc == SQLITE_OK);
|
||||||
|
assert(col.index_type == VEC0_INDEX_TYPE_FLAT);
|
||||||
|
assert(col.distance_metric == VEC0_DISTANCE_METRIC_COSINE);
|
||||||
|
sqlite3_free(col.name);
|
||||||
|
}
|
||||||
|
|
||||||
|
// indexed by flat() on int8
|
||||||
|
{
|
||||||
|
const char *input = "emb int8[256] indexed by flat()";
|
||||||
|
rc = vec0_parse_vector_column(input, (int)strlen(input), &col);
|
||||||
|
assert(rc == SQLITE_OK);
|
||||||
|
assert(col.index_type == VEC0_INDEX_TYPE_FLAT);
|
||||||
|
assert(col.element_type == SQLITE_VEC_ELEMENT_TYPE_INT8);
|
||||||
|
sqlite3_free(col.name);
|
||||||
|
}
|
||||||
|
|
||||||
|
// indexed by flat() on bit
|
||||||
|
{
|
||||||
|
const char *input = "emb bit[64] indexed by flat()";
|
||||||
|
rc = vec0_parse_vector_column(input, (int)strlen(input), &col);
|
||||||
|
assert(rc == SQLITE_OK);
|
||||||
|
assert(col.index_type == VEC0_INDEX_TYPE_FLAT);
|
||||||
|
assert(col.element_type == SQLITE_VEC_ELEMENT_TYPE_BIT);
|
||||||
|
sqlite3_free(col.name);
|
||||||
|
}
|
||||||
|
|
||||||
|
// default index_type is FLAT
|
||||||
|
{
|
||||||
|
const char *input = "emb float[768]";
|
||||||
|
rc = vec0_parse_vector_column(input, (int)strlen(input), &col);
|
||||||
|
assert(rc == SQLITE_OK);
|
||||||
|
assert(col.index_type == VEC0_INDEX_TYPE_FLAT);
|
||||||
|
sqlite3_free(col.name);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Error: indexed by (missing type name)
|
||||||
|
{
|
||||||
|
const char *input = "emb float[768] indexed by";
|
||||||
|
rc = vec0_parse_vector_column(input, (int)strlen(input), &col);
|
||||||
|
assert(rc == SQLITE_ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Error: indexed by unknown()
|
||||||
|
{
|
||||||
|
const char *input = "emb float[768] indexed by unknown()";
|
||||||
|
rc = vec0_parse_vector_column(input, (int)strlen(input), &col);
|
||||||
|
assert(rc == SQLITE_ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Error: indexed by flat (missing parens)
|
||||||
|
{
|
||||||
|
const char *input = "emb float[768] indexed by flat";
|
||||||
|
rc = vec0_parse_vector_column(input, (int)strlen(input), &col);
|
||||||
|
assert(rc == SQLITE_ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Error: indexed flat() (missing "by")
|
||||||
|
{
|
||||||
|
const char *input = "emb float[768] indexed flat()";
|
||||||
|
rc = vec0_parse_vector_column(input, (int)strlen(input), &col);
|
||||||
|
assert(rc == SQLITE_ERROR);
|
||||||
|
}
|
||||||
|
|
||||||
printf(" All vec0_parse_vector_column tests passed.\n");
|
printf(" All vec0_parse_vector_column tests passed.\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -656,6 +733,30 @@ void test_distance_hamming() {
|
||||||
assert(d == 16.0f);
|
assert(d == 16.0f);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Large vector (256 bits = 32 bytes) — exercises NEON path on ARM
|
||||||
|
{
|
||||||
|
unsigned char a[32];
|
||||||
|
unsigned char b[32];
|
||||||
|
memset(a, 0xFF, 32);
|
||||||
|
memset(b, 0x00, 32);
|
||||||
|
d = _test_distance_hamming(a, b, 256);
|
||||||
|
assert(d == 256.0f);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Large vector (1024 bits = 128 bytes) — exercises 64-byte NEON loop
|
||||||
|
{
|
||||||
|
unsigned char a[128];
|
||||||
|
unsigned char b[128];
|
||||||
|
memset(a, 0x00, 128);
|
||||||
|
memset(b, 0x00, 128);
|
||||||
|
// Set every other byte to 0xFF in a, 0x00 in b -> 8 bits per byte * 64 bytes = 512
|
||||||
|
for (int i = 0; i < 128; i += 2) {
|
||||||
|
a[i] = 0xFF;
|
||||||
|
}
|
||||||
|
d = _test_distance_hamming(a, b, 1024);
|
||||||
|
assert(d == 512.0f);
|
||||||
|
}
|
||||||
|
|
||||||
printf(" All distance_hamming tests passed.\n");
|
printf(" All distance_hamming tests passed.\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,56 +0,0 @@
|
||||||
import sqlite3
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
db = sqlite3.connect(":memory:")
|
|
||||||
|
|
||||||
db.enable_load_extension(True)
|
|
||||||
db.load_extension("./dist/vec0")
|
|
||||||
db.execute("select load_extension('./dist/vec0', 'sqlite3_vec_raw_init')")
|
|
||||||
db.enable_load_extension(False)
|
|
||||||
|
|
||||||
x = np.array([[0.1, 0.2, 0.3, 0.4], [0.9, 0.8, 0.7, 0.6]], dtype=np.float32)
|
|
||||||
y = np.array([[0.2, 0.3], [0.9, 0.8], [0.6, 0.5]], dtype=np.float32)
|
|
||||||
z = np.array(
|
|
||||||
[
|
|
||||||
[0.1, 0.1, 0.1, 0.1],
|
|
||||||
[0.2, 0.2, 0.2, 0.2],
|
|
||||||
[0.3, 0.3, 0.3, 0.3],
|
|
||||||
[0.4, 0.4, 0.4, 0.4],
|
|
||||||
[0.5, 0.5, 0.5, 0.5],
|
|
||||||
],
|
|
||||||
dtype=np.float32,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def register_np(array, name):
|
|
||||||
ptr = array.__array_interface__["data"][0]
|
|
||||||
nvectors, dimensions = array.__array_interface__["shape"]
|
|
||||||
element_type = array.__array_interface__["typestr"]
|
|
||||||
|
|
||||||
assert element_type == "<f4"
|
|
||||||
|
|
||||||
name_escaped = db.execute("select printf('%w', ?)", [name]).fetchone()[0]
|
|
||||||
|
|
||||||
db.execute(
|
|
||||||
"insert into temp.vec_static_blobs(name, data) select ?, vec_static_blob_from_raw(?, ?, ?, ?)",
|
|
||||||
[name, ptr, element_type, dimensions, nvectors],
|
|
||||||
)
|
|
||||||
|
|
||||||
db.execute(
|
|
||||||
f'create virtual table "{name_escaped}" using vec_static_blob_entries({name_escaped})'
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
register_np(x, "x")
|
|
||||||
register_np(y, "y")
|
|
||||||
register_np(z, "z")
|
|
||||||
print(db.execute("select *, dimensions, count from temp.vec_static_blobs;").fetchall())
|
|
||||||
|
|
||||||
print(db.execute("select vec_to_json(vector) from x;").fetchall())
|
|
||||||
print(db.execute("select (vector) from y limit 2;").fetchall())
|
|
||||||
print(
|
|
||||||
db.execute(
|
|
||||||
"select (vector) from z where vector match ? and k = 2 order by distance;",
|
|
||||||
[np.array([0.3, 0.3, 0.3, 0.3], dtype=np.float32)],
|
|
||||||
).fetchall()
|
|
||||||
)
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue