Initial commit

This commit is contained in:
Alex Garcia 2024-04-20 13:38:58 -07:00
commit 4c8ad629e0
28 changed files with 6758 additions and 0 deletions

24
.gitignore vendored Normal file
View file

@ -0,0 +1,24 @@
/target
.vscode
sift/
*.tar.gz
*.db
*.bin
*.out
venv/
vendor/
dist/
*.pyc
*.db-journal
*.svg
alexandria/
openai/
examples/supabase-dbpedia
examples/ann-filtering
examples/dbpedia-openai
examples/imdb
sqlite-vec.h

296
Makefile Normal file
View file

@ -0,0 +1,296 @@
COMMIT=$(shell git rev-parse HEAD)
VERSION=$(shell cat VERSION)
DATE=$(shell date +'%FT%TZ%z')
ifeq ($(shell uname -s),Darwin)
CONFIG_DARWIN=y
else ifeq ($(OS),Windows_NT)
CONFIG_WINDOWS=y
else
CONFIG_LINUX=y
endif
ifdef CONFIG_DARWIN
LOADABLE_EXTENSION=dylib
endif
ifdef CONFIG_LINUX
LOADABLE_EXTENSION=so
endif
ifdef CONFIG_WINDOWS
LOADABLE_EXTENSION=dll
endif
ifdef python
PYTHON=$(python)
else
PYTHON=python3
endif
ifndef OMIT_SIMD
ifeq ($(shell uname -sm),Darwin x86_64)
CFLAGS += -mavx -DSQLITE_VEC_ENABLE_AVX
endif
ifeq ($(shell uname -sm),Darwin arm64)
CFLAGS += -mcpu=apple-m1 -DSQLITE_VEC_ENABLE_NEON
endif
endif
ifdef IS_MACOS_ARM
RENAME_WHEELS_ARGS=--is-macos-arm
else
RENAME_WHEELS_ARGS=
endif
prefix=dist
$(prefix):
mkdir -p $(prefix)
TARGET_LOADABLE=$(prefix)/vec0.$(LOADABLE_EXTENSION)
TARGET_STATIC=$(prefix)/libsqlite_vec0.a
TARGET_STATIC_H=$(prefix)/sqlite-vec.h
TARGET_CLI=$(prefix)/sqlite3
loadable: $(TARGET_LOADABLE)
static: $(TARGET_STATIC)
cli: $(TARGET_CLI)
all: loadable static cli
$(TARGET_LOADABLE): sqlite-vec.c sqlite-vec.h $(prefix)
gcc \
-fPIC -shared \
-Wall -Wextra \
-Ivendor/ \
-O3 \
$(CFLAGS) \
$< -o $@
$(TARGET_STATIC): sqlite-vec.c sqlite-vec.h $(prefix)
gcc -Ivendor/sqlite -Ivendor/vec $(CFLAGS) -DSQLITE_CORE \
-O3 -c $< -o $(prefix)/.objs/vec.o
ar rcs $@ $(prefix)/.objs/vec.o
$(TARGET_STATIC_H): sqlite-vec.h $(prefix)
cp $< $@
OBJS_DIR=$(prefix)/.objs
LIBS_DIR=$(prefix)/.libs
BUILD_DIR=$(prefix)/.build
$(OBJS_DIR): $(prefix)
mkdir -p $@
$(LIBS_DIR): $(prefix)
mkdir -p $@
$(BUILD_DIR): $(prefix)
mkdir -p $@
$(OBJS_DIR)/sqlite3.o: vendor/sqlite3.c $(OBJS_DIR)
gcc -c -g3 -O3 -DSQLITE_EXTRA_INIT=core_init -DSQLITE_CORE -DSQLITE_ENABLE_STMT_SCANSTATUS -DSQLITE_ENABLE_BYTECODE_VTAB -DSQLITE_ENABLE_EXPLAIN_COMMENTS -I./vendor $< -o $@
$(LIBS_DIR)/sqlite3.a: $(OBJS_DIR)/sqlite3.o $(LIBS_DIR)
ar rcs $@ $<
$(BUILD_DIR)/shell-new.c: vendor/shell.c $(BUILD_DIR)
sed 's/\/\*extra-version-info\*\//EXTRA_TODO/g' $< > $@
$(OBJS_DIR)/shell.o: $(BUILD_DIR)/shell-new.c $(OBJS_DIR)
gcc -c -g3 -O3 \
-DHAVE_EDITLINE=1 -I./vendor \
-DSQLITE_ENABLE_STMT_SCANSTATUS -DSQLITE_ENABLE_BYTECODE_VTAB -DSQLITE_ENABLE_EXPLAIN_COMMENTS \
-DEXTRA_TODO="\"CUSTOM BUILD: sqlite-vec\n\"" \
$< -o $@
$(LIBS_DIR)/shell.a: $(OBJS_DIR)/shell.o $(LIBS_DIR)
ar rcs $@ $<
$(OBJS_DIR)/sqlite-vec.o: sqlite-vec.c $(OBJS_DIR)
gcc -c -g3 -I./vendor $(CFLAGS) $< -o $@
$(LIBS_DIR)/sqlite-vec.a: $(OBJS_DIR)/sqlite-vec.o $(LIBS_DIR)
ar rcs $@ $<
$(TARGET_CLI): $(LIBS_DIR)/sqlite-vec.a $(LIBS_DIR)/shell.a $(LIBS_DIR)/sqlite3.a examples/sqlite3-cli/core_init.c $(prefix)
gcc -g3 \
-Ivendor/sqlite -I./ \
-DSQLITE_CORE \
-DSQLITE_THREADSAFE=0 -DSQLITE_ENABLE_FTS4 \
-DSQLITE_ENABLE_STMT_SCANSTATUS -DSQLITE_ENABLE_BYTECODE_VTAB -DSQLITE_ENABLE_EXPLAIN_COMMENTS \
-DSQLITE_EXTRA_INIT=core_init \
$(CFLAGS) \
-lreadline -DHAVE_EDITLINE=1 \
-ldl -lm -lreadline \
$(LIBS_DIR)/shell.a $(LIBS_DIR)/sqlite3.a $(LIBS_DIR)/sqlite-vec.a examples/sqlite3-cli/core_init.c -o $@
sqlite-vec.h: sqlite-vec.h.tmpl VERSION
VERSION=$(shell cat VERSION) \
DATE=$(shell date -r VERSION +'%FT%TZ%z') \
SOURCE=$(shell git log -n 1 --pretty=format:%H -- VERSION) \
envsubst < $< > $@
clean:
rm -rf dist
FORMAT_FILES=sqlite-vec.h sqlite-vec.c
format: $(FORMAT_FILES)
clang-format -i $(FORMAT_FILES)
black tests/test-loadable.py
lint: SHELL:=/bin/bash
lint:
diff -u <(cat $(FORMAT_FILES)) <(clang-format $(FORMAT_FILES))
test:
sqlite3 :memory: '.read test.sql'
.PHONY: version loadable static test clean gh-release \
ruby
publish-release:
./scripts/publish_release.sh
TARGET_WHEELS=$(prefix)/wheels
INTERMEDIATE_PYPACKAGE_EXTENSION=bindings/python/sqlite_vec/
$(TARGET_WHEELS): $(prefix)
mkdir -p $(TARGET_WHEELS)
bindings/ruby/lib/version.rb: bindings/ruby/lib/version.rb.tmpl VERSION
VERSION=$(VERSION) envsubst < $< > $@
bindings/python/sqlite_vec/version.py: bindings/python/sqlite_vec/version.py.tmpl VERSION
VERSION=$(VERSION) envsubst < $< > $@
echo "✅ generated $@"
bindings/datasette/datasette_sqlite_vec/version.py: bindings/datasette/datasette_sqlite_vec/version.py.tmpl VERSION
VERSION=$(VERSION) envsubst < $< > $@
echo "✅ generated $@"
python: $(TARGET_WHEELS) $(TARGET_LOADABLE) bindings/python/setup.py bindings/python/sqlite_vec/__init__.py scripts/rename-wheels.py
cp $(TARGET_LOADABLE) $(INTERMEDIATE_PYPACKAGE_EXTENSION)
rm $(TARGET_WHEELS)/*.wheel || true
pip3 wheel bindings/python/ -w $(TARGET_WHEELS)
python3 scripts/rename-wheels.py $(TARGET_WHEELS) $(RENAME_WHEELS_ARGS)
echo "✅ generated python wheel"
datasette: $(TARGET_WHEELS) bindings/datasette/setup.py bindings/datasette/datasette_sqlite_vec/__init__.py
rm $(TARGET_WHEELS)/datasette* || true
pip3 wheel bindings/datasette/ --no-deps -w $(TARGET_WHEELS)
bindings/sqlite-utils/pyproject.toml: bindings/sqlite-utils/pyproject.toml.tmpl VERSION
VERSION=$(VERSION) envsubst < $< > $@
echo "✅ generated $@"
bindings/sqlite-utils/sqlite_utils_sqlite_vec/version.py: bindings/sqlite-utils/sqlite_utils_sqlite_vec/version.py.tmpl VERSION
VERSION=$(VERSION) envsubst < $< > $@
echo "✅ generated $@"
sqlite-utils: $(TARGET_WHEELS) bindings/sqlite-utils/pyproject.toml bindings/sqlite-utils/sqlite_utils_sqlite_vec/version.py
python3 -m build bindings/sqlite-utils -w -o $(TARGET_WHEELS)
node: VERSION bindings/node/platform-package.README.md.tmpl bindings/node/platform-package.package.json.tmpl bindings/node/sqlite-vec/package.json.tmpl scripts/node_generate_platform_packages.sh
scripts/node_generate_platform_packages.sh
deno: VERSION bindings/deno/deno.json.tmpl
scripts/deno_generate_package.sh
version:
make bindings/ruby/lib/version.rb
make bindings/python/sqlite_vec/version.py
make bindings/datasette/datasette_sqlite_vec/version.py
make bindings/datasette/datasette_sqlite_vec/version.py
make bindings/sqlite-utils/pyproject.toml bindings/sqlite-utils/sqlite_utils_sqlite_vec/version.py
make node
make deno
test-loadable: loadable
$(PYTHON) -m pytest -vv -s tests/test-loadable.py
test-loadable-snapshot-update: loadable
$(PYTHON) -m pytest -vv tests/test-loadable.py --snapshot-update
test-loadable-watch:
watchexec -w sqlite-vec.c -w tests/test-loadable.py -w Makefile --clear -- make test-loadable
# ███████████████████████████████ WASM SECTION ███████████████████████████████
WASM_DIR=$(prefix)/.wasm
$(WASM_DIR): $(prefix)
mkdir -p $@
SQLITE_WASM_VERSION=3450300
SQLITE_WASM_YEAR=2024
SQLITE_WASM_SRCZIP=$(BUILD_DIR)/sqlite-src.zip
SQLITE_WASM_COMPILED_SQLITE3C=$(BUILD_DIR)/sqlite-src-$(SQLITE_WASM_VERSION)/sqlite3.c
SQLITE_WASM_COMPILED_MJS=$(BUILD_DIR)/sqlite-src-$(SQLITE_WASM_VERSION)/ext/wasm/jswasm/sqlite3.mjs
SQLITE_WASM_COMPILED_WASM=$(BUILD_DIR)/sqlite-src-$(SQLITE_WASM_VERSION)/ext/wasm/jswasm/sqlite3.wasm
TARGET_WASM_LIB=$(WASM_DIR)/libsqlite_vec.wasm.a
TARGET_WASM_MJS=$(WASM_DIR)/sqlite3.mjs
TARGET_WASM_WASM=$(WASM_DIR)/sqlite3.wasm
TARGET_WASM=$(TARGET_WASM_MJS) $(TARGET_WASM_WASM)
$(SQLITE_WASM_SRCZIP): $(BUILD_DIR)
curl -o $@ https://www.sqlite.org/$(SQLITE_WASM_YEAR)/sqlite-src-$(SQLITE_WASM_VERSION).zip
$(SQLITE_WASM_COMPILED_SQLITE3C): $(SQLITE_WASM_SRCZIP) $(BUILD_DIR)
unzip -q -o $< -d $(BUILD_DIR)
(cd $(BUILD_DIR)/sqlite-src-$(SQLITE_WASM_VERSION)/ && ./configure --enable-all && make sqlite3.c)
$(TARGET_WASM_LIB): examples/wasm/wasm.c sqlite-vec.c $(BUILD_DIR) $(WASM_DIR)
emcc -O3 -I./ -Ivendor -DSQLITE_CORE -c examples/wasm/wasm.c -o $(BUILD_DIR)/wasm.wasm.o
emcc -O3 -I./ -Ivendor -DSQLITE_CORE -c sqlite-vec.c -o $(BUILD_DIR)/sqlite-vec.wasm.o
emar rcs $@ $(BUILD_DIR)/wasm.wasm.o $(BUILD_DIR)/sqlite-vec.wasm.o
$(SQLITE_WASM_COMPILED_MJS) $(SQLITE_WASM_COMPILED_WASM): $(SQLITE_WASM_COMPILED_SQLITE3C) $(TARGET_WASM_LIB)
(cd $(BUILD_DIR)/sqlite-src-$(SQLITE_WASM_VERSION)/ext/wasm && \
make sqlite3_wasm_extra_init.c=../../../../.wasm/libsqlite_vec.wasm.a "emcc.flags=-s EXTRA_EXPORTED_RUNTIME_METHODS=['ENV'] -s FETCH")
$(TARGET_WASM_MJS): $(SQLITE_WASM_COMPILED_MJS)
cp $< $@
$(TARGET_WASM_WASM): $(SQLITE_WASM_COMPILED_WASM)
cp $< $@
wasm: $(TARGET_WASM)
# ███████████████████████████████ END WASM ███████████████████████████████
# ███████████████████████████████ SITE SECTION ███████████████████████████████
WASM_TOOLKIT_NPM_TARGZ=$(BUILD_DIR)/sqlite-wasm-toolkit-npm.tar.gz
SITE_DIR=$(prefix)/.site
TARGET_SITE=$(prefix)/.site/index.html
$(WASM_TOOLKIT_NPM_TARGZ):
curl -o $@ -q https://registry.npmjs.org/@alex.garcia/sqlite-wasm-toolkit/-/sqlite-wasm-toolkit-0.0.1-alpha.7.tgz
$(SITE_DIR)/slim.js $(SITE_DIR)/slim.css: $(WASM_TOOLKIT_NPM_TARGZ)
tar -xvzf $< -C $(SITE_DIR) --strip-components=2 package/dist/slim.js package/dist/slim.css
$(SITE_DIR):
mkdir -p $(SITE_DIR)
# $(TARGET_WASM_MJS) $(TARGET_WASM_WASM)
$(TARGET_SITE): site/index.html $(SITE_DIR)/slim.js $(SITE_DIR)/slim.css
cp $(TARGET_WASM_MJS) $(SITE_DIR)
cp $(TARGET_WASM_WASM) $(SITE_DIR)
cp $< $@
site: $(TARGET_SITE)
# ███████████████████████████████ END SITE ███████████████████████████████

73
README.md Normal file
View file

@ -0,0 +1,73 @@
# `sqlite-vec`
An extremely small, "fast enough" vector search SQLite extension that runs
anywhere! A successor to [sqlite-vss](https://github.com/asg017/sqlite-vss)
> [!IMPORTANT]
> *`sqlite-vec` is a work-in-progress and not ready for general usage! I plan to launch a "beta" version in the next month or so. Watch this repo for updates.*
- Store and query float, int8, and binary vectors in `vec0` virtual tables
- Pre-filter vectors with `rowid IN (...)` subqueries
- Written in pure C, no dependencies,
runs anywhere SQLite runs (Linux/MacOS/Windows, in the browser with WASM,
Raspberry Pis, etc.)
## Sample usage
```sql
.load ./vec0
create virtual table vec_examples using vec0(
sample_embedding float[8]
);
-- vectors can be provided as JSON or in a compact binary format
insert into vec_examples(rowid, sample_embedding)
values
(1, '[-0.200, 0.250, 0.341, -0.211, 0.645, 0.935, -0.316, -0.924]'),
(2, '[0.443, -0.501, 0.355, -0.771, 0.707, -0.708, -0.185, 0.362]'),
(3, '[0.716, -0.927, 0.134, 0.052, -0.669, 0.793, -0.634, -0.162]'),
(4, '[-0.710, 0.330, 0.656, 0.041, -0.990, 0.726, 0.385, -0.958]');
-- KNN style query goes brrrr
select
rowid,
distance
from vec_examples
where sample_embedding match '[0.890, 0.544, 0.825, 0.961, 0.358, 0.0196, 0.521, 0.175]'
order by distance
limit 2;
/*
┌───────┬──────────────────┐
│ rowid │ distance │
├───────┼──────────────────┤
│ 2 │ 2.38687372207642 │
│ 1 │ 2.38978505134583 │
└───────┴──────────────────┘
*/
```
## Roadmap
Not currently implemented, but planned in the future (after initial beta version):
- Approximate nearest neighbors search (IVF and HNSW)
- Metadata filtering + custom internal partitioning
- More vector types (float16, int16, sparse, etc.) and distance functions
Additionally, there will be pre-compiled and pre-packaged packages of `sqlite-vec` for the following platforms:
- `pip` for Python
- `npm` for Node.js / Deno / Bun
- `gem` for Ruby
- `cargo` for Rust
- A single `.c` and `.h` amalgammation for C/C++
- Go module for Golang (requires CGO)
- Datasette and sqlite-utils plugins
- Pre-compiled loadable extensions on Github releases
## Support
Is your company interested in sponsoring `sqlite-vec` development? Send me an email to get more info: https://alexgarcia.xyz

1
VERSION Normal file
View file

@ -0,0 +1 @@
0.0.1-alpha.0

0
benchmarks/README.md Normal file
View file

View file

@ -0,0 +1,17 @@
```
python3 bench/bench.py \
-n "sift1m" \
-i sift/sift_base.fvecs \
-q sift/sift_query.fvecs \
--sample 10000 --qsample 100 \
-k 10
```
```
python3 bench/bench.py \
-n "sift1m" \
-i sift/sift_base.fvecs \
-q sift/sift_query.fvecs \
--sample 10000 --qsample 100 \
-k 10
```

View file

@ -0,0 +1,403 @@
import numpy as np
import numpy.typing as npt
import time
import hnswlib
import sqlite3
import faiss
import lancedb
import pandas as pd
# import chromadb
from usearch.index import Index, search, MetricKind
from dataclasses import dataclass
from typing import List
@dataclass
class BenchResult:
tool: str
build_time_ms: float
query_times_ms: List[float]
def duration(seconds: float):
ms = seconds * 1000
return f"{int(ms)}ms"
def cosine_similarity(
vec: npt.NDArray[np.float32], mat: npt.NDArray[np.float32], do_norm: bool = True
) -> npt.NDArray[np.float32]:
sim = vec @ mat.T
if do_norm:
sim /= np.linalg.norm(vec) * np.linalg.norm(mat, axis=1)
return sim
def topk(
vec: npt.NDArray[np.float32],
mat: npt.NDArray[np.float32],
k: int = 5,
do_norm: bool = True,
) -> tuple[npt.NDArray[np.int32], npt.NDArray[np.float32]]:
sim = cosine_similarity(vec, mat, do_norm=do_norm)
# Rather than sorting all similarities and taking the top K, it's faster to
# argpartition and then just sort the top K.
# The difference is O(N logN) vs O(N + k logk)
indices = np.argpartition(-sim, kth=k)[:k]
top_indices = np.argsort(-sim[indices])
return indices[top_indices], sim[top_indices]
def ivecs_read(fname):
a = np.fromfile(fname, dtype="int32")
d = a[0]
return a.reshape(-1, d + 1)[:, 1:].copy()
def fvecs_read(fname):
return ivecs_read(fname).view("float32")
def bench_hnsw(base, query):
t0 = time.time()
p = hnswlib.Index(space="ip", dim=128) # possible options are l2, cosine or ip
# NOTE: Use default settings from the README.
print("buildings hnsw")
p.init_index(max_elements=base.shape[0], ef_construction=200, M=16)
ids = np.arange(base.shape[0])
p.add_items(base, ids)
p.set_ef(50)
print("build time", time.time() - t0)
results = []
times = []
t = time.time()
for idx, q in enumerate(query):
t0 = time.time()
result = p.knn_query(q, k=5)
if idx < 5:
print(result[0])
results.append(result)
times.append(time.time() - t0)
print(time.time() - t)
print("hnsw avg", np.mean(times))
return results
def bench_hnsw_bf(base, query, k) -> BenchResult:
print("hnswlib-bf")
dimensions = base.shape[1]
t0 = time.time()
p = hnswlib.BFIndex(space="l2", dim=dimensions)
p.init_index(max_elements=base.shape[0])
ids = np.arange(base.shape[0])
p.add_items(base, ids)
build_time = time.time() - t0
results = []
times = []
t = time.time()
for idx, q in enumerate(query):
t0 = time.time()
result = p.knn_query(q, k=k)
results.append(result)
times.append(time.time() - t0)
return BenchResult("hnswlib-bf", build_time, times)
def bench_numpy(base, query, k) -> BenchResult:
print("numpy")
times = []
results = []
for idx, q in enumerate(query):
t0 = time.time()
result = topk(q, base, k=k)
results.append(result)
times.append(time.time() - t0)
return BenchResult("numpy", 0, times)
def bench_sqlite_vec(base, query, page_size, chunk_size, k) -> BenchResult:
dimensions = base.shape[1]
print(f"sqlite-vec {page_size} {chunk_size}")
db = sqlite3.connect(":memory:")
db.execute(f"PRAGMA page_size = {page_size}")
db.enable_load_extension(True)
db.load_extension("./dist/vec0")
db.execute(
f"""
create virtual table vec_sift1m using vec0(
chunk_size={chunk_size},
vector float[{dimensions}]
)
"""
)
t = time.time()
with db:
db.executemany(
"insert into vec_sift1m(vector) values (?)",
list(map(lambda x: [x.tobytes()], base)),
)
build_time = time.time() - t
times = []
results = []
for (
idx,
q,
) in enumerate(query):
t0 = time.time()
result = db.execute(
"""
select
rowid,
distance
from vec_sift1m
where vector match ?
and k = ?
order by distance
""",
[q.tobytes(), k],
).fetchall()
times.append(time.time() - t0)
return BenchResult(f"sqlite-vec vec0 ({page_size}|{chunk_size})", build_time, times)
def bench_sqlite_normal(base, query, page_size, k) -> BenchResult:
print(f"sqlite-normal")
db = sqlite3.connect(":memory:")
db.enable_load_extension(True)
db.load_extension("./dist/vec0")
db.execute(f"PRAGMA page_size={page_size}")
db.execute(f"create table sift1m(vector);")
t = time.time()
with db:
db.executemany(
"insert into sift1m(vector) values (?)",
list(map(lambda x: [x.tobytes()], base)),
)
build_time = time.time() - t
times = []
results = []
t = time.time()
for (
idx,
q,
) in enumerate(query):
t0 = time.time()
result = db.execute(
"""
select
rowid,
vec_distance_l2(?, vector) as distance
from sift1m
order by distance
limit ?
""",
[q.tobytes(), k],
).fetchall()
times.append(time.time() - t0)
return BenchResult(f"sqlite-vec normal ({page_size})", build_time, times)
def bench_faiss(base, query, k) -> BenchResult:
dimensions = base.shape[1]
print("faiss")
t = time.time()
index = faiss.IndexFlatL2(dimensions)
index.add(base)
build_time = time.time() - t
times = []
results = []
t = time.time()
for idx, q in enumerate(query):
t0 = time.time()
distances, rowids = index.search(x=np.array([q]), k=k)
results.append(rowids)
times.append(time.time() - t0)
print("faiss avg", duration(np.mean(times)))
return BenchResult("faiss", build_time, times)
def bench_lancedb(base, query, k) -> BenchResult:
dimensions = base.shape[1]
db = lancedb.connect("a")
data = [{"vector": row.reshape(1, -1)[0]} for row in base]
# Create a DataFrame where each row is a 1D array
df = pd.DataFrame(data=data, columns=["vector"])
t = time.time()
db.create_table("t", data=df)
build_time = time.time() - t
tbl = db.open_table("t")
times = []
for q in query:
t0 = time.time()
result = tbl.search(q).limit(k).to_arrow()
times.append(time.time() - t0)
return BenchResult("lancedb", build_time, times)
# def bench_chroma(base, query, k):
# chroma_client = chromadb.Client()
# collection = chroma_client.create_collection(name="my_collection")
#
# t = time.time()
# # chroma doesn't allow for more than 41666 vectors to be inserted at once (???)
# i = 0
# collection.add(embeddings=base, ids=[str(x) for x in range(len(base))])
# print("chroma build time: ", duration(time.time() - t))
# times = []
# for q in query:
# t0 = time.time()
# result = collection.query(
# query_embeddings=[q.tolist()],
# n_results=k,
# )
# print(result)
# times.append(time.time() - t0)
# print("chroma avg", duration(np.mean(times)))
def bench_usearch_npy(base, query, k) -> BenchResult:
times = []
for q in query:
t0 = time.time()
# result = index.search(q, exact=True)
result = search(base, q, k, MetricKind.L2sq, exact=True)
times.append(time.time() - t0)
return BenchResult("usearch numpy exact=True", 0, times)
def bench_usearch_special(base, query, k) -> BenchResult:
dimensions = base.shape[1]
index = Index(ndim=dimensions)
t = time.time()
index.add(np.arange(len(base)), base)
build_time = time.time() - t
times = []
for q in query:
t0 = time.time()
result = index.search(q, exact=True)
times.append(time.time() - t0)
return BenchResult("usuearch index exact=True", build_time, times)
from rich.console import Console
from rich.table import Table
def suite(name, base, query, k):
print(f"Starting benchmark suite: {name} {base.shape}, k={k}")
results = []
# n = bench_chroma(base[:40000], query, k=k)
# n = bench_usearch_npy(base, query, k=k)
# n = bench_usearch_special(base, query, k=k)
results.append(bench_faiss(base, query, k=k))
results.append(bench_hnsw_bf(base, query, k=k))
# n = bench_sqlite_vec(base, query, 4096, 1024, k=k)
# n = bench_sqlite_vec(base, query, 32768, 1024, k=k)
results.append(bench_sqlite_vec(base, query, 32768, 256, k=k))
# n = bench_sqlite_vec(base, query, 16384, 64, k=k)
# n = bench_sqlite_vec(base, query, 16384, 32, k=k)
results.append(bench_sqlite_normal(base, query, 8192, k=k))
results.append(bench_lancedb(base, query, k=k))
results.append(bench_numpy(base, query, k=k))
# h = bench_hnsw(base, query)
table = Table(
title=f"{name}: {base.shape[0]:,} {base.shape[1]}-dimension vectors, k={k}"
)
table.add_column("Tool")
table.add_column("Build Time (ms)", justify="right")
table.add_column("Query time (ms)", justify="right")
for res in results:
table.add_row(
res.tool, duration(res.build_time_ms), duration(np.mean(res.query_times_ms))
)
console = Console()
console.print(table)
import argparse
def parse_args():
parser = argparse.ArgumentParser(description="Benchmark processing script.")
# Required arguments
parser.add_argument("-n", "--name", required=True, help="Name of the benchmark.")
parser.add_argument(
"-i", "--input", required=True, help="Path to input file (.npy)."
)
parser.add_argument(
"-k", type=int, required=True, help="Parameter k to use in benchmark."
)
# Optional arguments
parser.add_argument(
"-q", "--query", required=False, help="Path to query file (.npy)."
)
parser.add_argument(
"--sample",
type=int,
required=False,
help="Number of entries in base to use. Defaults all",
)
parser.add_argument(
"--qsample",
type=int,
required=False,
help="Number of queries to use. Defaults all",
)
args = parser.parse_args()
return args
from pathlib import Path
def cli_read_input(input):
input_path = Path(input)
if input_path.suffix == ".fvecs":
return fvecs_read(input_path)
if input_path.suffx == ".npy":
return np.fromfile(input_path, dtype="float32")
raise Exception("unknown filetype", input)
def cli_read_query(query, base):
if query is None:
return base[np.random.choice(base.shape[0], 100, replace=False), :]
return cli_read_input(query)
def main():
args = parse_args()
base = cli_read_input(args.input)[: args.sample]
queries = cli_read_query(args.query, base)[: args.qsample]
suite(args.name, base, queries, args.k)
from sys import argv
# base = fvecs_read("sift/sift_base.fvecs") # [:100000]
# query = fvecs_read("sift/sift_query.fvecs")[:100]
# print(base.shape)
# k = int(argv[1]) if len(argv) > 1 else 5
# suite("sift1m", base, query, k)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,17 @@
.timer on
pragma page_size = 32768;
--pragma page_size = 16384;
--pragma page_size = 16384;
--pragma page_size = 4096;
create virtual table vec_items using vec0(
embedding float[1536]
);
-- 65s (limit 1e5), ~615MB on disk
insert into vec_items
select
rowid,
vector
from vec_npy_each(vec_npy_file('examples/dbpedia-openai/data/vectors.npy'))
limit 1e5;

View file

@ -0,0 +1,31 @@
.timer on
select rowid, distance
from vec_items
where embedding match (select embedding from vec_items where rowid = 100)
and k = :k
order by distance;
select rowid, distance
from vec_items
where embedding match (select embedding from vec_items where rowid = 100)
and k = :k
order by distance;
select rowid, distance
from vec_items
where embedding match (select embedding from vec_items where rowid = 100)
and k = :k
order by distance;
select rowid, distance
from vec_items
where embedding match (select embedding from vec_items where rowid = 100)
and k = :k
order by distance;
select rowid, distance
from vec_items
where embedding match (select embedding from vec_items where rowid = 100)
and k = :k
order by distance;

View file

@ -0,0 +1,85 @@
import sqlite3
import time
def connect(path):
db = sqlite3.connect(path)
db.enable_load_extension(True)
db.load_extension("../dist/vec0")
db.execute("select load_extension('../dist/vec0', 'sqlite3_vec_fs_read_init')")
db.enable_load_extension(False)
return db
page_sizes = [ # 4096, 8192,
16384,
32768,
]
chunk_sizes = [128, 256, 1024, 2048]
types = ["f32", "int8", "bit"]
SRC = "../examples/dbpedia-openai/data/vectors.npy"
for page_size in page_sizes:
for chunk_size in chunk_sizes:
for t in types:
print(f"{t} page_size={page_size}, chunk_size={chunk_size}")
t0 = time.time()
db = connect(f"dbs/test.{page_size}.{chunk_size}.{t}.db")
db.execute(f"pragma page_size = {page_size}")
with db:
db.execute(
f"""
create virtual table vec_items using vec0(
embedding {t}[1536],
chunk_size={chunk_size}
)
"""
)
func = "vector"
if t == "int8":
func = "vec_quantize_i8(vector, 'unit')"
if t == "bit":
func = "vec_quantize_binary(vector)"
db.execute(
f"""
insert into vec_items
select rowid, {func}
from vec_npy_each(vec_npy_file(?))
limit 100000
""",
[SRC],
)
elapsed = time.time() - t0
print(elapsed)
"""
# for 100_000
page_size=4096, chunk_size=256
3.5894200801849365
page_size=4096, chunk_size=1024
60.70046401023865
page_size=4096, chunk_size=2048
201.04426288604736
page_size=8192, chunk_size=256
7.034514904022217
page_size=8192, chunk_size=1024
9.983598947525024
page_size=8192, chunk_size=2048
12.318921089172363
page_size=16384, chunk_size=256
4.97080397605896
page_size=16384, chunk_size=1024
6.051195859909058
page_size=16384, chunk_size=2048
8.492683172225952
page_size=32768, chunk_size=256
5.906642198562622
page_size=32768, chunk_size=1024
5.876632213592529
page_size=32768, chunk_size=2048
5.420510292053223
"""

View file

@ -0,0 +1,83 @@
import sqlite3
import time
from random import randrange
from statistics import mean
def connect(path):
print(path)
db = sqlite3.connect(path)
db.enable_load_extension(True)
db.load_extension("../dist/vec0")
db.execute("select load_extension('../dist/vec0', 'sqlite3_vec_fs_read_init')")
db.enable_load_extension(False)
return db
page_sizes = [ # 4096, 8192,
16384,
32768,
]
chunk_sizes = [128, 256, 1024, 2048]
types = ["f32", "int8", "bit"]
types.reverse()
for t in types:
for page_size in page_sizes:
for chunk_size in chunk_sizes:
print(f"page_size={page_size}, chunk_size={chunk_size}")
func = "embedding"
if t == "int8":
func = "vec_quantize_i8(embedding, 'unit')"
if t == "bit":
func = "vec_quantize_binary(embedding)"
times = []
trials = 20
db = connect(f"dbs/test.{page_size}.{chunk_size}.{t}.db")
for trial in range(trials):
t0 = time.time()
results = db.execute(
f"""
select rowid
from vec_items
where embedding match (select {func} from vec_items where rowid = ?)
and k = 10
order by distance
""",
[randrange(100000)],
).fetchall()
times.append(time.time() - t0)
print(mean(times))
"""
page_size=4096, chunk_size=256
0.2635102152824402
page_size=4096, chunk_size=1024
0.2609449863433838
page_size=4096, chunk_size=2048
0.275589919090271
page_size=8192, chunk_size=256
0.18621582984924318
page_size=8192, chunk_size=1024
0.20939643383026124
page_size=8192, chunk_size=2048
0.22376316785812378
page_size=16384, chunk_size=256
0.16012665033340454
page_size=16384, chunk_size=1024
0.18346318006515502
page_size=16384, chunk_size=2048
0.18224761486053467
page_size=32768, chunk_size=256
0.14202518463134767
page_size=32768, chunk_size=1024
0.15340715646743774
page_size=32768, chunk_size=2048
0.18018823862075806
"""

View file

@ -0,0 +1,24 @@
import sqlite3
import time
def connect(path):
db = sqlite3.connect(path)
db.enable_load_extension(True)
db.load_extension("../dist/vec0")
db.execute("select load_extension('../dist/vec0', 'sqlite3_vec_fs_read_init')")
db.enable_load_extension(False)
return db
page_sizes = [4096, 8192, 16384, 32768]
chunk_sizes = [256, 1024, 2048]
for page_size in page_sizes:
for chunk_size in chunk_sizes:
print(f"page_size={page_size}, chunk_size={chunk_size}")
t0 = time.time()
db = connect(f"dbs/test.{page_size}.{chunk_size}.db")
print(db.execute("pragma page_size").fetchone()[0])
print(db.execute("select count(*) from vec_items_rowids").fetchone()[0])

View file

@ -0,0 +1,5 @@
# `sqlite-vec` statically compiled in the SQLite CLI
You can compile your own version of the `sqlite3` CLI with `sqlite-vec` builtin. The process is not well documented, but the special `SQLITE_EXTRA_INIT` compile option can be used to "inject" code at initialization time. See the `Makefile` at the root of this project for some more info.
The `core_init.c` file here demonstrates auto-loading the `sqlite-vec` entrypoints at startup.

View file

@ -0,0 +1,8 @@
#include "sqlite3.h"
#include "sqlite-vec.h"
#include <stdio.h>
int core_init(const char *dummy) {
int rc = sqlite3_auto_extension((void *)sqlite3_vec_init);
if(rc != SQLITE_OK) return rc;
return sqlite3_auto_extension((void *)sqlite3_vec_fs_read_init);
}

5
examples/wasm/README.md Normal file
View file

@ -0,0 +1,5 @@
# `sqlite-vec` statically compiled into WASM builds
You can compile your own version of SQLite's WASM build with `sqlite-vec` builtin. Dynamically loading SQLite extensions is not supported in the official WASM build yet, but you can statically compile extensions in. It's not well documented, but the `sqlite3_wasm_extra_init` option in the SQLite `ext/wasm` Makefile allows you to inject your own code at initialization time. See the `Makefile` at the room of the project for more info.
The `wasm.c` file here demonstrates auto-loading the `sqlite-vec` entrypoints at startup.

6
examples/wasm/wasm.c Normal file
View file

@ -0,0 +1,6 @@
#include "sqlite3.h"
#include "sqlite-vec.h"
int sqlite3_wasm_extra_init(const char *) {
sqlite3_auto_extension((void (*)(void)) sqlite3_vec_init);
}

23
site/index.html Normal file
View file

@ -0,0 +1,23 @@
<html>
<body>
<h1>sqlite-vec</h1>
<div id="target">
</div>
<link rel="stylesheet" href="./slim.css"/>
<script type="module">
import {attach} from "./slim.js";
import {default as init} from "./sqlite3.mjs";
const sqlite3 = await init();
const v = new sqlite3.oo1.DB(":memory:").selectValue('select vec_version()');
document.querySelector('h1').textContent += `(${v})`;
attach(document.body.querySelector('#target'), sqlite3,
`select
sqlite_version(),
vec_version(),
vec_to_json(X'00000000000080bf');
`);
</script>
</body>
</html>

4607
sqlite-vec.c Normal file

File diff suppressed because it is too large Load diff

11
sqlite-vec.h.tmpl Normal file
View file

@ -0,0 +1,11 @@
#include "sqlite3ext.h"
#define SQLITE_VEC_VERSION "v${VERSION}"
#define SQLITE_VEC_DATE "${DATE}"
#define SQLITE_VEC_SOURCE "${SOURCE}"
int sqlite3_vec_init(sqlite3 *db, char **pzErrMsg,
const sqlite3_api_routines *pApi);
int sqlite3_vec_fs_read_init(sqlite3 *db, char **pzErrMsg,
const sqlite3_api_routines *pApi);

1
tests/.gitignore vendored Normal file
View file

@ -0,0 +1 @@
target/

16
tests/Cargo.lock generated Normal file
View file

@ -0,0 +1,16 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "cc"
version = "1.0.90"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8cd6604a82acf3039f1144f54b8eb34e91ffba622051189e71b781822d5ee1f5"
[[package]]
name = "tests"
version = "0.1.0"
dependencies = [
"cc",
]

15
tests/Cargo.toml Normal file
View file

@ -0,0 +1,15 @@
[package]
name = "tests"
version = "0.1.0"
edition = "2021"
[dependencies]
[build-dependencies]
cc = "1.0"
[[bin]]
name = "unittest"
path = "unittest.rs"

13
tests/build.rs Normal file
View file

@ -0,0 +1,13 @@
use std::env;
use std::path::{Path, PathBuf};
use std::process::Command;
fn main() {
cc::Build::new()
.file("../sqlite-vec.c")
.include(".")
.static_flag(true)
.compile("sqlite-vec-internal");
println!("cargo:rerun-if-changed=usleep.c");
println!("cargo:rerun-if-changed=build.rs");
}

View file

@ -0,0 +1,12 @@
#include <stdlib.h>
int min_idx(
// list of distances, size n
const float *distances,
// number of entries in distances
int32_t n,
// output array of size k, the indicies of the lowest k values in distances
int32_t *out,
// output number of elements
int32_t k
);

49
tests/test-correctness.py Normal file
View file

@ -0,0 +1,49 @@
import sqlite3
import json
db = sqlite3.connect("test2.db")
db.enable_load_extension(True)
db.load_extension("dist/vec0")
db.enable_load_extension(False)
db.row_factory = sqlite3.Row
db.execute('attach database "sift1m-base.db" as sift1m')
#def test_sift1m():
rows = db.execute(
'''
with q as (
select rowid, vector, k100 from sift1m.sift1m_query limit 10
),
results as (
select
q.rowid as query_rowid,
vec_sift1m.rowid as vec_rowid,
distance,
k100 as k100_groundtruth
from q
join vec_sift1m
where
vec_sift1m.vector match q.vector
and k = 100
order by distance
)
select
query_rowid,
json_group_array(vec_rowid order by distance) as topk,
k100_groundtruth,
json_group_array(vec_rowid order by distance) == k100_groundtruth
from results
group by 1;
''').fetchall()
results = []
for row in rows:
actual = json.loads(row["topk"])
expected = json.loads(row["k100_groundtruth"])
ncorrect = sum([x in expected for x in actual])
results.append(ncorrect / 100.0)
from statistics import mean
print(mean(results))

874
tests/test-loadable.py Normal file
View file

@ -0,0 +1,874 @@
# ruff: noqa: E731
import re
from typing import List
import sqlite3
import unittest
from random import random
import struct
import inspect
import pytest
import json
import numpy as np
from math import isclose
EXT_PATH = "./dist/vec0"
def bitmap_full(n: int) -> bytearray:
assert (n % 8) == 0
return bytes([0xFF] * int(n / 8))
def bitmap_zerod(n: int) -> bytearray:
assert (n % 8) == 0
return bytes([0x00] * int(n / 8))
def f32_zerod(n: int) -> bytearray:
return bytes([0x00, 0x00, 0x00, 0x00] * int(n))
CHAR_BIT = 8
def _f32(list):
return struct.pack("%sf" % len(list), *list)
def _int8(list):
return struct.pack("%sb" % len(list), *list)
def connect(ext, path=":memory:"):
db = sqlite3.connect(path)
db.execute(
"create temp table base_functions as select name from pragma_function_list"
)
db.execute("create temp table base_modules as select name from pragma_module_list")
db.enable_load_extension(True)
db.load_extension(ext)
db.execute(
"create temp table loaded_functions as select name from pragma_function_list where name not in (select name from base_functions) order by name"
)
db.execute(
"create temp table loaded_modules as select name from pragma_module_list where name not in (select name from base_modules) order by name"
)
db.row_factory = sqlite3.Row
return db
db = connect(EXT_PATH)
# db.load_extension(EXT_PATH, entrypoint="trace_debug")
def explain_query_plan(sql):
return db.execute("explain query plan " + sql).fetchone()["detail"]
def execute_all(cursor, sql, args=None):
if args is None:
args = []
results = cursor.execute(sql, args).fetchall()
return list(map(lambda x: dict(x), results))
def spread_args(args):
return ",".join(["?"] * len(args))
FUNCTIONS = [
"vec_add",
"vec_bit",
"vec_debug",
"vec_distance_cosine",
"vec_distance_hamming",
"vec_distance_l2",
"vec_f32",
"vec_int8",
"vec_length",
"vec_normalize",
"vec_quantize_binary",
"vec_quantize_i8",
"vec_quantize_i8",
"vec_slice",
"vec_sub",
"vec_to_json",
"vec_version",
]
MODULES = ["vec0", "vec_each", "vec_npy_each"]
def test_funcs():
funcs = list(
map(
lambda a: a[0],
db.execute("select name from loaded_functions").fetchall(),
)
)
assert funcs == FUNCTIONS
def test_modules():
modules = list(
map(lambda a: a[0], db.execute("select name from loaded_modules").fetchall())
)
assert modules == MODULES
def test_vec_version():
vec_version = lambda *args: db.execute("select vec_version()", args).fetchone()[0]
assert vec_version()[0] == "v"
def test_vec_debug():
vec_debug = lambda *args: db.execute("select vec_debug()", args).fetchone()[0]
d = vec_debug().split("\n")
assert len(d) == 4
def test_vec_bit():
vec_bit = lambda *args: db.execute("select vec_bit(?)", args).fetchone()[0]
assert vec_bit(b"\xff") == b"\xff"
assert db.execute("select subtype(vec_bit(X'FF'))").fetchone()[0] == 224
with pytest.raises(
sqlite3.OperationalError, match="zero-length vectors are not supported."
):
db.execute("select vec_bit(X'')").fetchone()
for x in [None, "text", 1, 1.999]:
with pytest.raises(
sqlite3.OperationalError, match="Unknown type for bitvector."
):
db.execute("select vec_bit(?)", [x]).fetchone()
def test_vec_f32():
vec_f32 = lambda *args: db.execute("select vec_f32(?)", args).fetchone()[0]
assert vec_f32(b"\x00\x00\x00\x00") == b"\x00\x00\x00\x00"
assert vec_f32("[0.0000]") == b"\x00\x00\x00\x00"
# fmt: off
tests = [
[0],
[0, 0, 0, 0],
[1, -1, 10, -10],
[-0, 0, .0001, -.0001],
]
# fmt: on
for test in tests:
assert vec_f32(json.dumps(test)) == _f32(test)
assert db.execute("select subtype(vec_f32(X'00000000'))").fetchone()[0] == 223
with pytest.raises(
sqlite3.OperationalError, match="zero-length vectors are not supported."
):
vec_f32(b"")
for invalid in [None, 1, 1.2]:
with pytest.raises(
sqlite3.OperationalError,
match=re.escape(
"Input must have type BLOB (compact format) or TEXT (JSON)",
),
):
vec_f32(invalid)
with pytest.raises(
sqlite3.OperationalError,
match="invalid float32 vector BLOB length. Must be divisible by 4, found 5",
):
vec_f32(b"aaaaa")
with pytest.raises(
sqlite3.OperationalError,
match=re.escape("JSON array parsing error: Input does not start with '['"),
):
vec_f32("1]")
# TODO mas tests
# TODO different error message
with pytest.raises(
sqlite3.OperationalError,
match="zero-length vectors are not supported.",
):
vec_f32("[")
# vec_f32("[]")
def test_vec_int8():
vec_int8 = lambda *args: db.execute("select vec_int8(?)", args).fetchone()[0]
assert vec_int8(b"\x00") == _int8([0])
assert vec_int8(b"\x00\x0f") == _int8([0, 15])
assert db.execute("select subtype(vec_int8(?))", [b"\x00"]).fetchone()[0] == 225
def npy_cosine(a, b):
return 1 - (np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
def npy_l2(a, b):
return np.linalg.norm(a - b)
def test_vec_distance_cosine():
vec_distance_cosine = lambda *args, a="?", b="?": db.execute(
f"select vec_distance_cosine({a}, {b})", args
).fetchone()[0]
def check(a, b, dtype=np.float32):
if dtype == np.float32:
transform = "?"
elif dtype == np.int8:
transform = "vec_int8(?)"
a = np.array(a, dtype=dtype)
b = np.array(b, dtype=dtype)
x = vec_distance_cosine(a, b, a=transform, b=transform)
y = npy_cosine(a, b)
assert isclose(x, y, abs_tol=1e-6)
check([1.2, 0.1], [0.4, -0.4])
check([-1.2, -0.1], [-0.4, 0.4])
check([1, 2, 3], [-9, -8, -7], dtype=np.int8)
assert vec_distance_cosine("[1.1, 1.0]", "[1.2, 1.2]") == 0.001131898257881403
def test_vec_distance_hamming():
vec_distance_hamming = lambda *args: db.execute(
"select vec_distance_hamming(vec_bit(?), vec_bit(?))", args
).fetchone()[0]
assert vec_distance_hamming(b"\xff", b"\x00") == 8
assert vec_distance_hamming(b"\xff", b"\x01") == 7
assert vec_distance_hamming(b"\xab", b"\xab") == 0
with pytest.raises(
sqlite3.OperationalError,
match="Cannot calculate hamming distance between two float32 vectors.",
):
db.execute("select vec_distance_hamming(vec_f32('[1.0]'), vec_f32('[1.0]'))")
with pytest.raises(
sqlite3.OperationalError,
match="Cannot calculate hamming distance between two int8 vectors.",
):
db.execute("select vec_distance_hamming(vec_int8(X'FF'), vec_int8(X'FF'))")
def test_vec_distance_l2():
vec_distance_l2 = lambda *args, a="?", b="?": db.execute(
f"select vec_distance_l2({a}, {b})", args
).fetchone()[0]
def check(a, b, dtype=np.float32):
if dtype == np.float32:
transform = "?"
elif dtype == np.int8:
transform = "vec_int8(?)"
a = np.array(a, dtype=dtype)
b = np.array(b, dtype=dtype)
x = vec_distance_l2(a, b, a=transform, b=transform)
y = npy_l2(a, b)
assert isclose(x, y, abs_tol=1e-6)
check([1.2, 0.1], [0.4, -0.4])
check([-1.2, -0.1], [-0.4, 0.4])
check([1, 2, 3], [-9, -8, -7], dtype=np.int8)
def test_vec_length():
def test_f32():
vec_length = lambda *args: db.execute("select vec_length(?)", args).fetchone()[
0
]
assert vec_length(b"\xAA\xBB\xCC\xDD") == 1
assert vec_length(b"\xAA\xBB\xCC\xDD\x01\x02\x03\x04") == 2
assert vec_length(f32_zerod(1024)) == 1024
with pytest.raises(
sqlite3.OperationalError, match="zero-length vectors are not supported."
):
assert vec_length(b"") == 0
with pytest.raises(
sqlite3.OperationalError, match="zero-length vectors are not supported."
):
vec_length("[]")
def test_int8():
vec_length_int8 = lambda *args: db.execute(
"select vec_length(vec_int8(?))", args
).fetchone()[0]
assert vec_length_int8(b"\xAA") == 1
assert vec_length_int8(b"\xAA\xBB\xCC\xDD") == 4
assert vec_length_int8(b"\xAA\xBB\xCC\xDD\x01\x02\x03\x04") == 8
with pytest.raises(
sqlite3.OperationalError, match="zero-length vectors are not supported."
):
assert vec_length_int8(b"") == 0
def test_bit():
vec_length_bit = lambda *args: db.execute(
"select vec_length(vec_bit(?))", args
).fetchone()[0]
assert vec_length_bit(b"\xAA") == 8
assert vec_length_bit(b"\xAA\xBB\xCC\xDD") == 8 * 4
assert vec_length_bit(b"\xAA\xBB\xCC\xDD\x01\x02\x03\x04") == 8 * 8
with pytest.raises(
sqlite3.OperationalError, match="zero-length vectors are not supported."
):
assert vec_length_bit(b"") == 0
test_f32()
test_int8()
test_bit()
def test_vec_normalize():
vec_normalize = lambda *args: db.execute(
"select vec_normalize(?)", args
).fetchone()[0]
assert list(struct.unpack_from("4f", vec_normalize(_f32([1, 2, -1, -2])))) == [
0.3162277638912201,
0.6324555277824402,
-0.3162277638912201,
-0.6324555277824402,
]
def test_vec_slice():
vec_slice = lambda *args, f="?": db.execute(
f"select vec_slice({f}, ?, ?)", args
).fetchone()[0]
assert vec_slice(_f32([1.1, 2.2, 3.3]), 0, 3) == _f32([1.1, 2.2, 3.3])
assert vec_slice(_f32([1.1, 2.2, 3.3]), 0, 2) == _f32([1.1, 2.2])
assert vec_slice(_f32([1.1, 2.2, 3.3]), 0, 1) == _f32([1.1])
assert vec_slice(_int8([1, 2, 3]), 0, 3, f="vec_int8(?)") == _int8([1, 2, 3])
assert vec_slice(_int8([1, 2, 3]), 0, 2, f="vec_int8(?)") == _int8([1, 2])
assert vec_slice(_int8([1, 2, 3]), 0, 1, f="vec_int8(?)") == _int8([1])
assert vec_slice(b"\xAA\xBB\xCC\xDD", 0, 8, f="vec_bit(?)") == b"\xAA"
assert vec_slice(b"\xAA\xBB\xCC\xDD", 8, 16, f="vec_bit(?)") == b"\xBB"
assert vec_slice(b"\xAA\xBB\xCC\xDD", 8, 24, f="vec_bit(?)") == b"\xBB\xCC"
assert vec_slice(b"\xAA\xBB\xCC\xDD", 0, 32, f="vec_bit(?)") == b"\xAA\xBB\xCC\xDD"
with pytest.raises(
sqlite3.OperationalError, match="start index must be divisible by 8."
):
vec_slice(b"\xAA\xBB\xCC\xDD", 2, 32, f="vec_bit(?)")
with pytest.raises(
sqlite3.OperationalError, match="end index must be divisible by 8."
):
vec_slice(b"\xAA\xBB\xCC\xDD", 0, 31, f="vec_bit(?)")
with pytest.raises(
sqlite3.OperationalError, match="slice 'start' index must be a postive number."
):
vec_slice(b"\xab\xab\xab\xab", -1, 1)
with pytest.raises(
sqlite3.OperationalError, match="slice 'end' index must be a postive number."
):
vec_slice(b"\xab\xab\xab\xab", 0, -3)
with pytest.raises(
sqlite3.OperationalError,
match="slice 'start' index is greater than the number of dimensions",
):
vec_slice(b"\xab\xab\xab\xab", 2, 3)
with pytest.raises(
sqlite3.OperationalError,
match="slice 'end' index is greater than the number of dimensions",
):
vec_slice(b"\xab\xab\xab\xab", 0, 2)
with pytest.raises(
sqlite3.OperationalError,
match="slice 'start' index is greater than 'end' index",
):
vec_slice(b"\xab\xab\xab\xab", 1, 0)
def test_vec_add():
vec_add = lambda *args, a="?", b="?": db.execute(
f"select vec_add({a}, {b})", args
).fetchone()[0]
assert vec_add("[1]", "[2]") == _f32([3])
assert vec_add("[.1]", "[.2]") == _f32([0.3])
assert vec_add(_int8([1]), _int8([2]), a="vec_int8(?)", b="vec_int8(?)") == _int8(
[3]
)
with pytest.raises(
sqlite3.OperationalError,
match="Cannot add two bitvectors together.",
):
vec_add(b"0xff", b"0xff", a="vec_bit(?)", b="vec_bit(?)")
with pytest.raises(
sqlite3.OperationalError,
match="Vector type mistmatch. First vector has type float32, while the second has type int8.",
):
vec_add(_f32([1]), _int8([2]), b="vec_int8(?)")
with pytest.raises(
sqlite3.OperationalError,
match="Vector type mistmatch. First vector has type int8, while the second has type float32.",
):
vec_add(_int8([2]), _f32([1]), a="vec_int8(?)")
def test_vec_sub():
vec_sub = lambda *args, a="?", b="?": db.execute(
f"select vec_sub({a}, {b})", args
).fetchone()[0]
assert vec_sub("[1]", "[2]") == _f32([-1])
assert vec_sub("[.1]", "[.2]") == _f32([-0.1])
assert vec_sub(_int8([11]), _int8([2]), a="vec_int8(?)", b="vec_int8(?)") == _int8(
[9]
)
with pytest.raises(
sqlite3.OperationalError,
match="Cannot subtract two bitvectors together.",
):
vec_sub(b"0xff", b"0xff", a="vec_bit(?)", b="vec_bit(?)")
with pytest.raises(
sqlite3.OperationalError,
match="Vector type mistmatch. First vector has type float32, while the second has type int8.",
):
vec_sub(_f32([1]), _int8([2]), b="vec_int8(?)")
with pytest.raises(
sqlite3.OperationalError,
match="Vector type mistmatch. First vector has type int8, while the second has type float32.",
):
vec_sub(_int8([2]), _f32([1]), a="vec_int8(?)")
def test_vec_to_json():
vec_to_json = lambda *args, input="?": db.execute(
f"select vec_to_json({input})", args
).fetchone()[0]
assert vec_to_json("[1, 2, 3]") == "[1.000000,2.000000,3.000000]"
assert vec_to_json(b"\x00\x00\x00\x00\x00\x00\x80\xbf") == "[0.000000,-1.000000]"
assert vec_to_json(b"\x04", input="vec_int8(?)") == "[4]"
assert vec_to_json(b"\x04\xff", input="vec_int8(?)") == "[4,-1]"
assert vec_to_json(b"\xff", input="vec_bit(?)") == "[1,1,1,1,1,1,1,1]"
assert vec_to_json(b"\x0f", input="vec_bit(?)") == "[1,1,1,1,0,0,0,0]"
@pytest.mark.skip(reason="TODO")
def test_vec_quantize_i8():
vec_quantize_i8 = lambda *args: db.execute(
"select vec_quantize_i8()", args
).fetchone()[0]
assert vec_quantize_i8() == 111
@pytest.mark.skip(reason="TODO")
def test_vec_quantize_binary():
vec_quantize_binary = lambda *args: db.execute(
"select vec_quantize_binary()", args
).fetchone()[0]
assert vec_quantize_binary() == 111
@pytest.mark.skip(reason="TODO")
def test_vec0():
pass
def test_vec0_updates():
db = connect(EXT_PATH)
db.execute(
"""
create virtual table t using vec0(
aaa float[128],
bbb int8[128],
ccc bit[128]
);
"""
)
db.execute(
"insert into t values (?, ?, vec_int8(?), vec_bit(?))",
[
1,
np.full((128,), 0.0001, dtype="float32"),
np.full((128,), 4, dtype="int8"),
bitmap_full(128),
],
)
assert execute_all(db, "select * from t") == [
{
"rowid": 1,
"aaa": _f32([0.0001] * 128),
"bbb": _int8([4] * 128),
"ccc": bitmap_full(128),
}
]
db.execute(
"update t set aaa = ? where rowid = ?",
[np.full((128,), 0.00011, dtype="float32"), 1],
)
assert execute_all(db, "select * from t") == [
{
"rowid": 1,
"aaa": _f32([0.00011] * 128),
"bbb": _int8([4] * 128),
"ccc": bitmap_full(128),
}
]
def test_vec_each():
vec_each_f32 = lambda *args: execute_all(
db, "select rowid, * from vec_each(vec_f32(?))", args
)
assert vec_each_f32(_f32([1.0, 2.0, 3.0])) == [
{"rowid": 0, "value": 1.0},
{"rowid": 1, "value": 2.0},
{"rowid": 2, "value": 3.0},
]
import io
def to_npy(arr):
buf = io.BytesIO()
np.save(buf, arr)
buf.seek(0)
return buf.read()
def test_vec_npy_each():
vec_npy_each = lambda *args: execute_all(
db, "select rowid, * from vec_npy_each(?)", args
)
assert vec_npy_each(to_npy(np.array([1.1, 2.2, 3.3], dtype=np.float32))) == [
{
"rowid": 0,
"vector": _f32([1.1, 2.2, 3.3]),
},
]
assert vec_npy_each(to_npy(np.array([[1.1, 2.2, 3.3]], dtype=np.float32))) == [
{
"rowid": 0,
"vector": _f32([1.1, 2.2, 3.3]),
},
]
assert vec_npy_each(
to_npy(np.array([[1.1, 2.2, 3.3], [9.9, 8.8, 7.7]], dtype=np.float32))
) == [
{
"rowid": 0,
"vector": _f32([1.1, 2.2, 3.3]),
},
{
"rowid": 1,
"vector": _f32([9.9, 8.8, 7.7]),
},
]
def test_smoke():
db.execute("create virtual table vec_xyz using vec0( a float[2] )")
assert execute_all(
db,
"select name, ncol from pragma_table_list where name like 'vec_xyz%' order by name;",
) == [
{
"name": "vec_xyz",
"ncol": 4,
},
{
"name": "vec_xyz_chunks",
"ncol": 4,
},
{
"name": "vec_xyz_rowids",
"ncol": 4,
},
{
"name": "vec_xyz_vector_chunks00",
"ncol": 2,
},
]
chunk = db.execute("select * from vec_xyz_chunks").fetchone()
assert chunk["chunk_id"] == 1
assert chunk["validity"] == bytearray(int(1024 / 8))
assert chunk["rowids"] == bytearray(int(1024 * 8))
vchunk = db.execute("select * from vec_xyz_vector_chunks00").fetchone()
assert vchunk["rowid"] == 1
assert vchunk["vectors"] == bytearray(int(1024 * 4 * 2))
assert (
explain_query_plan(
"select * from vec_xyz where a match X'' order by distance limit 10"
)
== "SCAN vec_xyz VIRTUAL TABLE INDEX 0:knn:"
)
assert (
explain_query_plan("select * from vec_xyz")
== "SCAN vec_xyz VIRTUAL TABLE INDEX 0:fullscan"
)
assert (
explain_query_plan("select * from vec_xyz where rowid = 4")
== "SCAN vec_xyz VIRTUAL TABLE INDEX 3:point"
)
db.execute("insert into vec_xyz(rowid, a) select 1, X'000000000000803f'")
chunk = db.execute("select * from vec_xyz_chunks").fetchone()
assert chunk["chunk_id"] == 1
assert chunk["validity"] == b"\x01" + bytearray(int(1024 / 8) - 1)
assert chunk["rowids"] == b"\x01\x00\x00\x00\x00\x00\x00\x00" + bytearray(
int(1024 * 8) - 8
)
vchunk = db.execute("select * from vec_xyz_vector_chunks00").fetchone()
assert vchunk["rowid"] == 1
assert vchunk["vectors"] == b"\x00\x00\x00\x00\x00\x00\x80\x3f" + bytearray(
int(1024 * 4 * 2) - (2 * 4)
)
db.execute("insert into vec_xyz(rowid, a) select 2, X'0000000000000040'")
chunk = db.execute("select * from vec_xyz_chunks").fetchone()
assert chunk[
"rowids"
] == b"\x01\x00\x00\x00\x00\x00\x00\x00" + b"\x02\x00\x00\x00\x00\x00\x00\x00" + bytearray(
int(1024 * 8) - 8 * 2
)
assert chunk["chunk_id"] == 1
assert chunk["validity"] == b"\x03" + bytearray(int(1024 / 8) - 1)
vchunk = db.execute("select * from vec_xyz_vector_chunks00").fetchone()
assert vchunk["rowid"] == 1
assert vchunk[
"vectors"
] == b"\x00\x00\x00\x00\x00\x00\x80\x3f" + b"\x00\x00\x00\x00\x00\x00\x00\x40" + bytearray(
int(1024 * 4 * 2) - (2 * 4 * 2)
)
db.execute("insert into vec_xyz(rowid, a) select 3, X'00000000000080bf'")
chunk = db.execute("select * from vec_xyz_chunks").fetchone()
assert chunk["chunk_id"] == 1
assert chunk["validity"] == b"\x07" + bytearray(int(1024 / 8) - 1)
assert chunk[
"rowids"
] == b"\x01\x00\x00\x00\x00\x00\x00\x00" + b"\x02\x00\x00\x00\x00\x00\x00\x00" + b"\x03\x00\x00\x00\x00\x00\x00\x00" + bytearray(
int(1024 * 8) - 8 * 3
)
vchunk = db.execute("select * from vec_xyz_vector_chunks00").fetchone()
assert vchunk["rowid"] == 1
assert vchunk[
"vectors"
] == b"\x00\x00\x00\x00\x00\x00\x80\x3f" + b"\x00\x00\x00\x00\x00\x00\x00\x40" + b"\x00\x00\x00\x00\x00\x00\x80\xbf" + bytearray(
int(1024 * 4 * 2) - (2 * 4 * 3)
)
# db.execute("select * from vec_xyz")
assert execute_all(db, "select * from vec_xyz") == [
{"rowid": 1, "a": b"\x00\x00\x00\x00\x00\x00\x80?"},
{"rowid": 2, "a": b"\x00\x00\x00\x00\x00\x00\x00@"},
{"rowid": 3, "a": b"\x00\x00\x00\x00\x00\x00\x80\xbf"},
]
def test_vec0_stress_small_chunks():
data = np.zeros((1000, 8), dtype=np.float32)
for i in range(1000):
data[i] = np.array([(i + 1) * 0.1] * 8)
db.execute("create virtual table vec_small using vec0(chunk_size=8, a float[8])")
assert execute_all(db, "select rowid, * from vec_small") == []
with db:
for row in data:
db.execute("insert into vec_small(a) values (?) ", [row])
assert execute_all(db, "select rowid, * from vec_small limit 8") == [
{"rowid": 1, "a": _f32([0.1] * 8)},
{"rowid": 2, "a": _f32([0.2] * 8)},
{"rowid": 3, "a": _f32([0.3] * 8)},
{"rowid": 4, "a": _f32([0.4] * 8)},
{"rowid": 5, "a": _f32([0.5] * 8)},
{"rowid": 6, "a": _f32([0.6] * 8)},
{"rowid": 7, "a": _f32([0.7] * 8)},
{"rowid": 8, "a": _f32([0.8] * 8)},
]
assert db.execute("select count(*) from vec_small").fetchone()[0] == 1000
assert execute_all(
db, "select rowid, * from vec_small order by rowid desc limit 8"
) == [
{"rowid": 1000, "a": _f32([100.0] * 8)},
{"rowid": 999, "a": _f32([99.9] * 8)},
{"rowid": 998, "a": _f32([99.8] * 8)},
{"rowid": 997, "a": _f32([99.7] * 8)},
{"rowid": 996, "a": _f32([99.6] * 8)},
{"rowid": 995, "a": _f32([99.5] * 8)},
{"rowid": 994, "a": _f32([99.4] * 8)},
{"rowid": 993, "a": _f32([99.3] * 8)},
]
assert (
execute_all(
db,
"""
select rowid, a, distance
from vec_small
where a match ?
and k = 9
order by distance
""",
[_f32([50.0] * 8)],
)
== [
{
"a": _f32([500 * 0.1] * 8),
"distance": 0.0,
"rowid": 500,
},
{
"a": _f32([499 * 0.1] * 8),
"distance": 0.2828384041786194,
"rowid": 499,
},
{
"a": _f32([501 * 0.1] * 8),
"distance": 0.2828384041786194,
"rowid": 501,
},
{
"a": _f32([498 * 0.1] * 8),
"distance": 0.5656875967979431,
"rowid": 498,
},
{
"a": _f32([502 * 0.1] * 8),
"distance": 0.5656875967979431,
"rowid": 502,
},
{
"a": _f32([497 * 0.1] * 8),
"distance": 0.8485260009765625,
"rowid": 497,
},
{
"a": _f32([503 * 0.1] * 8),
"distance": 0.8485260009765625,
"rowid": 503,
},
{
"a": _f32([496 * 0.1] * 8),
"distance": 1.1313751935958862,
"rowid": 496,
},
{
"a": _f32([504 * 0.1] * 8),
"distance": 1.1313751935958862,
"rowid": 504,
},
]
)
def rowids_value(buffer: bytearray) -> List[int]:
assert (len(buffer) % 8) == 0
n = int(len(buffer) / 8)
return list(struct.unpack_from(f"<{n}q", buffer))
import numpy.typing as npt
def cosine_similarity(
vec: npt.NDArray[np.float32], mat: npt.NDArray[np.float32], do_norm: bool = True
) -> npt.NDArray[np.float32]:
sim = vec @ mat.T
if do_norm:
sim /= np.linalg.norm(vec) * np.linalg.norm(mat, axis=1)
return sim
def topk(
vec: npt.NDArray[np.float32],
mat: npt.NDArray[np.float32],
k: int = 5,
do_norm: bool = True,
) -> tuple[npt.NDArray[np.int32], npt.NDArray[np.float32]]:
sim = cosine_similarity(vec, mat, do_norm=do_norm)
# Rather than sorting all similarities and taking the top K, it's faster to
# argpartition and then just sort the top K.
# The difference is O(N logN) vs O(N + k logk)
indices = np.argpartition(-sim, kth=k)[:k]
top_indices = np.argsort(-sim[indices])
return indices[top_indices], sim[top_indices]
def test_stress1():
np.random.seed(1234)
data = np.random.uniform(-1.0, 1.0, (8000, 128)).astype(np.float32)
db.execute(
"create virtual table vec_stress1 using vec0( a float[128] distance_metric=cosine)"
)
with db:
for idx, row in enumerate(data):
db.execute("insert into vec_stress1 values (?, ?)", [idx, row])
queries = np.random.uniform(-1.0, 1.0, (100, 128)).astype(np.float32)
for q in queries:
ids, distances = topk(q, data, k=10)
rows = db.execute(
"""
select rowid, distance
from vec_stress1
where a match ? and k = ?
order by distance
""",
[q, 10],
).fetchall()
assert len(ids) == 10
assert len(rows) == 10
vec_ids = [row[0] for row in rows]
assert ids.tolist() == vec_ids
@pytest.mark.skip(reason="slow")
def test_stress():
db.execute("create virtual table vec_t1 using vec0( a float[1536])")
def rand_vec(n):
return struct.pack("%sf" % n, *list(map(lambda x: random(), range(n))))
for i in range(1025):
db.execute("insert into vec_t1(a) values (?)", [rand_vec(1536)])
rows = db.execute("select validity, rowids from vec_t1_chunks").fetchall()
assert len(rows) == 2
assert len(rows[0]["validity"]) == 1024 / CHAR_BIT
assert len(rows[0]["rowids"]) == 1024 * CHAR_BIT
assert rows[0]["validity"] == bitmap_full(1024)
assert rowids_value(rows[0]["rowids"]) == [x + 1 for x in range(1024)]
assert len(rows[1]["validity"]) == 1024 / CHAR_BIT
assert len(rows[1]["rowids"]) == 1024 * CHAR_BIT
assert rows[1]["validity"] == bytes([0b0000_0001]) + bitmap_zerod(1024)[1:]
assert rowids_value(rows[1]["rowids"])[0] == 1025
def test_coverage():
current_module = inspect.getmodule(inspect.currentframe())
test_methods = [
member[0]
for member in inspect.getmembers(current_module)
if member[0].startswith("test_")
]
funcs_with_tests = set([x.replace("test_", "") for x in test_methods])
for func in [*FUNCTIONS, *MODULES]:
assert func in funcs_with_tests, f"{func} is not tested"
if __name__ == "__main__":
unittest.main()

37
tests/unittest.rs Normal file
View file

@ -0,0 +1,37 @@
fn main() {
println!("Hello, world!");
println!("{:?}", _min_idx(vec![3.0, 2.0, 1.0], 2));
}
fn _min_idx(distances: Vec<f32>, k: i32) -> Vec<i32> {
let mut out: Vec<i32> = vec![0; k as usize];
unsafe {
min_idx(
distances.as_ptr().cast(),
distances.len() as i32,
out.as_mut_ptr(),
k,
);
}
out
}
#[link(name = "sqlite-vec-internal")]
extern "C" {
fn min_idx(distances: *const f32, n: i32, out: *mut i32, k: i32) -> i32;
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_basic() {
assert_eq!(_min_idx(vec![1.0, 2.0, 3.0], 3), vec![0, 1, 2]);
assert_eq!(_min_idx(vec![3.0, 2.0, 1.0], 3), vec![2, 1, 0]);
assert_eq!(_min_idx(vec![1.0, 2.0, 3.0], 2), vec![0, 1]);
assert_eq!(_min_idx(vec![3.0, 2.0, 1.0], 2), vec![2, 1]);
}
}

22
tests/utils.py Normal file
View file

@ -0,0 +1,22 @@
import numpy as np
from io import BytesIO
def to_npy(arr):
buf = BytesIO()
np.save(buf, arr)
buf.seek(0)
return buf.read()
to_npy(np.array([[1.0, 2.0, 3.0], [2.0, 3.0, 4.0]], dtype=np.float32))
print(to_npy(np.array([[1.0, 2.0]], dtype=np.float32)))
print(to_npy(np.array([1.0, 2.0], dtype=np.float32)))
to_npy(
np.array(
[np.zeros(10), np.zeros(10), np.zeros(10), np.zeros(10), np.zeros(10)],
dtype=np.float32,
)
)