Initial commit

2026-06-26 15:49:42 +02:00 · 2024-04-20 13:38:58 -07:00 · 2024-04-20 13:38:58 -07:00 · 4c8ad629e0
commit 4c8ad629e0
28 changed files with 6758 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,24 @@
 /target
 .vscode
 sift/
 *.tar.gz
 *.db
 *.bin
 *.out
 venv/
 vendor/
 dist/
 *.pyc
 *.db-journal
 *.svg
 alexandria/
 openai/
 examples/supabase-dbpedia
 examples/ann-filtering
 examples/dbpedia-openai
 examples/imdb
 sqlite-vec.h
--- a/296
+++ b/296
@ -0,0 +1,296 @@
 COMMIT=$(shell git rev-parse HEAD)
 VERSION=$(shell cat VERSION)
 DATE=$(shell date +'%FT%TZ%z')
 ifeq ($(shell uname -s),Darwin)
 CONFIG_DARWIN=y
 else ifeq ($(OS),Windows_NT)
 CONFIG_WINDOWS=y
 else
 CONFIG_LINUX=y
 endif
 ifdef CONFIG_DARWIN
 LOADABLE_EXTENSION=dylib
 endif
 ifdef CONFIG_LINUX
 LOADABLE_EXTENSION=so
 endif
 ifdef CONFIG_WINDOWS
 LOADABLE_EXTENSION=dll
 endif
 ifdef python
 PYTHON=$(python)
 else
 PYTHON=python3
 endif
 ifndef OMIT_SIMD
 	ifeq ($(shell uname -sm),Darwin x86_64)
 	CFLAGS += -mavx -DSQLITE_VEC_ENABLE_AVX
 	endif
 	ifeq ($(shell uname -sm),Darwin arm64)
 	CFLAGS += -mcpu=apple-m1 -DSQLITE_VEC_ENABLE_NEON
 	endif
 endif
 ifdef IS_MACOS_ARM
 RENAME_WHEELS_ARGS=--is-macos-arm
 else
 RENAME_WHEELS_ARGS=
 endif
 prefix=dist
 $(prefix):
 	mkdir -p $(prefix)
 TARGET_LOADABLE=$(prefix)/vec0.$(LOADABLE_EXTENSION)
 TARGET_STATIC=$(prefix)/libsqlite_vec0.a
 TARGET_STATIC_H=$(prefix)/sqlite-vec.h
 TARGET_CLI=$(prefix)/sqlite3
 loadable: $(TARGET_LOADABLE)
 static: $(TARGET_STATIC)
 cli: $(TARGET_CLI)
 all: loadable static cli
 $(TARGET_LOADABLE): sqlite-vec.c sqlite-vec.h $(prefix)
 	gcc \
 		-fPIC -shared \
 		-Wall -Wextra \
 		-Ivendor/ \
 		-O3 \
 		$(CFLAGS) \
 		$< -o $@
 $(TARGET_STATIC): sqlite-vec.c sqlite-vec.h $(prefix)
 	gcc -Ivendor/sqlite -Ivendor/vec $(CFLAGS) -DSQLITE_CORE \
 	-O3 -c  $< -o $(prefix)/.objs/vec.o
 	ar rcs $@ $(prefix)/.objs/vec.o
 $(TARGET_STATIC_H): sqlite-vec.h $(prefix)
 	cp $< $@
 OBJS_DIR=$(prefix)/.objs
 LIBS_DIR=$(prefix)/.libs
 BUILD_DIR=$(prefix)/.build
 $(OBJS_DIR): $(prefix)
 	mkdir -p $@
 $(LIBS_DIR): $(prefix)
 	mkdir -p $@
 $(BUILD_DIR): $(prefix)
 	mkdir -p $@
 $(OBJS_DIR)/sqlite3.o: vendor/sqlite3.c $(OBJS_DIR)
 	gcc -c -g3 -O3 -DSQLITE_EXTRA_INIT=core_init -DSQLITE_CORE -DSQLITE_ENABLE_STMT_SCANSTATUS -DSQLITE_ENABLE_BYTECODE_VTAB -DSQLITE_ENABLE_EXPLAIN_COMMENTS -I./vendor $< -o $@
 $(LIBS_DIR)/sqlite3.a: $(OBJS_DIR)/sqlite3.o $(LIBS_DIR)
 	ar rcs $@ $<
 $(BUILD_DIR)/shell-new.c: vendor/shell.c $(BUILD_DIR)
 	sed 's/\/\*extra-version-info\*\//EXTRA_TODO/g' $< > $@
 $(OBJS_DIR)/shell.o: $(BUILD_DIR)/shell-new.c $(OBJS_DIR)
 	gcc -c -g3 -O3 \
 		-DHAVE_EDITLINE=1 -I./vendor \
 		-DSQLITE_ENABLE_STMT_SCANSTATUS -DSQLITE_ENABLE_BYTECODE_VTAB -DSQLITE_ENABLE_EXPLAIN_COMMENTS \
 		-DEXTRA_TODO="\"CUSTOM BUILD: sqlite-vec\n\"" \
 		$< -o $@
 $(LIBS_DIR)/shell.a: $(OBJS_DIR)/shell.o $(LIBS_DIR)
 	ar rcs $@ $<
 $(OBJS_DIR)/sqlite-vec.o: sqlite-vec.c $(OBJS_DIR)
 	gcc -c -g3 -I./vendor $(CFLAGS) $< -o $@
 $(LIBS_DIR)/sqlite-vec.a: $(OBJS_DIR)/sqlite-vec.o $(LIBS_DIR)
 	ar rcs $@ $<
 $(TARGET_CLI): $(LIBS_DIR)/sqlite-vec.a $(LIBS_DIR)/shell.a $(LIBS_DIR)/sqlite3.a examples/sqlite3-cli/core_init.c $(prefix)
 	gcc -g3  \
 	-Ivendor/sqlite -I./ \
 	-DSQLITE_CORE \
 	-DSQLITE_THREADSAFE=0 -DSQLITE_ENABLE_FTS4 \
 	-DSQLITE_ENABLE_STMT_SCANSTATUS -DSQLITE_ENABLE_BYTECODE_VTAB -DSQLITE_ENABLE_EXPLAIN_COMMENTS \
 	-DSQLITE_EXTRA_INIT=core_init \
 	$(CFLAGS) \
 	-lreadline -DHAVE_EDITLINE=1 \
 	-ldl -lm -lreadline \
 	$(LIBS_DIR)/shell.a $(LIBS_DIR)/sqlite3.a $(LIBS_DIR)/sqlite-vec.a examples/sqlite3-cli/core_init.c -o $@
 sqlite-vec.h: sqlite-vec.h.tmpl VERSION
 	VERSION=$(shell cat VERSION) \
 	DATE=$(shell date -r VERSION +'%FT%TZ%z') \
 	SOURCE=$(shell git log -n 1 --pretty=format:%H -- VERSION) \
 	envsubst < $< > $@
 clean:
 	rm -rf dist
 FORMAT_FILES=sqlite-vec.h sqlite-vec.c
 format: $(FORMAT_FILES)
 	clang-format -i $(FORMAT_FILES)
 	black tests/test-loadable.py
 lint: SHELL:=/bin/bash
 lint:
 	diff -u <(cat $(FORMAT_FILES)) <(clang-format $(FORMAT_FILES))
 test:
 	sqlite3 :memory: '.read test.sql'
 .PHONY: version loadable static test clean gh-release \
 	ruby
 publish-release:
 	./scripts/publish_release.sh
 TARGET_WHEELS=$(prefix)/wheels
 INTERMEDIATE_PYPACKAGE_EXTENSION=bindings/python/sqlite_vec/
 $(TARGET_WHEELS): $(prefix)
 	mkdir -p $(TARGET_WHEELS)
 bindings/ruby/lib/version.rb: bindings/ruby/lib/version.rb.tmpl VERSION
 	VERSION=$(VERSION) envsubst < $< > $@
 bindings/python/sqlite_vec/version.py: bindings/python/sqlite_vec/version.py.tmpl VERSION
 	VERSION=$(VERSION) envsubst < $< > $@
 	echo "✅ generated $@"
 bindings/datasette/datasette_sqlite_vec/version.py: bindings/datasette/datasette_sqlite_vec/version.py.tmpl VERSION
 	VERSION=$(VERSION) envsubst < $< > $@
 	echo "✅ generated $@"
 python: $(TARGET_WHEELS) $(TARGET_LOADABLE) bindings/python/setup.py bindings/python/sqlite_vec/__init__.py scripts/rename-wheels.py
 	cp $(TARGET_LOADABLE) $(INTERMEDIATE_PYPACKAGE_EXTENSION)
 	rm $(TARGET_WHEELS)/*.wheel || true
 	pip3 wheel bindings/python/ -w $(TARGET_WHEELS)
 	python3 scripts/rename-wheels.py $(TARGET_WHEELS) $(RENAME_WHEELS_ARGS)
 	echo "✅ generated python wheel"
 datasette: $(TARGET_WHEELS) bindings/datasette/setup.py bindings/datasette/datasette_sqlite_vec/__init__.py
 	rm $(TARGET_WHEELS)/datasette* || true
 	pip3 wheel bindings/datasette/ --no-deps -w $(TARGET_WHEELS)
 bindings/sqlite-utils/pyproject.toml: bindings/sqlite-utils/pyproject.toml.tmpl VERSION
 	VERSION=$(VERSION) envsubst < $< > $@
 	echo "✅ generated $@"
 bindings/sqlite-utils/sqlite_utils_sqlite_vec/version.py: bindings/sqlite-utils/sqlite_utils_sqlite_vec/version.py.tmpl VERSION
 	VERSION=$(VERSION) envsubst < $< > $@
 	echo "✅ generated $@"
 sqlite-utils: $(TARGET_WHEELS) bindings/sqlite-utils/pyproject.toml bindings/sqlite-utils/sqlite_utils_sqlite_vec/version.py
 	python3 -m build bindings/sqlite-utils -w -o $(TARGET_WHEELS)
 node: VERSION bindings/node/platform-package.README.md.tmpl bindings/node/platform-package.package.json.tmpl bindings/node/sqlite-vec/package.json.tmpl scripts/node_generate_platform_packages.sh
 	scripts/node_generate_platform_packages.sh
 deno: VERSION bindings/deno/deno.json.tmpl
 	scripts/deno_generate_package.sh
 version:
 	make bindings/ruby/lib/version.rb
 	make bindings/python/sqlite_vec/version.py
 	make bindings/datasette/datasette_sqlite_vec/version.py
 	make bindings/datasette/datasette_sqlite_vec/version.py
 	make bindings/sqlite-utils/pyproject.toml bindings/sqlite-utils/sqlite_utils_sqlite_vec/version.py
 	make node
 	make deno
 test-loadable: loadable
 	$(PYTHON) -m pytest -vv -s tests/test-loadable.py
 test-loadable-snapshot-update: loadable
 	$(PYTHON) -m pytest -vv tests/test-loadable.py --snapshot-update
 test-loadable-watch:
 	watchexec -w sqlite-vec.c -w tests/test-loadable.py -w Makefile --clear -- make test-loadable
 # ███████████████████████████████ WASM SECTION ███████████████████████████████
 WASM_DIR=$(prefix)/.wasm
 $(WASM_DIR): $(prefix)
 	mkdir -p $@
 SQLITE_WASM_VERSION=3450300
 SQLITE_WASM_YEAR=2024
 SQLITE_WASM_SRCZIP=$(BUILD_DIR)/sqlite-src.zip
 SQLITE_WASM_COMPILED_SQLITE3C=$(BUILD_DIR)/sqlite-src-$(SQLITE_WASM_VERSION)/sqlite3.c
 SQLITE_WASM_COMPILED_MJS=$(BUILD_DIR)/sqlite-src-$(SQLITE_WASM_VERSION)/ext/wasm/jswasm/sqlite3.mjs
 SQLITE_WASM_COMPILED_WASM=$(BUILD_DIR)/sqlite-src-$(SQLITE_WASM_VERSION)/ext/wasm/jswasm/sqlite3.wasm
 TARGET_WASM_LIB=$(WASM_DIR)/libsqlite_vec.wasm.a
 TARGET_WASM_MJS=$(WASM_DIR)/sqlite3.mjs
 TARGET_WASM_WASM=$(WASM_DIR)/sqlite3.wasm
 TARGET_WASM=$(TARGET_WASM_MJS) $(TARGET_WASM_WASM)
 $(SQLITE_WASM_SRCZIP): $(BUILD_DIR)
 	curl -o $@ https://www.sqlite.org/$(SQLITE_WASM_YEAR)/sqlite-src-$(SQLITE_WASM_VERSION).zip
 $(SQLITE_WASM_COMPILED_SQLITE3C): $(SQLITE_WASM_SRCZIP) $(BUILD_DIR)
 	unzip -q -o $< -d $(BUILD_DIR)
 	(cd $(BUILD_DIR)/sqlite-src-$(SQLITE_WASM_VERSION)/ && ./configure --enable-all && make sqlite3.c)
 $(TARGET_WASM_LIB): examples/wasm/wasm.c sqlite-vec.c $(BUILD_DIR) $(WASM_DIR)
 	emcc -O3  -I./ -Ivendor -DSQLITE_CORE -c examples/wasm/wasm.c -o $(BUILD_DIR)/wasm.wasm.o
 	emcc -O3  -I./ -Ivendor -DSQLITE_CORE -c sqlite-vec.c -o $(BUILD_DIR)/sqlite-vec.wasm.o
 	emar rcs $@ $(BUILD_DIR)/wasm.wasm.o $(BUILD_DIR)/sqlite-vec.wasm.o
 $(SQLITE_WASM_COMPILED_MJS) $(SQLITE_WASM_COMPILED_WASM): $(SQLITE_WASM_COMPILED_SQLITE3C) $(TARGET_WASM_LIB)
 	(cd $(BUILD_DIR)/sqlite-src-$(SQLITE_WASM_VERSION)/ext/wasm && \
 		make sqlite3_wasm_extra_init.c=../../../../.wasm/libsqlite_vec.wasm.a "emcc.flags=-s EXTRA_EXPORTED_RUNTIME_METHODS=['ENV'] -s FETCH")
 $(TARGET_WASM_MJS): $(SQLITE_WASM_COMPILED_MJS)
 	cp $< $@
 $(TARGET_WASM_WASM): $(SQLITE_WASM_COMPILED_WASM)
 	cp $< $@
 wasm: $(TARGET_WASM)
 # ███████████████████████████████   END WASM   ███████████████████████████████
 # ███████████████████████████████ SITE SECTION ███████████████████████████████
 WASM_TOOLKIT_NPM_TARGZ=$(BUILD_DIR)/sqlite-wasm-toolkit-npm.tar.gz
 SITE_DIR=$(prefix)/.site
 TARGET_SITE=$(prefix)/.site/index.html
 $(WASM_TOOLKIT_NPM_TARGZ):
 	curl -o $@ -q https://registry.npmjs.org/@alex.garcia/sqlite-wasm-toolkit/-/sqlite-wasm-toolkit-0.0.1-alpha.7.tgz
 $(SITE_DIR)/slim.js $(SITE_DIR)/slim.css: $(WASM_TOOLKIT_NPM_TARGZ)
 	tar -xvzf $< -C $(SITE_DIR) --strip-components=2 package/dist/slim.js package/dist/slim.css
 $(SITE_DIR):
 	mkdir -p $(SITE_DIR)
 # $(TARGET_WASM_MJS) $(TARGET_WASM_WASM)
 $(TARGET_SITE): site/index.html $(SITE_DIR)/slim.js $(SITE_DIR)/slim.css
 	cp $(TARGET_WASM_MJS) $(SITE_DIR)
 	cp $(TARGET_WASM_WASM) $(SITE_DIR)
 	cp $< $@
 site: $(TARGET_SITE)
 # ███████████████████████████████   END SITE   ███████████████████████████████
--- a/README.md
+++ b/README.md
@ -0,0 +1,73 @@
 # `sqlite-vec`
 An extremely small, "fast enough" vector search SQLite extension that runs
 anywhere! A successor to [sqlite-vss](https://github.com/asg017/sqlite-vss)
 > [!IMPORTANT]
 > *`sqlite-vec` is a work-in-progress and not ready for general usage! I plan to launch a "beta" version in the next month or so. Watch this repo for updates.*
 - Store and query float, int8, and binary vectors in `vec0` virtual tables
 - Pre-filter vectors with `rowid IN (...)` subqueries
 - Written in pure C, no dependencies,
  runs anywhere SQLite runs (Linux/MacOS/Windows, in the browser with WASM,
  Raspberry Pis, etc.)
 ## Sample usage
 ```sql
 .load ./vec0
 create virtual table vec_examples using vec0(
  sample_embedding float[8]
 );
 -- vectors can be provided as JSON or in a compact binary format
 insert into vec_examples(rowid, sample_embedding)
  values
    (1, '[-0.200, 0.250, 0.341, -0.211, 0.645, 0.935, -0.316, -0.924]'),
    (2, '[0.443, -0.501, 0.355, -0.771, 0.707, -0.708, -0.185, 0.362]'),
    (3, '[0.716, -0.927, 0.134, 0.052, -0.669, 0.793, -0.634, -0.162]'),
    (4, '[-0.710, 0.330, 0.656, 0.041, -0.990, 0.726, 0.385, -0.958]');
 -- KNN style query goes brrrr
 select
  rowid,
  distance
 from vec_examples
 where sample_embedding match '[0.890, 0.544, 0.825, 0.961, 0.358, 0.0196, 0.521, 0.175]'
 order by distance
 limit 2;
 /*
 ┌───────┬──────────────────┐
 │ rowid │     distance     │
 ├───────┼──────────────────┤
 │ 2     │ 2.38687372207642 │
 │ 1     │ 2.38978505134583 │
 └───────┴──────────────────┘
 */
 ```
 ## Roadmap
 Not currently implemented, but planned in the future (after initial beta version):
 - Approximate nearest neighbors search (IVF and HNSW)
 - Metadata filtering + custom internal partitioning
 - More vector types (float16, int16, sparse, etc.) and distance functions
 Additionally, there will be pre-compiled and pre-packaged packages of `sqlite-vec` for the following platforms:
 - `pip` for Python
 - `npm` for Node.js / Deno / Bun
 - `gem` for Ruby
 - `cargo` for Rust
 - A single `.c` and `.h` amalgammation for C/C++
 - Go module for Golang (requires CGO)
 - Datasette and sqlite-utils plugins
 - Pre-compiled loadable extensions on Github releases
 ## Support
 Is your company interested in sponsoring `sqlite-vec` development? Send me an email to get more info: https://alexgarcia.xyz
--- a/1
+++ b/1
@ -0,0 +1 @@
 0.0.1-alpha.0
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
--- a/benchmarks/exhaustive-memory/README.md
+++ b/benchmarks/exhaustive-memory/README.md
@ -0,0 +1,17 @@
 ```
 python3 bench/bench.py \
  -n "sift1m" \
  -i sift/sift_base.fvecs \
  -q sift/sift_query.fvecs \
  --sample 10000 --qsample 100 \
  -k 10
 ```
 ```
 python3 bench/bench.py \
  -n "sift1m" \
  -i sift/sift_base.fvecs \
  -q sift/sift_query.fvecs \
  --sample 10000 --qsample 100 \
  -k 10
 ```
--- a/benchmarks/exhaustive-memory/bench.py
+++ b/benchmarks/exhaustive-memory/bench.py
@ -0,0 +1,403 @@
 import numpy as np
 import numpy.typing as npt
 import time
 import hnswlib
 import sqlite3
 import faiss
 import lancedb
 import pandas as pd
 # import chromadb
 from usearch.index import Index, search, MetricKind
 from dataclasses import dataclass
 from typing import List
@dataclass
 class BenchResult:
    tool: str
    build_time_ms: float
    query_times_ms: List[float]
 def duration(seconds: float):
    ms = seconds * 1000
    return f"{int(ms)}ms"
 def cosine_similarity(
    vec: npt.NDArray[np.float32], mat: npt.NDArray[np.float32], do_norm: bool = True
 ) -> npt.NDArray[np.float32]:
    sim = vec @ mat.T
    if do_norm:
        sim /= np.linalg.norm(vec) * np.linalg.norm(mat, axis=1)
    return sim
 def topk(
    vec: npt.NDArray[np.float32],
    mat: npt.NDArray[np.float32],
    k: int = 5,
    do_norm: bool = True,
 ) -> tuple[npt.NDArray[np.int32], npt.NDArray[np.float32]]:
    sim = cosine_similarity(vec, mat, do_norm=do_norm)
    # Rather than sorting all similarities and taking the top K, it's faster to
    # argpartition and then just sort the top K.
    # The difference is O(N logN) vs O(N + k logk)
    indices = np.argpartition(-sim, kth=k)[:k]
    top_indices = np.argsort(-sim[indices])
    return indices[top_indices], sim[top_indices]
 def ivecs_read(fname):
    a = np.fromfile(fname, dtype="int32")
    d = a[0]
    return a.reshape(-1, d + 1)[:, 1:].copy()
 def fvecs_read(fname):
    return ivecs_read(fname).view("float32")
 def bench_hnsw(base, query):
    t0 = time.time()
    p = hnswlib.Index(space="ip", dim=128)  # possible options are l2, cosine or ip
    # NOTE: Use default settings from the README.
    print("buildings hnsw")
    p.init_index(max_elements=base.shape[0], ef_construction=200, M=16)
    ids = np.arange(base.shape[0])
    p.add_items(base, ids)
    p.set_ef(50)
    print("build time", time.time() - t0)
    results = []
    times = []
    t = time.time()
    for idx, q in enumerate(query):
        t0 = time.time()
        result = p.knn_query(q, k=5)
        if idx < 5:
            print(result[0])
        results.append(result)
        times.append(time.time() - t0)
    print(time.time() - t)
    print("hnsw avg", np.mean(times))
    return results
 def bench_hnsw_bf(base, query, k) -> BenchResult:
    print("hnswlib-bf")
    dimensions = base.shape[1]
    t0 = time.time()
    p = hnswlib.BFIndex(space="l2", dim=dimensions)
    p.init_index(max_elements=base.shape[0])
    ids = np.arange(base.shape[0])
    p.add_items(base, ids)
    build_time = time.time() - t0
    results = []
    times = []
    t = time.time()
    for idx, q in enumerate(query):
        t0 = time.time()
        result = p.knn_query(q, k=k)
        results.append(result)
        times.append(time.time() - t0)
    return BenchResult("hnswlib-bf", build_time, times)
 def bench_numpy(base, query, k) -> BenchResult:
    print("numpy")
    times = []
    results = []
    for idx, q in enumerate(query):
        t0 = time.time()
        result = topk(q, base, k=k)
        results.append(result)
        times.append(time.time() - t0)
    return BenchResult("numpy", 0, times)
 def bench_sqlite_vec(base, query, page_size, chunk_size, k) -> BenchResult:
    dimensions = base.shape[1]
    print(f"sqlite-vec {page_size} {chunk_size}")
    db = sqlite3.connect(":memory:")
    db.execute(f"PRAGMA page_size = {page_size}")
    db.enable_load_extension(True)
    db.load_extension("./dist/vec0")
    db.execute(
        f"""
          create virtual table vec_sift1m using vec0(
            chunk_size={chunk_size},
            vector float[{dimensions}]
          )
        """
    )
    t = time.time()
    with db:
        db.executemany(
            "insert into vec_sift1m(vector) values (?)",
            list(map(lambda x: [x.tobytes()], base)),
        )
    build_time = time.time() - t
    times = []
    results = []
    for (
        idx,
        q,
    ) in enumerate(query):
        t0 = time.time()
        result = db.execute(
            """
              select
                rowid,
                distance
              from vec_sift1m
              where vector match ?
                and k = ?
              order by distance
            """,
            [q.tobytes(), k],
        ).fetchall()
        times.append(time.time() - t0)
    return BenchResult(f"sqlite-vec vec0 ({page_size}|{chunk_size})", build_time, times)
 def bench_sqlite_normal(base, query, page_size, k) -> BenchResult:
    print(f"sqlite-normal")
    db = sqlite3.connect(":memory:")
    db.enable_load_extension(True)
    db.load_extension("./dist/vec0")
    db.execute(f"PRAGMA page_size={page_size}")
    db.execute(f"create table sift1m(vector);")
    t = time.time()
    with db:
        db.executemany(
            "insert into sift1m(vector) values (?)",
            list(map(lambda x: [x.tobytes()], base)),
        )
    build_time = time.time() - t
    times = []
    results = []
    t = time.time()
    for (
        idx,
        q,
    ) in enumerate(query):
        t0 = time.time()
        result = db.execute(
            """
              select
                rowid,
                vec_distance_l2(?, vector) as distance
              from sift1m
              order by distance
              limit ?
            """,
            [q.tobytes(), k],
        ).fetchall()
        times.append(time.time() - t0)
    return BenchResult(f"sqlite-vec normal ({page_size})", build_time, times)
 def bench_faiss(base, query, k) -> BenchResult:
    dimensions = base.shape[1]
    print("faiss")
    t = time.time()
    index = faiss.IndexFlatL2(dimensions)
    index.add(base)
    build_time = time.time() - t
    times = []
    results = []
    t = time.time()
    for idx, q in enumerate(query):
        t0 = time.time()
        distances, rowids = index.search(x=np.array([q]), k=k)
        results.append(rowids)
        times.append(time.time() - t0)
    print("faiss avg", duration(np.mean(times)))
    return BenchResult("faiss", build_time, times)
 def bench_lancedb(base, query, k) -> BenchResult:
    dimensions = base.shape[1]
    db = lancedb.connect("a")
    data = [{"vector": row.reshape(1, -1)[0]} for row in base]
    # Create a DataFrame where each row is a 1D array
    df = pd.DataFrame(data=data, columns=["vector"])
    t = time.time()
    db.create_table("t", data=df)
    build_time = time.time() - t
    tbl = db.open_table("t")
    times = []
    for q in query:
        t0 = time.time()
        result = tbl.search(q).limit(k).to_arrow()
        times.append(time.time() - t0)
    return BenchResult("lancedb", build_time, times)
 # def bench_chroma(base, query, k):
 #    chroma_client = chromadb.Client()
 #    collection = chroma_client.create_collection(name="my_collection")
 #
 #    t = time.time()
 #    # chroma doesn't allow for more than 41666 vectors to be inserted at once (???)
 #    i = 0
 #    collection.add(embeddings=base, ids=[str(x) for x in range(len(base))])
 #    print("chroma build time: ", duration(time.time() - t))
 #    times = []
 #    for q in query:
 #        t0 = time.time()
 #        result = collection.query(
 #            query_embeddings=[q.tolist()],
 #            n_results=k,
 #        )
 #        print(result)
 #        times.append(time.time() - t0)
 #    print("chroma avg", duration(np.mean(times)))
 def bench_usearch_npy(base, query, k) -> BenchResult:
    times = []
    for q in query:
        t0 = time.time()
        # result = index.search(q, exact=True)
        result = search(base, q, k, MetricKind.L2sq, exact=True)
        times.append(time.time() - t0)
    return BenchResult("usearch numpy exact=True", 0, times)
 def bench_usearch_special(base, query, k) -> BenchResult:
    dimensions = base.shape[1]
    index = Index(ndim=dimensions)
    t = time.time()
    index.add(np.arange(len(base)), base)
    build_time = time.time() - t
    times = []
    for q in query:
        t0 = time.time()
        result = index.search(q, exact=True)
        times.append(time.time() - t0)
    return BenchResult("usuearch index exact=True", build_time, times)
 from rich.console import Console
 from rich.table import Table
 def suite(name, base, query, k):
    print(f"Starting benchmark suite: {name} {base.shape}, k={k}")
    results = []
    # n = bench_chroma(base[:40000], query, k=k)
    # n = bench_usearch_npy(base, query, k=k)
    # n = bench_usearch_special(base, query, k=k)
    results.append(bench_faiss(base, query, k=k))
    results.append(bench_hnsw_bf(base, query, k=k))
    # n = bench_sqlite_vec(base, query, 4096, 1024, k=k)
    # n = bench_sqlite_vec(base, query, 32768, 1024, k=k)
    results.append(bench_sqlite_vec(base, query, 32768, 256, k=k))
    # n = bench_sqlite_vec(base, query, 16384, 64, k=k)
    # n = bench_sqlite_vec(base, query, 16384, 32, k=k)
    results.append(bench_sqlite_normal(base, query, 8192, k=k))
    results.append(bench_lancedb(base, query, k=k))
    results.append(bench_numpy(base, query, k=k))
    # h = bench_hnsw(base, query)
    table = Table(
        title=f"{name}: {base.shape[0]:,} {base.shape[1]}-dimension vectors, k={k}"
    )
    table.add_column("Tool")
    table.add_column("Build Time (ms)", justify="right")
    table.add_column("Query time (ms)", justify="right")
    for res in results:
        table.add_row(
            res.tool, duration(res.build_time_ms), duration(np.mean(res.query_times_ms))
        )
    console = Console()
    console.print(table)
 import argparse
 def parse_args():
    parser = argparse.ArgumentParser(description="Benchmark processing script.")
    # Required arguments
    parser.add_argument("-n", "--name", required=True, help="Name of the benchmark.")
    parser.add_argument(
        "-i", "--input", required=True, help="Path to input file (.npy)."
    )
    parser.add_argument(
        "-k", type=int, required=True, help="Parameter k to use in benchmark."
    )
    # Optional arguments
    parser.add_argument(
        "-q", "--query", required=False, help="Path to query file (.npy)."
    )
    parser.add_argument(
        "--sample",
        type=int,
        required=False,
        help="Number of entries in base to use. Defaults all",
    )
    parser.add_argument(
        "--qsample",
        type=int,
        required=False,
        help="Number of queries to use. Defaults all",
    )
    args = parser.parse_args()
    return args
 from pathlib import Path
 def cli_read_input(input):
    input_path = Path(input)
    if input_path.suffix == ".fvecs":
        return fvecs_read(input_path)
    if input_path.suffx == ".npy":
        return np.fromfile(input_path, dtype="float32")
    raise Exception("unknown filetype", input)
 def cli_read_query(query, base):
    if query is None:
        return base[np.random.choice(base.shape[0], 100, replace=False), :]
    return cli_read_input(query)
 def main():
    args = parse_args()
    base = cli_read_input(args.input)[: args.sample]
    queries = cli_read_query(args.query, base)[: args.qsample]
    suite(args.name, base, queries, args.k)
    from sys import argv
    # base = fvecs_read("sift/sift_base.fvecs")  # [:100000]
    # query = fvecs_read("sift/sift_query.fvecs")[:100]
    # print(base.shape)
    # k = int(argv[1]) if len(argv) > 1 else 5
    # suite("sift1m", base, query, k)
 if __name__ == "__main__":
    main()
--- a/benchmarks/profiling/build-from-npy.sql
+++ b/benchmarks/profiling/build-from-npy.sql
@ -0,0 +1,17 @@
 .timer on
 pragma page_size = 32768;
 --pragma page_size = 16384;
 --pragma page_size = 16384;
 --pragma page_size = 4096;
 create virtual table vec_items using vec0(
  embedding float[1536]
 );
 -- 65s (limit 1e5), ~615MB on disk
 insert into vec_items
  select
    rowid,
    vector
  from vec_npy_each(vec_npy_file('examples/dbpedia-openai/data/vectors.npy'))
  limit 1e5;
--- a/benchmarks/profiling/query-k.sql
+++ b/benchmarks/profiling/query-k.sql
@ -0,0 +1,31 @@
 .timer on
 select rowid, distance
 from vec_items
 where embedding match (select embedding from vec_items where rowid = 100)
  and k = :k
 order by distance;
 select rowid, distance
 from vec_items
 where embedding match (select embedding from vec_items where rowid = 100)
  and k = :k
 order by distance;
 select rowid, distance
 from vec_items
 where embedding match (select embedding from vec_items where rowid = 100)
  and k = :k
 order by distance;
 select rowid, distance
 from vec_items
 where embedding match (select embedding from vec_items where rowid = 100)
  and k = :k
 order by distance;
 select rowid, distance
 from vec_items
 where embedding match (select embedding from vec_items where rowid = 100)
  and k = :k
 order by distance;
--- a/benchmarks/self-params/build.py
+++ b/benchmarks/self-params/build.py
@ -0,0 +1,85 @@
 import sqlite3
 import time
 def connect(path):
    db = sqlite3.connect(path)
    db.enable_load_extension(True)
    db.load_extension("../dist/vec0")
    db.execute("select load_extension('../dist/vec0', 'sqlite3_vec_fs_read_init')")
    db.enable_load_extension(False)
    return db
 page_sizes = [  # 4096, 8192,
    16384,
    32768,
 ]
 chunk_sizes = [128, 256, 1024, 2048]
 types = ["f32", "int8", "bit"]
 SRC = "../examples/dbpedia-openai/data/vectors.npy"
 for page_size in page_sizes:
    for chunk_size in chunk_sizes:
        for t in types:
            print(f"{t} page_size={page_size}, chunk_size={chunk_size}")
            t0 = time.time()
            db = connect(f"dbs/test.{page_size}.{chunk_size}.{t}.db")
            db.execute(f"pragma page_size = {page_size}")
            with db:
                db.execute(
                    f"""
                      create virtual table vec_items using vec0(
                        embedding {t}[1536],
                        chunk_size={chunk_size}
                      )
                    """
                )
                func = "vector"
                if t == "int8":
                    func = "vec_quantize_i8(vector, 'unit')"
                if t == "bit":
                    func = "vec_quantize_binary(vector)"
                db.execute(
                    f"""
                      insert into vec_items
                      select rowid, {func}
                      from vec_npy_each(vec_npy_file(?))
                      limit 100000
                    """,
                    [SRC],
                )
            elapsed = time.time() - t0
            print(elapsed)
 """
 # for 100_000
 page_size=4096, chunk_size=256
 3.5894200801849365
 page_size=4096, chunk_size=1024
 60.70046401023865
 page_size=4096, chunk_size=2048
 201.04426288604736
 page_size=8192, chunk_size=256
 7.034514904022217
 page_size=8192, chunk_size=1024
 9.983598947525024
 page_size=8192, chunk_size=2048
 12.318921089172363
 page_size=16384, chunk_size=256
 4.97080397605896
 page_size=16384, chunk_size=1024
 6.051195859909058
 page_size=16384, chunk_size=2048
 8.492683172225952
 page_size=32768, chunk_size=256
 5.906642198562622
 page_size=32768, chunk_size=1024
 5.876632213592529
 page_size=32768, chunk_size=2048
 5.420510292053223
 """
--- a/benchmarks/self-params/knn.py
+++ b/benchmarks/self-params/knn.py
@ -0,0 +1,83 @@
 import sqlite3
 import time
 from random import randrange
 from statistics import mean
 def connect(path):
    print(path)
    db = sqlite3.connect(path)
    db.enable_load_extension(True)
    db.load_extension("../dist/vec0")
    db.execute("select load_extension('../dist/vec0', 'sqlite3_vec_fs_read_init')")
    db.enable_load_extension(False)
    return db
 page_sizes = [  # 4096, 8192,
    16384,
    32768,
 ]
 chunk_sizes = [128, 256, 1024, 2048]
 types = ["f32", "int8", "bit"]
 types.reverse()
 for t in types:
    for page_size in page_sizes:
        for chunk_size in chunk_sizes:
            print(f"page_size={page_size}, chunk_size={chunk_size}")
            func = "embedding"
            if t == "int8":
                func = "vec_quantize_i8(embedding, 'unit')"
            if t == "bit":
                func = "vec_quantize_binary(embedding)"
            times = []
            trials = 20
            db = connect(f"dbs/test.{page_size}.{chunk_size}.{t}.db")
            for trial in range(trials):
                t0 = time.time()
                results = db.execute(
                    f"""
                      select rowid
                      from vec_items
                      where embedding match (select {func} from vec_items where rowid = ?)
                        and k = 10
                      order by distance
                    """,
                    [randrange(100000)],
                ).fetchall()
                times.append(time.time() - t0)
            print(mean(times))
 """
 page_size=4096, chunk_size=256
 0.2635102152824402
 page_size=4096, chunk_size=1024
 0.2609449863433838
 page_size=4096, chunk_size=2048
 0.275589919090271
 page_size=8192, chunk_size=256
 0.18621582984924318
 page_size=8192, chunk_size=1024
 0.20939643383026124
 page_size=8192, chunk_size=2048
 0.22376316785812378
 page_size=16384, chunk_size=256
 0.16012665033340454
 page_size=16384, chunk_size=1024
 0.18346318006515502
 page_size=16384, chunk_size=2048
 0.18224761486053467
 page_size=32768, chunk_size=256
 0.14202518463134767
 page_size=32768, chunk_size=1024
 0.15340715646743774
 page_size=32768, chunk_size=2048
 0.18018823862075806
 """
--- a/benchmarks/self-params/test.py
+++ b/benchmarks/self-params/test.py
@ -0,0 +1,24 @@
 import sqlite3
 import time
 def connect(path):
    db = sqlite3.connect(path)
    db.enable_load_extension(True)
    db.load_extension("../dist/vec0")
    db.execute("select load_extension('../dist/vec0', 'sqlite3_vec_fs_read_init')")
    db.enable_load_extension(False)
    return db
 page_sizes = [4096, 8192, 16384, 32768]
 chunk_sizes = [256, 1024, 2048]
 for page_size in page_sizes:
    for chunk_size in chunk_sizes:
        print(f"page_size={page_size}, chunk_size={chunk_size}")
        t0 = time.time()
        db = connect(f"dbs/test.{page_size}.{chunk_size}.db")
        print(db.execute("pragma page_size").fetchone()[0])
        print(db.execute("select count(*) from vec_items_rowids").fetchone()[0])
--- a/examples/sqlite3-cli/README.md
+++ b/examples/sqlite3-cli/README.md
@ -0,0 +1,5 @@
 # `sqlite-vec` statically compiled in the SQLite CLI
 You can compile your own version of the `sqlite3` CLI with `sqlite-vec` builtin. The process is not well documented, but the special `SQLITE_EXTRA_INIT` compile option can be used to "inject" code at initialization time. See the `Makefile` at the root of this project for some more info.
 The `core_init.c` file here demonstrates auto-loading the `sqlite-vec` entrypoints at startup.
--- a/examples/sqlite3-cli/core_init.c
+++ b/examples/sqlite3-cli/core_init.c
@ -0,0 +1,8 @@
 #include "sqlite3.h"
 #include "sqlite-vec.h"
 #include <stdio.h>
 int core_init(const char *dummy) {
  int rc = sqlite3_auto_extension((void *)sqlite3_vec_init);
  if(rc != SQLITE_OK) return rc;
  return sqlite3_auto_extension((void *)sqlite3_vec_fs_read_init);
 }
--- a/examples/wasm/README.md
+++ b/examples/wasm/README.md
@ -0,0 +1,5 @@
 # `sqlite-vec` statically compiled into WASM builds
 You can compile your own version of SQLite's WASM build with `sqlite-vec` builtin. Dynamically loading SQLite extensions is not supported in the official WASM build yet, but you can statically compile extensions in. It's not well documented, but the `sqlite3_wasm_extra_init` option in the SQLite `ext/wasm` Makefile allows you to inject your own code at initialization time. See the `Makefile` at the room of the project for more info.
 The `wasm.c` file here demonstrates auto-loading the `sqlite-vec` entrypoints at startup.
--- a/examples/wasm/wasm.c
+++ b/examples/wasm/wasm.c
@ -0,0 +1,6 @@
 #include "sqlite3.h"
 #include "sqlite-vec.h"
 int sqlite3_wasm_extra_init(const char *) {
  sqlite3_auto_extension((void (*)(void)) sqlite3_vec_init);
 }
--- a/site/index.html
+++ b/site/index.html
@ -0,0 +1,23 @@
 <html>
  <body>
    <h1>sqlite-vec</h1>
    <div id="target">
    </div>
    <link rel="stylesheet" href="./slim.css"/>
    <script type="module">
      import {attach} from "./slim.js";
      import {default as init} from "./sqlite3.mjs";
      const sqlite3 = await init();
      const v = new sqlite3.oo1.DB(":memory:").selectValue('select vec_version()');
      document.querySelector('h1').textContent += `(${v})`;
      attach(document.body.querySelector('#target'), sqlite3,
 `select
  sqlite_version(),
  vec_version(),
  vec_to_json(X'00000000000080bf');
 `);
    </script>
  </body>
 </html>
--- a/sqlite-vec.c
+++ b/sqlite-vec.c
--- a/sqlite-vec.h.tmpl
+++ b/sqlite-vec.h.tmpl
@ -0,0 +1,11 @@
 #include "sqlite3ext.h"
 #define SQLITE_VEC_VERSION "v${VERSION}"
 #define SQLITE_VEC_DATE "${DATE}"
 #define SQLITE_VEC_SOURCE "${SOURCE}"
 int sqlite3_vec_init(sqlite3 *db, char **pzErrMsg,
                     const sqlite3_api_routines *pApi);
 int sqlite3_vec_fs_read_init(sqlite3 *db, char **pzErrMsg,
                             const sqlite3_api_routines *pApi);
--- a/tests/.gitignore
+++ b/tests/.gitignore
@ -0,0 +1 @@
 target/
--- a/tests/Cargo.lock
+++ b/tests/Cargo.lock
@ -0,0 +1,16 @@
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
 version = 3
 [[package]]
 name = "cc"
 version = "1.0.90"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8cd6604a82acf3039f1144f54b8eb34e91ffba622051189e71b781822d5ee1f5"
 [[package]]
 name = "tests"
 version = "0.1.0"
 dependencies = [
 "cc",
 ]
--- a/tests/Cargo.toml
+++ b/tests/Cargo.toml
@ -0,0 +1,15 @@
 [package]
 name = "tests"
 version = "0.1.0"
 edition = "2021"
 [dependencies]
 [build-dependencies]
 cc = "1.0"
 [[bin]]
 name = "unittest"
 path = "unittest.rs"
--- a/tests/build.rs
+++ b/tests/build.rs
@ -0,0 +1,13 @@
 use std::env;
 use std::path::{Path, PathBuf};
 use std::process::Command;
 fn main() {
    cc::Build::new()
        .file("../sqlite-vec.c")
        .include(".")
        .static_flag(true)
        .compile("sqlite-vec-internal");
    println!("cargo:rerun-if-changed=usleep.c");
    println!("cargo:rerun-if-changed=build.rs");
 }
--- a/tests/sqlite-vec-internal.h
+++ b/tests/sqlite-vec-internal.h
@ -0,0 +1,12 @@
 #include <stdlib.h>
 int min_idx(
  // list of distances, size n
  const float *distances,
  // number of entries in distances
  int32_t n,
  // output array of size k, the indicies of the lowest k values in distances
  int32_t *out,
  // output number of elements
  int32_t k
 );
--- a/tests/test-correctness.py
+++ b/tests/test-correctness.py
@ -0,0 +1,49 @@
 import sqlite3
 import json
 db = sqlite3.connect("test2.db")
 db.enable_load_extension(True)
 db.load_extension("dist/vec0")
 db.enable_load_extension(False)
 db.row_factory = sqlite3.Row
 db.execute('attach database "sift1m-base.db" as sift1m')
 #def test_sift1m():
 rows = db.execute(
  '''
    with q as (
      select rowid, vector, k100 from sift1m.sift1m_query limit 10
    ),
    results as (
      select
        q.rowid as query_rowid,
        vec_sift1m.rowid as vec_rowid,
        distance,
        k100 as k100_groundtruth
      from q
      join vec_sift1m
      where
        vec_sift1m.vector match q.vector
        and k = 100
      order by distance
    )
    select
      query_rowid,
      json_group_array(vec_rowid order by distance) as topk,
      k100_groundtruth,
      json_group_array(vec_rowid order by distance) == k100_groundtruth
    from results
    group by 1;
  ''').fetchall()
 results = []
 for row in rows:
  actual = json.loads(row["topk"])
  expected = json.loads(row["k100_groundtruth"])
  ncorrect = sum([x in expected for x in actual])
  results.append(ncorrect / 100.0)
 from statistics import mean
 print(mean(results))
--- a/tests/test-loadable.py
+++ b/tests/test-loadable.py
@ -0,0 +1,874 @@
 # ruff: noqa: E731
 import re
 from typing import List
 import sqlite3
 import unittest
 from random import random
 import struct
 import inspect
 import pytest
 import json
 import numpy as np
 from math import isclose
 EXT_PATH = "./dist/vec0"
 def bitmap_full(n: int) -> bytearray:
    assert (n % 8) == 0
    return bytes([0xFF] * int(n / 8))
 def bitmap_zerod(n: int) -> bytearray:
    assert (n % 8) == 0
    return bytes([0x00] * int(n / 8))
 def f32_zerod(n: int) -> bytearray:
    return bytes([0x00, 0x00, 0x00, 0x00] * int(n))
 CHAR_BIT = 8
 def _f32(list):
    return struct.pack("%sf" % len(list), *list)
 def _int8(list):
    return struct.pack("%sb" % len(list), *list)
 def connect(ext, path=":memory:"):
    db = sqlite3.connect(path)
    db.execute(
        "create temp table base_functions as select name from pragma_function_list"
    )
    db.execute("create temp table base_modules as select name from pragma_module_list")
    db.enable_load_extension(True)
    db.load_extension(ext)
    db.execute(
        "create temp table loaded_functions as select name from pragma_function_list where name not in (select name from base_functions) order by name"
    )
    db.execute(
        "create temp table loaded_modules as select name from pragma_module_list where name not in (select name from base_modules) order by name"
    )
    db.row_factory = sqlite3.Row
    return db
 db = connect(EXT_PATH)
 # db.load_extension(EXT_PATH, entrypoint="trace_debug")
 def explain_query_plan(sql):
    return db.execute("explain query plan " + sql).fetchone()["detail"]
 def execute_all(cursor, sql, args=None):
    if args is None:
        args = []
    results = cursor.execute(sql, args).fetchall()
    return list(map(lambda x: dict(x), results))
 def spread_args(args):
    return ",".join(["?"] * len(args))
 FUNCTIONS = [
    "vec_add",
    "vec_bit",
    "vec_debug",
    "vec_distance_cosine",
    "vec_distance_hamming",
    "vec_distance_l2",
    "vec_f32",
    "vec_int8",
    "vec_length",
    "vec_normalize",
    "vec_quantize_binary",
    "vec_quantize_i8",
    "vec_quantize_i8",
    "vec_slice",
    "vec_sub",
    "vec_to_json",
    "vec_version",
 ]
 MODULES = ["vec0", "vec_each", "vec_npy_each"]
 def test_funcs():
    funcs = list(
        map(
            lambda a: a[0],
            db.execute("select name from loaded_functions").fetchall(),
        )
    )
    assert funcs == FUNCTIONS
 def test_modules():
    modules = list(
        map(lambda a: a[0], db.execute("select name from loaded_modules").fetchall())
    )
    assert modules == MODULES
 def test_vec_version():
    vec_version = lambda *args: db.execute("select vec_version()", args).fetchone()[0]
    assert vec_version()[0] == "v"
 def test_vec_debug():
    vec_debug = lambda *args: db.execute("select vec_debug()", args).fetchone()[0]
    d = vec_debug().split("\n")
    assert len(d) == 4
 def test_vec_bit():
    vec_bit = lambda *args: db.execute("select vec_bit(?)", args).fetchone()[0]
    assert vec_bit(b"\xff") == b"\xff"
    assert db.execute("select subtype(vec_bit(X'FF'))").fetchone()[0] == 224
    with pytest.raises(
        sqlite3.OperationalError, match="zero-length vectors are not supported."
    ):
        db.execute("select vec_bit(X'')").fetchone()
    for x in [None, "text", 1, 1.999]:
        with pytest.raises(
            sqlite3.OperationalError, match="Unknown type for bitvector."
        ):
            db.execute("select vec_bit(?)", [x]).fetchone()
 def test_vec_f32():
    vec_f32 = lambda *args: db.execute("select vec_f32(?)", args).fetchone()[0]
    assert vec_f32(b"\x00\x00\x00\x00") == b"\x00\x00\x00\x00"
    assert vec_f32("[0.0000]") == b"\x00\x00\x00\x00"
    # fmt: off
    tests = [
      [0],
      [0, 0, 0, 0],
      [1, -1, 10, -10],
      [-0, 0, .0001, -.0001],
    ]
    # fmt: on
    for test in tests:
        assert vec_f32(json.dumps(test)) == _f32(test)
    assert db.execute("select subtype(vec_f32(X'00000000'))").fetchone()[0] == 223
    with pytest.raises(
        sqlite3.OperationalError, match="zero-length vectors are not supported."
    ):
        vec_f32(b"")
    for invalid in [None, 1, 1.2]:
        with pytest.raises(
            sqlite3.OperationalError,
            match=re.escape(
                "Input must have type BLOB (compact format) or TEXT (JSON)",
            ),
        ):
            vec_f32(invalid)
    with pytest.raises(
        sqlite3.OperationalError,
        match="invalid float32 vector BLOB length. Must be divisible by 4, found 5",
    ):
        vec_f32(b"aaaaa")
    with pytest.raises(
        sqlite3.OperationalError,
        match=re.escape("JSON array parsing error: Input does not start with '['"),
    ):
        vec_f32("1]")
    # TODO mas tests
    # TODO different error message
    with pytest.raises(
        sqlite3.OperationalError,
        match="zero-length vectors are not supported.",
    ):
        vec_f32("[")
    # vec_f32("[]")
 def test_vec_int8():
    vec_int8 = lambda *args: db.execute("select vec_int8(?)", args).fetchone()[0]
    assert vec_int8(b"\x00") == _int8([0])
    assert vec_int8(b"\x00\x0f") == _int8([0, 15])
    assert db.execute("select subtype(vec_int8(?))", [b"\x00"]).fetchone()[0] == 225
 def npy_cosine(a, b):
    return 1 - (np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
 def npy_l2(a, b):
    return np.linalg.norm(a - b)
 def test_vec_distance_cosine():
    vec_distance_cosine = lambda *args, a="?", b="?": db.execute(
        f"select vec_distance_cosine({a}, {b})", args
    ).fetchone()[0]
    def check(a, b, dtype=np.float32):
        if dtype == np.float32:
            transform = "?"
        elif dtype == np.int8:
            transform = "vec_int8(?)"
        a = np.array(a, dtype=dtype)
        b = np.array(b, dtype=dtype)
        x = vec_distance_cosine(a, b, a=transform, b=transform)
        y = npy_cosine(a, b)
        assert isclose(x, y, abs_tol=1e-6)
    check([1.2, 0.1], [0.4, -0.4])
    check([-1.2, -0.1], [-0.4, 0.4])
    check([1, 2, 3], [-9, -8, -7], dtype=np.int8)
    assert vec_distance_cosine("[1.1, 1.0]", "[1.2, 1.2]") == 0.001131898257881403
 def test_vec_distance_hamming():
    vec_distance_hamming = lambda *args: db.execute(
        "select vec_distance_hamming(vec_bit(?), vec_bit(?))", args
    ).fetchone()[0]
    assert vec_distance_hamming(b"\xff", b"\x00") == 8
    assert vec_distance_hamming(b"\xff", b"\x01") == 7
    assert vec_distance_hamming(b"\xab", b"\xab") == 0
    with pytest.raises(
        sqlite3.OperationalError,
        match="Cannot calculate hamming distance between two float32 vectors.",
    ):
        db.execute("select vec_distance_hamming(vec_f32('[1.0]'), vec_f32('[1.0]'))")
    with pytest.raises(
        sqlite3.OperationalError,
        match="Cannot calculate hamming distance between two int8 vectors.",
    ):
        db.execute("select vec_distance_hamming(vec_int8(X'FF'), vec_int8(X'FF'))")
 def test_vec_distance_l2():
    vec_distance_l2 = lambda *args, a="?", b="?": db.execute(
        f"select vec_distance_l2({a}, {b})", args
    ).fetchone()[0]
    def check(a, b, dtype=np.float32):
        if dtype == np.float32:
            transform = "?"
        elif dtype == np.int8:
            transform = "vec_int8(?)"
        a = np.array(a, dtype=dtype)
        b = np.array(b, dtype=dtype)
        x = vec_distance_l2(a, b, a=transform, b=transform)
        y = npy_l2(a, b)
        assert isclose(x, y, abs_tol=1e-6)
    check([1.2, 0.1], [0.4, -0.4])
    check([-1.2, -0.1], [-0.4, 0.4])
    check([1, 2, 3], [-9, -8, -7], dtype=np.int8)
 def test_vec_length():
    def test_f32():
        vec_length = lambda *args: db.execute("select vec_length(?)", args).fetchone()[
            0
        ]
        assert vec_length(b"\xAA\xBB\xCC\xDD") == 1
        assert vec_length(b"\xAA\xBB\xCC\xDD\x01\x02\x03\x04") == 2
        assert vec_length(f32_zerod(1024)) == 1024
        with pytest.raises(
            sqlite3.OperationalError, match="zero-length vectors are not supported."
        ):
            assert vec_length(b"") == 0
        with pytest.raises(
            sqlite3.OperationalError, match="zero-length vectors are not supported."
        ):
            vec_length("[]")
    def test_int8():
        vec_length_int8 = lambda *args: db.execute(
            "select vec_length(vec_int8(?))", args
        ).fetchone()[0]
        assert vec_length_int8(b"\xAA") == 1
        assert vec_length_int8(b"\xAA\xBB\xCC\xDD") == 4
        assert vec_length_int8(b"\xAA\xBB\xCC\xDD\x01\x02\x03\x04") == 8
        with pytest.raises(
            sqlite3.OperationalError, match="zero-length vectors are not supported."
        ):
            assert vec_length_int8(b"") == 0
    def test_bit():
        vec_length_bit = lambda *args: db.execute(
            "select vec_length(vec_bit(?))", args
        ).fetchone()[0]
        assert vec_length_bit(b"\xAA") == 8
        assert vec_length_bit(b"\xAA\xBB\xCC\xDD") == 8 * 4
        assert vec_length_bit(b"\xAA\xBB\xCC\xDD\x01\x02\x03\x04") == 8 * 8
        with pytest.raises(
            sqlite3.OperationalError, match="zero-length vectors are not supported."
        ):
            assert vec_length_bit(b"") == 0
    test_f32()
    test_int8()
    test_bit()
 def test_vec_normalize():
    vec_normalize = lambda *args: db.execute(
        "select vec_normalize(?)", args
    ).fetchone()[0]
    assert list(struct.unpack_from("4f", vec_normalize(_f32([1, 2, -1, -2])))) == [
        0.3162277638912201,
        0.6324555277824402,
        -0.3162277638912201,
        -0.6324555277824402,
    ]
 def test_vec_slice():
    vec_slice = lambda *args, f="?": db.execute(
        f"select vec_slice({f}, ?, ?)", args
    ).fetchone()[0]
    assert vec_slice(_f32([1.1, 2.2, 3.3]), 0, 3) == _f32([1.1, 2.2, 3.3])
    assert vec_slice(_f32([1.1, 2.2, 3.3]), 0, 2) == _f32([1.1, 2.2])
    assert vec_slice(_f32([1.1, 2.2, 3.3]), 0, 1) == _f32([1.1])
    assert vec_slice(_int8([1, 2, 3]), 0, 3, f="vec_int8(?)") == _int8([1, 2, 3])
    assert vec_slice(_int8([1, 2, 3]), 0, 2, f="vec_int8(?)") == _int8([1, 2])
    assert vec_slice(_int8([1, 2, 3]), 0, 1, f="vec_int8(?)") == _int8([1])
    assert vec_slice(b"\xAA\xBB\xCC\xDD", 0, 8, f="vec_bit(?)") == b"\xAA"
    assert vec_slice(b"\xAA\xBB\xCC\xDD", 8, 16, f="vec_bit(?)") == b"\xBB"
    assert vec_slice(b"\xAA\xBB\xCC\xDD", 8, 24, f="vec_bit(?)") == b"\xBB\xCC"
    assert vec_slice(b"\xAA\xBB\xCC\xDD", 0, 32, f="vec_bit(?)") == b"\xAA\xBB\xCC\xDD"
    with pytest.raises(
        sqlite3.OperationalError, match="start index must be divisible by 8."
    ):
        vec_slice(b"\xAA\xBB\xCC\xDD", 2, 32, f="vec_bit(?)")
    with pytest.raises(
        sqlite3.OperationalError, match="end index must be divisible by 8."
    ):
        vec_slice(b"\xAA\xBB\xCC\xDD", 0, 31, f="vec_bit(?)")
    with pytest.raises(
        sqlite3.OperationalError, match="slice 'start' index must be a postive number."
    ):
        vec_slice(b"\xab\xab\xab\xab", -1, 1)
    with pytest.raises(
        sqlite3.OperationalError, match="slice 'end' index must be a postive number."
    ):
        vec_slice(b"\xab\xab\xab\xab", 0, -3)
    with pytest.raises(
        sqlite3.OperationalError,
        match="slice 'start' index is greater than the number of dimensions",
    ):
        vec_slice(b"\xab\xab\xab\xab", 2, 3)
    with pytest.raises(
        sqlite3.OperationalError,
        match="slice 'end' index is greater than the number of dimensions",
    ):
        vec_slice(b"\xab\xab\xab\xab", 0, 2)
    with pytest.raises(
        sqlite3.OperationalError,
        match="slice 'start' index is greater than 'end' index",
    ):
        vec_slice(b"\xab\xab\xab\xab", 1, 0)
 def test_vec_add():
    vec_add = lambda *args, a="?", b="?": db.execute(
        f"select vec_add({a}, {b})", args
    ).fetchone()[0]
    assert vec_add("[1]", "[2]") == _f32([3])
    assert vec_add("[.1]", "[.2]") == _f32([0.3])
    assert vec_add(_int8([1]), _int8([2]), a="vec_int8(?)", b="vec_int8(?)") == _int8(
        [3]
    )
    with pytest.raises(
        sqlite3.OperationalError,
        match="Cannot add two bitvectors together.",
    ):
        vec_add(b"0xff", b"0xff", a="vec_bit(?)", b="vec_bit(?)")
    with pytest.raises(
        sqlite3.OperationalError,
        match="Vector type mistmatch. First vector has type float32, while the second has type int8.",
    ):
        vec_add(_f32([1]), _int8([2]), b="vec_int8(?)")
    with pytest.raises(
        sqlite3.OperationalError,
        match="Vector type mistmatch. First vector has type int8, while the second has type float32.",
    ):
        vec_add(_int8([2]), _f32([1]), a="vec_int8(?)")
 def test_vec_sub():
    vec_sub = lambda *args, a="?", b="?": db.execute(
        f"select vec_sub({a}, {b})", args
    ).fetchone()[0]
    assert vec_sub("[1]", "[2]") == _f32([-1])
    assert vec_sub("[.1]", "[.2]") == _f32([-0.1])
    assert vec_sub(_int8([11]), _int8([2]), a="vec_int8(?)", b="vec_int8(?)") == _int8(
        [9]
    )
    with pytest.raises(
        sqlite3.OperationalError,
        match="Cannot subtract two bitvectors together.",
    ):
        vec_sub(b"0xff", b"0xff", a="vec_bit(?)", b="vec_bit(?)")
    with pytest.raises(
        sqlite3.OperationalError,
        match="Vector type mistmatch. First vector has type float32, while the second has type int8.",
    ):
        vec_sub(_f32([1]), _int8([2]), b="vec_int8(?)")
    with pytest.raises(
        sqlite3.OperationalError,
        match="Vector type mistmatch. First vector has type int8, while the second has type float32.",
    ):
        vec_sub(_int8([2]), _f32([1]), a="vec_int8(?)")
 def test_vec_to_json():
    vec_to_json = lambda *args, input="?": db.execute(
        f"select vec_to_json({input})", args
    ).fetchone()[0]
    assert vec_to_json("[1, 2, 3]") == "[1.000000,2.000000,3.000000]"
    assert vec_to_json(b"\x00\x00\x00\x00\x00\x00\x80\xbf") == "[0.000000,-1.000000]"
    assert vec_to_json(b"\x04", input="vec_int8(?)") == "[4]"
    assert vec_to_json(b"\x04\xff", input="vec_int8(?)") == "[4,-1]"
    assert vec_to_json(b"\xff", input="vec_bit(?)") == "[1,1,1,1,1,1,1,1]"
    assert vec_to_json(b"\x0f", input="vec_bit(?)") == "[1,1,1,1,0,0,0,0]"
@pytest.mark.skip(reason="TODO")
 def test_vec_quantize_i8():
    vec_quantize_i8 = lambda *args: db.execute(
        "select vec_quantize_i8()", args
    ).fetchone()[0]
    assert vec_quantize_i8() == 111
@pytest.mark.skip(reason="TODO")
 def test_vec_quantize_binary():
    vec_quantize_binary = lambda *args: db.execute(
        "select vec_quantize_binary()", args
    ).fetchone()[0]
    assert vec_quantize_binary() == 111
@pytest.mark.skip(reason="TODO")
 def test_vec0():
    pass
 def test_vec0_updates():
    db = connect(EXT_PATH)
    db.execute(
        """
          create virtual table t using vec0(
            aaa float[128],
            bbb int8[128],
            ccc bit[128]
          );
        """
    )
    db.execute(
        "insert into t values (?, ?, vec_int8(?), vec_bit(?))",
        [
            1,
            np.full((128,), 0.0001, dtype="float32"),
            np.full((128,), 4, dtype="int8"),
            bitmap_full(128),
        ],
    )
    assert execute_all(db, "select * from t") == [
        {
            "rowid": 1,
            "aaa": _f32([0.0001] * 128),
            "bbb": _int8([4] * 128),
            "ccc": bitmap_full(128),
        }
    ]
    db.execute(
        "update t set aaa = ? where rowid = ?",
        [np.full((128,), 0.00011, dtype="float32"), 1],
    )
    assert execute_all(db, "select * from t") == [
        {
            "rowid": 1,
            "aaa": _f32([0.00011] * 128),
            "bbb": _int8([4] * 128),
            "ccc": bitmap_full(128),
        }
    ]
 def test_vec_each():
    vec_each_f32 = lambda *args: execute_all(
        db, "select rowid, * from vec_each(vec_f32(?))", args
    )
    assert vec_each_f32(_f32([1.0, 2.0, 3.0])) == [
        {"rowid": 0, "value": 1.0},
        {"rowid": 1, "value": 2.0},
        {"rowid": 2, "value": 3.0},
    ]
 import io
 def to_npy(arr):
    buf = io.BytesIO()
    np.save(buf, arr)
    buf.seek(0)
    return buf.read()
 def test_vec_npy_each():
    vec_npy_each = lambda *args: execute_all(
        db, "select rowid, * from vec_npy_each(?)", args
    )
    assert vec_npy_each(to_npy(np.array([1.1, 2.2, 3.3], dtype=np.float32))) == [
        {
            "rowid": 0,
            "vector": _f32([1.1, 2.2, 3.3]),
        },
    ]
    assert vec_npy_each(to_npy(np.array([[1.1, 2.2, 3.3]], dtype=np.float32))) == [
        {
            "rowid": 0,
            "vector": _f32([1.1, 2.2, 3.3]),
        },
    ]
    assert vec_npy_each(
        to_npy(np.array([[1.1, 2.2, 3.3], [9.9, 8.8, 7.7]], dtype=np.float32))
    ) == [
        {
            "rowid": 0,
            "vector": _f32([1.1, 2.2, 3.3]),
        },
        {
            "rowid": 1,
            "vector": _f32([9.9, 8.8, 7.7]),
        },
    ]
 def test_smoke():
    db.execute("create virtual table vec_xyz using vec0( a float[2] )")
    assert execute_all(
        db,
        "select name, ncol from pragma_table_list where name like 'vec_xyz%' order by name;",
    ) == [
        {
            "name": "vec_xyz",
            "ncol": 4,
        },
        {
            "name": "vec_xyz_chunks",
            "ncol": 4,
        },
        {
            "name": "vec_xyz_rowids",
            "ncol": 4,
        },
        {
            "name": "vec_xyz_vector_chunks00",
            "ncol": 2,
        },
    ]
    chunk = db.execute("select * from vec_xyz_chunks").fetchone()
    assert chunk["chunk_id"] == 1
    assert chunk["validity"] == bytearray(int(1024 / 8))
    assert chunk["rowids"] == bytearray(int(1024 * 8))
    vchunk = db.execute("select * from vec_xyz_vector_chunks00").fetchone()
    assert vchunk["rowid"] == 1
    assert vchunk["vectors"] == bytearray(int(1024 * 4 * 2))
    assert (
        explain_query_plan(
            "select * from vec_xyz where a match X'' order by distance limit 10"
        )
        == "SCAN vec_xyz VIRTUAL TABLE INDEX 0:knn:"
    )
    assert (
        explain_query_plan("select * from vec_xyz")
        == "SCAN vec_xyz VIRTUAL TABLE INDEX 0:fullscan"
    )
    assert (
        explain_query_plan("select * from vec_xyz where rowid = 4")
        == "SCAN vec_xyz VIRTUAL TABLE INDEX 3:point"
    )
    db.execute("insert into vec_xyz(rowid, a) select 1, X'000000000000803f'")
    chunk = db.execute("select * from vec_xyz_chunks").fetchone()
    assert chunk["chunk_id"] == 1
    assert chunk["validity"] == b"\x01" + bytearray(int(1024 / 8) - 1)
    assert chunk["rowids"] == b"\x01\x00\x00\x00\x00\x00\x00\x00" + bytearray(
        int(1024 * 8) - 8
    )
    vchunk = db.execute("select * from vec_xyz_vector_chunks00").fetchone()
    assert vchunk["rowid"] == 1
    assert vchunk["vectors"] == b"\x00\x00\x00\x00\x00\x00\x80\x3f" + bytearray(
        int(1024 * 4 * 2) - (2 * 4)
    )
    db.execute("insert into vec_xyz(rowid, a) select 2, X'0000000000000040'")
    chunk = db.execute("select * from vec_xyz_chunks").fetchone()
    assert chunk[
        "rowids"
    ] == b"\x01\x00\x00\x00\x00\x00\x00\x00" + b"\x02\x00\x00\x00\x00\x00\x00\x00" + bytearray(
        int(1024 * 8) - 8 * 2
    )
    assert chunk["chunk_id"] == 1
    assert chunk["validity"] == b"\x03" + bytearray(int(1024 / 8) - 1)
    vchunk = db.execute("select * from vec_xyz_vector_chunks00").fetchone()
    assert vchunk["rowid"] == 1
    assert vchunk[
        "vectors"
    ] == b"\x00\x00\x00\x00\x00\x00\x80\x3f" + b"\x00\x00\x00\x00\x00\x00\x00\x40" + bytearray(
        int(1024 * 4 * 2) - (2 * 4 * 2)
    )
    db.execute("insert into vec_xyz(rowid, a) select 3, X'00000000000080bf'")
    chunk = db.execute("select * from vec_xyz_chunks").fetchone()
    assert chunk["chunk_id"] == 1
    assert chunk["validity"] == b"\x07" + bytearray(int(1024 / 8) - 1)
    assert chunk[
        "rowids"
    ] == b"\x01\x00\x00\x00\x00\x00\x00\x00" + b"\x02\x00\x00\x00\x00\x00\x00\x00" + b"\x03\x00\x00\x00\x00\x00\x00\x00" + bytearray(
        int(1024 * 8) - 8 * 3
    )
    vchunk = db.execute("select * from vec_xyz_vector_chunks00").fetchone()
    assert vchunk["rowid"] == 1
    assert vchunk[
        "vectors"
    ] == b"\x00\x00\x00\x00\x00\x00\x80\x3f" + b"\x00\x00\x00\x00\x00\x00\x00\x40" + b"\x00\x00\x00\x00\x00\x00\x80\xbf" + bytearray(
        int(1024 * 4 * 2) - (2 * 4 * 3)
    )
    # db.execute("select * from vec_xyz")
    assert execute_all(db, "select * from vec_xyz") == [
        {"rowid": 1, "a": b"\x00\x00\x00\x00\x00\x00\x80?"},
        {"rowid": 2, "a": b"\x00\x00\x00\x00\x00\x00\x00@"},
        {"rowid": 3, "a": b"\x00\x00\x00\x00\x00\x00\x80\xbf"},
    ]
 def test_vec0_stress_small_chunks():
    data = np.zeros((1000, 8), dtype=np.float32)
    for i in range(1000):
        data[i] = np.array([(i + 1) * 0.1] * 8)
    db.execute("create virtual table vec_small using vec0(chunk_size=8, a float[8])")
    assert execute_all(db, "select rowid, * from vec_small") == []
    with db:
        for row in data:
            db.execute("insert into vec_small(a) values (?) ", [row])
    assert execute_all(db, "select rowid, * from vec_small limit 8") == [
        {"rowid": 1, "a": _f32([0.1] * 8)},
        {"rowid": 2, "a": _f32([0.2] * 8)},
        {"rowid": 3, "a": _f32([0.3] * 8)},
        {"rowid": 4, "a": _f32([0.4] * 8)},
        {"rowid": 5, "a": _f32([0.5] * 8)},
        {"rowid": 6, "a": _f32([0.6] * 8)},
        {"rowid": 7, "a": _f32([0.7] * 8)},
        {"rowid": 8, "a": _f32([0.8] * 8)},
    ]
    assert db.execute("select count(*) from vec_small").fetchone()[0] == 1000
    assert execute_all(
        db, "select rowid, * from vec_small order by rowid desc limit 8"
    ) == [
        {"rowid": 1000, "a": _f32([100.0] * 8)},
        {"rowid": 999, "a": _f32([99.9] * 8)},
        {"rowid": 998, "a": _f32([99.8] * 8)},
        {"rowid": 997, "a": _f32([99.7] * 8)},
        {"rowid": 996, "a": _f32([99.6] * 8)},
        {"rowid": 995, "a": _f32([99.5] * 8)},
        {"rowid": 994, "a": _f32([99.4] * 8)},
        {"rowid": 993, "a": _f32([99.3] * 8)},
    ]
    assert (
        execute_all(
            db,
            """
              select rowid, a, distance
              from vec_small
              where a match ?
                and k = 9
              order by distance
            """,
            [_f32([50.0] * 8)],
        )
        == [
            {
                "a": _f32([500 * 0.1] * 8),
                "distance": 0.0,
                "rowid": 500,
            },
            {
                "a": _f32([499 * 0.1] * 8),
                "distance": 0.2828384041786194,
                "rowid": 499,
            },
            {
                "a": _f32([501 * 0.1] * 8),
                "distance": 0.2828384041786194,
                "rowid": 501,
            },
            {
                "a": _f32([498 * 0.1] * 8),
                "distance": 0.5656875967979431,
                "rowid": 498,
            },
            {
                "a": _f32([502 * 0.1] * 8),
                "distance": 0.5656875967979431,
                "rowid": 502,
            },
            {
                "a": _f32([497 * 0.1] * 8),
                "distance": 0.8485260009765625,
                "rowid": 497,
            },
            {
                "a": _f32([503 * 0.1] * 8),
                "distance": 0.8485260009765625,
                "rowid": 503,
            },
            {
                "a": _f32([496 * 0.1] * 8),
                "distance": 1.1313751935958862,
                "rowid": 496,
            },
            {
                "a": _f32([504 * 0.1] * 8),
                "distance": 1.1313751935958862,
                "rowid": 504,
            },
        ]
    )
 def rowids_value(buffer: bytearray) -> List[int]:
    assert (len(buffer) % 8) == 0
    n = int(len(buffer) / 8)
    return list(struct.unpack_from(f"<{n}q", buffer))
 import numpy.typing as npt
 def cosine_similarity(
    vec: npt.NDArray[np.float32], mat: npt.NDArray[np.float32], do_norm: bool = True
 ) -> npt.NDArray[np.float32]:
    sim = vec @ mat.T
    if do_norm:
        sim /= np.linalg.norm(vec) * np.linalg.norm(mat, axis=1)
    return sim
 def topk(
    vec: npt.NDArray[np.float32],
    mat: npt.NDArray[np.float32],
    k: int = 5,
    do_norm: bool = True,
 ) -> tuple[npt.NDArray[np.int32], npt.NDArray[np.float32]]:
    sim = cosine_similarity(vec, mat, do_norm=do_norm)
    # Rather than sorting all similarities and taking the top K, it's faster to
    # argpartition and then just sort the top K.
    # The difference is O(N logN) vs O(N + k logk)
    indices = np.argpartition(-sim, kth=k)[:k]
    top_indices = np.argsort(-sim[indices])
    return indices[top_indices], sim[top_indices]
 def test_stress1():
    np.random.seed(1234)
    data = np.random.uniform(-1.0, 1.0, (8000, 128)).astype(np.float32)
    db.execute(
        "create virtual table vec_stress1 using vec0( a float[128] distance_metric=cosine)"
    )
    with db:
        for idx, row in enumerate(data):
            db.execute("insert into vec_stress1 values (?, ?)", [idx, row])
    queries = np.random.uniform(-1.0, 1.0, (100, 128)).astype(np.float32)
    for q in queries:
        ids, distances = topk(q, data, k=10)
        rows = db.execute(
            """
              select rowid, distance
              from vec_stress1
              where a match ? and k = ?
              order by distance
             """,
            [q, 10],
        ).fetchall()
        assert len(ids) == 10
        assert len(rows) == 10
        vec_ids = [row[0] for row in rows]
        assert ids.tolist() == vec_ids
@pytest.mark.skip(reason="slow")
 def test_stress():
    db.execute("create virtual table vec_t1 using vec0( a float[1536])")
    def rand_vec(n):
        return struct.pack("%sf" % n, *list(map(lambda x: random(), range(n))))
    for i in range(1025):
        db.execute("insert into vec_t1(a) values (?)", [rand_vec(1536)])
    rows = db.execute("select validity, rowids from vec_t1_chunks").fetchall()
    assert len(rows) == 2
    assert len(rows[0]["validity"]) == 1024 / CHAR_BIT
    assert len(rows[0]["rowids"]) == 1024 * CHAR_BIT
    assert rows[0]["validity"] == bitmap_full(1024)
    assert rowids_value(rows[0]["rowids"]) == [x + 1 for x in range(1024)]
    assert len(rows[1]["validity"]) == 1024 / CHAR_BIT
    assert len(rows[1]["rowids"]) == 1024 * CHAR_BIT
    assert rows[1]["validity"] == bytes([0b0000_0001]) + bitmap_zerod(1024)[1:]
    assert rowids_value(rows[1]["rowids"])[0] == 1025
 def test_coverage():
    current_module = inspect.getmodule(inspect.currentframe())
    test_methods = [
        member[0]
        for member in inspect.getmembers(current_module)
        if member[0].startswith("test_")
    ]
    funcs_with_tests = set([x.replace("test_", "") for x in test_methods])
    for func in [*FUNCTIONS, *MODULES]:
        assert func in funcs_with_tests, f"{func} is not tested"
 if __name__ == "__main__":
    unittest.main()
--- a/tests/unittest.rs
+++ b/tests/unittest.rs
@ -0,0 +1,37 @@
 fn main() {
    println!("Hello, world!");
    println!("{:?}", _min_idx(vec![3.0, 2.0, 1.0], 2));
 }
 fn _min_idx(distances: Vec<f32>, k: i32) -> Vec<i32> {
    let mut out: Vec<i32> = vec![0; k as usize];
    unsafe {
        min_idx(
            distances.as_ptr().cast(),
            distances.len() as i32,
            out.as_mut_ptr(),
            k,
        );
    }
    out
 }
 #[link(name = "sqlite-vec-internal")]
 extern "C" {
    fn min_idx(distances: *const f32, n: i32, out: *mut i32, k: i32) -> i32;
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn test_basic() {
        assert_eq!(_min_idx(vec![1.0, 2.0, 3.0], 3), vec![0, 1, 2]);
        assert_eq!(_min_idx(vec![3.0, 2.0, 1.0], 3), vec![2, 1, 0]);
        assert_eq!(_min_idx(vec![1.0, 2.0, 3.0], 2), vec![0, 1]);
        assert_eq!(_min_idx(vec![3.0, 2.0, 1.0], 2), vec![2, 1]);
    }
 }
--- a/tests/utils.py
+++ b/tests/utils.py
@ -0,0 +1,22 @@
 import numpy as np
 from io import BytesIO
 def to_npy(arr):
    buf = BytesIO()
    np.save(buf, arr)
    buf.seek(0)
    return buf.read()
 to_npy(np.array([[1.0, 2.0, 3.0], [2.0, 3.0, 4.0]], dtype=np.float32))
 print(to_npy(np.array([[1.0, 2.0]], dtype=np.float32)))
 print(to_npy(np.array([1.0, 2.0], dtype=np.float32)))
 to_npy(
    np.array(
        [np.zeros(10), np.zeros(10), np.zeros(10), np.zeros(10), np.zeros(10)],
        dtype=np.float32,
    )
 )