diff --git a/benchmarks/exhaustive-memory/.gitignore b/benchmarks/exhaustive-memory/.gitignore new file mode 100644 index 0000000..8fce603 --- /dev/null +++ b/benchmarks/exhaustive-memory/.gitignore @@ -0,0 +1 @@ +data/ diff --git a/benchmarks/exhaustive-memory/Makefile b/benchmarks/exhaustive-memory/Makefile new file mode 100644 index 0000000..525571f --- /dev/null +++ b/benchmarks/exhaustive-memory/Makefile @@ -0,0 +1,15 @@ + + + +data/: + mkdir -p $@ + +data/sift: data/ + curl -o data/sift.tar.gz ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz + tar -xvzf data/sift.tar.gz -C data/ + rm data/sift.tar.gz + +data/gist: data/ + curl -o data/gist.tar.gz ftp://ftp.irisa.fr/local/texmex/corpus/gist.tar.gz + tar -xvzf data/gist.tar.gz -C data/ + rm data/gist.tar.gz diff --git a/benchmarks/exhaustive-memory/README.md b/benchmarks/exhaustive-memory/README.md index 374e4ee..e432ccf 100644 --- a/benchmarks/exhaustive-memory/README.md +++ b/benchmarks/exhaustive-memory/README.md @@ -1,35 +1,25 @@ -``` -python3 bench/bench.py \ - -n "sift1m" \ - -i sift/sift_base.fvecs \ - -q sift/sift_query.fvecs \ - --sample 10000 --qsample 100 \ - -k 10 -``` +# `sqlite-vec` In-memory benchmark comparisions -``` -python3 bench.py \ - -n "sift1m" \ - -i ../../sift/sift_base.fvecs \ - -q ../../sift/sift_query.fvecs \ - --qsample 100 \ - -k 20 -``` -``` -python3 bench.py \ - -n "sift1m" \ - -i ../../sift/sift_base.fvecs \ - -q ../../sift/sift_query.fvecs \ - --qsample 100 \ - -x faiss,vec-scalar.4096,vec-static,vec-vec0.4096.16,vec-vec0.8192.1024,usearch,duckdb,hnswlib,numpy \ - -k 20 -``` +This repo contains a benchmarks that compares KNN queries of `sqlite-vec` to other in-process vector search tools using **brute force linear scans only**. These include: - -``` -python bench.py -n gist -i ../../gist/gist_base.fvecs -q ../../gist/gist_query.fvecs --qsample 100 -k 20 --sample 500000 -x faiss,vec-static,vec-scalar.8192,vec-scalar.16384,vec-scalar.32768,vec-vec0.16384.64,vec-vec0.16384.128,vec-vec0.16384.256,vec-vec0.16384.512,vec-vec0.16384.1024,vec-vec0.16384.2048 -``` +- [Faiss IndexFlatL2](https://faiss.ai/) +- [usearch with `exact=True`](https://github.com/unum-cloud/usearch) +- [libsql vector search with `vector_distance_cos`](https://turso.tech/vector) +- [numpy](https://numpy.org/), using [this approach](https://github.com/EthanRosenthal/nn-vs-ann) +- [duckdb with `list_cosine_similarity`](https://duckdb.org/docs/sql/functions/nested.html#list_cosine_similaritylist1-list2) +- [`sentence_transformers.util.semantic_search`](https://sbert.net/docs/package_reference/util.html#sentence_transformers.util.semantic_search) +- [hnswlib BFIndex](https://github.com/nmslib/hnswlib/blob/c1b9b79af3d10c6ee7b5d0afa1ce851ae975254c/TESTING_RECALL.md?plain=1#L8) -python bench.py -n gist -i ../../gist/gist_base.fvecs -q ../../gist/gist_query.fvecs --qsample 100 -k 20 --sample 500000 -x faiss,vec-static,sentence-transformers,numpy +Again **ONLY BRUTE FORCE LINEAR SCANS ARE TESTED**. This benchmark does **not** test approximate nearest neighbors (ANN) implementations. This benchmarks is extremely narrow to just testing KNN searches using brute force. + +A few other caveats: + +- Only brute-force linear scans, no ANN +- Only CPU is used. The only tool that does offer GPU is Faiss anyway. +- Only in-memory datasets are used. Many of these tools do support serializing and reading from disk (including `sqlite-vec`) and possibly `mmap`'ing, but this only tests in-memory datasets. Mostly because of numpy +- Queries are made one after the other, **not batched.** Some tools offer APIs to query multiple inputs at the same time, but this benchmark runs queries sequentially. This was done to emulate "server request"-style queries, but multiple users would send queries at different times, making batching more difficult. To note, `sqlite-vec` does **not** support batched queries yet. + + +These tests are run in Python. Vectors are provided as an in-memory numpy array, and each test converts that numpy array into whatever makes sense for the given tool. For example, `sqlite-vec` tests will read those vectors into a SQLite table. DuckDB will read them into an Array array then create a DuckDB table from that. diff --git a/benchmarks/exhaustive-memory/b.py b/benchmarks/exhaustive-memory/b.py deleted file mode 100644 index 9cad980..0000000 --- a/benchmarks/exhaustive-memory/b.py +++ /dev/null @@ -1,51 +0,0 @@ -import numpy as np -import numpy.typing as npt -import time - -def cosine_similarity( - vec: npt.NDArray[np.float32], mat: npt.NDArray[np.float32], do_norm: bool = True -) -> npt.NDArray[np.float32]: - sim = vec @ mat.T - if do_norm: - sim /= np.linalg.norm(vec) * np.linalg.norm(mat, axis=1) - return sim - - -def topk( - vec: npt.NDArray[np.float32], - mat: npt.NDArray[np.float32], - k: int = 5, - do_norm: bool = True, -) -> tuple[npt.NDArray[np.int32], npt.NDArray[np.float32]]: - sim = cosine_similarity(vec, mat, do_norm=do_norm) - # Rather than sorting all similarities and taking the top K, it's faster to - # argpartition and then just sort the top K. - # The difference is O(N logN) vs O(N + k logk) - indices = np.argpartition(-sim, kth=k)[:k] - top_indices = np.argsort(-sim[indices]) - return indices[top_indices], sim[top_indices] - - -def ivecs_read(fname): - a = np.fromfile(fname, dtype="int32") - d = a[0] - return a.reshape(-1, d + 1)[:, 1:].copy() - - -def fvecs_read(fname): - return ivecs_read(fname).view("float32") - - - -base = fvecs_read("../../sift/sift_base.fvecs") -queries = fvecs_read("../../sift/sift_query.fvecs") -k = 20 -times = [] -results = [] -for idx, q in enumerate(queries[0:20]): - t0 = time.time() - result = topk(q, base, k=k) - results.append(result) - times.append(time.time() - t0) -print(np.__version__) -print(np.mean(times)) diff --git a/benchmarks/exhaustive-memory/bench.py b/benchmarks/exhaustive-memory/bench.py index 211864d..c9da831 100644 --- a/benchmarks/exhaustive-memory/bench.py +++ b/benchmarks/exhaustive-memory/bench.py @@ -1,22 +1,12 @@ import numpy as np import numpy.typing as npt import time -import hnswlib import sqlite3 -import faiss -import lancedb import pandas as pd - -# import chromadb -from usearch.index import Index, search, MetricKind - from dataclasses import dataclass - -from typing import List - -import duckdb -import pyarrow as pa -from sentence_transformers.util import semantic_search +from rich.console import Console +from rich.table import Table +from typing import List, Optional @dataclass @@ -66,6 +56,7 @@ def fvecs_read(fname, sample): def bench_hnsw(base, query): + import hnswlib t0 = time.time() p = hnswlib.Index(space="ip", dim=128) # possible options are l2, cosine or ip @@ -92,6 +83,7 @@ def bench_hnsw(base, query): def bench_hnsw_bf(base, query, k) -> BenchResult: + import hnswlib print("hnswlib-bf") dimensions = base.shape[1] t0 = time.time() @@ -115,7 +107,7 @@ def bench_hnsw_bf(base, query, k) -> BenchResult: def bench_numpy(base, query, k) -> BenchResult: - print("numpy") + print("numpy...") times = [] results = [] for idx, q in enumerate(query): @@ -128,7 +120,7 @@ def bench_numpy(base, query, k) -> BenchResult: def bench_sqlite_vec(base, query, page_size, chunk_size, k) -> BenchResult: dimensions = base.shape[1] - print(f"sqlite-vec {page_size} {chunk_size}") + print(f"sqlite-vec {page_size} {chunk_size}...") db = sqlite3.connect(":memory:") db.execute(f"PRAGMA page_size = {page_size}") @@ -169,12 +161,13 @@ def bench_sqlite_vec(base, query, page_size, chunk_size, k) -> BenchResult: """, [q.tobytes(), k], ).fetchall() + assert len(result) == k times.append(time.time() - t0) return BenchResult(f"sqlite-vec vec0 ({page_size}|{chunk_size})", build_time, times) def bench_sqlite_vec_scalar(base, query, page_size, k) -> BenchResult: - print(f"sqlite-vec-scalar") + print(f"sqlite-vec-scalar...") db = sqlite3.connect(":memory:") db.enable_load_extension(True) @@ -208,11 +201,12 @@ def bench_sqlite_vec_scalar(base, query, page_size, k) -> BenchResult: """, [q.tobytes(), k], ).fetchall() + assert len(result) == k times.append(time.time() - t0) return BenchResult(f"sqlite-vec-scalar ({page_size})", build_time, times) def bench_libsql(base, query, page_size, k) -> BenchResult: - print(f"libsql") + print(f"libsql ...") dimensions = base.shape[1] db = sqlite3.connect(":memory:") @@ -273,7 +267,7 @@ def register_np(db, array, name): ) def bench_sqlite_vec_static(base, query, k) -> BenchResult: - print(f"sqlite-vec static") + print(f"sqlite-vec static...") db = sqlite3.connect(":memory:") db.enable_load_extension(True) @@ -303,12 +297,14 @@ def bench_sqlite_vec_static(base, query, k) -> BenchResult: """, [q.tobytes(), k], ).fetchall() + assert len(result) == k times.append(time.time() - t0) return BenchResult(f"sqlite-vec static", build_time, times) def bench_faiss(base, query, k) -> BenchResult: + import faiss dimensions = base.shape[1] - print("faiss") + print("faiss...") t = time.time() index = faiss.IndexFlatL2(dimensions) index.add(base) @@ -321,11 +317,12 @@ def bench_faiss(base, query, k) -> BenchResult: distances, rowids = index.search(x=np.array([q]), k=k) results.append(rowids) times.append(time.time() - t0) - print("faiss avg", duration(np.mean(times))) return BenchResult("faiss", build_time, times) def bench_lancedb(base, query, k) -> BenchResult: + import lancedb + print('lancedb...') dimensions = base.shape[1] db = lancedb.connect("a") data = [{"vector": row.reshape(1, -1)[0]} for row in base] @@ -343,6 +340,9 @@ def bench_lancedb(base, query, k) -> BenchResult: return BenchResult("lancedb", build_time, times) def bench_duckdb(base, query, k) -> BenchResult: + import duckdb + import pyarrow as pa + print("duckdb...") dimensions = base.shape[1] db = duckdb.connect(":memory:") db.execute(f"CREATE TABLE t(vector float[{dimensions}])") @@ -368,6 +368,7 @@ def bench_duckdb(base, query, k) -> BenchResult: return BenchResult("duckdb", build_time, times) def bench_sentence_transformers(base, query, k) -> BenchResult: + from sentence_transformers.util import semantic_search print("sentence-transformers") dimensions = base.shape[1] t0 = time.time() @@ -382,28 +383,29 @@ def bench_sentence_transformers(base, query, k) -> BenchResult: return BenchResult("sentence-transformers", build_time, times) -# def bench_chroma(base, query, k): -# chroma_client = chromadb.Client() -# collection = chroma_client.create_collection(name="my_collection") -# -# t = time.time() -# # chroma doesn't allow for more than 41666 vectors to be inserted at once (???) -# i = 0 -# collection.add(embeddings=base, ids=[str(x) for x in range(len(base))]) -# print("chroma build time: ", duration(time.time() - t)) -# times = [] -# for q in query: -# t0 = time.time() -# result = collection.query( -# query_embeddings=[q.tolist()], -# n_results=k, -# ) -# print(result) -# times.append(time.time() - t0) -# print("chroma avg", duration(np.mean(times))) +def bench_chroma(base, query, k): + import chromadb + from chromadb.utils.batch_utils import create_batches + chroma_client = chromadb.EphemeralClient() + collection = chroma_client.create_collection(name="my_collection") + t = time.time() + for batch in create_batches(api=chroma_client, ids=[str(x) for x in range(len(base))], embeddings=base.tolist()): + collection.add(*batch) + build_time = time.time() - t + times = [] + for q in query: + t0 = time.time() + result = collection.query( + query_embeddings=[q.tolist()], + n_results=k, + ) + times.append(time.time() - t0) + #print("chroma avg", duration(np.mean(times))) + return BenchResult("chroma", build_time, times) def bench_usearch_npy(base, query, k) -> BenchResult: + from usearch.index import Index, search, MetricKind times = [] for q in query: t0 = time.time() @@ -414,6 +416,7 @@ def bench_usearch_npy(base, query, k) -> BenchResult: def bench_usearch_special(base, query, k) -> BenchResult: + from usearch.index import Index, search, MetricKind dimensions = base.shape[1] index = Index(ndim=dimensions) t = time.time() @@ -425,18 +428,14 @@ def bench_usearch_special(base, query, k) -> BenchResult: t0 = time.time() result = index.search(q, exact=True) times.append(time.time() - t0) - return BenchResult("usuearch index exact=True", build_time, times) - - -from rich.console import Console -from rich.table import Table + return BenchResult("usuearch index", build_time, times) def suite(name, base, query, k, benchmarks): print(f"Starting benchmark suite: {name} {base.shape}, k={k}") results = [] - for b in benchmarks.split(","): + for b in benchmarks: if b == "faiss": results.append(bench_faiss(base, query, k=k)) elif b == "vec-static": @@ -460,6 +459,8 @@ def suite(name, base, query, k, benchmarks): results.append(bench_duckdb(base, query, k=k)) elif b == "sentence-transformers": results.append(bench_sentence_transformers(base, query, k=k)) + elif b == "chroma": + results.append(bench_chroma(base, query, k=k)) else: raise Exception(f"unknown benchmark {b}") @@ -565,12 +566,58 @@ def cli_read_query(query, base): return cli_read_input(query, -1) -def main(): - args = parse_args() - print(args) - base = cli_read_input(args.input, args.sample) - queries = cli_read_query(args.query, base)[: args.qsample] - suite(args.name, base, queries, args.k, args.x) +@dataclass +class Config: + name: str + input: str + k: int + queries: str + qsample: int + tests: List[str] + sample: Optional[int] + +def parse_config_file(path:str) -> Config: + name = None + input = None + k = None + queries = None + qsample = None + sample = None + tests = [] + + for line in open(path, 'r'): + line = line.strip() + if not line or line.startswith('#'): + continue + elif line.startswith('@name='): + name = line.removeprefix('@name=') + elif line.startswith('@k='): + k = line.removeprefix('@k=') + elif line.startswith('@input='): + input = line.removeprefix('@input=') + elif line.startswith('@queries='): + queries = line.removeprefix('@queries=') + elif line.startswith('@qsample='): + qsample = line.removeprefix('@qsample=') + elif line.startswith('@sample='): + sample = line.removeprefix('@sample=') + elif line.startswith('@'): + raise Exception(f"unknown config line '{line}'") + else: + tests.append(line) + return Config(name, input, int(k), queries, int(qsample), tests, int(sample) if sample is not None else None) + + + +from sys import argv if __name__ == "__main__": - main() + config = parse_config_file(argv[1]) + print(config) + #args = parse_args() + #print(args) + base = cli_read_input(config.input, config.sample) + queries = cli_read_query(config.queries, base)[: config.qsample] + suite(config.name, base, queries, config.k, config.tests) + + #main() diff --git a/benchmarks/exhaustive-memory/gist.sh b/benchmarks/exhaustive-memory/gist.sh deleted file mode 100755 index 1522698..0000000 --- a/benchmarks/exhaustive-memory/gist.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -python bench.py -n gist -i ../../gist/gist_base.fvecs -q ../../gist/gist_query.fvecs --sample 750000 --qsample 200 -k 20 -x $1 diff --git a/benchmarks/exhaustive-memory/gist.suite b/benchmarks/exhaustive-memory/gist.suite new file mode 100644 index 0000000..f308748 --- /dev/null +++ b/benchmarks/exhaustive-memory/gist.suite @@ -0,0 +1,15 @@ +@name=gist +@input=data/gist/gist_base.fvecs +@queries=data/gist/gist_query.fvecs +@sample=500000 +@qsample=20 +@k=20 + +faiss +usearch +vec-static +#duckdb +#vec-vec0.8192.1024 +#vec-vec0.8192.2048 +#vec-scalar.8192 +#numpy diff --git a/benchmarks/exhaustive-memory/requirements.txt b/benchmarks/exhaustive-memory/requirements.txt new file mode 100644 index 0000000..e92cd19 --- /dev/null +++ b/benchmarks/exhaustive-memory/requirements.txt @@ -0,0 +1,120 @@ +annotated-types==0.7.0 +anyio==4.4.0 +asgiref==3.8.1 +attrs==23.2.0 +backoff==2.2.1 +bcrypt==4.2.0 +build==1.2.1 +cachetools==5.4.0 +certifi==2024.7.4 +charset-normalizer==3.3.2 +chroma-hnswlib==0.7.6 +chromadb==0.5.5 +click==8.1.7 +coloredlogs==15.0.1 +decorator==5.1.1 +deprecated==1.2.14 +deprecation==2.1.0 +dnspython==2.6.1 +duckdb==1.0.0 +email-validator==2.2.0 +faiss-cpu==1.8.0.post1 +fastapi==0.111.1 +fastapi-cli==0.0.4 +filelock==3.15.4 +flatbuffers==24.3.25 +fsspec==2024.6.1 +google-auth==2.32.0 +googleapis-common-protos==1.63.2 +grpcio==1.65.1 +h11==0.14.0 +hnswlib==0.8.0 +httpcore==1.0.5 +httptools==0.6.1 +httpx==0.27.0 +huggingface-hub==0.24.1 +humanfriendly==10.0 +idna==3.7 +importlib-metadata==8.0.0 +importlib-resources==6.4.0 +jinja2==3.1.4 +joblib==1.4.2 +kubernetes==30.1.0 +lancedb==0.10.2 +markdown-it-py==3.0.0 +markupsafe==2.1.5 +mdurl==0.1.2 +mmh3==4.1.0 +monotonic==1.6 +mpmath==1.3.0 +networkx==3.3 +numpy==1.26.4 +oauthlib==3.2.2 +onnxruntime==1.18.1 +opentelemetry-api==1.26.0 +opentelemetry-exporter-otlp-proto-common==1.26.0 +opentelemetry-exporter-otlp-proto-grpc==1.26.0 +opentelemetry-instrumentation==0.47b0 +opentelemetry-instrumentation-asgi==0.47b0 +opentelemetry-instrumentation-fastapi==0.47b0 +opentelemetry-proto==1.26.0 +opentelemetry-sdk==1.26.0 +opentelemetry-semantic-conventions==0.47b0 +opentelemetry-util-http==0.47b0 +orjson==3.10.6 +overrides==7.7.0 +packaging==24.1 +pandas==2.2.2 +pillow==10.4.0 +posthog==3.5.0 +protobuf==4.25.4 +py==1.11.0 +pyarrow==15.0.0 +pyasn1==0.6.0 +pyasn1-modules==0.4.0 +pydantic==2.8.2 +pydantic-core==2.20.1 +pygments==2.18.0 +pylance==0.14.1 +pypika==0.48.9 +pyproject-hooks==1.1.0 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-multipart==0.0.9 +pytz==2024.1 +pyyaml==6.0.1 +ratelimiter==1.2.0.post0 +regex==2024.5.15 +requests==2.32.3 +requests-oauthlib==2.0.0 +retry==0.9.2 +rich==13.7.1 +rsa==4.9 +safetensors==0.4.3 +scikit-learn==1.5.1 +scipy==1.14.0 +sentence-transformers==3.0.1 +setuptools==71.1.0 +shellingham==1.5.4 +six==1.16.0 +sniffio==1.3.1 +starlette==0.37.2 +sympy==1.13.1 +tenacity==8.5.0 +threadpoolctl==3.5.0 +tokenizers==0.19.1 +torch==2.3.1 +tqdm==4.66.4 +transformers==4.43.1 +typer==0.12.3 +typing-extensions==4.12.2 +tzdata==2024.1 +urllib3==2.2.2 +usearch==2.12.0 +uvicorn==0.30.3 +uvloop==0.19.0 +watchfiles==0.22.0 +websocket-client==1.8.0 +websockets==12.0 +wrapt==1.16.0 +zipp==3.19.2 diff --git a/benchmarks/exhaustive-memory/sift.sh b/benchmarks/exhaustive-memory/sift.sh deleted file mode 100755 index 30ea86e..0000000 --- a/benchmarks/exhaustive-memory/sift.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -python bench.py -n sift1m -i ../../sift/sift_base.fvecs -q ../../sift/sift_query.fvecs --qsample 100 -k 20 -x $1 diff --git a/benchmarks/exhaustive-memory/sift.suite b/benchmarks/exhaustive-memory/sift.suite index 96e0a65..979ff8d 100644 --- a/benchmarks/exhaustive-memory/sift.suite +++ b/benchmarks/exhaustive-memory/sift.suite @@ -1,18 +1,28 @@ @name=sift1m -@i=../../sift/sift_base.fvecs -@q=../../sift/sift_query.fvecs +@input=data/sift/sift_base.fvecs +@queries=data/sift/sift_query.fvecs @qsample=100 +@k=20 -libsql.4096 -libsql.8192 faiss -vec-scalar.4096 -vec-static -vec-vec0.4096.16 -vec-vec0.8192.1024 -vec-vec0.4096.2048 usearch duckdb -hnswlib +vec-static +vec-vec0.8192.1024 +vec-vec0.8192.2048 +vec-scalar.8192 numpy +# #libsql.4096 +# #libsql.8192 +# faiss +# vec-scalar.4096 +# vec-static +# vec-vec0.4096.16 +# vec-vec0.8192.1024 +# vec-vec0.4096.2048 +# usearch +# duckdb +# hnswlib +# numpy +# chroma