benchmark updates

2026-04-25 00:36:56 +02:00 · 2024-07-28 11:08:12 -07:00 · 2024-07-28 11:08:12 -07:00 · 4febdff11a
commit 4febdff11a
parent 156d6c1e3b
10 changed files with 290 additions and 149 deletions
--- a/benchmarks/exhaustive-memory/.gitignore
+++ b/benchmarks/exhaustive-memory/.gitignore
@ -0,0 +1 @@
+data/
--- a/benchmarks/exhaustive-memory/Makefile
+++ b/benchmarks/exhaustive-memory/Makefile
@ -0,0 +1,15 @@
+
+
+
+data/:
+	mkdir -p $@
+
+data/sift: data/
+	curl -o data/sift.tar.gz ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz
+	tar -xvzf data/sift.tar.gz -C data/
+	rm data/sift.tar.gz
+
+data/gist: data/
+	curl -o data/gist.tar.gz ftp://ftp.irisa.fr/local/texmex/corpus/gist.tar.gz
+	tar -xvzf data/gist.tar.gz -C data/
+	rm data/gist.tar.gz
--- a/benchmarks/exhaustive-memory/README.md
+++ b/benchmarks/exhaustive-memory/README.md
@ -1,35 +1,25 @@
-```
-python3 bench/bench.py \
-  -n "sift1m" \
-  -i sift/sift_base.fvecs \
-  -q sift/sift_query.fvecs \
-  --sample 10000 --qsample 100 \
-  -k 10
-```
+# `sqlite-vec` In-memory benchmark comparisions

-```
-python3 bench.py \
-  -n "sift1m" \
-  -i ../../sift/sift_base.fvecs \
-  -q ../../sift/sift_query.fvecs \
-  --qsample 100 \
-  -k 20
-```
-```
-python3 bench.py \
-  -n "sift1m" \
-  -i ../../sift/sift_base.fvecs \
-  -q ../../sift/sift_query.fvecs \
-  --qsample 100 \
-  -x faiss,vec-scalar.4096,vec-static,vec-vec0.4096.16,vec-vec0.8192.1024,usearch,duckdb,hnswlib,numpy \
-  -k 20
-```
+This repo contains a benchmarks that compares KNN queries of `sqlite-vec` to other in-process vector search tools using **brute force linear scans only**. These include:


-
-```
-python bench.py -n gist -i ../../gist/gist_base.fvecs -q ../../gist/gist_query.fvecs --qsample 100 -k 20 --sample 500000 -x faiss,vec-static,vec-scalar.8192,vec-scalar.16384,vec-scalar.32768,vec-vec0.16384.64,vec-vec0.16384.128,vec-vec0.16384.256,vec-vec0.16384.512,vec-vec0.16384.1024,vec-vec0.16384.2048
-```
+- [Faiss IndexFlatL2](https://faiss.ai/)
+- [usearch with `exact=True`](https://github.com/unum-cloud/usearch)
+- [libsql vector search with `vector_distance_cos`](https://turso.tech/vector)
+- [numpy](https://numpy.org/), using [this approach](https://github.com/EthanRosenthal/nn-vs-ann)
+- [duckdb with `list_cosine_similarity`](https://duckdb.org/docs/sql/functions/nested.html#list_cosine_similaritylist1-list2)
+- [`sentence_transformers.util.semantic_search`](https://sbert.net/docs/package_reference/util.html#sentence_transformers.util.semantic_search)
+- [hnswlib BFIndex](https://github.com/nmslib/hnswlib/blob/c1b9b79af3d10c6ee7b5d0afa1ce851ae975254c/TESTING_RECALL.md?plain=1#L8)


-python bench.py -n gist -i ../../gist/gist_base.fvecs -q ../../gist/gist_query.fvecs --qsample 100 -k 20 --sample 500000 -x faiss,vec-static,sentence-transformers,numpy
+Again **ONLY BRUTE FORCE LINEAR SCANS ARE TESTED**. This benchmark does **not** test approximate nearest neighbors (ANN) implementations. This benchmarks is extremely narrow to just testing KNN searches using brute force.
+
+A few other caveats:
+
+- Only brute-force linear scans, no ANN
+- Only CPU is used. The only tool that does offer GPU is Faiss anyway.
+- Only in-memory datasets are used. Many of these tools do support serializing and reading from disk (including `sqlite-vec`) and possibly `mmap`'ing, but this only tests in-memory datasets. Mostly because of numpy
+- Queries are made one after the other, **not batched.** Some tools offer APIs to query multiple inputs at the same time, but this benchmark runs queries sequentially. This was done to emulate "server request"-style queries, but multiple users would send queries at different times, making batching more difficult. To note, `sqlite-vec` does **not** support batched queries yet.
+
+
+These tests are run in Python. Vectors are provided as an in-memory numpy array, and each test converts that numpy array into whatever makes sense for the given tool. For example, `sqlite-vec` tests will read those vectors into a SQLite table. DuckDB will read them into an Array array then create a DuckDB table from that.
--- a/benchmarks/exhaustive-memory/b.py
+++ b/benchmarks/exhaustive-memory/b.py
@ -1,51 +0,0 @@
-import numpy as np
-import numpy.typing as npt
-import time
-
-def cosine_similarity(
-    vec: npt.NDArray[np.float32], mat: npt.NDArray[np.float32], do_norm: bool = True
-) -> npt.NDArray[np.float32]:
-    sim = vec @ mat.T
-    if do_norm:
-        sim /= np.linalg.norm(vec) * np.linalg.norm(mat, axis=1)
-    return sim
-
-
-def topk(
-    vec: npt.NDArray[np.float32],
-    mat: npt.NDArray[np.float32],
-    k: int = 5,
-    do_norm: bool = True,
-) -> tuple[npt.NDArray[np.int32], npt.NDArray[np.float32]]:
-    sim = cosine_similarity(vec, mat, do_norm=do_norm)
-    # Rather than sorting all similarities and taking the top K, it's faster to
-    # argpartition and then just sort the top K.
-    # The difference is O(N logN) vs O(N + k logk)
-    indices = np.argpartition(-sim, kth=k)[:k]
-    top_indices = np.argsort(-sim[indices])
-    return indices[top_indices], sim[top_indices]
-
-
-def ivecs_read(fname):
-    a = np.fromfile(fname, dtype="int32")
-    d = a[0]
-    return a.reshape(-1, d + 1)[:, 1:].copy()
-
-
-def fvecs_read(fname):
-    return ivecs_read(fname).view("float32")
-
-
-
-base = fvecs_read("../../sift/sift_base.fvecs")
-queries = fvecs_read("../../sift/sift_query.fvecs")
-k = 20
-times = []
-results = []
-for idx, q in enumerate(queries[0:20]):
-    t0 = time.time()
-    result = topk(q, base, k=k)
-    results.append(result)
-    times.append(time.time() - t0)
-print(np.__version__)
-print(np.mean(times))
--- a/benchmarks/exhaustive-memory/bench.py
+++ b/benchmarks/exhaustive-memory/bench.py
@ -1,22 +1,12 @@
 import numpy as np
 import numpy.typing as npt
 import time
-import hnswlib
 import sqlite3
-import faiss
-import lancedb
 import pandas as pd
-
-# import chromadb
-from usearch.index import Index, search, MetricKind
-
 from dataclasses import dataclass
-
-from typing import List
-
-import duckdb
-import pyarrow as pa
-from sentence_transformers.util import semantic_search
+from rich.console import Console
+from rich.table import Table
+from typing import List, Optional


@dataclass
@ -66,6 +56,7 @@ def fvecs_read(fname, sample):


 def bench_hnsw(base, query):
+    import hnswlib
    t0 = time.time()
    p = hnswlib.Index(space="ip", dim=128)  # possible options are l2, cosine or ip

@ -92,6 +83,7 @@ def bench_hnsw(base, query):


 def bench_hnsw_bf(base, query, k) -> BenchResult:
+    import hnswlib
    print("hnswlib-bf")
    dimensions = base.shape[1]
    t0 = time.time()
@ -115,7 +107,7 @@ def bench_hnsw_bf(base, query, k) -> BenchResult:


 def bench_numpy(base, query, k) -> BenchResult:
-    print("numpy")
+    print("numpy...")
    times = []
    results = []
    for idx, q in enumerate(query):
@ -128,7 +120,7 @@ def bench_numpy(base, query, k) -> BenchResult:

 def bench_sqlite_vec(base, query, page_size, chunk_size, k) -> BenchResult:
    dimensions = base.shape[1]
-    print(f"sqlite-vec {page_size} {chunk_size}")
+    print(f"sqlite-vec {page_size} {chunk_size}...")

    db = sqlite3.connect(":memory:")
    db.execute(f"PRAGMA page_size = {page_size}")
@ -169,12 +161,13 @@ def bench_sqlite_vec(base, query, page_size, chunk_size, k) -> BenchResult:
            """,
            [q.tobytes(), k],
        ).fetchall()
+        assert len(result) == k
        times.append(time.time() - t0)
    return BenchResult(f"sqlite-vec vec0 ({page_size}|{chunk_size})", build_time, times)


 def bench_sqlite_vec_scalar(base, query, page_size, k) -> BenchResult:
-    print(f"sqlite-vec-scalar")
+    print(f"sqlite-vec-scalar...")

    db = sqlite3.connect(":memory:")
    db.enable_load_extension(True)
@ -208,11 +201,12 @@ def bench_sqlite_vec_scalar(base, query, page_size, k) -> BenchResult:
            """,
            [q.tobytes(), k],
        ).fetchall()
+        assert len(result) == k
        times.append(time.time() - t0)
    return BenchResult(f"sqlite-vec-scalar ({page_size})", build_time, times)

 def bench_libsql(base, query, page_size, k) -> BenchResult:
-    print(f"libsql")
+    print(f"libsql ...")
    dimensions = base.shape[1]

    db = sqlite3.connect(":memory:")
@ -273,7 +267,7 @@ def register_np(db, array, name):
    )

 def bench_sqlite_vec_static(base, query, k) -> BenchResult:
-    print(f"sqlite-vec static")
+    print(f"sqlite-vec static...")

    db = sqlite3.connect(":memory:")
    db.enable_load_extension(True)
@ -303,12 +297,14 @@ def bench_sqlite_vec_static(base, query, k) -> BenchResult:
            """,
            [q.tobytes(), k],
        ).fetchall()
+        assert len(result) == k
        times.append(time.time() - t0)
    return BenchResult(f"sqlite-vec static", build_time, times)

 def bench_faiss(base, query, k) -> BenchResult:
+    import faiss
    dimensions = base.shape[1]
-    print("faiss")
+    print("faiss...")
    t = time.time()
    index = faiss.IndexFlatL2(dimensions)
    index.add(base)
@ -321,11 +317,12 @@ def bench_faiss(base, query, k) -> BenchResult:
        distances, rowids = index.search(x=np.array([q]), k=k)
        results.append(rowids)
        times.append(time.time() - t0)
-    print("faiss avg", duration(np.mean(times)))
    return BenchResult("faiss", build_time, times)


 def bench_lancedb(base, query, k) -> BenchResult:
+    import lancedb
+    print('lancedb...')
    dimensions = base.shape[1]
    db = lancedb.connect("a")
    data = [{"vector": row.reshape(1, -1)[0]} for row in base]
@ -343,6 +340,9 @@ def bench_lancedb(base, query, k) -> BenchResult:
    return BenchResult("lancedb", build_time, times)

 def bench_duckdb(base, query, k) -> BenchResult:
+    import duckdb
+    import pyarrow as pa
+    print("duckdb...")
    dimensions = base.shape[1]
    db = duckdb.connect(":memory:")
    db.execute(f"CREATE TABLE t(vector float[{dimensions}])")
@ -368,6 +368,7 @@ def bench_duckdb(base, query, k) -> BenchResult:
    return BenchResult("duckdb", build_time, times)

 def bench_sentence_transformers(base, query, k) -> BenchResult:
+    from sentence_transformers.util import semantic_search
    print("sentence-transformers")
    dimensions = base.shape[1]
    t0 = time.time()
@ -382,28 +383,29 @@ def bench_sentence_transformers(base, query, k) -> BenchResult:
    return BenchResult("sentence-transformers", build_time, times)


-# def bench_chroma(base, query, k):
-#    chroma_client = chromadb.Client()
-#    collection = chroma_client.create_collection(name="my_collection")
-#
-#    t = time.time()
-#    # chroma doesn't allow for more than 41666 vectors to be inserted at once (???)
-#    i = 0
-#    collection.add(embeddings=base, ids=[str(x) for x in range(len(base))])
-#    print("chroma build time: ", duration(time.time() - t))
-#    times = []
-#    for q in query:
-#        t0 = time.time()
-#        result = collection.query(
-#            query_embeddings=[q.tolist()],
-#            n_results=k,
-#        )
-#        print(result)
-#        times.append(time.time() - t0)
-#    print("chroma avg", duration(np.mean(times)))
+def bench_chroma(base, query, k):
+   import chromadb
+   from chromadb.utils.batch_utils import create_batches
+   chroma_client = chromadb.EphemeralClient()
+   collection = chroma_client.create_collection(name="my_collection")

+   t = time.time()
+   for batch in create_batches(api=chroma_client, ids=[str(x) for x in range(len(base))], embeddings=base.tolist()):
+      collection.add(*batch)
+   build_time = time.time() - t
+   times = []
+   for q in query:
+       t0 = time.time()
+       result = collection.query(
+           query_embeddings=[q.tolist()],
+           n_results=k,
+       )
+       times.append(time.time() - t0)
+   #print("chroma avg", duration(np.mean(times)))
+   return BenchResult("chroma", build_time, times)

 def bench_usearch_npy(base, query, k) -> BenchResult:
+    from usearch.index import Index, search, MetricKind
    times = []
    for q in query:
        t0 = time.time()
@ -414,6 +416,7 @@ def bench_usearch_npy(base, query, k) -> BenchResult:


 def bench_usearch_special(base, query, k) -> BenchResult:
+    from usearch.index import Index, search, MetricKind
    dimensions = base.shape[1]
    index = Index(ndim=dimensions)
    t = time.time()
@ -425,18 +428,14 @@ def bench_usearch_special(base, query, k) -> BenchResult:
        t0 = time.time()
        result = index.search(q, exact=True)
        times.append(time.time() - t0)
-    return BenchResult("usuearch index exact=True", build_time, times)
-
-
-from rich.console import Console
-from rich.table import Table
+    return BenchResult("usuearch index", build_time, times)


 def suite(name, base, query, k, benchmarks):
    print(f"Starting benchmark suite: {name} {base.shape}, k={k}")
    results = []

-    for b in benchmarks.split(","):
+    for b in benchmarks:
        if b == "faiss":
            results.append(bench_faiss(base, query, k=k))
        elif b == "vec-static":
@ -460,6 +459,8 @@ def suite(name, base, query, k, benchmarks):
            results.append(bench_duckdb(base, query, k=k))
        elif b == "sentence-transformers":
            results.append(bench_sentence_transformers(base, query, k=k))
+        elif b == "chroma":
+            results.append(bench_chroma(base, query, k=k))
        else:
            raise Exception(f"unknown benchmark {b}")

@ -565,12 +566,58 @@ def cli_read_query(query, base):
    return cli_read_input(query, -1)


-def main():
-    args = parse_args()
-    print(args)
-    base = cli_read_input(args.input, args.sample)
-    queries = cli_read_query(args.query, base)[: args.qsample]
-    suite(args.name, base, queries, args.k, args.x)

+@dataclass
+class Config:
+    name: str
+    input: str
+    k: int
+    queries: str
+    qsample: int
+    tests: List[str]
+    sample: Optional[int]
+
+def parse_config_file(path:str) -> Config:
+  name = None
+  input = None
+  k = None
+  queries = None
+  qsample = None
+  sample = None
+  tests = []
+
+  for line in open(path, 'r'):
+    line = line.strip()
+    if not line or line.startswith('#'):
+      continue
+    elif line.startswith('@name='):
+      name = line.removeprefix('@name=')
+    elif line.startswith('@k='):
+      k = line.removeprefix('@k=')
+    elif line.startswith('@input='):
+      input = line.removeprefix('@input=')
+    elif line.startswith('@queries='):
+      queries = line.removeprefix('@queries=')
+    elif line.startswith('@qsample='):
+      qsample = line.removeprefix('@qsample=')
+    elif line.startswith('@sample='):
+      sample = line.removeprefix('@sample=')
+    elif line.startswith('@'):
+        raise Exception(f"unknown config line '{line}'")
+    else:
+      tests.append(line)
+  return Config(name, input, int(k), queries, int(qsample), tests, int(sample) if sample is not None else None)
+
+
+
+from sys import argv
 if __name__ == "__main__":
-    main()
+    config = parse_config_file(argv[1])
+    print(config)
+    #args = parse_args()
+    #print(args)
+    base = cli_read_input(config.input, config.sample)
+    queries = cli_read_query(config.queries, base)[: config.qsample]
+    suite(config.name, base, queries, config.k, config.tests)
+
+    #main()
--- a/benchmarks/exhaustive-memory/gist.sh
+++ b/benchmarks/exhaustive-memory/gist.sh
@ -1,3 +0,0 @@
-#!/bin/bash
-
-python bench.py -n gist -i ../../gist/gist_base.fvecs -q ../../gist/gist_query.fvecs --sample 750000 --qsample 200 -k 20 -x $1
--- a/benchmarks/exhaustive-memory/gist.suite
+++ b/benchmarks/exhaustive-memory/gist.suite
@ -0,0 +1,15 @@
+@name=gist
+@input=data/gist/gist_base.fvecs
+@queries=data/gist/gist_query.fvecs
+@sample=500000
+@qsample=20
+@k=20
+
+faiss
+usearch
+vec-static
+#duckdb
+#vec-vec0.8192.1024
+#vec-vec0.8192.2048
+#vec-scalar.8192
+#numpy
--- a/benchmarks/exhaustive-memory/requirements.txt
+++ b/benchmarks/exhaustive-memory/requirements.txt
@ -0,0 +1,120 @@
+annotated-types==0.7.0
+anyio==4.4.0
+asgiref==3.8.1
+attrs==23.2.0
+backoff==2.2.1
+bcrypt==4.2.0
+build==1.2.1
+cachetools==5.4.0
+certifi==2024.7.4
+charset-normalizer==3.3.2
+chroma-hnswlib==0.7.6
+chromadb==0.5.5
+click==8.1.7
+coloredlogs==15.0.1
+decorator==5.1.1
+deprecated==1.2.14
+deprecation==2.1.0
+dnspython==2.6.1
+duckdb==1.0.0
+email-validator==2.2.0
+faiss-cpu==1.8.0.post1
+fastapi==0.111.1
+fastapi-cli==0.0.4
+filelock==3.15.4
+flatbuffers==24.3.25
+fsspec==2024.6.1
+google-auth==2.32.0
+googleapis-common-protos==1.63.2
+grpcio==1.65.1
+h11==0.14.0
+hnswlib==0.8.0
+httpcore==1.0.5
+httptools==0.6.1
+httpx==0.27.0
+huggingface-hub==0.24.1
+humanfriendly==10.0
+idna==3.7
+importlib-metadata==8.0.0
+importlib-resources==6.4.0
+jinja2==3.1.4
+joblib==1.4.2
+kubernetes==30.1.0
+lancedb==0.10.2
+markdown-it-py==3.0.0
+markupsafe==2.1.5
+mdurl==0.1.2
+mmh3==4.1.0
+monotonic==1.6
+mpmath==1.3.0
+networkx==3.3
+numpy==1.26.4
+oauthlib==3.2.2
+onnxruntime==1.18.1
+opentelemetry-api==1.26.0
+opentelemetry-exporter-otlp-proto-common==1.26.0
+opentelemetry-exporter-otlp-proto-grpc==1.26.0
+opentelemetry-instrumentation==0.47b0
+opentelemetry-instrumentation-asgi==0.47b0
+opentelemetry-instrumentation-fastapi==0.47b0
+opentelemetry-proto==1.26.0
+opentelemetry-sdk==1.26.0
+opentelemetry-semantic-conventions==0.47b0
+opentelemetry-util-http==0.47b0
+orjson==3.10.6
+overrides==7.7.0
+packaging==24.1
+pandas==2.2.2
+pillow==10.4.0
+posthog==3.5.0
+protobuf==4.25.4
+py==1.11.0
+pyarrow==15.0.0
+pyasn1==0.6.0
+pyasn1-modules==0.4.0
+pydantic==2.8.2
+pydantic-core==2.20.1
+pygments==2.18.0
+pylance==0.14.1
+pypika==0.48.9
+pyproject-hooks==1.1.0
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-multipart==0.0.9
+pytz==2024.1
+pyyaml==6.0.1
+ratelimiter==1.2.0.post0
+regex==2024.5.15
+requests==2.32.3
+requests-oauthlib==2.0.0
+retry==0.9.2
+rich==13.7.1
+rsa==4.9
+safetensors==0.4.3
+scikit-learn==1.5.1
+scipy==1.14.0
+sentence-transformers==3.0.1
+setuptools==71.1.0
+shellingham==1.5.4
+six==1.16.0
+sniffio==1.3.1
+starlette==0.37.2
+sympy==1.13.1
+tenacity==8.5.0
+threadpoolctl==3.5.0
+tokenizers==0.19.1
+torch==2.3.1
+tqdm==4.66.4
+transformers==4.43.1
+typer==0.12.3
+typing-extensions==4.12.2
+tzdata==2024.1
+urllib3==2.2.2
+usearch==2.12.0
+uvicorn==0.30.3
+uvloop==0.19.0
+watchfiles==0.22.0
+websocket-client==1.8.0
+websockets==12.0
+wrapt==1.16.0
+zipp==3.19.2
--- a/benchmarks/exhaustive-memory/sift.sh
+++ b/benchmarks/exhaustive-memory/sift.sh
@ -1,3 +0,0 @@
-#!/bin/bash
-
-python bench.py -n sift1m -i ../../sift/sift_base.fvecs -q ../../sift/sift_query.fvecs --qsample 100 -k 20 -x $1
--- a/benchmarks/exhaustive-memory/sift.suite
+++ b/benchmarks/exhaustive-memory/sift.suite
@ -1,18 +1,28 @@
@name=sift1m
-@i=../../sift/sift_base.fvecs
-@q=../../sift/sift_query.fvecs
+@input=data/sift/sift_base.fvecs
+@queries=data/sift/sift_query.fvecs
@qsample=100
+@k=20

-libsql.4096
-libsql.8192
 faiss
-vec-scalar.4096
-vec-static
-vec-vec0.4096.16
-vec-vec0.8192.1024
-vec-vec0.4096.2048
 usearch
 duckdb
-hnswlib
+vec-static
+vec-vec0.8192.1024
+vec-vec0.8192.2048
+vec-scalar.8192
 numpy

+# #libsql.4096
+# #libsql.8192
+# faiss
+# vec-scalar.4096
+# vec-static
+# vec-vec0.4096.16
+# vec-vec0.8192.1024
+# vec-vec0.4096.2048
+# usearch
+# duckdb
+# hnswlib
+# numpy
+# chroma