benchmark updates

This commit is contained in:
Alex Garcia 2024-07-28 11:08:12 -07:00
parent 156d6c1e3b
commit 4febdff11a
10 changed files with 290 additions and 149 deletions

View file

@ -0,0 +1 @@
data/

View file

@ -0,0 +1,15 @@
data/:
mkdir -p $@
data/sift: data/
curl -o data/sift.tar.gz ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz
tar -xvzf data/sift.tar.gz -C data/
rm data/sift.tar.gz
data/gist: data/
curl -o data/gist.tar.gz ftp://ftp.irisa.fr/local/texmex/corpus/gist.tar.gz
tar -xvzf data/gist.tar.gz -C data/
rm data/gist.tar.gz

View file

@ -1,35 +1,25 @@
``` # `sqlite-vec` In-memory benchmark comparisions
python3 bench/bench.py \
-n "sift1m" \
-i sift/sift_base.fvecs \
-q sift/sift_query.fvecs \
--sample 10000 --qsample 100 \
-k 10
```
``` This repo contains a benchmarks that compares KNN queries of `sqlite-vec` to other in-process vector search tools using **brute force linear scans only**. These include:
python3 bench.py \
-n "sift1m" \
-i ../../sift/sift_base.fvecs \
-q ../../sift/sift_query.fvecs \
--qsample 100 \
-k 20
```
```
python3 bench.py \
-n "sift1m" \
-i ../../sift/sift_base.fvecs \
-q ../../sift/sift_query.fvecs \
--qsample 100 \
-x faiss,vec-scalar.4096,vec-static,vec-vec0.4096.16,vec-vec0.8192.1024,usearch,duckdb,hnswlib,numpy \
-k 20
```
- [Faiss IndexFlatL2](https://faiss.ai/)
``` - [usearch with `exact=True`](https://github.com/unum-cloud/usearch)
python bench.py -n gist -i ../../gist/gist_base.fvecs -q ../../gist/gist_query.fvecs --qsample 100 -k 20 --sample 500000 -x faiss,vec-static,vec-scalar.8192,vec-scalar.16384,vec-scalar.32768,vec-vec0.16384.64,vec-vec0.16384.128,vec-vec0.16384.256,vec-vec0.16384.512,vec-vec0.16384.1024,vec-vec0.16384.2048 - [libsql vector search with `vector_distance_cos`](https://turso.tech/vector)
``` - [numpy](https://numpy.org/), using [this approach](https://github.com/EthanRosenthal/nn-vs-ann)
- [duckdb with `list_cosine_similarity`](https://duckdb.org/docs/sql/functions/nested.html#list_cosine_similaritylist1-list2)
- [`sentence_transformers.util.semantic_search`](https://sbert.net/docs/package_reference/util.html#sentence_transformers.util.semantic_search)
- [hnswlib BFIndex](https://github.com/nmslib/hnswlib/blob/c1b9b79af3d10c6ee7b5d0afa1ce851ae975254c/TESTING_RECALL.md?plain=1#L8)
python bench.py -n gist -i ../../gist/gist_base.fvecs -q ../../gist/gist_query.fvecs --qsample 100 -k 20 --sample 500000 -x faiss,vec-static,sentence-transformers,numpy Again **ONLY BRUTE FORCE LINEAR SCANS ARE TESTED**. This benchmark does **not** test approximate nearest neighbors (ANN) implementations. This benchmarks is extremely narrow to just testing KNN searches using brute force.
A few other caveats:
- Only brute-force linear scans, no ANN
- Only CPU is used. The only tool that does offer GPU is Faiss anyway.
- Only in-memory datasets are used. Many of these tools do support serializing and reading from disk (including `sqlite-vec`) and possibly `mmap`'ing, but this only tests in-memory datasets. Mostly because of numpy
- Queries are made one after the other, **not batched.** Some tools offer APIs to query multiple inputs at the same time, but this benchmark runs queries sequentially. This was done to emulate "server request"-style queries, but multiple users would send queries at different times, making batching more difficult. To note, `sqlite-vec` does **not** support batched queries yet.
These tests are run in Python. Vectors are provided as an in-memory numpy array, and each test converts that numpy array into whatever makes sense for the given tool. For example, `sqlite-vec` tests will read those vectors into a SQLite table. DuckDB will read them into an Array array then create a DuckDB table from that.

View file

@ -1,51 +0,0 @@
import numpy as np
import numpy.typing as npt
import time
def cosine_similarity(
vec: npt.NDArray[np.float32], mat: npt.NDArray[np.float32], do_norm: bool = True
) -> npt.NDArray[np.float32]:
sim = vec @ mat.T
if do_norm:
sim /= np.linalg.norm(vec) * np.linalg.norm(mat, axis=1)
return sim
def topk(
vec: npt.NDArray[np.float32],
mat: npt.NDArray[np.float32],
k: int = 5,
do_norm: bool = True,
) -> tuple[npt.NDArray[np.int32], npt.NDArray[np.float32]]:
sim = cosine_similarity(vec, mat, do_norm=do_norm)
# Rather than sorting all similarities and taking the top K, it's faster to
# argpartition and then just sort the top K.
# The difference is O(N logN) vs O(N + k logk)
indices = np.argpartition(-sim, kth=k)[:k]
top_indices = np.argsort(-sim[indices])
return indices[top_indices], sim[top_indices]
def ivecs_read(fname):
a = np.fromfile(fname, dtype="int32")
d = a[0]
return a.reshape(-1, d + 1)[:, 1:].copy()
def fvecs_read(fname):
return ivecs_read(fname).view("float32")
base = fvecs_read("../../sift/sift_base.fvecs")
queries = fvecs_read("../../sift/sift_query.fvecs")
k = 20
times = []
results = []
for idx, q in enumerate(queries[0:20]):
t0 = time.time()
result = topk(q, base, k=k)
results.append(result)
times.append(time.time() - t0)
print(np.__version__)
print(np.mean(times))

View file

@ -1,22 +1,12 @@
import numpy as np import numpy as np
import numpy.typing as npt import numpy.typing as npt
import time import time
import hnswlib
import sqlite3 import sqlite3
import faiss
import lancedb
import pandas as pd import pandas as pd
# import chromadb
from usearch.index import Index, search, MetricKind
from dataclasses import dataclass from dataclasses import dataclass
from rich.console import Console
from typing import List from rich.table import Table
from typing import List, Optional
import duckdb
import pyarrow as pa
from sentence_transformers.util import semantic_search
@dataclass @dataclass
@ -66,6 +56,7 @@ def fvecs_read(fname, sample):
def bench_hnsw(base, query): def bench_hnsw(base, query):
import hnswlib
t0 = time.time() t0 = time.time()
p = hnswlib.Index(space="ip", dim=128) # possible options are l2, cosine or ip p = hnswlib.Index(space="ip", dim=128) # possible options are l2, cosine or ip
@ -92,6 +83,7 @@ def bench_hnsw(base, query):
def bench_hnsw_bf(base, query, k) -> BenchResult: def bench_hnsw_bf(base, query, k) -> BenchResult:
import hnswlib
print("hnswlib-bf") print("hnswlib-bf")
dimensions = base.shape[1] dimensions = base.shape[1]
t0 = time.time() t0 = time.time()
@ -115,7 +107,7 @@ def bench_hnsw_bf(base, query, k) -> BenchResult:
def bench_numpy(base, query, k) -> BenchResult: def bench_numpy(base, query, k) -> BenchResult:
print("numpy") print("numpy...")
times = [] times = []
results = [] results = []
for idx, q in enumerate(query): for idx, q in enumerate(query):
@ -128,7 +120,7 @@ def bench_numpy(base, query, k) -> BenchResult:
def bench_sqlite_vec(base, query, page_size, chunk_size, k) -> BenchResult: def bench_sqlite_vec(base, query, page_size, chunk_size, k) -> BenchResult:
dimensions = base.shape[1] dimensions = base.shape[1]
print(f"sqlite-vec {page_size} {chunk_size}") print(f"sqlite-vec {page_size} {chunk_size}...")
db = sqlite3.connect(":memory:") db = sqlite3.connect(":memory:")
db.execute(f"PRAGMA page_size = {page_size}") db.execute(f"PRAGMA page_size = {page_size}")
@ -169,12 +161,13 @@ def bench_sqlite_vec(base, query, page_size, chunk_size, k) -> BenchResult:
""", """,
[q.tobytes(), k], [q.tobytes(), k],
).fetchall() ).fetchall()
assert len(result) == k
times.append(time.time() - t0) times.append(time.time() - t0)
return BenchResult(f"sqlite-vec vec0 ({page_size}|{chunk_size})", build_time, times) return BenchResult(f"sqlite-vec vec0 ({page_size}|{chunk_size})", build_time, times)
def bench_sqlite_vec_scalar(base, query, page_size, k) -> BenchResult: def bench_sqlite_vec_scalar(base, query, page_size, k) -> BenchResult:
print(f"sqlite-vec-scalar") print(f"sqlite-vec-scalar...")
db = sqlite3.connect(":memory:") db = sqlite3.connect(":memory:")
db.enable_load_extension(True) db.enable_load_extension(True)
@ -208,11 +201,12 @@ def bench_sqlite_vec_scalar(base, query, page_size, k) -> BenchResult:
""", """,
[q.tobytes(), k], [q.tobytes(), k],
).fetchall() ).fetchall()
assert len(result) == k
times.append(time.time() - t0) times.append(time.time() - t0)
return BenchResult(f"sqlite-vec-scalar ({page_size})", build_time, times) return BenchResult(f"sqlite-vec-scalar ({page_size})", build_time, times)
def bench_libsql(base, query, page_size, k) -> BenchResult: def bench_libsql(base, query, page_size, k) -> BenchResult:
print(f"libsql") print(f"libsql ...")
dimensions = base.shape[1] dimensions = base.shape[1]
db = sqlite3.connect(":memory:") db = sqlite3.connect(":memory:")
@ -273,7 +267,7 @@ def register_np(db, array, name):
) )
def bench_sqlite_vec_static(base, query, k) -> BenchResult: def bench_sqlite_vec_static(base, query, k) -> BenchResult:
print(f"sqlite-vec static") print(f"sqlite-vec static...")
db = sqlite3.connect(":memory:") db = sqlite3.connect(":memory:")
db.enable_load_extension(True) db.enable_load_extension(True)
@ -303,12 +297,14 @@ def bench_sqlite_vec_static(base, query, k) -> BenchResult:
""", """,
[q.tobytes(), k], [q.tobytes(), k],
).fetchall() ).fetchall()
assert len(result) == k
times.append(time.time() - t0) times.append(time.time() - t0)
return BenchResult(f"sqlite-vec static", build_time, times) return BenchResult(f"sqlite-vec static", build_time, times)
def bench_faiss(base, query, k) -> BenchResult: def bench_faiss(base, query, k) -> BenchResult:
import faiss
dimensions = base.shape[1] dimensions = base.shape[1]
print("faiss") print("faiss...")
t = time.time() t = time.time()
index = faiss.IndexFlatL2(dimensions) index = faiss.IndexFlatL2(dimensions)
index.add(base) index.add(base)
@ -321,11 +317,12 @@ def bench_faiss(base, query, k) -> BenchResult:
distances, rowids = index.search(x=np.array([q]), k=k) distances, rowids = index.search(x=np.array([q]), k=k)
results.append(rowids) results.append(rowids)
times.append(time.time() - t0) times.append(time.time() - t0)
print("faiss avg", duration(np.mean(times)))
return BenchResult("faiss", build_time, times) return BenchResult("faiss", build_time, times)
def bench_lancedb(base, query, k) -> BenchResult: def bench_lancedb(base, query, k) -> BenchResult:
import lancedb
print('lancedb...')
dimensions = base.shape[1] dimensions = base.shape[1]
db = lancedb.connect("a") db = lancedb.connect("a")
data = [{"vector": row.reshape(1, -1)[0]} for row in base] data = [{"vector": row.reshape(1, -1)[0]} for row in base]
@ -343,6 +340,9 @@ def bench_lancedb(base, query, k) -> BenchResult:
return BenchResult("lancedb", build_time, times) return BenchResult("lancedb", build_time, times)
def bench_duckdb(base, query, k) -> BenchResult: def bench_duckdb(base, query, k) -> BenchResult:
import duckdb
import pyarrow as pa
print("duckdb...")
dimensions = base.shape[1] dimensions = base.shape[1]
db = duckdb.connect(":memory:") db = duckdb.connect(":memory:")
db.execute(f"CREATE TABLE t(vector float[{dimensions}])") db.execute(f"CREATE TABLE t(vector float[{dimensions}])")
@ -368,6 +368,7 @@ def bench_duckdb(base, query, k) -> BenchResult:
return BenchResult("duckdb", build_time, times) return BenchResult("duckdb", build_time, times)
def bench_sentence_transformers(base, query, k) -> BenchResult: def bench_sentence_transformers(base, query, k) -> BenchResult:
from sentence_transformers.util import semantic_search
print("sentence-transformers") print("sentence-transformers")
dimensions = base.shape[1] dimensions = base.shape[1]
t0 = time.time() t0 = time.time()
@ -382,28 +383,29 @@ def bench_sentence_transformers(base, query, k) -> BenchResult:
return BenchResult("sentence-transformers", build_time, times) return BenchResult("sentence-transformers", build_time, times)
# def bench_chroma(base, query, k): def bench_chroma(base, query, k):
# chroma_client = chromadb.Client() import chromadb
# collection = chroma_client.create_collection(name="my_collection") from chromadb.utils.batch_utils import create_batches
# chroma_client = chromadb.EphemeralClient()
# t = time.time() collection = chroma_client.create_collection(name="my_collection")
# # chroma doesn't allow for more than 41666 vectors to be inserted at once (???)
# i = 0
# collection.add(embeddings=base, ids=[str(x) for x in range(len(base))])
# print("chroma build time: ", duration(time.time() - t))
# times = []
# for q in query:
# t0 = time.time()
# result = collection.query(
# query_embeddings=[q.tolist()],
# n_results=k,
# )
# print(result)
# times.append(time.time() - t0)
# print("chroma avg", duration(np.mean(times)))
t = time.time()
for batch in create_batches(api=chroma_client, ids=[str(x) for x in range(len(base))], embeddings=base.tolist()):
collection.add(*batch)
build_time = time.time() - t
times = []
for q in query:
t0 = time.time()
result = collection.query(
query_embeddings=[q.tolist()],
n_results=k,
)
times.append(time.time() - t0)
#print("chroma avg", duration(np.mean(times)))
return BenchResult("chroma", build_time, times)
def bench_usearch_npy(base, query, k) -> BenchResult: def bench_usearch_npy(base, query, k) -> BenchResult:
from usearch.index import Index, search, MetricKind
times = [] times = []
for q in query: for q in query:
t0 = time.time() t0 = time.time()
@ -414,6 +416,7 @@ def bench_usearch_npy(base, query, k) -> BenchResult:
def bench_usearch_special(base, query, k) -> BenchResult: def bench_usearch_special(base, query, k) -> BenchResult:
from usearch.index import Index, search, MetricKind
dimensions = base.shape[1] dimensions = base.shape[1]
index = Index(ndim=dimensions) index = Index(ndim=dimensions)
t = time.time() t = time.time()
@ -425,18 +428,14 @@ def bench_usearch_special(base, query, k) -> BenchResult:
t0 = time.time() t0 = time.time()
result = index.search(q, exact=True) result = index.search(q, exact=True)
times.append(time.time() - t0) times.append(time.time() - t0)
return BenchResult("usuearch index exact=True", build_time, times) return BenchResult("usuearch index", build_time, times)
from rich.console import Console
from rich.table import Table
def suite(name, base, query, k, benchmarks): def suite(name, base, query, k, benchmarks):
print(f"Starting benchmark suite: {name} {base.shape}, k={k}") print(f"Starting benchmark suite: {name} {base.shape}, k={k}")
results = [] results = []
for b in benchmarks.split(","): for b in benchmarks:
if b == "faiss": if b == "faiss":
results.append(bench_faiss(base, query, k=k)) results.append(bench_faiss(base, query, k=k))
elif b == "vec-static": elif b == "vec-static":
@ -460,6 +459,8 @@ def suite(name, base, query, k, benchmarks):
results.append(bench_duckdb(base, query, k=k)) results.append(bench_duckdb(base, query, k=k))
elif b == "sentence-transformers": elif b == "sentence-transformers":
results.append(bench_sentence_transformers(base, query, k=k)) results.append(bench_sentence_transformers(base, query, k=k))
elif b == "chroma":
results.append(bench_chroma(base, query, k=k))
else: else:
raise Exception(f"unknown benchmark {b}") raise Exception(f"unknown benchmark {b}")
@ -565,12 +566,58 @@ def cli_read_query(query, base):
return cli_read_input(query, -1) return cli_read_input(query, -1)
def main():
args = parse_args()
print(args)
base = cli_read_input(args.input, args.sample)
queries = cli_read_query(args.query, base)[: args.qsample]
suite(args.name, base, queries, args.k, args.x)
@dataclass
class Config:
name: str
input: str
k: int
queries: str
qsample: int
tests: List[str]
sample: Optional[int]
def parse_config_file(path:str) -> Config:
name = None
input = None
k = None
queries = None
qsample = None
sample = None
tests = []
for line in open(path, 'r'):
line = line.strip()
if not line or line.startswith('#'):
continue
elif line.startswith('@name='):
name = line.removeprefix('@name=')
elif line.startswith('@k='):
k = line.removeprefix('@k=')
elif line.startswith('@input='):
input = line.removeprefix('@input=')
elif line.startswith('@queries='):
queries = line.removeprefix('@queries=')
elif line.startswith('@qsample='):
qsample = line.removeprefix('@qsample=')
elif line.startswith('@sample='):
sample = line.removeprefix('@sample=')
elif line.startswith('@'):
raise Exception(f"unknown config line '{line}'")
else:
tests.append(line)
return Config(name, input, int(k), queries, int(qsample), tests, int(sample) if sample is not None else None)
from sys import argv
if __name__ == "__main__": if __name__ == "__main__":
main() config = parse_config_file(argv[1])
print(config)
#args = parse_args()
#print(args)
base = cli_read_input(config.input, config.sample)
queries = cli_read_query(config.queries, base)[: config.qsample]
suite(config.name, base, queries, config.k, config.tests)
#main()

View file

@ -1,3 +0,0 @@
#!/bin/bash
python bench.py -n gist -i ../../gist/gist_base.fvecs -q ../../gist/gist_query.fvecs --sample 750000 --qsample 200 -k 20 -x $1

View file

@ -0,0 +1,15 @@
@name=gist
@input=data/gist/gist_base.fvecs
@queries=data/gist/gist_query.fvecs
@sample=500000
@qsample=20
@k=20
faiss
usearch
vec-static
#duckdb
#vec-vec0.8192.1024
#vec-vec0.8192.2048
#vec-scalar.8192
#numpy

View file

@ -0,0 +1,120 @@
annotated-types==0.7.0
anyio==4.4.0
asgiref==3.8.1
attrs==23.2.0
backoff==2.2.1
bcrypt==4.2.0
build==1.2.1
cachetools==5.4.0
certifi==2024.7.4
charset-normalizer==3.3.2
chroma-hnswlib==0.7.6
chromadb==0.5.5
click==8.1.7
coloredlogs==15.0.1
decorator==5.1.1
deprecated==1.2.14
deprecation==2.1.0
dnspython==2.6.1
duckdb==1.0.0
email-validator==2.2.0
faiss-cpu==1.8.0.post1
fastapi==0.111.1
fastapi-cli==0.0.4
filelock==3.15.4
flatbuffers==24.3.25
fsspec==2024.6.1
google-auth==2.32.0
googleapis-common-protos==1.63.2
grpcio==1.65.1
h11==0.14.0
hnswlib==0.8.0
httpcore==1.0.5
httptools==0.6.1
httpx==0.27.0
huggingface-hub==0.24.1
humanfriendly==10.0
idna==3.7
importlib-metadata==8.0.0
importlib-resources==6.4.0
jinja2==3.1.4
joblib==1.4.2
kubernetes==30.1.0
lancedb==0.10.2
markdown-it-py==3.0.0
markupsafe==2.1.5
mdurl==0.1.2
mmh3==4.1.0
monotonic==1.6
mpmath==1.3.0
networkx==3.3
numpy==1.26.4
oauthlib==3.2.2
onnxruntime==1.18.1
opentelemetry-api==1.26.0
opentelemetry-exporter-otlp-proto-common==1.26.0
opentelemetry-exporter-otlp-proto-grpc==1.26.0
opentelemetry-instrumentation==0.47b0
opentelemetry-instrumentation-asgi==0.47b0
opentelemetry-instrumentation-fastapi==0.47b0
opentelemetry-proto==1.26.0
opentelemetry-sdk==1.26.0
opentelemetry-semantic-conventions==0.47b0
opentelemetry-util-http==0.47b0
orjson==3.10.6
overrides==7.7.0
packaging==24.1
pandas==2.2.2
pillow==10.4.0
posthog==3.5.0
protobuf==4.25.4
py==1.11.0
pyarrow==15.0.0
pyasn1==0.6.0
pyasn1-modules==0.4.0
pydantic==2.8.2
pydantic-core==2.20.1
pygments==2.18.0
pylance==0.14.1
pypika==0.48.9
pyproject-hooks==1.1.0
python-dateutil==2.9.0.post0
python-dotenv==1.0.1
python-multipart==0.0.9
pytz==2024.1
pyyaml==6.0.1
ratelimiter==1.2.0.post0
regex==2024.5.15
requests==2.32.3
requests-oauthlib==2.0.0
retry==0.9.2
rich==13.7.1
rsa==4.9
safetensors==0.4.3
scikit-learn==1.5.1
scipy==1.14.0
sentence-transformers==3.0.1
setuptools==71.1.0
shellingham==1.5.4
six==1.16.0
sniffio==1.3.1
starlette==0.37.2
sympy==1.13.1
tenacity==8.5.0
threadpoolctl==3.5.0
tokenizers==0.19.1
torch==2.3.1
tqdm==4.66.4
transformers==4.43.1
typer==0.12.3
typing-extensions==4.12.2
tzdata==2024.1
urllib3==2.2.2
usearch==2.12.0
uvicorn==0.30.3
uvloop==0.19.0
watchfiles==0.22.0
websocket-client==1.8.0
websockets==12.0
wrapt==1.16.0
zipp==3.19.2

View file

@ -1,3 +0,0 @@
#!/bin/bash
python bench.py -n sift1m -i ../../sift/sift_base.fvecs -q ../../sift/sift_query.fvecs --qsample 100 -k 20 -x $1

View file

@ -1,18 +1,28 @@
@name=sift1m @name=sift1m
@i=../../sift/sift_base.fvecs @input=data/sift/sift_base.fvecs
@q=../../sift/sift_query.fvecs @queries=data/sift/sift_query.fvecs
@qsample=100 @qsample=100
@k=20
libsql.4096
libsql.8192
faiss faiss
vec-scalar.4096
vec-static
vec-vec0.4096.16
vec-vec0.8192.1024
vec-vec0.4096.2048
usearch usearch
duckdb duckdb
hnswlib vec-static
vec-vec0.8192.1024
vec-vec0.8192.2048
vec-scalar.8192
numpy numpy
# #libsql.4096
# #libsql.8192
# faiss
# vec-scalar.4096
# vec-static
# vec-vec0.4096.16
# vec-vec0.8192.1024
# vec-vec0.4096.2048
# usearch
# duckdb
# hnswlib
# numpy
# chroma