mirror of
https://github.com/asg017/sqlite-vec.git
synced 2026-04-25 16:56:27 +02:00
benchmark updates
This commit is contained in:
parent
156d6c1e3b
commit
4febdff11a
10 changed files with 290 additions and 149 deletions
1
benchmarks/exhaustive-memory/.gitignore
vendored
Normal file
1
benchmarks/exhaustive-memory/.gitignore
vendored
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
data/
|
||||||
15
benchmarks/exhaustive-memory/Makefile
Normal file
15
benchmarks/exhaustive-memory/Makefile
Normal file
|
|
@ -0,0 +1,15 @@
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
data/:
|
||||||
|
mkdir -p $@
|
||||||
|
|
||||||
|
data/sift: data/
|
||||||
|
curl -o data/sift.tar.gz ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz
|
||||||
|
tar -xvzf data/sift.tar.gz -C data/
|
||||||
|
rm data/sift.tar.gz
|
||||||
|
|
||||||
|
data/gist: data/
|
||||||
|
curl -o data/gist.tar.gz ftp://ftp.irisa.fr/local/texmex/corpus/gist.tar.gz
|
||||||
|
tar -xvzf data/gist.tar.gz -C data/
|
||||||
|
rm data/gist.tar.gz
|
||||||
|
|
@ -1,35 +1,25 @@
|
||||||
```
|
# `sqlite-vec` In-memory benchmark comparisions
|
||||||
python3 bench/bench.py \
|
|
||||||
-n "sift1m" \
|
|
||||||
-i sift/sift_base.fvecs \
|
|
||||||
-q sift/sift_query.fvecs \
|
|
||||||
--sample 10000 --qsample 100 \
|
|
||||||
-k 10
|
|
||||||
```
|
|
||||||
|
|
||||||
```
|
This repo contains a benchmarks that compares KNN queries of `sqlite-vec` to other in-process vector search tools using **brute force linear scans only**. These include:
|
||||||
python3 bench.py \
|
|
||||||
-n "sift1m" \
|
|
||||||
-i ../../sift/sift_base.fvecs \
|
|
||||||
-q ../../sift/sift_query.fvecs \
|
|
||||||
--qsample 100 \
|
|
||||||
-k 20
|
|
||||||
```
|
|
||||||
```
|
|
||||||
python3 bench.py \
|
|
||||||
-n "sift1m" \
|
|
||||||
-i ../../sift/sift_base.fvecs \
|
|
||||||
-q ../../sift/sift_query.fvecs \
|
|
||||||
--qsample 100 \
|
|
||||||
-x faiss,vec-scalar.4096,vec-static,vec-vec0.4096.16,vec-vec0.8192.1024,usearch,duckdb,hnswlib,numpy \
|
|
||||||
-k 20
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
||||||
|
- [Faiss IndexFlatL2](https://faiss.ai/)
|
||||||
```
|
- [usearch with `exact=True`](https://github.com/unum-cloud/usearch)
|
||||||
python bench.py -n gist -i ../../gist/gist_base.fvecs -q ../../gist/gist_query.fvecs --qsample 100 -k 20 --sample 500000 -x faiss,vec-static,vec-scalar.8192,vec-scalar.16384,vec-scalar.32768,vec-vec0.16384.64,vec-vec0.16384.128,vec-vec0.16384.256,vec-vec0.16384.512,vec-vec0.16384.1024,vec-vec0.16384.2048
|
- [libsql vector search with `vector_distance_cos`](https://turso.tech/vector)
|
||||||
```
|
- [numpy](https://numpy.org/), using [this approach](https://github.com/EthanRosenthal/nn-vs-ann)
|
||||||
|
- [duckdb with `list_cosine_similarity`](https://duckdb.org/docs/sql/functions/nested.html#list_cosine_similaritylist1-list2)
|
||||||
|
- [`sentence_transformers.util.semantic_search`](https://sbert.net/docs/package_reference/util.html#sentence_transformers.util.semantic_search)
|
||||||
|
- [hnswlib BFIndex](https://github.com/nmslib/hnswlib/blob/c1b9b79af3d10c6ee7b5d0afa1ce851ae975254c/TESTING_RECALL.md?plain=1#L8)
|
||||||
|
|
||||||
|
|
||||||
python bench.py -n gist -i ../../gist/gist_base.fvecs -q ../../gist/gist_query.fvecs --qsample 100 -k 20 --sample 500000 -x faiss,vec-static,sentence-transformers,numpy
|
Again **ONLY BRUTE FORCE LINEAR SCANS ARE TESTED**. This benchmark does **not** test approximate nearest neighbors (ANN) implementations. This benchmarks is extremely narrow to just testing KNN searches using brute force.
|
||||||
|
|
||||||
|
A few other caveats:
|
||||||
|
|
||||||
|
- Only brute-force linear scans, no ANN
|
||||||
|
- Only CPU is used. The only tool that does offer GPU is Faiss anyway.
|
||||||
|
- Only in-memory datasets are used. Many of these tools do support serializing and reading from disk (including `sqlite-vec`) and possibly `mmap`'ing, but this only tests in-memory datasets. Mostly because of numpy
|
||||||
|
- Queries are made one after the other, **not batched.** Some tools offer APIs to query multiple inputs at the same time, but this benchmark runs queries sequentially. This was done to emulate "server request"-style queries, but multiple users would send queries at different times, making batching more difficult. To note, `sqlite-vec` does **not** support batched queries yet.
|
||||||
|
|
||||||
|
|
||||||
|
These tests are run in Python. Vectors are provided as an in-memory numpy array, and each test converts that numpy array into whatever makes sense for the given tool. For example, `sqlite-vec` tests will read those vectors into a SQLite table. DuckDB will read them into an Array array then create a DuckDB table from that.
|
||||||
|
|
|
||||||
|
|
@ -1,51 +0,0 @@
|
||||||
import numpy as np
|
|
||||||
import numpy.typing as npt
|
|
||||||
import time
|
|
||||||
|
|
||||||
def cosine_similarity(
|
|
||||||
vec: npt.NDArray[np.float32], mat: npt.NDArray[np.float32], do_norm: bool = True
|
|
||||||
) -> npt.NDArray[np.float32]:
|
|
||||||
sim = vec @ mat.T
|
|
||||||
if do_norm:
|
|
||||||
sim /= np.linalg.norm(vec) * np.linalg.norm(mat, axis=1)
|
|
||||||
return sim
|
|
||||||
|
|
||||||
|
|
||||||
def topk(
|
|
||||||
vec: npt.NDArray[np.float32],
|
|
||||||
mat: npt.NDArray[np.float32],
|
|
||||||
k: int = 5,
|
|
||||||
do_norm: bool = True,
|
|
||||||
) -> tuple[npt.NDArray[np.int32], npt.NDArray[np.float32]]:
|
|
||||||
sim = cosine_similarity(vec, mat, do_norm=do_norm)
|
|
||||||
# Rather than sorting all similarities and taking the top K, it's faster to
|
|
||||||
# argpartition and then just sort the top K.
|
|
||||||
# The difference is O(N logN) vs O(N + k logk)
|
|
||||||
indices = np.argpartition(-sim, kth=k)[:k]
|
|
||||||
top_indices = np.argsort(-sim[indices])
|
|
||||||
return indices[top_indices], sim[top_indices]
|
|
||||||
|
|
||||||
|
|
||||||
def ivecs_read(fname):
|
|
||||||
a = np.fromfile(fname, dtype="int32")
|
|
||||||
d = a[0]
|
|
||||||
return a.reshape(-1, d + 1)[:, 1:].copy()
|
|
||||||
|
|
||||||
|
|
||||||
def fvecs_read(fname):
|
|
||||||
return ivecs_read(fname).view("float32")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
base = fvecs_read("../../sift/sift_base.fvecs")
|
|
||||||
queries = fvecs_read("../../sift/sift_query.fvecs")
|
|
||||||
k = 20
|
|
||||||
times = []
|
|
||||||
results = []
|
|
||||||
for idx, q in enumerate(queries[0:20]):
|
|
||||||
t0 = time.time()
|
|
||||||
result = topk(q, base, k=k)
|
|
||||||
results.append(result)
|
|
||||||
times.append(time.time() - t0)
|
|
||||||
print(np.__version__)
|
|
||||||
print(np.mean(times))
|
|
||||||
|
|
@ -1,22 +1,12 @@
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import numpy.typing as npt
|
import numpy.typing as npt
|
||||||
import time
|
import time
|
||||||
import hnswlib
|
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import faiss
|
|
||||||
import lancedb
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
# import chromadb
|
|
||||||
from usearch.index import Index, search, MetricKind
|
|
||||||
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
from rich.console import Console
|
||||||
from typing import List
|
from rich.table import Table
|
||||||
|
from typing import List, Optional
|
||||||
import duckdb
|
|
||||||
import pyarrow as pa
|
|
||||||
from sentence_transformers.util import semantic_search
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
|
@ -66,6 +56,7 @@ def fvecs_read(fname, sample):
|
||||||
|
|
||||||
|
|
||||||
def bench_hnsw(base, query):
|
def bench_hnsw(base, query):
|
||||||
|
import hnswlib
|
||||||
t0 = time.time()
|
t0 = time.time()
|
||||||
p = hnswlib.Index(space="ip", dim=128) # possible options are l2, cosine or ip
|
p = hnswlib.Index(space="ip", dim=128) # possible options are l2, cosine or ip
|
||||||
|
|
||||||
|
|
@ -92,6 +83,7 @@ def bench_hnsw(base, query):
|
||||||
|
|
||||||
|
|
||||||
def bench_hnsw_bf(base, query, k) -> BenchResult:
|
def bench_hnsw_bf(base, query, k) -> BenchResult:
|
||||||
|
import hnswlib
|
||||||
print("hnswlib-bf")
|
print("hnswlib-bf")
|
||||||
dimensions = base.shape[1]
|
dimensions = base.shape[1]
|
||||||
t0 = time.time()
|
t0 = time.time()
|
||||||
|
|
@ -115,7 +107,7 @@ def bench_hnsw_bf(base, query, k) -> BenchResult:
|
||||||
|
|
||||||
|
|
||||||
def bench_numpy(base, query, k) -> BenchResult:
|
def bench_numpy(base, query, k) -> BenchResult:
|
||||||
print("numpy")
|
print("numpy...")
|
||||||
times = []
|
times = []
|
||||||
results = []
|
results = []
|
||||||
for idx, q in enumerate(query):
|
for idx, q in enumerate(query):
|
||||||
|
|
@ -128,7 +120,7 @@ def bench_numpy(base, query, k) -> BenchResult:
|
||||||
|
|
||||||
def bench_sqlite_vec(base, query, page_size, chunk_size, k) -> BenchResult:
|
def bench_sqlite_vec(base, query, page_size, chunk_size, k) -> BenchResult:
|
||||||
dimensions = base.shape[1]
|
dimensions = base.shape[1]
|
||||||
print(f"sqlite-vec {page_size} {chunk_size}")
|
print(f"sqlite-vec {page_size} {chunk_size}...")
|
||||||
|
|
||||||
db = sqlite3.connect(":memory:")
|
db = sqlite3.connect(":memory:")
|
||||||
db.execute(f"PRAGMA page_size = {page_size}")
|
db.execute(f"PRAGMA page_size = {page_size}")
|
||||||
|
|
@ -169,12 +161,13 @@ def bench_sqlite_vec(base, query, page_size, chunk_size, k) -> BenchResult:
|
||||||
""",
|
""",
|
||||||
[q.tobytes(), k],
|
[q.tobytes(), k],
|
||||||
).fetchall()
|
).fetchall()
|
||||||
|
assert len(result) == k
|
||||||
times.append(time.time() - t0)
|
times.append(time.time() - t0)
|
||||||
return BenchResult(f"sqlite-vec vec0 ({page_size}|{chunk_size})", build_time, times)
|
return BenchResult(f"sqlite-vec vec0 ({page_size}|{chunk_size})", build_time, times)
|
||||||
|
|
||||||
|
|
||||||
def bench_sqlite_vec_scalar(base, query, page_size, k) -> BenchResult:
|
def bench_sqlite_vec_scalar(base, query, page_size, k) -> BenchResult:
|
||||||
print(f"sqlite-vec-scalar")
|
print(f"sqlite-vec-scalar...")
|
||||||
|
|
||||||
db = sqlite3.connect(":memory:")
|
db = sqlite3.connect(":memory:")
|
||||||
db.enable_load_extension(True)
|
db.enable_load_extension(True)
|
||||||
|
|
@ -208,11 +201,12 @@ def bench_sqlite_vec_scalar(base, query, page_size, k) -> BenchResult:
|
||||||
""",
|
""",
|
||||||
[q.tobytes(), k],
|
[q.tobytes(), k],
|
||||||
).fetchall()
|
).fetchall()
|
||||||
|
assert len(result) == k
|
||||||
times.append(time.time() - t0)
|
times.append(time.time() - t0)
|
||||||
return BenchResult(f"sqlite-vec-scalar ({page_size})", build_time, times)
|
return BenchResult(f"sqlite-vec-scalar ({page_size})", build_time, times)
|
||||||
|
|
||||||
def bench_libsql(base, query, page_size, k) -> BenchResult:
|
def bench_libsql(base, query, page_size, k) -> BenchResult:
|
||||||
print(f"libsql")
|
print(f"libsql ...")
|
||||||
dimensions = base.shape[1]
|
dimensions = base.shape[1]
|
||||||
|
|
||||||
db = sqlite3.connect(":memory:")
|
db = sqlite3.connect(":memory:")
|
||||||
|
|
@ -273,7 +267,7 @@ def register_np(db, array, name):
|
||||||
)
|
)
|
||||||
|
|
||||||
def bench_sqlite_vec_static(base, query, k) -> BenchResult:
|
def bench_sqlite_vec_static(base, query, k) -> BenchResult:
|
||||||
print(f"sqlite-vec static")
|
print(f"sqlite-vec static...")
|
||||||
|
|
||||||
db = sqlite3.connect(":memory:")
|
db = sqlite3.connect(":memory:")
|
||||||
db.enable_load_extension(True)
|
db.enable_load_extension(True)
|
||||||
|
|
@ -303,12 +297,14 @@ def bench_sqlite_vec_static(base, query, k) -> BenchResult:
|
||||||
""",
|
""",
|
||||||
[q.tobytes(), k],
|
[q.tobytes(), k],
|
||||||
).fetchall()
|
).fetchall()
|
||||||
|
assert len(result) == k
|
||||||
times.append(time.time() - t0)
|
times.append(time.time() - t0)
|
||||||
return BenchResult(f"sqlite-vec static", build_time, times)
|
return BenchResult(f"sqlite-vec static", build_time, times)
|
||||||
|
|
||||||
def bench_faiss(base, query, k) -> BenchResult:
|
def bench_faiss(base, query, k) -> BenchResult:
|
||||||
|
import faiss
|
||||||
dimensions = base.shape[1]
|
dimensions = base.shape[1]
|
||||||
print("faiss")
|
print("faiss...")
|
||||||
t = time.time()
|
t = time.time()
|
||||||
index = faiss.IndexFlatL2(dimensions)
|
index = faiss.IndexFlatL2(dimensions)
|
||||||
index.add(base)
|
index.add(base)
|
||||||
|
|
@ -321,11 +317,12 @@ def bench_faiss(base, query, k) -> BenchResult:
|
||||||
distances, rowids = index.search(x=np.array([q]), k=k)
|
distances, rowids = index.search(x=np.array([q]), k=k)
|
||||||
results.append(rowids)
|
results.append(rowids)
|
||||||
times.append(time.time() - t0)
|
times.append(time.time() - t0)
|
||||||
print("faiss avg", duration(np.mean(times)))
|
|
||||||
return BenchResult("faiss", build_time, times)
|
return BenchResult("faiss", build_time, times)
|
||||||
|
|
||||||
|
|
||||||
def bench_lancedb(base, query, k) -> BenchResult:
|
def bench_lancedb(base, query, k) -> BenchResult:
|
||||||
|
import lancedb
|
||||||
|
print('lancedb...')
|
||||||
dimensions = base.shape[1]
|
dimensions = base.shape[1]
|
||||||
db = lancedb.connect("a")
|
db = lancedb.connect("a")
|
||||||
data = [{"vector": row.reshape(1, -1)[0]} for row in base]
|
data = [{"vector": row.reshape(1, -1)[0]} for row in base]
|
||||||
|
|
@ -343,6 +340,9 @@ def bench_lancedb(base, query, k) -> BenchResult:
|
||||||
return BenchResult("lancedb", build_time, times)
|
return BenchResult("lancedb", build_time, times)
|
||||||
|
|
||||||
def bench_duckdb(base, query, k) -> BenchResult:
|
def bench_duckdb(base, query, k) -> BenchResult:
|
||||||
|
import duckdb
|
||||||
|
import pyarrow as pa
|
||||||
|
print("duckdb...")
|
||||||
dimensions = base.shape[1]
|
dimensions = base.shape[1]
|
||||||
db = duckdb.connect(":memory:")
|
db = duckdb.connect(":memory:")
|
||||||
db.execute(f"CREATE TABLE t(vector float[{dimensions}])")
|
db.execute(f"CREATE TABLE t(vector float[{dimensions}])")
|
||||||
|
|
@ -368,6 +368,7 @@ def bench_duckdb(base, query, k) -> BenchResult:
|
||||||
return BenchResult("duckdb", build_time, times)
|
return BenchResult("duckdb", build_time, times)
|
||||||
|
|
||||||
def bench_sentence_transformers(base, query, k) -> BenchResult:
|
def bench_sentence_transformers(base, query, k) -> BenchResult:
|
||||||
|
from sentence_transformers.util import semantic_search
|
||||||
print("sentence-transformers")
|
print("sentence-transformers")
|
||||||
dimensions = base.shape[1]
|
dimensions = base.shape[1]
|
||||||
t0 = time.time()
|
t0 = time.time()
|
||||||
|
|
@ -382,28 +383,29 @@ def bench_sentence_transformers(base, query, k) -> BenchResult:
|
||||||
return BenchResult("sentence-transformers", build_time, times)
|
return BenchResult("sentence-transformers", build_time, times)
|
||||||
|
|
||||||
|
|
||||||
# def bench_chroma(base, query, k):
|
def bench_chroma(base, query, k):
|
||||||
# chroma_client = chromadb.Client()
|
import chromadb
|
||||||
# collection = chroma_client.create_collection(name="my_collection")
|
from chromadb.utils.batch_utils import create_batches
|
||||||
#
|
chroma_client = chromadb.EphemeralClient()
|
||||||
# t = time.time()
|
collection = chroma_client.create_collection(name="my_collection")
|
||||||
# # chroma doesn't allow for more than 41666 vectors to be inserted at once (???)
|
|
||||||
# i = 0
|
|
||||||
# collection.add(embeddings=base, ids=[str(x) for x in range(len(base))])
|
|
||||||
# print("chroma build time: ", duration(time.time() - t))
|
|
||||||
# times = []
|
|
||||||
# for q in query:
|
|
||||||
# t0 = time.time()
|
|
||||||
# result = collection.query(
|
|
||||||
# query_embeddings=[q.tolist()],
|
|
||||||
# n_results=k,
|
|
||||||
# )
|
|
||||||
# print(result)
|
|
||||||
# times.append(time.time() - t0)
|
|
||||||
# print("chroma avg", duration(np.mean(times)))
|
|
||||||
|
|
||||||
|
t = time.time()
|
||||||
|
for batch in create_batches(api=chroma_client, ids=[str(x) for x in range(len(base))], embeddings=base.tolist()):
|
||||||
|
collection.add(*batch)
|
||||||
|
build_time = time.time() - t
|
||||||
|
times = []
|
||||||
|
for q in query:
|
||||||
|
t0 = time.time()
|
||||||
|
result = collection.query(
|
||||||
|
query_embeddings=[q.tolist()],
|
||||||
|
n_results=k,
|
||||||
|
)
|
||||||
|
times.append(time.time() - t0)
|
||||||
|
#print("chroma avg", duration(np.mean(times)))
|
||||||
|
return BenchResult("chroma", build_time, times)
|
||||||
|
|
||||||
def bench_usearch_npy(base, query, k) -> BenchResult:
|
def bench_usearch_npy(base, query, k) -> BenchResult:
|
||||||
|
from usearch.index import Index, search, MetricKind
|
||||||
times = []
|
times = []
|
||||||
for q in query:
|
for q in query:
|
||||||
t0 = time.time()
|
t0 = time.time()
|
||||||
|
|
@ -414,6 +416,7 @@ def bench_usearch_npy(base, query, k) -> BenchResult:
|
||||||
|
|
||||||
|
|
||||||
def bench_usearch_special(base, query, k) -> BenchResult:
|
def bench_usearch_special(base, query, k) -> BenchResult:
|
||||||
|
from usearch.index import Index, search, MetricKind
|
||||||
dimensions = base.shape[1]
|
dimensions = base.shape[1]
|
||||||
index = Index(ndim=dimensions)
|
index = Index(ndim=dimensions)
|
||||||
t = time.time()
|
t = time.time()
|
||||||
|
|
@ -425,18 +428,14 @@ def bench_usearch_special(base, query, k) -> BenchResult:
|
||||||
t0 = time.time()
|
t0 = time.time()
|
||||||
result = index.search(q, exact=True)
|
result = index.search(q, exact=True)
|
||||||
times.append(time.time() - t0)
|
times.append(time.time() - t0)
|
||||||
return BenchResult("usuearch index exact=True", build_time, times)
|
return BenchResult("usuearch index", build_time, times)
|
||||||
|
|
||||||
|
|
||||||
from rich.console import Console
|
|
||||||
from rich.table import Table
|
|
||||||
|
|
||||||
|
|
||||||
def suite(name, base, query, k, benchmarks):
|
def suite(name, base, query, k, benchmarks):
|
||||||
print(f"Starting benchmark suite: {name} {base.shape}, k={k}")
|
print(f"Starting benchmark suite: {name} {base.shape}, k={k}")
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
for b in benchmarks.split(","):
|
for b in benchmarks:
|
||||||
if b == "faiss":
|
if b == "faiss":
|
||||||
results.append(bench_faiss(base, query, k=k))
|
results.append(bench_faiss(base, query, k=k))
|
||||||
elif b == "vec-static":
|
elif b == "vec-static":
|
||||||
|
|
@ -460,6 +459,8 @@ def suite(name, base, query, k, benchmarks):
|
||||||
results.append(bench_duckdb(base, query, k=k))
|
results.append(bench_duckdb(base, query, k=k))
|
||||||
elif b == "sentence-transformers":
|
elif b == "sentence-transformers":
|
||||||
results.append(bench_sentence_transformers(base, query, k=k))
|
results.append(bench_sentence_transformers(base, query, k=k))
|
||||||
|
elif b == "chroma":
|
||||||
|
results.append(bench_chroma(base, query, k=k))
|
||||||
else:
|
else:
|
||||||
raise Exception(f"unknown benchmark {b}")
|
raise Exception(f"unknown benchmark {b}")
|
||||||
|
|
||||||
|
|
@ -565,12 +566,58 @@ def cli_read_query(query, base):
|
||||||
return cli_read_input(query, -1)
|
return cli_read_input(query, -1)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
args = parse_args()
|
|
||||||
print(args)
|
|
||||||
base = cli_read_input(args.input, args.sample)
|
|
||||||
queries = cli_read_query(args.query, base)[: args.qsample]
|
|
||||||
suite(args.name, base, queries, args.k, args.x)
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Config:
|
||||||
|
name: str
|
||||||
|
input: str
|
||||||
|
k: int
|
||||||
|
queries: str
|
||||||
|
qsample: int
|
||||||
|
tests: List[str]
|
||||||
|
sample: Optional[int]
|
||||||
|
|
||||||
|
def parse_config_file(path:str) -> Config:
|
||||||
|
name = None
|
||||||
|
input = None
|
||||||
|
k = None
|
||||||
|
queries = None
|
||||||
|
qsample = None
|
||||||
|
sample = None
|
||||||
|
tests = []
|
||||||
|
|
||||||
|
for line in open(path, 'r'):
|
||||||
|
line = line.strip()
|
||||||
|
if not line or line.startswith('#'):
|
||||||
|
continue
|
||||||
|
elif line.startswith('@name='):
|
||||||
|
name = line.removeprefix('@name=')
|
||||||
|
elif line.startswith('@k='):
|
||||||
|
k = line.removeprefix('@k=')
|
||||||
|
elif line.startswith('@input='):
|
||||||
|
input = line.removeprefix('@input=')
|
||||||
|
elif line.startswith('@queries='):
|
||||||
|
queries = line.removeprefix('@queries=')
|
||||||
|
elif line.startswith('@qsample='):
|
||||||
|
qsample = line.removeprefix('@qsample=')
|
||||||
|
elif line.startswith('@sample='):
|
||||||
|
sample = line.removeprefix('@sample=')
|
||||||
|
elif line.startswith('@'):
|
||||||
|
raise Exception(f"unknown config line '{line}'")
|
||||||
|
else:
|
||||||
|
tests.append(line)
|
||||||
|
return Config(name, input, int(k), queries, int(qsample), tests, int(sample) if sample is not None else None)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
from sys import argv
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
config = parse_config_file(argv[1])
|
||||||
|
print(config)
|
||||||
|
#args = parse_args()
|
||||||
|
#print(args)
|
||||||
|
base = cli_read_input(config.input, config.sample)
|
||||||
|
queries = cli_read_query(config.queries, base)[: config.qsample]
|
||||||
|
suite(config.name, base, queries, config.k, config.tests)
|
||||||
|
|
||||||
|
#main()
|
||||||
|
|
|
||||||
|
|
@ -1,3 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
|
|
||||||
python bench.py -n gist -i ../../gist/gist_base.fvecs -q ../../gist/gist_query.fvecs --sample 750000 --qsample 200 -k 20 -x $1
|
|
||||||
15
benchmarks/exhaustive-memory/gist.suite
Normal file
15
benchmarks/exhaustive-memory/gist.suite
Normal file
|
|
@ -0,0 +1,15 @@
|
||||||
|
@name=gist
|
||||||
|
@input=data/gist/gist_base.fvecs
|
||||||
|
@queries=data/gist/gist_query.fvecs
|
||||||
|
@sample=500000
|
||||||
|
@qsample=20
|
||||||
|
@k=20
|
||||||
|
|
||||||
|
faiss
|
||||||
|
usearch
|
||||||
|
vec-static
|
||||||
|
#duckdb
|
||||||
|
#vec-vec0.8192.1024
|
||||||
|
#vec-vec0.8192.2048
|
||||||
|
#vec-scalar.8192
|
||||||
|
#numpy
|
||||||
120
benchmarks/exhaustive-memory/requirements.txt
Normal file
120
benchmarks/exhaustive-memory/requirements.txt
Normal file
|
|
@ -0,0 +1,120 @@
|
||||||
|
annotated-types==0.7.0
|
||||||
|
anyio==4.4.0
|
||||||
|
asgiref==3.8.1
|
||||||
|
attrs==23.2.0
|
||||||
|
backoff==2.2.1
|
||||||
|
bcrypt==4.2.0
|
||||||
|
build==1.2.1
|
||||||
|
cachetools==5.4.0
|
||||||
|
certifi==2024.7.4
|
||||||
|
charset-normalizer==3.3.2
|
||||||
|
chroma-hnswlib==0.7.6
|
||||||
|
chromadb==0.5.5
|
||||||
|
click==8.1.7
|
||||||
|
coloredlogs==15.0.1
|
||||||
|
decorator==5.1.1
|
||||||
|
deprecated==1.2.14
|
||||||
|
deprecation==2.1.0
|
||||||
|
dnspython==2.6.1
|
||||||
|
duckdb==1.0.0
|
||||||
|
email-validator==2.2.0
|
||||||
|
faiss-cpu==1.8.0.post1
|
||||||
|
fastapi==0.111.1
|
||||||
|
fastapi-cli==0.0.4
|
||||||
|
filelock==3.15.4
|
||||||
|
flatbuffers==24.3.25
|
||||||
|
fsspec==2024.6.1
|
||||||
|
google-auth==2.32.0
|
||||||
|
googleapis-common-protos==1.63.2
|
||||||
|
grpcio==1.65.1
|
||||||
|
h11==0.14.0
|
||||||
|
hnswlib==0.8.0
|
||||||
|
httpcore==1.0.5
|
||||||
|
httptools==0.6.1
|
||||||
|
httpx==0.27.0
|
||||||
|
huggingface-hub==0.24.1
|
||||||
|
humanfriendly==10.0
|
||||||
|
idna==3.7
|
||||||
|
importlib-metadata==8.0.0
|
||||||
|
importlib-resources==6.4.0
|
||||||
|
jinja2==3.1.4
|
||||||
|
joblib==1.4.2
|
||||||
|
kubernetes==30.1.0
|
||||||
|
lancedb==0.10.2
|
||||||
|
markdown-it-py==3.0.0
|
||||||
|
markupsafe==2.1.5
|
||||||
|
mdurl==0.1.2
|
||||||
|
mmh3==4.1.0
|
||||||
|
monotonic==1.6
|
||||||
|
mpmath==1.3.0
|
||||||
|
networkx==3.3
|
||||||
|
numpy==1.26.4
|
||||||
|
oauthlib==3.2.2
|
||||||
|
onnxruntime==1.18.1
|
||||||
|
opentelemetry-api==1.26.0
|
||||||
|
opentelemetry-exporter-otlp-proto-common==1.26.0
|
||||||
|
opentelemetry-exporter-otlp-proto-grpc==1.26.0
|
||||||
|
opentelemetry-instrumentation==0.47b0
|
||||||
|
opentelemetry-instrumentation-asgi==0.47b0
|
||||||
|
opentelemetry-instrumentation-fastapi==0.47b0
|
||||||
|
opentelemetry-proto==1.26.0
|
||||||
|
opentelemetry-sdk==1.26.0
|
||||||
|
opentelemetry-semantic-conventions==0.47b0
|
||||||
|
opentelemetry-util-http==0.47b0
|
||||||
|
orjson==3.10.6
|
||||||
|
overrides==7.7.0
|
||||||
|
packaging==24.1
|
||||||
|
pandas==2.2.2
|
||||||
|
pillow==10.4.0
|
||||||
|
posthog==3.5.0
|
||||||
|
protobuf==4.25.4
|
||||||
|
py==1.11.0
|
||||||
|
pyarrow==15.0.0
|
||||||
|
pyasn1==0.6.0
|
||||||
|
pyasn1-modules==0.4.0
|
||||||
|
pydantic==2.8.2
|
||||||
|
pydantic-core==2.20.1
|
||||||
|
pygments==2.18.0
|
||||||
|
pylance==0.14.1
|
||||||
|
pypika==0.48.9
|
||||||
|
pyproject-hooks==1.1.0
|
||||||
|
python-dateutil==2.9.0.post0
|
||||||
|
python-dotenv==1.0.1
|
||||||
|
python-multipart==0.0.9
|
||||||
|
pytz==2024.1
|
||||||
|
pyyaml==6.0.1
|
||||||
|
ratelimiter==1.2.0.post0
|
||||||
|
regex==2024.5.15
|
||||||
|
requests==2.32.3
|
||||||
|
requests-oauthlib==2.0.0
|
||||||
|
retry==0.9.2
|
||||||
|
rich==13.7.1
|
||||||
|
rsa==4.9
|
||||||
|
safetensors==0.4.3
|
||||||
|
scikit-learn==1.5.1
|
||||||
|
scipy==1.14.0
|
||||||
|
sentence-transformers==3.0.1
|
||||||
|
setuptools==71.1.0
|
||||||
|
shellingham==1.5.4
|
||||||
|
six==1.16.0
|
||||||
|
sniffio==1.3.1
|
||||||
|
starlette==0.37.2
|
||||||
|
sympy==1.13.1
|
||||||
|
tenacity==8.5.0
|
||||||
|
threadpoolctl==3.5.0
|
||||||
|
tokenizers==0.19.1
|
||||||
|
torch==2.3.1
|
||||||
|
tqdm==4.66.4
|
||||||
|
transformers==4.43.1
|
||||||
|
typer==0.12.3
|
||||||
|
typing-extensions==4.12.2
|
||||||
|
tzdata==2024.1
|
||||||
|
urllib3==2.2.2
|
||||||
|
usearch==2.12.0
|
||||||
|
uvicorn==0.30.3
|
||||||
|
uvloop==0.19.0
|
||||||
|
watchfiles==0.22.0
|
||||||
|
websocket-client==1.8.0
|
||||||
|
websockets==12.0
|
||||||
|
wrapt==1.16.0
|
||||||
|
zipp==3.19.2
|
||||||
|
|
@ -1,3 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
|
|
||||||
python bench.py -n sift1m -i ../../sift/sift_base.fvecs -q ../../sift/sift_query.fvecs --qsample 100 -k 20 -x $1
|
|
||||||
|
|
@ -1,18 +1,28 @@
|
||||||
@name=sift1m
|
@name=sift1m
|
||||||
@i=../../sift/sift_base.fvecs
|
@input=data/sift/sift_base.fvecs
|
||||||
@q=../../sift/sift_query.fvecs
|
@queries=data/sift/sift_query.fvecs
|
||||||
@qsample=100
|
@qsample=100
|
||||||
|
@k=20
|
||||||
|
|
||||||
libsql.4096
|
|
||||||
libsql.8192
|
|
||||||
faiss
|
faiss
|
||||||
vec-scalar.4096
|
|
||||||
vec-static
|
|
||||||
vec-vec0.4096.16
|
|
||||||
vec-vec0.8192.1024
|
|
||||||
vec-vec0.4096.2048
|
|
||||||
usearch
|
usearch
|
||||||
duckdb
|
duckdb
|
||||||
hnswlib
|
vec-static
|
||||||
|
vec-vec0.8192.1024
|
||||||
|
vec-vec0.8192.2048
|
||||||
|
vec-scalar.8192
|
||||||
numpy
|
numpy
|
||||||
|
|
||||||
|
# #libsql.4096
|
||||||
|
# #libsql.8192
|
||||||
|
# faiss
|
||||||
|
# vec-scalar.4096
|
||||||
|
# vec-static
|
||||||
|
# vec-vec0.4096.16
|
||||||
|
# vec-vec0.8192.1024
|
||||||
|
# vec-vec0.4096.2048
|
||||||
|
# usearch
|
||||||
|
# duckdb
|
||||||
|
# hnswlib
|
||||||
|
# numpy
|
||||||
|
# chroma
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue