Initial commit

This commit is contained in:
Alex Garcia 2024-04-20 13:38:58 -07:00
commit 4c8ad629e0
28 changed files with 6758 additions and 0 deletions

0
benchmarks/README.md Normal file
View file

View file

@ -0,0 +1,17 @@
```
python3 bench/bench.py \
-n "sift1m" \
-i sift/sift_base.fvecs \
-q sift/sift_query.fvecs \
--sample 10000 --qsample 100 \
-k 10
```
```
python3 bench/bench.py \
-n "sift1m" \
-i sift/sift_base.fvecs \
-q sift/sift_query.fvecs \
--sample 10000 --qsample 100 \
-k 10
```

View file

@ -0,0 +1,403 @@
import numpy as np
import numpy.typing as npt
import time
import hnswlib
import sqlite3
import faiss
import lancedb
import pandas as pd
# import chromadb
from usearch.index import Index, search, MetricKind
from dataclasses import dataclass
from typing import List
@dataclass
class BenchResult:
tool: str
build_time_ms: float
query_times_ms: List[float]
def duration(seconds: float):
ms = seconds * 1000
return f"{int(ms)}ms"
def cosine_similarity(
vec: npt.NDArray[np.float32], mat: npt.NDArray[np.float32], do_norm: bool = True
) -> npt.NDArray[np.float32]:
sim = vec @ mat.T
if do_norm:
sim /= np.linalg.norm(vec) * np.linalg.norm(mat, axis=1)
return sim
def topk(
vec: npt.NDArray[np.float32],
mat: npt.NDArray[np.float32],
k: int = 5,
do_norm: bool = True,
) -> tuple[npt.NDArray[np.int32], npt.NDArray[np.float32]]:
sim = cosine_similarity(vec, mat, do_norm=do_norm)
# Rather than sorting all similarities and taking the top K, it's faster to
# argpartition and then just sort the top K.
# The difference is O(N logN) vs O(N + k logk)
indices = np.argpartition(-sim, kth=k)[:k]
top_indices = np.argsort(-sim[indices])
return indices[top_indices], sim[top_indices]
def ivecs_read(fname):
a = np.fromfile(fname, dtype="int32")
d = a[0]
return a.reshape(-1, d + 1)[:, 1:].copy()
def fvecs_read(fname):
return ivecs_read(fname).view("float32")
def bench_hnsw(base, query):
t0 = time.time()
p = hnswlib.Index(space="ip", dim=128) # possible options are l2, cosine or ip
# NOTE: Use default settings from the README.
print("buildings hnsw")
p.init_index(max_elements=base.shape[0], ef_construction=200, M=16)
ids = np.arange(base.shape[0])
p.add_items(base, ids)
p.set_ef(50)
print("build time", time.time() - t0)
results = []
times = []
t = time.time()
for idx, q in enumerate(query):
t0 = time.time()
result = p.knn_query(q, k=5)
if idx < 5:
print(result[0])
results.append(result)
times.append(time.time() - t0)
print(time.time() - t)
print("hnsw avg", np.mean(times))
return results
def bench_hnsw_bf(base, query, k) -> BenchResult:
print("hnswlib-bf")
dimensions = base.shape[1]
t0 = time.time()
p = hnswlib.BFIndex(space="l2", dim=dimensions)
p.init_index(max_elements=base.shape[0])
ids = np.arange(base.shape[0])
p.add_items(base, ids)
build_time = time.time() - t0
results = []
times = []
t = time.time()
for idx, q in enumerate(query):
t0 = time.time()
result = p.knn_query(q, k=k)
results.append(result)
times.append(time.time() - t0)
return BenchResult("hnswlib-bf", build_time, times)
def bench_numpy(base, query, k) -> BenchResult:
print("numpy")
times = []
results = []
for idx, q in enumerate(query):
t0 = time.time()
result = topk(q, base, k=k)
results.append(result)
times.append(time.time() - t0)
return BenchResult("numpy", 0, times)
def bench_sqlite_vec(base, query, page_size, chunk_size, k) -> BenchResult:
dimensions = base.shape[1]
print(f"sqlite-vec {page_size} {chunk_size}")
db = sqlite3.connect(":memory:")
db.execute(f"PRAGMA page_size = {page_size}")
db.enable_load_extension(True)
db.load_extension("./dist/vec0")
db.execute(
f"""
create virtual table vec_sift1m using vec0(
chunk_size={chunk_size},
vector float[{dimensions}]
)
"""
)
t = time.time()
with db:
db.executemany(
"insert into vec_sift1m(vector) values (?)",
list(map(lambda x: [x.tobytes()], base)),
)
build_time = time.time() - t
times = []
results = []
for (
idx,
q,
) in enumerate(query):
t0 = time.time()
result = db.execute(
"""
select
rowid,
distance
from vec_sift1m
where vector match ?
and k = ?
order by distance
""",
[q.tobytes(), k],
).fetchall()
times.append(time.time() - t0)
return BenchResult(f"sqlite-vec vec0 ({page_size}|{chunk_size})", build_time, times)
def bench_sqlite_normal(base, query, page_size, k) -> BenchResult:
print(f"sqlite-normal")
db = sqlite3.connect(":memory:")
db.enable_load_extension(True)
db.load_extension("./dist/vec0")
db.execute(f"PRAGMA page_size={page_size}")
db.execute(f"create table sift1m(vector);")
t = time.time()
with db:
db.executemany(
"insert into sift1m(vector) values (?)",
list(map(lambda x: [x.tobytes()], base)),
)
build_time = time.time() - t
times = []
results = []
t = time.time()
for (
idx,
q,
) in enumerate(query):
t0 = time.time()
result = db.execute(
"""
select
rowid,
vec_distance_l2(?, vector) as distance
from sift1m
order by distance
limit ?
""",
[q.tobytes(), k],
).fetchall()
times.append(time.time() - t0)
return BenchResult(f"sqlite-vec normal ({page_size})", build_time, times)
def bench_faiss(base, query, k) -> BenchResult:
dimensions = base.shape[1]
print("faiss")
t = time.time()
index = faiss.IndexFlatL2(dimensions)
index.add(base)
build_time = time.time() - t
times = []
results = []
t = time.time()
for idx, q in enumerate(query):
t0 = time.time()
distances, rowids = index.search(x=np.array([q]), k=k)
results.append(rowids)
times.append(time.time() - t0)
print("faiss avg", duration(np.mean(times)))
return BenchResult("faiss", build_time, times)
def bench_lancedb(base, query, k) -> BenchResult:
dimensions = base.shape[1]
db = lancedb.connect("a")
data = [{"vector": row.reshape(1, -1)[0]} for row in base]
# Create a DataFrame where each row is a 1D array
df = pd.DataFrame(data=data, columns=["vector"])
t = time.time()
db.create_table("t", data=df)
build_time = time.time() - t
tbl = db.open_table("t")
times = []
for q in query:
t0 = time.time()
result = tbl.search(q).limit(k).to_arrow()
times.append(time.time() - t0)
return BenchResult("lancedb", build_time, times)
# def bench_chroma(base, query, k):
# chroma_client = chromadb.Client()
# collection = chroma_client.create_collection(name="my_collection")
#
# t = time.time()
# # chroma doesn't allow for more than 41666 vectors to be inserted at once (???)
# i = 0
# collection.add(embeddings=base, ids=[str(x) for x in range(len(base))])
# print("chroma build time: ", duration(time.time() - t))
# times = []
# for q in query:
# t0 = time.time()
# result = collection.query(
# query_embeddings=[q.tolist()],
# n_results=k,
# )
# print(result)
# times.append(time.time() - t0)
# print("chroma avg", duration(np.mean(times)))
def bench_usearch_npy(base, query, k) -> BenchResult:
times = []
for q in query:
t0 = time.time()
# result = index.search(q, exact=True)
result = search(base, q, k, MetricKind.L2sq, exact=True)
times.append(time.time() - t0)
return BenchResult("usearch numpy exact=True", 0, times)
def bench_usearch_special(base, query, k) -> BenchResult:
dimensions = base.shape[1]
index = Index(ndim=dimensions)
t = time.time()
index.add(np.arange(len(base)), base)
build_time = time.time() - t
times = []
for q in query:
t0 = time.time()
result = index.search(q, exact=True)
times.append(time.time() - t0)
return BenchResult("usuearch index exact=True", build_time, times)
from rich.console import Console
from rich.table import Table
def suite(name, base, query, k):
print(f"Starting benchmark suite: {name} {base.shape}, k={k}")
results = []
# n = bench_chroma(base[:40000], query, k=k)
# n = bench_usearch_npy(base, query, k=k)
# n = bench_usearch_special(base, query, k=k)
results.append(bench_faiss(base, query, k=k))
results.append(bench_hnsw_bf(base, query, k=k))
# n = bench_sqlite_vec(base, query, 4096, 1024, k=k)
# n = bench_sqlite_vec(base, query, 32768, 1024, k=k)
results.append(bench_sqlite_vec(base, query, 32768, 256, k=k))
# n = bench_sqlite_vec(base, query, 16384, 64, k=k)
# n = bench_sqlite_vec(base, query, 16384, 32, k=k)
results.append(bench_sqlite_normal(base, query, 8192, k=k))
results.append(bench_lancedb(base, query, k=k))
results.append(bench_numpy(base, query, k=k))
# h = bench_hnsw(base, query)
table = Table(
title=f"{name}: {base.shape[0]:,} {base.shape[1]}-dimension vectors, k={k}"
)
table.add_column("Tool")
table.add_column("Build Time (ms)", justify="right")
table.add_column("Query time (ms)", justify="right")
for res in results:
table.add_row(
res.tool, duration(res.build_time_ms), duration(np.mean(res.query_times_ms))
)
console = Console()
console.print(table)
import argparse
def parse_args():
parser = argparse.ArgumentParser(description="Benchmark processing script.")
# Required arguments
parser.add_argument("-n", "--name", required=True, help="Name of the benchmark.")
parser.add_argument(
"-i", "--input", required=True, help="Path to input file (.npy)."
)
parser.add_argument(
"-k", type=int, required=True, help="Parameter k to use in benchmark."
)
# Optional arguments
parser.add_argument(
"-q", "--query", required=False, help="Path to query file (.npy)."
)
parser.add_argument(
"--sample",
type=int,
required=False,
help="Number of entries in base to use. Defaults all",
)
parser.add_argument(
"--qsample",
type=int,
required=False,
help="Number of queries to use. Defaults all",
)
args = parser.parse_args()
return args
from pathlib import Path
def cli_read_input(input):
input_path = Path(input)
if input_path.suffix == ".fvecs":
return fvecs_read(input_path)
if input_path.suffx == ".npy":
return np.fromfile(input_path, dtype="float32")
raise Exception("unknown filetype", input)
def cli_read_query(query, base):
if query is None:
return base[np.random.choice(base.shape[0], 100, replace=False), :]
return cli_read_input(query)
def main():
args = parse_args()
base = cli_read_input(args.input)[: args.sample]
queries = cli_read_query(args.query, base)[: args.qsample]
suite(args.name, base, queries, args.k)
from sys import argv
# base = fvecs_read("sift/sift_base.fvecs") # [:100000]
# query = fvecs_read("sift/sift_query.fvecs")[:100]
# print(base.shape)
# k = int(argv[1]) if len(argv) > 1 else 5
# suite("sift1m", base, query, k)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,17 @@
.timer on
pragma page_size = 32768;
--pragma page_size = 16384;
--pragma page_size = 16384;
--pragma page_size = 4096;
create virtual table vec_items using vec0(
embedding float[1536]
);
-- 65s (limit 1e5), ~615MB on disk
insert into vec_items
select
rowid,
vector
from vec_npy_each(vec_npy_file('examples/dbpedia-openai/data/vectors.npy'))
limit 1e5;

View file

@ -0,0 +1,31 @@
.timer on
select rowid, distance
from vec_items
where embedding match (select embedding from vec_items where rowid = 100)
and k = :k
order by distance;
select rowid, distance
from vec_items
where embedding match (select embedding from vec_items where rowid = 100)
and k = :k
order by distance;
select rowid, distance
from vec_items
where embedding match (select embedding from vec_items where rowid = 100)
and k = :k
order by distance;
select rowid, distance
from vec_items
where embedding match (select embedding from vec_items where rowid = 100)
and k = :k
order by distance;
select rowid, distance
from vec_items
where embedding match (select embedding from vec_items where rowid = 100)
and k = :k
order by distance;

View file

@ -0,0 +1,85 @@
import sqlite3
import time
def connect(path):
db = sqlite3.connect(path)
db.enable_load_extension(True)
db.load_extension("../dist/vec0")
db.execute("select load_extension('../dist/vec0', 'sqlite3_vec_fs_read_init')")
db.enable_load_extension(False)
return db
page_sizes = [ # 4096, 8192,
16384,
32768,
]
chunk_sizes = [128, 256, 1024, 2048]
types = ["f32", "int8", "bit"]
SRC = "../examples/dbpedia-openai/data/vectors.npy"
for page_size in page_sizes:
for chunk_size in chunk_sizes:
for t in types:
print(f"{t} page_size={page_size}, chunk_size={chunk_size}")
t0 = time.time()
db = connect(f"dbs/test.{page_size}.{chunk_size}.{t}.db")
db.execute(f"pragma page_size = {page_size}")
with db:
db.execute(
f"""
create virtual table vec_items using vec0(
embedding {t}[1536],
chunk_size={chunk_size}
)
"""
)
func = "vector"
if t == "int8":
func = "vec_quantize_i8(vector, 'unit')"
if t == "bit":
func = "vec_quantize_binary(vector)"
db.execute(
f"""
insert into vec_items
select rowid, {func}
from vec_npy_each(vec_npy_file(?))
limit 100000
""",
[SRC],
)
elapsed = time.time() - t0
print(elapsed)
"""
# for 100_000
page_size=4096, chunk_size=256
3.5894200801849365
page_size=4096, chunk_size=1024
60.70046401023865
page_size=4096, chunk_size=2048
201.04426288604736
page_size=8192, chunk_size=256
7.034514904022217
page_size=8192, chunk_size=1024
9.983598947525024
page_size=8192, chunk_size=2048
12.318921089172363
page_size=16384, chunk_size=256
4.97080397605896
page_size=16384, chunk_size=1024
6.051195859909058
page_size=16384, chunk_size=2048
8.492683172225952
page_size=32768, chunk_size=256
5.906642198562622
page_size=32768, chunk_size=1024
5.876632213592529
page_size=32768, chunk_size=2048
5.420510292053223
"""

View file

@ -0,0 +1,83 @@
import sqlite3
import time
from random import randrange
from statistics import mean
def connect(path):
print(path)
db = sqlite3.connect(path)
db.enable_load_extension(True)
db.load_extension("../dist/vec0")
db.execute("select load_extension('../dist/vec0', 'sqlite3_vec_fs_read_init')")
db.enable_load_extension(False)
return db
page_sizes = [ # 4096, 8192,
16384,
32768,
]
chunk_sizes = [128, 256, 1024, 2048]
types = ["f32", "int8", "bit"]
types.reverse()
for t in types:
for page_size in page_sizes:
for chunk_size in chunk_sizes:
print(f"page_size={page_size}, chunk_size={chunk_size}")
func = "embedding"
if t == "int8":
func = "vec_quantize_i8(embedding, 'unit')"
if t == "bit":
func = "vec_quantize_binary(embedding)"
times = []
trials = 20
db = connect(f"dbs/test.{page_size}.{chunk_size}.{t}.db")
for trial in range(trials):
t0 = time.time()
results = db.execute(
f"""
select rowid
from vec_items
where embedding match (select {func} from vec_items where rowid = ?)
and k = 10
order by distance
""",
[randrange(100000)],
).fetchall()
times.append(time.time() - t0)
print(mean(times))
"""
page_size=4096, chunk_size=256
0.2635102152824402
page_size=4096, chunk_size=1024
0.2609449863433838
page_size=4096, chunk_size=2048
0.275589919090271
page_size=8192, chunk_size=256
0.18621582984924318
page_size=8192, chunk_size=1024
0.20939643383026124
page_size=8192, chunk_size=2048
0.22376316785812378
page_size=16384, chunk_size=256
0.16012665033340454
page_size=16384, chunk_size=1024
0.18346318006515502
page_size=16384, chunk_size=2048
0.18224761486053467
page_size=32768, chunk_size=256
0.14202518463134767
page_size=32768, chunk_size=1024
0.15340715646743774
page_size=32768, chunk_size=2048
0.18018823862075806
"""

View file

@ -0,0 +1,24 @@
import sqlite3
import time
def connect(path):
db = sqlite3.connect(path)
db.enable_load_extension(True)
db.load_extension("../dist/vec0")
db.execute("select load_extension('../dist/vec0', 'sqlite3_vec_fs_read_init')")
db.enable_load_extension(False)
return db
page_sizes = [4096, 8192, 16384, 32768]
chunk_sizes = [256, 1024, 2048]
for page_size in page_sizes:
for chunk_size in chunk_sizes:
print(f"page_size={page_size}, chunk_size={chunk_size}")
t0 = time.time()
db = connect(f"dbs/test.{page_size}.{chunk_size}.db")
print(db.execute("pragma page_size").fetchone()[0])
print(db.execute("select count(*) from vec_items_rowids").fetchone()[0])