benchmark work

2026-04-25 08:46:49 +02:00 · 2024-07-25 11:15:36 -07:00 · 2024-07-25 11:15:36 -07:00 · ac01e330de
commit ac01e330de
parent a0c4e202f6
6 changed files with 312 additions and 46 deletions
--- a/benchmarks/exhaustive-memory/b.py
+++ b/benchmarks/exhaustive-memory/b.py
@ -0,0 +1,51 @@
+import numpy as np
+import numpy.typing as npt
+import time
+
+def cosine_similarity(
+    vec: npt.NDArray[np.float32], mat: npt.NDArray[np.float32], do_norm: bool = True
+) -> npt.NDArray[np.float32]:
+    sim = vec @ mat.T
+    if do_norm:
+        sim /= np.linalg.norm(vec) * np.linalg.norm(mat, axis=1)
+    return sim
+
+
+def topk(
+    vec: npt.NDArray[np.float32],
+    mat: npt.NDArray[np.float32],
+    k: int = 5,
+    do_norm: bool = True,
+) -> tuple[npt.NDArray[np.int32], npt.NDArray[np.float32]]:
+    sim = cosine_similarity(vec, mat, do_norm=do_norm)
+    # Rather than sorting all similarities and taking the top K, it's faster to
+    # argpartition and then just sort the top K.
+    # The difference is O(N logN) vs O(N + k logk)
+    indices = np.argpartition(-sim, kth=k)[:k]
+    top_indices = np.argsort(-sim[indices])
+    return indices[top_indices], sim[top_indices]
+
+
+def ivecs_read(fname):
+    a = np.fromfile(fname, dtype="int32")
+    d = a[0]
+    return a.reshape(-1, d + 1)[:, 1:].copy()
+
+
+def fvecs_read(fname):
+    return ivecs_read(fname).view("float32")
+
+
+
+base = fvecs_read("../../sift/sift_base.fvecs")
+queries = fvecs_read("../../sift/sift_query.fvecs")
+k = 20
+times = []
+results = []
+for idx, q in enumerate(queries[0:20]):
+    t0 = time.time()
+    result = topk(q, base, k=k)
+    results.append(result)
+    times.append(time.time() - t0)
+print(np.__version__)
+print(np.mean(times))