From ac01e330de0ea34a2f92a8d8565f99c275f24182 Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Thu, 25 Jul 2024 11:15:36 -0700 Subject: [PATCH 1/2] benchmark work --- benchmarks/exhaustive-memory/README.md | 28 ++- benchmarks/exhaustive-memory/b.py | 51 +++++ benchmarks/exhaustive-memory/bench.py | 255 ++++++++++++++++++++---- benchmarks/exhaustive-memory/gist.sh | 3 + benchmarks/exhaustive-memory/sift.sh | 3 + benchmarks/exhaustive-memory/sift.suite | 18 ++ 6 files changed, 312 insertions(+), 46 deletions(-) create mode 100644 benchmarks/exhaustive-memory/b.py create mode 100755 benchmarks/exhaustive-memory/gist.sh create mode 100755 benchmarks/exhaustive-memory/sift.sh create mode 100644 benchmarks/exhaustive-memory/sift.suite diff --git a/benchmarks/exhaustive-memory/README.md b/benchmarks/exhaustive-memory/README.md index 5336d30..374e4ee 100644 --- a/benchmarks/exhaustive-memory/README.md +++ b/benchmarks/exhaustive-memory/README.md @@ -8,10 +8,28 @@ python3 bench/bench.py \ ``` ``` -python3 bench/bench.py \ +python3 bench.py \ -n "sift1m" \ - -i sift/sift_base.fvecs \ - -q sift/sift_query.fvecs \ - --sample 10000 --qsample 100 \ - -k 10 + -i ../../sift/sift_base.fvecs \ + -q ../../sift/sift_query.fvecs \ + --qsample 100 \ + -k 20 ``` +``` +python3 bench.py \ + -n "sift1m" \ + -i ../../sift/sift_base.fvecs \ + -q ../../sift/sift_query.fvecs \ + --qsample 100 \ + -x faiss,vec-scalar.4096,vec-static,vec-vec0.4096.16,vec-vec0.8192.1024,usearch,duckdb,hnswlib,numpy \ + -k 20 +``` + + + +``` +python bench.py -n gist -i ../../gist/gist_base.fvecs -q ../../gist/gist_query.fvecs --qsample 100 -k 20 --sample 500000 -x faiss,vec-static,vec-scalar.8192,vec-scalar.16384,vec-scalar.32768,vec-vec0.16384.64,vec-vec0.16384.128,vec-vec0.16384.256,vec-vec0.16384.512,vec-vec0.16384.1024,vec-vec0.16384.2048 +``` + + +python bench.py -n gist -i ../../gist/gist_base.fvecs -q ../../gist/gist_query.fvecs --qsample 100 -k 20 --sample 500000 -x faiss,vec-static,sentence-transformers,numpy diff --git a/benchmarks/exhaustive-memory/b.py b/benchmarks/exhaustive-memory/b.py new file mode 100644 index 0000000..9cad980 --- /dev/null +++ b/benchmarks/exhaustive-memory/b.py @@ -0,0 +1,51 @@ +import numpy as np +import numpy.typing as npt +import time + +def cosine_similarity( + vec: npt.NDArray[np.float32], mat: npt.NDArray[np.float32], do_norm: bool = True +) -> npt.NDArray[np.float32]: + sim = vec @ mat.T + if do_norm: + sim /= np.linalg.norm(vec) * np.linalg.norm(mat, axis=1) + return sim + + +def topk( + vec: npt.NDArray[np.float32], + mat: npt.NDArray[np.float32], + k: int = 5, + do_norm: bool = True, +) -> tuple[npt.NDArray[np.int32], npt.NDArray[np.float32]]: + sim = cosine_similarity(vec, mat, do_norm=do_norm) + # Rather than sorting all similarities and taking the top K, it's faster to + # argpartition and then just sort the top K. + # The difference is O(N logN) vs O(N + k logk) + indices = np.argpartition(-sim, kth=k)[:k] + top_indices = np.argsort(-sim[indices]) + return indices[top_indices], sim[top_indices] + + +def ivecs_read(fname): + a = np.fromfile(fname, dtype="int32") + d = a[0] + return a.reshape(-1, d + 1)[:, 1:].copy() + + +def fvecs_read(fname): + return ivecs_read(fname).view("float32") + + + +base = fvecs_read("../../sift/sift_base.fvecs") +queries = fvecs_read("../../sift/sift_query.fvecs") +k = 20 +times = [] +results = [] +for idx, q in enumerate(queries[0:20]): + t0 = time.time() + result = topk(q, base, k=k) + results.append(result) + times.append(time.time() - t0) +print(np.__version__) +print(np.mean(times)) diff --git a/benchmarks/exhaustive-memory/bench.py b/benchmarks/exhaustive-memory/bench.py index ffa3443..211864d 100644 --- a/benchmarks/exhaustive-memory/bench.py +++ b/benchmarks/exhaustive-memory/bench.py @@ -14,6 +14,10 @@ from dataclasses import dataclass from typing import List +import duckdb +import pyarrow as pa +from sentence_transformers.util import semantic_search + @dataclass class BenchResult: @@ -52,13 +56,13 @@ def topk( def ivecs_read(fname): - a = np.fromfile(fname, dtype="int32") + a = np.fromfile(fname, dtype="int32",) d = a[0] return a.reshape(-1, d + 1)[:, 1:].copy() -def fvecs_read(fname): - return ivecs_read(fname).view("float32") +def fvecs_read(fname, sample): + return ivecs_read(fname).view("float32")[:sample] def bench_hnsw(base, query): @@ -80,8 +84,6 @@ def bench_hnsw(base, query): for idx, q in enumerate(query): t0 = time.time() result = p.knn_query(q, k=5) - if idx < 5: - print(result[0]) results.append(result) times.append(time.time() - t0) print(time.time() - t) @@ -131,7 +133,7 @@ def bench_sqlite_vec(base, query, page_size, chunk_size, k) -> BenchResult: db = sqlite3.connect(":memory:") db.execute(f"PRAGMA page_size = {page_size}") db.enable_load_extension(True) - db.load_extension("./dist/vec0") + db.load_extension("../../dist/vec0") db.execute( f""" create virtual table vec_sift1m using vec0( @@ -171,12 +173,12 @@ def bench_sqlite_vec(base, query, page_size, chunk_size, k) -> BenchResult: return BenchResult(f"sqlite-vec vec0 ({page_size}|{chunk_size})", build_time, times) -def bench_sqlite_normal(base, query, page_size, k) -> BenchResult: - print(f"sqlite-normal") +def bench_sqlite_vec_scalar(base, query, page_size, k) -> BenchResult: + print(f"sqlite-vec-scalar") db = sqlite3.connect(":memory:") db.enable_load_extension(True) - db.load_extension("./dist/vec0") + db.load_extension("../../dist/vec0") db.execute(f"PRAGMA page_size={page_size}") db.execute(f"create table sift1m(vector);") @@ -207,8 +209,102 @@ def bench_sqlite_normal(base, query, page_size, k) -> BenchResult: [q.tobytes(), k], ).fetchall() times.append(time.time() - t0) - return BenchResult(f"sqlite-vec normal ({page_size})", build_time, times) + return BenchResult(f"sqlite-vec-scalar ({page_size})", build_time, times) +def bench_libsql(base, query, page_size, k) -> BenchResult: + print(f"libsql") + dimensions = base.shape[1] + + db = sqlite3.connect(":memory:") + db.enable_load_extension(True) + assert db.execute("select 'vector' in (select name from pragma_function_list)").fetchone()[0] == 1 + db.execute(f"PRAGMA page_size={page_size}") + db.execute(f"create table vectors(vector f32_blob({dimensions}));") + + # TODO: only does DiskANN? + #db.execute("CREATE INDEX vectors_idx ON vectors (libsql_vector_idx(vector, 'metric=cosine'))") + + t = time.time() + with db: + db.executemany( + "insert into vectors(vector) values (?)", + list(map(lambda x: [x.tobytes()], base)), + ) + build_time = time.time() - t + times = [] + results = [] + t = time.time() + for ( + idx, + q, + ) in enumerate(query): + t0 = time.time() + result = db.execute( + """ + select + rowid, + vector_distance_cos(?, vector) as distance + FROM vectors + order by 2 + limit ? + """, + [q.tobytes(), k], + ).fetchall() + times.append(time.time() - t0) + return BenchResult(f"libsql ({page_size})", build_time, times) + + +def register_np(db, array, name): + ptr = array.__array_interface__["data"][0] + nvectors, dimensions = array.__array_interface__["shape"] + element_type = array.__array_interface__["typestr"] + + assert element_type == " BenchResult: + print(f"sqlite-vec static") + + db = sqlite3.connect(":memory:") + db.enable_load_extension(True) + db.load_extension("../../dist/vec0") + + + + t = time.time() + register_np(db, base, "base") + build_time = time.time() - t + + times = [] + results = [] + for ( + idx, + q, + ) in enumerate(query): + t0 = time.time() + result = db.execute( + """ + select + rowid + from base + where vector match ? + and k = ? + order by distance + """, + [q.tobytes(), k], + ).fetchall() + times.append(time.time() - t0) + return BenchResult(f"sqlite-vec static", build_time, times) def bench_faiss(base, query, k) -> BenchResult: dimensions = base.shape[1] @@ -246,6 +342,45 @@ def bench_lancedb(base, query, k) -> BenchResult: times.append(time.time() - t0) return BenchResult("lancedb", build_time, times) +def bench_duckdb(base, query, k) -> BenchResult: + dimensions = base.shape[1] + db = duckdb.connect(":memory:") + db.execute(f"CREATE TABLE t(vector float[{dimensions}])") + + t0 = time.time() + pa_base = pa.Table.from_arrays([pa.array(list(base))], names=['vector']) + pa_base + db.execute(f"INSERT INTO t(vector) SELECT vector::float[{dimensions}] FROM pa_base") + build_time = time.time() - t0 + times = [] + for q in query: + t0 = time.time() + result = db.execute( + f""" + SELECT + rowid, + array_cosine_similarity(vector, ?::float[{dimensions}]) + FROM t + ORDER BY 2 DESC + LIMIT ? + """, [q, k]).fetchall() + times.append(time.time() - t0) + return BenchResult("duckdb", build_time, times) + +def bench_sentence_transformers(base, query, k) -> BenchResult: + print("sentence-transformers") + dimensions = base.shape[1] + t0 = time.time() + build_time = time.time() - t0 + + times = [] + for q in query: + t0 = time.time() + result = semantic_search(q, base, top_k=k) + times.append(time.time() - t0) + + return BenchResult("sentence-transformers", build_time, times) + # def bench_chroma(base, query, k): # chroma_client = chromadb.Client() @@ -297,23 +432,65 @@ from rich.console import Console from rich.table import Table -def suite(name, base, query, k): +def suite(name, base, query, k, benchmarks): print(f"Starting benchmark suite: {name} {base.shape}, k={k}") results = [] - # n = bench_chroma(base[:40000], query, k=k) - # n = bench_usearch_npy(base, query, k=k) - # n = bench_usearch_special(base, query, k=k) - results.append(bench_faiss(base, query, k=k)) - results.append(bench_hnsw_bf(base, query, k=k)) - # n = bench_sqlite_vec(base, query, 4096, 1024, k=k) - # n = bench_sqlite_vec(base, query, 32768, 1024, k=k) - results.append(bench_sqlite_vec(base, query, 32768, 256, k=k)) - # n = bench_sqlite_vec(base, query, 16384, 64, k=k) - # n = bench_sqlite_vec(base, query, 16384, 32, k=k) - results.append(bench_sqlite_normal(base, query, 8192, k=k)) - results.append(bench_lancedb(base, query, k=k)) - results.append(bench_numpy(base, query, k=k)) - # h = bench_hnsw(base, query) + + for b in benchmarks.split(","): + if b == "faiss": + results.append(bench_faiss(base, query, k=k)) + elif b == "vec-static": + results.append(bench_sqlite_vec_static(base, query, k=k)) + elif b.startswith("vec-scalar"): + _, page_size = b.split('.') + results.append(bench_sqlite_vec_scalar(base, query, page_size, k=k)) + elif b.startswith("libsql"): + _, page_size = b.split('.') + results.append(bench_libsql(base, query, page_size, k=k)) + elif b.startswith("vec-vec0"): + _, page_size, chunk_size = b.split('.') + results.append(bench_sqlite_vec(base, query, int(page_size), int(chunk_size), k=k)) + elif b == "usearch": + results.append(bench_usearch_npy(base, query, k=k)) + elif b == "hnswlib": + results.append(bench_hnsw_bf(base, query, k=k)) + elif b == "numpy": + results.append(bench_numpy(base, query, k=k)) + elif b == "duckdb": + results.append(bench_duckdb(base, query, k=k)) + elif b == "sentence-transformers": + results.append(bench_sentence_transformers(base, query, k=k)) + else: + raise Exception(f"unknown benchmark {b}") + + #results.append(bench_sqlite_vec(base, query, 32768, 512, k=k)) + #results.append(bench_sqlite_vec(base, query, 32768, 256, k=k)) + + + #results.append(bench_sqlite_vec_expo(base, query, k=k)) + + # n = bench_chroma(base[:40000], query, k=k) + + # n = bench_usearch_special(base, query, k=k) + + + + # n = bench_sqlite_vec(base, query, 4096, 1024, k=k) + # n = bench_sqlite_vec(base, query, 32768, 1024, k=k) + + + + # blessed + + ### #for pgsz in [4096, 8192, 16384, 32768, 65536]: + ### # for chunksz in [8, 32, 128, 512, 1024, 2048]: + ### # results.append(bench_sqlite_vec(base, query, pgsz, chunksz, k=k)) + ### # n = bench_sqlite_vec(base, query, 16384, 64, k=k) + ### # n = bench_sqlite_vec(base, query, 16384, 32, k=k) + ### results.append(bench_sqlite_normal(base, query, 8192, k=k)) + ### results.append(bench_lancedb(base, query, k=k)) + + ### #h = bench_hnsw(base, query) table = Table( title=f"{name}: {base.shape[0]:,} {base.shape[1]}-dimension vectors, k={k}" @@ -322,7 +499,7 @@ def suite(name, base, query, k): table.add_column("Tool") table.add_column("Build Time (ms)", justify="right") table.add_column("Query time (ms)", justify="right") - for res in results: + for res in sorted(results, key=lambda x: np.mean(x.query_times_ms)): table.add_row( res.tool, duration(res.build_time_ms), duration(np.mean(res.query_times_ms)) ) @@ -354,6 +531,7 @@ def parse_args(): type=int, required=False, help="Number of entries in base to use. Defaults all", + default=-1 ) parser.add_argument( "--qsample", @@ -361,6 +539,9 @@ def parse_args(): required=False, help="Number of queries to use. Defaults all", ) + parser.add_argument( + "-x", help="type of runs to make", default="faiss,vec-scalar.4096,vec-static,vec-vec0.4096.16,usearch,duckdb,hnswlib,numpy" + ) args = parser.parse_args() return args @@ -369,35 +550,27 @@ def parse_args(): from pathlib import Path -def cli_read_input(input): +def cli_read_input(input, sample): input_path = Path(input) if input_path.suffix == ".fvecs": - return fvecs_read(input_path) + return fvecs_read(input_path, sample) if input_path.suffx == ".npy": - return np.fromfile(input_path, dtype="float32") + return np.fromfile(input_path, dtype="float32", count=sample) raise Exception("unknown filetype", input) def cli_read_query(query, base): if query is None: return base[np.random.choice(base.shape[0], 100, replace=False), :] - return cli_read_input(query) + return cli_read_input(query, -1) def main(): args = parse_args() - base = cli_read_input(args.input)[: args.sample] + print(args) + base = cli_read_input(args.input, args.sample) queries = cli_read_query(args.query, base)[: args.qsample] - suite(args.name, base, queries, args.k) - - from sys import argv - - # base = fvecs_read("sift/sift_base.fvecs") # [:100000] - # query = fvecs_read("sift/sift_query.fvecs")[:100] - # print(base.shape) - # k = int(argv[1]) if len(argv) > 1 else 5 - # suite("sift1m", base, query, k) - + suite(args.name, base, queries, args.k, args.x) if __name__ == "__main__": main() diff --git a/benchmarks/exhaustive-memory/gist.sh b/benchmarks/exhaustive-memory/gist.sh new file mode 100755 index 0000000..1522698 --- /dev/null +++ b/benchmarks/exhaustive-memory/gist.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +python bench.py -n gist -i ../../gist/gist_base.fvecs -q ../../gist/gist_query.fvecs --sample 750000 --qsample 200 -k 20 -x $1 diff --git a/benchmarks/exhaustive-memory/sift.sh b/benchmarks/exhaustive-memory/sift.sh new file mode 100755 index 0000000..30ea86e --- /dev/null +++ b/benchmarks/exhaustive-memory/sift.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +python bench.py -n sift1m -i ../../sift/sift_base.fvecs -q ../../sift/sift_query.fvecs --qsample 100 -k 20 -x $1 diff --git a/benchmarks/exhaustive-memory/sift.suite b/benchmarks/exhaustive-memory/sift.suite new file mode 100644 index 0000000..96e0a65 --- /dev/null +++ b/benchmarks/exhaustive-memory/sift.suite @@ -0,0 +1,18 @@ +@name=sift1m +@i=../../sift/sift_base.fvecs +@q=../../sift/sift_query.fvecs +@qsample=100 + +libsql.4096 +libsql.8192 +faiss +vec-scalar.4096 +vec-static +vec-vec0.4096.16 +vec-vec0.8192.1024 +vec-vec0.4096.2048 +usearch +duckdb +hnswlib +numpy + From 65656cbadc993a14a432c14d7577ec199e0dad13 Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Thu, 25 Jul 2024 11:16:06 -0700 Subject: [PATCH 2/2] fuzz work --- tests/fuzz/.gitignore | 2 ++ tests/fuzz/Makefile | 48 +++++++++++++++++++++++++++ tests/fuzz/README.md | 15 +++++++++ tests/fuzz/corpus/vec0-create/normal1 | 1 + tests/fuzz/corpus/vec0-create/normal2 | 1 + tests/fuzz/exec.c | 30 +++++++++++++++++ tests/fuzz/exec.dict | 21 ++++++++++++ tests/fuzz/json.c | 34 +++++++++++++++++++ tests/fuzz/numpy.c | 42 +++++++++++++++++++++++ tests/fuzz/numpy.dict | 7 ++++ tests/fuzz/vec0-create.c | 37 +++++++++++++++++++++ tests/fuzz/vec0-create.dict | 16 +++++++++ tests/leak-fixtures/vec0-create.sql | 7 ++++ 13 files changed, 261 insertions(+) create mode 100644 tests/fuzz/.gitignore create mode 100644 tests/fuzz/Makefile create mode 100644 tests/fuzz/README.md create mode 100644 tests/fuzz/corpus/vec0-create/normal1 create mode 100644 tests/fuzz/corpus/vec0-create/normal2 create mode 100644 tests/fuzz/exec.c create mode 100644 tests/fuzz/exec.dict create mode 100644 tests/fuzz/json.c create mode 100644 tests/fuzz/numpy.c create mode 100644 tests/fuzz/numpy.dict create mode 100644 tests/fuzz/vec0-create.c create mode 100644 tests/fuzz/vec0-create.dict create mode 100644 tests/leak-fixtures/vec0-create.sql diff --git a/tests/fuzz/.gitignore b/tests/fuzz/.gitignore new file mode 100644 index 0000000..757d1ac --- /dev/null +++ b/tests/fuzz/.gitignore @@ -0,0 +1,2 @@ +*.dSYM +targets/ diff --git a/tests/fuzz/Makefile b/tests/fuzz/Makefile new file mode 100644 index 0000000..7bd0e0a --- /dev/null +++ b/tests/fuzz/Makefile @@ -0,0 +1,48 @@ + +TARGET_DIR=./targets + +$(TARGET_DIR): + mkdir -p $@ + +# ASAN_OPTIONS=detect_leaks=1 ./fuzz_json -detect_leaks=1 '-trace_malloc=[12]' tmp +$(TARGET_DIR)/json: json.c $(TARGET_DIR) + /opt/homebrew/opt/llvm/bin/clang \ + -fsanitize=address,fuzzer \ + -I ../../ -I ../../vendor -DSQLITE_CORE -g \ + ../../vendor/sqlite3.c \ + ../../sqlite-vec.c \ + $< \ + -o $@ + + +$(TARGET_DIR)/vec0_create: vec0-create.c ../../sqlite-vec.c $(TARGET_DIR) + /opt/homebrew/opt/llvm/bin/clang \ + -fsanitize=address,fuzzer \ + -I ../../ -I ../../vendor -DSQLITE_CORE -g \ + ../../vendor/sqlite3.c \ + ../../sqlite-vec.c \ + $< \ + -o $@ + +$(TARGET_DIR)/numpy: numpy.c ../../sqlite-vec.c $(TARGET_DIR) + /opt/homebrew/opt/llvm/bin/clang \ + -fsanitize=address,fuzzer \ + -I ../../ -I ../../vendor -DSQLITE_CORE -g \ + ../../vendor/sqlite3.c \ + ../../sqlite-vec.c \ + $< \ + -o $@ + +$(TARGET_DIR)/exec: exec.c ../../sqlite-vec.c $(TARGET_DIR) + /opt/homebrew/opt/llvm/bin/clang \ + -fsanitize=address,fuzzer \ + -I ../../ -I ../../vendor -DSQLITE_CORE -g \ + ../../vendor/sqlite3.c \ + ../../sqlite-vec.c \ + $< \ + -o $@ + +all: $(TARGET_DIR)/json $(TARGET_DIR)/numpy $(TARGET_DIR)/json $(TARGET_DIR)/exec + +clean: + rm -rf $(TARGET_DIR)/* diff --git a/tests/fuzz/README.md b/tests/fuzz/README.md new file mode 100644 index 0000000..e28d917 --- /dev/null +++ b/tests/fuzz/README.md @@ -0,0 +1,15 @@ +``` +ASAN_OPTIONS=detect_leaks=1 ./targets/vec0_create \ + -dict=./vec0-create.dict -max_total_time=5 \ + ./corpus/vec0-create +``` + + +``` +export PATH="/opt/homebrew/opt/llvm/bin:$PATH" +export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" +export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" + + +LDFLAGS="-L/opt/homebrew/opt/llvm/lib/c++ -Wl,-rpath,/opt/homebrew/opt/llvm/lib/c++" +``` diff --git a/tests/fuzz/corpus/vec0-create/normal1 b/tests/fuzz/corpus/vec0-create/normal1 new file mode 100644 index 0000000..669100f --- /dev/null +++ b/tests/fuzz/corpus/vec0-create/normal1 @@ -0,0 +1 @@ +aaa float[12] diff --git a/tests/fuzz/corpus/vec0-create/normal2 b/tests/fuzz/corpus/vec0-create/normal2 new file mode 100644 index 0000000..7f3d0d2 --- /dev/null +++ b/tests/fuzz/corpus/vec0-create/normal2 @@ -0,0 +1 @@ +aaa float[12], bbb int8[6] diff --git a/tests/fuzz/exec.c b/tests/fuzz/exec.c new file mode 100644 index 0000000..bcb407b --- /dev/null +++ b/tests/fuzz/exec.c @@ -0,0 +1,30 @@ +#include +#include + +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + int rc = SQLITE_OK; + sqlite3 *db; + sqlite3_stmt *stmt; + if(size < 1) return 0; + + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + const char * zSrc = sqlite3_mprintf("%.*s", size, data); + assert(zSrc); + + sqlite3_exec(db, zSrc, NULL, NULL, NULL); + sqlite3_free(zSrc); + + sqlite3_close(db); + return 0; +} diff --git a/tests/fuzz/exec.dict b/tests/fuzz/exec.dict new file mode 100644 index 0000000..46a5daa --- /dev/null +++ b/tests/fuzz/exec.dict @@ -0,0 +1,21 @@ +select="select" +from="from" +cname1="aaa" +cname1="bbb" +cname1="ccc" +type1="float" +type2="int8" +type3="bit" +lparen="[" +rparen="]" +pk="primary key" +text="text" +distance_metric="distance_metric" +eq="=" +l1="l1" +l2="l2" +cosine="cosine" +hamming="hamming" +vec_distance_l2="vec_distance_l2" +vec_distance_l1="vec_distance_l1" +comma="," diff --git a/tests/fuzz/json.c b/tests/fuzz/json.c new file mode 100644 index 0000000..437a753 --- /dev/null +++ b/tests/fuzz/json.c @@ -0,0 +1,34 @@ +#include +#include + +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + int rc = SQLITE_OK; + sqlite3 *db; + sqlite3_stmt *stmt; + + //rc = sqlite3_auto_extension((void (*)())sqlite3_vec_init); + //assert(rc == SQLITE_OK); + + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + rc = sqlite3_prepare_v2(db, "SELECT vec_f32(cast(? as text))", -1, &stmt, NULL); + assert(rc == SQLITE_OK); + + sqlite3_bind_blob(stmt, 1, data, size, SQLITE_STATIC); + sqlite3_step(stmt); + + sqlite3_finalize(stmt); + sqlite3_close(db); + return 0; + +} diff --git a/tests/fuzz/numpy.c b/tests/fuzz/numpy.c new file mode 100644 index 0000000..a2c8273 --- /dev/null +++ b/tests/fuzz/numpy.c @@ -0,0 +1,42 @@ +#include +#include + +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + int rc = SQLITE_OK; + sqlite3 *db; + sqlite3_stmt *stmt; + + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + + rc = sqlite3_prepare_v2(db, "select * from vec_npy_each(?)", -1, &stmt, NULL); + assert(rc == SQLITE_OK); + sqlite3_bind_blob(stmt, 1, data, size, SQLITE_STATIC); + rc = sqlite3_step(stmt); + if(rc != SQLITE_DONE || rc != SQLITE_ROW) { + sqlite3_finalize(stmt); + sqlite3_close(db); + return -1; + } + + while(1) { + if(rc == SQLITE_DONE) break; + if(rc == SQLITE_ROW) continue; + sqlite3_finalize(stmt); + sqlite3_close(db); + return 1; + } + sqlite3_finalize(stmt); + sqlite3_close(db); + return 0; +} diff --git a/tests/fuzz/numpy.dict b/tests/fuzz/numpy.dict new file mode 100644 index 0000000..f91b82f --- /dev/null +++ b/tests/fuzz/numpy.dict @@ -0,0 +1,7 @@ +magic="\x93NUMPY" +lparen="(" +rparen=")" +lbrace="{" +rbrace="}" +sq1="\"" +sq2="'" diff --git a/tests/fuzz/vec0-create.c b/tests/fuzz/vec0-create.c new file mode 100644 index 0000000..934191e --- /dev/null +++ b/tests/fuzz/vec0-create.c @@ -0,0 +1,37 @@ +#include +#include + +#include +#include +#include +#include "sqlite-vec.h" +#include "sqlite3.h" +#include + +int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + int rc = SQLITE_OK; + sqlite3 *db; + sqlite3_stmt *stmt; + + rc = sqlite3_open(":memory:", &db); + assert(rc == SQLITE_OK); + rc = sqlite3_vec_init(db, NULL, NULL); + assert(rc == SQLITE_OK); + + sqlite3_str * s = sqlite3_str_new(NULL); + assert(s); + sqlite3_str_appendall(s, "CREATE VIRTUAL TABLE v USING vec0("); + sqlite3_str_appendf(s, "%.*s", size, data); + sqlite3_str_appendall(s, ")"); + const char * zSql = sqlite3_str_finish(s); + assert(zSql); + + rc = sqlite3_prepare_v2(db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if(rc == SQLITE_OK) { + sqlite3_step(stmt); + } + sqlite3_finalize(stmt); + sqlite3_close(db); + return 0; +} diff --git a/tests/fuzz/vec0-create.dict b/tests/fuzz/vec0-create.dict new file mode 100644 index 0000000..77c7772 --- /dev/null +++ b/tests/fuzz/vec0-create.dict @@ -0,0 +1,16 @@ +cname1="aaa" +cname1="bbb" +cname1="ccc" +type1="float" +type2="int8" +type3="bit" +lparen="[" +rparen="]" +pk="primary key" +text="text" +distance_metric="distance_metric" +eq="=" +l1="l1" +l2="l2" +cosine="cosine" +hamming="hamming" diff --git a/tests/leak-fixtures/vec0-create.sql b/tests/leak-fixtures/vec0-create.sql new file mode 100644 index 0000000..874070e --- /dev/null +++ b/tests/leak-fixtures/vec0-create.sql @@ -0,0 +1,7 @@ +.load dist/vec0 +.mode box +.header on +.eqp on +.echo on + +create virtual table v using vec0(y);