From 763aad5d6a86257e9617d5e79e6ad4ede3dac108 Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Wed, 25 Sep 2024 23:07:17 -0700 Subject: [PATCH] Remove vec_npy_each from default entrypoint and move to sqlite3_vec_numpy_init entrypoint --- examples/sqlite3-cli/core_init.c | 4 +- reference.yaml | 55 ------------ site/versioning.md | 53 ++--------- sqlite-vec.c | 7 +- sqlite-vec.h.tmpl | 12 +-- tests/test-loadable.py | 148 ++++++++++++++++--------------- 6 files changed, 92 insertions(+), 187 deletions(-) diff --git a/examples/sqlite3-cli/core_init.c b/examples/sqlite3-cli/core_init.c index 4a5bcfd..dbd9fd7 100644 --- a/examples/sqlite3-cli/core_init.c +++ b/examples/sqlite3-cli/core_init.c @@ -2,7 +2,5 @@ #include "sqlite-vec.h" #include int core_init(const char *dummy) { - int rc = sqlite3_auto_extension((void *)sqlite3_vec_init); - if(rc != SQLITE_OK) return rc; - return sqlite3_auto_extension((void *)sqlite3_vec_fs_read_init); + return sqlite3_auto_extension((void *)sqlite3_vec_init); } diff --git a/reference.yaml b/reference.yaml index 54c1eaa..778f1ec 100644 --- a/reference.yaml +++ b/reference.yaml @@ -325,34 +325,6 @@ quantization: params: [vector, "[start]", "[end]"] desc: x example: select 'todo'; - -numpy: - vec_npy_each: - params: [npy_array] - desc: | - xxx - example: - - | - -- db.execute('select quote(?)', [to_npy(np.array([[1.0], [2.0], [3.0]], dtype=np.float32))]).fetchone() - select - rowid, - vector, - vec_type(vector), - vec_to_json(vector) - from vec_npy_each( - X'934E554D5059010076007B276465736372273A20273C6634272C2027666F727472616E5F6F72646572273A2046616C73652C20277368617065273A2028332C2031292C207D202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020200A0000803F0000004000004040' - ) - - | - -- db.execute('select quote(?)', [to_npy(np.array([[1.0], [2.0], [3.0]], dtype=np.float32))]).fetchone() - select - rowid, - vector, - vec_type(vector), - vec_to_json(vector) - from vec_npy_each( - X'934E554D5059010076007B276465736372273A20273C6634272C2027666F727472616E5F6F72646572273A2046616C73652C20277368617065273A2028332C2031292C207D202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020200A0000803F0000004000004040' - ) - vec0: vec0: params: [] @@ -367,30 +339,3 @@ vec0: values (1, '[1, 1, 1, 1]'), (2, '[2, 2, 2, 2]'), (3, '[3, 3, 3, 3]'); - -entrypoints: - {} - #sqlite3_vec_init: - # desc: | - # asdf - #sqlite3_vec_fs_read_init: - # desc: | - # asdf -#table_functions: -# vec_each: -# columns: [rowid, value] -# inputs: ["vector"] -# desc: -# example: -#virtual_tables: -# vec0: -# desc: -# example: -#entrypoints: -# sqlite3_vec_init: {} -# sqlite3_vec_fs_read_init: {} -#compile_options: -# - SQLITE_VEC_ENABLE_AVX -# - SQLITE_VEC_ENABLE_NEON -# - SQLITE_VEC_OMIT_FS -# diff --git a/site/versioning.md b/site/versioning.md index 7e4ca41..6928b42 100644 --- a/site/versioning.md +++ b/site/versioning.md @@ -4,51 +4,16 @@ [Semantic Versioning](https://semver.org/), so "minor" release like "0.2.0" or "0.3.0" may contain breaking changes. -But what exactly counts as a "breaking change" in a SQLite extension? The line -isn't so clear, unforetunately. Here are a all the surfaces that COULD count as -a "breaking change": +Only SQL functions, table functions, and virtual tables that are defined in the default `sqlite3_vec_init` entrypoint are considered as the `sqlite-vec` API for semantic versioning. This means that other entrypoints and other SQL functions should be considered unstable, untested, and possibly dangerous. -- SQL functions and columns on virtual tables -- The C API (extension entrypoints) -- "Bindings" like the official `pip` and `npm` packages -- Release assets like the pre-compile extensions +For the SQL API, a "breaking change" would include: -## What counts as a "breaking change"? +- Removing a function or module +- Changing the number or types of arguments for an SQL function +- Changing the require arguments of position of a table functions +- Changing the `CREATE VIRTUAL TABLE` constructor of a virtual table in a backwards-incompatible way +- Removing columns from a virtual table or table function -### Changes to SQL functions -- Re-naming or removing an SQL function -- Changing the number of required SQL parameters - -### Changes to SQL virtual tables - -- The number of - -### Changes to the C API - -Currently there is no "official" C API for `sqlite-vec`. However, there are -entrypoints defined in C that C developers or developers using FFI can call. Any changes to these entrypoints would be a breaking change. - -### Compile-time options - -The removal of any compile time options - -## When is `v1.0` coming? - -In a few months! The main problems I want to solve before `v1.0` include: - -- Metadata columns -- Metadata filtering -- ANN indexing -- Quantization + pre-transformations - -Once those items are complete, I will likely create a `v1.0` release, along with -renaming the `vec0` virtual table modile to `vec1`. And if future major releases -are required, a `v2.0` major releases will be made with new `vec2` virtual -tables and so on. - -Ideally, only a `v1` major release would be required. But who knows what the -future has in store with vector search! - -In general, I will try my best to maximize stability and limit the number of -breaking changes for future `sqlite-vec` versions. +The official "bindings" to `sqlite-vec`, including the Python/Node.js/Ruby/Go/Rust are subject to change and are not covered by semantic versioning. +Though I have no plans to change or break them, and would include notes in changelogs if that ever needs to happen. diff --git a/sqlite-vec.c b/sqlite-vec.c index bee9973..211aaff 100644 --- a/sqlite-vec.c +++ b/sqlite-vec.c @@ -7038,7 +7038,6 @@ SQLITE_VEC_API int sqlite3_vec_init(sqlite3 *db, char **pzErrMsg, // clang-format off {"vec0", &vec0Module, NULL, NULL}, {"vec_each", &vec_eachModule, NULL, NULL}, - {"vec_npy_each", &vec_npy_eachModule, NULL, NULL}, // clang-format on }; @@ -7066,7 +7065,7 @@ SQLITE_VEC_API int sqlite3_vec_init(sqlite3 *db, char **pzErrMsg, } #ifndef SQLITE_VEC_OMIT_FS -SQLITE_VEC_API int sqlite3_vec_fs_read_init(sqlite3 *db, char **pzErrMsg, +SQLITE_VEC_API int sqlite3_vec_numpy_init(sqlite3 *db, char **pzErrMsg, const sqlite3_api_routines *pApi) { UNUSED_PARAMETER(pzErrMsg); #ifndef SQLITE_CORE @@ -7075,6 +7074,10 @@ SQLITE_VEC_API int sqlite3_vec_fs_read_init(sqlite3 *db, char **pzErrMsg, int rc = SQLITE_OK; rc = sqlite3_create_function_v2(db, "vec_npy_file", 1, SQLITE_RESULT_SUBTYPE, NULL, vec_npy_file, NULL, NULL, NULL); + if(rc != SQLITE_OK) { + return rc; + } + rc = sqlite3_create_module_v2(db, "vec_npy_each", &vec_npy_eachModule, NULL, NULL); return rc; } #endif diff --git a/sqlite-vec.h.tmpl b/sqlite-vec.h.tmpl index 923453e..770e20f 100644 --- a/sqlite-vec.h.tmpl +++ b/sqlite-vec.h.tmpl @@ -25,19 +25,9 @@ extern "C" { #endif -#ifdef _WIN32 -__declspec(dllexport) -#endif -int sqlite3_vec_init(sqlite3 *db, char **pzErrMsg, +SQLITE_VEC_API int sqlite3_vec_init(sqlite3 *db, char **pzErrMsg, const sqlite3_api_routines *pApi); -#ifdef _WIN32 -__declspec(dllexport) -#endif -int sqlite3_vec_fs_read_init(sqlite3 *db, char **pzErrMsg, - const sqlite3_api_routines *pApi); - - #ifdef __cplusplus } /* end of the 'extern "C"' block */ #endif diff --git a/tests/test-loadable.py b/tests/test-loadable.py index ab50515..b1976cb 100644 --- a/tests/test-loadable.py +++ b/tests/test-loadable.py @@ -119,7 +119,6 @@ FUNCTIONS = [ MODULES = [ "vec0", "vec_each", - "vec_npy_each", # "vec_static_blob_entries", # "vec_static_blobs", ] @@ -1619,6 +1618,7 @@ def to_npy(arr): def test_vec_npy_each(): + db = connect(EXT_PATH, extra_entrypoint="sqlite3_vec_numpy_init") vec_npy_each = lambda *args: execute_all( db, "select rowid, * from vec_npy_each(?)", args ) @@ -1651,6 +1651,7 @@ def test_vec_npy_each(): def test_vec_npy_each_errors(): + db = connect(EXT_PATH, extra_entrypoint="sqlite3_vec_numpy_init") vec_npy_each = lambda *args: execute_all( db, "select rowid, * from vec_npy_each(?)", args ) @@ -1769,7 +1770,7 @@ import tempfile def test_vec_npy_each_errors_files(): - db = connect(EXT_PATH, extra_entrypoint="sqlite3_vec_fs_read_init") + db = connect(EXT_PATH, extra_entrypoint="sqlite3_vec_numpy_init") def vec_npy_each(data): with tempfile.NamedTemporaryFile(delete_on_close=False) as f: @@ -2274,36 +2275,42 @@ def test_smoke(): db.execute("insert into vec_xyz(rowid, a) select 2, X'0000000000000040'") chunk = db.execute("select * from vec_xyz_chunks").fetchone() - assert chunk[ - "rowids" - ] == b"\x01\x00\x00\x00\x00\x00\x00\x00" + b"\x02\x00\x00\x00\x00\x00\x00\x00" + bytearray( - int(1024 * 8) - 8 * 2 + assert ( + chunk["rowids"] + == b"\x01\x00\x00\x00\x00\x00\x00\x00" + + b"\x02\x00\x00\x00\x00\x00\x00\x00" + + bytearray(int(1024 * 8) - 8 * 2) ) assert chunk["chunk_id"] == 1 assert chunk["validity"] == b"\x03" + bytearray(int(1024 / 8) - 1) vchunk = db.execute("select * from vec_xyz_vector_chunks00").fetchone() assert vchunk["rowid"] == 1 - assert vchunk[ - "vectors" - ] == b"\x00\x00\x00\x00\x00\x00\x80\x3f" + b"\x00\x00\x00\x00\x00\x00\x00\x40" + bytearray( - int(1024 * 4 * 2) - (2 * 4 * 2) + assert ( + vchunk["vectors"] + == b"\x00\x00\x00\x00\x00\x00\x80\x3f" + + b"\x00\x00\x00\x00\x00\x00\x00\x40" + + bytearray(int(1024 * 4 * 2) - (2 * 4 * 2)) ) db.execute("insert into vec_xyz(rowid, a) select 3, X'00000000000080bf'") chunk = db.execute("select * from vec_xyz_chunks").fetchone() assert chunk["chunk_id"] == 1 assert chunk["validity"] == b"\x07" + bytearray(int(1024 / 8) - 1) - assert chunk[ - "rowids" - ] == b"\x01\x00\x00\x00\x00\x00\x00\x00" + b"\x02\x00\x00\x00\x00\x00\x00\x00" + b"\x03\x00\x00\x00\x00\x00\x00\x00" + bytearray( - int(1024 * 8) - 8 * 3 + assert ( + chunk["rowids"] + == b"\x01\x00\x00\x00\x00\x00\x00\x00" + + b"\x02\x00\x00\x00\x00\x00\x00\x00" + + b"\x03\x00\x00\x00\x00\x00\x00\x00" + + bytearray(int(1024 * 8) - 8 * 3) ) vchunk = db.execute("select * from vec_xyz_vector_chunks00").fetchone() assert vchunk["rowid"] == 1 - assert vchunk[ - "vectors" - ] == b"\x00\x00\x00\x00\x00\x00\x80\x3f" + b"\x00\x00\x00\x00\x00\x00\x00\x40" + b"\x00\x00\x00\x00\x00\x00\x80\xbf" + bytearray( - int(1024 * 4 * 2) - (2 * 4 * 3) + assert ( + vchunk["vectors"] + == b"\x00\x00\x00\x00\x00\x00\x80\x3f" + + b"\x00\x00\x00\x00\x00\x00\x00\x40" + + b"\x00\x00\x00\x00\x00\x00\x80\xbf" + + bytearray(int(1024 * 4 * 2) - (2 * 4 * 3)) ) # db.execute("select * from vec_xyz") @@ -2346,66 +2353,63 @@ def test_vec0_stress_small_chunks(): {"rowid": 994, "a": _f32([99.4] * 8)}, {"rowid": 993, "a": _f32([99.3] * 8)}, ] - assert ( - execute_all( - db, - """ + assert execute_all( + db, + """ select rowid, a, distance from vec_small where a match ? and k = 9 order by distance """, - [_f32([50.0] * 8)], - ) - == [ - { - "a": _f32([500 * 0.1] * 8), - "distance": 0.0, - "rowid": 500, - }, - { - "a": _f32([501 * 0.1] * 8), - "distance": 0.2828384041786194, - "rowid": 501, - }, - { - "a": _f32([499 * 0.1] * 8), - "distance": 0.2828384041786194, - "rowid": 499, - }, - { - "a": _f32([502 * 0.1] * 8), - "distance": 0.5656875967979431, - "rowid": 502, - }, - { - "a": _f32([498 * 0.1] * 8), - "distance": 0.5656875967979431, - "rowid": 498, - }, - { - "a": _f32([503 * 0.1] * 8), - "distance": 0.8485260009765625, - "rowid": 503, - }, - { - "a": _f32([497 * 0.1] * 8), - "distance": 0.8485260009765625, - "rowid": 497, - }, - { - "a": _f32([496 * 0.1] * 8), - "distance": 1.1313751935958862, - "rowid": 496, - }, - { - "a": _f32([504 * 0.1] * 8), - "distance": 1.1313751935958862, - "rowid": 504, - }, - ] - ) + [_f32([50.0] * 8)], + ) == [ + { + "a": _f32([500 * 0.1] * 8), + "distance": 0.0, + "rowid": 500, + }, + { + "a": _f32([501 * 0.1] * 8), + "distance": 0.2828384041786194, + "rowid": 501, + }, + { + "a": _f32([499 * 0.1] * 8), + "distance": 0.2828384041786194, + "rowid": 499, + }, + { + "a": _f32([502 * 0.1] * 8), + "distance": 0.5656875967979431, + "rowid": 502, + }, + { + "a": _f32([498 * 0.1] * 8), + "distance": 0.5656875967979431, + "rowid": 498, + }, + { + "a": _f32([503 * 0.1] * 8), + "distance": 0.8485260009765625, + "rowid": 503, + }, + { + "a": _f32([497 * 0.1] * 8), + "distance": 0.8485260009765625, + "rowid": 497, + }, + { + "a": _f32([496 * 0.1] * 8), + "distance": 1.1313751935958862, + "rowid": 496, + }, + { + "a": _f32([504 * 0.1] * 8), + "distance": 1.1313751935958862, + "rowid": 504, + }, + ] def test_vec0_distance_metric():