Remove vec_npy_each from default entrypoint and move to sqlite3_vec_numpy_init entrypoint

This commit is contained in:
Alex Garcia 2024-09-25 23:07:17 -07:00
parent 70dce09747
commit 763aad5d6a
6 changed files with 92 additions and 187 deletions

View file

@ -2,7 +2,5 @@
#include "sqlite-vec.h" #include "sqlite-vec.h"
#include <stdio.h> #include <stdio.h>
int core_init(const char *dummy) { int core_init(const char *dummy) {
int rc = sqlite3_auto_extension((void *)sqlite3_vec_init); return sqlite3_auto_extension((void *)sqlite3_vec_init);
if(rc != SQLITE_OK) return rc;
return sqlite3_auto_extension((void *)sqlite3_vec_fs_read_init);
} }

View file

@ -325,34 +325,6 @@ quantization:
params: [vector, "[start]", "[end]"] params: [vector, "[start]", "[end]"]
desc: x desc: x
example: select 'todo'; example: select 'todo';
numpy:
vec_npy_each:
params: [npy_array]
desc: |
xxx
example:
- |
-- db.execute('select quote(?)', [to_npy(np.array([[1.0], [2.0], [3.0]], dtype=np.float32))]).fetchone()
select
rowid,
vector,
vec_type(vector),
vec_to_json(vector)
from vec_npy_each(
X'934E554D5059010076007B276465736372273A20273C6634272C2027666F727472616E5F6F72646572273A2046616C73652C20277368617065273A2028332C2031292C207D202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020200A0000803F0000004000004040'
)
- |
-- db.execute('select quote(?)', [to_npy(np.array([[1.0], [2.0], [3.0]], dtype=np.float32))]).fetchone()
select
rowid,
vector,
vec_type(vector),
vec_to_json(vector)
from vec_npy_each(
X'934E554D5059010076007B276465736372273A20273C6634272C2027666F727472616E5F6F72646572273A2046616C73652C20277368617065273A2028332C2031292C207D202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020200A0000803F0000004000004040'
)
vec0: vec0:
vec0: vec0:
params: [] params: []
@ -367,30 +339,3 @@ vec0:
values (1, '[1, 1, 1, 1]'), values (1, '[1, 1, 1, 1]'),
(2, '[2, 2, 2, 2]'), (2, '[2, 2, 2, 2]'),
(3, '[3, 3, 3, 3]'); (3, '[3, 3, 3, 3]');
entrypoints:
{}
#sqlite3_vec_init:
# desc: |
# asdf
#sqlite3_vec_fs_read_init:
# desc: |
# asdf
#table_functions:
# vec_each:
# columns: [rowid, value]
# inputs: ["vector"]
# desc:
# example:
#virtual_tables:
# vec0:
# desc:
# example:
#entrypoints:
# sqlite3_vec_init: {}
# sqlite3_vec_fs_read_init: {}
#compile_options:
# - SQLITE_VEC_ENABLE_AVX
# - SQLITE_VEC_ENABLE_NEON
# - SQLITE_VEC_OMIT_FS
#

View file

@ -4,51 +4,16 @@
[Semantic Versioning](https://semver.org/), so "minor" release like "0.2.0" or [Semantic Versioning](https://semver.org/), so "minor" release like "0.2.0" or
"0.3.0" may contain breaking changes. "0.3.0" may contain breaking changes.
But what exactly counts as a "breaking change" in a SQLite extension? The line Only SQL functions, table functions, and virtual tables that are defined in the default `sqlite3_vec_init` entrypoint are considered as the `sqlite-vec` API for semantic versioning. This means that other entrypoints and other SQL functions should be considered unstable, untested, and possibly dangerous.
isn't so clear, unforetunately. Here are a all the surfaces that COULD count as
a "breaking change":
- SQL functions and columns on virtual tables For the SQL API, a "breaking change" would include:
- The C API (extension entrypoints)
- "Bindings" like the official `pip` and `npm` packages
- Release assets like the pre-compile extensions
## What counts as a "breaking change"? - Removing a function or module
- Changing the number or types of arguments for an SQL function
- Changing the require arguments of position of a table functions
- Changing the `CREATE VIRTUAL TABLE` constructor of a virtual table in a backwards-incompatible way
- Removing columns from a virtual table or table function
### Changes to SQL functions
- Re-naming or removing an SQL function The official "bindings" to `sqlite-vec`, including the Python/Node.js/Ruby/Go/Rust are subject to change and are not covered by semantic versioning.
- Changing the number of required SQL parameters Though I have no plans to change or break them, and would include notes in changelogs if that ever needs to happen.
### Changes to SQL virtual tables
- The number of
### Changes to the C API
Currently there is no "official" C API for `sqlite-vec`. However, there are
entrypoints defined in C that C developers or developers using FFI can call. Any changes to these entrypoints would be a breaking change.
### Compile-time options
The removal of any compile time options
## When is `v1.0` coming?
In a few months! The main problems I want to solve before `v1.0` include:
- Metadata columns
- Metadata filtering
- ANN indexing
- Quantization + pre-transformations
Once those items are complete, I will likely create a `v1.0` release, along with
renaming the `vec0` virtual table modile to `vec1`. And if future major releases
are required, a `v2.0` major releases will be made with new `vec2` virtual
tables and so on.
Ideally, only a `v1` major release would be required. But who knows what the
future has in store with vector search!
In general, I will try my best to maximize stability and limit the number of
breaking changes for future `sqlite-vec` versions.

View file

@ -7038,7 +7038,6 @@ SQLITE_VEC_API int sqlite3_vec_init(sqlite3 *db, char **pzErrMsg,
// clang-format off // clang-format off
{"vec0", &vec0Module, NULL, NULL}, {"vec0", &vec0Module, NULL, NULL},
{"vec_each", &vec_eachModule, NULL, NULL}, {"vec_each", &vec_eachModule, NULL, NULL},
{"vec_npy_each", &vec_npy_eachModule, NULL, NULL},
// clang-format on // clang-format on
}; };
@ -7066,7 +7065,7 @@ SQLITE_VEC_API int sqlite3_vec_init(sqlite3 *db, char **pzErrMsg,
} }
#ifndef SQLITE_VEC_OMIT_FS #ifndef SQLITE_VEC_OMIT_FS
SQLITE_VEC_API int sqlite3_vec_fs_read_init(sqlite3 *db, char **pzErrMsg, SQLITE_VEC_API int sqlite3_vec_numpy_init(sqlite3 *db, char **pzErrMsg,
const sqlite3_api_routines *pApi) { const sqlite3_api_routines *pApi) {
UNUSED_PARAMETER(pzErrMsg); UNUSED_PARAMETER(pzErrMsg);
#ifndef SQLITE_CORE #ifndef SQLITE_CORE
@ -7075,6 +7074,10 @@ SQLITE_VEC_API int sqlite3_vec_fs_read_init(sqlite3 *db, char **pzErrMsg,
int rc = SQLITE_OK; int rc = SQLITE_OK;
rc = sqlite3_create_function_v2(db, "vec_npy_file", 1, SQLITE_RESULT_SUBTYPE, rc = sqlite3_create_function_v2(db, "vec_npy_file", 1, SQLITE_RESULT_SUBTYPE,
NULL, vec_npy_file, NULL, NULL, NULL); NULL, vec_npy_file, NULL, NULL, NULL);
if(rc != SQLITE_OK) {
return rc;
}
rc = sqlite3_create_module_v2(db, "vec_npy_each", &vec_npy_eachModule, NULL, NULL);
return rc; return rc;
} }
#endif #endif

View file

@ -25,19 +25,9 @@
extern "C" { extern "C" {
#endif #endif
#ifdef _WIN32 SQLITE_VEC_API int sqlite3_vec_init(sqlite3 *db, char **pzErrMsg,
__declspec(dllexport)
#endif
int sqlite3_vec_init(sqlite3 *db, char **pzErrMsg,
const sqlite3_api_routines *pApi); const sqlite3_api_routines *pApi);
#ifdef _WIN32
__declspec(dllexport)
#endif
int sqlite3_vec_fs_read_init(sqlite3 *db, char **pzErrMsg,
const sqlite3_api_routines *pApi);
#ifdef __cplusplus #ifdef __cplusplus
} /* end of the 'extern "C"' block */ } /* end of the 'extern "C"' block */
#endif #endif

View file

@ -119,7 +119,6 @@ FUNCTIONS = [
MODULES = [ MODULES = [
"vec0", "vec0",
"vec_each", "vec_each",
"vec_npy_each",
# "vec_static_blob_entries", # "vec_static_blob_entries",
# "vec_static_blobs", # "vec_static_blobs",
] ]
@ -1619,6 +1618,7 @@ def to_npy(arr):
def test_vec_npy_each(): def test_vec_npy_each():
db = connect(EXT_PATH, extra_entrypoint="sqlite3_vec_numpy_init")
vec_npy_each = lambda *args: execute_all( vec_npy_each = lambda *args: execute_all(
db, "select rowid, * from vec_npy_each(?)", args db, "select rowid, * from vec_npy_each(?)", args
) )
@ -1651,6 +1651,7 @@ def test_vec_npy_each():
def test_vec_npy_each_errors(): def test_vec_npy_each_errors():
db = connect(EXT_PATH, extra_entrypoint="sqlite3_vec_numpy_init")
vec_npy_each = lambda *args: execute_all( vec_npy_each = lambda *args: execute_all(
db, "select rowid, * from vec_npy_each(?)", args db, "select rowid, * from vec_npy_each(?)", args
) )
@ -1769,7 +1770,7 @@ import tempfile
def test_vec_npy_each_errors_files(): def test_vec_npy_each_errors_files():
db = connect(EXT_PATH, extra_entrypoint="sqlite3_vec_fs_read_init") db = connect(EXT_PATH, extra_entrypoint="sqlite3_vec_numpy_init")
def vec_npy_each(data): def vec_npy_each(data):
with tempfile.NamedTemporaryFile(delete_on_close=False) as f: with tempfile.NamedTemporaryFile(delete_on_close=False) as f:
@ -2274,36 +2275,42 @@ def test_smoke():
db.execute("insert into vec_xyz(rowid, a) select 2, X'0000000000000040'") db.execute("insert into vec_xyz(rowid, a) select 2, X'0000000000000040'")
chunk = db.execute("select * from vec_xyz_chunks").fetchone() chunk = db.execute("select * from vec_xyz_chunks").fetchone()
assert chunk[ assert (
"rowids" chunk["rowids"]
] == b"\x01\x00\x00\x00\x00\x00\x00\x00" + b"\x02\x00\x00\x00\x00\x00\x00\x00" + bytearray( == b"\x01\x00\x00\x00\x00\x00\x00\x00"
int(1024 * 8) - 8 * 2 + b"\x02\x00\x00\x00\x00\x00\x00\x00"
+ bytearray(int(1024 * 8) - 8 * 2)
) )
assert chunk["chunk_id"] == 1 assert chunk["chunk_id"] == 1
assert chunk["validity"] == b"\x03" + bytearray(int(1024 / 8) - 1) assert chunk["validity"] == b"\x03" + bytearray(int(1024 / 8) - 1)
vchunk = db.execute("select * from vec_xyz_vector_chunks00").fetchone() vchunk = db.execute("select * from vec_xyz_vector_chunks00").fetchone()
assert vchunk["rowid"] == 1 assert vchunk["rowid"] == 1
assert vchunk[ assert (
"vectors" vchunk["vectors"]
] == b"\x00\x00\x00\x00\x00\x00\x80\x3f" + b"\x00\x00\x00\x00\x00\x00\x00\x40" + bytearray( == b"\x00\x00\x00\x00\x00\x00\x80\x3f"
int(1024 * 4 * 2) - (2 * 4 * 2) + b"\x00\x00\x00\x00\x00\x00\x00\x40"
+ bytearray(int(1024 * 4 * 2) - (2 * 4 * 2))
) )
db.execute("insert into vec_xyz(rowid, a) select 3, X'00000000000080bf'") db.execute("insert into vec_xyz(rowid, a) select 3, X'00000000000080bf'")
chunk = db.execute("select * from vec_xyz_chunks").fetchone() chunk = db.execute("select * from vec_xyz_chunks").fetchone()
assert chunk["chunk_id"] == 1 assert chunk["chunk_id"] == 1
assert chunk["validity"] == b"\x07" + bytearray(int(1024 / 8) - 1) assert chunk["validity"] == b"\x07" + bytearray(int(1024 / 8) - 1)
assert chunk[ assert (
"rowids" chunk["rowids"]
] == b"\x01\x00\x00\x00\x00\x00\x00\x00" + b"\x02\x00\x00\x00\x00\x00\x00\x00" + b"\x03\x00\x00\x00\x00\x00\x00\x00" + bytearray( == b"\x01\x00\x00\x00\x00\x00\x00\x00"
int(1024 * 8) - 8 * 3 + b"\x02\x00\x00\x00\x00\x00\x00\x00"
+ b"\x03\x00\x00\x00\x00\x00\x00\x00"
+ bytearray(int(1024 * 8) - 8 * 3)
) )
vchunk = db.execute("select * from vec_xyz_vector_chunks00").fetchone() vchunk = db.execute("select * from vec_xyz_vector_chunks00").fetchone()
assert vchunk["rowid"] == 1 assert vchunk["rowid"] == 1
assert vchunk[ assert (
"vectors" vchunk["vectors"]
] == b"\x00\x00\x00\x00\x00\x00\x80\x3f" + b"\x00\x00\x00\x00\x00\x00\x00\x40" + b"\x00\x00\x00\x00\x00\x00\x80\xbf" + bytearray( == b"\x00\x00\x00\x00\x00\x00\x80\x3f"
int(1024 * 4 * 2) - (2 * 4 * 3) + b"\x00\x00\x00\x00\x00\x00\x00\x40"
+ b"\x00\x00\x00\x00\x00\x00\x80\xbf"
+ bytearray(int(1024 * 4 * 2) - (2 * 4 * 3))
) )
# db.execute("select * from vec_xyz") # db.execute("select * from vec_xyz")
@ -2346,8 +2353,7 @@ def test_vec0_stress_small_chunks():
{"rowid": 994, "a": _f32([99.4] * 8)}, {"rowid": 994, "a": _f32([99.4] * 8)},
{"rowid": 993, "a": _f32([99.3] * 8)}, {"rowid": 993, "a": _f32([99.3] * 8)},
] ]
assert ( assert execute_all(
execute_all(
db, db,
""" """
select rowid, a, distance select rowid, a, distance
@ -2357,8 +2363,7 @@ def test_vec0_stress_small_chunks():
order by distance order by distance
""", """,
[_f32([50.0] * 8)], [_f32([50.0] * 8)],
) ) == [
== [
{ {
"a": _f32([500 * 0.1] * 8), "a": _f32([500 * 0.1] * 8),
"distance": 0.0, "distance": 0.0,
@ -2405,7 +2410,6 @@ def test_vec0_stress_small_chunks():
"rowid": 504, "rowid": 504,
}, },
] ]
)
def test_vec0_distance_metric(): def test_vec0_distance_metric():