From e99e31feb78c3cabd46a2a5f7e115b72633cc65f Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Tue, 19 Nov 2024 22:03:31 -0800 Subject: [PATCH] add broken shadow table test --- ARCHITECTURE.md | 44 ++++++--- TODO | 38 ++++---- sqlite-vec.c | 2 +- tests/__snapshots__/test-general.ambr | 123 ++++++++++++++++++++++++++ tests/test-general.py | 55 ++++++++++++ 5 files changed, 226 insertions(+), 36 deletions(-) create mode 100644 tests/__snapshots__/test-general.ambr create mode 100644 tests/test-general.py diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 9bc40ab..f93e846 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -1,3 +1,13 @@ +# `sqlite-vec` Architecture + +Internal documentation for how `sqlite-vec` works under-the-hood. Not meant for +users of the `sqlite-vec` project, consult +[the official `sqlite-vec` documentation](https://alexgarcia.xyz/sqlite-vec) for +how-to-guides. Rather, this is for people interested in how `sqlite-vec` works +and some guidelines to any future contributors. + +Very much a WIP. + ## `vec0` ### Shadow Tables @@ -9,7 +19,6 @@ - `validity BLOB` - `rowids BLOB` - #### `xyz_rowids` - `rowid INTEGER` @@ -32,7 +41,6 @@ - `rowid INTEGER` - `data BLOB` - #### `xyz_metadata_text_data_00` - `rowid INTEGER` @@ -52,8 +60,11 @@ The "header" charcter denotes the type of query plan, as determined by the | `VEC0_QUERY_PLAN_POINT` | `'2'` | Perform a single-lookup point query for the provided rowid | | `VEC0_QUERY_PLAN_KNN` | `'3'` | Perform a KNN-style query on the provided query vector and parameters. | -Each 4-character "block" is associated with a corresponding value in `argv[]`. For example, the 1st block at byte offset `1-4` (inclusive) is the 1st block and is associated with `argv[1]`. The 2nd block at byte offset `5-8` (inclusive) is associated with `argv[2]` and so on. Each block describes what kind of value or filter the given `argv[i]` value is. - +Each 4-character "block" is associated with a corresponding value in `argv[]`. +For example, the 1st block at byte offset `1-4` (inclusive) is the 1st block and +is associated with `argv[1]`. The 2nd block at byte offset `5-8` (inclusive) is +associated with `argv[2]` and so on. Each block describes what kind of value or +filter the given `argv[i]` value is. #### `VEC0_IDXSTR_KIND_KNN_MATCH` (`'{'`) @@ -69,7 +80,8 @@ The remaining 3 characters of the block are `_` fillers. #### `VEC0_IDXSTR_KIND_KNN_ROWID_IN` (`'['`) -`argv[i]` is the optional `rowid in (...)` value, and must be handled with [`sqlite3_vtab_in_first()` / +`argv[i]` is the optional `rowid in (...)` value, and must be handled with +[`sqlite3_vtab_in_first()` / `sqlite3_vtab_in_next()`](https://www.sqlite.org/c3ref/vtab_in_first.html). The remaining 3 characters of the block are `_` fillers. @@ -78,13 +90,16 @@ The remaining 3 characters of the block are `_` fillers. `argv[i]` is a "constraint" on a specific partition key. -The second character of the block denotes which partition key to filter on, using `A` to denote the first partition key column, `B` for the second, etc. It is encoded with `'A' + partition_idx` and can be decoded with `c - 'A'`. +The second character of the block denotes which partition key to filter on, +using `A` to denote the first partition key column, `B` for the second, etc. It +is encoded with `'A' + partition_idx` and can be decoded with `c - 'A'`. -The third character of the block denotes which operator is used in the constraint. It will be one of the values of `enum vec0_partition_operator`, as only a subset of operations are supported on partition keys. +The third character of the block denotes which operator is used in the +constraint. It will be one of the values of `enum vec0_partition_operator`, as +only a subset of operations are supported on partition keys. The fourth character of the block is a `_` filler. - #### `VEC0_IDXSTR_KIND_POINT_ID` (`'!'`) `argv[i]` is the value of the rowid or id to match against for the point query. @@ -93,11 +108,16 @@ The remaining 3 characters of the block are `_` fillers. #### `VEC0_IDXSTR_KIND_METADATA_CONSTRAINT` (`'&'`) -`argv[i]` is the value of the `WHERE` constraint for a metdata column in a KNN query. +`argv[i]` is the value of the `WHERE` constraint for a metdata column in a KNN +query. -The second character of the block denotes which metadata column the constraint belongs to, using `A` to denote the first metadata column column, `B` for the second, etc. It is encoded with `'A' + metadata_idx` and can be decoded with `c - 'A'`. +The second character of the block denotes which metadata column the constraint +belongs to, using `A` to denote the first metadata column column, `B` for the +second, etc. It is encoded with `'A' + metadata_idx` and can be decoded with +`c - 'A'`. -The third character of the block is the constraint operator. It will be one of `enum vec0_metadata_operator`, as only a subset of operators are supported on metadata column KNN filters. +The third character of the block is the constraint operator. It will be one of +`enum vec0_metadata_operator`, as only a subset of operators are supported on +metadata column KNN filters. The foruth character of the block is a `_` filler. - diff --git a/TODO b/TODO index 828d0f4..b3962b7 100644 --- a/TODO +++ b/TODO @@ -1,25 +1,17 @@ -# partition - -- [ ] UPDATE on partition key values - - remove previous row from chunk, insert into new one? -- [ ] properly sqlite3_vtab_nochange / sqlite3_value_nochange handling - - -# auxiliary columns - -- later: - - NOT NULL? - - perf: INSERT stmt should be cached on vec0_vtab - - perf: LEFT JOIN aux table to rowids query in vec0_cursor for rowid/point stmts, to avoid N lookup queries - -# metadata filtering -- `v in (...)` handling -- [ ] test accessing aux values when rowid is different than 1,2,3 etc. - [ ] add `xyz_info` shadow table with version etc. + - later - - null! - - date/datetime - - remaining TODO items - - skip invalid validity entries in knn filter? - - dictionary encoding? - - partition `x in (...)` handling + - [ ] partition: UPDATE support + - [ ] skip invalid validity entries in knn filter? + - [ ] nulls in metadata + - [ ] partition `x in (...)` handling + - [ ] blobs/date/datetime + - [ ] uuid/ulid perf + - [ ] Aux columns: `NOT NULL` constraint + - [ ] Metadata columns: `NOT NULL` constraint + - [ ] Partiion key: `NOT NULL` constraint + - [ ] dictionary encoding? + - [ ] properly sqlite3_vtab_nochange / sqlite3_value_nochange handling + - [ ] perf + - [ ] aux: cache INSERT + - [ ] aux: LEFT JOIN on `_rowids` queries to avoid N lookup queries diff --git a/sqlite-vec.c b/sqlite-vec.c index f3a9bb5..55c9972 100644 --- a/sqlite-vec.c +++ b/sqlite-vec.c @@ -8759,7 +8759,7 @@ static int vec0Update(sqlite3_vtab *pVTab, int argc, sqlite3_value **argv, } static int vec0ShadowName(const char *zName) { - static const char *azName[] = {"rowids", "chunks", "auxiliary", "vector_chunks", "metadata_chunks"}; + static const char *azName[] = {"rowids", "chunks", "auxiliary"}; for (size_t i = 0; i < sizeof(azName) / sizeof(azName[0]); i++) { if (sqlite3_stricmp(zName, azName[i]) == 0) diff --git a/tests/__snapshots__/test-general.ambr b/tests/__snapshots__/test-general.ambr new file mode 100644 index 0000000..6497685 --- /dev/null +++ b/tests/__snapshots__/test-general.ambr @@ -0,0 +1,123 @@ +# serializer version: 1 +# name: test_shadow + OrderedDict({ + 'sql': 'select * from sqlite_master order by name', + 'rows': list([ + OrderedDict({ + 'type': 'index', + 'name': 'sqlite_autoindex_v_metadata_chunks00_1', + 'tbl_name': 'v_metadata_chunks00', + 'rootpage': 8, + 'sql': None, + }), + OrderedDict({ + 'type': 'index', + 'name': 'sqlite_autoindex_v_metadata_text_data_00_1', + 'tbl_name': 'v_metadata_text_data_00', + 'rootpage': 10, + 'sql': None, + }), + OrderedDict({ + 'type': 'index', + 'name': 'sqlite_autoindex_v_vector_chunks00_1', + 'tbl_name': 'v_vector_chunks00', + 'rootpage': 6, + 'sql': None, + }), + OrderedDict({ + 'type': 'table', + 'name': 'sqlite_sequence', + 'tbl_name': 'sqlite_sequence', + 'rootpage': 3, + 'sql': 'CREATE TABLE sqlite_sequence(name,seq)', + }), + OrderedDict({ + 'type': 'table', + 'name': 'v', + 'tbl_name': 'v', + 'rootpage': 0, + 'sql': 'CREATE VIRTUAL TABLE v using vec0(a float[1], partition text partition key, metadata text, +name text, chunk_size=8)', + }), + OrderedDict({ + 'type': 'table', + 'name': 'v_auxiliary', + 'tbl_name': 'v_auxiliary', + 'rootpage': 11, + 'sql': 'CREATE TABLE "v_auxiliary"( rowid integer PRIMARY KEY , value00)', + }), + OrderedDict({ + 'type': 'table', + 'name': 'v_chunks', + 'tbl_name': 'v_chunks', + 'rootpage': 2, + 'sql': 'CREATE TABLE "v_chunks"(chunk_id INTEGER PRIMARY KEY AUTOINCREMENT,size INTEGER NOT NULL,sequence_id integer,partition00,validity BLOB NOT NULL, rowids BLOB NOT NULL)', + }), + OrderedDict({ + 'type': 'table', + 'name': 'v_metadata_chunks00', + 'tbl_name': 'v_metadata_chunks00', + 'rootpage': 7, + 'sql': 'CREATE TABLE "v_metadata_chunks00"(rowid PRIMARY KEY, data BLOB NOT NULL)', + }), + OrderedDict({ + 'type': 'table', + 'name': 'v_metadata_text_data_00', + 'tbl_name': 'v_metadata_text_data_00', + 'rootpage': 9, + 'sql': 'CREATE TABLE "v_metadata_text_data_00"(rowid PRIMARY KEY, data TEXT)', + }), + OrderedDict({ + 'type': 'table', + 'name': 'v_rowids', + 'tbl_name': 'v_rowids', + 'rootpage': 4, + 'sql': 'CREATE TABLE "v_rowids"(rowid INTEGER PRIMARY KEY AUTOINCREMENT,id,chunk_id INTEGER,chunk_offset INTEGER)', + }), + OrderedDict({ + 'type': 'table', + 'name': 'v_vector_chunks00', + 'tbl_name': 'v_vector_chunks00', + 'rootpage': 5, + 'sql': 'CREATE TABLE "v_vector_chunks00"(rowid PRIMARY KEY,vectors BLOB NOT NULL)', + }), + ]), + }) +# --- +# name: test_shadow.1 + OrderedDict({ + 'sql': "select * from pragma_table_list where type = 'shadow'", + 'rows': list([ + OrderedDict({ + 'schema': 'main', + 'name': 'v_auxiliary', + 'type': 'shadow', + 'ncol': 2, + 'wr': 0, + 'strict': 0, + }), + OrderedDict({ + 'schema': 'main', + 'name': 'v_rowids', + 'type': 'shadow', + 'ncol': 4, + 'wr': 0, + 'strict': 0, + }), + OrderedDict({ + 'schema': 'main', + 'name': 'v_chunks', + 'type': 'shadow', + 'ncol': 6, + 'wr': 0, + 'strict': 0, + }), + ]), + }) +# --- +# name: test_shadow.2 + OrderedDict({ + 'sql': "select * from pragma_table_list where type = 'shadow'", + 'rows': list([ + ]), + }) +# --- diff --git a/tests/test-general.py b/tests/test-general.py new file mode 100644 index 0000000..294164b --- /dev/null +++ b/tests/test-general.py @@ -0,0 +1,55 @@ +import sqlite3 +from collections import OrderedDict +import pytest + + +@pytest.mark.skipif( + sqlite3.sqlite_version_info[1] < 37, + reason="pragma_table_list was added in SQLite 3.37", +) +def test_shadow(db, snapshot): + db.execute( + "create virtual table v using vec0(a float[1], partition text partition key, metadata text, +name text, chunk_size=8)" + ) + assert exec(db, "select * from sqlite_master order by name") == snapshot() + assert ( + exec(db, "select * from pragma_table_list where type = 'shadow'") == snapshot() + ) + + db.execute("drop table v;") + assert ( + exec(db, "select * from pragma_table_list where type = 'shadow'") == snapshot() + ) + + +def exec(db, sql, parameters=[]): + try: + rows = db.execute(sql, parameters).fetchall() + except (sqlite3.OperationalError, sqlite3.DatabaseError) as e: + return { + "error": e.__class__.__name__, + "message": str(e), + } + a = [] + for row in rows: + o = OrderedDict() + for k in row.keys(): + o[k] = row[k] + a.append(o) + result = OrderedDict() + result["sql"] = sql + result["rows"] = a + return result + + +def vec0_shadow_table_contents(db, v): + shadow_tables = [ + row[0] + for row in db.execute( + "select name from sqlite_master where name like ? order by 1", [f"{v}_%"] + ).fetchall() + ] + o = {} + for shadow_table in shadow_tables: + o[shadow_table] = exec(db, f"select * from {shadow_table}") + return o