add broken shadow table test

2026-06-17 15:35:22 +02:00 · 2024-11-19 22:03:31 -08:00 · 2024-11-19 22:03:31 -08:00 · e99e31feb7
commit e99e31feb7
parent a657b3a216
5 changed files with 226 additions and 36 deletions
--- a/ARCHITECTURE.md
+++ b/ARCHITECTURE.md
@ -1,3 +1,13 @@
 # `sqlite-vec` Architecture
 Internal documentation for how `sqlite-vec` works under-the-hood. Not meant for
 users of the `sqlite-vec` project, consult
 [the official `sqlite-vec` documentation](https://alexgarcia.xyz/sqlite-vec) for
 how-to-guides. Rather, this is for people interested in how `sqlite-vec` works
 and some guidelines to any future contributors.
 Very much a WIP.
 ## `vec0`
 ### Shadow Tables
@ -9,7 +19,6 @@
 - `validity BLOB`
 - `rowids BLOB`
 #### `xyz_rowids`
 - `rowid INTEGER`
@ -32,7 +41,6 @@
 - `rowid INTEGER`
 - `data BLOB`
 #### `xyz_metadata_text_data_00`
 - `rowid INTEGER`
@ -52,8 +60,11 @@ The "header" charcter denotes the type of query plan, as determined by the
 | `VEC0_QUERY_PLAN_POINT`    | `'2'` | Perform a single-lookup point query for the provided rowid             |
 | `VEC0_QUERY_PLAN_KNN`      | `'3'` | Perform a KNN-style query on the provided query vector and parameters. |
-Each 4-character "block" is associated with a corresponding value in `argv[]`. For example, the 1st block at byte offset `1-4` (inclusive) is the 1st block and is associated with `argv[1]`. The 2nd block at byte offset `5-8` (inclusive) is associated with `argv[2]` and so on. Each block describes what kind of value or filter the given `argv[i]` value is.
+Each 4-character "block" is associated with a corresponding value in `argv[]`.
-
+For example, the 1st block at byte offset `1-4` (inclusive) is the 1st block and
 is associated with `argv[1]`. The 2nd block at byte offset `5-8` (inclusive) is
 associated with `argv[2]` and so on. Each block describes what kind of value or
 filter the given `argv[i]` value is.
 #### `VEC0_IDXSTR_KIND_KNN_MATCH` (`'{'`)
@ -69,7 +80,8 @@ The remaining 3 characters of the block are `_` fillers.
 #### `VEC0_IDXSTR_KIND_KNN_ROWID_IN` (`'['`)
-`argv[i]` is the optional `rowid in (...)` value, and must be handled with [`sqlite3_vtab_in_first()` /
+`argv[i]` is the optional `rowid in (...)` value, and must be handled with
 [`sqlite3_vtab_in_first()` /
 `sqlite3_vtab_in_next()`](https://www.sqlite.org/c3ref/vtab_in_first.html).
 The remaining 3 characters of the block are `_` fillers.
@ -78,13 +90,16 @@ The remaining 3 characters of the block are `_` fillers.
 `argv[i]` is a "constraint" on a specific partition key.
-The second character of the block denotes which partition key to filter on, using `A` to denote the first partition key column, `B` for the second, etc. It is encoded with `'A' + partition_idx` and can be decoded with `c - 'A'`.
+The second character of the block denotes which partition key to filter on,
 using `A` to denote the first partition key column, `B` for the second, etc. It
 is encoded with `'A' + partition_idx` and can be decoded with `c - 'A'`.
-The third character of the block denotes which operator is used in the constraint. It will be one of the values of `enum vec0_partition_operator`, as only a subset of operations are supported on partition keys.
+The third character of the block denotes which operator is used in the
 constraint. It will be one of the values of `enum vec0_partition_operator`, as
 only a subset of operations are supported on partition keys.
 The fourth character of the block is a `_` filler.
 #### `VEC0_IDXSTR_KIND_POINT_ID` (`'!'`)
 `argv[i]` is the value of the rowid or id to match against for the point query.
@ -93,11 +108,16 @@ The remaining 3 characters of the block are `_` fillers.
 #### `VEC0_IDXSTR_KIND_METADATA_CONSTRAINT` (`'&'`)
-`argv[i]` is the value of the `WHERE` constraint for a metdata column in a KNN query.
+`argv[i]` is the value of the `WHERE` constraint for a metdata column in a KNN
 query.
-The second character of the block denotes which metadata column the constraint belongs to, using `A` to denote the first metadata column column, `B` for the second, etc. It is encoded with `'A' + metadata_idx` and can be decoded with `c - 'A'`.
+The second character of the block denotes which metadata column the constraint
 belongs to, using `A` to denote the first metadata column column, `B` for the
 second, etc. It is encoded with `'A' + metadata_idx` and can be decoded with
 `c - 'A'`.
-The third character of the block is the constraint operator. It will be one of `enum vec0_metadata_operator`, as only a subset of operators are supported on metadata column KNN filters.
+The third character of the block is the constraint operator. It will be one of
 `enum vec0_metadata_operator`, as only a subset of operators are supported on
 metadata column KNN filters.
 The foruth character of the block is a `_` filler.
--- a/38
+++ b/38
@ -1,25 +1,17 @@
 # partition
 - [ ] UPDATE on partition key values
  - remove previous row from chunk, insert into new one?
 - [ ] properly sqlite3_vtab_nochange / sqlite3_value_nochange handling
 # auxiliary columns
 - later:
  - NOT NULL?
  - perf: INSERT stmt should be cached on vec0_vtab
  - perf: LEFT JOIN aux table to rowids query in vec0_cursor for rowid/point stmts, to avoid N lookup queries
 # metadata filtering
 - `v in (...)` handling
 - [ ] test accessing aux values when rowid is different than 1,2,3 etc.
 - [ ] add `xyz_info` shadow table with version etc.
 - later
-  - null!
+  - [ ] partition: UPDATE support
-  - date/datetime
+  - [ ] skip invalid validity entries in knn filter?
-  - remaining TODO items
+  - [ ] nulls in metadata
-  - skip invalid validity entries in knn filter?
+  - [ ] partition `x in (...)` handling
-  - dictionary encoding?
+  - [ ] blobs/date/datetime
-  - partition `x in (...)` handling
+  - [ ] uuid/ulid perf
  - [ ] Aux columns: `NOT NULL` constraint
  - [ ] Metadata columns: `NOT NULL` constraint
   - [ ] Partiion key: `NOT NULL` constraint
  - [ ] dictionary encoding?
  - [ ] properly sqlite3_vtab_nochange / sqlite3_value_nochange handling
  - [ ] perf
    - [ ] aux: cache INSERT
    - [ ] aux: LEFT JOIN on `_rowids` queries to avoid N lookup queries
--- a/sqlite-vec.c
+++ b/sqlite-vec.c
@ -8759,7 +8759,7 @@ static int vec0Update(sqlite3_vtab *pVTab, int argc, sqlite3_value **argv,
 }
 static int vec0ShadowName(const char *zName) {
-  static const char *azName[] = {"rowids", "chunks", "auxiliary", "vector_chunks", "metadata_chunks"};
+  static const char *azName[] = {"rowids", "chunks", "auxiliary"};
  for (size_t i = 0; i < sizeof(azName) / sizeof(azName[0]); i++) {
    if (sqlite3_stricmp(zName, azName[i]) == 0)
--- a/tests/snapshots/test-general.ambr
+++ b/tests/snapshots/test-general.ambr
@ -0,0 +1,123 @@
 # serializer version: 1
 # name: test_shadow
  OrderedDict({
    'sql': 'select * from sqlite_master order by name',
    'rows': list([
      OrderedDict({
        'type': 'index',
        'name': 'sqlite_autoindex_v_metadata_chunks00_1',
        'tbl_name': 'v_metadata_chunks00',
        'rootpage': 8,
        'sql': None,
      }),
      OrderedDict({
        'type': 'index',
        'name': 'sqlite_autoindex_v_metadata_text_data_00_1',
        'tbl_name': 'v_metadata_text_data_00',
        'rootpage': 10,
        'sql': None,
      }),
      OrderedDict({
        'type': 'index',
        'name': 'sqlite_autoindex_v_vector_chunks00_1',
        'tbl_name': 'v_vector_chunks00',
        'rootpage': 6,
        'sql': None,
      }),
      OrderedDict({
        'type': 'table',
        'name': 'sqlite_sequence',
        'tbl_name': 'sqlite_sequence',
        'rootpage': 3,
        'sql': 'CREATE TABLE sqlite_sequence(name,seq)',
      }),
      OrderedDict({
        'type': 'table',
        'name': 'v',
        'tbl_name': 'v',
        'rootpage': 0,
        'sql': 'CREATE VIRTUAL TABLE v using vec0(a float[1], partition text partition key, metadata text, +name text, chunk_size=8)',
      }),
      OrderedDict({
        'type': 'table',
        'name': 'v_auxiliary',
        'tbl_name': 'v_auxiliary',
        'rootpage': 11,
        'sql': 'CREATE TABLE "v_auxiliary"( rowid integer PRIMARY KEY , value00)',
      }),
      OrderedDict({
        'type': 'table',
        'name': 'v_chunks',
        'tbl_name': 'v_chunks',
        'rootpage': 2,
        'sql': 'CREATE TABLE "v_chunks"(chunk_id INTEGER PRIMARY KEY AUTOINCREMENT,size INTEGER NOT NULL,sequence_id integer,partition00,validity BLOB NOT NULL, rowids BLOB NOT NULL)',
      }),
      OrderedDict({
        'type': 'table',
        'name': 'v_metadata_chunks00',
        'tbl_name': 'v_metadata_chunks00',
        'rootpage': 7,
        'sql': 'CREATE TABLE "v_metadata_chunks00"(rowid PRIMARY KEY, data BLOB NOT NULL)',
      }),
      OrderedDict({
        'type': 'table',
        'name': 'v_metadata_text_data_00',
        'tbl_name': 'v_metadata_text_data_00',
        'rootpage': 9,
        'sql': 'CREATE TABLE "v_metadata_text_data_00"(rowid PRIMARY KEY, data TEXT)',
      }),
      OrderedDict({
        'type': 'table',
        'name': 'v_rowids',
        'tbl_name': 'v_rowids',
        'rootpage': 4,
        'sql': 'CREATE TABLE "v_rowids"(rowid INTEGER PRIMARY KEY AUTOINCREMENT,id,chunk_id INTEGER,chunk_offset INTEGER)',
      }),
      OrderedDict({
        'type': 'table',
        'name': 'v_vector_chunks00',
        'tbl_name': 'v_vector_chunks00',
        'rootpage': 5,
        'sql': 'CREATE TABLE "v_vector_chunks00"(rowid PRIMARY KEY,vectors BLOB NOT NULL)',
      }),
    ]),
  })
 # ---
 # name: test_shadow.1
  OrderedDict({
    'sql': "select * from pragma_table_list where type = 'shadow'",
    'rows': list([
      OrderedDict({
        'schema': 'main',
        'name': 'v_auxiliary',
        'type': 'shadow',
        'ncol': 2,
        'wr': 0,
        'strict': 0,
      }),
      OrderedDict({
        'schema': 'main',
        'name': 'v_rowids',
        'type': 'shadow',
        'ncol': 4,
        'wr': 0,
        'strict': 0,
      }),
      OrderedDict({
        'schema': 'main',
        'name': 'v_chunks',
        'type': 'shadow',
        'ncol': 6,
        'wr': 0,
        'strict': 0,
      }),
    ]),
  })
 # ---
 # name: test_shadow.2
  OrderedDict({
    'sql': "select * from pragma_table_list where type = 'shadow'",
    'rows': list([
    ]),
  })
 # ---
--- a/tests/test-general.py
+++ b/tests/test-general.py
@ -0,0 +1,55 @@
 import sqlite3
 from collections import OrderedDict
 import pytest
@pytest.mark.skipif(
    sqlite3.sqlite_version_info[1] < 37,
    reason="pragma_table_list was added in SQLite 3.37",
 )
 def test_shadow(db, snapshot):
    db.execute(
        "create virtual table v using vec0(a float[1], partition text partition key, metadata text, +name text, chunk_size=8)"
    )
    assert exec(db, "select * from sqlite_master order by name") == snapshot()
    assert (
        exec(db, "select * from pragma_table_list where type = 'shadow'") == snapshot()
    )
    db.execute("drop table v;")
    assert (
        exec(db, "select * from pragma_table_list where type = 'shadow'") == snapshot()
    )
 def exec(db, sql, parameters=[]):
    try:
        rows = db.execute(sql, parameters).fetchall()
    except (sqlite3.OperationalError, sqlite3.DatabaseError) as e:
        return {
            "error": e.__class__.__name__,
            "message": str(e),
        }
    a = []
    for row in rows:
        o = OrderedDict()
        for k in row.keys():
            o[k] = row[k]
        a.append(o)
    result = OrderedDict()
    result["sql"] = sql
    result["rows"] = a
    return result
 def vec0_shadow_table_contents(db, v):
    shadow_tables = [
        row[0]
        for row in db.execute(
            "select name from sqlite_master where name like ? order by 1", [f"{v}_%"]
        ).fetchall()
    ]
    o = {}
    for shadow_table in shadow_tables:
        o[shadow_table] = exec(db, f"select * from {shadow_table}")
    return o