From e99e31feb78c3cabd46a2a5f7e115b72633cc65f Mon Sep 17 00:00:00 2001
From: Alex Garcia <alexsebastian.garcia@gmail.com>
Date: Tue, 19 Nov 2024 22:03:31 -0800
Subject: [PATCH] add broken shadow table test

---
 ARCHITECTURE.md                       |  44 ++++++---
 TODO                                  |  38 ++++----
 sqlite-vec.c                          |   2 +-
 tests/__snapshots__/test-general.ambr | 123 ++++++++++++++++++++++++++
 tests/test-general.py                 |  55 ++++++++++++
 5 files changed, 226 insertions(+), 36 deletions(-)
 create mode 100644 tests/__snapshots__/test-general.ambr
 create mode 100644 tests/test-general.py

diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md
index 9bc40ab..f93e846 100644
--- a/ARCHITECTURE.md
+++ b/ARCHITECTURE.md
@@ -1,3 +1,13 @@
+# `sqlite-vec` Architecture
+
+Internal documentation for how `sqlite-vec` works under-the-hood. Not meant for
+users of the `sqlite-vec` project, consult
+[the official `sqlite-vec` documentation](https://alexgarcia.xyz/sqlite-vec) for
+how-to-guides. Rather, this is for people interested in how `sqlite-vec` works
+and some guidelines to any future contributors.
+
+Very much a WIP.
+
 ## `vec0`
 
 ### Shadow Tables
@@ -9,7 +19,6 @@
 - `validity BLOB`
 - `rowids BLOB`
 
-
 #### `xyz_rowids`
 
 - `rowid INTEGER`
@@ -32,7 +41,6 @@
 - `rowid INTEGER`
 - `data BLOB`
 
-
 #### `xyz_metadata_text_data_00`
 
 - `rowid INTEGER`
@@ -52,8 +60,11 @@ The "header" charcter denotes the type of query plan, as determined by the
 | `VEC0_QUERY_PLAN_POINT`    | `'2'` | Perform a single-lookup point query for the provided rowid             |
 | `VEC0_QUERY_PLAN_KNN`      | `'3'` | Perform a KNN-style query on the provided query vector and parameters. |
 
-Each 4-character "block" is associated with a corresponding value in `argv[]`. For example, the 1st block at byte offset `1-4` (inclusive) is the 1st block and is associated with `argv[1]`. The 2nd block at byte offset `5-8` (inclusive) is associated with `argv[2]` and so on. Each block describes what kind of value or filter the given `argv[i]` value is.
-
+Each 4-character "block" is associated with a corresponding value in `argv[]`.
+For example, the 1st block at byte offset `1-4` (inclusive) is the 1st block and
+is associated with `argv[1]`. The 2nd block at byte offset `5-8` (inclusive) is
+associated with `argv[2]` and so on. Each block describes what kind of value or
+filter the given `argv[i]` value is.
 
 #### `VEC0_IDXSTR_KIND_KNN_MATCH` (`'{'`)
 
@@ -69,7 +80,8 @@ The remaining 3 characters of the block are `_` fillers.
 
 #### `VEC0_IDXSTR_KIND_KNN_ROWID_IN` (`'['`)
 
-`argv[i]` is the optional `rowid in (...)` value, and must be handled with [`sqlite3_vtab_in_first()` /
+`argv[i]` is the optional `rowid in (...)` value, and must be handled with
+[`sqlite3_vtab_in_first()` /
 `sqlite3_vtab_in_next()`](https://www.sqlite.org/c3ref/vtab_in_first.html).
 
 The remaining 3 characters of the block are `_` fillers.
@@ -78,13 +90,16 @@ The remaining 3 characters of the block are `_` fillers.
 
 `argv[i]` is a "constraint" on a specific partition key.
 
-The second character of the block denotes which partition key to filter on, using `A` to denote the first partition key column, `B` for the second, etc. It is encoded with `'A' + partition_idx` and can be decoded with `c - 'A'`.
+The second character of the block denotes which partition key to filter on,
+using `A` to denote the first partition key column, `B` for the second, etc. It
+is encoded with `'A' + partition_idx` and can be decoded with `c - 'A'`.
 
-The third character of the block denotes which operator is used in the constraint. It will be one of the values of `enum vec0_partition_operator`, as only a subset of operations are supported on partition keys.
+The third character of the block denotes which operator is used in the
+constraint. It will be one of the values of `enum vec0_partition_operator`, as
+only a subset of operations are supported on partition keys.
 
 The fourth character of the block is a `_` filler.
 
-
 #### `VEC0_IDXSTR_KIND_POINT_ID` (`'!'`)
 
 `argv[i]` is the value of the rowid or id to match against for the point query.
@@ -93,11 +108,16 @@ The remaining 3 characters of the block are `_` fillers.
 
 #### `VEC0_IDXSTR_KIND_METADATA_CONSTRAINT` (`'&'`)
 
-`argv[i]` is the value of the `WHERE` constraint for a metdata column in a KNN query.
+`argv[i]` is the value of the `WHERE` constraint for a metdata column in a KNN
+query.
 
-The second character of the block denotes which metadata column the constraint belongs to, using `A` to denote the first metadata column column, `B` for the second, etc. It is encoded with `'A' + metadata_idx` and can be decoded with `c - 'A'`.
+The second character of the block denotes which metadata column the constraint
+belongs to, using `A` to denote the first metadata column column, `B` for the
+second, etc. It is encoded with `'A' + metadata_idx` and can be decoded with
+`c - 'A'`.
 
-The third character of the block is the constraint operator. It will be one of `enum vec0_metadata_operator`, as only a subset of operators are supported on metadata column KNN filters.
+The third character of the block is the constraint operator. It will be one of
+`enum vec0_metadata_operator`, as only a subset of operators are supported on
+metadata column KNN filters.
 
 The foruth character of the block is a `_` filler.
-
diff --git a/TODO b/TODO
index 828d0f4..b3962b7 100644
--- a/TODO
+++ b/TODO
@@ -1,25 +1,17 @@
-# partition
-
-- [ ] UPDATE on partition key values
-  - remove previous row from chunk, insert into new one?
-- [ ] properly sqlite3_vtab_nochange / sqlite3_value_nochange handling
-
-
-# auxiliary columns
-
-- later:
-  - NOT NULL?
-  - perf: INSERT stmt should be cached on vec0_vtab
-  - perf: LEFT JOIN aux table to rowids query in vec0_cursor for rowid/point stmts, to avoid N lookup queries
-
-# metadata filtering
-- `v in (...)` handling
-- [ ] test accessing aux values when rowid is different than 1,2,3 etc.
 - [ ] add `xyz_info` shadow table with version etc.
+
 - later
-  - null!
-  - date/datetime
-  - remaining TODO items
-  - skip invalid validity entries in knn filter?
-  - dictionary encoding?
-  - partition `x in (...)` handling
+  - [ ] partition: UPDATE support
+  - [ ] skip invalid validity entries in knn filter?
+  - [ ] nulls in metadata
+  - [ ] partition `x in (...)` handling
+  - [ ] blobs/date/datetime
+  - [ ] uuid/ulid perf
+  - [ ] Aux columns: `NOT NULL` constraint
+  - [ ] Metadata columns: `NOT NULL` constraint
+   - [ ] Partiion key: `NOT NULL` constraint
+  - [ ] dictionary encoding?
+  - [ ] properly sqlite3_vtab_nochange / sqlite3_value_nochange handling
+  - [ ] perf
+    - [ ] aux: cache INSERT
+    - [ ] aux: LEFT JOIN on `_rowids` queries to avoid N lookup queries
diff --git a/sqlite-vec.c b/sqlite-vec.c
index f3a9bb5..55c9972 100644
--- a/sqlite-vec.c
+++ b/sqlite-vec.c
@@ -8759,7 +8759,7 @@ static int vec0Update(sqlite3_vtab *pVTab, int argc, sqlite3_value **argv,
 }
 
 static int vec0ShadowName(const char *zName) {
-  static const char *azName[] = {"rowids", "chunks", "auxiliary", "vector_chunks", "metadata_chunks"};
+  static const char *azName[] = {"rowids", "chunks", "auxiliary"};
 
   for (size_t i = 0; i < sizeof(azName) / sizeof(azName[0]); i++) {
     if (sqlite3_stricmp(zName, azName[i]) == 0)
diff --git a/tests/__snapshots__/test-general.ambr b/tests/__snapshots__/test-general.ambr
new file mode 100644
index 0000000..6497685
--- /dev/null
+++ b/tests/__snapshots__/test-general.ambr
@@ -0,0 +1,123 @@
+# serializer version: 1
+# name: test_shadow
+  OrderedDict({
+    'sql': 'select * from sqlite_master order by name',
+    'rows': list([
+      OrderedDict({
+        'type': 'index',
+        'name': 'sqlite_autoindex_v_metadata_chunks00_1',
+        'tbl_name': 'v_metadata_chunks00',
+        'rootpage': 8,
+        'sql': None,
+      }),
+      OrderedDict({
+        'type': 'index',
+        'name': 'sqlite_autoindex_v_metadata_text_data_00_1',
+        'tbl_name': 'v_metadata_text_data_00',
+        'rootpage': 10,
+        'sql': None,
+      }),
+      OrderedDict({
+        'type': 'index',
+        'name': 'sqlite_autoindex_v_vector_chunks00_1',
+        'tbl_name': 'v_vector_chunks00',
+        'rootpage': 6,
+        'sql': None,
+      }),
+      OrderedDict({
+        'type': 'table',
+        'name': 'sqlite_sequence',
+        'tbl_name': 'sqlite_sequence',
+        'rootpage': 3,
+        'sql': 'CREATE TABLE sqlite_sequence(name,seq)',
+      }),
+      OrderedDict({
+        'type': 'table',
+        'name': 'v',
+        'tbl_name': 'v',
+        'rootpage': 0,
+        'sql': 'CREATE VIRTUAL TABLE v using vec0(a float[1], partition text partition key, metadata text, +name text, chunk_size=8)',
+      }),
+      OrderedDict({
+        'type': 'table',
+        'name': 'v_auxiliary',
+        'tbl_name': 'v_auxiliary',
+        'rootpage': 11,
+        'sql': 'CREATE TABLE "v_auxiliary"( rowid integer PRIMARY KEY , value00)',
+      }),
+      OrderedDict({
+        'type': 'table',
+        'name': 'v_chunks',
+        'tbl_name': 'v_chunks',
+        'rootpage': 2,
+        'sql': 'CREATE TABLE "v_chunks"(chunk_id INTEGER PRIMARY KEY AUTOINCREMENT,size INTEGER NOT NULL,sequence_id integer,partition00,validity BLOB NOT NULL, rowids BLOB NOT NULL)',
+      }),
+      OrderedDict({
+        'type': 'table',
+        'name': 'v_metadata_chunks00',
+        'tbl_name': 'v_metadata_chunks00',
+        'rootpage': 7,
+        'sql': 'CREATE TABLE "v_metadata_chunks00"(rowid PRIMARY KEY, data BLOB NOT NULL)',
+      }),
+      OrderedDict({
+        'type': 'table',
+        'name': 'v_metadata_text_data_00',
+        'tbl_name': 'v_metadata_text_data_00',
+        'rootpage': 9,
+        'sql': 'CREATE TABLE "v_metadata_text_data_00"(rowid PRIMARY KEY, data TEXT)',
+      }),
+      OrderedDict({
+        'type': 'table',
+        'name': 'v_rowids',
+        'tbl_name': 'v_rowids',
+        'rootpage': 4,
+        'sql': 'CREATE TABLE "v_rowids"(rowid INTEGER PRIMARY KEY AUTOINCREMENT,id,chunk_id INTEGER,chunk_offset INTEGER)',
+      }),
+      OrderedDict({
+        'type': 'table',
+        'name': 'v_vector_chunks00',
+        'tbl_name': 'v_vector_chunks00',
+        'rootpage': 5,
+        'sql': 'CREATE TABLE "v_vector_chunks00"(rowid PRIMARY KEY,vectors BLOB NOT NULL)',
+      }),
+    ]),
+  })
+# ---
+# name: test_shadow.1
+  OrderedDict({
+    'sql': "select * from pragma_table_list where type = 'shadow'",
+    'rows': list([
+      OrderedDict({
+        'schema': 'main',
+        'name': 'v_auxiliary',
+        'type': 'shadow',
+        'ncol': 2,
+        'wr': 0,
+        'strict': 0,
+      }),
+      OrderedDict({
+        'schema': 'main',
+        'name': 'v_rowids',
+        'type': 'shadow',
+        'ncol': 4,
+        'wr': 0,
+        'strict': 0,
+      }),
+      OrderedDict({
+        'schema': 'main',
+        'name': 'v_chunks',
+        'type': 'shadow',
+        'ncol': 6,
+        'wr': 0,
+        'strict': 0,
+      }),
+    ]),
+  })
+# ---
+# name: test_shadow.2
+  OrderedDict({
+    'sql': "select * from pragma_table_list where type = 'shadow'",
+    'rows': list([
+    ]),
+  })
+# ---
diff --git a/tests/test-general.py b/tests/test-general.py
new file mode 100644
index 0000000..294164b
--- /dev/null
+++ b/tests/test-general.py
@@ -0,0 +1,55 @@
+import sqlite3
+from collections import OrderedDict
+import pytest
+
+
+@pytest.mark.skipif(
+    sqlite3.sqlite_version_info[1] < 37,
+    reason="pragma_table_list was added in SQLite 3.37",
+)
+def test_shadow(db, snapshot):
+    db.execute(
+        "create virtual table v using vec0(a float[1], partition text partition key, metadata text, +name text, chunk_size=8)"
+    )
+    assert exec(db, "select * from sqlite_master order by name") == snapshot()
+    assert (
+        exec(db, "select * from pragma_table_list where type = 'shadow'") == snapshot()
+    )
+
+    db.execute("drop table v;")
+    assert (
+        exec(db, "select * from pragma_table_list where type = 'shadow'") == snapshot()
+    )
+
+
+def exec(db, sql, parameters=[]):
+    try:
+        rows = db.execute(sql, parameters).fetchall()
+    except (sqlite3.OperationalError, sqlite3.DatabaseError) as e:
+        return {
+            "error": e.__class__.__name__,
+            "message": str(e),
+        }
+    a = []
+    for row in rows:
+        o = OrderedDict()
+        for k in row.keys():
+            o[k] = row[k]
+        a.append(o)
+    result = OrderedDict()
+    result["sql"] = sql
+    result["rows"] = a
+    return result
+
+
+def vec0_shadow_table_contents(db, v):
+    shadow_tables = [
+        row[0]
+        for row in db.execute(
+            "select name from sqlite_master where name like ? order by 1", [f"{v}_%"]
+        ).fetchall()
+    ]
+    o = {}
+    for shadow_table in shadow_tables:
+        o[shadow_table] = exec(db, f"select * from {shadow_table}")
+    return o