From 8132f26f3b7daa8f688aa41fe6e7ddea6960825e Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Wed, 13 Nov 2024 23:36:46 -0800 Subject: [PATCH] long text support --- sqlite-vec.c | 89 +++++++++++--- tests/__snapshots__/test-metadata.ambr | 155 +++++++++++++++++++++++++ tests/test-metadata.py | 15 +++ 3 files changed, 244 insertions(+), 15 deletions(-) diff --git a/sqlite-vec.c b/sqlite-vec.c index 5615a96..f49d442 100644 --- a/sqlite-vec.c +++ b/sqlite-vec.c @@ -3419,6 +3419,7 @@ static sqlite3_module vec_npy_eachModule = { #define VEC0_SHADOW_AUXILIARY_NAME "\"%w\".\"%w_auxiliary\"" #define VEC0_SHADOW_METADATA_N_NAME "\"%w\".\"%w_metadata_chunks%02d\"" +#define VEC0_SHADOW_METADATA_TEXT_DATA_NAME "\"%w\".\"%w_metadata_text_data_%02d\"" #define VEC_INTERAL_ERROR "Internal sqlite-vec error: " #define REPORT_URL "https://github.com/asg017/sqlite-vec/issues/new" @@ -4094,8 +4095,23 @@ int vec0_result_metadata_value_for_rowid(vec0_vtab *p, i64 rowid, int metadata_i sqlite3_result_text(context, (const char*) (view + 4), length, SQLITE_TRANSIENT); } else { - fprintf(stderr, "TODO: handle longer strings in result_metadata_value\n"); - abort(); + sqlite3_stmt * stmt; + const char * zSql = sqlite3_mprintf("SELECT data FROM " VEC0_SHADOW_METADATA_TEXT_DATA_NAME " WHERE rowid = ?", p->schemaName, p->tableName, metadata_idx); + if(!zSql) { + abort(); + } + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + if(rc != SQLITE_OK) { + abort(); + } + sqlite3_bind_int64(stmt, 1, rowid); + rc = sqlite3_step(stmt); + if(rc != SQLITE_ROW) { + abort(); + } + sqlite3_result_value(context, sqlite3_column_value(stmt, 0)); + sqlite3_finalize(stmt); + rc = SQLITE_OK; } break; } @@ -5032,6 +5048,25 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, goto error; } sqlite3_finalize(stmt); + + if(pNew->metadata_columns[i].kind == VEC0_METADATA_COLUMN_KIND_TEXT) { + char *zSql = sqlite3_mprintf("CREATE TABLE " VEC0_SHADOW_METADATA_TEXT_DATA_NAME "(rowid PRIMARY KEY, data TEXT);", + pNew->schemaName, pNew->tableName, i); + if (!zSql) { + goto error; + } + rc = sqlite3_prepare_v2(db, zSql, -1, &stmt, 0); + sqlite3_free((void *)zSql); + if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) { + sqlite3_finalize(stmt); + *pzErr = sqlite3_mprintf( + "Could not create '_metadata_text_data_%02d' shadow table: %s", i, + sqlite3_errmsg(db)); + goto error; + } + sqlite3_finalize(stmt); + + } } if(pNew->numAuxiliaryColumns > 0) { @@ -5149,6 +5184,17 @@ static int vec0Destroy(sqlite3_vtab *pVtab) { goto done; } sqlite3_finalize(stmt); + + if(p->metadata_columns[i].kind == VEC0_METADATA_COLUMN_KIND_TEXT) { + zSql = sqlite3_mprintf("DROP TABLE " VEC0_SHADOW_METADATA_TEXT_DATA_NAME, p->schemaName,p->tableName, i); + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, 0); + sqlite3_free((void *)zSql); + if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) { + rc = SQLITE_ERROR; + goto done; + } + sqlite3_finalize(stmt); + } } stmt = NULL; @@ -7305,7 +7351,7 @@ cleanup: * @param chunk_offset offset the row/metadata value is assigned to * @return int */ -int vec0_insert_metadata_values(vec0_vtab *p, int argc, sqlite3_value ** argv, i64 chunk_id, i64 chunk_offset) { +int vec0_insert_metadata_values(vec0_vtab *p, int argc, sqlite3_value ** argv, i64 chunk_id, i64 chunk_offset, i64 rowid) { int rc; for(int i = 0; i < vec0_num_defined_user_columns(p); i++) { if(p->user_column_kinds[i] != SQLITE_VEC0_USER_COLUMN_KIND_METADATA) { @@ -7389,21 +7435,34 @@ int vec0_insert_metadata_values(vec0_vtab *p, int argc, sqlite3_value ** argv, i case VEC0_METADATA_COLUMN_KIND_TEXT: { char * s = sqlite3_value_text(v); int n = sqlite3_value_bytes(v); - if(n <= 12) { - u8 view[VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH]; - memset(view, 0, VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH); - memcpy(view, &n, sizeof(int)); - memcpy(view+4, s, n); + u8 view[VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH]; + memset(view, 0, VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH); + memcpy(view, &n, sizeof(int)); + memcpy(view+4, s, min(n, VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH-4)); - rc = sqlite3_blob_write(blobValue, &view, VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH, chunk_offset * VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH); - } - else { - fprintf(stderr, "TODO handle longer strings"); - abort(); + rc = sqlite3_blob_write(blobValue, &view, VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH, chunk_offset * VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH); + if(n > 12) { + const char * zSql = sqlite3_mprintf("INSERT INTO " VEC0_SHADOW_METADATA_TEXT_DATA_NAME " (rowid, data) VALUES (?, ?)", p->schemaName, p->tableName, metadata_idx); + if(!zSql) { + abort(); + } + sqlite3_stmt * stmt; + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + if(rc != SQLITE_OK) { + abort(); + } + sqlite3_bind_int64(stmt, 1, rowid); + sqlite3_bind_text(stmt, 2, s, n, SQLITE_STATIC); + rc = sqlite3_step(stmt); + if(rc != SQLITE_DONE) { + abort(); + } + sqlite3_finalize(stmt); } break; } } + printf("rc=%d\n", rc); if(rc != SQLITE_OK) { } @@ -7631,7 +7690,7 @@ int vec0Update_Insert(sqlite3_vtab *pVTab, int argc, sqlite3_value **argv, goto cleanup; } - rc = vec0_insert_metadata_values(p, argc, argv, chunk_rowid, chunk_offset); + rc = vec0_insert_metadata_values(p, argc, argv, chunk_rowid, chunk_offset, rowid); if(rc != SQLITE_OK) { goto cleanup; } @@ -7993,7 +8052,7 @@ int vec0Update_Update(sqlite3_vtab *pVTab, int argc, sqlite3_value **argv) { } } - // TODO handle metadata + // TODO handle metadata updates // 4) iterate over all new vectors, update the vectors for (int i = 0; i < vec0_num_defined_user_columns(p); i++) { diff --git a/tests/__snapshots__/test-metadata.ambr b/tests/__snapshots__/test-metadata.ambr index 973c1e3..87dcc60 100644 --- a/tests/__snapshots__/test-metadata.ambr +++ b/tests/__snapshots__/test-metadata.ambr @@ -251,6 +251,13 @@ 'rootpage': 7, 'sql': 'CREATE TABLE "v_metadata_chunks00"(rowid PRIMARY KEY, data BLOB NOT NULL)', }), + OrderedDict({ + 'type': 'table', + 'name': 'v_metadata_text_data_00', + 'tbl_name': 'v_metadata_text_data_00', + 'rootpage': 9, + 'sql': 'CREATE TABLE "v_metadata_text_data_00"(rowid PRIMARY KEY, data TEXT)', + }), OrderedDict({ 'type': 'table', 'name': 'v_rowids', @@ -268,6 +275,111 @@ ]), }) # --- +# name: test_long_text + dict({ + 'v_chunks': OrderedDict({ + 'sql': 'select * from v_chunks', + 'rows': list([ + ]), + }), + 'v_metadata_chunks00': OrderedDict({ + 'sql': 'select * from v_metadata_chunks00', + 'rows': list([ + ]), + }), + 'v_metadata_text_data_00': OrderedDict({ + 'sql': 'select * from v_metadata_text_data_00', + 'rows': list([ + ]), + }), + 'v_rowids': OrderedDict({ + 'sql': 'select * from v_rowids', + 'rows': list([ + ]), + }), + 'v_vector_chunks00': OrderedDict({ + 'sql': 'select * from v_vector_chunks00', + 'rows': list([ + ]), + }), + }) +# --- +# name: test_long_text.1 + OrderedDict({ + 'sql': 'select * from v', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'vector': b'\x11\x11\x11\x11', + 'name': '123456789a12', + }), + OrderedDict({ + 'rowid': 2, + 'vector': b'\x11\x11\x11\x11', + 'name': '123456789a123', + }), + ]), + }) +# --- +# name: test_long_text.2 + dict({ + 'v_chunks': OrderedDict({ + 'sql': 'select * from v_chunks', + 'rows': list([ + OrderedDict({ + 'chunk_id': 1, + 'size': 8, + 'validity': b'\x03', + 'rowids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + }), + ]), + }), + 'v_metadata_chunks00': OrderedDict({ + 'sql': 'select * from v_metadata_chunks00', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'data': b'\x0c\x00\x00\x00123456789a12\r\x00\x00\x00123456789a12\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + }), + ]), + }), + 'v_metadata_text_data_00': OrderedDict({ + 'sql': 'select * from v_metadata_text_data_00', + 'rows': list([ + OrderedDict({ + 'rowid': 2, + 'data': '123456789a123', + }), + ]), + }), + 'v_rowids': OrderedDict({ + 'sql': 'select * from v_rowids', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'id': None, + 'chunk_id': 1, + 'chunk_offset': 0, + }), + OrderedDict({ + 'rowid': 2, + 'id': None, + 'chunk_id': 1, + 'chunk_offset': 1, + }), + ]), + }), + 'v_vector_chunks00': OrderedDict({ + 'sql': 'select * from v_vector_chunks00', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'vectors': b'\x11\x11\x11\x11\x11\x11\x11\x11\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + }), + ]), + }), + }) +# --- # name: test_normal.1 dict({ 'v_chunks': OrderedDict({ @@ -295,6 +407,11 @@ 'rows': list([ ]), }), + 'v_metadata_text_data_03': OrderedDict({ + 'sql': 'select * from v_metadata_text_data_03', + 'rows': list([ + ]), + }), 'v_rowids': OrderedDict({ 'sql': 'select * from v_rowids', 'rows': list([ @@ -408,6 +525,11 @@ }), ]), }), + 'v_metadata_text_data_03': OrderedDict({ + 'sql': 'select * from v_metadata_text_data_03', + 'rows': list([ + ]), + }), 'v_rowids': OrderedDict({ 'sql': 'select * from v_rowids', 'rows': list([ @@ -442,6 +564,27 @@ }), }) # --- +# name: test_normal.7 + OrderedDict({ + 'sql': 'drop table v', + 'rows': list([ + ]), + }) +# --- +# name: test_normal.8 + OrderedDict({ + 'sql': 'select * from sqlite_master', + 'rows': list([ + OrderedDict({ + 'type': 'table', + 'name': 'sqlite_sequence', + 'tbl_name': 'sqlite_sequence', + 'rootpage': 3, + 'sql': 'CREATE TABLE sqlite_sequence(name,seq)', + }), + ]), + }) +# --- # name: test_normal[sqlite_master] OrderedDict({ 'sql': "select * from sqlite_master where type = 'table' order by name", @@ -495,6 +638,13 @@ 'rootpage': 13, 'sql': 'CREATE TABLE "v_metadata_chunks03"(rowid PRIMARY KEY, data BLOB NOT NULL)', }), + OrderedDict({ + 'type': 'table', + 'name': 'v_metadata_text_data_03', + 'tbl_name': 'v_metadata_text_data_03', + 'rootpage': 15, + 'sql': 'CREATE TABLE "v_metadata_text_data_03"(rowid PRIMARY KEY, data TEXT)', + }), OrderedDict({ 'type': 'table', 'name': 'v_rowids', @@ -732,6 +882,11 @@ }), ]), }), + 'vec_movies_metadata_text_data_01': OrderedDict({ + 'sql': 'select * from vec_movies_metadata_text_data_01', + 'rows': list([ + ]), + }), 'vec_movies_rowids': OrderedDict({ 'sql': 'select * from vec_movies_rowids', 'rows': list([ diff --git a/tests/test-metadata.py b/tests/test-metadata.py index b60c4d9..a9a4a05 100644 --- a/tests/test-metadata.py +++ b/tests/test-metadata.py @@ -33,6 +33,9 @@ def test_normal(db, snapshot): assert exec(db, "select * from v") == snapshot() assert vec0_shadow_table_contents(db, "v") == snapshot() + assert exec(db, "drop table v") == snapshot() + assert exec(db, "select * from sqlite_master") == snapshot() + # # assert exec(db, "select * from v") == snapshot() @@ -44,6 +47,18 @@ def test_normal(db, snapshot): # ) +def test_long_text(db, snapshot): + db.execute( + "create virtual table v using vec0(vector float[1], name text, chunk_size=8)" + ) + assert vec0_shadow_table_contents(db, "v") == snapshot() + INSERT = "insert into v(vector, name) values (?, ?)" + assert exec(db, INSERT, [b"\x11\x11\x11\x11", "123456789a12"]) + assert exec(db, INSERT, [b"\x11\x11\x11\x11", "123456789a123"]) + assert exec(db, "select * from v") == snapshot() + assert vec0_shadow_table_contents(db, "v") == snapshot() + + def test_types(db, snapshot): db.execute( "create virtual table v using vec0(vector float[1], b boolean, n int, f float, t text, chunk_size=8)"