From 6e2c4c6bab0edb2217120d96eb3050a4aa56a6ef Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Tue, 31 Mar 2026 22:39:18 -0700 Subject: [PATCH] Add FTS5-style command column and runtime oversample for rescore Replace the old INSERT INTO t(rowid) VALUES('command') hack with a proper hidden command column named after the table (FTS5 pattern): INSERT INTO t(t) VALUES ('oversample=16') The command column is the first hidden column (before distance and k) to reserve ability for future table-valued function argument use. Schema: CREATE TABLE x(rowid, , "" hidden, distance hidden, k hidden) For backwards compat, pre-v0.1.10 tables (detected via _info shadow table version) skip the command column to avoid name conflicts with user columns that may share the table's name. Verified with legacy fixture DB generated by sqlite-vec v0.1.6. Changes: - Add hidden command column to sqlite3_declare_vtab for new tables - Version-gate via _info shadow table for existing tables - Validate at CREATE time that no column name matches table name - Add rescore_handle_command() with oversample=N support - rescore_knn() prefers runtime oversample_search over CREATE default - Remove old rowid-based command dispatch - Migrate all DiskANN/IVF/fuzz tests and benchmarks to new syntax - Add legacy DB fixture (v0.1.6) and 9 backwards-compat tests Co-Authored-By: Claude Opus 4.6 (1M context) --- benchmarks-ann/bench-delete/bench_delete.py | 2 +- benchmarks-ann/bench.py | 10 +- sqlite-vec-rescore.c | 25 +++- sqlite-vec.c | 141 ++++++++++++++++---- tests/fixtures/legacy-v0.1.6.db | Bin 0 -> 106496 bytes tests/fuzz/diskann-command-inject.c | 8 +- tests/fuzz/ivf-cell-overflow.c | 12 +- tests/fuzz/ivf-kmeans.c | 8 +- tests/fuzz/ivf-knn-deep.c | 6 +- tests/fuzz/ivf-operations.c | 8 +- tests/fuzz/ivf-quantize.c | 4 +- tests/fuzz/ivf-rescore.c | 6 +- tests/fuzz/ivf-shadow-corrupt.c | 10 +- tests/generate_legacy_db.py | 81 +++++++++++ tests/test-diskann.py | 10 +- tests/test-general.py | 12 ++ tests/test-ivf-mutations.py | 26 ++-- tests/test-ivf-quantization.py | 14 +- tests/test-ivf.py | 26 ++-- tests/test-legacy-compat.py | 138 +++++++++++++++++++ tests/test-rescore.py | 70 ++++++++++ 21 files changed, 512 insertions(+), 105 deletions(-) create mode 100644 tests/fixtures/legacy-v0.1.6.db create mode 100644 tests/generate_legacy_db.py create mode 100644 tests/test-legacy-compat.py diff --git a/benchmarks-ann/bench-delete/bench_delete.py b/benchmarks-ann/bench-delete/bench_delete.py index 802f0a4..0ebd2ec 100644 --- a/benchmarks-ann/bench-delete/bench_delete.py +++ b/benchmarks-ann/bench-delete/bench_delete.py @@ -159,7 +159,7 @@ INDEX_REGISTRY = { def _ivf_train(conn): """Trigger built-in k-means training for IVF.""" t0 = now_ns() - conn.execute("INSERT INTO vec_items(id) VALUES ('compute-centroids')") + conn.execute("INSERT INTO vec_items(vec_items) VALUES ('compute-centroids')") conn.commit() return ns_to_s(now_ns() - t0) diff --git a/benchmarks-ann/bench.py b/benchmarks-ann/bench.py index a4cbbe4..966c458 100644 --- a/benchmarks-ann/bench.py +++ b/benchmarks-ann/bench.py @@ -456,7 +456,7 @@ def _ivf_create_table_sql(params): def _ivf_post_insert_hook(conn, params): print(" Training k-means centroids (built-in)...", flush=True) t0 = time.perf_counter() - conn.execute("INSERT INTO vec_items(id) VALUES ('compute-centroids')") + conn.execute("INSERT INTO vec_items(vec_items) VALUES ('compute-centroids')") conn.commit() elapsed = time.perf_counter() - t0 print(f" Training done in {elapsed:.1f}s", flush=True) @@ -514,7 +514,7 @@ def _ivf_faiss_kmeans_hook(conn, params): for cid, blob in centroids: conn.execute( - "INSERT INTO vec_items(id, embedding) VALUES (?, ?)", + "INSERT INTO vec_items(vec_items, embedding) VALUES (?, ?)", (f"set-centroid:{cid}", blob), ) conn.commit() @@ -540,7 +540,7 @@ def _ivf_pre_query_hook(conn, params): nprobe = params.get("nprobe") if nprobe: conn.execute( - "INSERT INTO vec_items(id) VALUES (?)", + "INSERT INTO vec_items(vec_items) VALUES (?)", (f"nprobe={nprobe}",), ) conn.commit() @@ -572,7 +572,7 @@ INDEX_REGISTRY["ivf"] = { "insert_sql": None, "post_insert_hook": _ivf_post_insert_hook, "pre_query_hook": _ivf_pre_query_hook, - "train_sql": lambda _: "INSERT INTO vec_items(id) VALUES ('compute-centroids')", + "train_sql": lambda _: "INSERT INTO vec_items(vec_items) VALUES ('compute-centroids')", "run_query": None, "query_sql": None, "describe": _ivf_describe, @@ -616,7 +616,7 @@ def _diskann_pre_query_hook(conn, params): L_search = params.get("L_search", 0) if L_search: conn.execute( - "INSERT INTO vec_items(id) VALUES (?)", + "INSERT INTO vec_items(vec_items) VALUES (?)", (f"search_list_size_search={L_search}",), ) conn.commit() diff --git a/sqlite-vec-rescore.c b/sqlite-vec-rescore.c index 5432612..6a47214 100644 --- a/sqlite-vec-rescore.c +++ b/sqlite-vec-rescore.c @@ -351,7 +351,9 @@ static int rescore_knn(vec0_vtab *p, vec0_cursor *pCur, (void)pCur; (void)aMetadataIn; int rc = SQLITE_OK; - int oversample = vector_column->rescore.oversample; + int oversample = vector_column->rescore.oversample_search > 0 + ? vector_column->rescore.oversample_search + : vector_column->rescore.oversample; i64 k_oversample = k * oversample; if (k_oversample > 4096) k_oversample = 4096; @@ -640,6 +642,27 @@ cleanup: return rc; } +/** + * Handle FTS5-style command dispatch for rescore parameters. + * Returns SQLITE_OK if handled, SQLITE_EMPTY if not a rescore command. + */ +static int rescore_handle_command(vec0_vtab *p, const char *command) { + if (strncmp(command, "oversample=", 11) == 0) { + int val = atoi(command + 11); + if (val < 1) { + vtab_set_error(&p->base, "oversample must be >= 1"); + return SQLITE_ERROR; + } + for (int i = 0; i < p->numVectorColumns; i++) { + if (p->vector_columns[i].index_type == VEC0_INDEX_TYPE_RESCORE) { + p->vector_columns[i].rescore.oversample_search = val; + } + } + return SQLITE_OK; + } + return SQLITE_EMPTY; +} + #ifdef SQLITE_VEC_TEST void _test_rescore_quantize_float_to_bit(const float *src, uint8_t *dst, size_t dim) { rescore_quantize_float_to_bit(src, dst, dim); diff --git a/sqlite-vec.c b/sqlite-vec.c index 16c3b4d..40fe0bf 100644 --- a/sqlite-vec.c +++ b/sqlite-vec.c @@ -2588,7 +2588,8 @@ enum Vec0RescoreQuantizerType { struct Vec0RescoreConfig { enum Vec0RescoreQuantizerType quantizer_type; - int oversample; + int oversample; // CREATE-time default + int oversample_search; // runtime override (0 = use default) }; #endif @@ -3399,8 +3400,9 @@ static sqlite3_module vec_eachModule = { #define VEC0_COLUMN_ID 0 #define VEC0_COLUMN_USERN_START 1 -#define VEC0_COLUMN_OFFSET_DISTANCE 1 -#define VEC0_COLUMN_OFFSET_K 2 +#define VEC0_COLUMN_OFFSET_COMMAND 1 +#define VEC0_COLUMN_OFFSET_DISTANCE 2 +#define VEC0_COLUMN_OFFSET_K 3 #define VEC0_SHADOW_INFO_NAME "\"%w\".\"%w_info\"" @@ -3498,6 +3500,10 @@ struct vec0_vtab { // Will change the schema of the _rowids table, and insert/query logic. int pkIsText; + // True if the hidden command column (named after the table) exists. + // Tables created before v0.1.10 or without _info table don't have it. + int hasCommandColumn; + // number of defined vector columns. int numVectorColumns; @@ -3777,20 +3783,19 @@ int vec0_num_defined_user_columns(vec0_vtab *p) { * @param p vec0 table * @return int */ -int vec0_column_distance_idx(vec0_vtab *p) { - return VEC0_COLUMN_USERN_START + (vec0_num_defined_user_columns(p) - 1) + - VEC0_COLUMN_OFFSET_DISTANCE; +int vec0_column_command_idx(vec0_vtab *p) { + // Command column is the first hidden column (right after user columns) + return VEC0_COLUMN_USERN_START + vec0_num_defined_user_columns(p); +} + +int vec0_column_distance_idx(vec0_vtab *p) { + int base = VEC0_COLUMN_USERN_START + vec0_num_defined_user_columns(p); + return base + (p->hasCommandColumn ? 1 : 0); } -/** - * @brief Returns the index of the k hidden column for the given vec0 table. - * - * @param p vec0 table - * @return int k column index - */ int vec0_column_k_idx(vec0_vtab *p) { - return VEC0_COLUMN_USERN_START + (vec0_num_defined_user_columns(p) - 1) + - VEC0_COLUMN_OFFSET_K; + int base = VEC0_COLUMN_USERN_START + vec0_num_defined_user_columns(p); + return base + (p->hasCommandColumn ? 2 : 1); } /** @@ -5205,6 +5210,74 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, } } + // Determine whether to add the FTS5-style hidden command column. + // New tables (isCreate) always get it; existing tables only if created + // with v0.1.10+ (which validated no column name == table name). + int hasCommandColumn = 0; + if (isCreate) { + // Validate no user column name conflicts with the table name + const char *tblName = argv[2]; + int tblNameLen = (int)strlen(tblName); + for (int i = 0; i < numVectorColumns; i++) { + if (pNew->vector_columns[i].name_length == tblNameLen && + sqlite3_strnicmp(pNew->vector_columns[i].name, tblName, tblNameLen) == 0) { + *pzErr = sqlite3_mprintf( + VEC_CONSTRUCTOR_ERROR + "column name '%s' conflicts with table name (reserved for command column)", + tblName); + goto error; + } + } + for (int i = 0; i < numPartitionColumns; i++) { + if (pNew->paritition_columns[i].name_length == tblNameLen && + sqlite3_strnicmp(pNew->paritition_columns[i].name, tblName, tblNameLen) == 0) { + *pzErr = sqlite3_mprintf( + VEC_CONSTRUCTOR_ERROR + "column name '%s' conflicts with table name (reserved for command column)", + tblName); + goto error; + } + } + for (int i = 0; i < numAuxiliaryColumns; i++) { + if (pNew->auxiliary_columns[i].name_length == tblNameLen && + sqlite3_strnicmp(pNew->auxiliary_columns[i].name, tblName, tblNameLen) == 0) { + *pzErr = sqlite3_mprintf( + VEC_CONSTRUCTOR_ERROR + "column name '%s' conflicts with table name (reserved for command column)", + tblName); + goto error; + } + } + for (int i = 0; i < numMetadataColumns; i++) { + if (pNew->metadata_columns[i].name_length == tblNameLen && + sqlite3_strnicmp(pNew->metadata_columns[i].name, tblName, tblNameLen) == 0) { + *pzErr = sqlite3_mprintf( + VEC_CONSTRUCTOR_ERROR + "column name '%s' conflicts with table name (reserved for command column)", + tblName); + goto error; + } + } + hasCommandColumn = 1; + } else { + // xConnect: check _info shadow table for version + sqlite3_stmt *stmtInfo = NULL; + char *zInfoSql = sqlite3_mprintf( + "SELECT value FROM " VEC0_SHADOW_INFO_NAME " WHERE key = 'CREATE_VERSION_PATCH'", + argv[1], argv[2]); + if (zInfoSql) { + int infoRc = sqlite3_prepare_v2(db, zInfoSql, -1, &stmtInfo, NULL); + sqlite3_free(zInfoSql); + if (infoRc == SQLITE_OK && sqlite3_step(stmtInfo) == SQLITE_ROW) { + int patch = sqlite3_column_int(stmtInfo, 0); + hasCommandColumn = (patch >= 10); // v0.1.10+ + } + // If _info doesn't exist or has no version, assume old table + sqlite3_finalize(stmtInfo); + } + } + pNew->hasCommandColumn = hasCommandColumn; + sqlite3_str *createStr = sqlite3_str_new(NULL); sqlite3_str_appendall(createStr, "CREATE TABLE x("); if (pkColumnName) { @@ -5246,7 +5319,11 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, } } - sqlite3_str_appendall(createStr, " distance hidden, k hidden) "); + if (hasCommandColumn) { + sqlite3_str_appendf(createStr, " \"%w\" hidden, distance hidden, k hidden) ", argv[2]); + } else { + sqlite3_str_appendall(createStr, " distance hidden, k hidden) "); + } if (pkColumnName) { sqlite3_str_appendall(createStr, "without rowid "); } @@ -10161,25 +10238,31 @@ static int vec0Update(sqlite3_vtab *pVTab, int argc, sqlite3_value **argv, } // INSERT operation else if (argc > 1 && sqlite3_value_type(argv[0]) == SQLITE_NULL) { -#if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE || SQLITE_VEC_ENABLE_DISKANN - // Check for command inserts: INSERT INTO t(rowid) VALUES ('command-string') - // The id column holds the command string. - sqlite3_value *idVal = argv[2 + VEC0_COLUMN_ID]; - if (sqlite3_value_type(idVal) == SQLITE_TEXT) { - const char *cmd = (const char *)sqlite3_value_text(idVal); - vec0_vtab *p = (vec0_vtab *)pVTab; - int cmdRc = SQLITE_EMPTY; + vec0_vtab *p = (vec0_vtab *)pVTab; + // FTS5-style command dispatch via hidden column named after table + if (p->hasCommandColumn) { + sqlite3_value *cmdVal = argv[2 + vec0_column_command_idx(p)]; + if (sqlite3_value_type(cmdVal) == SQLITE_TEXT) { + const char *cmd = (const char *)sqlite3_value_text(cmdVal); + int cmdRc = SQLITE_EMPTY; +#if SQLITE_VEC_ENABLE_RESCORE + cmdRc = rescore_handle_command(p, cmd); +#endif #if SQLITE_VEC_EXPERIMENTAL_IVF_ENABLE - cmdRc = ivf_handle_command(p, cmd, argc, argv); + if (cmdRc == SQLITE_EMPTY) + cmdRc = ivf_handle_command(p, cmd, argc, argv); #endif #if SQLITE_VEC_ENABLE_DISKANN - if (cmdRc == SQLITE_EMPTY) - cmdRc = diskann_handle_command(p, cmd); + if (cmdRc == SQLITE_EMPTY) + cmdRc = diskann_handle_command(p, cmd); #endif - if (cmdRc != SQLITE_EMPTY) return cmdRc; // handled (or error) - // SQLITE_EMPTY means not a recognized command — fall through to normal insert + if (cmdRc == SQLITE_EMPTY) { + vtab_set_error(pVTab, "unknown vec0 command: '%s'", cmd); + return SQLITE_ERROR; + } + return cmdRc; + } } -#endif return vec0Update_Insert(pVTab, argc, argv, pRowid); } // UPDATE operation diff --git a/tests/fixtures/legacy-v0.1.6.db b/tests/fixtures/legacy-v0.1.6.db new file mode 100644 index 0000000000000000000000000000000000000000..58bd89d250a0907420336b40abbf14615b1dc244 GIT binary patch literal 106496 zcmeI*Piz$D9mesQo!xhL*6Vp~46F?p#^#?FjIkjOq^cCJ=_)KSHZ=x`MwM$|Y2ARG z*oMR$(p{gb9IA*@MJ|XOQsva5Mv71lz^Ou2R3SyGI3W&A#UY9~HB}@!^Uk}d`?9!I zm14wxUoCp(H}81neLvo{+1c6k(I1aYFO>WwTRhwO1SSI6E-+L#T6z1|cjwRg1i{lf=@+1207OeM=9W9Q& z<_CAE{+_Trb9Q!W;fPa79eUopV=v#EteyGt8&~f^ z|M~9zMoMI7i%(-&{wR|J%lD@$yF@a(Z=YRRtT+AWyKn3wS}B&D?EmdiawH+LJ9pYA z)M)i%f4=ng`O?g(l2ImVMrd^Q%uJmt^#)&=s}$o(U}>P9K=k-B1frGkn*Ddz-R2oT zFyDzqKbhCmn@aeQF$Ni~iY+^haEt%qp<8g4`;lDds88H6!vi_NFSk_q3+}X?%;s{b zN}|p@YImjwU;B8k-Q=kc{pR3EarvPozmi|bPlE*-0tg_000IagfB*srAbN6+mij~B*|7RN>>kK`u~928l{8FY=F!^P3D@odZ)Oqm_|LqUh_e z^u4TJ)R!Mw@@x5}{5)8oA%Fk^2q1s}0tg_000IagfWS%!v_IuEK)g`>j=*R z#9E!4ao!+&{onRltl&mN009ILKmY**5I_I{1Q2K-ftzVNWyO{aB7J3A z^&RFhefa7~s_#Ni==DNpQm49pK3H~j9lppNwx%eojLz`MZ`iyjEUB~lT)0)&t8}*M zT&;7B&b2zTI@@)2=&Zhre!Z?Y=+I2))440V3Ft( z7dmWbpI!Ure*Hl$d)i~pKG*1qz9<-WI_mKpr^^oa7JWKJIy?ok$D6a{xALKUAn(h& z@{YVMZ^`TOn*2mwm6zou`Mz9|7v-G%r93HrBFE(+IV^u5_sAWxTdtF=xZByF`~tU-&9FiA^duij68eMW@OQVuQ-{V!g_BVx7tk(V?jWr7R@R%BBQcNG^tFBv`SaF zDpMk*GAWWOg%BzeBB3%a;wl~CsEj3IVY&%Bj8t^}zqGvSwGRz+Ph_eHAbKjJ=C!_w0QUBGb?-}*qjrtp-{zq6h zg>Q|p<#$$aqalC*0tg_000IagfB*srAb`LNDBw75_z4MVYnPr}pU&Mn`*jZJ+@q5~ z009ILKmY**5I_I{1Q0*~0R#|0009ILKmY**5I_I{1Q0*~f#*%2`qvPu;~K6mL>V{g zlu>7ly2Yqh8+E%;Z!l`#sJ9q(k5Ts;b-z)+WYqhO`iDk6WYj-0>VrmIt3`zX0tg_0 z00IagfB*srAb`MgDp36^h|lTp7#0EuAbg0^`2I1@fw%1|>HyQ#6Ab