fmt and SQLITE_VEC_OMIT_FS fixes

This commit is contained in:
Alex Garcia 2024-08-10 23:33:28 -07:00
parent abf59c418e
commit 7ea402931e
2 changed files with 174 additions and 148 deletions

View file

@ -1,4 +1,5 @@
#include "sqlite-vec.h" #include "sqlite-vec.h"
#include <assert.h> #include <assert.h>
#include <errno.h> #include <errno.h>
#include <float.h> #include <float.h>
@ -7,10 +8,13 @@
#include <math.h> #include <math.h>
#include <stdbool.h> #include <stdbool.h>
#include <stdint.h> #include <stdint.h>
#include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#ifndef SQLITE_VEC_OMIT_FS
#include <stdio.h>
#endif
#include "sqlite3ext.h" #include "sqlite3ext.h"
SQLITE_EXTENSION_INIT1 SQLITE_EXTENSION_INIT1
@ -79,7 +83,8 @@ typedef size_t usize;
#define UNUSED_PARAMETER(X) (void)(X) #define UNUSED_PARAMETER(X) (void)(X)
#endif #endif
// sqlite3_vtab_in() was added in SQLite version 3.38 (2022-02-22) https://www.sqlite.org/changes.html#version_3_38_0 // sqlite3_vtab_in() was added in SQLite version 3.38 (2022-02-22)
// https://www.sqlite.org/changes.html#version_3_38_0
#if SQLITE_VERSION_NUMBER >= 3038000 #if SQLITE_VERSION_NUMBER >= 3038000
#define COMPILER_SUPPORTS_VTAB_IN 1 #define COMPILER_SUPPORTS_VTAB_IN 1
#endif #endif
@ -505,9 +510,10 @@ static f32 distance_hamming_u8(u8 *a, u8 *b, size_t n) {
} }
#ifdef _MSC_VER #ifdef _MSC_VER
#if !defined(__clang__) && \ #if !defined(__clang__) && (defined(_M_ARM) || defined(_M_ARM64))
(defined(_M_ARM) || defined(_M_ARM64)) // From
// From https://github.com/ngtcp2/ngtcp2/blob/b64f1e77b5e0d880b93d31f474147fae4a1d17cc/lib/ngtcp2_ringbuf.c, line 34-43 // https://github.com/ngtcp2/ngtcp2/blob/b64f1e77b5e0d880b93d31f474147fae4a1d17cc/lib/ngtcp2_ringbuf.c,
// line 34-43
static unsigned int __builtin_popcountl(unsigned int x) { static unsigned int __builtin_popcountl(unsigned int x) {
unsigned int c = 0; unsigned int c = 0;
for (; x; ++c) { for (; x; ++c) {
@ -1037,6 +1043,7 @@ struct VecNpyFile {
}; };
#define SQLITE_VEC_NPY_FILE_NAME "vec0-npy-file" #define SQLITE_VEC_NPY_FILE_NAME "vec0-npy-file"
#ifndef SQLITE_VEC_OMIT_FS
static void vec_npy_file(sqlite3_context *context, int argc, static void vec_npy_file(sqlite3_context *context, int argc,
sqlite3_value **argv) { sqlite3_value **argv) {
assert(argc == 1); assert(argc == 1);
@ -1055,6 +1062,7 @@ static void vec_npy_file(sqlite3_context *context, int argc,
f->pathLength = pathLength; f->pathLength = pathLength;
sqlite3_result_pointer(context, f, SQLITE_VEC_NPY_FILE_NAME, sqlite3_free); sqlite3_result_pointer(context, f, SQLITE_VEC_NPY_FILE_NAME, sqlite3_free);
} }
#endif
#pragma region scalar functions #pragma region scalar functions
static void vec_f32(sqlite3_context *context, int argc, sqlite3_value **argv) { static void vec_f32(sqlite3_context *context, int argc, sqlite3_value **argv) {
@ -2648,7 +2656,9 @@ struct vec_npy_each_cursor {
// Opened npy file, when reading from a file. // Opened npy file, when reading from a file.
// fclose() when complete. // fclose() when complete.
#ifndef SQLITE_VEC_OMIT_FS
FILE *file; FILE *file;
#endif
// an in-memory buffer containing a portion of the npy array. // an in-memory buffer containing a portion of the npy array.
// Used for faster reading, instead of calling fread a lot. // Used for faster reading, instead of calling fread a lot.
@ -2856,12 +2866,12 @@ static int vec_npy_eachOpen(sqlite3_vtab *p, sqlite3_vtab_cursor **ppCursor) {
static int vec_npy_eachClose(sqlite3_vtab_cursor *cur) { static int vec_npy_eachClose(sqlite3_vtab_cursor *cur) {
vec_npy_each_cursor *pCur = (vec_npy_each_cursor *)cur; vec_npy_each_cursor *pCur = (vec_npy_each_cursor *)cur;
if (pCur->file) {
#ifndef SQLITE_VEC_OMIT_FS #ifndef SQLITE_VEC_OMIT_FS
if (pCur->file) {
fclose(pCur->file); fclose(pCur->file);
#endif
pCur->file = NULL; pCur->file = NULL;
} }
#endif
if (pCur->chunksBuffer) { if (pCur->chunksBuffer) {
sqlite3_free(pCur->chunksBuffer); sqlite3_free(pCur->chunksBuffer);
pCur->chunksBuffer = NULL; pCur->chunksBuffer = NULL;
@ -2912,12 +2922,12 @@ static int vec_npy_eachFilter(sqlite3_vtab_cursor *pVtabCursor, int idxNum,
vec_npy_each_cursor *pCur = (vec_npy_each_cursor *)pVtabCursor; vec_npy_each_cursor *pCur = (vec_npy_each_cursor *)pVtabCursor;
if (pCur->file) {
#ifndef SQLITE_VEC_OMIT_FS #ifndef SQLITE_VEC_OMIT_FS
if (pCur->file) {
fclose(pCur->file); fclose(pCur->file);
#endif
pCur->file = NULL; pCur->file = NULL;
} }
#endif
if (pCur->chunksBuffer) { if (pCur->chunksBuffer) {
sqlite3_free(pCur->chunksBuffer); sqlite3_free(pCur->chunksBuffer);
pCur->chunksBuffer = NULL; pCur->chunksBuffer = NULL;
@ -2926,9 +2936,8 @@ static int vec_npy_eachFilter(sqlite3_vtab_cursor *pVtabCursor, int idxNum,
pCur->vector = NULL; pCur->vector = NULL;
} }
struct VecNpyFile *f = NULL;
#ifndef SQLITE_VEC_OMIT_FS #ifndef SQLITE_VEC_OMIT_FS
struct VecNpyFile *f = NULL;
if ((f = sqlite3_value_pointer(argv[0], SQLITE_VEC_NPY_FILE_NAME))) { if ((f = sqlite3_value_pointer(argv[0], SQLITE_VEC_NPY_FILE_NAME))) {
FILE *file = fopen(f->path, "r"); FILE *file = fopen(f->path, "r");
if (!file) { if (!file) {
@ -3293,12 +3302,26 @@ void vec0_free(vec0_vtab *p) {
} }
} }
int vec0_column_distance_idx(vec0_vtab *pVtab) { /**
return VEC0_COLUMN_VECTORN_START + (pVtab->numVectorColumns - 1) + * @brief Returns the index of the distance hidden column for the given vec0
* table.
*
* @param p vec0 table
* @return int
*/
int vec0_column_distance_idx(vec0_vtab *p) {
return VEC0_COLUMN_VECTORN_START + (p->numVectorColumns - 1) +
VEC0_COLUMN_OFFSET_DISTANCE; VEC0_COLUMN_OFFSET_DISTANCE;
} }
int vec0_column_k_idx(vec0_vtab *pVtab) {
return VEC0_COLUMN_VECTORN_START + (pVtab->numVectorColumns - 1) + /**
* @brief Returns the index of the k hidden column for the given vec0 table.
*
* @param p vec0 table
* @return int k column index
*/
int vec0_column_k_idx(vec0_vtab *p) {
return VEC0_COLUMN_VECTORN_START + (p->numVectorColumns - 1) +
VEC0_COLUMN_OFFSET_K; VEC0_COLUMN_OFFSET_K;
} }
@ -3975,10 +3998,10 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv,
#define SQLITE_VEC_VEC0_MAX_DIMENSIONS 8192 #define SQLITE_VEC_VEC0_MAX_DIMENSIONS 8192
if (c.dimensions > SQLITE_VEC_VEC0_MAX_DIMENSIONS) { if (c.dimensions > SQLITE_VEC_VEC0_MAX_DIMENSIONS) {
sqlite3_free(c.name); sqlite3_free(c.name);
*pzErr = sqlite3_mprintf(VEC_CONSTRUCTOR_ERROR *pzErr = sqlite3_mprintf(
VEC_CONSTRUCTOR_ERROR
"Dimension on vector column too large, provided %lld, maximum %lld", "Dimension on vector column too large, provided %lld, maximum %lld",
(i64) c.dimensions, (i64)c.dimensions, SQLITE_VEC_VEC0_MAX_DIMENSIONS);
SQLITE_VEC_VEC0_MAX_DIMENSIONS);
goto error; goto error;
} }
memcpy(&pNew->vector_columns[numVectorColumns], &c, sizeof(c)); memcpy(&pNew->vector_columns[numVectorColumns], &c, sizeof(c));
@ -4034,8 +4057,8 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv,
} }
#define SQLITE_VEC_CHUNK_SIZE_MAX 4096 #define SQLITE_VEC_CHUNK_SIZE_MAX 4096
if (chunk_size > SQLITE_VEC_CHUNK_SIZE_MAX) { if (chunk_size > SQLITE_VEC_CHUNK_SIZE_MAX) {
*pzErr = sqlite3_mprintf(VEC_CONSTRUCTOR_ERROR *pzErr =
"chunk_size too large"); sqlite3_mprintf(VEC_CONSTRUCTOR_ERROR "chunk_size too large");
goto error; goto error;
} }
} else { } else {
@ -4964,7 +4987,9 @@ int vec0Filter_knn(vec0_cursor *pCur, vec0_vtab *p, int idxNum,
#define SQLITE_VEC_VEC0_K_MAX 4096 #define SQLITE_VEC_VEC0_K_MAX 4096
if (k > SQLITE_VEC_VEC0_K_MAX) { if (k > SQLITE_VEC_VEC0_K_MAX) {
vtab_set_error( vtab_set_error(
&p->base, "k value in knn query too large, provided %lld and the limit is %lld", k, SQLITE_VEC_VEC0_K_MAX); &p->base,
"k value in knn query too large, provided %lld and the limit is %lld",
k, SQLITE_VEC_VEC0_K_MAX);
rc = SQLITE_ERROR; rc = SQLITE_ERROR;
goto cleanup; goto cleanup;
} }
@ -6031,9 +6056,11 @@ int vec0Update_Delete(sqlite3_vtab *pVTab, sqlite3_value * idValue) {
return rc; return rc;
} }
// 3. zero out rowid in chunks.rowids https://github.com/asg017/sqlite-vec/issues/54 // 3. zero out rowid in chunks.rowids
// https://github.com/asg017/sqlite-vec/issues/54
// 4. zero out any data in vector chunks tables https://github.com/asg017/sqlite-vec/issues/54 // 4. zero out any data in vector chunks tables
// https://github.com/asg017/sqlite-vec/issues/54
// 5. delete from _rowids table // 5. delete from _rowids table
rc = vec0Update_Delete_DeleteRowids(p, rowid); rc = vec0Update_Delete_DeleteRowids(p, rowid);
@ -6122,8 +6149,7 @@ cleanup:
return SQLITE_OK; return SQLITE_OK;
} }
int vec0Update_Update(sqlite3_vtab *pVTab, int argc, int vec0Update_Update(sqlite3_vtab *pVTab, int argc, sqlite3_value **argv) {
sqlite3_value **argv) {
UNUSED_PARAMETER(argc); UNUSED_PARAMETER(argc);
vec0_vtab *p = (vec0_vtab *)pVTab; vec0_vtab *p = (vec0_vtab *)pVTab;
int rc; int rc;
@ -6135,11 +6161,10 @@ int vec0Update_Update(sqlite3_vtab *pVTab, int argc,
const char *a = (const char *)sqlite3_value_text(argv[0]); const char *a = (const char *)sqlite3_value_text(argv[0]);
const char *b = (const char *)sqlite3_value_text(argv[1]); const char *b = (const char *)sqlite3_value_text(argv[1]);
// IMP: V08886_25725 // IMP: V08886_25725
if( if ((sqlite3_value_bytes(argv[0]) != sqlite3_value_bytes(argv[1])) ||
(sqlite3_value_bytes(argv[0]) != sqlite3_value_bytes(argv[1])) strncmp(a, b, sqlite3_value_bytes(argv[0])) != 0) {
|| strncmp(a, b, sqlite3_value_bytes(argv[0])) != 0 vtab_set_error(pVTab,
) { "UPDATEs on vec0 primary key values are not allowed.");
vtab_set_error(pVTab, "UPDATEs on vec0 primary key values are not allowed.");
return SQLITE_ERROR; return SQLITE_ERROR;
} }
rc = vec0_rowid_from_id(p, argv[0], &rowid); rc = vec0_rowid_from_id(p, argv[0], &rowid);
@ -6561,7 +6586,6 @@ void sbe_query_knn_data_clear(struct sbe_query_knn_data *knn_data) {
} }
} }
typedef struct vec_static_blob_entries_cursor vec_static_blob_entries_cursor; typedef struct vec_static_blob_entries_cursor vec_static_blob_entries_cursor;
struct vec_static_blob_entries_cursor { struct vec_static_blob_entries_cursor {
sqlite3_vtab_cursor base; sqlite3_vtab_cursor base;
@ -6768,7 +6792,6 @@ static int vec_static_blob_entriesFilter(sqlite3_vtab_cursor *pVtabCursor,
size_t bsize = (p->blob->nvectors + 7) & ~7; size_t bsize = (p->blob->nvectors + 7) & ~7;
i32 *topk_rowids = sqlite3_malloc(k * sizeof(i32)); i32 *topk_rowids = sqlite3_malloc(k * sizeof(i32));
if (!topk_rowids) { if (!topk_rowids) {
// HANDLE https://github.com/asg017/sqlite-vec/issues/55 // HANDLE https://github.com/asg017/sqlite-vec/issues/55
@ -6827,7 +6850,6 @@ static int vec_static_blob_entriesRowid(sqlite3_vtab_cursor *cur,
} }
} }
return SQLITE_ERROR; return SQLITE_ERROR;
} }
static int vec_static_blob_entriesNext(sqlite3_vtab_cursor *cur) { static int vec_static_blob_entriesNext(sqlite3_vtab_cursor *cur) {
@ -6887,7 +6909,8 @@ static int vec_static_blob_entriesColumn(sqlite3_vtab_cursor *cur,
sqlite3_result_blob(context, sqlite3_result_blob(context,
((unsigned char *)p->blob->p) + ((unsigned char *)p->blob->p) +
(rowid * p->blob->dimensions * sizeof(float)), (rowid * p->blob->dimensions * sizeof(float)),
p->blob->dimensions * sizeof(float), SQLITE_TRANSIENT); p->blob->dimensions * sizeof(float),
SQLITE_TRANSIENT);
sqlite3_result_subtype(context, p->blob->element_type); sqlite3_result_subtype(context, p->blob->element_type);
break; break;
} }
@ -6900,7 +6923,9 @@ static int vec_static_blob_entriesColumn(sqlite3_vtab_cursor *cur,
static sqlite3_module vec_static_blob_entriesModule = { static sqlite3_module vec_static_blob_entriesModule = {
/* iVersion */ 3, /* iVersion */ 3,
/* xCreate */ vec_static_blob_entriesCreate, // handle rm? https://github.com/asg017/sqlite-vec/issues/55 /* xCreate */
vec_static_blob_entriesCreate, // handle rm?
// https://github.com/asg017/sqlite-vec/issues/55
/* xConnect */ vec_static_blob_entriesConnect, /* xConnect */ vec_static_blob_entriesConnect,
/* xBestIndex */ vec_static_blob_entriesBestIndex, /* xBestIndex */ vec_static_blob_entriesBestIndex,
/* xDisconnect */ vec_static_blob_entriesDisconnect, /* xDisconnect */ vec_static_blob_entriesDisconnect,
@ -6949,7 +6974,6 @@ static sqlite3_module vec_static_blob_entriesModule = {
"Commit: " SQLITE_VEC_SOURCE "\n" \ "Commit: " SQLITE_VEC_SOURCE "\n" \
"Build flags: " SQLITE_VEC_DEBUG_BUILD "Build flags: " SQLITE_VEC_DEBUG_BUILD
SQLITE_VEC_API int sqlite3_vec_init(sqlite3 *db, char **pzErrMsg, SQLITE_VEC_API int sqlite3_vec_init(sqlite3 *db, char **pzErrMsg,
const sqlite3_api_routines *pApi) { const sqlite3_api_routines *pApi) {
SQLITE_EXTENSION_INIT2(pApi); SQLITE_EXTENSION_INIT2(pApi);
@ -7045,8 +7069,8 @@ SQLITE_VEC_API int sqlite3_vec_fs_read_init(sqlite3 *db, char **pzErrMsg,
} }
#endif #endif
SQLITE_VEC_API int
SQLITE_VEC_API int sqlite3_vec_static_blobs_init(sqlite3 *db, char **pzErrMsg, sqlite3_vec_static_blobs_init(sqlite3 *db, char **pzErrMsg,
const sqlite3_api_routines *pApi) { const sqlite3_api_routines *pApi) {
UNUSED_PARAMETER(pzErrMsg); UNUSED_PARAMETER(pzErrMsg);
SQLITE_EXTENSION_INIT2(pApi); SQLITE_EXTENSION_INIT2(pApi);
@ -7059,16 +7083,21 @@ SQLITE_VEC_API int sqlite3_vec_static_blobs_init(sqlite3 *db, char **pzErrMsg,
} }
memset(static_blob_data, 0, sizeof(*static_blob_data)); memset(static_blob_data, 0, sizeof(*static_blob_data));
rc = sqlite3_create_function_v2(db, "vec_static_blob_from_raw", 4, DEFAULT_FLAGS | SQLITE_SUBTYPE | SQLITE_RESULT_SUBTYPE, rc = sqlite3_create_function_v2(
NULL, vec_static_blob_from_raw, NULL, NULL, NULL); db, "vec_static_blob_from_raw", 4,
if(rc != SQLITE_OK) return rc; DEFAULT_FLAGS | SQLITE_SUBTYPE | SQLITE_RESULT_SUBTYPE, NULL,
vec_static_blob_from_raw, NULL, NULL, NULL);
if (rc != SQLITE_OK)
return rc;
rc = sqlite3_create_module_v2(db, "vec_static_blobs", &vec_static_blobsModule, rc = sqlite3_create_module_v2(db, "vec_static_blobs", &vec_static_blobsModule,
static_blob_data, sqlite3_free); static_blob_data, sqlite3_free);
if(rc != SQLITE_OK) return rc; if (rc != SQLITE_OK)
return rc;
rc = sqlite3_create_module_v2(db, "vec_static_blob_entries", rc = sqlite3_create_module_v2(db, "vec_static_blob_entries",
&vec_static_blob_entriesModule, &vec_static_blob_entriesModule,
static_blob_data, NULL); static_blob_data, NULL);
if(rc != SQLITE_OK) return rc; if (rc != SQLITE_OK)
return rc;
return rc; return rc;
} }

View file

@ -2274,42 +2274,36 @@ def test_smoke():
db.execute("insert into vec_xyz(rowid, a) select 2, X'0000000000000040'") db.execute("insert into vec_xyz(rowid, a) select 2, X'0000000000000040'")
chunk = db.execute("select * from vec_xyz_chunks").fetchone() chunk = db.execute("select * from vec_xyz_chunks").fetchone()
assert ( assert chunk[
chunk["rowids"] "rowids"
== b"\x01\x00\x00\x00\x00\x00\x00\x00" ] == b"\x01\x00\x00\x00\x00\x00\x00\x00" + b"\x02\x00\x00\x00\x00\x00\x00\x00" + bytearray(
+ b"\x02\x00\x00\x00\x00\x00\x00\x00" int(1024 * 8) - 8 * 2
+ bytearray(int(1024 * 8) - 8 * 2)
) )
assert chunk["chunk_id"] == 1 assert chunk["chunk_id"] == 1
assert chunk["validity"] == b"\x03" + bytearray(int(1024 / 8) - 1) assert chunk["validity"] == b"\x03" + bytearray(int(1024 / 8) - 1)
vchunk = db.execute("select * from vec_xyz_vector_chunks00").fetchone() vchunk = db.execute("select * from vec_xyz_vector_chunks00").fetchone()
assert vchunk["rowid"] == 1 assert vchunk["rowid"] == 1
assert ( assert vchunk[
vchunk["vectors"] "vectors"
== b"\x00\x00\x00\x00\x00\x00\x80\x3f" ] == b"\x00\x00\x00\x00\x00\x00\x80\x3f" + b"\x00\x00\x00\x00\x00\x00\x00\x40" + bytearray(
+ b"\x00\x00\x00\x00\x00\x00\x00\x40" int(1024 * 4 * 2) - (2 * 4 * 2)
+ bytearray(int(1024 * 4 * 2) - (2 * 4 * 2))
) )
db.execute("insert into vec_xyz(rowid, a) select 3, X'00000000000080bf'") db.execute("insert into vec_xyz(rowid, a) select 3, X'00000000000080bf'")
chunk = db.execute("select * from vec_xyz_chunks").fetchone() chunk = db.execute("select * from vec_xyz_chunks").fetchone()
assert chunk["chunk_id"] == 1 assert chunk["chunk_id"] == 1
assert chunk["validity"] == b"\x07" + bytearray(int(1024 / 8) - 1) assert chunk["validity"] == b"\x07" + bytearray(int(1024 / 8) - 1)
assert ( assert chunk[
chunk["rowids"] "rowids"
== b"\x01\x00\x00\x00\x00\x00\x00\x00" ] == b"\x01\x00\x00\x00\x00\x00\x00\x00" + b"\x02\x00\x00\x00\x00\x00\x00\x00" + b"\x03\x00\x00\x00\x00\x00\x00\x00" + bytearray(
+ b"\x02\x00\x00\x00\x00\x00\x00\x00" int(1024 * 8) - 8 * 3
+ b"\x03\x00\x00\x00\x00\x00\x00\x00"
+ bytearray(int(1024 * 8) - 8 * 3)
) )
vchunk = db.execute("select * from vec_xyz_vector_chunks00").fetchone() vchunk = db.execute("select * from vec_xyz_vector_chunks00").fetchone()
assert vchunk["rowid"] == 1 assert vchunk["rowid"] == 1
assert ( assert vchunk[
vchunk["vectors"] "vectors"
== b"\x00\x00\x00\x00\x00\x00\x80\x3f" ] == b"\x00\x00\x00\x00\x00\x00\x80\x3f" + b"\x00\x00\x00\x00\x00\x00\x00\x40" + b"\x00\x00\x00\x00\x00\x00\x80\xbf" + bytearray(
+ b"\x00\x00\x00\x00\x00\x00\x00\x40" int(1024 * 4 * 2) - (2 * 4 * 3)
+ b"\x00\x00\x00\x00\x00\x00\x80\xbf"
+ bytearray(int(1024 * 4 * 2) - (2 * 4 * 3))
) )
# db.execute("select * from vec_xyz") # db.execute("select * from vec_xyz")
@ -2352,7 +2346,8 @@ def test_vec0_stress_small_chunks():
{"rowid": 994, "a": _f32([99.4] * 8)}, {"rowid": 994, "a": _f32([99.4] * 8)},
{"rowid": 993, "a": _f32([99.3] * 8)}, {"rowid": 993, "a": _f32([99.3] * 8)},
] ]
assert execute_all( assert (
execute_all(
db, db,
""" """
select rowid, a, distance select rowid, a, distance
@ -2362,7 +2357,8 @@ def test_vec0_stress_small_chunks():
order by distance order by distance
""", """,
[_f32([50.0] * 8)], [_f32([50.0] * 8)],
) == [ )
== [
{ {
"a": _f32([500 * 0.1] * 8), "a": _f32([500 * 0.1] * 8),
"distance": 0.0, "distance": 0.0,
@ -2409,6 +2405,7 @@ def test_vec0_stress_small_chunks():
"rowid": 504, "rowid": 504,
}, },
] ]
)
def test_vec0_distance_metric(): def test_vec0_distance_metric():