mirror of
https://github.com/asg017/sqlite-vec.git
synced 2026-04-26 09:16:22 +02:00
updates
This commit is contained in:
parent
7f47409e60
commit
da29ace630
5 changed files with 158 additions and 18 deletions
|
|
@ -1,5 +1,43 @@
|
||||||
## `vec0`
|
## `vec0`
|
||||||
|
|
||||||
|
### Shadow Tables
|
||||||
|
|
||||||
|
#### `xyz_chunks`
|
||||||
|
|
||||||
|
- `chunk_id INTEGER`
|
||||||
|
- `size INTEGER`
|
||||||
|
- `validity BLOB`
|
||||||
|
- `rowids BLOB`
|
||||||
|
|
||||||
|
|
||||||
|
#### `xyz_rowids`
|
||||||
|
|
||||||
|
- `rowid INTEGER`
|
||||||
|
- `id`
|
||||||
|
- `chunk_id INTEGER`
|
||||||
|
- `chunk_offset INTEGER`
|
||||||
|
|
||||||
|
#### `xyz_vector_chunksNN`
|
||||||
|
|
||||||
|
- `rowid INTEGER`
|
||||||
|
- `vector BLOB`
|
||||||
|
|
||||||
|
#### `xyz_auxiliary`
|
||||||
|
|
||||||
|
- `rowid INTEGER`
|
||||||
|
- `valueNN [type]`
|
||||||
|
|
||||||
|
#### `xyz_metadata_chunksNN`
|
||||||
|
|
||||||
|
- `rowid INTEGER`
|
||||||
|
- `data BLOB`
|
||||||
|
|
||||||
|
|
||||||
|
#### `xyz_metadata_text_data_00`
|
||||||
|
|
||||||
|
- `rowid INTEGER`
|
||||||
|
- `data TEXT`
|
||||||
|
|
||||||
### idxStr
|
### idxStr
|
||||||
|
|
||||||
The `vec0` idxStr is a string composed of single "header" character and 0 or
|
The `vec0` idxStr is a string composed of single "header" character and 0 or
|
||||||
|
|
|
||||||
99
sqlite-vec.c
99
sqlite-vec.c
|
|
@ -5719,6 +5719,41 @@ int min_idx(const f32 *distances, i32 n, u8 *candidates, i32 *out, i32 k,
|
||||||
return SQLITE_OK;
|
return SQLITE_OK;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int vec0_get_metadata_text_long_value(
|
||||||
|
vec0_vtab * p,
|
||||||
|
sqlite3_stmt ** stmt,
|
||||||
|
int metadata_idx,
|
||||||
|
i64 rowid,
|
||||||
|
int *n,
|
||||||
|
char ** s) {
|
||||||
|
int rc;
|
||||||
|
if(!(*stmt)) {
|
||||||
|
const char * zSql = sqlite3_mprintf("select data from " VEC0_SHADOW_METADATA_TEXT_DATA_NAME " where rowid = ?", p->schemaName, p->tableName, metadata_idx);
|
||||||
|
if(!zSql) {
|
||||||
|
rc = SQLITE_NOMEM;
|
||||||
|
goto done;
|
||||||
|
}
|
||||||
|
rc = sqlite3_prepare_v2(p->db, zSql, -1, stmt, NULL);
|
||||||
|
sqlite3_free( (void *) zSql);
|
||||||
|
if(rc != SQLITE_OK) {
|
||||||
|
goto done;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sqlite3_reset(*stmt);
|
||||||
|
sqlite3_bind_int64(*stmt, 1, rowid);
|
||||||
|
rc = sqlite3_step(*stmt);
|
||||||
|
if(rc != SQLITE_ROW) {
|
||||||
|
rc = SQLITE_ERROR;
|
||||||
|
goto done;
|
||||||
|
}
|
||||||
|
*s = (char *) sqlite3_column_text(*stmt, 0);
|
||||||
|
*n = sqlite3_column_bytes(*stmt, 0);
|
||||||
|
rc = SQLITE_OK;
|
||||||
|
done:
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Crete at "iterator" (sqlite3_stmt) of chunks with the given constraints
|
* @brief Crete at "iterator" (sqlite3_stmt) of chunks with the given constraints
|
||||||
*
|
*
|
||||||
|
|
@ -5845,6 +5880,28 @@ int vec0_set_metadata_filter_bitmap(
|
||||||
if(rc != SQLITE_OK) {
|
if(rc != SQLITE_OK) {
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
// TODO: only on text columns
|
||||||
|
sqlite3_blob * rowidsBlob;
|
||||||
|
rc = sqlite3_blob_open(p->db, p->schemaName, p->shadowChunksName, "rowids", chunk_rowid, 0, &rowidsBlob);
|
||||||
|
if(rc != SQLITE_OK) {
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
assert(sqlite3_blob_bytes(rowidsBlob) % sizeof(i64) == 0);
|
||||||
|
assert((sqlite3_blob_bytes(rowidsBlob) / sizeof(i64)) == size);
|
||||||
|
i64 * rowids;
|
||||||
|
rowids = sqlite3_malloc(sqlite3_blob_bytes(rowidsBlob));
|
||||||
|
if(!rowids) {
|
||||||
|
sqlite3_blob_close(rowidsBlob);
|
||||||
|
return SQLITE_NOMEM;
|
||||||
|
}
|
||||||
|
|
||||||
|
rc = sqlite3_blob_read(rowidsBlob, rowids, sqlite3_blob_bytes(rowidsBlob), 0);
|
||||||
|
if(rc != SQLITE_OK) {
|
||||||
|
sqlite3_blob_close(rowidsBlob);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
sqlite3_blob_close(rowidsBlob);
|
||||||
|
|
||||||
vec0_metadata_column_kind kind = p->metadata_columns[metadata_idx].kind;
|
vec0_metadata_column_kind kind = p->metadata_columns[metadata_idx].kind;
|
||||||
int szMatch = 0;
|
int szMatch = 0;
|
||||||
int blobSize = sqlite3_blob_bytes(blob);
|
int blobSize = sqlite3_blob_bytes(blob);
|
||||||
|
|
@ -5951,25 +6008,41 @@ int vec0_set_metadata_filter_bitmap(
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case VEC0_METADATA_COLUMN_KIND_TEXT: {
|
case VEC0_METADATA_COLUMN_KIND_TEXT: {
|
||||||
// TODO: handle large strings. For now just raise a generic error
|
|
||||||
for(int i = 0; i < size; i++) {
|
|
||||||
u8 * view = &((u8*) buffer)[i * VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH];
|
|
||||||
int n = ((int*) view)[0];
|
|
||||||
if(n > 12) {
|
|
||||||
rc = SQLITE_ERROR;
|
|
||||||
goto done;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
const char * target = (const char *) sqlite3_value_text(value);
|
const char * target = (const char *) sqlite3_value_text(value);
|
||||||
|
int targetn = sqlite3_value_bytes(value);
|
||||||
|
|
||||||
switch(op) {
|
switch(op) {
|
||||||
case VEC0_METADATA_OPERATOR_EQ: {
|
case VEC0_METADATA_OPERATOR_EQ: {
|
||||||
|
sqlite3_stmt * stmt = NULL;
|
||||||
for(int i = 0; i < size; i++) {
|
for(int i = 0; i < size; i++) {
|
||||||
u8 * view = &((u8*) buffer)[i * VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH];
|
u8 * view = &((u8*) buffer)[i * VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH];
|
||||||
int n = ((int*) view)[0];
|
int n = ((int*) view)[0];
|
||||||
char * s = (char *) &view[4];
|
char * s = (char *) &view[4];
|
||||||
bitmap_set(b, i, strncmp(s, target, n) == 0);
|
if(n != targetn) {
|
||||||
|
bitmap_set(b, i, 0);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
int prefix_cmp = strncmp(s, target, min(n, 12));
|
||||||
|
if(n <= 12) {
|
||||||
|
bitmap_set(b, i, prefix_cmp == 0);
|
||||||
|
}
|
||||||
|
// if the prefix doesnt match, the rest of the string wont match
|
||||||
|
else if(prefix_cmp) {
|
||||||
|
bitmap_set(b, i, 0);
|
||||||
|
}
|
||||||
|
// need to consult
|
||||||
|
else {
|
||||||
|
char *slong;
|
||||||
|
int slongn;
|
||||||
|
rc = vec0_get_metadata_text_long_value(p, &stmt, metadata_idx, rowids[i], &slongn, &slong);
|
||||||
|
if(rc != SQLITE_OK) {
|
||||||
|
goto done;
|
||||||
|
}
|
||||||
|
assert(n == slongn);
|
||||||
|
bitmap_set(b, i, strncmp(slong, target, n) == 0);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
sqlite3_finalize(stmt);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case VEC0_METADATA_OPERATOR_NE: {
|
case VEC0_METADATA_OPERATOR_NE: {
|
||||||
|
|
@ -5977,6 +6050,7 @@ int vec0_set_metadata_filter_bitmap(
|
||||||
u8 * view = &((u8*) buffer)[i * VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH];
|
u8 * view = &((u8*) buffer)[i * VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH];
|
||||||
int n = ((int*) view)[0];
|
int n = ((int*) view)[0];
|
||||||
char * s = (char *) &view[4];
|
char * s = (char *) &view[4];
|
||||||
|
if(n > 12) {rc = SQLITE_ERROR;goto done;} /* TODO */
|
||||||
bitmap_set(b, i, strncmp(s, target, n) != 0);
|
bitmap_set(b, i, strncmp(s, target, n) != 0);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
@ -5986,6 +6060,7 @@ int vec0_set_metadata_filter_bitmap(
|
||||||
u8 * view = &((u8*) buffer)[i * VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH];
|
u8 * view = &((u8*) buffer)[i * VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH];
|
||||||
int n = ((int*) view)[0];
|
int n = ((int*) view)[0];
|
||||||
char * s = (char *) &view[4];
|
char * s = (char *) &view[4];
|
||||||
|
if(n > 12) {rc = SQLITE_ERROR;goto done;} /* TODO */
|
||||||
bitmap_set(b, i, strncmp(s, target, n) > 0);
|
bitmap_set(b, i, strncmp(s, target, n) > 0);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
@ -5995,6 +6070,7 @@ int vec0_set_metadata_filter_bitmap(
|
||||||
u8 * view = &((u8*) buffer)[i * VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH];
|
u8 * view = &((u8*) buffer)[i * VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH];
|
||||||
int n = ((int*) view)[0];
|
int n = ((int*) view)[0];
|
||||||
char * s = (char *) &view[4];
|
char * s = (char *) &view[4];
|
||||||
|
if(n > 12) {rc = SQLITE_ERROR;goto done;} /* TODO */
|
||||||
bitmap_set(b, i, strncmp(s, target, n) >= 0);
|
bitmap_set(b, i, strncmp(s, target, n) >= 0);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
@ -6004,6 +6080,7 @@ int vec0_set_metadata_filter_bitmap(
|
||||||
u8 * view = &((u8*) buffer)[i * VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH];
|
u8 * view = &((u8*) buffer)[i * VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH];
|
||||||
int n = ((int*) view)[0];
|
int n = ((int*) view)[0];
|
||||||
char * s = (char *) &view[4];
|
char * s = (char *) &view[4];
|
||||||
|
if(n > 12) {rc = SQLITE_ERROR;goto done;} /* TODO */
|
||||||
bitmap_set(b, i, strncmp(s, target, n) <= 0);
|
bitmap_set(b, i, strncmp(s, target, n) <= 0);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
@ -6013,6 +6090,7 @@ int vec0_set_metadata_filter_bitmap(
|
||||||
u8 * view = &((u8*) buffer)[i * VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH];
|
u8 * view = &((u8*) buffer)[i * VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH];
|
||||||
int n = ((int*) view)[0];
|
int n = ((int*) view)[0];
|
||||||
char * s = (char *) &view[4];
|
char * s = (char *) &view[4];
|
||||||
|
if(n > 12) {rc = SQLITE_ERROR;goto done;} /* TODO */
|
||||||
bitmap_set(b, i, strncmp(s, target, n) < 0);
|
bitmap_set(b, i, strncmp(s, target, n) < 0);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
@ -6024,6 +6102,7 @@ int vec0_set_metadata_filter_bitmap(
|
||||||
}
|
}
|
||||||
done:
|
done:
|
||||||
sqlite3_free(buffer);
|
sqlite3_free(buffer);
|
||||||
|
sqlite3_free(rowids);
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -617,10 +617,28 @@
|
||||||
]),
|
]),
|
||||||
})
|
})
|
||||||
# ---
|
# ---
|
||||||
|
# name: test_long_text_knn[knn-eq-short]
|
||||||
|
OrderedDict({
|
||||||
|
'sql': "select * from v where vector match X'11111111' and k = 5 and name = ?",
|
||||||
|
'rows': list([
|
||||||
|
OrderedDict({
|
||||||
|
'rowid': 1,
|
||||||
|
'vector': b'\x11\x11\x11\x11',
|
||||||
|
'name': 'aaaaaaaaaaaa',
|
||||||
|
}),
|
||||||
|
]),
|
||||||
|
})
|
||||||
|
# ---
|
||||||
# name: test_long_text_knn[knn-eq-true]
|
# name: test_long_text_knn[knn-eq-true]
|
||||||
dict({
|
OrderedDict({
|
||||||
'error': 'OperationalError',
|
'sql': "select * from v where vector match X'11111111' and k = 5 and name = ?",
|
||||||
'message': 'Could not filter metadata fields',
|
'rows': list([
|
||||||
|
OrderedDict({
|
||||||
|
'rowid': 3,
|
||||||
|
'vector': b'\x11\x11\x11\x11',
|
||||||
|
'name': 'aaaaaaaaaaaa_aaa',
|
||||||
|
}),
|
||||||
|
]),
|
||||||
})
|
})
|
||||||
# ---
|
# ---
|
||||||
# name: test_long_text_updates
|
# name: test_long_text_updates
|
||||||
|
|
@ -1416,7 +1434,6 @@
|
||||||
# name: test_stress.1
|
# name: test_stress.1
|
||||||
OrderedDict({
|
OrderedDict({
|
||||||
'sql': '''
|
'sql': '''
|
||||||
|
|
||||||
select
|
select
|
||||||
movie_id,
|
movie_id,
|
||||||
title,
|
title,
|
||||||
|
|
@ -1431,7 +1448,6 @@
|
||||||
and num_reviews between 100 and 500
|
and num_reviews between 100 and 500
|
||||||
and mean_rating > 3.5
|
and mean_rating > 3.5
|
||||||
and k = 5;
|
and k = 5;
|
||||||
|
|
||||||
''',
|
''',
|
||||||
'rows': list([
|
'rows': list([
|
||||||
OrderedDict({
|
OrderedDict({
|
||||||
|
|
|
||||||
|
|
@ -119,7 +119,7 @@ def tests_command(file_path):
|
||||||
|
|
||||||
tests = [
|
tests = [
|
||||||
json.loads(row["data"])
|
json.loads(row["data"])
|
||||||
for row in db.execute("select data from tests limit 2000").fetchall()
|
for row in db.execute("select data from tests").fetchall()
|
||||||
]
|
]
|
||||||
|
|
||||||
num_or_skips = 0
|
num_or_skips = 0
|
||||||
|
|
@ -179,8 +179,8 @@ def tests_command(file_path):
|
||||||
== diff["values_changed"][bkey]["new_value"]
|
== diff["values_changed"][bkey]["new_value"]
|
||||||
)
|
)
|
||||||
elif len(keys_changed) == 1:
|
elif len(keys_changed) == 1:
|
||||||
v = int(akey.lstrip("root[").rstrip("]"))
|
v = int(keys_changed[0].lstrip("root[").rstrip("]"))
|
||||||
assert v == len(expected_closest_ids)
|
assert (v + 1) == len(expected_closest_ids)
|
||||||
else:
|
else:
|
||||||
raise Exception("fuck")
|
raise Exception("fuck")
|
||||||
num_1off_errors += 1
|
num_1off_errors += 1
|
||||||
|
|
|
||||||
|
|
@ -148,10 +148,17 @@ def test_long_text_knn(db, snapshot):
|
||||||
"create virtual table v using vec0(vector float[1], name text, chunk_size=8)"
|
"create virtual table v using vec0(vector float[1], name text, chunk_size=8)"
|
||||||
)
|
)
|
||||||
INSERT = "insert into v(vector, name) values (?, ?)"
|
INSERT = "insert into v(vector, name) values (?, ?)"
|
||||||
|
exec(db, INSERT, [b"\x11\x11\x11\x11", "aaaaaaaaaaaa"])
|
||||||
|
exec(db, INSERT, [b"\x11\x11\x11\x11", "bbbbbbbbbbbb"])
|
||||||
exec(db, INSERT, [b"\x11\x11\x11\x11", "aaaaaaaaaaaa_aaa"])
|
exec(db, INSERT, [b"\x11\x11\x11\x11", "aaaaaaaaaaaa_aaa"])
|
||||||
exec(db, INSERT, [b"\x11\x11\x11\x11", "aaaaaaaaaaaa_bbb"])
|
exec(db, INSERT, [b"\x11\x11\x11\x11", "aaaaaaaaaaaa_bbb"])
|
||||||
exec(db, INSERT, [b"\x11\x11\x11\x11", "aaaaaaaaaaaa_ccc"])
|
exec(db, INSERT, [b"\x11\x11\x11\x11", "aaaaaaaaaaaa_ccc"])
|
||||||
|
|
||||||
|
assert exec(
|
||||||
|
db,
|
||||||
|
"select * from v where vector match X'11111111' and k = 5 and name = ?",
|
||||||
|
["aaaaaaaaaaaa"],
|
||||||
|
) == snapshot(name="knn-eq-short")
|
||||||
assert exec(
|
assert exec(
|
||||||
db,
|
db,
|
||||||
"select * from v where vector match X'11111111' and k = 5 and name = ?",
|
"select * from v where vector match X'11111111' and k = 5 and name = ?",
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue