text knn LT/LE

This commit is contained in:
Alex Garcia 2024-11-18 14:09:07 -08:00
parent df29e31ddc
commit 018e9789de
3 changed files with 338 additions and 53 deletions

1
TODO
View file

@ -13,7 +13,6 @@
- perf: LEFT JOIN aux table to rowids query in vec0_cursor for rowid/point stmts, to avoid N lookup queries - perf: LEFT JOIN aux table to rowids query in vec0_cursor for rowid/point stmts, to avoid N lookup queries
# metadata filtering # metadata filtering
- text comparisons (long)
- `v in (...)` handling - `v in (...)` handling
- [ ] test accessing aux values when rowid is different than 1,2,3 etc. - [ ] test accessing aux values when rowid is different than 1,2,3 etc.
- [ ] add `xyz_info` shadow table with version etc. - [ ] add `xyz_info` shadow table with version etc.

View file

@ -6027,21 +6027,63 @@ int vec0_metadata_filter_text(vec0_vtab * p, sqlite3_value * value, const void *
} }
case VEC0_METADATA_OPERATOR_LE: { case VEC0_METADATA_OPERATOR_LE: {
for(int i = 0; i < size; i++) { for(int i = 0; i < size; i++) {
u8 * view = &((u8*) buffer)[i * VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH]; view = &((u8*) buffer)[i * VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH];
int n = ((int*) view)[0]; nPrefix = ((int*) view)[0];
char * s = (char *) &view[4]; sPrefix = (char *) &view[4];
if(n > VEC0_METADATA_TEXT_VIEW_DATA_LENGTH) {rc = SQLITE_ERROR;goto done;} /* TODO */ int cmpPrefix = strncmp(sPrefix, sTarget, min(min(nPrefix, VEC0_METADATA_TEXT_VIEW_DATA_LENGTH), nTarget));
bitmap_set(b, i, strncmp(s, sTarget, n) <= 0);
if(nPrefix < VEC0_METADATA_TEXT_VIEW_DATA_LENGTH) {
// if prefix match, check which is longer
if(cmpPrefix == 0) {
bitmap_set(b, i, nPrefix <= nTarget);
}
else {
bitmap_set(b, i, cmpPrefix <= 0);
}
continue;
}
// TODO(perf): may not need to compare full text in some cases
rc = vec0_get_metadata_text_long_value(p, &stmt, metadata_idx, rowids[i], &nFull, &sFull);
if(rc != SQLITE_OK) {
goto done;
}
if(nPrefix != nFull) {
rc = SQLITE_ERROR;
goto done;
}
bitmap_set(b, i, strncmp(sFull, sTarget, nFull) <= 0);
} }
break; break;
} }
case VEC0_METADATA_OPERATOR_LT: { case VEC0_METADATA_OPERATOR_LT: {
for(int i = 0; i < size; i++) { for(int i = 0; i < size; i++) {
u8 * view = &((u8*) buffer)[i * VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH]; view = &((u8*) buffer)[i * VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH];
int n = ((int*) view)[0]; nPrefix = ((int*) view)[0];
char * s = (char *) &view[4]; sPrefix = (char *) &view[4];
if(n > VEC0_METADATA_TEXT_VIEW_DATA_LENGTH) {rc = SQLITE_ERROR;goto done;} /* TODO */ int cmpPrefix = strncmp(sPrefix, sTarget, min(min(nPrefix, VEC0_METADATA_TEXT_VIEW_DATA_LENGTH), nTarget));
bitmap_set(b, i, strncmp(s, sTarget, n) < 0);
if(nPrefix < VEC0_METADATA_TEXT_VIEW_DATA_LENGTH) {
// if prefix match, check which is longer
if(cmpPrefix == 0) {
bitmap_set(b, i, nPrefix < nTarget);
}
else {
bitmap_set(b, i, cmpPrefix < 0);
}
continue;
}
// TODO(perf): may not need to compare full text in some cases
rc = vec0_get_metadata_text_long_value(p, &stmt, metadata_idx, rowids[i], &nFull, &sFull);
if(rc != SQLITE_OK) {
goto done;
}
if(nPrefix != nFull) {
rc = SQLITE_ERROR;
goto done;
}
bitmap_set(b, i, strncmp(sFull, sTarget, nFull) < 0);
} }
break; break;
} }

View file

@ -955,87 +955,331 @@
}) })
# --- # ---
# name: test_long_text_knn[le-bb] # name: test_long_text_knn[le-bb]
dict({ OrderedDict({
'error': 'OperationalError', 'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name <= ?",
'message': 'Could not filter metadata fields', 'rows': list([
OrderedDict({
'rowid': 2,
'name': 'aaaaaaaaaaaa_aaa',
'distance': 98.0,
}),
OrderedDict({
'rowid': 1,
'name': 'aaaa',
'distance': 99.0,
}),
]),
}) })
# --- # ---
# name: test_long_text_knn[le-bbbb] # name: test_long_text_knn[le-bbbb]
dict({ OrderedDict({
'error': 'OperationalError', 'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name <= ?",
'message': 'Could not filter metadata fields', 'rows': list([
OrderedDict({
'rowid': 3,
'name': 'bbbb',
'distance': 97.0,
}),
OrderedDict({
'rowid': 2,
'name': 'aaaaaaaaaaaa_aaa',
'distance': 98.0,
}),
OrderedDict({
'rowid': 1,
'name': 'aaaa',
'distance': 99.0,
}),
]),
}) })
# --- # ---
# name: test_long_text_knn[le-bbbbbb] # name: test_long_text_knn[le-bbbbbb]
dict({ OrderedDict({
'error': 'OperationalError', 'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name <= ?",
'message': 'Could not filter metadata fields', 'rows': list([
OrderedDict({
'rowid': 3,
'name': 'bbbb',
'distance': 97.0,
}),
OrderedDict({
'rowid': 2,
'name': 'aaaaaaaaaaaa_aaa',
'distance': 98.0,
}),
OrderedDict({
'rowid': 1,
'name': 'aaaa',
'distance': 99.0,
}),
]),
}) })
# --- # ---
# name: test_long_text_knn[le-bbbbbbbbbbbb_aaa] # name: test_long_text_knn[le-bbbbbbbbbbbb_aaa]
dict({ OrderedDict({
'error': 'OperationalError', 'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name <= ?",
'message': 'Could not filter metadata fields', 'rows': list([
OrderedDict({
'rowid': 3,
'name': 'bbbb',
'distance': 97.0,
}),
OrderedDict({
'rowid': 2,
'name': 'aaaaaaaaaaaa_aaa',
'distance': 98.0,
}),
OrderedDict({
'rowid': 1,
'name': 'aaaa',
'distance': 99.0,
}),
]),
}) })
# --- # ---
# name: test_long_text_knn[le-bbbbbbbbbbbb_bbb] # name: test_long_text_knn[le-bbbbbbbbbbbb_bbb]
dict({ OrderedDict({
'error': 'OperationalError', 'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name <= ?",
'message': 'Could not filter metadata fields', 'rows': list([
OrderedDict({
'rowid': 4,
'name': 'bbbbbbbbbbbb_bbb',
'distance': 96.0,
}),
OrderedDict({
'rowid': 3,
'name': 'bbbb',
'distance': 97.0,
}),
OrderedDict({
'rowid': 2,
'name': 'aaaaaaaaaaaa_aaa',
'distance': 98.0,
}),
OrderedDict({
'rowid': 1,
'name': 'aaaa',
'distance': 99.0,
}),
]),
}) })
# --- # ---
# name: test_long_text_knn[le-bbbbbbbbbbbb_ccc] # name: test_long_text_knn[le-bbbbbbbbbbbb_ccc]
dict({ OrderedDict({
'error': 'OperationalError', 'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name <= ?",
'message': 'Could not filter metadata fields', 'rows': list([
OrderedDict({
'rowid': 4,
'name': 'bbbbbbbbbbbb_bbb',
'distance': 96.0,
}),
OrderedDict({
'rowid': 3,
'name': 'bbbb',
'distance': 97.0,
}),
OrderedDict({
'rowid': 2,
'name': 'aaaaaaaaaaaa_aaa',
'distance': 98.0,
}),
OrderedDict({
'rowid': 1,
'name': 'aaaa',
'distance': 99.0,
}),
]),
}) })
# --- # ---
# name: test_long_text_knn[le-longlonglonglonglonglonglong] # name: test_long_text_knn[le-longlonglonglonglonglonglong]
dict({ OrderedDict({
'error': 'OperationalError', 'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name <= ?",
'message': 'Could not filter metadata fields', 'rows': list([
OrderedDict({
'rowid': 6,
'name': 'cccccccccccc_ccc',
'distance': 94.0,
}),
OrderedDict({
'rowid': 5,
'name': 'cccc',
'distance': 95.0,
}),
OrderedDict({
'rowid': 4,
'name': 'bbbbbbbbbbbb_bbb',
'distance': 96.0,
}),
OrderedDict({
'rowid': 3,
'name': 'bbbb',
'distance': 97.0,
}),
OrderedDict({
'rowid': 2,
'name': 'aaaaaaaaaaaa_aaa',
'distance': 98.0,
}),
]),
}) })
# --- # ---
# name: test_long_text_knn[lt-bb] # name: test_long_text_knn[lt-bb]
dict({ OrderedDict({
'error': 'OperationalError', 'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name < ?",
'message': 'Could not filter metadata fields', 'rows': list([
OrderedDict({
'rowid': 2,
'name': 'aaaaaaaaaaaa_aaa',
'distance': 98.0,
}),
OrderedDict({
'rowid': 1,
'name': 'aaaa',
'distance': 99.0,
}),
]),
}) })
# --- # ---
# name: test_long_text_knn[lt-bbbb] # name: test_long_text_knn[lt-bbbb]
dict({ OrderedDict({
'error': 'OperationalError', 'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name < ?",
'message': 'Could not filter metadata fields', 'rows': list([
OrderedDict({
'rowid': 2,
'name': 'aaaaaaaaaaaa_aaa',
'distance': 98.0,
}),
OrderedDict({
'rowid': 1,
'name': 'aaaa',
'distance': 99.0,
}),
]),
}) })
# --- # ---
# name: test_long_text_knn[lt-bbbbbb] # name: test_long_text_knn[lt-bbbbbb]
dict({ OrderedDict({
'error': 'OperationalError', 'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name < ?",
'message': 'Could not filter metadata fields', 'rows': list([
OrderedDict({
'rowid': 3,
'name': 'bbbb',
'distance': 97.0,
}),
OrderedDict({
'rowid': 2,
'name': 'aaaaaaaaaaaa_aaa',
'distance': 98.0,
}),
OrderedDict({
'rowid': 1,
'name': 'aaaa',
'distance': 99.0,
}),
]),
}) })
# --- # ---
# name: test_long_text_knn[lt-bbbbbbbbbbbb_aaa] # name: test_long_text_knn[lt-bbbbbbbbbbbb_aaa]
dict({ OrderedDict({
'error': 'OperationalError', 'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name < ?",
'message': 'Could not filter metadata fields', 'rows': list([
OrderedDict({
'rowid': 3,
'name': 'bbbb',
'distance': 97.0,
}),
OrderedDict({
'rowid': 2,
'name': 'aaaaaaaaaaaa_aaa',
'distance': 98.0,
}),
OrderedDict({
'rowid': 1,
'name': 'aaaa',
'distance': 99.0,
}),
]),
}) })
# --- # ---
# name: test_long_text_knn[lt-bbbbbbbbbbbb_bbb] # name: test_long_text_knn[lt-bbbbbbbbbbbb_bbb]
dict({ OrderedDict({
'error': 'OperationalError', 'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name < ?",
'message': 'Could not filter metadata fields', 'rows': list([
OrderedDict({
'rowid': 3,
'name': 'bbbb',
'distance': 97.0,
}),
OrderedDict({
'rowid': 2,
'name': 'aaaaaaaaaaaa_aaa',
'distance': 98.0,
}),
OrderedDict({
'rowid': 1,
'name': 'aaaa',
'distance': 99.0,
}),
]),
}) })
# --- # ---
# name: test_long_text_knn[lt-bbbbbbbbbbbb_ccc] # name: test_long_text_knn[lt-bbbbbbbbbbbb_ccc]
dict({ OrderedDict({
'error': 'OperationalError', 'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name < ?",
'message': 'Could not filter metadata fields', 'rows': list([
OrderedDict({
'rowid': 4,
'name': 'bbbbbbbbbbbb_bbb',
'distance': 96.0,
}),
OrderedDict({
'rowid': 3,
'name': 'bbbb',
'distance': 97.0,
}),
OrderedDict({
'rowid': 2,
'name': 'aaaaaaaaaaaa_aaa',
'distance': 98.0,
}),
OrderedDict({
'rowid': 1,
'name': 'aaaa',
'distance': 99.0,
}),
]),
}) })
# --- # ---
# name: test_long_text_knn[lt-longlonglonglonglonglonglong] # name: test_long_text_knn[lt-longlonglonglonglonglonglong]
dict({ OrderedDict({
'error': 'OperationalError', 'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name < ?",
'message': 'Could not filter metadata fields', 'rows': list([
OrderedDict({
'rowid': 6,
'name': 'cccccccccccc_ccc',
'distance': 94.0,
}),
OrderedDict({
'rowid': 5,
'name': 'cccc',
'distance': 95.0,
}),
OrderedDict({
'rowid': 4,
'name': 'bbbbbbbbbbbb_bbb',
'distance': 96.0,
}),
OrderedDict({
'rowid': 3,
'name': 'bbbb',
'distance': 97.0,
}),
OrderedDict({
'rowid': 2,
'name': 'aaaaaaaaaaaa_aaa',
'distance': 98.0,
}),
]),
}) })
# --- # ---
# name: test_long_text_knn[ne-bb] # name: test_long_text_knn[ne-bb]