text knn NE

This commit is contained in:
Alex Garcia 2024-11-18 12:15:25 -08:00
parent 31622209eb
commit 4ba167c315
3 changed files with 185 additions and 24 deletions

View file

@ -5885,13 +5885,16 @@ int vec0_metadata_filter_text(vec0_vtab * p, sqlite3_value * value, const void *
sqlite3_blob_close(rowidsBlob); sqlite3_blob_close(rowidsBlob);
switch(op) { switch(op) {
int nPrefix;
char * sPrefix;
char *sFull; char *sFull;
int nFull; int nFull;
u8 * view;
case VEC0_METADATA_OPERATOR_EQ: { case VEC0_METADATA_OPERATOR_EQ: {
for(int i = 0; i < size; i++) { for(int i = 0; i < size; i++) {
u8 * view = &((u8*) buffer)[i * VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH]; view = &((u8*) buffer)[i * VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH];
int nPrefix = ((int*) view)[0]; nPrefix = ((int*) view)[0];
char * sPrefix = (char *) &view[4]; sPrefix = (char *) &view[4];
// for EQ the text lengths must match // for EQ the text lengths must match
if(nPrefix != nTarget) { if(nPrefix != nTarget) {
@ -5925,11 +5928,39 @@ int vec0_metadata_filter_text(vec0_vtab * p, sqlite3_value * value, const void *
} }
case VEC0_METADATA_OPERATOR_NE: { case VEC0_METADATA_OPERATOR_NE: {
for(int i = 0; i < size; i++) { for(int i = 0; i < size; i++) {
u8 * view = &((u8*) buffer)[i * VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH]; view = &((u8*) buffer)[i * VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH];
int n = ((int*) view)[0]; nPrefix = ((int*) view)[0];
char * s = (char *) &view[4]; sPrefix = (char *) &view[4];
if(n > VEC0_METADATA_TEXT_VIEW_DATA_LENGTH) {rc = SQLITE_ERROR;goto done;} /* TODO */
bitmap_set(b, i, strncmp(s, sTarget, n) != 0); // for NE if text lengths dont match, it never will
if(nPrefix != nTarget) {
bitmap_set(b, i, 1);
continue;
}
int cmpPrefix = strncmp(sPrefix, sTarget, min(nPrefix, VEC0_METADATA_TEXT_VIEW_DATA_LENGTH));
// for short strings, use the prefix comparison direclty
if(nPrefix <= VEC0_METADATA_TEXT_VIEW_DATA_LENGTH) {
bitmap_set(b, i, cmpPrefix != 0);
continue;
}
// for NE on longs strings, if prefixes dont match, then long string wont
if(cmpPrefix) {
bitmap_set(b, i, 1);
continue;
}
// consult the full string
rc = vec0_get_metadata_text_long_value(p, &stmt, metadata_idx, rowids[i], &nFull, &sFull);
if(rc != SQLITE_OK) {
goto done;
}
if(nPrefix != nFull) {
rc = SQLITE_ERROR;
goto done;
}
bitmap_set(b, i, strncmp(sFull, sTarget, nFull) != 0);
} }
break; break;
} }

View file

@ -783,33 +783,163 @@
}) })
# --- # ---
# name: test_long_text_knn[ne-bbbb] # name: test_long_text_knn[ne-bbbb]
dict({ OrderedDict({
'error': 'OperationalError', 'sql': "select * from v where vector match X'11111111' and k = 5 and name != ?",
'message': 'unrecognized token: "!"', 'rows': list([
OrderedDict({
'rowid': 6,
'vector': b'\x11\x11\x11\x11',
'name': 'cccccccccccc_ccc',
}),
OrderedDict({
'rowid': 5,
'vector': b'\x11\x11\x11\x11',
'name': 'cccc',
}),
OrderedDict({
'rowid': 4,
'vector': b'\x11\x11\x11\x11',
'name': 'bbbbbbbbbbbb_bbb',
}),
OrderedDict({
'rowid': 2,
'vector': b'\x11\x11\x11\x11',
'name': 'aaaaaaaaaaaa_aaa',
}),
OrderedDict({
'rowid': 1,
'vector': b'\x11\x11\x11\x11',
'name': 'aaaa',
}),
]),
}) })
# --- # ---
# name: test_long_text_knn[ne-bbbbbbbbbbbb_aaa] # name: test_long_text_knn[ne-bbbbbbbbbbbb_aaa]
dict({ OrderedDict({
'error': 'OperationalError', 'sql': "select * from v where vector match X'11111111' and k = 5 and name != ?",
'message': 'unrecognized token: "!"', 'rows': list([
OrderedDict({
'rowid': 6,
'vector': b'\x11\x11\x11\x11',
'name': 'cccccccccccc_ccc',
}),
OrderedDict({
'rowid': 5,
'vector': b'\x11\x11\x11\x11',
'name': 'cccc',
}),
OrderedDict({
'rowid': 4,
'vector': b'\x11\x11\x11\x11',
'name': 'bbbbbbbbbbbb_bbb',
}),
OrderedDict({
'rowid': 3,
'vector': b'\x11\x11\x11\x11',
'name': 'bbbb',
}),
OrderedDict({
'rowid': 2,
'vector': b'\x11\x11\x11\x11',
'name': 'aaaaaaaaaaaa_aaa',
}),
]),
}) })
# --- # ---
# name: test_long_text_knn[ne-bbbbbbbbbbbb_bbb] # name: test_long_text_knn[ne-bbbbbbbbbbbb_bbb]
dict({ OrderedDict({
'error': 'OperationalError', 'sql': "select * from v where vector match X'11111111' and k = 5 and name != ?",
'message': 'unrecognized token: "!"', 'rows': list([
OrderedDict({
'rowid': 6,
'vector': b'\x11\x11\x11\x11',
'name': 'cccccccccccc_ccc',
}),
OrderedDict({
'rowid': 5,
'vector': b'\x11\x11\x11\x11',
'name': 'cccc',
}),
OrderedDict({
'rowid': 3,
'vector': b'\x11\x11\x11\x11',
'name': 'bbbb',
}),
OrderedDict({
'rowid': 2,
'vector': b'\x11\x11\x11\x11',
'name': 'aaaaaaaaaaaa_aaa',
}),
OrderedDict({
'rowid': 1,
'vector': b'\x11\x11\x11\x11',
'name': 'aaaa',
}),
]),
}) })
# --- # ---
# name: test_long_text_knn[ne-bbbbbbbbbbbb_ccc] # name: test_long_text_knn[ne-bbbbbbbbbbbb_ccc]
dict({ OrderedDict({
'error': 'OperationalError', 'sql': "select * from v where vector match X'11111111' and k = 5 and name != ?",
'message': 'unrecognized token: "!"', 'rows': list([
OrderedDict({
'rowid': 6,
'vector': b'\x11\x11\x11\x11',
'name': 'cccccccccccc_ccc',
}),
OrderedDict({
'rowid': 5,
'vector': b'\x11\x11\x11\x11',
'name': 'cccc',
}),
OrderedDict({
'rowid': 4,
'vector': b'\x11\x11\x11\x11',
'name': 'bbbbbbbbbbbb_bbb',
}),
OrderedDict({
'rowid': 3,
'vector': b'\x11\x11\x11\x11',
'name': 'bbbb',
}),
OrderedDict({
'rowid': 2,
'vector': b'\x11\x11\x11\x11',
'name': 'aaaaaaaaaaaa_aaa',
}),
]),
}) })
# --- # ---
# name: test_long_text_knn[ne-longlonglonglonglonglonglong] # name: test_long_text_knn[ne-longlonglonglonglonglonglong]
dict({ OrderedDict({
'error': 'OperationalError', 'sql': "select * from v where vector match X'11111111' and k = 5 and name != ?",
'message': 'unrecognized token: "!"', 'rows': list([
OrderedDict({
'rowid': 6,
'vector': b'\x11\x11\x11\x11',
'name': 'cccccccccccc_ccc',
}),
OrderedDict({
'rowid': 5,
'vector': b'\x11\x11\x11\x11',
'name': 'cccc',
}),
OrderedDict({
'rowid': 4,
'vector': b'\x11\x11\x11\x11',
'name': 'bbbbbbbbbbbb_bbb',
}),
OrderedDict({
'rowid': 3,
'vector': b'\x11\x11\x11\x11',
'name': 'bbbb',
}),
OrderedDict({
'rowid': 2,
'vector': b'\x11\x11\x11\x11',
'name': 'aaaaaaaaaaaa_aaa',
}),
]),
}) })
# --- # ---
# name: test_long_text_updates # name: test_long_text_updates

View file

@ -152,7 +152,7 @@ def test_long_text_knn(db, snapshot):
"bbbbbbbbbbbb_ccc", "bbbbbbbbbbbb_ccc",
"longlonglonglonglonglonglong", "longlonglonglonglonglonglong",
] ]
ops = ["=", "!-", "<", "<=", ">", ">="] ops = ["=", "!=", "<", "<=", ">", ">="]
op_names = ["eq", "ne", "lt", "le", "gt", "ge"] op_names = ["eq", "ne", "lt", "le", "gt", "ge"]
for test in tests: for test in tests: