boolean comparison handling

This commit is contained in:
Alex Garcia 2024-11-14 16:36:53 -08:00
parent 3965029726
commit a1a64427fc
4 changed files with 189 additions and 25 deletions

1
TODO
View file

@ -13,7 +13,6 @@
- perf: LEFT JOIN aux table to rowids query in vec0_cursor for rowid/point stmts, to avoid N lookup queries - perf: LEFT JOIN aux table to rowids query in vec0_cursor for rowid/point stmts, to avoid N lookup queries
# metadata filtering # metadata filtering
- boolean comparisons
- text comparisons (long) - text comparisons (long)
- skip invalid validity entries in knn filter? - skip invalid validity entries in knn filter?
- null! - null!

View file

@ -2093,7 +2093,7 @@ typedef enum {
VEC0_METADATA_COLUMN_KIND_INTEGER, VEC0_METADATA_COLUMN_KIND_INTEGER,
VEC0_METADATA_COLUMN_KIND_FLOAT, VEC0_METADATA_COLUMN_KIND_FLOAT,
VEC0_METADATA_COLUMN_KIND_TEXT, VEC0_METADATA_COLUMN_KIND_TEXT,
// TODO: blob, date, datetime // future: blob, date, datetime
} vec0_metadata_column_kind; } vec0_metadata_column_kind;
/** /**
@ -5480,7 +5480,6 @@ static int vec0BestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pIdxInfo) {
} }
// TODO: when aux branch is merge, move this loop logic to above loop
for (int i = 0; i < pIdxInfo->nConstraint; i++) { for (int i = 0; i < pIdxInfo->nConstraint; i++) {
if (!pIdxInfo->aConstraint[i].usable) if (!pIdxInfo->aConstraint[i].usable)
continue; continue;
@ -5533,14 +5532,21 @@ static int vec0BestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pIdxInfo) {
} }
} }
if(value) { if(p->metadata_columns[metadata_idx].kind == VEC0_METADATA_COLUMN_KIND_BOOLEAN) {
if(!(value == VEC0_METADATA_OPERATOR_EQ || value == VEC0_METADATA_OPERATOR_NE)) {
// IMP: V10145_26984
rc = SQLITE_ERROR;
vtab_set_error(pVTab, "ONLY EQUALS (=) or NOT_EQUALS (!=) operators are allowed on boolean metadata columns.");
goto done;
}
}
pIdxInfo->aConstraintUsage[i].argvIndex = argvIndex++; pIdxInfo->aConstraintUsage[i].argvIndex = argvIndex++;
pIdxInfo->aConstraintUsage[i].omit = 1; pIdxInfo->aConstraintUsage[i].omit = 1;
sqlite3_str_appendchar(idxStr, 1, VEC0_IDXSTR_KIND_METADATA_CONSTRAINT); sqlite3_str_appendchar(idxStr, 1, VEC0_IDXSTR_KIND_METADATA_CONSTRAINT);
sqlite3_str_appendchar(idxStr, 1, 'A' + metadata_idx); sqlite3_str_appendchar(idxStr, 1, 'A' + metadata_idx);
sqlite3_str_appendchar(idxStr, 1, value); sqlite3_str_appendchar(idxStr, 1, value);
sqlite3_str_appendchar(idxStr, 1, '_'); sqlite3_str_appendchar(idxStr, 1, '_');
}
} }
@ -5867,11 +5873,18 @@ int vec0_set_metadata_filter_bitmap(
if(!buffer) { if(!buffer) {
return SQLITE_NOMEM; return SQLITE_NOMEM;
} }
sqlite3_blob_read(blob, buffer, blobSize, 0); rc = sqlite3_blob_read(blob, buffer, blobSize, 0);
if(rc != SQLITE_OK) {
goto done;
}
switch(kind) { switch(kind) {
case VEC0_METADATA_COLUMN_KIND_BOOLEAN: { case VEC0_METADATA_COLUMN_KIND_BOOLEAN: {
for(int i = 0; i < size; i++) { int target = sqlite3_value_int(value);
// TODO boolean comparisions if( (target && op == VEC0_METADATA_OPERATOR_EQ) || (!target && op == VEC0_METADATA_OPERATOR_NE)) {
for(int i = 0; i < size; i++) { bitmap_set(b, i, bitmap_get((u8*) buffer, i)); }
}
else {
for(int i = 0; i < size; i++) { bitmap_set(b, i, !bitmap_get((u8*) buffer, i)); }
} }
break; break;
} }
@ -5938,8 +5951,17 @@ int vec0_set_metadata_filter_bitmap(
break; break;
} }
case VEC0_METADATA_COLUMN_KIND_TEXT: { case VEC0_METADATA_COLUMN_KIND_TEXT: {
// TODO check for and handle large strings // TODO: handle large strings. For now just raise a generic error
for(int i = 0; i < size; i++) {
u8 * view = &((u8*) buffer)[i * VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH];
int n = ((int*) view)[0];
if(n > 12) {
rc = SQLITE_ERROR;
goto done;
}
}
const char * target = (const char *) sqlite3_value_text(value); const char * target = (const char *) sqlite3_value_text(value);
switch(op) { switch(op) {
case VEC0_METADATA_OPERATOR_EQ: { case VEC0_METADATA_OPERATOR_EQ: {
for(int i = 0; i < size; i++) { for(int i = 0; i < size; i++) {
@ -5999,8 +6021,9 @@ int vec0_set_metadata_filter_bitmap(
break; break;
} }
} }
done:
sqlite3_free(buffer); sqlite3_free(buffer);
return SQLITE_OK; return rc;
} }
int vec0Filter_knn_chunks_iter(vec0_vtab *p, sqlite3_stmt *stmtChunks, int vec0Filter_knn_chunks_iter(vec0_vtab *p, sqlite3_stmt *stmtChunks,

View file

@ -1638,13 +1638,140 @@
]), ]),
}) })
# --- # ---
# name: test_stress.8 # name: test_stress[bool-eq-false]
OrderedDict({ OrderedDict({
'sql': "select movie_id, mean_rating, distance from vec_movies where synopsis_embedding match '[100]' and k = 5 and is_favorited = TRUE", 'sql': "select movie_id, is_favorited, distance from vec_movies where synopsis_embedding match '[100]' and k = 5 and is_favorited = FALSE",
'rows': list([ 'rows': list([
OrderedDict({
'movie_id': 16,
'is_favorited': 0,
'distance': 84.0,
}),
OrderedDict({
'movie_id': 14,
'is_favorited': 0,
'distance': 86.0,
}),
OrderedDict({
'movie_id': 12,
'is_favorited': 0,
'distance': 88.0,
}),
OrderedDict({
'movie_id': 10,
'is_favorited': 0,
'distance': 90.0,
}),
OrderedDict({
'movie_id': 8,
'is_favorited': 0,
'distance': 92.0,
}),
]), ]),
}) })
# --- # ---
# name: test_stress[bool-eq-true]
OrderedDict({
'sql': "select movie_id, is_favorited, distance from vec_movies where synopsis_embedding match '[100]' and k = 5 and is_favorited = TRUE",
'rows': list([
OrderedDict({
'movie_id': 25,
'is_favorited': 1,
'distance': 75.0,
}),
OrderedDict({
'movie_id': 24,
'is_favorited': 1,
'distance': 76.0,
}),
OrderedDict({
'movie_id': 23,
'is_favorited': 1,
'distance': 77.0,
}),
OrderedDict({
'movie_id': 22,
'is_favorited': 1,
'distance': 78.0,
}),
OrderedDict({
'movie_id': 21,
'is_favorited': 1,
'distance': 79.0,
}),
]),
})
# ---
# name: test_stress[bool-ne-false]
OrderedDict({
'sql': "select movie_id, is_favorited, distance from vec_movies where synopsis_embedding match '[100]' and k = 5 and is_favorited != FALSE",
'rows': list([
OrderedDict({
'movie_id': 25,
'is_favorited': 1,
'distance': 75.0,
}),
OrderedDict({
'movie_id': 24,
'is_favorited': 1,
'distance': 76.0,
}),
OrderedDict({
'movie_id': 23,
'is_favorited': 1,
'distance': 77.0,
}),
OrderedDict({
'movie_id': 22,
'is_favorited': 1,
'distance': 78.0,
}),
OrderedDict({
'movie_id': 21,
'is_favorited': 1,
'distance': 79.0,
}),
]),
})
# ---
# name: test_stress[bool-ne-true]
OrderedDict({
'sql': "select movie_id, is_favorited, distance from vec_movies where synopsis_embedding match '[100]' and k = 5 and is_favorited != TRUE",
'rows': list([
OrderedDict({
'movie_id': 16,
'is_favorited': 0,
'distance': 84.0,
}),
OrderedDict({
'movie_id': 14,
'is_favorited': 0,
'distance': 86.0,
}),
OrderedDict({
'movie_id': 12,
'is_favorited': 0,
'distance': 88.0,
}),
OrderedDict({
'movie_id': 10,
'is_favorited': 0,
'distance': 90.0,
}),
OrderedDict({
'movie_id': 8,
'is_favorited': 0,
'distance': 92.0,
}),
]),
})
# ---
# name: test_stress[bool-other-op]
dict({
'error': 'OperationalError',
'message': 'ONLY EQUALS (=) or NOT_EQUALS (!=) operators are allowed on boolean metadata columns.',
})
# ---
# name: test_types[illegal-boolean] # name: test_types[illegal-boolean]
dict({ dict({
'error': 'OperationalError', 'error': 'OperationalError',

View file

@ -355,13 +355,28 @@ def test_stress(db, snapshot):
== snapshot() == snapshot()
) )
assert ( assert exec(
exec(
db, db,
"select movie_id, mean_rating, distance from vec_movies where synopsis_embedding match '[100]' and k = 5 and is_favorited = TRUE", "select movie_id, is_favorited, distance from vec_movies where synopsis_embedding match '[100]' and k = 5 and is_favorited = TRUE",
) ) == snapshot(name="bool-eq-true")
== snapshot() assert exec(
) db,
"select movie_id, is_favorited, distance from vec_movies where synopsis_embedding match '[100]' and k = 5 and is_favorited != TRUE",
) == snapshot(name="bool-ne-true")
assert exec(
db,
"select movie_id, is_favorited, distance from vec_movies where synopsis_embedding match '[100]' and k = 5 and is_favorited = FALSE",
) == snapshot(name="bool-eq-false")
assert exec(
db,
"select movie_id, is_favorited, distance from vec_movies where synopsis_embedding match '[100]' and k = 5 and is_favorited != FALSE",
) == snapshot(name="bool-ne-false")
# EVIDENCE-OF: V10145_26984
assert exec(
db,
"select movie_id, is_favorited, distance from vec_movies where synopsis_embedding match '[100]' and k = 5 and is_favorited >= 999",
) == snapshot(name="bool-other-op")
def exec(db, sql, parameters=[]): def exec(db, sql, parameters=[]):