mirror of
https://github.com/asg017/sqlite-vec.git
synced 2026-04-25 08:46:49 +02:00
Add UBSAN findings TODO and improve vec-mismatch fuzzer
Document three classes of undefined behavior found by UBSAN: function pointer type mismatches, misaligned f32 reads, and float-to-integer overflow in vec_quantize_int8. Improve vec-mismatch fuzzer to cover all error-path cleanup patterns: type mismatches, dimension mismatches, single-arg functions, and both text and blob inputs. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
4ce1ef3c6f
commit
b669801d31
9 changed files with 190 additions and 90 deletions
|
|
@ -8,119 +8,170 @@
|
|||
#include <assert.h>
|
||||
|
||||
/*
|
||||
* Fuzz target for two-argument vector functions (vec_distance_*, vec_add,
|
||||
* vec_sub) where the first argument is always a valid vector and the second
|
||||
* is fuzz-derived. This exercises the ensure_vector_match() error paths
|
||||
* where the first vector parses successfully but the second does not.
|
||||
* Fuzz target that exercises error-path cleanup in vector functions.
|
||||
*
|
||||
* Critical coverage: when arg1 is TEXT (JSON-parsed), the cleanup function
|
||||
* is sqlite3_free rather than a no-op, so cleanup bugs become observable.
|
||||
* The key insight: when a vector is parsed from JSON TEXT, the cleanup
|
||||
* function is sqlite3_free (heap allocator). When parsed from BLOB,
|
||||
* cleanup is a no-op. Bugs in cleanup code (wrong pointer, missing
|
||||
* cleanup, double-free) are only observable with the sqlite3_free path.
|
||||
*
|
||||
* The first byte selects the function. The remaining bytes form arg 2.
|
||||
* This fuzzer systematically covers:
|
||||
* 1. Valid JSON arg1 + invalid fuzz arg2 (parse failure → cleanup arg1)
|
||||
* 2. Valid JSON arg1 + valid JSON arg2 with different dimensions
|
||||
* (dimension mismatch → cleanup both)
|
||||
* 3. Valid JSON arg1 + int8/bit blob arg2 with mismatched type
|
||||
* (type mismatch → cleanup both)
|
||||
* 4. Fuzz arg1 + valid JSON arg2 (parse failure of arg1, no cleanup)
|
||||
* 5. Single-arg functions with JSON text (normal cleanup path)
|
||||
* 6. Single-arg functions with fuzz text (parse failure path)
|
||||
*/
|
||||
|
||||
/* Helper: bind a valid vector to a statement parameter.
|
||||
* mode selects the vector type and format. */
|
||||
static void bind_valid_vector(sqlite3_stmt *stmt, int param, int mode) {
|
||||
/* JSON text vectors — cleanup = sqlite3_free */
|
||||
static const char *json_f32_4d = "[1.0, 0.0, 0.0, 0.0]";
|
||||
static const char *json_f32_2d = "[1.0, 2.0]";
|
||||
static const char *json_f32_1d = "[1.0]";
|
||||
|
||||
/* Blob vectors — cleanup = noop */
|
||||
static const float blob_f32_4d[] = {1.0f, 0.0f, 0.0f, 0.0f};
|
||||
static const float blob_f32_2d[] = {1.0f, 2.0f};
|
||||
|
||||
/* int8 blob — 4 bytes = 4 dimensions */
|
||||
static const int8_t blob_int8_4d[] = {10, 20, 30, 40};
|
||||
|
||||
/* bit blob — 1 byte = 8 bits */
|
||||
static const uint8_t blob_bit_1b[] = {0xAA};
|
||||
|
||||
switch (mode % 7) {
|
||||
case 0: sqlite3_bind_text(stmt, param, json_f32_4d, -1, SQLITE_STATIC); break;
|
||||
case 1: sqlite3_bind_text(stmt, param, json_f32_2d, -1, SQLITE_STATIC); break;
|
||||
case 2: sqlite3_bind_text(stmt, param, json_f32_1d, -1, SQLITE_STATIC); break;
|
||||
case 3: sqlite3_bind_blob(stmt, param, blob_f32_4d, sizeof(blob_f32_4d), SQLITE_STATIC); break;
|
||||
case 4: sqlite3_bind_blob(stmt, param, blob_f32_2d, sizeof(blob_f32_2d), SQLITE_STATIC); break;
|
||||
case 5: /* int8 — must set subtype */
|
||||
sqlite3_bind_blob(stmt, param, blob_int8_4d, sizeof(blob_int8_4d), SQLITE_STATIC);
|
||||
break;
|
||||
case 6: /* bit — must set subtype */
|
||||
sqlite3_bind_blob(stmt, param, blob_bit_1b, sizeof(blob_bit_1b), SQLITE_STATIC);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static void run_query(sqlite3 *db, const char *sql,
|
||||
int arg1_mode, int arg2_mode,
|
||||
const uint8_t *fuzz, int fuzz_len,
|
||||
int fuzz_arg, int fuzz_as_text) {
|
||||
sqlite3_stmt *stmt = NULL;
|
||||
int rc = sqlite3_prepare_v2(db, sql, -1, &stmt, NULL);
|
||||
if (rc != SQLITE_OK) return;
|
||||
|
||||
int nParams = sqlite3_bind_parameter_count(stmt);
|
||||
|
||||
for (int p = 1; p <= nParams; p++) {
|
||||
if (p == fuzz_arg) {
|
||||
/* Bind fuzz data */
|
||||
if (fuzz_as_text)
|
||||
sqlite3_bind_text(stmt, p, (const char *)fuzz, fuzz_len, SQLITE_STATIC);
|
||||
else
|
||||
sqlite3_bind_blob(stmt, p, fuzz, fuzz_len, SQLITE_STATIC);
|
||||
} else if (p == 1) {
|
||||
bind_valid_vector(stmt, p, arg1_mode);
|
||||
} else {
|
||||
bind_valid_vector(stmt, p, arg2_mode);
|
||||
}
|
||||
}
|
||||
|
||||
sqlite3_step(stmt);
|
||||
sqlite3_finalize(stmt);
|
||||
}
|
||||
|
||||
int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
|
||||
if (size < 2) return 0;
|
||||
if (size < 3) return 0;
|
||||
|
||||
int rc;
|
||||
sqlite3 *db;
|
||||
sqlite3_stmt *stmt = NULL;
|
||||
|
||||
rc = sqlite3_open(":memory:", &db);
|
||||
assert(rc == SQLITE_OK);
|
||||
rc = sqlite3_vec_init(db, NULL, NULL);
|
||||
assert(rc == SQLITE_OK);
|
||||
|
||||
/* --- Decode fuzz control bytes --- */
|
||||
uint8_t b0 = data[0];
|
||||
uint8_t b1 = data[1];
|
||||
uint8_t b2 = data[2];
|
||||
const uint8_t *payload = data + 3;
|
||||
int payload_size = (int)(size - 3);
|
||||
|
||||
/* Two-argument vector functions */
|
||||
static const char *queries[] = {
|
||||
"SELECT vec_distance_l2(?, ?)", /* 0 */
|
||||
"SELECT vec_distance_cosine(?, ?)", /* 1 */
|
||||
"SELECT vec_distance_l1(?, ?)", /* 2 */
|
||||
"SELECT vec_distance_hamming(?, ?)", /* 3 */
|
||||
"SELECT vec_add(?, ?)", /* 4 */
|
||||
"SELECT vec_sub(?, ?)", /* 5 */
|
||||
static const char *two_arg[] = {
|
||||
"SELECT vec_distance_l2(?, ?)",
|
||||
"SELECT vec_distance_cosine(?, ?)",
|
||||
"SELECT vec_distance_l1(?, ?)",
|
||||
"SELECT vec_distance_hamming(?, ?)",
|
||||
"SELECT vec_add(?, ?)",
|
||||
"SELECT vec_sub(?, ?)",
|
||||
};
|
||||
static const int nQueries = sizeof(queries) / sizeof(queries[0]);
|
||||
|
||||
/* Valid JSON vectors (TEXT) — parsed via fvec_from_value text path,
|
||||
* which sets cleanup = sqlite3_free */
|
||||
static const char *json_vecs[] = {
|
||||
"[1.0, 0.0, 0.0, 0.0]", /* 4d */
|
||||
"[1.0, 2.0]", /* 2d */
|
||||
"[1.0]", /* 1d */
|
||||
/* Single-argument vector functions that call cleanup */
|
||||
static const char *one_arg[] = {
|
||||
"SELECT vec_f32(?)",
|
||||
"SELECT vec_int8(?)",
|
||||
"SELECT vec_bit(?)",
|
||||
"SELECT vec_length(?)",
|
||||
"SELECT vec_type(?)",
|
||||
"SELECT vec_to_json(?)",
|
||||
"SELECT vec_normalize(?)",
|
||||
"SELECT vec_quantize_binary(?)",
|
||||
};
|
||||
static const int nJsonVecs = sizeof(json_vecs) / sizeof(json_vecs[0]);
|
||||
|
||||
/* Valid blob vectors (BLOB) — parsed via fvec_from_value blob path,
|
||||
* which sets cleanup = fvec_cleanup_noop */
|
||||
static const float blob_vec[] = {1.0f, 0.0f, 0.0f, 0.0f};
|
||||
int qIdx2 = b0 % 6;
|
||||
int qIdx1 = b0 % 8;
|
||||
int arg1_mode = b1 % 7;
|
||||
int arg2_mode = b2 % 7;
|
||||
|
||||
uint8_t selector = data[0];
|
||||
int qIdx = selector % nQueries;
|
||||
/* Bits 3-4: select which valid vector and format for arg1 */
|
||||
int arg1_mode = (selector / nQueries) % 4;
|
||||
/*
|
||||
* Phase 1: Two-arg functions — fuzz arg2, valid arg1
|
||||
* Exercises: parse-failure cleanup of arg1 (the fixed bug),
|
||||
* type mismatch cleanup, dimension mismatch cleanup.
|
||||
*/
|
||||
/* arg2 as fuzz blob */
|
||||
run_query(db, two_arg[qIdx2], arg1_mode, 0,
|
||||
payload, payload_size, /*fuzz_arg=*/2, /*as_text=*/0);
|
||||
/* arg2 as fuzz text */
|
||||
run_query(db, two_arg[qIdx2], arg1_mode, 0,
|
||||
payload, payload_size, /*fuzz_arg=*/2, /*as_text=*/1);
|
||||
|
||||
const uint8_t *payload = data + 1;
|
||||
int payload_size = (int)(size - 1);
|
||||
/*
|
||||
* Phase 2: Two-arg functions — fuzz arg1, valid arg2
|
||||
* Exercises: parse-failure of arg1 (no cleanup needed), and
|
||||
* type/dimension mismatch when arg1 parses to unexpected type.
|
||||
*/
|
||||
run_query(db, two_arg[qIdx2], 0, arg2_mode,
|
||||
payload, payload_size, /*fuzz_arg=*/1, /*as_text=*/0);
|
||||
run_query(db, two_arg[qIdx2], 0, arg2_mode,
|
||||
payload, payload_size, /*fuzz_arg=*/1, /*as_text=*/1);
|
||||
|
||||
/* --- Test 1: valid arg1, fuzz arg2 --- */
|
||||
rc = sqlite3_prepare_v2(db, queries[qIdx], -1, &stmt, NULL);
|
||||
if (rc != SQLITE_OK) { sqlite3_close(db); return 0; }
|
||||
/*
|
||||
* Phase 3: Two-arg — both valid but deliberately mismatched types/dims.
|
||||
* arg1_mode and arg2_mode often produce different types or dimensions.
|
||||
* Exercises: type mismatch (lines 1035-1042) and dimension mismatch
|
||||
* (lines 1044-1051) with sqlite3_free cleanup on both sides.
|
||||
*/
|
||||
run_query(db, two_arg[qIdx2], arg1_mode, arg2_mode,
|
||||
NULL, 0, /*fuzz_arg=*/0, /*as_text=*/0);
|
||||
|
||||
/* Bind arg1 as either JSON text or blob */
|
||||
switch (arg1_mode) {
|
||||
case 0: /* JSON text — triggers sqlite3_free cleanup */
|
||||
sqlite3_bind_text(stmt, 1, json_vecs[0], -1, SQLITE_STATIC);
|
||||
break;
|
||||
case 1:
|
||||
sqlite3_bind_text(stmt, 1, json_vecs[1], -1, SQLITE_STATIC);
|
||||
break;
|
||||
case 2:
|
||||
sqlite3_bind_text(stmt, 1, json_vecs[2], -1, SQLITE_STATIC);
|
||||
break;
|
||||
case 3: /* blob — triggers noop cleanup */
|
||||
sqlite3_bind_blob(stmt, 1, blob_vec, sizeof(blob_vec), SQLITE_STATIC);
|
||||
break;
|
||||
}
|
||||
|
||||
/* Bind arg2 as fuzz blob (most likely to fail parsing for non-4-aligned sizes) */
|
||||
sqlite3_bind_blob(stmt, 2, payload, payload_size, SQLITE_STATIC);
|
||||
sqlite3_step(stmt);
|
||||
sqlite3_finalize(stmt);
|
||||
stmt = NULL;
|
||||
|
||||
/* --- Test 2: same but arg2 as fuzz text --- */
|
||||
rc = sqlite3_prepare_v2(db, queries[qIdx], -1, &stmt, NULL);
|
||||
if (rc != SQLITE_OK) { sqlite3_close(db); return 0; }
|
||||
|
||||
switch (arg1_mode) {
|
||||
case 0:
|
||||
sqlite3_bind_text(stmt, 1, json_vecs[0], -1, SQLITE_STATIC);
|
||||
break;
|
||||
case 1:
|
||||
sqlite3_bind_text(stmt, 1, json_vecs[1], -1, SQLITE_STATIC);
|
||||
break;
|
||||
case 2:
|
||||
sqlite3_bind_text(stmt, 1, json_vecs[2], -1, SQLITE_STATIC);
|
||||
break;
|
||||
case 3:
|
||||
sqlite3_bind_blob(stmt, 1, blob_vec, sizeof(blob_vec), SQLITE_STATIC);
|
||||
break;
|
||||
}
|
||||
|
||||
sqlite3_bind_text(stmt, 2, (const char *)payload, payload_size, SQLITE_STATIC);
|
||||
sqlite3_step(stmt);
|
||||
sqlite3_finalize(stmt);
|
||||
stmt = NULL;
|
||||
|
||||
/* --- Test 3: fuzz arg1, valid arg2 --- */
|
||||
rc = sqlite3_prepare_v2(db, queries[qIdx], -1, &stmt, NULL);
|
||||
if (rc != SQLITE_OK) { sqlite3_close(db); return 0; }
|
||||
|
||||
sqlite3_bind_blob(stmt, 1, payload, payload_size, SQLITE_STATIC);
|
||||
sqlite3_bind_text(stmt, 2, json_vecs[0], -1, SQLITE_STATIC);
|
||||
sqlite3_step(stmt);
|
||||
sqlite3_finalize(stmt);
|
||||
/*
|
||||
* Phase 4: Single-arg functions — fuzz as blob and text.
|
||||
* Exercises: parse failure paths in vec_f32, vec_int8, vec_bit, etc.
|
||||
* Also exercises normal cleanup when fuzz data happens to be valid.
|
||||
*/
|
||||
run_query(db, one_arg[qIdx1], 0, 0,
|
||||
payload, payload_size, /*fuzz_arg=*/1, /*as_text=*/0);
|
||||
run_query(db, one_arg[qIdx1], 0, 0,
|
||||
payload, payload_size, /*fuzz_arg=*/1, /*as_text=*/1);
|
||||
|
||||
sqlite3_close(db);
|
||||
return 0;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue