diff --git a/tests/fuzz/TODO.md b/tests/fuzz/TODO.md new file mode 100644 index 0000000..1f33419 --- /dev/null +++ b/tests/fuzz/TODO.md @@ -0,0 +1,49 @@ +# Fuzz Testing TODO: Undefined Behavior Findings + +UBSAN findings from fuzz targets. None are crash-level bugs, but all are +formally undefined behavior per the C standard. + +## Class 1: Function pointer type mismatch (~20 sites) + +`fvec_cleanup_noop` is defined as `void (f32 *)` but called through +`vector_cleanup` which is `void (*)(void *)`. Two cleanup typedefs exist +with incompatible signatures: + +```c +typedef void (*vector_cleanup)(void *p); // line 597 +typedef void (*fvec_cleanup)(f32 *vector); // line 695 +``` + +Affected lines: 1031, 1049, 1050, 1160, 1200, 1201, 1241, 1242, 1282, +1283, 1324, 1325, 1356, 1424, 1524, 1525, 1582, 1583, 1699, 1749, 1798, +2520, 7236, 8501, and sqlite3.c:82930 (via sqlite3_result_blob destructor). + +Low practical severity — calling conventions on all real platforms pass +`f32 *` and `void *` identically — but flags on every UBSAN run. + +Fix: change `fvec_cleanup_noop` to take `void *`, or unify the typedefs. + +## Class 2: Misaligned f32 reads (~10 sites) + +`f32` (4-byte alignment required) read from potentially unaligned addresses. +Happens when a blob from SQLite's internal storage is cast to `f32 *` and +dereferenced. The blob pointer may not be 4-byte aligned. + +Affected lines: 369, 446, 473-475, 1401, 1461, 1501, 1559, 1653, 1726, +1789, 1793. + +Medium severity — silent on x86/ARM64 (hardware supports unaligned float +access) but UB on strict-alignment architectures. + +Fix: use `memcpy` to load floats from potentially-unaligned memory, or +ensure blob pointers are aligned before use. + +## Class 3: Float-to-integer overflow (1 site) + +`vec_quantize_int8` at line 1461 — when `srcVector[i]` is a large float, +the expression `((srcVector[i] - (-1.0)) / step) - 128` overflows +`signed char` range. Assigning this to `i8 out[i]` is UB. + +Low-medium severity — silent truncation in practice. + +Fix: clamp the result before cast. diff --git a/tests/fuzz/corpus/vec-mismatch/dim_mismatch_4d_2d b/tests/fuzz/corpus/vec-mismatch/dim_mismatch_4d_2d new file mode 100644 index 0000000..fa37ca9 Binary files /dev/null and b/tests/fuzz/corpus/vec-mismatch/dim_mismatch_4d_2d differ diff --git a/tests/fuzz/corpus/vec-mismatch/json2d_invalid_blob b/tests/fuzz/corpus/vec-mismatch/json2d_invalid_blob new file mode 100644 index 0000000..b48fcfb Binary files /dev/null and b/tests/fuzz/corpus/vec-mismatch/json2d_invalid_blob differ diff --git a/tests/fuzz/corpus/vec-mismatch/json4d_invalid_blob b/tests/fuzz/corpus/vec-mismatch/json4d_invalid_blob new file mode 100644 index 0000000..c53cb2f Binary files /dev/null and b/tests/fuzz/corpus/vec-mismatch/json4d_invalid_blob differ diff --git a/tests/fuzz/corpus/vec-mismatch/single_f32_bad_text b/tests/fuzz/corpus/vec-mismatch/single_f32_bad_text new file mode 100644 index 0000000..1ec5a05 Binary files /dev/null and b/tests/fuzz/corpus/vec-mismatch/single_f32_bad_text differ diff --git a/tests/fuzz/corpus/vec-mismatch/single_normalize_json b/tests/fuzz/corpus/vec-mismatch/single_normalize_json new file mode 100644 index 0000000..35712f7 Binary files /dev/null and b/tests/fuzz/corpus/vec-mismatch/single_normalize_json differ diff --git a/tests/fuzz/corpus/vec-mismatch/type_mismatch_f32_bit b/tests/fuzz/corpus/vec-mismatch/type_mismatch_f32_bit new file mode 100644 index 0000000..4de94c8 Binary files /dev/null and b/tests/fuzz/corpus/vec-mismatch/type_mismatch_f32_bit differ diff --git a/tests/fuzz/corpus/vec-mismatch/type_mismatch_f32_int8 b/tests/fuzz/corpus/vec-mismatch/type_mismatch_f32_int8 new file mode 100644 index 0000000..f32d149 Binary files /dev/null and b/tests/fuzz/corpus/vec-mismatch/type_mismatch_f32_int8 differ diff --git a/tests/fuzz/vec-mismatch.c b/tests/fuzz/vec-mismatch.c index 921782e..47be140 100644 --- a/tests/fuzz/vec-mismatch.c +++ b/tests/fuzz/vec-mismatch.c @@ -8,119 +8,170 @@ #include /* - * Fuzz target for two-argument vector functions (vec_distance_*, vec_add, - * vec_sub) where the first argument is always a valid vector and the second - * is fuzz-derived. This exercises the ensure_vector_match() error paths - * where the first vector parses successfully but the second does not. + * Fuzz target that exercises error-path cleanup in vector functions. * - * Critical coverage: when arg1 is TEXT (JSON-parsed), the cleanup function - * is sqlite3_free rather than a no-op, so cleanup bugs become observable. + * The key insight: when a vector is parsed from JSON TEXT, the cleanup + * function is sqlite3_free (heap allocator). When parsed from BLOB, + * cleanup is a no-op. Bugs in cleanup code (wrong pointer, missing + * cleanup, double-free) are only observable with the sqlite3_free path. * - * The first byte selects the function. The remaining bytes form arg 2. + * This fuzzer systematically covers: + * 1. Valid JSON arg1 + invalid fuzz arg2 (parse failure → cleanup arg1) + * 2. Valid JSON arg1 + valid JSON arg2 with different dimensions + * (dimension mismatch → cleanup both) + * 3. Valid JSON arg1 + int8/bit blob arg2 with mismatched type + * (type mismatch → cleanup both) + * 4. Fuzz arg1 + valid JSON arg2 (parse failure of arg1, no cleanup) + * 5. Single-arg functions with JSON text (normal cleanup path) + * 6. Single-arg functions with fuzz text (parse failure path) */ +/* Helper: bind a valid vector to a statement parameter. + * mode selects the vector type and format. */ +static void bind_valid_vector(sqlite3_stmt *stmt, int param, int mode) { + /* JSON text vectors — cleanup = sqlite3_free */ + static const char *json_f32_4d = "[1.0, 0.0, 0.0, 0.0]"; + static const char *json_f32_2d = "[1.0, 2.0]"; + static const char *json_f32_1d = "[1.0]"; + + /* Blob vectors — cleanup = noop */ + static const float blob_f32_4d[] = {1.0f, 0.0f, 0.0f, 0.0f}; + static const float blob_f32_2d[] = {1.0f, 2.0f}; + + /* int8 blob — 4 bytes = 4 dimensions */ + static const int8_t blob_int8_4d[] = {10, 20, 30, 40}; + + /* bit blob — 1 byte = 8 bits */ + static const uint8_t blob_bit_1b[] = {0xAA}; + + switch (mode % 7) { + case 0: sqlite3_bind_text(stmt, param, json_f32_4d, -1, SQLITE_STATIC); break; + case 1: sqlite3_bind_text(stmt, param, json_f32_2d, -1, SQLITE_STATIC); break; + case 2: sqlite3_bind_text(stmt, param, json_f32_1d, -1, SQLITE_STATIC); break; + case 3: sqlite3_bind_blob(stmt, param, blob_f32_4d, sizeof(blob_f32_4d), SQLITE_STATIC); break; + case 4: sqlite3_bind_blob(stmt, param, blob_f32_2d, sizeof(blob_f32_2d), SQLITE_STATIC); break; + case 5: /* int8 — must set subtype */ + sqlite3_bind_blob(stmt, param, blob_int8_4d, sizeof(blob_int8_4d), SQLITE_STATIC); + break; + case 6: /* bit — must set subtype */ + sqlite3_bind_blob(stmt, param, blob_bit_1b, sizeof(blob_bit_1b), SQLITE_STATIC); + break; + } +} + +static void run_query(sqlite3 *db, const char *sql, + int arg1_mode, int arg2_mode, + const uint8_t *fuzz, int fuzz_len, + int fuzz_arg, int fuzz_as_text) { + sqlite3_stmt *stmt = NULL; + int rc = sqlite3_prepare_v2(db, sql, -1, &stmt, NULL); + if (rc != SQLITE_OK) return; + + int nParams = sqlite3_bind_parameter_count(stmt); + + for (int p = 1; p <= nParams; p++) { + if (p == fuzz_arg) { + /* Bind fuzz data */ + if (fuzz_as_text) + sqlite3_bind_text(stmt, p, (const char *)fuzz, fuzz_len, SQLITE_STATIC); + else + sqlite3_bind_blob(stmt, p, fuzz, fuzz_len, SQLITE_STATIC); + } else if (p == 1) { + bind_valid_vector(stmt, p, arg1_mode); + } else { + bind_valid_vector(stmt, p, arg2_mode); + } + } + + sqlite3_step(stmt); + sqlite3_finalize(stmt); +} + int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { - if (size < 2) return 0; + if (size < 3) return 0; int rc; sqlite3 *db; - sqlite3_stmt *stmt = NULL; rc = sqlite3_open(":memory:", &db); assert(rc == SQLITE_OK); rc = sqlite3_vec_init(db, NULL, NULL); assert(rc == SQLITE_OK); + /* --- Decode fuzz control bytes --- */ + uint8_t b0 = data[0]; + uint8_t b1 = data[1]; + uint8_t b2 = data[2]; + const uint8_t *payload = data + 3; + int payload_size = (int)(size - 3); + /* Two-argument vector functions */ - static const char *queries[] = { - "SELECT vec_distance_l2(?, ?)", /* 0 */ - "SELECT vec_distance_cosine(?, ?)", /* 1 */ - "SELECT vec_distance_l1(?, ?)", /* 2 */ - "SELECT vec_distance_hamming(?, ?)", /* 3 */ - "SELECT vec_add(?, ?)", /* 4 */ - "SELECT vec_sub(?, ?)", /* 5 */ + static const char *two_arg[] = { + "SELECT vec_distance_l2(?, ?)", + "SELECT vec_distance_cosine(?, ?)", + "SELECT vec_distance_l1(?, ?)", + "SELECT vec_distance_hamming(?, ?)", + "SELECT vec_add(?, ?)", + "SELECT vec_sub(?, ?)", }; - static const int nQueries = sizeof(queries) / sizeof(queries[0]); - /* Valid JSON vectors (TEXT) — parsed via fvec_from_value text path, - * which sets cleanup = sqlite3_free */ - static const char *json_vecs[] = { - "[1.0, 0.0, 0.0, 0.0]", /* 4d */ - "[1.0, 2.0]", /* 2d */ - "[1.0]", /* 1d */ + /* Single-argument vector functions that call cleanup */ + static const char *one_arg[] = { + "SELECT vec_f32(?)", + "SELECT vec_int8(?)", + "SELECT vec_bit(?)", + "SELECT vec_length(?)", + "SELECT vec_type(?)", + "SELECT vec_to_json(?)", + "SELECT vec_normalize(?)", + "SELECT vec_quantize_binary(?)", }; - static const int nJsonVecs = sizeof(json_vecs) / sizeof(json_vecs[0]); - /* Valid blob vectors (BLOB) — parsed via fvec_from_value blob path, - * which sets cleanup = fvec_cleanup_noop */ - static const float blob_vec[] = {1.0f, 0.0f, 0.0f, 0.0f}; + int qIdx2 = b0 % 6; + int qIdx1 = b0 % 8; + int arg1_mode = b1 % 7; + int arg2_mode = b2 % 7; - uint8_t selector = data[0]; - int qIdx = selector % nQueries; - /* Bits 3-4: select which valid vector and format for arg1 */ - int arg1_mode = (selector / nQueries) % 4; + /* + * Phase 1: Two-arg functions — fuzz arg2, valid arg1 + * Exercises: parse-failure cleanup of arg1 (the fixed bug), + * type mismatch cleanup, dimension mismatch cleanup. + */ + /* arg2 as fuzz blob */ + run_query(db, two_arg[qIdx2], arg1_mode, 0, + payload, payload_size, /*fuzz_arg=*/2, /*as_text=*/0); + /* arg2 as fuzz text */ + run_query(db, two_arg[qIdx2], arg1_mode, 0, + payload, payload_size, /*fuzz_arg=*/2, /*as_text=*/1); - const uint8_t *payload = data + 1; - int payload_size = (int)(size - 1); + /* + * Phase 2: Two-arg functions — fuzz arg1, valid arg2 + * Exercises: parse-failure of arg1 (no cleanup needed), and + * type/dimension mismatch when arg1 parses to unexpected type. + */ + run_query(db, two_arg[qIdx2], 0, arg2_mode, + payload, payload_size, /*fuzz_arg=*/1, /*as_text=*/0); + run_query(db, two_arg[qIdx2], 0, arg2_mode, + payload, payload_size, /*fuzz_arg=*/1, /*as_text=*/1); - /* --- Test 1: valid arg1, fuzz arg2 --- */ - rc = sqlite3_prepare_v2(db, queries[qIdx], -1, &stmt, NULL); - if (rc != SQLITE_OK) { sqlite3_close(db); return 0; } + /* + * Phase 3: Two-arg — both valid but deliberately mismatched types/dims. + * arg1_mode and arg2_mode often produce different types or dimensions. + * Exercises: type mismatch (lines 1035-1042) and dimension mismatch + * (lines 1044-1051) with sqlite3_free cleanup on both sides. + */ + run_query(db, two_arg[qIdx2], arg1_mode, arg2_mode, + NULL, 0, /*fuzz_arg=*/0, /*as_text=*/0); - /* Bind arg1 as either JSON text or blob */ - switch (arg1_mode) { - case 0: /* JSON text — triggers sqlite3_free cleanup */ - sqlite3_bind_text(stmt, 1, json_vecs[0], -1, SQLITE_STATIC); - break; - case 1: - sqlite3_bind_text(stmt, 1, json_vecs[1], -1, SQLITE_STATIC); - break; - case 2: - sqlite3_bind_text(stmt, 1, json_vecs[2], -1, SQLITE_STATIC); - break; - case 3: /* blob — triggers noop cleanup */ - sqlite3_bind_blob(stmt, 1, blob_vec, sizeof(blob_vec), SQLITE_STATIC); - break; - } - - /* Bind arg2 as fuzz blob (most likely to fail parsing for non-4-aligned sizes) */ - sqlite3_bind_blob(stmt, 2, payload, payload_size, SQLITE_STATIC); - sqlite3_step(stmt); - sqlite3_finalize(stmt); - stmt = NULL; - - /* --- Test 2: same but arg2 as fuzz text --- */ - rc = sqlite3_prepare_v2(db, queries[qIdx], -1, &stmt, NULL); - if (rc != SQLITE_OK) { sqlite3_close(db); return 0; } - - switch (arg1_mode) { - case 0: - sqlite3_bind_text(stmt, 1, json_vecs[0], -1, SQLITE_STATIC); - break; - case 1: - sqlite3_bind_text(stmt, 1, json_vecs[1], -1, SQLITE_STATIC); - break; - case 2: - sqlite3_bind_text(stmt, 1, json_vecs[2], -1, SQLITE_STATIC); - break; - case 3: - sqlite3_bind_blob(stmt, 1, blob_vec, sizeof(blob_vec), SQLITE_STATIC); - break; - } - - sqlite3_bind_text(stmt, 2, (const char *)payload, payload_size, SQLITE_STATIC); - sqlite3_step(stmt); - sqlite3_finalize(stmt); - stmt = NULL; - - /* --- Test 3: fuzz arg1, valid arg2 --- */ - rc = sqlite3_prepare_v2(db, queries[qIdx], -1, &stmt, NULL); - if (rc != SQLITE_OK) { sqlite3_close(db); return 0; } - - sqlite3_bind_blob(stmt, 1, payload, payload_size, SQLITE_STATIC); - sqlite3_bind_text(stmt, 2, json_vecs[0], -1, SQLITE_STATIC); - sqlite3_step(stmt); - sqlite3_finalize(stmt); + /* + * Phase 4: Single-arg functions — fuzz as blob and text. + * Exercises: parse failure paths in vec_f32, vec_int8, vec_bit, etc. + * Also exercises normal cleanup when fuzz data happens to be valid. + */ + run_query(db, one_arg[qIdx1], 0, 0, + payload, payload_size, /*fuzz_arg=*/1, /*as_text=*/0); + run_query(db, one_arg[qIdx1], 0, 0, + payload, payload_size, /*fuzz_arg=*/1, /*as_text=*/1); sqlite3_close(db); return 0;