Add NULL checks after sqlite3_column_blob in rescore and DiskANN

sqlite3_column_blob() returns NULL for zero-length blobs or on OOM.
Several call sites in rescore KNN and DiskANN node/vector read passed
the result directly to memcpy without checking, risking NULL deref on
corrupt or empty databases. IVF already had proper NULL checks.

Adds corruption regression tests that truncate shadow table blobs and
verify the query errors cleanly instead of crashing.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Alex Garcia 2026-03-31 14:31:49 -07:00
parent 9df59b4c03
commit 82f4eb08bf
4 changed files with 76 additions and 4 deletions

View file

@ -410,9 +410,18 @@ static int diskann_node_read(vec0_vtab *p, int vec_col_idx, i64 rowid,
return SQLITE_NOMEM; return SQLITE_NOMEM;
} }
memcpy(v, sqlite3_column_blob(stmt, 0), vs); const void *blobV = sqlite3_column_blob(stmt, 0);
memcpy(ids, sqlite3_column_blob(stmt, 1), is); const void *blobIds = sqlite3_column_blob(stmt, 1);
memcpy(qv, sqlite3_column_blob(stmt, 2), qs); const void *blobQv = sqlite3_column_blob(stmt, 2);
if (!blobV || !blobIds || !blobQv) {
sqlite3_free(v);
sqlite3_free(ids);
sqlite3_free(qv);
return SQLITE_ERROR;
}
memcpy(v, blobV, vs);
memcpy(ids, blobIds, is);
memcpy(qv, blobQv, qs);
*outValidity = v; *outValiditySize = vs; *outValidity = v; *outValiditySize = vs;
*outNeighborIds = ids; *outNeighborIdsSize = is; *outNeighborIds = ids; *outNeighborIdsSize = is;
@ -480,9 +489,11 @@ static int diskann_vector_read(vec0_vtab *p, int vec_col_idx, i64 rowid,
} }
int sz = sqlite3_column_bytes(stmt, 0); int sz = sqlite3_column_bytes(stmt, 0);
const void *blob = sqlite3_column_blob(stmt, 0);
if (!blob || sz == 0) return SQLITE_ERROR;
void *vec = sqlite3_malloc(sz); void *vec = sqlite3_malloc(sz);
if (!vec) return SQLITE_NOMEM; if (!vec) return SQLITE_NOMEM;
memcpy(vec, sqlite3_column_blob(stmt, 0), sz); memcpy(vec, blob, sz);
*outVector = vec; *outVector = vec;
*outVectorSize = sz; *outVectorSize = sz;
@ -1325,6 +1336,7 @@ static int diskann_flush_buffer(vec0_vtab *p, int vec_col_idx) {
while ((rc = sqlite3_step(stmt)) == SQLITE_ROW) { while ((rc = sqlite3_step(stmt)) == SQLITE_ROW) {
i64 rowid = sqlite3_column_int64(stmt, 0); i64 rowid = sqlite3_column_int64(stmt, 0);
const void *vector = sqlite3_column_blob(stmt, 1); const void *vector = sqlite3_column_blob(stmt, 1);
if (!vector) continue;
// Note: vector is already written to _vectors table, so // Note: vector is already written to _vectors table, so
// diskann_insert_graph will skip re-writing it (vector already exists). // diskann_insert_graph will skip re-writing it (vector already exists).
// We call the graph-only insert path. // We call the graph-only insert path.

View file

@ -426,6 +426,10 @@ static int rescore_knn(vec0_vtab *p, vec0_cursor *pCur,
unsigned char *chunkValidity = unsigned char *chunkValidity =
(unsigned char *)sqlite3_column_blob(stmtChunks, 1); (unsigned char *)sqlite3_column_blob(stmtChunks, 1);
i64 *chunkRowids = (i64 *)sqlite3_column_blob(stmtChunks, 2); i64 *chunkRowids = (i64 *)sqlite3_column_blob(stmtChunks, 2);
if (!chunkValidity || !chunkRowids) {
rc = SQLITE_ERROR;
goto cleanup;
}
memset(chunk_distances, 0, p->chunk_size * sizeof(f32)); memset(chunk_distances, 0, p->chunk_size * sizeof(f32));
memset(chunk_topk_idxs, 0, k_oversample * sizeof(i32)); memset(chunk_topk_idxs, 0, k_oversample * sizeof(i32));

View file

@ -1149,3 +1149,30 @@ def test_diskann_large_batch_insert_500(db):
distances = [r[1] for r in rows] distances = [r[1] for r in rows]
for i in range(len(distances) - 1): for i in range(len(distances) - 1):
assert distances[i] <= distances[i + 1] assert distances[i] <= distances[i + 1]
def test_corrupt_truncated_node_blob(db):
"""KNN should error (not crash) when DiskANN node blob is truncated."""
db.execute("""
CREATE VIRTUAL TABLE t USING vec0(
emb float[8] INDEXED BY diskann(neighbor_quantizer=binary, n_neighbors=8)
)
""")
for i in range(5):
vec = [0.0] * 8
vec[i % 8] = 1.0
db.execute("INSERT INTO t(rowid, emb) VALUES (?, ?)", [i + 1, _f32(vec)])
# Corrupt a DiskANN node: truncate neighbor_ids to 1 byte (wrong size)
db.execute(
"UPDATE t_diskann_nodes00 SET neighbor_ids = x'00' WHERE rowid = 1"
)
# Should not crash — may return wrong results or error
try:
db.execute(
"SELECT rowid FROM t WHERE emb MATCH ? AND k=3",
[_f32([1, 0, 0, 0, 0, 0, 0, 0])],
).fetchall()
except sqlite3.OperationalError:
pass # Error is acceptable — crash is not

View file

@ -566,3 +566,32 @@ def test_multiple_vector_columns(db):
[float_vec([1.0] * 8)], [float_vec([1.0] * 8)],
).fetchall() ).fetchall()
assert rows[0]["rowid"] == 2 assert rows[0]["rowid"] == 2
def test_corrupt_zeroblob_validity(db):
"""KNN should error (not crash) when rescore chunk rowids blob is zeroed out."""
db.execute(
"CREATE VIRTUAL TABLE t USING vec0("
" embedding float[8] indexed by rescore(quantizer=bit)"
")"
)
db.execute(
"INSERT INTO t(rowid, embedding) VALUES (1, ?)",
[float_vec([1, 0, 0, 0, 0, 0, 0, 0])],
)
db.execute(
"INSERT INTO t(rowid, embedding) VALUES (2, ?)",
[float_vec([0, 1, 0, 0, 0, 0, 0, 0])],
)
# Corrupt: replace rowids with a truncated blob (wrong size)
db.execute("UPDATE t_chunks SET rowids = x'00'")
# Should not crash — may return wrong results or error
try:
rows = db.execute(
"SELECT rowid FROM t WHERE embedding MATCH ? ORDER BY distance LIMIT 1",
[float_vec([1, 0, 0, 0, 0, 0, 0, 0])],
).fetchall()
except sqlite3.OperationalError:
pass # Error is acceptable — crash is not