mirror of
https://github.com/ModernRelay/omnigraph.git
synced 2026-07-03 02:51:04 +02:00
iss-merge-rowid-overlap-corrupts-filtered-reads / lance#7444: an
update-style merge_insert over a merge-written fragment legally reuses the
updated rows' stable row ids (row-id-lineage spec: updates preserve
_rowid) while the superseded fragment keeps its full sequence plus a
deletion vector. A later delete leaves the overlapping id range sparsely
tiled, and lance-table 7.0.0's RowIdIndex::new asserted dense tiling —
failing every filtered read that builds the id→address map ("Wrong range"
debug assert; "all columns in a record batch must have the same length"
or a silently-wrong batch in release).
The upstream fix (lance#7480, merged 2026-07-01) landed hours AFTER
v8.0.0 was cut, so no release ≤ 8.0.0 carries it. Consume it now as a
vendored pin: vendor/lance-table is the pristine published 7.0.0 source
plus ONLY the #7480 rowids/index.rs hunk (drop the false tiling assert;
hard-error on the true invariant — one live id claimed by two fragments)
and upstream's regression unit test, wired via [patch.crates-io]. The fix
is read-side only, so already-written graphs become readable as-is — no
data repair.
Removal condition (see vendor/lance-table/README.omnigraph.md): drop the
vendor dir + patch entry at the first Lance bump whose lance-table ships
lance#7480 (9.0.0, or a backported 8.0.1). The surface guard
filtered_scan_tolerates_merge_update_row_id_overlap keeps that honest in
both directions.
Turns the previous commit's red tests green. Full workspace gate passes
(cargo test --workspace --locked --no-fail-fast, 68 suites).
55 lines
1.7 KiB
Protocol Buffer
55 lines
1.7 KiB
Protocol Buffer
// SPDX-License-Identifier: Apache-2.0
|
|
// SPDX-FileCopyrightText: Copyright The Lance Authors
|
|
|
|
syntax = "proto3";
|
|
|
|
package lance.pb;
|
|
|
|
import "table_identifier.proto";
|
|
import "table.proto";
|
|
import "index.proto";
|
|
|
|
// Serialized vector query parameters.
|
|
message VectorQueryProto {
|
|
// Query vector as Arrow IPC bytes (supports Float16, Float32, Float64, UInt8, etc.)
|
|
bytes query_vector_arrow_ipc = 1;
|
|
string column = 2;
|
|
uint32 k = 3;
|
|
optional float lower_bound = 4;
|
|
optional float upper_bound = 5;
|
|
optional uint32 minimum_nprobes = 6;
|
|
optional uint32 maximum_nprobes = 7;
|
|
optional uint32 ef = 8;
|
|
optional uint32 refine_factor = 9;
|
|
// Distance metric type. Absent means None (use the index's default metric).
|
|
optional lance.index.pb.VectorMetricType metric_type = 10;
|
|
bool use_index = 11;
|
|
optional float dist_q_c = 12;
|
|
optional int32 query_parallelism = 13;
|
|
}
|
|
|
|
// Serializable form of ANNIvfSubIndexExec — the IVF sub-index search node.
|
|
//
|
|
// The prefilter child ExecutionPlan is serialized by DataFusion's codec
|
|
// automatically via children() / with_new_children(). The prefilter_type
|
|
// field tells the decoder which PreFilterSource variant to use when
|
|
// reconstructing from the deserialized child inputs.
|
|
message ANNIvfSubIndexExecProto {
|
|
enum PreFilterType {
|
|
NONE = 0;
|
|
FILTERED_ROW_IDS = 1;
|
|
SCALAR_INDEX_QUERY = 2;
|
|
}
|
|
|
|
VectorQueryProto query = 1;
|
|
lance.datafusion.TableIdentifier table = 2;
|
|
repeated lance.table.IndexMetadata indices = 3;
|
|
PreFilterType prefilter_type = 4;
|
|
}
|
|
|
|
// Serializable form of ANNIvfPartitionExec — the IVF centroid routing node.
|
|
message ANNIvfPartitionExecProto {
|
|
VectorQueryProto query = 1;
|
|
lance.datafusion.TableIdentifier table = 2;
|
|
repeated string index_uuids = 3;
|
|
}
|