mirror of
https://github.com/ModernRelay/omnigraph.git
synced 2026-07-03 02:51:04 +02:00
iss-merge-rowid-overlap-corrupts-filtered-reads / lance#7444: an
update-style merge_insert over a merge-written fragment legally reuses the
updated rows' stable row ids (row-id-lineage spec: updates preserve
_rowid) while the superseded fragment keeps its full sequence plus a
deletion vector. A later delete leaves the overlapping id range sparsely
tiled, and lance-table 7.0.0's RowIdIndex::new asserted dense tiling —
failing every filtered read that builds the id→address map ("Wrong range"
debug assert; "all columns in a record batch must have the same length"
or a silently-wrong batch in release).
The upstream fix (lance#7480, merged 2026-07-01) landed hours AFTER
v8.0.0 was cut, so no release ≤ 8.0.0 carries it. Consume it now as a
vendored pin: vendor/lance-table is the pristine published 7.0.0 source
plus ONLY the #7480 rowids/index.rs hunk (drop the false tiling assert;
hard-error on the true invariant — one live id claimed by two fragments)
and upstream's regression unit test, wired via [patch.crates-io]. The fix
is read-side only, so already-written graphs become readable as-is — no
data repair.
Removal condition (see vendor/lance-table/README.omnigraph.md): drop the
vendor dir + patch entry at the first Lance bump whose lance-table ships
lance#7480 (9.0.0, or a backported 8.0.1). The surface guard
filtered_scan_tolerates_merge_update_row_id_overlap keeps that honest in
both directions.
Turns the previous commit's red tests green. Full workspace gate passes
(cargo test --workspace --locked --no-fail-fast, 68 suites).
113 lines
4 KiB
Protocol Buffer
113 lines
4 KiB
Protocol Buffer
// SPDX-License-Identifier: Apache-2.0
|
|
// SPDX-FileCopyrightText: Copyright The Lance Authors
|
|
|
|
syntax = "proto3";
|
|
|
|
package lance.table;
|
|
// TODO: what would it take to store this in a LanceV2 file?
|
|
// Or would flatbuffers be better for this?
|
|
|
|
/// A sequence of row IDs. This is split up into one or more segments,
|
|
/// each of which can be encoded in different ways. The encodings are optimized
|
|
/// for values that are sorted, which will often be the case with row ids.
|
|
/// They also have optimized forms depending on how sparse the values are.
|
|
message RowIdSequence {
|
|
repeated U64Segment segments = 1;
|
|
}
|
|
|
|
/// Different ways to encode a sequence of u64 values.
|
|
message U64Segment {
|
|
/// A range of u64 values.
|
|
message Range {
|
|
/// The start of the range, inclusive.
|
|
uint64 start = 1;
|
|
/// The end of the range, exclusive.
|
|
uint64 end = 2;
|
|
}
|
|
|
|
/// A range of u64 values with holes.
|
|
message RangeWithHoles {
|
|
/// The start of the range, inclusive.
|
|
uint64 start = 1;
|
|
/// The end of the range, exclusive.
|
|
uint64 end = 2;
|
|
/// The holes in the range, as a sorted array of values;
|
|
/// Binary search can be used to check whether a value is a hole and should
|
|
/// be skipped. This can also be used to count the number of holes before a
|
|
/// given value, if you need to find the logical offset of a value in the
|
|
/// segment.
|
|
EncodedU64Array holes = 3;
|
|
}
|
|
|
|
/// A range of u64 values with a bitmap.
|
|
message RangeWithBitmap {
|
|
/// The start of the range, inclusive.
|
|
uint64 start = 1;
|
|
/// The end of the range, exclusive.
|
|
uint64 end = 2;
|
|
/// A bitmap of the values in the range. The bitmap is a sequence of bytes,
|
|
/// where each byte represents 8 values. The first byte represents values
|
|
/// start to start + 7, the second byte represents values start + 8 to
|
|
/// start + 15, and so on. The most significant bit of each byte represents
|
|
/// the first value in the range, and the least significant bit represents
|
|
/// the last value in the range. If the bit is set, the value is in the
|
|
/// range; if it is not set, the value is not in the range.
|
|
bytes bitmap = 3;
|
|
}
|
|
|
|
oneof segment {
|
|
/// When the values are sorted and contiguous.
|
|
Range range = 1;
|
|
/// When the values are sorted but have a few gaps.
|
|
RangeWithHoles range_with_holes = 2;
|
|
/// When the values are sorted but have many gaps.
|
|
RangeWithBitmap range_with_bitmap = 3;
|
|
/// When the values are sorted but are sparse.
|
|
EncodedU64Array sorted_array = 4;
|
|
/// A general array of values, which is not sorted.
|
|
EncodedU64Array array = 5;
|
|
}
|
|
} // RowIdSegment
|
|
|
|
/// A basic bitpacked array of u64 values.
|
|
message EncodedU64Array {
|
|
message U16Array {
|
|
uint64 base = 1;
|
|
/// The deltas are stored as 16-bit unsigned integers.
|
|
/// (protobuf doesn't support 16-bit integers, so we use bytes instead)
|
|
bytes offsets = 2;
|
|
}
|
|
|
|
message U32Array {
|
|
uint64 base = 1;
|
|
/// The deltas are stored as 32-bit unsigned integers.
|
|
/// (we use bytes instead of uint32 to avoid overhead of varint encoding)
|
|
bytes offsets = 2;
|
|
}
|
|
|
|
message U64Array {
|
|
/// (We use bytes instead of uint64 to avoid overhead of varint encoding)
|
|
bytes values = 2;
|
|
}
|
|
|
|
oneof array {
|
|
U16Array u16_array = 1;
|
|
U32Array u32_array = 2;
|
|
U64Array u64_array = 3;
|
|
}
|
|
}
|
|
|
|
/// A sequence of dataset versions. Similar to RowIdSequence but tracks
|
|
/// version runs. It uses RLE (Run-Length Encoding) to efficiently
|
|
// represent consecutive rows with the same version.
|
|
message RowDatasetVersionSequence {
|
|
repeated RowDatasetVersionRun runs = 1;
|
|
}
|
|
|
|
/// A run of rows with the same version.
|
|
message RowDatasetVersionRun {
|
|
/// The number of consecutive rows with the same version.
|
|
U64Segment span = 1;
|
|
|
|
uint64 version = 2;
|
|
}
|