mirror of
https://github.com/ModernRelay/omnigraph.git
synced 2026-07-03 02:51:04 +02:00
iss-merge-rowid-overlap-corrupts-filtered-reads / lance#7444: an
update-style merge_insert over a merge-written fragment legally reuses the
updated rows' stable row ids (row-id-lineage spec: updates preserve
_rowid) while the superseded fragment keeps its full sequence plus a
deletion vector. A later delete leaves the overlapping id range sparsely
tiled, and lance-table 7.0.0's RowIdIndex::new asserted dense tiling —
failing every filtered read that builds the id→address map ("Wrong range"
debug assert; "all columns in a record batch must have the same length"
or a silently-wrong batch in release).
The upstream fix (lance#7480, merged 2026-07-01) landed hours AFTER
v8.0.0 was cut, so no release ≤ 8.0.0 carries it. Consume it now as a
vendored pin: vendor/lance-table is the pristine published 7.0.0 source
plus ONLY the #7480 rowids/index.rs hunk (drop the false tiling assert;
hard-error on the true invariant — one live id claimed by two fragments)
and upstream's regression unit test, wired via [patch.crates-io]. The fix
is read-side only, so already-written graphs become readable as-is — no
data repair.
Removal condition (see vendor/lance-table/README.omnigraph.md): drop the
vendor dir + patch entry at the first Lance bump whose lance-table ships
lance#7480 (9.0.0, or a backported 8.0.1). The surface guard
filtered_scan_tolerates_merge_update_row_id_overlap keeps that honest in
both directions.
Turns the previous commit's red tests green. Full workspace gate passes
(cargo test --workspace --locked --no-fail-fast, 68 suites).
354 lines
12 KiB
Protocol Buffer
354 lines
12 KiB
Protocol Buffer
// SPDX-License-Identifier: Apache-2.0
|
|
// SPDX-FileCopyrightText: Copyright The Lance Authors
|
|
|
|
syntax = "proto3";
|
|
|
|
import "file.proto";
|
|
import "table.proto";
|
|
import "google/protobuf/any.proto";
|
|
|
|
package lance.table;
|
|
|
|
// A transaction represents the changes to a dataset.
|
|
//
|
|
// This has two purposes:
|
|
// 1. When retrying a commit, the transaction can be used to re-build an updated
|
|
// manifest.
|
|
// 2. When there's a conflict, this can be used to determine whether the other
|
|
// transaction is compatible with this one.
|
|
message Transaction {
|
|
// The version of the dataset this transaction was built from.
|
|
//
|
|
// For example, for a delete transaction this means the version of the dataset
|
|
// that was read from while evaluating the deletion predicate.
|
|
uint64 read_version = 1;
|
|
|
|
// The UUID that unique identifies a transaction.
|
|
string uuid = 2;
|
|
|
|
// Optional version tag.
|
|
string tag = 3;
|
|
|
|
// Optional properties for the transaction
|
|
// __lance_commit_message is a reserved key
|
|
map<string, string> transaction_properties = 4;
|
|
|
|
// Add new rows to the dataset.
|
|
message Append {
|
|
// The new fragments to append.
|
|
//
|
|
// Fragment IDs are not yet assigned.
|
|
repeated DataFragment fragments = 1;
|
|
}
|
|
|
|
// Mark rows as deleted.
|
|
message Delete {
|
|
// The fragments to update
|
|
//
|
|
// The fragment IDs will match existing fragments in the dataset.
|
|
repeated DataFragment updated_fragments = 1;
|
|
// The fragments to delete entirely.
|
|
repeated uint64 deleted_fragment_ids = 2;
|
|
// The predicate that was evaluated
|
|
//
|
|
// This may be used to determine whether the delete would have affected
|
|
// files written by a concurrent transaction.
|
|
string predicate = 3;
|
|
}
|
|
|
|
// Create or overwrite the entire dataset.
|
|
message Overwrite {
|
|
// The new fragments
|
|
//
|
|
// Fragment IDs are not yet assigned.
|
|
repeated DataFragment fragments = 1;
|
|
// The new schema
|
|
repeated lance.file.Field schema = 2;
|
|
// Schema metadata.
|
|
map<string, bytes> schema_metadata = 3;
|
|
// Key-value pairs to merge with existing config.
|
|
map<string, string> config_upsert_values = 4;
|
|
// The base paths to be added for the initial dataset creation
|
|
repeated BasePath initial_bases = 5;
|
|
}
|
|
|
|
// Add or replace a new secondary index.
|
|
//
|
|
// This is also used to remove an index (we are replacing it with nothing)
|
|
//
|
|
// - new_indices: the modified indices, empty if dropping indices only
|
|
// - removed_indices: the indices that are being replaced
|
|
message CreateIndex {
|
|
repeated IndexMetadata new_indices = 1;
|
|
repeated IndexMetadata removed_indices = 2;
|
|
}
|
|
|
|
// An operation that rewrites but does not change the data in the table. These
|
|
// kinds of operations just rearrange data.
|
|
message Rewrite {
|
|
// The old fragments that are being replaced
|
|
//
|
|
// DEPRECATED: use groups instead.
|
|
//
|
|
// These should all have existing fragment IDs.
|
|
repeated DataFragment old_fragments = 1;
|
|
// The new fragments
|
|
//
|
|
// DEPRECATED: use groups instead.
|
|
//
|
|
// These fragments IDs are not yet assigned.
|
|
repeated DataFragment new_fragments = 2;
|
|
|
|
// During a rewrite an index may be rewritten. We only serialize the UUID
|
|
// since a rewrite should not change the other index parameters.
|
|
message RewrittenIndex {
|
|
// The id of the index that will be replaced
|
|
UUID old_id = 1;
|
|
// the id of the new index
|
|
UUID new_id = 2;
|
|
// the new index details
|
|
google.protobuf.Any new_index_details = 3;
|
|
// the version of the new index
|
|
uint32 new_index_version = 4;
|
|
// Files in the new index with their sizes.
|
|
// Empty if file sizes are not available (e.g. older writers).
|
|
repeated IndexFile new_index_files = 5;
|
|
}
|
|
|
|
// A group of rewrite files that are all part of the same rewrite.
|
|
message RewriteGroup {
|
|
// The old fragment that is being replaced
|
|
//
|
|
// This should have an existing fragment ID.
|
|
repeated DataFragment old_fragments = 1;
|
|
// The new fragment
|
|
//
|
|
// The ID should have been reserved by an earlier
|
|
// reserve operation
|
|
repeated DataFragment new_fragments = 2;
|
|
}
|
|
|
|
// Groups of files that have been rewritten
|
|
repeated RewriteGroup groups = 3;
|
|
// Indices that have been rewritten
|
|
repeated RewrittenIndex rewritten_indices = 4;
|
|
}
|
|
|
|
// An operation that merges in a new column, altering the schema.
|
|
message Merge {
|
|
// The updated fragments
|
|
//
|
|
// These should all have existing fragment IDs.
|
|
repeated DataFragment fragments = 1;
|
|
// The new schema
|
|
repeated lance.file.Field schema = 2;
|
|
// Schema metadata.
|
|
map<string, bytes> schema_metadata = 3;
|
|
}
|
|
|
|
// An operation that projects a subset of columns, altering the schema.
|
|
message Project {
|
|
// The new schema
|
|
repeated lance.file.Field schema = 1;
|
|
}
|
|
|
|
// An operation that restores a dataset to a previous version.
|
|
message Restore {
|
|
// The version to restore to
|
|
uint64 version = 1;
|
|
}
|
|
|
|
// An operation that reserves fragment ids for future use in
|
|
// a rewrite operation.
|
|
message ReserveFragments {
|
|
uint32 num_fragments = 1;
|
|
}
|
|
|
|
// An operation that clones a dataset.
|
|
message Clone {
|
|
// - true: Performs a metadata-only clone (copies manifest without data files).
|
|
// The cloned dataset references original data through `base_paths`,
|
|
// suitable for experimental scenarios or rapid metadata migration.
|
|
// - false: Performs a full deep clone using the underlying object storage's native
|
|
// copy API (e.g., S3 CopyObject, GCS rewrite). This leverages server-side
|
|
// bulk copy operations to bypass download/upload bottlenecks, achieving
|
|
// near-linear speedup for large datasets (typically 3-10x faster than
|
|
// manual file transfers). The operation maintains atomicity and data
|
|
// integrity guarantees provided by the storage backend.
|
|
bool is_shallow = 1;
|
|
// the reference name in the source dataset
|
|
// in most cases it should be the branch or tag name in the source dataset
|
|
optional string ref_name = 2;
|
|
// the version of the source dataset for cloning
|
|
uint64 ref_version = 3;
|
|
// the absolute base path of the source dataset for cloning
|
|
string ref_path = 4;
|
|
// if the target dataset is a branch, this is the branch name of the target dataset
|
|
optional string branch_name = 5;
|
|
}
|
|
|
|
// Exact set of key hashes for conflict detection.
|
|
// Used when the number of inserted rows is small.
|
|
message ExactKeySetFilter {
|
|
// 64-bit hashes of the inserted row keys.
|
|
repeated uint64 key_hashes = 1;
|
|
}
|
|
|
|
// Bloom filter for key existence tests.
|
|
// Used when the number of rows is large.
|
|
message BloomFilter {
|
|
// Bitset backing the bloom filter (SBBF format).
|
|
bytes bitmap = 1;
|
|
// Number of bits in the bitmap.
|
|
uint32 num_bits = 2;
|
|
// Number of items the filter was sized for.
|
|
// Used for intersection validation (filters with different sizes cannot be compared).
|
|
// Default: 8192
|
|
uint64 number_of_items = 3;
|
|
// False positive probability the filter was sized for.
|
|
// Used for intersection validation (filters with different parameters cannot be compared).
|
|
// Default: 0.00057
|
|
double probability = 4;
|
|
}
|
|
|
|
// A filter for checking key existence in set of rows inserted by a merge insert operation.
|
|
// Only created when the merge insert's ON columns match the schema's unenforced primary key.
|
|
// The presence of this filter indicates strict primary key conflict detection should be used.
|
|
// Can use either an exact set (for small row counts) or a Bloom filter (for large row counts).
|
|
message KeyExistenceFilter {
|
|
// Field IDs of columns participating in the key (must match unenforced primary key).
|
|
repeated int32 field_ids = 1;
|
|
// The underlying data structure storing the key hashes.
|
|
oneof data {
|
|
// Exact set of key hashes (used for small number of rows).
|
|
ExactKeySetFilter exact = 2;
|
|
// Bloom filter (used for large number of rows).
|
|
BloomFilter bloom = 3;
|
|
}
|
|
}
|
|
|
|
// Serialized as sorted distinct local physical row offsets within the fragment (0-based).
|
|
message UInt32List {
|
|
repeated uint32 values = 1;
|
|
}
|
|
|
|
// An operation that updates rows but does not add or remove rows.
|
|
message Update {
|
|
// The fragments that have been removed. These are fragments where all rows
|
|
// have been updated and moved to a new fragment.
|
|
repeated uint64 removed_fragment_ids = 1;
|
|
// The fragments that have been updated.
|
|
repeated DataFragment updated_fragments = 2;
|
|
// The new fragments where updated rows have been moved to.
|
|
repeated DataFragment new_fragments = 3;
|
|
// The ids of the fields that have been modified.
|
|
repeated uint32 fields_modified = 4;
|
|
/// List of MemWAL shard generations to mark as merged after this transaction
|
|
repeated MergedGeneration merged_generations = 5;
|
|
/// The fields that used to judge whether to preserve the new frag's id into
|
|
/// the frag bitmap of the specified indices.
|
|
repeated uint32 fields_for_preserving_frag_bitmap = 6;
|
|
// The mode of update
|
|
UpdateMode update_mode = 7;
|
|
// Filter for checking existence of keys in newly inserted rows, used for conflict detection.
|
|
// Only tracks keys from INSERT operations during merge insert, not updates.
|
|
optional KeyExistenceFilter inserted_rows = 8;
|
|
// Per-fragment physical row offsets that matched an update_columns hash join (RewriteColumns).
|
|
map<uint64, UInt32List> updated_fragment_offsets = 9;
|
|
}
|
|
|
|
// The mode of update operation
|
|
enum UpdateMode {
|
|
|
|
/// rows are deleted in current fragments and rewritten in new fragments.
|
|
/// This is most optimal when the majority of columns are being rewritten
|
|
/// or only a few rows are being updated.
|
|
REWRITE_ROWS = 0;
|
|
|
|
/// within each fragment, columns are fully rewritten and inserted as new data files.
|
|
/// Old versions of columns are tombstoned. This is most optimal when most rows are affected
|
|
/// but a small subset of columns are affected.
|
|
REWRITE_COLUMNS = 1;
|
|
}
|
|
|
|
// An entry for a map update. If value is not set, the key will be removed from the map.
|
|
message UpdateMapEntry {
|
|
// The key of the map entry to update.
|
|
string key = 1;
|
|
// The value to set for the key.
|
|
optional string value = 2;
|
|
}
|
|
|
|
message UpdateMap {
|
|
repeated UpdateMapEntry update_entries = 1;
|
|
// If true, the map will be replaced entirely with the new entries.
|
|
// If false, the new entries will be merged with the existing map.
|
|
bool replace = 2;
|
|
}
|
|
|
|
// An operation that updates the table config, table metadata, schema metadata,
|
|
// or field metadata.
|
|
message UpdateConfig {
|
|
UpdateMap config_updates = 6;
|
|
UpdateMap table_metadata_updates = 7;
|
|
UpdateMap schema_metadata_updates = 8;
|
|
map<int32, UpdateMap> field_metadata_updates = 9;
|
|
|
|
// Deprecated -------------------------------
|
|
map<string, string> upsert_values = 1;
|
|
repeated string delete_keys = 2;
|
|
map<string, string> schema_metadata = 3;
|
|
map<uint32, FieldMetadataUpdate> field_metadata = 4;
|
|
|
|
message FieldMetadataUpdate {
|
|
map<string, string> metadata = 5;
|
|
}
|
|
}
|
|
|
|
message DataReplacementGroup {
|
|
uint64 fragment_id = 1;
|
|
DataFile new_file = 2;
|
|
}
|
|
|
|
// An operation that replaces the data in a region of the table with new data.
|
|
message DataReplacement {
|
|
repeated DataReplacementGroup replacements = 1;
|
|
}
|
|
|
|
// Update the merged generations in MemWAL index.
|
|
// This operation is used during merge-insert to atomically record which
|
|
// generations have been merged to the base table.
|
|
message UpdateMemWalState {
|
|
// Shards and generations being marked as merged.
|
|
repeated MergedGeneration merged_generations = 1;
|
|
}
|
|
|
|
// An operation that updates base paths in the dataset.
|
|
message UpdateBases {
|
|
// The new base paths to add to the manifest.
|
|
repeated BasePath new_bases = 1;
|
|
}
|
|
|
|
// The operation of this transaction.
|
|
oneof operation {
|
|
Append append = 100;
|
|
Delete delete = 101;
|
|
Overwrite overwrite = 102;
|
|
CreateIndex create_index = 103;
|
|
Rewrite rewrite = 104;
|
|
Merge merge = 105;
|
|
Restore restore = 106;
|
|
ReserveFragments reserve_fragments = 107;
|
|
Update update = 108;
|
|
Project project = 109;
|
|
UpdateConfig update_config = 110;
|
|
DataReplacement data_replacement = 111;
|
|
UpdateMemWalState update_mem_wal_state = 112;
|
|
Clone clone = 113;
|
|
UpdateBases update_bases = 114;
|
|
}
|
|
|
|
// Fields 200/202 (`blob_append` / `blob_overwrite`) previously represented blob dataset ops.
|
|
reserved 200, 202;
|
|
reserved "blob_append", "blob_overwrite";
|
|
}
|