fix(deps): vendor lance-table 7.0.0 + lance#7480 so merge-updated tables survive filtered reads after deletes

iss-merge-rowid-overlap-corrupts-filtered-reads / lance#7444: an
update-style merge_insert over a merge-written fragment legally reuses the
updated rows' stable row ids (row-id-lineage spec: updates preserve
_rowid) while the superseded fragment keeps its full sequence plus a
deletion vector. A later delete leaves the overlapping id range sparsely
tiled, and lance-table 7.0.0's RowIdIndex::new asserted dense tiling —
failing every filtered read that builds the id→address map ("Wrong range"
debug assert; "all columns in a record batch must have the same length"
or a silently-wrong batch in release).

The upstream fix (lance#7480, merged 2026-07-01) landed hours AFTER
v8.0.0 was cut, so no release ≤ 8.0.0 carries it. Consume it now as a
vendored pin: vendor/lance-table is the pristine published 7.0.0 source
plus ONLY the #7480 rowids/index.rs hunk (drop the false tiling assert;
hard-error on the true invariant — one live id claimed by two fragments)
and upstream's regression unit test, wired via [patch.crates-io]. The fix
is read-side only, so already-written graphs become readable as-is — no
data repair.

Removal condition (see vendor/lance-table/README.omnigraph.md): drop the
vendor dir + patch entry at the first Lance bump whose lance-table ships
lance#7480 (9.0.0, or a backported 8.0.1). The surface guard
filtered_scan_tolerates_merge_update_row_id_overlap keeps that honest in
both directions.

Turns the previous commit's red tests green. Full workspace gate passes
(cargo test --workspace --locked --no-fail-fast, 68 suites).
This commit is contained in:
aaltshuler 2026-07-02 02:17:25 +03:00 committed by Andrew Altshuler
parent 3b564534a2
commit b5c0c6238b
48 changed files with 22203 additions and 2 deletions

2
Cargo.lock generated
View file

@ -4202,8 +4202,6 @@ dependencies = [
[[package]]
name = "lance-table"
version = "7.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b16f1355904aea4ebb04ffc70c58c97901e10bde44452b4b021de4a1f329250d"
dependencies = [
"arrow",
"arrow-array",

View file

@ -1,5 +1,8 @@
[workspace]
resolver = "2"
# The vendored patched crate is a [patch.crates-io] path source, not a
# workspace member (see the patch section at the bottom of this file).
exclude = ["vendor/lance-table"]
members = [
"crates/omnigraph-compiler",
"crates/omnigraph",
@ -86,3 +89,14 @@ opt-level = 2
lto = "thin"
codegen-units = 16
strip = true
# Vendored lance-table 7.0.0 carrying ONLY the lance#7480 hunk (rowids/index.rs):
# tolerate sparse overlapping stable-row-id chunks so filtered reads survive an
# update-style merge_insert followed by a delete (lance#7444;
# iss-merge-rowid-overlap-corrupts-filtered-reads). Pinned by
# lance_surface_guards.rs::filtered_scan_tolerates_merge_update_row_id_overlap.
# REMOVE vendor/lance-table + this patch at the first Lance bump whose
# lance-table ships lance#7480 (9.0.0, or a backported 8.0.1). Details:
# vendor/lance-table/README.omnigraph.md and docs/dev/lance.md.
[patch.crates-io]
lance-table = { path = "vendor/lance-table" }

View file

@ -0,0 +1,6 @@
{
"git": {
"sha1": "a15ae30939b9242d74b00aed1fb83abf7d15bf7f"
},
"path_in_vcs": "rust/lance-table"
}

5741
vendor/lance-table/Cargo.lock generated vendored Normal file

File diff suppressed because it is too large Load diff

263
vendor/lance-table/Cargo.toml vendored Normal file
View file

@ -0,0 +1,263 @@
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
#
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
# to registry (e.g., crates.io) dependencies.
#
# If you are reading this file be aware that the original Cargo.toml
# will likely look very different (and much more reasonable).
# See Cargo.toml.orig for the original contents.
[package]
edition = "2024"
rust-version = "1.91.0"
name = "lance-table"
version = "7.0.0"
authors = ["Lance Devs <dev@lance.org>"]
build = "build.rs"
autolib = false
autobins = false
autoexamples = false
autotests = false
autobenches = false
description = "Utilities for the Lance table format"
readme = "README.md"
keywords = [
"data-format",
"data-science",
"machine-learning",
"apache-arrow",
"data-analytics",
]
categories = [
"database-implementations",
"data-structures",
"development-tools",
"science",
]
license = "Apache-2.0"
repository = "https://github.com/lance-format/lance"
[package.metadata.docs.rs]
features = ["protoc"]
[features]
dynamodb = [
"dep:aws-sdk-dynamodb",
"dep:aws-credential-types",
"lance-io/aws",
]
protoc = ["dep:protobuf-src"]
[lib]
name = "lance_table"
path = "src/lib.rs"
[[bench]]
name = "manifest_intern"
path = "benches/manifest_intern.rs"
harness = false
[[bench]]
name = "row_id_index"
path = "benches/row_id_index.rs"
harness = false
[dependencies.arrow]
version = "58.0.0"
features = ["prettyprint"]
[dependencies.arrow-array]
version = "58.0.0"
[dependencies.arrow-buffer]
version = "58.0.0"
[dependencies.arrow-ipc]
version = "58.0.0"
features = ["zstd"]
[dependencies.arrow-schema]
version = "58.0.0"
[dependencies.async-trait]
version = "0.1"
[dependencies.aws-credential-types]
version = "1.2.0"
optional = true
[dependencies.aws-sdk-dynamodb]
version = "1.38.0"
features = [
"default-https-client",
"rt-tokio",
]
optional = true
default-features = false
[dependencies.byteorder]
version = "1.5"
[dependencies.bytes]
version = "1.11.1"
[dependencies.chrono]
version = "0.4.41"
features = [
"std",
"now",
"serde",
]
default-features = false
[dependencies.deepsize]
version = "0.2.0"
[dependencies.futures]
version = "0.3"
[dependencies.lance-arrow]
version = "=7.0.0"
[dependencies.lance-core]
version = "=7.0.0"
[dependencies.lance-file]
version = "=7.0.0"
[dependencies.lance-io]
version = "=7.0.0"
default-features = false
[dependencies.log]
version = "0.4"
[dependencies.object_store]
version = "0.13.2"
[dependencies.prost]
version = "0.14.1"
[dependencies.prost-types]
version = "0.14.1"
[dependencies.rand]
version = "0.9.1"
features = ["small_rng"]
[dependencies.rangemap]
version = "1.0"
[dependencies.roaring]
version = "0.11"
[dependencies.semver]
version = "1.0"
[dependencies.serde]
version = "^1"
[dependencies.serde_json]
version = "1"
[dependencies.snafu]
version = "0.9"
[dependencies.tokio]
version = "1.23"
features = [
"rt-multi-thread",
"macros",
"fs",
"sync",
]
[dependencies.tracing]
version = "0.1"
[dependencies.url]
version = "2.5.7"
[dependencies.uuid]
version = "1.2"
features = [
"v4",
"serde",
]
[dev-dependencies.arrow-schema]
version = "58.0.0"
[dev-dependencies.criterion]
version = "0.5"
features = [
"async",
"async_tokio",
"html_reports",
]
[dev-dependencies.lance-datagen]
version = "=7.0.0"
[dev-dependencies.pretty_assertions]
version = "1.4.0"
[dev-dependencies.proptest]
version = "1.3.1"
[dev-dependencies.rstest]
version = "0.23.0"
[build-dependencies.prost-build]
version = "0.14.1"
[build-dependencies.protobuf-src]
version = "2.1"
optional = true
[target.'cfg(target_os = "linux")'.dev-dependencies.pprof]
version = "0.14.0"
features = [
"flamegraph",
"criterion",
]
[lints.clippy]
dbg_macro = "deny"
disallowed_macros = "deny"
fallible_impl_from = "deny"
large_futures = "deny"
manual_let_else = "deny"
multiple-crate-versions = "allow"
print_stderr = "deny"
print_stdout = "deny"
redundant_clone = "deny"
redundant_pub_crate = "deny"
single_range_in_vec_init = "allow"
string_add = "deny"
string_add_assign = "deny"
string_lit_as_bytes = "deny"
trait_duplication_in_bounds = "deny"
use_self = "deny"
[lints.clippy.all]
level = "deny"
priority = -1
[lints.clippy.cargo]
level = "deny"
priority = -1
[lints.clippy.style]
level = "deny"
priority = -1
[lints.rust]
unsafe_op_in_unsafe_fn = "allow"
[lints.rust.unexpected_cfgs]
level = "warn"
priority = 0
check-cfg = ["cfg(coverage,coverage_nightly)"]

80
vendor/lance-table/Cargo.toml.orig generated vendored Normal file
View file

@ -0,0 +1,80 @@
[package]
name = "lance-table"
version.workspace = true
edition.workspace = true
authors.workspace = true
license.workspace = true
repository.workspace = true
readme = "README.md"
description = "Utilities for the Lance table format"
keywords.workspace = true
categories.workspace = true
rust-version.workspace = true
[dependencies]
lance-arrow.workspace = true
lance-core.workspace = true
lance-file.workspace = true
lance-io.workspace = true
arrow.workspace = true
arrow-array.workspace = true
arrow-buffer.workspace = true
arrow-ipc.workspace = true
arrow-schema.workspace = true
async-trait.workspace = true
aws-credential-types = { workspace = true, optional = true }
aws-sdk-dynamodb = { workspace = true, optional = true, default-features = false, features = ["default-https-client", "rt-tokio"] }
byteorder.workspace = true
bytes.workspace = true
chrono.workspace = true
deepsize.workspace = true
futures.workspace = true
log.workspace = true
object_store.workspace = true
prost.workspace = true
prost-types.workspace = true
rand.workspace = true
rangemap.workspace = true
roaring.workspace = true
serde.workspace = true
serde_json.workspace = true
semver.workspace = true
snafu.workspace = true
tokio.workspace = true
tracing.workspace = true
url.workspace = true
uuid.workspace = true
[dev-dependencies]
lance-datagen.workspace = true
arrow-schema.workspace = true
criterion.workspace = true
pretty_assertions.workspace = true
proptest.workspace = true
rstest.workspace = true
[target.'cfg(target_os = "linux")'.dev-dependencies]
pprof = { workspace = true }
[build-dependencies]
prost-build.workspace = true
protobuf-src = { version = "2.1", optional = true }
[features]
dynamodb = ["dep:aws-sdk-dynamodb", "dep:aws-credential-types", "lance-io/aws"]
protoc = ["dep:protobuf-src"]
[package.metadata.docs.rs]
# docs.rs uses an older version of Ubuntu that does not have the necessary protoc version
features = ["protoc"]
[[bench]]
name = "row_id_index"
harness = false
[[bench]]
name = "manifest_intern"
harness = false
[lints]
workspace = true

6
vendor/lance-table/README.md vendored Normal file
View file

@ -0,0 +1,6 @@
# lance-table
`lance-table` is an internal sub-crate for the
[Lance table format](https://lance.org/format/table/).
**Important Note**: This crate is **not intended for external usage**.

42
vendor/lance-table/README.omnigraph.md vendored Normal file
View file

@ -0,0 +1,42 @@
# Vendored `lance-table` 7.0.0 + lance#7480 (omnigraph patch pin)
This directory is the **pristine `lance-table` 7.0.0 crates.io source** (unpacked
from the published `.crate`) carrying exactly one upstream fix, cherry-picked
from [lance-format/lance#7480](https://github.com/lance-format/lance/pull/7480)
(merged to Lance main 2026-07-01, first present in no release ≤ 8.0.0):
- `src/rowids/index.rs``RowIdIndex::new` no longer asserts that overlapping
row-id chunks densely tile their range (an update-style `merge_insert`
legally reuses the updated rows' stable ids in new fragments while the
superseded fragment keeps its full sequence + a deletion vector; a later
delete leaves the union short of the span). The real invariant — the same
live id claimed by two fragments — is now a hard error in
`merge_overlapping_chunks` instead. Upstream's regression unit test is
included.
Without the fix, any filtered read that builds the row-id index on such a
table fails: `rowids/index.rs:50` "Wrong range" debug assert; "all columns in
a record batch must have the same length" (or a silently-wrong batch) in
release. Bug: [lance#7444](https://github.com/lance-format/lance/issues/7444),
tracked as `iss-merge-rowid-overlap-corrupts-filtered-reads` /
`blk-lance-7444` on the dev graph.
Wired up via `[patch.crates-io] lance-table = { path = "vendor/lance-table" }`
in the workspace root `Cargo.toml`.
## Removal condition
Delete this directory and the `[patch.crates-io]` entry at the **first Lance
bump whose `lance-table` ships lance#7480** — 9.0.0, or a backported 8.0.1 if
upstream cuts one. The runtime guard
`crates/omnigraph/tests/lance_surface_guards.rs::filtered_scan_tolerates_merge_update_row_id_overlap`
pins the fixed behavior: it goes red if the patch is dropped too early or a
future bump regresses the fix.
## Verifying the delta
```bash
# The full diff vs the published crate should be ONLY the #7480 hunk + this README:
tar -xzf ~/.cargo/registry/cache/index.crates.io-*/lance-table-7.0.0.crate -C /tmp
diff -ru /tmp/lance-table-7.0.0 vendor/lance-table
```

View file

@ -0,0 +1,261 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors
// Benchmarks use eprintln! to report memory stats alongside criterion output.
#![allow(clippy::print_stderr)]
//! Benchmark for manifest fragment interning.
//!
//! Measures memory savings and deserialization throughput when interning
//! `DataFile.fields`, `DataFile.column_indices`, and
//! `RowDatasetVersionMeta::Inline` bytes across many fragments.
use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
use deepsize::DeepSizeOf;
use prost::Message;
use lance_table::format::pb;
use lance_table::format::{DataFileFieldInterner, Fragment};
fn num_fragments() -> u64 {
std::env::var("BENCH_NUM_FRAGMENTS")
.map(|s| s.parse().unwrap())
.unwrap_or(100_000)
}
/// Build a vector of protobuf DataFragment messages that simulate a
/// homogeneous, post-compaction table: every fragment has the same field
/// list, column indices, and version metadata bytes.
fn make_uniform_pb_fragments(n: u64, num_fields: usize) -> Vec<pb::DataFragment> {
let fields: Vec<i32> = (0..num_fields as i32).collect();
let column_indices: Vec<i32> = (0..num_fields as i32).collect();
// Simulate version metadata: a small protobuf-encoded payload
// (identical across all fragments post-compaction)
let version_bytes: Vec<u8> = {
let seq = pb::RowDatasetVersionSequence {
runs: vec![pb::RowDatasetVersionRun {
span: Some(pb::U64Segment {
segment: Some(pb::u64_segment::Segment::Range(pb::u64_segment::Range {
start: 0,
end: 1000,
})),
}),
version: 42,
}],
};
seq.encode_to_vec()
};
(0..n)
.map(|i| pb::DataFragment {
id: i,
files: vec![pb::DataFile {
path: format!("data/{i}.lance"),
fields: fields.clone(),
column_indices: column_indices.clone(),
file_major_version: 2,
file_minor_version: 0,
file_size_bytes: 0,
base_id: None,
}],
deletion_file: None,
row_id_sequence: None,
physical_rows: 1000,
last_updated_at_version_sequence: Some(
pb::data_fragment::LastUpdatedAtVersionSequence::InlineLastUpdatedAtVersions(
version_bytes.clone(),
),
),
created_at_version_sequence: Some(
pb::data_fragment::CreatedAtVersionSequence::InlineCreatedAtVersions(
version_bytes.clone(),
),
),
})
.collect()
}
/// Deserialize protobuf fragments WITHOUT interning (baseline).
fn deserialize_without_interning(protos: &[pb::DataFragment]) -> Vec<Fragment> {
protos
.iter()
.map(|p| Fragment::try_from(p.clone()).unwrap())
.collect()
}
/// Deserialize protobuf fragments WITH interning.
fn deserialize_with_interning(protos: &[pb::DataFragment]) -> Vec<Fragment> {
let mut interner = DataFileFieldInterner::default();
protos
.iter()
.map(|p| interner.intern_fragment(p.clone()).unwrap())
.collect()
}
/// Build fragments where each group shares the same version metadata,
/// simulating many small appends without compaction.
fn make_diverse_pb_fragments(
n: u64,
num_fields: usize,
unique_versions: u64,
) -> Vec<pb::DataFragment> {
let fields: Vec<i32> = (0..num_fields as i32).collect();
let column_indices: Vec<i32> = (0..num_fields as i32).collect();
let group_size = n / unique_versions;
let version_payloads: Vec<Vec<u8>> = (0..unique_versions)
.map(|v| {
let seq = pb::RowDatasetVersionSequence {
runs: vec![pb::RowDatasetVersionRun {
span: Some(pb::U64Segment {
segment: Some(pb::u64_segment::Segment::Range(pb::u64_segment::Range {
start: 0,
end: 1000,
})),
}),
version: v,
}],
};
seq.encode_to_vec()
})
.collect();
(0..n)
.map(|i| {
let version_idx = (i / group_size).min(unique_versions - 1) as usize;
pb::DataFragment {
id: i,
files: vec![pb::DataFile {
path: format!("data/{i}.lance"),
fields: fields.clone(),
column_indices: column_indices.clone(),
file_major_version: 2,
file_minor_version: 0,
file_size_bytes: 0,
base_id: None,
}],
deletion_file: None,
row_id_sequence: None,
physical_rows: 1000,
last_updated_at_version_sequence: Some(
pb::data_fragment::LastUpdatedAtVersionSequence::InlineLastUpdatedAtVersions(
version_payloads[version_idx].clone(),
),
),
created_at_version_sequence: Some(
pb::data_fragment::CreatedAtVersionSequence::InlineCreatedAtVersions(
version_payloads[version_idx].clone(),
),
),
}
})
.collect()
}
fn bench_deserialization(c: &mut Criterion) {
let mut group = c.benchmark_group("manifest_intern");
let n = num_fragments();
for num_fields in [10, 50] {
let protos = make_uniform_pb_fragments(n, num_fields);
group.bench_with_input(
BenchmarkId::new("deserialize_no_intern", num_fields),
&num_fields,
|b, _| {
b.iter(|| deserialize_without_interning(&protos));
},
);
group.bench_with_input(
BenchmarkId::new("deserialize_with_intern", num_fields),
&num_fields,
|b, _| {
b.iter(|| deserialize_with_interning(&protos));
},
);
}
// Benchmark with many unique version payloads
for unique_versions in [10, 100, 500] {
let protos = make_diverse_pb_fragments(n, 10, unique_versions);
group.bench_with_input(
BenchmarkId::new("deserialize_no_intern_diverse", unique_versions),
&unique_versions,
|b, _| {
b.iter(|| deserialize_without_interning(&protos));
},
);
group.bench_with_input(
BenchmarkId::new("deserialize_with_intern_diverse", unique_versions),
&unique_versions,
|b, _| {
b.iter(|| deserialize_with_interning(&protos));
},
);
}
group.finish();
}
fn bench_memory(c: &mut Criterion) {
let mut group = c.benchmark_group("manifest_memory");
let n = num_fragments();
for num_fields in [10, 50] {
let protos = make_uniform_pb_fragments(n, num_fields);
let no_intern = deserialize_without_interning(&protos);
let with_intern = deserialize_with_interning(&protos);
let size_no_intern = no_intern.deep_size_of();
let size_with_intern = with_intern.deep_size_of();
eprintln!(
"\n[{} fragments, {} fields] Memory without interning: {:.2} MB",
n,
num_fields,
size_no_intern as f64 / 1_048_576.0
);
eprintln!(
"[{} fragments, {} fields] Memory with interning: {:.2} MB",
n,
num_fields,
size_with_intern as f64 / 1_048_576.0
);
eprintln!(
"[{} fragments, {} fields] Savings: {:.2} MB ({:.1}%)",
n,
num_fields,
(size_no_intern - size_with_intern) as f64 / 1_048_576.0,
(1.0 - size_with_intern as f64 / size_no_intern as f64) * 100.0
);
// Benchmark deep_size_of measurement itself (sanity check)
group.bench_with_input(
BenchmarkId::new("deep_size_of_interned", num_fields),
&num_fields,
|b, _| {
b.iter(|| with_intern.deep_size_of());
},
);
drop(no_intern);
drop(with_intern);
}
group.finish();
}
#[cfg(target_os = "linux")]
criterion_group!(
name = benches;
config = Criterion::default().with_profiler(pprof::criterion::PProfProfiler::new(100, pprof::criterion::Output::Flamegraph(None)));
targets = bench_deserialization, bench_memory
);
#[cfg(not(target_os = "linux"))]
criterion_group!(benches, bench_deserialization, bench_memory);
criterion_main!(benches);

View file

@ -0,0 +1,323 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors
// TODO:
// - [x] Create base cases with HashMap
// - [x] Create on-disk size measurement
// - [x] Create different cases for the index. Ideal, 25% deletions, 80% deletions + compaction.
// - [ ] Create a benchmark for the get method
// - [x] Average over all valid values
// - [ ] Time to get a value that is not in the index
// - [ ] Create a benchmark for the new method (building the in-memory index)
// Optional:
// - [ ] Create in-memory size measurement (if possible)
// Questions:
// How can I write out the file? Where should I put it?
// How can I take a argument to set the size of the index?
use std::{collections::HashMap, io::Write, ops::Range, sync::Arc};
use arrow_array::{RecordBatch, UInt64Array};
use arrow_schema::{DataType, Field, Schema};
use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
use lance_core::utils::address::RowAddress;
use lance_core::utils::deletion::DeletionVector;
use lance_io::ReadBatchParams;
use lance_table::rowids::FragmentRowIdIndex;
use lance_table::{
rowids::{RowIdIndex, RowIdSequence, write_row_ids},
utils::stream::{RowIdAndDeletesConfig, apply_row_id_and_deletes},
};
fn make_sequence(row_id_range: Range<u64>, deletions: usize) -> RowIdSequence {
let mut sequence = RowIdSequence::from(row_id_range);
// Delete every other row
let delete_ids = sequence
.iter()
.step_by(2)
.take(deletions)
.collect::<Vec<_>>();
sequence.delete(delete_ids);
sequence
}
fn make_frag_sequences(
num_rows: u64,
num_frags: u64,
percent_deletion: f32,
) -> Vec<(u32, Arc<RowIdSequence>)> {
let rows_per_frag = num_rows / num_frags;
let mut start = 0;
(0..num_frags)
.map(|i| {
let sequence = make_sequence(
start..(start + rows_per_frag),
(rows_per_frag as f32 * percent_deletion) as usize,
);
start += rows_per_frag;
(i as u32, Arc::new(sequence))
})
.collect()
}
// For range of values
// https://bheisler.github.io/criterion.rs/book/user_guide/benchmarking_with_inputs.html
fn num_rows() -> u64 {
std::env::var("BENCH_NUM_ROWS")
.map(|s| s.parse().unwrap())
.unwrap_or(1_000_000)
}
struct SizeStats {
structure: String,
percent_deletions: f32,
size: u64,
}
struct SizeStatsFile {
file: Option<std::fs::File>,
}
impl SizeStatsFile {
fn new() -> Self {
if let Ok(path) = std::env::var("BENCH_SIZE_STATS_FILE") {
let mut file = std::fs::File::create(path).unwrap();
// Header row
writeln!(file, "structure,percent_deletions,size").unwrap();
Self { file: Some(file) }
} else {
Self { file: None }
}
}
fn write_row(&mut self, stats: SizeStats) {
if let Some(file) = &mut self.file {
writeln!(
file,
"\"{}\",{},{}",
stats.structure, stats.percent_deletions, stats.size
)
.unwrap();
}
}
}
fn bench_creation(c: &mut Criterion) {
let mut group = c.benchmark_group("row_id_index_creation");
let mut stats_file = SizeStatsFile::new();
for percent_deletions in [0.0, 0.25, 0.5] {
let sequences = make_frag_sequences(num_rows(), 100, percent_deletions);
let fragment_indices: Vec<FragmentRowIdIndex> = sequences
.iter()
.map(|(frag_id, sequence)| FragmentRowIdIndex {
fragment_id: *frag_id,
row_id_sequence: sequence.clone(),
deletion_vector: Arc::new(DeletionVector::default()),
})
.collect();
group.bench_with_input(
BenchmarkId::new("BuildIndex", percent_deletions),
&percent_deletions,
|b, _| {
b.iter(|| {
let _index = RowIdIndex::new(&fragment_indices).unwrap();
});
},
);
// Measure size of index
{
let mut size = 0;
for (_frag_id, sequence) in &sequences {
size += write_row_ids(sequence).len() as u64;
}
let stats = SizeStats {
structure: "RowIdIndex".to_string(),
percent_deletions,
size,
};
stats_file.write_row(stats);
}
// TODO: we should compare tombstoned vs compacted. We don't mind the
// regression in the tombstoned case, but we want to see the improvement
// in the compacted case.
// TODO: collect size of sequences when serialized
// TODO: also show building a BTreeMap and HashMap
let flat_data = sequences
.iter()
.map(|(frag_id, sequence)| {
let row_ids = sequence.iter().collect::<Vec<_>>();
let row_addresses = (0..sequence.len())
.map(|i| RowAddress::new_from_parts(*frag_id, i as u32))
.map(u64::from)
.collect::<Vec<_>>();
(row_ids, row_addresses)
})
.collect::<Vec<_>>();
// Size of flat data is just 16 bytes per row
let size = flat_data
.iter()
.map(|(ids, _addresses)| ids.len() * 16)
.sum::<usize>() as u64;
let stats = SizeStats {
structure: "FlatData".to_string(),
percent_deletions,
size,
};
stats_file.write_row(stats);
group.bench_with_input(
BenchmarkId::new("BuildHashMap", percent_deletions),
&percent_deletions,
|b, _| {
b.iter(|| {
let mut index = HashMap::new();
index.extend(flat_data.iter().flat_map(|(ids, addresses)| {
ids.iter().copied().zip(addresses.iter().copied())
}));
});
},
);
}
group.finish();
}
fn bench_get_single(c: &mut Criterion) {
let mut group = c.benchmark_group("row_id_index_get_single");
for percent_deletions in [0.0, 0.02, 0.25, 0.5, 0.8] {
let sequences = make_frag_sequences(num_rows(), 100, percent_deletions);
let fragment_indices: Vec<FragmentRowIdIndex> = sequences
.iter()
.map(|(frag_id, sequence)| FragmentRowIdIndex {
fragment_id: *frag_id,
row_id_sequence: sequence.clone(),
deletion_vector: Arc::new(DeletionVector::default()),
})
.collect();
let index = RowIdIndex::new(&fragment_indices).unwrap();
let mut i = 0;
let total_rows: u64 = num_rows();
let mut next_id = || {
let id = i;
i += 241861;
i %= total_rows;
id
};
group.bench_with_input(
BenchmarkId::new("GetIndex", percent_deletions),
&percent_deletions,
|b, _| {
b.iter(|| {
let _ = index.get(next_id());
});
},
);
let flat_data = sequences
.iter()
.map(|(frag_id, sequence)| {
let row_ids = sequence.iter().collect::<Vec<_>>();
let row_addresses = (0..sequence.len())
.map(|i| RowAddress::new_from_parts(*frag_id, i as u32))
.map(u64::from)
.collect::<Vec<_>>();
(row_ids, row_addresses)
})
.collect::<Vec<_>>();
let index =
{
let mut index = HashMap::new();
index.extend(flat_data.iter().flat_map(|(ids, addresses)| {
ids.iter().copied().zip(addresses.iter().copied())
}));
index
};
group.bench_with_input(
BenchmarkId::new("GetHashMap", percent_deletions),
&percent_deletions,
|b, _| {
b.iter(|| {
for i in 0..num_rows() {
let _ = index.get(&i);
}
});
},
);
}
group.finish();
}
fn bench_apply_row_id(c: &mut Criterion) {
let mut group = c.benchmark_group("apply_row_id");
let batch = RecordBatch::try_new(
Arc::new(Schema::new(vec![Field::new(
"value",
DataType::UInt64,
false,
)])),
vec![Arc::new(UInt64Array::from(
(0..num_rows()).collect::<Vec<_>>(),
))],
)
.unwrap();
let config = RowIdAndDeletesConfig {
params: ReadBatchParams::default(),
with_row_id: true,
with_row_addr: false,
with_row_last_updated_at_version: false,
with_row_created_at_version: false,
deletion_vector: None,
row_id_sequence: None,
last_updated_at_sequence: None,
created_at_sequence: None,
make_deletions_null: false,
total_num_rows: num_rows() as u32,
};
group.bench_function("ApplyRowId", |b| {
let batch = batch.clone();
b.iter(|| {
let _ = apply_row_id_and_deletes(batch.clone(), 0, 0, &config);
});
});
group.finish();
}
#[cfg(target_os = "linux")]
criterion_group!(
name = benches;
config=Criterion::default().with_profiler(pprof::criterion::PProfProfiler::new(100, pprof::criterion::Output::Flamegraph(None)));
targets=bench_creation, bench_get_single, bench_apply_row_id);
#[cfg(not(target_os = "linux"))]
criterion_group!(
benches,
bench_creation,
bench_get_single,
bench_apply_row_id
);
criterion_main!(benches);

29
vendor/lance-table/build.rs vendored Normal file
View file

@ -0,0 +1,29 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors
use std::io::Result;
fn main() -> Result<()> {
println!("cargo:rerun-if-changed=protos");
#[cfg(feature = "protoc")]
// Use vendored protobuf compiler if requested.
unsafe {
std::env::set_var("PROTOC", protobuf_src::protoc());
}
let mut prost_build = prost_build::Config::new();
prost_build.extern_path(".lance.file", "::lance_file::format::pb");
prost_build.protoc_arg("--experimental_allow_proto3_optional");
prost_build.enable_type_names();
prost_build.compile_protos(
&[
"./protos/table.proto",
"./protos/transaction.proto",
"./protos/rowids.proto",
],
&["./protos"],
)?;
Ok(())
}

18
vendor/lance-table/protos/AGENTS.md vendored Normal file
View file

@ -0,0 +1,18 @@
# Protobuf Guidelines
Also see [root AGENTS.md](../AGENTS.md) for cross-language standards.
## Compatibility
- All changes must be backwards compatible. Never re-use or change field numbers of existing fields.
## Schema Design
- Use `optional` when you need to distinguish "not set" from "zero value" — `optional` enables presence tracking (`has_*` methods) and maps to `Option<T>` in Rust. Bare proto3 fields have no presence semantics: they always hold a value (defaulting to zero), so you cannot tell if the sender explicitly set them.
- Use structured message types (e.g., `BasePath`) instead of plain scalars, and scope fields to operation-specific messages (e.g., `InsertTransaction`) rather than generic top-level ones.
- Don't duplicate data across messages — store each fact once and derive relationships. Prefer parallel sequences over maps when keys already exist in another field.
## Documentation
- Document the semantic meaning of both present and absent states for `optional` fields — explain when each case applies.
- Use precise domain terminology in field descriptions — avoid ambiguous abbreviations or terms that collide with domain concepts.

18
vendor/lance-table/protos/CLAUDE.md vendored Normal file
View file

@ -0,0 +1,18 @@
# Protobuf Guidelines
Also see [root AGENTS.md](../AGENTS.md) for cross-language standards.
## Compatibility
- All changes must be backwards compatible. Never re-use or change field numbers of existing fields.
## Schema Design
- Use `optional` when you need to distinguish "not set" from "zero value" — `optional` enables presence tracking (`has_*` methods) and maps to `Option<T>` in Rust. Bare proto3 fields have no presence semantics: they always hold a value (defaulting to zero), so you cannot tell if the sender explicitly set them.
- Use structured message types (e.g., `BasePath`) instead of plain scalars, and scope fields to operation-specific messages (e.g., `InsertTransaction`) rather than generic top-level ones.
- Don't duplicate data across messages — store each fact once and derive relationships. Prefer parallel sequences over maps when keys already exist in another field.
## Documentation
- Document the semantic meaning of both present and absent states for `optional` fields — explain when each case applies.
- Use precise domain terminology in field descriptions — avoid ambiguous abbreviations or terms that collide with domain concepts.

55
vendor/lance-table/protos/ann.proto vendored Normal file
View file

@ -0,0 +1,55 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors
syntax = "proto3";
package lance.pb;
import "table_identifier.proto";
import "table.proto";
import "index.proto";
// Serialized vector query parameters.
message VectorQueryProto {
// Query vector as Arrow IPC bytes (supports Float16, Float32, Float64, UInt8, etc.)
bytes query_vector_arrow_ipc = 1;
string column = 2;
uint32 k = 3;
optional float lower_bound = 4;
optional float upper_bound = 5;
optional uint32 minimum_nprobes = 6;
optional uint32 maximum_nprobes = 7;
optional uint32 ef = 8;
optional uint32 refine_factor = 9;
// Distance metric type. Absent means None (use the index's default metric).
optional lance.index.pb.VectorMetricType metric_type = 10;
bool use_index = 11;
optional float dist_q_c = 12;
optional int32 query_parallelism = 13;
}
// Serializable form of ANNIvfSubIndexExec the IVF sub-index search node.
//
// The prefilter child ExecutionPlan is serialized by DataFusion's codec
// automatically via children() / with_new_children(). The prefilter_type
// field tells the decoder which PreFilterSource variant to use when
// reconstructing from the deserialized child inputs.
message ANNIvfSubIndexExecProto {
enum PreFilterType {
NONE = 0;
FILTERED_ROW_IDS = 1;
SCALAR_INDEX_QUERY = 2;
}
VectorQueryProto query = 1;
lance.datafusion.TableIdentifier table = 2;
repeated lance.table.IndexMetadata indices = 3;
PreFilterType prefilter_type = 4;
}
// Serializable form of ANNIvfPartitionExec the IVF centroid routing node.
message ANNIvfPartitionExecProto {
VectorQueryProto query = 1;
lance.datafusion.TableIdentifier table = 2;
repeated string index_uuids = 3;
}

View file

@ -0,0 +1,347 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors
syntax = "proto3";
package lance.encodings;
import "google/protobuf/empty.proto";
// This file contains a specification for encodings that can be used
// to store and load Arrow data into a Lance file for the 2.0 format. It
// has been superseded by encodings21.proto which is used for the 2.1 format.
//
// # Types
//
// This file assumes the user wants to load data into Arrow arrays and
// explains how to map Arrow arrays into Lance files. Encodings are divided
// into "array encoding" (which maps to an Arrow array and may contain multiple
// buffers) and "buffer encoding" (which encodes a single buffer of data).
//
// # Encoding Tree
//
// Most encodings are layered on top of each other. These form a tree of
// encodings with a single root node. To encode an array you will typically
// start with the root node and then take the output from that root encoding
// and feed it into child encodings. The decoding process works in reverse.
//
// # Multi-column Encodings
//
// Some Arrow arrays will map to more than one column of Lance data. For
// example, struct arrays and list arrays. This file only contains encodings
// for a single column. However, it does describe how multi-column arrays can
// be encoded.
// A pointer to a buffer in a Lance file
//
// A writer can place a buffer in three different locations. The buffer
// can go in the data page, in the column metadata, or in the file metadata.
// The writer is free to choose whatever is most appropriate (for example, a dictionary
// that is shared across all pages in a column will probably go in the column
// metadata). This specification does not dictate where the buffer should go.
message Buffer {
// The index of the buffer in the collection of buffers
uint32 buffer_index = 1;
// The collection holding the buffer
enum BufferType {
// The buffer is stored in the data page itself
page = 0;
// The buffer is stored in the column metadata
column = 1;
// The buffer is stored in the file metadata
file = 2;
};
BufferType buffer_type = 2;
}
// An encoding that adds nullability to another array encoding
//
// This can wrap any array encoding and add nullability information
message Nullable {
message NoNull {
ArrayEncoding values = 1;
}
message AllNull {}
message SomeNull {
ArrayEncoding validity = 1;
ArrayEncoding values = 2;
}
oneof nullability {
// The array has no nulls and there is a single buffer needed
NoNull no_nulls = 1;
// The array may have nulls and we need two buffers
SomeNull some_nulls = 2;
// All values are null (no buffers needed)
AllNull all_nulls = 3;
}
}
// An array encoding for variable-length list fields
message List {
// An array containing the offsets into an items array.
//
// This array will have num_rows items and will never
// have nulls.
//
// If the list at index i is not null then offsets[i] will
// contain `base + len(list)` where `base` is defined as:
// i == 0: 0
// i > 0: (offsets[i-1] % null_offset_adjustment)
//
// To help understand we can consider the following example list:
// [ [A, B], null, [], [C, D, E] ]
//
// The offsets will be [2, ?, 2, 5]
//
// If the incoming list at index i IS null then offsets[i] will
// contain `base + len(list) + null_offset_adjustment` where `base`
// is defined the same as above.
//
// To complete the above example let's assume that `null_offset_adjustment`
// is 7. Then the offsets will be [2, 9, 2, 5]
//
// If there are no nulls then the offsets we write here are exactly the
// same as the offsets in an Arrow list array (except we omit the leading
// 0 which is redundant)
//
// The reason we do this is so that reading a single list at index i only
// requires us to load the indices at i and i-1.
//
// If the offset at index i is greater than `null_offset_adjustment``
// then the list at index i is null.
//
// Otherwise the length of the list is `offsets[i] - base` where
// base is defined the same as above.
//
// Let's consider our example offsets: [2, 9, 2, 5]
//
// We can take any range of lists and determine how many list items are
// referenced by the sublist.
//
// 0..3: [_, 5] -> items 0..5 (base = 0* and end is 5)
// 0..2: [_, 2] -> items 0..2 (base = 0* and end is 2)
// 0..1: [_, 9] -> items 0..2 (base = 0* and end is 9 % 7)
// 1..3: [2, 5] -> items 2..5 (base = 2 and end is 5)
// 1..2: [2, 2] -> items 2..2 (base = 2 and end is 2)
// 2..3: [9, 5] -> items 2..5 (base = 9 % 7 and end is 5)
//
// * When the start of our range is the 0th item the base is always 0 and we only
// need to load a single index from disk to determine the range.
//
// The data type of the offsets array is flexible and does not need
// to match the data type of the destination array. Please note that the offsets
// array is very likely to be efficiently encoded by bit packing deltas.
ArrayEncoding offsets = 1;
// If a list is null then we add this value to the offset
//
// This value must be greater than the length of the items so that
// (offset + null_offset_adjustment) is never used by a non-null list.
//
// Note that this value cannot be equal to the length of the items
// because then a page with a single list would store [ X ] and we
// couldn't know if that is a null list or a list with X items.
//
// Therefore, the best choice for this value is 1 + # of items.
// Choosing this will maximize the bit packing that we can apply to the offsets.
uint64 null_offset_adjustment = 2;
// How many items are referenced by these offsets. This is needed in
// order to determine which items pages map to this offsets page.
uint64 num_items = 3;
}
// An array encoding for fixed-size list fields
message FixedSizeList {
/// The number of items in each list
uint32 dimension = 1;
/// True if the list is nullable
bool has_validity = 3;
/// The items in the list
ArrayEncoding items = 2;
}
message Compression {
string scheme = 1;
optional int32 level = 2;
}
// Fixed width items placed contiguously in a buffer
message Flat {
// the number of bits per value, must be greater than 0, does
// not need to be a multiple of 8
uint64 bits_per_value = 1;
// the buffer of values
Buffer buffer = 2;
// The Compression message can specify the compression scheme (e.g. zstd) and any
// other information that is needed for decompression.
//
// If this array is compressed then the bits_per_value refers to the uncompressed
// data.
Compression compression = 3;
}
// Compression algorithm where all values have a constant value
message Constant {
// The value (TODO: define encoding for literals?)
bytes value = 1;
}
// Items are bitpacked in a buffer
message Bitpacked {
// the number of bits used for a value in the buffer
uint64 compressed_bits_per_value = 1;
// the number of bits of the uncompressed value. e.g. for a u32, this will be 32
uint64 uncompressed_bits_per_value = 2;
// The items in the list
Buffer buffer = 3;
// Whether or not a sign bit is included in the bitpacked value
bool signed = 4;
}
// Items are bitpacked in a buffer
message BitpackedForNonNeg {
// the number of bits used for a value in the buffer
uint64 compressed_bits_per_value = 1;
// the number of bits of the uncompressed value. e.g. for a u32, this will be 32
uint64 uncompressed_bits_per_value = 2;
// The items in the list
Buffer buffer = 3;
}
// Opaque bitpacking variant where the bits per value are stored inline in the chunks themselves
message InlineBitpacking {
// the number of bits of the uncompressed value. e.g. for a u32, this will be 32
uint64 uncompressed_bits_per_value = 2;
}
// Transparent bitpacking variant where the number of bits per value is fixed through the whole buffer
message OutOfLineBitpacking {
// the number of bits of the uncompressed value. e.g. for a u32, this will be 32
uint64 uncompressed_bits_per_value = 2;
// The number of compressed bits per value, fixed across the entire buffer
uint64 compressed_bits_per_value = 3;
}
// An array encoding for shredded structs that will never be null
//
// There is no actual data in this column.
//
// TODO: Struct validity bitmaps will be placed here.
message SimpleStruct {}
// An array encoding for binary fields
message Binary {
ArrayEncoding indices = 1;
ArrayEncoding bytes = 2;
uint64 null_adjustment = 3;
}
message Variable {
uint32 bits_per_offset = 1;
}
message Fsst {
ArrayEncoding binary = 1;
bytes symbol_table = 2;
}
// An array encoding for dictionary-encoded fields
message Dictionary {
ArrayEncoding indices = 1;
ArrayEncoding items = 2;
uint32 num_dictionary_items = 3;
}
message PackedStruct {
repeated ArrayEncoding inner = 1;
Buffer buffer = 2;
}
message PackedStructFixedWidthMiniBlock {
ArrayEncoding Flat = 1;
repeated uint32 bits_per_values = 2;
}
message FixedSizeBinary {
ArrayEncoding bytes = 1;
uint32 byte_width = 2;
}
message Block {
string scheme = 1;
}
// Run-Length Encoding for miniblock format
message Rle {
// Number of bits per value (8, 16, 32, 64, or 128)
uint64 bits_per_value = 1;
}
// Byte Stream Split encoding for floating point values
message ByteStreamSplit {
// Number of bits per value (32 for float, 64 for double)
uint64 bits_per_value = 1;
}
// General miniblock encoding - wraps another miniblock encoding with compression
message GeneralMiniBlock {
// The inner miniblock encoding (e.g., Rle, Bitpacked, etc.)
ArrayEncoding inner = 1;
// The compression scheme to apply to the miniblock buffers
Compression compression = 2;
}
// Encodings that decode into an Arrow array
message ArrayEncoding {
oneof array_encoding {
Flat flat = 1;
Nullable nullable = 2;
FixedSizeList fixed_size_list = 3;
List list = 4;
SimpleStruct struct = 5;
Binary binary = 6;
Dictionary dictionary = 7;
Fsst fsst = 8;
PackedStruct packed_struct = 9;
Bitpacked bitpacked = 10;
FixedSizeBinary fixed_size_binary = 11;
BitpackedForNonNeg bitpacked_for_non_neg = 12;
Constant constant = 13;
InlineBitpacking inline_bitpacking = 14;
OutOfLineBitpacking out_of_line_bitpacking = 15;
Variable variable = 16;
PackedStructFixedWidthMiniBlock packed_struct_fixed_width_mini_block = 17;
Block block = 18;
Rle rle = 19;
GeneralMiniBlock general_mini_block = 20;
ByteStreamSplit byte_stream_split = 21;
}
}
// Wraps a column with a zone map index that can be used
// to apply pushdown filters
message ZoneIndex {
uint32 rows_per_zone = 1;
Buffer zone_map_buffer = 2;
ColumnEncoding inner = 3;
}
// Marks a column as blob data. It will contain a packed struct
// with fields position and size (u64)
message Blob {
ColumnEncoding inner = 1;
}
// Encodings that describe a column of values
message ColumnEncoding {
oneof column_encoding {
// No special encoding, just column values
google.protobuf.Empty values = 1;
ZoneIndex zone_index = 2;
Blob blob = 3;
}
}

View file

@ -0,0 +1,511 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors
syntax = "proto3";
package lance.encodings21;
// This file contains a specification for encodings that can be used
// to store and load Arrow data into a Lance file for the 2.1 format.
//
// # Types
//
// This file assumes the user wants to load data into Arrow arrays and
// explains how to map Arrow arrays into Lance files. Encodings are divided
// into "structural encodings" (which are used to encode the structure of the
// data such as any list or struct layers) and "compressive encodings" (which
// are used to compress the actual data values).
//
// # Standardized Interpretation of Counting Terms
//
// When working with 2.1 encodings we have a number of different "counting terms" and it can be
// difficult to understand what we mean when we are talking about a "number of values". Here is
// a standard interpretation of these terms:
//
// To understand these definitions consider a data type FIXED_SIZE_LIST<LIST<INT32>>.
//
// A "value" is an abstract term when we aren't being specific.
//
// - num_rows: This is the highest level counting term. A single row includes everything in the
// fixed size list. This is what the user asks for when they asks for a range of rows.
// - num_elements: The number of elements is the number of rows multiplied by the dimension of any
// fixed size list wrappers. This is what you get when you flatten the FSL layer and
// is the starting point for structural encoding. Note that an element can be a list
// value or a single primitive value.
// - num_items: The number of items is the number of values in the repetition and definition vectors
// after everything has been flattened.
// - num_visible_items: The number of visible items is the number of items after invisible items
// have been removed. Invisible items are rep/def levels that don't correspond to an
// actual value.
// # Structural Encodings
//
// The following message are used to describe the structural encoding of the
// data. In this document, we refer to these structural encodings as layouts.
// Repetition and definition levels are described in more detail elsewhere. As we peel through
// the structure of an array we will encounter layers of struct and list. Each of these layers
// potentially adds a new level to the repetition and definition levels. This message describes
// the meaning of each layer.
enum RepDefLayer {
// Should never be used, included for debugging purporses and general protobuf best practice
REPDEF_UNSPECIFIED = 0;
// All values are valid (can be primitive or struct)
REPDEF_ALL_VALID_ITEM = 1;
// All list values are valid
REPDEF_ALL_VALID_LIST = 2;
// There are one or more null items (can be primitive or struct)
REPDEF_NULLABLE_ITEM = 3;
// A list layer with null lists but no empty lists
REPDEF_NULLABLE_LIST = 4;
// A list layer with empty lists but no null lists
REPDEF_EMPTYABLE_LIST = 5;
// A list layer with both empty lists and null lists
REPDEF_NULL_AND_EMPTY_LIST = 6;
}
// A layout used for pages where the data is small
//
// In this case we can fit many values into a single disk sector and transposing buffers is
// expensive. As a result, we do not transpose the buffers but compress the data into small
// chunks (called mini blocks) which are roughly the size of a disk sector.
//
// The end result is a small amount of read amplification (since we must read an entire page
// at a time) but we have more flexibility in compression and do less work per value when
// compressing and decompressing in bulk.
message MiniBlockLayout {
// Description of the compression of repetition levels (e.g. how many bits per rep)
//
// Optional, if there is no repetition then this field is not present
CompressiveEncoding rep_compression = 1;
// Description of the compression of definition levels (e.g. how many bits per def)
//
// Optional, if there is no definition then this field is not present
CompressiveEncoding def_compression = 2;
// Description of the compression of values
CompressiveEncoding value_compression = 3;
// Description of the compression of the dictionary data
//
// Optional, if there is no dictionary then this field is not present
CompressiveEncoding dictionary = 4;
// Number of items in the dictionary
uint64 num_dictionary_items = 5;
// The meaning of each repdef layer, used to interpret repdef buffers correctly
repeated RepDefLayer layers = 6;
// The number of buffers in each mini-block, this is determined by the compression and does
// NOT include the repetition or definition buffers (the presence of these buffers can be determined
// by looking at the rep_compression and def_compression fields)
uint64 num_buffers = 7;
// The depth of the repetition index.
//
// If there is repetition then the depth must be at least 1. If there are many layers
// of repetition then deeper repetition indices will support deeper nested random access. For
// example, given 5 layers of repetition then the repetition index depth must be at least
// 3 to support access like `rows[50][17][3]`.
//
// We require `repetition_index_depth + 1` u64 values per mini-block to store the repetition
// index if the `repetition_index_depth` is greater than 0. The +1 is because we need to store
// the number of "leftover items" at the end of the chunk. Otherwise, we wouldn't have any way
// to know if the final item in a chunk is valid or not.
uint32 repetition_index_depth = 8;
// The page already records how many rows are in the page. For mini-block we also need to know how
// many "items" are in the page. A row and an item are the same thing unless the page has lists.
uint64 num_items = 9;
// Since Lance 2.2, miniblocks have larger chunk sizes (>= 64KB)
bool has_large_chunk = 10;
}
// A layout used for pages where the data is large
//
// In this case the cost of transposing the data is relatively small (compared to the cost of writing the data)
// and so we just zip the buffers together
message FullZipLayout {
// The number of bits of repetition info (0 if there is no repetition)
uint32 bits_rep = 1;
// The number of bits of definition info (0 if there is no definition)
uint32 bits_def = 2;
// The number of bits of value info
//
// Note: we use bits here (and not bytes) for consistency with other encodings. However, in practice,
// there is never a reason to use a bits per value that is not a multiple of 8. The complexity is not
// worth the small savings in space since this encoding is typically used with large values already.
oneof details {
// If this is a fixed width block then we need to have a fixed number of bits per value
uint32 bits_per_value = 3;
// If this is a variable width block then we need to have a fixed number of bits per offset
uint32 bits_per_offset = 4;
}
// The number of items in the page
uint32 num_items = 5;
// The number of visible items in the page
uint32 num_visible_items = 6;
// Description of the compression of values
CompressiveEncoding value_compression = 7;
// The meaning of each repdef layer, used to interpret repdef buffers correctly
repeated RepDefLayer layers = 8;
}
// A layout used for pages where all (visible) values are the same scalar value.
//
// This generalizes the prior AllNullLayout semantics for file_version >= 2.2.
//
// There may be buffers of repetition and definition information if required in order
// to interpret what kind of nulls are present / which items are visible.
message ConstantLayout {
// The meaning of each repdef layer, used to interpret repdef buffers correctly
repeated RepDefLayer layers = 5;
// Inline fixed-width scalar value bytes.
//
// This MUST only be used for types where a single non-null element is represented by a single
// fixed-width Arrow value buffer (i.e. no offsets buffer, no child data).
//
// Constraints:
// - MUST be absent for an all-null page
// - MUST be <= 32 bytes if present
optional bytes inline_value = 6;
// Optional compression algorithm used for the repetition buffer.
// If absent, repetition levels are stored as raw u16 values.
CompressiveEncoding rep_compression = 7;
// Optional compression algorithm used for the definition buffer.
// If absent, definition levels are stored as raw u16 values.
CompressiveEncoding def_compression = 8;
// Number of values in repetition buffer after decompression.
uint64 num_rep_values = 9;
// Number of values in definition buffer after decompression.
uint64 num_def_values = 10;
}
// A layout where large binary data is encoded externally and only
// the descriptions (position + size) are placed in the page
//
// Repdef information is stored in the descriptions. A description with a size of
// 0 and a position of 0 is an empty value. A description with a size of 0 and a
// non-zero position is a null value and the position is the repdef value.
message BlobLayout {
// The inner layout used to store the descriptions
PageLayout inner_layout = 1;
// The meaning of each repdef layer, used to interpret repdef buffers correctly
//
// The inner layout's repdef layers will always be 1 all valid item layer
repeated RepDefLayer layers = 2;
}
// Describes the structural encoding of a page
message PageLayout {
oneof layout {
// A layout used for pages where the data is small
MiniBlockLayout mini_block_layout = 1;
// A layout used for pages where all (visible) values are the same scalar value or null.
ConstantLayout constant_layout = 2;
// A layout used for pages where the data is large
FullZipLayout full_zip_layout = 3;
// A layout where large binary data is encoded externally
// and only the descriptions are put in the page
BlobLayout blob_layout = 4;
}
}
// # Compressive Encodings
//
// These encodings describe how an array is compressed. An encoding may split an
// array into multiple buffers. The buffers can then be compressed further (and split
// into yet more buffers). The entire process forms a tree of encodings with the root
// of the tree being the initial array and the leaves being the final compressed buffers.
//
// # Data blocks and buffers
//
// Data blocks are a simplified version of arrays and represent a collection of buffers grouped
// with some kind of interpretation. Data blocks are the input and output of compressive encodings.
// There are different kinds of data blocks:
// - Fixed width data blocks (e.g. u8, u16, ...)
// - Variable width data blocks (e.g. strings, binary)
// - Struct data blocks (note: this is for packed structs, normal structs are encoded in the structural encoding)
//
// In addition, leaf encodings may output "buffers". These are fully compressed buffers of data that
// are stored in the page and no longer compressed.
enum CompressionScheme {
COMPRESSION_ALGORITHM_UNSPECIFIED = 0;
COMPRESSION_ALGORITHM_LZ4 = 1;
COMPRESSION_ALGORITHM_ZSTD = 2;
}
// Compression applied to a single buffer of data
//
// A buffer is the leaf of the compression tree. Unlike data blocks, which can
// be further compressed with a variety of techniques, a buffer cannot be understood
// in any particular way.
//
// A general compression scheme may be applied to a buffer. This is something like
// zstd, lz4, etc. The entire buffer is compressed as a single unit. If this happens
// then any parent encoding becomes opaque, even if it would normally be transparent.
//
// This is a leaf, no further compression is applied to the data.
message BufferCompression {
// A general compression scheme to apply to the buffer
CompressionScheme scheme = 1;
// The compression level
//
// Optional, if not present a scheme-specific default value will be used.
//
// Interpretation of this value depends on the compression scheme. Generally, larger
// values indicate more compression at the expense of more CPU time.
optional int32 level = 2;
}
// Fixed width items placed contiguously in a single buffer
//
// This is a leaf encoding, there is no compression applied to the data.
//
// This is a transparent encoding by definition.
//
// The input is a fixed-width data block.
// The output is a single buffer.
message Flat {
// the number of bits per value, must be greater than 0, does
// not need to be a multiple of 8
uint64 bits_per_value = 1;
// The compression applied to the data
optional BufferCompression data = 2;
}
// Variable width items have the values stored in one buffer and the
// offsets are output as a data block that may be further compressed.
//
// This is a partial leaf encoding. Values are not compressed but
// the offsets may be further compressed.
//
// This is a transparent encoding by definition.
//
// The input is a variable-width data block.
// The output is a single fixed-width data block (the offsets) and
// a single buffer (the values)
message Variable {
// Describes how the offsets data block is compressed
CompressiveEncoding offsets = 1;
// The compression applied to the values
optional BufferCompression values = 2;
}
// Compression algorithm where all values have a constant value (encoded in the description)
//
// This is a leaf encoding, there is no compression applied to the data.
//
// The input can be any kind of data block.
// There is no output.
message Constant {
// The value (TODO: define encoding for literals?)
optional bytes value = 1;
}
// A compression scheme in which a single fixed-width block is "packed" into
// a smaller fixed-width block values where each value has fewer bits.
//
// This is typically done by throwing away the most significant bits of each value when
// those bits are all the same.
//
// In this scheme the number of bits per value is fixed across the entire buffer and stored
// in this message.
//
// This is a transparent encoding.
//
// The input is a fixed-width data block.
// The output is a single fixed-width data block.
message OutOfLineBitpacking {
// the number of bits of the uncompressed value. e.g. for a u32, this will be 32
uint64 uncompressed_bits_per_value = 1;
// The compression used to store the bitpacked values data block
CompressiveEncoding values = 3;
}
// Bitpacking variant where the bits per value are stored inline in the chunks themselves
//
// This variation of bitpacking allows for the number of bits per value to change throughout the
// buffer, which makes the compression more robust to outliers.
//
// This is an opaque encoding.
//
// The input is a fixed-width data block.
// The output is a single buffer.
message InlineBitpacking {
// the number of bits of the uncompressed value. e.g. for a u32, this will be 32
uint64 uncompressed_bits_per_value = 1;
// The compression applied to the values
optional BufferCompression values = 2;
}
// A compression scheme for variable-width data
//
// A small dictionary (referred to as a "symbol table") is used to compress the values.
// In this scheme there is a single symbol table for the entire page and it is stored in the
// encoding description itself.
//
// This is a transparent encoding.
//
// The input is a variable-width data block.
// The output is a single variable-width data block.
message Fsst {
// The FSST symbol table
bytes symbol_table = 1;
// The compression used to store the compressed values data block
CompressiveEncoding values = 2;
}
// A compression scheme where common values are stored in a dictionary and the values are
// encoded as indices into the dictionary.
//
// This is an opaque encoding unless the dictionary is considered metadata.
//
// The input is a any kind of data block.
// There are two outputs:
// - A data block of the same kind as the input (the dictionary)
// - A fixed-width data block containing the indices into the dictionary.
message Dictionary {
// The compression used to store the indices data block
CompressiveEncoding indices = 1;
// The compression used to store the dictionary items data block
CompressiveEncoding items = 2;
// The number of items in the dictionary
uint32 num_dictionary_items = 3;
}
// A compression scheme where runs of common values are encoded as a single value and a count
//
// This is an opaque encoding unless the run lengths are considered metadata.
//
// The input is a single data block of any kind.
// There are two outputs:
// - A data block of the same kind as the input (the run values)
// - A fixed-width data block containing the lengths of the runs
message Rle {
// The compression used to store the run values data block
CompressiveEncoding values = 1;
// The compression used to store the run lengths data block
CompressiveEncoding run_lengths = 2;
}
// Converts a fixed-size-list of values into a flattened list of values
//
// This encoding does not actually compress the data, it just flattens out the FSL layers.
//
// This is a transparent encoding.
//
// The input is a single block of fixed-width data (with a wide width and few items)
// The output is a single block of fixed-width data (with a narrow width and many items)
message FixedSizeList {
// The number of items in this layer of FSL
uint64 items_per_value = 1;
// Whether or not there is a validity buffer
bool has_validity = 3;
// The compression used to store the flattened values data block
CompressiveEncoding values = 2;
}
// Packs a struct containing only fixed-width children into a single fixed-width data block
//
// The children are concatenated row by row and stored as a single fixed-width buffer. This is
// the legacy packed struct representation and remains available for backwards compatibility.
message PackedStruct {
// The number of bits contributed by each child field in the packed row
repeated uint64 bits_per_value = 1;
// The compression used to store the packed fixed-width values
CompressiveEncoding values = 2;
}
// Variable-width packed struct encoding (2.2 extension)
//
// Each child value is compressed independently before being transposed into
// a row-major layout. This preserves per-field compression boundaries at the
// cost of disabling mini-block compression. Readers must prefer this field
// when present and fall back to the legacy encoding otherwise.
message VariablePackedStruct {
// Per-field encoding metadata in struct order
repeated FieldEncoding fields = 1;
// Encoding description for a single child field
message FieldEncoding {
// Compression applied to individual field values before transposition
CompressiveEncoding value = 1;
oneof layout {
// Bit width of each compressed value (when fixed width)
uint64 bits_per_value = 2;
// Bit width of the length prefix for variable-width compressed values
uint64 bits_per_length = 3;
}
}
}
// A compression scheme that wraps the underlying data with general compression
//
// Note: The application of wrapped compression will depend on the layout of the data.
// If we apply it to mini-block data then we compress entire mini-blocks. If we apply
// it to full-zip data then we compress each value individually.
//
// Note: Wrapped compression is somewhat unique at the moment as it is applied to the
// output of the inner encoding and not the input like all other compressive encodings.
//
// Note: General compression can usually be applied in two spots. We can apply
// it to individual buffers or we can apply it here, to the entire array.
//
// For example, let's say we are storing mini-blocks of strings and we are using
// FSST and bitpacking the offsets. We have something like this...
//
// WRAPPED(3) -> FSST -> VARIABLE -(offsets)-> INLINE_BITPACKING -(data)-> FLAT -> BUFFER (1)
// -(data)-> BUFFER (2)
//
// General compression can be applied at 1, 2, or 3 (or any combination of these).
//
// If we apply it at 1 then we apply it just to the bitpacked offsets
// If we apply it at 2 then we apply it just to the FSST compressed data
// If we apply it at 3 then we apply it to the entire mini-block (both offsets and data)
//
// The input is a single data block of any kind.
// The output is a single data block of the same kind as the input.
message General {
// The compression to apply to the values
BufferCompression compression = 1;
// The compression used to store the output data block
CompressiveEncoding values = 3;
}
// A compression scheme where fixed-width values are transposed into a series of byte streams
//
// This is commonly used for floating point values where the upper bits (the mantissa) have a
// significantly different meaning than the lower bits. By splitting the values into byte streams
// we group the mantissa bits together and the exponent bits together. The end result is typically
// more compressible.
//
// Note that this encoding is mostly useful when combined with other encodings. It does not do any
// compression on its own.
//
// This is an opaque encoding.
//
// The input is a fixed-width data block
// The output is a single fixed-width data block
message ByteStreamSplit {
// The compression used to store the values
CompressiveEncoding values = 1;
}
// An encoding that compresses a data block into buffers
message CompressiveEncoding {
oneof compression {
Flat flat = 1;
Variable variable = 2;
Constant constant = 3;
OutOfLineBitpacking out_of_line_bitpacking = 4;
InlineBitpacking inline_bitpacking = 5;
Fsst fsst = 6;
Dictionary dictionary = 7;
Rle rle = 8;
ByteStreamSplit byte_stream_split = 9;
General general = 10;
FixedSizeList fixed_size_list = 11;
PackedStruct packed_struct = 12;
VariablePackedStruct variable_packed_struct = 13;
}
}

207
vendor/lance-table/protos/file.proto vendored Normal file
View file

@ -0,0 +1,207 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors
syntax = "proto3";
package lance.file;
// A file descriptor that describes the contents of a Lance file
message FileDescriptor {
// The schema of the file
Schema schema = 1;
// The number of rows in the file
uint64 length = 2;
}
// A schema which describes the data type of each of the columns
message Schema {
// All fields in this file, including the nested fields.
repeated lance.file.Field fields = 1;
// Schema metadata.
map<string, bytes> metadata = 5;
}
// Metadata of one Lance file.
message Metadata {
// 4 was used for StatisticsMetadata in the past, but has been moved to
// prevent a bug in older readers.
reserved 4;
// Position of the manifest in the file. If it is zero, the manifest is stored
// externally.
uint64 manifest_position = 1;
// Logical offsets of each chunk group, i.e., number of the rows in each
// chunk.
repeated int32 batch_offsets = 2;
// The file position that page table is stored.
//
// A page table is a matrix of N x M x 2, where N = num_fields, and M =
// num_batches. Each cell in the table is a pair of <position:int64,
// length:int64> of the page. Both position and length are int64 values. The
// <position, length> of all the pages in the same column are then
// contiguously stored.
//
// Every field that is a part of the file will have a run in the page table.
// This includes struct columns, which will have a run of length 0 since
// they don't store any actual data.
//
// For example, for the column 5 and batch 4, we have:
// ```text
// position = page_table[5][4][0];
// length = page_table[5][4][1];
// ```
uint64 page_table_position = 3;
message StatisticsMetadata {
// The schema of the statistics.
//
// This might be empty, meaning there are no statistics. It also might not
// contain statistics for every field.
repeated Field schema = 1;
// The field ids of the statistics leaf fields.
//
// This plays a similar role to the `fields` field in the DataFile message.
// Each of these field ids corresponds to a field in the stats_schema. There
// is one per column in the stats page table.
repeated int32 fields = 2;
// The file position of the statistics page table
//
// The page table is a matrix of N x 2, where N = length of stats_fields.
// This is the same layout as the main page table, except there is always
// only one batch.
//
// For example, to get the stats column 5, we have:
// ```text
// position = stats_page_table[5][0];
// length = stats_page_table[5][1];
// ```
uint64 page_table_position = 3;
}
StatisticsMetadata statistics = 5;
} // Metadata
// Supported encodings.
enum Encoding {
// Invalid encoding.
NONE = 0;
// Plain encoding.
PLAIN = 1;
// Var-length binary encoding.
VAR_BINARY = 2;
// Dictionary encoding.
DICTIONARY = 3;
// Run-length encoding.
RLE = 4;
}
// Dictionary field metadata
message Dictionary {
/// The file offset for storing the dictionary value.
/// It is only valid if encoding is DICTIONARY.
///
/// The logic type presents the value type of the column, i.e., string value.
int64 offset = 1;
/// The length of dictionary values.
int64 length = 2;
}
// Field metadata for a column.
message Field {
enum Type {
PARENT = 0;
REPEATED = 1;
LEAF = 2;
}
Type type = 1;
// Fully qualified name.
string name = 2;
/// Field Id.
///
/// See the comment in `DataFile.fields` for how field ids are assigned.
int32 id = 3;
/// Parent Field ID. If not set, this is a top-level column.
int32 parent_id = 4;
// Logical types, support parameterized Arrow Type.
//
// PARENT types will always have logical type "struct".
//
// REPEATED types may have logical types:
// * "list"
// * "large_list"
// * "list.struct"
// * "large_list.struct"
// The final two are used if the list values are structs, and therefore the
// field is both implicitly REPEATED and PARENT.
//
// LEAF types may have logical types:
// * "null"
// * "bool"
// * "int8" / "uint8"
// * "int16" / "uint16"
// * "int32" / "uint32"
// * "int64" / "uint64"
// * "halffloat" / "float" / "double"
// * "string" / "large_string"
// * "binary" / "large_binary"
// * "date32:day"
// * "date64:ms"
// * "decimal:128:{precision}:{scale}" / "decimal:256:{precision}:{scale}"
// * "time:{unit}" / "timestamp:{unit}" / "duration:{unit}", where unit is
// "s", "ms", "us", "ns"
// * "dict:{value_type}:{index_type}:false"
string logical_type = 5;
// If this field is nullable.
bool nullable = 6;
// optional field metadata (e.g. extension type name/parameters)
map<string, bytes> metadata = 10;
bool unenforced_primary_key = 12;
// Position of this field in the primary key (1-based).
// 0 means the field is part of the primary key but uses schema field id for ordering.
// When set to a positive value, primary key fields are ordered by this position.
uint32 unenforced_primary_key_position = 13;
// Reserved for future use. Use unenforced_clustering_key_position instead.
bool unenforced_clustering_key = 14;
// Position of this field in the clustering key (1-based).
// 0 means the field is not part of the clustering key.
uint32 unenforced_clustering_key_position = 15;
// DEPRECATED ----------------------------------------------------------------
// Deprecated: Only used in V1 file format. V2 uses variable encodings defined
// per page.
//
// The global encoding to use for this field.
Encoding encoding = 7;
// Deprecated: Only used in V1 file format. V2 dynamically chooses when to
// do dictionary encoding and keeps the dictionary in the data files.
//
// The file offset for storing the dictionary value.
// It is only valid if encoding is DICTIONARY.
//
// The logic type presents the value type of the column, i.e., string value.
Dictionary dictionary = 8;
// Deprecated: optional extension type name, use metadata field
// ARROW:extension:name
string extension_name = 9;
// Field number 11 was previously `string storage_class`.
// Keep it reserved so older manifests remain compatible while new writers
// avoid reusing the slot.
reserved 11;
reserved "storage_class";
}

210
vendor/lance-table/protos/file2.proto vendored Normal file
View file

@ -0,0 +1,210 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors
syntax = "proto3";
package lance.file.v2;
import "google/protobuf/any.proto";
import "google/protobuf/empty.proto";
// # Lance v2.X File Format
//
// The Lance file format is a barebones format for serializing columnar data
// into a file.
//
// * Each Lance file contains between 0 and 4Gi columns
// * Each column contains between 0 and 4Gi pages
// * Each page contains between 0 and 2^64 items
// * Different pages within a column can have different items counts
// * Columns may have up to 2^64 items
// * Different columns within a file can have different item counts
//
// The Lance file format does not have any notion of a type system or schemas.
// From the perspective of the file format all data is arbitrary buffers of
// bytes with an extensible metadata block to describe the data. It is up to
// the user to interpret these bytes meaningfully.
//
// Data buffers are written to the file first. These data buffers can be
// referenced from three different places in the file:
//
// * Page encodings can reference data buffers. This is the most common way
// that actual data is stored.
// * Column encodings can reference data buffers. For example, a column encoding
// may reference data buffer(s) containing statistics or dictionaries.
// * Finally, the global buffer offset table can reference data buffers. This
// is useful for storing data that is shared across multiple columns.
// This is also useful for global file metadata (e.g. a schema that describes
// the file)
//
// ## File Layout
//
// Note: the number of buffers (BN) is independent of the number of columns (CN)
// and pages.
//
// Buffers often need to be aligned. 64-byte alignment is common when
// working with SIMD operations. 4096-byte alignment is common when
// working with direct I/O. In order to ensure these buffers are aligned
// writers may need to insert padding before the buffers.
//
// If direct I/O is required then most (but not all) fields described
// below must be sector aligned. We have marked these fields with an
// asterisk for clarity. Readers should assume there will be optional
// padding inserted before these fields.
//
// All footer fields are unsigned integers written with little endian
// byte order.
//
//
// | Data Pages |
// | Data Buffer 0* |
// | ... |
// | Data Buffer BN* |
//
// | Column Metadatas |
// | |A| Column 0 Metadata* |
// | Column 1 Metadata* |
// | ... |
// | Column CN Metadata* |
//
// | Column Metadata Offset Table |
// | |B| Column 0 Metadata Position* |
// | Column 0 Metadata Size |
// | ... |
// | Column CN Metadata Position |
// | Column CN Metadata Size |
//
// | Global Buffers Offset Table |
// | |C| Global Buffer 0 Position* |
// | Global Buffer 0 Size |
// | ... |
// | Global Buffer GN Position |
// | Global Buffer GN Size |
//
// | Footer |
// | A u64: Offset to column meta 0 |
// | B u64: Offset to CMO table |
// | C u64: Offset to GBO table |
// | u32: Number of global bufs |
// | u32: Number of columns |
// | u16: Major version |
// | u16: Minor version |
// | "LANC" |
//
//
// File Layout-End
//
// ## Data Pages
//
// A lot of flexibility is provided in how data is stored. A page's buffers do
// not strictly need to be contiguous on the disk. However, it is recommended
// that buffers within a page be grouped together for best performance.
//
// Data pages should be large. The only time a page should be written to disk
// is when the writer needs to flush the page to disk because it has accumulated
// too much data. Pages are not read in sequential order and if pages are too
// small then the seek overhead (or request overhead) will be problematic. We
// generally advise that pages be at least 8MB or larger.
//
// ## Encodings
//
// Specific encodings are not part of this minimal format. They are provided
// by extensions. Readers and writers should be designed so that encodings can
// be easily added and removed. Ideally, they should allow for this without
// requiring recompilation through some kind of plugin system.
// The deferred encoding is used to place the encoding itself in a different
// part of the file. This is most commonly used to allow encodings to be shared
// across different columns. For example, when writing a file with thousands of
// columns, where many pages have the exact same encoding, it can be useful
// to cut down on the size of the metadata by using a deferred encoding.
message DeferredEncoding {
// Location of the buffer containing the encoding.
//
// * If sharing encodings across columns then this will be in a global buffer
// * If sharing encodings across pages within a column this could be in a
// column metadata buffer.
// * This could also be a page buffer if the encoding is not shared, needs
// to be written before the file ends, and the encoding is too large to load
// unless we first determine the page needs to be read. This combination
// seems unusual.
uint64 buffer_location = 1;
uint64 buffer_length = 2;
}
// The encoding is placed directly in the metadata section
message DirectEncoding {
// The bytes that make up the encoding embedded directly in the metadata
//
// This is the most common approach.
bytes encoding = 1;
}
// An encoding stores the information needed to decode a column or page
//
// For example, it could describe if the page is using bit packing, and how many bits
// there are in each individual value.
//
// At the column level it can be used to wrap columns with dictionaries or statistics.
message Encoding {
oneof location {
// The encoding is stored elsewhere and not part of this protobuf message
DeferredEncoding indirect = 1;
// The encoding is stored within this protobuf message
DirectEncoding direct = 2;
// There is no encoding information
google.protobuf.Empty none = 3;
}
}
// ## Metadata
// Each column has a metadata block that is placed at the end of the file.
// These may be read individually to allow for column projection.
message ColumnMetadata {
// This describes a page of column data.
message Page {
// The file offsets for each of the page buffers
//
// The number of buffers is variable and depends on the encoding. There
// may be zero buffers (e.g. constant encoded data) in which case this
// could be empty.
repeated uint64 buffer_offsets = 1;
// The size (in bytes) of each of the page buffers
//
// This field will have the same length as `buffer_offsets` and
// may be empty.
repeated uint64 buffer_sizes = 2;
// Logical length (e.g. # rows) of the page
uint64 length = 3;
// The encoding used to encode the page
Encoding encoding = 4;
// The priority of the page
//
// For tabular data this will be the top-level row number of the first row
// in the page (and top-level rows should not split across pages).
uint64 priority = 5;
}
// Encoding information about the column itself. This typically describes
// how to interpret the column metadata buffers. For example, it could
// describe how statistics or dictionaries are stored in the column metadata.
Encoding encoding = 1;
// The pages in the column
repeated Page pages = 2;
// The file offsets of each of the column metadata buffers
//
// There may be zero buffers.
repeated uint64 buffer_offsets = 3;
// The size (in bytes) of each of the column metadata buffers
//
// This field will have the same length as `buffer_offsets` and
// may be empty.
repeated uint64 buffer_sizes = 4;
} // Metadata-End
// ## Where is the rest?
//
// This file format is extremely minimal. It is a building block for
// creating more useful readers and writers and not terribly useful by itself.
// Other protobuf files will describe how this can be extended.

View file

@ -0,0 +1,99 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors
syntax = "proto3";
package lance.datafusion;
import "table_identifier.proto";
message U64Range {
uint64 start = 1;
uint64 end = 2;
}
message ProjectionProto {
repeated int32 field_ids = 1;
bool with_row_id = 2;
bool with_row_addr = 3;
bool with_row_last_updated_at_version = 4;
bool with_row_created_at_version = 5;
BlobHandlingProto blob_handling = 6;
}
message BlobHandlingProto {
oneof mode {
// All blobs read as binary
bool all_binary = 1;
// Blobs as descriptions, other binary as binary (default)
bool blobs_descriptions = 2;
// All binary columns as descriptions
bool all_descriptions = 3;
// Specific blobs read as binary, rest as descriptions (non-blob binary stays binary)
FieldIdSet some_blobs_binary = 4;
// Specific columns as binary, all other binary as descriptions
FieldIdSet some_binary = 5;
}
}
message FieldIdSet {
repeated uint32 field_ids = 1;
}
message FilteredReadThreadingModeProto {
oneof mode {
uint64 one_partition_multiple_threads = 1;
uint64 multiple_partitions = 2;
}
}
// Serializable form of FilteredReadOptions.
message FilteredReadOptionsProto {
optional U64Range scan_range_before_filter = 1;
optional U64Range scan_range_after_filter = 2;
bool with_deleted_rows = 3;
optional uint32 batch_size = 4;
optional uint64 fragment_readahead = 5;
repeated uint64 fragment_ids = 6;
ProjectionProto projection = 7;
optional bytes refine_filter_substrait = 8;
optional bytes full_filter_substrait = 9;
FilteredReadThreadingModeProto threading_mode = 10;
optional uint64 io_buffer_size_bytes = 11;
// Arrow IPC schema for decoding Substrait filters (may be wider than projection).
optional bytes filter_schema_ipc = 12;
}
// Serializable form of FilteredReadPlan (planned/distributed mode).
// RowAddrTreeMap serialized via its built-in serialize_into/deserialize_from.
// Per-fragment filters are Substrait-encoded and deduplicated.
message FilteredReadPlanProto {
bytes row_addr_tree_map = 1;
optional U64Range scan_range_after_filter = 2;
// Arrow IPC schema for decoding Substrait filters (matches the schema used at encode time).
optional bytes filter_schema_ipc = 3;
// Per-fragment filter mapping. Key is fragment id, value is a list index into
// filter_expressions. Multiple fragments can share the same list index when
// they have the same filter, avoiding duplicate Substrait encoding.
map<uint32, uint32> fragment_filter_ids = 4;
// Deduplicated Substrait-encoded filter expressions. Each entry is referenced
// by one or more values in fragment_filter_ids.
repeated bytes filter_expressions = 5;
}
// Top-level wrapper for FilteredReadExec serialization.
message FilteredReadExecProto {
TableIdentifier table = 1;
FilteredReadOptionsProto options = 2;
// FilteredRead has two modes
// Plan-then-execute (distributed): The planner creates a FilteredReadPlan and sends it to a remote executor.
// Plan-and-execute (local): The executor creates the plan itself at execution time.
optional FilteredReadPlanProto plan = 3;
// Note: FilteredReadExec.index_input (child ExecutionPlan) is NOT serialized here.
// DataFusion's PhysicalExtensionCodec handles child plans automatically: it walks
// the plan tree via children() / with_new_children(), serializes each node, and
// passes deserialized children back as the `inputs` parameter in try_decode.
// This means any ExecutionPlan in the tree (including index_input) must also
// implement try_encode/try_decode in the PhysicalExtensionCodec.
// TODO: implement serialize/deserialize for lance-specific index input ExecutionPlans.
}

249
vendor/lance-table/protos/index.proto vendored Normal file
View file

@ -0,0 +1,249 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors
syntax = "proto3";
package lance.index.pb;
import "google/protobuf/any.proto";
// The type of an index.
enum IndexType {
// Vector index
VECTOR = 0;
}
message Index {
// The unique index name in the dataset.
string name = 1;
// Columns to be used to build the index.
repeated string columns = 2;
// The version of the dataset this index was built from.
uint64 dataset_version = 3;
// The [`IndexType`] of the index.
IndexType index_type = 4;
/// Index implementation details.
oneof implementation {
VectorIndex vector_index = 5;
}
}
message Tensor {
enum DataType {
BFLOAT16 = 0;
FLOAT16 = 1;
FLOAT32 = 2;
FLOAT64 = 3;
UINT8 = 4;
UINT16 = 5;
UINT32 = 6;
UINT64 = 7;
}
DataType data_type = 1;
// Data shape, [dim1, dim2, ...]
repeated uint32 shape = 2;
// Data buffer
bytes data = 3;
}
// Inverted Index File Metadata.
message IVF {
// Centroids of partitions. `dimension * num_partitions` of float32s.
//
// Deprecated, use centroids_tensor instead.
repeated float centroids = 1; // [deprecated = true];
// File offset of each partition.
repeated uint64 offsets = 2;
// Number of records in the partition.
repeated uint32 lengths = 3;
// Tensor of centroids. `num_partitions * dimension` of float32s.
Tensor centroids_tensor = 4;
// KMeans loss.
optional double loss = 5;
}
// Product Quantization.
message PQ {
// The number of bits to present a centroid.
uint32 num_bits = 1;
// Number of sub vectors.
uint32 num_sub_vectors = 2;
// Vector dimension
uint32 dimension = 3;
// Codebook. `dimension * 2 ^ num_bits` of float32s.
repeated float codebook = 4;
// Tensor of codebook. `2 ^ num_bits * dimension` of floats.
Tensor codebook_tensor = 5;
}
// Transform type
enum TransformType {
OPQ = 0;
}
// A transform matrix to apply to a vector or vectors.
message Transform {
// The file offset the matrix is stored
uint64 position = 1;
// Data shape of the matrix, [rows, cols].
repeated uint32 shape = 2;
// Transform type.
TransformType type = 3;
}
// Flat Index
message Flat {}
// DiskAnn Index
message DiskAnn {
// Graph spec version
uint32 spec = 1;
// Graph file
string filename = 2;
// r parameter
uint32 r = 3;
// alpha parameter
float alpha = 4;
// L parameter
uint32 L = 5;
/// Entry points to the graph
repeated uint64 entries = 6;
}
// One stage in the vector index pipeline.
message VectorIndexStage {
oneof stage {
// Flat index
Flat flat = 1;
// `IVF` - Inverted File
IVF ivf = 2;
// Product Quantization
PQ pq = 3;
// Transformer
Transform transform = 4;
// DiskANN
DiskAnn diskann = 5;
}
}
// Metric Type for Vector Index
enum VectorMetricType {
// L2 (Euclidean) Distance
L2 = 0;
// Cosine Distance
Cosine = 1;
// Dot Product
Dot = 2;
// Hamming Distance
Hamming = 3;
}
// Vector Index Metadata
message VectorIndex {
// Index specification version.
uint32 spec_version = 1;
// Vector dimension;
uint32 dimension = 2;
// Composed vector index stages.
//
// For example, `IVF_PQ` index type can be expressed as:
//
// ```text
// let stages = vec![Ivf{}, PQ{num_bits: 8, num_sub_vectors: 16}]
// ```
repeated VectorIndexStage stages = 3;
// Vector distance metrics type
VectorMetricType metric_type = 4;
}
// Details for vector indexes, stored in the manifest's index_details field.
message VectorIndexDetails {
VectorMetricType metric_type = 1;
// The target number of vectors per partition.
// 0 means unset.
uint64 target_partition_size = 2;
// Optional HNSW index configuration. If set, the index has an HNSW layer.
optional HnswParameters hnsw_index_config = 3;
message ProductQuantization {
uint32 num_bits = 1;
uint32 num_sub_vectors = 2;
}
message ScalarQuantization {
uint32 num_bits = 1;
}
message RabitQuantization {
enum RotationType {
FAST = 0;
MATRIX = 1;
}
uint32 num_bits = 1;
RotationType rotation_type = 2;
}
// No quantization; vectors are stored as-is.
message FlatCompression {}
oneof compression {
ProductQuantization pq = 4;
ScalarQuantization sq = 5;
RabitQuantization rq = 6;
FlatCompression flat = 8;
}
// Runtime hints: optional build preferences that don't affect index structure.
// Keys use reverse-DNS namespacing (e.g., "lance.ivf.max_iters", "lancedb.accelerator").
// Unrecognized keys must be silently ignored by all runtimes.
map<string, string> runtime_hints = 9;
}
// Hierarchical Navigable Small World (HNSW) parameters, used as an optional configuration for IVF indexes.
message HnswParameters {
// The maximum number of outgoing edges per node in the HNSW graph. Higher values
// means more connections, better recall, but more memory and slower builds.
// Referred to as "M" in the HNSW literature.
uint32 max_connections = 1;
// "construction exploration factor": The size of the dynamic list used during
// index construction.
uint32 construction_ef = 2;
// The maximum number of levels in the HNSW graph.
uint32 max_level = 3;
}
message JsonIndexDetails {
string path = 1;
google.protobuf.Any target_details = 2;
}
message BloomFilterIndexDetails {}
message RTreeIndexDetails {}

View file

@ -0,0 +1,42 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors
syntax = "proto3";
package lance.table;
// NOTE: Do *NOT* add new index details here. Add them to the index.proto file instead.
// This file is in the lance.table package namespace while the index.proto file is in the
// lance.index package namespace.
//
// These are only here for forward compatibility. Older versions of Lance expect btree indexes
// to have lance.table in the package namespace.
//
// If you need to modify these messages (e.g. to add new fields to btree or bitmap) then
// it is ok to modify them here.
// Currently many of these are empty messages because all needed details are either hard-coded (e.g.
// filenames) or stored in the index itself. However, we may want to add more details in the
// future, in particular we can add details that may be useful for planning queries (e.g. don't
// force us to load the index until we know we can make use of it)
message BTreeIndexDetails {}
message BitmapIndexDetails {}
message LabelListIndexDetails {}
message NGramIndexDetails {}
message ZoneMapIndexDetails {}
message InvertedIndexDetails {
// Marking this field as optional as old versions of the index store blank details and we
// need to make sure we have a proper optional field to detect this.
optional string base_tokenizer = 1;
string language = 2;
bool with_position = 3;
optional uint32 max_token_length = 4;
bool lower_case = 5;
bool stem = 6;
bool remove_stop_words = 7;
bool ascii_folding = 8;
uint32 min_ngram_length = 9;
uint32 max_ngram_length = 10;
bool prefix_only = 11;
}

View file

@ -0,0 +1,2 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors

113
vendor/lance-table/protos/rowids.proto vendored Normal file
View file

@ -0,0 +1,113 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors
syntax = "proto3";
package lance.table;
// TODO: what would it take to store this in a LanceV2 file?
// Or would flatbuffers be better for this?
/// A sequence of row IDs. This is split up into one or more segments,
/// each of which can be encoded in different ways. The encodings are optimized
/// for values that are sorted, which will often be the case with row ids.
/// They also have optimized forms depending on how sparse the values are.
message RowIdSequence {
repeated U64Segment segments = 1;
}
/// Different ways to encode a sequence of u64 values.
message U64Segment {
/// A range of u64 values.
message Range {
/// The start of the range, inclusive.
uint64 start = 1;
/// The end of the range, exclusive.
uint64 end = 2;
}
/// A range of u64 values with holes.
message RangeWithHoles {
/// The start of the range, inclusive.
uint64 start = 1;
/// The end of the range, exclusive.
uint64 end = 2;
/// The holes in the range, as a sorted array of values;
/// Binary search can be used to check whether a value is a hole and should
/// be skipped. This can also be used to count the number of holes before a
/// given value, if you need to find the logical offset of a value in the
/// segment.
EncodedU64Array holes = 3;
}
/// A range of u64 values with a bitmap.
message RangeWithBitmap {
/// The start of the range, inclusive.
uint64 start = 1;
/// The end of the range, exclusive.
uint64 end = 2;
/// A bitmap of the values in the range. The bitmap is a sequence of bytes,
/// where each byte represents 8 values. The first byte represents values
/// start to start + 7, the second byte represents values start + 8 to
/// start + 15, and so on. The most significant bit of each byte represents
/// the first value in the range, and the least significant bit represents
/// the last value in the range. If the bit is set, the value is in the
/// range; if it is not set, the value is not in the range.
bytes bitmap = 3;
}
oneof segment {
/// When the values are sorted and contiguous.
Range range = 1;
/// When the values are sorted but have a few gaps.
RangeWithHoles range_with_holes = 2;
/// When the values are sorted but have many gaps.
RangeWithBitmap range_with_bitmap = 3;
/// When the values are sorted but are sparse.
EncodedU64Array sorted_array = 4;
/// A general array of values, which is not sorted.
EncodedU64Array array = 5;
}
} // RowIdSegment
/// A basic bitpacked array of u64 values.
message EncodedU64Array {
message U16Array {
uint64 base = 1;
/// The deltas are stored as 16-bit unsigned integers.
/// (protobuf doesn't support 16-bit integers, so we use bytes instead)
bytes offsets = 2;
}
message U32Array {
uint64 base = 1;
/// The deltas are stored as 32-bit unsigned integers.
/// (we use bytes instead of uint32 to avoid overhead of varint encoding)
bytes offsets = 2;
}
message U64Array {
/// (We use bytes instead of uint64 to avoid overhead of varint encoding)
bytes values = 2;
}
oneof array {
U16Array u16_array = 1;
U32Array u32_array = 2;
U64Array u64_array = 3;
}
}
/// A sequence of dataset versions. Similar to RowIdSequence but tracks
/// version runs. It uses RLE (Run-Length Encoding) to efficiently
// represent consecutive rows with the same version.
message RowDatasetVersionSequence {
repeated RowDatasetVersionRun runs = 1;
}
/// A run of rows with the same version.
message RowDatasetVersionRun {
/// The number of consecutive rows with the same version.
U64Segment span = 1;
uint64 version = 2;
}

717
vendor/lance-table/protos/table.proto vendored Normal file
View file

@ -0,0 +1,717 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors
syntax = "proto3";
package lance.table;
import "google/protobuf/any.proto";
import "google/protobuf/timestamp.proto";
import "file.proto";
/*
Format:
+----------------------------------------+
| Encoded Column 0, Chunk 0 |
...
| Encoded Column M, Chunk N - 1 |
| Encoded Column M, Chunk N |
| Indices ... |
| Chunk Position (M x N x 8) |
| Manifest (Optional) |
| Metadata |
| i64: metadata position |
| MAJOR_VERSION | MINOR_VERSION | "LANC" |
+----------------------------------------+
*/
// UUID type. encoded as 16 bytes.
message UUID {
bytes uuid = 1;
}
// Manifest is a global section shared between all the files.
message Manifest {
// All fields of the dataset, including the nested fields.
repeated lance.file.Field fields = 1;
// Schema metadata.
map<string, bytes> schema_metadata = 5;
// Fragments of the dataset.
repeated DataFragment fragments = 2;
// Snapshot version number.
uint64 version = 3;
// The file position of the version auxiliary data.
// * It is not inheritable between versions.
// * It is not loaded by default during query.
uint64 version_aux_data = 4;
message WriterVersion {
// The name of the library that created this file.
string library = 1;
// The version of the library that created this file. Because we cannot assume
// that the library is semantically versioned, this is a string. However, if it
// is semantically versioned, it should be a valid semver string without any 'v'
// prefix. For example: `2.0.0`, `2.0.0-rc.1`.
//
// For forward compatibility with older readers, when writing new manifests this
// field should contain only the core version (major.minor.patch) without any
// prerelease or build metadata. The prerelease/build info should be stored in
// the separate prerelease and build_metadata fields instead.
string version = 2;
// Optional semver prerelease identifier.
//
// This field stores the prerelease portion of a semantic version separately
// from the core version number. For example, if the full version is "2.0.0-rc.1",
// the version field would contain "2.0.0" and prerelease would contain "rc.1".
//
// This separation ensures forward compatibility: older readers can parse the
// clean version field without errors, while newer readers can reconstruct the
// full semantic version by combining version, prerelease, and build_metadata.
//
// If absent, the version field is used as-is.
optional string prerelease = 3;
// Optional semver build metadata.
//
// This field stores the build metadata portion of a semantic version separately
// from the core version number. For example, if the full version is
// "2.0.0-rc.1+build.123", the version field would contain "2.0.0", prerelease
// would contain "rc.1", and build_metadata would contain "build.123".
//
// If absent, no build metadata is present.
optional string build_metadata = 4;
}
// The version of the writer that created this file.
//
// This information may be used to detect whether the file may have known bugs
// associated with that writer.
WriterVersion writer_version = 13;
// If present, the file position of the index metadata.
optional uint64 index_section = 6;
// Version creation Timestamp, UTC timezone
google.protobuf.Timestamp timestamp = 7;
// Optional version tag
string tag = 8;
// Feature flags for readers.
//
// A bitmap of flags that indicate which features are required to be able to
// read the table. If a reader does not recognize a flag that is set, it
// should not attempt to read the dataset.
//
// Known flags:
// * 1: deletion files are present
// * 2: row ids are stable and stored as part of the fragment metadata.
// * 4: use v2 format (deprecated)
// * 8: table config is present
uint64 reader_feature_flags = 9;
// Feature flags for writers.
//
// A bitmap of flags that indicate which features must be used when writing to the
// dataset. If a writer does not recognize a flag that is set, it should not attempt to
// write to the dataset.
//
// The flag identities are the same as for reader_feature_flags, but the values of
// reader_feature_flags and writer_feature_flags are not required to be identical.
uint64 writer_feature_flags = 10;
// The highest fragment ID that has been used so far.
//
// This ID is not guaranteed to be present in the current version, but it may
// have been used in previous versions.
//
// For a single fragment, will be zero. For no fragments, will be absent.
optional uint32 max_fragment_id = 11;
// Path to the transaction file, relative to `{root}/_transactions`. The file at that
// location contains a wire-format serialized Transaction message representing the
// transaction that created this version.
//
// This string field "transaction_file" may be empty if no transaction file was written.
//
// The path format is "{read_version}-{uuid}.txn" where {read_version} is the version of
// the table the transaction read from (serialized to decimal with no padding digits),
// and {uuid} is a hyphen-separated UUID.
string transaction_file = 12;
// The file position of the transaction content. None if transaction is empty
// This transaction content begins with the transaction content length as u32
// If the transaction proto message has a length of `len`, the message ends at `len` + 4
optional uint64 transaction_section = 21;
// The next unused row id. If zero, then the table does not have any rows.
//
// This is only used if the "stable_row_ids" feature flag is set.
uint64 next_row_id = 14;
message DataStorageFormat {
// The format of the data files (e.g. "lance")
string file_format = 1;
// The max format version of the data files. The format of the version can vary by
// file_format and is not required to follow semver.
//
// Every file in this version of the dataset has the same file_format version.
string version = 2;
}
// The data storage format
//
// This specifies what format is used to store the data files.
DataStorageFormat data_format = 15;
// Table config.
//
// Keys with the prefix "lance." are reserved for the Lance library. Other
// libraries may wish to similarly prefix their configuration keys
// appropriately.
map<string, string> config = 16;
// Metadata associated with the table.
//
// This is a key-value map that can be used to store arbitrary metadata
// associated with the table.
//
// This is different than configuration, which is used to tell libraries how
// to read, write, or manage the table.
//
// This is different than schema metadata, which is used to describe the
// data itself and is attached to the output schema of scans.
map<string, string> table_metadata = 19;
// Field number 17 (`blob_dataset_version`) was used for a secondary blob dataset.
reserved 17;
reserved "blob_dataset_version";
// The base paths of data files.
//
// This is used to determine the base path of a data file. In common cases data file paths are under current dataset base path.
// But for shallow cloning, importing file and other multi-tier storage cases, the actual data files could be outside of the current dataset.
// This field is used with the `base_id` in `lance.file.File` and `lance.file.DeletionFile`.
//
// For example, if we have a dataset with base path `s3://bucket/dataset`, we have a DataFile with base_id 0, we get the actual data file path by:
// base_paths[id = 0] + /data/ + file.path
// the key(a.k.a index) starts from 0, increased by 1 for each new base path.
repeated BasePath base_paths = 18;
// The branch of the dataset. None means main branch.
optional string branch = 20;
} // Manifest
// external dataset base path
message BasePath {
uint32 id = 1;
// This is an alias name of the base path, it is optional.
// When we use shallow clone and the target version is a tag, the tag name will be set here.
optional string name = 2;
// Flag indicating whether this path is a dataset root path or file directory:
// - true: Path is a dataset root (actual files under subdirectories like `data`, '_deletions')
// - false: Path is a direct file directory (scenario like importing files)
bool is_dataset_root = 3;
// Note: This absolute path will be directly used by Path:parse(),
string path = 4;
}
// Auxiliary Data attached to a version.
// Only load on-demand.
message VersionAuxData {
// key-value metadata.
map<string, bytes> metadata = 3;
}
// Metadata describing an index.
message IndexMetadata {
// Unique ID of an index. It is unique across all the dataset versions.
UUID uuid = 1;
// The columns to build the index. These refer to file.Field.id.
repeated int32 fields = 2;
// Index name. Must be unique within one dataset version.
string name = 3;
// The version of the dataset this index was built from.
uint64 dataset_version = 4;
// A bitmap of the included fragment ids.
//
// This may by used to determine how much of the dataset is covered by the
// index. This information can be retrieved from the dataset by looking at
// the dataset at `dataset_version`. However, since the old version may be
// deleted while the index is still in use, this information is also stored
// in the index.
//
// The bitmap is stored as a 32-bit Roaring bitmap.
bytes fragment_bitmap = 5;
// Details, specific to the index type, which are needed to load / interpret the index
//
// Indices should avoid putting large amounts of information in this field, as it will
// bloat the manifest.
//
// Indexes are plugins, and so the format of the details message is flexible and not fully
// defined by the table format. However, there are some conventions that should be followed:
//
// - When Lance APIs refer to indexes they will use the type URL of the index details as the
// identifier for the index type. If a user provides a simple string identifier like
// "btree" then it will be converted to "/lance.table.BTreeIndexDetails"
// - Type URLs comparisons are case-insensitive. Thereform an index must have a unique type
// URL ignoring case.
google.protobuf.Any index_details = 6;
// The minimum lance version that this index is compatible with.
optional int32 index_version = 7;
// Timestamp when the index was created (UTC timestamp in milliseconds since epoch)
//
// This field is optional for backward compatibility. For existing indices created before
// this field was added, this will be None/null.
optional uint64 created_at = 8;
// The base path index of the data file. Used when the file is imported or referred from another dataset.
// Lance use it as key of the base_paths field in Manifest to determine the actual base path of the data file.
optional uint32 base_id = 9;
// List of files and their sizes for this index segment.
// This enables skipping HEAD calls when opening indices and allows reporting
// of index sizes without extra IO.
// If this is empty, the index files sizes are unknown.
repeated IndexFile files = 10;
}
// Metadata about a single file within an index segment.
message IndexFile {
// Path relative to the index directory (e.g., "index.idx", "auxiliary.idx")
string path = 1;
// Size of the file in bytes
uint64 size_bytes = 2;
}
// Index Section, containing a list of index metadata for one dataset version.
message IndexSection {
repeated IndexMetadata indices = 1;
}
// A DataFragment is a set of files which represent the different columns of the same
// rows. If column exists in the schema of a dataset, but the file for that column does
// not exist within a DataFragment of that dataset, that column consists entirely of
// nulls.
message DataFragment {
// The ID of a DataFragment is unique within a dataset.
uint64 id = 1;
repeated DataFile files = 2;
// File that indicates which rows, if any, should be considered deleted.
DeletionFile deletion_file = 3;
// TODO: What's the simplest way we can allow an inline tombstone bitmap?
// A serialized RowIdSequence message (see rowids.proto).
//
// These are the row ids for the fragment, in order of the rows as they appear.
// That is, if a fragment has 3 rows, and the row ids are [1, 42, 3], then the
// first row is row 1, the second row is row 42, and the third row is row 3.
oneof row_id_sequence {
// If small (< 200KB), the row ids are stored inline.
bytes inline_row_ids = 5;
// Otherwise, stored as part of a file.
ExternalFile external_row_ids = 6;
} // row_id_sequence
oneof last_updated_at_version_sequence {
// If small (< 200KB), the row latest updated versions are stored inline.
bytes inline_last_updated_at_versions = 7;
// Otherwise, stored as part of a file.
ExternalFile external_last_updated_at_versions = 8;
} // last_updated_at_version_sequence
oneof created_at_version_sequence {
// If small (< 200KB), the row created at versions are stored inline.
bytes inline_created_at_versions = 9;
// Otherwise, stored as part of a file.
ExternalFile external_created_at_versions = 10;
} // created_at_version_sequence
// Number of original rows in the fragment, this includes rows that are now marked with
// deletion tombstones. To compute the current number of rows, subtract
// `deletion_file.num_deleted_rows` from this value.
uint64 physical_rows = 4;
}
message DataFile {
// Path to the root relative to the dataset's URI.
string path = 1;
// The ids of the fields/columns in this file.
//
// When a DataFile object is created in memory, every value in fields is assigned -1 by
// default. An object with a value in fields of -1 must not be stored to disk. -2 is
// used for "tombstoned", meaning a field that is no longer in use. This is often
// because the original field id was reassigned to a different data file.
//
// In Lance v1 IDs are assigned based on position in the file, offset by the max
// existing field id in the table (if any already). So when a fragment is first created
// with one file of N columns, the field ids will be 1, 2, ..., N. If a second fragment
// is created with M columns, the field ids will be N+1, N+2, ..., N+M.
//
// In Lance v1 there is one field for each field in the input schema, this includes
// nested fields (both struct and list). Fixed size list fields have only a single
// field id (these are not considered nested fields in Lance v1).
//
// This allows column indices to be calculated from field IDs and the input schema.
//
// In Lance v2 the field IDs generally follow the same pattern but there is no
// way to calculate the column index from the field ID. This is because a given
// field could be encoded in many different ways, some of which occupy a different
// number of columns. For example, a struct field could be encoded into N + 1 columns
// or it could be encoded into a single packed column. To determine column indices
// the column_indices property should be used instead.
//
// In Lance v1 these ids must be sorted but might not always be contiguous.
repeated int32 fields = 2;
// The top-level column indices for each field in the file.
//
// If the data file is version 1 then this property will be empty
//
// Otherwise there must be one entry for each field in `fields`.
//
// Some fields may not correspond to a top-level column in the file. In these cases
// the index will -1.
//
// For example, consider the schema:
//
// - dimension: packed-struct (0):
// - x: u32 (1)
// - y: u32 (2)
// - path: `list<u32>` (3)
// - embedding: `fsl<768>` (4)
// - fp64
// - borders: `fsl<4>` (5)
// - simple-struct (6)
// - margin: fp64 (7)
// - padding: fp64 (8)
//
// One possible column indices array could be:
// [0, -1, -1, 1, 3, 4, 5, 6, 7]
//
// This reflects quite a few phenomenon:
// - The packed struct is encoded into a single column and there is no top-level column
// for the x or y fields
// - The variable sized list is encoded into two columns
// - The embedding is encoded into a single column (common for FSL of primitive) and there
// is not "FSL column"
// - The borders field actually does have an "FSL column"
//
// The column indices table may not have duplicates (other than -1)
repeated int32 column_indices = 3;
// The major file version used to create the file
uint32 file_major_version = 4;
// The minor file version used to create the file
//
// If both `file_major_version` and `file_minor_version` are set to 0,
// then this is a version 0.1 or version 0.2 file.
uint32 file_minor_version = 5;
// The known size of the file on disk in bytes.
//
// This is used to quickly find the footer of the file.
//
// When this is zero, it should be interpreted as "unknown".
uint64 file_size_bytes = 6;
// The base path index of the data file. Used when the file is imported or referred from another dataset.
// Lance use it as key of the base_paths field in Manifest to determine the actual base path of the data file.
optional uint32 base_id = 7;
} // DataFile
// Deletion File
//
// The path of the deletion file is constructed as:
// {root}/_deletions/{fragment_id}-{read_version}-{id}.{extension}
// where {extension} depends on DeletionFileType.
message DeletionFile {
// Type of deletion file, intended as a way to increase efficiency of the storage of deleted row
// offsets. If there are sparsely deleted rows, then ARROW_ARRAY is the most efficient. If there
// are densely deleted rows, then BITMAP is the most efficient.
enum DeletionFileType {
// A single Int32Array of deleted row offsets, stored as an Arrow IPC file with one batch and
// one column. Has a .arrow extension.
ARROW_ARRAY = 0;
// A Roaring Bitmap of deleted row offsets. Has a .bin extension.
BITMAP = 1;
}
// Type of deletion file.
DeletionFileType file_type = 1;
// The version of the dataset this deletion file was built from.
uint64 read_version = 2;
// An opaque id used to differentiate this file from others written by concurrent
// writers.
uint64 id = 3;
// The number of rows that are marked as deleted.
uint64 num_deleted_rows = 4;
// The base path index of the deletion file. Used when the file is imported or referred from another
// dataset. Lance uses it as key of the base_paths field in Manifest to determine the actual base
// path of the deletion file.
optional uint32 base_id = 7;
} // DeletionFile
message ExternalFile {
// Path to the file, relative to the root of the table.
string path = 1;
// The byte offset in the file where the data starts.
uint64 offset = 2;
// The size of the data in the file, in bytes.
uint64 size = 3;
}
// VectorIndexDetails and HnswParameters (formerly HnswIndexDetails) moved to index.proto
message FragmentReuseIndexDetails {
oneof content {
// if < 200KB, store the content inline, otherwise store the InlineContent bytes in external file
InlineContent inline = 1;
ExternalFile external = 2;
}
message InlineContent {
repeated Version versions = 1;
}
message FragmentDigest {
uint64 id = 1;
uint64 physical_rows = 2;
uint64 num_deleted_rows = 3;
}
// A summarized version of the RewriteGroup information in a Rewrite transaction
message Group {
// A roaring treemap of the changed row addresses.
// When combined with the old fragment IDs and new fragment IDs,
// it can recover the full mapping of old row addresses to either new row addresses or deleted.
// this mapping can then be used to remap indexes or satisfy index queries for the new unindexed fragments.
bytes changed_row_addrs = 1;
repeated FragmentDigest old_fragments = 2;
repeated FragmentDigest new_fragments = 3;
}
message Version {
// The dataset_version at the time the index adds this version entry
uint64 dataset_version = 1;
repeated Group groups = 3;
}
}
// ============================================================================
// MemWAL Index Types
// ============================================================================
// Shard manifest containing epoch-based fencing and WAL state.
// Each shard has exactly one active writer at any time.
message ShardManifest {
// Shard identifier (UUID v4).
UUID shard_id = 11;
// Manifest version number.
// Matches the version encoded in the filename.
uint64 version = 1;
// Shard spec ID this shard was created with.
// Set at shard creation and immutable thereafter.
// A value of 0 indicates a manually-created shard not governed by any spec.
uint32 shard_spec_id = 10;
// Computed shard field values as raw Arrow scalar bytes, keyed by shard
// field id. The byte encoding follows Arrow's little-endian convention:
// int32 is 4 LE bytes, utf8 is raw UTF-8 bytes, etc. The receiver looks
// up the result_type from the ShardingSpec to interpret each value.
repeated ShardFieldEntry shard_field_entries = 14;
// Writer fencing token - monotonically increasing.
// A writer must increment this when claiming the shard.
uint64 writer_epoch = 2;
// The most recent WAL entry position that has been flushed to a MemTable.
// During recovery, replay starts from replay_after_wal_entry_position + 1.
// WAL positions are 1-based, so the default value 0 unambiguously means
// "no flush has ever stamped this shard" and recovery replays from 1.
uint64 replay_after_wal_entry_position = 3;
// The most recent WAL entry position observed at the time the manifest was
// updated. WAL positions are 1-based; default 0 means no entry has been
// written yet. This is a hint, not authoritative - recovery must list
// files to find actual state.
uint64 wal_entry_position_last_seen = 4;
// Next generation ID to create (incremented after each MemTable flush).
uint64 current_generation = 6;
// Field 7 removed: merged_generation moved to MemWalIndexDetails.merged_generations
// which is the authoritative source for merge progress.
// List of flushed MemTable generations and their directory paths.
repeated FlushedGeneration flushed_generations = 8;
}
// A shard field value stored as raw Arrow scalar bytes.
message ShardFieldEntry {
// Shard field id (matches ShardingField.field_id in the ShardingSpec).
string field_id = 1;
// Raw Arrow scalar value bytes in little-endian encoding.
// The data type is determined by the result_type of the matching ShardingField.
bytes value = 2;
}
// A flushed MemTable generation and its storage location.
message FlushedGeneration {
// Generation number.
uint64 generation = 1;
// Directory name relative to the shard directory.
string path = 2;
}
// A shard's merged generation, used in MemWalIndexDetails.
message MergedGeneration {
// Shard identifier (UUID v4).
UUID shard_id = 1;
// Last generation merged to base table for this shard.
uint64 generation = 2;
}
// Tracks which merged generation a base table index has been rebuilt to cover.
// Used to determine whether to read from flushed MemTable indexes or base table.
message IndexCatchupProgress {
// Name of the base table index (must match an entry in maintained_indexes).
string index_name = 1;
// Per-shard progress: the generation up to which this index covers.
// If a shard is not present, the index is assumed to be fully caught up
// (i.e., caught_up_generation >= merged_generation for that shard).
repeated MergedGeneration caught_up_generations = 2;
}
// Index details for MemWAL Index, stored in IndexMetadata.index_details.
// This is the centralized structure for all MemWAL metadata:
// - Configuration (sharding specs, indexes to maintain)
// - Merge progress (merged generations per shard)
// - Shard state snapshots
//
// Writers read this index to get configuration before writing.
// Readers may use shard snapshots in this index as a point-in-time
// optimization. Readers that need the latest shard set should list shard
// directories in storage and read each shard's latest manifest.
// A background process updates the index periodically to keep shard snapshots current.
//
// Shard snapshots are stored as a Lance file with one row per shard.
// The schema records shard discovery fields. Full mutable shard state remains
// authoritative in the shard manifest files.
// shard_id: utf8
// shard_spec_id: uint32
// shard_field_{field_id}: typed per the matching ShardingField.result_type
message MemWalIndexDetails {
// Snapshot timestamp (Unix timestamp in milliseconds).
int64 snapshot_ts_millis = 1;
// Number of shards in the snapshot.
// Used to determine storage format without reading the snapshot data.
uint32 num_shards = 2;
// Inline shard snapshots for small shard counts.
// When num_shards <= threshold (implementation-defined, e.g., 100),
// snapshots are stored inline as serialized bytes.
// Format: Lance file bytes with the shard snapshot schema.
optional bytes inline_snapshots = 3;
// Sharding specs defining how to derive shard identifiers.
// This configuration determines how rows are partitioned into shards.
repeated ShardingSpec sharding_specs = 7;
// Indexes from the base table to maintain in MemTables.
// These are index names referencing indexes defined on the base table.
// The primary key btree index is always maintained implicitly and
// should not be listed here.
//
// For vector indexes, MemTables inherit quantization parameters (PQ codebook,
// SQ params) from the base table index to ensure distance comparability.
repeated string maintained_indexes = 8;
// Last generation merged to base table for each shard.
// This is updated atomically with merge-insert data commits, enabling
// conflict resolution when multiple mergers operate concurrently.
//
// Note: This is separate from shard snapshots because:
// 1. merged_generations is updated by mergers (atomic with data commit)
// 2. shard snapshots are updated by background index builder
repeated MergedGeneration merged_generations = 9;
// Per-index catchup progress tracking.
// When data is merged to the base table, base table indexes are rebuilt
// asynchronously. This field tracks which generation each index covers.
//
// For indexed queries, if an index's caught_up_generation < merged_generation,
// readers should use flushed MemTable indexes for the gap instead of
// scanning unindexed data in the base table.
//
// If an index is not present in this list, it is assumed to be fully caught up.
repeated IndexCatchupProgress index_catchup = 10;
// Default ShardWriter configuration values for this MemWAL index.
//
// A free-form string map persisted so that every writer across
// processes and restarts starts from the same default writer
// configuration. These are defaults only: an individual writer may
// still override any value at runtime in its own ShardWriterConfig
// (which is not persisted).
map<string, string> writer_config_defaults = 11;
}
// Sharding spec definition.
message ShardingSpec {
// Unique identifier for this spec within the index.
// IDs are never reused.
uint32 spec_id = 1;
// Sharding field definitions that determine how to compute shard identifiers.
repeated ShardingField fields = 2;
}
// Sharding field definition.
message ShardingField {
// Unique string identifier for this shard field.
string field_id = 1;
// Field IDs referencing source columns in the schema.
repeated int32 source_ids = 2;
// Well-known shard transform name (e.g., "identity", "year", "bucket").
// Mutually exclusive with expression.
optional string transform = 3;
// DataFusion SQL expression for custom logic.
// Mutually exclusive with transform.
optional string expression = 4;
// Output type of the shard value (Arrow type name).
string result_type = 5;
// Transform parameters (e.g., num_buckets for bucket transform).
map<string, string> parameters = 6;
}

View file

@ -0,0 +1,19 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors
syntax = "proto3";
package lance.datafusion;
// Identifies a Lance dataset for remote reconstruction.
//
// Two modes:
// 1. uri + serialized_manifest (fast): remote executor skips manifest read.
// 2. uri + version + etag (lightweight): remote executor loads manifest from storage.
message TableIdentifier {
string uri = 1;
uint64 version = 2;
optional string manifest_etag = 3;
optional bytes serialized_manifest = 4;
map<string, string> storage_options = 5;
}

View file

@ -0,0 +1,354 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors
syntax = "proto3";
import "file.proto";
import "table.proto";
import "google/protobuf/any.proto";
package lance.table;
// A transaction represents the changes to a dataset.
//
// This has two purposes:
// 1. When retrying a commit, the transaction can be used to re-build an updated
// manifest.
// 2. When there's a conflict, this can be used to determine whether the other
// transaction is compatible with this one.
message Transaction {
// The version of the dataset this transaction was built from.
//
// For example, for a delete transaction this means the version of the dataset
// that was read from while evaluating the deletion predicate.
uint64 read_version = 1;
// The UUID that unique identifies a transaction.
string uuid = 2;
// Optional version tag.
string tag = 3;
// Optional properties for the transaction
// __lance_commit_message is a reserved key
map<string, string> transaction_properties = 4;
// Add new rows to the dataset.
message Append {
// The new fragments to append.
//
// Fragment IDs are not yet assigned.
repeated DataFragment fragments = 1;
}
// Mark rows as deleted.
message Delete {
// The fragments to update
//
// The fragment IDs will match existing fragments in the dataset.
repeated DataFragment updated_fragments = 1;
// The fragments to delete entirely.
repeated uint64 deleted_fragment_ids = 2;
// The predicate that was evaluated
//
// This may be used to determine whether the delete would have affected
// files written by a concurrent transaction.
string predicate = 3;
}
// Create or overwrite the entire dataset.
message Overwrite {
// The new fragments
//
// Fragment IDs are not yet assigned.
repeated DataFragment fragments = 1;
// The new schema
repeated lance.file.Field schema = 2;
// Schema metadata.
map<string, bytes> schema_metadata = 3;
// Key-value pairs to merge with existing config.
map<string, string> config_upsert_values = 4;
// The base paths to be added for the initial dataset creation
repeated BasePath initial_bases = 5;
}
// Add or replace a new secondary index.
//
// This is also used to remove an index (we are replacing it with nothing)
//
// - new_indices: the modified indices, empty if dropping indices only
// - removed_indices: the indices that are being replaced
message CreateIndex {
repeated IndexMetadata new_indices = 1;
repeated IndexMetadata removed_indices = 2;
}
// An operation that rewrites but does not change the data in the table. These
// kinds of operations just rearrange data.
message Rewrite {
// The old fragments that are being replaced
//
// DEPRECATED: use groups instead.
//
// These should all have existing fragment IDs.
repeated DataFragment old_fragments = 1;
// The new fragments
//
// DEPRECATED: use groups instead.
//
// These fragments IDs are not yet assigned.
repeated DataFragment new_fragments = 2;
// During a rewrite an index may be rewritten. We only serialize the UUID
// since a rewrite should not change the other index parameters.
message RewrittenIndex {
// The id of the index that will be replaced
UUID old_id = 1;
// the id of the new index
UUID new_id = 2;
// the new index details
google.protobuf.Any new_index_details = 3;
// the version of the new index
uint32 new_index_version = 4;
// Files in the new index with their sizes.
// Empty if file sizes are not available (e.g. older writers).
repeated IndexFile new_index_files = 5;
}
// A group of rewrite files that are all part of the same rewrite.
message RewriteGroup {
// The old fragment that is being replaced
//
// This should have an existing fragment ID.
repeated DataFragment old_fragments = 1;
// The new fragment
//
// The ID should have been reserved by an earlier
// reserve operation
repeated DataFragment new_fragments = 2;
}
// Groups of files that have been rewritten
repeated RewriteGroup groups = 3;
// Indices that have been rewritten
repeated RewrittenIndex rewritten_indices = 4;
}
// An operation that merges in a new column, altering the schema.
message Merge {
// The updated fragments
//
// These should all have existing fragment IDs.
repeated DataFragment fragments = 1;
// The new schema
repeated lance.file.Field schema = 2;
// Schema metadata.
map<string, bytes> schema_metadata = 3;
}
// An operation that projects a subset of columns, altering the schema.
message Project {
// The new schema
repeated lance.file.Field schema = 1;
}
// An operation that restores a dataset to a previous version.
message Restore {
// The version to restore to
uint64 version = 1;
}
// An operation that reserves fragment ids for future use in
// a rewrite operation.
message ReserveFragments {
uint32 num_fragments = 1;
}
// An operation that clones a dataset.
message Clone {
// - true: Performs a metadata-only clone (copies manifest without data files).
// The cloned dataset references original data through `base_paths`,
// suitable for experimental scenarios or rapid metadata migration.
// - false: Performs a full deep clone using the underlying object storage's native
// copy API (e.g., S3 CopyObject, GCS rewrite). This leverages server-side
// bulk copy operations to bypass download/upload bottlenecks, achieving
// near-linear speedup for large datasets (typically 3-10x faster than
// manual file transfers). The operation maintains atomicity and data
// integrity guarantees provided by the storage backend.
bool is_shallow = 1;
// the reference name in the source dataset
// in most cases it should be the branch or tag name in the source dataset
optional string ref_name = 2;
// the version of the source dataset for cloning
uint64 ref_version = 3;
// the absolute base path of the source dataset for cloning
string ref_path = 4;
// if the target dataset is a branch, this is the branch name of the target dataset
optional string branch_name = 5;
}
// Exact set of key hashes for conflict detection.
// Used when the number of inserted rows is small.
message ExactKeySetFilter {
// 64-bit hashes of the inserted row keys.
repeated uint64 key_hashes = 1;
}
// Bloom filter for key existence tests.
// Used when the number of rows is large.
message BloomFilter {
// Bitset backing the bloom filter (SBBF format).
bytes bitmap = 1;
// Number of bits in the bitmap.
uint32 num_bits = 2;
// Number of items the filter was sized for.
// Used for intersection validation (filters with different sizes cannot be compared).
// Default: 8192
uint64 number_of_items = 3;
// False positive probability the filter was sized for.
// Used for intersection validation (filters with different parameters cannot be compared).
// Default: 0.00057
double probability = 4;
}
// A filter for checking key existence in set of rows inserted by a merge insert operation.
// Only created when the merge insert's ON columns match the schema's unenforced primary key.
// The presence of this filter indicates strict primary key conflict detection should be used.
// Can use either an exact set (for small row counts) or a Bloom filter (for large row counts).
message KeyExistenceFilter {
// Field IDs of columns participating in the key (must match unenforced primary key).
repeated int32 field_ids = 1;
// The underlying data structure storing the key hashes.
oneof data {
// Exact set of key hashes (used for small number of rows).
ExactKeySetFilter exact = 2;
// Bloom filter (used for large number of rows).
BloomFilter bloom = 3;
}
}
// Serialized as sorted distinct local physical row offsets within the fragment (0-based).
message UInt32List {
repeated uint32 values = 1;
}
// An operation that updates rows but does not add or remove rows.
message Update {
// The fragments that have been removed. These are fragments where all rows
// have been updated and moved to a new fragment.
repeated uint64 removed_fragment_ids = 1;
// The fragments that have been updated.
repeated DataFragment updated_fragments = 2;
// The new fragments where updated rows have been moved to.
repeated DataFragment new_fragments = 3;
// The ids of the fields that have been modified.
repeated uint32 fields_modified = 4;
/// List of MemWAL shard generations to mark as merged after this transaction
repeated MergedGeneration merged_generations = 5;
/// The fields that used to judge whether to preserve the new frag's id into
/// the frag bitmap of the specified indices.
repeated uint32 fields_for_preserving_frag_bitmap = 6;
// The mode of update
UpdateMode update_mode = 7;
// Filter for checking existence of keys in newly inserted rows, used for conflict detection.
// Only tracks keys from INSERT operations during merge insert, not updates.
optional KeyExistenceFilter inserted_rows = 8;
// Per-fragment physical row offsets that matched an update_columns hash join (RewriteColumns).
map<uint64, UInt32List> updated_fragment_offsets = 9;
}
// The mode of update operation
enum UpdateMode {
/// rows are deleted in current fragments and rewritten in new fragments.
/// This is most optimal when the majority of columns are being rewritten
/// or only a few rows are being updated.
REWRITE_ROWS = 0;
/// within each fragment, columns are fully rewritten and inserted as new data files.
/// Old versions of columns are tombstoned. This is most optimal when most rows are affected
/// but a small subset of columns are affected.
REWRITE_COLUMNS = 1;
}
// An entry for a map update. If value is not set, the key will be removed from the map.
message UpdateMapEntry {
// The key of the map entry to update.
string key = 1;
// The value to set for the key.
optional string value = 2;
}
message UpdateMap {
repeated UpdateMapEntry update_entries = 1;
// If true, the map will be replaced entirely with the new entries.
// If false, the new entries will be merged with the existing map.
bool replace = 2;
}
// An operation that updates the table config, table metadata, schema metadata,
// or field metadata.
message UpdateConfig {
UpdateMap config_updates = 6;
UpdateMap table_metadata_updates = 7;
UpdateMap schema_metadata_updates = 8;
map<int32, UpdateMap> field_metadata_updates = 9;
// Deprecated -------------------------------
map<string, string> upsert_values = 1;
repeated string delete_keys = 2;
map<string, string> schema_metadata = 3;
map<uint32, FieldMetadataUpdate> field_metadata = 4;
message FieldMetadataUpdate {
map<string, string> metadata = 5;
}
}
message DataReplacementGroup {
uint64 fragment_id = 1;
DataFile new_file = 2;
}
// An operation that replaces the data in a region of the table with new data.
message DataReplacement {
repeated DataReplacementGroup replacements = 1;
}
// Update the merged generations in MemWAL index.
// This operation is used during merge-insert to atomically record which
// generations have been merged to the base table.
message UpdateMemWalState {
// Shards and generations being marked as merged.
repeated MergedGeneration merged_generations = 1;
}
// An operation that updates base paths in the dataset.
message UpdateBases {
// The new base paths to add to the manifest.
repeated BasePath new_bases = 1;
}
// The operation of this transaction.
oneof operation {
Append append = 100;
Delete delete = 101;
Overwrite overwrite = 102;
CreateIndex create_index = 103;
Rewrite rewrite = 104;
Merge merge = 105;
Restore restore = 106;
ReserveFragments reserve_fragments = 107;
Update update = 108;
Project project = 109;
UpdateConfig update_config = 110;
DataReplacement data_replacement = 111;
UpdateMemWalState update_mem_wal_state = 112;
Clone clone = 113;
UpdateBases update_bases = 114;
}
// Fields 200/202 (`blob_append` / `blob_overwrite`) previously represented blob dataset ops.
reserved 200, 202;
reserved "blob_append", "blob_overwrite";
}

184
vendor/lance-table/src/feature_flags.rs vendored Normal file
View file

@ -0,0 +1,184 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors
//! Feature flags
use crate::format::Manifest;
use lance_core::{Error, Result};
/// Fragments may contain deletion files, which record the tombstones of
/// soft-deleted rows.
pub const FLAG_DELETION_FILES: u64 = 1;
/// Row ids are stable for both moves and updates. Fragments contain an index
/// mapping row ids to row addresses.
pub const FLAG_STABLE_ROW_IDS: u64 = 2;
/// Files are written with the new v2 format (this flag is no longer used)
pub const FLAG_USE_V2_FORMAT_DEPRECATED: u64 = 4;
/// Table config is present
pub const FLAG_TABLE_CONFIG: u64 = 8;
/// Dataset uses multiple base paths (for shallow clones or multi-base datasets)
pub const FLAG_BASE_PATHS: u64 = 16;
/// Disable writing transaction file under _transaction/, this flag is set when we only want to write inline transaction in manifest
pub const FLAG_DISABLE_TRANSACTION_FILE: u64 = 32;
/// The first bit that is unknown as a feature flag
pub const FLAG_UNKNOWN: u64 = 64;
/// Set the reader and writer feature flags in the manifest based on the contents of the manifest.
pub fn apply_feature_flags(
manifest: &mut Manifest,
enable_stable_row_id: bool,
disable_transaction_file: bool,
) -> Result<()> {
// Reset flags
manifest.reader_feature_flags = 0;
manifest.writer_feature_flags = 0;
let has_deletion_files = manifest
.fragments
.iter()
.any(|frag| frag.deletion_file.is_some());
if has_deletion_files {
// Both readers and writers need to be able to read deletion files
manifest.reader_feature_flags |= FLAG_DELETION_FILES;
manifest.writer_feature_flags |= FLAG_DELETION_FILES;
}
// If any fragment has row ids, they must all have row ids.
let has_row_ids = manifest
.fragments
.iter()
.any(|frag| frag.row_id_meta.is_some());
if has_row_ids || enable_stable_row_id {
if !manifest
.fragments
.iter()
.all(|frag| frag.row_id_meta.is_some())
{
return Err(Error::invalid_input("All fragments must have row ids"));
}
manifest.reader_feature_flags |= FLAG_STABLE_ROW_IDS;
manifest.writer_feature_flags |= FLAG_STABLE_ROW_IDS;
}
// Test whether any table metadata has been set
if !manifest.config.is_empty() {
manifest.writer_feature_flags |= FLAG_TABLE_CONFIG;
}
// Check if this dataset uses multiple base paths (for shallow clones or multi-base datasets)
if !manifest.base_paths.is_empty() {
manifest.reader_feature_flags |= FLAG_BASE_PATHS;
manifest.writer_feature_flags |= FLAG_BASE_PATHS;
}
if disable_transaction_file {
manifest.writer_feature_flags |= FLAG_DISABLE_TRANSACTION_FILE;
}
Ok(())
}
pub fn can_read_dataset(reader_flags: u64) -> bool {
reader_flags < FLAG_UNKNOWN
}
pub fn can_write_dataset(writer_flags: u64) -> bool {
writer_flags < FLAG_UNKNOWN
}
pub fn has_deprecated_v2_feature_flag(writer_flags: u64) -> bool {
writer_flags & FLAG_USE_V2_FORMAT_DEPRECATED != 0
}
#[cfg(test)]
mod tests {
use super::*;
use crate::format::BasePath;
#[test]
fn test_read_check() {
assert!(can_read_dataset(0));
assert!(can_read_dataset(super::FLAG_DELETION_FILES));
assert!(can_read_dataset(super::FLAG_STABLE_ROW_IDS));
assert!(can_read_dataset(super::FLAG_USE_V2_FORMAT_DEPRECATED));
assert!(can_read_dataset(super::FLAG_TABLE_CONFIG));
assert!(can_read_dataset(super::FLAG_BASE_PATHS));
assert!(can_read_dataset(super::FLAG_DISABLE_TRANSACTION_FILE));
assert!(can_read_dataset(
super::FLAG_DELETION_FILES
| super::FLAG_STABLE_ROW_IDS
| super::FLAG_USE_V2_FORMAT_DEPRECATED
));
assert!(!can_read_dataset(super::FLAG_UNKNOWN));
}
#[test]
fn test_write_check() {
assert!(can_write_dataset(0));
assert!(can_write_dataset(super::FLAG_DELETION_FILES));
assert!(can_write_dataset(super::FLAG_STABLE_ROW_IDS));
assert!(can_write_dataset(super::FLAG_USE_V2_FORMAT_DEPRECATED));
assert!(can_write_dataset(super::FLAG_TABLE_CONFIG));
assert!(can_write_dataset(super::FLAG_BASE_PATHS));
assert!(can_write_dataset(super::FLAG_DISABLE_TRANSACTION_FILE));
assert!(can_write_dataset(
super::FLAG_DELETION_FILES
| super::FLAG_STABLE_ROW_IDS
| super::FLAG_USE_V2_FORMAT_DEPRECATED
| super::FLAG_TABLE_CONFIG
| super::FLAG_BASE_PATHS
));
assert!(!can_write_dataset(super::FLAG_UNKNOWN));
}
#[test]
fn test_base_paths_feature_flags() {
use crate::format::{DataStorageFormat, Manifest};
use arrow_schema::{Field as ArrowField, Schema as ArrowSchema};
use lance_core::datatypes::Schema;
use std::collections::HashMap;
use std::sync::Arc;
// Create a basic schema for testing
let arrow_schema = ArrowSchema::new(vec![ArrowField::new(
"test_field",
arrow_schema::DataType::Int64,
false,
)]);
let schema = Schema::try_from(&arrow_schema).unwrap();
// Test 1: Normal dataset (no base_paths) should not have FLAG_BASE_PATHS
let mut normal_manifest = Manifest::new(
schema.clone(),
Arc::new(vec![]),
DataStorageFormat::default(),
HashMap::new(), // Empty base_paths
);
apply_feature_flags(&mut normal_manifest, false, false).unwrap();
assert_eq!(normal_manifest.reader_feature_flags & FLAG_BASE_PATHS, 0);
assert_eq!(normal_manifest.writer_feature_flags & FLAG_BASE_PATHS, 0);
// Test 2: Dataset with base_paths (shallow clone or multi-base) should have FLAG_BASE_PATHS
let mut base_paths: HashMap<u32, BasePath> = HashMap::new();
base_paths.insert(
1,
BasePath::new(
1,
"file:///path/to/original".to_string(),
Some("test_ref".to_string()),
true,
),
);
let mut multi_base_manifest = Manifest::new(
schema,
Arc::new(vec![]),
DataStorageFormat::default(),
base_paths,
);
apply_feature_flags(&mut multi_base_manifest, false, false).unwrap();
assert_ne!(
multi_base_manifest.reader_feature_flags & FLAG_BASE_PATHS,
0
);
assert_ne!(
multi_base_manifest.writer_feature_flags & FLAG_BASE_PATHS,
0
);
}
}

70
vendor/lance-table/src/format.rs vendored Normal file
View file

@ -0,0 +1,70 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors
use arrow_buffer::ToByteSlice;
use uuid::Uuid;
mod fragment;
mod index;
mod manifest;
mod transaction;
pub use crate::rowids::version::{
RowDatasetVersionMeta, RowDatasetVersionRun, RowDatasetVersionSequence,
};
pub use fragment::*;
pub use index::{IndexFile, IndexMetadata, index_metadata_codec, list_index_files_with_sizes};
pub use manifest::{
BasePath, DETACHED_VERSION_MASK, DataStorageFormat, Manifest, SelfDescribingFileReader,
WriterVersion, is_detached_version,
};
pub use transaction::Transaction;
use lance_core::{Error, Result};
// In 0.36.1 we renamed Index to IndexMetadata because Index conflicted too much with the
// Index trait. This is left in for backward compatibility.
#[deprecated(since = "0.36.1", note = "Use IndexMetadata instead")]
pub type Index = IndexMetadata;
/// Protobuf definitions for Lance Format
pub mod pb {
#![allow(clippy::all)]
#![allow(non_upper_case_globals)]
#![allow(non_camel_case_types)]
#![allow(non_snake_case)]
#![allow(unused)]
#![allow(improper_ctypes)]
#![allow(clippy::upper_case_acronyms)]
#![allow(clippy::use_self)]
include!(concat!(env!("OUT_DIR"), "/lance.table.rs"));
}
/// These version/magic values are written at the end of manifest files (e.g. versions/1.version)
pub const MAJOR_VERSION: i16 = 0;
pub const MINOR_VERSION: i16 = 1;
pub const MAGIC: &[u8; 4] = b"LANC";
impl TryFrom<&pb::Uuid> for Uuid {
type Error = Error;
fn try_from(p: &pb::Uuid) -> Result<Self> {
if p.uuid.len() != 16 {
return Err(Error::invalid_input(
"Protobuf UUID is malformed".to_string(),
));
}
let mut buf: [u8; 16] = [0; 16];
buf.copy_from_slice(p.uuid.to_byte_slice());
Ok(Self::from_bytes(buf))
}
}
impl From<&Uuid> for pb::Uuid {
fn from(value: &Uuid) -> Self {
Self {
uuid: value.into_bytes().to_vec(),
}
}
}

View file

@ -0,0 +1,841 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors
use std::collections::HashMap;
use std::num::NonZero;
use std::sync::Arc;
use deepsize::DeepSizeOf;
use lance_core::Error;
use lance_file::format::{MAJOR_VERSION, MINOR_VERSION};
use lance_file::version::LanceFileVersion;
use lance_io::utils::CachedFileSize;
use object_store::path::Path;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use crate::format::pb;
use crate::rowids::version::{
RowDatasetVersionMeta, created_at_version_meta_to_pb, last_updated_at_version_meta_to_pb,
};
use lance_core::datatypes::Schema;
use lance_core::error::Result;
/// Lance Data File
///
/// A data file is one piece of file storing data.
#[derive(Debug, Clone, PartialEq, Eq, DeepSizeOf)]
pub struct DataFile {
/// Relative path of the data file to dataset root.
pub path: String,
/// The ids of fields in this file.
///
/// When identical across many fragments (common case), multiple `DataFile`
/// instances share a single heap allocation via `Arc`, significantly
/// reducing manifest memory for large tables.
pub fields: Arc<[i32]>,
/// The offsets of the fields listed in `fields`, empty in v1 files
///
/// Note that -1 is a possibility and it indices that the field has
/// no top-level column in the file.
///
/// Columns that lack a field id may still exist as extra entries in
/// `column_indices`; such columns are ignored by field-idbased projection.
/// For example, some fields, such as blob fields, occupy multiple
/// columns in the file but only have a single field id.
pub column_indices: Arc<[i32]>,
/// The major version of the file format used to write this file.
pub file_major_version: u32,
/// The minor version of the file format used to write this file.
pub file_minor_version: u32,
/// The size of the file in bytes, if known.
pub file_size_bytes: CachedFileSize,
/// The base path of the datafile, when the datafile is outside the dataset.
pub base_id: Option<u32>,
}
// Custom Serialize: convert Arc<[i32]> to slice for transparent JSON output
impl Serialize for DataFile {
fn serialize<S: Serializer>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error> {
use serde::ser::SerializeStruct;
let mut s = serializer.serialize_struct("DataFile", 7)?;
s.serialize_field("path", &self.path)?;
s.serialize_field("fields", self.fields.as_ref())?;
s.serialize_field("column_indices", self.column_indices.as_ref())?;
s.serialize_field("file_major_version", &self.file_major_version)?;
s.serialize_field("file_minor_version", &self.file_minor_version)?;
s.serialize_field("file_size_bytes", &self.file_size_bytes)?;
s.serialize_field("base_id", &self.base_id)?;
s.end()
}
}
// Custom Deserialize: read Vec<i32> and convert to Arc<[i32]>
impl<'de> Deserialize<'de> for DataFile {
fn deserialize<D: Deserializer<'de>>(deserializer: D) -> std::result::Result<Self, D::Error> {
#[derive(Deserialize)]
struct DataFileHelper {
path: String,
fields: Vec<i32>,
#[serde(default)]
column_indices: Vec<i32>,
#[serde(default)]
file_major_version: u32,
#[serde(default)]
file_minor_version: u32,
file_size_bytes: CachedFileSize,
base_id: Option<u32>,
}
let helper = DataFileHelper::deserialize(deserializer)?;
Ok(Self {
path: helper.path,
fields: Arc::from(helper.fields),
column_indices: Arc::from(helper.column_indices),
file_major_version: helper.file_major_version,
file_minor_version: helper.file_minor_version,
file_size_bytes: helper.file_size_bytes,
base_id: helper.base_id,
})
}
}
impl DataFile {
pub fn new(
path: impl Into<String>,
fields: Vec<i32>,
column_indices: Vec<i32>,
file_major_version: u32,
file_minor_version: u32,
file_size_bytes: Option<NonZero<u64>>,
base_id: Option<u32>,
) -> Self {
Self {
path: path.into(),
fields: Arc::from(fields),
column_indices: Arc::from(column_indices),
file_major_version,
file_minor_version,
file_size_bytes: file_size_bytes.into(),
base_id,
}
}
/// Create a new `DataFile` with the expectation that fields and column_indices will be set later
pub fn new_unstarted(
path: impl Into<String>,
file_major_version: u32,
file_minor_version: u32,
) -> Self {
Self {
path: path.into(),
fields: Arc::from([]),
column_indices: Arc::from([]),
file_major_version,
file_minor_version,
file_size_bytes: Default::default(),
base_id: None,
}
}
pub fn new_legacy_from_fields(
path: impl Into<String>,
fields: Vec<i32>,
base_id: Option<u32>,
) -> Self {
Self::new(
path,
fields,
vec![],
MAJOR_VERSION as u32,
MINOR_VERSION as u32,
None,
base_id,
)
}
pub fn new_legacy(
path: impl Into<String>,
schema: &Schema,
file_size_bytes: Option<NonZero<u64>>,
base_id: Option<u32>,
) -> Self {
let mut field_ids = schema.field_ids();
field_ids.sort();
Self::new(
path,
field_ids,
vec![],
MAJOR_VERSION as u32,
MINOR_VERSION as u32,
file_size_bytes,
base_id,
)
}
pub fn schema(&self, full_schema: &Schema) -> Schema {
full_schema.project_by_ids(&self.fields, false)
}
pub fn is_legacy_file(&self) -> bool {
self.file_major_version == 0 && self.file_minor_version < 3
}
pub fn validate(&self, base_path: &Path) -> Result<()> {
if self.is_legacy_file() {
if !self.fields.windows(2).all(|w| w[0] < w[1]) {
return Err(Error::corrupt_file(
base_path.clone().join(self.path.clone()),
"contained unsorted or duplicate field ids",
));
}
} else if self.column_indices.len() < self.fields.len() {
// Every recorded field id must have a column index, but not every column needs
// to be associated with a field id (extra columns are allowed).
return Err(Error::corrupt_file(
base_path.clone().join(self.path.clone()),
"contained fewer column_indices than fields",
));
}
Ok(())
}
}
impl From<&DataFile> for pb::DataFile {
fn from(df: &DataFile) -> Self {
Self {
path: df.path.clone(),
fields: df.fields.to_vec(),
column_indices: df.column_indices.to_vec(),
file_major_version: df.file_major_version,
file_minor_version: df.file_minor_version,
file_size_bytes: df.file_size_bytes.get().map_or(0, |v| v.get()),
base_id: df.base_id,
}
}
}
impl TryFrom<pb::DataFile> for DataFile {
type Error = Error;
fn try_from(proto: pb::DataFile) -> Result<Self> {
Ok(Self {
path: proto.path,
fields: Arc::from(proto.fields),
column_indices: Arc::from(proto.column_indices),
file_major_version: proto.file_major_version,
file_minor_version: proto.file_minor_version,
file_size_bytes: CachedFileSize::new(proto.file_size_bytes),
base_id: proto.base_id,
})
}
}
/// Interns repeated data so that fragments with identical content share a
/// single heap allocation via `Arc`.
///
/// At 20M fragments the deduplication typically saves multiple GB of heap
/// because every fragment in a homogeneous table carries the same field list,
/// and post-compaction fragments share identical version metadata bytes.
///
/// Uses a `Vec`-based linear scan when the cache is small (<=16 entries)
/// and upgrades to `HashMap` for larger caches. In the common homogeneous
/// case (1-3 unique values), linear scan avoids per-fragment hashing overhead.
#[derive(Default)]
pub struct DataFileFieldInterner {
fields: InternCache<i32>,
column_indices: InternCache<i32>,
inline_bytes: InternCache<u8>,
}
/// A cache that uses linear scan for small sizes and HashMap for large.
/// The threshold is chosen so that scan + compare is cheaper than hash for
/// typical payload sizes (20-200 bytes).
enum InternCache<T: Eq + std::hash::Hash + Clone> {
Small(Vec<Arc<[T]>>),
Large(HashMap<Arc<[T]>, ()>),
}
const INTERN_CACHE_UPGRADE_THRESHOLD: usize = 16;
impl<T: Eq + std::hash::Hash + Clone> Default for InternCache<T> {
fn default() -> Self {
Self::Small(Vec::new())
}
}
impl<T: Eq + std::hash::Hash + Clone> InternCache<T> {
fn intern(&mut self, v: Vec<T>) -> Arc<[T]> {
match self {
Self::Small(entries) => {
for existing in entries.iter() {
if existing.as_ref() == v.as_slice() {
return existing.clone();
}
}
let arc: Arc<[T]> = Arc::from(v);
entries.push(arc.clone());
if entries.len() > INTERN_CACHE_UPGRADE_THRESHOLD {
let mut map = HashMap::with_capacity(entries.len());
for e in entries.drain(..) {
map.insert(e, ());
}
*self = Self::Large(map);
}
arc
}
Self::Large(map) => {
if let Some((existing, _)) = map.get_key_value(v.as_slice()) {
existing.clone()
} else {
let arc: Arc<[T]> = Arc::from(v);
map.insert(arc.clone(), ());
arc
}
}
}
}
}
impl DataFileFieldInterner {
/// Intern a `RowDatasetVersionMeta`, deduplicating inline byte payloads.
/// Accepts the protobuf oneof value directly to avoid an intermediate
/// `Arc<[u8]>` allocation that would need to be `.to_vec()`'d for the key lookup.
fn intern_last_updated_version_meta(
cache: &mut InternCache<u8>,
pb: pb::data_fragment::LastUpdatedAtVersionSequence,
) -> Result<RowDatasetVersionMeta> {
match pb {
pb::data_fragment::LastUpdatedAtVersionSequence::InlineLastUpdatedAtVersions(data) => {
Ok(RowDatasetVersionMeta::Inline(cache.intern(data)))
}
pb::data_fragment::LastUpdatedAtVersionSequence::ExternalLastUpdatedAtVersions(
file,
) => Ok(RowDatasetVersionMeta::External(ExternalFile {
path: file.path,
offset: file.offset,
size: file.size,
})),
}
}
/// Intern a `RowDatasetVersionMeta`, deduplicating inline byte payloads.
fn intern_created_version_meta(
cache: &mut InternCache<u8>,
pb: pb::data_fragment::CreatedAtVersionSequence,
) -> Result<RowDatasetVersionMeta> {
match pb {
pb::data_fragment::CreatedAtVersionSequence::InlineCreatedAtVersions(data) => {
Ok(RowDatasetVersionMeta::Inline(cache.intern(data)))
}
pb::data_fragment::CreatedAtVersionSequence::ExternalCreatedAtVersions(file) => {
Ok(RowDatasetVersionMeta::External(ExternalFile {
path: file.path,
offset: file.offset,
size: file.size,
}))
}
}
}
/// Convert a protobuf `DataFile`, interning `fields` and `column_indices`.
pub fn intern_data_file(&mut self, proto: pb::DataFile) -> Result<DataFile> {
Ok(DataFile {
path: proto.path,
fields: self.fields.intern(proto.fields),
column_indices: self.column_indices.intern(proto.column_indices),
file_major_version: proto.file_major_version,
file_minor_version: proto.file_minor_version,
file_size_bytes: CachedFileSize::new(proto.file_size_bytes),
base_id: proto.base_id,
})
}
/// Convert a protobuf `DataFragment`, interning fields and version metadata.
pub fn intern_fragment(&mut self, p: pb::DataFragment) -> Result<Fragment> {
let physical_rows = if p.physical_rows > 0 {
Some(p.physical_rows as usize)
} else {
None
};
let last_updated_at_version_meta = p
.last_updated_at_version_sequence
.map(|pb| Self::intern_last_updated_version_meta(&mut self.inline_bytes, pb))
.transpose()?;
let created_at_version_meta = p
.created_at_version_sequence
.map(|pb| Self::intern_created_version_meta(&mut self.inline_bytes, pb))
.transpose()?;
Ok(Fragment {
id: p.id,
files: p
.files
.into_iter()
.map(|f| self.intern_data_file(f))
.collect::<Result<_>>()?,
deletion_file: p.deletion_file.map(DeletionFile::try_from).transpose()?,
row_id_meta: p.row_id_sequence.map(RowIdMeta::try_from).transpose()?,
physical_rows,
last_updated_at_version_meta,
created_at_version_meta,
})
}
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)]
#[serde(rename_all = "lowercase")]
pub enum DeletionFileType {
Array,
Bitmap,
}
impl DeletionFileType {
// TODO: pub(crate)
pub fn suffix(&self) -> &str {
match self {
Self::Array => "arrow",
Self::Bitmap => "bin",
}
}
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)]
pub struct DeletionFile {
pub read_version: u64,
pub id: u64,
pub file_type: DeletionFileType,
/// Number of deleted rows in this file. If None, this is unknown.
pub num_deleted_rows: Option<usize>,
pub base_id: Option<u32>,
}
impl TryFrom<pb::DeletionFile> for DeletionFile {
type Error = Error;
fn try_from(value: pb::DeletionFile) -> Result<Self> {
let file_type = match value.file_type {
0 => DeletionFileType::Array,
1 => DeletionFileType::Bitmap,
_ => {
return Err(Error::not_supported_source(
"Unknown deletion file type".into(),
));
}
};
let num_deleted_rows = if value.num_deleted_rows == 0 {
None
} else {
Some(value.num_deleted_rows as usize)
};
Ok(Self {
read_version: value.read_version,
id: value.id,
file_type,
num_deleted_rows,
base_id: value.base_id,
})
}
}
/// A reference to a part of a file.
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)]
pub struct ExternalFile {
pub path: String,
pub offset: u64,
pub size: u64,
}
/// Metadata about location of the row id sequence.
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)]
pub enum RowIdMeta {
Inline(Vec<u8>),
External(ExternalFile),
}
impl TryFrom<pb::data_fragment::RowIdSequence> for RowIdMeta {
type Error = Error;
fn try_from(value: pb::data_fragment::RowIdSequence) -> Result<Self> {
match value {
pb::data_fragment::RowIdSequence::InlineRowIds(data) => Ok(Self::Inline(data)),
pb::data_fragment::RowIdSequence::ExternalRowIds(file) => {
Ok(Self::External(ExternalFile {
path: file.path.clone(),
offset: file.offset,
size: file.size,
}))
}
}
}
}
/// Data fragment.
///
/// A fragment is a set of files which represent the different columns of the same rows.
/// If column exists in the schema, but the related file does not exist, treat this column as `nulls`.
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)]
pub struct Fragment {
/// Fragment ID
pub id: u64,
/// Files within the fragment.
pub files: Vec<DataFile>,
/// Optional file with deleted local row offsets.
#[serde(skip_serializing_if = "Option::is_none")]
pub deletion_file: Option<DeletionFile>,
/// RowIndex
#[serde(skip_serializing_if = "Option::is_none")]
pub row_id_meta: Option<RowIdMeta>,
/// Original number of rows in the fragment. If this is None, then it is
/// unknown. This is only optional for legacy reasons. All new tables should
/// have this set.
pub physical_rows: Option<usize>,
/// Last updated at version metadata
#[serde(skip_serializing_if = "Option::is_none")]
pub last_updated_at_version_meta: Option<RowDatasetVersionMeta>,
/// Created at version metadata
#[serde(skip_serializing_if = "Option::is_none")]
pub created_at_version_meta: Option<RowDatasetVersionMeta>,
}
impl Fragment {
pub fn new(id: u64) -> Self {
Self {
id,
files: vec![],
deletion_file: None,
row_id_meta: None,
physical_rows: None,
last_updated_at_version_meta: None,
created_at_version_meta: None,
}
}
pub fn num_rows(&self) -> Option<usize> {
match (self.physical_rows, &self.deletion_file) {
// Known fragment length, no deletion file.
(Some(len), None) => Some(len),
// Known fragment length, but don't know deletion file size.
(
Some(len),
Some(DeletionFile {
num_deleted_rows: Some(num_deleted_rows),
..
}),
) => Some(len - num_deleted_rows),
_ => None,
}
}
pub fn from_json(json: &str) -> Result<Self> {
let fragment: Self = serde_json::from_str(json)?;
Ok(fragment)
}
/// Create a `Fragment` with one DataFile
pub fn with_file_legacy(
id: u64,
path: &str,
schema: &Schema,
physical_rows: Option<usize>,
) -> Self {
Self {
id,
files: vec![DataFile::new_legacy(path, schema, None, None)],
deletion_file: None,
physical_rows,
row_id_meta: None,
last_updated_at_version_meta: None,
created_at_version_meta: None,
}
}
pub fn with_file(
mut self,
path: impl Into<String>,
field_ids: Vec<i32>,
column_indices: Vec<i32>,
version: &LanceFileVersion,
file_size_bytes: Option<NonZero<u64>>,
) -> Self {
let (major, minor) = version.to_numbers();
let data_file = DataFile::new(
path,
field_ids,
column_indices,
major,
minor,
file_size_bytes,
None,
);
self.files.push(data_file);
self
}
pub fn with_physical_rows(mut self, physical_rows: usize) -> Self {
self.physical_rows = Some(physical_rows);
self
}
pub fn add_file(
&mut self,
path: impl Into<String>,
field_ids: Vec<i32>,
column_indices: Vec<i32>,
version: &LanceFileVersion,
file_size_bytes: Option<NonZero<u64>>,
) {
let (major, minor) = version.to_numbers();
self.files.push(DataFile::new(
path,
field_ids,
column_indices,
major,
minor,
file_size_bytes,
None,
));
}
/// Add a new [`DataFile`] to this fragment.
pub fn add_file_legacy(&mut self, path: &str, schema: &Schema) {
self.files
.push(DataFile::new_legacy(path, schema, None, None));
}
// True if this fragment is made up of legacy v1 files, false otherwise
pub fn has_legacy_files(&self) -> bool {
// If any file in a fragment is legacy then all files in the fragment must be
self.files[0].is_legacy_file()
}
// Helper method to infer the Lance version from a set of fragments
//
// Returns None if there are no data files
// Returns an error if the data files have different versions
pub fn try_infer_version(fragments: &[Self]) -> Result<Option<LanceFileVersion>> {
// Otherwise we need to check the actual file versions
// Determine version from first file
let Some(sample_file) = fragments
.iter()
.find(|f| !f.files.is_empty())
.map(|f| &f.files[0])
else {
return Ok(None);
};
let file_version = LanceFileVersion::try_from_major_minor(
sample_file.file_major_version,
sample_file.file_minor_version,
)?;
// Ensure all files match
for frag in fragments {
for file in &frag.files {
let this_file_version = LanceFileVersion::try_from_major_minor(
file.file_major_version,
file.file_minor_version,
)?;
if file_version != this_file_version {
return Err(Error::invalid_input(format!(
"All data files must have the same version. Detected both {} and {}",
file_version, this_file_version
)));
}
}
}
Ok(Some(file_version))
}
}
impl TryFrom<pb::DataFragment> for Fragment {
type Error = Error;
fn try_from(p: pb::DataFragment) -> Result<Self> {
let physical_rows = if p.physical_rows > 0 {
Some(p.physical_rows as usize)
} else {
None
};
Ok(Self {
id: p.id,
files: p
.files
.into_iter()
.map(DataFile::try_from)
.collect::<Result<_>>()?,
deletion_file: p.deletion_file.map(DeletionFile::try_from).transpose()?,
row_id_meta: p.row_id_sequence.map(RowIdMeta::try_from).transpose()?,
physical_rows,
last_updated_at_version_meta: p
.last_updated_at_version_sequence
.map(RowDatasetVersionMeta::try_from)
.transpose()?,
created_at_version_meta: p
.created_at_version_sequence
.map(RowDatasetVersionMeta::try_from)
.transpose()?,
})
}
}
impl From<&Fragment> for pb::DataFragment {
fn from(f: &Fragment) -> Self {
let deletion_file = f.deletion_file.as_ref().map(|f| {
let file_type = match f.file_type {
DeletionFileType::Array => pb::deletion_file::DeletionFileType::ArrowArray,
DeletionFileType::Bitmap => pb::deletion_file::DeletionFileType::Bitmap,
};
pb::DeletionFile {
read_version: f.read_version,
id: f.id,
file_type: file_type.into(),
num_deleted_rows: f.num_deleted_rows.unwrap_or_default() as u64,
base_id: f.base_id,
}
});
let row_id_sequence = f.row_id_meta.as_ref().map(|m| match m {
RowIdMeta::Inline(data) => pb::data_fragment::RowIdSequence::InlineRowIds(data.clone()),
RowIdMeta::External(file) => {
pb::data_fragment::RowIdSequence::ExternalRowIds(pb::ExternalFile {
path: file.path.clone(),
offset: file.offset,
size: file.size,
})
}
});
let last_updated_at_version_sequence =
last_updated_at_version_meta_to_pb(&f.last_updated_at_version_meta);
let created_at_version_sequence = created_at_version_meta_to_pb(&f.created_at_version_meta);
Self {
id: f.id,
files: f.files.iter().map(pb::DataFile::from).collect(),
deletion_file,
row_id_sequence,
physical_rows: f.physical_rows.unwrap_or_default() as u64,
last_updated_at_version_sequence,
created_at_version_sequence,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use arrow_schema::{
DataType, Field as ArrowField, Fields as ArrowFields, Schema as ArrowSchema,
};
use object_store::path::Path;
use serde_json::{Value, json};
#[test]
fn test_new_fragment() {
let path = "foobar.lance";
let arrow_schema = ArrowSchema::new(vec![
ArrowField::new(
"s",
DataType::Struct(ArrowFields::from(vec![
ArrowField::new("si", DataType::Int32, false),
ArrowField::new("sb", DataType::Binary, true),
])),
true,
),
ArrowField::new("bool", DataType::Boolean, true),
]);
let schema = Schema::try_from(&arrow_schema).unwrap();
let fragment = Fragment::with_file_legacy(123, path, &schema, Some(10));
assert_eq!(123, fragment.id);
assert_eq!(
fragment.files,
vec![DataFile::new_legacy_from_fields(
path.to_string(),
vec![0, 1, 2, 3],
None,
)]
)
}
#[test]
fn test_roundtrip_fragment() {
let mut fragment = Fragment::new(123);
let schema = ArrowSchema::new(vec![ArrowField::new("x", DataType::Float16, true)]);
fragment.add_file_legacy("foobar.lance", &Schema::try_from(&schema).unwrap());
fragment.deletion_file = Some(DeletionFile {
read_version: 123,
id: 456,
file_type: DeletionFileType::Array,
num_deleted_rows: Some(10),
base_id: None,
});
let proto = pb::DataFragment::from(&fragment);
let fragment2 = Fragment::try_from(proto).unwrap();
assert_eq!(fragment, fragment2);
fragment.deletion_file = None;
let proto = pb::DataFragment::from(&fragment);
let fragment2 = Fragment::try_from(proto).unwrap();
assert_eq!(fragment, fragment2);
}
#[test]
fn test_to_json() {
let mut fragment = Fragment::new(123);
let schema = ArrowSchema::new(vec![ArrowField::new("x", DataType::Float16, true)]);
fragment.add_file_legacy("foobar.lance", &Schema::try_from(&schema).unwrap());
fragment.deletion_file = Some(DeletionFile {
read_version: 123,
id: 456,
file_type: DeletionFileType::Array,
num_deleted_rows: Some(10),
base_id: None,
});
let json = serde_json::to_string(&fragment).unwrap();
let value: Value = serde_json::from_str(&json).unwrap();
assert_eq!(
value,
json!({
"id": 123,
"files":[
{"path": "foobar.lance", "fields": [0], "column_indices": [],
"file_major_version": MAJOR_VERSION, "file_minor_version": MINOR_VERSION,
"file_size_bytes": null, "base_id": null }
],
"deletion_file": {"read_version": 123, "id": 456, "file_type": "array",
"num_deleted_rows": 10, "base_id": null},
"physical_rows": None::<usize>}),
);
let frag2 = Fragment::from_json(&json).unwrap();
assert_eq!(fragment, frag2);
}
#[test]
fn data_file_validate_allows_extra_columns() {
let data_file = DataFile {
path: "foo.lance".to_string(),
fields: Arc::from([1, 2]),
// One extra column without a field id mapping
column_indices: Arc::from([0, 1, 2]),
file_major_version: MAJOR_VERSION as u32,
file_minor_version: MINOR_VERSION as u32,
file_size_bytes: Default::default(),
base_id: None,
};
let base_path = Path::from("base");
data_file
.validate(&base_path)
.expect("validation should allow extra columns without field ids");
}
}

368
vendor/lance-table/src/format/index.rs vendored Normal file
View file

@ -0,0 +1,368 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors
//! Metadata for index
use std::collections::HashMap;
use std::sync::Arc;
use chrono::{DateTime, Utc};
use deepsize::DeepSizeOf;
use futures::StreamExt;
use lance_io::object_store::ObjectStore;
use object_store::path::Path;
use roaring::RoaringBitmap;
use uuid::Uuid;
use super::pb;
use lance_core::{Error, Result};
/// Metadata about a single file within an index segment.
#[derive(Debug, Clone, PartialEq, DeepSizeOf)]
pub struct IndexFile {
/// Path relative to the index directory (e.g., "index.idx", "auxiliary.idx")
pub path: String,
/// Size of the file in bytes
pub size_bytes: u64,
}
/// Index metadata
#[derive(Debug, Clone, PartialEq)]
pub struct IndexMetadata {
/// Unique ID across all dataset versions.
pub uuid: Uuid,
/// Fields to build the index.
pub fields: Vec<i32>,
/// Human readable index name
pub name: String,
/// The version of the dataset this index was last updated on
///
/// This is set when the index is created (based on the version used to train the index)
/// This is updated when the index is updated or remapped
pub dataset_version: u64,
/// The fragment ids this index covers.
///
/// This may contain fragment ids that no longer exist in the dataset.
///
/// If this is None, then this is unknown.
pub fragment_bitmap: Option<RoaringBitmap>,
/// Metadata specific to the index type
///
/// This is an Option because older versions of Lance may not have this defined. However, it should always
/// be present in newer versions.
pub index_details: Option<Arc<prost_types::Any>>,
/// The index version.
pub index_version: i32,
/// Timestamp when the index was created
///
/// This field is optional for backward compatibility. For existing indices created before
/// this field was added, this will be None.
pub created_at: Option<DateTime<Utc>>,
/// The base path index of the index files. Used when the index is imported or referred from another dataset.
/// Lance uses it as key of the base_paths field in Manifest to determine the actual base path of the index files.
pub base_id: Option<u32>,
/// List of files and their sizes for this index segment.
/// This enables skipping HEAD calls when opening indices and provides
/// visibility into index storage size via describe_indices().
/// This is None if the file sizes are unknown. This happens for indices created
/// before this field was added.
pub files: Option<Vec<IndexFile>>,
}
impl IndexMetadata {
pub fn effective_fragment_bitmap(
&self,
existing_fragments: &RoaringBitmap,
) -> Option<RoaringBitmap> {
let fragment_bitmap = self.fragment_bitmap.as_ref()?;
Some(fragment_bitmap & existing_fragments)
}
/// Returns a map of relative file paths to their sizes.
/// Returns an empty map if file information is not available.
pub fn file_size_map(&self) -> HashMap<String, u64> {
self.files
.as_ref()
.map(|files| {
files
.iter()
.map(|f| (f.path.clone(), f.size_bytes))
.collect()
})
.unwrap_or_default()
}
/// Returns the total size of all files in this index segment in bytes.
/// Returns None if file information is not available.
pub fn total_size_bytes(&self) -> Option<u64> {
self.files
.as_ref()
.map(|files| files.iter().map(|f| f.size_bytes).sum())
}
/// Returns the set of fragments which are part of the fragment bitmap
/// but no longer in the dataset.
pub fn deleted_fragment_bitmap(
&self,
existing_fragments: &RoaringBitmap,
) -> Option<RoaringBitmap> {
let fragment_bitmap = self.fragment_bitmap.as_ref()?;
Some(fragment_bitmap - existing_fragments)
}
}
impl DeepSizeOf for IndexMetadata {
fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize {
self.uuid.as_bytes().deep_size_of_children(context)
+ self.fields.deep_size_of_children(context)
+ self.name.deep_size_of_children(context)
+ self.dataset_version.deep_size_of_children(context)
+ self
.fragment_bitmap
.as_ref()
.map(|fragment_bitmap| fragment_bitmap.serialized_size())
.unwrap_or(0)
+ self.files.deep_size_of_children(context)
}
}
impl TryFrom<pb::IndexMetadata> for IndexMetadata {
type Error = Error;
fn try_from(proto: pb::IndexMetadata) -> Result<Self> {
let fragment_bitmap = if proto.fragment_bitmap.is_empty() {
None
} else {
Some(RoaringBitmap::deserialize_from(
&mut proto.fragment_bitmap.as_slice(),
)?)
};
let files = if proto.files.is_empty() {
None
} else {
Some(
proto
.files
.into_iter()
.map(|f| IndexFile {
path: f.path,
size_bytes: f.size_bytes,
})
.collect(),
)
};
Ok(Self {
uuid: proto.uuid.as_ref().map(Uuid::try_from).ok_or_else(|| {
Error::invalid_input("uuid field does not exist in Index metadata".to_string())
})??,
name: proto.name,
fields: proto.fields,
dataset_version: proto.dataset_version,
fragment_bitmap,
index_details: proto.index_details.map(Arc::new),
index_version: proto.index_version.unwrap_or_default(),
created_at: proto.created_at.map(|ts| {
DateTime::from_timestamp_millis(ts as i64)
.expect("Invalid timestamp in index metadata")
}),
base_id: proto.base_id,
files,
})
}
}
impl From<&IndexMetadata> for pb::IndexMetadata {
fn from(idx: &IndexMetadata) -> Self {
let mut fragment_bitmap = Vec::new();
if let Some(bitmap) = &idx.fragment_bitmap
&& let Err(e) = bitmap.serialize_into(&mut fragment_bitmap)
{
// In theory, this should never error. But if we do, just
// recover gracefully.
log::error!("Failed to serialize fragment bitmap: {}", e);
fragment_bitmap.clear();
}
let files = idx
.files
.as_ref()
.map(|files| {
files
.iter()
.map(|f| pb::IndexFile {
path: f.path.clone(),
size_bytes: f.size_bytes,
})
.collect()
})
.unwrap_or_default();
Self {
uuid: Some((&idx.uuid).into()),
name: idx.name.clone(),
fields: idx.fields.clone(),
dataset_version: idx.dataset_version,
fragment_bitmap,
index_details: idx
.index_details
.as_ref()
.map(|details| details.as_ref().clone()),
index_version: Some(idx.index_version),
created_at: idx.created_at.map(|dt| dt.timestamp_millis() as u64),
base_id: idx.base_id,
files,
}
}
}
/// Returns a [`CacheCodec`](lance_core::cache::CacheCodec) for `Vec<IndexMetadata>`.
///
/// Uses `pb::IndexSection` (which wraps `repeated IndexMetadata`) as the wire
/// format, reusing the existing `TryFrom`/`From` conversions.
///
/// Uses [`CacheCodec::new`](lance_core::cache::CacheCodec::new) because the
/// orphan rule prevents `impl CacheCodecImpl for Vec<IndexMetadata>`.
type ArcAny = Arc<dyn std::any::Any + Send + Sync>;
fn serialize_index_metadata(
any: &ArcAny,
writer: &mut dyn std::io::Write,
) -> lance_core::Result<()> {
use prost::Message;
let vec = any
.downcast_ref::<Vec<IndexMetadata>>()
.expect("index_metadata_codec: wrong type (this is a bug in the cache layer)");
let section = pb::IndexSection {
indices: vec.iter().map(pb::IndexMetadata::from).collect(),
};
writer.write_all(&section.encode_to_vec())?;
Ok(())
}
fn deserialize_index_metadata(data: &bytes::Bytes) -> lance_core::Result<ArcAny> {
use prost::Message;
let section = pb::IndexSection::decode(data.as_ref())?;
let indices: Vec<IndexMetadata> = section
.indices
.into_iter()
.map(IndexMetadata::try_from)
.collect::<lance_core::Result<_>>()?;
Ok(Arc::new(indices))
}
pub fn index_metadata_codec() -> lance_core::cache::CacheCodec {
lance_core::cache::CacheCodec::new(serialize_index_metadata, deserialize_index_metadata)
}
/// List all files in an index directory with their sizes.
///
/// Returns a list of `IndexFile` structs containing relative paths and sizes.
/// This is used to capture file metadata after index creation/modification.
pub async fn list_index_files_with_sizes(
object_store: &ObjectStore,
index_dir: &Path,
) -> Result<Vec<IndexFile>> {
let mut files = Vec::new();
let mut stream = object_store.read_dir_all(index_dir, None);
while let Some(meta) = stream.next().await {
let meta = meta?;
// Get relative path by stripping the index_dir prefix
let relative_path = meta
.location
.as_ref()
.strip_prefix(index_dir.as_ref())
.map(|s| s.trim_start_matches('/').to_string())
.unwrap_or_else(|| meta.location.filename().unwrap_or("").to_string());
files.push(IndexFile {
path: relative_path,
size_bytes: meta.size,
});
}
Ok(files)
}
#[cfg(test)]
mod tests {
use super::*;
use std::collections::HashMap;
/// Demonstrates the pattern a disk-backed cache backend would use:
/// serialize entries to bytes, store in a key-value map, then
/// deserialize on retrieval.
#[test]
fn test_index_metadata_codec_roundtrip() {
let codec = index_metadata_codec();
let original = vec![
IndexMetadata {
uuid: Uuid::new_v4(),
name: "my_index".to_string(),
fields: vec![0, 1],
dataset_version: 42,
fragment_bitmap: Some(RoaringBitmap::from_iter([1, 2, 3])),
index_details: None,
index_version: 1,
created_at: None,
base_id: None,
files: Some(vec![IndexFile {
path: "index.idx".to_string(),
size_bytes: 1024,
}]),
},
IndexMetadata {
uuid: Uuid::new_v4(),
name: "second_index".to_string(),
fields: vec![2],
dataset_version: 43,
fragment_bitmap: None,
index_details: None,
index_version: 2,
created_at: None,
base_id: Some(7),
files: None,
},
];
// Simulate a disk-backed store: HashMap<String, Vec<u8>>
let mut store: HashMap<String, Vec<u8>> = HashMap::new();
// Serialize into the store
let key = "dataset/v42/Vec<IndexMetadata>".to_string();
let mut buf = Vec::new();
let entry: Arc<dyn std::any::Any + Send + Sync> = Arc::new(original.clone());
codec.serialize(&entry, &mut buf).unwrap();
store.insert(key.clone(), buf);
// Deserialize from the store
let bytes = store.get(&key).unwrap();
let recovered = codec
.deserialize(&bytes::Bytes::copy_from_slice(bytes))
.unwrap();
let recovered = recovered
.downcast::<Vec<IndexMetadata>>()
.expect("downcast should succeed");
assert_eq!(original.len(), recovered.len());
for (orig, rec) in original.iter().zip(recovered.iter()) {
assert_eq!(orig.uuid, rec.uuid);
assert_eq!(orig.name, rec.name);
assert_eq!(orig.fields, rec.fields);
assert_eq!(orig.dataset_version, rec.dataset_version);
assert_eq!(orig.fragment_bitmap, rec.fragment_bitmap);
assert_eq!(orig.index_version, rec.index_version);
assert_eq!(orig.base_id, rec.base_id);
assert_eq!(orig.files, rec.files);
}
}
}

1490
vendor/lance-table/src/format/manifest.rs vendored Normal file

File diff suppressed because it is too large Load diff

42
vendor/lance-table/src/format/transaction.rs vendored Executable file
View file

@ -0,0 +1,42 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors
//! Transaction struct for lance-table format layer.
//!
//! This struct is introduced to provide a Struct-first API for passing transaction
//! information within the lance-table crate. It mirrors the protobuf Transaction
//! message at a semantic level while remaining crate-local, so lance-table does
//! not depend on higher layers (e.g., lance crate).
//!
//! Conversion to protobuf occurs at the write boundary. See the `From<Transaction>`
//! implementation below.
use crate::format::pb;
#[derive(Clone, Debug, PartialEq)]
pub struct Transaction {
/// Crate-local representation backing: protobuf Transaction.
/// Keeping this simple avoids ring dependencies while still enabling
/// Struct-first parameter passing in lance-table.
pub inner: pb::Transaction,
}
impl Transaction {
/// Accessor for testing or internal inspection if needed.
pub fn as_pb(&self) -> &pb::Transaction {
&self.inner
}
}
/// Write-boundary conversion: serialize using protobuf at the last step.
impl From<Transaction> for pb::Transaction {
fn from(tx: Transaction) -> Self {
tx.inner
}
}
impl From<pb::Transaction> for Transaction {
fn from(pb_tx: pb::Transaction) -> Self {
Self { inner: pb_tx }
}
}

6
vendor/lance-table/src/io.rs vendored Normal file
View file

@ -0,0 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors
pub mod commit;
pub mod deletion;
pub mod manifest;

1898
vendor/lance-table/src/io/commit.rs vendored Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,495 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors
//! DynamoDB based external manifest store
//!
use std::collections::HashSet;
use std::sync::{Arc, LazyLock};
use async_trait::async_trait;
use aws_sdk_dynamodb::Client;
use aws_sdk_dynamodb::error::SdkError;
use aws_sdk_dynamodb::operation::RequestId;
use aws_sdk_dynamodb::operation::delete_item::builders::DeleteItemFluentBuilder;
use aws_sdk_dynamodb::operation::{
get_item::builders::GetItemFluentBuilder, put_item::builders::PutItemFluentBuilder,
query::builders::QueryFluentBuilder,
};
use aws_sdk_dynamodb::types::{AttributeValue, KeyType};
use object_store::path::Path;
use snafu::OptionExt;
use tokio::sync::RwLock;
use tracing::warn;
use crate::io::commit::external_manifest::ExternalManifestStore;
use lance_core::error::NotFoundSnafu;
use lance_core::error::box_error;
use lance_core::{Error, Result};
use super::ManifestLocation;
use super::external_manifest::detect_naming_scheme_from_path;
#[derive(Debug)]
struct WrappedSdkError<E>(SdkError<E>);
impl<E> From<WrappedSdkError<E>> for Error
where
E: std::error::Error + Send + Sync + 'static,
{
fn from(e: WrappedSdkError<E>) -> Self {
Self::io_source(box_error(e))
}
}
impl<E> std::fmt::Display for WrappedSdkError<E>
where
E: std::error::Error + Send + Sync + 'static,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let request_id = self.0.request_id().unwrap_or("unknown");
let service_err = &self.0.raw_response();
write!(f, "WrappedSdkError: request_id: {}", request_id)?;
if let Some(err) = service_err {
write!(f, ", service_error: {:?}", err)
} else {
write!(f, ", no service error")
}
}
}
impl<E> std::error::Error for WrappedSdkError<E>
where
E: std::error::Error + Send + Sync + 'static,
{
// Implement the necessary methods for the Error trait here.
// For example, you can delegate to the inner SdkError:
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
Some(&self.0)
}
}
trait SdkResultExt<T> {
fn wrap_err(self) -> Result<T>;
}
impl<T, E> SdkResultExt<T> for std::result::Result<T, SdkError<E>>
where
E: std::error::Error + Send + Sync + 'static,
{
fn wrap_err(self) -> Result<T> {
self.map_err(|err| {
warn!(
target: "lance::dynamodb",
request_id = err.request_id().unwrap_or("unknown"),
"DynamoDB SDK error: {err:?}",
);
Error::from(WrappedSdkError(err))
})
}
}
/// An external manifest store backed by DynamoDB
///
/// When calling DynamoDBExternalManifestStore::new_external_store()
/// the key schema, (PK, SK), is checked. If the table does not exist,
/// or the key schema is not as expected, an error is returned.
///
/// The table schema is expected as follows:
/// PK: base_uri -- string
/// SK: version -- number
/// path -- string
/// committer -- string
///
/// Consistency: This store is expected to have read-after-write consistency
/// consistent_read should always be set to true
///
/// Transaction Safety: This store uses DynamoDB conditional write to ensure
/// only one writer can win per version.
#[derive(Debug)]
pub struct DynamoDBExternalManifestStore {
client: Arc<Client>,
table_name: String,
committer_name: String,
}
// these are in macro because I want to use them in a match statement
macro_rules! base_uri {
() => {
"base_uri"
};
}
macro_rules! version {
() => {
"version"
};
}
macro_rules! path {
() => {
"path"
};
}
macro_rules! committer {
() => {
"committer"
};
}
impl DynamoDBExternalManifestStore {
pub async fn new_external_store(
client: Arc<Client>,
table_name: &str,
committer_name: &str,
) -> Result<Arc<dyn ExternalManifestStore>> {
static SANITY_CHECK_CACHE: LazyLock<RwLock<HashSet<String>>> =
LazyLock::new(|| RwLock::new(HashSet::new()));
let store = Arc::new(Self {
client: client.clone(),
table_name: table_name.to_string(),
committer_name: committer_name.to_string(),
});
// already checked this table before, skip
// this is to avoid checking the table schema every time
// because it's expensive to call DescribeTable
if SANITY_CHECK_CACHE.read().await.contains(table_name) {
return Ok(store);
}
// Check if the table schema is correct
let describe_result = client
.describe_table()
.table_name(table_name)
.send()
.await
.wrap_err()?;
let table = describe_result
.table
.ok_or_else(|| Error::io(format!("dynamodb table: {table_name} does not exist")))?;
let mut schema = table.key_schema.ok_or_else(|| {
Error::io(format!(
"dynamodb table: {table_name} does not have a key schema"
))
})?;
let mut has_hash_key = false;
let mut has_range_key = false;
// there should be two keys, HASH(base_uri) and RANGE(version)
for _ in 0..2 {
let key = schema.pop().ok_or_else(|| {
Error::io(format!(
"dynamodb table: {table_name} must have HASH and RANGE keys"
))
})?;
match (key.key_type, key.attribute_name.as_str()) {
(KeyType::Hash, base_uri!()) => {
has_hash_key = true;
}
(KeyType::Range, version!()) => {
has_range_key = true;
}
_ => {
return Err(Error::io(format!(
"dynamodb table: {} unknown key type encountered name:{}",
table_name, key.attribute_name
)));
}
}
}
// Both keys must be present
if !(has_hash_key && has_range_key) {
return Err(Error::io(format!(
"dynamodb table: {} must have HASH and RANGE keys, named `{}` and `{}` respectively",
table_name,
base_uri!(),
version!()
)));
}
SANITY_CHECK_CACHE
.write()
.await
.insert(table_name.to_string());
Ok(store)
}
fn ddb_put(&self) -> PutItemFluentBuilder {
self.client.put_item().table_name(&self.table_name)
}
fn ddb_get(&self) -> GetItemFluentBuilder {
self.client
.get_item()
.table_name(&self.table_name)
.consistent_read(true)
}
fn ddb_query(&self) -> QueryFluentBuilder {
self.client
.query()
.table_name(&self.table_name)
.consistent_read(true)
}
fn ddb_delete(&self) -> DeleteItemFluentBuilder {
self.client.delete_item().table_name(&self.table_name)
}
}
#[async_trait]
impl ExternalManifestStore for DynamoDBExternalManifestStore {
/// Get the manifest path for a given base_uri and version
async fn get(&self, base_uri: &str, version: u64) -> Result<String> {
let get_item_result = self
.ddb_get()
.key(base_uri!(), AttributeValue::S(base_uri.into()))
.key(version!(), AttributeValue::N(version.to_string()))
.send()
.await
.wrap_err()?;
let item = get_item_result.item.context(NotFoundSnafu {
uri: format!(
"dynamodb not found: base_uri: {}; version: {}",
base_uri, version
),
})?;
let path = item
.get(path!())
.ok_or_else(|| Error::not_found(format!("key {} is not present", path!())))?;
match path {
AttributeValue::S(path) => Ok(path.clone()),
_ => Err(Error::invalid_input(format!(
"key {} is not a string",
path!()
))),
}
}
async fn get_manifest_location(
&self,
base_uri: &str,
version: u64,
) -> Result<ManifestLocation> {
let get_item_result = self
.ddb_get()
.key(base_uri!(), AttributeValue::S(base_uri.into()))
.key(version!(), AttributeValue::N(version.to_string()))
.send()
.await
.wrap_err()?;
let item = get_item_result.item.context(NotFoundSnafu {
uri: format!(
"dynamodb not found: base_uri: {}; version: {}",
base_uri, version
),
})?;
let path = item
.get(path!())
.ok_or_else(|| Error::not_found(format!("key {} is not present", path!())))?
.as_s()
.map_err(|_| Error::invalid_input(format!("key {} is not a string", path!())))?
.as_str();
let path = Path::from(path);
let size = item
.get("size")
.and_then(|attr| attr.as_n().ok().and_then(|v| v.parse().ok()));
let e_tag = item.get("e_tag").and_then(|attr| attr.as_s().ok().cloned());
let naming_scheme = detect_naming_scheme_from_path(&path)?;
Ok(ManifestLocation {
version,
path,
size,
naming_scheme,
e_tag,
})
}
/// Get the latest version of a dataset at the base_uri
async fn get_latest_version(&self, base_uri: &str) -> Result<Option<(u64, String)>> {
self.get_latest_manifest_location(base_uri)
.await
.map(|location| location.map(|loc| (loc.version, loc.path.to_string())))
}
async fn get_latest_manifest_location(
&self,
base_uri: &str,
) -> Result<Option<ManifestLocation>> {
let query_result = self
.ddb_query()
.key_condition_expression(format!("{} = :{}", base_uri!(), base_uri!()))
.expression_attribute_values(
format!(":{}", base_uri!()),
AttributeValue::S(base_uri.into()),
)
.scan_index_forward(false)
.limit(1)
.send()
.await
.wrap_err()?;
match query_result.items {
Some(mut items) => {
if items.is_empty() {
return Ok(None);
}
if items.len() > 1 {
return Err(Error::invalid_input(format!(
"dynamodb table: {} returned unexpected number of items",
self.table_name
)));
}
let item = items.pop().expect("length checked");
let version_attribute = item
.get(version!())
.ok_or_else(|| Error::not_found(
format!("dynamodb error: found entries for {} but the returned data does not contain {} column", base_uri, version!())
))?;
let path_attribute = item
.get(path!())
.ok_or_else(|| Error::not_found(
format!("dynamodb error: found entries for {} but the returned data does not contain {} column", base_uri, path!())
))?;
let size = item.get("size").and_then(|attr| match attr {
AttributeValue::N(size) => size.parse().ok(),
_ => None,
});
let e_tag = item.get("e_tag").and_then(|attr| attr.as_s().ok().cloned());
match (version_attribute, path_attribute) {
(AttributeValue::N(version), AttributeValue::S(path)) => {
let version = version.parse().map_err(|e| Error::invalid_input(format!("dynamodb error: could not parse the version number returned {}, error: {}", version, e)))?;
let path = Path::from(path.as_str());
let naming_scheme = detect_naming_scheme_from_path(&path)?;
let location = ManifestLocation {
version,
path,
size,
naming_scheme,
e_tag,
};
Ok(Some(location))
}
_ => Err(Error::invalid_input(format!(
"dynamodb error: found entries for {base_uri} but the returned data is not number type"
))),
}
}
_ => Ok(None),
}
}
/// Put the manifest path for a given base_uri and version, should fail if the version already exists
async fn put_if_not_exists(
&self,
base_uri: &str,
version: u64,
path: &str,
size: u64,
e_tag: Option<String>,
) -> Result<()> {
let mut put_item = self
.ddb_put()
.item(base_uri!(), AttributeValue::S(base_uri.into()))
.item(version!(), AttributeValue::N(version.to_string()))
.item(path!(), AttributeValue::S(path.to_string()))
.item(committer!(), AttributeValue::S(self.committer_name.clone()))
.item("size", AttributeValue::N(size.to_string()));
if let Some(e_tag) = e_tag {
put_item = put_item.item("e_tag", AttributeValue::S(e_tag));
}
put_item
.condition_expression(format!(
"attribute_not_exists({}) AND attribute_not_exists({})",
base_uri!(),
version!(),
))
.send()
.await
.wrap_err()?;
Ok(())
}
/// Put the manifest path for a given base_uri and version, should fail if the version **does not** already exist
async fn put_if_exists(
&self,
base_uri: &str,
version: u64,
path: &str,
size: u64,
e_tag: Option<String>,
) -> Result<()> {
let mut put_item = self
.ddb_put()
.item(base_uri!(), AttributeValue::S(base_uri.into()))
.item(version!(), AttributeValue::N(version.to_string()))
.item(path!(), AttributeValue::S(path.to_string()))
.item(committer!(), AttributeValue::S(self.committer_name.clone()))
.item("size", AttributeValue::N(size.to_string()));
if let Some(e_tag) = e_tag {
put_item = put_item.item("e_tag", AttributeValue::S(e_tag));
}
put_item
.condition_expression(format!(
"attribute_exists({}) AND attribute_exists({})",
base_uri!(),
version!(),
))
.send()
.await
.wrap_err()?;
Ok(())
}
/// Delete the manifest information for the given base_uri in dynamodb
async fn delete(&self, base_uri: &str) -> Result<()> {
let query_result = self
.ddb_query()
.key_condition_expression(format!("{} = :{}", base_uri!(), base_uri!()))
.expression_attribute_values(
format!(":{}", base_uri!()),
AttributeValue::S(base_uri.into()),
)
.send()
.await
.wrap_err()?;
if let Some(items) = query_result.items {
for item in items {
if let Some(AttributeValue::N(version)) = item.get("version") {
self.ddb_delete()
.key(base_uri!(), AttributeValue::S(base_uri.to_string()))
.key(version!(), AttributeValue::N(version.clone()))
.send()
.await
.wrap_err()?;
}
}
}
Ok(())
}
}

View file

@ -0,0 +1,515 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors
//! Trait for external manifest handler.
//!
//! This trait abstracts an external storage with put_if_not_exists semantics.
use std::sync::Arc;
use async_trait::async_trait;
use lance_core::utils::tracing::{
AUDIT_MODE_CREATE, AUDIT_MODE_DELETE, AUDIT_TYPE_MANIFEST, TRACE_FILE_AUDIT,
};
use lance_core::{Error, Result};
use lance_io::object_store::ObjectStore;
use log::warn;
use object_store::ObjectMeta;
use object_store::ObjectStoreExt;
use object_store::{Error as ObjectStoreError, ObjectStore as OSObjectStore, path::Path};
use tracing::info;
use super::{
MANIFEST_EXTENSION, ManifestLocation, ManifestNamingScheme, current_manifest_path,
default_resolve_version, make_staging_manifest_path, write_version_hint,
};
use crate::format::{IndexMetadata, Manifest, Transaction};
use crate::io::commit::{CommitError, CommitHandler};
/// External manifest store
///
/// This trait abstracts an external storage for source of truth for manifests.
/// The storage is expected to remember (uri, version) -> manifest_path
/// and able to run transactions on the manifest_path.
///
/// This trait is called an **External** manifest store because the store is
/// expected to work in tandem with the object store. We are only leveraging
/// the external store for concurrent commit. Any manifest committed thru this
/// trait should ultimately be materialized in the object store.
/// For a visual explanation of the commit loop see
/// <https://github.com/lance-format/lance/assets/12615154/b0822312-0826-432a-b554-3965f8d48d04>
#[async_trait]
pub trait ExternalManifestStore: std::fmt::Debug + Send + Sync {
/// Get the manifest path for a given base_uri and version
async fn get(&self, base_uri: &str, version: u64) -> Result<String>;
async fn get_manifest_location(
&self,
base_uri: &str,
version: u64,
) -> Result<ManifestLocation> {
let path = self.get(base_uri, version).await?;
let path = Path::parse(&path).map_err(|e| Error::invalid_input(e.to_string()))?;
let naming_scheme = detect_naming_scheme_from_path(&path)?;
Ok(ManifestLocation {
version,
path,
size: None,
naming_scheme,
e_tag: None,
})
}
/// Get the latest version of a dataset at the base_uri, and the path to the manifest.
/// The path is provided as an optimization. The path is deterministic based on
/// the version and the store should not customize it.
async fn get_latest_version(&self, base_uri: &str) -> Result<Option<(u64, String)>>;
/// Get the latest manifest location for a given base_uri.
///
/// By default, this calls get_latest_version. Impls should
/// override this method if they store both the location and size
/// of the latest manifest.
async fn get_latest_manifest_location(
&self,
base_uri: &str,
) -> Result<Option<ManifestLocation>> {
self.get_latest_version(base_uri).await.and_then(|res| {
res.map(|(version, uri)| {
let path = Path::parse(&uri).map_err(|e| Error::invalid_input(e.to_string()))?;
let naming_scheme = detect_naming_scheme_from_path(&path)?;
Ok(ManifestLocation {
version,
path,
size: None,
naming_scheme,
e_tag: None,
})
})
.transpose()
})
}
/// Put the manifest to the external store.
///
/// The staging manifest has been written to `staging_path` on the object store.
/// This method should atomically claim the version and return the final manifest location.
///
/// The default implementation uses put_if_not_exists and put_if_exists to
/// implement a staging-based workflow. Implementations that can write directly
/// (e.g., namespace-backed stores) should override this method.
#[allow(clippy::too_many_arguments)]
async fn put(
&self,
base_path: &Path,
version: u64,
staging_path: &Path,
size: u64,
e_tag: Option<String>,
object_store: &dyn OSObjectStore,
naming_scheme: ManifestNamingScheme,
) -> Result<ManifestLocation> {
// Default implementation: staging-based workflow
// Step 1: Record staging path atomically
self.put_if_not_exists(
base_path.as_ref(),
version,
staging_path.as_ref(),
size,
e_tag.clone(),
)
.await?;
// Step 2: Copy staging to final path
let final_path = naming_scheme.manifest_path(base_path, version);
let copied = match object_store.copy(staging_path, &final_path).await {
Ok(_) => true,
Err(ObjectStoreError::NotFound { .. }) => false,
Err(e) => return Err(e.into()),
};
if copied {
info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_CREATE, r#type=AUDIT_TYPE_MANIFEST, path = final_path.as_ref());
}
// Get final e_tag (may change after copy for large files)
let e_tag = if copied && size < 5 * 1024 * 1024 {
e_tag
} else {
let meta = object_store.head(&final_path).await?;
meta.e_tag
};
let location = ManifestLocation {
version,
path: final_path.clone(),
size: Some(size),
naming_scheme,
e_tag: e_tag.clone(),
};
if !copied {
return Ok(location);
}
// Step 3: Update external store to final path
self.put_if_exists(
base_path.as_ref(),
version,
final_path.as_ref(),
size,
e_tag,
)
.await?;
// Step 4: Delete staging manifest
match object_store.delete(staging_path).await {
Ok(_) => {}
Err(ObjectStoreError::NotFound { .. }) => {}
Err(e) => return Err(e.into()),
}
info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE, r#type=AUDIT_TYPE_MANIFEST, path = staging_path.as_ref());
Ok(location)
}
/// Put the manifest path for a given base_uri and version, should fail if the version already exists
async fn put_if_not_exists(
&self,
base_uri: &str,
version: u64,
path: &str,
size: u64,
e_tag: Option<String>,
) -> Result<()>;
/// Put the manifest path for a given base_uri and version, should fail if the version **does not** already exist
async fn put_if_exists(
&self,
base_uri: &str,
version: u64,
path: &str,
size: u64,
e_tag: Option<String>,
) -> Result<()>;
/// Delete the manifest information for given base_uri from the store
async fn delete(&self, _base_uri: &str) -> Result<()> {
Ok(())
}
}
pub(crate) fn detect_naming_scheme_from_path(path: &Path) -> Result<ManifestNamingScheme> {
path.filename()
.and_then(|name| {
ManifestNamingScheme::detect_scheme(name)
.or_else(|| Some(ManifestNamingScheme::detect_scheme_staging(name)))
})
.ok_or_else(|| {
Error::corrupt_file(
path.clone(),
"Path does not follow known manifest naming convention.",
)
})
}
/// External manifest commit handler
/// This handler is used to commit a manifest to an external store
/// for detailed design, see <https://github.com/lance-format/lance/issues/1183>
#[derive(Debug)]
pub struct ExternalManifestCommitHandler {
pub external_manifest_store: Arc<dyn ExternalManifestStore>,
}
impl ExternalManifestCommitHandler {
/// The manifest is considered committed once the staging manifest is written
/// to object store and that path is committed to the external store.
///
/// However, to fully complete this, the staging manifest should be materialized
/// into the final path, the final path should be committed to the external store
/// and the staging manifest should be deleted. These steps may be completed
/// by any number of readers or writers, so care should be taken to ensure
/// that the manifest is not lost nor any errors occur due to duplicate
/// operations.
#[allow(clippy::too_many_arguments)]
async fn finalize_manifest(
&self,
base_path: &Path,
staging_manifest_path: &Path,
version: u64,
size: u64,
e_tag: Option<String>,
store: &dyn OSObjectStore,
naming_scheme: ManifestNamingScheme,
) -> std::result::Result<ManifestLocation, Error> {
// step 1: copy the manifest to the final location
let final_manifest_path = naming_scheme.manifest_path(base_path, version);
let copied = match store
.copy(staging_manifest_path, &final_manifest_path)
.await
{
Ok(_) => true,
Err(ObjectStoreError::NotFound { .. }) => false, // Another writer beat us to it.
Err(e) => return Err(e.into()),
};
if copied {
info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_CREATE, r#type=AUDIT_TYPE_MANIFEST, path = final_manifest_path.as_ref());
}
// On S3, the etag can change if originally was MultipartUpload and later was Copy
// https://docs.aws.amazon.com/AmazonS3/latest/API/API_Object.html#AmazonS3-Type-Object-ETag
// We only do MultipartUpload for > 5MB files, so we can skip this check
// if size < 5MB. However, we need to double check the final_manifest_path
// exists before we change the external store, otherwise we may point to a
// non-existing manifest.
let e_tag = if copied && size < 5 * 1024 * 1024 {
e_tag
} else {
let meta = store.head(&final_manifest_path).await?;
meta.e_tag
};
let location = ManifestLocation {
version,
path: final_manifest_path,
size: Some(size),
naming_scheme,
e_tag,
};
if !copied {
return Ok(location);
}
// step 2: flip the external store to point to the final location
self.external_manifest_store
.put_if_exists(
base_path.as_ref(),
version,
location.path.as_ref(),
size,
location.e_tag.clone(),
)
.await?;
// step 3: delete the staging manifest
match store.delete(staging_manifest_path).await {
Ok(_) => {}
Err(ObjectStoreError::NotFound { .. }) => {}
Err(e) => return Err(e.into()),
}
info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE, r#type=AUDIT_TYPE_MANIFEST, path = staging_manifest_path.as_ref());
Ok(location)
}
}
#[async_trait]
impl CommitHandler for ExternalManifestCommitHandler {
async fn resolve_latest_location(
&self,
base_path: &Path,
object_store: &ObjectStore,
) -> std::result::Result<ManifestLocation, Error> {
let location = self
.external_manifest_store
.get_latest_manifest_location(base_path.as_ref())
.await?;
match location {
Some(ManifestLocation {
version,
path,
size,
naming_scheme,
e_tag,
}) => {
// The path is finalized, no need to check object store
if path.extension() == Some(MANIFEST_EXTENSION) {
return Ok(ManifestLocation {
version,
path,
size,
naming_scheme,
e_tag,
});
}
let (size, e_tag) = if let Some(size) = size {
(size, e_tag)
} else {
match object_store.inner.head(&path).await {
Ok(meta) => (meta.size, meta.e_tag),
Err(ObjectStoreError::NotFound { .. }) => {
// there may be other threads that have finished executing finalize_manifest.
let new_location = self
.external_manifest_store
.get_manifest_location(base_path.as_ref(), version)
.await?;
return Ok(new_location);
}
Err(e) => return Err(e.into()),
}
};
let final_location = self
.finalize_manifest(
base_path,
&path,
version,
size,
e_tag.clone(),
&object_store.inner,
naming_scheme,
)
.await?;
Ok(final_location)
}
// Dataset not found in the external store, this could be because the dataset did not
// use external store for commit before. In this case, we search for the latest manifest
None => current_manifest_path(object_store, base_path).await,
}
}
async fn resolve_version_location(
&self,
base_path: &Path,
version: u64,
object_store: &dyn OSObjectStore,
) -> std::result::Result<ManifestLocation, Error> {
let location_res = self
.external_manifest_store
.get_manifest_location(base_path.as_ref(), version)
.await;
let location = match location_res {
Ok(p) => p,
// not board external manifest yet, direct to object store
Err(Error::NotFound { .. }) => {
let path = default_resolve_version(base_path, version, object_store)
.await
.map_err(|_| Error::not_found(format!("{}@{}", base_path, version)))?
.path;
match object_store.head(&path).await {
Ok(ObjectMeta { size, e_tag, .. }) => {
let res = self
.external_manifest_store
.put_if_not_exists(
base_path.as_ref(),
version,
path.as_ref(),
size,
e_tag.clone(),
)
.await;
if let Err(e) = res {
warn!(
"could not update external manifest store during load, with error: {}",
e
);
}
let naming_scheme =
ManifestNamingScheme::detect_scheme_staging(path.filename().unwrap());
return Ok(ManifestLocation {
version,
path,
size: Some(size),
naming_scheme,
e_tag,
});
}
Err(ObjectStoreError::NotFound { .. }) => {
return Err(Error::not_found(path.to_string()));
}
Err(e) => return Err(e.into()),
}
}
Err(e) => return Err(e),
};
// finalized path, just return
if location.path.extension() == Some(MANIFEST_EXTENSION) {
return Ok(location);
}
let naming_scheme =
ManifestNamingScheme::detect_scheme_staging(location.path.filename().unwrap());
let (size, e_tag) = if let Some(size) = location.size {
(size, location.e_tag.clone())
} else {
let meta = object_store.head(&location.path).await?;
(meta.size as u64, meta.e_tag)
};
self.finalize_manifest(
base_path,
&location.path,
version,
size,
e_tag,
object_store,
naming_scheme,
)
.await
}
async fn commit(
&self,
manifest: &mut Manifest,
indices: Option<Vec<IndexMetadata>>,
base_path: &Path,
object_store: &ObjectStore,
manifest_writer: super::ManifestWriter,
naming_scheme: ManifestNamingScheme,
transaction: Option<Transaction>,
) -> std::result::Result<ManifestLocation, CommitError> {
// path we get here is the path to the manifest we want to write
// use object_store.base_path.as_ref() for getting the root of the dataset
// step 1: Write the manifest we want to commit to object store with a temporary name
let path = naming_scheme.manifest_path(base_path, manifest.version);
let staging_path = make_staging_manifest_path(&path)?;
let write_res =
manifest_writer(object_store, manifest, indices, &staging_path, transaction).await?;
// step 2 & 3: Put the manifest to external store
let result = self
.external_manifest_store
.put(
base_path,
manifest.version,
&staging_path,
write_res.size as u64,
write_res.e_tag,
&object_store.inner,
naming_scheme,
)
.await;
match result {
Ok(location) => {
write_version_hint(object_store, base_path, manifest.version).await;
Ok(location)
}
Err(_) => {
// delete the staging manifest
match object_store.inner.delete(&staging_path).await {
Ok(_) => {}
Err(ObjectStoreError::NotFound { .. }) => {}
Err(e) => return Err(CommitError::OtherError(e.into())),
}
info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE, r#type=AUDIT_TYPE_MANIFEST, path = staging_path.as_ref());
Err(CommitError::CommitConflict {})
}
}
}
async fn delete(&self, base_path: &Path) -> Result<()> {
self.external_manifest_store
.delete(base_path.as_ref())
.await
}
}

370
vendor/lance-table/src/io/deletion.rs vendored Normal file
View file

@ -0,0 +1,370 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors
use std::{collections::HashSet, sync::Arc};
use arrow_array::{RecordBatch, UInt32Array};
use arrow_ipc::CompressionType;
use arrow_ipc::reader::FileReader as ArrowFileReader;
use arrow_ipc::writer::{FileWriter as ArrowFileWriter, IpcWriteOptions};
use arrow_schema::{ArrowError, DataType, Field, Schema};
use bytes::Buf;
use lance_core::error::{CorruptFileSnafu, box_error};
use lance_core::utils::deletion::DeletionVector;
use lance_core::utils::tracing::{AUDIT_MODE_CREATE, AUDIT_TYPE_DELETION, TRACE_FILE_AUDIT};
use lance_core::{Error, Result};
use lance_io::object_store::ObjectStore;
use object_store::path::Path;
use rand::Rng;
use roaring::bitmap::RoaringBitmap;
use snafu::ResultExt;
use tracing::{info, instrument};
use crate::format::{DeletionFile, DeletionFileType};
pub const DELETIONS_DIR: &str = "_deletions";
/// Get the Arrow schema for an Arrow deletion file.
fn deletion_arrow_schema() -> Arc<Schema> {
Arc::new(Schema::new(vec![Field::new(
"row_id",
DataType::UInt32,
false,
)]))
}
/// Get the file path for a deletion file. This is relative to the dataset root.
pub fn deletion_file_path(base: &Path, fragment_id: u64, deletion_file: &DeletionFile) -> Path {
let DeletionFile {
read_version,
id,
file_type,
..
} = deletion_file;
let suffix = file_type.suffix();
base.clone()
.join(DELETIONS_DIR)
.join(format!("{fragment_id}-{read_version}-{id}.{suffix}"))
}
pub fn relative_deletion_file_path(fragment_id: u64, deletion_file: &DeletionFile) -> String {
let DeletionFile {
read_version,
id,
file_type,
..
} = deletion_file;
let suffix = file_type.suffix();
format!("{DELETIONS_DIR}/{fragment_id}-{read_version}-{id}.{suffix}")
}
/// Write a deletion file for a fragment for a given deletion vector.
///
/// Returns the deletion file if one was written. If no deletions were present,
/// returns `Ok(None)`.
pub async fn write_deletion_file(
base: &Path,
fragment_id: u64,
read_version: u64,
removed_rows: &DeletionVector,
object_store: &ObjectStore,
) -> Result<Option<DeletionFile>> {
let deletion_file = match removed_rows {
DeletionVector::NoDeletions => None,
DeletionVector::Set(set) => {
let id = rand::rng().random::<u64>();
let deletion_file = DeletionFile {
read_version,
id,
file_type: DeletionFileType::Array,
num_deleted_rows: Some(set.len()),
base_id: None,
};
let path = deletion_file_path(base, fragment_id, &deletion_file);
let array = UInt32Array::from_iter(set.iter().copied());
let array = Arc::new(array);
let schema = deletion_arrow_schema();
let batch = RecordBatch::try_new(schema.clone(), vec![array])?;
let mut out: Vec<u8> = Vec::new();
let write_options =
IpcWriteOptions::default().try_with_compression(Some(CompressionType::ZSTD))?;
{
let mut writer = ArrowFileWriter::try_new_with_options(
&mut out,
schema.as_ref(),
write_options,
)?;
writer.write(&batch)?;
writer.finish()?;
// Drop writer so out is no longer borrowed.
}
object_store.put(&path, &out).await?;
info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_CREATE, r#type=AUDIT_TYPE_DELETION, path = path.to_string());
Some(deletion_file)
}
DeletionVector::Bitmap(bitmap) => {
let id = rand::rng().random::<u64>();
let deletion_file = DeletionFile {
read_version,
id,
file_type: DeletionFileType::Bitmap,
num_deleted_rows: Some(bitmap.len() as usize),
base_id: None,
};
let path = deletion_file_path(base, fragment_id, &deletion_file);
let mut out: Vec<u8> = Vec::new();
bitmap.serialize_into(&mut out)?;
object_store.put(&path, &out).await?;
info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_CREATE, r#type=AUDIT_TYPE_DELETION, path = path.to_string());
Some(deletion_file)
}
};
Ok(deletion_file)
}
#[instrument(
level = "debug",
skip(base, object_store),
fields(
base = base.as_ref(),
bytes_read = tracing::field::Empty
)
)]
pub async fn read_deletion_file(
fragment_id: u64,
deletion_file: &DeletionFile,
base: &Path,
object_store: &ObjectStore,
) -> Result<DeletionVector> {
let span = tracing::Span::current();
match deletion_file.file_type {
DeletionFileType::Array => {
let path = deletion_file_path(base, fragment_id, deletion_file);
let data = object_store.read_one_all(&path).await?;
span.record("bytes_read", data.len());
let data = std::io::Cursor::new(data);
let mut batches: Vec<RecordBatch> = ArrowFileReader::try_new(data, None)?
.collect::<std::result::Result<_, ArrowError>>()
.map_err(box_error)
.context(CorruptFileSnafu { path: path.clone() })?;
if batches.len() != 1 {
return Err(Error::corrupt_file(
path,
format!(
"Expected exactly one batch in deletion file, got {}",
batches.len()
),
));
}
let batch = batches.pop().unwrap();
if batch.schema() != deletion_arrow_schema() {
return Err(Error::corrupt_file(
path,
format!(
"Expected schema {:?} in deletion file, got {:?}",
deletion_arrow_schema(),
batch.schema()
),
));
}
let array = batch.columns()[0]
.as_any()
.downcast_ref::<UInt32Array>()
.unwrap();
let mut set = HashSet::with_capacity(array.len());
for val in array.iter() {
if let Some(val) = val {
set.insert(val);
} else {
return Err(Error::corrupt_file(
path,
"Null values are not allowed in deletion files",
));
}
}
Ok(DeletionVector::Set(set))
}
DeletionFileType::Bitmap => {
let path = deletion_file_path(base, fragment_id, deletion_file);
let data = object_store.read_one_all(&path).await?;
span.record("bytes_read", data.len());
let reader = data.reader();
let bitmap = RoaringBitmap::deserialize_from(reader)
.map_err(box_error)
.context(CorruptFileSnafu { path })?;
Ok(DeletionVector::Bitmap(bitmap))
}
}
}
#[cfg(test)]
mod test {
use super::*;
use object_store::ObjectStoreExt;
#[tokio::test]
async fn test_write_no_deletions() {
let dv = DeletionVector::NoDeletions;
let (object_store, path) = ObjectStore::from_uri("memory:///no_deletion")
.await
.unwrap();
let file = write_deletion_file(&path, 0, 0, &dv, &object_store)
.await
.unwrap();
assert!(file.is_none());
}
#[tokio::test]
async fn test_write_array() {
let dv = DeletionVector::Set(HashSet::from_iter(0..100));
let fragment_id = 21;
let read_version = 12;
let object_store = ObjectStore::memory();
let path = Path::from("/write");
let file = write_deletion_file(&path, fragment_id, read_version, &dv, &object_store)
.await
.unwrap();
assert!(matches!(
file,
Some(DeletionFile {
file_type: DeletionFileType::Array,
..
})
));
let file = file.unwrap();
assert_eq!(file.read_version, read_version);
let path = deletion_file_path(&path, fragment_id, &file);
assert_eq!(
path,
Path::from(format!("/write/_deletions/21-12-{}.arrow", file.id))
);
let data = object_store
.inner
.get(&path)
.await
.unwrap()
.bytes()
.await
.unwrap();
let data = std::io::Cursor::new(data);
let mut batches: Vec<RecordBatch> = ArrowFileReader::try_new(data, None)
.unwrap()
.collect::<std::result::Result<_, ArrowError>>()
.unwrap();
assert_eq!(batches.len(), 1);
let batch = batches.pop().unwrap();
assert_eq!(batch.schema(), deletion_arrow_schema());
let array = batch["row_id"]
.as_any()
.downcast_ref::<UInt32Array>()
.unwrap();
let read_dv = DeletionVector::from_iter(array.iter().map(|v| v.unwrap()));
assert_eq!(read_dv, dv);
}
#[tokio::test]
async fn test_write_bitmap() {
let dv = DeletionVector::Bitmap(RoaringBitmap::from_iter(0..100));
let fragment_id = 21;
let read_version = 12;
let object_store = ObjectStore::memory();
let path = Path::from("/bitmap");
let file = write_deletion_file(&path, fragment_id, read_version, &dv, &object_store)
.await
.unwrap();
assert!(matches!(
file,
Some(DeletionFile {
file_type: DeletionFileType::Bitmap,
..
})
));
let file = file.unwrap();
assert_eq!(file.read_version, read_version);
let path = deletion_file_path(&path, fragment_id, &file);
assert_eq!(
path,
Path::from(format!("/bitmap/_deletions/21-12-{}.bin", file.id))
);
let data = object_store
.inner
.get(&path)
.await
.unwrap()
.bytes()
.await
.unwrap();
let reader = data.reader();
let read_bitmap = RoaringBitmap::deserialize_from(reader).unwrap();
assert_eq!(read_bitmap, dv.into_iter().collect::<RoaringBitmap>());
}
#[tokio::test]
async fn test_roundtrip_array() {
let dv = DeletionVector::Set(HashSet::from_iter(0..100));
let fragment_id = 21;
let read_version = 12;
let object_store = ObjectStore::memory();
let path = Path::from("/roundtrip");
let file = write_deletion_file(&path, fragment_id, read_version, &dv, &object_store)
.await
.unwrap();
let read_dv = read_deletion_file(fragment_id, &file.unwrap(), &path, &object_store)
.await
.unwrap();
assert_eq!(read_dv, dv);
}
#[tokio::test]
async fn test_roundtrip_bitmap() {
let dv = DeletionVector::Bitmap(RoaringBitmap::from_iter(0..100));
let fragment_id = 21;
let read_version = 12;
let object_store = ObjectStore::memory();
let path = Path::from("/bitmap");
let file = write_deletion_file(&path, fragment_id, read_version, &dv, &object_store)
.await
.unwrap();
let read_dv = read_deletion_file(fragment_id, &file.unwrap(), &path, &object_store)
.await
.unwrap();
assert_eq!(read_dv, dv);
}
}

344
vendor/lance-table/src/io/manifest.rs vendored Normal file
View file

@ -0,0 +1,344 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors
use async_trait::async_trait;
use byteorder::{ByteOrder, LittleEndian};
use bytes::{Bytes, BytesMut};
use lance_arrow::DataTypeExt;
use lance_file::{
previous::writer::ManifestProvider as PreviousManifestProvider, version::LanceFileVersion,
};
use object_store::ObjectStoreExt;
use object_store::path::Path;
use prost::Message;
use std::collections::HashMap;
use std::{ops::Range, sync::Arc};
use tracing::instrument;
use lance_core::{Error, Result, datatypes::Schema};
use lance_io::{
encodings::{Encoder, binary::BinaryEncoder, plain::PlainEncoder},
object_store::ObjectStore,
traits::{WriteExt, Writer},
utils::read_message,
};
use crate::format::{DataStorageFormat, IndexMetadata, MAGIC, Manifest, Transaction, pb};
use super::commit::ManifestLocation;
/// Read Manifest on URI.
///
/// This only reads manifest files. It does not read data files.
#[instrument(level = "debug", skip(object_store))]
pub async fn read_manifest(
object_store: &ObjectStore,
path: &Path,
known_size: Option<u64>,
) -> Result<Manifest> {
let file_size = if let Some(known_size) = known_size {
known_size
} else {
object_store.inner.head(path).await?.size
};
const PREFETCH_SIZE: u64 = 64 * 1024;
let initial_start = file_size.saturating_sub(PREFETCH_SIZE);
let range = Range {
start: initial_start,
end: file_size,
};
let buf = object_store.inner.get_range(path, range).await?;
// In case of corruption, the known_size might be wrong. We can retry without
// the size to be more robust.
if (buf.len() < 16 || !buf.ends_with(MAGIC)) && known_size.is_some() {
return Box::pin(read_manifest(object_store, path, None)).await;
}
if buf.len() < 16 {
return Err(Error::corrupt_file(
path.clone(),
"Invalid format: file size is smaller than 16 bytes".to_string(),
));
}
if !buf.ends_with(MAGIC) {
return Err(Error::corrupt_file(
path.clone(),
"Invalid format: magic number does not match".to_string(),
));
}
let manifest_pos = LittleEndian::read_i64(&buf[buf.len() - 16..buf.len() - 8]) as usize;
let manifest_len = file_size as usize - manifest_pos;
let buf: Bytes = if manifest_len <= buf.len() {
// The prefetch captured the entire manifest. We just need to trim the buffer.
buf.slice(buf.len() - manifest_len..buf.len())
} else {
// The prefetch only captured part of the manifest. We need to make an
// additional range request to read the remainder.
let mut buf2: BytesMut = object_store
.inner
.get_range(
path,
Range {
start: manifest_pos as u64,
end: file_size - PREFETCH_SIZE,
},
)
.await?
.into_iter()
.collect();
buf2.extend_from_slice(&buf);
buf2.freeze()
};
let recorded_length = LittleEndian::read_u32(&buf[0..4]) as usize;
// Need to trim the magic number at end and message length at beginning
let buf = buf.slice(4..buf.len() - 16);
if buf.len() != recorded_length {
return Err(Error::invalid_input(format!(
"Invalid format: manifest length does not match. Expected {}, got {}",
recorded_length,
buf.len()
)));
}
let proto = pb::Manifest::decode(buf)?;
Manifest::try_from(proto)
}
#[instrument(level = "debug", skip(object_store, manifest))]
pub async fn read_manifest_indexes(
object_store: &ObjectStore,
location: &ManifestLocation,
manifest: &Manifest,
) -> Result<Vec<IndexMetadata>> {
if let Some(pos) = manifest.index_section.as_ref() {
let reader = if let Some(size) = location.size {
object_store
.open_with_size(&location.path, size as usize)
.await?
} else {
object_store.open(&location.path).await?
};
let section: pb::IndexSection = read_message(reader.as_ref(), *pos).await?;
let indices = section
.indices
.into_iter()
.map(IndexMetadata::try_from)
.collect::<Result<Vec<_>>>()?;
Ok(indices)
} else {
Ok(vec![])
}
}
async fn do_write_manifest(
writer: &mut dyn Writer,
manifest: &mut Manifest,
indices: Option<Vec<IndexMetadata>>,
mut transaction: Option<Transaction>,
) -> Result<usize> {
// Write indices if presented.
if let Some(indices) = indices.as_ref() {
let section = pb::IndexSection {
indices: indices.iter().map(|i| i.into()).collect(),
};
let pos = writer.write_protobuf(&section).await?;
manifest.index_section = Some(pos);
}
// Write inline transaction if presented.
if let Some(tx) = transaction.take() {
// Convert to protobuf at the write boundary to persist inline
let pb_tx: pb::Transaction = tx.into();
let pos = writer.write_protobuf(&pb_tx).await?;
manifest.transaction_section = Some(pos);
}
writer.write_struct(manifest).await
}
/// Write manifest to an open file.
pub async fn write_manifest(
writer: &mut dyn Writer,
manifest: &mut Manifest,
indices: Option<Vec<IndexMetadata>>,
transaction: Option<Transaction>,
) -> Result<usize> {
// Write dictionary values.
let max_field_id = manifest.schema.max_field_id().unwrap_or(-1);
let is_legacy_storage = manifest.should_use_legacy_format();
for field_id in 0..max_field_id + 1 {
if let Some(field) = manifest.schema.mut_field_by_id(field_id)
&& field.data_type().is_dictionary()
&& is_legacy_storage
{
let dict_info = field.dictionary.as_mut().ok_or_else(|| {
Error::io(format!("Lance field {} misses dictionary info", field.name))
})?;
let value_arr = dict_info.values.as_ref().ok_or_else(|| {
Error::io(format!(
"Lance field {} is dictionary type, but misses the dictionary value array",
field.name
))
})?;
let data_type = value_arr.data_type();
let pos = match data_type {
dt if dt.is_numeric() => {
let mut encoder = PlainEncoder::new(writer, dt);
encoder.encode(&[value_arr]).await?
}
dt if dt.is_binary_like() => {
let mut encoder = BinaryEncoder::new(writer);
encoder.encode(&[value_arr]).await?
}
_ => {
return Err(Error::schema(format!(
"Does not support {} as dictionary value type",
value_arr.data_type()
)));
}
};
dict_info.offset = pos;
dict_info.length = value_arr.len();
}
}
do_write_manifest(writer, manifest, indices, transaction).await
}
/// Implementation of ManifestProvider that describes a Lance file by writing
/// a manifest that contains nothing but default fields and the schema
pub struct ManifestDescribing {}
#[async_trait]
impl PreviousManifestProvider for ManifestDescribing {
async fn store_schema(
object_writer: &mut dyn Writer,
schema: &Schema,
) -> Result<Option<usize>> {
let mut manifest = Manifest::new(
schema.clone(),
Arc::new(vec![]),
DataStorageFormat::new(LanceFileVersion::Legacy),
HashMap::new(),
);
let pos = do_write_manifest(object_writer, &mut manifest, None, None).await?;
Ok(Some(pos))
}
}
#[cfg(test)]
mod test {
use arrow_array::{Int32Array, RecordBatch};
use std::collections::HashMap;
use crate::format::SelfDescribingFileReader;
use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema};
use lance_file::format::{MAGIC, MAJOR_VERSION, MINOR_VERSION};
use lance_file::previous::{
reader::FileReader as PreviousFileReader, writer::FileWriter as PreviousFileWriter,
};
use rand::{Rng, distr::Alphanumeric};
use tokio::io::AsyncWriteExt;
use super::*;
async fn test_roundtrip_manifest(prefix_size: usize, manifest_min_size: usize) {
let store = ObjectStore::memory();
let path = Path::from("/read_large_manifest");
let mut writer = store.create(&path).await.unwrap();
// Write prefix we should ignore
let prefix: Vec<u8> = rand::rng()
.sample_iter(&Alphanumeric)
.take(prefix_size)
.collect();
writer.write_all(&prefix).await.unwrap();
let long_name: String = rand::rng()
.sample_iter(&Alphanumeric)
.take(manifest_min_size)
.map(char::from)
.collect();
let arrow_schema =
ArrowSchema::new(vec![ArrowField::new(long_name, DataType::Int64, false)]);
let schema = Schema::try_from(&arrow_schema).unwrap();
let mut config = HashMap::new();
config.insert("key".to_string(), "value".to_string());
let mut manifest = Manifest::new(
schema,
Arc::new(vec![]),
DataStorageFormat::default(),
HashMap::new(),
);
let pos = write_manifest(writer.as_mut(), &mut manifest, None, None)
.await
.unwrap();
writer
.write_magics(pos, MAJOR_VERSION, MINOR_VERSION, MAGIC)
.await
.unwrap();
Writer::shutdown(writer.as_mut()).await.unwrap();
let roundtripped_manifest = read_manifest(&store, &path, None).await.unwrap();
assert_eq!(manifest, roundtripped_manifest);
store.inner.delete(&path).await.unwrap();
}
#[tokio::test]
async fn test_read_large_manifest() {
test_roundtrip_manifest(0, 100_000).await;
test_roundtrip_manifest(1000, 100_000).await;
test_roundtrip_manifest(1000, 1000).await;
}
#[tokio::test]
async fn test_update_schema_metadata() {
let store = ObjectStore::memory();
let path = Path::from("/update_schema_metadata");
let arrow_schema = Arc::new(ArrowSchema::new(vec![ArrowField::new(
"i",
DataType::Int32,
false,
)]));
let schema = Schema::try_from(arrow_schema.as_ref()).unwrap();
let mut file_writer = PreviousFileWriter::<ManifestDescribing>::try_new(
&store,
&path,
schema.clone(),
&Default::default(),
)
.await
.unwrap();
let array = Int32Array::from_iter_values(0..10);
let batch = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new(array)]).unwrap();
file_writer
.write(std::slice::from_ref(&batch))
.await
.unwrap();
let mut metadata = HashMap::new();
metadata.insert(String::from("lance:extra"), String::from("for_test"));
file_writer.finish_with_metadata(&metadata).await.unwrap();
let reader = store.open(&path).await.unwrap();
let reader = PreviousFileReader::try_new_self_described_from_reader(reader.into(), None)
.await
.unwrap();
let schema = ArrowSchema::from(reader.schema());
assert_eq!(schema.metadata().get("lance:extra").unwrap(), "for_test");
}
}

8
vendor/lance-table/src/lib.rs vendored Normal file
View file

@ -0,0 +1,8 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors
pub mod feature_flags;
pub mod format;
pub mod io;
pub mod rowids;
pub mod utils;

1364
vendor/lance-table/src/rowids.rs vendored Normal file

File diff suppressed because it is too large Load diff

314
vendor/lance-table/src/rowids/bitmap.rs vendored Normal file
View file

@ -0,0 +1,314 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors
use deepsize::DeepSizeOf;
#[derive(PartialEq, Eq, Clone, DeepSizeOf)]
pub struct Bitmap {
pub data: Vec<u8>,
pub len: usize,
}
impl std::fmt::Debug for Bitmap {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "Bitmap {{ data: ")?;
for i in 0..self.len {
write!(f, "{}", if self.get(i) { "1" } else { "0" })?;
}
write!(f, ", len: {} }}", self.len)
}
}
impl Bitmap {
pub fn new_empty(len: usize) -> Self {
let data = vec![0; len.div_ceil(8)];
Self { data, len }
}
pub fn new_full(len: usize) -> Self {
let mut data = vec![0xff; len.div_ceil(8)];
// Zero past the end of len
let remainder = len % 8;
if remainder != 0 {
let last_byte = data.last_mut().unwrap();
let bits_to_clear = 8 - remainder;
for offset_from_end in 0..bits_to_clear {
let i = 7 - offset_from_end;
*last_byte &= !(1 << i);
}
}
Self { data, len }
}
pub fn set(&mut self, i: usize) {
self.data[i / 8] |= 1 << (i % 8);
}
pub fn clear(&mut self, i: usize) {
self.data[i / 8] &= !(1 << (i % 8));
}
pub fn get(&self, i: usize) -> bool {
self.data[i / 8] & (1 << (i % 8)) != 0
}
pub fn len(&self) -> usize {
self.len
}
pub fn slice(&self, start: usize, len: usize) -> BitmapSlice<'_> {
BitmapSlice {
bitmap: self,
start,
len,
}
}
pub fn count_ones(&self) -> usize {
self.data.iter().map(|&x| x.count_ones() as usize).sum()
}
pub fn count_zeros(&self) -> usize {
self.len - self.count_ones()
}
pub fn iter(&self) -> impl Iterator<Item = bool> + '_ {
self.data
.iter()
.flat_map(|&x| (0..8).map(move |i| x & (1 << i) != 0))
.take(self.len)
}
}
impl From<&[bool]> for Bitmap {
fn from(slice: &[bool]) -> Self {
let mut bitmap = Self::new_empty(slice.len());
for (i, &b) in slice.iter().enumerate() {
if b {
bitmap.set(i);
}
}
bitmap
}
}
// Make a slice of bitmap
pub struct BitmapSlice<'a> {
bitmap: &'a Bitmap,
start: usize,
len: usize,
}
impl BitmapSlice<'_> {
pub fn count_ones(&self) -> usize {
if self.len == 0 {
return 0;
}
let first_byte = self.start / 8;
let last_byte = (self.start + self.len - 1) / 8;
if first_byte == last_byte {
let byte = self.bitmap.data[first_byte];
let mut count = 0;
for i in self.start % 8..((self.start + self.len - 1) % 8 + 1) {
if byte & (1 << i) != 0 {
count += 1;
}
}
count
} else {
let mut count = 0;
// Handle first byte
for i in self.start % 8..8 {
if self.bitmap.data[first_byte] & (1 << i) != 0 {
count += 1;
}
}
// Handle last bytes
for i in 0..((self.start + self.len - 1) % 8 + 1) {
if self.bitmap.data[last_byte] & (1 << i) != 0 {
count += 1;
}
}
// Middle bytes can just use count_ones
count += self.bitmap.data[first_byte + 1..last_byte]
.iter()
.map(|&x| x.count_ones() as usize)
.sum::<usize>();
count
}
}
pub fn count_zeros(&self) -> usize {
self.len - self.count_ones()
}
}
impl From<BitmapSlice<'_>> for Bitmap {
fn from(slice: BitmapSlice) -> Self {
let mut bitmap = Self::new_empty(slice.len);
for i in 0..slice.len {
if slice.bitmap.get(slice.start + i) {
bitmap.set(i);
}
}
bitmap
}
}
#[cfg(test)]
mod tests {
use super::*;
use proptest::prop_assert_eq;
#[test]
fn test_bitmap() {
let mut bitmap = Bitmap::new_empty(10);
assert_eq!(bitmap.len(), 10);
assert_eq!(bitmap.count_ones(), 0);
bitmap.set(0);
bitmap.set(1);
bitmap.set(4);
bitmap.set(5);
bitmap.set(9);
assert_eq!(bitmap.count_ones(), 5);
assert_eq!(
format!("{:?}", bitmap),
"Bitmap { data: 1100110001, len: 10 }"
);
bitmap.clear(1);
bitmap.clear(4);
assert_eq!(bitmap.count_ones(), 3);
assert_eq!(
format!("{:?}", bitmap),
"Bitmap { data: 1000010001, len: 10 }"
);
let bitmap_slice = bitmap.slice(5, 5);
assert_eq!(bitmap_slice.count_ones(), 2);
}
#[test]
fn test_equality() {
for len in 48..56 {
let mut bitmap1 = Bitmap::new_empty(len);
for i in 0..len {
if i % 2 == 0 {
bitmap1.set(i);
}
}
let mut bitmap2 = Bitmap::new_full(len);
for i in 0..len {
if i % 2 == 1 {
bitmap2.clear(i);
}
}
assert_eq!(bitmap1, bitmap2);
}
}
proptest::proptest! {
#[test]
fn test_bitmap_slice(
values in proptest::collection::vec(proptest::bool::ANY, 0..100),
mut start in 0..100usize,
mut len in 0..100usize,
) {
if start > values.len() {
start = values.len();
}
if len > values.len() - start {
len = values.len() - start;
}
let bitmap = Bitmap::from(values.as_slice());
let slice = bitmap.slice(start, len);
let values_slice = values[start..(start + len)].to_vec();
prop_assert_eq!(slice.count_ones(), values_slice.iter().filter(|&&x| x).count());
}
}
#[test]
fn test_bitmap_iter_empty() {
let bitmap = Bitmap::new_empty(10);
let values: Vec<bool> = bitmap.iter().collect();
assert_eq!(values, vec![false; 10]);
}
#[test]
fn test_bitmap_iter_full() {
let bitmap = Bitmap::new_full(10);
let values: Vec<bool> = bitmap.iter().collect();
assert_eq!(values, vec![true; 10]);
}
#[test]
fn test_bitmap_iter_partial() {
let mut bitmap = Bitmap::new_empty(10);
bitmap.set(0);
bitmap.set(3);
bitmap.set(7);
bitmap.set(9);
let values: Vec<bool> = bitmap.iter().collect();
let expected = vec![
true, // 0
false, // 1
false, // 2
true, // 3
false, // 4
false, // 5
false, // 6
true, // 7
false, // 8
true, // 9
];
assert_eq!(values, expected);
}
#[test]
fn test_bitmap_iter_edge_cases() {
// Test with length that's not a multiple of 8
let mut bitmap = Bitmap::new_empty(15);
bitmap.set(0);
bitmap.set(7);
bitmap.set(14);
let values: Vec<bool> = bitmap.iter().collect();
let expected = vec![
true, // 0
false, // 1
false, // 2
false, // 3
false, // 4
false, // 5
false, // 6
true, // 7
false, // 8
false, // 9
false, // 10
false, // 11
false, // 12
false, // 13
true, // 14
];
assert_eq!(values, expected);
}
proptest::proptest! {
#[test]
fn test_bitmap_iter_property(
values in proptest::collection::vec(proptest::bool::ANY, 0..100)
) {
let bitmap = Bitmap::from(values.as_slice());
let iter_values: Vec<bool> = bitmap.iter().collect();
assert_eq!(iter_values, values);
}
}
}

View file

@ -0,0 +1,400 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors
use std::ops::Range;
use deepsize::DeepSizeOf;
/// Encoded array of u64 values.
///
/// This is a internal data type used as part of row id indices.
#[derive(Debug, Clone, PartialEq, Eq, DeepSizeOf)]
pub enum EncodedU64Array {
/// u64 values represented as u16 offset from a base value.
///
/// Useful when the min and max value are within u16 range (0..65535).
/// Only space saving when there are more than 2 values.
U16 { base: u64, offsets: Vec<u16> },
/// u64 values represented as u32 offset from a base value.
///
/// Useful when the min and max value are within u32 range (0..~4 billion).
U32 { base: u64, offsets: Vec<u32> },
/// Just a plain vector of u64 values.
///
/// For when the values cover a wide range.
U64(Vec<u64>),
}
impl EncodedU64Array {
pub fn len(&self) -> usize {
match self {
Self::U16 { offsets, .. } => offsets.len(),
Self::U32 { offsets, .. } => offsets.len(),
Self::U64(values) => values.len(),
}
}
pub fn iter(&self) -> Box<dyn DoubleEndedIterator<Item = u64> + '_> {
match self {
Self::U16 { base, offsets } => {
Box::new(offsets.iter().cloned().map(move |o| base + o as u64))
}
Self::U32 { base, offsets } => {
Box::new(offsets.iter().cloned().map(move |o| base + o as u64))
}
Self::U64(values) => Box::new(values.iter().cloned()),
}
}
pub fn get(&self, i: usize) -> Option<u64> {
match self {
Self::U16 { base, offsets } => {
if i < offsets.len() {
Some(*base + offsets[i] as u64)
} else {
None
}
}
Self::U32 { base, offsets } => {
if i < offsets.len() {
Some(*base + offsets[i] as u64)
} else {
None
}
}
Self::U64(values) => values.get(i).copied(),
}
}
pub fn min(&self) -> Option<u64> {
match self {
Self::U16 { base, offsets } => {
if offsets.is_empty() {
None
} else {
Some(*base)
}
}
Self::U32 { base, offsets } => {
if offsets.is_empty() {
None
} else {
Some(*base)
}
}
Self::U64(values) => values.iter().copied().min(),
}
}
pub fn max(&self) -> Option<u64> {
match self {
Self::U16 { base, offsets } => {
if offsets.is_empty() {
None
} else {
Some(*base + offsets.iter().copied().max().unwrap() as u64)
}
}
Self::U32 { base, offsets } => {
if offsets.is_empty() {
None
} else {
Some(*base + offsets.iter().copied().max().unwrap() as u64)
}
}
Self::U64(values) => values.iter().copied().max(),
}
}
pub fn first(&self) -> Option<u64> {
match self {
Self::U16 { base, offsets } => {
if offsets.is_empty() {
None
} else {
Some(*base + *offsets.first().unwrap() as u64)
}
}
Self::U32 { base, offsets } => {
if offsets.is_empty() {
None
} else {
Some(*base + *offsets.first().unwrap() as u64)
}
}
Self::U64(values) => values.first().copied(),
}
}
pub fn last(&self) -> Option<u64> {
match self {
Self::U16 { base, offsets } => {
if offsets.is_empty() {
None
} else {
Some(*base + *offsets.last().unwrap() as u64)
}
}
Self::U32 { base, offsets } => {
if offsets.is_empty() {
None
} else {
Some(*base + *offsets.last().unwrap() as u64)
}
}
Self::U64(values) => values.last().copied(),
}
}
pub fn binary_search(&self, val: u64) -> std::result::Result<usize, usize> {
match self {
Self::U16 { base, offsets } => match val.checked_sub(*base) {
None => Err(0),
Some(val) => {
if val > u16::MAX as u64 {
return Err(offsets.len());
}
let u16 = val as u16;
offsets.binary_search(&u16)
}
},
Self::U32 { base, offsets } => match val.checked_sub(*base) {
None => Err(0),
Some(val) => {
if val > u32::MAX as u64 {
return Err(offsets.len());
}
let u32 = val as u32;
offsets.binary_search(&u32)
}
},
Self::U64(values) => values.binary_search(&val),
}
}
pub fn slice(&self, offset: usize, len: usize) -> Self {
match self {
Self::U16 { base, offsets } => offsets[offset..(offset + len)]
.iter()
.map(|o| *base + *o as u64)
.collect(),
Self::U32 { base, offsets } => offsets[offset..(offset + len)]
.iter()
.map(|o| *base + *o as u64)
.collect(),
Self::U64(values) => {
let values = values[offset..(offset + len)].to_vec();
Self::U64(values)
}
}
}
}
impl From<Vec<u64>> for EncodedU64Array {
fn from(values: Vec<u64>) -> Self {
let min = values.iter().copied().min().unwrap_or(0);
let max = values.iter().copied().max().unwrap_or(0);
let range = max - min;
if values.is_empty() {
Self::U64(Vec::new())
} else if range <= u16::MAX as u64 {
let base = min;
let offsets = values.iter().map(|v| (*v - base) as u16).collect();
Self::U16 { base, offsets }
} else if range <= u32::MAX as u64 {
let base = min;
let offsets = values.iter().map(|v| (*v - base) as u32).collect();
Self::U32 { base, offsets }
} else {
Self::U64(values)
}
}
}
impl From<Range<u64>> for EncodedU64Array {
fn from(range: Range<u64>) -> Self {
let min = range.start;
let max = range.end;
let range = max - min;
if range < u16::MAX as u64 {
let base = min;
let offsets = (0..range as u16).collect();
Self::U16 { base, offsets }
} else if range < u32::MAX as u64 {
let base = min;
let offsets = (0..range as u32).collect();
Self::U32 { base, offsets }
} else {
Self::U64((min..max).collect())
}
}
}
impl FromIterator<u64> for EncodedU64Array {
fn from_iter<I: IntoIterator<Item = u64>>(iter: I) -> Self {
let values: Vec<u64> = iter.into_iter().collect();
Self::from(values)
}
}
impl IntoIterator for EncodedU64Array {
type Item = u64;
type IntoIter = Box<dyn DoubleEndedIterator<Item = u64>>;
fn into_iter(self) -> Self::IntoIter {
match self {
Self::U16 { base, offsets } => {
Box::new(offsets.into_iter().map(move |o| base + o as u64))
}
Self::U32 { base, offsets } => {
Box::new(offsets.into_iter().map(move |o| base + o as u64))
}
Self::U64(values) => Box::new(values.into_iter()),
}
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_encoded_array_from_vec() {
fn roundtrip_array(values: Vec<u64>, expected: &EncodedU64Array) {
let encoded = EncodedU64Array::from(values.clone());
assert_eq!(&encoded, expected);
assert_eq!(values.len(), encoded.len());
assert_eq!(values.first(), encoded.first().as_ref());
assert_eq!(values.last(), encoded.last().as_ref());
assert_eq!(values.iter().min(), encoded.min().as_ref());
assert_eq!(values.iter().max(), encoded.max().as_ref());
let roundtripped = encoded.iter().collect::<Vec<_>>();
assert_eq!(values, roundtripped);
for (i, v) in values.iter().enumerate() {
assert_eq!(Some(*v), encoded.get(i));
}
let encoded2 = values.into_iter().collect::<EncodedU64Array>();
assert_eq!(&encoded2, expected);
}
// Empty
roundtrip_array(vec![], &EncodedU64Array::U64(vec![]));
// Single value
roundtrip_array(
vec![42],
&EncodedU64Array::U16 {
base: 42,
offsets: vec![0],
},
);
// u16 version, it can start beyond the u16 range, but the
// relative values must be within u16 range.
let relative_values = [42, 0, 43, u16::MAX as u64, 99];
let values = relative_values.map(|v| v + 2 * u16::MAX as u64).to_vec();
let expected = EncodedU64Array::U16 {
base: 2 * u16::MAX as u64,
offsets: relative_values.iter().map(|v| *v as u16).collect(),
};
roundtrip_array(values, &expected);
// u32 version
let relative_values = [42, 0, 43, u32::MAX as u64, 99];
let values = relative_values.map(|v| v + 2 * u32::MAX as u64).to_vec();
let expected = EncodedU64Array::U32 {
base: 2 * u32::MAX as u64,
offsets: relative_values.iter().map(|v| *v as u32).collect(),
};
roundtrip_array(values, &expected);
// u64 version
let values = [42, 0, 43, u64::MAX, 99].to_vec();
let expected = EncodedU64Array::U64(values.clone());
roundtrip_array(values, &expected);
}
#[test]
fn test_double_ended_iter() {
let arrays = vec![
EncodedU64Array::U16 {
base: 42,
offsets: vec![0, 1, 2, 3, 4],
},
EncodedU64Array::U32 {
base: 42,
offsets: vec![0, 1, 2, 3, 4],
},
EncodedU64Array::U64(vec![42, 43, 44, 45, 46]),
];
for array in arrays {
// Should be able to iterate forwards and backwards, and get the same thing.
let forwards = array.iter().collect::<Vec<_>>();
let mut backwards = array.iter().rev().collect::<Vec<_>>();
backwards.reverse();
assert_eq!(forwards, backwards);
// Should be able to pull from both sides in lockstep.
let mut expected = Vec::with_capacity(array.len());
let mut actual = Vec::with_capacity(array.len());
let mut iter = array.iter();
// Alternating forwards and backwards
for i in 0..array.len() {
if i % 2 == 0 {
actual.push(iter.next().unwrap());
expected.push(array.get(i / 2).unwrap());
} else {
let i = array.len() - 1 - i / 2;
actual.push(iter.next_back().unwrap());
expected.push(array.get(i).unwrap());
};
}
assert_eq!(expected, actual);
}
}
#[test]
fn test_encoded_array_from_range() {
// u16 version
let range = (2 * u16::MAX as u64)..(40 + 2 * u16::MAX as u64);
let encoded = EncodedU64Array::from(range.clone());
let expected_base = 2 * u16::MAX as u64;
assert!(
matches!(
encoded,
EncodedU64Array::U16 {
base,
..
} if base == expected_base
),
"{:?}",
encoded
);
let roundtripped = encoded.into_iter().collect::<Vec<_>>();
assert_eq!(range.collect::<Vec<_>>(), roundtripped);
// u32 version
let range = (2 * u32::MAX as u64)..(u16::MAX as u64 + 10 + 2 * u32::MAX as u64);
let encoded = EncodedU64Array::from(range.clone());
let expected_base = 2 * u32::MAX as u64;
assert!(matches!(
encoded,
EncodedU64Array::U32 {
base,
..
} if base == expected_base
));
let roundtripped = encoded.into_iter().collect::<Vec<_>>();
assert_eq!(range.collect::<Vec<_>>(), roundtripped);
// We'll skip u64 since it would take a lot of memory.
// Empty one
let range = 42..42;
let encoded = EncodedU64Array::from(range);
assert_eq!(encoded.len(), 0);
}
}

822
vendor/lance-table/src/rowids/index.rs vendored Normal file
View file

@ -0,0 +1,822 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors
use std::ops::RangeInclusive;
use std::sync::Arc;
use super::{RowIdSequence, U64Segment};
use deepsize::DeepSizeOf;
use lance_core::{Error, Result};
use lance_core::utils::address::RowAddress;
use lance_core::utils::deletion::DeletionVector;
use rangemap::RangeInclusiveMap;
/// An index of row ids
///
/// This index is used to map row ids to their corresponding addresses. These
/// addresses correspond to physical positions in the dataset. See [RowAddress].
///
/// This structure only contains rows that physically exist. However, it may
/// map to addresses that have been tombstoned. A separate tombstone index is
/// used to track tombstoned rows.
// (Implementation)
// Disjoint ranges of row ids are stored as the keys of the map. The values are
// a pair of segments. The first segment is the row ids, and the second segment
// is the addresses.
#[derive(Debug)]
pub struct RowIdIndex(RangeInclusiveMap<u64, (U64Segment, U64Segment)>);
pub struct FragmentRowIdIndex {
pub fragment_id: u32,
pub row_id_sequence: Arc<RowIdSequence>,
pub deletion_vector: Arc<DeletionVector>,
}
impl RowIdIndex {
/// Create a new index from a list of fragment ids and their corresponding row id sequences.
pub fn new(fragment_indices: &[FragmentRowIdIndex]) -> Result<Self> {
let chunks = fragment_indices
.iter()
.flat_map(decompose_sequence)
.collect::<Vec<_>>();
let mut final_chunks = Vec::new();
for processed_chunk in prep_index_chunks(chunks) {
match processed_chunk {
RawIndexChunk::NonOverlapping(chunk) => {
final_chunks.push(chunk);
}
RawIndexChunk::Overlapping(_range, overlapping_chunks) => {
// Intersecting row-id ranges don't imply intersecting id sets;
// sparse ids and deletion holes leave the union short of the span.
// The real invariant (no id in two fragments) is checked in the merge.
let merged_chunk = merge_overlapping_chunks(overlapping_chunks)?;
final_chunks.push(merged_chunk);
}
}
}
Ok(Self(RangeInclusiveMap::from_iter(final_chunks)))
}
/// Get the address for a given row id.
///
/// Will return None if the row id does not exist in the index.
pub fn get(&self, row_id: u64) -> Option<RowAddress> {
let (row_id_segment, address_segment) = self.0.get(&row_id)?;
let pos = row_id_segment.position(row_id)?;
let address = address_segment.get(pos)?;
Some(RowAddress::from(address))
}
/// Get addresses for many row ids in one pass over the index.
///
/// Returns one entry per input id, in input order (`None` for missing).
/// Sorts a working copy of the input internally so the chunk iterator
/// is advanced at most once per chunk, amortizing the per-id tree walk
/// from O(N · log F) to O(F + N).
pub fn get_many(&self, row_ids: &[u64]) -> Vec<Option<RowAddress>> {
let n = row_ids.len();
let mut out = vec![None; n];
if n == 0 {
return out;
}
let mut sorted: Vec<(u64, usize)> = row_ids.iter().copied().zip(0..n).collect();
sorted.sort_unstable_by_key(|&(id, _)| id);
let mut chunks = self.0.iter().peekable();
for (id, orig_idx) in sorted {
// Advance past chunks that end before this id.
while let Some((range, _)) = chunks.peek() {
if *range.end() < id {
chunks.next();
} else {
break;
}
}
let Some((range, (row_id_seg, addr_seg))) = chunks.peek() else {
break;
};
if id < *range.start() {
continue; // falls in a gap between chunks
}
if let Some(pos) = row_id_seg.position(id)
&& let Some(addr) = addr_seg.get(pos)
{
out[orig_idx] = Some(RowAddress::from(addr));
}
}
out
}
}
impl DeepSizeOf for RowIdIndex {
fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize {
self.0
.iter()
.map(|(_, (row_id_segment, address_segment))| {
(2 * std::mem::size_of::<u64>())
+ std::mem::size_of::<(U64Segment, U64Segment)>()
+ row_id_segment.deep_size_of_children(context)
+ address_segment.deep_size_of_children(context)
})
.sum()
}
}
fn decompose_sequence(
frag_index: &FragmentRowIdIndex,
) -> Vec<(RangeInclusive<u64>, (U64Segment, U64Segment))> {
let mut start_address: u64 = RowAddress::first_row(frag_index.fragment_id).into();
let mut current_offset = 0u32;
let no_deletions = frag_index.deletion_vector.is_empty();
frag_index
.row_id_sequence
.0
.iter()
.filter_map(|segment| {
let segment_len = segment.len();
let result = if no_deletions {
decompose_segment_no_deletions(segment, start_address)
} else {
decompose_segment_with_deletions(
segment,
start_address,
current_offset,
&frag_index.deletion_vector,
)
};
current_offset += segment_len as u32;
start_address += segment_len as u64;
result
})
.collect()
}
/// Build an IndexChunk from a list of (row_id, address) pairs.
fn build_chunk_from_pairs(pairs: Vec<(u64, u64)>) -> Option<IndexChunk> {
if pairs.is_empty() {
return None;
}
let (row_ids, addresses): (Vec<u64>, Vec<u64>) = pairs.into_iter().unzip();
let row_id_segment = U64Segment::from_iter(row_ids);
let address_segment = U64Segment::from_iter(addresses);
let coverage = row_id_segment.range()?;
Some((coverage, (row_id_segment, address_segment)))
}
/// Fast path: no deletions. O(1) for Range segments.
fn decompose_segment_no_deletions(segment: &U64Segment, start_address: u64) -> Option<IndexChunk> {
match segment {
U64Segment::Range(range) if !range.is_empty() => {
let len = range.end - range.start;
let row_id_segment = U64Segment::Range(range.clone());
let address_segment = U64Segment::Range(start_address..start_address + len);
let coverage = range.start..=range.end - 1;
Some((coverage, (row_id_segment, address_segment)))
}
_ if segment.is_empty() => None,
_ => {
// Non-Range segments: must iterate to build address mapping.
let pairs: Vec<(u64, u64)> = segment
.iter()
.enumerate()
.map(|(i, row_id)| (row_id, start_address + i as u64))
.collect();
build_chunk_from_pairs(pairs)
}
}
}
/// Slow path: has deletions, must check each row.
fn decompose_segment_with_deletions(
segment: &U64Segment,
start_address: u64,
current_offset: u32,
deletion_vector: &DeletionVector,
) -> Option<IndexChunk> {
let pairs: Vec<(u64, u64)> = segment
.iter()
.enumerate()
.filter_map(|(i, row_id)| {
let row_offset = current_offset + i as u32;
if !deletion_vector.contains(row_offset) {
Some((row_id, start_address + i as u64))
} else {
None
}
})
.collect();
build_chunk_from_pairs(pairs)
}
type IndexChunk = (RangeInclusive<u64>, (U64Segment, U64Segment));
#[derive(Debug)]
enum RawIndexChunk {
NonOverlapping(IndexChunk),
Overlapping(RangeInclusive<u64>, Vec<IndexChunk>),
}
impl RawIndexChunk {
fn range_end(&self) -> u64 {
match self {
Self::NonOverlapping((range, _)) => *range.end(),
Self::Overlapping(range, _) => *range.end(),
}
}
}
/// Given a vector of index chunks, sort them and return an iterator of index chunks.
///
/// The iterator will yield chunks that are non-overlapping or a set of chunks
/// that are overlapping.
fn prep_index_chunks(mut chunks: Vec<IndexChunk>) -> impl Iterator<Item = RawIndexChunk> {
chunks.sort_by_key(|(range, _)| u64::MAX - *range.start());
let mut output = Vec::new();
// Start assuming non-overlapping in first chunk.
if let Some(first_chunk) = chunks.pop() {
output.push(RawIndexChunk::NonOverlapping(first_chunk));
} else {
// Early return for empty.
return output.into_iter();
}
let mut current_range = 0..=0;
let mut current_overlap = Vec::new();
while let Some(chunk) = chunks.pop() {
debug_assert_eq!(
current_overlap
.iter()
.map(|(range, _): &IndexChunk| *range.start())
.min()
.unwrap_or_default(),
*current_range.start(),
);
debug_assert_eq!(
current_overlap
.iter()
.map(|(range, _): &IndexChunk| *range.end())
.max()
.unwrap_or_default(),
*current_range.end(),
);
if current_overlap.is_empty() {
// We haven't found overlap yet.
let last_chunk_end = output.last().unwrap().range_end();
if *chunk.0.start() <= last_chunk_end {
// We have found overlap.
match output.pop().unwrap() {
RawIndexChunk::NonOverlapping(chunk) => {
current_overlap.push(chunk);
}
_ => unreachable!(),
}
current_overlap.push(chunk);
let range_start = *current_overlap.first().unwrap().0.start();
let range_end = *current_overlap
.last()
.unwrap()
.0
.end()
.max(current_overlap.first().unwrap().0.end());
current_range = range_start..=range_end;
} else {
// We are still in non-overlapping space.
output.push(RawIndexChunk::NonOverlapping(chunk));
}
} else {
// We are making an overlap chunk
if chunk.0.start() <= current_range.end() {
// We are still in overlap.
let range_end = *chunk.0.end().max(current_range.end());
current_range = *current_range.start()..=range_end;
current_overlap.push(chunk);
} else {
// We have exited overlap.
output.push(RawIndexChunk::Overlapping(
std::mem::replace(&mut current_range, 0..=0),
std::mem::take(&mut current_overlap),
));
output.push(RawIndexChunk::NonOverlapping(chunk));
}
}
}
debug_assert_eq!(
current_overlap
.iter()
.map(|(range, _): &IndexChunk| *range.start())
.min()
.unwrap_or_default(),
*current_range.start(),
);
debug_assert_eq!(
current_overlap
.iter()
.map(|(range, _): &IndexChunk| *range.end())
.max()
.unwrap_or_default(),
*current_range.end(),
);
if !current_overlap.is_empty() {
output.push(RawIndexChunk::Overlapping(
current_range.clone(),
current_overlap,
));
}
output.into_iter()
}
fn merge_overlapping_chunks(overlapping_chunks: Vec<IndexChunk>) -> Result<IndexChunk> {
let total_capacity = overlapping_chunks
.iter()
.map(|(_, (row_ids, _))| row_ids.len())
.sum();
let mut values = Vec::with_capacity(total_capacity);
for (_, (row_ids, row_addrs)) in overlapping_chunks.iter() {
values.extend(row_ids.iter().zip(row_addrs.iter()));
}
values.sort_by_key(|(row_id, _)| *row_id);
// A duplicate row id here means two fragments claim the same live id: a
// corrupt index, not a resolvable sparse-coverage case.
if let Some(w) = values.windows(2).find(|w| w[0].0 == w[1].0) {
return Err(Error::internal(format!(
"row id index corrupt: stable row id {} is live in multiple fragments",
w[0].0
)));
}
let row_id_segment = U64Segment::from_iter(values.iter().map(|(row_id, _)| *row_id));
let address_segment = U64Segment::from_iter(values.iter().map(|(_, row_addr)| *row_addr));
let range = row_id_segment.range().unwrap();
Ok((range, (row_id_segment, address_segment)))
}
#[cfg(test)]
mod tests {
use super::*;
use proptest::{prelude::Strategy, prop_assert_eq};
#[test]
fn test_new_index() {
let fragment_indices = vec![
FragmentRowIdIndex {
fragment_id: 10,
row_id_sequence: Arc::new(RowIdSequence(vec![
U64Segment::Range(0..10),
U64Segment::RangeWithHoles {
range: 10..17,
holes: vec![12, 15].into(),
},
U64Segment::SortedArray(vec![20, 25, 30].into()),
])),
deletion_vector: Arc::new(DeletionVector::default()),
},
FragmentRowIdIndex {
fragment_id: 20,
row_id_sequence: Arc::new(RowIdSequence(vec![
U64Segment::RangeWithBitmap {
range: 17..20,
bitmap: [true, false, true].as_slice().into(),
},
U64Segment::Array(vec![40, 50, 60].into()),
])),
deletion_vector: Arc::new(DeletionVector::default()),
},
];
let index = RowIdIndex::new(&fragment_indices).unwrap();
// Check various queries.
assert_eq!(index.get(0), Some(RowAddress::new_from_parts(10, 0)));
assert_eq!(index.get(15), None);
assert_eq!(index.get(16), Some(RowAddress::new_from_parts(10, 14)));
assert_eq!(index.get(17), Some(RowAddress::new_from_parts(20, 0)));
assert_eq!(index.get(25), Some(RowAddress::new_from_parts(10, 16)));
assert_eq!(index.get(40), Some(RowAddress::new_from_parts(20, 2)));
assert_eq!(index.get(60), Some(RowAddress::new_from_parts(20, 4)));
assert_eq!(index.get(61), None);
}
#[test]
fn test_new_index_overlap() {
let fragment_indices = vec![
FragmentRowIdIndex {
fragment_id: 23,
row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::SortedArray(
vec![3, 6, 9].into(),
)])),
deletion_vector: Arc::new(DeletionVector::default()),
},
FragmentRowIdIndex {
fragment_id: 42,
row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::SortedArray(
vec![2, 5, 8].into(),
)])),
deletion_vector: Arc::new(DeletionVector::default()),
},
FragmentRowIdIndex {
fragment_id: 10,
row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::SortedArray(
vec![1, 4, 7].into(),
)])),
deletion_vector: Arc::new(DeletionVector::default()),
},
];
let index = RowIdIndex::new(&fragment_indices).unwrap();
// Check various queries.
assert_eq!(index.get(1), Some(RowAddress::new_from_parts(10, 0)));
assert_eq!(index.get(2), Some(RowAddress::new_from_parts(42, 0)));
assert_eq!(index.get(3), Some(RowAddress::new_from_parts(23, 0)));
assert_eq!(index.get(4), Some(RowAddress::new_from_parts(10, 1)));
assert_eq!(index.get(5), Some(RowAddress::new_from_parts(42, 1)));
assert_eq!(index.get(6), Some(RowAddress::new_from_parts(23, 1)));
assert_eq!(index.get(7), Some(RowAddress::new_from_parts(10, 2)));
assert_eq!(index.get(8), Some(RowAddress::new_from_parts(42, 2)));
assert_eq!(index.get(9), Some(RowAddress::new_from_parts(23, 2)));
}
#[test]
fn test_new_index_unsorted_row_ids() {
// Test case with unsorted row ids within fragments
let fragment_indices = vec![
FragmentRowIdIndex {
fragment_id: 10,
row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::Array(
vec![9, 3, 6].into(), // Unsorted array
)])),
deletion_vector: Arc::new(DeletionVector::default()),
},
FragmentRowIdIndex {
fragment_id: 20,
row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::Array(
vec![8, 2, 5].into(), // Unsorted array
)])),
deletion_vector: Arc::new(DeletionVector::default()),
},
FragmentRowIdIndex {
fragment_id: 30,
row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::Array(
vec![7, 1, 4].into(), // Unsorted array
)])),
deletion_vector: Arc::new(DeletionVector::default()),
},
];
let index = RowIdIndex::new(&fragment_indices).unwrap();
// Check that all row ids can be found regardless of their order in the segments
assert_eq!(index.get(1), Some(RowAddress::new_from_parts(30, 1)));
assert_eq!(index.get(2), Some(RowAddress::new_from_parts(20, 1)));
assert_eq!(index.get(3), Some(RowAddress::new_from_parts(10, 1)));
assert_eq!(index.get(4), Some(RowAddress::new_from_parts(30, 2)));
assert_eq!(index.get(5), Some(RowAddress::new_from_parts(20, 2)));
assert_eq!(index.get(6), Some(RowAddress::new_from_parts(10, 2)));
assert_eq!(index.get(7), Some(RowAddress::new_from_parts(30, 0)));
assert_eq!(index.get(8), Some(RowAddress::new_from_parts(20, 0)));
assert_eq!(index.get(9), Some(RowAddress::new_from_parts(10, 0)));
// Check that non-existent row ids return None
assert_eq!(index.get(0), None);
assert_eq!(index.get(10), None);
}
#[test]
fn test_new_index_partial_overlap() {
let fragment_indices = vec![
FragmentRowIdIndex {
fragment_id: 0,
row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::RangeWithHoles {
range: 0..100,
holes: vec![50].into(),
}])),
deletion_vector: Arc::new(DeletionVector::default()),
},
FragmentRowIdIndex {
fragment_id: 1,
row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::Range(50..51)])),
deletion_vector: Arc::new(DeletionVector::default()),
},
];
let index = RowIdIndex::new(&fragment_indices).unwrap();
// Check various queries.
assert_eq!(index.get(0), Some(RowAddress::new_from_parts(0, 0)));
assert_eq!(index.get(49), Some(RowAddress::new_from_parts(0, 49)));
assert_eq!(index.get(50), Some(RowAddress::new_from_parts(1, 0)));
assert_eq!(index.get(51), Some(RowAddress::new_from_parts(0, 50)));
assert_eq!(index.get(99), Some(RowAddress::new_from_parts(0, 98)));
}
#[test]
fn test_overlapping_chunks_sparse_with_deletions() {
// Interleaved (overlapping) id ranges plus a deletion that leaves a hole,
// so the union doesn't tile the span. Every live id must still resolve.
let fragment_indices = vec![
FragmentRowIdIndex {
fragment_id: 10,
row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::SortedArray(
vec![1, 3, 5, 7, 9].into(),
)])),
deletion_vector: Arc::new(DeletionVector::default()),
},
FragmentRowIdIndex {
fragment_id: 20,
row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::SortedArray(
vec![0, 2, 4, 6, 8].into(),
)])),
// Delete offset 2 (id 4) -> a hole in the span.
deletion_vector: Arc::new(DeletionVector::from_iter(vec![2])),
},
];
let index = RowIdIndex::new(&fragment_indices).unwrap();
assert_eq!(index.get(0), Some(RowAddress::new_from_parts(20, 0)));
assert_eq!(index.get(1), Some(RowAddress::new_from_parts(10, 0)));
assert_eq!(index.get(2), Some(RowAddress::new_from_parts(20, 1)));
assert_eq!(index.get(3), Some(RowAddress::new_from_parts(10, 1)));
assert_eq!(index.get(4), None);
// Surviving ids keep their original offsets (the hole is not compacted).
assert_eq!(index.get(6), Some(RowAddress::new_from_parts(20, 3)));
assert_eq!(index.get(8), Some(RowAddress::new_from_parts(20, 4)));
assert_eq!(index.get(9), Some(RowAddress::new_from_parts(10, 4)));
}
#[test]
fn test_index_with_deletion_vector() {
let deletion_vector = DeletionVector::from_iter(vec![2, 3]);
let fragment_indices = vec![FragmentRowIdIndex {
fragment_id: 10,
row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::Range(0..6)])),
deletion_vector: Arc::new(deletion_vector),
}];
let index = RowIdIndex::new(&fragment_indices).unwrap();
assert_eq!(index.get(0), Some(RowAddress::new_from_parts(10, 0)));
assert_eq!(index.get(1), Some(RowAddress::new_from_parts(10, 1)));
assert_eq!(index.get(4), Some(RowAddress::new_from_parts(10, 4)));
assert_eq!(index.get(5), Some(RowAddress::new_from_parts(10, 5)));
assert_eq!(index.get(2), None);
assert_eq!(index.get(3), None);
}
#[test]
fn test_empty_fragment_sequences() {
let fragment_indices = vec![
FragmentRowIdIndex {
fragment_id: 10,
row_id_sequence: Arc::new(RowIdSequence(vec![])),
deletion_vector: Arc::new(DeletionVector::default()),
},
FragmentRowIdIndex {
fragment_id: 20,
row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::Range(5..8)])),
deletion_vector: Arc::new(DeletionVector::default()),
},
];
let index = RowIdIndex::new(&fragment_indices).unwrap();
assert_eq!(index.get(5), Some(RowAddress::new_from_parts(20, 0)));
assert_eq!(index.get(7), Some(RowAddress::new_from_parts(20, 2)));
assert_eq!(index.get(4), None);
}
#[test]
fn test_completely_empty_index() {
let fragment_indices = vec![];
let index = RowIdIndex::new(&fragment_indices).unwrap();
assert_eq!(index.get(0), None);
assert_eq!(index.get(100), None);
}
#[test]
fn test_non_overlapping_ranges() {
let fragment_indices = vec![
FragmentRowIdIndex {
fragment_id: 10,
row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::Range(0..5)])),
deletion_vector: Arc::new(DeletionVector::default()),
},
FragmentRowIdIndex {
fragment_id: 20,
row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::Range(5..10)])),
deletion_vector: Arc::new(DeletionVector::default()),
},
FragmentRowIdIndex {
fragment_id: 30,
row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::Range(10..15)])),
deletion_vector: Arc::new(DeletionVector::default()),
},
];
let index = RowIdIndex::new(&fragment_indices).unwrap();
assert_eq!(index.get(0), Some(RowAddress::new_from_parts(10, 0)));
assert_eq!(index.get(4), Some(RowAddress::new_from_parts(10, 4)));
assert_eq!(index.get(5), Some(RowAddress::new_from_parts(20, 0)));
assert_eq!(index.get(9), Some(RowAddress::new_from_parts(20, 4)));
assert_eq!(index.get(10), Some(RowAddress::new_from_parts(30, 0)));
assert_eq!(index.get(14), Some(RowAddress::new_from_parts(30, 4)));
}
fn arbitrary_row_ids(
num_fragments_range: std::ops::Range<usize>,
frag_size_range: std::ops::Range<usize>,
) -> impl Strategy<Value = Vec<(u32, Arc<RowIdSequence>)>> {
let fragment_sizes = proptest::collection::vec(frag_size_range, num_fragments_range);
fragment_sizes.prop_flat_map(|fragment_sizes| {
let num_rows = fragment_sizes.iter().sum::<usize>() as u64;
let row_ids = 0..num_rows;
let row_ids = row_ids.collect::<Vec<_>>();
let row_ids_shuffled = proptest::strategy::Just(row_ids).prop_shuffle();
row_ids_shuffled.prop_map(move |row_ids| {
let mut sequences = Vec::with_capacity(fragment_sizes.len());
let mut i = 0;
for size in &fragment_sizes {
let end = i + size;
let sequence =
RowIdSequence(vec![U64Segment::from_slice(row_ids[i..end].into())]);
sequences.push((i as u32, Arc::new(sequence)));
i = end;
}
sequences
})
})
}
#[test]
fn test_large_range_segments_no_deletions() {
// Simulates a real-world scenario: many fragments with large Range segments
// and no deletions. Before optimization, this would iterate over all rows
// (O(total_rows)). After optimization, it's O(num_fragments).
let rows_per_fragment = 250_000u64;
let num_fragments = 100u32;
let mut offset = 0u64;
let fragment_indices: Vec<FragmentRowIdIndex> = (0..num_fragments)
.map(|frag_id| {
let start = offset;
offset += rows_per_fragment;
FragmentRowIdIndex {
fragment_id: frag_id,
row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::Range(
start..start + rows_per_fragment,
)])),
deletion_vector: Arc::new(DeletionVector::default()),
}
})
.collect();
let start = std::time::Instant::now();
let index = RowIdIndex::new(&fragment_indices).unwrap();
let elapsed = start.elapsed();
// Verify correctness at boundaries
assert_eq!(index.get(0), Some(RowAddress::new_from_parts(0, 0)));
assert_eq!(
index.get(rows_per_fragment - 1),
Some(RowAddress::new_from_parts(0, rows_per_fragment as u32 - 1))
);
assert_eq!(
index.get(rows_per_fragment),
Some(RowAddress::new_from_parts(1, 0))
);
let last_row = num_fragments as u64 * rows_per_fragment - 1;
assert_eq!(
index.get(last_row),
Some(RowAddress::new_from_parts(
num_fragments - 1,
rows_per_fragment as u32 - 1
))
);
assert_eq!(index.get(last_row + 1), None);
// With the optimization, building an index for 25M rows across 100 fragments
// should complete in well under 1 second (typically < 1ms).
assert!(
elapsed.as_secs() < 1,
"Index build took {:?} for {} fragments x {} rows = {} total rows. \
This suggests the O(rows) -> O(fragments) optimization is not working.",
elapsed,
num_fragments,
rows_per_fragment,
num_fragments as u64 * rows_per_fragment,
);
}
#[test]
fn test_large_range_segments_with_deletions() {
let rows_per_fragment = 1_000u64;
let num_fragments = 10u32;
let mut offset = 0u64;
let fragment_indices: Vec<FragmentRowIdIndex> = (0..num_fragments)
.map(|frag_id| {
let start = offset;
offset += rows_per_fragment;
// Delete every 3rd row (offsets 0, 3, 6, ...) within each fragment.
let mut deleted = roaring::RoaringBitmap::new();
for i in (0..rows_per_fragment as u32).step_by(3) {
deleted.insert(i);
}
FragmentRowIdIndex {
fragment_id: frag_id,
row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::Range(
start..start + rows_per_fragment,
)])),
deletion_vector: Arc::new(DeletionVector::Bitmap(deleted)),
}
})
.collect();
let index = RowIdIndex::new(&fragment_indices).unwrap();
// Deleted rows (offset 0, 3, 6, ...) should not be found.
// Row ID 0 has offset 0 in fragment 0 -> deleted.
assert_eq!(index.get(0), None);
// Row ID 3 has offset 3 in fragment 0 -> deleted.
assert_eq!(index.get(3), None);
// Non-deleted rows should resolve correctly.
// Row ID 1 has offset 1 in fragment 0 -> address (frag=0, row=1).
assert_eq!(index.get(1), Some(RowAddress::new_from_parts(0, 1)));
// Row ID 2 has offset 2 in fragment 0 -> address (frag=0, row=2).
assert_eq!(index.get(2), Some(RowAddress::new_from_parts(0, 2)));
// Row ID 4 has offset 4 in fragment 0 -> address (frag=0, row=4).
assert_eq!(index.get(4), Some(RowAddress::new_from_parts(0, 4)));
// Check second fragment: row IDs start at 1000.
// Row ID 1000 has offset 0 in fragment 1 -> deleted.
assert_eq!(index.get(rows_per_fragment), None);
// Row ID 1001 has offset 1 in fragment 1 -> address (frag=1, row=1).
assert_eq!(
index.get(rows_per_fragment + 1),
Some(RowAddress::new_from_parts(1, 1))
);
// Last fragment, last non-deleted row.
// Row ID 9999 has offset 999 in fragment 9 -> 999 % 3 == 0 -> deleted.
let last_row = num_fragments as u64 * rows_per_fragment - 1;
assert_eq!(index.get(last_row), None);
// Row ID 9998 has offset 998 -> 998 % 3 == 2 -> not deleted.
assert_eq!(
index.get(last_row - 1),
Some(RowAddress::new_from_parts(num_fragments - 1, 998))
);
// Out of range.
assert_eq!(index.get(last_row + 1), None);
}
proptest::proptest! {
#[test]
fn test_new_index_robustness(row_ids in arbitrary_row_ids(0..5, 0..32)) {
let fragment_indices: Vec<FragmentRowIdIndex> = row_ids
.iter()
.map(|(frag_id, sequence)| FragmentRowIdIndex {
fragment_id: *frag_id,
row_id_sequence: sequence.clone(),
deletion_vector: Arc::new(DeletionVector::default()),
})
.collect();
let index = RowIdIndex::new(&fragment_indices).unwrap();
for (frag_id, sequence) in row_ids.iter() {
for (local_offset, row_id) in sequence.iter().enumerate() {
prop_assert_eq!(
index.get(row_id),
Some(RowAddress::new_from_parts(*frag_id, local_offset as u32)),
"Row id {} in sequence {:?} not found in index {:?}",
row_id,
sequence,
index
);
}
}
}
}
}

1141
vendor/lance-table/src/rowids/segment.rs vendored Normal file

File diff suppressed because it is too large Load diff

239
vendor/lance-table/src/rowids/serde.rs vendored Normal file
View file

@ -0,0 +1,239 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors
use crate::{format::pb, rowids::bitmap::Bitmap};
use lance_core::{Error, Result};
use super::{RowIdSequence, U64Segment, encoded_array::EncodedU64Array};
use prost::Message;
impl TryFrom<pb::RowIdSequence> for RowIdSequence {
type Error = Error;
fn try_from(pb: pb::RowIdSequence) -> Result<Self> {
Ok(Self(
pb.segments
.into_iter()
.map(U64Segment::try_from)
.collect::<Result<Vec<_>>>()?,
))
}
}
impl TryFrom<pb::U64Segment> for U64Segment {
type Error = Error;
fn try_from(pb: pb::U64Segment) -> Result<Self> {
use pb::u64_segment as pb_seg;
use pb::u64_segment::Segment::*;
match pb.segment {
Some(Range(pb_seg::Range { start, end })) => Ok(Self::Range(start..end)),
Some(RangeWithHoles(pb_seg::RangeWithHoles { start, end, holes })) => {
let holes = holes
.ok_or_else(|| Error::invalid_input("missing hole"))?
.try_into()?;
Ok(Self::RangeWithHoles {
range: start..end,
holes,
})
}
Some(RangeWithBitmap(pb_seg::RangeWithBitmap { start, end, bitmap })) => {
Ok(Self::RangeWithBitmap {
range: start..end,
bitmap: Bitmap {
data: bitmap,
len: (end - start) as usize,
},
})
}
Some(SortedArray(array)) => Ok(Self::SortedArray(EncodedU64Array::try_from(array)?)),
Some(Array(array)) => Ok(Self::Array(EncodedU64Array::try_from(array)?)),
// TODO: why non-exhaustive?
// Some(_) => Err(Error::invalid_input("unknown segment type")),
None => Err(Error::invalid_input("missing segment type")),
}
}
}
impl TryFrom<pb::EncodedU64Array> for EncodedU64Array {
type Error = Error;
fn try_from(pb: pb::EncodedU64Array) -> Result<Self> {
use pb::encoded_u64_array as pb_arr;
use pb::encoded_u64_array::Array::*;
match pb.array {
Some(U16Array(pb_arr::U16Array { base, offsets })) => {
assert!(
offsets.len() % 2 == 0,
"Must have even number of bytes to store u16 array"
);
let offsets = offsets
.chunks_exact(2)
.map(|chunk| u16::from_le_bytes([chunk[0], chunk[1]]))
.collect();
Ok(Self::U16 { base, offsets })
}
Some(U32Array(pb_arr::U32Array { base, offsets })) => {
assert!(
offsets.len() % 4 == 0,
"Must have even number of bytes to store u32 array"
);
let offsets = offsets
.chunks_exact(4)
.map(|chunk| u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]))
.collect();
Ok(Self::U32 { base, offsets })
}
Some(U64Array(pb_arr::U64Array { values })) => {
assert!(
values.len() % 8 == 0,
"Must have even number of bytes to store u64 array"
);
let values = values
.chunks_exact(8)
.map(|chunk| {
u64::from_le_bytes([
chunk[0], chunk[1], chunk[2], chunk[3], chunk[4], chunk[5], chunk[6],
chunk[7],
])
})
.collect();
Ok(Self::U64(values))
}
// TODO: shouldn't this enum be non-exhaustive?
// Some(_) => Err(Error::invalid_input("unknown array type")),
None => Err(Error::invalid_input("missing array type")),
}
}
}
impl From<RowIdSequence> for pb::RowIdSequence {
fn from(sequence: RowIdSequence) -> Self {
Self {
segments: sequence.0.into_iter().map(pb::U64Segment::from).collect(),
}
}
}
impl From<U64Segment> for pb::U64Segment {
fn from(segment: U64Segment) -> Self {
match segment {
U64Segment::Range(range) => Self {
segment: Some(pb::u64_segment::Segment::Range(pb::u64_segment::Range {
start: range.start,
end: range.end,
})),
},
U64Segment::RangeWithHoles { range, holes } => Self {
segment: Some(pb::u64_segment::Segment::RangeWithHoles(
pb::u64_segment::RangeWithHoles {
start: range.start,
end: range.end,
holes: Some(holes.into()),
},
)),
},
U64Segment::RangeWithBitmap { range, bitmap } => Self {
segment: Some(pb::u64_segment::Segment::RangeWithBitmap(
pb::u64_segment::RangeWithBitmap {
start: range.start,
end: range.end,
bitmap: bitmap.data,
},
)),
},
U64Segment::SortedArray(array) => Self {
segment: Some(pb::u64_segment::Segment::SortedArray(array.into())),
},
U64Segment::Array(array) => Self {
segment: Some(pb::u64_segment::Segment::Array(array.into())),
},
}
}
}
impl From<EncodedU64Array> for pb::EncodedU64Array {
fn from(array: EncodedU64Array) -> Self {
match array {
EncodedU64Array::U16 { base, offsets } => Self {
array: Some(pb::encoded_u64_array::Array::U16Array(
pb::encoded_u64_array::U16Array {
base,
offsets: offsets
.iter()
.flat_map(|&offset| offset.to_le_bytes().to_vec())
.collect(),
},
)),
},
EncodedU64Array::U32 { base, offsets } => Self {
array: Some(pb::encoded_u64_array::Array::U32Array(
pb::encoded_u64_array::U32Array {
base,
offsets: offsets
.iter()
.flat_map(|&offset| offset.to_le_bytes().to_vec())
.collect(),
},
)),
},
EncodedU64Array::U64(values) => Self {
array: Some(pb::encoded_u64_array::Array::U64Array(
pb::encoded_u64_array::U64Array {
values: values
.iter()
.flat_map(|&value| value.to_le_bytes().to_vec())
.collect(),
},
)),
},
}
}
}
/// Serialize a rowid sequence to a buffer.
pub fn write_row_ids(sequence: &RowIdSequence) -> Vec<u8> {
let pb_sequence = pb::RowIdSequence::from(sequence.clone());
pb_sequence.encode_to_vec()
}
/// Deserialize a rowid sequence from some bytes.
pub fn read_row_ids(reader: &[u8]) -> Result<RowIdSequence> {
let pb_sequence = pb::RowIdSequence::decode(reader)?;
RowIdSequence::try_from(pb_sequence)
}
#[cfg(test)]
mod test {
use super::*;
use pretty_assertions::assert_eq;
#[test]
fn test_write_read_row_ids() {
let mut sequence = RowIdSequence::from(0..20);
sequence.0.push(U64Segment::Range(30..100));
sequence.0.push(U64Segment::RangeWithHoles {
range: 100..200,
holes: EncodedU64Array::U64(vec![104, 108, 150]),
});
sequence.0.push(U64Segment::RangeWithBitmap {
range: 200..300,
bitmap: Bitmap::new_empty(100),
});
sequence
.0
.push(U64Segment::SortedArray(EncodedU64Array::U16 {
base: 200,
offsets: vec![1, 2, 3],
}));
sequence
.0
.push(U64Segment::Array(EncodedU64Array::U64(vec![1, 2, 3])));
let serialized = write_row_ids(&sequence);
let sequence2 = read_row_ids(&serialized).unwrap();
assert_eq!(sequence.0, sequence2.0);
}
}

713
vendor/lance-table/src/rowids/version.rs vendored Normal file
View file

@ -0,0 +1,713 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors
//! Row version tracking for cross-version diff functionality
//!
//! This module provides data structures and functionality to track the latest
//! update version for each row in a Lance dataset, enabling efficient
//! cross-version diff operations.
use std::sync::Arc;
use deepsize::DeepSizeOf;
use lance_core::Error;
use lance_core::Result;
use prost::Message;
use serde::de::Deserializer;
use serde::ser::Serializer;
use serde::{Deserialize, Serialize};
use crate::format::{ExternalFile, Fragment, pb};
use crate::rowids::segment::U64Segment;
use crate::rowids::{RowIdSequence, read_row_ids};
/// A run of identical versions over a contiguous span of row positions.
///
/// Span is expressed as a U64Segment over row offsets (0..N within a fragment),
/// not over row IDs. This keeps the encoding aligned with RowIdSequence order
/// and enables zipped iteration without building a map.
#[derive(Debug, Clone, PartialEq, Eq, DeepSizeOf)]
pub struct RowDatasetVersionRun {
pub span: U64Segment,
pub version: u64,
}
impl RowDatasetVersionRun {
/// Number of rows covered by this run.
pub fn len(&self) -> usize {
self.span.len()
}
/// Whether this run covers no rows.
pub fn is_empty(&self) -> bool {
self.span.is_empty()
}
/// The version value of this run.
pub fn version(&self) -> u64 {
self.version
}
}
/// Sequence of dataset versions
///
/// Stores version runs aligned to the positional order of RowIdSequence.
/// Provides sequential iterators and optional lightweight indexing for
/// efficient random access.
#[derive(Debug, Clone, PartialEq, Eq, DeepSizeOf, Default)]
pub struct RowDatasetVersionSequence {
pub runs: Vec<RowDatasetVersionRun>,
}
impl RowDatasetVersionSequence {
/// Create a new empty version sequence
pub fn new() -> Self {
Self { runs: Vec::new() }
}
/// Create a version sequence with a single uniform run of `row_count` rows.
pub fn from_uniform_row_count(row_count: u64, version: u64) -> Self {
if row_count == 0 {
return Self::new();
}
let run = RowDatasetVersionRun {
span: U64Segment::Range(0..row_count),
version,
};
Self { runs: vec![run] }
}
/// Number of rows tracked by this sequence (sum of run lengths).
pub fn len(&self) -> u64 {
self.runs.iter().map(|s| s.len() as u64).sum()
}
/// Empty if there are no runs or all runs are empty.
pub fn is_empty(&self) -> bool {
self.runs.is_empty() || self.runs.iter().all(|s| s.is_empty())
}
/// Returns a forward iterator over versions, expanding runs lazily.
pub fn versions(&self) -> VersionsIter<'_> {
VersionsIter::new(&self.runs)
}
/// Random access: get the version at global row position `index`.
pub fn version_at(&self, index: usize) -> Option<u64> {
let mut offset = 0usize;
for run in &self.runs {
let len = run.len();
if index < offset + len {
return Some(run.version());
}
offset += len;
}
None
}
/// Get the version associated with a specific row id.
/// This reconstructs the positional offset from RowIdSequence and then
/// performs `version_at` lookup.
pub fn get_version_for_row_id(&self, row_ids: &RowIdSequence, row_id: u64) -> Option<u64> {
let mut offset = 0usize;
for seg in &row_ids.0 {
if seg.range().is_some_and(|r| r.contains(&row_id))
&& let Some(local) = seg.position(row_id)
{
return self.version_at(offset + local);
}
offset += seg.len();
}
None
}
/// Convenience: collect row IDs with version strictly greater than `threshold`.
pub fn rows_with_version_greater_than(
&self,
row_ids: &RowIdSequence,
threshold: u64,
) -> Vec<u64> {
row_ids
.iter()
.zip(self.versions())
.filter_map(|(rid, v)| if v > threshold { Some(rid) } else { None })
.collect()
}
/// Delete rows by positional offsets (e.g., from a deletion vector)
pub fn mask(&mut self, positions: impl IntoIterator<Item = u32>) -> Result<()> {
let mut local_positions: Vec<u32> = Vec::new();
let mut positions_iter = positions.into_iter();
let mut curr_position = positions_iter.next();
let mut offset: usize = 0;
let mut cutoff: usize = 0;
for run in self.runs.iter_mut() {
cutoff += run.span.len();
while let Some(position) = curr_position {
if position as usize >= cutoff {
break;
}
local_positions.push(position - offset as u32);
curr_position = positions_iter.next();
}
if !local_positions.is_empty() {
run.span.mask(local_positions.as_slice());
local_positions.clear();
}
offset = cutoff;
}
self.runs.retain(|r| !r.span.is_empty());
Ok(())
}
}
/// Iterator over versions expanding runs lazily.
pub struct VersionsIter<'a> {
runs: &'a [RowDatasetVersionRun],
run_idx: usize,
remaining_in_run: usize,
current_version: u64,
}
impl<'a> VersionsIter<'a> {
fn new(runs: &'a [RowDatasetVersionRun]) -> Self {
let mut it = Self {
runs,
run_idx: 0,
remaining_in_run: 0,
current_version: 0,
};
it.advance_run();
it
}
fn advance_run(&mut self) {
if self.run_idx < self.runs.len() {
let run = &self.runs[self.run_idx];
self.remaining_in_run = run.len();
self.current_version = run.version();
} else {
self.remaining_in_run = 0;
}
}
}
impl<'a> Iterator for VersionsIter<'a> {
type Item = u64;
fn next(&mut self) -> Option<Self::Item> {
if self.remaining_in_run == 0 {
// Move to next run
self.run_idx += 1;
if self.run_idx >= self.runs.len() {
return None;
}
self.advance_run();
}
self.remaining_in_run = self.remaining_in_run.saturating_sub(1);
Some(self.current_version)
}
}
/// Metadata about the location of dataset version sequence data
/// Following the same pattern as RowIdMeta
///
/// When stored inline, identical byte sequences are shared across fragments
/// via `Arc<[u8]>` to reduce manifest memory for large tables.
#[derive(Debug, Clone, PartialEq, Eq, DeepSizeOf)]
pub enum RowDatasetVersionMeta {
/// Small sequences stored inline in the fragment metadata
Inline(Arc<[u8]>),
/// Large sequences stored in external files
External(ExternalFile),
}
// Custom Serialize: convert Arc<[u8]> to slice for transparent JSON output
impl Serialize for RowDatasetVersionMeta {
fn serialize<S: Serializer>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error> {
#[derive(Serialize)]
#[serde(untagged)]
enum Helper<'a> {
Inline { inline: &'a [u8] },
External { external: &'a ExternalFile },
}
match self {
Self::Inline(data) => Helper::Inline {
inline: data.as_ref(),
}
.serialize(serializer),
Self::External(file) => Helper::External { external: file }.serialize(serializer),
}
}
}
// Custom Deserialize: read Vec<u8> and convert to Arc<[u8]>
impl<'de> Deserialize<'de> for RowDatasetVersionMeta {
fn deserialize<D: Deserializer<'de>>(deserializer: D) -> std::result::Result<Self, D::Error> {
#[derive(Deserialize)]
#[serde(untagged)]
enum Helper {
Inline { inline: Vec<u8> },
External { external: ExternalFile },
}
match Helper::deserialize(deserializer)? {
Helper::Inline { inline } => Ok(Self::Inline(Arc::from(inline))),
Helper::External { external } => Ok(Self::External(external)),
}
}
}
impl RowDatasetVersionMeta {
/// Create inline metadata from a version sequence
pub fn from_sequence(sequence: &RowDatasetVersionSequence) -> lance_core::Result<Self> {
let bytes = write_dataset_versions(sequence);
Ok(Self::Inline(Arc::from(bytes)))
}
/// Create external metadata reference
pub fn from_external_file(path: String, offset: u64, size: u64) -> Self {
Self::External(ExternalFile { path, offset, size })
}
/// Load the version sequence from this metadata
pub fn load_sequence(&self) -> lance_core::Result<RowDatasetVersionSequence> {
match self {
Self::Inline(data) => read_dataset_versions(data),
Self::External(_file) => {
todo!("External file loading not yet implemented")
}
}
}
}
/// Helper function to convert RowDatasetVersionMeta to protobuf format for last_updated_at
pub fn last_updated_at_version_meta_to_pb(
meta: &Option<RowDatasetVersionMeta>,
) -> Option<pb::data_fragment::LastUpdatedAtVersionSequence> {
meta.as_ref().map(|m| match m {
RowDatasetVersionMeta::Inline(data) => {
pb::data_fragment::LastUpdatedAtVersionSequence::InlineLastUpdatedAtVersions(
data.to_vec(),
)
}
RowDatasetVersionMeta::External(file) => {
pb::data_fragment::LastUpdatedAtVersionSequence::ExternalLastUpdatedAtVersions(
pb::ExternalFile {
path: file.path.clone(),
offset: file.offset,
size: file.size,
},
)
}
})
}
/// Helper function to convert RowDatasetVersionMeta to protobuf format for created_at
pub fn created_at_version_meta_to_pb(
meta: &Option<RowDatasetVersionMeta>,
) -> Option<pb::data_fragment::CreatedAtVersionSequence> {
meta.as_ref().map(|m| match m {
RowDatasetVersionMeta::Inline(data) => {
pb::data_fragment::CreatedAtVersionSequence::InlineCreatedAtVersions(data.to_vec())
}
RowDatasetVersionMeta::External(file) => {
pb::data_fragment::CreatedAtVersionSequence::ExternalCreatedAtVersions(
pb::ExternalFile {
path: file.path.clone(),
offset: file.offset,
size: file.size,
},
)
}
})
}
/// Serialize a dataset version sequence to a buffer (following RowIdSequence pattern)
pub fn write_dataset_versions(sequence: &RowDatasetVersionSequence) -> Vec<u8> {
// Convert to protobuf sequence
let pb_sequence = pb::RowDatasetVersionSequence {
runs: sequence
.runs
.iter()
.map(|run| pb::RowDatasetVersionRun {
span: Some(pb::U64Segment::from(run.span.clone())),
version: run.version,
})
.collect(),
};
pb_sequence.encode_to_vec()
}
/// Deserialize a dataset version sequence from bytes (following RowIdSequence pattern)
pub fn read_dataset_versions(data: &[u8]) -> lance_core::Result<RowDatasetVersionSequence> {
let pb_sequence = pb::RowDatasetVersionSequence::decode(data).map_err(|e| {
Error::internal(format!("Failed to decode RowDatasetVersionSequence: {}", e))
})?;
let segments = pb_sequence
.runs
.into_iter()
.map(|pb_run| {
let positions_pb = pb_run.span.ok_or_else(|| {
Error::internal("Missing positions in RowDatasetVersionRun".to_string())
})?;
let segment = U64Segment::try_from(positions_pb)?;
Ok(RowDatasetVersionRun {
span: segment,
version: pb_run.version,
})
})
.collect::<Result<Vec<_>>>()?;
Ok(RowDatasetVersionSequence { runs: segments })
}
/// Re-chunk a sequence of dataset version runs into new chunk sizes (aligned with RowIdSequence rechunking)
pub fn rechunk_version_sequences(
sequences: impl IntoIterator<Item = RowDatasetVersionSequence>,
chunk_sizes: impl IntoIterator<Item = u64>,
allow_incomplete: bool,
) -> Result<Vec<RowDatasetVersionSequence>> {
let chunk_sizes_vec: Vec<u64> = chunk_sizes.into_iter().collect();
let total_chunks = chunk_sizes_vec.len();
let mut chunked_sequences: Vec<RowDatasetVersionSequence> = Vec::with_capacity(total_chunks);
let mut run_iter = sequences
.into_iter()
.flat_map(|sequence| sequence.runs.into_iter())
.peekable();
let too_few_segments_error = |chunk_index: usize, expected_chunk_size: u64, remaining: u64| {
Error::invalid_input(format!(
"Got too few version runs for chunk {}. Expected chunk size: {}, remaining needed: {}",
chunk_index, expected_chunk_size, remaining
))
};
let too_many_segments_error = |processed_chunks: usize, total_chunk_sizes: usize| {
Error::invalid_input(format!(
"Got too many version runs for the provided chunk lengths. Processed {} chunks out of {} expected",
processed_chunks, total_chunk_sizes
))
};
let mut segment_offset = 0_u64;
for (chunk_index, chunk_size) in chunk_sizes_vec.iter().enumerate() {
let chunk_size = *chunk_size;
let mut out_seq = RowDatasetVersionSequence::new();
let mut remaining = chunk_size;
while remaining > 0 {
let remaining_in_segment = run_iter
.peek()
.map_or(0, |run| run.span.len() as u64 - segment_offset);
if remaining_in_segment == 0 {
if run_iter.next().is_some() {
segment_offset = 0;
continue;
} else if allow_incomplete {
break;
} else {
return Err(too_few_segments_error(chunk_index, chunk_size, remaining));
}
}
match remaining_in_segment.cmp(&remaining) {
std::cmp::Ordering::Greater => {
let run = run_iter.peek().unwrap();
let seg = run.span.slice(segment_offset as usize, remaining as usize);
out_seq.runs.push(RowDatasetVersionRun {
span: seg,
version: run.version,
});
segment_offset += remaining;
remaining = 0;
}
std::cmp::Ordering::Equal | std::cmp::Ordering::Less => {
let run = run_iter.next().ok_or_else(|| {
too_few_segments_error(chunk_index, chunk_size, remaining)
})?;
let seg = run
.span
.slice(segment_offset as usize, remaining_in_segment as usize);
out_seq.runs.push(RowDatasetVersionRun {
span: seg,
version: run.version,
});
segment_offset = 0;
remaining -= remaining_in_segment;
}
}
}
chunked_sequences.push(out_seq);
}
if run_iter.peek().is_some() {
return Err(too_many_segments_error(
chunked_sequences.len(),
total_chunks,
));
}
Ok(chunked_sequences)
}
/// Build version metadata for a fragment if it has physical rows and no existing metadata.
pub fn build_version_meta(
fragment: &Fragment,
current_version: u64,
) -> Option<RowDatasetVersionMeta> {
if let Some(physical_rows) = fragment.physical_rows
&& physical_rows > 0
{
// Verify row_id_meta exists (sanity check for stable row IDs)
if fragment.row_id_meta.is_none() {
panic!("Can not find row id meta, please make sure you have enabled stable row id.")
}
// Use physical_rows directly as the authoritative row count
// This is correct even for compacted fragments where row_id_meta might
// have been partially copied
let version_sequence = RowDatasetVersionSequence::from_uniform_row_count(
physical_rows as u64,
current_version,
);
return Some(RowDatasetVersionMeta::from_sequence(&version_sequence).unwrap());
}
None
}
/// Refresh row-level latest update version metadata for a full fragment rewrite-column update.
///
/// This sets a uniform version sequence for all rows in the fragment to `current_version`.
pub fn refresh_row_latest_update_meta_for_full_frag_rewrite_cols(
fragment: &mut Fragment,
current_version: u64,
) -> Result<()> {
let row_count = if let Some(pr) = fragment.physical_rows {
pr as u64
} else if let Some(row_id_meta) = fragment.row_id_meta.as_ref() {
match row_id_meta {
crate::format::RowIdMeta::Inline(data) => {
let sequence = read_row_ids(data).unwrap();
sequence.len()
}
// Follow existing behavior: external sequence not yet supported here
crate::format::RowIdMeta::External(_file) => 0,
}
} else {
0
};
if row_count > 0 {
let version_seq =
RowDatasetVersionSequence::from_uniform_row_count(row_count, current_version);
let version_meta = RowDatasetVersionMeta::from_sequence(&version_seq)?;
fragment.last_updated_at_version_meta = Some(version_meta);
}
Ok(())
}
/// Refresh row-level latest update version metadata for a partial fragment rewrite-column update.
///
/// `updated_offsets` are local row offsets (within the fragment) that have been updated.
/// Existing version metadata is preserved and only the updated positions are set to `current_version`.
/// If no existing metadata is present, positions default to `prev_version`.
pub fn refresh_row_latest_update_meta_for_partial_frag_rewrite_cols(
fragment: &mut Fragment,
updated_offsets: &[usize],
current_version: u64,
prev_version: u64,
) -> Result<()> {
// Determine row count for fragment
let row_count_u64: u64 = if let Some(pr) = fragment.physical_rows {
pr as u64
} else if let Some(row_id_meta) = fragment.row_id_meta.as_ref() {
match row_id_meta {
crate::format::RowIdMeta::Inline(data) => {
let sequence = read_row_ids(data).unwrap();
sequence.len()
}
crate::format::RowIdMeta::External(_file) => {
// Preserve original behavior for external sequences
todo!("External file loading not yet implemented")
}
}
} else {
0
};
if row_count_u64 > 0 {
// Build base version vector from existing meta or previous dataset version
let mut base_versions: Vec<u64> = Vec::with_capacity(row_count_u64 as usize);
if let Some(meta) = fragment.last_updated_at_version_meta.as_ref() {
if let Ok(base_seq) = meta.load_sequence() {
for pos in 0..(row_count_u64 as usize) {
base_versions.push(base_seq.version_at(pos).unwrap_or(prev_version));
}
} else {
base_versions.resize(row_count_u64 as usize, prev_version);
}
} else {
base_versions.resize(row_count_u64 as usize, prev_version);
}
// Apply updates to updated positions
for &pos in updated_offsets {
if pos < base_versions.len() {
base_versions[pos] = current_version;
}
}
// Compress into runs
let mut runs: Vec<RowDatasetVersionRun> = Vec::new();
if !base_versions.is_empty() {
let mut start = 0usize;
let mut curr_ver = base_versions[0];
for (idx, &ver) in base_versions.iter().enumerate().skip(1) {
if ver != curr_ver {
runs.push(RowDatasetVersionRun {
span: U64Segment::Range(start as u64..idx as u64),
version: curr_ver,
});
start = idx;
curr_ver = ver;
}
}
runs.push(RowDatasetVersionRun {
span: U64Segment::Range(start as u64..base_versions.len() as u64),
version: curr_ver,
});
}
let new_seq = RowDatasetVersionSequence { runs };
let new_meta = RowDatasetVersionMeta::from_sequence(&new_seq)?;
fragment.last_updated_at_version_meta = Some(new_meta);
}
Ok(())
}
// Protobuf conversion implementations
impl TryFrom<pb::data_fragment::LastUpdatedAtVersionSequence> for RowDatasetVersionMeta {
type Error = Error;
fn try_from(value: pb::data_fragment::LastUpdatedAtVersionSequence) -> Result<Self> {
match value {
pb::data_fragment::LastUpdatedAtVersionSequence::InlineLastUpdatedAtVersions(data) => {
Ok(Self::Inline(Arc::from(data)))
}
pb::data_fragment::LastUpdatedAtVersionSequence::ExternalLastUpdatedAtVersions(
file,
) => Ok(Self::External(ExternalFile {
path: file.path,
offset: file.offset,
size: file.size,
})),
}
}
}
impl TryFrom<pb::data_fragment::CreatedAtVersionSequence> for RowDatasetVersionMeta {
type Error = Error;
fn try_from(value: pb::data_fragment::CreatedAtVersionSequence) -> Result<Self> {
match value {
pb::data_fragment::CreatedAtVersionSequence::InlineCreatedAtVersions(data) => {
Ok(Self::Inline(Arc::from(data)))
}
pb::data_fragment::CreatedAtVersionSequence::ExternalCreatedAtVersions(file) => {
Ok(Self::External(ExternalFile {
path: file.path,
offset: file.offset,
size: file.size,
}))
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_version_random_access() {
let seq = RowDatasetVersionSequence {
runs: vec![
RowDatasetVersionRun {
span: U64Segment::Range(0..3),
version: 1,
},
RowDatasetVersionRun {
span: U64Segment::Range(0..2),
version: 2,
},
RowDatasetVersionRun {
span: U64Segment::Range(0..1),
version: 3,
},
],
};
assert_eq!(seq.version_at(0), Some(1));
assert_eq!(seq.version_at(2), Some(1));
assert_eq!(seq.version_at(3), Some(2));
assert_eq!(seq.version_at(4), Some(2));
assert_eq!(seq.version_at(5), Some(3));
assert_eq!(seq.version_at(6), None);
}
#[test]
fn test_serialization_round_trip() {
let seq = RowDatasetVersionSequence {
runs: vec![
RowDatasetVersionRun {
span: U64Segment::Range(0..4),
version: 42,
},
RowDatasetVersionRun {
span: U64Segment::Range(0..3),
version: 99,
},
],
};
let bytes = write_dataset_versions(&seq);
let seq2 = read_dataset_versions(&bytes).unwrap();
assert_eq!(seq2.runs.len(), 2);
assert_eq!(seq2.len(), 7);
assert_eq!(seq2.version_at(0), Some(42));
assert_eq!(seq2.version_at(5), Some(99));
}
#[test]
fn test_get_version_for_row_id() {
let seq = RowDatasetVersionSequence {
runs: vec![
RowDatasetVersionRun {
span: U64Segment::Range(0..2),
version: 8,
},
RowDatasetVersionRun {
span: U64Segment::Range(0..2),
version: 9,
},
],
};
let rows = RowIdSequence::from(10..14); // row ids: 10,11,12,13
assert_eq!(seq.get_version_for_row_id(&rows, 10), Some(8));
assert_eq!(seq.get_version_for_row_id(&rows, 11), Some(8));
assert_eq!(seq.get_version_for_row_id(&rows, 12), Some(9));
assert_eq!(seq.get_version_for_row_id(&rows, 13), Some(9));
assert_eq!(seq.get_version_for_row_id(&rows, 99), None);
}
}

47
vendor/lance-table/src/utils.rs vendored Normal file
View file

@ -0,0 +1,47 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors
pub mod stream;
pub trait LanceIteratorExtension {
fn exact_size(self, size: usize) -> ExactSize<Self>
where
Self: Sized;
}
impl<I: Iterator> LanceIteratorExtension for I {
fn exact_size(self, size: usize) -> ExactSize<Self>
where
Self: Sized,
{
ExactSize { inner: self, size }
}
}
/// A iterator that is tagged with a known size. This is useful when we are
/// able to pre-compute the size of the iterator but the iterator implementation
/// isn't able to itself. A common example is when using `flatten()`.
///
/// This is inspired by discussion in <https://github.com/rust-lang/rust/issues/68995>
pub struct ExactSize<I> {
inner: I,
size: usize,
}
impl<I: Iterator> Iterator for ExactSize<I> {
type Item = I::Item;
fn next(&mut self) -> Option<Self::Item> {
match self.inner.next() {
None => None,
Some(x) => {
self.size -= 1;
Some(x)
}
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
(self.size, Some(self.size))
}
}

806
vendor/lance-table/src/utils/stream.rs vendored Normal file
View file

@ -0,0 +1,806 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors
use std::sync::Arc;
use arrow_array::{BooleanArray, RecordBatch, RecordBatchOptions, UInt64Array, make_array};
use arrow_buffer::NullBuffer;
use futures::{
FutureExt, Stream, StreamExt,
future::BoxFuture,
stream::{BoxStream, FuturesOrdered},
};
use lance_arrow::RecordBatchExt;
use lance_core::{
ROW_ADDR, ROW_ADDR_FIELD, ROW_CREATED_AT_VERSION_FIELD, ROW_ID, ROW_ID_FIELD,
ROW_LAST_UPDATED_AT_VERSION_FIELD, Result,
utils::{address::RowAddress, deletion::DeletionVector},
};
use lance_io::ReadBatchParams;
use tracing::instrument;
use crate::rowids::RowIdSequence;
pub type ReadBatchFut = BoxFuture<'static, Result<RecordBatch>>;
/// A task, emitted by a file reader, that will produce a batch (of the
/// given size)
pub struct ReadBatchTask {
pub task: ReadBatchFut,
pub num_rows: u32,
}
pub type ReadBatchTaskStream = BoxStream<'static, ReadBatchTask>;
pub type ReadBatchFutStream = BoxStream<'static, ReadBatchFut>;
struct MergeStream {
streams: Vec<ReadBatchTaskStream>,
next_batch: FuturesOrdered<ReadBatchFut>,
next_num_rows: u32,
index: usize,
}
impl MergeStream {
fn emit(&mut self) -> ReadBatchTask {
let mut iter = std::mem::take(&mut self.next_batch);
let task = async move {
let mut batch = iter.next().await.unwrap()?;
while let Some(next) = iter.next().await {
let next = next?;
batch = batch.merge(&next)?;
}
Ok(batch)
}
.boxed();
let num_rows = self.next_num_rows;
self.next_num_rows = 0;
ReadBatchTask { task, num_rows }
}
}
impl Stream for MergeStream {
type Item = ReadBatchTask;
fn poll_next(
mut self: std::pin::Pin<&mut Self>,
cx: &mut std::task::Context<'_>,
) -> std::task::Poll<Option<Self::Item>> {
loop {
let index = self.index;
match self.streams[index].poll_next_unpin(cx) {
std::task::Poll::Ready(Some(batch_task)) => {
if self.index == 0 {
self.next_num_rows = batch_task.num_rows;
} else {
debug_assert_eq!(self.next_num_rows, batch_task.num_rows);
}
self.next_batch.push_back(batch_task.task);
self.index += 1;
if self.index == self.streams.len() {
self.index = 0;
let next_batch = self.emit();
return std::task::Poll::Ready(Some(next_batch));
}
}
std::task::Poll::Ready(None) => {
return std::task::Poll::Ready(None);
}
std::task::Poll::Pending => {
return std::task::Poll::Pending;
}
}
}
}
}
/// Given multiple streams of batch tasks, merge them into a single stream
///
/// This pulls one batch from each stream and then combines the columns from
/// all of the batches into a single batch. The order of the batches in the
/// streams is maintained and the merged batch columns will be in order from
/// first to last stream.
///
/// This stream ends as soon as any of the input streams ends (we do not
/// verify that the other input streams are finished as well)
///
/// This will panic if any of the input streams return a batch with a different
/// number of rows than the first stream.
pub fn merge_streams(streams: Vec<ReadBatchTaskStream>) -> ReadBatchTaskStream {
MergeStream {
streams,
next_batch: FuturesOrdered::new(),
next_num_rows: 0,
index: 0,
}
.boxed()
}
/// Apply a mask to the batch, where rows are "deleted" by the _rowid column null.
///
/// This is used partly as a performance optimization (cheaper to null than to filter)
/// but also because there are cases where we want to load the physical rows. For example,
/// we may be replacing a column based on some UDF and we want to provide a value for the
/// deleted rows to ensure the fragments are aligned.
fn apply_deletions_as_nulls(batch: RecordBatch, mask: &BooleanArray) -> Result<RecordBatch> {
// Transform mask into null buffer. Null means deleted, though note that
// null buffers are actually validity buffers, so True means not null
// and thus not deleted.
let mask_buffer = NullBuffer::new(mask.values().clone());
if mask_buffer.null_count() == 0 {
// No rows are deleted
return Ok(batch);
}
// For each column convert to data
let new_columns = batch
.schema()
.fields()
.iter()
.zip(batch.columns())
.map(|(field, col)| {
if field.name() == ROW_ID || field.name() == ROW_ADDR {
let col_data = col.to_data();
// If it already has a validity bitmap, then AND it with the mask.
// Otherwise, use the boolean buffer as the mask.
let null_buffer = NullBuffer::union(col_data.nulls(), Some(&mask_buffer));
Ok(col_data
.into_builder()
.null_bit_buffer(null_buffer.map(|b| b.buffer().clone()))
.build()
.map(make_array)?)
} else {
Ok(col.clone())
}
})
.collect::<Result<Vec<_>>>()?;
Ok(RecordBatch::try_new_with_options(
batch.schema(),
new_columns,
&RecordBatchOptions::new().with_row_count(Some(batch.num_rows())),
)?)
}
/// Extract version values for a batch selection by binary-searching over
/// precomputed RLE run offsets. Single-run fragments (the common case)
/// take the O(1) fast path.
fn version_values_for_selection(
sequence: &crate::rowids::version::RowDatasetVersionSequence,
params: &ReadBatchParams,
batch_offset: u32,
num_rows: u32,
) -> Result<Vec<u64>> {
let selection = params
.slice(batch_offset as usize, num_rows as usize)
.unwrap()
.to_ranges()
.unwrap();
if sequence.runs.len() == 1 {
return Ok(vec![sequence.runs[0].version(); num_rows as usize]);
}
let mut versions = Vec::with_capacity(num_rows as usize);
let run_offsets: Vec<usize> = sequence
.runs
.iter()
.scan(0usize, |acc, run| {
let start = *acc;
*acc += run.len();
Some(start)
})
.collect();
let total_len: usize = sequence.runs.iter().map(|r| r.len()).sum();
for r in &selection {
for pos in r.start..r.end {
let pos = pos as usize;
if pos >= total_len {
return Err(lance_core::Error::internal(format!(
"version column position {} out of range (total_len={})",
pos, total_len
)));
}
let run_idx = match run_offsets.binary_search(&pos) {
Ok(idx) => idx,
Err(idx) => idx - 1,
};
versions.push(sequence.runs[run_idx].version());
}
}
Ok(versions)
}
/// Configuration needed to apply row ids and deletions to a batch
#[derive(Debug)]
pub struct RowIdAndDeletesConfig {
/// The row ids that were requested
pub params: ReadBatchParams,
/// Whether to include the row id column in the final batch
pub with_row_id: bool,
/// Whether to include the row address column in the final batch
pub with_row_addr: bool,
/// Whether to include the last updated at version column in the final batch
pub with_row_last_updated_at_version: bool,
/// Whether to include the created at version column in the final batch
pub with_row_created_at_version: bool,
/// An optional deletion vector to apply to the batch
pub deletion_vector: Option<Arc<DeletionVector>>,
/// An optional row id sequence to use for the row id column.
pub row_id_sequence: Option<Arc<RowIdSequence>>,
/// The last_updated_at version sequence
pub last_updated_at_sequence: Option<Arc<crate::rowids::version::RowDatasetVersionSequence>>,
/// The created_at version sequence
pub created_at_sequence: Option<Arc<crate::rowids::version::RowDatasetVersionSequence>>,
/// Whether to make deleted rows null instead of filtering them out
pub make_deletions_null: bool,
/// The total number of rows that will be loaded
///
/// This is needed to convert ReadbatchParams::RangeTo into a valid range
pub total_num_rows: u32,
}
impl RowIdAndDeletesConfig {
fn has_system_cols(&self) -> bool {
self.with_row_id
|| self.with_row_addr
|| self.with_row_last_updated_at_version
|| self.with_row_created_at_version
}
}
#[instrument(level = "debug", skip_all)]
pub fn apply_row_id_and_deletes(
batch: RecordBatch,
batch_offset: u32,
fragment_id: u32,
config: &RowIdAndDeletesConfig,
) -> Result<RecordBatch> {
let mut deletion_vector = config.deletion_vector.as_ref();
// Convert Some(NoDeletions) into None to simplify logic below
if let Some(deletion_vector_inner) = deletion_vector
&& matches!(deletion_vector_inner.as_ref(), DeletionVector::NoDeletions)
{
deletion_vector = None;
}
let has_deletions = deletion_vector.is_some();
debug_assert!(batch.num_columns() > 0 || config.has_system_cols() || has_deletions);
// If row id sequence is None, then row id IS row address.
let should_fetch_row_addr = config.with_row_addr
|| (config.with_row_id && config.row_id_sequence.is_none())
|| has_deletions;
let num_rows = batch.num_rows() as u32;
let row_addrs =
if should_fetch_row_addr {
let _rowaddrs = tracing::span!(tracing::Level::DEBUG, "fetch_row_addrs").entered();
let mut row_addrs = Vec::with_capacity(num_rows as usize);
for offset_range in config
.params
.slice(batch_offset as usize, num_rows as usize)
.unwrap()
.iter_offset_ranges()?
{
row_addrs.extend(offset_range.map(|row_offset| {
u64::from(RowAddress::new_from_parts(fragment_id, row_offset))
}));
}
Some(Arc::new(UInt64Array::from(row_addrs)))
} else {
None
};
let row_ids = if config.with_row_id {
let _rowids = tracing::span!(tracing::Level::DEBUG, "fetch_row_ids").entered();
if let Some(row_id_sequence) = &config.row_id_sequence {
let selection = config
.params
.slice(batch_offset as usize, num_rows as usize)
.unwrap()
.to_ranges()
.unwrap();
let row_ids = row_id_sequence
.select(
selection
.iter()
.flat_map(|r| r.start as usize..r.end as usize),
)
.collect::<UInt64Array>();
Some(Arc::new(row_ids))
} else {
// If we don't have a row id sequence, can assume the row ids are
// the same as the row addresses.
row_addrs.clone()
}
} else {
None
};
let span = tracing::span!(tracing::Level::DEBUG, "apply_deletions");
let _enter = span.enter();
let deletion_mask = deletion_vector.and_then(|v| {
let row_addrs: &[u64] = row_addrs.as_ref().unwrap().values();
v.build_predicate(row_addrs.iter())
});
let batch = if config.with_row_id {
let row_id_arr = row_ids.unwrap();
batch.try_with_column(ROW_ID_FIELD.clone(), row_id_arr)?
} else {
batch
};
let batch = if config.with_row_addr {
let row_addr_arr = row_addrs.unwrap();
batch.try_with_column(ROW_ADDR_FIELD.clone(), row_addr_arr)?
} else {
batch
};
// Add version columns if requested
let batch = if config.with_row_last_updated_at_version || config.with_row_created_at_version {
let mut batch = batch;
if config.with_row_last_updated_at_version {
let version_arr = if let Some(sequence) = &config.last_updated_at_sequence {
Arc::new(UInt64Array::from(version_values_for_selection(
sequence,
&config.params,
batch_offset,
num_rows,
)?))
} else {
// Default to version 1 if sequence not provided
Arc::new(UInt64Array::from(vec![1u64; num_rows as usize]))
};
batch =
batch.try_with_column(ROW_LAST_UPDATED_AT_VERSION_FIELD.clone(), version_arr)?;
}
if config.with_row_created_at_version {
let version_arr = if let Some(sequence) = &config.created_at_sequence {
Arc::new(UInt64Array::from(version_values_for_selection(
sequence,
&config.params,
batch_offset,
num_rows,
)?))
} else {
// Default to version 1 if sequence not provided
Arc::new(UInt64Array::from(vec![1u64; num_rows as usize]))
};
batch = batch.try_with_column(ROW_CREATED_AT_VERSION_FIELD.clone(), version_arr)?;
}
batch
} else {
batch
};
match (deletion_mask, config.make_deletions_null) {
(None, _) => Ok(batch),
(Some(mask), false) => Ok(arrow::compute::filter_record_batch(&batch, &mask)?),
(Some(mask), true) => Ok(apply_deletions_as_nulls(batch, &mask)?),
}
}
/// Given a stream of batch tasks this function will add a row ids column (if requested)
/// and also apply a deletions vector to the batch.
///
/// This converts from BatchTaskStream to BatchFutStream because, if we are applying a
/// deletion vector, it is impossible to know how many output rows we will have.
pub fn wrap_with_row_id_and_delete(
stream: ReadBatchTaskStream,
fragment_id: u32,
config: RowIdAndDeletesConfig,
) -> ReadBatchFutStream {
let config = Arc::new(config);
let mut offset = 0;
stream
.map(move |batch_task| {
let config = config.clone();
let this_offset = offset;
let num_rows = batch_task.num_rows;
offset += num_rows;
batch_task
.task
.map(move |batch| {
apply_row_id_and_deletes(batch?, this_offset, fragment_id, config.as_ref())
})
.boxed()
})
.boxed()
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use arrow::{array::AsArray, datatypes::UInt64Type};
use arrow_array::{RecordBatch, UInt32Array, types::Int32Type};
use arrow_schema::ArrowError;
use futures::{FutureExt, StreamExt, TryStreamExt, stream::BoxStream};
use lance_core::{
ROW_ID,
utils::{address::RowAddress, deletion::DeletionVector},
};
use lance_datagen::{BatchCount, RowCount};
use lance_io::{ReadBatchParams, stream::arrow_stream_to_lance_stream};
use roaring::RoaringBitmap;
use crate::utils::stream::ReadBatchTask;
use super::RowIdAndDeletesConfig;
fn batch_task_stream(
datagen_stream: BoxStream<'static, std::result::Result<RecordBatch, ArrowError>>,
) -> super::ReadBatchTaskStream {
arrow_stream_to_lance_stream(datagen_stream)
.map(|batch| ReadBatchTask {
num_rows: batch.as_ref().unwrap().num_rows() as u32,
task: std::future::ready(batch).boxed(),
})
.boxed()
}
#[tokio::test]
async fn test_basic_zip() {
let left = batch_task_stream(
lance_datagen::gen_batch()
.col("x", lance_datagen::array::step::<Int32Type>())
.into_reader_stream(RowCount::from(100), BatchCount::from(10))
.0,
);
let right = batch_task_stream(
lance_datagen::gen_batch()
.col("y", lance_datagen::array::step::<Int32Type>())
.into_reader_stream(RowCount::from(100), BatchCount::from(10))
.0,
);
let merged = super::merge_streams(vec![left, right])
.map(|batch_task| batch_task.task)
.buffered(1)
.try_collect::<Vec<_>>()
.await
.unwrap();
let expected = lance_datagen::gen_batch()
.col("x", lance_datagen::array::step::<Int32Type>())
.col("y", lance_datagen::array::step::<Int32Type>())
.into_reader_rows(RowCount::from(100), BatchCount::from(10))
.collect::<Result<Vec<_>, ArrowError>>()
.unwrap();
assert_eq!(merged, expected);
}
async fn check_row_id(params: ReadBatchParams, expected: impl IntoIterator<Item = u32>) {
let expected = Vec::from_iter(expected);
for has_columns in [false, true] {
for fragment_id in [0, 10] {
// 100 rows across 10 batches of 10 rows
let mut datagen = lance_datagen::gen_batch();
if has_columns {
datagen = datagen.col("x", lance_datagen::array::rand::<Int32Type>());
}
let data = batch_task_stream(
datagen
.into_reader_stream(RowCount::from(10), BatchCount::from(10))
.0,
);
let config = RowIdAndDeletesConfig {
params: params.clone(),
with_row_id: true,
with_row_addr: false,
with_row_last_updated_at_version: false,
with_row_created_at_version: false,
deletion_vector: None,
row_id_sequence: None,
last_updated_at_sequence: None,
created_at_sequence: None,
make_deletions_null: false,
total_num_rows: 100,
};
let stream = super::wrap_with_row_id_and_delete(data, fragment_id, config);
let batches = stream.buffered(1).try_collect::<Vec<_>>().await.unwrap();
let mut offset = 0;
let expected = expected.clone();
for batch in batches {
let actual_row_ids =
batch[ROW_ID].as_primitive::<UInt64Type>().values().to_vec();
let expected_row_ids = expected[offset..offset + 10]
.iter()
.map(|row_offset| {
RowAddress::new_from_parts(fragment_id, *row_offset).into()
})
.collect::<Vec<u64>>();
assert_eq!(actual_row_ids, expected_row_ids);
offset += batch.num_rows();
}
}
}
}
#[tokio::test]
async fn test_row_id() {
let some_indices = (0..100).rev().collect::<Vec<u32>>();
let some_indices_arr = UInt32Array::from(some_indices.clone());
check_row_id(ReadBatchParams::RangeFull, 0..100).await;
check_row_id(ReadBatchParams::Indices(some_indices_arr), some_indices).await;
check_row_id(ReadBatchParams::Range(1000..1100), 1000..1100).await;
check_row_id(
ReadBatchParams::RangeFrom(std::ops::RangeFrom { start: 1000 }),
1000..1100,
)
.await;
check_row_id(
ReadBatchParams::RangeTo(std::ops::RangeTo { end: 1000 }),
0..100,
)
.await;
}
#[tokio::test]
async fn test_deletes() {
let no_deletes: Option<Arc<DeletionVector>> = None;
let no_deletes_2 = Some(Arc::new(DeletionVector::NoDeletions));
let delete_some_bitmap = Some(Arc::new(DeletionVector::Bitmap(RoaringBitmap::from_iter(
0..35,
))));
let delete_some_set = Some(Arc::new(DeletionVector::Set((0..35).collect())));
for deletion_vector in [
no_deletes,
no_deletes_2,
delete_some_bitmap,
delete_some_set,
] {
for has_columns in [false, true] {
for with_row_id in [false, true] {
for make_deletions_null in [false, true] {
for frag_id in [0, 1] {
let has_deletions = if let Some(dv) = &deletion_vector {
!matches!(dv.as_ref(), DeletionVector::NoDeletions)
} else {
false
};
if !has_columns && !has_deletions && !with_row_id {
// This is an invalid case and should be prevented upstream,
// no meaningful work is being done!
continue;
}
if make_deletions_null && !with_row_id {
// This is an invalid case and should be prevented upstream
// we cannot make the row_id column null if it isn't present
continue;
}
let mut datagen = lance_datagen::gen_batch();
if has_columns {
datagen =
datagen.col("x", lance_datagen::array::rand::<Int32Type>());
}
// 100 rows across 10 batches of 10 rows
let data = batch_task_stream(
datagen
.into_reader_stream(RowCount::from(10), BatchCount::from(10))
.0,
);
let config = RowIdAndDeletesConfig {
params: ReadBatchParams::RangeFull,
with_row_id,
with_row_addr: false,
with_row_last_updated_at_version: false,
with_row_created_at_version: false,
deletion_vector: deletion_vector.clone(),
row_id_sequence: None,
last_updated_at_sequence: None,
created_at_sequence: None,
make_deletions_null,
total_num_rows: 100,
};
let stream = super::wrap_with_row_id_and_delete(data, frag_id, config);
let batches = stream
.buffered(1)
.filter_map(|batch| {
std::future::ready(
batch
.map(|batch| {
if batch.num_rows() == 0 {
None
} else {
Some(batch)
}
})
.transpose(),
)
})
.try_collect::<Vec<_>>()
.await
.unwrap();
let total_num_rows =
batches.iter().map(|b| b.num_rows()).sum::<usize>();
let total_num_nulls = if make_deletions_null {
batches
.iter()
.map(|b| b[ROW_ID].null_count())
.sum::<usize>()
} else {
0
};
let total_actually_deleted = total_num_nulls + (100 - total_num_rows);
let expected_deletions = match &deletion_vector {
None => 0,
Some(deletion_vector) => match deletion_vector.as_ref() {
DeletionVector::NoDeletions => 0,
DeletionVector::Bitmap(b) => b.len() as usize,
DeletionVector::Set(s) => s.len(),
},
};
assert_eq!(total_actually_deleted, expected_deletions);
if expected_deletions > 0 && with_row_id {
if make_deletions_null {
// If we make deletions null we get 3 batches of all-null and then
// a batch of half-null
assert_eq!(
batches[3][ROW_ID].as_primitive::<UInt64Type>().value(0),
u64::from(RowAddress::new_from_parts(frag_id, 30))
);
assert_eq!(batches[3][ROW_ID].null_count(), 5);
} else {
// If we materialize deletions the first row will be 35
assert_eq!(
batches[0][ROW_ID].as_primitive::<UInt64Type>().value(0),
u64::from(RowAddress::new_from_parts(frag_id, 35))
);
}
}
if !with_row_id {
assert!(batches[0].column_by_name(ROW_ID).is_none());
}
}
}
}
}
}
}
#[tokio::test]
async fn test_version_column_with_deletions() {
use crate::rowids::segment::U64Segment;
use crate::rowids::version::{RowDatasetVersionRun, RowDatasetVersionSequence};
let seq = Arc::new(RowDatasetVersionSequence {
runs: vec![RowDatasetVersionRun {
span: U64Segment::Range(0..100),
version: 42,
}],
});
let data = batch_task_stream(
lance_datagen::gen_batch()
.col("x", lance_datagen::array::rand::<Int32Type>())
.into_reader_stream(RowCount::from(10), BatchCount::from(10))
.0,
);
let config = RowIdAndDeletesConfig {
params: ReadBatchParams::RangeFull,
with_row_id: true,
with_row_addr: false,
with_row_last_updated_at_version: false,
with_row_created_at_version: true,
deletion_vector: Some(Arc::new(DeletionVector::Bitmap(RoaringBitmap::from_iter(
0..35,
)))),
row_id_sequence: None,
last_updated_at_sequence: None,
created_at_sequence: Some(seq),
make_deletions_null: false,
total_num_rows: 100,
};
let stream = super::wrap_with_row_id_and_delete(data, 0, config);
let batches: Vec<_> = stream
.buffered(1)
.try_filter(|b| std::future::ready(b.num_rows() > 0))
.try_collect()
.await
.unwrap();
let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
assert_eq!(total_rows, 65);
for batch in &batches {
let versions = batch
.column_by_name("_row_created_at_version")
.unwrap()
.as_primitive::<UInt64Type>()
.values();
assert!(versions.iter().all(|&v| v == 42));
}
}
#[tokio::test]
async fn test_version_column_multi_run() {
use crate::rowids::segment::U64Segment;
use crate::rowids::version::{RowDatasetVersionRun, RowDatasetVersionSequence};
// 3 runs: 0..40 v1, 40..70 v2, 70..100 v3
let seq = Arc::new(RowDatasetVersionSequence {
runs: vec![
RowDatasetVersionRun {
span: U64Segment::Range(0..40),
version: 1,
},
RowDatasetVersionRun {
span: U64Segment::Range(40..70),
version: 2,
},
RowDatasetVersionRun {
span: U64Segment::Range(70..100),
version: 3,
},
],
});
// Delete 0..20 and 60..80 (spans run boundary).
// Survivors: 20..40 (v1), 40..60 (v2), 80..100 (v3) = 60 rows
let mut deletions = RoaringBitmap::from_iter(0..20);
deletions.extend(60..80);
let data = batch_task_stream(
lance_datagen::gen_batch()
.col("x", lance_datagen::array::rand::<Int32Type>())
.into_reader_stream(RowCount::from(10), BatchCount::from(10))
.0,
);
let config = RowIdAndDeletesConfig {
params: ReadBatchParams::RangeFull,
with_row_id: true,
with_row_addr: false,
with_row_last_updated_at_version: false,
with_row_created_at_version: true,
deletion_vector: Some(Arc::new(DeletionVector::Bitmap(deletions))),
row_id_sequence: None,
last_updated_at_sequence: None,
created_at_sequence: Some(seq),
make_deletions_null: false,
total_num_rows: 100,
};
let stream = super::wrap_with_row_id_and_delete(data, 0, config);
let batches: Vec<_> = stream
.buffered(1)
.try_filter(|b| std::future::ready(b.num_rows() > 0))
.try_collect()
.await
.unwrap();
let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
assert_eq!(total_rows, 60);
let all_versions: Vec<u64> = batches
.iter()
.flat_map(|b| {
b.column_by_name("_row_created_at_version")
.unwrap()
.as_primitive::<UInt64Type>()
.values()
.to_vec()
})
.collect();
assert!(all_versions[..20].iter().all(|&v| v == 1));
assert!(all_versions[20..40].iter().all(|&v| v == 2));
assert!(all_versions[40..60].iter().all(|&v| v == 3));
}
}