mirror of
https://github.com/ModernRelay/omnigraph.git
synced 2026-07-03 02:51:04 +02:00
fix(deps): vendor lance-table 7.0.0 + lance#7480 so merge-updated tables survive filtered reads after deletes
iss-merge-rowid-overlap-corrupts-filtered-reads / lance#7444: an
update-style merge_insert over a merge-written fragment legally reuses the
updated rows' stable row ids (row-id-lineage spec: updates preserve
_rowid) while the superseded fragment keeps its full sequence plus a
deletion vector. A later delete leaves the overlapping id range sparsely
tiled, and lance-table 7.0.0's RowIdIndex::new asserted dense tiling —
failing every filtered read that builds the id→address map ("Wrong range"
debug assert; "all columns in a record batch must have the same length"
or a silently-wrong batch in release).
The upstream fix (lance#7480, merged 2026-07-01) landed hours AFTER
v8.0.0 was cut, so no release ≤ 8.0.0 carries it. Consume it now as a
vendored pin: vendor/lance-table is the pristine published 7.0.0 source
plus ONLY the #7480 rowids/index.rs hunk (drop the false tiling assert;
hard-error on the true invariant — one live id claimed by two fragments)
and upstream's regression unit test, wired via [patch.crates-io]. The fix
is read-side only, so already-written graphs become readable as-is — no
data repair.
Removal condition (see vendor/lance-table/README.omnigraph.md): drop the
vendor dir + patch entry at the first Lance bump whose lance-table ships
lance#7480 (9.0.0, or a backported 8.0.1). The surface guard
filtered_scan_tolerates_merge_update_row_id_overlap keeps that honest in
both directions.
Turns the previous commit's red tests green. Full workspace gate passes
(cargo test --workspace --locked --no-fail-fast, 68 suites).
This commit is contained in:
parent
3b564534a2
commit
b5c0c6238b
48 changed files with 22203 additions and 2 deletions
2
Cargo.lock
generated
2
Cargo.lock
generated
|
|
@ -4202,8 +4202,6 @@ dependencies = [
|
|||
[[package]]
|
||||
name = "lance-table"
|
||||
version = "7.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b16f1355904aea4ebb04ffc70c58c97901e10bde44452b4b021de4a1f329250d"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-array",
|
||||
|
|
|
|||
14
Cargo.toml
14
Cargo.toml
|
|
@ -1,5 +1,8 @@
|
|||
[workspace]
|
||||
resolver = "2"
|
||||
# The vendored patched crate is a [patch.crates-io] path source, not a
|
||||
# workspace member (see the patch section at the bottom of this file).
|
||||
exclude = ["vendor/lance-table"]
|
||||
members = [
|
||||
"crates/omnigraph-compiler",
|
||||
"crates/omnigraph",
|
||||
|
|
@ -86,3 +89,14 @@ opt-level = 2
|
|||
lto = "thin"
|
||||
codegen-units = 16
|
||||
strip = true
|
||||
|
||||
# Vendored lance-table 7.0.0 carrying ONLY the lance#7480 hunk (rowids/index.rs):
|
||||
# tolerate sparse overlapping stable-row-id chunks so filtered reads survive an
|
||||
# update-style merge_insert followed by a delete (lance#7444;
|
||||
# iss-merge-rowid-overlap-corrupts-filtered-reads). Pinned by
|
||||
# lance_surface_guards.rs::filtered_scan_tolerates_merge_update_row_id_overlap.
|
||||
# REMOVE vendor/lance-table + this patch at the first Lance bump whose
|
||||
# lance-table ships lance#7480 (9.0.0, or a backported 8.0.1). Details:
|
||||
# vendor/lance-table/README.omnigraph.md and docs/dev/lance.md.
|
||||
[patch.crates-io]
|
||||
lance-table = { path = "vendor/lance-table" }
|
||||
|
|
|
|||
6
vendor/lance-table/.cargo_vcs_info.json
vendored
Normal file
6
vendor/lance-table/.cargo_vcs_info.json
vendored
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
{
|
||||
"git": {
|
||||
"sha1": "a15ae30939b9242d74b00aed1fb83abf7d15bf7f"
|
||||
},
|
||||
"path_in_vcs": "rust/lance-table"
|
||||
}
|
||||
5741
vendor/lance-table/Cargo.lock
generated
vendored
Normal file
5741
vendor/lance-table/Cargo.lock
generated
vendored
Normal file
File diff suppressed because it is too large
Load diff
263
vendor/lance-table/Cargo.toml
vendored
Normal file
263
vendor/lance-table/Cargo.toml
vendored
Normal file
|
|
@ -0,0 +1,263 @@
|
|||
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
|
||||
#
|
||||
# When uploading crates to the registry Cargo will automatically
|
||||
# "normalize" Cargo.toml files for maximal compatibility
|
||||
# with all versions of Cargo and also rewrite `path` dependencies
|
||||
# to registry (e.g., crates.io) dependencies.
|
||||
#
|
||||
# If you are reading this file be aware that the original Cargo.toml
|
||||
# will likely look very different (and much more reasonable).
|
||||
# See Cargo.toml.orig for the original contents.
|
||||
|
||||
[package]
|
||||
edition = "2024"
|
||||
rust-version = "1.91.0"
|
||||
name = "lance-table"
|
||||
version = "7.0.0"
|
||||
authors = ["Lance Devs <dev@lance.org>"]
|
||||
build = "build.rs"
|
||||
autolib = false
|
||||
autobins = false
|
||||
autoexamples = false
|
||||
autotests = false
|
||||
autobenches = false
|
||||
description = "Utilities for the Lance table format"
|
||||
readme = "README.md"
|
||||
keywords = [
|
||||
"data-format",
|
||||
"data-science",
|
||||
"machine-learning",
|
||||
"apache-arrow",
|
||||
"data-analytics",
|
||||
]
|
||||
categories = [
|
||||
"database-implementations",
|
||||
"data-structures",
|
||||
"development-tools",
|
||||
"science",
|
||||
]
|
||||
license = "Apache-2.0"
|
||||
repository = "https://github.com/lance-format/lance"
|
||||
|
||||
[package.metadata.docs.rs]
|
||||
features = ["protoc"]
|
||||
|
||||
[features]
|
||||
dynamodb = [
|
||||
"dep:aws-sdk-dynamodb",
|
||||
"dep:aws-credential-types",
|
||||
"lance-io/aws",
|
||||
]
|
||||
protoc = ["dep:protobuf-src"]
|
||||
|
||||
[lib]
|
||||
name = "lance_table"
|
||||
path = "src/lib.rs"
|
||||
|
||||
[[bench]]
|
||||
name = "manifest_intern"
|
||||
path = "benches/manifest_intern.rs"
|
||||
harness = false
|
||||
|
||||
[[bench]]
|
||||
name = "row_id_index"
|
||||
path = "benches/row_id_index.rs"
|
||||
harness = false
|
||||
|
||||
[dependencies.arrow]
|
||||
version = "58.0.0"
|
||||
features = ["prettyprint"]
|
||||
|
||||
[dependencies.arrow-array]
|
||||
version = "58.0.0"
|
||||
|
||||
[dependencies.arrow-buffer]
|
||||
version = "58.0.0"
|
||||
|
||||
[dependencies.arrow-ipc]
|
||||
version = "58.0.0"
|
||||
features = ["zstd"]
|
||||
|
||||
[dependencies.arrow-schema]
|
||||
version = "58.0.0"
|
||||
|
||||
[dependencies.async-trait]
|
||||
version = "0.1"
|
||||
|
||||
[dependencies.aws-credential-types]
|
||||
version = "1.2.0"
|
||||
optional = true
|
||||
|
||||
[dependencies.aws-sdk-dynamodb]
|
||||
version = "1.38.0"
|
||||
features = [
|
||||
"default-https-client",
|
||||
"rt-tokio",
|
||||
]
|
||||
optional = true
|
||||
default-features = false
|
||||
|
||||
[dependencies.byteorder]
|
||||
version = "1.5"
|
||||
|
||||
[dependencies.bytes]
|
||||
version = "1.11.1"
|
||||
|
||||
[dependencies.chrono]
|
||||
version = "0.4.41"
|
||||
features = [
|
||||
"std",
|
||||
"now",
|
||||
"serde",
|
||||
]
|
||||
default-features = false
|
||||
|
||||
[dependencies.deepsize]
|
||||
version = "0.2.0"
|
||||
|
||||
[dependencies.futures]
|
||||
version = "0.3"
|
||||
|
||||
[dependencies.lance-arrow]
|
||||
version = "=7.0.0"
|
||||
|
||||
[dependencies.lance-core]
|
||||
version = "=7.0.0"
|
||||
|
||||
[dependencies.lance-file]
|
||||
version = "=7.0.0"
|
||||
|
||||
[dependencies.lance-io]
|
||||
version = "=7.0.0"
|
||||
default-features = false
|
||||
|
||||
[dependencies.log]
|
||||
version = "0.4"
|
||||
|
||||
[dependencies.object_store]
|
||||
version = "0.13.2"
|
||||
|
||||
[dependencies.prost]
|
||||
version = "0.14.1"
|
||||
|
||||
[dependencies.prost-types]
|
||||
version = "0.14.1"
|
||||
|
||||
[dependencies.rand]
|
||||
version = "0.9.1"
|
||||
features = ["small_rng"]
|
||||
|
||||
[dependencies.rangemap]
|
||||
version = "1.0"
|
||||
|
||||
[dependencies.roaring]
|
||||
version = "0.11"
|
||||
|
||||
[dependencies.semver]
|
||||
version = "1.0"
|
||||
|
||||
[dependencies.serde]
|
||||
version = "^1"
|
||||
|
||||
[dependencies.serde_json]
|
||||
version = "1"
|
||||
|
||||
[dependencies.snafu]
|
||||
version = "0.9"
|
||||
|
||||
[dependencies.tokio]
|
||||
version = "1.23"
|
||||
features = [
|
||||
"rt-multi-thread",
|
||||
"macros",
|
||||
"fs",
|
||||
"sync",
|
||||
]
|
||||
|
||||
[dependencies.tracing]
|
||||
version = "0.1"
|
||||
|
||||
[dependencies.url]
|
||||
version = "2.5.7"
|
||||
|
||||
[dependencies.uuid]
|
||||
version = "1.2"
|
||||
features = [
|
||||
"v4",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[dev-dependencies.arrow-schema]
|
||||
version = "58.0.0"
|
||||
|
||||
[dev-dependencies.criterion]
|
||||
version = "0.5"
|
||||
features = [
|
||||
"async",
|
||||
"async_tokio",
|
||||
"html_reports",
|
||||
]
|
||||
|
||||
[dev-dependencies.lance-datagen]
|
||||
version = "=7.0.0"
|
||||
|
||||
[dev-dependencies.pretty_assertions]
|
||||
version = "1.4.0"
|
||||
|
||||
[dev-dependencies.proptest]
|
||||
version = "1.3.1"
|
||||
|
||||
[dev-dependencies.rstest]
|
||||
version = "0.23.0"
|
||||
|
||||
[build-dependencies.prost-build]
|
||||
version = "0.14.1"
|
||||
|
||||
[build-dependencies.protobuf-src]
|
||||
version = "2.1"
|
||||
optional = true
|
||||
|
||||
[target.'cfg(target_os = "linux")'.dev-dependencies.pprof]
|
||||
version = "0.14.0"
|
||||
features = [
|
||||
"flamegraph",
|
||||
"criterion",
|
||||
]
|
||||
|
||||
[lints.clippy]
|
||||
dbg_macro = "deny"
|
||||
disallowed_macros = "deny"
|
||||
fallible_impl_from = "deny"
|
||||
large_futures = "deny"
|
||||
manual_let_else = "deny"
|
||||
multiple-crate-versions = "allow"
|
||||
print_stderr = "deny"
|
||||
print_stdout = "deny"
|
||||
redundant_clone = "deny"
|
||||
redundant_pub_crate = "deny"
|
||||
single_range_in_vec_init = "allow"
|
||||
string_add = "deny"
|
||||
string_add_assign = "deny"
|
||||
string_lit_as_bytes = "deny"
|
||||
trait_duplication_in_bounds = "deny"
|
||||
use_self = "deny"
|
||||
|
||||
[lints.clippy.all]
|
||||
level = "deny"
|
||||
priority = -1
|
||||
|
||||
[lints.clippy.cargo]
|
||||
level = "deny"
|
||||
priority = -1
|
||||
|
||||
[lints.clippy.style]
|
||||
level = "deny"
|
||||
priority = -1
|
||||
|
||||
[lints.rust]
|
||||
unsafe_op_in_unsafe_fn = "allow"
|
||||
|
||||
[lints.rust.unexpected_cfgs]
|
||||
level = "warn"
|
||||
priority = 0
|
||||
check-cfg = ["cfg(coverage,coverage_nightly)"]
|
||||
80
vendor/lance-table/Cargo.toml.orig
generated
vendored
Normal file
80
vendor/lance-table/Cargo.toml.orig
generated
vendored
Normal file
|
|
@ -0,0 +1,80 @@
|
|||
[package]
|
||||
name = "lance-table"
|
||||
version.workspace = true
|
||||
edition.workspace = true
|
||||
authors.workspace = true
|
||||
license.workspace = true
|
||||
repository.workspace = true
|
||||
readme = "README.md"
|
||||
description = "Utilities for the Lance table format"
|
||||
keywords.workspace = true
|
||||
categories.workspace = true
|
||||
rust-version.workspace = true
|
||||
|
||||
[dependencies]
|
||||
lance-arrow.workspace = true
|
||||
lance-core.workspace = true
|
||||
lance-file.workspace = true
|
||||
lance-io.workspace = true
|
||||
arrow.workspace = true
|
||||
arrow-array.workspace = true
|
||||
arrow-buffer.workspace = true
|
||||
arrow-ipc.workspace = true
|
||||
arrow-schema.workspace = true
|
||||
async-trait.workspace = true
|
||||
aws-credential-types = { workspace = true, optional = true }
|
||||
aws-sdk-dynamodb = { workspace = true, optional = true, default-features = false, features = ["default-https-client", "rt-tokio"] }
|
||||
byteorder.workspace = true
|
||||
bytes.workspace = true
|
||||
chrono.workspace = true
|
||||
deepsize.workspace = true
|
||||
futures.workspace = true
|
||||
log.workspace = true
|
||||
object_store.workspace = true
|
||||
prost.workspace = true
|
||||
prost-types.workspace = true
|
||||
rand.workspace = true
|
||||
rangemap.workspace = true
|
||||
roaring.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
semver.workspace = true
|
||||
snafu.workspace = true
|
||||
tokio.workspace = true
|
||||
tracing.workspace = true
|
||||
url.workspace = true
|
||||
uuid.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
lance-datagen.workspace = true
|
||||
arrow-schema.workspace = true
|
||||
criterion.workspace = true
|
||||
pretty_assertions.workspace = true
|
||||
proptest.workspace = true
|
||||
rstest.workspace = true
|
||||
|
||||
[target.'cfg(target_os = "linux")'.dev-dependencies]
|
||||
pprof = { workspace = true }
|
||||
|
||||
[build-dependencies]
|
||||
prost-build.workspace = true
|
||||
protobuf-src = { version = "2.1", optional = true }
|
||||
|
||||
[features]
|
||||
dynamodb = ["dep:aws-sdk-dynamodb", "dep:aws-credential-types", "lance-io/aws"]
|
||||
protoc = ["dep:protobuf-src"]
|
||||
|
||||
[package.metadata.docs.rs]
|
||||
# docs.rs uses an older version of Ubuntu that does not have the necessary protoc version
|
||||
features = ["protoc"]
|
||||
|
||||
[[bench]]
|
||||
name = "row_id_index"
|
||||
harness = false
|
||||
|
||||
[[bench]]
|
||||
name = "manifest_intern"
|
||||
harness = false
|
||||
|
||||
[lints]
|
||||
workspace = true
|
||||
6
vendor/lance-table/README.md
vendored
Normal file
6
vendor/lance-table/README.md
vendored
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
# lance-table
|
||||
|
||||
`lance-table` is an internal sub-crate for the
|
||||
[Lance table format](https://lance.org/format/table/).
|
||||
|
||||
**Important Note**: This crate is **not intended for external usage**.
|
||||
42
vendor/lance-table/README.omnigraph.md
vendored
Normal file
42
vendor/lance-table/README.omnigraph.md
vendored
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
# Vendored `lance-table` 7.0.0 + lance#7480 (omnigraph patch pin)
|
||||
|
||||
This directory is the **pristine `lance-table` 7.0.0 crates.io source** (unpacked
|
||||
from the published `.crate`) carrying exactly one upstream fix, cherry-picked
|
||||
from [lance-format/lance#7480](https://github.com/lance-format/lance/pull/7480)
|
||||
(merged to Lance main 2026-07-01, first present in no release ≤ 8.0.0):
|
||||
|
||||
- `src/rowids/index.rs` — `RowIdIndex::new` no longer asserts that overlapping
|
||||
row-id chunks densely tile their range (an update-style `merge_insert`
|
||||
legally reuses the updated rows' stable ids in new fragments while the
|
||||
superseded fragment keeps its full sequence + a deletion vector; a later
|
||||
delete leaves the union short of the span). The real invariant — the same
|
||||
live id claimed by two fragments — is now a hard error in
|
||||
`merge_overlapping_chunks` instead. Upstream's regression unit test is
|
||||
included.
|
||||
|
||||
Without the fix, any filtered read that builds the row-id index on such a
|
||||
table fails: `rowids/index.rs:50` "Wrong range" debug assert; "all columns in
|
||||
a record batch must have the same length" (or a silently-wrong batch) in
|
||||
release. Bug: [lance#7444](https://github.com/lance-format/lance/issues/7444),
|
||||
tracked as `iss-merge-rowid-overlap-corrupts-filtered-reads` /
|
||||
`blk-lance-7444` on the dev graph.
|
||||
|
||||
Wired up via `[patch.crates-io] lance-table = { path = "vendor/lance-table" }`
|
||||
in the workspace root `Cargo.toml`.
|
||||
|
||||
## Removal condition
|
||||
|
||||
Delete this directory and the `[patch.crates-io]` entry at the **first Lance
|
||||
bump whose `lance-table` ships lance#7480** — 9.0.0, or a backported 8.0.1 if
|
||||
upstream cuts one. The runtime guard
|
||||
`crates/omnigraph/tests/lance_surface_guards.rs::filtered_scan_tolerates_merge_update_row_id_overlap`
|
||||
pins the fixed behavior: it goes red if the patch is dropped too early or a
|
||||
future bump regresses the fix.
|
||||
|
||||
## Verifying the delta
|
||||
|
||||
```bash
|
||||
# The full diff vs the published crate should be ONLY the #7480 hunk + this README:
|
||||
tar -xzf ~/.cargo/registry/cache/index.crates.io-*/lance-table-7.0.0.crate -C /tmp
|
||||
diff -ru /tmp/lance-table-7.0.0 vendor/lance-table
|
||||
```
|
||||
261
vendor/lance-table/benches/manifest_intern.rs
vendored
Normal file
261
vendor/lance-table/benches/manifest_intern.rs
vendored
Normal file
|
|
@ -0,0 +1,261 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The Lance Authors
|
||||
|
||||
// Benchmarks use eprintln! to report memory stats alongside criterion output.
|
||||
#![allow(clippy::print_stderr)]
|
||||
|
||||
//! Benchmark for manifest fragment interning.
|
||||
//!
|
||||
//! Measures memory savings and deserialization throughput when interning
|
||||
//! `DataFile.fields`, `DataFile.column_indices`, and
|
||||
//! `RowDatasetVersionMeta::Inline` bytes across many fragments.
|
||||
|
||||
use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
|
||||
use deepsize::DeepSizeOf;
|
||||
use prost::Message;
|
||||
|
||||
use lance_table::format::pb;
|
||||
use lance_table::format::{DataFileFieldInterner, Fragment};
|
||||
|
||||
fn num_fragments() -> u64 {
|
||||
std::env::var("BENCH_NUM_FRAGMENTS")
|
||||
.map(|s| s.parse().unwrap())
|
||||
.unwrap_or(100_000)
|
||||
}
|
||||
|
||||
/// Build a vector of protobuf DataFragment messages that simulate a
|
||||
/// homogeneous, post-compaction table: every fragment has the same field
|
||||
/// list, column indices, and version metadata bytes.
|
||||
fn make_uniform_pb_fragments(n: u64, num_fields: usize) -> Vec<pb::DataFragment> {
|
||||
let fields: Vec<i32> = (0..num_fields as i32).collect();
|
||||
let column_indices: Vec<i32> = (0..num_fields as i32).collect();
|
||||
|
||||
// Simulate version metadata: a small protobuf-encoded payload
|
||||
// (identical across all fragments post-compaction)
|
||||
let version_bytes: Vec<u8> = {
|
||||
let seq = pb::RowDatasetVersionSequence {
|
||||
runs: vec![pb::RowDatasetVersionRun {
|
||||
span: Some(pb::U64Segment {
|
||||
segment: Some(pb::u64_segment::Segment::Range(pb::u64_segment::Range {
|
||||
start: 0,
|
||||
end: 1000,
|
||||
})),
|
||||
}),
|
||||
version: 42,
|
||||
}],
|
||||
};
|
||||
seq.encode_to_vec()
|
||||
};
|
||||
|
||||
(0..n)
|
||||
.map(|i| pb::DataFragment {
|
||||
id: i,
|
||||
files: vec![pb::DataFile {
|
||||
path: format!("data/{i}.lance"),
|
||||
fields: fields.clone(),
|
||||
column_indices: column_indices.clone(),
|
||||
file_major_version: 2,
|
||||
file_minor_version: 0,
|
||||
file_size_bytes: 0,
|
||||
base_id: None,
|
||||
}],
|
||||
deletion_file: None,
|
||||
row_id_sequence: None,
|
||||
physical_rows: 1000,
|
||||
last_updated_at_version_sequence: Some(
|
||||
pb::data_fragment::LastUpdatedAtVersionSequence::InlineLastUpdatedAtVersions(
|
||||
version_bytes.clone(),
|
||||
),
|
||||
),
|
||||
created_at_version_sequence: Some(
|
||||
pb::data_fragment::CreatedAtVersionSequence::InlineCreatedAtVersions(
|
||||
version_bytes.clone(),
|
||||
),
|
||||
),
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Deserialize protobuf fragments WITHOUT interning (baseline).
|
||||
fn deserialize_without_interning(protos: &[pb::DataFragment]) -> Vec<Fragment> {
|
||||
protos
|
||||
.iter()
|
||||
.map(|p| Fragment::try_from(p.clone()).unwrap())
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Deserialize protobuf fragments WITH interning.
|
||||
fn deserialize_with_interning(protos: &[pb::DataFragment]) -> Vec<Fragment> {
|
||||
let mut interner = DataFileFieldInterner::default();
|
||||
protos
|
||||
.iter()
|
||||
.map(|p| interner.intern_fragment(p.clone()).unwrap())
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Build fragments where each group shares the same version metadata,
|
||||
/// simulating many small appends without compaction.
|
||||
fn make_diverse_pb_fragments(
|
||||
n: u64,
|
||||
num_fields: usize,
|
||||
unique_versions: u64,
|
||||
) -> Vec<pb::DataFragment> {
|
||||
let fields: Vec<i32> = (0..num_fields as i32).collect();
|
||||
let column_indices: Vec<i32> = (0..num_fields as i32).collect();
|
||||
let group_size = n / unique_versions;
|
||||
|
||||
let version_payloads: Vec<Vec<u8>> = (0..unique_versions)
|
||||
.map(|v| {
|
||||
let seq = pb::RowDatasetVersionSequence {
|
||||
runs: vec![pb::RowDatasetVersionRun {
|
||||
span: Some(pb::U64Segment {
|
||||
segment: Some(pb::u64_segment::Segment::Range(pb::u64_segment::Range {
|
||||
start: 0,
|
||||
end: 1000,
|
||||
})),
|
||||
}),
|
||||
version: v,
|
||||
}],
|
||||
};
|
||||
seq.encode_to_vec()
|
||||
})
|
||||
.collect();
|
||||
|
||||
(0..n)
|
||||
.map(|i| {
|
||||
let version_idx = (i / group_size).min(unique_versions - 1) as usize;
|
||||
pb::DataFragment {
|
||||
id: i,
|
||||
files: vec![pb::DataFile {
|
||||
path: format!("data/{i}.lance"),
|
||||
fields: fields.clone(),
|
||||
column_indices: column_indices.clone(),
|
||||
file_major_version: 2,
|
||||
file_minor_version: 0,
|
||||
file_size_bytes: 0,
|
||||
base_id: None,
|
||||
}],
|
||||
deletion_file: None,
|
||||
row_id_sequence: None,
|
||||
physical_rows: 1000,
|
||||
last_updated_at_version_sequence: Some(
|
||||
pb::data_fragment::LastUpdatedAtVersionSequence::InlineLastUpdatedAtVersions(
|
||||
version_payloads[version_idx].clone(),
|
||||
),
|
||||
),
|
||||
created_at_version_sequence: Some(
|
||||
pb::data_fragment::CreatedAtVersionSequence::InlineCreatedAtVersions(
|
||||
version_payloads[version_idx].clone(),
|
||||
),
|
||||
),
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn bench_deserialization(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("manifest_intern");
|
||||
let n = num_fragments();
|
||||
|
||||
for num_fields in [10, 50] {
|
||||
let protos = make_uniform_pb_fragments(n, num_fields);
|
||||
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("deserialize_no_intern", num_fields),
|
||||
&num_fields,
|
||||
|b, _| {
|
||||
b.iter(|| deserialize_without_interning(&protos));
|
||||
},
|
||||
);
|
||||
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("deserialize_with_intern", num_fields),
|
||||
&num_fields,
|
||||
|b, _| {
|
||||
b.iter(|| deserialize_with_interning(&protos));
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
// Benchmark with many unique version payloads
|
||||
for unique_versions in [10, 100, 500] {
|
||||
let protos = make_diverse_pb_fragments(n, 10, unique_versions);
|
||||
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("deserialize_no_intern_diverse", unique_versions),
|
||||
&unique_versions,
|
||||
|b, _| {
|
||||
b.iter(|| deserialize_without_interning(&protos));
|
||||
},
|
||||
);
|
||||
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("deserialize_with_intern_diverse", unique_versions),
|
||||
&unique_versions,
|
||||
|b, _| {
|
||||
b.iter(|| deserialize_with_interning(&protos));
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_memory(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("manifest_memory");
|
||||
let n = num_fragments();
|
||||
|
||||
for num_fields in [10, 50] {
|
||||
let protos = make_uniform_pb_fragments(n, num_fields);
|
||||
|
||||
let no_intern = deserialize_without_interning(&protos);
|
||||
let with_intern = deserialize_with_interning(&protos);
|
||||
|
||||
let size_no_intern = no_intern.deep_size_of();
|
||||
let size_with_intern = with_intern.deep_size_of();
|
||||
|
||||
eprintln!(
|
||||
"\n[{} fragments, {} fields] Memory without interning: {:.2} MB",
|
||||
n,
|
||||
num_fields,
|
||||
size_no_intern as f64 / 1_048_576.0
|
||||
);
|
||||
eprintln!(
|
||||
"[{} fragments, {} fields] Memory with interning: {:.2} MB",
|
||||
n,
|
||||
num_fields,
|
||||
size_with_intern as f64 / 1_048_576.0
|
||||
);
|
||||
eprintln!(
|
||||
"[{} fragments, {} fields] Savings: {:.2} MB ({:.1}%)",
|
||||
n,
|
||||
num_fields,
|
||||
(size_no_intern - size_with_intern) as f64 / 1_048_576.0,
|
||||
(1.0 - size_with_intern as f64 / size_no_intern as f64) * 100.0
|
||||
);
|
||||
|
||||
// Benchmark deep_size_of measurement itself (sanity check)
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("deep_size_of_interned", num_fields),
|
||||
&num_fields,
|
||||
|b, _| {
|
||||
b.iter(|| with_intern.deep_size_of());
|
||||
},
|
||||
);
|
||||
|
||||
drop(no_intern);
|
||||
drop(with_intern);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
criterion_group!(
|
||||
name = benches;
|
||||
config = Criterion::default().with_profiler(pprof::criterion::PProfProfiler::new(100, pprof::criterion::Output::Flamegraph(None)));
|
||||
targets = bench_deserialization, bench_memory
|
||||
);
|
||||
#[cfg(not(target_os = "linux"))]
|
||||
criterion_group!(benches, bench_deserialization, bench_memory);
|
||||
criterion_main!(benches);
|
||||
323
vendor/lance-table/benches/row_id_index.rs
vendored
Normal file
323
vendor/lance-table/benches/row_id_index.rs
vendored
Normal file
|
|
@ -0,0 +1,323 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The Lance Authors
|
||||
|
||||
// TODO:
|
||||
// - [x] Create base cases with HashMap
|
||||
// - [x] Create on-disk size measurement
|
||||
// - [x] Create different cases for the index. Ideal, 25% deletions, 80% deletions + compaction.
|
||||
// - [ ] Create a benchmark for the get method
|
||||
// - [x] Average over all valid values
|
||||
// - [ ] Time to get a value that is not in the index
|
||||
// - [ ] Create a benchmark for the new method (building the in-memory index)
|
||||
// Optional:
|
||||
// - [ ] Create in-memory size measurement (if possible)
|
||||
|
||||
// Questions:
|
||||
// How can I write out the file? Where should I put it?
|
||||
// How can I take a argument to set the size of the index?
|
||||
|
||||
use std::{collections::HashMap, io::Write, ops::Range, sync::Arc};
|
||||
|
||||
use arrow_array::{RecordBatch, UInt64Array};
|
||||
use arrow_schema::{DataType, Field, Schema};
|
||||
use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
|
||||
|
||||
use lance_core::utils::address::RowAddress;
|
||||
use lance_core::utils::deletion::DeletionVector;
|
||||
use lance_io::ReadBatchParams;
|
||||
use lance_table::rowids::FragmentRowIdIndex;
|
||||
use lance_table::{
|
||||
rowids::{RowIdIndex, RowIdSequence, write_row_ids},
|
||||
utils::stream::{RowIdAndDeletesConfig, apply_row_id_and_deletes},
|
||||
};
|
||||
|
||||
fn make_sequence(row_id_range: Range<u64>, deletions: usize) -> RowIdSequence {
|
||||
let mut sequence = RowIdSequence::from(row_id_range);
|
||||
|
||||
// Delete every other row
|
||||
let delete_ids = sequence
|
||||
.iter()
|
||||
.step_by(2)
|
||||
.take(deletions)
|
||||
.collect::<Vec<_>>();
|
||||
sequence.delete(delete_ids);
|
||||
|
||||
sequence
|
||||
}
|
||||
|
||||
fn make_frag_sequences(
|
||||
num_rows: u64,
|
||||
num_frags: u64,
|
||||
percent_deletion: f32,
|
||||
) -> Vec<(u32, Arc<RowIdSequence>)> {
|
||||
let rows_per_frag = num_rows / num_frags;
|
||||
let mut start = 0;
|
||||
(0..num_frags)
|
||||
.map(|i| {
|
||||
let sequence = make_sequence(
|
||||
start..(start + rows_per_frag),
|
||||
(rows_per_frag as f32 * percent_deletion) as usize,
|
||||
);
|
||||
start += rows_per_frag;
|
||||
(i as u32, Arc::new(sequence))
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
// For range of values
|
||||
// https://bheisler.github.io/criterion.rs/book/user_guide/benchmarking_with_inputs.html
|
||||
|
||||
fn num_rows() -> u64 {
|
||||
std::env::var("BENCH_NUM_ROWS")
|
||||
.map(|s| s.parse().unwrap())
|
||||
.unwrap_or(1_000_000)
|
||||
}
|
||||
|
||||
struct SizeStats {
|
||||
structure: String,
|
||||
percent_deletions: f32,
|
||||
size: u64,
|
||||
}
|
||||
|
||||
struct SizeStatsFile {
|
||||
file: Option<std::fs::File>,
|
||||
}
|
||||
|
||||
impl SizeStatsFile {
|
||||
fn new() -> Self {
|
||||
if let Ok(path) = std::env::var("BENCH_SIZE_STATS_FILE") {
|
||||
let mut file = std::fs::File::create(path).unwrap();
|
||||
// Header row
|
||||
writeln!(file, "structure,percent_deletions,size").unwrap();
|
||||
Self { file: Some(file) }
|
||||
} else {
|
||||
Self { file: None }
|
||||
}
|
||||
}
|
||||
|
||||
fn write_row(&mut self, stats: SizeStats) {
|
||||
if let Some(file) = &mut self.file {
|
||||
writeln!(
|
||||
file,
|
||||
"\"{}\",{},{}",
|
||||
stats.structure, stats.percent_deletions, stats.size
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn bench_creation(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("row_id_index_creation");
|
||||
let mut stats_file = SizeStatsFile::new();
|
||||
|
||||
for percent_deletions in [0.0, 0.25, 0.5] {
|
||||
let sequences = make_frag_sequences(num_rows(), 100, percent_deletions);
|
||||
|
||||
let fragment_indices: Vec<FragmentRowIdIndex> = sequences
|
||||
.iter()
|
||||
.map(|(frag_id, sequence)| FragmentRowIdIndex {
|
||||
fragment_id: *frag_id,
|
||||
row_id_sequence: sequence.clone(),
|
||||
deletion_vector: Arc::new(DeletionVector::default()),
|
||||
})
|
||||
.collect();
|
||||
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("BuildIndex", percent_deletions),
|
||||
&percent_deletions,
|
||||
|b, _| {
|
||||
b.iter(|| {
|
||||
let _index = RowIdIndex::new(&fragment_indices).unwrap();
|
||||
});
|
||||
},
|
||||
);
|
||||
|
||||
// Measure size of index
|
||||
{
|
||||
let mut size = 0;
|
||||
for (_frag_id, sequence) in &sequences {
|
||||
size += write_row_ids(sequence).len() as u64;
|
||||
}
|
||||
let stats = SizeStats {
|
||||
structure: "RowIdIndex".to_string(),
|
||||
percent_deletions,
|
||||
size,
|
||||
};
|
||||
stats_file.write_row(stats);
|
||||
}
|
||||
|
||||
// TODO: we should compare tombstoned vs compacted. We don't mind the
|
||||
// regression in the tombstoned case, but we want to see the improvement
|
||||
// in the compacted case.
|
||||
|
||||
// TODO: collect size of sequences when serialized
|
||||
|
||||
// TODO: also show building a BTreeMap and HashMap
|
||||
|
||||
let flat_data = sequences
|
||||
.iter()
|
||||
.map(|(frag_id, sequence)| {
|
||||
let row_ids = sequence.iter().collect::<Vec<_>>();
|
||||
let row_addresses = (0..sequence.len())
|
||||
.map(|i| RowAddress::new_from_parts(*frag_id, i as u32))
|
||||
.map(u64::from)
|
||||
.collect::<Vec<_>>();
|
||||
(row_ids, row_addresses)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
// Size of flat data is just 16 bytes per row
|
||||
let size = flat_data
|
||||
.iter()
|
||||
.map(|(ids, _addresses)| ids.len() * 16)
|
||||
.sum::<usize>() as u64;
|
||||
let stats = SizeStats {
|
||||
structure: "FlatData".to_string(),
|
||||
percent_deletions,
|
||||
size,
|
||||
};
|
||||
stats_file.write_row(stats);
|
||||
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("BuildHashMap", percent_deletions),
|
||||
&percent_deletions,
|
||||
|b, _| {
|
||||
b.iter(|| {
|
||||
let mut index = HashMap::new();
|
||||
index.extend(flat_data.iter().flat_map(|(ids, addresses)| {
|
||||
ids.iter().copied().zip(addresses.iter().copied())
|
||||
}));
|
||||
});
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_get_single(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("row_id_index_get_single");
|
||||
|
||||
for percent_deletions in [0.0, 0.02, 0.25, 0.5, 0.8] {
|
||||
let sequences = make_frag_sequences(num_rows(), 100, percent_deletions);
|
||||
|
||||
let fragment_indices: Vec<FragmentRowIdIndex> = sequences
|
||||
.iter()
|
||||
.map(|(frag_id, sequence)| FragmentRowIdIndex {
|
||||
fragment_id: *frag_id,
|
||||
row_id_sequence: sequence.clone(),
|
||||
deletion_vector: Arc::new(DeletionVector::default()),
|
||||
})
|
||||
.collect();
|
||||
|
||||
let index = RowIdIndex::new(&fragment_indices).unwrap();
|
||||
|
||||
let mut i = 0;
|
||||
let total_rows: u64 = num_rows();
|
||||
let mut next_id = || {
|
||||
let id = i;
|
||||
i += 241861;
|
||||
i %= total_rows;
|
||||
id
|
||||
};
|
||||
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("GetIndex", percent_deletions),
|
||||
&percent_deletions,
|
||||
|b, _| {
|
||||
b.iter(|| {
|
||||
let _ = index.get(next_id());
|
||||
});
|
||||
},
|
||||
);
|
||||
|
||||
let flat_data = sequences
|
||||
.iter()
|
||||
.map(|(frag_id, sequence)| {
|
||||
let row_ids = sequence.iter().collect::<Vec<_>>();
|
||||
let row_addresses = (0..sequence.len())
|
||||
.map(|i| RowAddress::new_from_parts(*frag_id, i as u32))
|
||||
.map(u64::from)
|
||||
.collect::<Vec<_>>();
|
||||
(row_ids, row_addresses)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let index =
|
||||
{
|
||||
let mut index = HashMap::new();
|
||||
index.extend(flat_data.iter().flat_map(|(ids, addresses)| {
|
||||
ids.iter().copied().zip(addresses.iter().copied())
|
||||
}));
|
||||
index
|
||||
};
|
||||
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("GetHashMap", percent_deletions),
|
||||
&percent_deletions,
|
||||
|b, _| {
|
||||
b.iter(|| {
|
||||
for i in 0..num_rows() {
|
||||
let _ = index.get(&i);
|
||||
}
|
||||
});
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn bench_apply_row_id(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("apply_row_id");
|
||||
|
||||
let batch = RecordBatch::try_new(
|
||||
Arc::new(Schema::new(vec![Field::new(
|
||||
"value",
|
||||
DataType::UInt64,
|
||||
false,
|
||||
)])),
|
||||
vec![Arc::new(UInt64Array::from(
|
||||
(0..num_rows()).collect::<Vec<_>>(),
|
||||
))],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let config = RowIdAndDeletesConfig {
|
||||
params: ReadBatchParams::default(),
|
||||
with_row_id: true,
|
||||
with_row_addr: false,
|
||||
with_row_last_updated_at_version: false,
|
||||
with_row_created_at_version: false,
|
||||
deletion_vector: None,
|
||||
row_id_sequence: None,
|
||||
last_updated_at_sequence: None,
|
||||
created_at_sequence: None,
|
||||
make_deletions_null: false,
|
||||
total_num_rows: num_rows() as u32,
|
||||
};
|
||||
|
||||
group.bench_function("ApplyRowId", |b| {
|
||||
let batch = batch.clone();
|
||||
b.iter(|| {
|
||||
let _ = apply_row_id_and_deletes(batch.clone(), 0, 0, &config);
|
||||
});
|
||||
});
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
criterion_group!(
|
||||
name = benches;
|
||||
config=Criterion::default().with_profiler(pprof::criterion::PProfProfiler::new(100, pprof::criterion::Output::Flamegraph(None)));
|
||||
targets=bench_creation, bench_get_single, bench_apply_row_id);
|
||||
#[cfg(not(target_os = "linux"))]
|
||||
criterion_group!(
|
||||
benches,
|
||||
bench_creation,
|
||||
bench_get_single,
|
||||
bench_apply_row_id
|
||||
);
|
||||
criterion_main!(benches);
|
||||
29
vendor/lance-table/build.rs
vendored
Normal file
29
vendor/lance-table/build.rs
vendored
Normal file
|
|
@ -0,0 +1,29 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The Lance Authors
|
||||
|
||||
use std::io::Result;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
println!("cargo:rerun-if-changed=protos");
|
||||
|
||||
#[cfg(feature = "protoc")]
|
||||
// Use vendored protobuf compiler if requested.
|
||||
unsafe {
|
||||
std::env::set_var("PROTOC", protobuf_src::protoc());
|
||||
}
|
||||
|
||||
let mut prost_build = prost_build::Config::new();
|
||||
prost_build.extern_path(".lance.file", "::lance_file::format::pb");
|
||||
prost_build.protoc_arg("--experimental_allow_proto3_optional");
|
||||
prost_build.enable_type_names();
|
||||
prost_build.compile_protos(
|
||||
&[
|
||||
"./protos/table.proto",
|
||||
"./protos/transaction.proto",
|
||||
"./protos/rowids.proto",
|
||||
],
|
||||
&["./protos"],
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
18
vendor/lance-table/protos/AGENTS.md
vendored
Normal file
18
vendor/lance-table/protos/AGENTS.md
vendored
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
# Protobuf Guidelines
|
||||
|
||||
Also see [root AGENTS.md](../AGENTS.md) for cross-language standards.
|
||||
|
||||
## Compatibility
|
||||
|
||||
- All changes must be backwards compatible. Never re-use or change field numbers of existing fields.
|
||||
|
||||
## Schema Design
|
||||
|
||||
- Use `optional` when you need to distinguish "not set" from "zero value" — `optional` enables presence tracking (`has_*` methods) and maps to `Option<T>` in Rust. Bare proto3 fields have no presence semantics: they always hold a value (defaulting to zero), so you cannot tell if the sender explicitly set them.
|
||||
- Use structured message types (e.g., `BasePath`) instead of plain scalars, and scope fields to operation-specific messages (e.g., `InsertTransaction`) rather than generic top-level ones.
|
||||
- Don't duplicate data across messages — store each fact once and derive relationships. Prefer parallel sequences over maps when keys already exist in another field.
|
||||
|
||||
## Documentation
|
||||
|
||||
- Document the semantic meaning of both present and absent states for `optional` fields — explain when each case applies.
|
||||
- Use precise domain terminology in field descriptions — avoid ambiguous abbreviations or terms that collide with domain concepts.
|
||||
18
vendor/lance-table/protos/CLAUDE.md
vendored
Normal file
18
vendor/lance-table/protos/CLAUDE.md
vendored
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
# Protobuf Guidelines
|
||||
|
||||
Also see [root AGENTS.md](../AGENTS.md) for cross-language standards.
|
||||
|
||||
## Compatibility
|
||||
|
||||
- All changes must be backwards compatible. Never re-use or change field numbers of existing fields.
|
||||
|
||||
## Schema Design
|
||||
|
||||
- Use `optional` when you need to distinguish "not set" from "zero value" — `optional` enables presence tracking (`has_*` methods) and maps to `Option<T>` in Rust. Bare proto3 fields have no presence semantics: they always hold a value (defaulting to zero), so you cannot tell if the sender explicitly set them.
|
||||
- Use structured message types (e.g., `BasePath`) instead of plain scalars, and scope fields to operation-specific messages (e.g., `InsertTransaction`) rather than generic top-level ones.
|
||||
- Don't duplicate data across messages — store each fact once and derive relationships. Prefer parallel sequences over maps when keys already exist in another field.
|
||||
|
||||
## Documentation
|
||||
|
||||
- Document the semantic meaning of both present and absent states for `optional` fields — explain when each case applies.
|
||||
- Use precise domain terminology in field descriptions — avoid ambiguous abbreviations or terms that collide with domain concepts.
|
||||
55
vendor/lance-table/protos/ann.proto
vendored
Normal file
55
vendor/lance-table/protos/ann.proto
vendored
Normal file
|
|
@ -0,0 +1,55 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The Lance Authors
|
||||
|
||||
syntax = "proto3";
|
||||
|
||||
package lance.pb;
|
||||
|
||||
import "table_identifier.proto";
|
||||
import "table.proto";
|
||||
import "index.proto";
|
||||
|
||||
// Serialized vector query parameters.
|
||||
message VectorQueryProto {
|
||||
// Query vector as Arrow IPC bytes (supports Float16, Float32, Float64, UInt8, etc.)
|
||||
bytes query_vector_arrow_ipc = 1;
|
||||
string column = 2;
|
||||
uint32 k = 3;
|
||||
optional float lower_bound = 4;
|
||||
optional float upper_bound = 5;
|
||||
optional uint32 minimum_nprobes = 6;
|
||||
optional uint32 maximum_nprobes = 7;
|
||||
optional uint32 ef = 8;
|
||||
optional uint32 refine_factor = 9;
|
||||
// Distance metric type. Absent means None (use the index's default metric).
|
||||
optional lance.index.pb.VectorMetricType metric_type = 10;
|
||||
bool use_index = 11;
|
||||
optional float dist_q_c = 12;
|
||||
optional int32 query_parallelism = 13;
|
||||
}
|
||||
|
||||
// Serializable form of ANNIvfSubIndexExec — the IVF sub-index search node.
|
||||
//
|
||||
// The prefilter child ExecutionPlan is serialized by DataFusion's codec
|
||||
// automatically via children() / with_new_children(). The prefilter_type
|
||||
// field tells the decoder which PreFilterSource variant to use when
|
||||
// reconstructing from the deserialized child inputs.
|
||||
message ANNIvfSubIndexExecProto {
|
||||
enum PreFilterType {
|
||||
NONE = 0;
|
||||
FILTERED_ROW_IDS = 1;
|
||||
SCALAR_INDEX_QUERY = 2;
|
||||
}
|
||||
|
||||
VectorQueryProto query = 1;
|
||||
lance.datafusion.TableIdentifier table = 2;
|
||||
repeated lance.table.IndexMetadata indices = 3;
|
||||
PreFilterType prefilter_type = 4;
|
||||
}
|
||||
|
||||
// Serializable form of ANNIvfPartitionExec — the IVF centroid routing node.
|
||||
message ANNIvfPartitionExecProto {
|
||||
VectorQueryProto query = 1;
|
||||
lance.datafusion.TableIdentifier table = 2;
|
||||
repeated string index_uuids = 3;
|
||||
}
|
||||
347
vendor/lance-table/protos/encodings_v2_0.proto
vendored
Normal file
347
vendor/lance-table/protos/encodings_v2_0.proto
vendored
Normal file
|
|
@ -0,0 +1,347 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The Lance Authors
|
||||
|
||||
syntax = "proto3";
|
||||
|
||||
package lance.encodings;
|
||||
|
||||
import "google/protobuf/empty.proto";
|
||||
|
||||
// This file contains a specification for encodings that can be used
|
||||
// to store and load Arrow data into a Lance file for the 2.0 format. It
|
||||
// has been superseded by encodings21.proto which is used for the 2.1 format.
|
||||
//
|
||||
// # Types
|
||||
//
|
||||
// This file assumes the user wants to load data into Arrow arrays and
|
||||
// explains how to map Arrow arrays into Lance files. Encodings are divided
|
||||
// into "array encoding" (which maps to an Arrow array and may contain multiple
|
||||
// buffers) and "buffer encoding" (which encodes a single buffer of data).
|
||||
//
|
||||
// # Encoding Tree
|
||||
//
|
||||
// Most encodings are layered on top of each other. These form a tree of
|
||||
// encodings with a single root node. To encode an array you will typically
|
||||
// start with the root node and then take the output from that root encoding
|
||||
// and feed it into child encodings. The decoding process works in reverse.
|
||||
//
|
||||
// # Multi-column Encodings
|
||||
//
|
||||
// Some Arrow arrays will map to more than one column of Lance data. For
|
||||
// example, struct arrays and list arrays. This file only contains encodings
|
||||
// for a single column. However, it does describe how multi-column arrays can
|
||||
// be encoded.
|
||||
|
||||
// A pointer to a buffer in a Lance file
|
||||
//
|
||||
// A writer can place a buffer in three different locations. The buffer
|
||||
// can go in the data page, in the column metadata, or in the file metadata.
|
||||
// The writer is free to choose whatever is most appropriate (for example, a dictionary
|
||||
// that is shared across all pages in a column will probably go in the column
|
||||
// metadata). This specification does not dictate where the buffer should go.
|
||||
message Buffer {
|
||||
// The index of the buffer in the collection of buffers
|
||||
uint32 buffer_index = 1;
|
||||
// The collection holding the buffer
|
||||
enum BufferType {
|
||||
// The buffer is stored in the data page itself
|
||||
page = 0;
|
||||
// The buffer is stored in the column metadata
|
||||
column = 1;
|
||||
// The buffer is stored in the file metadata
|
||||
file = 2;
|
||||
};
|
||||
BufferType buffer_type = 2;
|
||||
}
|
||||
|
||||
// An encoding that adds nullability to another array encoding
|
||||
//
|
||||
// This can wrap any array encoding and add nullability information
|
||||
message Nullable {
|
||||
message NoNull {
|
||||
ArrayEncoding values = 1;
|
||||
}
|
||||
message AllNull {}
|
||||
message SomeNull {
|
||||
ArrayEncoding validity = 1;
|
||||
ArrayEncoding values = 2;
|
||||
}
|
||||
oneof nullability {
|
||||
// The array has no nulls and there is a single buffer needed
|
||||
NoNull no_nulls = 1;
|
||||
// The array may have nulls and we need two buffers
|
||||
SomeNull some_nulls = 2;
|
||||
// All values are null (no buffers needed)
|
||||
AllNull all_nulls = 3;
|
||||
}
|
||||
}
|
||||
|
||||
// An array encoding for variable-length list fields
|
||||
message List {
|
||||
// An array containing the offsets into an items array.
|
||||
//
|
||||
// This array will have num_rows items and will never
|
||||
// have nulls.
|
||||
//
|
||||
// If the list at index i is not null then offsets[i] will
|
||||
// contain `base + len(list)` where `base` is defined as:
|
||||
// i == 0: 0
|
||||
// i > 0: (offsets[i-1] % null_offset_adjustment)
|
||||
//
|
||||
// To help understand we can consider the following example list:
|
||||
// [ [A, B], null, [], [C, D, E] ]
|
||||
//
|
||||
// The offsets will be [2, ?, 2, 5]
|
||||
//
|
||||
// If the incoming list at index i IS null then offsets[i] will
|
||||
// contain `base + len(list) + null_offset_adjustment` where `base`
|
||||
// is defined the same as above.
|
||||
//
|
||||
// To complete the above example let's assume that `null_offset_adjustment`
|
||||
// is 7. Then the offsets will be [2, 9, 2, 5]
|
||||
//
|
||||
// If there are no nulls then the offsets we write here are exactly the
|
||||
// same as the offsets in an Arrow list array (except we omit the leading
|
||||
// 0 which is redundant)
|
||||
//
|
||||
// The reason we do this is so that reading a single list at index i only
|
||||
// requires us to load the indices at i and i-1.
|
||||
//
|
||||
// If the offset at index i is greater than `null_offset_adjustment``
|
||||
// then the list at index i is null.
|
||||
//
|
||||
// Otherwise the length of the list is `offsets[i] - base` where
|
||||
// base is defined the same as above.
|
||||
//
|
||||
// Let's consider our example offsets: [2, 9, 2, 5]
|
||||
//
|
||||
// We can take any range of lists and determine how many list items are
|
||||
// referenced by the sublist.
|
||||
//
|
||||
// 0..3: [_, 5] -> items 0..5 (base = 0* and end is 5)
|
||||
// 0..2: [_, 2] -> items 0..2 (base = 0* and end is 2)
|
||||
// 0..1: [_, 9] -> items 0..2 (base = 0* and end is 9 % 7)
|
||||
// 1..3: [2, 5] -> items 2..5 (base = 2 and end is 5)
|
||||
// 1..2: [2, 2] -> items 2..2 (base = 2 and end is 2)
|
||||
// 2..3: [9, 5] -> items 2..5 (base = 9 % 7 and end is 5)
|
||||
//
|
||||
// * When the start of our range is the 0th item the base is always 0 and we only
|
||||
// need to load a single index from disk to determine the range.
|
||||
//
|
||||
// The data type of the offsets array is flexible and does not need
|
||||
// to match the data type of the destination array. Please note that the offsets
|
||||
// array is very likely to be efficiently encoded by bit packing deltas.
|
||||
ArrayEncoding offsets = 1;
|
||||
// If a list is null then we add this value to the offset
|
||||
//
|
||||
// This value must be greater than the length of the items so that
|
||||
// (offset + null_offset_adjustment) is never used by a non-null list.
|
||||
//
|
||||
// Note that this value cannot be equal to the length of the items
|
||||
// because then a page with a single list would store [ X ] and we
|
||||
// couldn't know if that is a null list or a list with X items.
|
||||
//
|
||||
// Therefore, the best choice for this value is 1 + # of items.
|
||||
// Choosing this will maximize the bit packing that we can apply to the offsets.
|
||||
uint64 null_offset_adjustment = 2;
|
||||
// How many items are referenced by these offsets. This is needed in
|
||||
// order to determine which items pages map to this offsets page.
|
||||
uint64 num_items = 3;
|
||||
}
|
||||
|
||||
// An array encoding for fixed-size list fields
|
||||
message FixedSizeList {
|
||||
/// The number of items in each list
|
||||
uint32 dimension = 1;
|
||||
/// True if the list is nullable
|
||||
bool has_validity = 3;
|
||||
/// The items in the list
|
||||
ArrayEncoding items = 2;
|
||||
}
|
||||
|
||||
message Compression {
|
||||
string scheme = 1;
|
||||
optional int32 level = 2;
|
||||
}
|
||||
|
||||
// Fixed width items placed contiguously in a buffer
|
||||
message Flat {
|
||||
// the number of bits per value, must be greater than 0, does
|
||||
// not need to be a multiple of 8
|
||||
uint64 bits_per_value = 1;
|
||||
// the buffer of values
|
||||
Buffer buffer = 2;
|
||||
// The Compression message can specify the compression scheme (e.g. zstd) and any
|
||||
// other information that is needed for decompression.
|
||||
//
|
||||
// If this array is compressed then the bits_per_value refers to the uncompressed
|
||||
// data.
|
||||
Compression compression = 3;
|
||||
}
|
||||
|
||||
// Compression algorithm where all values have a constant value
|
||||
message Constant {
|
||||
// The value (TODO: define encoding for literals?)
|
||||
bytes value = 1;
|
||||
}
|
||||
|
||||
// Items are bitpacked in a buffer
|
||||
message Bitpacked {
|
||||
// the number of bits used for a value in the buffer
|
||||
uint64 compressed_bits_per_value = 1;
|
||||
|
||||
// the number of bits of the uncompressed value. e.g. for a u32, this will be 32
|
||||
uint64 uncompressed_bits_per_value = 2;
|
||||
|
||||
// The items in the list
|
||||
Buffer buffer = 3;
|
||||
|
||||
// Whether or not a sign bit is included in the bitpacked value
|
||||
bool signed = 4;
|
||||
}
|
||||
|
||||
// Items are bitpacked in a buffer
|
||||
message BitpackedForNonNeg {
|
||||
// the number of bits used for a value in the buffer
|
||||
uint64 compressed_bits_per_value = 1;
|
||||
|
||||
// the number of bits of the uncompressed value. e.g. for a u32, this will be 32
|
||||
uint64 uncompressed_bits_per_value = 2;
|
||||
|
||||
// The items in the list
|
||||
Buffer buffer = 3;
|
||||
}
|
||||
|
||||
// Opaque bitpacking variant where the bits per value are stored inline in the chunks themselves
|
||||
message InlineBitpacking {
|
||||
// the number of bits of the uncompressed value. e.g. for a u32, this will be 32
|
||||
uint64 uncompressed_bits_per_value = 2;
|
||||
}
|
||||
|
||||
// Transparent bitpacking variant where the number of bits per value is fixed through the whole buffer
|
||||
message OutOfLineBitpacking {
|
||||
// the number of bits of the uncompressed value. e.g. for a u32, this will be 32
|
||||
uint64 uncompressed_bits_per_value = 2;
|
||||
// The number of compressed bits per value, fixed across the entire buffer
|
||||
uint64 compressed_bits_per_value = 3;
|
||||
}
|
||||
|
||||
// An array encoding for shredded structs that will never be null
|
||||
//
|
||||
// There is no actual data in this column.
|
||||
//
|
||||
// TODO: Struct validity bitmaps will be placed here.
|
||||
message SimpleStruct {}
|
||||
|
||||
// An array encoding for binary fields
|
||||
message Binary {
|
||||
ArrayEncoding indices = 1;
|
||||
ArrayEncoding bytes = 2;
|
||||
uint64 null_adjustment = 3;
|
||||
}
|
||||
|
||||
message Variable {
|
||||
uint32 bits_per_offset = 1;
|
||||
}
|
||||
|
||||
message Fsst {
|
||||
ArrayEncoding binary = 1;
|
||||
bytes symbol_table = 2;
|
||||
}
|
||||
|
||||
// An array encoding for dictionary-encoded fields
|
||||
message Dictionary {
|
||||
ArrayEncoding indices = 1;
|
||||
ArrayEncoding items = 2;
|
||||
uint32 num_dictionary_items = 3;
|
||||
}
|
||||
|
||||
message PackedStruct {
|
||||
repeated ArrayEncoding inner = 1;
|
||||
Buffer buffer = 2;
|
||||
}
|
||||
|
||||
message PackedStructFixedWidthMiniBlock {
|
||||
ArrayEncoding Flat = 1;
|
||||
repeated uint32 bits_per_values = 2;
|
||||
}
|
||||
|
||||
message FixedSizeBinary {
|
||||
ArrayEncoding bytes = 1;
|
||||
uint32 byte_width = 2;
|
||||
}
|
||||
|
||||
message Block {
|
||||
string scheme = 1;
|
||||
}
|
||||
|
||||
// Run-Length Encoding for miniblock format
|
||||
message Rle {
|
||||
// Number of bits per value (8, 16, 32, 64, or 128)
|
||||
uint64 bits_per_value = 1;
|
||||
}
|
||||
|
||||
// Byte Stream Split encoding for floating point values
|
||||
message ByteStreamSplit {
|
||||
// Number of bits per value (32 for float, 64 for double)
|
||||
uint64 bits_per_value = 1;
|
||||
}
|
||||
|
||||
// General miniblock encoding - wraps another miniblock encoding with compression
|
||||
message GeneralMiniBlock {
|
||||
// The inner miniblock encoding (e.g., Rle, Bitpacked, etc.)
|
||||
ArrayEncoding inner = 1;
|
||||
// The compression scheme to apply to the miniblock buffers
|
||||
Compression compression = 2;
|
||||
}
|
||||
|
||||
// Encodings that decode into an Arrow array
|
||||
message ArrayEncoding {
|
||||
oneof array_encoding {
|
||||
Flat flat = 1;
|
||||
Nullable nullable = 2;
|
||||
FixedSizeList fixed_size_list = 3;
|
||||
List list = 4;
|
||||
SimpleStruct struct = 5;
|
||||
Binary binary = 6;
|
||||
Dictionary dictionary = 7;
|
||||
Fsst fsst = 8;
|
||||
PackedStruct packed_struct = 9;
|
||||
Bitpacked bitpacked = 10;
|
||||
FixedSizeBinary fixed_size_binary = 11;
|
||||
BitpackedForNonNeg bitpacked_for_non_neg = 12;
|
||||
Constant constant = 13;
|
||||
InlineBitpacking inline_bitpacking = 14;
|
||||
OutOfLineBitpacking out_of_line_bitpacking = 15;
|
||||
Variable variable = 16;
|
||||
PackedStructFixedWidthMiniBlock packed_struct_fixed_width_mini_block = 17;
|
||||
Block block = 18;
|
||||
Rle rle = 19;
|
||||
GeneralMiniBlock general_mini_block = 20;
|
||||
ByteStreamSplit byte_stream_split = 21;
|
||||
}
|
||||
}
|
||||
|
||||
// Wraps a column with a zone map index that can be used
|
||||
// to apply pushdown filters
|
||||
message ZoneIndex {
|
||||
uint32 rows_per_zone = 1;
|
||||
Buffer zone_map_buffer = 2;
|
||||
ColumnEncoding inner = 3;
|
||||
}
|
||||
|
||||
// Marks a column as blob data. It will contain a packed struct
|
||||
// with fields position and size (u64)
|
||||
message Blob {
|
||||
ColumnEncoding inner = 1;
|
||||
}
|
||||
|
||||
// Encodings that describe a column of values
|
||||
message ColumnEncoding {
|
||||
oneof column_encoding {
|
||||
// No special encoding, just column values
|
||||
google.protobuf.Empty values = 1;
|
||||
ZoneIndex zone_index = 2;
|
||||
Blob blob = 3;
|
||||
}
|
||||
}
|
||||
511
vendor/lance-table/protos/encodings_v2_1.proto
vendored
Normal file
511
vendor/lance-table/protos/encodings_v2_1.proto
vendored
Normal file
|
|
@ -0,0 +1,511 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The Lance Authors
|
||||
|
||||
syntax = "proto3";
|
||||
|
||||
package lance.encodings21;
|
||||
|
||||
// This file contains a specification for encodings that can be used
|
||||
// to store and load Arrow data into a Lance file for the 2.1 format.
|
||||
//
|
||||
// # Types
|
||||
//
|
||||
// This file assumes the user wants to load data into Arrow arrays and
|
||||
// explains how to map Arrow arrays into Lance files. Encodings are divided
|
||||
// into "structural encodings" (which are used to encode the structure of the
|
||||
// data such as any list or struct layers) and "compressive encodings" (which
|
||||
// are used to compress the actual data values).
|
||||
//
|
||||
// # Standardized Interpretation of Counting Terms
|
||||
//
|
||||
// When working with 2.1 encodings we have a number of different "counting terms" and it can be
|
||||
// difficult to understand what we mean when we are talking about a "number of values". Here is
|
||||
// a standard interpretation of these terms:
|
||||
//
|
||||
// To understand these definitions consider a data type FIXED_SIZE_LIST<LIST<INT32>>.
|
||||
//
|
||||
// A "value" is an abstract term when we aren't being specific.
|
||||
//
|
||||
// - num_rows: This is the highest level counting term. A single row includes everything in the
|
||||
// fixed size list. This is what the user asks for when they asks for a range of rows.
|
||||
// - num_elements: The number of elements is the number of rows multiplied by the dimension of any
|
||||
// fixed size list wrappers. This is what you get when you flatten the FSL layer and
|
||||
// is the starting point for structural encoding. Note that an element can be a list
|
||||
// value or a single primitive value.
|
||||
// - num_items: The number of items is the number of values in the repetition and definition vectors
|
||||
// after everything has been flattened.
|
||||
// - num_visible_items: The number of visible items is the number of items after invisible items
|
||||
// have been removed. Invisible items are rep/def levels that don't correspond to an
|
||||
// actual value.
|
||||
|
||||
|
||||
// # Structural Encodings
|
||||
//
|
||||
// The following message are used to describe the structural encoding of the
|
||||
// data. In this document, we refer to these structural encodings as layouts.
|
||||
|
||||
// Repetition and definition levels are described in more detail elsewhere. As we peel through
|
||||
// the structure of an array we will encounter layers of struct and list. Each of these layers
|
||||
// potentially adds a new level to the repetition and definition levels. This message describes
|
||||
// the meaning of each layer.
|
||||
enum RepDefLayer {
|
||||
// Should never be used, included for debugging purporses and general protobuf best practice
|
||||
REPDEF_UNSPECIFIED = 0;
|
||||
// All values are valid (can be primitive or struct)
|
||||
REPDEF_ALL_VALID_ITEM = 1;
|
||||
// All list values are valid
|
||||
REPDEF_ALL_VALID_LIST = 2;
|
||||
// There are one or more null items (can be primitive or struct)
|
||||
REPDEF_NULLABLE_ITEM = 3;
|
||||
// A list layer with null lists but no empty lists
|
||||
REPDEF_NULLABLE_LIST = 4;
|
||||
// A list layer with empty lists but no null lists
|
||||
REPDEF_EMPTYABLE_LIST = 5;
|
||||
// A list layer with both empty lists and null lists
|
||||
REPDEF_NULL_AND_EMPTY_LIST = 6;
|
||||
}
|
||||
|
||||
// A layout used for pages where the data is small
|
||||
//
|
||||
// In this case we can fit many values into a single disk sector and transposing buffers is
|
||||
// expensive. As a result, we do not transpose the buffers but compress the data into small
|
||||
// chunks (called mini blocks) which are roughly the size of a disk sector.
|
||||
//
|
||||
// The end result is a small amount of read amplification (since we must read an entire page
|
||||
// at a time) but we have more flexibility in compression and do less work per value when
|
||||
// compressing and decompressing in bulk.
|
||||
message MiniBlockLayout {
|
||||
// Description of the compression of repetition levels (e.g. how many bits per rep)
|
||||
//
|
||||
// Optional, if there is no repetition then this field is not present
|
||||
CompressiveEncoding rep_compression = 1;
|
||||
// Description of the compression of definition levels (e.g. how many bits per def)
|
||||
//
|
||||
// Optional, if there is no definition then this field is not present
|
||||
CompressiveEncoding def_compression = 2;
|
||||
// Description of the compression of values
|
||||
CompressiveEncoding value_compression = 3;
|
||||
// Description of the compression of the dictionary data
|
||||
//
|
||||
// Optional, if there is no dictionary then this field is not present
|
||||
CompressiveEncoding dictionary = 4;
|
||||
// Number of items in the dictionary
|
||||
uint64 num_dictionary_items = 5;
|
||||
// The meaning of each repdef layer, used to interpret repdef buffers correctly
|
||||
repeated RepDefLayer layers = 6;
|
||||
// The number of buffers in each mini-block, this is determined by the compression and does
|
||||
// NOT include the repetition or definition buffers (the presence of these buffers can be determined
|
||||
// by looking at the rep_compression and def_compression fields)
|
||||
uint64 num_buffers = 7;
|
||||
// The depth of the repetition index.
|
||||
//
|
||||
// If there is repetition then the depth must be at least 1. If there are many layers
|
||||
// of repetition then deeper repetition indices will support deeper nested random access. For
|
||||
// example, given 5 layers of repetition then the repetition index depth must be at least
|
||||
// 3 to support access like `rows[50][17][3]`.
|
||||
//
|
||||
// We require `repetition_index_depth + 1` u64 values per mini-block to store the repetition
|
||||
// index if the `repetition_index_depth` is greater than 0. The +1 is because we need to store
|
||||
// the number of "leftover items" at the end of the chunk. Otherwise, we wouldn't have any way
|
||||
// to know if the final item in a chunk is valid or not.
|
||||
uint32 repetition_index_depth = 8;
|
||||
// The page already records how many rows are in the page. For mini-block we also need to know how
|
||||
// many "items" are in the page. A row and an item are the same thing unless the page has lists.
|
||||
uint64 num_items = 9;
|
||||
|
||||
// Since Lance 2.2, miniblocks have larger chunk sizes (>= 64KB)
|
||||
bool has_large_chunk = 10;
|
||||
}
|
||||
|
||||
// A layout used for pages where the data is large
|
||||
//
|
||||
// In this case the cost of transposing the data is relatively small (compared to the cost of writing the data)
|
||||
// and so we just zip the buffers together
|
||||
message FullZipLayout {
|
||||
// The number of bits of repetition info (0 if there is no repetition)
|
||||
uint32 bits_rep = 1;
|
||||
// The number of bits of definition info (0 if there is no definition)
|
||||
uint32 bits_def = 2;
|
||||
// The number of bits of value info
|
||||
//
|
||||
// Note: we use bits here (and not bytes) for consistency with other encodings. However, in practice,
|
||||
// there is never a reason to use a bits per value that is not a multiple of 8. The complexity is not
|
||||
// worth the small savings in space since this encoding is typically used with large values already.
|
||||
oneof details {
|
||||
// If this is a fixed width block then we need to have a fixed number of bits per value
|
||||
uint32 bits_per_value = 3;
|
||||
// If this is a variable width block then we need to have a fixed number of bits per offset
|
||||
uint32 bits_per_offset = 4;
|
||||
}
|
||||
// The number of items in the page
|
||||
uint32 num_items = 5;
|
||||
// The number of visible items in the page
|
||||
uint32 num_visible_items = 6;
|
||||
// Description of the compression of values
|
||||
CompressiveEncoding value_compression = 7;
|
||||
// The meaning of each repdef layer, used to interpret repdef buffers correctly
|
||||
repeated RepDefLayer layers = 8;
|
||||
}
|
||||
|
||||
// A layout used for pages where all (visible) values are the same scalar value.
|
||||
//
|
||||
// This generalizes the prior AllNullLayout semantics for file_version >= 2.2.
|
||||
//
|
||||
// There may be buffers of repetition and definition information if required in order
|
||||
// to interpret what kind of nulls are present / which items are visible.
|
||||
message ConstantLayout {
|
||||
// The meaning of each repdef layer, used to interpret repdef buffers correctly
|
||||
repeated RepDefLayer layers = 5;
|
||||
|
||||
// Inline fixed-width scalar value bytes.
|
||||
//
|
||||
// This MUST only be used for types where a single non-null element is represented by a single
|
||||
// fixed-width Arrow value buffer (i.e. no offsets buffer, no child data).
|
||||
//
|
||||
// Constraints:
|
||||
// - MUST be absent for an all-null page
|
||||
// - MUST be <= 32 bytes if present
|
||||
optional bytes inline_value = 6;
|
||||
|
||||
// Optional compression algorithm used for the repetition buffer.
|
||||
// If absent, repetition levels are stored as raw u16 values.
|
||||
CompressiveEncoding rep_compression = 7;
|
||||
// Optional compression algorithm used for the definition buffer.
|
||||
// If absent, definition levels are stored as raw u16 values.
|
||||
CompressiveEncoding def_compression = 8;
|
||||
// Number of values in repetition buffer after decompression.
|
||||
uint64 num_rep_values = 9;
|
||||
// Number of values in definition buffer after decompression.
|
||||
uint64 num_def_values = 10;
|
||||
}
|
||||
|
||||
// A layout where large binary data is encoded externally and only
|
||||
// the descriptions (position + size) are placed in the page
|
||||
//
|
||||
// Repdef information is stored in the descriptions. A description with a size of
|
||||
// 0 and a position of 0 is an empty value. A description with a size of 0 and a
|
||||
// non-zero position is a null value and the position is the repdef value.
|
||||
message BlobLayout {
|
||||
// The inner layout used to store the descriptions
|
||||
PageLayout inner_layout = 1;
|
||||
// The meaning of each repdef layer, used to interpret repdef buffers correctly
|
||||
//
|
||||
// The inner layout's repdef layers will always be 1 all valid item layer
|
||||
repeated RepDefLayer layers = 2;
|
||||
}
|
||||
|
||||
// Describes the structural encoding of a page
|
||||
message PageLayout {
|
||||
oneof layout {
|
||||
// A layout used for pages where the data is small
|
||||
MiniBlockLayout mini_block_layout = 1;
|
||||
// A layout used for pages where all (visible) values are the same scalar value or null.
|
||||
ConstantLayout constant_layout = 2;
|
||||
// A layout used for pages where the data is large
|
||||
FullZipLayout full_zip_layout = 3;
|
||||
// A layout where large binary data is encoded externally
|
||||
// and only the descriptions are put in the page
|
||||
BlobLayout blob_layout = 4;
|
||||
}
|
||||
}
|
||||
|
||||
// # Compressive Encodings
|
||||
//
|
||||
// These encodings describe how an array is compressed. An encoding may split an
|
||||
// array into multiple buffers. The buffers can then be compressed further (and split
|
||||
// into yet more buffers). The entire process forms a tree of encodings with the root
|
||||
// of the tree being the initial array and the leaves being the final compressed buffers.
|
||||
//
|
||||
// # Data blocks and buffers
|
||||
//
|
||||
// Data blocks are a simplified version of arrays and represent a collection of buffers grouped
|
||||
// with some kind of interpretation. Data blocks are the input and output of compressive encodings.
|
||||
// There are different kinds of data blocks:
|
||||
// - Fixed width data blocks (e.g. u8, u16, ...)
|
||||
// - Variable width data blocks (e.g. strings, binary)
|
||||
// - Struct data blocks (note: this is for packed structs, normal structs are encoded in the structural encoding)
|
||||
//
|
||||
// In addition, leaf encodings may output "buffers". These are fully compressed buffers of data that
|
||||
// are stored in the page and no longer compressed.
|
||||
|
||||
enum CompressionScheme {
|
||||
COMPRESSION_ALGORITHM_UNSPECIFIED = 0;
|
||||
COMPRESSION_ALGORITHM_LZ4 = 1;
|
||||
COMPRESSION_ALGORITHM_ZSTD = 2;
|
||||
}
|
||||
|
||||
// Compression applied to a single buffer of data
|
||||
//
|
||||
// A buffer is the leaf of the compression tree. Unlike data blocks, which can
|
||||
// be further compressed with a variety of techniques, a buffer cannot be understood
|
||||
// in any particular way.
|
||||
//
|
||||
// A general compression scheme may be applied to a buffer. This is something like
|
||||
// zstd, lz4, etc. The entire buffer is compressed as a single unit. If this happens
|
||||
// then any parent encoding becomes opaque, even if it would normally be transparent.
|
||||
//
|
||||
// This is a leaf, no further compression is applied to the data.
|
||||
message BufferCompression {
|
||||
// A general compression scheme to apply to the buffer
|
||||
CompressionScheme scheme = 1;
|
||||
// The compression level
|
||||
//
|
||||
// Optional, if not present a scheme-specific default value will be used.
|
||||
//
|
||||
// Interpretation of this value depends on the compression scheme. Generally, larger
|
||||
// values indicate more compression at the expense of more CPU time.
|
||||
optional int32 level = 2;
|
||||
}
|
||||
|
||||
// Fixed width items placed contiguously in a single buffer
|
||||
//
|
||||
// This is a leaf encoding, there is no compression applied to the data.
|
||||
//
|
||||
// This is a transparent encoding by definition.
|
||||
//
|
||||
// The input is a fixed-width data block.
|
||||
// The output is a single buffer.
|
||||
message Flat {
|
||||
// the number of bits per value, must be greater than 0, does
|
||||
// not need to be a multiple of 8
|
||||
uint64 bits_per_value = 1;
|
||||
// The compression applied to the data
|
||||
optional BufferCompression data = 2;
|
||||
}
|
||||
|
||||
// Variable width items have the values stored in one buffer and the
|
||||
// offsets are output as a data block that may be further compressed.
|
||||
//
|
||||
// This is a partial leaf encoding. Values are not compressed but
|
||||
// the offsets may be further compressed.
|
||||
//
|
||||
// This is a transparent encoding by definition.
|
||||
//
|
||||
// The input is a variable-width data block.
|
||||
// The output is a single fixed-width data block (the offsets) and
|
||||
// a single buffer (the values)
|
||||
message Variable {
|
||||
// Describes how the offsets data block is compressed
|
||||
CompressiveEncoding offsets = 1;
|
||||
// The compression applied to the values
|
||||
optional BufferCompression values = 2;
|
||||
}
|
||||
|
||||
// Compression algorithm where all values have a constant value (encoded in the description)
|
||||
//
|
||||
// This is a leaf encoding, there is no compression applied to the data.
|
||||
//
|
||||
// The input can be any kind of data block.
|
||||
// There is no output.
|
||||
message Constant {
|
||||
// The value (TODO: define encoding for literals?)
|
||||
optional bytes value = 1;
|
||||
}
|
||||
|
||||
// A compression scheme in which a single fixed-width block is "packed" into
|
||||
// a smaller fixed-width block values where each value has fewer bits.
|
||||
//
|
||||
// This is typically done by throwing away the most significant bits of each value when
|
||||
// those bits are all the same.
|
||||
//
|
||||
// In this scheme the number of bits per value is fixed across the entire buffer and stored
|
||||
// in this message.
|
||||
//
|
||||
// This is a transparent encoding.
|
||||
//
|
||||
// The input is a fixed-width data block.
|
||||
// The output is a single fixed-width data block.
|
||||
message OutOfLineBitpacking {
|
||||
// the number of bits of the uncompressed value. e.g. for a u32, this will be 32
|
||||
uint64 uncompressed_bits_per_value = 1;
|
||||
// The compression used to store the bitpacked values data block
|
||||
CompressiveEncoding values = 3;
|
||||
}
|
||||
|
||||
// Bitpacking variant where the bits per value are stored inline in the chunks themselves
|
||||
//
|
||||
// This variation of bitpacking allows for the number of bits per value to change throughout the
|
||||
// buffer, which makes the compression more robust to outliers.
|
||||
//
|
||||
// This is an opaque encoding.
|
||||
//
|
||||
// The input is a fixed-width data block.
|
||||
// The output is a single buffer.
|
||||
message InlineBitpacking {
|
||||
// the number of bits of the uncompressed value. e.g. for a u32, this will be 32
|
||||
uint64 uncompressed_bits_per_value = 1;
|
||||
// The compression applied to the values
|
||||
optional BufferCompression values = 2;
|
||||
}
|
||||
|
||||
// A compression scheme for variable-width data
|
||||
//
|
||||
// A small dictionary (referred to as a "symbol table") is used to compress the values.
|
||||
// In this scheme there is a single symbol table for the entire page and it is stored in the
|
||||
// encoding description itself.
|
||||
//
|
||||
// This is a transparent encoding.
|
||||
//
|
||||
// The input is a variable-width data block.
|
||||
// The output is a single variable-width data block.
|
||||
message Fsst {
|
||||
// The FSST symbol table
|
||||
bytes symbol_table = 1;
|
||||
// The compression used to store the compressed values data block
|
||||
CompressiveEncoding values = 2;
|
||||
}
|
||||
|
||||
// A compression scheme where common values are stored in a dictionary and the values are
|
||||
// encoded as indices into the dictionary.
|
||||
//
|
||||
// This is an opaque encoding unless the dictionary is considered metadata.
|
||||
//
|
||||
// The input is a any kind of data block.
|
||||
// There are two outputs:
|
||||
// - A data block of the same kind as the input (the dictionary)
|
||||
// - A fixed-width data block containing the indices into the dictionary.
|
||||
message Dictionary {
|
||||
// The compression used to store the indices data block
|
||||
CompressiveEncoding indices = 1;
|
||||
// The compression used to store the dictionary items data block
|
||||
CompressiveEncoding items = 2;
|
||||
// The number of items in the dictionary
|
||||
uint32 num_dictionary_items = 3;
|
||||
}
|
||||
|
||||
// A compression scheme where runs of common values are encoded as a single value and a count
|
||||
//
|
||||
// This is an opaque encoding unless the run lengths are considered metadata.
|
||||
//
|
||||
// The input is a single data block of any kind.
|
||||
// There are two outputs:
|
||||
// - A data block of the same kind as the input (the run values)
|
||||
// - A fixed-width data block containing the lengths of the runs
|
||||
message Rle {
|
||||
// The compression used to store the run values data block
|
||||
CompressiveEncoding values = 1;
|
||||
// The compression used to store the run lengths data block
|
||||
CompressiveEncoding run_lengths = 2;
|
||||
}
|
||||
|
||||
// Converts a fixed-size-list of values into a flattened list of values
|
||||
//
|
||||
// This encoding does not actually compress the data, it just flattens out the FSL layers.
|
||||
//
|
||||
// This is a transparent encoding.
|
||||
//
|
||||
// The input is a single block of fixed-width data (with a wide width and few items)
|
||||
// The output is a single block of fixed-width data (with a narrow width and many items)
|
||||
message FixedSizeList {
|
||||
// The number of items in this layer of FSL
|
||||
uint64 items_per_value = 1;
|
||||
// Whether or not there is a validity buffer
|
||||
bool has_validity = 3;
|
||||
// The compression used to store the flattened values data block
|
||||
CompressiveEncoding values = 2;
|
||||
}
|
||||
|
||||
// Packs a struct containing only fixed-width children into a single fixed-width data block
|
||||
//
|
||||
// The children are concatenated row by row and stored as a single fixed-width buffer. This is
|
||||
// the legacy packed struct representation and remains available for backwards compatibility.
|
||||
message PackedStruct {
|
||||
// The number of bits contributed by each child field in the packed row
|
||||
repeated uint64 bits_per_value = 1;
|
||||
// The compression used to store the packed fixed-width values
|
||||
CompressiveEncoding values = 2;
|
||||
}
|
||||
|
||||
// Variable-width packed struct encoding (2.2 extension)
|
||||
//
|
||||
// Each child value is compressed independently before being transposed into
|
||||
// a row-major layout. This preserves per-field compression boundaries at the
|
||||
// cost of disabling mini-block compression. Readers must prefer this field
|
||||
// when present and fall back to the legacy encoding otherwise.
|
||||
message VariablePackedStruct {
|
||||
// Per-field encoding metadata in struct order
|
||||
repeated FieldEncoding fields = 1;
|
||||
|
||||
// Encoding description for a single child field
|
||||
message FieldEncoding {
|
||||
// Compression applied to individual field values before transposition
|
||||
CompressiveEncoding value = 1;
|
||||
oneof layout {
|
||||
// Bit width of each compressed value (when fixed width)
|
||||
uint64 bits_per_value = 2;
|
||||
// Bit width of the length prefix for variable-width compressed values
|
||||
uint64 bits_per_length = 3;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// A compression scheme that wraps the underlying data with general compression
|
||||
//
|
||||
// Note: The application of wrapped compression will depend on the layout of the data.
|
||||
// If we apply it to mini-block data then we compress entire mini-blocks. If we apply
|
||||
// it to full-zip data then we compress each value individually.
|
||||
//
|
||||
// Note: Wrapped compression is somewhat unique at the moment as it is applied to the
|
||||
// output of the inner encoding and not the input like all other compressive encodings.
|
||||
//
|
||||
// Note: General compression can usually be applied in two spots. We can apply
|
||||
// it to individual buffers or we can apply it here, to the entire array.
|
||||
//
|
||||
// For example, let's say we are storing mini-blocks of strings and we are using
|
||||
// FSST and bitpacking the offsets. We have something like this...
|
||||
//
|
||||
// WRAPPED(†3) -> FSST -> VARIABLE -(offsets)-> INLINE_BITPACKING -(data)-> FLAT -> BUFFER (†1)
|
||||
// -(data)-> BUFFER (†2)
|
||||
//
|
||||
// General compression can be applied at †1, †2, or †3 (or any combination of these).
|
||||
//
|
||||
// If we apply it at †1 then we apply it just to the bitpacked offsets
|
||||
// If we apply it at †2 then we apply it just to the FSST compressed data
|
||||
// If we apply it at †3 then we apply it to the entire mini-block (both offsets and data)
|
||||
//
|
||||
// The input is a single data block of any kind.
|
||||
// The output is a single data block of the same kind as the input.
|
||||
message General {
|
||||
// The compression to apply to the values
|
||||
BufferCompression compression = 1;
|
||||
// The compression used to store the output data block
|
||||
CompressiveEncoding values = 3;
|
||||
}
|
||||
|
||||
// A compression scheme where fixed-width values are transposed into a series of byte streams
|
||||
//
|
||||
// This is commonly used for floating point values where the upper bits (the mantissa) have a
|
||||
// significantly different meaning than the lower bits. By splitting the values into byte streams
|
||||
// we group the mantissa bits together and the exponent bits together. The end result is typically
|
||||
// more compressible.
|
||||
//
|
||||
// Note that this encoding is mostly useful when combined with other encodings. It does not do any
|
||||
// compression on its own.
|
||||
//
|
||||
// This is an opaque encoding.
|
||||
//
|
||||
// The input is a fixed-width data block
|
||||
// The output is a single fixed-width data block
|
||||
message ByteStreamSplit {
|
||||
// The compression used to store the values
|
||||
CompressiveEncoding values = 1;
|
||||
}
|
||||
|
||||
// An encoding that compresses a data block into buffers
|
||||
message CompressiveEncoding {
|
||||
oneof compression {
|
||||
Flat flat = 1;
|
||||
Variable variable = 2;
|
||||
Constant constant = 3;
|
||||
OutOfLineBitpacking out_of_line_bitpacking = 4;
|
||||
InlineBitpacking inline_bitpacking = 5;
|
||||
Fsst fsst = 6;
|
||||
Dictionary dictionary = 7;
|
||||
Rle rle = 8;
|
||||
ByteStreamSplit byte_stream_split = 9;
|
||||
General general = 10;
|
||||
FixedSizeList fixed_size_list = 11;
|
||||
PackedStruct packed_struct = 12;
|
||||
VariablePackedStruct variable_packed_struct = 13;
|
||||
}
|
||||
}
|
||||
207
vendor/lance-table/protos/file.proto
vendored
Normal file
207
vendor/lance-table/protos/file.proto
vendored
Normal file
|
|
@ -0,0 +1,207 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The Lance Authors
|
||||
|
||||
syntax = "proto3";
|
||||
|
||||
package lance.file;
|
||||
|
||||
// A file descriptor that describes the contents of a Lance file
|
||||
message FileDescriptor {
|
||||
// The schema of the file
|
||||
Schema schema = 1;
|
||||
// The number of rows in the file
|
||||
uint64 length = 2;
|
||||
}
|
||||
|
||||
// A schema which describes the data type of each of the columns
|
||||
message Schema {
|
||||
// All fields in this file, including the nested fields.
|
||||
repeated lance.file.Field fields = 1;
|
||||
// Schema metadata.
|
||||
map<string, bytes> metadata = 5;
|
||||
}
|
||||
|
||||
// Metadata of one Lance file.
|
||||
message Metadata {
|
||||
// 4 was used for StatisticsMetadata in the past, but has been moved to
|
||||
// prevent a bug in older readers.
|
||||
reserved 4;
|
||||
|
||||
// Position of the manifest in the file. If it is zero, the manifest is stored
|
||||
// externally.
|
||||
uint64 manifest_position = 1;
|
||||
|
||||
// Logical offsets of each chunk group, i.e., number of the rows in each
|
||||
// chunk.
|
||||
repeated int32 batch_offsets = 2;
|
||||
|
||||
// The file position that page table is stored.
|
||||
//
|
||||
// A page table is a matrix of N x M x 2, where N = num_fields, and M =
|
||||
// num_batches. Each cell in the table is a pair of <position:int64,
|
||||
// length:int64> of the page. Both position and length are int64 values. The
|
||||
// <position, length> of all the pages in the same column are then
|
||||
// contiguously stored.
|
||||
//
|
||||
// Every field that is a part of the file will have a run in the page table.
|
||||
// This includes struct columns, which will have a run of length 0 since
|
||||
// they don't store any actual data.
|
||||
//
|
||||
// For example, for the column 5 and batch 4, we have:
|
||||
// ```text
|
||||
// position = page_table[5][4][0];
|
||||
// length = page_table[5][4][1];
|
||||
// ```
|
||||
uint64 page_table_position = 3;
|
||||
|
||||
message StatisticsMetadata {
|
||||
// The schema of the statistics.
|
||||
//
|
||||
// This might be empty, meaning there are no statistics. It also might not
|
||||
// contain statistics for every field.
|
||||
repeated Field schema = 1;
|
||||
|
||||
// The field ids of the statistics leaf fields.
|
||||
//
|
||||
// This plays a similar role to the `fields` field in the DataFile message.
|
||||
// Each of these field ids corresponds to a field in the stats_schema. There
|
||||
// is one per column in the stats page table.
|
||||
repeated int32 fields = 2;
|
||||
|
||||
// The file position of the statistics page table
|
||||
//
|
||||
// The page table is a matrix of N x 2, where N = length of stats_fields.
|
||||
// This is the same layout as the main page table, except there is always
|
||||
// only one batch.
|
||||
//
|
||||
// For example, to get the stats column 5, we have:
|
||||
// ```text
|
||||
// position = stats_page_table[5][0];
|
||||
// length = stats_page_table[5][1];
|
||||
// ```
|
||||
uint64 page_table_position = 3;
|
||||
}
|
||||
|
||||
StatisticsMetadata statistics = 5;
|
||||
} // Metadata
|
||||
|
||||
// Supported encodings.
|
||||
enum Encoding {
|
||||
// Invalid encoding.
|
||||
NONE = 0;
|
||||
// Plain encoding.
|
||||
PLAIN = 1;
|
||||
// Var-length binary encoding.
|
||||
VAR_BINARY = 2;
|
||||
// Dictionary encoding.
|
||||
DICTIONARY = 3;
|
||||
// Run-length encoding.
|
||||
RLE = 4;
|
||||
}
|
||||
|
||||
// Dictionary field metadata
|
||||
message Dictionary {
|
||||
/// The file offset for storing the dictionary value.
|
||||
/// It is only valid if encoding is DICTIONARY.
|
||||
///
|
||||
/// The logic type presents the value type of the column, i.e., string value.
|
||||
int64 offset = 1;
|
||||
|
||||
/// The length of dictionary values.
|
||||
int64 length = 2;
|
||||
}
|
||||
|
||||
// Field metadata for a column.
|
||||
message Field {
|
||||
enum Type {
|
||||
PARENT = 0;
|
||||
REPEATED = 1;
|
||||
LEAF = 2;
|
||||
}
|
||||
Type type = 1;
|
||||
|
||||
// Fully qualified name.
|
||||
string name = 2;
|
||||
/// Field Id.
|
||||
///
|
||||
/// See the comment in `DataFile.fields` for how field ids are assigned.
|
||||
int32 id = 3;
|
||||
/// Parent Field ID. If not set, this is a top-level column.
|
||||
int32 parent_id = 4;
|
||||
|
||||
// Logical types, support parameterized Arrow Type.
|
||||
//
|
||||
// PARENT types will always have logical type "struct".
|
||||
//
|
||||
// REPEATED types may have logical types:
|
||||
// * "list"
|
||||
// * "large_list"
|
||||
// * "list.struct"
|
||||
// * "large_list.struct"
|
||||
// The final two are used if the list values are structs, and therefore the
|
||||
// field is both implicitly REPEATED and PARENT.
|
||||
//
|
||||
// LEAF types may have logical types:
|
||||
// * "null"
|
||||
// * "bool"
|
||||
// * "int8" / "uint8"
|
||||
// * "int16" / "uint16"
|
||||
// * "int32" / "uint32"
|
||||
// * "int64" / "uint64"
|
||||
// * "halffloat" / "float" / "double"
|
||||
// * "string" / "large_string"
|
||||
// * "binary" / "large_binary"
|
||||
// * "date32:day"
|
||||
// * "date64:ms"
|
||||
// * "decimal:128:{precision}:{scale}" / "decimal:256:{precision}:{scale}"
|
||||
// * "time:{unit}" / "timestamp:{unit}" / "duration:{unit}", where unit is
|
||||
// "s", "ms", "us", "ns"
|
||||
// * "dict:{value_type}:{index_type}:false"
|
||||
string logical_type = 5;
|
||||
// If this field is nullable.
|
||||
bool nullable = 6;
|
||||
|
||||
// optional field metadata (e.g. extension type name/parameters)
|
||||
map<string, bytes> metadata = 10;
|
||||
|
||||
bool unenforced_primary_key = 12;
|
||||
|
||||
// Position of this field in the primary key (1-based).
|
||||
// 0 means the field is part of the primary key but uses schema field id for ordering.
|
||||
// When set to a positive value, primary key fields are ordered by this position.
|
||||
uint32 unenforced_primary_key_position = 13;
|
||||
|
||||
// Reserved for future use. Use unenforced_clustering_key_position instead.
|
||||
bool unenforced_clustering_key = 14;
|
||||
|
||||
// Position of this field in the clustering key (1-based).
|
||||
// 0 means the field is not part of the clustering key.
|
||||
uint32 unenforced_clustering_key_position = 15;
|
||||
|
||||
// DEPRECATED ----------------------------------------------------------------
|
||||
|
||||
// Deprecated: Only used in V1 file format. V2 uses variable encodings defined
|
||||
// per page.
|
||||
//
|
||||
// The global encoding to use for this field.
|
||||
Encoding encoding = 7;
|
||||
|
||||
// Deprecated: Only used in V1 file format. V2 dynamically chooses when to
|
||||
// do dictionary encoding and keeps the dictionary in the data files.
|
||||
//
|
||||
// The file offset for storing the dictionary value.
|
||||
// It is only valid if encoding is DICTIONARY.
|
||||
//
|
||||
// The logic type presents the value type of the column, i.e., string value.
|
||||
Dictionary dictionary = 8;
|
||||
|
||||
// Deprecated: optional extension type name, use metadata field
|
||||
// ARROW:extension:name
|
||||
string extension_name = 9;
|
||||
|
||||
// Field number 11 was previously `string storage_class`.
|
||||
// Keep it reserved so older manifests remain compatible while new writers
|
||||
// avoid reusing the slot.
|
||||
reserved 11;
|
||||
reserved "storage_class";
|
||||
}
|
||||
210
vendor/lance-table/protos/file2.proto
vendored
Normal file
210
vendor/lance-table/protos/file2.proto
vendored
Normal file
|
|
@ -0,0 +1,210 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The Lance Authors
|
||||
|
||||
syntax = "proto3";
|
||||
|
||||
package lance.file.v2;
|
||||
|
||||
import "google/protobuf/any.proto";
|
||||
import "google/protobuf/empty.proto";
|
||||
|
||||
// # Lance v2.X File Format
|
||||
//
|
||||
// The Lance file format is a barebones format for serializing columnar data
|
||||
// into a file.
|
||||
//
|
||||
// * Each Lance file contains between 0 and 4Gi columns
|
||||
// * Each column contains between 0 and 4Gi pages
|
||||
// * Each page contains between 0 and 2^64 items
|
||||
// * Different pages within a column can have different items counts
|
||||
// * Columns may have up to 2^64 items
|
||||
// * Different columns within a file can have different item counts
|
||||
//
|
||||
// The Lance file format does not have any notion of a type system or schemas.
|
||||
// From the perspective of the file format all data is arbitrary buffers of
|
||||
// bytes with an extensible metadata block to describe the data. It is up to
|
||||
// the user to interpret these bytes meaningfully.
|
||||
//
|
||||
// Data buffers are written to the file first. These data buffers can be
|
||||
// referenced from three different places in the file:
|
||||
//
|
||||
// * Page encodings can reference data buffers. This is the most common way
|
||||
// that actual data is stored.
|
||||
// * Column encodings can reference data buffers. For example, a column encoding
|
||||
// may reference data buffer(s) containing statistics or dictionaries.
|
||||
// * Finally, the global buffer offset table can reference data buffers. This
|
||||
// is useful for storing data that is shared across multiple columns.
|
||||
// This is also useful for global file metadata (e.g. a schema that describes
|
||||
// the file)
|
||||
//
|
||||
// ## File Layout
|
||||
//
|
||||
// Note: the number of buffers (BN) is independent of the number of columns (CN)
|
||||
// and pages.
|
||||
//
|
||||
// Buffers often need to be aligned. 64-byte alignment is common when
|
||||
// working with SIMD operations. 4096-byte alignment is common when
|
||||
// working with direct I/O. In order to ensure these buffers are aligned
|
||||
// writers may need to insert padding before the buffers.
|
||||
//
|
||||
// If direct I/O is required then most (but not all) fields described
|
||||
// below must be sector aligned. We have marked these fields with an
|
||||
// asterisk for clarity. Readers should assume there will be optional
|
||||
// padding inserted before these fields.
|
||||
//
|
||||
// All footer fields are unsigned integers written with little endian
|
||||
// byte order.
|
||||
//
|
||||
// ├──────────────────────────────────┤
|
||||
// | Data Pages |
|
||||
// | Data Buffer 0* |
|
||||
// | ... |
|
||||
// | Data Buffer BN* |
|
||||
// ├──────────────────────────────────┤
|
||||
// | Column Metadatas |
|
||||
// | |A| Column 0 Metadata* |
|
||||
// | Column 1 Metadata* |
|
||||
// | ... |
|
||||
// | Column CN Metadata* |
|
||||
// ├──────────────────────────────────┤
|
||||
// | Column Metadata Offset Table |
|
||||
// | |B| Column 0 Metadata Position* |
|
||||
// | Column 0 Metadata Size |
|
||||
// | ... |
|
||||
// | Column CN Metadata Position |
|
||||
// | Column CN Metadata Size |
|
||||
// ├──────────────────────────────────┤
|
||||
// | Global Buffers Offset Table |
|
||||
// | |C| Global Buffer 0 Position* |
|
||||
// | Global Buffer 0 Size |
|
||||
// | ... |
|
||||
// | Global Buffer GN Position |
|
||||
// | Global Buffer GN Size |
|
||||
// ├──────────────────────────────────┤
|
||||
// | Footer |
|
||||
// | A u64: Offset to column meta 0 |
|
||||
// | B u64: Offset to CMO table |
|
||||
// | C u64: Offset to GBO table |
|
||||
// | u32: Number of global bufs |
|
||||
// | u32: Number of columns |
|
||||
// | u16: Major version |
|
||||
// | u16: Minor version |
|
||||
// | "LANC" |
|
||||
// ├──────────────────────────────────┤
|
||||
//
|
||||
// File Layout-End
|
||||
//
|
||||
// ## Data Pages
|
||||
//
|
||||
// A lot of flexibility is provided in how data is stored. A page's buffers do
|
||||
// not strictly need to be contiguous on the disk. However, it is recommended
|
||||
// that buffers within a page be grouped together for best performance.
|
||||
//
|
||||
// Data pages should be large. The only time a page should be written to disk
|
||||
// is when the writer needs to flush the page to disk because it has accumulated
|
||||
// too much data. Pages are not read in sequential order and if pages are too
|
||||
// small then the seek overhead (or request overhead) will be problematic. We
|
||||
// generally advise that pages be at least 8MB or larger.
|
||||
//
|
||||
// ## Encodings
|
||||
//
|
||||
// Specific encodings are not part of this minimal format. They are provided
|
||||
// by extensions. Readers and writers should be designed so that encodings can
|
||||
// be easily added and removed. Ideally, they should allow for this without
|
||||
// requiring recompilation through some kind of plugin system.
|
||||
|
||||
// The deferred encoding is used to place the encoding itself in a different
|
||||
// part of the file. This is most commonly used to allow encodings to be shared
|
||||
// across different columns. For example, when writing a file with thousands of
|
||||
// columns, where many pages have the exact same encoding, it can be useful
|
||||
// to cut down on the size of the metadata by using a deferred encoding.
|
||||
message DeferredEncoding {
|
||||
// Location of the buffer containing the encoding.
|
||||
//
|
||||
// * If sharing encodings across columns then this will be in a global buffer
|
||||
// * If sharing encodings across pages within a column this could be in a
|
||||
// column metadata buffer.
|
||||
// * This could also be a page buffer if the encoding is not shared, needs
|
||||
// to be written before the file ends, and the encoding is too large to load
|
||||
// unless we first determine the page needs to be read. This combination
|
||||
// seems unusual.
|
||||
uint64 buffer_location = 1;
|
||||
uint64 buffer_length = 2;
|
||||
}
|
||||
|
||||
// The encoding is placed directly in the metadata section
|
||||
message DirectEncoding {
|
||||
// The bytes that make up the encoding embedded directly in the metadata
|
||||
//
|
||||
// This is the most common approach.
|
||||
bytes encoding = 1;
|
||||
}
|
||||
|
||||
// An encoding stores the information needed to decode a column or page
|
||||
//
|
||||
// For example, it could describe if the page is using bit packing, and how many bits
|
||||
// there are in each individual value.
|
||||
//
|
||||
// At the column level it can be used to wrap columns with dictionaries or statistics.
|
||||
message Encoding {
|
||||
oneof location {
|
||||
// The encoding is stored elsewhere and not part of this protobuf message
|
||||
DeferredEncoding indirect = 1;
|
||||
// The encoding is stored within this protobuf message
|
||||
DirectEncoding direct = 2;
|
||||
// There is no encoding information
|
||||
google.protobuf.Empty none = 3;
|
||||
}
|
||||
}
|
||||
|
||||
// ## Metadata
|
||||
|
||||
// Each column has a metadata block that is placed at the end of the file.
|
||||
// These may be read individually to allow for column projection.
|
||||
message ColumnMetadata {
|
||||
|
||||
// This describes a page of column data.
|
||||
message Page {
|
||||
// The file offsets for each of the page buffers
|
||||
//
|
||||
// The number of buffers is variable and depends on the encoding. There
|
||||
// may be zero buffers (e.g. constant encoded data) in which case this
|
||||
// could be empty.
|
||||
repeated uint64 buffer_offsets = 1;
|
||||
// The size (in bytes) of each of the page buffers
|
||||
//
|
||||
// This field will have the same length as `buffer_offsets` and
|
||||
// may be empty.
|
||||
repeated uint64 buffer_sizes = 2;
|
||||
// Logical length (e.g. # rows) of the page
|
||||
uint64 length = 3;
|
||||
// The encoding used to encode the page
|
||||
Encoding encoding = 4;
|
||||
// The priority of the page
|
||||
//
|
||||
// For tabular data this will be the top-level row number of the first row
|
||||
// in the page (and top-level rows should not split across pages).
|
||||
uint64 priority = 5;
|
||||
}
|
||||
// Encoding information about the column itself. This typically describes
|
||||
// how to interpret the column metadata buffers. For example, it could
|
||||
// describe how statistics or dictionaries are stored in the column metadata.
|
||||
Encoding encoding = 1;
|
||||
// The pages in the column
|
||||
repeated Page pages = 2;
|
||||
// The file offsets of each of the column metadata buffers
|
||||
//
|
||||
// There may be zero buffers.
|
||||
repeated uint64 buffer_offsets = 3;
|
||||
// The size (in bytes) of each of the column metadata buffers
|
||||
//
|
||||
// This field will have the same length as `buffer_offsets` and
|
||||
// may be empty.
|
||||
repeated uint64 buffer_sizes = 4;
|
||||
} // Metadata-End
|
||||
|
||||
// ## Where is the rest?
|
||||
//
|
||||
// This file format is extremely minimal. It is a building block for
|
||||
// creating more useful readers and writers and not terribly useful by itself.
|
||||
// Other protobuf files will describe how this can be extended.
|
||||
99
vendor/lance-table/protos/filtered_read.proto
vendored
Normal file
99
vendor/lance-table/protos/filtered_read.proto
vendored
Normal file
|
|
@ -0,0 +1,99 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The Lance Authors
|
||||
|
||||
syntax = "proto3";
|
||||
|
||||
package lance.datafusion;
|
||||
|
||||
import "table_identifier.proto";
|
||||
|
||||
message U64Range {
|
||||
uint64 start = 1;
|
||||
uint64 end = 2;
|
||||
}
|
||||
|
||||
message ProjectionProto {
|
||||
repeated int32 field_ids = 1;
|
||||
bool with_row_id = 2;
|
||||
bool with_row_addr = 3;
|
||||
bool with_row_last_updated_at_version = 4;
|
||||
bool with_row_created_at_version = 5;
|
||||
BlobHandlingProto blob_handling = 6;
|
||||
}
|
||||
|
||||
message BlobHandlingProto {
|
||||
oneof mode {
|
||||
// All blobs read as binary
|
||||
bool all_binary = 1;
|
||||
// Blobs as descriptions, other binary as binary (default)
|
||||
bool blobs_descriptions = 2;
|
||||
// All binary columns as descriptions
|
||||
bool all_descriptions = 3;
|
||||
// Specific blobs read as binary, rest as descriptions (non-blob binary stays binary)
|
||||
FieldIdSet some_blobs_binary = 4;
|
||||
// Specific columns as binary, all other binary as descriptions
|
||||
FieldIdSet some_binary = 5;
|
||||
}
|
||||
}
|
||||
|
||||
message FieldIdSet {
|
||||
repeated uint32 field_ids = 1;
|
||||
}
|
||||
|
||||
message FilteredReadThreadingModeProto {
|
||||
oneof mode {
|
||||
uint64 one_partition_multiple_threads = 1;
|
||||
uint64 multiple_partitions = 2;
|
||||
}
|
||||
}
|
||||
|
||||
// Serializable form of FilteredReadOptions.
|
||||
message FilteredReadOptionsProto {
|
||||
optional U64Range scan_range_before_filter = 1;
|
||||
optional U64Range scan_range_after_filter = 2;
|
||||
bool with_deleted_rows = 3;
|
||||
optional uint32 batch_size = 4;
|
||||
optional uint64 fragment_readahead = 5;
|
||||
repeated uint64 fragment_ids = 6;
|
||||
ProjectionProto projection = 7;
|
||||
optional bytes refine_filter_substrait = 8;
|
||||
optional bytes full_filter_substrait = 9;
|
||||
FilteredReadThreadingModeProto threading_mode = 10;
|
||||
optional uint64 io_buffer_size_bytes = 11;
|
||||
// Arrow IPC schema for decoding Substrait filters (may be wider than projection).
|
||||
optional bytes filter_schema_ipc = 12;
|
||||
}
|
||||
|
||||
// Serializable form of FilteredReadPlan (planned/distributed mode).
|
||||
// RowAddrTreeMap serialized via its built-in serialize_into/deserialize_from.
|
||||
// Per-fragment filters are Substrait-encoded and deduplicated.
|
||||
message FilteredReadPlanProto {
|
||||
bytes row_addr_tree_map = 1;
|
||||
optional U64Range scan_range_after_filter = 2;
|
||||
// Arrow IPC schema for decoding Substrait filters (matches the schema used at encode time).
|
||||
optional bytes filter_schema_ipc = 3;
|
||||
// Per-fragment filter mapping. Key is fragment id, value is a list index into
|
||||
// filter_expressions. Multiple fragments can share the same list index when
|
||||
// they have the same filter, avoiding duplicate Substrait encoding.
|
||||
map<uint32, uint32> fragment_filter_ids = 4;
|
||||
// Deduplicated Substrait-encoded filter expressions. Each entry is referenced
|
||||
// by one or more values in fragment_filter_ids.
|
||||
repeated bytes filter_expressions = 5;
|
||||
}
|
||||
|
||||
// Top-level wrapper for FilteredReadExec serialization.
|
||||
message FilteredReadExecProto {
|
||||
TableIdentifier table = 1;
|
||||
FilteredReadOptionsProto options = 2;
|
||||
// FilteredRead has two modes
|
||||
// Plan-then-execute (distributed): The planner creates a FilteredReadPlan and sends it to a remote executor.
|
||||
// Plan-and-execute (local): The executor creates the plan itself at execution time.
|
||||
optional FilteredReadPlanProto plan = 3;
|
||||
// Note: FilteredReadExec.index_input (child ExecutionPlan) is NOT serialized here.
|
||||
// DataFusion's PhysicalExtensionCodec handles child plans automatically: it walks
|
||||
// the plan tree via children() / with_new_children(), serializes each node, and
|
||||
// passes deserialized children back as the `inputs` parameter in try_decode.
|
||||
// This means any ExecutionPlan in the tree (including index_input) must also
|
||||
// implement try_encode/try_decode in the PhysicalExtensionCodec.
|
||||
// TODO: implement serialize/deserialize for lance-specific index input ExecutionPlans.
|
||||
}
|
||||
249
vendor/lance-table/protos/index.proto
vendored
Normal file
249
vendor/lance-table/protos/index.proto
vendored
Normal file
|
|
@ -0,0 +1,249 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The Lance Authors
|
||||
|
||||
syntax = "proto3";
|
||||
|
||||
package lance.index.pb;
|
||||
|
||||
import "google/protobuf/any.proto";
|
||||
|
||||
// The type of an index.
|
||||
enum IndexType {
|
||||
// Vector index
|
||||
VECTOR = 0;
|
||||
}
|
||||
|
||||
message Index {
|
||||
// The unique index name in the dataset.
|
||||
string name = 1;
|
||||
|
||||
// Columns to be used to build the index.
|
||||
repeated string columns = 2;
|
||||
|
||||
// The version of the dataset this index was built from.
|
||||
uint64 dataset_version = 3;
|
||||
|
||||
// The [`IndexType`] of the index.
|
||||
IndexType index_type = 4;
|
||||
|
||||
/// Index implementation details.
|
||||
oneof implementation {
|
||||
VectorIndex vector_index = 5;
|
||||
}
|
||||
}
|
||||
|
||||
message Tensor {
|
||||
enum DataType {
|
||||
BFLOAT16 = 0;
|
||||
FLOAT16 = 1;
|
||||
FLOAT32 = 2;
|
||||
FLOAT64 = 3;
|
||||
UINT8 = 4;
|
||||
UINT16 = 5;
|
||||
UINT32 = 6;
|
||||
UINT64 = 7;
|
||||
}
|
||||
|
||||
DataType data_type = 1;
|
||||
|
||||
// Data shape, [dim1, dim2, ...]
|
||||
repeated uint32 shape = 2;
|
||||
|
||||
// Data buffer
|
||||
bytes data = 3;
|
||||
}
|
||||
|
||||
// Inverted Index File Metadata.
|
||||
message IVF {
|
||||
// Centroids of partitions. `dimension * num_partitions` of float32s.
|
||||
//
|
||||
// Deprecated, use centroids_tensor instead.
|
||||
repeated float centroids = 1; // [deprecated = true];
|
||||
|
||||
// File offset of each partition.
|
||||
repeated uint64 offsets = 2;
|
||||
|
||||
// Number of records in the partition.
|
||||
repeated uint32 lengths = 3;
|
||||
|
||||
// Tensor of centroids. `num_partitions * dimension` of float32s.
|
||||
Tensor centroids_tensor = 4;
|
||||
|
||||
// KMeans loss.
|
||||
optional double loss = 5;
|
||||
}
|
||||
|
||||
// Product Quantization.
|
||||
message PQ {
|
||||
// The number of bits to present a centroid.
|
||||
uint32 num_bits = 1;
|
||||
|
||||
// Number of sub vectors.
|
||||
uint32 num_sub_vectors = 2;
|
||||
|
||||
// Vector dimension
|
||||
uint32 dimension = 3;
|
||||
|
||||
// Codebook. `dimension * 2 ^ num_bits` of float32s.
|
||||
repeated float codebook = 4;
|
||||
|
||||
// Tensor of codebook. `2 ^ num_bits * dimension` of floats.
|
||||
Tensor codebook_tensor = 5;
|
||||
}
|
||||
|
||||
// Transform type
|
||||
enum TransformType {
|
||||
OPQ = 0;
|
||||
}
|
||||
|
||||
// A transform matrix to apply to a vector or vectors.
|
||||
message Transform {
|
||||
// The file offset the matrix is stored
|
||||
uint64 position = 1;
|
||||
|
||||
// Data shape of the matrix, [rows, cols].
|
||||
repeated uint32 shape = 2;
|
||||
|
||||
// Transform type.
|
||||
TransformType type = 3;
|
||||
}
|
||||
|
||||
// Flat Index
|
||||
message Flat {}
|
||||
|
||||
// DiskAnn Index
|
||||
message DiskAnn {
|
||||
// Graph spec version
|
||||
uint32 spec = 1;
|
||||
|
||||
// Graph file
|
||||
string filename = 2;
|
||||
|
||||
// r parameter
|
||||
uint32 r = 3;
|
||||
|
||||
// alpha parameter
|
||||
float alpha = 4;
|
||||
|
||||
// L parameter
|
||||
uint32 L = 5;
|
||||
|
||||
/// Entry points to the graph
|
||||
repeated uint64 entries = 6;
|
||||
}
|
||||
|
||||
// One stage in the vector index pipeline.
|
||||
message VectorIndexStage {
|
||||
oneof stage {
|
||||
// Flat index
|
||||
Flat flat = 1;
|
||||
// `IVF` - Inverted File
|
||||
IVF ivf = 2;
|
||||
// Product Quantization
|
||||
PQ pq = 3;
|
||||
// Transformer
|
||||
Transform transform = 4;
|
||||
// DiskANN
|
||||
DiskAnn diskann = 5;
|
||||
}
|
||||
}
|
||||
|
||||
// Metric Type for Vector Index
|
||||
enum VectorMetricType {
|
||||
// L2 (Euclidean) Distance
|
||||
L2 = 0;
|
||||
|
||||
// Cosine Distance
|
||||
Cosine = 1;
|
||||
|
||||
// Dot Product
|
||||
Dot = 2;
|
||||
|
||||
// Hamming Distance
|
||||
Hamming = 3;
|
||||
}
|
||||
|
||||
// Vector Index Metadata
|
||||
message VectorIndex {
|
||||
// Index specification version.
|
||||
uint32 spec_version = 1;
|
||||
|
||||
// Vector dimension;
|
||||
uint32 dimension = 2;
|
||||
|
||||
// Composed vector index stages.
|
||||
//
|
||||
// For example, `IVF_PQ` index type can be expressed as:
|
||||
//
|
||||
// ```text
|
||||
// let stages = vec![Ivf{}, PQ{num_bits: 8, num_sub_vectors: 16}]
|
||||
// ```
|
||||
repeated VectorIndexStage stages = 3;
|
||||
|
||||
// Vector distance metrics type
|
||||
VectorMetricType metric_type = 4;
|
||||
}
|
||||
|
||||
// Details for vector indexes, stored in the manifest's index_details field.
|
||||
message VectorIndexDetails {
|
||||
VectorMetricType metric_type = 1;
|
||||
|
||||
// The target number of vectors per partition.
|
||||
// 0 means unset.
|
||||
uint64 target_partition_size = 2;
|
||||
|
||||
// Optional HNSW index configuration. If set, the index has an HNSW layer.
|
||||
optional HnswParameters hnsw_index_config = 3;
|
||||
|
||||
message ProductQuantization {
|
||||
uint32 num_bits = 1;
|
||||
uint32 num_sub_vectors = 2;
|
||||
}
|
||||
message ScalarQuantization {
|
||||
uint32 num_bits = 1;
|
||||
}
|
||||
message RabitQuantization {
|
||||
enum RotationType {
|
||||
FAST = 0;
|
||||
MATRIX = 1;
|
||||
}
|
||||
uint32 num_bits = 1;
|
||||
RotationType rotation_type = 2;
|
||||
}
|
||||
|
||||
// No quantization; vectors are stored as-is.
|
||||
message FlatCompression {}
|
||||
|
||||
oneof compression {
|
||||
ProductQuantization pq = 4;
|
||||
ScalarQuantization sq = 5;
|
||||
RabitQuantization rq = 6;
|
||||
FlatCompression flat = 8;
|
||||
}
|
||||
|
||||
// Runtime hints: optional build preferences that don't affect index structure.
|
||||
// Keys use reverse-DNS namespacing (e.g., "lance.ivf.max_iters", "lancedb.accelerator").
|
||||
// Unrecognized keys must be silently ignored by all runtimes.
|
||||
map<string, string> runtime_hints = 9;
|
||||
}
|
||||
|
||||
// Hierarchical Navigable Small World (HNSW) parameters, used as an optional configuration for IVF indexes.
|
||||
message HnswParameters {
|
||||
// The maximum number of outgoing edges per node in the HNSW graph. Higher values
|
||||
// means more connections, better recall, but more memory and slower builds.
|
||||
// Referred to as "M" in the HNSW literature.
|
||||
uint32 max_connections = 1;
|
||||
// "construction exploration factor": The size of the dynamic list used during
|
||||
// index construction.
|
||||
uint32 construction_ef = 2;
|
||||
// The maximum number of levels in the HNSW graph.
|
||||
uint32 max_level = 3;
|
||||
}
|
||||
|
||||
message JsonIndexDetails {
|
||||
string path = 1;
|
||||
google.protobuf.Any target_details = 2;
|
||||
}
|
||||
message BloomFilterIndexDetails {}
|
||||
|
||||
message RTreeIndexDetails {}
|
||||
42
vendor/lance-table/protos/index_old.proto
vendored
Normal file
42
vendor/lance-table/protos/index_old.proto
vendored
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The Lance Authors
|
||||
|
||||
syntax = "proto3";
|
||||
|
||||
package lance.table;
|
||||
|
||||
// NOTE: Do *NOT* add new index details here. Add them to the index.proto file instead.
|
||||
// This file is in the lance.table package namespace while the index.proto file is in the
|
||||
// lance.index package namespace.
|
||||
//
|
||||
// These are only here for forward compatibility. Older versions of Lance expect btree indexes
|
||||
// to have lance.table in the package namespace.
|
||||
//
|
||||
// If you need to modify these messages (e.g. to add new fields to btree or bitmap) then
|
||||
// it is ok to modify them here.
|
||||
|
||||
// Currently many of these are empty messages because all needed details are either hard-coded (e.g.
|
||||
// filenames) or stored in the index itself. However, we may want to add more details in the
|
||||
// future, in particular we can add details that may be useful for planning queries (e.g. don't
|
||||
// force us to load the index until we know we can make use of it)
|
||||
|
||||
message BTreeIndexDetails {}
|
||||
message BitmapIndexDetails {}
|
||||
message LabelListIndexDetails {}
|
||||
message NGramIndexDetails {}
|
||||
message ZoneMapIndexDetails {}
|
||||
message InvertedIndexDetails {
|
||||
// Marking this field as optional as old versions of the index store blank details and we
|
||||
// need to make sure we have a proper optional field to detect this.
|
||||
optional string base_tokenizer = 1;
|
||||
string language = 2;
|
||||
bool with_position = 3;
|
||||
optional uint32 max_token_length = 4;
|
||||
bool lower_case = 5;
|
||||
bool stem = 6;
|
||||
bool remove_stop_words = 7;
|
||||
bool ascii_folding = 8;
|
||||
uint32 min_ngram_length = 9;
|
||||
uint32 max_ngram_length = 10;
|
||||
bool prefix_only = 11;
|
||||
}
|
||||
2
vendor/lance-table/protos/license_header.txt
vendored
Normal file
2
vendor/lance-table/protos/license_header.txt
vendored
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The Lance Authors
|
||||
113
vendor/lance-table/protos/rowids.proto
vendored
Normal file
113
vendor/lance-table/protos/rowids.proto
vendored
Normal file
|
|
@ -0,0 +1,113 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The Lance Authors
|
||||
|
||||
syntax = "proto3";
|
||||
|
||||
package lance.table;
|
||||
// TODO: what would it take to store this in a LanceV2 file?
|
||||
// Or would flatbuffers be better for this?
|
||||
|
||||
/// A sequence of row IDs. This is split up into one or more segments,
|
||||
/// each of which can be encoded in different ways. The encodings are optimized
|
||||
/// for values that are sorted, which will often be the case with row ids.
|
||||
/// They also have optimized forms depending on how sparse the values are.
|
||||
message RowIdSequence {
|
||||
repeated U64Segment segments = 1;
|
||||
}
|
||||
|
||||
/// Different ways to encode a sequence of u64 values.
|
||||
message U64Segment {
|
||||
/// A range of u64 values.
|
||||
message Range {
|
||||
/// The start of the range, inclusive.
|
||||
uint64 start = 1;
|
||||
/// The end of the range, exclusive.
|
||||
uint64 end = 2;
|
||||
}
|
||||
|
||||
/// A range of u64 values with holes.
|
||||
message RangeWithHoles {
|
||||
/// The start of the range, inclusive.
|
||||
uint64 start = 1;
|
||||
/// The end of the range, exclusive.
|
||||
uint64 end = 2;
|
||||
/// The holes in the range, as a sorted array of values;
|
||||
/// Binary search can be used to check whether a value is a hole and should
|
||||
/// be skipped. This can also be used to count the number of holes before a
|
||||
/// given value, if you need to find the logical offset of a value in the
|
||||
/// segment.
|
||||
EncodedU64Array holes = 3;
|
||||
}
|
||||
|
||||
/// A range of u64 values with a bitmap.
|
||||
message RangeWithBitmap {
|
||||
/// The start of the range, inclusive.
|
||||
uint64 start = 1;
|
||||
/// The end of the range, exclusive.
|
||||
uint64 end = 2;
|
||||
/// A bitmap of the values in the range. The bitmap is a sequence of bytes,
|
||||
/// where each byte represents 8 values. The first byte represents values
|
||||
/// start to start + 7, the second byte represents values start + 8 to
|
||||
/// start + 15, and so on. The most significant bit of each byte represents
|
||||
/// the first value in the range, and the least significant bit represents
|
||||
/// the last value in the range. If the bit is set, the value is in the
|
||||
/// range; if it is not set, the value is not in the range.
|
||||
bytes bitmap = 3;
|
||||
}
|
||||
|
||||
oneof segment {
|
||||
/// When the values are sorted and contiguous.
|
||||
Range range = 1;
|
||||
/// When the values are sorted but have a few gaps.
|
||||
RangeWithHoles range_with_holes = 2;
|
||||
/// When the values are sorted but have many gaps.
|
||||
RangeWithBitmap range_with_bitmap = 3;
|
||||
/// When the values are sorted but are sparse.
|
||||
EncodedU64Array sorted_array = 4;
|
||||
/// A general array of values, which is not sorted.
|
||||
EncodedU64Array array = 5;
|
||||
}
|
||||
} // RowIdSegment
|
||||
|
||||
/// A basic bitpacked array of u64 values.
|
||||
message EncodedU64Array {
|
||||
message U16Array {
|
||||
uint64 base = 1;
|
||||
/// The deltas are stored as 16-bit unsigned integers.
|
||||
/// (protobuf doesn't support 16-bit integers, so we use bytes instead)
|
||||
bytes offsets = 2;
|
||||
}
|
||||
|
||||
message U32Array {
|
||||
uint64 base = 1;
|
||||
/// The deltas are stored as 32-bit unsigned integers.
|
||||
/// (we use bytes instead of uint32 to avoid overhead of varint encoding)
|
||||
bytes offsets = 2;
|
||||
}
|
||||
|
||||
message U64Array {
|
||||
/// (We use bytes instead of uint64 to avoid overhead of varint encoding)
|
||||
bytes values = 2;
|
||||
}
|
||||
|
||||
oneof array {
|
||||
U16Array u16_array = 1;
|
||||
U32Array u32_array = 2;
|
||||
U64Array u64_array = 3;
|
||||
}
|
||||
}
|
||||
|
||||
/// A sequence of dataset versions. Similar to RowIdSequence but tracks
|
||||
/// version runs. It uses RLE (Run-Length Encoding) to efficiently
|
||||
// represent consecutive rows with the same version.
|
||||
message RowDatasetVersionSequence {
|
||||
repeated RowDatasetVersionRun runs = 1;
|
||||
}
|
||||
|
||||
/// A run of rows with the same version.
|
||||
message RowDatasetVersionRun {
|
||||
/// The number of consecutive rows with the same version.
|
||||
U64Segment span = 1;
|
||||
|
||||
uint64 version = 2;
|
||||
}
|
||||
717
vendor/lance-table/protos/table.proto
vendored
Normal file
717
vendor/lance-table/protos/table.proto
vendored
Normal file
|
|
@ -0,0 +1,717 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The Lance Authors
|
||||
|
||||
syntax = "proto3";
|
||||
|
||||
package lance.table;
|
||||
|
||||
import "google/protobuf/any.proto";
|
||||
import "google/protobuf/timestamp.proto";
|
||||
import "file.proto";
|
||||
|
||||
/*
|
||||
|
||||
Format:
|
||||
|
||||
+----------------------------------------+
|
||||
| Encoded Column 0, Chunk 0 |
|
||||
...
|
||||
| Encoded Column M, Chunk N - 1 |
|
||||
| Encoded Column M, Chunk N |
|
||||
| Indices ... |
|
||||
| Chunk Position (M x N x 8) |
|
||||
| Manifest (Optional) |
|
||||
| Metadata |
|
||||
| i64: metadata position |
|
||||
| MAJOR_VERSION | MINOR_VERSION | "LANC" |
|
||||
+----------------------------------------+
|
||||
*/
|
||||
|
||||
// UUID type. encoded as 16 bytes.
|
||||
message UUID {
|
||||
bytes uuid = 1;
|
||||
}
|
||||
|
||||
// Manifest is a global section shared between all the files.
|
||||
message Manifest {
|
||||
// All fields of the dataset, including the nested fields.
|
||||
repeated lance.file.Field fields = 1;
|
||||
|
||||
// Schema metadata.
|
||||
map<string, bytes> schema_metadata = 5;
|
||||
|
||||
// Fragments of the dataset.
|
||||
repeated DataFragment fragments = 2;
|
||||
|
||||
// Snapshot version number.
|
||||
uint64 version = 3;
|
||||
|
||||
// The file position of the version auxiliary data.
|
||||
// * It is not inheritable between versions.
|
||||
// * It is not loaded by default during query.
|
||||
uint64 version_aux_data = 4;
|
||||
|
||||
message WriterVersion {
|
||||
// The name of the library that created this file.
|
||||
string library = 1;
|
||||
// The version of the library that created this file. Because we cannot assume
|
||||
// that the library is semantically versioned, this is a string. However, if it
|
||||
// is semantically versioned, it should be a valid semver string without any 'v'
|
||||
// prefix. For example: `2.0.0`, `2.0.0-rc.1`.
|
||||
//
|
||||
// For forward compatibility with older readers, when writing new manifests this
|
||||
// field should contain only the core version (major.minor.patch) without any
|
||||
// prerelease or build metadata. The prerelease/build info should be stored in
|
||||
// the separate prerelease and build_metadata fields instead.
|
||||
string version = 2;
|
||||
// Optional semver prerelease identifier.
|
||||
//
|
||||
// This field stores the prerelease portion of a semantic version separately
|
||||
// from the core version number. For example, if the full version is "2.0.0-rc.1",
|
||||
// the version field would contain "2.0.0" and prerelease would contain "rc.1".
|
||||
//
|
||||
// This separation ensures forward compatibility: older readers can parse the
|
||||
// clean version field without errors, while newer readers can reconstruct the
|
||||
// full semantic version by combining version, prerelease, and build_metadata.
|
||||
//
|
||||
// If absent, the version field is used as-is.
|
||||
optional string prerelease = 3;
|
||||
// Optional semver build metadata.
|
||||
//
|
||||
// This field stores the build metadata portion of a semantic version separately
|
||||
// from the core version number. For example, if the full version is
|
||||
// "2.0.0-rc.1+build.123", the version field would contain "2.0.0", prerelease
|
||||
// would contain "rc.1", and build_metadata would contain "build.123".
|
||||
//
|
||||
// If absent, no build metadata is present.
|
||||
optional string build_metadata = 4;
|
||||
}
|
||||
|
||||
// The version of the writer that created this file.
|
||||
//
|
||||
// This information may be used to detect whether the file may have known bugs
|
||||
// associated with that writer.
|
||||
WriterVersion writer_version = 13;
|
||||
|
||||
// If present, the file position of the index metadata.
|
||||
optional uint64 index_section = 6;
|
||||
|
||||
// Version creation Timestamp, UTC timezone
|
||||
google.protobuf.Timestamp timestamp = 7;
|
||||
|
||||
// Optional version tag
|
||||
string tag = 8;
|
||||
|
||||
// Feature flags for readers.
|
||||
//
|
||||
// A bitmap of flags that indicate which features are required to be able to
|
||||
// read the table. If a reader does not recognize a flag that is set, it
|
||||
// should not attempt to read the dataset.
|
||||
//
|
||||
// Known flags:
|
||||
// * 1: deletion files are present
|
||||
// * 2: row ids are stable and stored as part of the fragment metadata.
|
||||
// * 4: use v2 format (deprecated)
|
||||
// * 8: table config is present
|
||||
uint64 reader_feature_flags = 9;
|
||||
|
||||
// Feature flags for writers.
|
||||
//
|
||||
// A bitmap of flags that indicate which features must be used when writing to the
|
||||
// dataset. If a writer does not recognize a flag that is set, it should not attempt to
|
||||
// write to the dataset.
|
||||
//
|
||||
// The flag identities are the same as for reader_feature_flags, but the values of
|
||||
// reader_feature_flags and writer_feature_flags are not required to be identical.
|
||||
uint64 writer_feature_flags = 10;
|
||||
|
||||
// The highest fragment ID that has been used so far.
|
||||
//
|
||||
// This ID is not guaranteed to be present in the current version, but it may
|
||||
// have been used in previous versions.
|
||||
//
|
||||
// For a single fragment, will be zero. For no fragments, will be absent.
|
||||
optional uint32 max_fragment_id = 11;
|
||||
|
||||
// Path to the transaction file, relative to `{root}/_transactions`. The file at that
|
||||
// location contains a wire-format serialized Transaction message representing the
|
||||
// transaction that created this version.
|
||||
//
|
||||
// This string field "transaction_file" may be empty if no transaction file was written.
|
||||
//
|
||||
// The path format is "{read_version}-{uuid}.txn" where {read_version} is the version of
|
||||
// the table the transaction read from (serialized to decimal with no padding digits),
|
||||
// and {uuid} is a hyphen-separated UUID.
|
||||
string transaction_file = 12;
|
||||
|
||||
// The file position of the transaction content. None if transaction is empty
|
||||
// This transaction content begins with the transaction content length as u32
|
||||
// If the transaction proto message has a length of `len`, the message ends at `len` + 4
|
||||
optional uint64 transaction_section = 21;
|
||||
|
||||
// The next unused row id. If zero, then the table does not have any rows.
|
||||
//
|
||||
// This is only used if the "stable_row_ids" feature flag is set.
|
||||
uint64 next_row_id = 14;
|
||||
|
||||
message DataStorageFormat {
|
||||
// The format of the data files (e.g. "lance")
|
||||
string file_format = 1;
|
||||
// The max format version of the data files. The format of the version can vary by
|
||||
// file_format and is not required to follow semver.
|
||||
//
|
||||
// Every file in this version of the dataset has the same file_format version.
|
||||
string version = 2;
|
||||
}
|
||||
|
||||
// The data storage format
|
||||
//
|
||||
// This specifies what format is used to store the data files.
|
||||
DataStorageFormat data_format = 15;
|
||||
|
||||
// Table config.
|
||||
//
|
||||
// Keys with the prefix "lance." are reserved for the Lance library. Other
|
||||
// libraries may wish to similarly prefix their configuration keys
|
||||
// appropriately.
|
||||
map<string, string> config = 16;
|
||||
|
||||
// Metadata associated with the table.
|
||||
//
|
||||
// This is a key-value map that can be used to store arbitrary metadata
|
||||
// associated with the table.
|
||||
//
|
||||
// This is different than configuration, which is used to tell libraries how
|
||||
// to read, write, or manage the table.
|
||||
//
|
||||
// This is different than schema metadata, which is used to describe the
|
||||
// data itself and is attached to the output schema of scans.
|
||||
map<string, string> table_metadata = 19;
|
||||
|
||||
// Field number 17 (`blob_dataset_version`) was used for a secondary blob dataset.
|
||||
reserved 17;
|
||||
reserved "blob_dataset_version";
|
||||
|
||||
// The base paths of data files.
|
||||
//
|
||||
// This is used to determine the base path of a data file. In common cases data file paths are under current dataset base path.
|
||||
// But for shallow cloning, importing file and other multi-tier storage cases, the actual data files could be outside of the current dataset.
|
||||
// This field is used with the `base_id` in `lance.file.File` and `lance.file.DeletionFile`.
|
||||
//
|
||||
// For example, if we have a dataset with base path `s3://bucket/dataset`, we have a DataFile with base_id 0, we get the actual data file path by:
|
||||
// base_paths[id = 0] + /data/ + file.path
|
||||
// the key(a.k.a index) starts from 0, increased by 1 for each new base path.
|
||||
repeated BasePath base_paths = 18;
|
||||
|
||||
// The branch of the dataset. None means main branch.
|
||||
optional string branch = 20;
|
||||
} // Manifest
|
||||
|
||||
// external dataset base path
|
||||
message BasePath {
|
||||
uint32 id = 1;
|
||||
// This is an alias name of the base path, it is optional.
|
||||
// When we use shallow clone and the target version is a tag, the tag name will be set here.
|
||||
optional string name = 2;
|
||||
// Flag indicating whether this path is a dataset root path or file directory:
|
||||
// - true: Path is a dataset root (actual files under subdirectories like `data`, '_deletions')
|
||||
// - false: Path is a direct file directory (scenario like importing files)
|
||||
bool is_dataset_root = 3;
|
||||
// Note: This absolute path will be directly used by Path:parse(),
|
||||
string path = 4;
|
||||
}
|
||||
|
||||
// Auxiliary Data attached to a version.
|
||||
// Only load on-demand.
|
||||
message VersionAuxData {
|
||||
// key-value metadata.
|
||||
map<string, bytes> metadata = 3;
|
||||
}
|
||||
|
||||
// Metadata describing an index.
|
||||
message IndexMetadata {
|
||||
// Unique ID of an index. It is unique across all the dataset versions.
|
||||
UUID uuid = 1;
|
||||
|
||||
// The columns to build the index. These refer to file.Field.id.
|
||||
repeated int32 fields = 2;
|
||||
|
||||
// Index name. Must be unique within one dataset version.
|
||||
string name = 3;
|
||||
|
||||
// The version of the dataset this index was built from.
|
||||
uint64 dataset_version = 4;
|
||||
|
||||
// A bitmap of the included fragment ids.
|
||||
//
|
||||
// This may by used to determine how much of the dataset is covered by the
|
||||
// index. This information can be retrieved from the dataset by looking at
|
||||
// the dataset at `dataset_version`. However, since the old version may be
|
||||
// deleted while the index is still in use, this information is also stored
|
||||
// in the index.
|
||||
//
|
||||
// The bitmap is stored as a 32-bit Roaring bitmap.
|
||||
bytes fragment_bitmap = 5;
|
||||
|
||||
// Details, specific to the index type, which are needed to load / interpret the index
|
||||
//
|
||||
// Indices should avoid putting large amounts of information in this field, as it will
|
||||
// bloat the manifest.
|
||||
//
|
||||
// Indexes are plugins, and so the format of the details message is flexible and not fully
|
||||
// defined by the table format. However, there are some conventions that should be followed:
|
||||
//
|
||||
// - When Lance APIs refer to indexes they will use the type URL of the index details as the
|
||||
// identifier for the index type. If a user provides a simple string identifier like
|
||||
// "btree" then it will be converted to "/lance.table.BTreeIndexDetails"
|
||||
// - Type URLs comparisons are case-insensitive. Thereform an index must have a unique type
|
||||
// URL ignoring case.
|
||||
google.protobuf.Any index_details = 6;
|
||||
|
||||
// The minimum lance version that this index is compatible with.
|
||||
optional int32 index_version = 7;
|
||||
|
||||
// Timestamp when the index was created (UTC timestamp in milliseconds since epoch)
|
||||
//
|
||||
// This field is optional for backward compatibility. For existing indices created before
|
||||
// this field was added, this will be None/null.
|
||||
optional uint64 created_at = 8;
|
||||
|
||||
// The base path index of the data file. Used when the file is imported or referred from another dataset.
|
||||
// Lance use it as key of the base_paths field in Manifest to determine the actual base path of the data file.
|
||||
optional uint32 base_id = 9;
|
||||
|
||||
// List of files and their sizes for this index segment.
|
||||
// This enables skipping HEAD calls when opening indices and allows reporting
|
||||
// of index sizes without extra IO.
|
||||
// If this is empty, the index files sizes are unknown.
|
||||
repeated IndexFile files = 10;
|
||||
}
|
||||
|
||||
// Metadata about a single file within an index segment.
|
||||
message IndexFile {
|
||||
// Path relative to the index directory (e.g., "index.idx", "auxiliary.idx")
|
||||
string path = 1;
|
||||
// Size of the file in bytes
|
||||
uint64 size_bytes = 2;
|
||||
}
|
||||
|
||||
// Index Section, containing a list of index metadata for one dataset version.
|
||||
message IndexSection {
|
||||
repeated IndexMetadata indices = 1;
|
||||
}
|
||||
|
||||
// A DataFragment is a set of files which represent the different columns of the same
|
||||
// rows. If column exists in the schema of a dataset, but the file for that column does
|
||||
// not exist within a DataFragment of that dataset, that column consists entirely of
|
||||
// nulls.
|
||||
message DataFragment {
|
||||
// The ID of a DataFragment is unique within a dataset.
|
||||
uint64 id = 1;
|
||||
|
||||
repeated DataFile files = 2;
|
||||
|
||||
// File that indicates which rows, if any, should be considered deleted.
|
||||
DeletionFile deletion_file = 3;
|
||||
|
||||
// TODO: What's the simplest way we can allow an inline tombstone bitmap?
|
||||
|
||||
// A serialized RowIdSequence message (see rowids.proto).
|
||||
//
|
||||
// These are the row ids for the fragment, in order of the rows as they appear.
|
||||
// That is, if a fragment has 3 rows, and the row ids are [1, 42, 3], then the
|
||||
// first row is row 1, the second row is row 42, and the third row is row 3.
|
||||
oneof row_id_sequence {
|
||||
// If small (< 200KB), the row ids are stored inline.
|
||||
bytes inline_row_ids = 5;
|
||||
// Otherwise, stored as part of a file.
|
||||
ExternalFile external_row_ids = 6;
|
||||
} // row_id_sequence
|
||||
|
||||
oneof last_updated_at_version_sequence {
|
||||
// If small (< 200KB), the row latest updated versions are stored inline.
|
||||
bytes inline_last_updated_at_versions = 7;
|
||||
// Otherwise, stored as part of a file.
|
||||
ExternalFile external_last_updated_at_versions = 8;
|
||||
} // last_updated_at_version_sequence
|
||||
|
||||
oneof created_at_version_sequence {
|
||||
// If small (< 200KB), the row created at versions are stored inline.
|
||||
bytes inline_created_at_versions = 9;
|
||||
// Otherwise, stored as part of a file.
|
||||
ExternalFile external_created_at_versions = 10;
|
||||
} // created_at_version_sequence
|
||||
|
||||
// Number of original rows in the fragment, this includes rows that are now marked with
|
||||
// deletion tombstones. To compute the current number of rows, subtract
|
||||
// `deletion_file.num_deleted_rows` from this value.
|
||||
uint64 physical_rows = 4;
|
||||
}
|
||||
|
||||
message DataFile {
|
||||
// Path to the root relative to the dataset's URI.
|
||||
string path = 1;
|
||||
// The ids of the fields/columns in this file.
|
||||
//
|
||||
// When a DataFile object is created in memory, every value in fields is assigned -1 by
|
||||
// default. An object with a value in fields of -1 must not be stored to disk. -2 is
|
||||
// used for "tombstoned", meaning a field that is no longer in use. This is often
|
||||
// because the original field id was reassigned to a different data file.
|
||||
//
|
||||
// In Lance v1 IDs are assigned based on position in the file, offset by the max
|
||||
// existing field id in the table (if any already). So when a fragment is first created
|
||||
// with one file of N columns, the field ids will be 1, 2, ..., N. If a second fragment
|
||||
// is created with M columns, the field ids will be N+1, N+2, ..., N+M.
|
||||
//
|
||||
// In Lance v1 there is one field for each field in the input schema, this includes
|
||||
// nested fields (both struct and list). Fixed size list fields have only a single
|
||||
// field id (these are not considered nested fields in Lance v1).
|
||||
//
|
||||
// This allows column indices to be calculated from field IDs and the input schema.
|
||||
//
|
||||
// In Lance v2 the field IDs generally follow the same pattern but there is no
|
||||
// way to calculate the column index from the field ID. This is because a given
|
||||
// field could be encoded in many different ways, some of which occupy a different
|
||||
// number of columns. For example, a struct field could be encoded into N + 1 columns
|
||||
// or it could be encoded into a single packed column. To determine column indices
|
||||
// the column_indices property should be used instead.
|
||||
//
|
||||
// In Lance v1 these ids must be sorted but might not always be contiguous.
|
||||
repeated int32 fields = 2;
|
||||
// The top-level column indices for each field in the file.
|
||||
//
|
||||
// If the data file is version 1 then this property will be empty
|
||||
//
|
||||
// Otherwise there must be one entry for each field in `fields`.
|
||||
//
|
||||
// Some fields may not correspond to a top-level column in the file. In these cases
|
||||
// the index will -1.
|
||||
//
|
||||
// For example, consider the schema:
|
||||
//
|
||||
// - dimension: packed-struct (0):
|
||||
// - x: u32 (1)
|
||||
// - y: u32 (2)
|
||||
// - path: `list<u32>` (3)
|
||||
// - embedding: `fsl<768>` (4)
|
||||
// - fp64
|
||||
// - borders: `fsl<4>` (5)
|
||||
// - simple-struct (6)
|
||||
// - margin: fp64 (7)
|
||||
// - padding: fp64 (8)
|
||||
//
|
||||
// One possible column indices array could be:
|
||||
// [0, -1, -1, 1, 3, 4, 5, 6, 7]
|
||||
//
|
||||
// This reflects quite a few phenomenon:
|
||||
// - The packed struct is encoded into a single column and there is no top-level column
|
||||
// for the x or y fields
|
||||
// - The variable sized list is encoded into two columns
|
||||
// - The embedding is encoded into a single column (common for FSL of primitive) and there
|
||||
// is not "FSL column"
|
||||
// - The borders field actually does have an "FSL column"
|
||||
//
|
||||
// The column indices table may not have duplicates (other than -1)
|
||||
repeated int32 column_indices = 3;
|
||||
// The major file version used to create the file
|
||||
uint32 file_major_version = 4;
|
||||
// The minor file version used to create the file
|
||||
//
|
||||
// If both `file_major_version` and `file_minor_version` are set to 0,
|
||||
// then this is a version 0.1 or version 0.2 file.
|
||||
uint32 file_minor_version = 5;
|
||||
|
||||
// The known size of the file on disk in bytes.
|
||||
//
|
||||
// This is used to quickly find the footer of the file.
|
||||
//
|
||||
// When this is zero, it should be interpreted as "unknown".
|
||||
uint64 file_size_bytes = 6;
|
||||
|
||||
// The base path index of the data file. Used when the file is imported or referred from another dataset.
|
||||
// Lance use it as key of the base_paths field in Manifest to determine the actual base path of the data file.
|
||||
optional uint32 base_id = 7;
|
||||
} // DataFile
|
||||
|
||||
// Deletion File
|
||||
//
|
||||
// The path of the deletion file is constructed as:
|
||||
// {root}/_deletions/{fragment_id}-{read_version}-{id}.{extension}
|
||||
// where {extension} depends on DeletionFileType.
|
||||
message DeletionFile {
|
||||
// Type of deletion file, intended as a way to increase efficiency of the storage of deleted row
|
||||
// offsets. If there are sparsely deleted rows, then ARROW_ARRAY is the most efficient. If there
|
||||
// are densely deleted rows, then BITMAP is the most efficient.
|
||||
enum DeletionFileType {
|
||||
// A single Int32Array of deleted row offsets, stored as an Arrow IPC file with one batch and
|
||||
// one column. Has a .arrow extension.
|
||||
ARROW_ARRAY = 0;
|
||||
// A Roaring Bitmap of deleted row offsets. Has a .bin extension.
|
||||
BITMAP = 1;
|
||||
}
|
||||
|
||||
// Type of deletion file.
|
||||
DeletionFileType file_type = 1;
|
||||
// The version of the dataset this deletion file was built from.
|
||||
uint64 read_version = 2;
|
||||
// An opaque id used to differentiate this file from others written by concurrent
|
||||
// writers.
|
||||
uint64 id = 3;
|
||||
// The number of rows that are marked as deleted.
|
||||
uint64 num_deleted_rows = 4;
|
||||
// The base path index of the deletion file. Used when the file is imported or referred from another
|
||||
// dataset. Lance uses it as key of the base_paths field in Manifest to determine the actual base
|
||||
// path of the deletion file.
|
||||
optional uint32 base_id = 7;
|
||||
} // DeletionFile
|
||||
|
||||
message ExternalFile {
|
||||
// Path to the file, relative to the root of the table.
|
||||
string path = 1;
|
||||
// The byte offset in the file where the data starts.
|
||||
uint64 offset = 2;
|
||||
// The size of the data in the file, in bytes.
|
||||
uint64 size = 3;
|
||||
}
|
||||
|
||||
// VectorIndexDetails and HnswParameters (formerly HnswIndexDetails) moved to index.proto
|
||||
|
||||
message FragmentReuseIndexDetails {
|
||||
|
||||
oneof content {
|
||||
// if < 200KB, store the content inline, otherwise store the InlineContent bytes in external file
|
||||
InlineContent inline = 1;
|
||||
ExternalFile external = 2;
|
||||
}
|
||||
|
||||
message InlineContent {
|
||||
repeated Version versions = 1;
|
||||
}
|
||||
|
||||
message FragmentDigest {
|
||||
uint64 id = 1;
|
||||
|
||||
uint64 physical_rows = 2;
|
||||
|
||||
uint64 num_deleted_rows = 3;
|
||||
}
|
||||
|
||||
// A summarized version of the RewriteGroup information in a Rewrite transaction
|
||||
message Group {
|
||||
// A roaring treemap of the changed row addresses.
|
||||
// When combined with the old fragment IDs and new fragment IDs,
|
||||
// it can recover the full mapping of old row addresses to either new row addresses or deleted.
|
||||
// this mapping can then be used to remap indexes or satisfy index queries for the new unindexed fragments.
|
||||
bytes changed_row_addrs = 1;
|
||||
|
||||
repeated FragmentDigest old_fragments = 2;
|
||||
|
||||
repeated FragmentDigest new_fragments = 3;
|
||||
}
|
||||
|
||||
message Version {
|
||||
// The dataset_version at the time the index adds this version entry
|
||||
uint64 dataset_version = 1;
|
||||
|
||||
repeated Group groups = 3;
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// MemWAL Index Types
|
||||
// ============================================================================
|
||||
|
||||
// Shard manifest containing epoch-based fencing and WAL state.
|
||||
// Each shard has exactly one active writer at any time.
|
||||
message ShardManifest {
|
||||
// Shard identifier (UUID v4).
|
||||
UUID shard_id = 11;
|
||||
|
||||
// Manifest version number.
|
||||
// Matches the version encoded in the filename.
|
||||
uint64 version = 1;
|
||||
|
||||
// Shard spec ID this shard was created with.
|
||||
// Set at shard creation and immutable thereafter.
|
||||
// A value of 0 indicates a manually-created shard not governed by any spec.
|
||||
uint32 shard_spec_id = 10;
|
||||
|
||||
// Computed shard field values as raw Arrow scalar bytes, keyed by shard
|
||||
// field id. The byte encoding follows Arrow's little-endian convention:
|
||||
// int32 is 4 LE bytes, utf8 is raw UTF-8 bytes, etc. The receiver looks
|
||||
// up the result_type from the ShardingSpec to interpret each value.
|
||||
repeated ShardFieldEntry shard_field_entries = 14;
|
||||
|
||||
// Writer fencing token - monotonically increasing.
|
||||
// A writer must increment this when claiming the shard.
|
||||
uint64 writer_epoch = 2;
|
||||
|
||||
// The most recent WAL entry position that has been flushed to a MemTable.
|
||||
// During recovery, replay starts from replay_after_wal_entry_position + 1.
|
||||
// WAL positions are 1-based, so the default value 0 unambiguously means
|
||||
// "no flush has ever stamped this shard" and recovery replays from 1.
|
||||
uint64 replay_after_wal_entry_position = 3;
|
||||
|
||||
// The most recent WAL entry position observed at the time the manifest was
|
||||
// updated. WAL positions are 1-based; default 0 means no entry has been
|
||||
// written yet. This is a hint, not authoritative - recovery must list
|
||||
// files to find actual state.
|
||||
uint64 wal_entry_position_last_seen = 4;
|
||||
|
||||
// Next generation ID to create (incremented after each MemTable flush).
|
||||
uint64 current_generation = 6;
|
||||
|
||||
// Field 7 removed: merged_generation moved to MemWalIndexDetails.merged_generations
|
||||
// which is the authoritative source for merge progress.
|
||||
|
||||
// List of flushed MemTable generations and their directory paths.
|
||||
repeated FlushedGeneration flushed_generations = 8;
|
||||
}
|
||||
|
||||
// A shard field value stored as raw Arrow scalar bytes.
|
||||
message ShardFieldEntry {
|
||||
// Shard field id (matches ShardingField.field_id in the ShardingSpec).
|
||||
string field_id = 1;
|
||||
|
||||
// Raw Arrow scalar value bytes in little-endian encoding.
|
||||
// The data type is determined by the result_type of the matching ShardingField.
|
||||
bytes value = 2;
|
||||
}
|
||||
|
||||
// A flushed MemTable generation and its storage location.
|
||||
message FlushedGeneration {
|
||||
// Generation number.
|
||||
uint64 generation = 1;
|
||||
|
||||
// Directory name relative to the shard directory.
|
||||
string path = 2;
|
||||
}
|
||||
|
||||
// A shard's merged generation, used in MemWalIndexDetails.
|
||||
message MergedGeneration {
|
||||
// Shard identifier (UUID v4).
|
||||
UUID shard_id = 1;
|
||||
|
||||
// Last generation merged to base table for this shard.
|
||||
uint64 generation = 2;
|
||||
}
|
||||
|
||||
// Tracks which merged generation a base table index has been rebuilt to cover.
|
||||
// Used to determine whether to read from flushed MemTable indexes or base table.
|
||||
message IndexCatchupProgress {
|
||||
// Name of the base table index (must match an entry in maintained_indexes).
|
||||
string index_name = 1;
|
||||
|
||||
// Per-shard progress: the generation up to which this index covers.
|
||||
// If a shard is not present, the index is assumed to be fully caught up
|
||||
// (i.e., caught_up_generation >= merged_generation for that shard).
|
||||
repeated MergedGeneration caught_up_generations = 2;
|
||||
}
|
||||
|
||||
// Index details for MemWAL Index, stored in IndexMetadata.index_details.
|
||||
// This is the centralized structure for all MemWAL metadata:
|
||||
// - Configuration (sharding specs, indexes to maintain)
|
||||
// - Merge progress (merged generations per shard)
|
||||
// - Shard state snapshots
|
||||
//
|
||||
// Writers read this index to get configuration before writing.
|
||||
// Readers may use shard snapshots in this index as a point-in-time
|
||||
// optimization. Readers that need the latest shard set should list shard
|
||||
// directories in storage and read each shard's latest manifest.
|
||||
// A background process updates the index periodically to keep shard snapshots current.
|
||||
//
|
||||
// Shard snapshots are stored as a Lance file with one row per shard.
|
||||
// The schema records shard discovery fields. Full mutable shard state remains
|
||||
// authoritative in the shard manifest files.
|
||||
// shard_id: utf8
|
||||
// shard_spec_id: uint32
|
||||
// shard_field_{field_id}: typed per the matching ShardingField.result_type
|
||||
message MemWalIndexDetails {
|
||||
// Snapshot timestamp (Unix timestamp in milliseconds).
|
||||
int64 snapshot_ts_millis = 1;
|
||||
|
||||
// Number of shards in the snapshot.
|
||||
// Used to determine storage format without reading the snapshot data.
|
||||
uint32 num_shards = 2;
|
||||
|
||||
// Inline shard snapshots for small shard counts.
|
||||
// When num_shards <= threshold (implementation-defined, e.g., 100),
|
||||
// snapshots are stored inline as serialized bytes.
|
||||
// Format: Lance file bytes with the shard snapshot schema.
|
||||
optional bytes inline_snapshots = 3;
|
||||
|
||||
// Sharding specs defining how to derive shard identifiers.
|
||||
// This configuration determines how rows are partitioned into shards.
|
||||
repeated ShardingSpec sharding_specs = 7;
|
||||
|
||||
// Indexes from the base table to maintain in MemTables.
|
||||
// These are index names referencing indexes defined on the base table.
|
||||
// The primary key btree index is always maintained implicitly and
|
||||
// should not be listed here.
|
||||
//
|
||||
// For vector indexes, MemTables inherit quantization parameters (PQ codebook,
|
||||
// SQ params) from the base table index to ensure distance comparability.
|
||||
repeated string maintained_indexes = 8;
|
||||
|
||||
// Last generation merged to base table for each shard.
|
||||
// This is updated atomically with merge-insert data commits, enabling
|
||||
// conflict resolution when multiple mergers operate concurrently.
|
||||
//
|
||||
// Note: This is separate from shard snapshots because:
|
||||
// 1. merged_generations is updated by mergers (atomic with data commit)
|
||||
// 2. shard snapshots are updated by background index builder
|
||||
repeated MergedGeneration merged_generations = 9;
|
||||
|
||||
// Per-index catchup progress tracking.
|
||||
// When data is merged to the base table, base table indexes are rebuilt
|
||||
// asynchronously. This field tracks which generation each index covers.
|
||||
//
|
||||
// For indexed queries, if an index's caught_up_generation < merged_generation,
|
||||
// readers should use flushed MemTable indexes for the gap instead of
|
||||
// scanning unindexed data in the base table.
|
||||
//
|
||||
// If an index is not present in this list, it is assumed to be fully caught up.
|
||||
repeated IndexCatchupProgress index_catchup = 10;
|
||||
|
||||
// Default ShardWriter configuration values for this MemWAL index.
|
||||
//
|
||||
// A free-form string map persisted so that every writer — across
|
||||
// processes and restarts — starts from the same default writer
|
||||
// configuration. These are defaults only: an individual writer may
|
||||
// still override any value at runtime in its own ShardWriterConfig
|
||||
// (which is not persisted).
|
||||
map<string, string> writer_config_defaults = 11;
|
||||
}
|
||||
|
||||
// Sharding spec definition.
|
||||
message ShardingSpec {
|
||||
// Unique identifier for this spec within the index.
|
||||
// IDs are never reused.
|
||||
uint32 spec_id = 1;
|
||||
|
||||
// Sharding field definitions that determine how to compute shard identifiers.
|
||||
repeated ShardingField fields = 2;
|
||||
}
|
||||
|
||||
// Sharding field definition.
|
||||
message ShardingField {
|
||||
// Unique string identifier for this shard field.
|
||||
string field_id = 1;
|
||||
|
||||
// Field IDs referencing source columns in the schema.
|
||||
repeated int32 source_ids = 2;
|
||||
|
||||
// Well-known shard transform name (e.g., "identity", "year", "bucket").
|
||||
// Mutually exclusive with expression.
|
||||
optional string transform = 3;
|
||||
|
||||
// DataFusion SQL expression for custom logic.
|
||||
// Mutually exclusive with transform.
|
||||
optional string expression = 4;
|
||||
|
||||
// Output type of the shard value (Arrow type name).
|
||||
string result_type = 5;
|
||||
|
||||
// Transform parameters (e.g., num_buckets for bucket transform).
|
||||
map<string, string> parameters = 6;
|
||||
}
|
||||
19
vendor/lance-table/protos/table_identifier.proto
vendored
Normal file
19
vendor/lance-table/protos/table_identifier.proto
vendored
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The Lance Authors
|
||||
|
||||
syntax = "proto3";
|
||||
|
||||
package lance.datafusion;
|
||||
|
||||
// Identifies a Lance dataset for remote reconstruction.
|
||||
//
|
||||
// Two modes:
|
||||
// 1. uri + serialized_manifest (fast): remote executor skips manifest read.
|
||||
// 2. uri + version + etag (lightweight): remote executor loads manifest from storage.
|
||||
message TableIdentifier {
|
||||
string uri = 1;
|
||||
uint64 version = 2;
|
||||
optional string manifest_etag = 3;
|
||||
optional bytes serialized_manifest = 4;
|
||||
map<string, string> storage_options = 5;
|
||||
}
|
||||
354
vendor/lance-table/protos/transaction.proto
vendored
Normal file
354
vendor/lance-table/protos/transaction.proto
vendored
Normal file
|
|
@ -0,0 +1,354 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The Lance Authors
|
||||
|
||||
syntax = "proto3";
|
||||
|
||||
import "file.proto";
|
||||
import "table.proto";
|
||||
import "google/protobuf/any.proto";
|
||||
|
||||
package lance.table;
|
||||
|
||||
// A transaction represents the changes to a dataset.
|
||||
//
|
||||
// This has two purposes:
|
||||
// 1. When retrying a commit, the transaction can be used to re-build an updated
|
||||
// manifest.
|
||||
// 2. When there's a conflict, this can be used to determine whether the other
|
||||
// transaction is compatible with this one.
|
||||
message Transaction {
|
||||
// The version of the dataset this transaction was built from.
|
||||
//
|
||||
// For example, for a delete transaction this means the version of the dataset
|
||||
// that was read from while evaluating the deletion predicate.
|
||||
uint64 read_version = 1;
|
||||
|
||||
// The UUID that unique identifies a transaction.
|
||||
string uuid = 2;
|
||||
|
||||
// Optional version tag.
|
||||
string tag = 3;
|
||||
|
||||
// Optional properties for the transaction
|
||||
// __lance_commit_message is a reserved key
|
||||
map<string, string> transaction_properties = 4;
|
||||
|
||||
// Add new rows to the dataset.
|
||||
message Append {
|
||||
// The new fragments to append.
|
||||
//
|
||||
// Fragment IDs are not yet assigned.
|
||||
repeated DataFragment fragments = 1;
|
||||
}
|
||||
|
||||
// Mark rows as deleted.
|
||||
message Delete {
|
||||
// The fragments to update
|
||||
//
|
||||
// The fragment IDs will match existing fragments in the dataset.
|
||||
repeated DataFragment updated_fragments = 1;
|
||||
// The fragments to delete entirely.
|
||||
repeated uint64 deleted_fragment_ids = 2;
|
||||
// The predicate that was evaluated
|
||||
//
|
||||
// This may be used to determine whether the delete would have affected
|
||||
// files written by a concurrent transaction.
|
||||
string predicate = 3;
|
||||
}
|
||||
|
||||
// Create or overwrite the entire dataset.
|
||||
message Overwrite {
|
||||
// The new fragments
|
||||
//
|
||||
// Fragment IDs are not yet assigned.
|
||||
repeated DataFragment fragments = 1;
|
||||
// The new schema
|
||||
repeated lance.file.Field schema = 2;
|
||||
// Schema metadata.
|
||||
map<string, bytes> schema_metadata = 3;
|
||||
// Key-value pairs to merge with existing config.
|
||||
map<string, string> config_upsert_values = 4;
|
||||
// The base paths to be added for the initial dataset creation
|
||||
repeated BasePath initial_bases = 5;
|
||||
}
|
||||
|
||||
// Add or replace a new secondary index.
|
||||
//
|
||||
// This is also used to remove an index (we are replacing it with nothing)
|
||||
//
|
||||
// - new_indices: the modified indices, empty if dropping indices only
|
||||
// - removed_indices: the indices that are being replaced
|
||||
message CreateIndex {
|
||||
repeated IndexMetadata new_indices = 1;
|
||||
repeated IndexMetadata removed_indices = 2;
|
||||
}
|
||||
|
||||
// An operation that rewrites but does not change the data in the table. These
|
||||
// kinds of operations just rearrange data.
|
||||
message Rewrite {
|
||||
// The old fragments that are being replaced
|
||||
//
|
||||
// DEPRECATED: use groups instead.
|
||||
//
|
||||
// These should all have existing fragment IDs.
|
||||
repeated DataFragment old_fragments = 1;
|
||||
// The new fragments
|
||||
//
|
||||
// DEPRECATED: use groups instead.
|
||||
//
|
||||
// These fragments IDs are not yet assigned.
|
||||
repeated DataFragment new_fragments = 2;
|
||||
|
||||
// During a rewrite an index may be rewritten. We only serialize the UUID
|
||||
// since a rewrite should not change the other index parameters.
|
||||
message RewrittenIndex {
|
||||
// The id of the index that will be replaced
|
||||
UUID old_id = 1;
|
||||
// the id of the new index
|
||||
UUID new_id = 2;
|
||||
// the new index details
|
||||
google.protobuf.Any new_index_details = 3;
|
||||
// the version of the new index
|
||||
uint32 new_index_version = 4;
|
||||
// Files in the new index with their sizes.
|
||||
// Empty if file sizes are not available (e.g. older writers).
|
||||
repeated IndexFile new_index_files = 5;
|
||||
}
|
||||
|
||||
// A group of rewrite files that are all part of the same rewrite.
|
||||
message RewriteGroup {
|
||||
// The old fragment that is being replaced
|
||||
//
|
||||
// This should have an existing fragment ID.
|
||||
repeated DataFragment old_fragments = 1;
|
||||
// The new fragment
|
||||
//
|
||||
// The ID should have been reserved by an earlier
|
||||
// reserve operation
|
||||
repeated DataFragment new_fragments = 2;
|
||||
}
|
||||
|
||||
// Groups of files that have been rewritten
|
||||
repeated RewriteGroup groups = 3;
|
||||
// Indices that have been rewritten
|
||||
repeated RewrittenIndex rewritten_indices = 4;
|
||||
}
|
||||
|
||||
// An operation that merges in a new column, altering the schema.
|
||||
message Merge {
|
||||
// The updated fragments
|
||||
//
|
||||
// These should all have existing fragment IDs.
|
||||
repeated DataFragment fragments = 1;
|
||||
// The new schema
|
||||
repeated lance.file.Field schema = 2;
|
||||
// Schema metadata.
|
||||
map<string, bytes> schema_metadata = 3;
|
||||
}
|
||||
|
||||
// An operation that projects a subset of columns, altering the schema.
|
||||
message Project {
|
||||
// The new schema
|
||||
repeated lance.file.Field schema = 1;
|
||||
}
|
||||
|
||||
// An operation that restores a dataset to a previous version.
|
||||
message Restore {
|
||||
// The version to restore to
|
||||
uint64 version = 1;
|
||||
}
|
||||
|
||||
// An operation that reserves fragment ids for future use in
|
||||
// a rewrite operation.
|
||||
message ReserveFragments {
|
||||
uint32 num_fragments = 1;
|
||||
}
|
||||
|
||||
// An operation that clones a dataset.
|
||||
message Clone {
|
||||
// - true: Performs a metadata-only clone (copies manifest without data files).
|
||||
// The cloned dataset references original data through `base_paths`,
|
||||
// suitable for experimental scenarios or rapid metadata migration.
|
||||
// - false: Performs a full deep clone using the underlying object storage's native
|
||||
// copy API (e.g., S3 CopyObject, GCS rewrite). This leverages server-side
|
||||
// bulk copy operations to bypass download/upload bottlenecks, achieving
|
||||
// near-linear speedup for large datasets (typically 3-10x faster than
|
||||
// manual file transfers). The operation maintains atomicity and data
|
||||
// integrity guarantees provided by the storage backend.
|
||||
bool is_shallow = 1;
|
||||
// the reference name in the source dataset
|
||||
// in most cases it should be the branch or tag name in the source dataset
|
||||
optional string ref_name = 2;
|
||||
// the version of the source dataset for cloning
|
||||
uint64 ref_version = 3;
|
||||
// the absolute base path of the source dataset for cloning
|
||||
string ref_path = 4;
|
||||
// if the target dataset is a branch, this is the branch name of the target dataset
|
||||
optional string branch_name = 5;
|
||||
}
|
||||
|
||||
// Exact set of key hashes for conflict detection.
|
||||
// Used when the number of inserted rows is small.
|
||||
message ExactKeySetFilter {
|
||||
// 64-bit hashes of the inserted row keys.
|
||||
repeated uint64 key_hashes = 1;
|
||||
}
|
||||
|
||||
// Bloom filter for key existence tests.
|
||||
// Used when the number of rows is large.
|
||||
message BloomFilter {
|
||||
// Bitset backing the bloom filter (SBBF format).
|
||||
bytes bitmap = 1;
|
||||
// Number of bits in the bitmap.
|
||||
uint32 num_bits = 2;
|
||||
// Number of items the filter was sized for.
|
||||
// Used for intersection validation (filters with different sizes cannot be compared).
|
||||
// Default: 8192
|
||||
uint64 number_of_items = 3;
|
||||
// False positive probability the filter was sized for.
|
||||
// Used for intersection validation (filters with different parameters cannot be compared).
|
||||
// Default: 0.00057
|
||||
double probability = 4;
|
||||
}
|
||||
|
||||
// A filter for checking key existence in set of rows inserted by a merge insert operation.
|
||||
// Only created when the merge insert's ON columns match the schema's unenforced primary key.
|
||||
// The presence of this filter indicates strict primary key conflict detection should be used.
|
||||
// Can use either an exact set (for small row counts) or a Bloom filter (for large row counts).
|
||||
message KeyExistenceFilter {
|
||||
// Field IDs of columns participating in the key (must match unenforced primary key).
|
||||
repeated int32 field_ids = 1;
|
||||
// The underlying data structure storing the key hashes.
|
||||
oneof data {
|
||||
// Exact set of key hashes (used for small number of rows).
|
||||
ExactKeySetFilter exact = 2;
|
||||
// Bloom filter (used for large number of rows).
|
||||
BloomFilter bloom = 3;
|
||||
}
|
||||
}
|
||||
|
||||
// Serialized as sorted distinct local physical row offsets within the fragment (0-based).
|
||||
message UInt32List {
|
||||
repeated uint32 values = 1;
|
||||
}
|
||||
|
||||
// An operation that updates rows but does not add or remove rows.
|
||||
message Update {
|
||||
// The fragments that have been removed. These are fragments where all rows
|
||||
// have been updated and moved to a new fragment.
|
||||
repeated uint64 removed_fragment_ids = 1;
|
||||
// The fragments that have been updated.
|
||||
repeated DataFragment updated_fragments = 2;
|
||||
// The new fragments where updated rows have been moved to.
|
||||
repeated DataFragment new_fragments = 3;
|
||||
// The ids of the fields that have been modified.
|
||||
repeated uint32 fields_modified = 4;
|
||||
/// List of MemWAL shard generations to mark as merged after this transaction
|
||||
repeated MergedGeneration merged_generations = 5;
|
||||
/// The fields that used to judge whether to preserve the new frag's id into
|
||||
/// the frag bitmap of the specified indices.
|
||||
repeated uint32 fields_for_preserving_frag_bitmap = 6;
|
||||
// The mode of update
|
||||
UpdateMode update_mode = 7;
|
||||
// Filter for checking existence of keys in newly inserted rows, used for conflict detection.
|
||||
// Only tracks keys from INSERT operations during merge insert, not updates.
|
||||
optional KeyExistenceFilter inserted_rows = 8;
|
||||
// Per-fragment physical row offsets that matched an update_columns hash join (RewriteColumns).
|
||||
map<uint64, UInt32List> updated_fragment_offsets = 9;
|
||||
}
|
||||
|
||||
// The mode of update operation
|
||||
enum UpdateMode {
|
||||
|
||||
/// rows are deleted in current fragments and rewritten in new fragments.
|
||||
/// This is most optimal when the majority of columns are being rewritten
|
||||
/// or only a few rows are being updated.
|
||||
REWRITE_ROWS = 0;
|
||||
|
||||
/// within each fragment, columns are fully rewritten and inserted as new data files.
|
||||
/// Old versions of columns are tombstoned. This is most optimal when most rows are affected
|
||||
/// but a small subset of columns are affected.
|
||||
REWRITE_COLUMNS = 1;
|
||||
}
|
||||
|
||||
// An entry for a map update. If value is not set, the key will be removed from the map.
|
||||
message UpdateMapEntry {
|
||||
// The key of the map entry to update.
|
||||
string key = 1;
|
||||
// The value to set for the key.
|
||||
optional string value = 2;
|
||||
}
|
||||
|
||||
message UpdateMap {
|
||||
repeated UpdateMapEntry update_entries = 1;
|
||||
// If true, the map will be replaced entirely with the new entries.
|
||||
// If false, the new entries will be merged with the existing map.
|
||||
bool replace = 2;
|
||||
}
|
||||
|
||||
// An operation that updates the table config, table metadata, schema metadata,
|
||||
// or field metadata.
|
||||
message UpdateConfig {
|
||||
UpdateMap config_updates = 6;
|
||||
UpdateMap table_metadata_updates = 7;
|
||||
UpdateMap schema_metadata_updates = 8;
|
||||
map<int32, UpdateMap> field_metadata_updates = 9;
|
||||
|
||||
// Deprecated -------------------------------
|
||||
map<string, string> upsert_values = 1;
|
||||
repeated string delete_keys = 2;
|
||||
map<string, string> schema_metadata = 3;
|
||||
map<uint32, FieldMetadataUpdate> field_metadata = 4;
|
||||
|
||||
message FieldMetadataUpdate {
|
||||
map<string, string> metadata = 5;
|
||||
}
|
||||
}
|
||||
|
||||
message DataReplacementGroup {
|
||||
uint64 fragment_id = 1;
|
||||
DataFile new_file = 2;
|
||||
}
|
||||
|
||||
// An operation that replaces the data in a region of the table with new data.
|
||||
message DataReplacement {
|
||||
repeated DataReplacementGroup replacements = 1;
|
||||
}
|
||||
|
||||
// Update the merged generations in MemWAL index.
|
||||
// This operation is used during merge-insert to atomically record which
|
||||
// generations have been merged to the base table.
|
||||
message UpdateMemWalState {
|
||||
// Shards and generations being marked as merged.
|
||||
repeated MergedGeneration merged_generations = 1;
|
||||
}
|
||||
|
||||
// An operation that updates base paths in the dataset.
|
||||
message UpdateBases {
|
||||
// The new base paths to add to the manifest.
|
||||
repeated BasePath new_bases = 1;
|
||||
}
|
||||
|
||||
// The operation of this transaction.
|
||||
oneof operation {
|
||||
Append append = 100;
|
||||
Delete delete = 101;
|
||||
Overwrite overwrite = 102;
|
||||
CreateIndex create_index = 103;
|
||||
Rewrite rewrite = 104;
|
||||
Merge merge = 105;
|
||||
Restore restore = 106;
|
||||
ReserveFragments reserve_fragments = 107;
|
||||
Update update = 108;
|
||||
Project project = 109;
|
||||
UpdateConfig update_config = 110;
|
||||
DataReplacement data_replacement = 111;
|
||||
UpdateMemWalState update_mem_wal_state = 112;
|
||||
Clone clone = 113;
|
||||
UpdateBases update_bases = 114;
|
||||
}
|
||||
|
||||
// Fields 200/202 (`blob_append` / `blob_overwrite`) previously represented blob dataset ops.
|
||||
reserved 200, 202;
|
||||
reserved "blob_append", "blob_overwrite";
|
||||
}
|
||||
184
vendor/lance-table/src/feature_flags.rs
vendored
Normal file
184
vendor/lance-table/src/feature_flags.rs
vendored
Normal file
|
|
@ -0,0 +1,184 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The Lance Authors
|
||||
|
||||
//! Feature flags
|
||||
|
||||
use crate::format::Manifest;
|
||||
use lance_core::{Error, Result};
|
||||
|
||||
/// Fragments may contain deletion files, which record the tombstones of
|
||||
/// soft-deleted rows.
|
||||
pub const FLAG_DELETION_FILES: u64 = 1;
|
||||
/// Row ids are stable for both moves and updates. Fragments contain an index
|
||||
/// mapping row ids to row addresses.
|
||||
pub const FLAG_STABLE_ROW_IDS: u64 = 2;
|
||||
/// Files are written with the new v2 format (this flag is no longer used)
|
||||
pub const FLAG_USE_V2_FORMAT_DEPRECATED: u64 = 4;
|
||||
/// Table config is present
|
||||
pub const FLAG_TABLE_CONFIG: u64 = 8;
|
||||
/// Dataset uses multiple base paths (for shallow clones or multi-base datasets)
|
||||
pub const FLAG_BASE_PATHS: u64 = 16;
|
||||
/// Disable writing transaction file under _transaction/, this flag is set when we only want to write inline transaction in manifest
|
||||
pub const FLAG_DISABLE_TRANSACTION_FILE: u64 = 32;
|
||||
/// The first bit that is unknown as a feature flag
|
||||
pub const FLAG_UNKNOWN: u64 = 64;
|
||||
|
||||
/// Set the reader and writer feature flags in the manifest based on the contents of the manifest.
|
||||
pub fn apply_feature_flags(
|
||||
manifest: &mut Manifest,
|
||||
enable_stable_row_id: bool,
|
||||
disable_transaction_file: bool,
|
||||
) -> Result<()> {
|
||||
// Reset flags
|
||||
manifest.reader_feature_flags = 0;
|
||||
manifest.writer_feature_flags = 0;
|
||||
|
||||
let has_deletion_files = manifest
|
||||
.fragments
|
||||
.iter()
|
||||
.any(|frag| frag.deletion_file.is_some());
|
||||
if has_deletion_files {
|
||||
// Both readers and writers need to be able to read deletion files
|
||||
manifest.reader_feature_flags |= FLAG_DELETION_FILES;
|
||||
manifest.writer_feature_flags |= FLAG_DELETION_FILES;
|
||||
}
|
||||
|
||||
// If any fragment has row ids, they must all have row ids.
|
||||
let has_row_ids = manifest
|
||||
.fragments
|
||||
.iter()
|
||||
.any(|frag| frag.row_id_meta.is_some());
|
||||
if has_row_ids || enable_stable_row_id {
|
||||
if !manifest
|
||||
.fragments
|
||||
.iter()
|
||||
.all(|frag| frag.row_id_meta.is_some())
|
||||
{
|
||||
return Err(Error::invalid_input("All fragments must have row ids"));
|
||||
}
|
||||
manifest.reader_feature_flags |= FLAG_STABLE_ROW_IDS;
|
||||
manifest.writer_feature_flags |= FLAG_STABLE_ROW_IDS;
|
||||
}
|
||||
|
||||
// Test whether any table metadata has been set
|
||||
if !manifest.config.is_empty() {
|
||||
manifest.writer_feature_flags |= FLAG_TABLE_CONFIG;
|
||||
}
|
||||
|
||||
// Check if this dataset uses multiple base paths (for shallow clones or multi-base datasets)
|
||||
if !manifest.base_paths.is_empty() {
|
||||
manifest.reader_feature_flags |= FLAG_BASE_PATHS;
|
||||
manifest.writer_feature_flags |= FLAG_BASE_PATHS;
|
||||
}
|
||||
|
||||
if disable_transaction_file {
|
||||
manifest.writer_feature_flags |= FLAG_DISABLE_TRANSACTION_FILE;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn can_read_dataset(reader_flags: u64) -> bool {
|
||||
reader_flags < FLAG_UNKNOWN
|
||||
}
|
||||
|
||||
pub fn can_write_dataset(writer_flags: u64) -> bool {
|
||||
writer_flags < FLAG_UNKNOWN
|
||||
}
|
||||
|
||||
pub fn has_deprecated_v2_feature_flag(writer_flags: u64) -> bool {
|
||||
writer_flags & FLAG_USE_V2_FORMAT_DEPRECATED != 0
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::format::BasePath;
|
||||
|
||||
#[test]
|
||||
fn test_read_check() {
|
||||
assert!(can_read_dataset(0));
|
||||
assert!(can_read_dataset(super::FLAG_DELETION_FILES));
|
||||
assert!(can_read_dataset(super::FLAG_STABLE_ROW_IDS));
|
||||
assert!(can_read_dataset(super::FLAG_USE_V2_FORMAT_DEPRECATED));
|
||||
assert!(can_read_dataset(super::FLAG_TABLE_CONFIG));
|
||||
assert!(can_read_dataset(super::FLAG_BASE_PATHS));
|
||||
assert!(can_read_dataset(super::FLAG_DISABLE_TRANSACTION_FILE));
|
||||
assert!(can_read_dataset(
|
||||
super::FLAG_DELETION_FILES
|
||||
| super::FLAG_STABLE_ROW_IDS
|
||||
| super::FLAG_USE_V2_FORMAT_DEPRECATED
|
||||
));
|
||||
assert!(!can_read_dataset(super::FLAG_UNKNOWN));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_write_check() {
|
||||
assert!(can_write_dataset(0));
|
||||
assert!(can_write_dataset(super::FLAG_DELETION_FILES));
|
||||
assert!(can_write_dataset(super::FLAG_STABLE_ROW_IDS));
|
||||
assert!(can_write_dataset(super::FLAG_USE_V2_FORMAT_DEPRECATED));
|
||||
assert!(can_write_dataset(super::FLAG_TABLE_CONFIG));
|
||||
assert!(can_write_dataset(super::FLAG_BASE_PATHS));
|
||||
assert!(can_write_dataset(super::FLAG_DISABLE_TRANSACTION_FILE));
|
||||
assert!(can_write_dataset(
|
||||
super::FLAG_DELETION_FILES
|
||||
| super::FLAG_STABLE_ROW_IDS
|
||||
| super::FLAG_USE_V2_FORMAT_DEPRECATED
|
||||
| super::FLAG_TABLE_CONFIG
|
||||
| super::FLAG_BASE_PATHS
|
||||
));
|
||||
assert!(!can_write_dataset(super::FLAG_UNKNOWN));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_base_paths_feature_flags() {
|
||||
use crate::format::{DataStorageFormat, Manifest};
|
||||
use arrow_schema::{Field as ArrowField, Schema as ArrowSchema};
|
||||
use lance_core::datatypes::Schema;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
// Create a basic schema for testing
|
||||
let arrow_schema = ArrowSchema::new(vec![ArrowField::new(
|
||||
"test_field",
|
||||
arrow_schema::DataType::Int64,
|
||||
false,
|
||||
)]);
|
||||
let schema = Schema::try_from(&arrow_schema).unwrap();
|
||||
// Test 1: Normal dataset (no base_paths) should not have FLAG_BASE_PATHS
|
||||
let mut normal_manifest = Manifest::new(
|
||||
schema.clone(),
|
||||
Arc::new(vec![]),
|
||||
DataStorageFormat::default(),
|
||||
HashMap::new(), // Empty base_paths
|
||||
);
|
||||
apply_feature_flags(&mut normal_manifest, false, false).unwrap();
|
||||
assert_eq!(normal_manifest.reader_feature_flags & FLAG_BASE_PATHS, 0);
|
||||
assert_eq!(normal_manifest.writer_feature_flags & FLAG_BASE_PATHS, 0);
|
||||
// Test 2: Dataset with base_paths (shallow clone or multi-base) should have FLAG_BASE_PATHS
|
||||
let mut base_paths: HashMap<u32, BasePath> = HashMap::new();
|
||||
base_paths.insert(
|
||||
1,
|
||||
BasePath::new(
|
||||
1,
|
||||
"file:///path/to/original".to_string(),
|
||||
Some("test_ref".to_string()),
|
||||
true,
|
||||
),
|
||||
);
|
||||
let mut multi_base_manifest = Manifest::new(
|
||||
schema,
|
||||
Arc::new(vec![]),
|
||||
DataStorageFormat::default(),
|
||||
base_paths,
|
||||
);
|
||||
apply_feature_flags(&mut multi_base_manifest, false, false).unwrap();
|
||||
assert_ne!(
|
||||
multi_base_manifest.reader_feature_flags & FLAG_BASE_PATHS,
|
||||
0
|
||||
);
|
||||
assert_ne!(
|
||||
multi_base_manifest.writer_feature_flags & FLAG_BASE_PATHS,
|
||||
0
|
||||
);
|
||||
}
|
||||
}
|
||||
70
vendor/lance-table/src/format.rs
vendored
Normal file
70
vendor/lance-table/src/format.rs
vendored
Normal file
|
|
@ -0,0 +1,70 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The Lance Authors
|
||||
|
||||
use arrow_buffer::ToByteSlice;
|
||||
use uuid::Uuid;
|
||||
|
||||
mod fragment;
|
||||
mod index;
|
||||
mod manifest;
|
||||
mod transaction;
|
||||
|
||||
pub use crate::rowids::version::{
|
||||
RowDatasetVersionMeta, RowDatasetVersionRun, RowDatasetVersionSequence,
|
||||
};
|
||||
pub use fragment::*;
|
||||
pub use index::{IndexFile, IndexMetadata, index_metadata_codec, list_index_files_with_sizes};
|
||||
|
||||
pub use manifest::{
|
||||
BasePath, DETACHED_VERSION_MASK, DataStorageFormat, Manifest, SelfDescribingFileReader,
|
||||
WriterVersion, is_detached_version,
|
||||
};
|
||||
pub use transaction::Transaction;
|
||||
|
||||
use lance_core::{Error, Result};
|
||||
|
||||
// In 0.36.1 we renamed Index to IndexMetadata because Index conflicted too much with the
|
||||
// Index trait. This is left in for backward compatibility.
|
||||
#[deprecated(since = "0.36.1", note = "Use IndexMetadata instead")]
|
||||
pub type Index = IndexMetadata;
|
||||
|
||||
/// Protobuf definitions for Lance Format
|
||||
pub mod pb {
|
||||
#![allow(clippy::all)]
|
||||
#![allow(non_upper_case_globals)]
|
||||
#![allow(non_camel_case_types)]
|
||||
#![allow(non_snake_case)]
|
||||
#![allow(unused)]
|
||||
#![allow(improper_ctypes)]
|
||||
#![allow(clippy::upper_case_acronyms)]
|
||||
#![allow(clippy::use_self)]
|
||||
include!(concat!(env!("OUT_DIR"), "/lance.table.rs"));
|
||||
}
|
||||
|
||||
/// These version/magic values are written at the end of manifest files (e.g. versions/1.version)
|
||||
pub const MAJOR_VERSION: i16 = 0;
|
||||
pub const MINOR_VERSION: i16 = 1;
|
||||
pub const MAGIC: &[u8; 4] = b"LANC";
|
||||
|
||||
impl TryFrom<&pb::Uuid> for Uuid {
|
||||
type Error = Error;
|
||||
|
||||
fn try_from(p: &pb::Uuid) -> Result<Self> {
|
||||
if p.uuid.len() != 16 {
|
||||
return Err(Error::invalid_input(
|
||||
"Protobuf UUID is malformed".to_string(),
|
||||
));
|
||||
}
|
||||
let mut buf: [u8; 16] = [0; 16];
|
||||
buf.copy_from_slice(p.uuid.to_byte_slice());
|
||||
Ok(Self::from_bytes(buf))
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&Uuid> for pb::Uuid {
|
||||
fn from(value: &Uuid) -> Self {
|
||||
Self {
|
||||
uuid: value.into_bytes().to_vec(),
|
||||
}
|
||||
}
|
||||
}
|
||||
841
vendor/lance-table/src/format/fragment.rs
vendored
Normal file
841
vendor/lance-table/src/format/fragment.rs
vendored
Normal file
|
|
@ -0,0 +1,841 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The Lance Authors
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::num::NonZero;
|
||||
use std::sync::Arc;
|
||||
|
||||
use deepsize::DeepSizeOf;
|
||||
use lance_core::Error;
|
||||
use lance_file::format::{MAJOR_VERSION, MINOR_VERSION};
|
||||
use lance_file::version::LanceFileVersion;
|
||||
use lance_io::utils::CachedFileSize;
|
||||
use object_store::path::Path;
|
||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||
|
||||
use crate::format::pb;
|
||||
|
||||
use crate::rowids::version::{
|
||||
RowDatasetVersionMeta, created_at_version_meta_to_pb, last_updated_at_version_meta_to_pb,
|
||||
};
|
||||
use lance_core::datatypes::Schema;
|
||||
use lance_core::error::Result;
|
||||
|
||||
/// Lance Data File
|
||||
///
|
||||
/// A data file is one piece of file storing data.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, DeepSizeOf)]
|
||||
pub struct DataFile {
|
||||
/// Relative path of the data file to dataset root.
|
||||
pub path: String,
|
||||
/// The ids of fields in this file.
|
||||
///
|
||||
/// When identical across many fragments (common case), multiple `DataFile`
|
||||
/// instances share a single heap allocation via `Arc`, significantly
|
||||
/// reducing manifest memory for large tables.
|
||||
pub fields: Arc<[i32]>,
|
||||
/// The offsets of the fields listed in `fields`, empty in v1 files
|
||||
///
|
||||
/// Note that -1 is a possibility and it indices that the field has
|
||||
/// no top-level column in the file.
|
||||
///
|
||||
/// Columns that lack a field id may still exist as extra entries in
|
||||
/// `column_indices`; such columns are ignored by field-id–based projection.
|
||||
/// For example, some fields, such as blob fields, occupy multiple
|
||||
/// columns in the file but only have a single field id.
|
||||
pub column_indices: Arc<[i32]>,
|
||||
/// The major version of the file format used to write this file.
|
||||
pub file_major_version: u32,
|
||||
/// The minor version of the file format used to write this file.
|
||||
pub file_minor_version: u32,
|
||||
|
||||
/// The size of the file in bytes, if known.
|
||||
pub file_size_bytes: CachedFileSize,
|
||||
|
||||
/// The base path of the datafile, when the datafile is outside the dataset.
|
||||
pub base_id: Option<u32>,
|
||||
}
|
||||
|
||||
// Custom Serialize: convert Arc<[i32]> to slice for transparent JSON output
|
||||
impl Serialize for DataFile {
|
||||
fn serialize<S: Serializer>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error> {
|
||||
use serde::ser::SerializeStruct;
|
||||
let mut s = serializer.serialize_struct("DataFile", 7)?;
|
||||
s.serialize_field("path", &self.path)?;
|
||||
s.serialize_field("fields", self.fields.as_ref())?;
|
||||
s.serialize_field("column_indices", self.column_indices.as_ref())?;
|
||||
s.serialize_field("file_major_version", &self.file_major_version)?;
|
||||
s.serialize_field("file_minor_version", &self.file_minor_version)?;
|
||||
s.serialize_field("file_size_bytes", &self.file_size_bytes)?;
|
||||
s.serialize_field("base_id", &self.base_id)?;
|
||||
s.end()
|
||||
}
|
||||
}
|
||||
|
||||
// Custom Deserialize: read Vec<i32> and convert to Arc<[i32]>
|
||||
impl<'de> Deserialize<'de> for DataFile {
|
||||
fn deserialize<D: Deserializer<'de>>(deserializer: D) -> std::result::Result<Self, D::Error> {
|
||||
#[derive(Deserialize)]
|
||||
struct DataFileHelper {
|
||||
path: String,
|
||||
fields: Vec<i32>,
|
||||
#[serde(default)]
|
||||
column_indices: Vec<i32>,
|
||||
#[serde(default)]
|
||||
file_major_version: u32,
|
||||
#[serde(default)]
|
||||
file_minor_version: u32,
|
||||
file_size_bytes: CachedFileSize,
|
||||
base_id: Option<u32>,
|
||||
}
|
||||
|
||||
let helper = DataFileHelper::deserialize(deserializer)?;
|
||||
Ok(Self {
|
||||
path: helper.path,
|
||||
fields: Arc::from(helper.fields),
|
||||
column_indices: Arc::from(helper.column_indices),
|
||||
file_major_version: helper.file_major_version,
|
||||
file_minor_version: helper.file_minor_version,
|
||||
file_size_bytes: helper.file_size_bytes,
|
||||
base_id: helper.base_id,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl DataFile {
|
||||
pub fn new(
|
||||
path: impl Into<String>,
|
||||
fields: Vec<i32>,
|
||||
column_indices: Vec<i32>,
|
||||
file_major_version: u32,
|
||||
file_minor_version: u32,
|
||||
file_size_bytes: Option<NonZero<u64>>,
|
||||
base_id: Option<u32>,
|
||||
) -> Self {
|
||||
Self {
|
||||
path: path.into(),
|
||||
fields: Arc::from(fields),
|
||||
column_indices: Arc::from(column_indices),
|
||||
file_major_version,
|
||||
file_minor_version,
|
||||
file_size_bytes: file_size_bytes.into(),
|
||||
base_id,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new `DataFile` with the expectation that fields and column_indices will be set later
|
||||
pub fn new_unstarted(
|
||||
path: impl Into<String>,
|
||||
file_major_version: u32,
|
||||
file_minor_version: u32,
|
||||
) -> Self {
|
||||
Self {
|
||||
path: path.into(),
|
||||
fields: Arc::from([]),
|
||||
column_indices: Arc::from([]),
|
||||
file_major_version,
|
||||
file_minor_version,
|
||||
file_size_bytes: Default::default(),
|
||||
base_id: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_legacy_from_fields(
|
||||
path: impl Into<String>,
|
||||
fields: Vec<i32>,
|
||||
base_id: Option<u32>,
|
||||
) -> Self {
|
||||
Self::new(
|
||||
path,
|
||||
fields,
|
||||
vec![],
|
||||
MAJOR_VERSION as u32,
|
||||
MINOR_VERSION as u32,
|
||||
None,
|
||||
base_id,
|
||||
)
|
||||
}
|
||||
|
||||
pub fn new_legacy(
|
||||
path: impl Into<String>,
|
||||
schema: &Schema,
|
||||
file_size_bytes: Option<NonZero<u64>>,
|
||||
base_id: Option<u32>,
|
||||
) -> Self {
|
||||
let mut field_ids = schema.field_ids();
|
||||
field_ids.sort();
|
||||
Self::new(
|
||||
path,
|
||||
field_ids,
|
||||
vec![],
|
||||
MAJOR_VERSION as u32,
|
||||
MINOR_VERSION as u32,
|
||||
file_size_bytes,
|
||||
base_id,
|
||||
)
|
||||
}
|
||||
|
||||
pub fn schema(&self, full_schema: &Schema) -> Schema {
|
||||
full_schema.project_by_ids(&self.fields, false)
|
||||
}
|
||||
|
||||
pub fn is_legacy_file(&self) -> bool {
|
||||
self.file_major_version == 0 && self.file_minor_version < 3
|
||||
}
|
||||
|
||||
pub fn validate(&self, base_path: &Path) -> Result<()> {
|
||||
if self.is_legacy_file() {
|
||||
if !self.fields.windows(2).all(|w| w[0] < w[1]) {
|
||||
return Err(Error::corrupt_file(
|
||||
base_path.clone().join(self.path.clone()),
|
||||
"contained unsorted or duplicate field ids",
|
||||
));
|
||||
}
|
||||
} else if self.column_indices.len() < self.fields.len() {
|
||||
// Every recorded field id must have a column index, but not every column needs
|
||||
// to be associated with a field id (extra columns are allowed).
|
||||
return Err(Error::corrupt_file(
|
||||
base_path.clone().join(self.path.clone()),
|
||||
"contained fewer column_indices than fields",
|
||||
));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&DataFile> for pb::DataFile {
|
||||
fn from(df: &DataFile) -> Self {
|
||||
Self {
|
||||
path: df.path.clone(),
|
||||
fields: df.fields.to_vec(),
|
||||
column_indices: df.column_indices.to_vec(),
|
||||
file_major_version: df.file_major_version,
|
||||
file_minor_version: df.file_minor_version,
|
||||
file_size_bytes: df.file_size_bytes.get().map_or(0, |v| v.get()),
|
||||
base_id: df.base_id,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<pb::DataFile> for DataFile {
|
||||
type Error = Error;
|
||||
|
||||
fn try_from(proto: pb::DataFile) -> Result<Self> {
|
||||
Ok(Self {
|
||||
path: proto.path,
|
||||
fields: Arc::from(proto.fields),
|
||||
column_indices: Arc::from(proto.column_indices),
|
||||
file_major_version: proto.file_major_version,
|
||||
file_minor_version: proto.file_minor_version,
|
||||
file_size_bytes: CachedFileSize::new(proto.file_size_bytes),
|
||||
base_id: proto.base_id,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Interns repeated data so that fragments with identical content share a
|
||||
/// single heap allocation via `Arc`.
|
||||
///
|
||||
/// At 20M fragments the deduplication typically saves multiple GB of heap
|
||||
/// because every fragment in a homogeneous table carries the same field list,
|
||||
/// and post-compaction fragments share identical version metadata bytes.
|
||||
///
|
||||
/// Uses a `Vec`-based linear scan when the cache is small (<=16 entries)
|
||||
/// and upgrades to `HashMap` for larger caches. In the common homogeneous
|
||||
/// case (1-3 unique values), linear scan avoids per-fragment hashing overhead.
|
||||
#[derive(Default)]
|
||||
pub struct DataFileFieldInterner {
|
||||
fields: InternCache<i32>,
|
||||
column_indices: InternCache<i32>,
|
||||
inline_bytes: InternCache<u8>,
|
||||
}
|
||||
|
||||
/// A cache that uses linear scan for small sizes and HashMap for large.
|
||||
/// The threshold is chosen so that scan + compare is cheaper than hash for
|
||||
/// typical payload sizes (20-200 bytes).
|
||||
enum InternCache<T: Eq + std::hash::Hash + Clone> {
|
||||
Small(Vec<Arc<[T]>>),
|
||||
Large(HashMap<Arc<[T]>, ()>),
|
||||
}
|
||||
|
||||
const INTERN_CACHE_UPGRADE_THRESHOLD: usize = 16;
|
||||
|
||||
impl<T: Eq + std::hash::Hash + Clone> Default for InternCache<T> {
|
||||
fn default() -> Self {
|
||||
Self::Small(Vec::new())
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Eq + std::hash::Hash + Clone> InternCache<T> {
|
||||
fn intern(&mut self, v: Vec<T>) -> Arc<[T]> {
|
||||
match self {
|
||||
Self::Small(entries) => {
|
||||
for existing in entries.iter() {
|
||||
if existing.as_ref() == v.as_slice() {
|
||||
return existing.clone();
|
||||
}
|
||||
}
|
||||
let arc: Arc<[T]> = Arc::from(v);
|
||||
entries.push(arc.clone());
|
||||
if entries.len() > INTERN_CACHE_UPGRADE_THRESHOLD {
|
||||
let mut map = HashMap::with_capacity(entries.len());
|
||||
for e in entries.drain(..) {
|
||||
map.insert(e, ());
|
||||
}
|
||||
*self = Self::Large(map);
|
||||
}
|
||||
arc
|
||||
}
|
||||
Self::Large(map) => {
|
||||
if let Some((existing, _)) = map.get_key_value(v.as_slice()) {
|
||||
existing.clone()
|
||||
} else {
|
||||
let arc: Arc<[T]> = Arc::from(v);
|
||||
map.insert(arc.clone(), ());
|
||||
arc
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl DataFileFieldInterner {
|
||||
/// Intern a `RowDatasetVersionMeta`, deduplicating inline byte payloads.
|
||||
/// Accepts the protobuf oneof value directly to avoid an intermediate
|
||||
/// `Arc<[u8]>` allocation that would need to be `.to_vec()`'d for the key lookup.
|
||||
fn intern_last_updated_version_meta(
|
||||
cache: &mut InternCache<u8>,
|
||||
pb: pb::data_fragment::LastUpdatedAtVersionSequence,
|
||||
) -> Result<RowDatasetVersionMeta> {
|
||||
match pb {
|
||||
pb::data_fragment::LastUpdatedAtVersionSequence::InlineLastUpdatedAtVersions(data) => {
|
||||
Ok(RowDatasetVersionMeta::Inline(cache.intern(data)))
|
||||
}
|
||||
pb::data_fragment::LastUpdatedAtVersionSequence::ExternalLastUpdatedAtVersions(
|
||||
file,
|
||||
) => Ok(RowDatasetVersionMeta::External(ExternalFile {
|
||||
path: file.path,
|
||||
offset: file.offset,
|
||||
size: file.size,
|
||||
})),
|
||||
}
|
||||
}
|
||||
|
||||
/// Intern a `RowDatasetVersionMeta`, deduplicating inline byte payloads.
|
||||
fn intern_created_version_meta(
|
||||
cache: &mut InternCache<u8>,
|
||||
pb: pb::data_fragment::CreatedAtVersionSequence,
|
||||
) -> Result<RowDatasetVersionMeta> {
|
||||
match pb {
|
||||
pb::data_fragment::CreatedAtVersionSequence::InlineCreatedAtVersions(data) => {
|
||||
Ok(RowDatasetVersionMeta::Inline(cache.intern(data)))
|
||||
}
|
||||
pb::data_fragment::CreatedAtVersionSequence::ExternalCreatedAtVersions(file) => {
|
||||
Ok(RowDatasetVersionMeta::External(ExternalFile {
|
||||
path: file.path,
|
||||
offset: file.offset,
|
||||
size: file.size,
|
||||
}))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert a protobuf `DataFile`, interning `fields` and `column_indices`.
|
||||
pub fn intern_data_file(&mut self, proto: pb::DataFile) -> Result<DataFile> {
|
||||
Ok(DataFile {
|
||||
path: proto.path,
|
||||
fields: self.fields.intern(proto.fields),
|
||||
column_indices: self.column_indices.intern(proto.column_indices),
|
||||
file_major_version: proto.file_major_version,
|
||||
file_minor_version: proto.file_minor_version,
|
||||
file_size_bytes: CachedFileSize::new(proto.file_size_bytes),
|
||||
base_id: proto.base_id,
|
||||
})
|
||||
}
|
||||
|
||||
/// Convert a protobuf `DataFragment`, interning fields and version metadata.
|
||||
pub fn intern_fragment(&mut self, p: pb::DataFragment) -> Result<Fragment> {
|
||||
let physical_rows = if p.physical_rows > 0 {
|
||||
Some(p.physical_rows as usize)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let last_updated_at_version_meta = p
|
||||
.last_updated_at_version_sequence
|
||||
.map(|pb| Self::intern_last_updated_version_meta(&mut self.inline_bytes, pb))
|
||||
.transpose()?;
|
||||
let created_at_version_meta = p
|
||||
.created_at_version_sequence
|
||||
.map(|pb| Self::intern_created_version_meta(&mut self.inline_bytes, pb))
|
||||
.transpose()?;
|
||||
Ok(Fragment {
|
||||
id: p.id,
|
||||
files: p
|
||||
.files
|
||||
.into_iter()
|
||||
.map(|f| self.intern_data_file(f))
|
||||
.collect::<Result<_>>()?,
|
||||
deletion_file: p.deletion_file.map(DeletionFile::try_from).transpose()?,
|
||||
row_id_meta: p.row_id_sequence.map(RowIdMeta::try_from).transpose()?,
|
||||
physical_rows,
|
||||
last_updated_at_version_meta,
|
||||
created_at_version_meta,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum DeletionFileType {
|
||||
Array,
|
||||
Bitmap,
|
||||
}
|
||||
|
||||
impl DeletionFileType {
|
||||
// TODO: pub(crate)
|
||||
pub fn suffix(&self) -> &str {
|
||||
match self {
|
||||
Self::Array => "arrow",
|
||||
Self::Bitmap => "bin",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)]
|
||||
pub struct DeletionFile {
|
||||
pub read_version: u64,
|
||||
pub id: u64,
|
||||
pub file_type: DeletionFileType,
|
||||
/// Number of deleted rows in this file. If None, this is unknown.
|
||||
pub num_deleted_rows: Option<usize>,
|
||||
pub base_id: Option<u32>,
|
||||
}
|
||||
|
||||
impl TryFrom<pb::DeletionFile> for DeletionFile {
|
||||
type Error = Error;
|
||||
|
||||
fn try_from(value: pb::DeletionFile) -> Result<Self> {
|
||||
let file_type = match value.file_type {
|
||||
0 => DeletionFileType::Array,
|
||||
1 => DeletionFileType::Bitmap,
|
||||
_ => {
|
||||
return Err(Error::not_supported_source(
|
||||
"Unknown deletion file type".into(),
|
||||
));
|
||||
}
|
||||
};
|
||||
let num_deleted_rows = if value.num_deleted_rows == 0 {
|
||||
None
|
||||
} else {
|
||||
Some(value.num_deleted_rows as usize)
|
||||
};
|
||||
Ok(Self {
|
||||
read_version: value.read_version,
|
||||
id: value.id,
|
||||
file_type,
|
||||
num_deleted_rows,
|
||||
base_id: value.base_id,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// A reference to a part of a file.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)]
|
||||
pub struct ExternalFile {
|
||||
pub path: String,
|
||||
pub offset: u64,
|
||||
pub size: u64,
|
||||
}
|
||||
|
||||
/// Metadata about location of the row id sequence.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)]
|
||||
pub enum RowIdMeta {
|
||||
Inline(Vec<u8>),
|
||||
External(ExternalFile),
|
||||
}
|
||||
|
||||
impl TryFrom<pb::data_fragment::RowIdSequence> for RowIdMeta {
|
||||
type Error = Error;
|
||||
|
||||
fn try_from(value: pb::data_fragment::RowIdSequence) -> Result<Self> {
|
||||
match value {
|
||||
pb::data_fragment::RowIdSequence::InlineRowIds(data) => Ok(Self::Inline(data)),
|
||||
pb::data_fragment::RowIdSequence::ExternalRowIds(file) => {
|
||||
Ok(Self::External(ExternalFile {
|
||||
path: file.path.clone(),
|
||||
offset: file.offset,
|
||||
size: file.size,
|
||||
}))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Data fragment.
|
||||
///
|
||||
/// A fragment is a set of files which represent the different columns of the same rows.
|
||||
/// If column exists in the schema, but the related file does not exist, treat this column as `nulls`.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)]
|
||||
pub struct Fragment {
|
||||
/// Fragment ID
|
||||
pub id: u64,
|
||||
|
||||
/// Files within the fragment.
|
||||
pub files: Vec<DataFile>,
|
||||
|
||||
/// Optional file with deleted local row offsets.
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub deletion_file: Option<DeletionFile>,
|
||||
|
||||
/// RowIndex
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub row_id_meta: Option<RowIdMeta>,
|
||||
|
||||
/// Original number of rows in the fragment. If this is None, then it is
|
||||
/// unknown. This is only optional for legacy reasons. All new tables should
|
||||
/// have this set.
|
||||
pub physical_rows: Option<usize>,
|
||||
|
||||
/// Last updated at version metadata
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub last_updated_at_version_meta: Option<RowDatasetVersionMeta>,
|
||||
|
||||
/// Created at version metadata
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub created_at_version_meta: Option<RowDatasetVersionMeta>,
|
||||
}
|
||||
|
||||
impl Fragment {
|
||||
pub fn new(id: u64) -> Self {
|
||||
Self {
|
||||
id,
|
||||
files: vec![],
|
||||
deletion_file: None,
|
||||
row_id_meta: None,
|
||||
physical_rows: None,
|
||||
last_updated_at_version_meta: None,
|
||||
created_at_version_meta: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn num_rows(&self) -> Option<usize> {
|
||||
match (self.physical_rows, &self.deletion_file) {
|
||||
// Known fragment length, no deletion file.
|
||||
(Some(len), None) => Some(len),
|
||||
// Known fragment length, but don't know deletion file size.
|
||||
(
|
||||
Some(len),
|
||||
Some(DeletionFile {
|
||||
num_deleted_rows: Some(num_deleted_rows),
|
||||
..
|
||||
}),
|
||||
) => Some(len - num_deleted_rows),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_json(json: &str) -> Result<Self> {
|
||||
let fragment: Self = serde_json::from_str(json)?;
|
||||
Ok(fragment)
|
||||
}
|
||||
|
||||
/// Create a `Fragment` with one DataFile
|
||||
pub fn with_file_legacy(
|
||||
id: u64,
|
||||
path: &str,
|
||||
schema: &Schema,
|
||||
physical_rows: Option<usize>,
|
||||
) -> Self {
|
||||
Self {
|
||||
id,
|
||||
files: vec![DataFile::new_legacy(path, schema, None, None)],
|
||||
deletion_file: None,
|
||||
physical_rows,
|
||||
row_id_meta: None,
|
||||
last_updated_at_version_meta: None,
|
||||
created_at_version_meta: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_file(
|
||||
mut self,
|
||||
path: impl Into<String>,
|
||||
field_ids: Vec<i32>,
|
||||
column_indices: Vec<i32>,
|
||||
version: &LanceFileVersion,
|
||||
file_size_bytes: Option<NonZero<u64>>,
|
||||
) -> Self {
|
||||
let (major, minor) = version.to_numbers();
|
||||
let data_file = DataFile::new(
|
||||
path,
|
||||
field_ids,
|
||||
column_indices,
|
||||
major,
|
||||
minor,
|
||||
file_size_bytes,
|
||||
None,
|
||||
);
|
||||
self.files.push(data_file);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn with_physical_rows(mut self, physical_rows: usize) -> Self {
|
||||
self.physical_rows = Some(physical_rows);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn add_file(
|
||||
&mut self,
|
||||
path: impl Into<String>,
|
||||
field_ids: Vec<i32>,
|
||||
column_indices: Vec<i32>,
|
||||
version: &LanceFileVersion,
|
||||
file_size_bytes: Option<NonZero<u64>>,
|
||||
) {
|
||||
let (major, minor) = version.to_numbers();
|
||||
self.files.push(DataFile::new(
|
||||
path,
|
||||
field_ids,
|
||||
column_indices,
|
||||
major,
|
||||
minor,
|
||||
file_size_bytes,
|
||||
None,
|
||||
));
|
||||
}
|
||||
|
||||
/// Add a new [`DataFile`] to this fragment.
|
||||
pub fn add_file_legacy(&mut self, path: &str, schema: &Schema) {
|
||||
self.files
|
||||
.push(DataFile::new_legacy(path, schema, None, None));
|
||||
}
|
||||
|
||||
// True if this fragment is made up of legacy v1 files, false otherwise
|
||||
pub fn has_legacy_files(&self) -> bool {
|
||||
// If any file in a fragment is legacy then all files in the fragment must be
|
||||
self.files[0].is_legacy_file()
|
||||
}
|
||||
|
||||
// Helper method to infer the Lance version from a set of fragments
|
||||
//
|
||||
// Returns None if there are no data files
|
||||
// Returns an error if the data files have different versions
|
||||
pub fn try_infer_version(fragments: &[Self]) -> Result<Option<LanceFileVersion>> {
|
||||
// Otherwise we need to check the actual file versions
|
||||
// Determine version from first file
|
||||
let Some(sample_file) = fragments
|
||||
.iter()
|
||||
.find(|f| !f.files.is_empty())
|
||||
.map(|f| &f.files[0])
|
||||
else {
|
||||
return Ok(None);
|
||||
};
|
||||
let file_version = LanceFileVersion::try_from_major_minor(
|
||||
sample_file.file_major_version,
|
||||
sample_file.file_minor_version,
|
||||
)?;
|
||||
// Ensure all files match
|
||||
for frag in fragments {
|
||||
for file in &frag.files {
|
||||
let this_file_version = LanceFileVersion::try_from_major_minor(
|
||||
file.file_major_version,
|
||||
file.file_minor_version,
|
||||
)?;
|
||||
if file_version != this_file_version {
|
||||
return Err(Error::invalid_input(format!(
|
||||
"All data files must have the same version. Detected both {} and {}",
|
||||
file_version, this_file_version
|
||||
)));
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(Some(file_version))
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<pb::DataFragment> for Fragment {
|
||||
type Error = Error;
|
||||
|
||||
fn try_from(p: pb::DataFragment) -> Result<Self> {
|
||||
let physical_rows = if p.physical_rows > 0 {
|
||||
Some(p.physical_rows as usize)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
Ok(Self {
|
||||
id: p.id,
|
||||
files: p
|
||||
.files
|
||||
.into_iter()
|
||||
.map(DataFile::try_from)
|
||||
.collect::<Result<_>>()?,
|
||||
deletion_file: p.deletion_file.map(DeletionFile::try_from).transpose()?,
|
||||
row_id_meta: p.row_id_sequence.map(RowIdMeta::try_from).transpose()?,
|
||||
physical_rows,
|
||||
last_updated_at_version_meta: p
|
||||
.last_updated_at_version_sequence
|
||||
.map(RowDatasetVersionMeta::try_from)
|
||||
.transpose()?,
|
||||
created_at_version_meta: p
|
||||
.created_at_version_sequence
|
||||
.map(RowDatasetVersionMeta::try_from)
|
||||
.transpose()?,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&Fragment> for pb::DataFragment {
|
||||
fn from(f: &Fragment) -> Self {
|
||||
let deletion_file = f.deletion_file.as_ref().map(|f| {
|
||||
let file_type = match f.file_type {
|
||||
DeletionFileType::Array => pb::deletion_file::DeletionFileType::ArrowArray,
|
||||
DeletionFileType::Bitmap => pb::deletion_file::DeletionFileType::Bitmap,
|
||||
};
|
||||
pb::DeletionFile {
|
||||
read_version: f.read_version,
|
||||
id: f.id,
|
||||
file_type: file_type.into(),
|
||||
num_deleted_rows: f.num_deleted_rows.unwrap_or_default() as u64,
|
||||
base_id: f.base_id,
|
||||
}
|
||||
});
|
||||
|
||||
let row_id_sequence = f.row_id_meta.as_ref().map(|m| match m {
|
||||
RowIdMeta::Inline(data) => pb::data_fragment::RowIdSequence::InlineRowIds(data.clone()),
|
||||
RowIdMeta::External(file) => {
|
||||
pb::data_fragment::RowIdSequence::ExternalRowIds(pb::ExternalFile {
|
||||
path: file.path.clone(),
|
||||
offset: file.offset,
|
||||
size: file.size,
|
||||
})
|
||||
}
|
||||
});
|
||||
let last_updated_at_version_sequence =
|
||||
last_updated_at_version_meta_to_pb(&f.last_updated_at_version_meta);
|
||||
let created_at_version_sequence = created_at_version_meta_to_pb(&f.created_at_version_meta);
|
||||
Self {
|
||||
id: f.id,
|
||||
files: f.files.iter().map(pb::DataFile::from).collect(),
|
||||
deletion_file,
|
||||
row_id_sequence,
|
||||
physical_rows: f.physical_rows.unwrap_or_default() as u64,
|
||||
last_updated_at_version_sequence,
|
||||
created_at_version_sequence,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use arrow_schema::{
|
||||
DataType, Field as ArrowField, Fields as ArrowFields, Schema as ArrowSchema,
|
||||
};
|
||||
use object_store::path::Path;
|
||||
use serde_json::{Value, json};
|
||||
|
||||
#[test]
|
||||
fn test_new_fragment() {
|
||||
let path = "foobar.lance";
|
||||
|
||||
let arrow_schema = ArrowSchema::new(vec![
|
||||
ArrowField::new(
|
||||
"s",
|
||||
DataType::Struct(ArrowFields::from(vec![
|
||||
ArrowField::new("si", DataType::Int32, false),
|
||||
ArrowField::new("sb", DataType::Binary, true),
|
||||
])),
|
||||
true,
|
||||
),
|
||||
ArrowField::new("bool", DataType::Boolean, true),
|
||||
]);
|
||||
let schema = Schema::try_from(&arrow_schema).unwrap();
|
||||
let fragment = Fragment::with_file_legacy(123, path, &schema, Some(10));
|
||||
|
||||
assert_eq!(123, fragment.id);
|
||||
assert_eq!(
|
||||
fragment.files,
|
||||
vec![DataFile::new_legacy_from_fields(
|
||||
path.to_string(),
|
||||
vec![0, 1, 2, 3],
|
||||
None,
|
||||
)]
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_roundtrip_fragment() {
|
||||
let mut fragment = Fragment::new(123);
|
||||
let schema = ArrowSchema::new(vec![ArrowField::new("x", DataType::Float16, true)]);
|
||||
fragment.add_file_legacy("foobar.lance", &Schema::try_from(&schema).unwrap());
|
||||
fragment.deletion_file = Some(DeletionFile {
|
||||
read_version: 123,
|
||||
id: 456,
|
||||
file_type: DeletionFileType::Array,
|
||||
num_deleted_rows: Some(10),
|
||||
base_id: None,
|
||||
});
|
||||
|
||||
let proto = pb::DataFragment::from(&fragment);
|
||||
let fragment2 = Fragment::try_from(proto).unwrap();
|
||||
assert_eq!(fragment, fragment2);
|
||||
|
||||
fragment.deletion_file = None;
|
||||
let proto = pb::DataFragment::from(&fragment);
|
||||
let fragment2 = Fragment::try_from(proto).unwrap();
|
||||
assert_eq!(fragment, fragment2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_to_json() {
|
||||
let mut fragment = Fragment::new(123);
|
||||
let schema = ArrowSchema::new(vec![ArrowField::new("x", DataType::Float16, true)]);
|
||||
fragment.add_file_legacy("foobar.lance", &Schema::try_from(&schema).unwrap());
|
||||
fragment.deletion_file = Some(DeletionFile {
|
||||
read_version: 123,
|
||||
id: 456,
|
||||
file_type: DeletionFileType::Array,
|
||||
num_deleted_rows: Some(10),
|
||||
base_id: None,
|
||||
});
|
||||
|
||||
let json = serde_json::to_string(&fragment).unwrap();
|
||||
|
||||
let value: Value = serde_json::from_str(&json).unwrap();
|
||||
assert_eq!(
|
||||
value,
|
||||
json!({
|
||||
"id": 123,
|
||||
"files":[
|
||||
{"path": "foobar.lance", "fields": [0], "column_indices": [],
|
||||
"file_major_version": MAJOR_VERSION, "file_minor_version": MINOR_VERSION,
|
||||
"file_size_bytes": null, "base_id": null }
|
||||
],
|
||||
"deletion_file": {"read_version": 123, "id": 456, "file_type": "array",
|
||||
"num_deleted_rows": 10, "base_id": null},
|
||||
"physical_rows": None::<usize>}),
|
||||
);
|
||||
|
||||
let frag2 = Fragment::from_json(&json).unwrap();
|
||||
assert_eq!(fragment, frag2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn data_file_validate_allows_extra_columns() {
|
||||
let data_file = DataFile {
|
||||
path: "foo.lance".to_string(),
|
||||
fields: Arc::from([1, 2]),
|
||||
// One extra column without a field id mapping
|
||||
column_indices: Arc::from([0, 1, 2]),
|
||||
file_major_version: MAJOR_VERSION as u32,
|
||||
file_minor_version: MINOR_VERSION as u32,
|
||||
file_size_bytes: Default::default(),
|
||||
base_id: None,
|
||||
};
|
||||
|
||||
let base_path = Path::from("base");
|
||||
data_file
|
||||
.validate(&base_path)
|
||||
.expect("validation should allow extra columns without field ids");
|
||||
}
|
||||
}
|
||||
368
vendor/lance-table/src/format/index.rs
vendored
Normal file
368
vendor/lance-table/src/format/index.rs
vendored
Normal file
|
|
@ -0,0 +1,368 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The Lance Authors
|
||||
|
||||
//! Metadata for index
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
use deepsize::DeepSizeOf;
|
||||
use futures::StreamExt;
|
||||
use lance_io::object_store::ObjectStore;
|
||||
use object_store::path::Path;
|
||||
use roaring::RoaringBitmap;
|
||||
use uuid::Uuid;
|
||||
|
||||
use super::pb;
|
||||
use lance_core::{Error, Result};
|
||||
|
||||
/// Metadata about a single file within an index segment.
|
||||
#[derive(Debug, Clone, PartialEq, DeepSizeOf)]
|
||||
pub struct IndexFile {
|
||||
/// Path relative to the index directory (e.g., "index.idx", "auxiliary.idx")
|
||||
pub path: String,
|
||||
/// Size of the file in bytes
|
||||
pub size_bytes: u64,
|
||||
}
|
||||
|
||||
/// Index metadata
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub struct IndexMetadata {
|
||||
/// Unique ID across all dataset versions.
|
||||
pub uuid: Uuid,
|
||||
|
||||
/// Fields to build the index.
|
||||
pub fields: Vec<i32>,
|
||||
|
||||
/// Human readable index name
|
||||
pub name: String,
|
||||
|
||||
/// The version of the dataset this index was last updated on
|
||||
///
|
||||
/// This is set when the index is created (based on the version used to train the index)
|
||||
/// This is updated when the index is updated or remapped
|
||||
pub dataset_version: u64,
|
||||
|
||||
/// The fragment ids this index covers.
|
||||
///
|
||||
/// This may contain fragment ids that no longer exist in the dataset.
|
||||
///
|
||||
/// If this is None, then this is unknown.
|
||||
pub fragment_bitmap: Option<RoaringBitmap>,
|
||||
|
||||
/// Metadata specific to the index type
|
||||
///
|
||||
/// This is an Option because older versions of Lance may not have this defined. However, it should always
|
||||
/// be present in newer versions.
|
||||
pub index_details: Option<Arc<prost_types::Any>>,
|
||||
|
||||
/// The index version.
|
||||
pub index_version: i32,
|
||||
|
||||
/// Timestamp when the index was created
|
||||
///
|
||||
/// This field is optional for backward compatibility. For existing indices created before
|
||||
/// this field was added, this will be None.
|
||||
pub created_at: Option<DateTime<Utc>>,
|
||||
|
||||
/// The base path index of the index files. Used when the index is imported or referred from another dataset.
|
||||
/// Lance uses it as key of the base_paths field in Manifest to determine the actual base path of the index files.
|
||||
pub base_id: Option<u32>,
|
||||
|
||||
/// List of files and their sizes for this index segment.
|
||||
/// This enables skipping HEAD calls when opening indices and provides
|
||||
/// visibility into index storage size via describe_indices().
|
||||
/// This is None if the file sizes are unknown. This happens for indices created
|
||||
/// before this field was added.
|
||||
pub files: Option<Vec<IndexFile>>,
|
||||
}
|
||||
|
||||
impl IndexMetadata {
|
||||
pub fn effective_fragment_bitmap(
|
||||
&self,
|
||||
existing_fragments: &RoaringBitmap,
|
||||
) -> Option<RoaringBitmap> {
|
||||
let fragment_bitmap = self.fragment_bitmap.as_ref()?;
|
||||
Some(fragment_bitmap & existing_fragments)
|
||||
}
|
||||
|
||||
/// Returns a map of relative file paths to their sizes.
|
||||
/// Returns an empty map if file information is not available.
|
||||
pub fn file_size_map(&self) -> HashMap<String, u64> {
|
||||
self.files
|
||||
.as_ref()
|
||||
.map(|files| {
|
||||
files
|
||||
.iter()
|
||||
.map(|f| (f.path.clone(), f.size_bytes))
|
||||
.collect()
|
||||
})
|
||||
.unwrap_or_default()
|
||||
}
|
||||
|
||||
/// Returns the total size of all files in this index segment in bytes.
|
||||
/// Returns None if file information is not available.
|
||||
pub fn total_size_bytes(&self) -> Option<u64> {
|
||||
self.files
|
||||
.as_ref()
|
||||
.map(|files| files.iter().map(|f| f.size_bytes).sum())
|
||||
}
|
||||
|
||||
/// Returns the set of fragments which are part of the fragment bitmap
|
||||
/// but no longer in the dataset.
|
||||
pub fn deleted_fragment_bitmap(
|
||||
&self,
|
||||
existing_fragments: &RoaringBitmap,
|
||||
) -> Option<RoaringBitmap> {
|
||||
let fragment_bitmap = self.fragment_bitmap.as_ref()?;
|
||||
Some(fragment_bitmap - existing_fragments)
|
||||
}
|
||||
}
|
||||
|
||||
impl DeepSizeOf for IndexMetadata {
|
||||
fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize {
|
||||
self.uuid.as_bytes().deep_size_of_children(context)
|
||||
+ self.fields.deep_size_of_children(context)
|
||||
+ self.name.deep_size_of_children(context)
|
||||
+ self.dataset_version.deep_size_of_children(context)
|
||||
+ self
|
||||
.fragment_bitmap
|
||||
.as_ref()
|
||||
.map(|fragment_bitmap| fragment_bitmap.serialized_size())
|
||||
.unwrap_or(0)
|
||||
+ self.files.deep_size_of_children(context)
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<pb::IndexMetadata> for IndexMetadata {
|
||||
type Error = Error;
|
||||
|
||||
fn try_from(proto: pb::IndexMetadata) -> Result<Self> {
|
||||
let fragment_bitmap = if proto.fragment_bitmap.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(RoaringBitmap::deserialize_from(
|
||||
&mut proto.fragment_bitmap.as_slice(),
|
||||
)?)
|
||||
};
|
||||
|
||||
let files = if proto.files.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(
|
||||
proto
|
||||
.files
|
||||
.into_iter()
|
||||
.map(|f| IndexFile {
|
||||
path: f.path,
|
||||
size_bytes: f.size_bytes,
|
||||
})
|
||||
.collect(),
|
||||
)
|
||||
};
|
||||
|
||||
Ok(Self {
|
||||
uuid: proto.uuid.as_ref().map(Uuid::try_from).ok_or_else(|| {
|
||||
Error::invalid_input("uuid field does not exist in Index metadata".to_string())
|
||||
})??,
|
||||
name: proto.name,
|
||||
fields: proto.fields,
|
||||
dataset_version: proto.dataset_version,
|
||||
fragment_bitmap,
|
||||
index_details: proto.index_details.map(Arc::new),
|
||||
index_version: proto.index_version.unwrap_or_default(),
|
||||
created_at: proto.created_at.map(|ts| {
|
||||
DateTime::from_timestamp_millis(ts as i64)
|
||||
.expect("Invalid timestamp in index metadata")
|
||||
}),
|
||||
base_id: proto.base_id,
|
||||
files,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&IndexMetadata> for pb::IndexMetadata {
|
||||
fn from(idx: &IndexMetadata) -> Self {
|
||||
let mut fragment_bitmap = Vec::new();
|
||||
if let Some(bitmap) = &idx.fragment_bitmap
|
||||
&& let Err(e) = bitmap.serialize_into(&mut fragment_bitmap)
|
||||
{
|
||||
// In theory, this should never error. But if we do, just
|
||||
// recover gracefully.
|
||||
log::error!("Failed to serialize fragment bitmap: {}", e);
|
||||
fragment_bitmap.clear();
|
||||
}
|
||||
|
||||
let files = idx
|
||||
.files
|
||||
.as_ref()
|
||||
.map(|files| {
|
||||
files
|
||||
.iter()
|
||||
.map(|f| pb::IndexFile {
|
||||
path: f.path.clone(),
|
||||
size_bytes: f.size_bytes,
|
||||
})
|
||||
.collect()
|
||||
})
|
||||
.unwrap_or_default();
|
||||
|
||||
Self {
|
||||
uuid: Some((&idx.uuid).into()),
|
||||
name: idx.name.clone(),
|
||||
fields: idx.fields.clone(),
|
||||
dataset_version: idx.dataset_version,
|
||||
fragment_bitmap,
|
||||
index_details: idx
|
||||
.index_details
|
||||
.as_ref()
|
||||
.map(|details| details.as_ref().clone()),
|
||||
index_version: Some(idx.index_version),
|
||||
created_at: idx.created_at.map(|dt| dt.timestamp_millis() as u64),
|
||||
base_id: idx.base_id,
|
||||
files,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a [`CacheCodec`](lance_core::cache::CacheCodec) for `Vec<IndexMetadata>`.
|
||||
///
|
||||
/// Uses `pb::IndexSection` (which wraps `repeated IndexMetadata`) as the wire
|
||||
/// format, reusing the existing `TryFrom`/`From` conversions.
|
||||
///
|
||||
/// Uses [`CacheCodec::new`](lance_core::cache::CacheCodec::new) because the
|
||||
/// orphan rule prevents `impl CacheCodecImpl for Vec<IndexMetadata>`.
|
||||
type ArcAny = Arc<dyn std::any::Any + Send + Sync>;
|
||||
|
||||
fn serialize_index_metadata(
|
||||
any: &ArcAny,
|
||||
writer: &mut dyn std::io::Write,
|
||||
) -> lance_core::Result<()> {
|
||||
use prost::Message;
|
||||
let vec = any
|
||||
.downcast_ref::<Vec<IndexMetadata>>()
|
||||
.expect("index_metadata_codec: wrong type (this is a bug in the cache layer)");
|
||||
let section = pb::IndexSection {
|
||||
indices: vec.iter().map(pb::IndexMetadata::from).collect(),
|
||||
};
|
||||
writer.write_all(§ion.encode_to_vec())?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize_index_metadata(data: &bytes::Bytes) -> lance_core::Result<ArcAny> {
|
||||
use prost::Message;
|
||||
let section = pb::IndexSection::decode(data.as_ref())?;
|
||||
let indices: Vec<IndexMetadata> = section
|
||||
.indices
|
||||
.into_iter()
|
||||
.map(IndexMetadata::try_from)
|
||||
.collect::<lance_core::Result<_>>()?;
|
||||
Ok(Arc::new(indices))
|
||||
}
|
||||
|
||||
pub fn index_metadata_codec() -> lance_core::cache::CacheCodec {
|
||||
lance_core::cache::CacheCodec::new(serialize_index_metadata, deserialize_index_metadata)
|
||||
}
|
||||
|
||||
/// List all files in an index directory with their sizes.
|
||||
///
|
||||
/// Returns a list of `IndexFile` structs containing relative paths and sizes.
|
||||
/// This is used to capture file metadata after index creation/modification.
|
||||
pub async fn list_index_files_with_sizes(
|
||||
object_store: &ObjectStore,
|
||||
index_dir: &Path,
|
||||
) -> Result<Vec<IndexFile>> {
|
||||
let mut files = Vec::new();
|
||||
let mut stream = object_store.read_dir_all(index_dir, None);
|
||||
while let Some(meta) = stream.next().await {
|
||||
let meta = meta?;
|
||||
// Get relative path by stripping the index_dir prefix
|
||||
let relative_path = meta
|
||||
.location
|
||||
.as_ref()
|
||||
.strip_prefix(index_dir.as_ref())
|
||||
.map(|s| s.trim_start_matches('/').to_string())
|
||||
.unwrap_or_else(|| meta.location.filename().unwrap_or("").to_string());
|
||||
files.push(IndexFile {
|
||||
path: relative_path,
|
||||
size_bytes: meta.size,
|
||||
});
|
||||
}
|
||||
Ok(files)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::collections::HashMap;
|
||||
|
||||
/// Demonstrates the pattern a disk-backed cache backend would use:
|
||||
/// serialize entries to bytes, store in a key-value map, then
|
||||
/// deserialize on retrieval.
|
||||
#[test]
|
||||
fn test_index_metadata_codec_roundtrip() {
|
||||
let codec = index_metadata_codec();
|
||||
|
||||
let original = vec![
|
||||
IndexMetadata {
|
||||
uuid: Uuid::new_v4(),
|
||||
name: "my_index".to_string(),
|
||||
fields: vec![0, 1],
|
||||
dataset_version: 42,
|
||||
fragment_bitmap: Some(RoaringBitmap::from_iter([1, 2, 3])),
|
||||
index_details: None,
|
||||
index_version: 1,
|
||||
created_at: None,
|
||||
base_id: None,
|
||||
files: Some(vec![IndexFile {
|
||||
path: "index.idx".to_string(),
|
||||
size_bytes: 1024,
|
||||
}]),
|
||||
},
|
||||
IndexMetadata {
|
||||
uuid: Uuid::new_v4(),
|
||||
name: "second_index".to_string(),
|
||||
fields: vec![2],
|
||||
dataset_version: 43,
|
||||
fragment_bitmap: None,
|
||||
index_details: None,
|
||||
index_version: 2,
|
||||
created_at: None,
|
||||
base_id: Some(7),
|
||||
files: None,
|
||||
},
|
||||
];
|
||||
|
||||
// Simulate a disk-backed store: HashMap<String, Vec<u8>>
|
||||
let mut store: HashMap<String, Vec<u8>> = HashMap::new();
|
||||
|
||||
// Serialize into the store
|
||||
let key = "dataset/v42/Vec<IndexMetadata>".to_string();
|
||||
let mut buf = Vec::new();
|
||||
let entry: Arc<dyn std::any::Any + Send + Sync> = Arc::new(original.clone());
|
||||
codec.serialize(&entry, &mut buf).unwrap();
|
||||
store.insert(key.clone(), buf);
|
||||
|
||||
// Deserialize from the store
|
||||
let bytes = store.get(&key).unwrap();
|
||||
let recovered = codec
|
||||
.deserialize(&bytes::Bytes::copy_from_slice(bytes))
|
||||
.unwrap();
|
||||
let recovered = recovered
|
||||
.downcast::<Vec<IndexMetadata>>()
|
||||
.expect("downcast should succeed");
|
||||
|
||||
assert_eq!(original.len(), recovered.len());
|
||||
for (orig, rec) in original.iter().zip(recovered.iter()) {
|
||||
assert_eq!(orig.uuid, rec.uuid);
|
||||
assert_eq!(orig.name, rec.name);
|
||||
assert_eq!(orig.fields, rec.fields);
|
||||
assert_eq!(orig.dataset_version, rec.dataset_version);
|
||||
assert_eq!(orig.fragment_bitmap, rec.fragment_bitmap);
|
||||
assert_eq!(orig.index_version, rec.index_version);
|
||||
assert_eq!(orig.base_id, rec.base_id);
|
||||
assert_eq!(orig.files, rec.files);
|
||||
}
|
||||
}
|
||||
}
|
||||
1490
vendor/lance-table/src/format/manifest.rs
vendored
Normal file
1490
vendor/lance-table/src/format/manifest.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
42
vendor/lance-table/src/format/transaction.rs
vendored
Executable file
42
vendor/lance-table/src/format/transaction.rs
vendored
Executable file
|
|
@ -0,0 +1,42 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The Lance Authors
|
||||
|
||||
//! Transaction struct for lance-table format layer.
|
||||
//!
|
||||
//! This struct is introduced to provide a Struct-first API for passing transaction
|
||||
//! information within the lance-table crate. It mirrors the protobuf Transaction
|
||||
//! message at a semantic level while remaining crate-local, so lance-table does
|
||||
//! not depend on higher layers (e.g., lance crate).
|
||||
//!
|
||||
//! Conversion to protobuf occurs at the write boundary. See the `From<Transaction>`
|
||||
//! implementation below.
|
||||
|
||||
use crate::format::pb;
|
||||
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub struct Transaction {
|
||||
/// Crate-local representation backing: protobuf Transaction.
|
||||
/// Keeping this simple avoids ring dependencies while still enabling
|
||||
/// Struct-first parameter passing in lance-table.
|
||||
pub inner: pb::Transaction,
|
||||
}
|
||||
|
||||
impl Transaction {
|
||||
/// Accessor for testing or internal inspection if needed.
|
||||
pub fn as_pb(&self) -> &pb::Transaction {
|
||||
&self.inner
|
||||
}
|
||||
}
|
||||
|
||||
/// Write-boundary conversion: serialize using protobuf at the last step.
|
||||
impl From<Transaction> for pb::Transaction {
|
||||
fn from(tx: Transaction) -> Self {
|
||||
tx.inner
|
||||
}
|
||||
}
|
||||
|
||||
impl From<pb::Transaction> for Transaction {
|
||||
fn from(pb_tx: pb::Transaction) -> Self {
|
||||
Self { inner: pb_tx }
|
||||
}
|
||||
}
|
||||
6
vendor/lance-table/src/io.rs
vendored
Normal file
6
vendor/lance-table/src/io.rs
vendored
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The Lance Authors
|
||||
|
||||
pub mod commit;
|
||||
pub mod deletion;
|
||||
pub mod manifest;
|
||||
1898
vendor/lance-table/src/io/commit.rs
vendored
Normal file
1898
vendor/lance-table/src/io/commit.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
495
vendor/lance-table/src/io/commit/dynamodb.rs
vendored
Normal file
495
vendor/lance-table/src/io/commit/dynamodb.rs
vendored
Normal file
|
|
@ -0,0 +1,495 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The Lance Authors
|
||||
|
||||
//! DynamoDB based external manifest store
|
||||
//!
|
||||
|
||||
use std::collections::HashSet;
|
||||
use std::sync::{Arc, LazyLock};
|
||||
|
||||
use async_trait::async_trait;
|
||||
use aws_sdk_dynamodb::Client;
|
||||
use aws_sdk_dynamodb::error::SdkError;
|
||||
use aws_sdk_dynamodb::operation::RequestId;
|
||||
use aws_sdk_dynamodb::operation::delete_item::builders::DeleteItemFluentBuilder;
|
||||
use aws_sdk_dynamodb::operation::{
|
||||
get_item::builders::GetItemFluentBuilder, put_item::builders::PutItemFluentBuilder,
|
||||
query::builders::QueryFluentBuilder,
|
||||
};
|
||||
use aws_sdk_dynamodb::types::{AttributeValue, KeyType};
|
||||
use object_store::path::Path;
|
||||
use snafu::OptionExt;
|
||||
use tokio::sync::RwLock;
|
||||
use tracing::warn;
|
||||
|
||||
use crate::io::commit::external_manifest::ExternalManifestStore;
|
||||
use lance_core::error::NotFoundSnafu;
|
||||
use lance_core::error::box_error;
|
||||
use lance_core::{Error, Result};
|
||||
|
||||
use super::ManifestLocation;
|
||||
use super::external_manifest::detect_naming_scheme_from_path;
|
||||
|
||||
#[derive(Debug)]
|
||||
struct WrappedSdkError<E>(SdkError<E>);
|
||||
|
||||
impl<E> From<WrappedSdkError<E>> for Error
|
||||
where
|
||||
E: std::error::Error + Send + Sync + 'static,
|
||||
{
|
||||
fn from(e: WrappedSdkError<E>) -> Self {
|
||||
Self::io_source(box_error(e))
|
||||
}
|
||||
}
|
||||
|
||||
impl<E> std::fmt::Display for WrappedSdkError<E>
|
||||
where
|
||||
E: std::error::Error + Send + Sync + 'static,
|
||||
{
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let request_id = self.0.request_id().unwrap_or("unknown");
|
||||
let service_err = &self.0.raw_response();
|
||||
write!(f, "WrappedSdkError: request_id: {}", request_id)?;
|
||||
if let Some(err) = service_err {
|
||||
write!(f, ", service_error: {:?}", err)
|
||||
} else {
|
||||
write!(f, ", no service error")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<E> std::error::Error for WrappedSdkError<E>
|
||||
where
|
||||
E: std::error::Error + Send + Sync + 'static,
|
||||
{
|
||||
// Implement the necessary methods for the Error trait here.
|
||||
// For example, you can delegate to the inner SdkError:
|
||||
|
||||
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
|
||||
Some(&self.0)
|
||||
}
|
||||
}
|
||||
|
||||
trait SdkResultExt<T> {
|
||||
fn wrap_err(self) -> Result<T>;
|
||||
}
|
||||
|
||||
impl<T, E> SdkResultExt<T> for std::result::Result<T, SdkError<E>>
|
||||
where
|
||||
E: std::error::Error + Send + Sync + 'static,
|
||||
{
|
||||
fn wrap_err(self) -> Result<T> {
|
||||
self.map_err(|err| {
|
||||
warn!(
|
||||
target: "lance::dynamodb",
|
||||
request_id = err.request_id().unwrap_or("unknown"),
|
||||
"DynamoDB SDK error: {err:?}",
|
||||
);
|
||||
Error::from(WrappedSdkError(err))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// An external manifest store backed by DynamoDB
|
||||
///
|
||||
/// When calling DynamoDBExternalManifestStore::new_external_store()
|
||||
/// the key schema, (PK, SK), is checked. If the table does not exist,
|
||||
/// or the key schema is not as expected, an error is returned.
|
||||
///
|
||||
/// The table schema is expected as follows:
|
||||
/// PK: base_uri -- string
|
||||
/// SK: version -- number
|
||||
/// path -- string
|
||||
/// committer -- string
|
||||
///
|
||||
/// Consistency: This store is expected to have read-after-write consistency
|
||||
/// consistent_read should always be set to true
|
||||
///
|
||||
/// Transaction Safety: This store uses DynamoDB conditional write to ensure
|
||||
/// only one writer can win per version.
|
||||
#[derive(Debug)]
|
||||
pub struct DynamoDBExternalManifestStore {
|
||||
client: Arc<Client>,
|
||||
table_name: String,
|
||||
committer_name: String,
|
||||
}
|
||||
|
||||
// these are in macro because I want to use them in a match statement
|
||||
macro_rules! base_uri {
|
||||
() => {
|
||||
"base_uri"
|
||||
};
|
||||
}
|
||||
macro_rules! version {
|
||||
() => {
|
||||
"version"
|
||||
};
|
||||
}
|
||||
macro_rules! path {
|
||||
() => {
|
||||
"path"
|
||||
};
|
||||
}
|
||||
macro_rules! committer {
|
||||
() => {
|
||||
"committer"
|
||||
};
|
||||
}
|
||||
|
||||
impl DynamoDBExternalManifestStore {
|
||||
pub async fn new_external_store(
|
||||
client: Arc<Client>,
|
||||
table_name: &str,
|
||||
committer_name: &str,
|
||||
) -> Result<Arc<dyn ExternalManifestStore>> {
|
||||
static SANITY_CHECK_CACHE: LazyLock<RwLock<HashSet<String>>> =
|
||||
LazyLock::new(|| RwLock::new(HashSet::new()));
|
||||
|
||||
let store = Arc::new(Self {
|
||||
client: client.clone(),
|
||||
table_name: table_name.to_string(),
|
||||
committer_name: committer_name.to_string(),
|
||||
});
|
||||
|
||||
// already checked this table before, skip
|
||||
// this is to avoid checking the table schema every time
|
||||
// because it's expensive to call DescribeTable
|
||||
if SANITY_CHECK_CACHE.read().await.contains(table_name) {
|
||||
return Ok(store);
|
||||
}
|
||||
|
||||
// Check if the table schema is correct
|
||||
let describe_result = client
|
||||
.describe_table()
|
||||
.table_name(table_name)
|
||||
.send()
|
||||
.await
|
||||
.wrap_err()?;
|
||||
let table = describe_result
|
||||
.table
|
||||
.ok_or_else(|| Error::io(format!("dynamodb table: {table_name} does not exist")))?;
|
||||
let mut schema = table.key_schema.ok_or_else(|| {
|
||||
Error::io(format!(
|
||||
"dynamodb table: {table_name} does not have a key schema"
|
||||
))
|
||||
})?;
|
||||
|
||||
let mut has_hash_key = false;
|
||||
let mut has_range_key = false;
|
||||
|
||||
// there should be two keys, HASH(base_uri) and RANGE(version)
|
||||
for _ in 0..2 {
|
||||
let key = schema.pop().ok_or_else(|| {
|
||||
Error::io(format!(
|
||||
"dynamodb table: {table_name} must have HASH and RANGE keys"
|
||||
))
|
||||
})?;
|
||||
match (key.key_type, key.attribute_name.as_str()) {
|
||||
(KeyType::Hash, base_uri!()) => {
|
||||
has_hash_key = true;
|
||||
}
|
||||
(KeyType::Range, version!()) => {
|
||||
has_range_key = true;
|
||||
}
|
||||
_ => {
|
||||
return Err(Error::io(format!(
|
||||
"dynamodb table: {} unknown key type encountered name:{}",
|
||||
table_name, key.attribute_name
|
||||
)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Both keys must be present
|
||||
if !(has_hash_key && has_range_key) {
|
||||
return Err(Error::io(format!(
|
||||
"dynamodb table: {} must have HASH and RANGE keys, named `{}` and `{}` respectively",
|
||||
table_name,
|
||||
base_uri!(),
|
||||
version!()
|
||||
)));
|
||||
}
|
||||
|
||||
SANITY_CHECK_CACHE
|
||||
.write()
|
||||
.await
|
||||
.insert(table_name.to_string());
|
||||
|
||||
Ok(store)
|
||||
}
|
||||
|
||||
fn ddb_put(&self) -> PutItemFluentBuilder {
|
||||
self.client.put_item().table_name(&self.table_name)
|
||||
}
|
||||
|
||||
fn ddb_get(&self) -> GetItemFluentBuilder {
|
||||
self.client
|
||||
.get_item()
|
||||
.table_name(&self.table_name)
|
||||
.consistent_read(true)
|
||||
}
|
||||
|
||||
fn ddb_query(&self) -> QueryFluentBuilder {
|
||||
self.client
|
||||
.query()
|
||||
.table_name(&self.table_name)
|
||||
.consistent_read(true)
|
||||
}
|
||||
|
||||
fn ddb_delete(&self) -> DeleteItemFluentBuilder {
|
||||
self.client.delete_item().table_name(&self.table_name)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl ExternalManifestStore for DynamoDBExternalManifestStore {
|
||||
/// Get the manifest path for a given base_uri and version
|
||||
async fn get(&self, base_uri: &str, version: u64) -> Result<String> {
|
||||
let get_item_result = self
|
||||
.ddb_get()
|
||||
.key(base_uri!(), AttributeValue::S(base_uri.into()))
|
||||
.key(version!(), AttributeValue::N(version.to_string()))
|
||||
.send()
|
||||
.await
|
||||
.wrap_err()?;
|
||||
|
||||
let item = get_item_result.item.context(NotFoundSnafu {
|
||||
uri: format!(
|
||||
"dynamodb not found: base_uri: {}; version: {}",
|
||||
base_uri, version
|
||||
),
|
||||
})?;
|
||||
|
||||
let path = item
|
||||
.get(path!())
|
||||
.ok_or_else(|| Error::not_found(format!("key {} is not present", path!())))?;
|
||||
|
||||
match path {
|
||||
AttributeValue::S(path) => Ok(path.clone()),
|
||||
_ => Err(Error::invalid_input(format!(
|
||||
"key {} is not a string",
|
||||
path!()
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_manifest_location(
|
||||
&self,
|
||||
base_uri: &str,
|
||||
version: u64,
|
||||
) -> Result<ManifestLocation> {
|
||||
let get_item_result = self
|
||||
.ddb_get()
|
||||
.key(base_uri!(), AttributeValue::S(base_uri.into()))
|
||||
.key(version!(), AttributeValue::N(version.to_string()))
|
||||
.send()
|
||||
.await
|
||||
.wrap_err()?;
|
||||
|
||||
let item = get_item_result.item.context(NotFoundSnafu {
|
||||
uri: format!(
|
||||
"dynamodb not found: base_uri: {}; version: {}",
|
||||
base_uri, version
|
||||
),
|
||||
})?;
|
||||
|
||||
let path = item
|
||||
.get(path!())
|
||||
.ok_or_else(|| Error::not_found(format!("key {} is not present", path!())))?
|
||||
.as_s()
|
||||
.map_err(|_| Error::invalid_input(format!("key {} is not a string", path!())))?
|
||||
.as_str();
|
||||
let path = Path::from(path);
|
||||
|
||||
let size = item
|
||||
.get("size")
|
||||
.and_then(|attr| attr.as_n().ok().and_then(|v| v.parse().ok()));
|
||||
|
||||
let e_tag = item.get("e_tag").and_then(|attr| attr.as_s().ok().cloned());
|
||||
|
||||
let naming_scheme = detect_naming_scheme_from_path(&path)?;
|
||||
|
||||
Ok(ManifestLocation {
|
||||
version,
|
||||
path,
|
||||
size,
|
||||
naming_scheme,
|
||||
e_tag,
|
||||
})
|
||||
}
|
||||
|
||||
/// Get the latest version of a dataset at the base_uri
|
||||
async fn get_latest_version(&self, base_uri: &str) -> Result<Option<(u64, String)>> {
|
||||
self.get_latest_manifest_location(base_uri)
|
||||
.await
|
||||
.map(|location| location.map(|loc| (loc.version, loc.path.to_string())))
|
||||
}
|
||||
|
||||
async fn get_latest_manifest_location(
|
||||
&self,
|
||||
base_uri: &str,
|
||||
) -> Result<Option<ManifestLocation>> {
|
||||
let query_result = self
|
||||
.ddb_query()
|
||||
.key_condition_expression(format!("{} = :{}", base_uri!(), base_uri!()))
|
||||
.expression_attribute_values(
|
||||
format!(":{}", base_uri!()),
|
||||
AttributeValue::S(base_uri.into()),
|
||||
)
|
||||
.scan_index_forward(false)
|
||||
.limit(1)
|
||||
.send()
|
||||
.await
|
||||
.wrap_err()?;
|
||||
|
||||
match query_result.items {
|
||||
Some(mut items) => {
|
||||
if items.is_empty() {
|
||||
return Ok(None);
|
||||
}
|
||||
if items.len() > 1 {
|
||||
return Err(Error::invalid_input(format!(
|
||||
"dynamodb table: {} returned unexpected number of items",
|
||||
self.table_name
|
||||
)));
|
||||
}
|
||||
|
||||
let item = items.pop().expect("length checked");
|
||||
let version_attribute = item
|
||||
.get(version!())
|
||||
.ok_or_else(|| Error::not_found(
|
||||
format!("dynamodb error: found entries for {} but the returned data does not contain {} column", base_uri, version!())
|
||||
))?;
|
||||
|
||||
let path_attribute = item
|
||||
.get(path!())
|
||||
.ok_or_else(|| Error::not_found(
|
||||
format!("dynamodb error: found entries for {} but the returned data does not contain {} column", base_uri, path!())
|
||||
))?;
|
||||
|
||||
let size = item.get("size").and_then(|attr| match attr {
|
||||
AttributeValue::N(size) => size.parse().ok(),
|
||||
_ => None,
|
||||
});
|
||||
|
||||
let e_tag = item.get("e_tag").and_then(|attr| attr.as_s().ok().cloned());
|
||||
|
||||
match (version_attribute, path_attribute) {
|
||||
(AttributeValue::N(version), AttributeValue::S(path)) => {
|
||||
let version = version.parse().map_err(|e| Error::invalid_input(format!("dynamodb error: could not parse the version number returned {}, error: {}", version, e)))?;
|
||||
let path = Path::from(path.as_str());
|
||||
let naming_scheme = detect_naming_scheme_from_path(&path)?;
|
||||
let location = ManifestLocation {
|
||||
version,
|
||||
path,
|
||||
size,
|
||||
naming_scheme,
|
||||
e_tag,
|
||||
};
|
||||
Ok(Some(location))
|
||||
}
|
||||
_ => Err(Error::invalid_input(format!(
|
||||
"dynamodb error: found entries for {base_uri} but the returned data is not number type"
|
||||
))),
|
||||
}
|
||||
}
|
||||
_ => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
/// Put the manifest path for a given base_uri and version, should fail if the version already exists
|
||||
async fn put_if_not_exists(
|
||||
&self,
|
||||
base_uri: &str,
|
||||
version: u64,
|
||||
path: &str,
|
||||
size: u64,
|
||||
e_tag: Option<String>,
|
||||
) -> Result<()> {
|
||||
let mut put_item = self
|
||||
.ddb_put()
|
||||
.item(base_uri!(), AttributeValue::S(base_uri.into()))
|
||||
.item(version!(), AttributeValue::N(version.to_string()))
|
||||
.item(path!(), AttributeValue::S(path.to_string()))
|
||||
.item(committer!(), AttributeValue::S(self.committer_name.clone()))
|
||||
.item("size", AttributeValue::N(size.to_string()));
|
||||
|
||||
if let Some(e_tag) = e_tag {
|
||||
put_item = put_item.item("e_tag", AttributeValue::S(e_tag));
|
||||
}
|
||||
|
||||
put_item
|
||||
.condition_expression(format!(
|
||||
"attribute_not_exists({}) AND attribute_not_exists({})",
|
||||
base_uri!(),
|
||||
version!(),
|
||||
))
|
||||
.send()
|
||||
.await
|
||||
.wrap_err()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Put the manifest path for a given base_uri and version, should fail if the version **does not** already exist
|
||||
async fn put_if_exists(
|
||||
&self,
|
||||
base_uri: &str,
|
||||
version: u64,
|
||||
path: &str,
|
||||
size: u64,
|
||||
e_tag: Option<String>,
|
||||
) -> Result<()> {
|
||||
let mut put_item = self
|
||||
.ddb_put()
|
||||
.item(base_uri!(), AttributeValue::S(base_uri.into()))
|
||||
.item(version!(), AttributeValue::N(version.to_string()))
|
||||
.item(path!(), AttributeValue::S(path.to_string()))
|
||||
.item(committer!(), AttributeValue::S(self.committer_name.clone()))
|
||||
.item("size", AttributeValue::N(size.to_string()));
|
||||
|
||||
if let Some(e_tag) = e_tag {
|
||||
put_item = put_item.item("e_tag", AttributeValue::S(e_tag));
|
||||
}
|
||||
|
||||
put_item
|
||||
.condition_expression(format!(
|
||||
"attribute_exists({}) AND attribute_exists({})",
|
||||
base_uri!(),
|
||||
version!(),
|
||||
))
|
||||
.send()
|
||||
.await
|
||||
.wrap_err()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Delete the manifest information for the given base_uri in dynamodb
|
||||
async fn delete(&self, base_uri: &str) -> Result<()> {
|
||||
let query_result = self
|
||||
.ddb_query()
|
||||
.key_condition_expression(format!("{} = :{}", base_uri!(), base_uri!()))
|
||||
.expression_attribute_values(
|
||||
format!(":{}", base_uri!()),
|
||||
AttributeValue::S(base_uri.into()),
|
||||
)
|
||||
.send()
|
||||
.await
|
||||
.wrap_err()?;
|
||||
|
||||
if let Some(items) = query_result.items {
|
||||
for item in items {
|
||||
if let Some(AttributeValue::N(version)) = item.get("version") {
|
||||
self.ddb_delete()
|
||||
.key(base_uri!(), AttributeValue::S(base_uri.to_string()))
|
||||
.key(version!(), AttributeValue::N(version.clone()))
|
||||
.send()
|
||||
.await
|
||||
.wrap_err()?;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
515
vendor/lance-table/src/io/commit/external_manifest.rs
vendored
Normal file
515
vendor/lance-table/src/io/commit/external_manifest.rs
vendored
Normal file
|
|
@ -0,0 +1,515 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The Lance Authors
|
||||
|
||||
//! Trait for external manifest handler.
|
||||
//!
|
||||
//! This trait abstracts an external storage with put_if_not_exists semantics.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use lance_core::utils::tracing::{
|
||||
AUDIT_MODE_CREATE, AUDIT_MODE_DELETE, AUDIT_TYPE_MANIFEST, TRACE_FILE_AUDIT,
|
||||
};
|
||||
use lance_core::{Error, Result};
|
||||
use lance_io::object_store::ObjectStore;
|
||||
use log::warn;
|
||||
use object_store::ObjectMeta;
|
||||
use object_store::ObjectStoreExt;
|
||||
use object_store::{Error as ObjectStoreError, ObjectStore as OSObjectStore, path::Path};
|
||||
use tracing::info;
|
||||
|
||||
use super::{
|
||||
MANIFEST_EXTENSION, ManifestLocation, ManifestNamingScheme, current_manifest_path,
|
||||
default_resolve_version, make_staging_manifest_path, write_version_hint,
|
||||
};
|
||||
use crate::format::{IndexMetadata, Manifest, Transaction};
|
||||
use crate::io::commit::{CommitError, CommitHandler};
|
||||
|
||||
/// External manifest store
|
||||
///
|
||||
/// This trait abstracts an external storage for source of truth for manifests.
|
||||
/// The storage is expected to remember (uri, version) -> manifest_path
|
||||
/// and able to run transactions on the manifest_path.
|
||||
///
|
||||
/// This trait is called an **External** manifest store because the store is
|
||||
/// expected to work in tandem with the object store. We are only leveraging
|
||||
/// the external store for concurrent commit. Any manifest committed thru this
|
||||
/// trait should ultimately be materialized in the object store.
|
||||
/// For a visual explanation of the commit loop see
|
||||
/// <https://github.com/lance-format/lance/assets/12615154/b0822312-0826-432a-b554-3965f8d48d04>
|
||||
#[async_trait]
|
||||
pub trait ExternalManifestStore: std::fmt::Debug + Send + Sync {
|
||||
/// Get the manifest path for a given base_uri and version
|
||||
async fn get(&self, base_uri: &str, version: u64) -> Result<String>;
|
||||
|
||||
async fn get_manifest_location(
|
||||
&self,
|
||||
base_uri: &str,
|
||||
version: u64,
|
||||
) -> Result<ManifestLocation> {
|
||||
let path = self.get(base_uri, version).await?;
|
||||
let path = Path::parse(&path).map_err(|e| Error::invalid_input(e.to_string()))?;
|
||||
let naming_scheme = detect_naming_scheme_from_path(&path)?;
|
||||
Ok(ManifestLocation {
|
||||
version,
|
||||
path,
|
||||
size: None,
|
||||
naming_scheme,
|
||||
e_tag: None,
|
||||
})
|
||||
}
|
||||
|
||||
/// Get the latest version of a dataset at the base_uri, and the path to the manifest.
|
||||
/// The path is provided as an optimization. The path is deterministic based on
|
||||
/// the version and the store should not customize it.
|
||||
async fn get_latest_version(&self, base_uri: &str) -> Result<Option<(u64, String)>>;
|
||||
|
||||
/// Get the latest manifest location for a given base_uri.
|
||||
///
|
||||
/// By default, this calls get_latest_version. Impls should
|
||||
/// override this method if they store both the location and size
|
||||
/// of the latest manifest.
|
||||
async fn get_latest_manifest_location(
|
||||
&self,
|
||||
base_uri: &str,
|
||||
) -> Result<Option<ManifestLocation>> {
|
||||
self.get_latest_version(base_uri).await.and_then(|res| {
|
||||
res.map(|(version, uri)| {
|
||||
let path = Path::parse(&uri).map_err(|e| Error::invalid_input(e.to_string()))?;
|
||||
let naming_scheme = detect_naming_scheme_from_path(&path)?;
|
||||
Ok(ManifestLocation {
|
||||
version,
|
||||
path,
|
||||
size: None,
|
||||
naming_scheme,
|
||||
e_tag: None,
|
||||
})
|
||||
})
|
||||
.transpose()
|
||||
})
|
||||
}
|
||||
|
||||
/// Put the manifest to the external store.
|
||||
///
|
||||
/// The staging manifest has been written to `staging_path` on the object store.
|
||||
/// This method should atomically claim the version and return the final manifest location.
|
||||
///
|
||||
/// The default implementation uses put_if_not_exists and put_if_exists to
|
||||
/// implement a staging-based workflow. Implementations that can write directly
|
||||
/// (e.g., namespace-backed stores) should override this method.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn put(
|
||||
&self,
|
||||
base_path: &Path,
|
||||
version: u64,
|
||||
staging_path: &Path,
|
||||
size: u64,
|
||||
e_tag: Option<String>,
|
||||
object_store: &dyn OSObjectStore,
|
||||
naming_scheme: ManifestNamingScheme,
|
||||
) -> Result<ManifestLocation> {
|
||||
// Default implementation: staging-based workflow
|
||||
|
||||
// Step 1: Record staging path atomically
|
||||
self.put_if_not_exists(
|
||||
base_path.as_ref(),
|
||||
version,
|
||||
staging_path.as_ref(),
|
||||
size,
|
||||
e_tag.clone(),
|
||||
)
|
||||
.await?;
|
||||
|
||||
// Step 2: Copy staging to final path
|
||||
let final_path = naming_scheme.manifest_path(base_path, version);
|
||||
let copied = match object_store.copy(staging_path, &final_path).await {
|
||||
Ok(_) => true,
|
||||
Err(ObjectStoreError::NotFound { .. }) => false,
|
||||
Err(e) => return Err(e.into()),
|
||||
};
|
||||
if copied {
|
||||
info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_CREATE, r#type=AUDIT_TYPE_MANIFEST, path = final_path.as_ref());
|
||||
}
|
||||
|
||||
// Get final e_tag (may change after copy for large files)
|
||||
let e_tag = if copied && size < 5 * 1024 * 1024 {
|
||||
e_tag
|
||||
} else {
|
||||
let meta = object_store.head(&final_path).await?;
|
||||
meta.e_tag
|
||||
};
|
||||
|
||||
let location = ManifestLocation {
|
||||
version,
|
||||
path: final_path.clone(),
|
||||
size: Some(size),
|
||||
naming_scheme,
|
||||
e_tag: e_tag.clone(),
|
||||
};
|
||||
|
||||
if !copied {
|
||||
return Ok(location);
|
||||
}
|
||||
|
||||
// Step 3: Update external store to final path
|
||||
self.put_if_exists(
|
||||
base_path.as_ref(),
|
||||
version,
|
||||
final_path.as_ref(),
|
||||
size,
|
||||
e_tag,
|
||||
)
|
||||
.await?;
|
||||
|
||||
// Step 4: Delete staging manifest
|
||||
match object_store.delete(staging_path).await {
|
||||
Ok(_) => {}
|
||||
Err(ObjectStoreError::NotFound { .. }) => {}
|
||||
Err(e) => return Err(e.into()),
|
||||
}
|
||||
info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE, r#type=AUDIT_TYPE_MANIFEST, path = staging_path.as_ref());
|
||||
|
||||
Ok(location)
|
||||
}
|
||||
|
||||
/// Put the manifest path for a given base_uri and version, should fail if the version already exists
|
||||
async fn put_if_not_exists(
|
||||
&self,
|
||||
base_uri: &str,
|
||||
version: u64,
|
||||
path: &str,
|
||||
size: u64,
|
||||
e_tag: Option<String>,
|
||||
) -> Result<()>;
|
||||
|
||||
/// Put the manifest path for a given base_uri and version, should fail if the version **does not** already exist
|
||||
async fn put_if_exists(
|
||||
&self,
|
||||
base_uri: &str,
|
||||
version: u64,
|
||||
path: &str,
|
||||
size: u64,
|
||||
e_tag: Option<String>,
|
||||
) -> Result<()>;
|
||||
|
||||
/// Delete the manifest information for given base_uri from the store
|
||||
async fn delete(&self, _base_uri: &str) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn detect_naming_scheme_from_path(path: &Path) -> Result<ManifestNamingScheme> {
|
||||
path.filename()
|
||||
.and_then(|name| {
|
||||
ManifestNamingScheme::detect_scheme(name)
|
||||
.or_else(|| Some(ManifestNamingScheme::detect_scheme_staging(name)))
|
||||
})
|
||||
.ok_or_else(|| {
|
||||
Error::corrupt_file(
|
||||
path.clone(),
|
||||
"Path does not follow known manifest naming convention.",
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
/// External manifest commit handler
|
||||
/// This handler is used to commit a manifest to an external store
|
||||
/// for detailed design, see <https://github.com/lance-format/lance/issues/1183>
|
||||
#[derive(Debug)]
|
||||
pub struct ExternalManifestCommitHandler {
|
||||
pub external_manifest_store: Arc<dyn ExternalManifestStore>,
|
||||
}
|
||||
|
||||
impl ExternalManifestCommitHandler {
|
||||
/// The manifest is considered committed once the staging manifest is written
|
||||
/// to object store and that path is committed to the external store.
|
||||
///
|
||||
/// However, to fully complete this, the staging manifest should be materialized
|
||||
/// into the final path, the final path should be committed to the external store
|
||||
/// and the staging manifest should be deleted. These steps may be completed
|
||||
/// by any number of readers or writers, so care should be taken to ensure
|
||||
/// that the manifest is not lost nor any errors occur due to duplicate
|
||||
/// operations.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn finalize_manifest(
|
||||
&self,
|
||||
base_path: &Path,
|
||||
staging_manifest_path: &Path,
|
||||
version: u64,
|
||||
size: u64,
|
||||
e_tag: Option<String>,
|
||||
store: &dyn OSObjectStore,
|
||||
naming_scheme: ManifestNamingScheme,
|
||||
) -> std::result::Result<ManifestLocation, Error> {
|
||||
// step 1: copy the manifest to the final location
|
||||
let final_manifest_path = naming_scheme.manifest_path(base_path, version);
|
||||
|
||||
let copied = match store
|
||||
.copy(staging_manifest_path, &final_manifest_path)
|
||||
.await
|
||||
{
|
||||
Ok(_) => true,
|
||||
Err(ObjectStoreError::NotFound { .. }) => false, // Another writer beat us to it.
|
||||
Err(e) => return Err(e.into()),
|
||||
};
|
||||
if copied {
|
||||
info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_CREATE, r#type=AUDIT_TYPE_MANIFEST, path = final_manifest_path.as_ref());
|
||||
}
|
||||
|
||||
// On S3, the etag can change if originally was MultipartUpload and later was Copy
|
||||
// https://docs.aws.amazon.com/AmazonS3/latest/API/API_Object.html#AmazonS3-Type-Object-ETag
|
||||
// We only do MultipartUpload for > 5MB files, so we can skip this check
|
||||
// if size < 5MB. However, we need to double check the final_manifest_path
|
||||
// exists before we change the external store, otherwise we may point to a
|
||||
// non-existing manifest.
|
||||
let e_tag = if copied && size < 5 * 1024 * 1024 {
|
||||
e_tag
|
||||
} else {
|
||||
let meta = store.head(&final_manifest_path).await?;
|
||||
meta.e_tag
|
||||
};
|
||||
|
||||
let location = ManifestLocation {
|
||||
version,
|
||||
path: final_manifest_path,
|
||||
size: Some(size),
|
||||
naming_scheme,
|
||||
e_tag,
|
||||
};
|
||||
|
||||
if !copied {
|
||||
return Ok(location);
|
||||
}
|
||||
|
||||
// step 2: flip the external store to point to the final location
|
||||
self.external_manifest_store
|
||||
.put_if_exists(
|
||||
base_path.as_ref(),
|
||||
version,
|
||||
location.path.as_ref(),
|
||||
size,
|
||||
location.e_tag.clone(),
|
||||
)
|
||||
.await?;
|
||||
|
||||
// step 3: delete the staging manifest
|
||||
match store.delete(staging_manifest_path).await {
|
||||
Ok(_) => {}
|
||||
Err(ObjectStoreError::NotFound { .. }) => {}
|
||||
Err(e) => return Err(e.into()),
|
||||
}
|
||||
info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE, r#type=AUDIT_TYPE_MANIFEST, path = staging_manifest_path.as_ref());
|
||||
|
||||
Ok(location)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl CommitHandler for ExternalManifestCommitHandler {
|
||||
async fn resolve_latest_location(
|
||||
&self,
|
||||
base_path: &Path,
|
||||
object_store: &ObjectStore,
|
||||
) -> std::result::Result<ManifestLocation, Error> {
|
||||
let location = self
|
||||
.external_manifest_store
|
||||
.get_latest_manifest_location(base_path.as_ref())
|
||||
.await?;
|
||||
|
||||
match location {
|
||||
Some(ManifestLocation {
|
||||
version,
|
||||
path,
|
||||
size,
|
||||
naming_scheme,
|
||||
e_tag,
|
||||
}) => {
|
||||
// The path is finalized, no need to check object store
|
||||
if path.extension() == Some(MANIFEST_EXTENSION) {
|
||||
return Ok(ManifestLocation {
|
||||
version,
|
||||
path,
|
||||
size,
|
||||
naming_scheme,
|
||||
e_tag,
|
||||
});
|
||||
}
|
||||
|
||||
let (size, e_tag) = if let Some(size) = size {
|
||||
(size, e_tag)
|
||||
} else {
|
||||
match object_store.inner.head(&path).await {
|
||||
Ok(meta) => (meta.size, meta.e_tag),
|
||||
Err(ObjectStoreError::NotFound { .. }) => {
|
||||
// there may be other threads that have finished executing finalize_manifest.
|
||||
let new_location = self
|
||||
.external_manifest_store
|
||||
.get_manifest_location(base_path.as_ref(), version)
|
||||
.await?;
|
||||
return Ok(new_location);
|
||||
}
|
||||
Err(e) => return Err(e.into()),
|
||||
}
|
||||
};
|
||||
|
||||
let final_location = self
|
||||
.finalize_manifest(
|
||||
base_path,
|
||||
&path,
|
||||
version,
|
||||
size,
|
||||
e_tag.clone(),
|
||||
&object_store.inner,
|
||||
naming_scheme,
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(final_location)
|
||||
}
|
||||
// Dataset not found in the external store, this could be because the dataset did not
|
||||
// use external store for commit before. In this case, we search for the latest manifest
|
||||
None => current_manifest_path(object_store, base_path).await,
|
||||
}
|
||||
}
|
||||
|
||||
async fn resolve_version_location(
|
||||
&self,
|
||||
base_path: &Path,
|
||||
version: u64,
|
||||
object_store: &dyn OSObjectStore,
|
||||
) -> std::result::Result<ManifestLocation, Error> {
|
||||
let location_res = self
|
||||
.external_manifest_store
|
||||
.get_manifest_location(base_path.as_ref(), version)
|
||||
.await;
|
||||
|
||||
let location = match location_res {
|
||||
Ok(p) => p,
|
||||
// not board external manifest yet, direct to object store
|
||||
Err(Error::NotFound { .. }) => {
|
||||
let path = default_resolve_version(base_path, version, object_store)
|
||||
.await
|
||||
.map_err(|_| Error::not_found(format!("{}@{}", base_path, version)))?
|
||||
.path;
|
||||
match object_store.head(&path).await {
|
||||
Ok(ObjectMeta { size, e_tag, .. }) => {
|
||||
let res = self
|
||||
.external_manifest_store
|
||||
.put_if_not_exists(
|
||||
base_path.as_ref(),
|
||||
version,
|
||||
path.as_ref(),
|
||||
size,
|
||||
e_tag.clone(),
|
||||
)
|
||||
.await;
|
||||
if let Err(e) = res {
|
||||
warn!(
|
||||
"could not update external manifest store during load, with error: {}",
|
||||
e
|
||||
);
|
||||
}
|
||||
let naming_scheme =
|
||||
ManifestNamingScheme::detect_scheme_staging(path.filename().unwrap());
|
||||
return Ok(ManifestLocation {
|
||||
version,
|
||||
path,
|
||||
size: Some(size),
|
||||
naming_scheme,
|
||||
e_tag,
|
||||
});
|
||||
}
|
||||
Err(ObjectStoreError::NotFound { .. }) => {
|
||||
return Err(Error::not_found(path.to_string()));
|
||||
}
|
||||
Err(e) => return Err(e.into()),
|
||||
}
|
||||
}
|
||||
Err(e) => return Err(e),
|
||||
};
|
||||
|
||||
// finalized path, just return
|
||||
if location.path.extension() == Some(MANIFEST_EXTENSION) {
|
||||
return Ok(location);
|
||||
}
|
||||
|
||||
let naming_scheme =
|
||||
ManifestNamingScheme::detect_scheme_staging(location.path.filename().unwrap());
|
||||
|
||||
let (size, e_tag) = if let Some(size) = location.size {
|
||||
(size, location.e_tag.clone())
|
||||
} else {
|
||||
let meta = object_store.head(&location.path).await?;
|
||||
(meta.size as u64, meta.e_tag)
|
||||
};
|
||||
|
||||
self.finalize_manifest(
|
||||
base_path,
|
||||
&location.path,
|
||||
version,
|
||||
size,
|
||||
e_tag,
|
||||
object_store,
|
||||
naming_scheme,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn commit(
|
||||
&self,
|
||||
manifest: &mut Manifest,
|
||||
indices: Option<Vec<IndexMetadata>>,
|
||||
base_path: &Path,
|
||||
object_store: &ObjectStore,
|
||||
manifest_writer: super::ManifestWriter,
|
||||
naming_scheme: ManifestNamingScheme,
|
||||
transaction: Option<Transaction>,
|
||||
) -> std::result::Result<ManifestLocation, CommitError> {
|
||||
// path we get here is the path to the manifest we want to write
|
||||
// use object_store.base_path.as_ref() for getting the root of the dataset
|
||||
|
||||
// step 1: Write the manifest we want to commit to object store with a temporary name
|
||||
let path = naming_scheme.manifest_path(base_path, manifest.version);
|
||||
let staging_path = make_staging_manifest_path(&path)?;
|
||||
let write_res =
|
||||
manifest_writer(object_store, manifest, indices, &staging_path, transaction).await?;
|
||||
|
||||
// step 2 & 3: Put the manifest to external store
|
||||
let result = self
|
||||
.external_manifest_store
|
||||
.put(
|
||||
base_path,
|
||||
manifest.version,
|
||||
&staging_path,
|
||||
write_res.size as u64,
|
||||
write_res.e_tag,
|
||||
&object_store.inner,
|
||||
naming_scheme,
|
||||
)
|
||||
.await;
|
||||
|
||||
match result {
|
||||
Ok(location) => {
|
||||
write_version_hint(object_store, base_path, manifest.version).await;
|
||||
Ok(location)
|
||||
}
|
||||
Err(_) => {
|
||||
// delete the staging manifest
|
||||
match object_store.inner.delete(&staging_path).await {
|
||||
Ok(_) => {}
|
||||
Err(ObjectStoreError::NotFound { .. }) => {}
|
||||
Err(e) => return Err(CommitError::OtherError(e.into())),
|
||||
}
|
||||
info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE, r#type=AUDIT_TYPE_MANIFEST, path = staging_path.as_ref());
|
||||
Err(CommitError::CommitConflict {})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn delete(&self, base_path: &Path) -> Result<()> {
|
||||
self.external_manifest_store
|
||||
.delete(base_path.as_ref())
|
||||
.await
|
||||
}
|
||||
}
|
||||
370
vendor/lance-table/src/io/deletion.rs
vendored
Normal file
370
vendor/lance-table/src/io/deletion.rs
vendored
Normal file
|
|
@ -0,0 +1,370 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The Lance Authors
|
||||
|
||||
use std::{collections::HashSet, sync::Arc};
|
||||
|
||||
use arrow_array::{RecordBatch, UInt32Array};
|
||||
use arrow_ipc::CompressionType;
|
||||
use arrow_ipc::reader::FileReader as ArrowFileReader;
|
||||
use arrow_ipc::writer::{FileWriter as ArrowFileWriter, IpcWriteOptions};
|
||||
use arrow_schema::{ArrowError, DataType, Field, Schema};
|
||||
use bytes::Buf;
|
||||
use lance_core::error::{CorruptFileSnafu, box_error};
|
||||
use lance_core::utils::deletion::DeletionVector;
|
||||
use lance_core::utils::tracing::{AUDIT_MODE_CREATE, AUDIT_TYPE_DELETION, TRACE_FILE_AUDIT};
|
||||
use lance_core::{Error, Result};
|
||||
use lance_io::object_store::ObjectStore;
|
||||
use object_store::path::Path;
|
||||
use rand::Rng;
|
||||
use roaring::bitmap::RoaringBitmap;
|
||||
use snafu::ResultExt;
|
||||
use tracing::{info, instrument};
|
||||
|
||||
use crate::format::{DeletionFile, DeletionFileType};
|
||||
|
||||
pub const DELETIONS_DIR: &str = "_deletions";
|
||||
|
||||
/// Get the Arrow schema for an Arrow deletion file.
|
||||
fn deletion_arrow_schema() -> Arc<Schema> {
|
||||
Arc::new(Schema::new(vec![Field::new(
|
||||
"row_id",
|
||||
DataType::UInt32,
|
||||
false,
|
||||
)]))
|
||||
}
|
||||
|
||||
/// Get the file path for a deletion file. This is relative to the dataset root.
|
||||
pub fn deletion_file_path(base: &Path, fragment_id: u64, deletion_file: &DeletionFile) -> Path {
|
||||
let DeletionFile {
|
||||
read_version,
|
||||
id,
|
||||
file_type,
|
||||
..
|
||||
} = deletion_file;
|
||||
let suffix = file_type.suffix();
|
||||
base.clone()
|
||||
.join(DELETIONS_DIR)
|
||||
.join(format!("{fragment_id}-{read_version}-{id}.{suffix}"))
|
||||
}
|
||||
|
||||
pub fn relative_deletion_file_path(fragment_id: u64, deletion_file: &DeletionFile) -> String {
|
||||
let DeletionFile {
|
||||
read_version,
|
||||
id,
|
||||
file_type,
|
||||
..
|
||||
} = deletion_file;
|
||||
let suffix = file_type.suffix();
|
||||
format!("{DELETIONS_DIR}/{fragment_id}-{read_version}-{id}.{suffix}")
|
||||
}
|
||||
|
||||
/// Write a deletion file for a fragment for a given deletion vector.
|
||||
///
|
||||
/// Returns the deletion file if one was written. If no deletions were present,
|
||||
/// returns `Ok(None)`.
|
||||
pub async fn write_deletion_file(
|
||||
base: &Path,
|
||||
fragment_id: u64,
|
||||
read_version: u64,
|
||||
removed_rows: &DeletionVector,
|
||||
object_store: &ObjectStore,
|
||||
) -> Result<Option<DeletionFile>> {
|
||||
let deletion_file = match removed_rows {
|
||||
DeletionVector::NoDeletions => None,
|
||||
DeletionVector::Set(set) => {
|
||||
let id = rand::rng().random::<u64>();
|
||||
let deletion_file = DeletionFile {
|
||||
read_version,
|
||||
id,
|
||||
file_type: DeletionFileType::Array,
|
||||
num_deleted_rows: Some(set.len()),
|
||||
base_id: None,
|
||||
};
|
||||
let path = deletion_file_path(base, fragment_id, &deletion_file);
|
||||
|
||||
let array = UInt32Array::from_iter(set.iter().copied());
|
||||
let array = Arc::new(array);
|
||||
|
||||
let schema = deletion_arrow_schema();
|
||||
let batch = RecordBatch::try_new(schema.clone(), vec![array])?;
|
||||
|
||||
let mut out: Vec<u8> = Vec::new();
|
||||
let write_options =
|
||||
IpcWriteOptions::default().try_with_compression(Some(CompressionType::ZSTD))?;
|
||||
{
|
||||
let mut writer = ArrowFileWriter::try_new_with_options(
|
||||
&mut out,
|
||||
schema.as_ref(),
|
||||
write_options,
|
||||
)?;
|
||||
writer.write(&batch)?;
|
||||
writer.finish()?;
|
||||
// Drop writer so out is no longer borrowed.
|
||||
}
|
||||
|
||||
object_store.put(&path, &out).await?;
|
||||
|
||||
info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_CREATE, r#type=AUDIT_TYPE_DELETION, path = path.to_string());
|
||||
|
||||
Some(deletion_file)
|
||||
}
|
||||
DeletionVector::Bitmap(bitmap) => {
|
||||
let id = rand::rng().random::<u64>();
|
||||
let deletion_file = DeletionFile {
|
||||
read_version,
|
||||
id,
|
||||
file_type: DeletionFileType::Bitmap,
|
||||
num_deleted_rows: Some(bitmap.len() as usize),
|
||||
base_id: None,
|
||||
};
|
||||
let path = deletion_file_path(base, fragment_id, &deletion_file);
|
||||
|
||||
let mut out: Vec<u8> = Vec::new();
|
||||
bitmap.serialize_into(&mut out)?;
|
||||
|
||||
object_store.put(&path, &out).await?;
|
||||
|
||||
info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_CREATE, r#type=AUDIT_TYPE_DELETION, path = path.to_string());
|
||||
|
||||
Some(deletion_file)
|
||||
}
|
||||
};
|
||||
Ok(deletion_file)
|
||||
}
|
||||
|
||||
#[instrument(
|
||||
level = "debug",
|
||||
skip(base, object_store),
|
||||
fields(
|
||||
base = base.as_ref(),
|
||||
bytes_read = tracing::field::Empty
|
||||
)
|
||||
)]
|
||||
pub async fn read_deletion_file(
|
||||
fragment_id: u64,
|
||||
deletion_file: &DeletionFile,
|
||||
base: &Path,
|
||||
object_store: &ObjectStore,
|
||||
) -> Result<DeletionVector> {
|
||||
let span = tracing::Span::current();
|
||||
match deletion_file.file_type {
|
||||
DeletionFileType::Array => {
|
||||
let path = deletion_file_path(base, fragment_id, deletion_file);
|
||||
|
||||
let data = object_store.read_one_all(&path).await?;
|
||||
span.record("bytes_read", data.len());
|
||||
let data = std::io::Cursor::new(data);
|
||||
let mut batches: Vec<RecordBatch> = ArrowFileReader::try_new(data, None)?
|
||||
.collect::<std::result::Result<_, ArrowError>>()
|
||||
.map_err(box_error)
|
||||
.context(CorruptFileSnafu { path: path.clone() })?;
|
||||
|
||||
if batches.len() != 1 {
|
||||
return Err(Error::corrupt_file(
|
||||
path,
|
||||
format!(
|
||||
"Expected exactly one batch in deletion file, got {}",
|
||||
batches.len()
|
||||
),
|
||||
));
|
||||
}
|
||||
|
||||
let batch = batches.pop().unwrap();
|
||||
if batch.schema() != deletion_arrow_schema() {
|
||||
return Err(Error::corrupt_file(
|
||||
path,
|
||||
format!(
|
||||
"Expected schema {:?} in deletion file, got {:?}",
|
||||
deletion_arrow_schema(),
|
||||
batch.schema()
|
||||
),
|
||||
));
|
||||
}
|
||||
|
||||
let array = batch.columns()[0]
|
||||
.as_any()
|
||||
.downcast_ref::<UInt32Array>()
|
||||
.unwrap();
|
||||
|
||||
let mut set = HashSet::with_capacity(array.len());
|
||||
for val in array.iter() {
|
||||
if let Some(val) = val {
|
||||
set.insert(val);
|
||||
} else {
|
||||
return Err(Error::corrupt_file(
|
||||
path,
|
||||
"Null values are not allowed in deletion files",
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(DeletionVector::Set(set))
|
||||
}
|
||||
DeletionFileType::Bitmap => {
|
||||
let path = deletion_file_path(base, fragment_id, deletion_file);
|
||||
|
||||
let data = object_store.read_one_all(&path).await?;
|
||||
span.record("bytes_read", data.len());
|
||||
let reader = data.reader();
|
||||
let bitmap = RoaringBitmap::deserialize_from(reader)
|
||||
.map_err(box_error)
|
||||
.context(CorruptFileSnafu { path })?;
|
||||
|
||||
Ok(DeletionVector::Bitmap(bitmap))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
|
||||
use super::*;
|
||||
use object_store::ObjectStoreExt;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_write_no_deletions() {
|
||||
let dv = DeletionVector::NoDeletions;
|
||||
|
||||
let (object_store, path) = ObjectStore::from_uri("memory:///no_deletion")
|
||||
.await
|
||||
.unwrap();
|
||||
let file = write_deletion_file(&path, 0, 0, &dv, &object_store)
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(file.is_none());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_write_array() {
|
||||
let dv = DeletionVector::Set(HashSet::from_iter(0..100));
|
||||
|
||||
let fragment_id = 21;
|
||||
let read_version = 12;
|
||||
|
||||
let object_store = ObjectStore::memory();
|
||||
let path = Path::from("/write");
|
||||
let file = write_deletion_file(&path, fragment_id, read_version, &dv, &object_store)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert!(matches!(
|
||||
file,
|
||||
Some(DeletionFile {
|
||||
file_type: DeletionFileType::Array,
|
||||
..
|
||||
})
|
||||
));
|
||||
|
||||
let file = file.unwrap();
|
||||
assert_eq!(file.read_version, read_version);
|
||||
let path = deletion_file_path(&path, fragment_id, &file);
|
||||
assert_eq!(
|
||||
path,
|
||||
Path::from(format!("/write/_deletions/21-12-{}.arrow", file.id))
|
||||
);
|
||||
|
||||
let data = object_store
|
||||
.inner
|
||||
.get(&path)
|
||||
.await
|
||||
.unwrap()
|
||||
.bytes()
|
||||
.await
|
||||
.unwrap();
|
||||
let data = std::io::Cursor::new(data);
|
||||
let mut batches: Vec<RecordBatch> = ArrowFileReader::try_new(data, None)
|
||||
.unwrap()
|
||||
.collect::<std::result::Result<_, ArrowError>>()
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(batches.len(), 1);
|
||||
let batch = batches.pop().unwrap();
|
||||
assert_eq!(batch.schema(), deletion_arrow_schema());
|
||||
let array = batch["row_id"]
|
||||
.as_any()
|
||||
.downcast_ref::<UInt32Array>()
|
||||
.unwrap();
|
||||
let read_dv = DeletionVector::from_iter(array.iter().map(|v| v.unwrap()));
|
||||
assert_eq!(read_dv, dv);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_write_bitmap() {
|
||||
let dv = DeletionVector::Bitmap(RoaringBitmap::from_iter(0..100));
|
||||
|
||||
let fragment_id = 21;
|
||||
let read_version = 12;
|
||||
|
||||
let object_store = ObjectStore::memory();
|
||||
let path = Path::from("/bitmap");
|
||||
let file = write_deletion_file(&path, fragment_id, read_version, &dv, &object_store)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert!(matches!(
|
||||
file,
|
||||
Some(DeletionFile {
|
||||
file_type: DeletionFileType::Bitmap,
|
||||
..
|
||||
})
|
||||
));
|
||||
|
||||
let file = file.unwrap();
|
||||
assert_eq!(file.read_version, read_version);
|
||||
let path = deletion_file_path(&path, fragment_id, &file);
|
||||
assert_eq!(
|
||||
path,
|
||||
Path::from(format!("/bitmap/_deletions/21-12-{}.bin", file.id))
|
||||
);
|
||||
|
||||
let data = object_store
|
||||
.inner
|
||||
.get(&path)
|
||||
.await
|
||||
.unwrap()
|
||||
.bytes()
|
||||
.await
|
||||
.unwrap();
|
||||
let reader = data.reader();
|
||||
let read_bitmap = RoaringBitmap::deserialize_from(reader).unwrap();
|
||||
assert_eq!(read_bitmap, dv.into_iter().collect::<RoaringBitmap>());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_roundtrip_array() {
|
||||
let dv = DeletionVector::Set(HashSet::from_iter(0..100));
|
||||
|
||||
let fragment_id = 21;
|
||||
let read_version = 12;
|
||||
|
||||
let object_store = ObjectStore::memory();
|
||||
let path = Path::from("/roundtrip");
|
||||
let file = write_deletion_file(&path, fragment_id, read_version, &dv, &object_store)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let read_dv = read_deletion_file(fragment_id, &file.unwrap(), &path, &object_store)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(read_dv, dv);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_roundtrip_bitmap() {
|
||||
let dv = DeletionVector::Bitmap(RoaringBitmap::from_iter(0..100));
|
||||
|
||||
let fragment_id = 21;
|
||||
let read_version = 12;
|
||||
|
||||
let object_store = ObjectStore::memory();
|
||||
let path = Path::from("/bitmap");
|
||||
let file = write_deletion_file(&path, fragment_id, read_version, &dv, &object_store)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let read_dv = read_deletion_file(fragment_id, &file.unwrap(), &path, &object_store)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(read_dv, dv);
|
||||
}
|
||||
}
|
||||
344
vendor/lance-table/src/io/manifest.rs
vendored
Normal file
344
vendor/lance-table/src/io/manifest.rs
vendored
Normal file
|
|
@ -0,0 +1,344 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The Lance Authors
|
||||
|
||||
use async_trait::async_trait;
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use lance_arrow::DataTypeExt;
|
||||
use lance_file::{
|
||||
previous::writer::ManifestProvider as PreviousManifestProvider, version::LanceFileVersion,
|
||||
};
|
||||
use object_store::ObjectStoreExt;
|
||||
use object_store::path::Path;
|
||||
use prost::Message;
|
||||
use std::collections::HashMap;
|
||||
use std::{ops::Range, sync::Arc};
|
||||
use tracing::instrument;
|
||||
|
||||
use lance_core::{Error, Result, datatypes::Schema};
|
||||
use lance_io::{
|
||||
encodings::{Encoder, binary::BinaryEncoder, plain::PlainEncoder},
|
||||
object_store::ObjectStore,
|
||||
traits::{WriteExt, Writer},
|
||||
utils::read_message,
|
||||
};
|
||||
|
||||
use crate::format::{DataStorageFormat, IndexMetadata, MAGIC, Manifest, Transaction, pb};
|
||||
|
||||
use super::commit::ManifestLocation;
|
||||
|
||||
/// Read Manifest on URI.
|
||||
///
|
||||
/// This only reads manifest files. It does not read data files.
|
||||
#[instrument(level = "debug", skip(object_store))]
|
||||
pub async fn read_manifest(
|
||||
object_store: &ObjectStore,
|
||||
path: &Path,
|
||||
known_size: Option<u64>,
|
||||
) -> Result<Manifest> {
|
||||
let file_size = if let Some(known_size) = known_size {
|
||||
known_size
|
||||
} else {
|
||||
object_store.inner.head(path).await?.size
|
||||
};
|
||||
const PREFETCH_SIZE: u64 = 64 * 1024;
|
||||
let initial_start = file_size.saturating_sub(PREFETCH_SIZE);
|
||||
let range = Range {
|
||||
start: initial_start,
|
||||
end: file_size,
|
||||
};
|
||||
let buf = object_store.inner.get_range(path, range).await?;
|
||||
|
||||
// In case of corruption, the known_size might be wrong. We can retry without
|
||||
// the size to be more robust.
|
||||
if (buf.len() < 16 || !buf.ends_with(MAGIC)) && known_size.is_some() {
|
||||
return Box::pin(read_manifest(object_store, path, None)).await;
|
||||
}
|
||||
|
||||
if buf.len() < 16 {
|
||||
return Err(Error::corrupt_file(
|
||||
path.clone(),
|
||||
"Invalid format: file size is smaller than 16 bytes".to_string(),
|
||||
));
|
||||
}
|
||||
if !buf.ends_with(MAGIC) {
|
||||
return Err(Error::corrupt_file(
|
||||
path.clone(),
|
||||
"Invalid format: magic number does not match".to_string(),
|
||||
));
|
||||
}
|
||||
let manifest_pos = LittleEndian::read_i64(&buf[buf.len() - 16..buf.len() - 8]) as usize;
|
||||
let manifest_len = file_size as usize - manifest_pos;
|
||||
|
||||
let buf: Bytes = if manifest_len <= buf.len() {
|
||||
// The prefetch captured the entire manifest. We just need to trim the buffer.
|
||||
buf.slice(buf.len() - manifest_len..buf.len())
|
||||
} else {
|
||||
// The prefetch only captured part of the manifest. We need to make an
|
||||
// additional range request to read the remainder.
|
||||
let mut buf2: BytesMut = object_store
|
||||
.inner
|
||||
.get_range(
|
||||
path,
|
||||
Range {
|
||||
start: manifest_pos as u64,
|
||||
end: file_size - PREFETCH_SIZE,
|
||||
},
|
||||
)
|
||||
.await?
|
||||
.into_iter()
|
||||
.collect();
|
||||
buf2.extend_from_slice(&buf);
|
||||
buf2.freeze()
|
||||
};
|
||||
|
||||
let recorded_length = LittleEndian::read_u32(&buf[0..4]) as usize;
|
||||
// Need to trim the magic number at end and message length at beginning
|
||||
let buf = buf.slice(4..buf.len() - 16);
|
||||
|
||||
if buf.len() != recorded_length {
|
||||
return Err(Error::invalid_input(format!(
|
||||
"Invalid format: manifest length does not match. Expected {}, got {}",
|
||||
recorded_length,
|
||||
buf.len()
|
||||
)));
|
||||
}
|
||||
|
||||
let proto = pb::Manifest::decode(buf)?;
|
||||
Manifest::try_from(proto)
|
||||
}
|
||||
|
||||
#[instrument(level = "debug", skip(object_store, manifest))]
|
||||
pub async fn read_manifest_indexes(
|
||||
object_store: &ObjectStore,
|
||||
location: &ManifestLocation,
|
||||
manifest: &Manifest,
|
||||
) -> Result<Vec<IndexMetadata>> {
|
||||
if let Some(pos) = manifest.index_section.as_ref() {
|
||||
let reader = if let Some(size) = location.size {
|
||||
object_store
|
||||
.open_with_size(&location.path, size as usize)
|
||||
.await?
|
||||
} else {
|
||||
object_store.open(&location.path).await?
|
||||
};
|
||||
let section: pb::IndexSection = read_message(reader.as_ref(), *pos).await?;
|
||||
|
||||
let indices = section
|
||||
.indices
|
||||
.into_iter()
|
||||
.map(IndexMetadata::try_from)
|
||||
.collect::<Result<Vec<_>>>()?;
|
||||
Ok(indices)
|
||||
} else {
|
||||
Ok(vec![])
|
||||
}
|
||||
}
|
||||
|
||||
async fn do_write_manifest(
|
||||
writer: &mut dyn Writer,
|
||||
manifest: &mut Manifest,
|
||||
indices: Option<Vec<IndexMetadata>>,
|
||||
mut transaction: Option<Transaction>,
|
||||
) -> Result<usize> {
|
||||
// Write indices if presented.
|
||||
if let Some(indices) = indices.as_ref() {
|
||||
let section = pb::IndexSection {
|
||||
indices: indices.iter().map(|i| i.into()).collect(),
|
||||
};
|
||||
let pos = writer.write_protobuf(§ion).await?;
|
||||
manifest.index_section = Some(pos);
|
||||
}
|
||||
|
||||
// Write inline transaction if presented.
|
||||
if let Some(tx) = transaction.take() {
|
||||
// Convert to protobuf at the write boundary to persist inline
|
||||
let pb_tx: pb::Transaction = tx.into();
|
||||
let pos = writer.write_protobuf(&pb_tx).await?;
|
||||
manifest.transaction_section = Some(pos);
|
||||
}
|
||||
|
||||
writer.write_struct(manifest).await
|
||||
}
|
||||
|
||||
/// Write manifest to an open file.
|
||||
pub async fn write_manifest(
|
||||
writer: &mut dyn Writer,
|
||||
manifest: &mut Manifest,
|
||||
indices: Option<Vec<IndexMetadata>>,
|
||||
transaction: Option<Transaction>,
|
||||
) -> Result<usize> {
|
||||
// Write dictionary values.
|
||||
let max_field_id = manifest.schema.max_field_id().unwrap_or(-1);
|
||||
let is_legacy_storage = manifest.should_use_legacy_format();
|
||||
for field_id in 0..max_field_id + 1 {
|
||||
if let Some(field) = manifest.schema.mut_field_by_id(field_id)
|
||||
&& field.data_type().is_dictionary()
|
||||
&& is_legacy_storage
|
||||
{
|
||||
let dict_info = field.dictionary.as_mut().ok_or_else(|| {
|
||||
Error::io(format!("Lance field {} misses dictionary info", field.name))
|
||||
})?;
|
||||
|
||||
let value_arr = dict_info.values.as_ref().ok_or_else(|| {
|
||||
Error::io(format!(
|
||||
"Lance field {} is dictionary type, but misses the dictionary value array",
|
||||
field.name
|
||||
))
|
||||
})?;
|
||||
|
||||
let data_type = value_arr.data_type();
|
||||
let pos = match data_type {
|
||||
dt if dt.is_numeric() => {
|
||||
let mut encoder = PlainEncoder::new(writer, dt);
|
||||
encoder.encode(&[value_arr]).await?
|
||||
}
|
||||
dt if dt.is_binary_like() => {
|
||||
let mut encoder = BinaryEncoder::new(writer);
|
||||
encoder.encode(&[value_arr]).await?
|
||||
}
|
||||
_ => {
|
||||
return Err(Error::schema(format!(
|
||||
"Does not support {} as dictionary value type",
|
||||
value_arr.data_type()
|
||||
)));
|
||||
}
|
||||
};
|
||||
dict_info.offset = pos;
|
||||
dict_info.length = value_arr.len();
|
||||
}
|
||||
}
|
||||
|
||||
do_write_manifest(writer, manifest, indices, transaction).await
|
||||
}
|
||||
|
||||
/// Implementation of ManifestProvider that describes a Lance file by writing
|
||||
/// a manifest that contains nothing but default fields and the schema
|
||||
pub struct ManifestDescribing {}
|
||||
|
||||
#[async_trait]
|
||||
impl PreviousManifestProvider for ManifestDescribing {
|
||||
async fn store_schema(
|
||||
object_writer: &mut dyn Writer,
|
||||
schema: &Schema,
|
||||
) -> Result<Option<usize>> {
|
||||
let mut manifest = Manifest::new(
|
||||
schema.clone(),
|
||||
Arc::new(vec![]),
|
||||
DataStorageFormat::new(LanceFileVersion::Legacy),
|
||||
HashMap::new(),
|
||||
);
|
||||
let pos = do_write_manifest(object_writer, &mut manifest, None, None).await?;
|
||||
Ok(Some(pos))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use arrow_array::{Int32Array, RecordBatch};
|
||||
use std::collections::HashMap;
|
||||
|
||||
use crate::format::SelfDescribingFileReader;
|
||||
use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema};
|
||||
use lance_file::format::{MAGIC, MAJOR_VERSION, MINOR_VERSION};
|
||||
use lance_file::previous::{
|
||||
reader::FileReader as PreviousFileReader, writer::FileWriter as PreviousFileWriter,
|
||||
};
|
||||
use rand::{Rng, distr::Alphanumeric};
|
||||
use tokio::io::AsyncWriteExt;
|
||||
|
||||
use super::*;
|
||||
|
||||
async fn test_roundtrip_manifest(prefix_size: usize, manifest_min_size: usize) {
|
||||
let store = ObjectStore::memory();
|
||||
let path = Path::from("/read_large_manifest");
|
||||
|
||||
let mut writer = store.create(&path).await.unwrap();
|
||||
|
||||
// Write prefix we should ignore
|
||||
let prefix: Vec<u8> = rand::rng()
|
||||
.sample_iter(&Alphanumeric)
|
||||
.take(prefix_size)
|
||||
.collect();
|
||||
writer.write_all(&prefix).await.unwrap();
|
||||
|
||||
let long_name: String = rand::rng()
|
||||
.sample_iter(&Alphanumeric)
|
||||
.take(manifest_min_size)
|
||||
.map(char::from)
|
||||
.collect();
|
||||
|
||||
let arrow_schema =
|
||||
ArrowSchema::new(vec![ArrowField::new(long_name, DataType::Int64, false)]);
|
||||
let schema = Schema::try_from(&arrow_schema).unwrap();
|
||||
|
||||
let mut config = HashMap::new();
|
||||
config.insert("key".to_string(), "value".to_string());
|
||||
|
||||
let mut manifest = Manifest::new(
|
||||
schema,
|
||||
Arc::new(vec![]),
|
||||
DataStorageFormat::default(),
|
||||
HashMap::new(),
|
||||
);
|
||||
let pos = write_manifest(writer.as_mut(), &mut manifest, None, None)
|
||||
.await
|
||||
.unwrap();
|
||||
writer
|
||||
.write_magics(pos, MAJOR_VERSION, MINOR_VERSION, MAGIC)
|
||||
.await
|
||||
.unwrap();
|
||||
Writer::shutdown(writer.as_mut()).await.unwrap();
|
||||
|
||||
let roundtripped_manifest = read_manifest(&store, &path, None).await.unwrap();
|
||||
|
||||
assert_eq!(manifest, roundtripped_manifest);
|
||||
|
||||
store.inner.delete(&path).await.unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_read_large_manifest() {
|
||||
test_roundtrip_manifest(0, 100_000).await;
|
||||
test_roundtrip_manifest(1000, 100_000).await;
|
||||
test_roundtrip_manifest(1000, 1000).await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_update_schema_metadata() {
|
||||
let store = ObjectStore::memory();
|
||||
let path = Path::from("/update_schema_metadata");
|
||||
|
||||
let arrow_schema = Arc::new(ArrowSchema::new(vec![ArrowField::new(
|
||||
"i",
|
||||
DataType::Int32,
|
||||
false,
|
||||
)]));
|
||||
let schema = Schema::try_from(arrow_schema.as_ref()).unwrap();
|
||||
let mut file_writer = PreviousFileWriter::<ManifestDescribing>::try_new(
|
||||
&store,
|
||||
&path,
|
||||
schema.clone(),
|
||||
&Default::default(),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let array = Int32Array::from_iter_values(0..10);
|
||||
let batch = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new(array)]).unwrap();
|
||||
file_writer
|
||||
.write(std::slice::from_ref(&batch))
|
||||
.await
|
||||
.unwrap();
|
||||
let mut metadata = HashMap::new();
|
||||
metadata.insert(String::from("lance:extra"), String::from("for_test"));
|
||||
file_writer.finish_with_metadata(&metadata).await.unwrap();
|
||||
|
||||
let reader = store.open(&path).await.unwrap();
|
||||
let reader = PreviousFileReader::try_new_self_described_from_reader(reader.into(), None)
|
||||
.await
|
||||
.unwrap();
|
||||
let schema = ArrowSchema::from(reader.schema());
|
||||
assert_eq!(schema.metadata().get("lance:extra").unwrap(), "for_test");
|
||||
}
|
||||
}
|
||||
8
vendor/lance-table/src/lib.rs
vendored
Normal file
8
vendor/lance-table/src/lib.rs
vendored
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The Lance Authors
|
||||
|
||||
pub mod feature_flags;
|
||||
pub mod format;
|
||||
pub mod io;
|
||||
pub mod rowids;
|
||||
pub mod utils;
|
||||
1364
vendor/lance-table/src/rowids.rs
vendored
Normal file
1364
vendor/lance-table/src/rowids.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
314
vendor/lance-table/src/rowids/bitmap.rs
vendored
Normal file
314
vendor/lance-table/src/rowids/bitmap.rs
vendored
Normal file
|
|
@ -0,0 +1,314 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The Lance Authors
|
||||
|
||||
use deepsize::DeepSizeOf;
|
||||
|
||||
#[derive(PartialEq, Eq, Clone, DeepSizeOf)]
|
||||
pub struct Bitmap {
|
||||
pub data: Vec<u8>,
|
||||
pub len: usize,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for Bitmap {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
write!(f, "Bitmap {{ data: ")?;
|
||||
for i in 0..self.len {
|
||||
write!(f, "{}", if self.get(i) { "1" } else { "0" })?;
|
||||
}
|
||||
write!(f, ", len: {} }}", self.len)
|
||||
}
|
||||
}
|
||||
|
||||
impl Bitmap {
|
||||
pub fn new_empty(len: usize) -> Self {
|
||||
let data = vec![0; len.div_ceil(8)];
|
||||
Self { data, len }
|
||||
}
|
||||
|
||||
pub fn new_full(len: usize) -> Self {
|
||||
let mut data = vec![0xff; len.div_ceil(8)];
|
||||
// Zero past the end of len
|
||||
let remainder = len % 8;
|
||||
if remainder != 0 {
|
||||
let last_byte = data.last_mut().unwrap();
|
||||
let bits_to_clear = 8 - remainder;
|
||||
for offset_from_end in 0..bits_to_clear {
|
||||
let i = 7 - offset_from_end;
|
||||
*last_byte &= !(1 << i);
|
||||
}
|
||||
}
|
||||
Self { data, len }
|
||||
}
|
||||
|
||||
pub fn set(&mut self, i: usize) {
|
||||
self.data[i / 8] |= 1 << (i % 8);
|
||||
}
|
||||
|
||||
pub fn clear(&mut self, i: usize) {
|
||||
self.data[i / 8] &= !(1 << (i % 8));
|
||||
}
|
||||
|
||||
pub fn get(&self, i: usize) -> bool {
|
||||
self.data[i / 8] & (1 << (i % 8)) != 0
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
self.len
|
||||
}
|
||||
|
||||
pub fn slice(&self, start: usize, len: usize) -> BitmapSlice<'_> {
|
||||
BitmapSlice {
|
||||
bitmap: self,
|
||||
start,
|
||||
len,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn count_ones(&self) -> usize {
|
||||
self.data.iter().map(|&x| x.count_ones() as usize).sum()
|
||||
}
|
||||
|
||||
pub fn count_zeros(&self) -> usize {
|
||||
self.len - self.count_ones()
|
||||
}
|
||||
|
||||
pub fn iter(&self) -> impl Iterator<Item = bool> + '_ {
|
||||
self.data
|
||||
.iter()
|
||||
.flat_map(|&x| (0..8).map(move |i| x & (1 << i) != 0))
|
||||
.take(self.len)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&[bool]> for Bitmap {
|
||||
fn from(slice: &[bool]) -> Self {
|
||||
let mut bitmap = Self::new_empty(slice.len());
|
||||
for (i, &b) in slice.iter().enumerate() {
|
||||
if b {
|
||||
bitmap.set(i);
|
||||
}
|
||||
}
|
||||
bitmap
|
||||
}
|
||||
}
|
||||
|
||||
// Make a slice of bitmap
|
||||
pub struct BitmapSlice<'a> {
|
||||
bitmap: &'a Bitmap,
|
||||
start: usize,
|
||||
len: usize,
|
||||
}
|
||||
|
||||
impl BitmapSlice<'_> {
|
||||
pub fn count_ones(&self) -> usize {
|
||||
if self.len == 0 {
|
||||
return 0;
|
||||
}
|
||||
let first_byte = self.start / 8;
|
||||
let last_byte = (self.start + self.len - 1) / 8;
|
||||
if first_byte == last_byte {
|
||||
let byte = self.bitmap.data[first_byte];
|
||||
let mut count = 0;
|
||||
for i in self.start % 8..((self.start + self.len - 1) % 8 + 1) {
|
||||
if byte & (1 << i) != 0 {
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
count
|
||||
} else {
|
||||
let mut count = 0;
|
||||
// Handle first byte
|
||||
for i in self.start % 8..8 {
|
||||
if self.bitmap.data[first_byte] & (1 << i) != 0 {
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Handle last bytes
|
||||
for i in 0..((self.start + self.len - 1) % 8 + 1) {
|
||||
if self.bitmap.data[last_byte] & (1 << i) != 0 {
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Middle bytes can just use count_ones
|
||||
count += self.bitmap.data[first_byte + 1..last_byte]
|
||||
.iter()
|
||||
.map(|&x| x.count_ones() as usize)
|
||||
.sum::<usize>();
|
||||
count
|
||||
}
|
||||
}
|
||||
|
||||
pub fn count_zeros(&self) -> usize {
|
||||
self.len - self.count_ones()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<BitmapSlice<'_>> for Bitmap {
|
||||
fn from(slice: BitmapSlice) -> Self {
|
||||
let mut bitmap = Self::new_empty(slice.len);
|
||||
for i in 0..slice.len {
|
||||
if slice.bitmap.get(slice.start + i) {
|
||||
bitmap.set(i);
|
||||
}
|
||||
}
|
||||
bitmap
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use proptest::prop_assert_eq;
|
||||
|
||||
#[test]
|
||||
fn test_bitmap() {
|
||||
let mut bitmap = Bitmap::new_empty(10);
|
||||
assert_eq!(bitmap.len(), 10);
|
||||
assert_eq!(bitmap.count_ones(), 0);
|
||||
|
||||
bitmap.set(0);
|
||||
bitmap.set(1);
|
||||
bitmap.set(4);
|
||||
bitmap.set(5);
|
||||
bitmap.set(9);
|
||||
assert_eq!(bitmap.count_ones(), 5);
|
||||
assert_eq!(
|
||||
format!("{:?}", bitmap),
|
||||
"Bitmap { data: 1100110001, len: 10 }"
|
||||
);
|
||||
|
||||
bitmap.clear(1);
|
||||
bitmap.clear(4);
|
||||
assert_eq!(bitmap.count_ones(), 3);
|
||||
assert_eq!(
|
||||
format!("{:?}", bitmap),
|
||||
"Bitmap { data: 1000010001, len: 10 }"
|
||||
);
|
||||
|
||||
let bitmap_slice = bitmap.slice(5, 5);
|
||||
assert_eq!(bitmap_slice.count_ones(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_equality() {
|
||||
for len in 48..56 {
|
||||
let mut bitmap1 = Bitmap::new_empty(len);
|
||||
for i in 0..len {
|
||||
if i % 2 == 0 {
|
||||
bitmap1.set(i);
|
||||
}
|
||||
}
|
||||
|
||||
let mut bitmap2 = Bitmap::new_full(len);
|
||||
for i in 0..len {
|
||||
if i % 2 == 1 {
|
||||
bitmap2.clear(i);
|
||||
}
|
||||
}
|
||||
|
||||
assert_eq!(bitmap1, bitmap2);
|
||||
}
|
||||
}
|
||||
|
||||
proptest::proptest! {
|
||||
#[test]
|
||||
fn test_bitmap_slice(
|
||||
values in proptest::collection::vec(proptest::bool::ANY, 0..100),
|
||||
mut start in 0..100usize,
|
||||
mut len in 0..100usize,
|
||||
) {
|
||||
if start > values.len() {
|
||||
start = values.len();
|
||||
}
|
||||
if len > values.len() - start {
|
||||
len = values.len() - start;
|
||||
}
|
||||
|
||||
let bitmap = Bitmap::from(values.as_slice());
|
||||
let slice = bitmap.slice(start, len);
|
||||
let values_slice = values[start..(start + len)].to_vec();
|
||||
|
||||
prop_assert_eq!(slice.count_ones(), values_slice.iter().filter(|&&x| x).count());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitmap_iter_empty() {
|
||||
let bitmap = Bitmap::new_empty(10);
|
||||
let values: Vec<bool> = bitmap.iter().collect();
|
||||
assert_eq!(values, vec![false; 10]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitmap_iter_full() {
|
||||
let bitmap = Bitmap::new_full(10);
|
||||
let values: Vec<bool> = bitmap.iter().collect();
|
||||
assert_eq!(values, vec![true; 10]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitmap_iter_partial() {
|
||||
let mut bitmap = Bitmap::new_empty(10);
|
||||
bitmap.set(0);
|
||||
bitmap.set(3);
|
||||
bitmap.set(7);
|
||||
bitmap.set(9);
|
||||
|
||||
let values: Vec<bool> = bitmap.iter().collect();
|
||||
let expected = vec![
|
||||
true, // 0
|
||||
false, // 1
|
||||
false, // 2
|
||||
true, // 3
|
||||
false, // 4
|
||||
false, // 5
|
||||
false, // 6
|
||||
true, // 7
|
||||
false, // 8
|
||||
true, // 9
|
||||
];
|
||||
assert_eq!(values, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitmap_iter_edge_cases() {
|
||||
// Test with length that's not a multiple of 8
|
||||
let mut bitmap = Bitmap::new_empty(15);
|
||||
bitmap.set(0);
|
||||
bitmap.set(7);
|
||||
bitmap.set(14);
|
||||
|
||||
let values: Vec<bool> = bitmap.iter().collect();
|
||||
let expected = vec![
|
||||
true, // 0
|
||||
false, // 1
|
||||
false, // 2
|
||||
false, // 3
|
||||
false, // 4
|
||||
false, // 5
|
||||
false, // 6
|
||||
true, // 7
|
||||
false, // 8
|
||||
false, // 9
|
||||
false, // 10
|
||||
false, // 11
|
||||
false, // 12
|
||||
false, // 13
|
||||
true, // 14
|
||||
];
|
||||
assert_eq!(values, expected);
|
||||
}
|
||||
|
||||
proptest::proptest! {
|
||||
#[test]
|
||||
fn test_bitmap_iter_property(
|
||||
values in proptest::collection::vec(proptest::bool::ANY, 0..100)
|
||||
) {
|
||||
let bitmap = Bitmap::from(values.as_slice());
|
||||
let iter_values: Vec<bool> = bitmap.iter().collect();
|
||||
assert_eq!(iter_values, values);
|
||||
}
|
||||
}
|
||||
}
|
||||
400
vendor/lance-table/src/rowids/encoded_array.rs
vendored
Normal file
400
vendor/lance-table/src/rowids/encoded_array.rs
vendored
Normal file
|
|
@ -0,0 +1,400 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The Lance Authors
|
||||
|
||||
use std::ops::Range;
|
||||
|
||||
use deepsize::DeepSizeOf;
|
||||
|
||||
/// Encoded array of u64 values.
|
||||
///
|
||||
/// This is a internal data type used as part of row id indices.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, DeepSizeOf)]
|
||||
pub enum EncodedU64Array {
|
||||
/// u64 values represented as u16 offset from a base value.
|
||||
///
|
||||
/// Useful when the min and max value are within u16 range (0..65535).
|
||||
/// Only space saving when there are more than 2 values.
|
||||
U16 { base: u64, offsets: Vec<u16> },
|
||||
/// u64 values represented as u32 offset from a base value.
|
||||
///
|
||||
/// Useful when the min and max value are within u32 range (0..~4 billion).
|
||||
U32 { base: u64, offsets: Vec<u32> },
|
||||
/// Just a plain vector of u64 values.
|
||||
///
|
||||
/// For when the values cover a wide range.
|
||||
U64(Vec<u64>),
|
||||
}
|
||||
|
||||
impl EncodedU64Array {
|
||||
pub fn len(&self) -> usize {
|
||||
match self {
|
||||
Self::U16 { offsets, .. } => offsets.len(),
|
||||
Self::U32 { offsets, .. } => offsets.len(),
|
||||
Self::U64(values) => values.len(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn iter(&self) -> Box<dyn DoubleEndedIterator<Item = u64> + '_> {
|
||||
match self {
|
||||
Self::U16 { base, offsets } => {
|
||||
Box::new(offsets.iter().cloned().map(move |o| base + o as u64))
|
||||
}
|
||||
Self::U32 { base, offsets } => {
|
||||
Box::new(offsets.iter().cloned().map(move |o| base + o as u64))
|
||||
}
|
||||
Self::U64(values) => Box::new(values.iter().cloned()),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get(&self, i: usize) -> Option<u64> {
|
||||
match self {
|
||||
Self::U16 { base, offsets } => {
|
||||
if i < offsets.len() {
|
||||
Some(*base + offsets[i] as u64)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
Self::U32 { base, offsets } => {
|
||||
if i < offsets.len() {
|
||||
Some(*base + offsets[i] as u64)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
Self::U64(values) => values.get(i).copied(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn min(&self) -> Option<u64> {
|
||||
match self {
|
||||
Self::U16 { base, offsets } => {
|
||||
if offsets.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(*base)
|
||||
}
|
||||
}
|
||||
Self::U32 { base, offsets } => {
|
||||
if offsets.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(*base)
|
||||
}
|
||||
}
|
||||
Self::U64(values) => values.iter().copied().min(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn max(&self) -> Option<u64> {
|
||||
match self {
|
||||
Self::U16 { base, offsets } => {
|
||||
if offsets.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(*base + offsets.iter().copied().max().unwrap() as u64)
|
||||
}
|
||||
}
|
||||
Self::U32 { base, offsets } => {
|
||||
if offsets.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(*base + offsets.iter().copied().max().unwrap() as u64)
|
||||
}
|
||||
}
|
||||
Self::U64(values) => values.iter().copied().max(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn first(&self) -> Option<u64> {
|
||||
match self {
|
||||
Self::U16 { base, offsets } => {
|
||||
if offsets.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(*base + *offsets.first().unwrap() as u64)
|
||||
}
|
||||
}
|
||||
Self::U32 { base, offsets } => {
|
||||
if offsets.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(*base + *offsets.first().unwrap() as u64)
|
||||
}
|
||||
}
|
||||
Self::U64(values) => values.first().copied(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn last(&self) -> Option<u64> {
|
||||
match self {
|
||||
Self::U16 { base, offsets } => {
|
||||
if offsets.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(*base + *offsets.last().unwrap() as u64)
|
||||
}
|
||||
}
|
||||
Self::U32 { base, offsets } => {
|
||||
if offsets.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(*base + *offsets.last().unwrap() as u64)
|
||||
}
|
||||
}
|
||||
Self::U64(values) => values.last().copied(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn binary_search(&self, val: u64) -> std::result::Result<usize, usize> {
|
||||
match self {
|
||||
Self::U16 { base, offsets } => match val.checked_sub(*base) {
|
||||
None => Err(0),
|
||||
Some(val) => {
|
||||
if val > u16::MAX as u64 {
|
||||
return Err(offsets.len());
|
||||
}
|
||||
let u16 = val as u16;
|
||||
offsets.binary_search(&u16)
|
||||
}
|
||||
},
|
||||
Self::U32 { base, offsets } => match val.checked_sub(*base) {
|
||||
None => Err(0),
|
||||
Some(val) => {
|
||||
if val > u32::MAX as u64 {
|
||||
return Err(offsets.len());
|
||||
}
|
||||
let u32 = val as u32;
|
||||
offsets.binary_search(&u32)
|
||||
}
|
||||
},
|
||||
Self::U64(values) => values.binary_search(&val),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn slice(&self, offset: usize, len: usize) -> Self {
|
||||
match self {
|
||||
Self::U16 { base, offsets } => offsets[offset..(offset + len)]
|
||||
.iter()
|
||||
.map(|o| *base + *o as u64)
|
||||
.collect(),
|
||||
Self::U32 { base, offsets } => offsets[offset..(offset + len)]
|
||||
.iter()
|
||||
.map(|o| *base + *o as u64)
|
||||
.collect(),
|
||||
Self::U64(values) => {
|
||||
let values = values[offset..(offset + len)].to_vec();
|
||||
Self::U64(values)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Vec<u64>> for EncodedU64Array {
|
||||
fn from(values: Vec<u64>) -> Self {
|
||||
let min = values.iter().copied().min().unwrap_or(0);
|
||||
let max = values.iter().copied().max().unwrap_or(0);
|
||||
let range = max - min;
|
||||
if values.is_empty() {
|
||||
Self::U64(Vec::new())
|
||||
} else if range <= u16::MAX as u64 {
|
||||
let base = min;
|
||||
let offsets = values.iter().map(|v| (*v - base) as u16).collect();
|
||||
Self::U16 { base, offsets }
|
||||
} else if range <= u32::MAX as u64 {
|
||||
let base = min;
|
||||
let offsets = values.iter().map(|v| (*v - base) as u32).collect();
|
||||
Self::U32 { base, offsets }
|
||||
} else {
|
||||
Self::U64(values)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Range<u64>> for EncodedU64Array {
|
||||
fn from(range: Range<u64>) -> Self {
|
||||
let min = range.start;
|
||||
let max = range.end;
|
||||
let range = max - min;
|
||||
if range < u16::MAX as u64 {
|
||||
let base = min;
|
||||
let offsets = (0..range as u16).collect();
|
||||
Self::U16 { base, offsets }
|
||||
} else if range < u32::MAX as u64 {
|
||||
let base = min;
|
||||
let offsets = (0..range as u32).collect();
|
||||
Self::U32 { base, offsets }
|
||||
} else {
|
||||
Self::U64((min..max).collect())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FromIterator<u64> for EncodedU64Array {
|
||||
fn from_iter<I: IntoIterator<Item = u64>>(iter: I) -> Self {
|
||||
let values: Vec<u64> = iter.into_iter().collect();
|
||||
Self::from(values)
|
||||
}
|
||||
}
|
||||
|
||||
impl IntoIterator for EncodedU64Array {
|
||||
type Item = u64;
|
||||
type IntoIter = Box<dyn DoubleEndedIterator<Item = u64>>;
|
||||
fn into_iter(self) -> Self::IntoIter {
|
||||
match self {
|
||||
Self::U16 { base, offsets } => {
|
||||
Box::new(offsets.into_iter().map(move |o| base + o as u64))
|
||||
}
|
||||
Self::U32 { base, offsets } => {
|
||||
Box::new(offsets.into_iter().map(move |o| base + o as u64))
|
||||
}
|
||||
Self::U64(values) => Box::new(values.into_iter()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_encoded_array_from_vec() {
|
||||
fn roundtrip_array(values: Vec<u64>, expected: &EncodedU64Array) {
|
||||
let encoded = EncodedU64Array::from(values.clone());
|
||||
assert_eq!(&encoded, expected);
|
||||
|
||||
assert_eq!(values.len(), encoded.len());
|
||||
assert_eq!(values.first(), encoded.first().as_ref());
|
||||
assert_eq!(values.last(), encoded.last().as_ref());
|
||||
assert_eq!(values.iter().min(), encoded.min().as_ref());
|
||||
assert_eq!(values.iter().max(), encoded.max().as_ref());
|
||||
|
||||
let roundtripped = encoded.iter().collect::<Vec<_>>();
|
||||
assert_eq!(values, roundtripped);
|
||||
|
||||
for (i, v) in values.iter().enumerate() {
|
||||
assert_eq!(Some(*v), encoded.get(i));
|
||||
}
|
||||
|
||||
let encoded2 = values.into_iter().collect::<EncodedU64Array>();
|
||||
assert_eq!(&encoded2, expected);
|
||||
}
|
||||
|
||||
// Empty
|
||||
roundtrip_array(vec![], &EncodedU64Array::U64(vec![]));
|
||||
|
||||
// Single value
|
||||
roundtrip_array(
|
||||
vec![42],
|
||||
&EncodedU64Array::U16 {
|
||||
base: 42,
|
||||
offsets: vec![0],
|
||||
},
|
||||
);
|
||||
|
||||
// u16 version, it can start beyond the u16 range, but the
|
||||
// relative values must be within u16 range.
|
||||
let relative_values = [42, 0, 43, u16::MAX as u64, 99];
|
||||
let values = relative_values.map(|v| v + 2 * u16::MAX as u64).to_vec();
|
||||
let expected = EncodedU64Array::U16 {
|
||||
base: 2 * u16::MAX as u64,
|
||||
offsets: relative_values.iter().map(|v| *v as u16).collect(),
|
||||
};
|
||||
roundtrip_array(values, &expected);
|
||||
|
||||
// u32 version
|
||||
let relative_values = [42, 0, 43, u32::MAX as u64, 99];
|
||||
let values = relative_values.map(|v| v + 2 * u32::MAX as u64).to_vec();
|
||||
let expected = EncodedU64Array::U32 {
|
||||
base: 2 * u32::MAX as u64,
|
||||
offsets: relative_values.iter().map(|v| *v as u32).collect(),
|
||||
};
|
||||
roundtrip_array(values, &expected);
|
||||
|
||||
// u64 version
|
||||
let values = [42, 0, 43, u64::MAX, 99].to_vec();
|
||||
let expected = EncodedU64Array::U64(values.clone());
|
||||
roundtrip_array(values, &expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_double_ended_iter() {
|
||||
let arrays = vec![
|
||||
EncodedU64Array::U16 {
|
||||
base: 42,
|
||||
offsets: vec![0, 1, 2, 3, 4],
|
||||
},
|
||||
EncodedU64Array::U32 {
|
||||
base: 42,
|
||||
offsets: vec![0, 1, 2, 3, 4],
|
||||
},
|
||||
EncodedU64Array::U64(vec![42, 43, 44, 45, 46]),
|
||||
];
|
||||
for array in arrays {
|
||||
// Should be able to iterate forwards and backwards, and get the same thing.
|
||||
let forwards = array.iter().collect::<Vec<_>>();
|
||||
let mut backwards = array.iter().rev().collect::<Vec<_>>();
|
||||
backwards.reverse();
|
||||
assert_eq!(forwards, backwards);
|
||||
|
||||
// Should be able to pull from both sides in lockstep.
|
||||
let mut expected = Vec::with_capacity(array.len());
|
||||
let mut actual = Vec::with_capacity(array.len());
|
||||
let mut iter = array.iter();
|
||||
// Alternating forwards and backwards
|
||||
for i in 0..array.len() {
|
||||
if i % 2 == 0 {
|
||||
actual.push(iter.next().unwrap());
|
||||
expected.push(array.get(i / 2).unwrap());
|
||||
} else {
|
||||
let i = array.len() - 1 - i / 2;
|
||||
actual.push(iter.next_back().unwrap());
|
||||
expected.push(array.get(i).unwrap());
|
||||
};
|
||||
}
|
||||
assert_eq!(expected, actual);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_encoded_array_from_range() {
|
||||
// u16 version
|
||||
let range = (2 * u16::MAX as u64)..(40 + 2 * u16::MAX as u64);
|
||||
let encoded = EncodedU64Array::from(range.clone());
|
||||
let expected_base = 2 * u16::MAX as u64;
|
||||
assert!(
|
||||
matches!(
|
||||
encoded,
|
||||
EncodedU64Array::U16 {
|
||||
base,
|
||||
..
|
||||
} if base == expected_base
|
||||
),
|
||||
"{:?}",
|
||||
encoded
|
||||
);
|
||||
let roundtripped = encoded.into_iter().collect::<Vec<_>>();
|
||||
assert_eq!(range.collect::<Vec<_>>(), roundtripped);
|
||||
|
||||
// u32 version
|
||||
let range = (2 * u32::MAX as u64)..(u16::MAX as u64 + 10 + 2 * u32::MAX as u64);
|
||||
let encoded = EncodedU64Array::from(range.clone());
|
||||
let expected_base = 2 * u32::MAX as u64;
|
||||
assert!(matches!(
|
||||
encoded,
|
||||
EncodedU64Array::U32 {
|
||||
base,
|
||||
..
|
||||
} if base == expected_base
|
||||
));
|
||||
let roundtripped = encoded.into_iter().collect::<Vec<_>>();
|
||||
assert_eq!(range.collect::<Vec<_>>(), roundtripped);
|
||||
|
||||
// We'll skip u64 since it would take a lot of memory.
|
||||
|
||||
// Empty one
|
||||
let range = 42..42;
|
||||
let encoded = EncodedU64Array::from(range);
|
||||
assert_eq!(encoded.len(), 0);
|
||||
}
|
||||
}
|
||||
822
vendor/lance-table/src/rowids/index.rs
vendored
Normal file
822
vendor/lance-table/src/rowids/index.rs
vendored
Normal file
|
|
@ -0,0 +1,822 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The Lance Authors
|
||||
|
||||
use std::ops::RangeInclusive;
|
||||
use std::sync::Arc;
|
||||
|
||||
use super::{RowIdSequence, U64Segment};
|
||||
use deepsize::DeepSizeOf;
|
||||
use lance_core::{Error, Result};
|
||||
use lance_core::utils::address::RowAddress;
|
||||
use lance_core::utils::deletion::DeletionVector;
|
||||
use rangemap::RangeInclusiveMap;
|
||||
|
||||
/// An index of row ids
|
||||
///
|
||||
/// This index is used to map row ids to their corresponding addresses. These
|
||||
/// addresses correspond to physical positions in the dataset. See [RowAddress].
|
||||
///
|
||||
/// This structure only contains rows that physically exist. However, it may
|
||||
/// map to addresses that have been tombstoned. A separate tombstone index is
|
||||
/// used to track tombstoned rows.
|
||||
// (Implementation)
|
||||
// Disjoint ranges of row ids are stored as the keys of the map. The values are
|
||||
// a pair of segments. The first segment is the row ids, and the second segment
|
||||
// is the addresses.
|
||||
#[derive(Debug)]
|
||||
pub struct RowIdIndex(RangeInclusiveMap<u64, (U64Segment, U64Segment)>);
|
||||
|
||||
pub struct FragmentRowIdIndex {
|
||||
pub fragment_id: u32,
|
||||
pub row_id_sequence: Arc<RowIdSequence>,
|
||||
pub deletion_vector: Arc<DeletionVector>,
|
||||
}
|
||||
|
||||
impl RowIdIndex {
|
||||
/// Create a new index from a list of fragment ids and their corresponding row id sequences.
|
||||
pub fn new(fragment_indices: &[FragmentRowIdIndex]) -> Result<Self> {
|
||||
let chunks = fragment_indices
|
||||
.iter()
|
||||
.flat_map(decompose_sequence)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let mut final_chunks = Vec::new();
|
||||
for processed_chunk in prep_index_chunks(chunks) {
|
||||
match processed_chunk {
|
||||
RawIndexChunk::NonOverlapping(chunk) => {
|
||||
final_chunks.push(chunk);
|
||||
}
|
||||
RawIndexChunk::Overlapping(_range, overlapping_chunks) => {
|
||||
// Intersecting row-id ranges don't imply intersecting id sets;
|
||||
// sparse ids and deletion holes leave the union short of the span.
|
||||
// The real invariant (no id in two fragments) is checked in the merge.
|
||||
let merged_chunk = merge_overlapping_chunks(overlapping_chunks)?;
|
||||
final_chunks.push(merged_chunk);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Self(RangeInclusiveMap::from_iter(final_chunks)))
|
||||
}
|
||||
|
||||
/// Get the address for a given row id.
|
||||
///
|
||||
/// Will return None if the row id does not exist in the index.
|
||||
pub fn get(&self, row_id: u64) -> Option<RowAddress> {
|
||||
let (row_id_segment, address_segment) = self.0.get(&row_id)?;
|
||||
let pos = row_id_segment.position(row_id)?;
|
||||
let address = address_segment.get(pos)?;
|
||||
Some(RowAddress::from(address))
|
||||
}
|
||||
|
||||
/// Get addresses for many row ids in one pass over the index.
|
||||
///
|
||||
/// Returns one entry per input id, in input order (`None` for missing).
|
||||
/// Sorts a working copy of the input internally so the chunk iterator
|
||||
/// is advanced at most once per chunk, amortizing the per-id tree walk
|
||||
/// from O(N · log F) to O(F + N).
|
||||
pub fn get_many(&self, row_ids: &[u64]) -> Vec<Option<RowAddress>> {
|
||||
let n = row_ids.len();
|
||||
let mut out = vec![None; n];
|
||||
if n == 0 {
|
||||
return out;
|
||||
}
|
||||
|
||||
let mut sorted: Vec<(u64, usize)> = row_ids.iter().copied().zip(0..n).collect();
|
||||
sorted.sort_unstable_by_key(|&(id, _)| id);
|
||||
|
||||
let mut chunks = self.0.iter().peekable();
|
||||
for (id, orig_idx) in sorted {
|
||||
// Advance past chunks that end before this id.
|
||||
while let Some((range, _)) = chunks.peek() {
|
||||
if *range.end() < id {
|
||||
chunks.next();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
let Some((range, (row_id_seg, addr_seg))) = chunks.peek() else {
|
||||
break;
|
||||
};
|
||||
if id < *range.start() {
|
||||
continue; // falls in a gap between chunks
|
||||
}
|
||||
if let Some(pos) = row_id_seg.position(id)
|
||||
&& let Some(addr) = addr_seg.get(pos)
|
||||
{
|
||||
out[orig_idx] = Some(RowAddress::from(addr));
|
||||
}
|
||||
}
|
||||
out
|
||||
}
|
||||
}
|
||||
|
||||
impl DeepSizeOf for RowIdIndex {
|
||||
fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize {
|
||||
self.0
|
||||
.iter()
|
||||
.map(|(_, (row_id_segment, address_segment))| {
|
||||
(2 * std::mem::size_of::<u64>())
|
||||
+ std::mem::size_of::<(U64Segment, U64Segment)>()
|
||||
+ row_id_segment.deep_size_of_children(context)
|
||||
+ address_segment.deep_size_of_children(context)
|
||||
})
|
||||
.sum()
|
||||
}
|
||||
}
|
||||
|
||||
fn decompose_sequence(
|
||||
frag_index: &FragmentRowIdIndex,
|
||||
) -> Vec<(RangeInclusive<u64>, (U64Segment, U64Segment))> {
|
||||
let mut start_address: u64 = RowAddress::first_row(frag_index.fragment_id).into();
|
||||
let mut current_offset = 0u32;
|
||||
let no_deletions = frag_index.deletion_vector.is_empty();
|
||||
|
||||
frag_index
|
||||
.row_id_sequence
|
||||
.0
|
||||
.iter()
|
||||
.filter_map(|segment| {
|
||||
let segment_len = segment.len();
|
||||
|
||||
let result = if no_deletions {
|
||||
decompose_segment_no_deletions(segment, start_address)
|
||||
} else {
|
||||
decompose_segment_with_deletions(
|
||||
segment,
|
||||
start_address,
|
||||
current_offset,
|
||||
&frag_index.deletion_vector,
|
||||
)
|
||||
};
|
||||
|
||||
current_offset += segment_len as u32;
|
||||
start_address += segment_len as u64;
|
||||
|
||||
result
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Build an IndexChunk from a list of (row_id, address) pairs.
|
||||
fn build_chunk_from_pairs(pairs: Vec<(u64, u64)>) -> Option<IndexChunk> {
|
||||
if pairs.is_empty() {
|
||||
return None;
|
||||
}
|
||||
let (row_ids, addresses): (Vec<u64>, Vec<u64>) = pairs.into_iter().unzip();
|
||||
let row_id_segment = U64Segment::from_iter(row_ids);
|
||||
let address_segment = U64Segment::from_iter(addresses);
|
||||
let coverage = row_id_segment.range()?;
|
||||
Some((coverage, (row_id_segment, address_segment)))
|
||||
}
|
||||
|
||||
/// Fast path: no deletions. O(1) for Range segments.
|
||||
fn decompose_segment_no_deletions(segment: &U64Segment, start_address: u64) -> Option<IndexChunk> {
|
||||
match segment {
|
||||
U64Segment::Range(range) if !range.is_empty() => {
|
||||
let len = range.end - range.start;
|
||||
let row_id_segment = U64Segment::Range(range.clone());
|
||||
let address_segment = U64Segment::Range(start_address..start_address + len);
|
||||
let coverage = range.start..=range.end - 1;
|
||||
Some((coverage, (row_id_segment, address_segment)))
|
||||
}
|
||||
_ if segment.is_empty() => None,
|
||||
_ => {
|
||||
// Non-Range segments: must iterate to build address mapping.
|
||||
let pairs: Vec<(u64, u64)> = segment
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, row_id)| (row_id, start_address + i as u64))
|
||||
.collect();
|
||||
build_chunk_from_pairs(pairs)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Slow path: has deletions, must check each row.
|
||||
fn decompose_segment_with_deletions(
|
||||
segment: &U64Segment,
|
||||
start_address: u64,
|
||||
current_offset: u32,
|
||||
deletion_vector: &DeletionVector,
|
||||
) -> Option<IndexChunk> {
|
||||
let pairs: Vec<(u64, u64)> = segment
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter_map(|(i, row_id)| {
|
||||
let row_offset = current_offset + i as u32;
|
||||
if !deletion_vector.contains(row_offset) {
|
||||
Some((row_id, start_address + i as u64))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
build_chunk_from_pairs(pairs)
|
||||
}
|
||||
|
||||
type IndexChunk = (RangeInclusive<u64>, (U64Segment, U64Segment));
|
||||
|
||||
#[derive(Debug)]
|
||||
enum RawIndexChunk {
|
||||
NonOverlapping(IndexChunk),
|
||||
Overlapping(RangeInclusive<u64>, Vec<IndexChunk>),
|
||||
}
|
||||
|
||||
impl RawIndexChunk {
|
||||
fn range_end(&self) -> u64 {
|
||||
match self {
|
||||
Self::NonOverlapping((range, _)) => *range.end(),
|
||||
Self::Overlapping(range, _) => *range.end(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Given a vector of index chunks, sort them and return an iterator of index chunks.
|
||||
///
|
||||
/// The iterator will yield chunks that are non-overlapping or a set of chunks
|
||||
/// that are overlapping.
|
||||
fn prep_index_chunks(mut chunks: Vec<IndexChunk>) -> impl Iterator<Item = RawIndexChunk> {
|
||||
chunks.sort_by_key(|(range, _)| u64::MAX - *range.start());
|
||||
|
||||
let mut output = Vec::new();
|
||||
|
||||
// Start assuming non-overlapping in first chunk.
|
||||
if let Some(first_chunk) = chunks.pop() {
|
||||
output.push(RawIndexChunk::NonOverlapping(first_chunk));
|
||||
} else {
|
||||
// Early return for empty.
|
||||
return output.into_iter();
|
||||
}
|
||||
|
||||
let mut current_range = 0..=0;
|
||||
let mut current_overlap = Vec::new();
|
||||
while let Some(chunk) = chunks.pop() {
|
||||
debug_assert_eq!(
|
||||
current_overlap
|
||||
.iter()
|
||||
.map(|(range, _): &IndexChunk| *range.start())
|
||||
.min()
|
||||
.unwrap_or_default(),
|
||||
*current_range.start(),
|
||||
);
|
||||
debug_assert_eq!(
|
||||
current_overlap
|
||||
.iter()
|
||||
.map(|(range, _): &IndexChunk| *range.end())
|
||||
.max()
|
||||
.unwrap_or_default(),
|
||||
*current_range.end(),
|
||||
);
|
||||
|
||||
if current_overlap.is_empty() {
|
||||
// We haven't found overlap yet.
|
||||
let last_chunk_end = output.last().unwrap().range_end();
|
||||
if *chunk.0.start() <= last_chunk_end {
|
||||
// We have found overlap.
|
||||
match output.pop().unwrap() {
|
||||
RawIndexChunk::NonOverlapping(chunk) => {
|
||||
current_overlap.push(chunk);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
current_overlap.push(chunk);
|
||||
|
||||
let range_start = *current_overlap.first().unwrap().0.start();
|
||||
let range_end = *current_overlap
|
||||
.last()
|
||||
.unwrap()
|
||||
.0
|
||||
.end()
|
||||
.max(current_overlap.first().unwrap().0.end());
|
||||
current_range = range_start..=range_end;
|
||||
} else {
|
||||
// We are still in non-overlapping space.
|
||||
output.push(RawIndexChunk::NonOverlapping(chunk));
|
||||
}
|
||||
} else {
|
||||
// We are making an overlap chunk
|
||||
if chunk.0.start() <= current_range.end() {
|
||||
// We are still in overlap.
|
||||
let range_end = *chunk.0.end().max(current_range.end());
|
||||
current_range = *current_range.start()..=range_end;
|
||||
|
||||
current_overlap.push(chunk);
|
||||
} else {
|
||||
// We have exited overlap.
|
||||
output.push(RawIndexChunk::Overlapping(
|
||||
std::mem::replace(&mut current_range, 0..=0),
|
||||
std::mem::take(&mut current_overlap),
|
||||
));
|
||||
output.push(RawIndexChunk::NonOverlapping(chunk));
|
||||
}
|
||||
}
|
||||
}
|
||||
debug_assert_eq!(
|
||||
current_overlap
|
||||
.iter()
|
||||
.map(|(range, _): &IndexChunk| *range.start())
|
||||
.min()
|
||||
.unwrap_or_default(),
|
||||
*current_range.start(),
|
||||
);
|
||||
debug_assert_eq!(
|
||||
current_overlap
|
||||
.iter()
|
||||
.map(|(range, _): &IndexChunk| *range.end())
|
||||
.max()
|
||||
.unwrap_or_default(),
|
||||
*current_range.end(),
|
||||
);
|
||||
|
||||
if !current_overlap.is_empty() {
|
||||
output.push(RawIndexChunk::Overlapping(
|
||||
current_range.clone(),
|
||||
current_overlap,
|
||||
));
|
||||
}
|
||||
|
||||
output.into_iter()
|
||||
}
|
||||
|
||||
fn merge_overlapping_chunks(overlapping_chunks: Vec<IndexChunk>) -> Result<IndexChunk> {
|
||||
let total_capacity = overlapping_chunks
|
||||
.iter()
|
||||
.map(|(_, (row_ids, _))| row_ids.len())
|
||||
.sum();
|
||||
let mut values = Vec::with_capacity(total_capacity);
|
||||
for (_, (row_ids, row_addrs)) in overlapping_chunks.iter() {
|
||||
values.extend(row_ids.iter().zip(row_addrs.iter()));
|
||||
}
|
||||
values.sort_by_key(|(row_id, _)| *row_id);
|
||||
// A duplicate row id here means two fragments claim the same live id: a
|
||||
// corrupt index, not a resolvable sparse-coverage case.
|
||||
if let Some(w) = values.windows(2).find(|w| w[0].0 == w[1].0) {
|
||||
return Err(Error::internal(format!(
|
||||
"row id index corrupt: stable row id {} is live in multiple fragments",
|
||||
w[0].0
|
||||
)));
|
||||
}
|
||||
let row_id_segment = U64Segment::from_iter(values.iter().map(|(row_id, _)| *row_id));
|
||||
let address_segment = U64Segment::from_iter(values.iter().map(|(_, row_addr)| *row_addr));
|
||||
|
||||
let range = row_id_segment.range().unwrap();
|
||||
|
||||
Ok((range, (row_id_segment, address_segment)))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use proptest::{prelude::Strategy, prop_assert_eq};
|
||||
|
||||
#[test]
|
||||
fn test_new_index() {
|
||||
let fragment_indices = vec![
|
||||
FragmentRowIdIndex {
|
||||
fragment_id: 10,
|
||||
row_id_sequence: Arc::new(RowIdSequence(vec![
|
||||
U64Segment::Range(0..10),
|
||||
U64Segment::RangeWithHoles {
|
||||
range: 10..17,
|
||||
holes: vec![12, 15].into(),
|
||||
},
|
||||
U64Segment::SortedArray(vec![20, 25, 30].into()),
|
||||
])),
|
||||
deletion_vector: Arc::new(DeletionVector::default()),
|
||||
},
|
||||
FragmentRowIdIndex {
|
||||
fragment_id: 20,
|
||||
row_id_sequence: Arc::new(RowIdSequence(vec![
|
||||
U64Segment::RangeWithBitmap {
|
||||
range: 17..20,
|
||||
bitmap: [true, false, true].as_slice().into(),
|
||||
},
|
||||
U64Segment::Array(vec![40, 50, 60].into()),
|
||||
])),
|
||||
deletion_vector: Arc::new(DeletionVector::default()),
|
||||
},
|
||||
];
|
||||
|
||||
let index = RowIdIndex::new(&fragment_indices).unwrap();
|
||||
|
||||
// Check various queries.
|
||||
assert_eq!(index.get(0), Some(RowAddress::new_from_parts(10, 0)));
|
||||
assert_eq!(index.get(15), None);
|
||||
assert_eq!(index.get(16), Some(RowAddress::new_from_parts(10, 14)));
|
||||
assert_eq!(index.get(17), Some(RowAddress::new_from_parts(20, 0)));
|
||||
assert_eq!(index.get(25), Some(RowAddress::new_from_parts(10, 16)));
|
||||
assert_eq!(index.get(40), Some(RowAddress::new_from_parts(20, 2)));
|
||||
assert_eq!(index.get(60), Some(RowAddress::new_from_parts(20, 4)));
|
||||
assert_eq!(index.get(61), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_new_index_overlap() {
|
||||
let fragment_indices = vec![
|
||||
FragmentRowIdIndex {
|
||||
fragment_id: 23,
|
||||
row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::SortedArray(
|
||||
vec![3, 6, 9].into(),
|
||||
)])),
|
||||
deletion_vector: Arc::new(DeletionVector::default()),
|
||||
},
|
||||
FragmentRowIdIndex {
|
||||
fragment_id: 42,
|
||||
row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::SortedArray(
|
||||
vec![2, 5, 8].into(),
|
||||
)])),
|
||||
deletion_vector: Arc::new(DeletionVector::default()),
|
||||
},
|
||||
FragmentRowIdIndex {
|
||||
fragment_id: 10,
|
||||
row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::SortedArray(
|
||||
vec![1, 4, 7].into(),
|
||||
)])),
|
||||
deletion_vector: Arc::new(DeletionVector::default()),
|
||||
},
|
||||
];
|
||||
|
||||
let index = RowIdIndex::new(&fragment_indices).unwrap();
|
||||
|
||||
// Check various queries.
|
||||
assert_eq!(index.get(1), Some(RowAddress::new_from_parts(10, 0)));
|
||||
assert_eq!(index.get(2), Some(RowAddress::new_from_parts(42, 0)));
|
||||
assert_eq!(index.get(3), Some(RowAddress::new_from_parts(23, 0)));
|
||||
assert_eq!(index.get(4), Some(RowAddress::new_from_parts(10, 1)));
|
||||
assert_eq!(index.get(5), Some(RowAddress::new_from_parts(42, 1)));
|
||||
assert_eq!(index.get(6), Some(RowAddress::new_from_parts(23, 1)));
|
||||
assert_eq!(index.get(7), Some(RowAddress::new_from_parts(10, 2)));
|
||||
assert_eq!(index.get(8), Some(RowAddress::new_from_parts(42, 2)));
|
||||
assert_eq!(index.get(9), Some(RowAddress::new_from_parts(23, 2)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_new_index_unsorted_row_ids() {
|
||||
// Test case with unsorted row ids within fragments
|
||||
let fragment_indices = vec![
|
||||
FragmentRowIdIndex {
|
||||
fragment_id: 10,
|
||||
row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::Array(
|
||||
vec![9, 3, 6].into(), // Unsorted array
|
||||
)])),
|
||||
deletion_vector: Arc::new(DeletionVector::default()),
|
||||
},
|
||||
FragmentRowIdIndex {
|
||||
fragment_id: 20,
|
||||
row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::Array(
|
||||
vec![8, 2, 5].into(), // Unsorted array
|
||||
)])),
|
||||
deletion_vector: Arc::new(DeletionVector::default()),
|
||||
},
|
||||
FragmentRowIdIndex {
|
||||
fragment_id: 30,
|
||||
row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::Array(
|
||||
vec![7, 1, 4].into(), // Unsorted array
|
||||
)])),
|
||||
deletion_vector: Arc::new(DeletionVector::default()),
|
||||
},
|
||||
];
|
||||
|
||||
let index = RowIdIndex::new(&fragment_indices).unwrap();
|
||||
|
||||
// Check that all row ids can be found regardless of their order in the segments
|
||||
assert_eq!(index.get(1), Some(RowAddress::new_from_parts(30, 1)));
|
||||
assert_eq!(index.get(2), Some(RowAddress::new_from_parts(20, 1)));
|
||||
assert_eq!(index.get(3), Some(RowAddress::new_from_parts(10, 1)));
|
||||
assert_eq!(index.get(4), Some(RowAddress::new_from_parts(30, 2)));
|
||||
assert_eq!(index.get(5), Some(RowAddress::new_from_parts(20, 2)));
|
||||
assert_eq!(index.get(6), Some(RowAddress::new_from_parts(10, 2)));
|
||||
assert_eq!(index.get(7), Some(RowAddress::new_from_parts(30, 0)));
|
||||
assert_eq!(index.get(8), Some(RowAddress::new_from_parts(20, 0)));
|
||||
assert_eq!(index.get(9), Some(RowAddress::new_from_parts(10, 0)));
|
||||
|
||||
// Check that non-existent row ids return None
|
||||
assert_eq!(index.get(0), None);
|
||||
assert_eq!(index.get(10), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_new_index_partial_overlap() {
|
||||
let fragment_indices = vec![
|
||||
FragmentRowIdIndex {
|
||||
fragment_id: 0,
|
||||
row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::RangeWithHoles {
|
||||
range: 0..100,
|
||||
holes: vec![50].into(),
|
||||
}])),
|
||||
deletion_vector: Arc::new(DeletionVector::default()),
|
||||
},
|
||||
FragmentRowIdIndex {
|
||||
fragment_id: 1,
|
||||
row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::Range(50..51)])),
|
||||
deletion_vector: Arc::new(DeletionVector::default()),
|
||||
},
|
||||
];
|
||||
|
||||
let index = RowIdIndex::new(&fragment_indices).unwrap();
|
||||
|
||||
// Check various queries.
|
||||
assert_eq!(index.get(0), Some(RowAddress::new_from_parts(0, 0)));
|
||||
assert_eq!(index.get(49), Some(RowAddress::new_from_parts(0, 49)));
|
||||
assert_eq!(index.get(50), Some(RowAddress::new_from_parts(1, 0)));
|
||||
assert_eq!(index.get(51), Some(RowAddress::new_from_parts(0, 50)));
|
||||
assert_eq!(index.get(99), Some(RowAddress::new_from_parts(0, 98)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_overlapping_chunks_sparse_with_deletions() {
|
||||
// Interleaved (overlapping) id ranges plus a deletion that leaves a hole,
|
||||
// so the union doesn't tile the span. Every live id must still resolve.
|
||||
let fragment_indices = vec![
|
||||
FragmentRowIdIndex {
|
||||
fragment_id: 10,
|
||||
row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::SortedArray(
|
||||
vec![1, 3, 5, 7, 9].into(),
|
||||
)])),
|
||||
deletion_vector: Arc::new(DeletionVector::default()),
|
||||
},
|
||||
FragmentRowIdIndex {
|
||||
fragment_id: 20,
|
||||
row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::SortedArray(
|
||||
vec![0, 2, 4, 6, 8].into(),
|
||||
)])),
|
||||
// Delete offset 2 (id 4) -> a hole in the span.
|
||||
deletion_vector: Arc::new(DeletionVector::from_iter(vec![2])),
|
||||
},
|
||||
];
|
||||
|
||||
let index = RowIdIndex::new(&fragment_indices).unwrap();
|
||||
|
||||
assert_eq!(index.get(0), Some(RowAddress::new_from_parts(20, 0)));
|
||||
assert_eq!(index.get(1), Some(RowAddress::new_from_parts(10, 0)));
|
||||
assert_eq!(index.get(2), Some(RowAddress::new_from_parts(20, 1)));
|
||||
assert_eq!(index.get(3), Some(RowAddress::new_from_parts(10, 1)));
|
||||
assert_eq!(index.get(4), None);
|
||||
// Surviving ids keep their original offsets (the hole is not compacted).
|
||||
assert_eq!(index.get(6), Some(RowAddress::new_from_parts(20, 3)));
|
||||
assert_eq!(index.get(8), Some(RowAddress::new_from_parts(20, 4)));
|
||||
assert_eq!(index.get(9), Some(RowAddress::new_from_parts(10, 4)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_index_with_deletion_vector() {
|
||||
let deletion_vector = DeletionVector::from_iter(vec![2, 3]);
|
||||
|
||||
let fragment_indices = vec![FragmentRowIdIndex {
|
||||
fragment_id: 10,
|
||||
row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::Range(0..6)])),
|
||||
deletion_vector: Arc::new(deletion_vector),
|
||||
}];
|
||||
|
||||
let index = RowIdIndex::new(&fragment_indices).unwrap();
|
||||
|
||||
assert_eq!(index.get(0), Some(RowAddress::new_from_parts(10, 0)));
|
||||
assert_eq!(index.get(1), Some(RowAddress::new_from_parts(10, 1)));
|
||||
assert_eq!(index.get(4), Some(RowAddress::new_from_parts(10, 4)));
|
||||
assert_eq!(index.get(5), Some(RowAddress::new_from_parts(10, 5)));
|
||||
|
||||
assert_eq!(index.get(2), None);
|
||||
assert_eq!(index.get(3), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty_fragment_sequences() {
|
||||
let fragment_indices = vec![
|
||||
FragmentRowIdIndex {
|
||||
fragment_id: 10,
|
||||
row_id_sequence: Arc::new(RowIdSequence(vec![])),
|
||||
deletion_vector: Arc::new(DeletionVector::default()),
|
||||
},
|
||||
FragmentRowIdIndex {
|
||||
fragment_id: 20,
|
||||
row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::Range(5..8)])),
|
||||
deletion_vector: Arc::new(DeletionVector::default()),
|
||||
},
|
||||
];
|
||||
|
||||
let index = RowIdIndex::new(&fragment_indices).unwrap();
|
||||
|
||||
assert_eq!(index.get(5), Some(RowAddress::new_from_parts(20, 0)));
|
||||
assert_eq!(index.get(7), Some(RowAddress::new_from_parts(20, 2)));
|
||||
assert_eq!(index.get(4), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_completely_empty_index() {
|
||||
let fragment_indices = vec![];
|
||||
let index = RowIdIndex::new(&fragment_indices).unwrap();
|
||||
|
||||
assert_eq!(index.get(0), None);
|
||||
assert_eq!(index.get(100), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_non_overlapping_ranges() {
|
||||
let fragment_indices = vec![
|
||||
FragmentRowIdIndex {
|
||||
fragment_id: 10,
|
||||
row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::Range(0..5)])),
|
||||
deletion_vector: Arc::new(DeletionVector::default()),
|
||||
},
|
||||
FragmentRowIdIndex {
|
||||
fragment_id: 20,
|
||||
row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::Range(5..10)])),
|
||||
deletion_vector: Arc::new(DeletionVector::default()),
|
||||
},
|
||||
FragmentRowIdIndex {
|
||||
fragment_id: 30,
|
||||
row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::Range(10..15)])),
|
||||
deletion_vector: Arc::new(DeletionVector::default()),
|
||||
},
|
||||
];
|
||||
|
||||
let index = RowIdIndex::new(&fragment_indices).unwrap();
|
||||
|
||||
assert_eq!(index.get(0), Some(RowAddress::new_from_parts(10, 0)));
|
||||
assert_eq!(index.get(4), Some(RowAddress::new_from_parts(10, 4)));
|
||||
assert_eq!(index.get(5), Some(RowAddress::new_from_parts(20, 0)));
|
||||
assert_eq!(index.get(9), Some(RowAddress::new_from_parts(20, 4)));
|
||||
assert_eq!(index.get(10), Some(RowAddress::new_from_parts(30, 0)));
|
||||
assert_eq!(index.get(14), Some(RowAddress::new_from_parts(30, 4)));
|
||||
}
|
||||
|
||||
fn arbitrary_row_ids(
|
||||
num_fragments_range: std::ops::Range<usize>,
|
||||
frag_size_range: std::ops::Range<usize>,
|
||||
) -> impl Strategy<Value = Vec<(u32, Arc<RowIdSequence>)>> {
|
||||
let fragment_sizes = proptest::collection::vec(frag_size_range, num_fragments_range);
|
||||
fragment_sizes.prop_flat_map(|fragment_sizes| {
|
||||
let num_rows = fragment_sizes.iter().sum::<usize>() as u64;
|
||||
let row_ids = 0..num_rows;
|
||||
let row_ids = row_ids.collect::<Vec<_>>();
|
||||
let row_ids_shuffled = proptest::strategy::Just(row_ids).prop_shuffle();
|
||||
row_ids_shuffled.prop_map(move |row_ids| {
|
||||
let mut sequences = Vec::with_capacity(fragment_sizes.len());
|
||||
let mut i = 0;
|
||||
for size in &fragment_sizes {
|
||||
let end = i + size;
|
||||
let sequence =
|
||||
RowIdSequence(vec![U64Segment::from_slice(row_ids[i..end].into())]);
|
||||
sequences.push((i as u32, Arc::new(sequence)));
|
||||
i = end;
|
||||
}
|
||||
sequences
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_large_range_segments_no_deletions() {
|
||||
// Simulates a real-world scenario: many fragments with large Range segments
|
||||
// and no deletions. Before optimization, this would iterate over all rows
|
||||
// (O(total_rows)). After optimization, it's O(num_fragments).
|
||||
let rows_per_fragment = 250_000u64;
|
||||
let num_fragments = 100u32;
|
||||
let mut offset = 0u64;
|
||||
|
||||
let fragment_indices: Vec<FragmentRowIdIndex> = (0..num_fragments)
|
||||
.map(|frag_id| {
|
||||
let start = offset;
|
||||
offset += rows_per_fragment;
|
||||
FragmentRowIdIndex {
|
||||
fragment_id: frag_id,
|
||||
row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::Range(
|
||||
start..start + rows_per_fragment,
|
||||
)])),
|
||||
deletion_vector: Arc::new(DeletionVector::default()),
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
let start = std::time::Instant::now();
|
||||
let index = RowIdIndex::new(&fragment_indices).unwrap();
|
||||
let elapsed = start.elapsed();
|
||||
|
||||
// Verify correctness at boundaries
|
||||
assert_eq!(index.get(0), Some(RowAddress::new_from_parts(0, 0)));
|
||||
assert_eq!(
|
||||
index.get(rows_per_fragment - 1),
|
||||
Some(RowAddress::new_from_parts(0, rows_per_fragment as u32 - 1))
|
||||
);
|
||||
assert_eq!(
|
||||
index.get(rows_per_fragment),
|
||||
Some(RowAddress::new_from_parts(1, 0))
|
||||
);
|
||||
let last_row = num_fragments as u64 * rows_per_fragment - 1;
|
||||
assert_eq!(
|
||||
index.get(last_row),
|
||||
Some(RowAddress::new_from_parts(
|
||||
num_fragments - 1,
|
||||
rows_per_fragment as u32 - 1
|
||||
))
|
||||
);
|
||||
assert_eq!(index.get(last_row + 1), None);
|
||||
|
||||
// With the optimization, building an index for 25M rows across 100 fragments
|
||||
// should complete in well under 1 second (typically < 1ms).
|
||||
assert!(
|
||||
elapsed.as_secs() < 1,
|
||||
"Index build took {:?} for {} fragments x {} rows = {} total rows. \
|
||||
This suggests the O(rows) -> O(fragments) optimization is not working.",
|
||||
elapsed,
|
||||
num_fragments,
|
||||
rows_per_fragment,
|
||||
num_fragments as u64 * rows_per_fragment,
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_large_range_segments_with_deletions() {
|
||||
let rows_per_fragment = 1_000u64;
|
||||
let num_fragments = 10u32;
|
||||
let mut offset = 0u64;
|
||||
|
||||
let fragment_indices: Vec<FragmentRowIdIndex> = (0..num_fragments)
|
||||
.map(|frag_id| {
|
||||
let start = offset;
|
||||
offset += rows_per_fragment;
|
||||
|
||||
// Delete every 3rd row (offsets 0, 3, 6, ...) within each fragment.
|
||||
let mut deleted = roaring::RoaringBitmap::new();
|
||||
for i in (0..rows_per_fragment as u32).step_by(3) {
|
||||
deleted.insert(i);
|
||||
}
|
||||
|
||||
FragmentRowIdIndex {
|
||||
fragment_id: frag_id,
|
||||
row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::Range(
|
||||
start..start + rows_per_fragment,
|
||||
)])),
|
||||
deletion_vector: Arc::new(DeletionVector::Bitmap(deleted)),
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
let index = RowIdIndex::new(&fragment_indices).unwrap();
|
||||
|
||||
// Deleted rows (offset 0, 3, 6, ...) should not be found.
|
||||
// Row ID 0 has offset 0 in fragment 0 -> deleted.
|
||||
assert_eq!(index.get(0), None);
|
||||
// Row ID 3 has offset 3 in fragment 0 -> deleted.
|
||||
assert_eq!(index.get(3), None);
|
||||
|
||||
// Non-deleted rows should resolve correctly.
|
||||
// Row ID 1 has offset 1 in fragment 0 -> address (frag=0, row=1).
|
||||
assert_eq!(index.get(1), Some(RowAddress::new_from_parts(0, 1)));
|
||||
// Row ID 2 has offset 2 in fragment 0 -> address (frag=0, row=2).
|
||||
assert_eq!(index.get(2), Some(RowAddress::new_from_parts(0, 2)));
|
||||
// Row ID 4 has offset 4 in fragment 0 -> address (frag=0, row=4).
|
||||
assert_eq!(index.get(4), Some(RowAddress::new_from_parts(0, 4)));
|
||||
|
||||
// Check second fragment: row IDs start at 1000.
|
||||
// Row ID 1000 has offset 0 in fragment 1 -> deleted.
|
||||
assert_eq!(index.get(rows_per_fragment), None);
|
||||
// Row ID 1001 has offset 1 in fragment 1 -> address (frag=1, row=1).
|
||||
assert_eq!(
|
||||
index.get(rows_per_fragment + 1),
|
||||
Some(RowAddress::new_from_parts(1, 1))
|
||||
);
|
||||
|
||||
// Last fragment, last non-deleted row.
|
||||
// Row ID 9999 has offset 999 in fragment 9 -> 999 % 3 == 0 -> deleted.
|
||||
let last_row = num_fragments as u64 * rows_per_fragment - 1;
|
||||
assert_eq!(index.get(last_row), None);
|
||||
// Row ID 9998 has offset 998 -> 998 % 3 == 2 -> not deleted.
|
||||
assert_eq!(
|
||||
index.get(last_row - 1),
|
||||
Some(RowAddress::new_from_parts(num_fragments - 1, 998))
|
||||
);
|
||||
|
||||
// Out of range.
|
||||
assert_eq!(index.get(last_row + 1), None);
|
||||
}
|
||||
|
||||
proptest::proptest! {
|
||||
#[test]
|
||||
fn test_new_index_robustness(row_ids in arbitrary_row_ids(0..5, 0..32)) {
|
||||
let fragment_indices: Vec<FragmentRowIdIndex> = row_ids
|
||||
.iter()
|
||||
.map(|(frag_id, sequence)| FragmentRowIdIndex {
|
||||
fragment_id: *frag_id,
|
||||
row_id_sequence: sequence.clone(),
|
||||
deletion_vector: Arc::new(DeletionVector::default()),
|
||||
})
|
||||
.collect();
|
||||
|
||||
let index = RowIdIndex::new(&fragment_indices).unwrap();
|
||||
for (frag_id, sequence) in row_ids.iter() {
|
||||
for (local_offset, row_id) in sequence.iter().enumerate() {
|
||||
prop_assert_eq!(
|
||||
index.get(row_id),
|
||||
Some(RowAddress::new_from_parts(*frag_id, local_offset as u32)),
|
||||
"Row id {} in sequence {:?} not found in index {:?}",
|
||||
row_id,
|
||||
sequence,
|
||||
index
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
1141
vendor/lance-table/src/rowids/segment.rs
vendored
Normal file
1141
vendor/lance-table/src/rowids/segment.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
239
vendor/lance-table/src/rowids/serde.rs
vendored
Normal file
239
vendor/lance-table/src/rowids/serde.rs
vendored
Normal file
|
|
@ -0,0 +1,239 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The Lance Authors
|
||||
|
||||
use crate::{format::pb, rowids::bitmap::Bitmap};
|
||||
use lance_core::{Error, Result};
|
||||
|
||||
use super::{RowIdSequence, U64Segment, encoded_array::EncodedU64Array};
|
||||
use prost::Message;
|
||||
|
||||
impl TryFrom<pb::RowIdSequence> for RowIdSequence {
|
||||
type Error = Error;
|
||||
|
||||
fn try_from(pb: pb::RowIdSequence) -> Result<Self> {
|
||||
Ok(Self(
|
||||
pb.segments
|
||||
.into_iter()
|
||||
.map(U64Segment::try_from)
|
||||
.collect::<Result<Vec<_>>>()?,
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<pb::U64Segment> for U64Segment {
|
||||
type Error = Error;
|
||||
|
||||
fn try_from(pb: pb::U64Segment) -> Result<Self> {
|
||||
use pb::u64_segment as pb_seg;
|
||||
use pb::u64_segment::Segment::*;
|
||||
match pb.segment {
|
||||
Some(Range(pb_seg::Range { start, end })) => Ok(Self::Range(start..end)),
|
||||
Some(RangeWithHoles(pb_seg::RangeWithHoles { start, end, holes })) => {
|
||||
let holes = holes
|
||||
.ok_or_else(|| Error::invalid_input("missing hole"))?
|
||||
.try_into()?;
|
||||
Ok(Self::RangeWithHoles {
|
||||
range: start..end,
|
||||
holes,
|
||||
})
|
||||
}
|
||||
Some(RangeWithBitmap(pb_seg::RangeWithBitmap { start, end, bitmap })) => {
|
||||
Ok(Self::RangeWithBitmap {
|
||||
range: start..end,
|
||||
bitmap: Bitmap {
|
||||
data: bitmap,
|
||||
len: (end - start) as usize,
|
||||
},
|
||||
})
|
||||
}
|
||||
Some(SortedArray(array)) => Ok(Self::SortedArray(EncodedU64Array::try_from(array)?)),
|
||||
Some(Array(array)) => Ok(Self::Array(EncodedU64Array::try_from(array)?)),
|
||||
// TODO: why non-exhaustive?
|
||||
// Some(_) => Err(Error::invalid_input("unknown segment type")),
|
||||
None => Err(Error::invalid_input("missing segment type")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<pb::EncodedU64Array> for EncodedU64Array {
|
||||
type Error = Error;
|
||||
|
||||
fn try_from(pb: pb::EncodedU64Array) -> Result<Self> {
|
||||
use pb::encoded_u64_array as pb_arr;
|
||||
use pb::encoded_u64_array::Array::*;
|
||||
match pb.array {
|
||||
Some(U16Array(pb_arr::U16Array { base, offsets })) => {
|
||||
assert!(
|
||||
offsets.len() % 2 == 0,
|
||||
"Must have even number of bytes to store u16 array"
|
||||
);
|
||||
let offsets = offsets
|
||||
.chunks_exact(2)
|
||||
.map(|chunk| u16::from_le_bytes([chunk[0], chunk[1]]))
|
||||
.collect();
|
||||
Ok(Self::U16 { base, offsets })
|
||||
}
|
||||
Some(U32Array(pb_arr::U32Array { base, offsets })) => {
|
||||
assert!(
|
||||
offsets.len() % 4 == 0,
|
||||
"Must have even number of bytes to store u32 array"
|
||||
);
|
||||
let offsets = offsets
|
||||
.chunks_exact(4)
|
||||
.map(|chunk| u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]))
|
||||
.collect();
|
||||
Ok(Self::U32 { base, offsets })
|
||||
}
|
||||
Some(U64Array(pb_arr::U64Array { values })) => {
|
||||
assert!(
|
||||
values.len() % 8 == 0,
|
||||
"Must have even number of bytes to store u64 array"
|
||||
);
|
||||
let values = values
|
||||
.chunks_exact(8)
|
||||
.map(|chunk| {
|
||||
u64::from_le_bytes([
|
||||
chunk[0], chunk[1], chunk[2], chunk[3], chunk[4], chunk[5], chunk[6],
|
||||
chunk[7],
|
||||
])
|
||||
})
|
||||
.collect();
|
||||
Ok(Self::U64(values))
|
||||
}
|
||||
// TODO: shouldn't this enum be non-exhaustive?
|
||||
// Some(_) => Err(Error::invalid_input("unknown array type")),
|
||||
None => Err(Error::invalid_input("missing array type")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<RowIdSequence> for pb::RowIdSequence {
|
||||
fn from(sequence: RowIdSequence) -> Self {
|
||||
Self {
|
||||
segments: sequence.0.into_iter().map(pb::U64Segment::from).collect(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<U64Segment> for pb::U64Segment {
|
||||
fn from(segment: U64Segment) -> Self {
|
||||
match segment {
|
||||
U64Segment::Range(range) => Self {
|
||||
segment: Some(pb::u64_segment::Segment::Range(pb::u64_segment::Range {
|
||||
start: range.start,
|
||||
end: range.end,
|
||||
})),
|
||||
},
|
||||
U64Segment::RangeWithHoles { range, holes } => Self {
|
||||
segment: Some(pb::u64_segment::Segment::RangeWithHoles(
|
||||
pb::u64_segment::RangeWithHoles {
|
||||
start: range.start,
|
||||
end: range.end,
|
||||
holes: Some(holes.into()),
|
||||
},
|
||||
)),
|
||||
},
|
||||
U64Segment::RangeWithBitmap { range, bitmap } => Self {
|
||||
segment: Some(pb::u64_segment::Segment::RangeWithBitmap(
|
||||
pb::u64_segment::RangeWithBitmap {
|
||||
start: range.start,
|
||||
end: range.end,
|
||||
bitmap: bitmap.data,
|
||||
},
|
||||
)),
|
||||
},
|
||||
U64Segment::SortedArray(array) => Self {
|
||||
segment: Some(pb::u64_segment::Segment::SortedArray(array.into())),
|
||||
},
|
||||
U64Segment::Array(array) => Self {
|
||||
segment: Some(pb::u64_segment::Segment::Array(array.into())),
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<EncodedU64Array> for pb::EncodedU64Array {
|
||||
fn from(array: EncodedU64Array) -> Self {
|
||||
match array {
|
||||
EncodedU64Array::U16 { base, offsets } => Self {
|
||||
array: Some(pb::encoded_u64_array::Array::U16Array(
|
||||
pb::encoded_u64_array::U16Array {
|
||||
base,
|
||||
offsets: offsets
|
||||
.iter()
|
||||
.flat_map(|&offset| offset.to_le_bytes().to_vec())
|
||||
.collect(),
|
||||
},
|
||||
)),
|
||||
},
|
||||
EncodedU64Array::U32 { base, offsets } => Self {
|
||||
array: Some(pb::encoded_u64_array::Array::U32Array(
|
||||
pb::encoded_u64_array::U32Array {
|
||||
base,
|
||||
offsets: offsets
|
||||
.iter()
|
||||
.flat_map(|&offset| offset.to_le_bytes().to_vec())
|
||||
.collect(),
|
||||
},
|
||||
)),
|
||||
},
|
||||
EncodedU64Array::U64(values) => Self {
|
||||
array: Some(pb::encoded_u64_array::Array::U64Array(
|
||||
pb::encoded_u64_array::U64Array {
|
||||
values: values
|
||||
.iter()
|
||||
.flat_map(|&value| value.to_le_bytes().to_vec())
|
||||
.collect(),
|
||||
},
|
||||
)),
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Serialize a rowid sequence to a buffer.
|
||||
pub fn write_row_ids(sequence: &RowIdSequence) -> Vec<u8> {
|
||||
let pb_sequence = pb::RowIdSequence::from(sequence.clone());
|
||||
pb_sequence.encode_to_vec()
|
||||
}
|
||||
|
||||
/// Deserialize a rowid sequence from some bytes.
|
||||
pub fn read_row_ids(reader: &[u8]) -> Result<RowIdSequence> {
|
||||
let pb_sequence = pb::RowIdSequence::decode(reader)?;
|
||||
RowIdSequence::try_from(pb_sequence)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
use pretty_assertions::assert_eq;
|
||||
|
||||
#[test]
|
||||
fn test_write_read_row_ids() {
|
||||
let mut sequence = RowIdSequence::from(0..20);
|
||||
sequence.0.push(U64Segment::Range(30..100));
|
||||
sequence.0.push(U64Segment::RangeWithHoles {
|
||||
range: 100..200,
|
||||
holes: EncodedU64Array::U64(vec![104, 108, 150]),
|
||||
});
|
||||
sequence.0.push(U64Segment::RangeWithBitmap {
|
||||
range: 200..300,
|
||||
bitmap: Bitmap::new_empty(100),
|
||||
});
|
||||
sequence
|
||||
.0
|
||||
.push(U64Segment::SortedArray(EncodedU64Array::U16 {
|
||||
base: 200,
|
||||
offsets: vec![1, 2, 3],
|
||||
}));
|
||||
sequence
|
||||
.0
|
||||
.push(U64Segment::Array(EncodedU64Array::U64(vec![1, 2, 3])));
|
||||
|
||||
let serialized = write_row_ids(&sequence);
|
||||
|
||||
let sequence2 = read_row_ids(&serialized).unwrap();
|
||||
|
||||
assert_eq!(sequence.0, sequence2.0);
|
||||
}
|
||||
}
|
||||
713
vendor/lance-table/src/rowids/version.rs
vendored
Normal file
713
vendor/lance-table/src/rowids/version.rs
vendored
Normal file
|
|
@ -0,0 +1,713 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The Lance Authors
|
||||
|
||||
//! Row version tracking for cross-version diff functionality
|
||||
//!
|
||||
//! This module provides data structures and functionality to track the latest
|
||||
//! update version for each row in a Lance dataset, enabling efficient
|
||||
//! cross-version diff operations.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use deepsize::DeepSizeOf;
|
||||
use lance_core::Error;
|
||||
use lance_core::Result;
|
||||
use prost::Message;
|
||||
use serde::de::Deserializer;
|
||||
use serde::ser::Serializer;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::format::{ExternalFile, Fragment, pb};
|
||||
use crate::rowids::segment::U64Segment;
|
||||
use crate::rowids::{RowIdSequence, read_row_ids};
|
||||
|
||||
/// A run of identical versions over a contiguous span of row positions.
|
||||
///
|
||||
/// Span is expressed as a U64Segment over row offsets (0..N within a fragment),
|
||||
/// not over row IDs. This keeps the encoding aligned with RowIdSequence order
|
||||
/// and enables zipped iteration without building a map.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, DeepSizeOf)]
|
||||
pub struct RowDatasetVersionRun {
|
||||
pub span: U64Segment,
|
||||
pub version: u64,
|
||||
}
|
||||
|
||||
impl RowDatasetVersionRun {
|
||||
/// Number of rows covered by this run.
|
||||
pub fn len(&self) -> usize {
|
||||
self.span.len()
|
||||
}
|
||||
|
||||
/// Whether this run covers no rows.
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.span.is_empty()
|
||||
}
|
||||
|
||||
/// The version value of this run.
|
||||
pub fn version(&self) -> u64 {
|
||||
self.version
|
||||
}
|
||||
}
|
||||
|
||||
/// Sequence of dataset versions
|
||||
///
|
||||
/// Stores version runs aligned to the positional order of RowIdSequence.
|
||||
/// Provides sequential iterators and optional lightweight indexing for
|
||||
/// efficient random access.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, DeepSizeOf, Default)]
|
||||
pub struct RowDatasetVersionSequence {
|
||||
pub runs: Vec<RowDatasetVersionRun>,
|
||||
}
|
||||
|
||||
impl RowDatasetVersionSequence {
|
||||
/// Create a new empty version sequence
|
||||
pub fn new() -> Self {
|
||||
Self { runs: Vec::new() }
|
||||
}
|
||||
|
||||
/// Create a version sequence with a single uniform run of `row_count` rows.
|
||||
pub fn from_uniform_row_count(row_count: u64, version: u64) -> Self {
|
||||
if row_count == 0 {
|
||||
return Self::new();
|
||||
}
|
||||
let run = RowDatasetVersionRun {
|
||||
span: U64Segment::Range(0..row_count),
|
||||
version,
|
||||
};
|
||||
Self { runs: vec![run] }
|
||||
}
|
||||
|
||||
/// Number of rows tracked by this sequence (sum of run lengths).
|
||||
pub fn len(&self) -> u64 {
|
||||
self.runs.iter().map(|s| s.len() as u64).sum()
|
||||
}
|
||||
|
||||
/// Empty if there are no runs or all runs are empty.
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.runs.is_empty() || self.runs.iter().all(|s| s.is_empty())
|
||||
}
|
||||
|
||||
/// Returns a forward iterator over versions, expanding runs lazily.
|
||||
pub fn versions(&self) -> VersionsIter<'_> {
|
||||
VersionsIter::new(&self.runs)
|
||||
}
|
||||
|
||||
/// Random access: get the version at global row position `index`.
|
||||
pub fn version_at(&self, index: usize) -> Option<u64> {
|
||||
let mut offset = 0usize;
|
||||
for run in &self.runs {
|
||||
let len = run.len();
|
||||
if index < offset + len {
|
||||
return Some(run.version());
|
||||
}
|
||||
offset += len;
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Get the version associated with a specific row id.
|
||||
/// This reconstructs the positional offset from RowIdSequence and then
|
||||
/// performs `version_at` lookup.
|
||||
pub fn get_version_for_row_id(&self, row_ids: &RowIdSequence, row_id: u64) -> Option<u64> {
|
||||
let mut offset = 0usize;
|
||||
for seg in &row_ids.0 {
|
||||
if seg.range().is_some_and(|r| r.contains(&row_id))
|
||||
&& let Some(local) = seg.position(row_id)
|
||||
{
|
||||
return self.version_at(offset + local);
|
||||
}
|
||||
offset += seg.len();
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Convenience: collect row IDs with version strictly greater than `threshold`.
|
||||
pub fn rows_with_version_greater_than(
|
||||
&self,
|
||||
row_ids: &RowIdSequence,
|
||||
threshold: u64,
|
||||
) -> Vec<u64> {
|
||||
row_ids
|
||||
.iter()
|
||||
.zip(self.versions())
|
||||
.filter_map(|(rid, v)| if v > threshold { Some(rid) } else { None })
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Delete rows by positional offsets (e.g., from a deletion vector)
|
||||
pub fn mask(&mut self, positions: impl IntoIterator<Item = u32>) -> Result<()> {
|
||||
let mut local_positions: Vec<u32> = Vec::new();
|
||||
let mut positions_iter = positions.into_iter();
|
||||
let mut curr_position = positions_iter.next();
|
||||
let mut offset: usize = 0;
|
||||
let mut cutoff: usize = 0;
|
||||
|
||||
for run in self.runs.iter_mut() {
|
||||
cutoff += run.span.len();
|
||||
while let Some(position) = curr_position {
|
||||
if position as usize >= cutoff {
|
||||
break;
|
||||
}
|
||||
local_positions.push(position - offset as u32);
|
||||
curr_position = positions_iter.next();
|
||||
}
|
||||
|
||||
if !local_positions.is_empty() {
|
||||
run.span.mask(local_positions.as_slice());
|
||||
local_positions.clear();
|
||||
}
|
||||
offset = cutoff;
|
||||
}
|
||||
|
||||
self.runs.retain(|r| !r.span.is_empty());
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Iterator over versions expanding runs lazily.
|
||||
pub struct VersionsIter<'a> {
|
||||
runs: &'a [RowDatasetVersionRun],
|
||||
run_idx: usize,
|
||||
remaining_in_run: usize,
|
||||
current_version: u64,
|
||||
}
|
||||
|
||||
impl<'a> VersionsIter<'a> {
|
||||
fn new(runs: &'a [RowDatasetVersionRun]) -> Self {
|
||||
let mut it = Self {
|
||||
runs,
|
||||
run_idx: 0,
|
||||
remaining_in_run: 0,
|
||||
current_version: 0,
|
||||
};
|
||||
it.advance_run();
|
||||
it
|
||||
}
|
||||
|
||||
fn advance_run(&mut self) {
|
||||
if self.run_idx < self.runs.len() {
|
||||
let run = &self.runs[self.run_idx];
|
||||
self.remaining_in_run = run.len();
|
||||
self.current_version = run.version();
|
||||
} else {
|
||||
self.remaining_in_run = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for VersionsIter<'a> {
|
||||
type Item = u64;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
if self.remaining_in_run == 0 {
|
||||
// Move to next run
|
||||
self.run_idx += 1;
|
||||
if self.run_idx >= self.runs.len() {
|
||||
return None;
|
||||
}
|
||||
self.advance_run();
|
||||
}
|
||||
self.remaining_in_run = self.remaining_in_run.saturating_sub(1);
|
||||
Some(self.current_version)
|
||||
}
|
||||
}
|
||||
|
||||
/// Metadata about the location of dataset version sequence data
|
||||
/// Following the same pattern as RowIdMeta
|
||||
///
|
||||
/// When stored inline, identical byte sequences are shared across fragments
|
||||
/// via `Arc<[u8]>` to reduce manifest memory for large tables.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, DeepSizeOf)]
|
||||
pub enum RowDatasetVersionMeta {
|
||||
/// Small sequences stored inline in the fragment metadata
|
||||
Inline(Arc<[u8]>),
|
||||
/// Large sequences stored in external files
|
||||
External(ExternalFile),
|
||||
}
|
||||
|
||||
// Custom Serialize: convert Arc<[u8]> to slice for transparent JSON output
|
||||
impl Serialize for RowDatasetVersionMeta {
|
||||
fn serialize<S: Serializer>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error> {
|
||||
#[derive(Serialize)]
|
||||
#[serde(untagged)]
|
||||
enum Helper<'a> {
|
||||
Inline { inline: &'a [u8] },
|
||||
External { external: &'a ExternalFile },
|
||||
}
|
||||
|
||||
match self {
|
||||
Self::Inline(data) => Helper::Inline {
|
||||
inline: data.as_ref(),
|
||||
}
|
||||
.serialize(serializer),
|
||||
Self::External(file) => Helper::External { external: file }.serialize(serializer),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Custom Deserialize: read Vec<u8> and convert to Arc<[u8]>
|
||||
impl<'de> Deserialize<'de> for RowDatasetVersionMeta {
|
||||
fn deserialize<D: Deserializer<'de>>(deserializer: D) -> std::result::Result<Self, D::Error> {
|
||||
#[derive(Deserialize)]
|
||||
#[serde(untagged)]
|
||||
enum Helper {
|
||||
Inline { inline: Vec<u8> },
|
||||
External { external: ExternalFile },
|
||||
}
|
||||
|
||||
match Helper::deserialize(deserializer)? {
|
||||
Helper::Inline { inline } => Ok(Self::Inline(Arc::from(inline))),
|
||||
Helper::External { external } => Ok(Self::External(external)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl RowDatasetVersionMeta {
|
||||
/// Create inline metadata from a version sequence
|
||||
pub fn from_sequence(sequence: &RowDatasetVersionSequence) -> lance_core::Result<Self> {
|
||||
let bytes = write_dataset_versions(sequence);
|
||||
Ok(Self::Inline(Arc::from(bytes)))
|
||||
}
|
||||
|
||||
/// Create external metadata reference
|
||||
pub fn from_external_file(path: String, offset: u64, size: u64) -> Self {
|
||||
Self::External(ExternalFile { path, offset, size })
|
||||
}
|
||||
|
||||
/// Load the version sequence from this metadata
|
||||
pub fn load_sequence(&self) -> lance_core::Result<RowDatasetVersionSequence> {
|
||||
match self {
|
||||
Self::Inline(data) => read_dataset_versions(data),
|
||||
Self::External(_file) => {
|
||||
todo!("External file loading not yet implemented")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper function to convert RowDatasetVersionMeta to protobuf format for last_updated_at
|
||||
pub fn last_updated_at_version_meta_to_pb(
|
||||
meta: &Option<RowDatasetVersionMeta>,
|
||||
) -> Option<pb::data_fragment::LastUpdatedAtVersionSequence> {
|
||||
meta.as_ref().map(|m| match m {
|
||||
RowDatasetVersionMeta::Inline(data) => {
|
||||
pb::data_fragment::LastUpdatedAtVersionSequence::InlineLastUpdatedAtVersions(
|
||||
data.to_vec(),
|
||||
)
|
||||
}
|
||||
RowDatasetVersionMeta::External(file) => {
|
||||
pb::data_fragment::LastUpdatedAtVersionSequence::ExternalLastUpdatedAtVersions(
|
||||
pb::ExternalFile {
|
||||
path: file.path.clone(),
|
||||
offset: file.offset,
|
||||
size: file.size,
|
||||
},
|
||||
)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
/// Helper function to convert RowDatasetVersionMeta to protobuf format for created_at
|
||||
pub fn created_at_version_meta_to_pb(
|
||||
meta: &Option<RowDatasetVersionMeta>,
|
||||
) -> Option<pb::data_fragment::CreatedAtVersionSequence> {
|
||||
meta.as_ref().map(|m| match m {
|
||||
RowDatasetVersionMeta::Inline(data) => {
|
||||
pb::data_fragment::CreatedAtVersionSequence::InlineCreatedAtVersions(data.to_vec())
|
||||
}
|
||||
RowDatasetVersionMeta::External(file) => {
|
||||
pb::data_fragment::CreatedAtVersionSequence::ExternalCreatedAtVersions(
|
||||
pb::ExternalFile {
|
||||
path: file.path.clone(),
|
||||
offset: file.offset,
|
||||
size: file.size,
|
||||
},
|
||||
)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
/// Serialize a dataset version sequence to a buffer (following RowIdSequence pattern)
|
||||
pub fn write_dataset_versions(sequence: &RowDatasetVersionSequence) -> Vec<u8> {
|
||||
// Convert to protobuf sequence
|
||||
let pb_sequence = pb::RowDatasetVersionSequence {
|
||||
runs: sequence
|
||||
.runs
|
||||
.iter()
|
||||
.map(|run| pb::RowDatasetVersionRun {
|
||||
span: Some(pb::U64Segment::from(run.span.clone())),
|
||||
version: run.version,
|
||||
})
|
||||
.collect(),
|
||||
};
|
||||
|
||||
pb_sequence.encode_to_vec()
|
||||
}
|
||||
|
||||
/// Deserialize a dataset version sequence from bytes (following RowIdSequence pattern)
|
||||
pub fn read_dataset_versions(data: &[u8]) -> lance_core::Result<RowDatasetVersionSequence> {
|
||||
let pb_sequence = pb::RowDatasetVersionSequence::decode(data).map_err(|e| {
|
||||
Error::internal(format!("Failed to decode RowDatasetVersionSequence: {}", e))
|
||||
})?;
|
||||
|
||||
let segments = pb_sequence
|
||||
.runs
|
||||
.into_iter()
|
||||
.map(|pb_run| {
|
||||
let positions_pb = pb_run.span.ok_or_else(|| {
|
||||
Error::internal("Missing positions in RowDatasetVersionRun".to_string())
|
||||
})?;
|
||||
let segment = U64Segment::try_from(positions_pb)?;
|
||||
Ok(RowDatasetVersionRun {
|
||||
span: segment,
|
||||
version: pb_run.version,
|
||||
})
|
||||
})
|
||||
.collect::<Result<Vec<_>>>()?;
|
||||
|
||||
Ok(RowDatasetVersionSequence { runs: segments })
|
||||
}
|
||||
|
||||
/// Re-chunk a sequence of dataset version runs into new chunk sizes (aligned with RowIdSequence rechunking)
|
||||
pub fn rechunk_version_sequences(
|
||||
sequences: impl IntoIterator<Item = RowDatasetVersionSequence>,
|
||||
chunk_sizes: impl IntoIterator<Item = u64>,
|
||||
allow_incomplete: bool,
|
||||
) -> Result<Vec<RowDatasetVersionSequence>> {
|
||||
let chunk_sizes_vec: Vec<u64> = chunk_sizes.into_iter().collect();
|
||||
let total_chunks = chunk_sizes_vec.len();
|
||||
let mut chunked_sequences: Vec<RowDatasetVersionSequence> = Vec::with_capacity(total_chunks);
|
||||
|
||||
let mut run_iter = sequences
|
||||
.into_iter()
|
||||
.flat_map(|sequence| sequence.runs.into_iter())
|
||||
.peekable();
|
||||
|
||||
let too_few_segments_error = |chunk_index: usize, expected_chunk_size: u64, remaining: u64| {
|
||||
Error::invalid_input(format!(
|
||||
"Got too few version runs for chunk {}. Expected chunk size: {}, remaining needed: {}",
|
||||
chunk_index, expected_chunk_size, remaining
|
||||
))
|
||||
};
|
||||
|
||||
let too_many_segments_error = |processed_chunks: usize, total_chunk_sizes: usize| {
|
||||
Error::invalid_input(format!(
|
||||
"Got too many version runs for the provided chunk lengths. Processed {} chunks out of {} expected",
|
||||
processed_chunks, total_chunk_sizes
|
||||
))
|
||||
};
|
||||
|
||||
let mut segment_offset = 0_u64;
|
||||
|
||||
for (chunk_index, chunk_size) in chunk_sizes_vec.iter().enumerate() {
|
||||
let chunk_size = *chunk_size;
|
||||
let mut out_seq = RowDatasetVersionSequence::new();
|
||||
let mut remaining = chunk_size;
|
||||
|
||||
while remaining > 0 {
|
||||
let remaining_in_segment = run_iter
|
||||
.peek()
|
||||
.map_or(0, |run| run.span.len() as u64 - segment_offset);
|
||||
|
||||
if remaining_in_segment == 0 {
|
||||
if run_iter.next().is_some() {
|
||||
segment_offset = 0;
|
||||
continue;
|
||||
} else if allow_incomplete {
|
||||
break;
|
||||
} else {
|
||||
return Err(too_few_segments_error(chunk_index, chunk_size, remaining));
|
||||
}
|
||||
}
|
||||
|
||||
match remaining_in_segment.cmp(&remaining) {
|
||||
std::cmp::Ordering::Greater => {
|
||||
let run = run_iter.peek().unwrap();
|
||||
let seg = run.span.slice(segment_offset as usize, remaining as usize);
|
||||
out_seq.runs.push(RowDatasetVersionRun {
|
||||
span: seg,
|
||||
version: run.version,
|
||||
});
|
||||
segment_offset += remaining;
|
||||
remaining = 0;
|
||||
}
|
||||
std::cmp::Ordering::Equal | std::cmp::Ordering::Less => {
|
||||
let run = run_iter.next().ok_or_else(|| {
|
||||
too_few_segments_error(chunk_index, chunk_size, remaining)
|
||||
})?;
|
||||
let seg = run
|
||||
.span
|
||||
.slice(segment_offset as usize, remaining_in_segment as usize);
|
||||
out_seq.runs.push(RowDatasetVersionRun {
|
||||
span: seg,
|
||||
version: run.version,
|
||||
});
|
||||
segment_offset = 0;
|
||||
remaining -= remaining_in_segment;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
chunked_sequences.push(out_seq);
|
||||
}
|
||||
|
||||
if run_iter.peek().is_some() {
|
||||
return Err(too_many_segments_error(
|
||||
chunked_sequences.len(),
|
||||
total_chunks,
|
||||
));
|
||||
}
|
||||
|
||||
Ok(chunked_sequences)
|
||||
}
|
||||
|
||||
/// Build version metadata for a fragment if it has physical rows and no existing metadata.
|
||||
pub fn build_version_meta(
|
||||
fragment: &Fragment,
|
||||
current_version: u64,
|
||||
) -> Option<RowDatasetVersionMeta> {
|
||||
if let Some(physical_rows) = fragment.physical_rows
|
||||
&& physical_rows > 0
|
||||
{
|
||||
// Verify row_id_meta exists (sanity check for stable row IDs)
|
||||
if fragment.row_id_meta.is_none() {
|
||||
panic!("Can not find row id meta, please make sure you have enabled stable row id.")
|
||||
}
|
||||
|
||||
// Use physical_rows directly as the authoritative row count
|
||||
// This is correct even for compacted fragments where row_id_meta might
|
||||
// have been partially copied
|
||||
let version_sequence = RowDatasetVersionSequence::from_uniform_row_count(
|
||||
physical_rows as u64,
|
||||
current_version,
|
||||
);
|
||||
|
||||
return Some(RowDatasetVersionMeta::from_sequence(&version_sequence).unwrap());
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Refresh row-level latest update version metadata for a full fragment rewrite-column update.
|
||||
///
|
||||
/// This sets a uniform version sequence for all rows in the fragment to `current_version`.
|
||||
pub fn refresh_row_latest_update_meta_for_full_frag_rewrite_cols(
|
||||
fragment: &mut Fragment,
|
||||
current_version: u64,
|
||||
) -> Result<()> {
|
||||
let row_count = if let Some(pr) = fragment.physical_rows {
|
||||
pr as u64
|
||||
} else if let Some(row_id_meta) = fragment.row_id_meta.as_ref() {
|
||||
match row_id_meta {
|
||||
crate::format::RowIdMeta::Inline(data) => {
|
||||
let sequence = read_row_ids(data).unwrap();
|
||||
sequence.len()
|
||||
}
|
||||
// Follow existing behavior: external sequence not yet supported here
|
||||
crate::format::RowIdMeta::External(_file) => 0,
|
||||
}
|
||||
} else {
|
||||
0
|
||||
};
|
||||
|
||||
if row_count > 0 {
|
||||
let version_seq =
|
||||
RowDatasetVersionSequence::from_uniform_row_count(row_count, current_version);
|
||||
let version_meta = RowDatasetVersionMeta::from_sequence(&version_seq)?;
|
||||
fragment.last_updated_at_version_meta = Some(version_meta);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Refresh row-level latest update version metadata for a partial fragment rewrite-column update.
|
||||
///
|
||||
/// `updated_offsets` are local row offsets (within the fragment) that have been updated.
|
||||
/// Existing version metadata is preserved and only the updated positions are set to `current_version`.
|
||||
/// If no existing metadata is present, positions default to `prev_version`.
|
||||
pub fn refresh_row_latest_update_meta_for_partial_frag_rewrite_cols(
|
||||
fragment: &mut Fragment,
|
||||
updated_offsets: &[usize],
|
||||
current_version: u64,
|
||||
prev_version: u64,
|
||||
) -> Result<()> {
|
||||
// Determine row count for fragment
|
||||
let row_count_u64: u64 = if let Some(pr) = fragment.physical_rows {
|
||||
pr as u64
|
||||
} else if let Some(row_id_meta) = fragment.row_id_meta.as_ref() {
|
||||
match row_id_meta {
|
||||
crate::format::RowIdMeta::Inline(data) => {
|
||||
let sequence = read_row_ids(data).unwrap();
|
||||
sequence.len()
|
||||
}
|
||||
crate::format::RowIdMeta::External(_file) => {
|
||||
// Preserve original behavior for external sequences
|
||||
todo!("External file loading not yet implemented")
|
||||
}
|
||||
}
|
||||
} else {
|
||||
0
|
||||
};
|
||||
|
||||
if row_count_u64 > 0 {
|
||||
// Build base version vector from existing meta or previous dataset version
|
||||
let mut base_versions: Vec<u64> = Vec::with_capacity(row_count_u64 as usize);
|
||||
if let Some(meta) = fragment.last_updated_at_version_meta.as_ref() {
|
||||
if let Ok(base_seq) = meta.load_sequence() {
|
||||
for pos in 0..(row_count_u64 as usize) {
|
||||
base_versions.push(base_seq.version_at(pos).unwrap_or(prev_version));
|
||||
}
|
||||
} else {
|
||||
base_versions.resize(row_count_u64 as usize, prev_version);
|
||||
}
|
||||
} else {
|
||||
base_versions.resize(row_count_u64 as usize, prev_version);
|
||||
}
|
||||
|
||||
// Apply updates to updated positions
|
||||
for &pos in updated_offsets {
|
||||
if pos < base_versions.len() {
|
||||
base_versions[pos] = current_version;
|
||||
}
|
||||
}
|
||||
|
||||
// Compress into runs
|
||||
let mut runs: Vec<RowDatasetVersionRun> = Vec::new();
|
||||
if !base_versions.is_empty() {
|
||||
let mut start = 0usize;
|
||||
let mut curr_ver = base_versions[0];
|
||||
for (idx, &ver) in base_versions.iter().enumerate().skip(1) {
|
||||
if ver != curr_ver {
|
||||
runs.push(RowDatasetVersionRun {
|
||||
span: U64Segment::Range(start as u64..idx as u64),
|
||||
version: curr_ver,
|
||||
});
|
||||
start = idx;
|
||||
curr_ver = ver;
|
||||
}
|
||||
}
|
||||
runs.push(RowDatasetVersionRun {
|
||||
span: U64Segment::Range(start as u64..base_versions.len() as u64),
|
||||
version: curr_ver,
|
||||
});
|
||||
}
|
||||
let new_seq = RowDatasetVersionSequence { runs };
|
||||
let new_meta = RowDatasetVersionMeta::from_sequence(&new_seq)?;
|
||||
fragment.last_updated_at_version_meta = Some(new_meta);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Protobuf conversion implementations
|
||||
impl TryFrom<pb::data_fragment::LastUpdatedAtVersionSequence> for RowDatasetVersionMeta {
|
||||
type Error = Error;
|
||||
|
||||
fn try_from(value: pb::data_fragment::LastUpdatedAtVersionSequence) -> Result<Self> {
|
||||
match value {
|
||||
pb::data_fragment::LastUpdatedAtVersionSequence::InlineLastUpdatedAtVersions(data) => {
|
||||
Ok(Self::Inline(Arc::from(data)))
|
||||
}
|
||||
pb::data_fragment::LastUpdatedAtVersionSequence::ExternalLastUpdatedAtVersions(
|
||||
file,
|
||||
) => Ok(Self::External(ExternalFile {
|
||||
path: file.path,
|
||||
offset: file.offset,
|
||||
size: file.size,
|
||||
})),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<pb::data_fragment::CreatedAtVersionSequence> for RowDatasetVersionMeta {
|
||||
type Error = Error;
|
||||
|
||||
fn try_from(value: pb::data_fragment::CreatedAtVersionSequence) -> Result<Self> {
|
||||
match value {
|
||||
pb::data_fragment::CreatedAtVersionSequence::InlineCreatedAtVersions(data) => {
|
||||
Ok(Self::Inline(Arc::from(data)))
|
||||
}
|
||||
pb::data_fragment::CreatedAtVersionSequence::ExternalCreatedAtVersions(file) => {
|
||||
Ok(Self::External(ExternalFile {
|
||||
path: file.path,
|
||||
offset: file.offset,
|
||||
size: file.size,
|
||||
}))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_version_random_access() {
|
||||
let seq = RowDatasetVersionSequence {
|
||||
runs: vec![
|
||||
RowDatasetVersionRun {
|
||||
span: U64Segment::Range(0..3),
|
||||
version: 1,
|
||||
},
|
||||
RowDatasetVersionRun {
|
||||
span: U64Segment::Range(0..2),
|
||||
version: 2,
|
||||
},
|
||||
RowDatasetVersionRun {
|
||||
span: U64Segment::Range(0..1),
|
||||
version: 3,
|
||||
},
|
||||
],
|
||||
};
|
||||
assert_eq!(seq.version_at(0), Some(1));
|
||||
assert_eq!(seq.version_at(2), Some(1));
|
||||
assert_eq!(seq.version_at(3), Some(2));
|
||||
assert_eq!(seq.version_at(4), Some(2));
|
||||
assert_eq!(seq.version_at(5), Some(3));
|
||||
assert_eq!(seq.version_at(6), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_serialization_round_trip() {
|
||||
let seq = RowDatasetVersionSequence {
|
||||
runs: vec![
|
||||
RowDatasetVersionRun {
|
||||
span: U64Segment::Range(0..4),
|
||||
version: 42,
|
||||
},
|
||||
RowDatasetVersionRun {
|
||||
span: U64Segment::Range(0..3),
|
||||
version: 99,
|
||||
},
|
||||
],
|
||||
};
|
||||
let bytes = write_dataset_versions(&seq);
|
||||
let seq2 = read_dataset_versions(&bytes).unwrap();
|
||||
assert_eq!(seq2.runs.len(), 2);
|
||||
assert_eq!(seq2.len(), 7);
|
||||
assert_eq!(seq2.version_at(0), Some(42));
|
||||
assert_eq!(seq2.version_at(5), Some(99));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_get_version_for_row_id() {
|
||||
let seq = RowDatasetVersionSequence {
|
||||
runs: vec![
|
||||
RowDatasetVersionRun {
|
||||
span: U64Segment::Range(0..2),
|
||||
version: 8,
|
||||
},
|
||||
RowDatasetVersionRun {
|
||||
span: U64Segment::Range(0..2),
|
||||
version: 9,
|
||||
},
|
||||
],
|
||||
};
|
||||
let rows = RowIdSequence::from(10..14); // row ids: 10,11,12,13
|
||||
assert_eq!(seq.get_version_for_row_id(&rows, 10), Some(8));
|
||||
assert_eq!(seq.get_version_for_row_id(&rows, 11), Some(8));
|
||||
assert_eq!(seq.get_version_for_row_id(&rows, 12), Some(9));
|
||||
assert_eq!(seq.get_version_for_row_id(&rows, 13), Some(9));
|
||||
assert_eq!(seq.get_version_for_row_id(&rows, 99), None);
|
||||
}
|
||||
}
|
||||
47
vendor/lance-table/src/utils.rs
vendored
Normal file
47
vendor/lance-table/src/utils.rs
vendored
Normal file
|
|
@ -0,0 +1,47 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The Lance Authors
|
||||
|
||||
pub mod stream;
|
||||
|
||||
pub trait LanceIteratorExtension {
|
||||
fn exact_size(self, size: usize) -> ExactSize<Self>
|
||||
where
|
||||
Self: Sized;
|
||||
}
|
||||
|
||||
impl<I: Iterator> LanceIteratorExtension for I {
|
||||
fn exact_size(self, size: usize) -> ExactSize<Self>
|
||||
where
|
||||
Self: Sized,
|
||||
{
|
||||
ExactSize { inner: self, size }
|
||||
}
|
||||
}
|
||||
|
||||
/// A iterator that is tagged with a known size. This is useful when we are
|
||||
/// able to pre-compute the size of the iterator but the iterator implementation
|
||||
/// isn't able to itself. A common example is when using `flatten()`.
|
||||
///
|
||||
/// This is inspired by discussion in <https://github.com/rust-lang/rust/issues/68995>
|
||||
pub struct ExactSize<I> {
|
||||
inner: I,
|
||||
size: usize,
|
||||
}
|
||||
|
||||
impl<I: Iterator> Iterator for ExactSize<I> {
|
||||
type Item = I::Item;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
match self.inner.next() {
|
||||
None => None,
|
||||
Some(x) => {
|
||||
self.size -= 1;
|
||||
Some(x)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
(self.size, Some(self.size))
|
||||
}
|
||||
}
|
||||
806
vendor/lance-table/src/utils/stream.rs
vendored
Normal file
806
vendor/lance-table/src/utils/stream.rs
vendored
Normal file
|
|
@ -0,0 +1,806 @@
|
|||
// SPDX-License-Identifier: Apache-2.0
|
||||
// SPDX-FileCopyrightText: Copyright The Lance Authors
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use arrow_array::{BooleanArray, RecordBatch, RecordBatchOptions, UInt64Array, make_array};
|
||||
use arrow_buffer::NullBuffer;
|
||||
use futures::{
|
||||
FutureExt, Stream, StreamExt,
|
||||
future::BoxFuture,
|
||||
stream::{BoxStream, FuturesOrdered},
|
||||
};
|
||||
use lance_arrow::RecordBatchExt;
|
||||
use lance_core::{
|
||||
ROW_ADDR, ROW_ADDR_FIELD, ROW_CREATED_AT_VERSION_FIELD, ROW_ID, ROW_ID_FIELD,
|
||||
ROW_LAST_UPDATED_AT_VERSION_FIELD, Result,
|
||||
utils::{address::RowAddress, deletion::DeletionVector},
|
||||
};
|
||||
use lance_io::ReadBatchParams;
|
||||
use tracing::instrument;
|
||||
|
||||
use crate::rowids::RowIdSequence;
|
||||
|
||||
pub type ReadBatchFut = BoxFuture<'static, Result<RecordBatch>>;
|
||||
/// A task, emitted by a file reader, that will produce a batch (of the
|
||||
/// given size)
|
||||
pub struct ReadBatchTask {
|
||||
pub task: ReadBatchFut,
|
||||
pub num_rows: u32,
|
||||
}
|
||||
pub type ReadBatchTaskStream = BoxStream<'static, ReadBatchTask>;
|
||||
pub type ReadBatchFutStream = BoxStream<'static, ReadBatchFut>;
|
||||
|
||||
struct MergeStream {
|
||||
streams: Vec<ReadBatchTaskStream>,
|
||||
next_batch: FuturesOrdered<ReadBatchFut>,
|
||||
next_num_rows: u32,
|
||||
index: usize,
|
||||
}
|
||||
|
||||
impl MergeStream {
|
||||
fn emit(&mut self) -> ReadBatchTask {
|
||||
let mut iter = std::mem::take(&mut self.next_batch);
|
||||
let task = async move {
|
||||
let mut batch = iter.next().await.unwrap()?;
|
||||
while let Some(next) = iter.next().await {
|
||||
let next = next?;
|
||||
batch = batch.merge(&next)?;
|
||||
}
|
||||
Ok(batch)
|
||||
}
|
||||
.boxed();
|
||||
let num_rows = self.next_num_rows;
|
||||
self.next_num_rows = 0;
|
||||
ReadBatchTask { task, num_rows }
|
||||
}
|
||||
}
|
||||
|
||||
impl Stream for MergeStream {
|
||||
type Item = ReadBatchTask;
|
||||
|
||||
fn poll_next(
|
||||
mut self: std::pin::Pin<&mut Self>,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
) -> std::task::Poll<Option<Self::Item>> {
|
||||
loop {
|
||||
let index = self.index;
|
||||
match self.streams[index].poll_next_unpin(cx) {
|
||||
std::task::Poll::Ready(Some(batch_task)) => {
|
||||
if self.index == 0 {
|
||||
self.next_num_rows = batch_task.num_rows;
|
||||
} else {
|
||||
debug_assert_eq!(self.next_num_rows, batch_task.num_rows);
|
||||
}
|
||||
self.next_batch.push_back(batch_task.task);
|
||||
self.index += 1;
|
||||
if self.index == self.streams.len() {
|
||||
self.index = 0;
|
||||
let next_batch = self.emit();
|
||||
return std::task::Poll::Ready(Some(next_batch));
|
||||
}
|
||||
}
|
||||
std::task::Poll::Ready(None) => {
|
||||
return std::task::Poll::Ready(None);
|
||||
}
|
||||
std::task::Poll::Pending => {
|
||||
return std::task::Poll::Pending;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Given multiple streams of batch tasks, merge them into a single stream
|
||||
///
|
||||
/// This pulls one batch from each stream and then combines the columns from
|
||||
/// all of the batches into a single batch. The order of the batches in the
|
||||
/// streams is maintained and the merged batch columns will be in order from
|
||||
/// first to last stream.
|
||||
///
|
||||
/// This stream ends as soon as any of the input streams ends (we do not
|
||||
/// verify that the other input streams are finished as well)
|
||||
///
|
||||
/// This will panic if any of the input streams return a batch with a different
|
||||
/// number of rows than the first stream.
|
||||
pub fn merge_streams(streams: Vec<ReadBatchTaskStream>) -> ReadBatchTaskStream {
|
||||
MergeStream {
|
||||
streams,
|
||||
next_batch: FuturesOrdered::new(),
|
||||
next_num_rows: 0,
|
||||
index: 0,
|
||||
}
|
||||
.boxed()
|
||||
}
|
||||
|
||||
/// Apply a mask to the batch, where rows are "deleted" by the _rowid column null.
|
||||
///
|
||||
/// This is used partly as a performance optimization (cheaper to null than to filter)
|
||||
/// but also because there are cases where we want to load the physical rows. For example,
|
||||
/// we may be replacing a column based on some UDF and we want to provide a value for the
|
||||
/// deleted rows to ensure the fragments are aligned.
|
||||
fn apply_deletions_as_nulls(batch: RecordBatch, mask: &BooleanArray) -> Result<RecordBatch> {
|
||||
// Transform mask into null buffer. Null means deleted, though note that
|
||||
// null buffers are actually validity buffers, so True means not null
|
||||
// and thus not deleted.
|
||||
let mask_buffer = NullBuffer::new(mask.values().clone());
|
||||
|
||||
if mask_buffer.null_count() == 0 {
|
||||
// No rows are deleted
|
||||
return Ok(batch);
|
||||
}
|
||||
|
||||
// For each column convert to data
|
||||
let new_columns = batch
|
||||
.schema()
|
||||
.fields()
|
||||
.iter()
|
||||
.zip(batch.columns())
|
||||
.map(|(field, col)| {
|
||||
if field.name() == ROW_ID || field.name() == ROW_ADDR {
|
||||
let col_data = col.to_data();
|
||||
// If it already has a validity bitmap, then AND it with the mask.
|
||||
// Otherwise, use the boolean buffer as the mask.
|
||||
let null_buffer = NullBuffer::union(col_data.nulls(), Some(&mask_buffer));
|
||||
|
||||
Ok(col_data
|
||||
.into_builder()
|
||||
.null_bit_buffer(null_buffer.map(|b| b.buffer().clone()))
|
||||
.build()
|
||||
.map(make_array)?)
|
||||
} else {
|
||||
Ok(col.clone())
|
||||
}
|
||||
})
|
||||
.collect::<Result<Vec<_>>>()?;
|
||||
|
||||
Ok(RecordBatch::try_new_with_options(
|
||||
batch.schema(),
|
||||
new_columns,
|
||||
&RecordBatchOptions::new().with_row_count(Some(batch.num_rows())),
|
||||
)?)
|
||||
}
|
||||
|
||||
/// Extract version values for a batch selection by binary-searching over
|
||||
/// precomputed RLE run offsets. Single-run fragments (the common case)
|
||||
/// take the O(1) fast path.
|
||||
fn version_values_for_selection(
|
||||
sequence: &crate::rowids::version::RowDatasetVersionSequence,
|
||||
params: &ReadBatchParams,
|
||||
batch_offset: u32,
|
||||
num_rows: u32,
|
||||
) -> Result<Vec<u64>> {
|
||||
let selection = params
|
||||
.slice(batch_offset as usize, num_rows as usize)
|
||||
.unwrap()
|
||||
.to_ranges()
|
||||
.unwrap();
|
||||
|
||||
if sequence.runs.len() == 1 {
|
||||
return Ok(vec![sequence.runs[0].version(); num_rows as usize]);
|
||||
}
|
||||
|
||||
let mut versions = Vec::with_capacity(num_rows as usize);
|
||||
let run_offsets: Vec<usize> = sequence
|
||||
.runs
|
||||
.iter()
|
||||
.scan(0usize, |acc, run| {
|
||||
let start = *acc;
|
||||
*acc += run.len();
|
||||
Some(start)
|
||||
})
|
||||
.collect();
|
||||
let total_len: usize = sequence.runs.iter().map(|r| r.len()).sum();
|
||||
|
||||
for r in &selection {
|
||||
for pos in r.start..r.end {
|
||||
let pos = pos as usize;
|
||||
if pos >= total_len {
|
||||
return Err(lance_core::Error::internal(format!(
|
||||
"version column position {} out of range (total_len={})",
|
||||
pos, total_len
|
||||
)));
|
||||
}
|
||||
let run_idx = match run_offsets.binary_search(&pos) {
|
||||
Ok(idx) => idx,
|
||||
Err(idx) => idx - 1,
|
||||
};
|
||||
versions.push(sequence.runs[run_idx].version());
|
||||
}
|
||||
}
|
||||
Ok(versions)
|
||||
}
|
||||
|
||||
/// Configuration needed to apply row ids and deletions to a batch
|
||||
#[derive(Debug)]
|
||||
pub struct RowIdAndDeletesConfig {
|
||||
/// The row ids that were requested
|
||||
pub params: ReadBatchParams,
|
||||
/// Whether to include the row id column in the final batch
|
||||
pub with_row_id: bool,
|
||||
/// Whether to include the row address column in the final batch
|
||||
pub with_row_addr: bool,
|
||||
/// Whether to include the last updated at version column in the final batch
|
||||
pub with_row_last_updated_at_version: bool,
|
||||
/// Whether to include the created at version column in the final batch
|
||||
pub with_row_created_at_version: bool,
|
||||
/// An optional deletion vector to apply to the batch
|
||||
pub deletion_vector: Option<Arc<DeletionVector>>,
|
||||
/// An optional row id sequence to use for the row id column.
|
||||
pub row_id_sequence: Option<Arc<RowIdSequence>>,
|
||||
/// The last_updated_at version sequence
|
||||
pub last_updated_at_sequence: Option<Arc<crate::rowids::version::RowDatasetVersionSequence>>,
|
||||
/// The created_at version sequence
|
||||
pub created_at_sequence: Option<Arc<crate::rowids::version::RowDatasetVersionSequence>>,
|
||||
/// Whether to make deleted rows null instead of filtering them out
|
||||
pub make_deletions_null: bool,
|
||||
/// The total number of rows that will be loaded
|
||||
///
|
||||
/// This is needed to convert ReadbatchParams::RangeTo into a valid range
|
||||
pub total_num_rows: u32,
|
||||
}
|
||||
|
||||
impl RowIdAndDeletesConfig {
|
||||
fn has_system_cols(&self) -> bool {
|
||||
self.with_row_id
|
||||
|| self.with_row_addr
|
||||
|| self.with_row_last_updated_at_version
|
||||
|| self.with_row_created_at_version
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(level = "debug", skip_all)]
|
||||
pub fn apply_row_id_and_deletes(
|
||||
batch: RecordBatch,
|
||||
batch_offset: u32,
|
||||
fragment_id: u32,
|
||||
config: &RowIdAndDeletesConfig,
|
||||
) -> Result<RecordBatch> {
|
||||
let mut deletion_vector = config.deletion_vector.as_ref();
|
||||
// Convert Some(NoDeletions) into None to simplify logic below
|
||||
if let Some(deletion_vector_inner) = deletion_vector
|
||||
&& matches!(deletion_vector_inner.as_ref(), DeletionVector::NoDeletions)
|
||||
{
|
||||
deletion_vector = None;
|
||||
}
|
||||
let has_deletions = deletion_vector.is_some();
|
||||
debug_assert!(batch.num_columns() > 0 || config.has_system_cols() || has_deletions);
|
||||
|
||||
// If row id sequence is None, then row id IS row address.
|
||||
let should_fetch_row_addr = config.with_row_addr
|
||||
|| (config.with_row_id && config.row_id_sequence.is_none())
|
||||
|| has_deletions;
|
||||
|
||||
let num_rows = batch.num_rows() as u32;
|
||||
|
||||
let row_addrs =
|
||||
if should_fetch_row_addr {
|
||||
let _rowaddrs = tracing::span!(tracing::Level::DEBUG, "fetch_row_addrs").entered();
|
||||
let mut row_addrs = Vec::with_capacity(num_rows as usize);
|
||||
for offset_range in config
|
||||
.params
|
||||
.slice(batch_offset as usize, num_rows as usize)
|
||||
.unwrap()
|
||||
.iter_offset_ranges()?
|
||||
{
|
||||
row_addrs.extend(offset_range.map(|row_offset| {
|
||||
u64::from(RowAddress::new_from_parts(fragment_id, row_offset))
|
||||
}));
|
||||
}
|
||||
|
||||
Some(Arc::new(UInt64Array::from(row_addrs)))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let row_ids = if config.with_row_id {
|
||||
let _rowids = tracing::span!(tracing::Level::DEBUG, "fetch_row_ids").entered();
|
||||
if let Some(row_id_sequence) = &config.row_id_sequence {
|
||||
let selection = config
|
||||
.params
|
||||
.slice(batch_offset as usize, num_rows as usize)
|
||||
.unwrap()
|
||||
.to_ranges()
|
||||
.unwrap();
|
||||
let row_ids = row_id_sequence
|
||||
.select(
|
||||
selection
|
||||
.iter()
|
||||
.flat_map(|r| r.start as usize..r.end as usize),
|
||||
)
|
||||
.collect::<UInt64Array>();
|
||||
Some(Arc::new(row_ids))
|
||||
} else {
|
||||
// If we don't have a row id sequence, can assume the row ids are
|
||||
// the same as the row addresses.
|
||||
row_addrs.clone()
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let span = tracing::span!(tracing::Level::DEBUG, "apply_deletions");
|
||||
let _enter = span.enter();
|
||||
let deletion_mask = deletion_vector.and_then(|v| {
|
||||
let row_addrs: &[u64] = row_addrs.as_ref().unwrap().values();
|
||||
v.build_predicate(row_addrs.iter())
|
||||
});
|
||||
|
||||
let batch = if config.with_row_id {
|
||||
let row_id_arr = row_ids.unwrap();
|
||||
batch.try_with_column(ROW_ID_FIELD.clone(), row_id_arr)?
|
||||
} else {
|
||||
batch
|
||||
};
|
||||
|
||||
let batch = if config.with_row_addr {
|
||||
let row_addr_arr = row_addrs.unwrap();
|
||||
batch.try_with_column(ROW_ADDR_FIELD.clone(), row_addr_arr)?
|
||||
} else {
|
||||
batch
|
||||
};
|
||||
|
||||
// Add version columns if requested
|
||||
let batch = if config.with_row_last_updated_at_version || config.with_row_created_at_version {
|
||||
let mut batch = batch;
|
||||
|
||||
if config.with_row_last_updated_at_version {
|
||||
let version_arr = if let Some(sequence) = &config.last_updated_at_sequence {
|
||||
Arc::new(UInt64Array::from(version_values_for_selection(
|
||||
sequence,
|
||||
&config.params,
|
||||
batch_offset,
|
||||
num_rows,
|
||||
)?))
|
||||
} else {
|
||||
// Default to version 1 if sequence not provided
|
||||
Arc::new(UInt64Array::from(vec![1u64; num_rows as usize]))
|
||||
};
|
||||
batch =
|
||||
batch.try_with_column(ROW_LAST_UPDATED_AT_VERSION_FIELD.clone(), version_arr)?;
|
||||
}
|
||||
|
||||
if config.with_row_created_at_version {
|
||||
let version_arr = if let Some(sequence) = &config.created_at_sequence {
|
||||
Arc::new(UInt64Array::from(version_values_for_selection(
|
||||
sequence,
|
||||
&config.params,
|
||||
batch_offset,
|
||||
num_rows,
|
||||
)?))
|
||||
} else {
|
||||
// Default to version 1 if sequence not provided
|
||||
Arc::new(UInt64Array::from(vec![1u64; num_rows as usize]))
|
||||
};
|
||||
batch = batch.try_with_column(ROW_CREATED_AT_VERSION_FIELD.clone(), version_arr)?;
|
||||
}
|
||||
|
||||
batch
|
||||
} else {
|
||||
batch
|
||||
};
|
||||
|
||||
match (deletion_mask, config.make_deletions_null) {
|
||||
(None, _) => Ok(batch),
|
||||
(Some(mask), false) => Ok(arrow::compute::filter_record_batch(&batch, &mask)?),
|
||||
(Some(mask), true) => Ok(apply_deletions_as_nulls(batch, &mask)?),
|
||||
}
|
||||
}
|
||||
|
||||
/// Given a stream of batch tasks this function will add a row ids column (if requested)
|
||||
/// and also apply a deletions vector to the batch.
|
||||
///
|
||||
/// This converts from BatchTaskStream to BatchFutStream because, if we are applying a
|
||||
/// deletion vector, it is impossible to know how many output rows we will have.
|
||||
pub fn wrap_with_row_id_and_delete(
|
||||
stream: ReadBatchTaskStream,
|
||||
fragment_id: u32,
|
||||
config: RowIdAndDeletesConfig,
|
||||
) -> ReadBatchFutStream {
|
||||
let config = Arc::new(config);
|
||||
let mut offset = 0;
|
||||
stream
|
||||
.map(move |batch_task| {
|
||||
let config = config.clone();
|
||||
let this_offset = offset;
|
||||
let num_rows = batch_task.num_rows;
|
||||
offset += num_rows;
|
||||
batch_task
|
||||
.task
|
||||
.map(move |batch| {
|
||||
apply_row_id_and_deletes(batch?, this_offset, fragment_id, config.as_ref())
|
||||
})
|
||||
.boxed()
|
||||
})
|
||||
.boxed()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::Arc;
|
||||
|
||||
use arrow::{array::AsArray, datatypes::UInt64Type};
|
||||
use arrow_array::{RecordBatch, UInt32Array, types::Int32Type};
|
||||
use arrow_schema::ArrowError;
|
||||
use futures::{FutureExt, StreamExt, TryStreamExt, stream::BoxStream};
|
||||
use lance_core::{
|
||||
ROW_ID,
|
||||
utils::{address::RowAddress, deletion::DeletionVector},
|
||||
};
|
||||
use lance_datagen::{BatchCount, RowCount};
|
||||
use lance_io::{ReadBatchParams, stream::arrow_stream_to_lance_stream};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::utils::stream::ReadBatchTask;
|
||||
|
||||
use super::RowIdAndDeletesConfig;
|
||||
|
||||
fn batch_task_stream(
|
||||
datagen_stream: BoxStream<'static, std::result::Result<RecordBatch, ArrowError>>,
|
||||
) -> super::ReadBatchTaskStream {
|
||||
arrow_stream_to_lance_stream(datagen_stream)
|
||||
.map(|batch| ReadBatchTask {
|
||||
num_rows: batch.as_ref().unwrap().num_rows() as u32,
|
||||
task: std::future::ready(batch).boxed(),
|
||||
})
|
||||
.boxed()
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_basic_zip() {
|
||||
let left = batch_task_stream(
|
||||
lance_datagen::gen_batch()
|
||||
.col("x", lance_datagen::array::step::<Int32Type>())
|
||||
.into_reader_stream(RowCount::from(100), BatchCount::from(10))
|
||||
.0,
|
||||
);
|
||||
let right = batch_task_stream(
|
||||
lance_datagen::gen_batch()
|
||||
.col("y", lance_datagen::array::step::<Int32Type>())
|
||||
.into_reader_stream(RowCount::from(100), BatchCount::from(10))
|
||||
.0,
|
||||
);
|
||||
|
||||
let merged = super::merge_streams(vec![left, right])
|
||||
.map(|batch_task| batch_task.task)
|
||||
.buffered(1)
|
||||
.try_collect::<Vec<_>>()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let expected = lance_datagen::gen_batch()
|
||||
.col("x", lance_datagen::array::step::<Int32Type>())
|
||||
.col("y", lance_datagen::array::step::<Int32Type>())
|
||||
.into_reader_rows(RowCount::from(100), BatchCount::from(10))
|
||||
.collect::<Result<Vec<_>, ArrowError>>()
|
||||
.unwrap();
|
||||
assert_eq!(merged, expected);
|
||||
}
|
||||
|
||||
async fn check_row_id(params: ReadBatchParams, expected: impl IntoIterator<Item = u32>) {
|
||||
let expected = Vec::from_iter(expected);
|
||||
|
||||
for has_columns in [false, true] {
|
||||
for fragment_id in [0, 10] {
|
||||
// 100 rows across 10 batches of 10 rows
|
||||
let mut datagen = lance_datagen::gen_batch();
|
||||
if has_columns {
|
||||
datagen = datagen.col("x", lance_datagen::array::rand::<Int32Type>());
|
||||
}
|
||||
let data = batch_task_stream(
|
||||
datagen
|
||||
.into_reader_stream(RowCount::from(10), BatchCount::from(10))
|
||||
.0,
|
||||
);
|
||||
|
||||
let config = RowIdAndDeletesConfig {
|
||||
params: params.clone(),
|
||||
with_row_id: true,
|
||||
with_row_addr: false,
|
||||
with_row_last_updated_at_version: false,
|
||||
with_row_created_at_version: false,
|
||||
deletion_vector: None,
|
||||
row_id_sequence: None,
|
||||
last_updated_at_sequence: None,
|
||||
created_at_sequence: None,
|
||||
make_deletions_null: false,
|
||||
total_num_rows: 100,
|
||||
};
|
||||
let stream = super::wrap_with_row_id_and_delete(data, fragment_id, config);
|
||||
let batches = stream.buffered(1).try_collect::<Vec<_>>().await.unwrap();
|
||||
|
||||
let mut offset = 0;
|
||||
let expected = expected.clone();
|
||||
for batch in batches {
|
||||
let actual_row_ids =
|
||||
batch[ROW_ID].as_primitive::<UInt64Type>().values().to_vec();
|
||||
let expected_row_ids = expected[offset..offset + 10]
|
||||
.iter()
|
||||
.map(|row_offset| {
|
||||
RowAddress::new_from_parts(fragment_id, *row_offset).into()
|
||||
})
|
||||
.collect::<Vec<u64>>();
|
||||
assert_eq!(actual_row_ids, expected_row_ids);
|
||||
offset += batch.num_rows();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_row_id() {
|
||||
let some_indices = (0..100).rev().collect::<Vec<u32>>();
|
||||
let some_indices_arr = UInt32Array::from(some_indices.clone());
|
||||
check_row_id(ReadBatchParams::RangeFull, 0..100).await;
|
||||
check_row_id(ReadBatchParams::Indices(some_indices_arr), some_indices).await;
|
||||
check_row_id(ReadBatchParams::Range(1000..1100), 1000..1100).await;
|
||||
check_row_id(
|
||||
ReadBatchParams::RangeFrom(std::ops::RangeFrom { start: 1000 }),
|
||||
1000..1100,
|
||||
)
|
||||
.await;
|
||||
check_row_id(
|
||||
ReadBatchParams::RangeTo(std::ops::RangeTo { end: 1000 }),
|
||||
0..100,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_deletes() {
|
||||
let no_deletes: Option<Arc<DeletionVector>> = None;
|
||||
let no_deletes_2 = Some(Arc::new(DeletionVector::NoDeletions));
|
||||
let delete_some_bitmap = Some(Arc::new(DeletionVector::Bitmap(RoaringBitmap::from_iter(
|
||||
0..35,
|
||||
))));
|
||||
let delete_some_set = Some(Arc::new(DeletionVector::Set((0..35).collect())));
|
||||
|
||||
for deletion_vector in [
|
||||
no_deletes,
|
||||
no_deletes_2,
|
||||
delete_some_bitmap,
|
||||
delete_some_set,
|
||||
] {
|
||||
for has_columns in [false, true] {
|
||||
for with_row_id in [false, true] {
|
||||
for make_deletions_null in [false, true] {
|
||||
for frag_id in [0, 1] {
|
||||
let has_deletions = if let Some(dv) = &deletion_vector {
|
||||
!matches!(dv.as_ref(), DeletionVector::NoDeletions)
|
||||
} else {
|
||||
false
|
||||
};
|
||||
if !has_columns && !has_deletions && !with_row_id {
|
||||
// This is an invalid case and should be prevented upstream,
|
||||
// no meaningful work is being done!
|
||||
continue;
|
||||
}
|
||||
if make_deletions_null && !with_row_id {
|
||||
// This is an invalid case and should be prevented upstream
|
||||
// we cannot make the row_id column null if it isn't present
|
||||
continue;
|
||||
}
|
||||
|
||||
let mut datagen = lance_datagen::gen_batch();
|
||||
if has_columns {
|
||||
datagen =
|
||||
datagen.col("x", lance_datagen::array::rand::<Int32Type>());
|
||||
}
|
||||
// 100 rows across 10 batches of 10 rows
|
||||
let data = batch_task_stream(
|
||||
datagen
|
||||
.into_reader_stream(RowCount::from(10), BatchCount::from(10))
|
||||
.0,
|
||||
);
|
||||
|
||||
let config = RowIdAndDeletesConfig {
|
||||
params: ReadBatchParams::RangeFull,
|
||||
with_row_id,
|
||||
with_row_addr: false,
|
||||
with_row_last_updated_at_version: false,
|
||||
with_row_created_at_version: false,
|
||||
deletion_vector: deletion_vector.clone(),
|
||||
row_id_sequence: None,
|
||||
last_updated_at_sequence: None,
|
||||
created_at_sequence: None,
|
||||
make_deletions_null,
|
||||
total_num_rows: 100,
|
||||
};
|
||||
let stream = super::wrap_with_row_id_and_delete(data, frag_id, config);
|
||||
let batches = stream
|
||||
.buffered(1)
|
||||
.filter_map(|batch| {
|
||||
std::future::ready(
|
||||
batch
|
||||
.map(|batch| {
|
||||
if batch.num_rows() == 0 {
|
||||
None
|
||||
} else {
|
||||
Some(batch)
|
||||
}
|
||||
})
|
||||
.transpose(),
|
||||
)
|
||||
})
|
||||
.try_collect::<Vec<_>>()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let total_num_rows =
|
||||
batches.iter().map(|b| b.num_rows()).sum::<usize>();
|
||||
let total_num_nulls = if make_deletions_null {
|
||||
batches
|
||||
.iter()
|
||||
.map(|b| b[ROW_ID].null_count())
|
||||
.sum::<usize>()
|
||||
} else {
|
||||
0
|
||||
};
|
||||
let total_actually_deleted = total_num_nulls + (100 - total_num_rows);
|
||||
|
||||
let expected_deletions = match &deletion_vector {
|
||||
None => 0,
|
||||
Some(deletion_vector) => match deletion_vector.as_ref() {
|
||||
DeletionVector::NoDeletions => 0,
|
||||
DeletionVector::Bitmap(b) => b.len() as usize,
|
||||
DeletionVector::Set(s) => s.len(),
|
||||
},
|
||||
};
|
||||
assert_eq!(total_actually_deleted, expected_deletions);
|
||||
if expected_deletions > 0 && with_row_id {
|
||||
if make_deletions_null {
|
||||
// If we make deletions null we get 3 batches of all-null and then
|
||||
// a batch of half-null
|
||||
assert_eq!(
|
||||
batches[3][ROW_ID].as_primitive::<UInt64Type>().value(0),
|
||||
u64::from(RowAddress::new_from_parts(frag_id, 30))
|
||||
);
|
||||
assert_eq!(batches[3][ROW_ID].null_count(), 5);
|
||||
} else {
|
||||
// If we materialize deletions the first row will be 35
|
||||
assert_eq!(
|
||||
batches[0][ROW_ID].as_primitive::<UInt64Type>().value(0),
|
||||
u64::from(RowAddress::new_from_parts(frag_id, 35))
|
||||
);
|
||||
}
|
||||
}
|
||||
if !with_row_id {
|
||||
assert!(batches[0].column_by_name(ROW_ID).is_none());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_version_column_with_deletions() {
|
||||
use crate::rowids::segment::U64Segment;
|
||||
use crate::rowids::version::{RowDatasetVersionRun, RowDatasetVersionSequence};
|
||||
|
||||
let seq = Arc::new(RowDatasetVersionSequence {
|
||||
runs: vec![RowDatasetVersionRun {
|
||||
span: U64Segment::Range(0..100),
|
||||
version: 42,
|
||||
}],
|
||||
});
|
||||
|
||||
let data = batch_task_stream(
|
||||
lance_datagen::gen_batch()
|
||||
.col("x", lance_datagen::array::rand::<Int32Type>())
|
||||
.into_reader_stream(RowCount::from(10), BatchCount::from(10))
|
||||
.0,
|
||||
);
|
||||
|
||||
let config = RowIdAndDeletesConfig {
|
||||
params: ReadBatchParams::RangeFull,
|
||||
with_row_id: true,
|
||||
with_row_addr: false,
|
||||
with_row_last_updated_at_version: false,
|
||||
with_row_created_at_version: true,
|
||||
deletion_vector: Some(Arc::new(DeletionVector::Bitmap(RoaringBitmap::from_iter(
|
||||
0..35,
|
||||
)))),
|
||||
row_id_sequence: None,
|
||||
last_updated_at_sequence: None,
|
||||
created_at_sequence: Some(seq),
|
||||
make_deletions_null: false,
|
||||
total_num_rows: 100,
|
||||
};
|
||||
let stream = super::wrap_with_row_id_and_delete(data, 0, config);
|
||||
let batches: Vec<_> = stream
|
||||
.buffered(1)
|
||||
.try_filter(|b| std::future::ready(b.num_rows() > 0))
|
||||
.try_collect()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
|
||||
assert_eq!(total_rows, 65);
|
||||
|
||||
for batch in &batches {
|
||||
let versions = batch
|
||||
.column_by_name("_row_created_at_version")
|
||||
.unwrap()
|
||||
.as_primitive::<UInt64Type>()
|
||||
.values();
|
||||
assert!(versions.iter().all(|&v| v == 42));
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_version_column_multi_run() {
|
||||
use crate::rowids::segment::U64Segment;
|
||||
use crate::rowids::version::{RowDatasetVersionRun, RowDatasetVersionSequence};
|
||||
|
||||
// 3 runs: 0..40 v1, 40..70 v2, 70..100 v3
|
||||
let seq = Arc::new(RowDatasetVersionSequence {
|
||||
runs: vec![
|
||||
RowDatasetVersionRun {
|
||||
span: U64Segment::Range(0..40),
|
||||
version: 1,
|
||||
},
|
||||
RowDatasetVersionRun {
|
||||
span: U64Segment::Range(40..70),
|
||||
version: 2,
|
||||
},
|
||||
RowDatasetVersionRun {
|
||||
span: U64Segment::Range(70..100),
|
||||
version: 3,
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
// Delete 0..20 and 60..80 (spans run boundary).
|
||||
// Survivors: 20..40 (v1), 40..60 (v2), 80..100 (v3) = 60 rows
|
||||
let mut deletions = RoaringBitmap::from_iter(0..20);
|
||||
deletions.extend(60..80);
|
||||
|
||||
let data = batch_task_stream(
|
||||
lance_datagen::gen_batch()
|
||||
.col("x", lance_datagen::array::rand::<Int32Type>())
|
||||
.into_reader_stream(RowCount::from(10), BatchCount::from(10))
|
||||
.0,
|
||||
);
|
||||
|
||||
let config = RowIdAndDeletesConfig {
|
||||
params: ReadBatchParams::RangeFull,
|
||||
with_row_id: true,
|
||||
with_row_addr: false,
|
||||
with_row_last_updated_at_version: false,
|
||||
with_row_created_at_version: true,
|
||||
deletion_vector: Some(Arc::new(DeletionVector::Bitmap(deletions))),
|
||||
row_id_sequence: None,
|
||||
last_updated_at_sequence: None,
|
||||
created_at_sequence: Some(seq),
|
||||
make_deletions_null: false,
|
||||
total_num_rows: 100,
|
||||
};
|
||||
let stream = super::wrap_with_row_id_and_delete(data, 0, config);
|
||||
let batches: Vec<_> = stream
|
||||
.buffered(1)
|
||||
.try_filter(|b| std::future::ready(b.num_rows() > 0))
|
||||
.try_collect()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
|
||||
assert_eq!(total_rows, 60);
|
||||
|
||||
let all_versions: Vec<u64> = batches
|
||||
.iter()
|
||||
.flat_map(|b| {
|
||||
b.column_by_name("_row_created_at_version")
|
||||
.unwrap()
|
||||
.as_primitive::<UInt64Type>()
|
||||
.values()
|
||||
.to_vec()
|
||||
})
|
||||
.collect();
|
||||
|
||||
assert!(all_versions[..20].iter().all(|&v| v == 1));
|
||||
assert!(all_versions[20..40].iter().all(|&v| v == 2));
|
||||
assert!(all_versions[40..60].iter().all(|&v| v == 3));
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue