fix(deps): vendor lance-table 7.0.0 + lance#7480 so merge-updated tables survive filtered reads after deletes

iss-merge-rowid-overlap-corrupts-filtered-reads / lance#7444: an update-style merge_insert over a merge-written fragment legally reuses the updated rows' stable row ids (row-id-lineage spec: updates preserve _rowid) while the superseded fragment keeps its full sequence plus a deletion vector. A later delete leaves the overlapping id range sparsely tiled, and lance-table 7.0.0's RowIdIndex::new asserted dense tiling — failing every filtered read that builds the id→address map ("Wrong range" debug assert; "all columns in a record batch must have the same length" or a silently-wrong batch in release). The upstream fix (lance#7480, merged 2026-07-01) landed hours AFTER v8.0.0 was cut, so no release ≤ 8.0.0 carries it. Consume it now as a vendored pin: vendor/lance-table is the pristine published 7.0.0 source plus ONLY the #7480 rowids/index.rs hunk (drop the false tiling assert; hard-error on the true invariant — one live id claimed by two fragments) and upstream's regression unit test, wired via [patch.crates-io]. The fix is read-side only, so already-written graphs become readable as-is — no data repair. Removal condition (see vendor/lance-table/README.omnigraph.md): drop the vendor dir + patch entry at the first Lance bump whose lance-table ships lance#7480 (9.0.0, or a backported 8.0.1). The surface guard filtered_scan_tolerates_merge_update_row_id_overlap keeps that honest in both directions. Turns the previous commit's red tests green. Full workspace gate passes (cargo test --workspace --locked --no-fail-fast, 68 suites).
2026-07-03 02:51:04 +02:00 · 2026-07-02 02:17:25 +03:00 · 2026-07-02 02:17:25 +03:00 · b5c0c6238b
commit b5c0c6238b
parent 3b564534a2
48 changed files with 22203 additions and 2 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -4202,8 +4202,6 @@ dependencies = [
 [[package]]
 name = "lance-table"
 version = "7.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b16f1355904aea4ebb04ffc70c58c97901e10bde44452b4b021de4a1f329250d"
 dependencies = [
 "arrow",
 "arrow-array",
--- a/Cargo.toml
+++ b/Cargo.toml
@ -1,5 +1,8 @@
 [workspace]
 resolver = "2"
+# The vendored patched crate is a [patch.crates-io] path source, not a
+# workspace member (see the patch section at the bottom of this file).
+exclude = ["vendor/lance-table"]
 members = [
    "crates/omnigraph-compiler",
    "crates/omnigraph",
@ -86,3 +89,14 @@ opt-level = 2
 lto = "thin"
 codegen-units = 16
 strip = true
+
+# Vendored lance-table 7.0.0 carrying ONLY the lance#7480 hunk (rowids/index.rs):
+# tolerate sparse overlapping stable-row-id chunks so filtered reads survive an
+# update-style merge_insert followed by a delete (lance#7444;
+# iss-merge-rowid-overlap-corrupts-filtered-reads). Pinned by
+# lance_surface_guards.rs::filtered_scan_tolerates_merge_update_row_id_overlap.
+# REMOVE vendor/lance-table + this patch at the first Lance bump whose
+# lance-table ships lance#7480 (9.0.0, or a backported 8.0.1). Details:
+# vendor/lance-table/README.omnigraph.md and docs/dev/lance.md.
+[patch.crates-io]
+lance-table = { path = "vendor/lance-table" }
--- a/vendor/lance-table/.cargo_vcs_info.json
+++ b/vendor/lance-table/.cargo_vcs_info.json
@ -0,0 +1,6 @@
+{
+  "git": {
+    "sha1": "a15ae30939b9242d74b00aed1fb83abf7d15bf7f"
+  },
+  "path_in_vcs": "rust/lance-table"
+}
--- a/vendor/lance-table/Cargo.lock
+++ b/vendor/lance-table/Cargo.lock
--- a/vendor/lance-table/Cargo.toml
+++ b/vendor/lance-table/Cargo.toml
@ -0,0 +1,263 @@
+# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
+#
+# When uploading crates to the registry Cargo will automatically
+# "normalize" Cargo.toml files for maximal compatibility
+# with all versions of Cargo and also rewrite `path` dependencies
+# to registry (e.g., crates.io) dependencies.
+#
+# If you are reading this file be aware that the original Cargo.toml
+# will likely look very different (and much more reasonable).
+# See Cargo.toml.orig for the original contents.
+
+[package]
+edition = "2024"
+rust-version = "1.91.0"
+name = "lance-table"
+version = "7.0.0"
+authors = ["Lance Devs <dev@lance.org>"]
+build = "build.rs"
+autolib = false
+autobins = false
+autoexamples = false
+autotests = false
+autobenches = false
+description = "Utilities for the Lance table format"
+readme = "README.md"
+keywords = [
+    "data-format",
+    "data-science",
+    "machine-learning",
+    "apache-arrow",
+    "data-analytics",
+]
+categories = [
+    "database-implementations",
+    "data-structures",
+    "development-tools",
+    "science",
+]
+license = "Apache-2.0"
+repository = "https://github.com/lance-format/lance"
+
+[package.metadata.docs.rs]
+features = ["protoc"]
+
+[features]
+dynamodb = [
+    "dep:aws-sdk-dynamodb",
+    "dep:aws-credential-types",
+    "lance-io/aws",
+]
+protoc = ["dep:protobuf-src"]
+
+[lib]
+name = "lance_table"
+path = "src/lib.rs"
+
+[[bench]]
+name = "manifest_intern"
+path = "benches/manifest_intern.rs"
+harness = false
+
+[[bench]]
+name = "row_id_index"
+path = "benches/row_id_index.rs"
+harness = false
+
+[dependencies.arrow]
+version = "58.0.0"
+features = ["prettyprint"]
+
+[dependencies.arrow-array]
+version = "58.0.0"
+
+[dependencies.arrow-buffer]
+version = "58.0.0"
+
+[dependencies.arrow-ipc]
+version = "58.0.0"
+features = ["zstd"]
+
+[dependencies.arrow-schema]
+version = "58.0.0"
+
+[dependencies.async-trait]
+version = "0.1"
+
+[dependencies.aws-credential-types]
+version = "1.2.0"
+optional = true
+
+[dependencies.aws-sdk-dynamodb]
+version = "1.38.0"
+features = [
+    "default-https-client",
+    "rt-tokio",
+]
+optional = true
+default-features = false
+
+[dependencies.byteorder]
+version = "1.5"
+
+[dependencies.bytes]
+version = "1.11.1"
+
+[dependencies.chrono]
+version = "0.4.41"
+features = [
+    "std",
+    "now",
+    "serde",
+]
+default-features = false
+
+[dependencies.deepsize]
+version = "0.2.0"
+
+[dependencies.futures]
+version = "0.3"
+
+[dependencies.lance-arrow]
+version = "=7.0.0"
+
+[dependencies.lance-core]
+version = "=7.0.0"
+
+[dependencies.lance-file]
+version = "=7.0.0"
+
+[dependencies.lance-io]
+version = "=7.0.0"
+default-features = false
+
+[dependencies.log]
+version = "0.4"
+
+[dependencies.object_store]
+version = "0.13.2"
+
+[dependencies.prost]
+version = "0.14.1"
+
+[dependencies.prost-types]
+version = "0.14.1"
+
+[dependencies.rand]
+version = "0.9.1"
+features = ["small_rng"]
+
+[dependencies.rangemap]
+version = "1.0"
+
+[dependencies.roaring]
+version = "0.11"
+
+[dependencies.semver]
+version = "1.0"
+
+[dependencies.serde]
+version = "^1"
+
+[dependencies.serde_json]
+version = "1"
+
+[dependencies.snafu]
+version = "0.9"
+
+[dependencies.tokio]
+version = "1.23"
+features = [
+    "rt-multi-thread",
+    "macros",
+    "fs",
+    "sync",
+]
+
+[dependencies.tracing]
+version = "0.1"
+
+[dependencies.url]
+version = "2.5.7"
+
+[dependencies.uuid]
+version = "1.2"
+features = [
+    "v4",
+    "serde",
+]
+
+[dev-dependencies.arrow-schema]
+version = "58.0.0"
+
+[dev-dependencies.criterion]
+version = "0.5"
+features = [
+    "async",
+    "async_tokio",
+    "html_reports",
+]
+
+[dev-dependencies.lance-datagen]
+version = "=7.0.0"
+
+[dev-dependencies.pretty_assertions]
+version = "1.4.0"
+
+[dev-dependencies.proptest]
+version = "1.3.1"
+
+[dev-dependencies.rstest]
+version = "0.23.0"
+
+[build-dependencies.prost-build]
+version = "0.14.1"
+
+[build-dependencies.protobuf-src]
+version = "2.1"
+optional = true
+
+[target.'cfg(target_os = "linux")'.dev-dependencies.pprof]
+version = "0.14.0"
+features = [
+    "flamegraph",
+    "criterion",
+]
+
+[lints.clippy]
+dbg_macro = "deny"
+disallowed_macros = "deny"
+fallible_impl_from = "deny"
+large_futures = "deny"
+manual_let_else = "deny"
+multiple-crate-versions = "allow"
+print_stderr = "deny"
+print_stdout = "deny"
+redundant_clone = "deny"
+redundant_pub_crate = "deny"
+single_range_in_vec_init = "allow"
+string_add = "deny"
+string_add_assign = "deny"
+string_lit_as_bytes = "deny"
+trait_duplication_in_bounds = "deny"
+use_self = "deny"
+
+[lints.clippy.all]
+level = "deny"
+priority = -1
+
+[lints.clippy.cargo]
+level = "deny"
+priority = -1
+
+[lints.clippy.style]
+level = "deny"
+priority = -1
+
+[lints.rust]
+unsafe_op_in_unsafe_fn = "allow"
+
+[lints.rust.unexpected_cfgs]
+level = "warn"
+priority = 0
+check-cfg = ["cfg(coverage,coverage_nightly)"]
--- a/vendor/lance-table/Cargo.toml.orig
+++ b/vendor/lance-table/Cargo.toml.orig
@ -0,0 +1,80 @@
+[package]
+name = "lance-table"
+version.workspace = true
+edition.workspace = true
+authors.workspace = true
+license.workspace = true
+repository.workspace = true
+readme = "README.md"
+description = "Utilities for the Lance table format"
+keywords.workspace = true
+categories.workspace = true
+rust-version.workspace = true
+
+[dependencies]
+lance-arrow.workspace = true
+lance-core.workspace = true
+lance-file.workspace = true
+lance-io.workspace = true
+arrow.workspace = true
+arrow-array.workspace = true
+arrow-buffer.workspace = true
+arrow-ipc.workspace = true
+arrow-schema.workspace = true
+async-trait.workspace = true
+aws-credential-types = { workspace = true, optional = true }
+aws-sdk-dynamodb = { workspace = true, optional = true, default-features = false, features = ["default-https-client", "rt-tokio"] }
+byteorder.workspace = true
+bytes.workspace = true
+chrono.workspace = true
+deepsize.workspace = true
+futures.workspace = true
+log.workspace = true
+object_store.workspace = true
+prost.workspace = true
+prost-types.workspace = true
+rand.workspace = true
+rangemap.workspace = true
+roaring.workspace = true
+serde.workspace = true
+serde_json.workspace = true
+semver.workspace = true
+snafu.workspace = true
+tokio.workspace = true
+tracing.workspace = true
+url.workspace = true
+uuid.workspace = true
+
+[dev-dependencies]
+lance-datagen.workspace = true
+arrow-schema.workspace = true
+criterion.workspace = true
+pretty_assertions.workspace = true
+proptest.workspace = true
+rstest.workspace = true
+
+[target.'cfg(target_os = "linux")'.dev-dependencies]
+pprof = { workspace = true }
+
+[build-dependencies]
+prost-build.workspace = true
+protobuf-src = { version = "2.1", optional = true }
+
+[features]
+dynamodb = ["dep:aws-sdk-dynamodb", "dep:aws-credential-types", "lance-io/aws"]
+protoc = ["dep:protobuf-src"]
+
+[package.metadata.docs.rs]
+# docs.rs uses an older version of Ubuntu that does not have the necessary protoc version
+features = ["protoc"]
+
+[[bench]]
+name = "row_id_index"
+harness = false
+
+[[bench]]
+name = "manifest_intern"
+harness = false
+
+[lints]
+workspace = true
--- a/vendor/lance-table/README.md
+++ b/vendor/lance-table/README.md
@ -0,0 +1,6 @@
+# lance-table
+
+`lance-table` is an internal sub-crate for the
+[Lance table format](https://lance.org/format/table/).
+
+**Important Note**: This crate is **not intended for external usage**.
--- a/vendor/lance-table/README.omnigraph.md
+++ b/vendor/lance-table/README.omnigraph.md
@ -0,0 +1,42 @@
+# Vendored `lance-table` 7.0.0 + lance#7480 (omnigraph patch pin)
+
+This directory is the **pristine `lance-table` 7.0.0 crates.io source** (unpacked
+from the published `.crate`) carrying exactly one upstream fix, cherry-picked
+from [lance-format/lance#7480](https://github.com/lance-format/lance/pull/7480)
+(merged to Lance main 2026-07-01, first present in no release ≤ 8.0.0):
+
+- `src/rowids/index.rs` — `RowIdIndex::new` no longer asserts that overlapping
+  row-id chunks densely tile their range (an update-style `merge_insert`
+  legally reuses the updated rows' stable ids in new fragments while the
+  superseded fragment keeps its full sequence + a deletion vector; a later
+  delete leaves the union short of the span). The real invariant — the same
+  live id claimed by two fragments — is now a hard error in
+  `merge_overlapping_chunks` instead. Upstream's regression unit test is
+  included.
+
+Without the fix, any filtered read that builds the row-id index on such a
+table fails: `rowids/index.rs:50` "Wrong range" debug assert; "all columns in
+a record batch must have the same length" (or a silently-wrong batch) in
+release. Bug: [lance#7444](https://github.com/lance-format/lance/issues/7444),
+tracked as `iss-merge-rowid-overlap-corrupts-filtered-reads` /
+`blk-lance-7444` on the dev graph.
+
+Wired up via `[patch.crates-io] lance-table = { path = "vendor/lance-table" }`
+in the workspace root `Cargo.toml`.
+
+## Removal condition
+
+Delete this directory and the `[patch.crates-io]` entry at the **first Lance
+bump whose `lance-table` ships lance#7480** — 9.0.0, or a backported 8.0.1 if
+upstream cuts one. The runtime guard
+`crates/omnigraph/tests/lance_surface_guards.rs::filtered_scan_tolerates_merge_update_row_id_overlap`
+pins the fixed behavior: it goes red if the patch is dropped too early or a
+future bump regresses the fix.
+
+## Verifying the delta
+
+```bash
+# The full diff vs the published crate should be ONLY the #7480 hunk + this README:
+tar -xzf ~/.cargo/registry/cache/index.crates.io-*/lance-table-7.0.0.crate -C /tmp
+diff -ru /tmp/lance-table-7.0.0 vendor/lance-table
+```
--- a/vendor/lance-table/benches/manifest_intern.rs
+++ b/vendor/lance-table/benches/manifest_intern.rs
@ -0,0 +1,261 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+// Benchmarks use eprintln! to report memory stats alongside criterion output.
+#![allow(clippy::print_stderr)]
+
+//! Benchmark for manifest fragment interning.
+//!
+//! Measures memory savings and deserialization throughput when interning
+//! `DataFile.fields`, `DataFile.column_indices`, and
+//! `RowDatasetVersionMeta::Inline` bytes across many fragments.
+
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
+use deepsize::DeepSizeOf;
+use prost::Message;
+
+use lance_table::format::pb;
+use lance_table::format::{DataFileFieldInterner, Fragment};
+
+fn num_fragments() -> u64 {
+    std::env::var("BENCH_NUM_FRAGMENTS")
+        .map(|s| s.parse().unwrap())
+        .unwrap_or(100_000)
+}
+
+/// Build a vector of protobuf DataFragment messages that simulate a
+/// homogeneous, post-compaction table: every fragment has the same field
+/// list, column indices, and version metadata bytes.
+fn make_uniform_pb_fragments(n: u64, num_fields: usize) -> Vec<pb::DataFragment> {
+    let fields: Vec<i32> = (0..num_fields as i32).collect();
+    let column_indices: Vec<i32> = (0..num_fields as i32).collect();
+
+    // Simulate version metadata: a small protobuf-encoded payload
+    // (identical across all fragments post-compaction)
+    let version_bytes: Vec<u8> = {
+        let seq = pb::RowDatasetVersionSequence {
+            runs: vec![pb::RowDatasetVersionRun {
+                span: Some(pb::U64Segment {
+                    segment: Some(pb::u64_segment::Segment::Range(pb::u64_segment::Range {
+                        start: 0,
+                        end: 1000,
+                    })),
+                }),
+                version: 42,
+            }],
+        };
+        seq.encode_to_vec()
+    };
+
+    (0..n)
+        .map(|i| pb::DataFragment {
+            id: i,
+            files: vec![pb::DataFile {
+                path: format!("data/{i}.lance"),
+                fields: fields.clone(),
+                column_indices: column_indices.clone(),
+                file_major_version: 2,
+                file_minor_version: 0,
+                file_size_bytes: 0,
+                base_id: None,
+            }],
+            deletion_file: None,
+            row_id_sequence: None,
+            physical_rows: 1000,
+            last_updated_at_version_sequence: Some(
+                pb::data_fragment::LastUpdatedAtVersionSequence::InlineLastUpdatedAtVersions(
+                    version_bytes.clone(),
+                ),
+            ),
+            created_at_version_sequence: Some(
+                pb::data_fragment::CreatedAtVersionSequence::InlineCreatedAtVersions(
+                    version_bytes.clone(),
+                ),
+            ),
+        })
+        .collect()
+}
+
+/// Deserialize protobuf fragments WITHOUT interning (baseline).
+fn deserialize_without_interning(protos: &[pb::DataFragment]) -> Vec<Fragment> {
+    protos
+        .iter()
+        .map(|p| Fragment::try_from(p.clone()).unwrap())
+        .collect()
+}
+
+/// Deserialize protobuf fragments WITH interning.
+fn deserialize_with_interning(protos: &[pb::DataFragment]) -> Vec<Fragment> {
+    let mut interner = DataFileFieldInterner::default();
+    protos
+        .iter()
+        .map(|p| interner.intern_fragment(p.clone()).unwrap())
+        .collect()
+}
+
+/// Build fragments where each group shares the same version metadata,
+/// simulating many small appends without compaction.
+fn make_diverse_pb_fragments(
+    n: u64,
+    num_fields: usize,
+    unique_versions: u64,
+) -> Vec<pb::DataFragment> {
+    let fields: Vec<i32> = (0..num_fields as i32).collect();
+    let column_indices: Vec<i32> = (0..num_fields as i32).collect();
+    let group_size = n / unique_versions;
+
+    let version_payloads: Vec<Vec<u8>> = (0..unique_versions)
+        .map(|v| {
+            let seq = pb::RowDatasetVersionSequence {
+                runs: vec![pb::RowDatasetVersionRun {
+                    span: Some(pb::U64Segment {
+                        segment: Some(pb::u64_segment::Segment::Range(pb::u64_segment::Range {
+                            start: 0,
+                            end: 1000,
+                        })),
+                    }),
+                    version: v,
+                }],
+            };
+            seq.encode_to_vec()
+        })
+        .collect();
+
+    (0..n)
+        .map(|i| {
+            let version_idx = (i / group_size).min(unique_versions - 1) as usize;
+            pb::DataFragment {
+                id: i,
+                files: vec![pb::DataFile {
+                    path: format!("data/{i}.lance"),
+                    fields: fields.clone(),
+                    column_indices: column_indices.clone(),
+                    file_major_version: 2,
+                    file_minor_version: 0,
+                    file_size_bytes: 0,
+                    base_id: None,
+                }],
+                deletion_file: None,
+                row_id_sequence: None,
+                physical_rows: 1000,
+                last_updated_at_version_sequence: Some(
+                    pb::data_fragment::LastUpdatedAtVersionSequence::InlineLastUpdatedAtVersions(
+                        version_payloads[version_idx].clone(),
+                    ),
+                ),
+                created_at_version_sequence: Some(
+                    pb::data_fragment::CreatedAtVersionSequence::InlineCreatedAtVersions(
+                        version_payloads[version_idx].clone(),
+                    ),
+                ),
+            }
+        })
+        .collect()
+}
+
+fn bench_deserialization(c: &mut Criterion) {
+    let mut group = c.benchmark_group("manifest_intern");
+    let n = num_fragments();
+
+    for num_fields in [10, 50] {
+        let protos = make_uniform_pb_fragments(n, num_fields);
+
+        group.bench_with_input(
+            BenchmarkId::new("deserialize_no_intern", num_fields),
+            &num_fields,
+            |b, _| {
+                b.iter(|| deserialize_without_interning(&protos));
+            },
+        );
+
+        group.bench_with_input(
+            BenchmarkId::new("deserialize_with_intern", num_fields),
+            &num_fields,
+            |b, _| {
+                b.iter(|| deserialize_with_interning(&protos));
+            },
+        );
+    }
+
+    // Benchmark with many unique version payloads
+    for unique_versions in [10, 100, 500] {
+        let protos = make_diverse_pb_fragments(n, 10, unique_versions);
+
+        group.bench_with_input(
+            BenchmarkId::new("deserialize_no_intern_diverse", unique_versions),
+            &unique_versions,
+            |b, _| {
+                b.iter(|| deserialize_without_interning(&protos));
+            },
+        );
+
+        group.bench_with_input(
+            BenchmarkId::new("deserialize_with_intern_diverse", unique_versions),
+            &unique_versions,
+            |b, _| {
+                b.iter(|| deserialize_with_interning(&protos));
+            },
+        );
+    }
+
+    group.finish();
+}
+
+fn bench_memory(c: &mut Criterion) {
+    let mut group = c.benchmark_group("manifest_memory");
+    let n = num_fragments();
+
+    for num_fields in [10, 50] {
+        let protos = make_uniform_pb_fragments(n, num_fields);
+
+        let no_intern = deserialize_without_interning(&protos);
+        let with_intern = deserialize_with_interning(&protos);
+
+        let size_no_intern = no_intern.deep_size_of();
+        let size_with_intern = with_intern.deep_size_of();
+
+        eprintln!(
+            "\n[{} fragments, {} fields] Memory without interning: {:.2} MB",
+            n,
+            num_fields,
+            size_no_intern as f64 / 1_048_576.0
+        );
+        eprintln!(
+            "[{} fragments, {} fields] Memory with interning:    {:.2} MB",
+            n,
+            num_fields,
+            size_with_intern as f64 / 1_048_576.0
+        );
+        eprintln!(
+            "[{} fragments, {} fields] Savings:                  {:.2} MB ({:.1}%)",
+            n,
+            num_fields,
+            (size_no_intern - size_with_intern) as f64 / 1_048_576.0,
+            (1.0 - size_with_intern as f64 / size_no_intern as f64) * 100.0
+        );
+
+        // Benchmark deep_size_of measurement itself (sanity check)
+        group.bench_with_input(
+            BenchmarkId::new("deep_size_of_interned", num_fields),
+            &num_fields,
+            |b, _| {
+                b.iter(|| with_intern.deep_size_of());
+            },
+        );
+
+        drop(no_intern);
+        drop(with_intern);
+    }
+
+    group.finish();
+}
+
+#[cfg(target_os = "linux")]
+criterion_group!(
+    name = benches;
+    config = Criterion::default().with_profiler(pprof::criterion::PProfProfiler::new(100, pprof::criterion::Output::Flamegraph(None)));
+    targets = bench_deserialization, bench_memory
+);
+#[cfg(not(target_os = "linux"))]
+criterion_group!(benches, bench_deserialization, bench_memory);
+criterion_main!(benches);
--- a/vendor/lance-table/benches/row_id_index.rs
+++ b/vendor/lance-table/benches/row_id_index.rs
@ -0,0 +1,323 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+// TODO:
+// - [x] Create base cases with HashMap
+// - [x] Create on-disk size measurement
+// - [x] Create different cases for the index. Ideal, 25% deletions, 80% deletions + compaction.
+// - [ ] Create a benchmark for the get method
+//   - [x] Average over all valid values
+//   - [ ] Time to get a value that is not in the index
+// - [ ] Create a benchmark for the new method (building the in-memory index)
+// Optional:
+// - [ ] Create in-memory size measurement (if possible)
+
+// Questions:
+// How can I write out the file? Where should I put it?
+// How can I take a argument to set the size of the index?
+
+use std::{collections::HashMap, io::Write, ops::Range, sync::Arc};
+
+use arrow_array::{RecordBatch, UInt64Array};
+use arrow_schema::{DataType, Field, Schema};
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
+
+use lance_core::utils::address::RowAddress;
+use lance_core::utils::deletion::DeletionVector;
+use lance_io::ReadBatchParams;
+use lance_table::rowids::FragmentRowIdIndex;
+use lance_table::{
+    rowids::{RowIdIndex, RowIdSequence, write_row_ids},
+    utils::stream::{RowIdAndDeletesConfig, apply_row_id_and_deletes},
+};
+
+fn make_sequence(row_id_range: Range<u64>, deletions: usize) -> RowIdSequence {
+    let mut sequence = RowIdSequence::from(row_id_range);
+
+    // Delete every other row
+    let delete_ids = sequence
+        .iter()
+        .step_by(2)
+        .take(deletions)
+        .collect::<Vec<_>>();
+    sequence.delete(delete_ids);
+
+    sequence
+}
+
+fn make_frag_sequences(
+    num_rows: u64,
+    num_frags: u64,
+    percent_deletion: f32,
+) -> Vec<(u32, Arc<RowIdSequence>)> {
+    let rows_per_frag = num_rows / num_frags;
+    let mut start = 0;
+    (0..num_frags)
+        .map(|i| {
+            let sequence = make_sequence(
+                start..(start + rows_per_frag),
+                (rows_per_frag as f32 * percent_deletion) as usize,
+            );
+            start += rows_per_frag;
+            (i as u32, Arc::new(sequence))
+        })
+        .collect()
+}
+
+// For range of values
+// https://bheisler.github.io/criterion.rs/book/user_guide/benchmarking_with_inputs.html
+
+fn num_rows() -> u64 {
+    std::env::var("BENCH_NUM_ROWS")
+        .map(|s| s.parse().unwrap())
+        .unwrap_or(1_000_000)
+}
+
+struct SizeStats {
+    structure: String,
+    percent_deletions: f32,
+    size: u64,
+}
+
+struct SizeStatsFile {
+    file: Option<std::fs::File>,
+}
+
+impl SizeStatsFile {
+    fn new() -> Self {
+        if let Ok(path) = std::env::var("BENCH_SIZE_STATS_FILE") {
+            let mut file = std::fs::File::create(path).unwrap();
+            // Header row
+            writeln!(file, "structure,percent_deletions,size").unwrap();
+            Self { file: Some(file) }
+        } else {
+            Self { file: None }
+        }
+    }
+
+    fn write_row(&mut self, stats: SizeStats) {
+        if let Some(file) = &mut self.file {
+            writeln!(
+                file,
+                "\"{}\",{},{}",
+                stats.structure, stats.percent_deletions, stats.size
+            )
+            .unwrap();
+        }
+    }
+}
+
+fn bench_creation(c: &mut Criterion) {
+    let mut group = c.benchmark_group("row_id_index_creation");
+    let mut stats_file = SizeStatsFile::new();
+
+    for percent_deletions in [0.0, 0.25, 0.5] {
+        let sequences = make_frag_sequences(num_rows(), 100, percent_deletions);
+
+        let fragment_indices: Vec<FragmentRowIdIndex> = sequences
+            .iter()
+            .map(|(frag_id, sequence)| FragmentRowIdIndex {
+                fragment_id: *frag_id,
+                row_id_sequence: sequence.clone(),
+                deletion_vector: Arc::new(DeletionVector::default()),
+            })
+            .collect();
+
+        group.bench_with_input(
+            BenchmarkId::new("BuildIndex", percent_deletions),
+            &percent_deletions,
+            |b, _| {
+                b.iter(|| {
+                    let _index = RowIdIndex::new(&fragment_indices).unwrap();
+                });
+            },
+        );
+
+        // Measure size of index
+        {
+            let mut size = 0;
+            for (_frag_id, sequence) in &sequences {
+                size += write_row_ids(sequence).len() as u64;
+            }
+            let stats = SizeStats {
+                structure: "RowIdIndex".to_string(),
+                percent_deletions,
+                size,
+            };
+            stats_file.write_row(stats);
+        }
+
+        // TODO: we should compare tombstoned vs compacted. We don't mind the
+        // regression in the tombstoned case, but we want to see the improvement
+        // in the compacted case.
+
+        // TODO: collect size of sequences when serialized
+
+        // TODO: also show building a BTreeMap and HashMap
+
+        let flat_data = sequences
+            .iter()
+            .map(|(frag_id, sequence)| {
+                let row_ids = sequence.iter().collect::<Vec<_>>();
+                let row_addresses = (0..sequence.len())
+                    .map(|i| RowAddress::new_from_parts(*frag_id, i as u32))
+                    .map(u64::from)
+                    .collect::<Vec<_>>();
+                (row_ids, row_addresses)
+            })
+            .collect::<Vec<_>>();
+
+        // Size of flat data is just 16 bytes per row
+        let size = flat_data
+            .iter()
+            .map(|(ids, _addresses)| ids.len() * 16)
+            .sum::<usize>() as u64;
+        let stats = SizeStats {
+            structure: "FlatData".to_string(),
+            percent_deletions,
+            size,
+        };
+        stats_file.write_row(stats);
+
+        group.bench_with_input(
+            BenchmarkId::new("BuildHashMap", percent_deletions),
+            &percent_deletions,
+            |b, _| {
+                b.iter(|| {
+                    let mut index = HashMap::new();
+                    index.extend(flat_data.iter().flat_map(|(ids, addresses)| {
+                        ids.iter().copied().zip(addresses.iter().copied())
+                    }));
+                });
+            },
+        );
+    }
+
+    group.finish();
+}
+
+fn bench_get_single(c: &mut Criterion) {
+    let mut group = c.benchmark_group("row_id_index_get_single");
+
+    for percent_deletions in [0.0, 0.02, 0.25, 0.5, 0.8] {
+        let sequences = make_frag_sequences(num_rows(), 100, percent_deletions);
+
+        let fragment_indices: Vec<FragmentRowIdIndex> = sequences
+            .iter()
+            .map(|(frag_id, sequence)| FragmentRowIdIndex {
+                fragment_id: *frag_id,
+                row_id_sequence: sequence.clone(),
+                deletion_vector: Arc::new(DeletionVector::default()),
+            })
+            .collect();
+
+        let index = RowIdIndex::new(&fragment_indices).unwrap();
+
+        let mut i = 0;
+        let total_rows: u64 = num_rows();
+        let mut next_id = || {
+            let id = i;
+            i += 241861;
+            i %= total_rows;
+            id
+        };
+
+        group.bench_with_input(
+            BenchmarkId::new("GetIndex", percent_deletions),
+            &percent_deletions,
+            |b, _| {
+                b.iter(|| {
+                    let _ = index.get(next_id());
+                });
+            },
+        );
+
+        let flat_data = sequences
+            .iter()
+            .map(|(frag_id, sequence)| {
+                let row_ids = sequence.iter().collect::<Vec<_>>();
+                let row_addresses = (0..sequence.len())
+                    .map(|i| RowAddress::new_from_parts(*frag_id, i as u32))
+                    .map(u64::from)
+                    .collect::<Vec<_>>();
+                (row_ids, row_addresses)
+            })
+            .collect::<Vec<_>>();
+
+        let index =
+            {
+                let mut index = HashMap::new();
+                index.extend(flat_data.iter().flat_map(|(ids, addresses)| {
+                    ids.iter().copied().zip(addresses.iter().copied())
+                }));
+                index
+            };
+
+        group.bench_with_input(
+            BenchmarkId::new("GetHashMap", percent_deletions),
+            &percent_deletions,
+            |b, _| {
+                b.iter(|| {
+                    for i in 0..num_rows() {
+                        let _ = index.get(&i);
+                    }
+                });
+            },
+        );
+    }
+
+    group.finish();
+}
+
+fn bench_apply_row_id(c: &mut Criterion) {
+    let mut group = c.benchmark_group("apply_row_id");
+
+    let batch = RecordBatch::try_new(
+        Arc::new(Schema::new(vec![Field::new(
+            "value",
+            DataType::UInt64,
+            false,
+        )])),
+        vec![Arc::new(UInt64Array::from(
+            (0..num_rows()).collect::<Vec<_>>(),
+        ))],
+    )
+    .unwrap();
+
+    let config = RowIdAndDeletesConfig {
+        params: ReadBatchParams::default(),
+        with_row_id: true,
+        with_row_addr: false,
+        with_row_last_updated_at_version: false,
+        with_row_created_at_version: false,
+        deletion_vector: None,
+        row_id_sequence: None,
+        last_updated_at_sequence: None,
+        created_at_sequence: None,
+        make_deletions_null: false,
+        total_num_rows: num_rows() as u32,
+    };
+
+    group.bench_function("ApplyRowId", |b| {
+        let batch = batch.clone();
+        b.iter(|| {
+            let _ = apply_row_id_and_deletes(batch.clone(), 0, 0, &config);
+        });
+    });
+
+    group.finish();
+}
+
+#[cfg(target_os = "linux")]
+criterion_group!(
+    name = benches;
+    config=Criterion::default().with_profiler(pprof::criterion::PProfProfiler::new(100, pprof::criterion::Output::Flamegraph(None)));
+    targets=bench_creation, bench_get_single, bench_apply_row_id);
+#[cfg(not(target_os = "linux"))]
+criterion_group!(
+    benches,
+    bench_creation,
+    bench_get_single,
+    bench_apply_row_id
+);
+criterion_main!(benches);
--- a/vendor/lance-table/build.rs
+++ b/vendor/lance-table/build.rs
@ -0,0 +1,29 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+use std::io::Result;
+
+fn main() -> Result<()> {
+    println!("cargo:rerun-if-changed=protos");
+
+    #[cfg(feature = "protoc")]
+    // Use vendored protobuf compiler if requested.
+    unsafe {
+        std::env::set_var("PROTOC", protobuf_src::protoc());
+    }
+
+    let mut prost_build = prost_build::Config::new();
+    prost_build.extern_path(".lance.file", "::lance_file::format::pb");
+    prost_build.protoc_arg("--experimental_allow_proto3_optional");
+    prost_build.enable_type_names();
+    prost_build.compile_protos(
+        &[
+            "./protos/table.proto",
+            "./protos/transaction.proto",
+            "./protos/rowids.proto",
+        ],
+        &["./protos"],
+    )?;
+
+    Ok(())
+}
--- a/vendor/lance-table/protos/AGENTS.md
+++ b/vendor/lance-table/protos/AGENTS.md
@ -0,0 +1,18 @@
+# Protobuf Guidelines
+
+Also see [root AGENTS.md](../AGENTS.md) for cross-language standards.
+
+## Compatibility
+
+- All changes must be backwards compatible. Never re-use or change field numbers of existing fields.
+
+## Schema Design
+
+- Use `optional` when you need to distinguish "not set" from "zero value" — `optional` enables presence tracking (`has_*` methods) and maps to `Option<T>` in Rust. Bare proto3 fields have no presence semantics: they always hold a value (defaulting to zero), so you cannot tell if the sender explicitly set them.
+- Use structured message types (e.g., `BasePath`) instead of plain scalars, and scope fields to operation-specific messages (e.g., `InsertTransaction`) rather than generic top-level ones.
+- Don't duplicate data across messages — store each fact once and derive relationships. Prefer parallel sequences over maps when keys already exist in another field.
+
+## Documentation
+
+- Document the semantic meaning of both present and absent states for `optional` fields — explain when each case applies.
+- Use precise domain terminology in field descriptions — avoid ambiguous abbreviations or terms that collide with domain concepts.
--- a/vendor/lance-table/protos/CLAUDE.md
+++ b/vendor/lance-table/protos/CLAUDE.md
@ -0,0 +1,18 @@
+# Protobuf Guidelines
+
+Also see [root AGENTS.md](../AGENTS.md) for cross-language standards.
+
+## Compatibility
+
+- All changes must be backwards compatible. Never re-use or change field numbers of existing fields.
+
+## Schema Design
+
+- Use `optional` when you need to distinguish "not set" from "zero value" — `optional` enables presence tracking (`has_*` methods) and maps to `Option<T>` in Rust. Bare proto3 fields have no presence semantics: they always hold a value (defaulting to zero), so you cannot tell if the sender explicitly set them.
+- Use structured message types (e.g., `BasePath`) instead of plain scalars, and scope fields to operation-specific messages (e.g., `InsertTransaction`) rather than generic top-level ones.
+- Don't duplicate data across messages — store each fact once and derive relationships. Prefer parallel sequences over maps when keys already exist in another field.
+
+## Documentation
+
+- Document the semantic meaning of both present and absent states for `optional` fields — explain when each case applies.
+- Use precise domain terminology in field descriptions — avoid ambiguous abbreviations or terms that collide with domain concepts.
--- a/vendor/lance-table/protos/ann.proto
+++ b/vendor/lance-table/protos/ann.proto
@ -0,0 +1,55 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+syntax = "proto3";
+
+package lance.pb;
+
+import "table_identifier.proto";
+import "table.proto";
+import "index.proto";
+
+// Serialized vector query parameters.
+message VectorQueryProto {
+  // Query vector as Arrow IPC bytes (supports Float16, Float32, Float64, UInt8, etc.)
+  bytes query_vector_arrow_ipc = 1;
+  string column = 2;
+  uint32 k = 3;
+  optional float lower_bound = 4;
+  optional float upper_bound = 5;
+  optional uint32 minimum_nprobes = 6;
+  optional uint32 maximum_nprobes = 7;
+  optional uint32 ef = 8;
+  optional uint32 refine_factor = 9;
+  // Distance metric type. Absent means None (use the index's default metric).
+  optional lance.index.pb.VectorMetricType metric_type = 10;
+  bool use_index = 11;
+  optional float dist_q_c = 12;
+  optional int32 query_parallelism = 13;
+}
+
+// Serializable form of ANNIvfSubIndexExec — the IVF sub-index search node.
+//
+// The prefilter child ExecutionPlan is serialized by DataFusion's codec
+// automatically via children() / with_new_children(). The prefilter_type
+// field tells the decoder which PreFilterSource variant to use when
+// reconstructing from the deserialized child inputs.
+message ANNIvfSubIndexExecProto {
+  enum PreFilterType {
+    NONE = 0;
+    FILTERED_ROW_IDS = 1;
+    SCALAR_INDEX_QUERY = 2;
+  }
+
+  VectorQueryProto query = 1;
+  lance.datafusion.TableIdentifier table = 2;
+  repeated lance.table.IndexMetadata indices = 3;
+  PreFilterType prefilter_type = 4;
+}
+
+// Serializable form of ANNIvfPartitionExec — the IVF centroid routing node.
+message ANNIvfPartitionExecProto {
+  VectorQueryProto query = 1;
+  lance.datafusion.TableIdentifier table = 2;
+  repeated string index_uuids = 3;
+}
--- a/vendor/lance-table/protos/encodings_v2_0.proto
+++ b/vendor/lance-table/protos/encodings_v2_0.proto
@ -0,0 +1,347 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+syntax = "proto3";
+
+package lance.encodings;
+
+import "google/protobuf/empty.proto";
+
+// This file contains a specification for encodings that can be used
+// to store and load Arrow data into a Lance file for the 2.0 format.  It
+// has been superseded by encodings21.proto which is used for the 2.1 format.
+//
+// # Types
+//
+// This file assumes the user wants to load data into Arrow arrays and
+// explains how to map Arrow arrays into Lance files.  Encodings are divided
+// into "array encoding" (which maps to an Arrow array and may contain multiple
+// buffers) and "buffer encoding" (which encodes a single buffer of data).
+//
+// # Encoding Tree
+//
+// Most encodings are layered on top of each other.  These form a tree of
+// encodings with a single root node.  To encode an array you will typically
+// start with the root node and then take the output from that root encoding
+// and feed it into child encodings.  The decoding process works in reverse.
+//
+// # Multi-column Encodings
+//
+// Some Arrow arrays will map to more than one column of Lance data.  For
+// example, struct arrays and list arrays.  This file only contains encodings
+// for a single column.  However, it does describe how multi-column arrays can
+// be encoded.
+
+// A pointer to a buffer in a Lance file
+//
+// A writer can place a buffer in three different locations.  The buffer
+// can go in the data page, in the column metadata, or in the file metadata.
+// The writer is free to choose whatever is most appropriate (for example, a dictionary
+// that is shared across all pages in a column will probably go in the column
+// metadata).  This specification does not dictate where the buffer should go.
+message Buffer {
+    // The index of the buffer in the collection of buffers
+    uint32 buffer_index = 1;
+    // The collection holding the buffer
+    enum BufferType {
+      // The buffer is stored in the data page itself
+      page = 0;
+      // The buffer is stored in the column metadata
+      column = 1;
+      // The buffer is stored in the file metadata
+      file = 2;
+    };
+    BufferType buffer_type = 2;
+}
+
+// An encoding that adds nullability to another array encoding
+//
+// This can wrap any array encoding and add nullability information
+message Nullable {
+  message NoNull {
+    ArrayEncoding values = 1;
+  }
+  message AllNull {}
+  message SomeNull {
+    ArrayEncoding validity = 1;
+    ArrayEncoding values = 2;
+  }
+  oneof nullability {
+    // The array has no nulls and there is a single buffer needed
+    NoNull no_nulls = 1;
+    // The array may have nulls and we need two buffers
+    SomeNull some_nulls = 2;
+    // All values are null (no buffers needed)
+    AllNull all_nulls = 3;
+  }
+}
+
+// An array encoding for variable-length list fields
+message List {
+    // An array containing the offsets into an items array.
+    //
+    // This array will have num_rows items and will never
+    // have nulls.
+    //
+    // If the list at index i is not null then offsets[i] will
+    // contain `base + len(list)` where `base` is defined as:
+    //   i == 0: 0
+    //   i >  0: (offsets[i-1] % null_offset_adjustment)
+    //
+    // To help understand we can consider the following example list:
+    // [ [A, B], null, [], [C, D, E] ]
+    //
+    // The offsets will be [2, ?, 2, 5]
+    //
+    // If the incoming list at index i IS null then offsets[i] will
+    // contain `base + len(list) + null_offset_adjustment` where `base`
+    // is defined the same as above.
+    //
+    // To complete the above example let's assume that `null_offset_adjustment`
+    // is 7.  Then the offsets will be [2, 9, 2, 5]
+    //
+    // If there are no nulls then the offsets we write here are exactly the
+    // same as the offsets in an Arrow list array (except we omit the leading
+    // 0 which is redundant)
+    //
+    // The reason we do this is so that reading a single list at index i only
+    // requires us to load the indices at i and i-1.
+    //
+    // If the offset at index i is greater than `null_offset_adjustment``
+    // then the list at index i is null.
+    //
+    // Otherwise the length of the list is `offsets[i] - base` where
+    // base is defined the same as above.
+    //
+    // Let's consider our example offsets: [2, 9, 2, 5]
+    //
+    // We can take any range of lists and determine how many list items are
+    // referenced by the sublist.
+    //
+    // 0..3: [_, 5] -> items 0..5 (base = 0* and end is 5)
+    // 0..2: [_, 2] -> items 0..2 (base = 0* and end is 2)
+    // 0..1: [_, 9] -> items 0..2 (base = 0* and end is 9 % 7)
+    // 1..3: [2, 5] -> items 2..5 (base = 2 and end is 5)
+    // 1..2: [2, 2] -> items 2..2 (base = 2 and end is 2)
+    // 2..3: [9, 5] -> items 2..5 (base = 9 % 7 and end is 5)
+    //
+    // * When the start of our range is the 0th item the base is always 0 and we only
+    //   need to load a single index from disk to determine the range.
+    //
+    // The data type of the offsets array is flexible and does not need
+    // to match the data type of the destination array.  Please note that the offsets
+    // array is very likely to be efficiently encoded by bit packing deltas.
+    ArrayEncoding offsets = 1;
+    // If a list is null then we add this value to the offset
+    //
+    // This value must be greater than the length of the items so that
+    // (offset + null_offset_adjustment) is never used by a non-null list.
+    //
+    // Note that this value cannot be equal to the length of the items
+    // because then a page with a single list would store [ X ] and we
+    // couldn't know if that is a null list or a list with X items.
+    //
+    // Therefore, the best choice for this value is 1 + # of items.
+    // Choosing this will maximize the bit packing that we can apply to the offsets.
+    uint64 null_offset_adjustment = 2;
+    // How many items are referenced by these offsets.  This is needed in
+    // order to determine which items pages map to this offsets page.
+    uint64 num_items = 3;
+}
+
+// An array encoding for fixed-size list fields
+message FixedSizeList {
+  /// The number of items in each list
+  uint32 dimension = 1;
+  /// True if the list is nullable
+  bool has_validity = 3;
+  /// The items in the list
+  ArrayEncoding items = 2;
+}
+
+message Compression {
+  string scheme = 1;
+  optional int32 level = 2;
+}
+
+// Fixed width items placed contiguously in a buffer
+message Flat {
+  // the number of bits per value, must be greater than 0, does
+  // not need to be a multiple of 8
+  uint64 bits_per_value = 1;
+  // the buffer of values
+  Buffer buffer = 2;
+  // The Compression message can specify the compression scheme (e.g. zstd) and any
+  // other information that is needed for decompression.
+  //
+  // If this array is compressed then the bits_per_value refers to the uncompressed
+  // data.
+  Compression compression = 3;
+}
+
+// Compression algorithm where all values have a constant value
+message Constant {
+  // The value (TODO: define encoding for literals?)
+  bytes value = 1;
+}
+
+// Items are bitpacked in a buffer
+message Bitpacked {
+  // the number of bits used for a value in the buffer
+  uint64 compressed_bits_per_value = 1;
+
+  // the number of bits of the uncompressed value. e.g. for a u32, this will be 32
+  uint64 uncompressed_bits_per_value = 2;
+
+  // The items in the list
+  Buffer buffer = 3;
+
+  // Whether or not a sign bit is included in the bitpacked value
+  bool signed = 4;
+}
+
+// Items are bitpacked in a buffer
+message BitpackedForNonNeg {
+  // the number of bits used for a value in the buffer
+  uint64 compressed_bits_per_value = 1;
+
+  // the number of bits of the uncompressed value. e.g. for a u32, this will be 32
+  uint64 uncompressed_bits_per_value = 2;
+
+  // The items in the list
+  Buffer buffer = 3;
+}
+
+// Opaque bitpacking variant where the bits per value are stored inline in the chunks themselves
+message InlineBitpacking {
+  // the number of bits of the uncompressed value. e.g. for a u32, this will be 32
+  uint64 uncompressed_bits_per_value = 2;
+}
+
+// Transparent bitpacking variant where the number of bits per value is fixed through the whole buffer
+message OutOfLineBitpacking {
+  // the number of bits of the uncompressed value. e.g. for a u32, this will be 32
+  uint64 uncompressed_bits_per_value = 2;
+  // The number of compressed bits per value, fixed across the entire buffer
+  uint64 compressed_bits_per_value = 3;
+}
+
+// An array encoding for shredded structs that will never be null
+//
+// There is no actual data in this column.
+//
+// TODO: Struct validity bitmaps will be placed here.
+message SimpleStruct {}
+
+// An array encoding for binary fields
+message Binary {
+  ArrayEncoding indices = 1;
+  ArrayEncoding bytes = 2;
+  uint64 null_adjustment = 3;
+}
+
+message Variable {
+  uint32 bits_per_offset = 1;
+}
+
+message Fsst {
+  ArrayEncoding binary = 1;
+  bytes symbol_table = 2;
+}
+
+// An array encoding for dictionary-encoded fields
+message Dictionary {
+  ArrayEncoding indices = 1;
+  ArrayEncoding items = 2;
+  uint32 num_dictionary_items = 3;
+}
+
+message PackedStruct {
+  repeated ArrayEncoding inner = 1;
+  Buffer buffer = 2;
+}
+
+message PackedStructFixedWidthMiniBlock {
+  ArrayEncoding Flat = 1;
+  repeated uint32 bits_per_values = 2;
+}
+
+message FixedSizeBinary {
+  ArrayEncoding bytes = 1;
+  uint32 byte_width = 2;
+}
+
+message Block {
+  string scheme = 1;
+}
+
+// Run-Length Encoding for miniblock format
+message Rle {
+  // Number of bits per value (8, 16, 32, 64, or 128)
+  uint64 bits_per_value = 1;
+}
+
+// Byte Stream Split encoding for floating point values
+message ByteStreamSplit {
+  // Number of bits per value (32 for float, 64 for double)
+  uint64 bits_per_value = 1;
+}
+
+// General miniblock encoding - wraps another miniblock encoding with compression
+message GeneralMiniBlock {
+  // The inner miniblock encoding (e.g., Rle, Bitpacked, etc.)
+  ArrayEncoding inner = 1;
+  // The compression scheme to apply to the miniblock buffers
+  Compression compression = 2;
+}
+
+// Encodings that decode into an Arrow array
+message ArrayEncoding {
+    oneof array_encoding {
+        Flat flat = 1;
+        Nullable nullable = 2;
+        FixedSizeList fixed_size_list = 3;
+        List list = 4;
+        SimpleStruct struct = 5;
+        Binary binary = 6;
+        Dictionary dictionary = 7;
+        Fsst fsst = 8;
+        PackedStruct packed_struct = 9;
+        Bitpacked bitpacked = 10;
+        FixedSizeBinary fixed_size_binary = 11;
+        BitpackedForNonNeg bitpacked_for_non_neg = 12;
+        Constant constant = 13;
+        InlineBitpacking inline_bitpacking = 14;
+        OutOfLineBitpacking out_of_line_bitpacking = 15;
+        Variable variable = 16;
+        PackedStructFixedWidthMiniBlock packed_struct_fixed_width_mini_block = 17;
+        Block block = 18;
+        Rle rle = 19;
+        GeneralMiniBlock general_mini_block = 20;
+        ByteStreamSplit byte_stream_split = 21;
+    }
+}
+
+// Wraps a column with a zone map index that can be used
+// to apply pushdown filters
+message ZoneIndex {
+  uint32 rows_per_zone = 1;
+  Buffer zone_map_buffer = 2;
+  ColumnEncoding inner = 3;
+}
+
+// Marks a column as blob data.  It will contain a packed struct
+// with fields position and size (u64)
+message Blob {
+  ColumnEncoding inner = 1;
+}
+
+// Encodings that describe a column of values
+message ColumnEncoding {
+  oneof column_encoding {
+    // No special encoding, just column values
+    google.protobuf.Empty values = 1;
+    ZoneIndex zone_index = 2;
+    Blob blob = 3;
+  }
+}
--- a/vendor/lance-table/protos/encodings_v2_1.proto
+++ b/vendor/lance-table/protos/encodings_v2_1.proto
@ -0,0 +1,511 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+syntax = "proto3";
+
+package lance.encodings21;
+
+// This file contains a specification for encodings that can be used
+// to store and load Arrow data into a Lance file for the 2.1 format.
+//
+// # Types
+//
+// This file assumes the user wants to load data into Arrow arrays and
+// explains how to map Arrow arrays into Lance files.  Encodings are divided
+// into "structural encodings" (which are used to encode the structure of the
+// data such as any list or struct layers) and "compressive encodings" (which
+// are used to compress the actual data values).
+//
+// # Standardized Interpretation of Counting Terms
+//
+// When working with 2.1 encodings we have a number of different "counting terms" and it can be
+// difficult to understand what we mean when we are talking about a "number of values".  Here is
+// a standard interpretation of these terms:
+//
+// To understand these definitions consider a data type FIXED_SIZE_LIST<LIST<INT32>>.
+//
+// A "value" is an abstract term when we aren't being specific.
+//
+// - num_rows: This is the highest level counting term.  A single row includes everything in the
+//             fixed size list.  This is what the user asks for when they asks for a range of rows.
+// - num_elements: The number of elements is the number of rows multiplied by the dimension of any
+//             fixed size list wrappers.  This is what you get when you flatten the FSL layer and
+//             is the starting point for structural encoding.  Note that an element can be a list
+//             value or a single primitive value.
+// - num_items: The number of items is the number of values in the repetition and definition vectors
+//             after everything has been flattened.
+// - num_visible_items: The number of visible items is the number of items after invisible items
+//             have been removed.  Invisible items are rep/def levels that don't correspond to an
+//             actual value.
+
+
+// # Structural Encodings
+//
+// The following message are used to describe the structural encoding of the
+// data.  In this document, we refer to these structural encodings as layouts.
+
+// Repetition and definition levels are described in more detail elsewhere.  As we peel through
+// the structure of an array we will encounter layers of struct and list.  Each of these layers
+// potentially adds a new level to the repetition and definition levels.  This message describes
+// the meaning of each layer.
+enum RepDefLayer {
+  // Should never be used, included for debugging purporses and general protobuf best practice
+  REPDEF_UNSPECIFIED = 0;
+  // All values are valid (can be primitive or struct)
+  REPDEF_ALL_VALID_ITEM = 1;
+  // All list values are valid
+  REPDEF_ALL_VALID_LIST = 2;
+  // There are one or more null items (can be primitive or struct)
+  REPDEF_NULLABLE_ITEM = 3;
+  // A list layer with null lists but no empty lists
+  REPDEF_NULLABLE_LIST = 4;
+  // A list layer with empty lists but no null lists
+  REPDEF_EMPTYABLE_LIST = 5;
+  // A list layer with both empty lists and null lists
+  REPDEF_NULL_AND_EMPTY_LIST = 6;
+}
+
+// A layout used for pages where the data is small
+//
+// In this case we can fit many values into a single disk sector and transposing buffers is
+// expensive.  As a result, we do not transpose the buffers but compress the data into small
+// chunks (called mini blocks) which are roughly the size of a disk sector.
+//
+// The end result is a small amount of read amplification (since we must read an entire page
+// at a time) but we have more flexibility in compression and do less work per value when
+// compressing and decompressing in bulk.
+message MiniBlockLayout {
+  // Description of the compression of repetition levels (e.g. how many bits per rep)
+  //
+  // Optional, if there is no repetition then this field is not present
+  CompressiveEncoding rep_compression = 1;
+  // Description of the compression of definition levels (e.g. how many bits per def)
+  //
+  // Optional, if there is no definition then this field is not present
+  CompressiveEncoding def_compression = 2;
+  // Description of the compression of values
+  CompressiveEncoding value_compression = 3;
+  // Description of the compression of the dictionary data
+  //
+  // Optional, if there is no dictionary then this field is not present
+  CompressiveEncoding dictionary = 4;
+  // Number of items in the dictionary
+  uint64 num_dictionary_items = 5;
+  // The meaning of each repdef layer, used to interpret repdef buffers correctly
+  repeated RepDefLayer layers = 6;
+  // The number of buffers in each mini-block, this is determined by the compression and does
+  // NOT include the repetition or definition buffers (the presence of these buffers can be determined
+  // by looking at the rep_compression and def_compression fields)
+  uint64 num_buffers = 7;
+  // The depth of the repetition index.
+  //
+  // If there is repetition then the depth must be at least 1.  If there are many layers
+  // of repetition then deeper repetition indices will support deeper nested random access.  For
+  // example, given 5 layers of repetition then the repetition index depth must be at least
+  // 3 to support access like `rows[50][17][3]`.
+  //
+  // We require `repetition_index_depth + 1` u64 values per mini-block to store the repetition
+  // index if the `repetition_index_depth` is greater than 0.  The +1 is because we need to store
+  // the number of "leftover items" at the end of the chunk.  Otherwise, we wouldn't have any way
+  // to know if the final item in a chunk is valid or not.
+  uint32 repetition_index_depth = 8;
+  // The page already records how many rows are in the page.  For mini-block we also need to know how
+  // many "items" are in the page.  A row and an item are the same thing unless the page has lists.
+  uint64 num_items = 9;
+
+  // Since Lance 2.2, miniblocks have larger chunk sizes (>= 64KB)
+  bool has_large_chunk = 10;
+}
+
+// A layout used for pages where the data is large
+//
+// In this case the cost of transposing the data is relatively small (compared to the cost of writing the data)
+// and so we just zip the buffers together
+message FullZipLayout {
+  // The number of bits of repetition info (0 if there is no repetition)
+  uint32 bits_rep = 1;
+  // The number of bits of definition info (0 if there is no definition)
+  uint32 bits_def = 2;
+  // The number of bits of value info
+  //
+  // Note: we use bits here (and not bytes) for consistency with other encodings.  However, in practice,
+  // there is never a reason to use a bits per value that is not a multiple of 8.  The complexity is not
+  // worth the small savings in space since this encoding is typically used with large values already.
+  oneof details {
+    // If this is a fixed width block then we need to have a fixed number of bits per value
+    uint32 bits_per_value = 3;
+    // If this is a variable width block then we need to have a fixed number of bits per offset
+    uint32 bits_per_offset = 4;
+  }
+  // The number of items in the page
+  uint32 num_items = 5;
+  // The number of visible items in the page
+  uint32 num_visible_items = 6;
+  // Description of the compression of values
+  CompressiveEncoding value_compression = 7;
+  // The meaning of each repdef layer, used to interpret repdef buffers correctly
+  repeated RepDefLayer layers = 8;
+}
+
+// A layout used for pages where all (visible) values are the same scalar value.
+//
+// This generalizes the prior AllNullLayout semantics for file_version >= 2.2.
+//
+// There may be buffers of repetition and definition information if required in order
+// to interpret what kind of nulls are present / which items are visible.
+message ConstantLayout {
+  // The meaning of each repdef layer, used to interpret repdef buffers correctly
+  repeated RepDefLayer layers = 5;
+
+  // Inline fixed-width scalar value bytes.
+  //
+  // This MUST only be used for types where a single non-null element is represented by a single
+  // fixed-width Arrow value buffer (i.e. no offsets buffer, no child data).
+  //
+  // Constraints:
+  // - MUST be absent for an all-null page
+  // - MUST be <= 32 bytes if present
+  optional bytes inline_value = 6;
+
+  // Optional compression algorithm used for the repetition buffer.
+  // If absent, repetition levels are stored as raw u16 values.
+  CompressiveEncoding rep_compression = 7;
+  // Optional compression algorithm used for the definition buffer.
+  // If absent, definition levels are stored as raw u16 values.
+  CompressiveEncoding def_compression = 8;
+  // Number of values in repetition buffer after decompression.
+  uint64 num_rep_values = 9;
+  // Number of values in definition buffer after decompression.
+  uint64 num_def_values = 10;
+}
+
+// A layout where large binary data is encoded externally and only
+// the descriptions (position + size) are placed in the page
+//
+// Repdef information is stored in the descriptions.  A description with a size of
+// 0 and a position of 0 is an empty value.  A description with a size of 0 and a
+// non-zero position is a null value and the position is the repdef value.
+message BlobLayout {
+  // The inner layout used to store the descriptions
+  PageLayout inner_layout = 1;
+  // The meaning of each repdef layer, used to interpret repdef buffers correctly
+  //
+  // The inner layout's repdef layers will always be 1 all valid item layer
+  repeated RepDefLayer layers = 2;
+}
+
+// Describes the structural encoding of a page
+message PageLayout {
+  oneof layout {
+    // A layout used for pages where the data is small
+    MiniBlockLayout mini_block_layout = 1;
+    // A layout used for pages where all (visible) values are the same scalar value or null.
+    ConstantLayout constant_layout = 2;
+    // A layout used for pages where the data is large
+    FullZipLayout full_zip_layout = 3;
+    // A layout where large binary data is encoded externally
+    // and only the descriptions are put in the page
+    BlobLayout blob_layout = 4;
+  }
+}
+
+// # Compressive Encodings
+//
+// These encodings describe how an array is compressed.  An encoding may split an
+// array into multiple buffers.  The buffers can then be compressed further (and split
+// into yet more buffers).  The entire process forms a tree of encodings with the root
+// of the tree being the initial array and the leaves being the final compressed buffers.
+//
+// # Data blocks and buffers
+//
+// Data blocks are a simplified version of arrays and represent a collection of buffers grouped
+// with some kind of interpretation.  Data blocks are the input and output of compressive encodings.
+// There are different kinds of data blocks:
+// - Fixed width data blocks (e.g. u8, u16, ...)
+// - Variable width data blocks (e.g. strings, binary)
+// - Struct data blocks (note: this is for packed structs, normal structs are encoded in the structural encoding)
+//
+// In addition, leaf encodings may output "buffers".  These are fully compressed buffers of data that
+// are stored in the page and no longer compressed.
+
+enum CompressionScheme {
+  COMPRESSION_ALGORITHM_UNSPECIFIED = 0;
+  COMPRESSION_ALGORITHM_LZ4 = 1;
+  COMPRESSION_ALGORITHM_ZSTD = 2;
+}
+
+// Compression applied to a single buffer of data
+//
+// A buffer is the leaf of the compression tree.  Unlike data blocks, which can
+// be further compressed with a variety of techniques, a buffer cannot be understood
+// in any particular way.
+//
+// A general compression scheme may be applied to a buffer.  This is something like
+// zstd, lz4, etc.  The entire buffer is compressed as a single unit.  If this happens
+// then any parent encoding becomes opaque, even if it would normally be transparent.
+//
+// This is a leaf, no further compression is applied to the data.
+message BufferCompression {
+  // A general compression scheme to apply to the buffer
+  CompressionScheme scheme = 1;
+  // The compression level
+  //
+  // Optional, if not present a scheme-specific default value will be used.
+  //
+  // Interpretation of this value depends on the compression scheme.  Generally, larger
+  // values indicate more compression at the expense of more CPU time.
+  optional int32 level = 2;
+}
+
+// Fixed width items placed contiguously in a single buffer
+//
+// This is a leaf encoding, there is no compression applied to the data.
+//
+// This is a transparent encoding by definition.
+//
+// The input is a fixed-width data block.
+// The output is a single buffer.
+message Flat {
+  // the number of bits per value, must be greater than 0, does
+  // not need to be a multiple of 8
+  uint64 bits_per_value = 1;
+  // The compression applied to the data
+  optional BufferCompression data = 2;
+}
+
+// Variable width items have the values stored in one buffer and the
+// offsets are output as a data block that may be further compressed.
+//
+// This is a partial leaf encoding.  Values are not compressed but
+// the offsets may be further compressed.
+//
+// This is a transparent encoding by definition.
+//
+// The input is a variable-width data block.
+// The output is a single fixed-width data block (the offsets) and
+// a single buffer (the values)
+message Variable {
+  // Describes how the offsets data block is compressed
+  CompressiveEncoding offsets = 1;
+  // The compression applied to the values
+  optional BufferCompression values = 2;
+}
+
+// Compression algorithm where all values have a constant value (encoded in the description)
+//
+// This is a leaf encoding, there is no compression applied to the data.
+//
+// The input can be any kind of data block.
+// There is no output.
+message Constant {
+  // The value (TODO: define encoding for literals?)
+  optional bytes value = 1;
+}
+
+// A compression scheme in which a single fixed-width block is "packed" into
+// a smaller fixed-width block values where each value has fewer bits.
+//
+// This is typically done by throwing away the most significant bits of each value when
+// those bits are all the same.
+//
+// In this scheme the number of bits per value is fixed across the entire buffer and stored
+// in this message.
+//
+// This is a transparent encoding.
+//
+// The input is a fixed-width data block.
+// The output is a single fixed-width data block.
+message OutOfLineBitpacking {
+  // the number of bits of the uncompressed value. e.g. for a u32, this will be 32
+  uint64 uncompressed_bits_per_value = 1;
+  // The compression used to store the bitpacked values data block
+  CompressiveEncoding values = 3;
+}
+
+// Bitpacking variant where the bits per value are stored inline in the chunks themselves
+//
+// This variation of bitpacking allows for the number of bits per value to change throughout the
+// buffer, which makes the compression more robust to outliers.
+//
+// This is an opaque encoding.
+//
+// The input is a fixed-width data block.
+// The output is a single buffer.
+message InlineBitpacking {
+  // the number of bits of the uncompressed value. e.g. for a u32, this will be 32
+  uint64 uncompressed_bits_per_value = 1;
+  // The compression applied to the values
+  optional BufferCompression values = 2;
+}
+
+// A compression scheme for variable-width data
+//
+// A small dictionary (referred to as a "symbol table") is used to compress the values.
+// In this scheme there is a single symbol table for the entire page and it is stored in the
+// encoding description itself.
+//
+// This is a transparent encoding.
+//
+// The input is a variable-width data block.
+// The output is a single variable-width data block.
+message Fsst {
+  // The FSST symbol table
+  bytes symbol_table = 1;
+  // The compression used to store the compressed values data block
+  CompressiveEncoding values = 2;
+}
+
+// A compression scheme where common values are stored in a dictionary and the values are
+// encoded as indices into the dictionary.
+//
+// This is an opaque encoding unless the dictionary is considered metadata.
+//
+// The input is a any kind of data block.
+// There are two outputs:
+// - A data block of the same kind as the input (the dictionary)
+// - A fixed-width data block containing the indices into the dictionary.
+message Dictionary {
+  // The compression used to store the indices data block
+  CompressiveEncoding indices = 1;
+  // The compression used to store the dictionary items data block
+  CompressiveEncoding items = 2;
+  // The number of items in the dictionary
+  uint32 num_dictionary_items = 3;
+}
+
+// A compression scheme where runs of common values are encoded as a single value and a count
+//
+// This is an opaque encoding unless the run lengths are considered metadata.
+//
+// The input is a single data block of any kind.
+// There are two outputs:
+// - A data block of the same kind as the input (the run values)
+// - A fixed-width data block containing the lengths of the runs
+message Rle {
+  // The compression used to store the run values data block
+  CompressiveEncoding values = 1;
+  // The compression used to store the run lengths data block
+  CompressiveEncoding run_lengths = 2;
+}
+
+// Converts a fixed-size-list of values into a flattened list of values
+//
+// This encoding does not actually compress the data, it just flattens out the FSL layers.
+//
+// This is a transparent encoding.
+//
+// The input is a single block of fixed-width data (with a wide width and few items)
+// The output is a single block of fixed-width data (with a narrow width and many items)
+message FixedSizeList {
+  // The number of items in this layer of FSL
+  uint64 items_per_value = 1;
+  // Whether or not there is a validity buffer
+  bool has_validity = 3;
+  // The compression used to store the flattened values data block
+  CompressiveEncoding values = 2;
+}
+
+// Packs a struct containing only fixed-width children into a single fixed-width data block
+//
+// The children are concatenated row by row and stored as a single fixed-width buffer. This is
+// the legacy packed struct representation and remains available for backwards compatibility.
+message PackedStruct {
+  // The number of bits contributed by each child field in the packed row
+  repeated uint64 bits_per_value = 1;
+  // The compression used to store the packed fixed-width values
+  CompressiveEncoding values = 2;
+}
+
+// Variable-width packed struct encoding (2.2 extension)
+//
+// Each child value is compressed independently before being transposed into
+// a row-major layout. This preserves per-field compression boundaries at the
+// cost of disabling mini-block compression. Readers must prefer this field
+// when present and fall back to the legacy encoding otherwise.
+message VariablePackedStruct {
+  // Per-field encoding metadata in struct order
+  repeated FieldEncoding fields = 1;
+
+  // Encoding description for a single child field
+  message FieldEncoding {
+    // Compression applied to individual field values before transposition
+    CompressiveEncoding value = 1;
+    oneof layout {
+      // Bit width of each compressed value (when fixed width)
+      uint64 bits_per_value = 2;
+      // Bit width of the length prefix for variable-width compressed values
+      uint64 bits_per_length = 3;
+    }
+  }
+}
+
+// A compression scheme that wraps the underlying data with general compression
+//
+// Note: The application of wrapped compression will depend on the layout of the data.
+// If we apply it to mini-block data then we compress entire mini-blocks.  If we apply
+// it to full-zip data then we compress each value individually.
+//
+// Note: Wrapped compression is somewhat unique at the moment as it is applied to the
+// output of the inner encoding and not the input like all other compressive encodings.
+//
+// Note: General compression can usually be applied in two spots.  We can apply
+// it to individual buffers or we can apply it here, to the entire array.
+//
+// For example, let's say we are storing mini-blocks of strings and we are using
+// FSST and bitpacking the offsets.  We have something like this...
+//
+// WRAPPED(†3) -> FSST -> VARIABLE -(offsets)-> INLINE_BITPACKING -(data)-> FLAT -> BUFFER (†1)
+//                                 -(data)-> BUFFER (†2)
+//
+// General compression can be applied at †1, †2, or †3 (or any combination of these).
+//
+// If we apply it at †1 then we apply it just to the bitpacked offsets
+// If we apply it at †2 then we apply it just to the FSST compressed data
+// If we apply it at †3 then we apply it to the entire mini-block (both offsets and data)
+//
+// The input is a single data block of any kind.
+// The output is a single data block of the same kind as the input.
+message General {
+  // The compression to apply to the values
+  BufferCompression compression = 1;
+  // The compression used to store the output data block
+  CompressiveEncoding values = 3;
+}
+
+// A compression scheme where fixed-width values are transposed into a series of byte streams
+//
+// This is commonly used for floating point values where the upper bits (the mantissa) have a
+// significantly different meaning than the lower bits.  By splitting the values into byte streams
+// we group the mantissa bits together and the exponent bits together.  The end result is typically
+// more compressible.
+//
+// Note that this encoding is mostly useful when combined with other encodings.  It does not do any
+// compression on its own.
+//
+// This is an opaque encoding.
+//
+// The input is a fixed-width data block
+// The output is a single fixed-width data block
+message ByteStreamSplit {
+  // The compression used to store the values
+  CompressiveEncoding values = 1;
+}
+
+// An encoding that compresses a data block into buffers
+message CompressiveEncoding {
+    oneof compression {
+        Flat flat = 1;
+        Variable variable = 2;
+        Constant constant = 3;
+        OutOfLineBitpacking out_of_line_bitpacking = 4;
+        InlineBitpacking inline_bitpacking = 5;
+        Fsst fsst = 6;
+        Dictionary dictionary = 7;
+        Rle rle = 8;
+        ByteStreamSplit byte_stream_split = 9;
+        General general = 10;
+        FixedSizeList fixed_size_list = 11;
+        PackedStruct packed_struct = 12;
+        VariablePackedStruct variable_packed_struct = 13;
+    }
+}
--- a/vendor/lance-table/protos/file.proto
+++ b/vendor/lance-table/protos/file.proto
@ -0,0 +1,207 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+syntax = "proto3";
+
+package lance.file;
+
+// A file descriptor that describes the contents of a Lance file
+message FileDescriptor {
+  // The schema of the file
+  Schema schema = 1;
+  // The number of rows in the file
+  uint64 length = 2;
+}
+
+// A schema which describes the data type of each of the columns
+message Schema {
+  // All fields in this file, including the nested fields.
+  repeated lance.file.Field fields = 1;
+  // Schema metadata.
+  map<string, bytes> metadata = 5;
+}
+
+// Metadata of one Lance file.
+message Metadata {
+  // 4 was used for StatisticsMetadata in the past, but has been moved to
+  // prevent a bug in older readers.
+  reserved 4;
+
+  // Position of the manifest in the file. If it is zero, the manifest is stored
+  // externally.
+  uint64 manifest_position = 1;
+
+  // Logical offsets of each chunk group, i.e., number of the rows in each
+  // chunk.
+  repeated int32 batch_offsets = 2;
+
+  // The file position that page table is stored.
+  //
+  // A page table is a matrix of N x M x 2, where N = num_fields, and M =
+  // num_batches. Each cell in the table is a pair of <position:int64,
+  // length:int64> of the page. Both position and length are int64 values. The
+  // <position, length> of all the pages in the same column are then
+  // contiguously stored.
+  //
+  // Every field that is a part of the file will have a run in the page table.
+  // This includes struct columns, which will have a run of length 0 since
+  // they don't store any actual data.
+  //
+  // For example, for the column 5 and batch 4, we have:
+  // ```text
+  //   position = page_table[5][4][0];
+  //   length = page_table[5][4][1];
+  // ```
+  uint64 page_table_position = 3;
+
+  message StatisticsMetadata {
+    // The schema of the statistics.
+    //
+    // This might be empty, meaning there are no statistics. It also might not
+    // contain statistics for every field.
+    repeated Field schema = 1;
+
+    // The field ids of the statistics leaf fields.
+    //
+    // This plays a similar role to the `fields` field in the DataFile message.
+    // Each of these field ids corresponds to a field in the stats_schema. There
+    // is one per column in the stats page table.
+    repeated int32 fields = 2;
+
+    // The file position of the statistics page table
+    //
+    // The page table is a matrix of N x 2, where N = length of stats_fields.
+    // This is the same layout as the main page table, except there is always
+    // only one batch.
+    //
+    // For example, to get the stats column 5, we have:
+    // ```text
+    //   position = stats_page_table[5][0];
+    //   length = stats_page_table[5][1];
+    // ```
+    uint64 page_table_position = 3;
+  }
+
+  StatisticsMetadata statistics = 5;
+}  // Metadata
+
+// Supported encodings.
+enum Encoding {
+  // Invalid encoding.
+  NONE = 0;
+  // Plain encoding.
+  PLAIN = 1;
+  // Var-length binary encoding.
+  VAR_BINARY = 2;
+  // Dictionary encoding.
+  DICTIONARY = 3;
+  // Run-length encoding.
+  RLE = 4;
+}
+
+// Dictionary field metadata
+message Dictionary {
+  /// The file offset for storing the dictionary value.
+  /// It is only valid if encoding is DICTIONARY.
+  ///
+  /// The logic type presents the value type of the column, i.e., string value.
+  int64 offset = 1;
+
+  /// The length of dictionary values.
+  int64 length = 2;
+}
+
+// Field metadata for a column.
+message Field {
+  enum Type {
+    PARENT = 0;
+    REPEATED = 1;
+    LEAF = 2;
+  }
+  Type type = 1;
+
+  // Fully qualified name.
+  string name = 2;
+  /// Field Id.
+  ///
+  /// See the comment in `DataFile.fields` for how field ids are assigned.
+  int32 id = 3;
+  /// Parent Field ID. If not set, this is a top-level column.
+  int32 parent_id = 4;
+
+  // Logical types, support parameterized Arrow Type.
+  //
+  // PARENT types will always have logical type "struct".
+  //
+  // REPEATED types may have logical types:
+  // * "list"
+  // * "large_list"
+  // * "list.struct"
+  // * "large_list.struct"
+  // The final two are used if the list values are structs, and therefore the
+  // field is both implicitly REPEATED and PARENT.
+  //
+  // LEAF types may have logical types:
+  // * "null"
+  // * "bool"
+  // * "int8" / "uint8"
+  // * "int16" / "uint16"
+  // * "int32" / "uint32"
+  // * "int64" / "uint64"
+  // * "halffloat" / "float" / "double"
+  // * "string" / "large_string"
+  // * "binary" / "large_binary"
+  // * "date32:day"
+  // * "date64:ms"
+  // * "decimal:128:{precision}:{scale}" / "decimal:256:{precision}:{scale}"
+  // * "time:{unit}" / "timestamp:{unit}" / "duration:{unit}", where unit is
+  // "s", "ms", "us", "ns"
+  // * "dict:{value_type}:{index_type}:false"
+  string logical_type = 5;
+  // If this field is nullable.
+  bool nullable = 6;
+
+  // optional field metadata (e.g. extension type name/parameters)
+  map<string, bytes> metadata = 10;  
+
+  bool unenforced_primary_key = 12;
+
+  // Position of this field in the primary key (1-based).
+  // 0 means the field is part of the primary key but uses schema field id for ordering.
+  // When set to a positive value, primary key fields are ordered by this position.
+  uint32 unenforced_primary_key_position = 13;
+
+  // Reserved for future use. Use unenforced_clustering_key_position instead.
+  bool unenforced_clustering_key = 14;
+
+  // Position of this field in the clustering key (1-based).
+  // 0 means the field is not part of the clustering key.
+  uint32 unenforced_clustering_key_position = 15;
+
+  // DEPRECATED ----------------------------------------------------------------
+
+  // Deprecated: Only used in V1 file format. V2 uses variable encodings defined
+  // per page.
+  //
+  // The global encoding to use for this field.
+  Encoding encoding = 7;
+
+  // Deprecated: Only used in V1 file format. V2 dynamically chooses when to
+  // do dictionary encoding and keeps the dictionary in the data files.
+  //
+  // The file offset for storing the dictionary value.
+  // It is only valid if encoding is DICTIONARY.
+  //
+  // The logic type presents the value type of the column, i.e., string value.
+  Dictionary dictionary = 8;
+
+  // Deprecated: optional extension type name, use metadata field
+  // ARROW:extension:name
+  string extension_name = 9;
+
+  // Field number 11 was previously `string storage_class`.
+  // Keep it reserved so older manifests remain compatible while new writers
+  // avoid reusing the slot.
+  reserved 11;
+  reserved "storage_class";
+}
--- a/vendor/lance-table/protos/file2.proto
+++ b/vendor/lance-table/protos/file2.proto
@ -0,0 +1,210 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+ 
+syntax = "proto3";
+
+package lance.file.v2;
+
+import "google/protobuf/any.proto";
+import "google/protobuf/empty.proto";
+
+// # Lance v2.X File Format
+//
+// The Lance file format is a barebones format for serializing columnar data
+// into a file.
+//
+// * Each Lance file contains between 0 and 4Gi columns
+// * Each column contains between 0 and 4Gi pages
+// * Each page contains between 0 and 2^64 items
+// * Different pages within a column can have different items counts
+// * Columns may have up to 2^64 items
+// * Different columns within a file can have different item counts
+//
+// The Lance file format does not have any notion of a type system or schemas.
+// From the perspective of the file format all data is arbitrary buffers of
+// bytes with an extensible metadata block to describe the data.  It is up to
+// the user to interpret these bytes meaningfully.
+//
+// Data buffers are written to the file first.  These data buffers can be
+// referenced from three different places in the file:
+//
+// * Page encodings can reference data buffers.  This is the most common way
+//   that actual data is stored.
+// * Column encodings can reference data buffers.  For example, a column encoding
+//   may reference data buffer(s) containing statistics or dictionaries.
+// * Finally, the global buffer offset table can reference data buffers.  This
+//   is useful for storing data that is shared across multiple columns.
+//   This is also useful for global file metadata (e.g. a schema that describes
+//   the file)
+//
+// ## File Layout
+//
+// Note: the number of buffers (BN) is independent of the number of columns (CN)
+//       and pages.
+//
+//       Buffers often need to be aligned.  64-byte alignment is common when
+//       working with SIMD operations.  4096-byte alignment is common when
+//       working with direct I/O.  In order to ensure these buffers are aligned
+//       writers may need to insert padding before the buffers.
+//       
+//       If direct I/O is required then most (but not all) fields described
+//       below must be sector aligned.  We have marked these fields with an
+//       asterisk for clarity.  Readers should assume there will be optional
+//       padding inserted before these fields.
+//
+//       All footer fields are unsigned integers written with  little endian
+//       byte order.
+//
+// ├──────────────────────────────────┤
+// | Data Pages                       |
+// |   Data Buffer 0*                 |
+// |   ...                            |
+// |   Data Buffer BN*                |
+// ├──────────────────────────────────┤
+// | Column Metadatas                 |
+// | |A| Column 0 Metadata*           |
+// |     Column 1 Metadata*           |
+// |     ...                          |
+// |     Column CN Metadata*          |
+// ├──────────────────────────────────┤
+// | Column Metadata Offset Table     |
+// | |B| Column 0 Metadata Position*  |
+// |     Column 0 Metadata Size       |
+// |     ...                          |
+// |     Column CN Metadata Position  |
+// |     Column CN Metadata Size      |
+// ├──────────────────────────────────┤
+// | Global Buffers Offset Table      |
+// | |C| Global Buffer 0 Position*    |
+// |     Global Buffer 0 Size         |
+// |     ...                          |
+// |     Global Buffer GN Position    |
+// |     Global Buffer GN Size        |
+// ├──────────────────────────────────┤
+// | Footer                           |
+// | A u64: Offset to column meta 0   |
+// | B u64: Offset to CMO table       |
+// | C u64: Offset to GBO table       |
+// |   u32: Number of global bufs     |
+// |   u32: Number of columns         |
+// |   u16: Major version             |
+// |   u16: Minor version             |
+// |   "LANC"                         |
+// ├──────────────────────────────────┤
+//
+// File Layout-End
+//
+// ## Data Pages
+//
+// A lot of flexibility is provided in how data is stored.  A page's buffers do
+// not strictly need to be contiguous on the disk.  However, it is recommended
+// that buffers within a page be grouped together for best performance.
+//
+// Data pages should be large.  The only time a page should be written to disk
+// is when the writer needs to flush the page to disk because it has accumulated
+// too much data.  Pages are not read in sequential order and if pages are too
+// small then the seek overhead (or request overhead) will be problematic.  We
+// generally advise that pages be at least 8MB or larger.
+//
+// ## Encodings
+//
+// Specific encodings are not part of this minimal format.  They are provided
+// by extensions. Readers and writers should be designed so that encodings can
+// be easily added and removed. Ideally, they should allow for this without
+// requiring recompilation through some kind of plugin system.
+
+// The deferred encoding is used to place the encoding itself in a different
+// part of the file.  This is most commonly used to allow encodings to be shared
+// across different columns.  For example, when writing a file with thousands of
+// columns, where many pages have the exact same encoding, it can be useful
+// to cut down on the size of the metadata by using a deferred encoding.
+message DeferredEncoding {
+   // Location of the buffer containing the encoding.
+   //
+   // * If sharing encodings across columns then this will be in a global buffer
+   // * If sharing encodings across pages within a column this could be in a
+   //   column metadata buffer.
+   // * This could also be a page buffer if the encoding is not shared, needs
+   //   to be written before the file ends, and the encoding is too large to load
+   //   unless we first determine the page needs to be read.  This combination
+   //   seems unusual.
+   uint64 buffer_location = 1;
+   uint64 buffer_length = 2;
+}
+
+// The encoding is placed directly in the metadata section
+message DirectEncoding {
+    // The bytes that make up the encoding embedded directly in the metadata
+    //
+    // This is the most common approach.
+    bytes encoding = 1;
+}
+
+// An encoding stores the information needed to decode a column or page
+//
+// For example, it could describe if the page is using bit packing, and how many bits
+// there are in each individual value.
+//
+// At the column level it can be used to wrap columns with dictionaries or statistics.
+message Encoding {
+    oneof location {
+        // The encoding is stored elsewhere and not part of this protobuf message
+        DeferredEncoding indirect = 1;
+        // The encoding is stored within this protobuf message
+        DirectEncoding direct = 2;
+        // There is no encoding information
+        google.protobuf.Empty none = 3;
+    }
+}
+
+// ## Metadata
+
+// Each column has a metadata block that is placed at the end of the file.
+// These may be read individually to allow for column projection.
+message ColumnMetadata {
+
+  // This describes a page of column data.
+  message Page {
+    // The file offsets for each of the page buffers
+    //
+    // The number of buffers is variable and depends on the encoding.  There
+    // may be zero buffers (e.g. constant encoded data) in which case this
+    // could be empty.
+    repeated uint64 buffer_offsets = 1;
+    // The size (in bytes) of each of the page buffers
+    //
+    // This field will have the same length as `buffer_offsets` and
+    // may be empty.
+    repeated uint64 buffer_sizes = 2;
+    // Logical length (e.g. # rows) of the page
+    uint64 length = 3;
+    // The encoding used to encode the page
+    Encoding encoding = 4;
+    // The priority of the page
+    //
+    // For tabular data this will be the top-level row number of the first row
+    // in the page (and top-level rows should not split across pages).
+    uint64 priority = 5;
+  }
+  // Encoding information about the column itself.  This typically describes
+  // how to interpret the column metadata buffers.  For example, it could
+  // describe how statistics or dictionaries are stored in the column metadata.
+  Encoding encoding = 1;
+  // The pages in the column
+  repeated Page pages = 2;   
+  // The file offsets of each of the column metadata buffers
+  //
+  // There may be zero buffers.
+  repeated uint64 buffer_offsets = 3;
+  // The size (in bytes) of each of the column metadata buffers
+  //
+  // This field will have the same length as `buffer_offsets` and
+  // may be empty.
+  repeated uint64 buffer_sizes = 4;
+} // Metadata-End
+
+// ## Where is the rest?
+//
+// This file format is extremely minimal.  It is a building block for
+// creating more useful readers and writers and not terribly useful by itself.
+// Other protobuf files will describe how this can be extended.
--- a/vendor/lance-table/protos/filtered_read.proto
+++ b/vendor/lance-table/protos/filtered_read.proto
@ -0,0 +1,99 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+syntax = "proto3";
+
+package lance.datafusion;
+
+import "table_identifier.proto";
+
+message U64Range {
+  uint64 start = 1;
+  uint64 end = 2;
+}
+
+message ProjectionProto {
+  repeated int32 field_ids = 1;
+  bool with_row_id = 2;
+  bool with_row_addr = 3;
+  bool with_row_last_updated_at_version = 4;
+  bool with_row_created_at_version = 5;
+  BlobHandlingProto blob_handling = 6;
+}
+
+message BlobHandlingProto {
+  oneof mode {
+    // All blobs read as binary
+    bool all_binary = 1;
+    // Blobs as descriptions, other binary as binary (default)
+    bool blobs_descriptions = 2;
+    // All binary columns as descriptions
+    bool all_descriptions = 3;
+    // Specific blobs read as binary, rest as descriptions (non-blob binary stays binary)
+    FieldIdSet some_blobs_binary = 4;
+    // Specific columns as binary, all other binary as descriptions
+    FieldIdSet some_binary = 5;
+  }
+}
+
+message FieldIdSet {
+  repeated uint32 field_ids = 1;
+}
+
+message FilteredReadThreadingModeProto {
+  oneof mode {
+    uint64 one_partition_multiple_threads = 1;
+    uint64 multiple_partitions = 2;
+  }
+}
+
+// Serializable form of FilteredReadOptions.
+message FilteredReadOptionsProto {
+  optional U64Range scan_range_before_filter = 1;
+  optional U64Range scan_range_after_filter = 2;
+  bool with_deleted_rows = 3;
+  optional uint32 batch_size = 4;
+  optional uint64 fragment_readahead = 5;
+  repeated uint64 fragment_ids = 6;
+  ProjectionProto projection = 7;
+  optional bytes refine_filter_substrait = 8;
+  optional bytes full_filter_substrait = 9;
+  FilteredReadThreadingModeProto threading_mode = 10;
+  optional uint64 io_buffer_size_bytes = 11;
+  // Arrow IPC schema for decoding Substrait filters (may be wider than projection).
+  optional bytes filter_schema_ipc = 12;
+}
+
+// Serializable form of FilteredReadPlan (planned/distributed mode).
+// RowAddrTreeMap serialized via its built-in serialize_into/deserialize_from.
+// Per-fragment filters are Substrait-encoded and deduplicated.
+message FilteredReadPlanProto {
+  bytes row_addr_tree_map = 1;
+  optional U64Range scan_range_after_filter = 2;
+  // Arrow IPC schema for decoding Substrait filters (matches the schema used at encode time).
+  optional bytes filter_schema_ipc = 3;
+  // Per-fragment filter mapping. Key is fragment id, value is a list index into
+  // filter_expressions. Multiple fragments can share the same list index when
+  // they have the same filter, avoiding duplicate Substrait encoding.
+  map<uint32, uint32> fragment_filter_ids = 4;
+  // Deduplicated Substrait-encoded filter expressions. Each entry is referenced
+  // by one or more values in fragment_filter_ids.
+  repeated bytes filter_expressions = 5;
+}
+
+// Top-level wrapper for FilteredReadExec serialization.
+message FilteredReadExecProto {
+  TableIdentifier table = 1;
+  FilteredReadOptionsProto options = 2;
+  // FilteredRead has two modes
+  // Plan-then-execute (distributed): The planner creates a FilteredReadPlan and sends it to a remote executor.
+  // Plan-and-execute (local): The executor creates the plan itself at execution time.
+  optional FilteredReadPlanProto plan = 3;
+  // Note: FilteredReadExec.index_input (child ExecutionPlan) is NOT serialized here.
+  // DataFusion's PhysicalExtensionCodec handles child plans automatically: it walks
+  // the plan tree via children() / with_new_children(), serializes each node, and
+  // passes deserialized children back as the `inputs` parameter in try_decode.
+  // This means any ExecutionPlan in the tree (including index_input) must also
+  // implement try_encode/try_decode in the PhysicalExtensionCodec.
+  // TODO: implement serialize/deserialize for lance-specific index input ExecutionPlans.
+}
--- a/vendor/lance-table/protos/index.proto
+++ b/vendor/lance-table/protos/index.proto
@ -0,0 +1,249 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+syntax = "proto3";
+
+package lance.index.pb;
+
+import "google/protobuf/any.proto";
+
+// The type of an index.
+enum IndexType {
+  // Vector index
+  VECTOR = 0;
+}
+
+message Index {
+  // The unique index name in the dataset.
+  string name = 1;
+
+  // Columns to be used to build the index.
+  repeated string columns = 2;
+
+  // The version of the dataset this index was built from.
+  uint64 dataset_version = 3;
+
+  // The [`IndexType`] of the index.
+  IndexType index_type = 4;
+
+  /// Index implementation details.
+  oneof implementation {
+    VectorIndex vector_index = 5;
+  }
+}
+
+message Tensor {
+  enum DataType {
+    BFLOAT16 = 0;
+    FLOAT16 = 1;
+    FLOAT32 = 2;
+    FLOAT64 = 3;
+    UINT8 = 4;
+    UINT16 = 5;
+    UINT32 = 6;
+    UINT64 = 7;
+  }
+
+  DataType data_type = 1;
+
+  // Data shape, [dim1, dim2, ...]
+  repeated uint32 shape = 2;
+
+  // Data buffer
+  bytes data = 3;
+}
+
+// Inverted Index File Metadata.
+message IVF {
+  // Centroids of partitions. `dimension * num_partitions` of float32s.
+  //
+  // Deprecated, use centroids_tensor instead.
+  repeated float centroids = 1;  // [deprecated = true];
+
+  // File offset of each partition.
+  repeated uint64 offsets = 2;
+
+  // Number of records in the partition.
+  repeated uint32 lengths = 3;
+
+  // Tensor of centroids. `num_partitions * dimension` of float32s.
+  Tensor centroids_tensor = 4;
+
+  // KMeans loss.
+  optional double loss = 5;
+}
+
+// Product Quantization.
+message PQ {
+  // The number of bits to present a centroid.
+  uint32 num_bits = 1;
+
+  // Number of sub vectors.
+  uint32 num_sub_vectors = 2;
+
+  // Vector dimension
+  uint32 dimension = 3;
+
+  // Codebook. `dimension * 2 ^ num_bits` of float32s.
+  repeated float codebook = 4;
+
+  // Tensor of codebook. `2 ^ num_bits * dimension` of floats.
+  Tensor codebook_tensor = 5;
+}
+
+// Transform type
+enum TransformType {
+  OPQ = 0;
+}
+
+// A transform matrix to apply to a vector or vectors.
+message Transform {
+  // The file offset the matrix is stored
+  uint64 position = 1;
+
+  // Data shape of the matrix, [rows, cols].
+  repeated uint32 shape = 2;
+
+  // Transform type.
+  TransformType type = 3;
+}
+
+// Flat Index
+message Flat {}
+
+// DiskAnn Index
+message DiskAnn {
+  // Graph spec version
+  uint32 spec = 1;
+
+  // Graph file
+  string filename = 2;
+
+  // r parameter
+  uint32 r = 3;
+
+  // alpha parameter
+  float alpha = 4;
+
+  // L parameter
+  uint32 L = 5;
+
+  /// Entry points to the graph
+  repeated uint64 entries = 6;
+}
+
+// One stage in the vector index pipeline.
+message VectorIndexStage {
+  oneof stage {
+    // Flat index
+    Flat flat = 1;
+    // `IVF` - Inverted File
+    IVF ivf = 2;
+    // Product Quantization
+    PQ pq = 3;
+    // Transformer
+    Transform transform = 4;
+    // DiskANN
+    DiskAnn diskann = 5;
+  }
+}
+
+// Metric Type for Vector Index
+enum VectorMetricType {
+  // L2 (Euclidean) Distance
+  L2 = 0;
+
+  // Cosine Distance
+  Cosine = 1;
+
+  // Dot Product
+  Dot = 2;
+
+  // Hamming Distance
+  Hamming = 3;
+}
+
+// Vector Index Metadata
+message VectorIndex {
+  // Index specification version.
+  uint32 spec_version = 1;
+
+  // Vector dimension;
+  uint32 dimension = 2;
+
+  // Composed vector index stages.
+  //
+  // For example, `IVF_PQ` index type can be expressed as:
+  //
+  // ```text
+  // let stages = vec![Ivf{}, PQ{num_bits: 8, num_sub_vectors: 16}]
+  // ```
+  repeated VectorIndexStage stages = 3;
+
+  // Vector distance metrics type
+  VectorMetricType metric_type = 4;
+}
+
+// Details for vector indexes, stored in the manifest's index_details field.
+message VectorIndexDetails {
+  VectorMetricType metric_type = 1;
+
+  // The target number of vectors per partition.
+  // 0 means unset.
+  uint64 target_partition_size = 2;
+
+  // Optional HNSW index configuration. If set, the index has an HNSW layer.
+  optional HnswParameters hnsw_index_config = 3;
+
+  message ProductQuantization {
+    uint32 num_bits = 1;
+    uint32 num_sub_vectors = 2;
+  }
+  message ScalarQuantization {
+    uint32 num_bits = 1;
+  }
+  message RabitQuantization {
+    enum RotationType {
+      FAST = 0;
+      MATRIX = 1;
+    }
+    uint32 num_bits = 1;
+    RotationType rotation_type = 2;
+  }
+
+  // No quantization; vectors are stored as-is.
+  message FlatCompression {}
+
+  oneof compression {
+    ProductQuantization pq = 4;
+    ScalarQuantization sq = 5;
+    RabitQuantization rq = 6;
+    FlatCompression flat = 8;
+  }
+
+  // Runtime hints: optional build preferences that don't affect index structure.
+  // Keys use reverse-DNS namespacing (e.g., "lance.ivf.max_iters", "lancedb.accelerator").
+  // Unrecognized keys must be silently ignored by all runtimes.
+  map<string, string> runtime_hints = 9;
+}
+
+// Hierarchical Navigable Small World (HNSW) parameters, used as an optional configuration for IVF indexes.
+message HnswParameters {
+  // The maximum number of outgoing edges per node in the HNSW graph. Higher values
+  // means more connections, better recall, but more memory and slower builds.
+  // Referred to as "M" in the HNSW literature.
+  uint32 max_connections = 1;
+  // "construction exploration factor": The size of the dynamic list used during
+  // index construction.
+  uint32 construction_ef = 2;
+  // The maximum number of levels in the HNSW graph.
+  uint32 max_level = 3;
+}
+
+message JsonIndexDetails {
+  string path = 1;
+  google.protobuf.Any target_details = 2;
+}
+message BloomFilterIndexDetails {}
+
+message RTreeIndexDetails {}
--- a/vendor/lance-table/protos/index_old.proto
+++ b/vendor/lance-table/protos/index_old.proto
@ -0,0 +1,42 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+syntax = "proto3";
+
+package lance.table;
+
+// NOTE: Do *NOT* add new index details here.  Add them to the index.proto file instead.
+// This file is in the lance.table package namespace while the index.proto file is in the
+// lance.index package namespace.
+//
+// These are only here for forward compatibility.  Older versions of Lance expect btree indexes
+// to have lance.table in the package namespace.
+//
+// If you need to modify these messages (e.g. to add new fields to btree or bitmap) then
+// it is ok to modify them here.
+
+// Currently many of these are empty messages because all needed details are either hard-coded (e.g.
+// filenames) or stored in the index itself.  However, we may want to add more details in the
+// future, in particular we can add details that may be useful for planning queries (e.g. don't
+// force us to load the index until we know we can make use of it)
+
+message BTreeIndexDetails {}
+message BitmapIndexDetails {}
+message LabelListIndexDetails {}
+message NGramIndexDetails {}
+message ZoneMapIndexDetails {}
+message InvertedIndexDetails {
+  // Marking this field as optional as old versions of the index store blank details and we
+  // need to make sure we have a proper optional field to detect this.
+  optional string base_tokenizer = 1;
+  string language = 2;
+  bool with_position = 3;
+  optional uint32 max_token_length = 4;
+  bool lower_case = 5;
+  bool stem = 6;
+  bool remove_stop_words = 7;
+  bool ascii_folding = 8;
+  uint32 min_ngram_length = 9;
+  uint32 max_ngram_length = 10;
+  bool prefix_only = 11;
+}
--- a/vendor/lance-table/protos/license_header.txt
+++ b/vendor/lance-table/protos/license_header.txt
@ -0,0 +1,2 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
--- a/vendor/lance-table/protos/rowids.proto
+++ b/vendor/lance-table/protos/rowids.proto
@ -0,0 +1,113 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+syntax = "proto3";
+
+package lance.table;
+// TODO: what would it take to store this in a LanceV2 file?
+// Or would flatbuffers be better for this?
+
+/// A sequence of row IDs. This is split up into one or more segments,
+/// each of which can be encoded in different ways. The encodings are optimized
+/// for values that are sorted, which will often be the case with row ids.
+/// They also have optimized forms depending on how sparse the values are.
+message RowIdSequence {
+    repeated U64Segment segments = 1;
+}
+
+/// Different ways to encode a sequence of u64 values.
+message U64Segment {
+    /// A range of u64 values.
+    message Range {
+        /// The start of the range, inclusive.
+        uint64 start = 1;
+        /// The end of the range, exclusive.
+        uint64 end = 2;
+    }
+
+    /// A range of u64 values with holes.
+    message RangeWithHoles {
+        /// The start of the range, inclusive.
+        uint64 start = 1;
+        /// The end of the range, exclusive.
+        uint64 end = 2;
+        /// The holes in the range, as a sorted array of values;
+        /// Binary search can be used to check whether a value is a hole and should
+        /// be skipped. This can also be used to count the number of holes before a
+        /// given value, if you need to find the logical offset of a value in the
+        /// segment.
+        EncodedU64Array holes = 3;
+    }
+
+    /// A range of u64 values with a bitmap.
+    message RangeWithBitmap {
+        /// The start of the range, inclusive.
+        uint64 start = 1;
+        /// The end of the range, exclusive.
+        uint64 end = 2;
+        /// A bitmap of the values in the range. The bitmap is a sequence of bytes,
+        /// where each byte represents 8 values. The first byte represents values
+        /// start to start + 7, the second byte represents values start + 8 to
+        /// start + 15, and so on. The most significant bit of each byte represents
+        /// the first value in the range, and the least significant bit represents
+        /// the last value in the range. If the bit is set, the value is in the
+        /// range; if it is not set, the value is not in the range.
+        bytes bitmap = 3;
+    }
+
+    oneof segment {
+        /// When the values are sorted and contiguous.
+        Range range = 1;
+        /// When the values are sorted but have a few gaps.
+        RangeWithHoles range_with_holes = 2;
+        /// When the values are sorted but have many gaps.
+        RangeWithBitmap range_with_bitmap = 3;
+        /// When the values are sorted but are sparse.
+        EncodedU64Array sorted_array = 4;
+        /// A general array of values, which is not sorted.
+        EncodedU64Array array = 5;
+    }
+} // RowIdSegment
+
+/// A basic bitpacked array of u64 values.
+message EncodedU64Array {
+    message U16Array {
+        uint64 base = 1;
+        /// The deltas are stored as 16-bit unsigned integers.
+        /// (protobuf doesn't support 16-bit integers, so we use bytes instead)
+        bytes offsets = 2;
+    }
+
+    message U32Array {
+        uint64 base = 1;
+        /// The deltas are stored as 32-bit unsigned integers.
+        /// (we use bytes instead of uint32 to avoid overhead of varint encoding)
+        bytes offsets = 2;
+    }
+
+    message U64Array {
+        /// (We use bytes instead of uint64 to avoid overhead of varint encoding)
+        bytes values = 2;
+    }
+
+    oneof array {
+        U16Array u16_array = 1;
+        U32Array u32_array = 2;
+        U64Array u64_array = 3;
+    }
+}
+
+/// A sequence of dataset versions. Similar to RowIdSequence but tracks
+/// version runs. It uses RLE (Run-Length Encoding) to efficiently
+// represent consecutive rows with the same version.
+message RowDatasetVersionSequence {
+    repeated RowDatasetVersionRun runs = 1;
+}
+
+/// A run of rows with the same version.
+message RowDatasetVersionRun {
+    /// The number of consecutive rows with the same version.
+    U64Segment span = 1;
+
+    uint64 version = 2;
+}
--- a/vendor/lance-table/protos/table.proto
+++ b/vendor/lance-table/protos/table.proto
@ -0,0 +1,717 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+syntax = "proto3";
+
+package lance.table;
+
+import "google/protobuf/any.proto";
+import "google/protobuf/timestamp.proto";
+import "file.proto";
+
+/*
+
+Format:
+
+----------------------------------------+
+|       Encoded Column 0, Chunk 0        |
+           ...
+|       Encoded Column M, Chunk N - 1    |
+|       Encoded Column M, Chunk N        |
+|       Indices ...                      |
+|       Chunk Position (M x N x 8)       |
+|         Manifest (Optional)            |
+|         Metadata                       |
+| i64: metadata position                 |
+| MAJOR_VERSION | MINOR_VERSION | "LANC" |
+----------------------------------------+
+ */
+
+// UUID type. encoded as 16 bytes.
+message UUID {
+  bytes uuid = 1;
+}
+
+// Manifest is a global section shared between all the files.
+message Manifest {
+  // All fields of the dataset, including the nested fields.
+  repeated lance.file.Field fields = 1;
+
+  // Schema metadata.
+  map<string, bytes> schema_metadata = 5;
+
+  // Fragments of the dataset.
+  repeated DataFragment fragments = 2;
+
+  // Snapshot version number.
+  uint64 version = 3;
+
+  // The file position of the version auxiliary data.
+  //  * It is not inheritable between versions.
+  //  * It is not loaded by default during query.
+  uint64 version_aux_data = 4;
+
+  message WriterVersion {
+    // The name of the library that created this file.
+    string library = 1;
+    // The version of the library that created this file. Because we cannot assume
+    // that the library is semantically versioned, this is a string. However, if it
+    // is semantically versioned, it should be a valid semver string without any 'v'
+    // prefix. For example: `2.0.0`, `2.0.0-rc.1`.
+    //
+    // For forward compatibility with older readers, when writing new manifests this
+    // field should contain only the core version (major.minor.patch) without any
+    // prerelease or build metadata. The prerelease/build info should be stored in
+    // the separate prerelease and build_metadata fields instead.
+    string version = 2;
+    // Optional semver prerelease identifier.
+    //
+    // This field stores the prerelease portion of a semantic version separately
+    // from the core version number. For example, if the full version is "2.0.0-rc.1",
+    // the version field would contain "2.0.0" and prerelease would contain "rc.1".
+    //
+    // This separation ensures forward compatibility: older readers can parse the
+    // clean version field without errors, while newer readers can reconstruct the
+    // full semantic version by combining version, prerelease, and build_metadata.
+    //
+    // If absent, the version field is used as-is.
+    optional string prerelease = 3;
+    // Optional semver build metadata.
+    //
+    // This field stores the build metadata portion of a semantic version separately
+    // from the core version number. For example, if the full version is
+    // "2.0.0-rc.1+build.123", the version field would contain "2.0.0", prerelease
+    // would contain "rc.1", and build_metadata would contain "build.123".
+    //
+    // If absent, no build metadata is present.
+    optional string build_metadata = 4;
+  }
+
+  // The version of the writer that created this file.
+  //
+  // This information may be used to detect whether the file may have known bugs
+  // associated with that writer.
+  WriterVersion writer_version = 13;
+
+  // If present, the file position of the index metadata.
+  optional uint64 index_section = 6;
+
+  // Version creation Timestamp, UTC timezone
+  google.protobuf.Timestamp timestamp = 7;
+
+  // Optional version tag
+  string tag = 8;
+
+  // Feature flags for readers.
+  //
+  // A bitmap of flags that indicate which features are required to be able to
+  // read the table. If a reader does not recognize a flag that is set, it
+  // should not attempt to read the dataset.
+  //
+  // Known flags:
+  // * 1: deletion files are present
+  // * 2: row ids are stable and stored as part of the fragment metadata.
+  // * 4: use v2 format (deprecated)
+  // * 8: table config is present
+  uint64 reader_feature_flags = 9;
+
+  // Feature flags for writers.
+  //
+  // A bitmap of flags that indicate which features must be used when writing to the
+  // dataset. If a writer does not recognize a flag that is set, it should not attempt to
+  // write to the dataset.
+  //
+  // The flag identities are the same as for reader_feature_flags, but the values of
+  // reader_feature_flags and writer_feature_flags are not required to be identical.
+  uint64 writer_feature_flags = 10;
+
+  // The highest fragment ID that has been used so far.
+  //
+  // This ID is not guaranteed to be present in the current version, but it may
+  // have been used in previous versions.
+  //
+  // For a single fragment, will be zero. For no fragments, will be absent.
+  optional uint32 max_fragment_id = 11;
+
+  // Path to the transaction file, relative to `{root}/_transactions`. The file at that
+  // location contains a wire-format serialized Transaction message representing the
+  // transaction that created this version.
+  //
+  // This string field "transaction_file" may be empty if no transaction file was written.
+  //
+  // The path format is "{read_version}-{uuid}.txn" where {read_version} is the version of
+  // the table the transaction read from (serialized to decimal with no padding digits),
+  // and {uuid} is a hyphen-separated UUID.
+  string transaction_file = 12;
+
+  // The file position of the transaction content. None if transaction is empty
+  // This transaction content begins with the transaction content length as u32
+  // If the transaction proto message has a length of `len`, the message ends at `len` + 4
+  optional uint64 transaction_section = 21;
+
+  // The next unused row id. If zero, then the table does not have any rows.
+  //
+  // This is only used if the "stable_row_ids" feature flag is set.
+  uint64 next_row_id = 14;
+
+  message DataStorageFormat {
+    // The format of the data files (e.g. "lance")
+    string file_format = 1;
+    // The max format version of the data files. The format of the version can vary by
+    // file_format and is not required to follow semver.
+    //
+    // Every file in this version of the dataset has the same file_format version.
+    string version = 2;
+  }
+
+  // The data storage format
+  //
+  // This specifies what format is used to store the data files.
+  DataStorageFormat data_format = 15;
+
+  // Table config.
+  //
+  // Keys with the prefix "lance." are reserved for the Lance library. Other
+  // libraries may wish to similarly prefix their configuration keys
+  // appropriately.
+  map<string, string> config = 16;
+
+  // Metadata associated with the table.
+  //
+  // This is a key-value map that can be used to store arbitrary metadata
+  // associated with the table.
+  //
+  // This is different than configuration, which is used to tell libraries how
+  // to read, write, or manage the table.
+  //
+  // This is different than schema metadata, which is used to describe the
+  // data itself and is attached to the output schema of scans.
+  map<string, string> table_metadata = 19;
+
+  // Field number 17 (`blob_dataset_version`) was used for a secondary blob dataset.
+  reserved 17;
+  reserved "blob_dataset_version";
+
+  // The base paths of data files.
+  //
+  // This is used to determine the base path of a data file. In common cases data file paths are under current dataset base path.
+  // But for shallow cloning, importing file and other multi-tier storage cases, the actual data files could be outside of the current dataset.
+  // This field is used with the `base_id` in `lance.file.File` and `lance.file.DeletionFile`.
+  //
+  // For example, if we have a dataset with base path `s3://bucket/dataset`, we have a DataFile with base_id 0, we get the actual data file path by:
+  // base_paths[id = 0] + /data/ + file.path
+  // the key(a.k.a index) starts from 0, increased by 1 for each new base path.
+  repeated BasePath base_paths = 18;
+
+  // The branch of the dataset. None means main branch.
+  optional string branch = 20;
+} // Manifest
+
+// external dataset base path
+message BasePath {
+  uint32 id = 1;
+  // This is an alias name of the base path, it is optional.
+  // When we use shallow clone and the target version is a tag, the tag name will be set here.
+  optional string name = 2;
+  // Flag indicating whether this path is a dataset root path or file directory:
+  // - true:  Path is a dataset root (actual files under subdirectories like `data`, '_deletions')
+  // - false: Path is a direct file directory (scenario like importing files)
+  bool is_dataset_root = 3;
+  // Note: This absolute path will be directly used by Path:parse(),
+  string path = 4;
+}
+
+// Auxiliary Data attached to a version.
+// Only load on-demand.
+message VersionAuxData {
+  // key-value metadata.
+  map<string, bytes> metadata = 3;
+}
+
+// Metadata describing an index.
+message IndexMetadata {
+  // Unique ID of an index. It is unique across all the dataset versions.
+  UUID uuid = 1;
+
+  // The columns to build the index. These refer to file.Field.id.
+  repeated int32 fields = 2;
+
+  // Index name. Must be unique within one dataset version.
+  string name = 3;
+
+  // The version of the dataset this index was built from.
+  uint64 dataset_version = 4;
+
+  // A bitmap of the included fragment ids.
+  //
+  // This may by used to determine how much of the dataset is covered by the
+  // index. This information can be retrieved from the dataset by looking at
+  // the dataset at `dataset_version`. However, since the old version may be
+  // deleted while the index is still in use, this information is also stored
+  // in the index.
+  //
+  // The bitmap is stored as a 32-bit Roaring bitmap.
+  bytes fragment_bitmap = 5;
+
+  // Details, specific to the index type, which are needed to load / interpret the index
+  //
+  // Indices should avoid putting large amounts of information in this field, as it will
+  // bloat the manifest.
+  //
+  // Indexes are plugins, and so the format of the details message is flexible and not fully
+  // defined by the table format.  However, there are some conventions that should be followed:
+  //
+  // - When Lance APIs refer to indexes they will use the type URL of the index details as the
+  //   identifier for the index type.  If a user provides a simple string identifier like
+  //   "btree" then it will be converted to "/lance.table.BTreeIndexDetails"
+  // - Type URLs comparisons are case-insensitive.  Thereform an index must have a unique type
+  //   URL ignoring case.
+  google.protobuf.Any index_details = 6;
+
+  // The minimum lance version that this index is compatible with.
+  optional int32 index_version = 7;
+
+  // Timestamp when the index was created (UTC timestamp in milliseconds since epoch)
+  //
+  // This field is optional for backward compatibility. For existing indices created before
+  // this field was added, this will be None/null.
+  optional uint64 created_at = 8;
+
+  // The base path index of the data file. Used when the file is imported or referred from another dataset.
+  // Lance use it as key of the base_paths field in Manifest to determine the actual base path of the data file.
+  optional uint32 base_id = 9;
+
+  // List of files and their sizes for this index segment.
+  // This enables skipping HEAD calls when opening indices and allows reporting
+  // of index sizes without extra IO.
+  // If this is empty, the index files sizes are unknown.
+  repeated IndexFile files = 10;
+}
+
+// Metadata about a single file within an index segment.
+message IndexFile {
+  // Path relative to the index directory (e.g., "index.idx", "auxiliary.idx")
+  string path = 1;
+  // Size of the file in bytes
+  uint64 size_bytes = 2;
+}
+
+// Index Section, containing a list of index metadata for one dataset version.
+message IndexSection {
+  repeated IndexMetadata indices = 1;
+}
+
+// A DataFragment is a set of files which represent the different columns of the same
+// rows. If column exists in the schema of a dataset, but the file for that column does
+// not exist within a DataFragment of that dataset, that column consists entirely of
+// nulls.
+message DataFragment {
+  // The ID of a DataFragment is unique within a dataset.
+  uint64 id = 1;
+
+  repeated DataFile files = 2;
+
+  // File that indicates which rows, if any, should be considered deleted.
+  DeletionFile deletion_file = 3;
+
+  // TODO: What's the simplest way we can allow an inline tombstone bitmap?
+
+  // A serialized RowIdSequence message (see rowids.proto).
+  //
+  // These are the row ids for the fragment, in order of the rows as they appear.
+  // That is, if a fragment has 3 rows, and the row ids are [1, 42, 3], then the
+  // first row is row 1, the second row is row 42, and the third row is row 3.
+  oneof row_id_sequence {
+    // If small (< 200KB), the row ids are stored inline.
+    bytes inline_row_ids = 5;
+    // Otherwise, stored as part of a file.
+    ExternalFile external_row_ids = 6;
+  } // row_id_sequence
+
+  oneof last_updated_at_version_sequence {
+    // If small (< 200KB), the row latest updated versions are stored inline.
+    bytes inline_last_updated_at_versions = 7;
+    // Otherwise, stored as part of a file.
+    ExternalFile external_last_updated_at_versions = 8;
+  } // last_updated_at_version_sequence
+
+  oneof created_at_version_sequence {
+    // If small (< 200KB), the row created at versions are stored inline.
+    bytes inline_created_at_versions = 9;
+    // Otherwise, stored as part of a file.
+    ExternalFile external_created_at_versions = 10;
+  } // created_at_version_sequence
+
+  // Number of original rows in the fragment, this includes rows that are now marked with
+  // deletion tombstones. To compute the current number of rows, subtract
+  // `deletion_file.num_deleted_rows` from this value.
+  uint64 physical_rows = 4;
+}
+
+message DataFile {
+  // Path to the root relative to the dataset's URI.
+  string path = 1;
+  // The ids of the fields/columns in this file.
+  //
+  // When a DataFile object is created in memory, every value in fields is assigned -1 by
+  // default. An object with a value in fields of -1 must not be stored to disk. -2 is
+  // used for "tombstoned", meaning a field that is no longer in use. This is often
+  // because the original field id was reassigned to a different data file.
+  //
+  // In Lance v1 IDs are assigned based on position in the file, offset by the max
+  // existing field id in the table (if any already). So when a fragment is first created
+  // with one file of N columns, the field ids will be 1, 2, ..., N. If a second fragment
+  // is created with M columns, the field ids will be N+1, N+2, ..., N+M.
+  //
+  // In Lance v1 there is one field for each field in the input schema, this includes
+  // nested fields (both struct and list).  Fixed size list fields have only a single
+  // field id (these are not considered nested fields in Lance v1).
+  //
+  // This allows column indices to be calculated from field IDs and the input schema.
+  //
+  // In Lance v2 the field IDs generally follow the same pattern but there is no
+  // way to calculate the column index from the field ID.  This is because a given
+  // field could be encoded in many different ways, some of which occupy a different
+  // number of columns.  For example, a struct field could be encoded into N + 1 columns
+  // or it could be encoded into a single packed column.  To determine column indices
+  // the column_indices property should be used instead.
+  //
+  // In Lance v1 these ids must be sorted but might not always be contiguous.
+  repeated int32 fields = 2;
+  // The top-level column indices for each field in the file.
+  //
+  // If the data file is version 1 then this property will be empty
+  //
+  // Otherwise there must be one entry for each field in `fields`.
+  //
+  // Some fields may not correspond to a top-level column in the file.  In these cases
+  // the index will -1.
+  //
+  // For example, consider the schema:
+  //
+  // - dimension: packed-struct (0):
+  //   - x: u32 (1)
+  //   - y: u32 (2)
+  // - path: `list<u32>` (3)
+  // - embedding: `fsl<768>` (4)
+  //   - fp64
+  // - borders: `fsl<4>` (5)
+  //   - simple-struct (6)
+  //     - margin: fp64 (7)
+  //     - padding: fp64 (8)
+  //
+  // One possible column indices array could be:
+  // [0, -1, -1, 1, 3, 4, 5, 6, 7]
+  //
+  // This reflects quite a few phenomenon:
+  // - The packed struct is encoded into a single column and there is no top-level column
+  //   for the x or y fields
+  // - The variable sized list is encoded into two columns
+  // - The embedding is encoded into a single column (common for FSL of primitive) and there
+  //   is not "FSL column"
+  // - The borders field actually does have an "FSL column"
+  //
+  // The column indices table may not have duplicates (other than -1)
+  repeated int32 column_indices = 3;
+  // The major file version used to create the file
+  uint32 file_major_version = 4;
+  // The minor file version used to create the file
+  //
+  // If both `file_major_version` and `file_minor_version` are set to 0,
+  // then this is a version 0.1 or version 0.2 file.
+  uint32 file_minor_version = 5;
+
+  // The known size of the file on disk in bytes.
+  //
+  // This is used to quickly find the footer of the file.
+  //
+  // When this is zero, it should be interpreted as "unknown".
+  uint64 file_size_bytes = 6;
+
+  // The base path index of the data file. Used when the file is imported or referred from another dataset.
+  // Lance use it as key of the base_paths field in Manifest to determine the actual base path of the data file.
+  optional uint32 base_id = 7;
+} // DataFile
+
+// Deletion File
+//
+// The path of the deletion file is constructed as:
+//   {root}/_deletions/{fragment_id}-{read_version}-{id}.{extension}
+// where {extension} depends on DeletionFileType.
+message DeletionFile {
+  // Type of deletion file, intended as a way to increase efficiency of the storage of deleted row
+  // offsets. If there are sparsely deleted rows, then ARROW_ARRAY is the most efficient. If there
+  // are densely deleted rows, then BITMAP is the most efficient.
+  enum DeletionFileType {
+    // A single Int32Array of deleted row offsets, stored as an Arrow IPC file with one batch and
+    // one column. Has a .arrow extension.
+    ARROW_ARRAY = 0;
+    // A Roaring Bitmap of deleted row offsets. Has a .bin extension.
+    BITMAP = 1;
+  }
+
+  // Type of deletion file.
+  DeletionFileType file_type = 1;
+  // The version of the dataset this deletion file was built from.
+  uint64 read_version = 2;
+  // An opaque id used to differentiate this file from others written by concurrent
+  // writers.
+  uint64 id = 3;
+  // The number of rows that are marked as deleted.
+  uint64 num_deleted_rows = 4;
+  // The base path index of the deletion file. Used when the file is imported or referred from another
+  // dataset. Lance uses it as key of the base_paths field in Manifest to determine the actual base
+  // path of the deletion file.
+  optional uint32 base_id = 7;
+} // DeletionFile
+
+message ExternalFile {
+  // Path to the file, relative to the root of the table.
+  string path = 1;
+  // The byte offset in the file where the data starts.
+  uint64 offset = 2;
+  // The size of the data in the file, in bytes.
+  uint64 size = 3;
+}
+
+// VectorIndexDetails and HnswParameters (formerly HnswIndexDetails) moved to index.proto
+
+message FragmentReuseIndexDetails {
+
+  oneof content {
+    // if < 200KB, store the content inline, otherwise store the InlineContent bytes in external file
+    InlineContent inline = 1;
+    ExternalFile external = 2;
+  }
+
+  message InlineContent {
+    repeated Version versions = 1;
+  }
+
+  message FragmentDigest {
+    uint64 id = 1;
+
+    uint64 physical_rows = 2;
+
+    uint64 num_deleted_rows = 3;
+  }
+
+  // A summarized version of the RewriteGroup information in a Rewrite transaction
+  message Group {
+    // A roaring treemap of the changed row addresses.
+    // When combined with the old fragment IDs and new fragment IDs,
+    // it can recover the full mapping of old row addresses to either new row addresses or deleted.
+    // this mapping can then be used to remap indexes or satisfy index queries for the new unindexed fragments.
+    bytes changed_row_addrs = 1;
+
+    repeated FragmentDigest old_fragments = 2;
+
+    repeated FragmentDigest new_fragments = 3;
+  }
+
+  message Version {
+    // The dataset_version at the time the index adds this version entry
+    uint64 dataset_version = 1;
+
+    repeated Group groups = 3;
+  }
+}
+
+// ============================================================================
+// MemWAL Index Types
+// ============================================================================
+
+// Shard manifest containing epoch-based fencing and WAL state.
+// Each shard has exactly one active writer at any time.
+message ShardManifest {
+  // Shard identifier (UUID v4).
+  UUID shard_id = 11;
+
+  // Manifest version number.
+  // Matches the version encoded in the filename.
+  uint64 version = 1;
+
+  // Shard spec ID this shard was created with.
+  // Set at shard creation and immutable thereafter.
+  // A value of 0 indicates a manually-created shard not governed by any spec.
+  uint32 shard_spec_id = 10;
+
+  // Computed shard field values as raw Arrow scalar bytes, keyed by shard
+  // field id. The byte encoding follows Arrow's little-endian convention:
+  // int32 is 4 LE bytes, utf8 is raw UTF-8 bytes, etc. The receiver looks
+  // up the result_type from the ShardingSpec to interpret each value.
+  repeated ShardFieldEntry shard_field_entries = 14;
+
+  // Writer fencing token - monotonically increasing.
+  // A writer must increment this when claiming the shard.
+  uint64 writer_epoch = 2;
+
+  // The most recent WAL entry position that has been flushed to a MemTable.
+  // During recovery, replay starts from replay_after_wal_entry_position + 1.
+  // WAL positions are 1-based, so the default value 0 unambiguously means
+  // "no flush has ever stamped this shard" and recovery replays from 1.
+  uint64 replay_after_wal_entry_position = 3;
+
+  // The most recent WAL entry position observed at the time the manifest was
+  // updated. WAL positions are 1-based; default 0 means no entry has been
+  // written yet. This is a hint, not authoritative - recovery must list
+  // files to find actual state.
+  uint64 wal_entry_position_last_seen = 4;
+
+  // Next generation ID to create (incremented after each MemTable flush).
+  uint64 current_generation = 6;
+
+  // Field 7 removed: merged_generation moved to MemWalIndexDetails.merged_generations
+  // which is the authoritative source for merge progress.
+
+  // List of flushed MemTable generations and their directory paths.
+  repeated FlushedGeneration flushed_generations = 8;
+}
+
+// A shard field value stored as raw Arrow scalar bytes.
+message ShardFieldEntry {
+  // Shard field id (matches ShardingField.field_id in the ShardingSpec).
+  string field_id = 1;
+
+  // Raw Arrow scalar value bytes in little-endian encoding.
+  // The data type is determined by the result_type of the matching ShardingField.
+  bytes value = 2;
+}
+
+// A flushed MemTable generation and its storage location.
+message FlushedGeneration {
+  // Generation number.
+  uint64 generation = 1;
+
+  // Directory name relative to the shard directory.
+  string path = 2;
+}
+
+// A shard's merged generation, used in MemWalIndexDetails.
+message MergedGeneration {
+  // Shard identifier (UUID v4).
+  UUID shard_id = 1;
+
+  // Last generation merged to base table for this shard.
+  uint64 generation = 2;
+}
+
+// Tracks which merged generation a base table index has been rebuilt to cover.
+// Used to determine whether to read from flushed MemTable indexes or base table.
+message IndexCatchupProgress {
+  // Name of the base table index (must match an entry in maintained_indexes).
+  string index_name = 1;
+
+  // Per-shard progress: the generation up to which this index covers.
+  // If a shard is not present, the index is assumed to be fully caught up
+  // (i.e., caught_up_generation >= merged_generation for that shard).
+  repeated MergedGeneration caught_up_generations = 2;
+}
+
+// Index details for MemWAL Index, stored in IndexMetadata.index_details.
+// This is the centralized structure for all MemWAL metadata:
+// - Configuration (sharding specs, indexes to maintain)
+// - Merge progress (merged generations per shard)
+// - Shard state snapshots
+//
+// Writers read this index to get configuration before writing.
+// Readers may use shard snapshots in this index as a point-in-time
+// optimization. Readers that need the latest shard set should list shard
+// directories in storage and read each shard's latest manifest.
+// A background process updates the index periodically to keep shard snapshots current.
+//
+// Shard snapshots are stored as a Lance file with one row per shard.
+// The schema records shard discovery fields. Full mutable shard state remains
+// authoritative in the shard manifest files.
+//   shard_id: utf8
+//   shard_spec_id: uint32
+//   shard_field_{field_id}: typed per the matching ShardingField.result_type
+message MemWalIndexDetails {
+  // Snapshot timestamp (Unix timestamp in milliseconds).
+  int64 snapshot_ts_millis = 1;
+
+  // Number of shards in the snapshot.
+  // Used to determine storage format without reading the snapshot data.
+  uint32 num_shards = 2;
+
+  // Inline shard snapshots for small shard counts.
+  // When num_shards <= threshold (implementation-defined, e.g., 100),
+  // snapshots are stored inline as serialized bytes.
+  // Format: Lance file bytes with the shard snapshot schema.
+  optional bytes inline_snapshots = 3;
+
+  // Sharding specs defining how to derive shard identifiers.
+  // This configuration determines how rows are partitioned into shards.
+  repeated ShardingSpec sharding_specs = 7;
+
+  // Indexes from the base table to maintain in MemTables.
+  // These are index names referencing indexes defined on the base table.
+  // The primary key btree index is always maintained implicitly and
+  // should not be listed here.
+  //
+  // For vector indexes, MemTables inherit quantization parameters (PQ codebook,
+  // SQ params) from the base table index to ensure distance comparability.
+  repeated string maintained_indexes = 8;
+
+  // Last generation merged to base table for each shard.
+  // This is updated atomically with merge-insert data commits, enabling
+  // conflict resolution when multiple mergers operate concurrently.
+  //
+  // Note: This is separate from shard snapshots because:
+  // 1. merged_generations is updated by mergers (atomic with data commit)
+  // 2. shard snapshots are updated by background index builder
+  repeated MergedGeneration merged_generations = 9;
+
+  // Per-index catchup progress tracking.
+  // When data is merged to the base table, base table indexes are rebuilt
+  // asynchronously. This field tracks which generation each index covers.
+  //
+  // For indexed queries, if an index's caught_up_generation < merged_generation,
+  // readers should use flushed MemTable indexes for the gap instead of
+  // scanning unindexed data in the base table.
+  //
+  // If an index is not present in this list, it is assumed to be fully caught up.
+  repeated IndexCatchupProgress index_catchup = 10;
+
+  // Default ShardWriter configuration values for this MemWAL index.
+  //
+  // A free-form string map persisted so that every writer — across
+  // processes and restarts — starts from the same default writer
+  // configuration. These are defaults only: an individual writer may
+  // still override any value at runtime in its own ShardWriterConfig
+  // (which is not persisted).
+  map<string, string> writer_config_defaults = 11;
+}
+
+// Sharding spec definition.
+message ShardingSpec {
+  // Unique identifier for this spec within the index.
+  // IDs are never reused.
+  uint32 spec_id = 1;
+
+  // Sharding field definitions that determine how to compute shard identifiers.
+  repeated ShardingField fields = 2;
+}
+
+// Sharding field definition.
+message ShardingField {
+  // Unique string identifier for this shard field.
+  string field_id = 1;
+
+  // Field IDs referencing source columns in the schema.
+  repeated int32 source_ids = 2;
+
+  // Well-known shard transform name (e.g., "identity", "year", "bucket").
+  // Mutually exclusive with expression.
+  optional string transform = 3;
+
+  // DataFusion SQL expression for custom logic.
+  // Mutually exclusive with transform.
+  optional string expression = 4;
+
+  // Output type of the shard value (Arrow type name).
+  string result_type = 5;
+
+  // Transform parameters (e.g., num_buckets for bucket transform).
+  map<string, string> parameters = 6;
+}
--- a/vendor/lance-table/protos/table_identifier.proto
+++ b/vendor/lance-table/protos/table_identifier.proto
@ -0,0 +1,19 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+syntax = "proto3";
+
+package lance.datafusion;
+
+// Identifies a Lance dataset for remote reconstruction.
+//
+// Two modes:
+//   1. uri + serialized_manifest (fast): remote executor skips manifest read.
+//   2. uri + version + etag (lightweight): remote executor loads manifest from storage.
+message TableIdentifier {
+  string uri = 1;
+  uint64 version = 2;
+  optional string manifest_etag = 3;
+  optional bytes serialized_manifest = 4;
+  map<string, string> storage_options = 5;
+}
--- a/vendor/lance-table/protos/transaction.proto
+++ b/vendor/lance-table/protos/transaction.proto
@ -0,0 +1,354 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+syntax = "proto3";
+
+import "file.proto";
+import "table.proto";
+import "google/protobuf/any.proto";
+
+package lance.table;
+
+// A transaction represents the changes to a dataset.
+//
+// This has two purposes:
+// 1. When retrying a commit, the transaction can be used to re-build an updated
+//    manifest.
+// 2. When there's a conflict, this can be used to determine whether the other
+//    transaction is compatible with this one.
+message Transaction {
+  // The version of the dataset this transaction was built from.
+  //
+  // For example, for a delete transaction this means the version of the dataset
+  // that was read from while evaluating the deletion predicate.
+  uint64 read_version = 1;
+
+  // The UUID that unique identifies a transaction.
+  string uuid = 2;
+
+  // Optional version tag.
+  string tag = 3;
+
+  // Optional properties for the transaction
+  // __lance_commit_message is a reserved key
+  map<string, string> transaction_properties = 4;
+
+  // Add new rows to the dataset.
+  message Append {
+    // The new fragments to append.
+    //
+    // Fragment IDs are not yet assigned.
+    repeated DataFragment fragments = 1;
+  }
+
+  // Mark rows as deleted.
+  message Delete {
+    // The fragments to update
+    //
+    // The fragment IDs will match existing fragments in the dataset.
+    repeated DataFragment updated_fragments = 1;
+    // The fragments to delete entirely.
+    repeated uint64 deleted_fragment_ids = 2;
+    // The predicate that was evaluated
+    //
+    // This may be used to determine whether the delete would have affected 
+    // files written by a concurrent transaction.
+    string predicate = 3;
+  }
+
+  // Create or overwrite the entire dataset.
+  message Overwrite {
+    // The new fragments
+    //
+    // Fragment IDs are not yet assigned.
+    repeated DataFragment fragments = 1;
+    // The new schema
+    repeated lance.file.Field schema = 2;
+    // Schema metadata.
+    map<string, bytes> schema_metadata = 3;
+    // Key-value pairs to merge with existing config.
+    map<string, string> config_upsert_values = 4;
+    // The base paths to be added for the initial dataset creation
+    repeated BasePath initial_bases = 5;
+  }
+
+  // Add or replace a new secondary index.
+  //
+  // This is also used to remove an index (we are replacing it with nothing)
+  //
+  // - new_indices: the modified indices, empty if dropping indices only
+  // - removed_indices: the indices that are being replaced
+  message CreateIndex {
+    repeated IndexMetadata new_indices = 1;
+    repeated IndexMetadata removed_indices = 2;
+  }
+
+  // An operation that rewrites but does not change the data in the table. These
+  // kinds of operations just rearrange data.
+  message Rewrite {
+    // The old fragments that are being replaced
+    //
+    // DEPRECATED: use groups instead.
+    //
+    // These should all have existing fragment IDs.
+    repeated DataFragment old_fragments = 1;
+    // The new fragments
+    //
+    // DEPRECATED: use groups instead.
+    //
+    // These fragments IDs are not yet assigned.
+    repeated DataFragment new_fragments = 2;
+
+    // During a rewrite an index may be rewritten.  We only serialize the UUID
+    // since a rewrite should not change the other index parameters.
+    message RewrittenIndex {
+      // The id of the index that will be replaced
+      UUID old_id = 1;
+      // the id of the new index
+      UUID new_id = 2;
+      // the new index details
+      google.protobuf.Any new_index_details = 3;
+      // the version of the new index
+      uint32 new_index_version = 4;
+      // Files in the new index with their sizes.
+      // Empty if file sizes are not available (e.g. older writers).
+      repeated IndexFile new_index_files = 5;
+    }
+
+    // A group of rewrite files that are all part of the same rewrite.
+    message RewriteGroup {
+      // The old fragment that is being replaced
+      //
+      // This should have an existing fragment ID.
+      repeated DataFragment old_fragments = 1;
+      // The new fragment
+      //
+      // The ID should have been reserved by an earlier
+      // reserve operation
+      repeated DataFragment new_fragments = 2;
+    }
+
+    // Groups of files that have been rewritten
+    repeated RewriteGroup groups = 3;
+    // Indices that have been rewritten
+    repeated RewrittenIndex rewritten_indices = 4;
+  }
+
+  // An operation that merges in a new column, altering the schema.
+  message Merge {
+    // The updated fragments
+    //
+    // These should all have existing fragment IDs.
+    repeated DataFragment fragments = 1;
+    // The new schema
+    repeated lance.file.Field schema = 2;
+    // Schema metadata.
+    map<string, bytes> schema_metadata = 3;
+  }
+
+  // An operation that projects a subset of columns, altering the schema.
+  message Project {
+    // The new schema
+    repeated lance.file.Field schema = 1;
+  }
+
+  // An operation that restores a dataset to a previous version.
+  message Restore {
+    // The version to restore to
+    uint64 version = 1;
+  }
+
+  // An operation that reserves fragment ids for future use in
+  // a rewrite operation.
+  message ReserveFragments {
+    uint32 num_fragments = 1;
+  }
+
+  // An operation that clones a dataset.
+  message Clone {
+    // - true:  Performs a metadata-only clone (copies manifest without data files).
+    //          The cloned dataset references original data through `base_paths`,
+    //          suitable for experimental scenarios or rapid metadata migration.
+    // - false: Performs a full deep clone using the underlying object storage's native
+    //          copy API (e.g., S3 CopyObject, GCS rewrite). This leverages server-side
+    //          bulk copy operations to bypass download/upload bottlenecks, achieving
+    //          near-linear speedup for large datasets (typically 3-10x faster than
+    //          manual file transfers). The operation maintains atomicity and data
+    //          integrity guarantees provided by the storage backend.
+    bool is_shallow = 1;
+    // the reference name in the source dataset
+    // in most cases it should be the branch or tag name in the source dataset
+    optional string ref_name = 2;
+    // the version of the source dataset for cloning
+    uint64 ref_version = 3;
+    // the absolute base path of the source dataset for cloning
+    string ref_path = 4;
+    // if the target dataset is a branch, this is the branch name of the target dataset
+    optional string branch_name = 5;
+  }
+  
+  // Exact set of key hashes for conflict detection.
+  // Used when the number of inserted rows is small.
+  message ExactKeySetFilter {
+    // 64-bit hashes of the inserted row keys.
+    repeated uint64 key_hashes = 1;
+  }
+
+  // Bloom filter for key existence tests.
+  // Used when the number of rows is large.
+  message BloomFilter {
+    // Bitset backing the bloom filter (SBBF format).
+    bytes bitmap = 1;
+    // Number of bits in the bitmap.
+    uint32 num_bits = 2;
+    // Number of items the filter was sized for.
+    // Used for intersection validation (filters with different sizes cannot be compared).
+    // Default: 8192
+    uint64 number_of_items = 3;
+    // False positive probability the filter was sized for.
+    // Used for intersection validation (filters with different parameters cannot be compared).
+    // Default: 0.00057
+    double probability = 4;
+  }
+
+  // A filter for checking key existence in set of rows inserted by a merge insert operation.
+  // Only created when the merge insert's ON columns match the schema's unenforced primary key.
+  // The presence of this filter indicates strict primary key conflict detection should be used.
+  // Can use either an exact set (for small row counts) or a Bloom filter (for large row counts).
+  message KeyExistenceFilter {
+    // Field IDs of columns participating in the key (must match unenforced primary key).
+    repeated int32 field_ids = 1;
+    // The underlying data structure storing the key hashes.
+    oneof data {
+      // Exact set of key hashes (used for small number of rows).
+      ExactKeySetFilter exact = 2;
+      // Bloom filter (used for large number of rows).
+      BloomFilter bloom = 3;
+    }
+  }
+
+  // Serialized as sorted distinct local physical row offsets within the fragment (0-based).
+  message UInt32List {
+    repeated uint32 values = 1;
+  }
+
+  // An operation that updates rows but does not add or remove rows.
+  message Update {
+    // The fragments that have been removed. These are fragments where all rows
+    // have been updated and moved to a new fragment.
+    repeated uint64 removed_fragment_ids = 1;
+    // The fragments that have been updated.
+    repeated DataFragment updated_fragments = 2;
+    // The new fragments where updated rows have been moved to.
+    repeated DataFragment new_fragments = 3;
+    // The ids of the fields that have been modified.
+    repeated uint32 fields_modified = 4;
+    /// List of MemWAL shard generations to mark as merged after this transaction
+    repeated MergedGeneration merged_generations = 5;
+    /// The fields that used to judge whether to preserve the new frag's id into
+    /// the frag bitmap of the specified indices.
+    repeated uint32 fields_for_preserving_frag_bitmap = 6;
+    // The mode of update
+    UpdateMode update_mode = 7;
+    // Filter for checking existence of keys in newly inserted rows, used for conflict detection.
+    // Only tracks keys from INSERT operations during merge insert, not updates.
+    optional KeyExistenceFilter inserted_rows = 8;
+    // Per-fragment physical row offsets that matched an update_columns hash join (RewriteColumns).
+    map<uint64, UInt32List> updated_fragment_offsets = 9;
+  }
+
+  // The mode of update operation
+  enum UpdateMode {
+
+    /// rows are deleted in current fragments and rewritten in new fragments.
+    /// This is most optimal when the majority of columns are being rewritten
+    /// or only a few rows are being updated.
+    REWRITE_ROWS = 0;
+
+    /// within each fragment, columns are fully rewritten and inserted as new data files.
+    /// Old versions of columns are tombstoned. This is most optimal when most rows are affected
+    /// but a small subset of columns are affected.
+    REWRITE_COLUMNS = 1;
+  }
+
+  // An entry for a map update. If value is not set, the key will be removed from the map.
+  message UpdateMapEntry {
+    // The key of the map entry to update.
+    string key = 1;
+    // The value to set for the key.
+    optional string value = 2;
+  }
+
+  message UpdateMap {
+    repeated UpdateMapEntry update_entries = 1;
+    // If true, the map will be replaced entirely with the new entries.
+    // If false, the new entries will be merged with the existing map.
+    bool replace = 2;
+  }
+  
+  // An operation that updates the table config, table metadata, schema metadata,
+  // or field metadata.
+  message UpdateConfig {
+    UpdateMap config_updates = 6;
+    UpdateMap table_metadata_updates = 7;
+    UpdateMap schema_metadata_updates = 8;
+    map<int32, UpdateMap> field_metadata_updates = 9;
+
+    // Deprecated -------------------------------
+    map<string, string> upsert_values = 1;
+    repeated string delete_keys = 2;
+    map<string, string> schema_metadata = 3;
+    map<uint32, FieldMetadataUpdate> field_metadata = 4;
+
+    message FieldMetadataUpdate {
+      map<string, string> metadata = 5;
+    }
+  }
+
+  message DataReplacementGroup {
+    uint64 fragment_id = 1;
+    DataFile new_file = 2;
+  }
+
+  // An operation that replaces the data in a region of the table with new data.
+  message DataReplacement {
+    repeated DataReplacementGroup replacements = 1;
+  }
+
+  // Update the merged generations in MemWAL index.
+  // This operation is used during merge-insert to atomically record which
+  // generations have been merged to the base table.
+  message UpdateMemWalState {
+    // Shards and generations being marked as merged.
+    repeated MergedGeneration merged_generations = 1;
+  }
+
+  // An operation that updates base paths in the dataset.
+  message UpdateBases {
+    // The new base paths to add to the manifest.
+    repeated BasePath new_bases = 1;
+  }
+
+  // The operation of this transaction.
+  oneof operation {
+    Append append = 100;
+    Delete delete = 101;
+    Overwrite overwrite = 102;
+    CreateIndex create_index = 103;
+    Rewrite rewrite = 104;
+    Merge merge = 105;
+    Restore restore = 106;
+    ReserveFragments reserve_fragments = 107;
+    Update update = 108;
+    Project project = 109;
+    UpdateConfig update_config = 110;
+    DataReplacement data_replacement = 111;
+    UpdateMemWalState update_mem_wal_state = 112;
+    Clone clone = 113;
+    UpdateBases update_bases = 114;
+  }
+
+  // Fields 200/202 (`blob_append` / `blob_overwrite`) previously represented blob dataset ops.
+  reserved 200, 202;
+  reserved "blob_append", "blob_overwrite";
+}
--- a/vendor/lance-table/src/feature_flags.rs
+++ b/vendor/lance-table/src/feature_flags.rs
@ -0,0 +1,184 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+//! Feature flags
+
+use crate::format::Manifest;
+use lance_core::{Error, Result};
+
+/// Fragments may contain deletion files, which record the tombstones of
+/// soft-deleted rows.
+pub const FLAG_DELETION_FILES: u64 = 1;
+/// Row ids are stable for both moves and updates. Fragments contain an index
+/// mapping row ids to row addresses.
+pub const FLAG_STABLE_ROW_IDS: u64 = 2;
+/// Files are written with the new v2 format (this flag is no longer used)
+pub const FLAG_USE_V2_FORMAT_DEPRECATED: u64 = 4;
+/// Table config is present
+pub const FLAG_TABLE_CONFIG: u64 = 8;
+/// Dataset uses multiple base paths (for shallow clones or multi-base datasets)
+pub const FLAG_BASE_PATHS: u64 = 16;
+/// Disable writing transaction file under _transaction/, this flag is set when we only want to write inline transaction in manifest
+pub const FLAG_DISABLE_TRANSACTION_FILE: u64 = 32;
+/// The first bit that is unknown as a feature flag
+pub const FLAG_UNKNOWN: u64 = 64;
+
+/// Set the reader and writer feature flags in the manifest based on the contents of the manifest.
+pub fn apply_feature_flags(
+    manifest: &mut Manifest,
+    enable_stable_row_id: bool,
+    disable_transaction_file: bool,
+) -> Result<()> {
+    // Reset flags
+    manifest.reader_feature_flags = 0;
+    manifest.writer_feature_flags = 0;
+
+    let has_deletion_files = manifest
+        .fragments
+        .iter()
+        .any(|frag| frag.deletion_file.is_some());
+    if has_deletion_files {
+        // Both readers and writers need to be able to read deletion files
+        manifest.reader_feature_flags |= FLAG_DELETION_FILES;
+        manifest.writer_feature_flags |= FLAG_DELETION_FILES;
+    }
+
+    // If any fragment has row ids, they must all have row ids.
+    let has_row_ids = manifest
+        .fragments
+        .iter()
+        .any(|frag| frag.row_id_meta.is_some());
+    if has_row_ids || enable_stable_row_id {
+        if !manifest
+            .fragments
+            .iter()
+            .all(|frag| frag.row_id_meta.is_some())
+        {
+            return Err(Error::invalid_input("All fragments must have row ids"));
+        }
+        manifest.reader_feature_flags |= FLAG_STABLE_ROW_IDS;
+        manifest.writer_feature_flags |= FLAG_STABLE_ROW_IDS;
+    }
+
+    // Test whether any table metadata has been set
+    if !manifest.config.is_empty() {
+        manifest.writer_feature_flags |= FLAG_TABLE_CONFIG;
+    }
+
+    // Check if this dataset uses multiple base paths (for shallow clones or multi-base datasets)
+    if !manifest.base_paths.is_empty() {
+        manifest.reader_feature_flags |= FLAG_BASE_PATHS;
+        manifest.writer_feature_flags |= FLAG_BASE_PATHS;
+    }
+
+    if disable_transaction_file {
+        manifest.writer_feature_flags |= FLAG_DISABLE_TRANSACTION_FILE;
+    }
+    Ok(())
+}
+
+pub fn can_read_dataset(reader_flags: u64) -> bool {
+    reader_flags < FLAG_UNKNOWN
+}
+
+pub fn can_write_dataset(writer_flags: u64) -> bool {
+    writer_flags < FLAG_UNKNOWN
+}
+
+pub fn has_deprecated_v2_feature_flag(writer_flags: u64) -> bool {
+    writer_flags & FLAG_USE_V2_FORMAT_DEPRECATED != 0
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::format::BasePath;
+
+    #[test]
+    fn test_read_check() {
+        assert!(can_read_dataset(0));
+        assert!(can_read_dataset(super::FLAG_DELETION_FILES));
+        assert!(can_read_dataset(super::FLAG_STABLE_ROW_IDS));
+        assert!(can_read_dataset(super::FLAG_USE_V2_FORMAT_DEPRECATED));
+        assert!(can_read_dataset(super::FLAG_TABLE_CONFIG));
+        assert!(can_read_dataset(super::FLAG_BASE_PATHS));
+        assert!(can_read_dataset(super::FLAG_DISABLE_TRANSACTION_FILE));
+        assert!(can_read_dataset(
+            super::FLAG_DELETION_FILES
+                | super::FLAG_STABLE_ROW_IDS
+                | super::FLAG_USE_V2_FORMAT_DEPRECATED
+        ));
+        assert!(!can_read_dataset(super::FLAG_UNKNOWN));
+    }
+
+    #[test]
+    fn test_write_check() {
+        assert!(can_write_dataset(0));
+        assert!(can_write_dataset(super::FLAG_DELETION_FILES));
+        assert!(can_write_dataset(super::FLAG_STABLE_ROW_IDS));
+        assert!(can_write_dataset(super::FLAG_USE_V2_FORMAT_DEPRECATED));
+        assert!(can_write_dataset(super::FLAG_TABLE_CONFIG));
+        assert!(can_write_dataset(super::FLAG_BASE_PATHS));
+        assert!(can_write_dataset(super::FLAG_DISABLE_TRANSACTION_FILE));
+        assert!(can_write_dataset(
+            super::FLAG_DELETION_FILES
+                | super::FLAG_STABLE_ROW_IDS
+                | super::FLAG_USE_V2_FORMAT_DEPRECATED
+                | super::FLAG_TABLE_CONFIG
+                | super::FLAG_BASE_PATHS
+        ));
+        assert!(!can_write_dataset(super::FLAG_UNKNOWN));
+    }
+
+    #[test]
+    fn test_base_paths_feature_flags() {
+        use crate::format::{DataStorageFormat, Manifest};
+        use arrow_schema::{Field as ArrowField, Schema as ArrowSchema};
+        use lance_core::datatypes::Schema;
+        use std::collections::HashMap;
+        use std::sync::Arc;
+        // Create a basic schema for testing
+        let arrow_schema = ArrowSchema::new(vec![ArrowField::new(
+            "test_field",
+            arrow_schema::DataType::Int64,
+            false,
+        )]);
+        let schema = Schema::try_from(&arrow_schema).unwrap();
+        // Test 1: Normal dataset (no base_paths) should not have FLAG_BASE_PATHS
+        let mut normal_manifest = Manifest::new(
+            schema.clone(),
+            Arc::new(vec![]),
+            DataStorageFormat::default(),
+            HashMap::new(), // Empty base_paths
+        );
+        apply_feature_flags(&mut normal_manifest, false, false).unwrap();
+        assert_eq!(normal_manifest.reader_feature_flags & FLAG_BASE_PATHS, 0);
+        assert_eq!(normal_manifest.writer_feature_flags & FLAG_BASE_PATHS, 0);
+        // Test 2: Dataset with base_paths (shallow clone or multi-base) should have FLAG_BASE_PATHS
+        let mut base_paths: HashMap<u32, BasePath> = HashMap::new();
+        base_paths.insert(
+            1,
+            BasePath::new(
+                1,
+                "file:///path/to/original".to_string(),
+                Some("test_ref".to_string()),
+                true,
+            ),
+        );
+        let mut multi_base_manifest = Manifest::new(
+            schema,
+            Arc::new(vec![]),
+            DataStorageFormat::default(),
+            base_paths,
+        );
+        apply_feature_flags(&mut multi_base_manifest, false, false).unwrap();
+        assert_ne!(
+            multi_base_manifest.reader_feature_flags & FLAG_BASE_PATHS,
+            0
+        );
+        assert_ne!(
+            multi_base_manifest.writer_feature_flags & FLAG_BASE_PATHS,
+            0
+        );
+    }
+}
--- a/vendor/lance-table/src/format.rs
+++ b/vendor/lance-table/src/format.rs
@ -0,0 +1,70 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+use arrow_buffer::ToByteSlice;
+use uuid::Uuid;
+
+mod fragment;
+mod index;
+mod manifest;
+mod transaction;
+
+pub use crate::rowids::version::{
+    RowDatasetVersionMeta, RowDatasetVersionRun, RowDatasetVersionSequence,
+};
+pub use fragment::*;
+pub use index::{IndexFile, IndexMetadata, index_metadata_codec, list_index_files_with_sizes};
+
+pub use manifest::{
+    BasePath, DETACHED_VERSION_MASK, DataStorageFormat, Manifest, SelfDescribingFileReader,
+    WriterVersion, is_detached_version,
+};
+pub use transaction::Transaction;
+
+use lance_core::{Error, Result};
+
+// In 0.36.1 we renamed Index to IndexMetadata because Index conflicted too much with the
+// Index trait.  This is left in for backward compatibility.
+#[deprecated(since = "0.36.1", note = "Use IndexMetadata instead")]
+pub type Index = IndexMetadata;
+
+/// Protobuf definitions for Lance Format
+pub mod pb {
+    #![allow(clippy::all)]
+    #![allow(non_upper_case_globals)]
+    #![allow(non_camel_case_types)]
+    #![allow(non_snake_case)]
+    #![allow(unused)]
+    #![allow(improper_ctypes)]
+    #![allow(clippy::upper_case_acronyms)]
+    #![allow(clippy::use_self)]
+    include!(concat!(env!("OUT_DIR"), "/lance.table.rs"));
+}
+
+/// These version/magic values are written at the end of manifest files (e.g. versions/1.version)
+pub const MAJOR_VERSION: i16 = 0;
+pub const MINOR_VERSION: i16 = 1;
+pub const MAGIC: &[u8; 4] = b"LANC";
+
+impl TryFrom<&pb::Uuid> for Uuid {
+    type Error = Error;
+
+    fn try_from(p: &pb::Uuid) -> Result<Self> {
+        if p.uuid.len() != 16 {
+            return Err(Error::invalid_input(
+                "Protobuf UUID is malformed".to_string(),
+            ));
+        }
+        let mut buf: [u8; 16] = [0; 16];
+        buf.copy_from_slice(p.uuid.to_byte_slice());
+        Ok(Self::from_bytes(buf))
+    }
+}
+
+impl From<&Uuid> for pb::Uuid {
+    fn from(value: &Uuid) -> Self {
+        Self {
+            uuid: value.into_bytes().to_vec(),
+        }
+    }
+}
--- a/vendor/lance-table/src/format/fragment.rs
+++ b/vendor/lance-table/src/format/fragment.rs
@ -0,0 +1,841 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+use std::collections::HashMap;
+use std::num::NonZero;
+use std::sync::Arc;
+
+use deepsize::DeepSizeOf;
+use lance_core::Error;
+use lance_file::format::{MAJOR_VERSION, MINOR_VERSION};
+use lance_file::version::LanceFileVersion;
+use lance_io::utils::CachedFileSize;
+use object_store::path::Path;
+use serde::{Deserialize, Deserializer, Serialize, Serializer};
+
+use crate::format::pb;
+
+use crate::rowids::version::{
+    RowDatasetVersionMeta, created_at_version_meta_to_pb, last_updated_at_version_meta_to_pb,
+};
+use lance_core::datatypes::Schema;
+use lance_core::error::Result;
+
+/// Lance Data File
+///
+/// A data file is one piece of file storing data.
+#[derive(Debug, Clone, PartialEq, Eq, DeepSizeOf)]
+pub struct DataFile {
+    /// Relative path of the data file to dataset root.
+    pub path: String,
+    /// The ids of fields in this file.
+    ///
+    /// When identical across many fragments (common case), multiple `DataFile`
+    /// instances share a single heap allocation via `Arc`, significantly
+    /// reducing manifest memory for large tables.
+    pub fields: Arc<[i32]>,
+    /// The offsets of the fields listed in `fields`, empty in v1 files
+    ///
+    /// Note that -1 is a possibility and it indices that the field has
+    /// no top-level column in the file.
+    ///
+    /// Columns that lack a field id may still exist as extra entries in
+    /// `column_indices`; such columns are ignored by field-id–based projection.
+    /// For example, some fields, such as blob fields, occupy multiple
+    /// columns in the file but only have a single field id.
+    pub column_indices: Arc<[i32]>,
+    /// The major version of the file format used to write this file.
+    pub file_major_version: u32,
+    /// The minor version of the file format used to write this file.
+    pub file_minor_version: u32,
+
+    /// The size of the file in bytes, if known.
+    pub file_size_bytes: CachedFileSize,
+
+    /// The base path of the datafile, when the datafile is outside the dataset.
+    pub base_id: Option<u32>,
+}
+
+// Custom Serialize: convert Arc<[i32]> to slice for transparent JSON output
+impl Serialize for DataFile {
+    fn serialize<S: Serializer>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error> {
+        use serde::ser::SerializeStruct;
+        let mut s = serializer.serialize_struct("DataFile", 7)?;
+        s.serialize_field("path", &self.path)?;
+        s.serialize_field("fields", self.fields.as_ref())?;
+        s.serialize_field("column_indices", self.column_indices.as_ref())?;
+        s.serialize_field("file_major_version", &self.file_major_version)?;
+        s.serialize_field("file_minor_version", &self.file_minor_version)?;
+        s.serialize_field("file_size_bytes", &self.file_size_bytes)?;
+        s.serialize_field("base_id", &self.base_id)?;
+        s.end()
+    }
+}
+
+// Custom Deserialize: read Vec<i32> and convert to Arc<[i32]>
+impl<'de> Deserialize<'de> for DataFile {
+    fn deserialize<D: Deserializer<'de>>(deserializer: D) -> std::result::Result<Self, D::Error> {
+        #[derive(Deserialize)]
+        struct DataFileHelper {
+            path: String,
+            fields: Vec<i32>,
+            #[serde(default)]
+            column_indices: Vec<i32>,
+            #[serde(default)]
+            file_major_version: u32,
+            #[serde(default)]
+            file_minor_version: u32,
+            file_size_bytes: CachedFileSize,
+            base_id: Option<u32>,
+        }
+
+        let helper = DataFileHelper::deserialize(deserializer)?;
+        Ok(Self {
+            path: helper.path,
+            fields: Arc::from(helper.fields),
+            column_indices: Arc::from(helper.column_indices),
+            file_major_version: helper.file_major_version,
+            file_minor_version: helper.file_minor_version,
+            file_size_bytes: helper.file_size_bytes,
+            base_id: helper.base_id,
+        })
+    }
+}
+
+impl DataFile {
+    pub fn new(
+        path: impl Into<String>,
+        fields: Vec<i32>,
+        column_indices: Vec<i32>,
+        file_major_version: u32,
+        file_minor_version: u32,
+        file_size_bytes: Option<NonZero<u64>>,
+        base_id: Option<u32>,
+    ) -> Self {
+        Self {
+            path: path.into(),
+            fields: Arc::from(fields),
+            column_indices: Arc::from(column_indices),
+            file_major_version,
+            file_minor_version,
+            file_size_bytes: file_size_bytes.into(),
+            base_id,
+        }
+    }
+
+    /// Create a new `DataFile` with the expectation that fields and column_indices will be set later
+    pub fn new_unstarted(
+        path: impl Into<String>,
+        file_major_version: u32,
+        file_minor_version: u32,
+    ) -> Self {
+        Self {
+            path: path.into(),
+            fields: Arc::from([]),
+            column_indices: Arc::from([]),
+            file_major_version,
+            file_minor_version,
+            file_size_bytes: Default::default(),
+            base_id: None,
+        }
+    }
+
+    pub fn new_legacy_from_fields(
+        path: impl Into<String>,
+        fields: Vec<i32>,
+        base_id: Option<u32>,
+    ) -> Self {
+        Self::new(
+            path,
+            fields,
+            vec![],
+            MAJOR_VERSION as u32,
+            MINOR_VERSION as u32,
+            None,
+            base_id,
+        )
+    }
+
+    pub fn new_legacy(
+        path: impl Into<String>,
+        schema: &Schema,
+        file_size_bytes: Option<NonZero<u64>>,
+        base_id: Option<u32>,
+    ) -> Self {
+        let mut field_ids = schema.field_ids();
+        field_ids.sort();
+        Self::new(
+            path,
+            field_ids,
+            vec![],
+            MAJOR_VERSION as u32,
+            MINOR_VERSION as u32,
+            file_size_bytes,
+            base_id,
+        )
+    }
+
+    pub fn schema(&self, full_schema: &Schema) -> Schema {
+        full_schema.project_by_ids(&self.fields, false)
+    }
+
+    pub fn is_legacy_file(&self) -> bool {
+        self.file_major_version == 0 && self.file_minor_version < 3
+    }
+
+    pub fn validate(&self, base_path: &Path) -> Result<()> {
+        if self.is_legacy_file() {
+            if !self.fields.windows(2).all(|w| w[0] < w[1]) {
+                return Err(Error::corrupt_file(
+                    base_path.clone().join(self.path.clone()),
+                    "contained unsorted or duplicate field ids",
+                ));
+            }
+        } else if self.column_indices.len() < self.fields.len() {
+            // Every recorded field id must have a column index, but not every column needs
+            // to be associated with a field id (extra columns are allowed).
+            return Err(Error::corrupt_file(
+                base_path.clone().join(self.path.clone()),
+                "contained fewer column_indices than fields",
+            ));
+        }
+        Ok(())
+    }
+}
+
+impl From<&DataFile> for pb::DataFile {
+    fn from(df: &DataFile) -> Self {
+        Self {
+            path: df.path.clone(),
+            fields: df.fields.to_vec(),
+            column_indices: df.column_indices.to_vec(),
+            file_major_version: df.file_major_version,
+            file_minor_version: df.file_minor_version,
+            file_size_bytes: df.file_size_bytes.get().map_or(0, |v| v.get()),
+            base_id: df.base_id,
+        }
+    }
+}
+
+impl TryFrom<pb::DataFile> for DataFile {
+    type Error = Error;
+
+    fn try_from(proto: pb::DataFile) -> Result<Self> {
+        Ok(Self {
+            path: proto.path,
+            fields: Arc::from(proto.fields),
+            column_indices: Arc::from(proto.column_indices),
+            file_major_version: proto.file_major_version,
+            file_minor_version: proto.file_minor_version,
+            file_size_bytes: CachedFileSize::new(proto.file_size_bytes),
+            base_id: proto.base_id,
+        })
+    }
+}
+
+/// Interns repeated data so that fragments with identical content share a
+/// single heap allocation via `Arc`.
+///
+/// At 20M fragments the deduplication typically saves multiple GB of heap
+/// because every fragment in a homogeneous table carries the same field list,
+/// and post-compaction fragments share identical version metadata bytes.
+///
+/// Uses a `Vec`-based linear scan when the cache is small (<=16 entries)
+/// and upgrades to `HashMap` for larger caches. In the common homogeneous
+/// case (1-3 unique values), linear scan avoids per-fragment hashing overhead.
+#[derive(Default)]
+pub struct DataFileFieldInterner {
+    fields: InternCache<i32>,
+    column_indices: InternCache<i32>,
+    inline_bytes: InternCache<u8>,
+}
+
+/// A cache that uses linear scan for small sizes and HashMap for large.
+/// The threshold is chosen so that scan + compare is cheaper than hash for
+/// typical payload sizes (20-200 bytes).
+enum InternCache<T: Eq + std::hash::Hash + Clone> {
+    Small(Vec<Arc<[T]>>),
+    Large(HashMap<Arc<[T]>, ()>),
+}
+
+const INTERN_CACHE_UPGRADE_THRESHOLD: usize = 16;
+
+impl<T: Eq + std::hash::Hash + Clone> Default for InternCache<T> {
+    fn default() -> Self {
+        Self::Small(Vec::new())
+    }
+}
+
+impl<T: Eq + std::hash::Hash + Clone> InternCache<T> {
+    fn intern(&mut self, v: Vec<T>) -> Arc<[T]> {
+        match self {
+            Self::Small(entries) => {
+                for existing in entries.iter() {
+                    if existing.as_ref() == v.as_slice() {
+                        return existing.clone();
+                    }
+                }
+                let arc: Arc<[T]> = Arc::from(v);
+                entries.push(arc.clone());
+                if entries.len() > INTERN_CACHE_UPGRADE_THRESHOLD {
+                    let mut map = HashMap::with_capacity(entries.len());
+                    for e in entries.drain(..) {
+                        map.insert(e, ());
+                    }
+                    *self = Self::Large(map);
+                }
+                arc
+            }
+            Self::Large(map) => {
+                if let Some((existing, _)) = map.get_key_value(v.as_slice()) {
+                    existing.clone()
+                } else {
+                    let arc: Arc<[T]> = Arc::from(v);
+                    map.insert(arc.clone(), ());
+                    arc
+                }
+            }
+        }
+    }
+}
+
+impl DataFileFieldInterner {
+    /// Intern a `RowDatasetVersionMeta`, deduplicating inline byte payloads.
+    /// Accepts the protobuf oneof value directly to avoid an intermediate
+    /// `Arc<[u8]>` allocation that would need to be `.to_vec()`'d for the key lookup.
+    fn intern_last_updated_version_meta(
+        cache: &mut InternCache<u8>,
+        pb: pb::data_fragment::LastUpdatedAtVersionSequence,
+    ) -> Result<RowDatasetVersionMeta> {
+        match pb {
+            pb::data_fragment::LastUpdatedAtVersionSequence::InlineLastUpdatedAtVersions(data) => {
+                Ok(RowDatasetVersionMeta::Inline(cache.intern(data)))
+            }
+            pb::data_fragment::LastUpdatedAtVersionSequence::ExternalLastUpdatedAtVersions(
+                file,
+            ) => Ok(RowDatasetVersionMeta::External(ExternalFile {
+                path: file.path,
+                offset: file.offset,
+                size: file.size,
+            })),
+        }
+    }
+
+    /// Intern a `RowDatasetVersionMeta`, deduplicating inline byte payloads.
+    fn intern_created_version_meta(
+        cache: &mut InternCache<u8>,
+        pb: pb::data_fragment::CreatedAtVersionSequence,
+    ) -> Result<RowDatasetVersionMeta> {
+        match pb {
+            pb::data_fragment::CreatedAtVersionSequence::InlineCreatedAtVersions(data) => {
+                Ok(RowDatasetVersionMeta::Inline(cache.intern(data)))
+            }
+            pb::data_fragment::CreatedAtVersionSequence::ExternalCreatedAtVersions(file) => {
+                Ok(RowDatasetVersionMeta::External(ExternalFile {
+                    path: file.path,
+                    offset: file.offset,
+                    size: file.size,
+                }))
+            }
+        }
+    }
+
+    /// Convert a protobuf `DataFile`, interning `fields` and `column_indices`.
+    pub fn intern_data_file(&mut self, proto: pb::DataFile) -> Result<DataFile> {
+        Ok(DataFile {
+            path: proto.path,
+            fields: self.fields.intern(proto.fields),
+            column_indices: self.column_indices.intern(proto.column_indices),
+            file_major_version: proto.file_major_version,
+            file_minor_version: proto.file_minor_version,
+            file_size_bytes: CachedFileSize::new(proto.file_size_bytes),
+            base_id: proto.base_id,
+        })
+    }
+
+    /// Convert a protobuf `DataFragment`, interning fields and version metadata.
+    pub fn intern_fragment(&mut self, p: pb::DataFragment) -> Result<Fragment> {
+        let physical_rows = if p.physical_rows > 0 {
+            Some(p.physical_rows as usize)
+        } else {
+            None
+        };
+        let last_updated_at_version_meta = p
+            .last_updated_at_version_sequence
+            .map(|pb| Self::intern_last_updated_version_meta(&mut self.inline_bytes, pb))
+            .transpose()?;
+        let created_at_version_meta = p
+            .created_at_version_sequence
+            .map(|pb| Self::intern_created_version_meta(&mut self.inline_bytes, pb))
+            .transpose()?;
+        Ok(Fragment {
+            id: p.id,
+            files: p
+                .files
+                .into_iter()
+                .map(|f| self.intern_data_file(f))
+                .collect::<Result<_>>()?,
+            deletion_file: p.deletion_file.map(DeletionFile::try_from).transpose()?,
+            row_id_meta: p.row_id_sequence.map(RowIdMeta::try_from).transpose()?,
+            physical_rows,
+            last_updated_at_version_meta,
+            created_at_version_meta,
+        })
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)]
+#[serde(rename_all = "lowercase")]
+pub enum DeletionFileType {
+    Array,
+    Bitmap,
+}
+
+impl DeletionFileType {
+    // TODO: pub(crate)
+    pub fn suffix(&self) -> &str {
+        match self {
+            Self::Array => "arrow",
+            Self::Bitmap => "bin",
+        }
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)]
+pub struct DeletionFile {
+    pub read_version: u64,
+    pub id: u64,
+    pub file_type: DeletionFileType,
+    /// Number of deleted rows in this file. If None, this is unknown.
+    pub num_deleted_rows: Option<usize>,
+    pub base_id: Option<u32>,
+}
+
+impl TryFrom<pb::DeletionFile> for DeletionFile {
+    type Error = Error;
+
+    fn try_from(value: pb::DeletionFile) -> Result<Self> {
+        let file_type = match value.file_type {
+            0 => DeletionFileType::Array,
+            1 => DeletionFileType::Bitmap,
+            _ => {
+                return Err(Error::not_supported_source(
+                    "Unknown deletion file type".into(),
+                ));
+            }
+        };
+        let num_deleted_rows = if value.num_deleted_rows == 0 {
+            None
+        } else {
+            Some(value.num_deleted_rows as usize)
+        };
+        Ok(Self {
+            read_version: value.read_version,
+            id: value.id,
+            file_type,
+            num_deleted_rows,
+            base_id: value.base_id,
+        })
+    }
+}
+
+/// A reference to a part of a file.
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)]
+pub struct ExternalFile {
+    pub path: String,
+    pub offset: u64,
+    pub size: u64,
+}
+
+/// Metadata about location of the row id sequence.
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)]
+pub enum RowIdMeta {
+    Inline(Vec<u8>),
+    External(ExternalFile),
+}
+
+impl TryFrom<pb::data_fragment::RowIdSequence> for RowIdMeta {
+    type Error = Error;
+
+    fn try_from(value: pb::data_fragment::RowIdSequence) -> Result<Self> {
+        match value {
+            pb::data_fragment::RowIdSequence::InlineRowIds(data) => Ok(Self::Inline(data)),
+            pb::data_fragment::RowIdSequence::ExternalRowIds(file) => {
+                Ok(Self::External(ExternalFile {
+                    path: file.path.clone(),
+                    offset: file.offset,
+                    size: file.size,
+                }))
+            }
+        }
+    }
+}
+
+/// Data fragment.
+///
+/// A fragment is a set of files which represent the different columns of the same rows.
+/// If column exists in the schema, but the related file does not exist, treat this column as `nulls`.
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, DeepSizeOf)]
+pub struct Fragment {
+    /// Fragment ID
+    pub id: u64,
+
+    /// Files within the fragment.
+    pub files: Vec<DataFile>,
+
+    /// Optional file with deleted local row offsets.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub deletion_file: Option<DeletionFile>,
+
+    /// RowIndex
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub row_id_meta: Option<RowIdMeta>,
+
+    /// Original number of rows in the fragment. If this is None, then it is
+    /// unknown. This is only optional for legacy reasons. All new tables should
+    /// have this set.
+    pub physical_rows: Option<usize>,
+
+    /// Last updated at version metadata
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub last_updated_at_version_meta: Option<RowDatasetVersionMeta>,
+
+    /// Created at version metadata
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub created_at_version_meta: Option<RowDatasetVersionMeta>,
+}
+
+impl Fragment {
+    pub fn new(id: u64) -> Self {
+        Self {
+            id,
+            files: vec![],
+            deletion_file: None,
+            row_id_meta: None,
+            physical_rows: None,
+            last_updated_at_version_meta: None,
+            created_at_version_meta: None,
+        }
+    }
+
+    pub fn num_rows(&self) -> Option<usize> {
+        match (self.physical_rows, &self.deletion_file) {
+            // Known fragment length, no deletion file.
+            (Some(len), None) => Some(len),
+            // Known fragment length, but don't know deletion file size.
+            (
+                Some(len),
+                Some(DeletionFile {
+                    num_deleted_rows: Some(num_deleted_rows),
+                    ..
+                }),
+            ) => Some(len - num_deleted_rows),
+            _ => None,
+        }
+    }
+
+    pub fn from_json(json: &str) -> Result<Self> {
+        let fragment: Self = serde_json::from_str(json)?;
+        Ok(fragment)
+    }
+
+    /// Create a `Fragment` with one DataFile
+    pub fn with_file_legacy(
+        id: u64,
+        path: &str,
+        schema: &Schema,
+        physical_rows: Option<usize>,
+    ) -> Self {
+        Self {
+            id,
+            files: vec![DataFile::new_legacy(path, schema, None, None)],
+            deletion_file: None,
+            physical_rows,
+            row_id_meta: None,
+            last_updated_at_version_meta: None,
+            created_at_version_meta: None,
+        }
+    }
+
+    pub fn with_file(
+        mut self,
+        path: impl Into<String>,
+        field_ids: Vec<i32>,
+        column_indices: Vec<i32>,
+        version: &LanceFileVersion,
+        file_size_bytes: Option<NonZero<u64>>,
+    ) -> Self {
+        let (major, minor) = version.to_numbers();
+        let data_file = DataFile::new(
+            path,
+            field_ids,
+            column_indices,
+            major,
+            minor,
+            file_size_bytes,
+            None,
+        );
+        self.files.push(data_file);
+        self
+    }
+
+    pub fn with_physical_rows(mut self, physical_rows: usize) -> Self {
+        self.physical_rows = Some(physical_rows);
+        self
+    }
+
+    pub fn add_file(
+        &mut self,
+        path: impl Into<String>,
+        field_ids: Vec<i32>,
+        column_indices: Vec<i32>,
+        version: &LanceFileVersion,
+        file_size_bytes: Option<NonZero<u64>>,
+    ) {
+        let (major, minor) = version.to_numbers();
+        self.files.push(DataFile::new(
+            path,
+            field_ids,
+            column_indices,
+            major,
+            minor,
+            file_size_bytes,
+            None,
+        ));
+    }
+
+    /// Add a new [`DataFile`] to this fragment.
+    pub fn add_file_legacy(&mut self, path: &str, schema: &Schema) {
+        self.files
+            .push(DataFile::new_legacy(path, schema, None, None));
+    }
+
+    // True if this fragment is made up of legacy v1 files, false otherwise
+    pub fn has_legacy_files(&self) -> bool {
+        // If any file in a fragment is legacy then all files in the fragment must be
+        self.files[0].is_legacy_file()
+    }
+
+    // Helper method to infer the Lance version from a set of fragments
+    //
+    // Returns None if there are no data files
+    // Returns an error if the data files have different versions
+    pub fn try_infer_version(fragments: &[Self]) -> Result<Option<LanceFileVersion>> {
+        // Otherwise we need to check the actual file versions
+        // Determine version from first file
+        let Some(sample_file) = fragments
+            .iter()
+            .find(|f| !f.files.is_empty())
+            .map(|f| &f.files[0])
+        else {
+            return Ok(None);
+        };
+        let file_version = LanceFileVersion::try_from_major_minor(
+            sample_file.file_major_version,
+            sample_file.file_minor_version,
+        )?;
+        // Ensure all files match
+        for frag in fragments {
+            for file in &frag.files {
+                let this_file_version = LanceFileVersion::try_from_major_minor(
+                    file.file_major_version,
+                    file.file_minor_version,
+                )?;
+                if file_version != this_file_version {
+                    return Err(Error::invalid_input(format!(
+                        "All data files must have the same version.  Detected both {} and {}",
+                        file_version, this_file_version
+                    )));
+                }
+            }
+        }
+        Ok(Some(file_version))
+    }
+}
+
+impl TryFrom<pb::DataFragment> for Fragment {
+    type Error = Error;
+
+    fn try_from(p: pb::DataFragment) -> Result<Self> {
+        let physical_rows = if p.physical_rows > 0 {
+            Some(p.physical_rows as usize)
+        } else {
+            None
+        };
+        Ok(Self {
+            id: p.id,
+            files: p
+                .files
+                .into_iter()
+                .map(DataFile::try_from)
+                .collect::<Result<_>>()?,
+            deletion_file: p.deletion_file.map(DeletionFile::try_from).transpose()?,
+            row_id_meta: p.row_id_sequence.map(RowIdMeta::try_from).transpose()?,
+            physical_rows,
+            last_updated_at_version_meta: p
+                .last_updated_at_version_sequence
+                .map(RowDatasetVersionMeta::try_from)
+                .transpose()?,
+            created_at_version_meta: p
+                .created_at_version_sequence
+                .map(RowDatasetVersionMeta::try_from)
+                .transpose()?,
+        })
+    }
+}
+
+impl From<&Fragment> for pb::DataFragment {
+    fn from(f: &Fragment) -> Self {
+        let deletion_file = f.deletion_file.as_ref().map(|f| {
+            let file_type = match f.file_type {
+                DeletionFileType::Array => pb::deletion_file::DeletionFileType::ArrowArray,
+                DeletionFileType::Bitmap => pb::deletion_file::DeletionFileType::Bitmap,
+            };
+            pb::DeletionFile {
+                read_version: f.read_version,
+                id: f.id,
+                file_type: file_type.into(),
+                num_deleted_rows: f.num_deleted_rows.unwrap_or_default() as u64,
+                base_id: f.base_id,
+            }
+        });
+
+        let row_id_sequence = f.row_id_meta.as_ref().map(|m| match m {
+            RowIdMeta::Inline(data) => pb::data_fragment::RowIdSequence::InlineRowIds(data.clone()),
+            RowIdMeta::External(file) => {
+                pb::data_fragment::RowIdSequence::ExternalRowIds(pb::ExternalFile {
+                    path: file.path.clone(),
+                    offset: file.offset,
+                    size: file.size,
+                })
+            }
+        });
+        let last_updated_at_version_sequence =
+            last_updated_at_version_meta_to_pb(&f.last_updated_at_version_meta);
+        let created_at_version_sequence = created_at_version_meta_to_pb(&f.created_at_version_meta);
+        Self {
+            id: f.id,
+            files: f.files.iter().map(pb::DataFile::from).collect(),
+            deletion_file,
+            row_id_sequence,
+            physical_rows: f.physical_rows.unwrap_or_default() as u64,
+            last_updated_at_version_sequence,
+            created_at_version_sequence,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow_schema::{
+        DataType, Field as ArrowField, Fields as ArrowFields, Schema as ArrowSchema,
+    };
+    use object_store::path::Path;
+    use serde_json::{Value, json};
+
+    #[test]
+    fn test_new_fragment() {
+        let path = "foobar.lance";
+
+        let arrow_schema = ArrowSchema::new(vec![
+            ArrowField::new(
+                "s",
+                DataType::Struct(ArrowFields::from(vec![
+                    ArrowField::new("si", DataType::Int32, false),
+                    ArrowField::new("sb", DataType::Binary, true),
+                ])),
+                true,
+            ),
+            ArrowField::new("bool", DataType::Boolean, true),
+        ]);
+        let schema = Schema::try_from(&arrow_schema).unwrap();
+        let fragment = Fragment::with_file_legacy(123, path, &schema, Some(10));
+
+        assert_eq!(123, fragment.id);
+        assert_eq!(
+            fragment.files,
+            vec![DataFile::new_legacy_from_fields(
+                path.to_string(),
+                vec![0, 1, 2, 3],
+                None,
+            )]
+        )
+    }
+
+    #[test]
+    fn test_roundtrip_fragment() {
+        let mut fragment = Fragment::new(123);
+        let schema = ArrowSchema::new(vec![ArrowField::new("x", DataType::Float16, true)]);
+        fragment.add_file_legacy("foobar.lance", &Schema::try_from(&schema).unwrap());
+        fragment.deletion_file = Some(DeletionFile {
+            read_version: 123,
+            id: 456,
+            file_type: DeletionFileType::Array,
+            num_deleted_rows: Some(10),
+            base_id: None,
+        });
+
+        let proto = pb::DataFragment::from(&fragment);
+        let fragment2 = Fragment::try_from(proto).unwrap();
+        assert_eq!(fragment, fragment2);
+
+        fragment.deletion_file = None;
+        let proto = pb::DataFragment::from(&fragment);
+        let fragment2 = Fragment::try_from(proto).unwrap();
+        assert_eq!(fragment, fragment2);
+    }
+
+    #[test]
+    fn test_to_json() {
+        let mut fragment = Fragment::new(123);
+        let schema = ArrowSchema::new(vec![ArrowField::new("x", DataType::Float16, true)]);
+        fragment.add_file_legacy("foobar.lance", &Schema::try_from(&schema).unwrap());
+        fragment.deletion_file = Some(DeletionFile {
+            read_version: 123,
+            id: 456,
+            file_type: DeletionFileType::Array,
+            num_deleted_rows: Some(10),
+            base_id: None,
+        });
+
+        let json = serde_json::to_string(&fragment).unwrap();
+
+        let value: Value = serde_json::from_str(&json).unwrap();
+        assert_eq!(
+            value,
+            json!({
+                "id": 123,
+                "files":[
+                    {"path": "foobar.lance", "fields": [0], "column_indices": [], 
+                     "file_major_version": MAJOR_VERSION, "file_minor_version": MINOR_VERSION,
+                     "file_size_bytes": null, "base_id": null }
+                ],
+                "deletion_file": {"read_version": 123, "id": 456, "file_type": "array",
+                                  "num_deleted_rows": 10, "base_id": null},
+                "physical_rows": None::<usize>}),
+        );
+
+        let frag2 = Fragment::from_json(&json).unwrap();
+        assert_eq!(fragment, frag2);
+    }
+
+    #[test]
+    fn data_file_validate_allows_extra_columns() {
+        let data_file = DataFile {
+            path: "foo.lance".to_string(),
+            fields: Arc::from([1, 2]),
+            // One extra column without a field id mapping
+            column_indices: Arc::from([0, 1, 2]),
+            file_major_version: MAJOR_VERSION as u32,
+            file_minor_version: MINOR_VERSION as u32,
+            file_size_bytes: Default::default(),
+            base_id: None,
+        };
+
+        let base_path = Path::from("base");
+        data_file
+            .validate(&base_path)
+            .expect("validation should allow extra columns without field ids");
+    }
+}
--- a/vendor/lance-table/src/format/index.rs
+++ b/vendor/lance-table/src/format/index.rs
@ -0,0 +1,368 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+//! Metadata for index
+
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use chrono::{DateTime, Utc};
+use deepsize::DeepSizeOf;
+use futures::StreamExt;
+use lance_io::object_store::ObjectStore;
+use object_store::path::Path;
+use roaring::RoaringBitmap;
+use uuid::Uuid;
+
+use super::pb;
+use lance_core::{Error, Result};
+
+/// Metadata about a single file within an index segment.
+#[derive(Debug, Clone, PartialEq, DeepSizeOf)]
+pub struct IndexFile {
+    /// Path relative to the index directory (e.g., "index.idx", "auxiliary.idx")
+    pub path: String,
+    /// Size of the file in bytes
+    pub size_bytes: u64,
+}
+
+/// Index metadata
+#[derive(Debug, Clone, PartialEq)]
+pub struct IndexMetadata {
+    /// Unique ID across all dataset versions.
+    pub uuid: Uuid,
+
+    /// Fields to build the index.
+    pub fields: Vec<i32>,
+
+    /// Human readable index name
+    pub name: String,
+
+    /// The version of the dataset this index was last updated on
+    ///
+    /// This is set when the index is created (based on the version used to train the index)
+    /// This is updated when the index is updated or remapped
+    pub dataset_version: u64,
+
+    /// The fragment ids this index covers.
+    ///
+    /// This may contain fragment ids that no longer exist in the dataset.
+    ///
+    /// If this is None, then this is unknown.
+    pub fragment_bitmap: Option<RoaringBitmap>,
+
+    /// Metadata specific to the index type
+    ///
+    /// This is an Option because older versions of Lance may not have this defined.  However, it should always
+    /// be present in newer versions.
+    pub index_details: Option<Arc<prost_types::Any>>,
+
+    /// The index version.
+    pub index_version: i32,
+
+    /// Timestamp when the index was created
+    ///
+    /// This field is optional for backward compatibility. For existing indices created before
+    /// this field was added, this will be None.
+    pub created_at: Option<DateTime<Utc>>,
+
+    /// The base path index of the index files. Used when the index is imported or referred from another dataset.
+    /// Lance uses it as key of the base_paths field in Manifest to determine the actual base path of the index files.
+    pub base_id: Option<u32>,
+
+    /// List of files and their sizes for this index segment.
+    /// This enables skipping HEAD calls when opening indices and provides
+    /// visibility into index storage size via describe_indices().
+    /// This is None if the file sizes are unknown. This happens for indices created
+    /// before this field was added.
+    pub files: Option<Vec<IndexFile>>,
+}
+
+impl IndexMetadata {
+    pub fn effective_fragment_bitmap(
+        &self,
+        existing_fragments: &RoaringBitmap,
+    ) -> Option<RoaringBitmap> {
+        let fragment_bitmap = self.fragment_bitmap.as_ref()?;
+        Some(fragment_bitmap & existing_fragments)
+    }
+
+    /// Returns a map of relative file paths to their sizes.
+    /// Returns an empty map if file information is not available.
+    pub fn file_size_map(&self) -> HashMap<String, u64> {
+        self.files
+            .as_ref()
+            .map(|files| {
+                files
+                    .iter()
+                    .map(|f| (f.path.clone(), f.size_bytes))
+                    .collect()
+            })
+            .unwrap_or_default()
+    }
+
+    /// Returns the total size of all files in this index segment in bytes.
+    /// Returns None if file information is not available.
+    pub fn total_size_bytes(&self) -> Option<u64> {
+        self.files
+            .as_ref()
+            .map(|files| files.iter().map(|f| f.size_bytes).sum())
+    }
+
+    /// Returns the set of fragments which are part of the fragment bitmap
+    /// but no longer in the dataset.
+    pub fn deleted_fragment_bitmap(
+        &self,
+        existing_fragments: &RoaringBitmap,
+    ) -> Option<RoaringBitmap> {
+        let fragment_bitmap = self.fragment_bitmap.as_ref()?;
+        Some(fragment_bitmap - existing_fragments)
+    }
+}
+
+impl DeepSizeOf for IndexMetadata {
+    fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize {
+        self.uuid.as_bytes().deep_size_of_children(context)
+            + self.fields.deep_size_of_children(context)
+            + self.name.deep_size_of_children(context)
+            + self.dataset_version.deep_size_of_children(context)
+            + self
+                .fragment_bitmap
+                .as_ref()
+                .map(|fragment_bitmap| fragment_bitmap.serialized_size())
+                .unwrap_or(0)
+            + self.files.deep_size_of_children(context)
+    }
+}
+
+impl TryFrom<pb::IndexMetadata> for IndexMetadata {
+    type Error = Error;
+
+    fn try_from(proto: pb::IndexMetadata) -> Result<Self> {
+        let fragment_bitmap = if proto.fragment_bitmap.is_empty() {
+            None
+        } else {
+            Some(RoaringBitmap::deserialize_from(
+                &mut proto.fragment_bitmap.as_slice(),
+            )?)
+        };
+
+        let files = if proto.files.is_empty() {
+            None
+        } else {
+            Some(
+                proto
+                    .files
+                    .into_iter()
+                    .map(|f| IndexFile {
+                        path: f.path,
+                        size_bytes: f.size_bytes,
+                    })
+                    .collect(),
+            )
+        };
+
+        Ok(Self {
+            uuid: proto.uuid.as_ref().map(Uuid::try_from).ok_or_else(|| {
+                Error::invalid_input("uuid field does not exist in Index metadata".to_string())
+            })??,
+            name: proto.name,
+            fields: proto.fields,
+            dataset_version: proto.dataset_version,
+            fragment_bitmap,
+            index_details: proto.index_details.map(Arc::new),
+            index_version: proto.index_version.unwrap_or_default(),
+            created_at: proto.created_at.map(|ts| {
+                DateTime::from_timestamp_millis(ts as i64)
+                    .expect("Invalid timestamp in index metadata")
+            }),
+            base_id: proto.base_id,
+            files,
+        })
+    }
+}
+
+impl From<&IndexMetadata> for pb::IndexMetadata {
+    fn from(idx: &IndexMetadata) -> Self {
+        let mut fragment_bitmap = Vec::new();
+        if let Some(bitmap) = &idx.fragment_bitmap
+            && let Err(e) = bitmap.serialize_into(&mut fragment_bitmap)
+        {
+            // In theory, this should never error. But if we do, just
+            // recover gracefully.
+            log::error!("Failed to serialize fragment bitmap: {}", e);
+            fragment_bitmap.clear();
+        }
+
+        let files = idx
+            .files
+            .as_ref()
+            .map(|files| {
+                files
+                    .iter()
+                    .map(|f| pb::IndexFile {
+                        path: f.path.clone(),
+                        size_bytes: f.size_bytes,
+                    })
+                    .collect()
+            })
+            .unwrap_or_default();
+
+        Self {
+            uuid: Some((&idx.uuid).into()),
+            name: idx.name.clone(),
+            fields: idx.fields.clone(),
+            dataset_version: idx.dataset_version,
+            fragment_bitmap,
+            index_details: idx
+                .index_details
+                .as_ref()
+                .map(|details| details.as_ref().clone()),
+            index_version: Some(idx.index_version),
+            created_at: idx.created_at.map(|dt| dt.timestamp_millis() as u64),
+            base_id: idx.base_id,
+            files,
+        }
+    }
+}
+
+/// Returns a [`CacheCodec`](lance_core::cache::CacheCodec) for `Vec<IndexMetadata>`.
+///
+/// Uses `pb::IndexSection` (which wraps `repeated IndexMetadata`) as the wire
+/// format, reusing the existing `TryFrom`/`From` conversions.
+///
+/// Uses [`CacheCodec::new`](lance_core::cache::CacheCodec::new) because the
+/// orphan rule prevents `impl CacheCodecImpl for Vec<IndexMetadata>`.
+type ArcAny = Arc<dyn std::any::Any + Send + Sync>;
+
+fn serialize_index_metadata(
+    any: &ArcAny,
+    writer: &mut dyn std::io::Write,
+) -> lance_core::Result<()> {
+    use prost::Message;
+    let vec = any
+        .downcast_ref::<Vec<IndexMetadata>>()
+        .expect("index_metadata_codec: wrong type (this is a bug in the cache layer)");
+    let section = pb::IndexSection {
+        indices: vec.iter().map(pb::IndexMetadata::from).collect(),
+    };
+    writer.write_all(&section.encode_to_vec())?;
+    Ok(())
+}
+
+fn deserialize_index_metadata(data: &bytes::Bytes) -> lance_core::Result<ArcAny> {
+    use prost::Message;
+    let section = pb::IndexSection::decode(data.as_ref())?;
+    let indices: Vec<IndexMetadata> = section
+        .indices
+        .into_iter()
+        .map(IndexMetadata::try_from)
+        .collect::<lance_core::Result<_>>()?;
+    Ok(Arc::new(indices))
+}
+
+pub fn index_metadata_codec() -> lance_core::cache::CacheCodec {
+    lance_core::cache::CacheCodec::new(serialize_index_metadata, deserialize_index_metadata)
+}
+
+/// List all files in an index directory with their sizes.
+///
+/// Returns a list of `IndexFile` structs containing relative paths and sizes.
+/// This is used to capture file metadata after index creation/modification.
+pub async fn list_index_files_with_sizes(
+    object_store: &ObjectStore,
+    index_dir: &Path,
+) -> Result<Vec<IndexFile>> {
+    let mut files = Vec::new();
+    let mut stream = object_store.read_dir_all(index_dir, None);
+    while let Some(meta) = stream.next().await {
+        let meta = meta?;
+        // Get relative path by stripping the index_dir prefix
+        let relative_path = meta
+            .location
+            .as_ref()
+            .strip_prefix(index_dir.as_ref())
+            .map(|s| s.trim_start_matches('/').to_string())
+            .unwrap_or_else(|| meta.location.filename().unwrap_or("").to_string());
+        files.push(IndexFile {
+            path: relative_path,
+            size_bytes: meta.size,
+        });
+    }
+    Ok(files)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::collections::HashMap;
+
+    /// Demonstrates the pattern a disk-backed cache backend would use:
+    /// serialize entries to bytes, store in a key-value map, then
+    /// deserialize on retrieval.
+    #[test]
+    fn test_index_metadata_codec_roundtrip() {
+        let codec = index_metadata_codec();
+
+        let original = vec![
+            IndexMetadata {
+                uuid: Uuid::new_v4(),
+                name: "my_index".to_string(),
+                fields: vec![0, 1],
+                dataset_version: 42,
+                fragment_bitmap: Some(RoaringBitmap::from_iter([1, 2, 3])),
+                index_details: None,
+                index_version: 1,
+                created_at: None,
+                base_id: None,
+                files: Some(vec![IndexFile {
+                    path: "index.idx".to_string(),
+                    size_bytes: 1024,
+                }]),
+            },
+            IndexMetadata {
+                uuid: Uuid::new_v4(),
+                name: "second_index".to_string(),
+                fields: vec![2],
+                dataset_version: 43,
+                fragment_bitmap: None,
+                index_details: None,
+                index_version: 2,
+                created_at: None,
+                base_id: Some(7),
+                files: None,
+            },
+        ];
+
+        // Simulate a disk-backed store: HashMap<String, Vec<u8>>
+        let mut store: HashMap<String, Vec<u8>> = HashMap::new();
+
+        // Serialize into the store
+        let key = "dataset/v42/Vec<IndexMetadata>".to_string();
+        let mut buf = Vec::new();
+        let entry: Arc<dyn std::any::Any + Send + Sync> = Arc::new(original.clone());
+        codec.serialize(&entry, &mut buf).unwrap();
+        store.insert(key.clone(), buf);
+
+        // Deserialize from the store
+        let bytes = store.get(&key).unwrap();
+        let recovered = codec
+            .deserialize(&bytes::Bytes::copy_from_slice(bytes))
+            .unwrap();
+        let recovered = recovered
+            .downcast::<Vec<IndexMetadata>>()
+            .expect("downcast should succeed");
+
+        assert_eq!(original.len(), recovered.len());
+        for (orig, rec) in original.iter().zip(recovered.iter()) {
+            assert_eq!(orig.uuid, rec.uuid);
+            assert_eq!(orig.name, rec.name);
+            assert_eq!(orig.fields, rec.fields);
+            assert_eq!(orig.dataset_version, rec.dataset_version);
+            assert_eq!(orig.fragment_bitmap, rec.fragment_bitmap);
+            assert_eq!(orig.index_version, rec.index_version);
+            assert_eq!(orig.base_id, rec.base_id);
+            assert_eq!(orig.files, rec.files);
+        }
+    }
+}
--- a/vendor/lance-table/src/format/manifest.rs
+++ b/vendor/lance-table/src/format/manifest.rs
--- a/vendor/lance-table/src/format/transaction.rs
+++ b/vendor/lance-table/src/format/transaction.rs
@ -0,0 +1,42 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+//! Transaction struct for lance-table format layer.
+//!
+//! This struct is introduced to provide a Struct-first API for passing transaction
+//! information within the lance-table crate. It mirrors the protobuf Transaction
+//! message at a semantic level while remaining crate-local, so lance-table does
+//! not depend on higher layers (e.g., lance crate).
+//!
+//! Conversion to protobuf occurs at the write boundary. See the `From<Transaction>`
+//! implementation below.
+
+use crate::format::pb;
+
+#[derive(Clone, Debug, PartialEq)]
+pub struct Transaction {
+    /// Crate-local representation backing: protobuf Transaction.
+    /// Keeping this simple avoids ring dependencies while still enabling
+    /// Struct-first parameter passing in lance-table.
+    pub inner: pb::Transaction,
+}
+
+impl Transaction {
+    /// Accessor for testing or internal inspection if needed.
+    pub fn as_pb(&self) -> &pb::Transaction {
+        &self.inner
+    }
+}
+
+/// Write-boundary conversion: serialize using protobuf at the last step.
+impl From<Transaction> for pb::Transaction {
+    fn from(tx: Transaction) -> Self {
+        tx.inner
+    }
+}
+
+impl From<pb::Transaction> for Transaction {
+    fn from(pb_tx: pb::Transaction) -> Self {
+        Self { inner: pb_tx }
+    }
+}
--- a/vendor/lance-table/src/io.rs
+++ b/vendor/lance-table/src/io.rs
@ -0,0 +1,6 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+pub mod commit;
+pub mod deletion;
+pub mod manifest;
--- a/vendor/lance-table/src/io/commit.rs
+++ b/vendor/lance-table/src/io/commit.rs
--- a/vendor/lance-table/src/io/commit/dynamodb.rs
+++ b/vendor/lance-table/src/io/commit/dynamodb.rs
@ -0,0 +1,495 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+//! DynamoDB based external manifest store
+//!
+
+use std::collections::HashSet;
+use std::sync::{Arc, LazyLock};
+
+use async_trait::async_trait;
+use aws_sdk_dynamodb::Client;
+use aws_sdk_dynamodb::error::SdkError;
+use aws_sdk_dynamodb::operation::RequestId;
+use aws_sdk_dynamodb::operation::delete_item::builders::DeleteItemFluentBuilder;
+use aws_sdk_dynamodb::operation::{
+    get_item::builders::GetItemFluentBuilder, put_item::builders::PutItemFluentBuilder,
+    query::builders::QueryFluentBuilder,
+};
+use aws_sdk_dynamodb::types::{AttributeValue, KeyType};
+use object_store::path::Path;
+use snafu::OptionExt;
+use tokio::sync::RwLock;
+use tracing::warn;
+
+use crate::io::commit::external_manifest::ExternalManifestStore;
+use lance_core::error::NotFoundSnafu;
+use lance_core::error::box_error;
+use lance_core::{Error, Result};
+
+use super::ManifestLocation;
+use super::external_manifest::detect_naming_scheme_from_path;
+
+#[derive(Debug)]
+struct WrappedSdkError<E>(SdkError<E>);
+
+impl<E> From<WrappedSdkError<E>> for Error
+where
+    E: std::error::Error + Send + Sync + 'static,
+{
+    fn from(e: WrappedSdkError<E>) -> Self {
+        Self::io_source(box_error(e))
+    }
+}
+
+impl<E> std::fmt::Display for WrappedSdkError<E>
+where
+    E: std::error::Error + Send + Sync + 'static,
+{
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let request_id = self.0.request_id().unwrap_or("unknown");
+        let service_err = &self.0.raw_response();
+        write!(f, "WrappedSdkError: request_id: {}", request_id)?;
+        if let Some(err) = service_err {
+            write!(f, ", service_error: {:?}", err)
+        } else {
+            write!(f, ", no service error")
+        }
+    }
+}
+
+impl<E> std::error::Error for WrappedSdkError<E>
+where
+    E: std::error::Error + Send + Sync + 'static,
+{
+    // Implement the necessary methods for the Error trait here.
+    // For example, you can delegate to the inner SdkError:
+
+    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+        Some(&self.0)
+    }
+}
+
+trait SdkResultExt<T> {
+    fn wrap_err(self) -> Result<T>;
+}
+
+impl<T, E> SdkResultExt<T> for std::result::Result<T, SdkError<E>>
+where
+    E: std::error::Error + Send + Sync + 'static,
+{
+    fn wrap_err(self) -> Result<T> {
+        self.map_err(|err| {
+            warn!(
+                target: "lance::dynamodb",
+                request_id = err.request_id().unwrap_or("unknown"),
+                "DynamoDB SDK error: {err:?}",
+            );
+            Error::from(WrappedSdkError(err))
+        })
+    }
+}
+
+/// An external manifest store backed by DynamoDB
+///
+/// When calling DynamoDBExternalManifestStore::new_external_store()
+/// the key schema, (PK, SK), is checked. If the table does not exist,
+/// or the key schema is not as expected, an error is returned.
+///
+/// The table schema is expected as follows:
+/// PK: base_uri -- string
+/// SK: version -- number
+/// path -- string
+/// committer -- string
+///
+/// Consistency: This store is expected to have read-after-write consistency
+/// consistent_read should always be set to true
+///
+/// Transaction Safety: This store uses DynamoDB conditional write to ensure
+/// only one writer can win per version.
+#[derive(Debug)]
+pub struct DynamoDBExternalManifestStore {
+    client: Arc<Client>,
+    table_name: String,
+    committer_name: String,
+}
+
+// these are in macro because I want to use them in a match statement
+macro_rules! base_uri {
+    () => {
+        "base_uri"
+    };
+}
+macro_rules! version {
+    () => {
+        "version"
+    };
+}
+macro_rules! path {
+    () => {
+        "path"
+    };
+}
+macro_rules! committer {
+    () => {
+        "committer"
+    };
+}
+
+impl DynamoDBExternalManifestStore {
+    pub async fn new_external_store(
+        client: Arc<Client>,
+        table_name: &str,
+        committer_name: &str,
+    ) -> Result<Arc<dyn ExternalManifestStore>> {
+        static SANITY_CHECK_CACHE: LazyLock<RwLock<HashSet<String>>> =
+            LazyLock::new(|| RwLock::new(HashSet::new()));
+
+        let store = Arc::new(Self {
+            client: client.clone(),
+            table_name: table_name.to_string(),
+            committer_name: committer_name.to_string(),
+        });
+
+        // already checked this table before, skip
+        // this is to avoid checking the table schema every time
+        // because it's expensive to call DescribeTable
+        if SANITY_CHECK_CACHE.read().await.contains(table_name) {
+            return Ok(store);
+        }
+
+        // Check if the table schema is correct
+        let describe_result = client
+            .describe_table()
+            .table_name(table_name)
+            .send()
+            .await
+            .wrap_err()?;
+        let table = describe_result
+            .table
+            .ok_or_else(|| Error::io(format!("dynamodb table: {table_name} does not exist")))?;
+        let mut schema = table.key_schema.ok_or_else(|| {
+            Error::io(format!(
+                "dynamodb table: {table_name} does not have a key schema"
+            ))
+        })?;
+
+        let mut has_hash_key = false;
+        let mut has_range_key = false;
+
+        // there should be two keys, HASH(base_uri) and RANGE(version)
+        for _ in 0..2 {
+            let key = schema.pop().ok_or_else(|| {
+                Error::io(format!(
+                    "dynamodb table: {table_name} must have HASH and RANGE keys"
+                ))
+            })?;
+            match (key.key_type, key.attribute_name.as_str()) {
+                (KeyType::Hash, base_uri!()) => {
+                    has_hash_key = true;
+                }
+                (KeyType::Range, version!()) => {
+                    has_range_key = true;
+                }
+                _ => {
+                    return Err(Error::io(format!(
+                        "dynamodb table: {} unknown key type encountered name:{}",
+                        table_name, key.attribute_name
+                    )));
+                }
+            }
+        }
+
+        // Both keys must be present
+        if !(has_hash_key && has_range_key) {
+            return Err(Error::io(format!(
+                "dynamodb table: {} must have HASH and RANGE keys, named `{}` and `{}` respectively",
+                table_name,
+                base_uri!(),
+                version!()
+            )));
+        }
+
+        SANITY_CHECK_CACHE
+            .write()
+            .await
+            .insert(table_name.to_string());
+
+        Ok(store)
+    }
+
+    fn ddb_put(&self) -> PutItemFluentBuilder {
+        self.client.put_item().table_name(&self.table_name)
+    }
+
+    fn ddb_get(&self) -> GetItemFluentBuilder {
+        self.client
+            .get_item()
+            .table_name(&self.table_name)
+            .consistent_read(true)
+    }
+
+    fn ddb_query(&self) -> QueryFluentBuilder {
+        self.client
+            .query()
+            .table_name(&self.table_name)
+            .consistent_read(true)
+    }
+
+    fn ddb_delete(&self) -> DeleteItemFluentBuilder {
+        self.client.delete_item().table_name(&self.table_name)
+    }
+}
+
+#[async_trait]
+impl ExternalManifestStore for DynamoDBExternalManifestStore {
+    /// Get the manifest path for a given base_uri and version
+    async fn get(&self, base_uri: &str, version: u64) -> Result<String> {
+        let get_item_result = self
+            .ddb_get()
+            .key(base_uri!(), AttributeValue::S(base_uri.into()))
+            .key(version!(), AttributeValue::N(version.to_string()))
+            .send()
+            .await
+            .wrap_err()?;
+
+        let item = get_item_result.item.context(NotFoundSnafu {
+            uri: format!(
+                "dynamodb not found: base_uri: {}; version: {}",
+                base_uri, version
+            ),
+        })?;
+
+        let path = item
+            .get(path!())
+            .ok_or_else(|| Error::not_found(format!("key {} is not present", path!())))?;
+
+        match path {
+            AttributeValue::S(path) => Ok(path.clone()),
+            _ => Err(Error::invalid_input(format!(
+                "key {} is not a string",
+                path!()
+            ))),
+        }
+    }
+
+    async fn get_manifest_location(
+        &self,
+        base_uri: &str,
+        version: u64,
+    ) -> Result<ManifestLocation> {
+        let get_item_result = self
+            .ddb_get()
+            .key(base_uri!(), AttributeValue::S(base_uri.into()))
+            .key(version!(), AttributeValue::N(version.to_string()))
+            .send()
+            .await
+            .wrap_err()?;
+
+        let item = get_item_result.item.context(NotFoundSnafu {
+            uri: format!(
+                "dynamodb not found: base_uri: {}; version: {}",
+                base_uri, version
+            ),
+        })?;
+
+        let path = item
+            .get(path!())
+            .ok_or_else(|| Error::not_found(format!("key {} is not present", path!())))?
+            .as_s()
+            .map_err(|_| Error::invalid_input(format!("key {} is not a string", path!())))?
+            .as_str();
+        let path = Path::from(path);
+
+        let size = item
+            .get("size")
+            .and_then(|attr| attr.as_n().ok().and_then(|v| v.parse().ok()));
+
+        let e_tag = item.get("e_tag").and_then(|attr| attr.as_s().ok().cloned());
+
+        let naming_scheme = detect_naming_scheme_from_path(&path)?;
+
+        Ok(ManifestLocation {
+            version,
+            path,
+            size,
+            naming_scheme,
+            e_tag,
+        })
+    }
+
+    /// Get the latest version of a dataset at the base_uri
+    async fn get_latest_version(&self, base_uri: &str) -> Result<Option<(u64, String)>> {
+        self.get_latest_manifest_location(base_uri)
+            .await
+            .map(|location| location.map(|loc| (loc.version, loc.path.to_string())))
+    }
+
+    async fn get_latest_manifest_location(
+        &self,
+        base_uri: &str,
+    ) -> Result<Option<ManifestLocation>> {
+        let query_result = self
+            .ddb_query()
+            .key_condition_expression(format!("{} = :{}", base_uri!(), base_uri!()))
+            .expression_attribute_values(
+                format!(":{}", base_uri!()),
+                AttributeValue::S(base_uri.into()),
+            )
+            .scan_index_forward(false)
+            .limit(1)
+            .send()
+            .await
+            .wrap_err()?;
+
+        match query_result.items {
+            Some(mut items) => {
+                if items.is_empty() {
+                    return Ok(None);
+                }
+                if items.len() > 1 {
+                    return Err(Error::invalid_input(format!(
+                        "dynamodb table: {} returned unexpected number of items",
+                        self.table_name
+                    )));
+                }
+
+                let item = items.pop().expect("length checked");
+                let version_attribute = item
+                    .get(version!())
+                    .ok_or_else(|| Error::not_found(
+                        format!("dynamodb error: found entries for {} but the returned data does not contain {} column", base_uri, version!())
+                    ))?;
+
+                let path_attribute = item
+                    .get(path!())
+                    .ok_or_else(|| Error::not_found(
+                        format!("dynamodb error: found entries for {} but the returned data does not contain {} column", base_uri, path!())
+                    ))?;
+
+                let size = item.get("size").and_then(|attr| match attr {
+                    AttributeValue::N(size) => size.parse().ok(),
+                    _ => None,
+                });
+
+                let e_tag = item.get("e_tag").and_then(|attr| attr.as_s().ok().cloned());
+
+                match (version_attribute, path_attribute) {
+                    (AttributeValue::N(version), AttributeValue::S(path)) => {
+                        let version = version.parse().map_err(|e| Error::invalid_input(format!("dynamodb error: could not parse the version number returned {}, error: {}", version, e)))?;
+                        let path = Path::from(path.as_str());
+                        let naming_scheme = detect_naming_scheme_from_path(&path)?;
+                        let location = ManifestLocation {
+                            version,
+                            path,
+                            size,
+                            naming_scheme,
+                            e_tag,
+                        };
+                        Ok(Some(location))
+                    }
+                    _ => Err(Error::invalid_input(format!(
+                        "dynamodb error: found entries for {base_uri} but the returned data is not number type"
+                    ))),
+                }
+            }
+            _ => Ok(None),
+        }
+    }
+
+    /// Put the manifest path for a given base_uri and version, should fail if the version already exists
+    async fn put_if_not_exists(
+        &self,
+        base_uri: &str,
+        version: u64,
+        path: &str,
+        size: u64,
+        e_tag: Option<String>,
+    ) -> Result<()> {
+        let mut put_item = self
+            .ddb_put()
+            .item(base_uri!(), AttributeValue::S(base_uri.into()))
+            .item(version!(), AttributeValue::N(version.to_string()))
+            .item(path!(), AttributeValue::S(path.to_string()))
+            .item(committer!(), AttributeValue::S(self.committer_name.clone()))
+            .item("size", AttributeValue::N(size.to_string()));
+
+        if let Some(e_tag) = e_tag {
+            put_item = put_item.item("e_tag", AttributeValue::S(e_tag));
+        }
+
+        put_item
+            .condition_expression(format!(
+                "attribute_not_exists({}) AND attribute_not_exists({})",
+                base_uri!(),
+                version!(),
+            ))
+            .send()
+            .await
+            .wrap_err()?;
+
+        Ok(())
+    }
+
+    /// Put the manifest path for a given base_uri and version, should fail if the version **does not** already exist
+    async fn put_if_exists(
+        &self,
+        base_uri: &str,
+        version: u64,
+        path: &str,
+        size: u64,
+        e_tag: Option<String>,
+    ) -> Result<()> {
+        let mut put_item = self
+            .ddb_put()
+            .item(base_uri!(), AttributeValue::S(base_uri.into()))
+            .item(version!(), AttributeValue::N(version.to_string()))
+            .item(path!(), AttributeValue::S(path.to_string()))
+            .item(committer!(), AttributeValue::S(self.committer_name.clone()))
+            .item("size", AttributeValue::N(size.to_string()));
+
+        if let Some(e_tag) = e_tag {
+            put_item = put_item.item("e_tag", AttributeValue::S(e_tag));
+        }
+
+        put_item
+            .condition_expression(format!(
+                "attribute_exists({}) AND attribute_exists({})",
+                base_uri!(),
+                version!(),
+            ))
+            .send()
+            .await
+            .wrap_err()?;
+
+        Ok(())
+    }
+
+    /// Delete the manifest information for the given base_uri in dynamodb
+    async fn delete(&self, base_uri: &str) -> Result<()> {
+        let query_result = self
+            .ddb_query()
+            .key_condition_expression(format!("{} = :{}", base_uri!(), base_uri!()))
+            .expression_attribute_values(
+                format!(":{}", base_uri!()),
+                AttributeValue::S(base_uri.into()),
+            )
+            .send()
+            .await
+            .wrap_err()?;
+
+        if let Some(items) = query_result.items {
+            for item in items {
+                if let Some(AttributeValue::N(version)) = item.get("version") {
+                    self.ddb_delete()
+                        .key(base_uri!(), AttributeValue::S(base_uri.to_string()))
+                        .key(version!(), AttributeValue::N(version.clone()))
+                        .send()
+                        .await
+                        .wrap_err()?;
+                }
+            }
+        }
+        Ok(())
+    }
+}
--- a/vendor/lance-table/src/io/commit/external_manifest.rs
+++ b/vendor/lance-table/src/io/commit/external_manifest.rs
@ -0,0 +1,515 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+//! Trait for external manifest handler.
+//!
+//! This trait abstracts an external storage with put_if_not_exists semantics.
+
+use std::sync::Arc;
+
+use async_trait::async_trait;
+use lance_core::utils::tracing::{
+    AUDIT_MODE_CREATE, AUDIT_MODE_DELETE, AUDIT_TYPE_MANIFEST, TRACE_FILE_AUDIT,
+};
+use lance_core::{Error, Result};
+use lance_io::object_store::ObjectStore;
+use log::warn;
+use object_store::ObjectMeta;
+use object_store::ObjectStoreExt;
+use object_store::{Error as ObjectStoreError, ObjectStore as OSObjectStore, path::Path};
+use tracing::info;
+
+use super::{
+    MANIFEST_EXTENSION, ManifestLocation, ManifestNamingScheme, current_manifest_path,
+    default_resolve_version, make_staging_manifest_path, write_version_hint,
+};
+use crate::format::{IndexMetadata, Manifest, Transaction};
+use crate::io::commit::{CommitError, CommitHandler};
+
+/// External manifest store
+///
+/// This trait abstracts an external storage for source of truth for manifests.
+/// The storage is expected to remember (uri, version) -> manifest_path
+/// and able to run transactions on the manifest_path.
+///
+/// This trait is called an **External** manifest store because the store is
+/// expected to work in tandem with the object store. We are only leveraging
+/// the external store for concurrent commit. Any manifest committed thru this
+/// trait should ultimately be materialized in the object store.
+/// For a visual explanation of the commit loop see
+/// <https://github.com/lance-format/lance/assets/12615154/b0822312-0826-432a-b554-3965f8d48d04>
+#[async_trait]
+pub trait ExternalManifestStore: std::fmt::Debug + Send + Sync {
+    /// Get the manifest path for a given base_uri and version
+    async fn get(&self, base_uri: &str, version: u64) -> Result<String>;
+
+    async fn get_manifest_location(
+        &self,
+        base_uri: &str,
+        version: u64,
+    ) -> Result<ManifestLocation> {
+        let path = self.get(base_uri, version).await?;
+        let path = Path::parse(&path).map_err(|e| Error::invalid_input(e.to_string()))?;
+        let naming_scheme = detect_naming_scheme_from_path(&path)?;
+        Ok(ManifestLocation {
+            version,
+            path,
+            size: None,
+            naming_scheme,
+            e_tag: None,
+        })
+    }
+
+    /// Get the latest version of a dataset at the base_uri, and the path to the manifest.
+    /// The path is provided as an optimization. The path is deterministic based on
+    /// the version and the store should not customize it.
+    async fn get_latest_version(&self, base_uri: &str) -> Result<Option<(u64, String)>>;
+
+    /// Get the latest manifest location for a given base_uri.
+    ///
+    /// By default, this calls get_latest_version.  Impls should
+    /// override this method if they store both the location and size
+    /// of the latest manifest.
+    async fn get_latest_manifest_location(
+        &self,
+        base_uri: &str,
+    ) -> Result<Option<ManifestLocation>> {
+        self.get_latest_version(base_uri).await.and_then(|res| {
+            res.map(|(version, uri)| {
+                let path = Path::parse(&uri).map_err(|e| Error::invalid_input(e.to_string()))?;
+                let naming_scheme = detect_naming_scheme_from_path(&path)?;
+                Ok(ManifestLocation {
+                    version,
+                    path,
+                    size: None,
+                    naming_scheme,
+                    e_tag: None,
+                })
+            })
+            .transpose()
+        })
+    }
+
+    /// Put the manifest to the external store.
+    ///
+    /// The staging manifest has been written to `staging_path` on the object store.
+    /// This method should atomically claim the version and return the final manifest location.
+    ///
+    /// The default implementation uses put_if_not_exists and put_if_exists to
+    /// implement a staging-based workflow. Implementations that can write directly
+    /// (e.g., namespace-backed stores) should override this method.
+    #[allow(clippy::too_many_arguments)]
+    async fn put(
+        &self,
+        base_path: &Path,
+        version: u64,
+        staging_path: &Path,
+        size: u64,
+        e_tag: Option<String>,
+        object_store: &dyn OSObjectStore,
+        naming_scheme: ManifestNamingScheme,
+    ) -> Result<ManifestLocation> {
+        // Default implementation: staging-based workflow
+
+        // Step 1: Record staging path atomically
+        self.put_if_not_exists(
+            base_path.as_ref(),
+            version,
+            staging_path.as_ref(),
+            size,
+            e_tag.clone(),
+        )
+        .await?;
+
+        // Step 2: Copy staging to final path
+        let final_path = naming_scheme.manifest_path(base_path, version);
+        let copied = match object_store.copy(staging_path, &final_path).await {
+            Ok(_) => true,
+            Err(ObjectStoreError::NotFound { .. }) => false,
+            Err(e) => return Err(e.into()),
+        };
+        if copied {
+            info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_CREATE, r#type=AUDIT_TYPE_MANIFEST, path = final_path.as_ref());
+        }
+
+        // Get final e_tag (may change after copy for large files)
+        let e_tag = if copied && size < 5 * 1024 * 1024 {
+            e_tag
+        } else {
+            let meta = object_store.head(&final_path).await?;
+            meta.e_tag
+        };
+
+        let location = ManifestLocation {
+            version,
+            path: final_path.clone(),
+            size: Some(size),
+            naming_scheme,
+            e_tag: e_tag.clone(),
+        };
+
+        if !copied {
+            return Ok(location);
+        }
+
+        // Step 3: Update external store to final path
+        self.put_if_exists(
+            base_path.as_ref(),
+            version,
+            final_path.as_ref(),
+            size,
+            e_tag,
+        )
+        .await?;
+
+        // Step 4: Delete staging manifest
+        match object_store.delete(staging_path).await {
+            Ok(_) => {}
+            Err(ObjectStoreError::NotFound { .. }) => {}
+            Err(e) => return Err(e.into()),
+        }
+        info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE, r#type=AUDIT_TYPE_MANIFEST, path = staging_path.as_ref());
+
+        Ok(location)
+    }
+
+    /// Put the manifest path for a given base_uri and version, should fail if the version already exists
+    async fn put_if_not_exists(
+        &self,
+        base_uri: &str,
+        version: u64,
+        path: &str,
+        size: u64,
+        e_tag: Option<String>,
+    ) -> Result<()>;
+
+    /// Put the manifest path for a given base_uri and version, should fail if the version **does not** already exist
+    async fn put_if_exists(
+        &self,
+        base_uri: &str,
+        version: u64,
+        path: &str,
+        size: u64,
+        e_tag: Option<String>,
+    ) -> Result<()>;
+
+    /// Delete the manifest information for given base_uri from the store
+    async fn delete(&self, _base_uri: &str) -> Result<()> {
+        Ok(())
+    }
+}
+
+pub(crate) fn detect_naming_scheme_from_path(path: &Path) -> Result<ManifestNamingScheme> {
+    path.filename()
+        .and_then(|name| {
+            ManifestNamingScheme::detect_scheme(name)
+                .or_else(|| Some(ManifestNamingScheme::detect_scheme_staging(name)))
+        })
+        .ok_or_else(|| {
+            Error::corrupt_file(
+                path.clone(),
+                "Path does not follow known manifest naming convention.",
+            )
+        })
+}
+
+/// External manifest commit handler
+/// This handler is used to commit a manifest to an external store
+/// for detailed design, see <https://github.com/lance-format/lance/issues/1183>
+#[derive(Debug)]
+pub struct ExternalManifestCommitHandler {
+    pub external_manifest_store: Arc<dyn ExternalManifestStore>,
+}
+
+impl ExternalManifestCommitHandler {
+    /// The manifest is considered committed once the staging manifest is written
+    /// to object store and that path is committed to the external store.
+    ///
+    /// However, to fully complete this, the staging manifest should be materialized
+    /// into the final path, the final path should be committed to the external store
+    /// and the staging manifest should be deleted. These steps may be completed
+    /// by any number of readers or writers, so care should be taken to ensure
+    /// that the manifest is not lost nor any errors occur due to duplicate
+    /// operations.
+    #[allow(clippy::too_many_arguments)]
+    async fn finalize_manifest(
+        &self,
+        base_path: &Path,
+        staging_manifest_path: &Path,
+        version: u64,
+        size: u64,
+        e_tag: Option<String>,
+        store: &dyn OSObjectStore,
+        naming_scheme: ManifestNamingScheme,
+    ) -> std::result::Result<ManifestLocation, Error> {
+        // step 1: copy the manifest to the final location
+        let final_manifest_path = naming_scheme.manifest_path(base_path, version);
+
+        let copied = match store
+            .copy(staging_manifest_path, &final_manifest_path)
+            .await
+        {
+            Ok(_) => true,
+            Err(ObjectStoreError::NotFound { .. }) => false, // Another writer beat us to it.
+            Err(e) => return Err(e.into()),
+        };
+        if copied {
+            info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_CREATE, r#type=AUDIT_TYPE_MANIFEST, path = final_manifest_path.as_ref());
+        }
+
+        // On S3, the etag can change if originally was MultipartUpload and later was Copy
+        // https://docs.aws.amazon.com/AmazonS3/latest/API/API_Object.html#AmazonS3-Type-Object-ETag
+        // We only do MultipartUpload for > 5MB files, so we can skip this check
+        // if size < 5MB. However, we need to double check the final_manifest_path
+        // exists before we change the external store, otherwise we may point to a
+        // non-existing manifest.
+        let e_tag = if copied && size < 5 * 1024 * 1024 {
+            e_tag
+        } else {
+            let meta = store.head(&final_manifest_path).await?;
+            meta.e_tag
+        };
+
+        let location = ManifestLocation {
+            version,
+            path: final_manifest_path,
+            size: Some(size),
+            naming_scheme,
+            e_tag,
+        };
+
+        if !copied {
+            return Ok(location);
+        }
+
+        // step 2: flip the external store to point to the final location
+        self.external_manifest_store
+            .put_if_exists(
+                base_path.as_ref(),
+                version,
+                location.path.as_ref(),
+                size,
+                location.e_tag.clone(),
+            )
+            .await?;
+
+        // step 3: delete the staging manifest
+        match store.delete(staging_manifest_path).await {
+            Ok(_) => {}
+            Err(ObjectStoreError::NotFound { .. }) => {}
+            Err(e) => return Err(e.into()),
+        }
+        info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE, r#type=AUDIT_TYPE_MANIFEST, path = staging_manifest_path.as_ref());
+
+        Ok(location)
+    }
+}
+
+#[async_trait]
+impl CommitHandler for ExternalManifestCommitHandler {
+    async fn resolve_latest_location(
+        &self,
+        base_path: &Path,
+        object_store: &ObjectStore,
+    ) -> std::result::Result<ManifestLocation, Error> {
+        let location = self
+            .external_manifest_store
+            .get_latest_manifest_location(base_path.as_ref())
+            .await?;
+
+        match location {
+            Some(ManifestLocation {
+                version,
+                path,
+                size,
+                naming_scheme,
+                e_tag,
+            }) => {
+                // The path is finalized, no need to check object store
+                if path.extension() == Some(MANIFEST_EXTENSION) {
+                    return Ok(ManifestLocation {
+                        version,
+                        path,
+                        size,
+                        naming_scheme,
+                        e_tag,
+                    });
+                }
+
+                let (size, e_tag) = if let Some(size) = size {
+                    (size, e_tag)
+                } else {
+                    match object_store.inner.head(&path).await {
+                        Ok(meta) => (meta.size, meta.e_tag),
+                        Err(ObjectStoreError::NotFound { .. }) => {
+                            // there may be other threads that have finished executing finalize_manifest.
+                            let new_location = self
+                                .external_manifest_store
+                                .get_manifest_location(base_path.as_ref(), version)
+                                .await?;
+                            return Ok(new_location);
+                        }
+                        Err(e) => return Err(e.into()),
+                    }
+                };
+
+                let final_location = self
+                    .finalize_manifest(
+                        base_path,
+                        &path,
+                        version,
+                        size,
+                        e_tag.clone(),
+                        &object_store.inner,
+                        naming_scheme,
+                    )
+                    .await?;
+
+                Ok(final_location)
+            }
+            // Dataset not found in the external store, this could be because the dataset did not
+            // use external store for commit before. In this case, we search for the latest manifest
+            None => current_manifest_path(object_store, base_path).await,
+        }
+    }
+
+    async fn resolve_version_location(
+        &self,
+        base_path: &Path,
+        version: u64,
+        object_store: &dyn OSObjectStore,
+    ) -> std::result::Result<ManifestLocation, Error> {
+        let location_res = self
+            .external_manifest_store
+            .get_manifest_location(base_path.as_ref(), version)
+            .await;
+
+        let location = match location_res {
+            Ok(p) => p,
+            // not board external manifest yet, direct to object store
+            Err(Error::NotFound { .. }) => {
+                let path = default_resolve_version(base_path, version, object_store)
+                    .await
+                    .map_err(|_| Error::not_found(format!("{}@{}", base_path, version)))?
+                    .path;
+                match object_store.head(&path).await {
+                    Ok(ObjectMeta { size, e_tag, .. }) => {
+                        let res = self
+                            .external_manifest_store
+                            .put_if_not_exists(
+                                base_path.as_ref(),
+                                version,
+                                path.as_ref(),
+                                size,
+                                e_tag.clone(),
+                            )
+                            .await;
+                        if let Err(e) = res {
+                            warn!(
+                                "could not update external manifest store during load, with error: {}",
+                                e
+                            );
+                        }
+                        let naming_scheme =
+                            ManifestNamingScheme::detect_scheme_staging(path.filename().unwrap());
+                        return Ok(ManifestLocation {
+                            version,
+                            path,
+                            size: Some(size),
+                            naming_scheme,
+                            e_tag,
+                        });
+                    }
+                    Err(ObjectStoreError::NotFound { .. }) => {
+                        return Err(Error::not_found(path.to_string()));
+                    }
+                    Err(e) => return Err(e.into()),
+                }
+            }
+            Err(e) => return Err(e),
+        };
+
+        // finalized path, just return
+        if location.path.extension() == Some(MANIFEST_EXTENSION) {
+            return Ok(location);
+        }
+
+        let naming_scheme =
+            ManifestNamingScheme::detect_scheme_staging(location.path.filename().unwrap());
+
+        let (size, e_tag) = if let Some(size) = location.size {
+            (size, location.e_tag.clone())
+        } else {
+            let meta = object_store.head(&location.path).await?;
+            (meta.size as u64, meta.e_tag)
+        };
+
+        self.finalize_manifest(
+            base_path,
+            &location.path,
+            version,
+            size,
+            e_tag,
+            object_store,
+            naming_scheme,
+        )
+        .await
+    }
+
+    async fn commit(
+        &self,
+        manifest: &mut Manifest,
+        indices: Option<Vec<IndexMetadata>>,
+        base_path: &Path,
+        object_store: &ObjectStore,
+        manifest_writer: super::ManifestWriter,
+        naming_scheme: ManifestNamingScheme,
+        transaction: Option<Transaction>,
+    ) -> std::result::Result<ManifestLocation, CommitError> {
+        // path we get here is the path to the manifest we want to write
+        // use object_store.base_path.as_ref() for getting the root of the dataset
+
+        // step 1: Write the manifest we want to commit to object store with a temporary name
+        let path = naming_scheme.manifest_path(base_path, manifest.version);
+        let staging_path = make_staging_manifest_path(&path)?;
+        let write_res =
+            manifest_writer(object_store, manifest, indices, &staging_path, transaction).await?;
+
+        // step 2 & 3: Put the manifest to external store
+        let result = self
+            .external_manifest_store
+            .put(
+                base_path,
+                manifest.version,
+                &staging_path,
+                write_res.size as u64,
+                write_res.e_tag,
+                &object_store.inner,
+                naming_scheme,
+            )
+            .await;
+
+        match result {
+            Ok(location) => {
+                write_version_hint(object_store, base_path, manifest.version).await;
+                Ok(location)
+            }
+            Err(_) => {
+                // delete the staging manifest
+                match object_store.inner.delete(&staging_path).await {
+                    Ok(_) => {}
+                    Err(ObjectStoreError::NotFound { .. }) => {}
+                    Err(e) => return Err(CommitError::OtherError(e.into())),
+                }
+                info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_DELETE, r#type=AUDIT_TYPE_MANIFEST, path = staging_path.as_ref());
+                Err(CommitError::CommitConflict {})
+            }
+        }
+    }
+
+    async fn delete(&self, base_path: &Path) -> Result<()> {
+        self.external_manifest_store
+            .delete(base_path.as_ref())
+            .await
+    }
+}
--- a/vendor/lance-table/src/io/deletion.rs
+++ b/vendor/lance-table/src/io/deletion.rs
@ -0,0 +1,370 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+use std::{collections::HashSet, sync::Arc};
+
+use arrow_array::{RecordBatch, UInt32Array};
+use arrow_ipc::CompressionType;
+use arrow_ipc::reader::FileReader as ArrowFileReader;
+use arrow_ipc::writer::{FileWriter as ArrowFileWriter, IpcWriteOptions};
+use arrow_schema::{ArrowError, DataType, Field, Schema};
+use bytes::Buf;
+use lance_core::error::{CorruptFileSnafu, box_error};
+use lance_core::utils::deletion::DeletionVector;
+use lance_core::utils::tracing::{AUDIT_MODE_CREATE, AUDIT_TYPE_DELETION, TRACE_FILE_AUDIT};
+use lance_core::{Error, Result};
+use lance_io::object_store::ObjectStore;
+use object_store::path::Path;
+use rand::Rng;
+use roaring::bitmap::RoaringBitmap;
+use snafu::ResultExt;
+use tracing::{info, instrument};
+
+use crate::format::{DeletionFile, DeletionFileType};
+
+pub const DELETIONS_DIR: &str = "_deletions";
+
+/// Get the Arrow schema for an Arrow deletion file.
+fn deletion_arrow_schema() -> Arc<Schema> {
+    Arc::new(Schema::new(vec![Field::new(
+        "row_id",
+        DataType::UInt32,
+        false,
+    )]))
+}
+
+/// Get the file path for a deletion file. This is relative to the dataset root.
+pub fn deletion_file_path(base: &Path, fragment_id: u64, deletion_file: &DeletionFile) -> Path {
+    let DeletionFile {
+        read_version,
+        id,
+        file_type,
+        ..
+    } = deletion_file;
+    let suffix = file_type.suffix();
+    base.clone()
+        .join(DELETIONS_DIR)
+        .join(format!("{fragment_id}-{read_version}-{id}.{suffix}"))
+}
+
+pub fn relative_deletion_file_path(fragment_id: u64, deletion_file: &DeletionFile) -> String {
+    let DeletionFile {
+        read_version,
+        id,
+        file_type,
+        ..
+    } = deletion_file;
+    let suffix = file_type.suffix();
+    format!("{DELETIONS_DIR}/{fragment_id}-{read_version}-{id}.{suffix}")
+}
+
+/// Write a deletion file for a fragment for a given deletion vector.
+///
+/// Returns the deletion file if one was written. If no deletions were present,
+/// returns `Ok(None)`.
+pub async fn write_deletion_file(
+    base: &Path,
+    fragment_id: u64,
+    read_version: u64,
+    removed_rows: &DeletionVector,
+    object_store: &ObjectStore,
+) -> Result<Option<DeletionFile>> {
+    let deletion_file = match removed_rows {
+        DeletionVector::NoDeletions => None,
+        DeletionVector::Set(set) => {
+            let id = rand::rng().random::<u64>();
+            let deletion_file = DeletionFile {
+                read_version,
+                id,
+                file_type: DeletionFileType::Array,
+                num_deleted_rows: Some(set.len()),
+                base_id: None,
+            };
+            let path = deletion_file_path(base, fragment_id, &deletion_file);
+
+            let array = UInt32Array::from_iter(set.iter().copied());
+            let array = Arc::new(array);
+
+            let schema = deletion_arrow_schema();
+            let batch = RecordBatch::try_new(schema.clone(), vec![array])?;
+
+            let mut out: Vec<u8> = Vec::new();
+            let write_options =
+                IpcWriteOptions::default().try_with_compression(Some(CompressionType::ZSTD))?;
+            {
+                let mut writer = ArrowFileWriter::try_new_with_options(
+                    &mut out,
+                    schema.as_ref(),
+                    write_options,
+                )?;
+                writer.write(&batch)?;
+                writer.finish()?;
+                // Drop writer so out is no longer borrowed.
+            }
+
+            object_store.put(&path, &out).await?;
+
+            info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_CREATE, r#type=AUDIT_TYPE_DELETION, path = path.to_string());
+
+            Some(deletion_file)
+        }
+        DeletionVector::Bitmap(bitmap) => {
+            let id = rand::rng().random::<u64>();
+            let deletion_file = DeletionFile {
+                read_version,
+                id,
+                file_type: DeletionFileType::Bitmap,
+                num_deleted_rows: Some(bitmap.len() as usize),
+                base_id: None,
+            };
+            let path = deletion_file_path(base, fragment_id, &deletion_file);
+
+            let mut out: Vec<u8> = Vec::new();
+            bitmap.serialize_into(&mut out)?;
+
+            object_store.put(&path, &out).await?;
+
+            info!(target: TRACE_FILE_AUDIT, mode=AUDIT_MODE_CREATE, r#type=AUDIT_TYPE_DELETION, path = path.to_string());
+
+            Some(deletion_file)
+        }
+    };
+    Ok(deletion_file)
+}
+
+#[instrument(
+    level = "debug",
+    skip(base, object_store),
+    fields(
+        base = base.as_ref(),
+        bytes_read = tracing::field::Empty
+    )
+)]
+pub async fn read_deletion_file(
+    fragment_id: u64,
+    deletion_file: &DeletionFile,
+    base: &Path,
+    object_store: &ObjectStore,
+) -> Result<DeletionVector> {
+    let span = tracing::Span::current();
+    match deletion_file.file_type {
+        DeletionFileType::Array => {
+            let path = deletion_file_path(base, fragment_id, deletion_file);
+
+            let data = object_store.read_one_all(&path).await?;
+            span.record("bytes_read", data.len());
+            let data = std::io::Cursor::new(data);
+            let mut batches: Vec<RecordBatch> = ArrowFileReader::try_new(data, None)?
+                .collect::<std::result::Result<_, ArrowError>>()
+                .map_err(box_error)
+                .context(CorruptFileSnafu { path: path.clone() })?;
+
+            if batches.len() != 1 {
+                return Err(Error::corrupt_file(
+                    path,
+                    format!(
+                        "Expected exactly one batch in deletion file, got {}",
+                        batches.len()
+                    ),
+                ));
+            }
+
+            let batch = batches.pop().unwrap();
+            if batch.schema() != deletion_arrow_schema() {
+                return Err(Error::corrupt_file(
+                    path,
+                    format!(
+                        "Expected schema {:?} in deletion file, got {:?}",
+                        deletion_arrow_schema(),
+                        batch.schema()
+                    ),
+                ));
+            }
+
+            let array = batch.columns()[0]
+                .as_any()
+                .downcast_ref::<UInt32Array>()
+                .unwrap();
+
+            let mut set = HashSet::with_capacity(array.len());
+            for val in array.iter() {
+                if let Some(val) = val {
+                    set.insert(val);
+                } else {
+                    return Err(Error::corrupt_file(
+                        path,
+                        "Null values are not allowed in deletion files",
+                    ));
+                }
+            }
+
+            Ok(DeletionVector::Set(set))
+        }
+        DeletionFileType::Bitmap => {
+            let path = deletion_file_path(base, fragment_id, deletion_file);
+
+            let data = object_store.read_one_all(&path).await?;
+            span.record("bytes_read", data.len());
+            let reader = data.reader();
+            let bitmap = RoaringBitmap::deserialize_from(reader)
+                .map_err(box_error)
+                .context(CorruptFileSnafu { path })?;
+
+            Ok(DeletionVector::Bitmap(bitmap))
+        }
+    }
+}
+
+#[cfg(test)]
+mod test {
+
+    use super::*;
+    use object_store::ObjectStoreExt;
+
+    #[tokio::test]
+    async fn test_write_no_deletions() {
+        let dv = DeletionVector::NoDeletions;
+
+        let (object_store, path) = ObjectStore::from_uri("memory:///no_deletion")
+            .await
+            .unwrap();
+        let file = write_deletion_file(&path, 0, 0, &dv, &object_store)
+            .await
+            .unwrap();
+        assert!(file.is_none());
+    }
+
+    #[tokio::test]
+    async fn test_write_array() {
+        let dv = DeletionVector::Set(HashSet::from_iter(0..100));
+
+        let fragment_id = 21;
+        let read_version = 12;
+
+        let object_store = ObjectStore::memory();
+        let path = Path::from("/write");
+        let file = write_deletion_file(&path, fragment_id, read_version, &dv, &object_store)
+            .await
+            .unwrap();
+
+        assert!(matches!(
+            file,
+            Some(DeletionFile {
+                file_type: DeletionFileType::Array,
+                ..
+            })
+        ));
+
+        let file = file.unwrap();
+        assert_eq!(file.read_version, read_version);
+        let path = deletion_file_path(&path, fragment_id, &file);
+        assert_eq!(
+            path,
+            Path::from(format!("/write/_deletions/21-12-{}.arrow", file.id))
+        );
+
+        let data = object_store
+            .inner
+            .get(&path)
+            .await
+            .unwrap()
+            .bytes()
+            .await
+            .unwrap();
+        let data = std::io::Cursor::new(data);
+        let mut batches: Vec<RecordBatch> = ArrowFileReader::try_new(data, None)
+            .unwrap()
+            .collect::<std::result::Result<_, ArrowError>>()
+            .unwrap();
+
+        assert_eq!(batches.len(), 1);
+        let batch = batches.pop().unwrap();
+        assert_eq!(batch.schema(), deletion_arrow_schema());
+        let array = batch["row_id"]
+            .as_any()
+            .downcast_ref::<UInt32Array>()
+            .unwrap();
+        let read_dv = DeletionVector::from_iter(array.iter().map(|v| v.unwrap()));
+        assert_eq!(read_dv, dv);
+    }
+
+    #[tokio::test]
+    async fn test_write_bitmap() {
+        let dv = DeletionVector::Bitmap(RoaringBitmap::from_iter(0..100));
+
+        let fragment_id = 21;
+        let read_version = 12;
+
+        let object_store = ObjectStore::memory();
+        let path = Path::from("/bitmap");
+        let file = write_deletion_file(&path, fragment_id, read_version, &dv, &object_store)
+            .await
+            .unwrap();
+
+        assert!(matches!(
+            file,
+            Some(DeletionFile {
+                file_type: DeletionFileType::Bitmap,
+                ..
+            })
+        ));
+
+        let file = file.unwrap();
+        assert_eq!(file.read_version, read_version);
+        let path = deletion_file_path(&path, fragment_id, &file);
+        assert_eq!(
+            path,
+            Path::from(format!("/bitmap/_deletions/21-12-{}.bin", file.id))
+        );
+
+        let data = object_store
+            .inner
+            .get(&path)
+            .await
+            .unwrap()
+            .bytes()
+            .await
+            .unwrap();
+        let reader = data.reader();
+        let read_bitmap = RoaringBitmap::deserialize_from(reader).unwrap();
+        assert_eq!(read_bitmap, dv.into_iter().collect::<RoaringBitmap>());
+    }
+
+    #[tokio::test]
+    async fn test_roundtrip_array() {
+        let dv = DeletionVector::Set(HashSet::from_iter(0..100));
+
+        let fragment_id = 21;
+        let read_version = 12;
+
+        let object_store = ObjectStore::memory();
+        let path = Path::from("/roundtrip");
+        let file = write_deletion_file(&path, fragment_id, read_version, &dv, &object_store)
+            .await
+            .unwrap();
+
+        let read_dv = read_deletion_file(fragment_id, &file.unwrap(), &path, &object_store)
+            .await
+            .unwrap();
+        assert_eq!(read_dv, dv);
+    }
+
+    #[tokio::test]
+    async fn test_roundtrip_bitmap() {
+        let dv = DeletionVector::Bitmap(RoaringBitmap::from_iter(0..100));
+
+        let fragment_id = 21;
+        let read_version = 12;
+
+        let object_store = ObjectStore::memory();
+        let path = Path::from("/bitmap");
+        let file = write_deletion_file(&path, fragment_id, read_version, &dv, &object_store)
+            .await
+            .unwrap();
+
+        let read_dv = read_deletion_file(fragment_id, &file.unwrap(), &path, &object_store)
+            .await
+            .unwrap();
+        assert_eq!(read_dv, dv);
+    }
+}
--- a/vendor/lance-table/src/io/manifest.rs
+++ b/vendor/lance-table/src/io/manifest.rs
@ -0,0 +1,344 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+use async_trait::async_trait;
+use byteorder::{ByteOrder, LittleEndian};
+use bytes::{Bytes, BytesMut};
+use lance_arrow::DataTypeExt;
+use lance_file::{
+    previous::writer::ManifestProvider as PreviousManifestProvider, version::LanceFileVersion,
+};
+use object_store::ObjectStoreExt;
+use object_store::path::Path;
+use prost::Message;
+use std::collections::HashMap;
+use std::{ops::Range, sync::Arc};
+use tracing::instrument;
+
+use lance_core::{Error, Result, datatypes::Schema};
+use lance_io::{
+    encodings::{Encoder, binary::BinaryEncoder, plain::PlainEncoder},
+    object_store::ObjectStore,
+    traits::{WriteExt, Writer},
+    utils::read_message,
+};
+
+use crate::format::{DataStorageFormat, IndexMetadata, MAGIC, Manifest, Transaction, pb};
+
+use super::commit::ManifestLocation;
+
+/// Read Manifest on URI.
+///
+/// This only reads manifest files. It does not read data files.
+#[instrument(level = "debug", skip(object_store))]
+pub async fn read_manifest(
+    object_store: &ObjectStore,
+    path: &Path,
+    known_size: Option<u64>,
+) -> Result<Manifest> {
+    let file_size = if let Some(known_size) = known_size {
+        known_size
+    } else {
+        object_store.inner.head(path).await?.size
+    };
+    const PREFETCH_SIZE: u64 = 64 * 1024;
+    let initial_start = file_size.saturating_sub(PREFETCH_SIZE);
+    let range = Range {
+        start: initial_start,
+        end: file_size,
+    };
+    let buf = object_store.inner.get_range(path, range).await?;
+
+    // In case of corruption, the known_size might be wrong. We can retry without
+    // the size to be more robust.
+    if (buf.len() < 16 || !buf.ends_with(MAGIC)) && known_size.is_some() {
+        return Box::pin(read_manifest(object_store, path, None)).await;
+    }
+
+    if buf.len() < 16 {
+        return Err(Error::corrupt_file(
+            path.clone(),
+            "Invalid format: file size is smaller than 16 bytes".to_string(),
+        ));
+    }
+    if !buf.ends_with(MAGIC) {
+        return Err(Error::corrupt_file(
+            path.clone(),
+            "Invalid format: magic number does not match".to_string(),
+        ));
+    }
+    let manifest_pos = LittleEndian::read_i64(&buf[buf.len() - 16..buf.len() - 8]) as usize;
+    let manifest_len = file_size as usize - manifest_pos;
+
+    let buf: Bytes = if manifest_len <= buf.len() {
+        // The prefetch captured the entire manifest. We just need to trim the buffer.
+        buf.slice(buf.len() - manifest_len..buf.len())
+    } else {
+        // The prefetch only captured part of the manifest. We need to make an
+        // additional range request to read the remainder.
+        let mut buf2: BytesMut = object_store
+            .inner
+            .get_range(
+                path,
+                Range {
+                    start: manifest_pos as u64,
+                    end: file_size - PREFETCH_SIZE,
+                },
+            )
+            .await?
+            .into_iter()
+            .collect();
+        buf2.extend_from_slice(&buf);
+        buf2.freeze()
+    };
+
+    let recorded_length = LittleEndian::read_u32(&buf[0..4]) as usize;
+    // Need to trim the magic number at end and message length at beginning
+    let buf = buf.slice(4..buf.len() - 16);
+
+    if buf.len() != recorded_length {
+        return Err(Error::invalid_input(format!(
+            "Invalid format: manifest length does not match. Expected {}, got {}",
+            recorded_length,
+            buf.len()
+        )));
+    }
+
+    let proto = pb::Manifest::decode(buf)?;
+    Manifest::try_from(proto)
+}
+
+#[instrument(level = "debug", skip(object_store, manifest))]
+pub async fn read_manifest_indexes(
+    object_store: &ObjectStore,
+    location: &ManifestLocation,
+    manifest: &Manifest,
+) -> Result<Vec<IndexMetadata>> {
+    if let Some(pos) = manifest.index_section.as_ref() {
+        let reader = if let Some(size) = location.size {
+            object_store
+                .open_with_size(&location.path, size as usize)
+                .await?
+        } else {
+            object_store.open(&location.path).await?
+        };
+        let section: pb::IndexSection = read_message(reader.as_ref(), *pos).await?;
+
+        let indices = section
+            .indices
+            .into_iter()
+            .map(IndexMetadata::try_from)
+            .collect::<Result<Vec<_>>>()?;
+        Ok(indices)
+    } else {
+        Ok(vec![])
+    }
+}
+
+async fn do_write_manifest(
+    writer: &mut dyn Writer,
+    manifest: &mut Manifest,
+    indices: Option<Vec<IndexMetadata>>,
+    mut transaction: Option<Transaction>,
+) -> Result<usize> {
+    // Write indices if presented.
+    if let Some(indices) = indices.as_ref() {
+        let section = pb::IndexSection {
+            indices: indices.iter().map(|i| i.into()).collect(),
+        };
+        let pos = writer.write_protobuf(&section).await?;
+        manifest.index_section = Some(pos);
+    }
+
+    // Write inline transaction if presented.
+    if let Some(tx) = transaction.take() {
+        // Convert to protobuf at the write boundary to persist inline
+        let pb_tx: pb::Transaction = tx.into();
+        let pos = writer.write_protobuf(&pb_tx).await?;
+        manifest.transaction_section = Some(pos);
+    }
+
+    writer.write_struct(manifest).await
+}
+
+/// Write manifest to an open file.
+pub async fn write_manifest(
+    writer: &mut dyn Writer,
+    manifest: &mut Manifest,
+    indices: Option<Vec<IndexMetadata>>,
+    transaction: Option<Transaction>,
+) -> Result<usize> {
+    // Write dictionary values.
+    let max_field_id = manifest.schema.max_field_id().unwrap_or(-1);
+    let is_legacy_storage = manifest.should_use_legacy_format();
+    for field_id in 0..max_field_id + 1 {
+        if let Some(field) = manifest.schema.mut_field_by_id(field_id)
+            && field.data_type().is_dictionary()
+            && is_legacy_storage
+        {
+            let dict_info = field.dictionary.as_mut().ok_or_else(|| {
+                Error::io(format!("Lance field {} misses dictionary info", field.name))
+            })?;
+
+            let value_arr = dict_info.values.as_ref().ok_or_else(|| {
+                Error::io(format!(
+                    "Lance field {} is dictionary type, but misses the dictionary value array",
+                    field.name
+                ))
+            })?;
+
+            let data_type = value_arr.data_type();
+            let pos = match data_type {
+                dt if dt.is_numeric() => {
+                    let mut encoder = PlainEncoder::new(writer, dt);
+                    encoder.encode(&[value_arr]).await?
+                }
+                dt if dt.is_binary_like() => {
+                    let mut encoder = BinaryEncoder::new(writer);
+                    encoder.encode(&[value_arr]).await?
+                }
+                _ => {
+                    return Err(Error::schema(format!(
+                        "Does not support {} as dictionary value type",
+                        value_arr.data_type()
+                    )));
+                }
+            };
+            dict_info.offset = pos;
+            dict_info.length = value_arr.len();
+        }
+    }
+
+    do_write_manifest(writer, manifest, indices, transaction).await
+}
+
+/// Implementation of ManifestProvider that describes a Lance file by writing
+/// a manifest that contains nothing but default fields and the schema
+pub struct ManifestDescribing {}
+
+#[async_trait]
+impl PreviousManifestProvider for ManifestDescribing {
+    async fn store_schema(
+        object_writer: &mut dyn Writer,
+        schema: &Schema,
+    ) -> Result<Option<usize>> {
+        let mut manifest = Manifest::new(
+            schema.clone(),
+            Arc::new(vec![]),
+            DataStorageFormat::new(LanceFileVersion::Legacy),
+            HashMap::new(),
+        );
+        let pos = do_write_manifest(object_writer, &mut manifest, None, None).await?;
+        Ok(Some(pos))
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use arrow_array::{Int32Array, RecordBatch};
+    use std::collections::HashMap;
+
+    use crate::format::SelfDescribingFileReader;
+    use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema};
+    use lance_file::format::{MAGIC, MAJOR_VERSION, MINOR_VERSION};
+    use lance_file::previous::{
+        reader::FileReader as PreviousFileReader, writer::FileWriter as PreviousFileWriter,
+    };
+    use rand::{Rng, distr::Alphanumeric};
+    use tokio::io::AsyncWriteExt;
+
+    use super::*;
+
+    async fn test_roundtrip_manifest(prefix_size: usize, manifest_min_size: usize) {
+        let store = ObjectStore::memory();
+        let path = Path::from("/read_large_manifest");
+
+        let mut writer = store.create(&path).await.unwrap();
+
+        // Write prefix we should ignore
+        let prefix: Vec<u8> = rand::rng()
+            .sample_iter(&Alphanumeric)
+            .take(prefix_size)
+            .collect();
+        writer.write_all(&prefix).await.unwrap();
+
+        let long_name: String = rand::rng()
+            .sample_iter(&Alphanumeric)
+            .take(manifest_min_size)
+            .map(char::from)
+            .collect();
+
+        let arrow_schema =
+            ArrowSchema::new(vec![ArrowField::new(long_name, DataType::Int64, false)]);
+        let schema = Schema::try_from(&arrow_schema).unwrap();
+
+        let mut config = HashMap::new();
+        config.insert("key".to_string(), "value".to_string());
+
+        let mut manifest = Manifest::new(
+            schema,
+            Arc::new(vec![]),
+            DataStorageFormat::default(),
+            HashMap::new(),
+        );
+        let pos = write_manifest(writer.as_mut(), &mut manifest, None, None)
+            .await
+            .unwrap();
+        writer
+            .write_magics(pos, MAJOR_VERSION, MINOR_VERSION, MAGIC)
+            .await
+            .unwrap();
+        Writer::shutdown(writer.as_mut()).await.unwrap();
+
+        let roundtripped_manifest = read_manifest(&store, &path, None).await.unwrap();
+
+        assert_eq!(manifest, roundtripped_manifest);
+
+        store.inner.delete(&path).await.unwrap();
+    }
+
+    #[tokio::test]
+    async fn test_read_large_manifest() {
+        test_roundtrip_manifest(0, 100_000).await;
+        test_roundtrip_manifest(1000, 100_000).await;
+        test_roundtrip_manifest(1000, 1000).await;
+    }
+
+    #[tokio::test]
+    async fn test_update_schema_metadata() {
+        let store = ObjectStore::memory();
+        let path = Path::from("/update_schema_metadata");
+
+        let arrow_schema = Arc::new(ArrowSchema::new(vec![ArrowField::new(
+            "i",
+            DataType::Int32,
+            false,
+        )]));
+        let schema = Schema::try_from(arrow_schema.as_ref()).unwrap();
+        let mut file_writer = PreviousFileWriter::<ManifestDescribing>::try_new(
+            &store,
+            &path,
+            schema.clone(),
+            &Default::default(),
+        )
+        .await
+        .unwrap();
+
+        let array = Int32Array::from_iter_values(0..10);
+        let batch = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new(array)]).unwrap();
+        file_writer
+            .write(std::slice::from_ref(&batch))
+            .await
+            .unwrap();
+        let mut metadata = HashMap::new();
+        metadata.insert(String::from("lance:extra"), String::from("for_test"));
+        file_writer.finish_with_metadata(&metadata).await.unwrap();
+
+        let reader = store.open(&path).await.unwrap();
+        let reader = PreviousFileReader::try_new_self_described_from_reader(reader.into(), None)
+            .await
+            .unwrap();
+        let schema = ArrowSchema::from(reader.schema());
+        assert_eq!(schema.metadata().get("lance:extra").unwrap(), "for_test");
+    }
+}
--- a/vendor/lance-table/src/lib.rs
+++ b/vendor/lance-table/src/lib.rs
@ -0,0 +1,8 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+pub mod feature_flags;
+pub mod format;
+pub mod io;
+pub mod rowids;
+pub mod utils;
--- a/vendor/lance-table/src/rowids.rs
+++ b/vendor/lance-table/src/rowids.rs
--- a/vendor/lance-table/src/rowids/bitmap.rs
+++ b/vendor/lance-table/src/rowids/bitmap.rs
@ -0,0 +1,314 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+use deepsize::DeepSizeOf;
+
+#[derive(PartialEq, Eq, Clone, DeepSizeOf)]
+pub struct Bitmap {
+    pub data: Vec<u8>,
+    pub len: usize,
+}
+
+impl std::fmt::Debug for Bitmap {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "Bitmap {{ data: ")?;
+        for i in 0..self.len {
+            write!(f, "{}", if self.get(i) { "1" } else { "0" })?;
+        }
+        write!(f, ", len: {} }}", self.len)
+    }
+}
+
+impl Bitmap {
+    pub fn new_empty(len: usize) -> Self {
+        let data = vec![0; len.div_ceil(8)];
+        Self { data, len }
+    }
+
+    pub fn new_full(len: usize) -> Self {
+        let mut data = vec![0xff; len.div_ceil(8)];
+        // Zero past the end of len
+        let remainder = len % 8;
+        if remainder != 0 {
+            let last_byte = data.last_mut().unwrap();
+            let bits_to_clear = 8 - remainder;
+            for offset_from_end in 0..bits_to_clear {
+                let i = 7 - offset_from_end;
+                *last_byte &= !(1 << i);
+            }
+        }
+        Self { data, len }
+    }
+
+    pub fn set(&mut self, i: usize) {
+        self.data[i / 8] |= 1 << (i % 8);
+    }
+
+    pub fn clear(&mut self, i: usize) {
+        self.data[i / 8] &= !(1 << (i % 8));
+    }
+
+    pub fn get(&self, i: usize) -> bool {
+        self.data[i / 8] & (1 << (i % 8)) != 0
+    }
+
+    pub fn len(&self) -> usize {
+        self.len
+    }
+
+    pub fn slice(&self, start: usize, len: usize) -> BitmapSlice<'_> {
+        BitmapSlice {
+            bitmap: self,
+            start,
+            len,
+        }
+    }
+
+    pub fn count_ones(&self) -> usize {
+        self.data.iter().map(|&x| x.count_ones() as usize).sum()
+    }
+
+    pub fn count_zeros(&self) -> usize {
+        self.len - self.count_ones()
+    }
+
+    pub fn iter(&self) -> impl Iterator<Item = bool> + '_ {
+        self.data
+            .iter()
+            .flat_map(|&x| (0..8).map(move |i| x & (1 << i) != 0))
+            .take(self.len)
+    }
+}
+
+impl From<&[bool]> for Bitmap {
+    fn from(slice: &[bool]) -> Self {
+        let mut bitmap = Self::new_empty(slice.len());
+        for (i, &b) in slice.iter().enumerate() {
+            if b {
+                bitmap.set(i);
+            }
+        }
+        bitmap
+    }
+}
+
+// Make a slice of bitmap
+pub struct BitmapSlice<'a> {
+    bitmap: &'a Bitmap,
+    start: usize,
+    len: usize,
+}
+
+impl BitmapSlice<'_> {
+    pub fn count_ones(&self) -> usize {
+        if self.len == 0 {
+            return 0;
+        }
+        let first_byte = self.start / 8;
+        let last_byte = (self.start + self.len - 1) / 8;
+        if first_byte == last_byte {
+            let byte = self.bitmap.data[first_byte];
+            let mut count = 0;
+            for i in self.start % 8..((self.start + self.len - 1) % 8 + 1) {
+                if byte & (1 << i) != 0 {
+                    count += 1;
+                }
+            }
+            count
+        } else {
+            let mut count = 0;
+            // Handle first byte
+            for i in self.start % 8..8 {
+                if self.bitmap.data[first_byte] & (1 << i) != 0 {
+                    count += 1;
+                }
+            }
+
+            // Handle last bytes
+            for i in 0..((self.start + self.len - 1) % 8 + 1) {
+                if self.bitmap.data[last_byte] & (1 << i) != 0 {
+                    count += 1;
+                }
+            }
+
+            // Middle bytes can just use count_ones
+            count += self.bitmap.data[first_byte + 1..last_byte]
+                .iter()
+                .map(|&x| x.count_ones() as usize)
+                .sum::<usize>();
+            count
+        }
+    }
+
+    pub fn count_zeros(&self) -> usize {
+        self.len - self.count_ones()
+    }
+}
+
+impl From<BitmapSlice<'_>> for Bitmap {
+    fn from(slice: BitmapSlice) -> Self {
+        let mut bitmap = Self::new_empty(slice.len);
+        for i in 0..slice.len {
+            if slice.bitmap.get(slice.start + i) {
+                bitmap.set(i);
+            }
+        }
+        bitmap
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use proptest::prop_assert_eq;
+
+    #[test]
+    fn test_bitmap() {
+        let mut bitmap = Bitmap::new_empty(10);
+        assert_eq!(bitmap.len(), 10);
+        assert_eq!(bitmap.count_ones(), 0);
+
+        bitmap.set(0);
+        bitmap.set(1);
+        bitmap.set(4);
+        bitmap.set(5);
+        bitmap.set(9);
+        assert_eq!(bitmap.count_ones(), 5);
+        assert_eq!(
+            format!("{:?}", bitmap),
+            "Bitmap { data: 1100110001, len: 10 }"
+        );
+
+        bitmap.clear(1);
+        bitmap.clear(4);
+        assert_eq!(bitmap.count_ones(), 3);
+        assert_eq!(
+            format!("{:?}", bitmap),
+            "Bitmap { data: 1000010001, len: 10 }"
+        );
+
+        let bitmap_slice = bitmap.slice(5, 5);
+        assert_eq!(bitmap_slice.count_ones(), 2);
+    }
+
+    #[test]
+    fn test_equality() {
+        for len in 48..56 {
+            let mut bitmap1 = Bitmap::new_empty(len);
+            for i in 0..len {
+                if i % 2 == 0 {
+                    bitmap1.set(i);
+                }
+            }
+
+            let mut bitmap2 = Bitmap::new_full(len);
+            for i in 0..len {
+                if i % 2 == 1 {
+                    bitmap2.clear(i);
+                }
+            }
+
+            assert_eq!(bitmap1, bitmap2);
+        }
+    }
+
+    proptest::proptest! {
+        #[test]
+        fn test_bitmap_slice(
+            values in proptest::collection::vec(proptest::bool::ANY, 0..100),
+            mut start in 0..100usize,
+            mut len in 0..100usize,
+        ) {
+            if start > values.len() {
+                start = values.len();
+            }
+            if len > values.len() - start {
+                len = values.len() - start;
+            }
+
+            let bitmap = Bitmap::from(values.as_slice());
+            let slice = bitmap.slice(start, len);
+            let values_slice = values[start..(start + len)].to_vec();
+
+            prop_assert_eq!(slice.count_ones(), values_slice.iter().filter(|&&x| x).count());
+        }
+    }
+
+    #[test]
+    fn test_bitmap_iter_empty() {
+        let bitmap = Bitmap::new_empty(10);
+        let values: Vec<bool> = bitmap.iter().collect();
+        assert_eq!(values, vec![false; 10]);
+    }
+
+    #[test]
+    fn test_bitmap_iter_full() {
+        let bitmap = Bitmap::new_full(10);
+        let values: Vec<bool> = bitmap.iter().collect();
+        assert_eq!(values, vec![true; 10]);
+    }
+
+    #[test]
+    fn test_bitmap_iter_partial() {
+        let mut bitmap = Bitmap::new_empty(10);
+        bitmap.set(0);
+        bitmap.set(3);
+        bitmap.set(7);
+        bitmap.set(9);
+
+        let values: Vec<bool> = bitmap.iter().collect();
+        let expected = vec![
+            true,  // 0
+            false, // 1
+            false, // 2
+            true,  // 3
+            false, // 4
+            false, // 5
+            false, // 6
+            true,  // 7
+            false, // 8
+            true,  // 9
+        ];
+        assert_eq!(values, expected);
+    }
+
+    #[test]
+    fn test_bitmap_iter_edge_cases() {
+        // Test with length that's not a multiple of 8
+        let mut bitmap = Bitmap::new_empty(15);
+        bitmap.set(0);
+        bitmap.set(7);
+        bitmap.set(14);
+
+        let values: Vec<bool> = bitmap.iter().collect();
+        let expected = vec![
+            true,  // 0
+            false, // 1
+            false, // 2
+            false, // 3
+            false, // 4
+            false, // 5
+            false, // 6
+            true,  // 7
+            false, // 8
+            false, // 9
+            false, // 10
+            false, // 11
+            false, // 12
+            false, // 13
+            true,  // 14
+        ];
+        assert_eq!(values, expected);
+    }
+
+    proptest::proptest! {
+        #[test]
+        fn test_bitmap_iter_property(
+            values in proptest::collection::vec(proptest::bool::ANY, 0..100)
+        ) {
+            let bitmap = Bitmap::from(values.as_slice());
+            let iter_values: Vec<bool> = bitmap.iter().collect();
+            assert_eq!(iter_values, values);
+        }
+    }
+}
--- a/vendor/lance-table/src/rowids/encoded_array.rs
+++ b/vendor/lance-table/src/rowids/encoded_array.rs
@ -0,0 +1,400 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+use std::ops::Range;
+
+use deepsize::DeepSizeOf;
+
+/// Encoded array of u64 values.
+///
+/// This is a internal data type used as part of row id indices.
+#[derive(Debug, Clone, PartialEq, Eq, DeepSizeOf)]
+pub enum EncodedU64Array {
+    /// u64 values represented as u16 offset from a base value.
+    ///
+    /// Useful when the min and max value are within u16 range (0..65535).
+    /// Only space saving when there are more than 2 values.
+    U16 { base: u64, offsets: Vec<u16> },
+    /// u64 values represented as u32 offset from a base value.
+    ///
+    /// Useful when the min and max value are within u32 range (0..~4 billion).
+    U32 { base: u64, offsets: Vec<u32> },
+    /// Just a plain vector of u64 values.
+    ///
+    /// For when the values cover a wide range.
+    U64(Vec<u64>),
+}
+
+impl EncodedU64Array {
+    pub fn len(&self) -> usize {
+        match self {
+            Self::U16 { offsets, .. } => offsets.len(),
+            Self::U32 { offsets, .. } => offsets.len(),
+            Self::U64(values) => values.len(),
+        }
+    }
+
+    pub fn iter(&self) -> Box<dyn DoubleEndedIterator<Item = u64> + '_> {
+        match self {
+            Self::U16 { base, offsets } => {
+                Box::new(offsets.iter().cloned().map(move |o| base + o as u64))
+            }
+            Self::U32 { base, offsets } => {
+                Box::new(offsets.iter().cloned().map(move |o| base + o as u64))
+            }
+            Self::U64(values) => Box::new(values.iter().cloned()),
+        }
+    }
+
+    pub fn get(&self, i: usize) -> Option<u64> {
+        match self {
+            Self::U16 { base, offsets } => {
+                if i < offsets.len() {
+                    Some(*base + offsets[i] as u64)
+                } else {
+                    None
+                }
+            }
+            Self::U32 { base, offsets } => {
+                if i < offsets.len() {
+                    Some(*base + offsets[i] as u64)
+                } else {
+                    None
+                }
+            }
+            Self::U64(values) => values.get(i).copied(),
+        }
+    }
+
+    pub fn min(&self) -> Option<u64> {
+        match self {
+            Self::U16 { base, offsets } => {
+                if offsets.is_empty() {
+                    None
+                } else {
+                    Some(*base)
+                }
+            }
+            Self::U32 { base, offsets } => {
+                if offsets.is_empty() {
+                    None
+                } else {
+                    Some(*base)
+                }
+            }
+            Self::U64(values) => values.iter().copied().min(),
+        }
+    }
+
+    pub fn max(&self) -> Option<u64> {
+        match self {
+            Self::U16 { base, offsets } => {
+                if offsets.is_empty() {
+                    None
+                } else {
+                    Some(*base + offsets.iter().copied().max().unwrap() as u64)
+                }
+            }
+            Self::U32 { base, offsets } => {
+                if offsets.is_empty() {
+                    None
+                } else {
+                    Some(*base + offsets.iter().copied().max().unwrap() as u64)
+                }
+            }
+            Self::U64(values) => values.iter().copied().max(),
+        }
+    }
+
+    pub fn first(&self) -> Option<u64> {
+        match self {
+            Self::U16 { base, offsets } => {
+                if offsets.is_empty() {
+                    None
+                } else {
+                    Some(*base + *offsets.first().unwrap() as u64)
+                }
+            }
+            Self::U32 { base, offsets } => {
+                if offsets.is_empty() {
+                    None
+                } else {
+                    Some(*base + *offsets.first().unwrap() as u64)
+                }
+            }
+            Self::U64(values) => values.first().copied(),
+        }
+    }
+
+    pub fn last(&self) -> Option<u64> {
+        match self {
+            Self::U16 { base, offsets } => {
+                if offsets.is_empty() {
+                    None
+                } else {
+                    Some(*base + *offsets.last().unwrap() as u64)
+                }
+            }
+            Self::U32 { base, offsets } => {
+                if offsets.is_empty() {
+                    None
+                } else {
+                    Some(*base + *offsets.last().unwrap() as u64)
+                }
+            }
+            Self::U64(values) => values.last().copied(),
+        }
+    }
+
+    pub fn binary_search(&self, val: u64) -> std::result::Result<usize, usize> {
+        match self {
+            Self::U16 { base, offsets } => match val.checked_sub(*base) {
+                None => Err(0),
+                Some(val) => {
+                    if val > u16::MAX as u64 {
+                        return Err(offsets.len());
+                    }
+                    let u16 = val as u16;
+                    offsets.binary_search(&u16)
+                }
+            },
+            Self::U32 { base, offsets } => match val.checked_sub(*base) {
+                None => Err(0),
+                Some(val) => {
+                    if val > u32::MAX as u64 {
+                        return Err(offsets.len());
+                    }
+                    let u32 = val as u32;
+                    offsets.binary_search(&u32)
+                }
+            },
+            Self::U64(values) => values.binary_search(&val),
+        }
+    }
+
+    pub fn slice(&self, offset: usize, len: usize) -> Self {
+        match self {
+            Self::U16 { base, offsets } => offsets[offset..(offset + len)]
+                .iter()
+                .map(|o| *base + *o as u64)
+                .collect(),
+            Self::U32 { base, offsets } => offsets[offset..(offset + len)]
+                .iter()
+                .map(|o| *base + *o as u64)
+                .collect(),
+            Self::U64(values) => {
+                let values = values[offset..(offset + len)].to_vec();
+                Self::U64(values)
+            }
+        }
+    }
+}
+
+impl From<Vec<u64>> for EncodedU64Array {
+    fn from(values: Vec<u64>) -> Self {
+        let min = values.iter().copied().min().unwrap_or(0);
+        let max = values.iter().copied().max().unwrap_or(0);
+        let range = max - min;
+        if values.is_empty() {
+            Self::U64(Vec::new())
+        } else if range <= u16::MAX as u64 {
+            let base = min;
+            let offsets = values.iter().map(|v| (*v - base) as u16).collect();
+            Self::U16 { base, offsets }
+        } else if range <= u32::MAX as u64 {
+            let base = min;
+            let offsets = values.iter().map(|v| (*v - base) as u32).collect();
+            Self::U32 { base, offsets }
+        } else {
+            Self::U64(values)
+        }
+    }
+}
+
+impl From<Range<u64>> for EncodedU64Array {
+    fn from(range: Range<u64>) -> Self {
+        let min = range.start;
+        let max = range.end;
+        let range = max - min;
+        if range < u16::MAX as u64 {
+            let base = min;
+            let offsets = (0..range as u16).collect();
+            Self::U16 { base, offsets }
+        } else if range < u32::MAX as u64 {
+            let base = min;
+            let offsets = (0..range as u32).collect();
+            Self::U32 { base, offsets }
+        } else {
+            Self::U64((min..max).collect())
+        }
+    }
+}
+
+impl FromIterator<u64> for EncodedU64Array {
+    fn from_iter<I: IntoIterator<Item = u64>>(iter: I) -> Self {
+        let values: Vec<u64> = iter.into_iter().collect();
+        Self::from(values)
+    }
+}
+
+impl IntoIterator for EncodedU64Array {
+    type Item = u64;
+    type IntoIter = Box<dyn DoubleEndedIterator<Item = u64>>;
+    fn into_iter(self) -> Self::IntoIter {
+        match self {
+            Self::U16 { base, offsets } => {
+                Box::new(offsets.into_iter().map(move |o| base + o as u64))
+            }
+            Self::U32 { base, offsets } => {
+                Box::new(offsets.into_iter().map(move |o| base + o as u64))
+            }
+            Self::U64(values) => Box::new(values.into_iter()),
+        }
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn test_encoded_array_from_vec() {
+        fn roundtrip_array(values: Vec<u64>, expected: &EncodedU64Array) {
+            let encoded = EncodedU64Array::from(values.clone());
+            assert_eq!(&encoded, expected);
+
+            assert_eq!(values.len(), encoded.len());
+            assert_eq!(values.first(), encoded.first().as_ref());
+            assert_eq!(values.last(), encoded.last().as_ref());
+            assert_eq!(values.iter().min(), encoded.min().as_ref());
+            assert_eq!(values.iter().max(), encoded.max().as_ref());
+
+            let roundtripped = encoded.iter().collect::<Vec<_>>();
+            assert_eq!(values, roundtripped);
+
+            for (i, v) in values.iter().enumerate() {
+                assert_eq!(Some(*v), encoded.get(i));
+            }
+
+            let encoded2 = values.into_iter().collect::<EncodedU64Array>();
+            assert_eq!(&encoded2, expected);
+        }
+
+        // Empty
+        roundtrip_array(vec![], &EncodedU64Array::U64(vec![]));
+
+        // Single value
+        roundtrip_array(
+            vec![42],
+            &EncodedU64Array::U16 {
+                base: 42,
+                offsets: vec![0],
+            },
+        );
+
+        // u16 version, it can start beyond the u16 range, but the
+        // relative values must be within u16 range.
+        let relative_values = [42, 0, 43, u16::MAX as u64, 99];
+        let values = relative_values.map(|v| v + 2 * u16::MAX as u64).to_vec();
+        let expected = EncodedU64Array::U16 {
+            base: 2 * u16::MAX as u64,
+            offsets: relative_values.iter().map(|v| *v as u16).collect(),
+        };
+        roundtrip_array(values, &expected);
+
+        // u32 version
+        let relative_values = [42, 0, 43, u32::MAX as u64, 99];
+        let values = relative_values.map(|v| v + 2 * u32::MAX as u64).to_vec();
+        let expected = EncodedU64Array::U32 {
+            base: 2 * u32::MAX as u64,
+            offsets: relative_values.iter().map(|v| *v as u32).collect(),
+        };
+        roundtrip_array(values, &expected);
+
+        // u64 version
+        let values = [42, 0, 43, u64::MAX, 99].to_vec();
+        let expected = EncodedU64Array::U64(values.clone());
+        roundtrip_array(values, &expected);
+    }
+
+    #[test]
+    fn test_double_ended_iter() {
+        let arrays = vec![
+            EncodedU64Array::U16 {
+                base: 42,
+                offsets: vec![0, 1, 2, 3, 4],
+            },
+            EncodedU64Array::U32 {
+                base: 42,
+                offsets: vec![0, 1, 2, 3, 4],
+            },
+            EncodedU64Array::U64(vec![42, 43, 44, 45, 46]),
+        ];
+        for array in arrays {
+            // Should be able to iterate forwards and backwards, and get the same thing.
+            let forwards = array.iter().collect::<Vec<_>>();
+            let mut backwards = array.iter().rev().collect::<Vec<_>>();
+            backwards.reverse();
+            assert_eq!(forwards, backwards);
+
+            // Should be able to pull from both sides in lockstep.
+            let mut expected = Vec::with_capacity(array.len());
+            let mut actual = Vec::with_capacity(array.len());
+            let mut iter = array.iter();
+            // Alternating forwards and backwards
+            for i in 0..array.len() {
+                if i % 2 == 0 {
+                    actual.push(iter.next().unwrap());
+                    expected.push(array.get(i / 2).unwrap());
+                } else {
+                    let i = array.len() - 1 - i / 2;
+                    actual.push(iter.next_back().unwrap());
+                    expected.push(array.get(i).unwrap());
+                };
+            }
+            assert_eq!(expected, actual);
+        }
+    }
+
+    #[test]
+    fn test_encoded_array_from_range() {
+        // u16 version
+        let range = (2 * u16::MAX as u64)..(40 + 2 * u16::MAX as u64);
+        let encoded = EncodedU64Array::from(range.clone());
+        let expected_base = 2 * u16::MAX as u64;
+        assert!(
+            matches!(
+                encoded,
+                EncodedU64Array::U16 {
+                    base,
+                    ..
+                } if base == expected_base
+            ),
+            "{:?}",
+            encoded
+        );
+        let roundtripped = encoded.into_iter().collect::<Vec<_>>();
+        assert_eq!(range.collect::<Vec<_>>(), roundtripped);
+
+        // u32 version
+        let range = (2 * u32::MAX as u64)..(u16::MAX as u64 + 10 + 2 * u32::MAX as u64);
+        let encoded = EncodedU64Array::from(range.clone());
+        let expected_base = 2 * u32::MAX as u64;
+        assert!(matches!(
+            encoded,
+            EncodedU64Array::U32 {
+                base,
+                ..
+            } if base == expected_base
+        ));
+        let roundtripped = encoded.into_iter().collect::<Vec<_>>();
+        assert_eq!(range.collect::<Vec<_>>(), roundtripped);
+
+        // We'll skip u64 since it would take a lot of memory.
+
+        // Empty one
+        let range = 42..42;
+        let encoded = EncodedU64Array::from(range);
+        assert_eq!(encoded.len(), 0);
+    }
+}
--- a/vendor/lance-table/src/rowids/index.rs
+++ b/vendor/lance-table/src/rowids/index.rs
@ -0,0 +1,822 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+use std::ops::RangeInclusive;
+use std::sync::Arc;
+
+use super::{RowIdSequence, U64Segment};
+use deepsize::DeepSizeOf;
+use lance_core::{Error, Result};
+use lance_core::utils::address::RowAddress;
+use lance_core::utils::deletion::DeletionVector;
+use rangemap::RangeInclusiveMap;
+
+/// An index of row ids
+///
+/// This index is used to map row ids to their corresponding addresses. These
+/// addresses correspond to physical positions in the dataset. See [RowAddress].
+///
+/// This structure only contains rows that physically exist. However, it may
+/// map to addresses that have been tombstoned. A separate tombstone index is
+/// used to track tombstoned rows.
+// (Implementation)
+// Disjoint ranges of row ids are stored as the keys of the map. The values are
+// a pair of segments. The first segment is the row ids, and the second segment
+// is the addresses.
+#[derive(Debug)]
+pub struct RowIdIndex(RangeInclusiveMap<u64, (U64Segment, U64Segment)>);
+
+pub struct FragmentRowIdIndex {
+    pub fragment_id: u32,
+    pub row_id_sequence: Arc<RowIdSequence>,
+    pub deletion_vector: Arc<DeletionVector>,
+}
+
+impl RowIdIndex {
+    /// Create a new index from a list of fragment ids and their corresponding row id sequences.
+    pub fn new(fragment_indices: &[FragmentRowIdIndex]) -> Result<Self> {
+        let chunks = fragment_indices
+            .iter()
+            .flat_map(decompose_sequence)
+            .collect::<Vec<_>>();
+
+        let mut final_chunks = Vec::new();
+        for processed_chunk in prep_index_chunks(chunks) {
+            match processed_chunk {
+                RawIndexChunk::NonOverlapping(chunk) => {
+                    final_chunks.push(chunk);
+                }
+                RawIndexChunk::Overlapping(_range, overlapping_chunks) => {
+                    // Intersecting row-id ranges don't imply intersecting id sets;
+                    // sparse ids and deletion holes leave the union short of the span.
+                    // The real invariant (no id in two fragments) is checked in the merge.
+                    let merged_chunk = merge_overlapping_chunks(overlapping_chunks)?;
+                    final_chunks.push(merged_chunk);
+                }
+            }
+        }
+
+        Ok(Self(RangeInclusiveMap::from_iter(final_chunks)))
+    }
+
+    /// Get the address for a given row id.
+    ///
+    /// Will return None if the row id does not exist in the index.
+    pub fn get(&self, row_id: u64) -> Option<RowAddress> {
+        let (row_id_segment, address_segment) = self.0.get(&row_id)?;
+        let pos = row_id_segment.position(row_id)?;
+        let address = address_segment.get(pos)?;
+        Some(RowAddress::from(address))
+    }
+
+    /// Get addresses for many row ids in one pass over the index.
+    ///
+    /// Returns one entry per input id, in input order (`None` for missing).
+    /// Sorts a working copy of the input internally so the chunk iterator
+    /// is advanced at most once per chunk, amortizing the per-id tree walk
+    /// from O(N · log F) to O(F + N).
+    pub fn get_many(&self, row_ids: &[u64]) -> Vec<Option<RowAddress>> {
+        let n = row_ids.len();
+        let mut out = vec![None; n];
+        if n == 0 {
+            return out;
+        }
+
+        let mut sorted: Vec<(u64, usize)> = row_ids.iter().copied().zip(0..n).collect();
+        sorted.sort_unstable_by_key(|&(id, _)| id);
+
+        let mut chunks = self.0.iter().peekable();
+        for (id, orig_idx) in sorted {
+            // Advance past chunks that end before this id.
+            while let Some((range, _)) = chunks.peek() {
+                if *range.end() < id {
+                    chunks.next();
+                } else {
+                    break;
+                }
+            }
+            let Some((range, (row_id_seg, addr_seg))) = chunks.peek() else {
+                break;
+            };
+            if id < *range.start() {
+                continue; // falls in a gap between chunks
+            }
+            if let Some(pos) = row_id_seg.position(id)
+                && let Some(addr) = addr_seg.get(pos)
+            {
+                out[orig_idx] = Some(RowAddress::from(addr));
+            }
+        }
+        out
+    }
+}
+
+impl DeepSizeOf for RowIdIndex {
+    fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize {
+        self.0
+            .iter()
+            .map(|(_, (row_id_segment, address_segment))| {
+                (2 * std::mem::size_of::<u64>())
+                    + std::mem::size_of::<(U64Segment, U64Segment)>()
+                    + row_id_segment.deep_size_of_children(context)
+                    + address_segment.deep_size_of_children(context)
+            })
+            .sum()
+    }
+}
+
+fn decompose_sequence(
+    frag_index: &FragmentRowIdIndex,
+) -> Vec<(RangeInclusive<u64>, (U64Segment, U64Segment))> {
+    let mut start_address: u64 = RowAddress::first_row(frag_index.fragment_id).into();
+    let mut current_offset = 0u32;
+    let no_deletions = frag_index.deletion_vector.is_empty();
+
+    frag_index
+        .row_id_sequence
+        .0
+        .iter()
+        .filter_map(|segment| {
+            let segment_len = segment.len();
+
+            let result = if no_deletions {
+                decompose_segment_no_deletions(segment, start_address)
+            } else {
+                decompose_segment_with_deletions(
+                    segment,
+                    start_address,
+                    current_offset,
+                    &frag_index.deletion_vector,
+                )
+            };
+
+            current_offset += segment_len as u32;
+            start_address += segment_len as u64;
+
+            result
+        })
+        .collect()
+}
+
+/// Build an IndexChunk from a list of (row_id, address) pairs.
+fn build_chunk_from_pairs(pairs: Vec<(u64, u64)>) -> Option<IndexChunk> {
+    if pairs.is_empty() {
+        return None;
+    }
+    let (row_ids, addresses): (Vec<u64>, Vec<u64>) = pairs.into_iter().unzip();
+    let row_id_segment = U64Segment::from_iter(row_ids);
+    let address_segment = U64Segment::from_iter(addresses);
+    let coverage = row_id_segment.range()?;
+    Some((coverage, (row_id_segment, address_segment)))
+}
+
+/// Fast path: no deletions. O(1) for Range segments.
+fn decompose_segment_no_deletions(segment: &U64Segment, start_address: u64) -> Option<IndexChunk> {
+    match segment {
+        U64Segment::Range(range) if !range.is_empty() => {
+            let len = range.end - range.start;
+            let row_id_segment = U64Segment::Range(range.clone());
+            let address_segment = U64Segment::Range(start_address..start_address + len);
+            let coverage = range.start..=range.end - 1;
+            Some((coverage, (row_id_segment, address_segment)))
+        }
+        _ if segment.is_empty() => None,
+        _ => {
+            // Non-Range segments: must iterate to build address mapping.
+            let pairs: Vec<(u64, u64)> = segment
+                .iter()
+                .enumerate()
+                .map(|(i, row_id)| (row_id, start_address + i as u64))
+                .collect();
+            build_chunk_from_pairs(pairs)
+        }
+    }
+}
+
+/// Slow path: has deletions, must check each row.
+fn decompose_segment_with_deletions(
+    segment: &U64Segment,
+    start_address: u64,
+    current_offset: u32,
+    deletion_vector: &DeletionVector,
+) -> Option<IndexChunk> {
+    let pairs: Vec<(u64, u64)> = segment
+        .iter()
+        .enumerate()
+        .filter_map(|(i, row_id)| {
+            let row_offset = current_offset + i as u32;
+            if !deletion_vector.contains(row_offset) {
+                Some((row_id, start_address + i as u64))
+            } else {
+                None
+            }
+        })
+        .collect();
+    build_chunk_from_pairs(pairs)
+}
+
+type IndexChunk = (RangeInclusive<u64>, (U64Segment, U64Segment));
+
+#[derive(Debug)]
+enum RawIndexChunk {
+    NonOverlapping(IndexChunk),
+    Overlapping(RangeInclusive<u64>, Vec<IndexChunk>),
+}
+
+impl RawIndexChunk {
+    fn range_end(&self) -> u64 {
+        match self {
+            Self::NonOverlapping((range, _)) => *range.end(),
+            Self::Overlapping(range, _) => *range.end(),
+        }
+    }
+}
+
+/// Given a vector of index chunks, sort them and return an iterator of index chunks.
+///
+/// The iterator will yield chunks that are non-overlapping or a set of chunks
+/// that are overlapping.
+fn prep_index_chunks(mut chunks: Vec<IndexChunk>) -> impl Iterator<Item = RawIndexChunk> {
+    chunks.sort_by_key(|(range, _)| u64::MAX - *range.start());
+
+    let mut output = Vec::new();
+
+    // Start assuming non-overlapping in first chunk.
+    if let Some(first_chunk) = chunks.pop() {
+        output.push(RawIndexChunk::NonOverlapping(first_chunk));
+    } else {
+        // Early return for empty.
+        return output.into_iter();
+    }
+
+    let mut current_range = 0..=0;
+    let mut current_overlap = Vec::new();
+    while let Some(chunk) = chunks.pop() {
+        debug_assert_eq!(
+            current_overlap
+                .iter()
+                .map(|(range, _): &IndexChunk| *range.start())
+                .min()
+                .unwrap_or_default(),
+            *current_range.start(),
+        );
+        debug_assert_eq!(
+            current_overlap
+                .iter()
+                .map(|(range, _): &IndexChunk| *range.end())
+                .max()
+                .unwrap_or_default(),
+            *current_range.end(),
+        );
+
+        if current_overlap.is_empty() {
+            // We haven't found overlap yet.
+            let last_chunk_end = output.last().unwrap().range_end();
+            if *chunk.0.start() <= last_chunk_end {
+                // We have found overlap.
+                match output.pop().unwrap() {
+                    RawIndexChunk::NonOverlapping(chunk) => {
+                        current_overlap.push(chunk);
+                    }
+                    _ => unreachable!(),
+                }
+                current_overlap.push(chunk);
+
+                let range_start = *current_overlap.first().unwrap().0.start();
+                let range_end = *current_overlap
+                    .last()
+                    .unwrap()
+                    .0
+                    .end()
+                    .max(current_overlap.first().unwrap().0.end());
+                current_range = range_start..=range_end;
+            } else {
+                // We are still in non-overlapping space.
+                output.push(RawIndexChunk::NonOverlapping(chunk));
+            }
+        } else {
+            // We are making an overlap chunk
+            if chunk.0.start() <= current_range.end() {
+                // We are still in overlap.
+                let range_end = *chunk.0.end().max(current_range.end());
+                current_range = *current_range.start()..=range_end;
+
+                current_overlap.push(chunk);
+            } else {
+                // We have exited overlap.
+                output.push(RawIndexChunk::Overlapping(
+                    std::mem::replace(&mut current_range, 0..=0),
+                    std::mem::take(&mut current_overlap),
+                ));
+                output.push(RawIndexChunk::NonOverlapping(chunk));
+            }
+        }
+    }
+    debug_assert_eq!(
+        current_overlap
+            .iter()
+            .map(|(range, _): &IndexChunk| *range.start())
+            .min()
+            .unwrap_or_default(),
+        *current_range.start(),
+    );
+    debug_assert_eq!(
+        current_overlap
+            .iter()
+            .map(|(range, _): &IndexChunk| *range.end())
+            .max()
+            .unwrap_or_default(),
+        *current_range.end(),
+    );
+
+    if !current_overlap.is_empty() {
+        output.push(RawIndexChunk::Overlapping(
+            current_range.clone(),
+            current_overlap,
+        ));
+    }
+
+    output.into_iter()
+}
+
+fn merge_overlapping_chunks(overlapping_chunks: Vec<IndexChunk>) -> Result<IndexChunk> {
+    let total_capacity = overlapping_chunks
+        .iter()
+        .map(|(_, (row_ids, _))| row_ids.len())
+        .sum();
+    let mut values = Vec::with_capacity(total_capacity);
+    for (_, (row_ids, row_addrs)) in overlapping_chunks.iter() {
+        values.extend(row_ids.iter().zip(row_addrs.iter()));
+    }
+    values.sort_by_key(|(row_id, _)| *row_id);
+    // A duplicate row id here means two fragments claim the same live id: a
+    // corrupt index, not a resolvable sparse-coverage case.
+    if let Some(w) = values.windows(2).find(|w| w[0].0 == w[1].0) {
+        return Err(Error::internal(format!(
+            "row id index corrupt: stable row id {} is live in multiple fragments",
+            w[0].0
+        )));
+    }
+    let row_id_segment = U64Segment::from_iter(values.iter().map(|(row_id, _)| *row_id));
+    let address_segment = U64Segment::from_iter(values.iter().map(|(_, row_addr)| *row_addr));
+
+    let range = row_id_segment.range().unwrap();
+
+    Ok((range, (row_id_segment, address_segment)))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use proptest::{prelude::Strategy, prop_assert_eq};
+
+    #[test]
+    fn test_new_index() {
+        let fragment_indices = vec![
+            FragmentRowIdIndex {
+                fragment_id: 10,
+                row_id_sequence: Arc::new(RowIdSequence(vec![
+                    U64Segment::Range(0..10),
+                    U64Segment::RangeWithHoles {
+                        range: 10..17,
+                        holes: vec![12, 15].into(),
+                    },
+                    U64Segment::SortedArray(vec![20, 25, 30].into()),
+                ])),
+                deletion_vector: Arc::new(DeletionVector::default()),
+            },
+            FragmentRowIdIndex {
+                fragment_id: 20,
+                row_id_sequence: Arc::new(RowIdSequence(vec![
+                    U64Segment::RangeWithBitmap {
+                        range: 17..20,
+                        bitmap: [true, false, true].as_slice().into(),
+                    },
+                    U64Segment::Array(vec![40, 50, 60].into()),
+                ])),
+                deletion_vector: Arc::new(DeletionVector::default()),
+            },
+        ];
+
+        let index = RowIdIndex::new(&fragment_indices).unwrap();
+
+        // Check various queries.
+        assert_eq!(index.get(0), Some(RowAddress::new_from_parts(10, 0)));
+        assert_eq!(index.get(15), None);
+        assert_eq!(index.get(16), Some(RowAddress::new_from_parts(10, 14)));
+        assert_eq!(index.get(17), Some(RowAddress::new_from_parts(20, 0)));
+        assert_eq!(index.get(25), Some(RowAddress::new_from_parts(10, 16)));
+        assert_eq!(index.get(40), Some(RowAddress::new_from_parts(20, 2)));
+        assert_eq!(index.get(60), Some(RowAddress::new_from_parts(20, 4)));
+        assert_eq!(index.get(61), None);
+    }
+
+    #[test]
+    fn test_new_index_overlap() {
+        let fragment_indices = vec![
+            FragmentRowIdIndex {
+                fragment_id: 23,
+                row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::SortedArray(
+                    vec![3, 6, 9].into(),
+                )])),
+                deletion_vector: Arc::new(DeletionVector::default()),
+            },
+            FragmentRowIdIndex {
+                fragment_id: 42,
+                row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::SortedArray(
+                    vec![2, 5, 8].into(),
+                )])),
+                deletion_vector: Arc::new(DeletionVector::default()),
+            },
+            FragmentRowIdIndex {
+                fragment_id: 10,
+                row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::SortedArray(
+                    vec![1, 4, 7].into(),
+                )])),
+                deletion_vector: Arc::new(DeletionVector::default()),
+            },
+        ];
+
+        let index = RowIdIndex::new(&fragment_indices).unwrap();
+
+        // Check various queries.
+        assert_eq!(index.get(1), Some(RowAddress::new_from_parts(10, 0)));
+        assert_eq!(index.get(2), Some(RowAddress::new_from_parts(42, 0)));
+        assert_eq!(index.get(3), Some(RowAddress::new_from_parts(23, 0)));
+        assert_eq!(index.get(4), Some(RowAddress::new_from_parts(10, 1)));
+        assert_eq!(index.get(5), Some(RowAddress::new_from_parts(42, 1)));
+        assert_eq!(index.get(6), Some(RowAddress::new_from_parts(23, 1)));
+        assert_eq!(index.get(7), Some(RowAddress::new_from_parts(10, 2)));
+        assert_eq!(index.get(8), Some(RowAddress::new_from_parts(42, 2)));
+        assert_eq!(index.get(9), Some(RowAddress::new_from_parts(23, 2)));
+    }
+
+    #[test]
+    fn test_new_index_unsorted_row_ids() {
+        // Test case with unsorted row ids within fragments
+        let fragment_indices = vec![
+            FragmentRowIdIndex {
+                fragment_id: 10,
+                row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::Array(
+                    vec![9, 3, 6].into(), // Unsorted array
+                )])),
+                deletion_vector: Arc::new(DeletionVector::default()),
+            },
+            FragmentRowIdIndex {
+                fragment_id: 20,
+                row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::Array(
+                    vec![8, 2, 5].into(), // Unsorted array
+                )])),
+                deletion_vector: Arc::new(DeletionVector::default()),
+            },
+            FragmentRowIdIndex {
+                fragment_id: 30,
+                row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::Array(
+                    vec![7, 1, 4].into(), // Unsorted array
+                )])),
+                deletion_vector: Arc::new(DeletionVector::default()),
+            },
+        ];
+
+        let index = RowIdIndex::new(&fragment_indices).unwrap();
+
+        // Check that all row ids can be found regardless of their order in the segments
+        assert_eq!(index.get(1), Some(RowAddress::new_from_parts(30, 1)));
+        assert_eq!(index.get(2), Some(RowAddress::new_from_parts(20, 1)));
+        assert_eq!(index.get(3), Some(RowAddress::new_from_parts(10, 1)));
+        assert_eq!(index.get(4), Some(RowAddress::new_from_parts(30, 2)));
+        assert_eq!(index.get(5), Some(RowAddress::new_from_parts(20, 2)));
+        assert_eq!(index.get(6), Some(RowAddress::new_from_parts(10, 2)));
+        assert_eq!(index.get(7), Some(RowAddress::new_from_parts(30, 0)));
+        assert_eq!(index.get(8), Some(RowAddress::new_from_parts(20, 0)));
+        assert_eq!(index.get(9), Some(RowAddress::new_from_parts(10, 0)));
+
+        // Check that non-existent row ids return None
+        assert_eq!(index.get(0), None);
+        assert_eq!(index.get(10), None);
+    }
+
+    #[test]
+    fn test_new_index_partial_overlap() {
+        let fragment_indices = vec![
+            FragmentRowIdIndex {
+                fragment_id: 0,
+                row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::RangeWithHoles {
+                    range: 0..100,
+                    holes: vec![50].into(),
+                }])),
+                deletion_vector: Arc::new(DeletionVector::default()),
+            },
+            FragmentRowIdIndex {
+                fragment_id: 1,
+                row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::Range(50..51)])),
+                deletion_vector: Arc::new(DeletionVector::default()),
+            },
+        ];
+
+        let index = RowIdIndex::new(&fragment_indices).unwrap();
+
+        // Check various queries.
+        assert_eq!(index.get(0), Some(RowAddress::new_from_parts(0, 0)));
+        assert_eq!(index.get(49), Some(RowAddress::new_from_parts(0, 49)));
+        assert_eq!(index.get(50), Some(RowAddress::new_from_parts(1, 0)));
+        assert_eq!(index.get(51), Some(RowAddress::new_from_parts(0, 50)));
+        assert_eq!(index.get(99), Some(RowAddress::new_from_parts(0, 98)));
+    }
+
+    #[test]
+    fn test_overlapping_chunks_sparse_with_deletions() {
+        // Interleaved (overlapping) id ranges plus a deletion that leaves a hole,
+        // so the union doesn't tile the span. Every live id must still resolve.
+        let fragment_indices = vec![
+            FragmentRowIdIndex {
+                fragment_id: 10,
+                row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::SortedArray(
+                    vec![1, 3, 5, 7, 9].into(),
+                )])),
+                deletion_vector: Arc::new(DeletionVector::default()),
+            },
+            FragmentRowIdIndex {
+                fragment_id: 20,
+                row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::SortedArray(
+                    vec![0, 2, 4, 6, 8].into(),
+                )])),
+                // Delete offset 2 (id 4) -> a hole in the span.
+                deletion_vector: Arc::new(DeletionVector::from_iter(vec![2])),
+            },
+        ];
+
+        let index = RowIdIndex::new(&fragment_indices).unwrap();
+
+        assert_eq!(index.get(0), Some(RowAddress::new_from_parts(20, 0)));
+        assert_eq!(index.get(1), Some(RowAddress::new_from_parts(10, 0)));
+        assert_eq!(index.get(2), Some(RowAddress::new_from_parts(20, 1)));
+        assert_eq!(index.get(3), Some(RowAddress::new_from_parts(10, 1)));
+        assert_eq!(index.get(4), None);
+        // Surviving ids keep their original offsets (the hole is not compacted).
+        assert_eq!(index.get(6), Some(RowAddress::new_from_parts(20, 3)));
+        assert_eq!(index.get(8), Some(RowAddress::new_from_parts(20, 4)));
+        assert_eq!(index.get(9), Some(RowAddress::new_from_parts(10, 4)));
+    }
+
+    #[test]
+    fn test_index_with_deletion_vector() {
+        let deletion_vector = DeletionVector::from_iter(vec![2, 3]);
+
+        let fragment_indices = vec![FragmentRowIdIndex {
+            fragment_id: 10,
+            row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::Range(0..6)])),
+            deletion_vector: Arc::new(deletion_vector),
+        }];
+
+        let index = RowIdIndex::new(&fragment_indices).unwrap();
+
+        assert_eq!(index.get(0), Some(RowAddress::new_from_parts(10, 0)));
+        assert_eq!(index.get(1), Some(RowAddress::new_from_parts(10, 1)));
+        assert_eq!(index.get(4), Some(RowAddress::new_from_parts(10, 4)));
+        assert_eq!(index.get(5), Some(RowAddress::new_from_parts(10, 5)));
+
+        assert_eq!(index.get(2), None);
+        assert_eq!(index.get(3), None);
+    }
+
+    #[test]
+    fn test_empty_fragment_sequences() {
+        let fragment_indices = vec![
+            FragmentRowIdIndex {
+                fragment_id: 10,
+                row_id_sequence: Arc::new(RowIdSequence(vec![])),
+                deletion_vector: Arc::new(DeletionVector::default()),
+            },
+            FragmentRowIdIndex {
+                fragment_id: 20,
+                row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::Range(5..8)])),
+                deletion_vector: Arc::new(DeletionVector::default()),
+            },
+        ];
+
+        let index = RowIdIndex::new(&fragment_indices).unwrap();
+
+        assert_eq!(index.get(5), Some(RowAddress::new_from_parts(20, 0)));
+        assert_eq!(index.get(7), Some(RowAddress::new_from_parts(20, 2)));
+        assert_eq!(index.get(4), None);
+    }
+
+    #[test]
+    fn test_completely_empty_index() {
+        let fragment_indices = vec![];
+        let index = RowIdIndex::new(&fragment_indices).unwrap();
+
+        assert_eq!(index.get(0), None);
+        assert_eq!(index.get(100), None);
+    }
+
+    #[test]
+    fn test_non_overlapping_ranges() {
+        let fragment_indices = vec![
+            FragmentRowIdIndex {
+                fragment_id: 10,
+                row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::Range(0..5)])),
+                deletion_vector: Arc::new(DeletionVector::default()),
+            },
+            FragmentRowIdIndex {
+                fragment_id: 20,
+                row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::Range(5..10)])),
+                deletion_vector: Arc::new(DeletionVector::default()),
+            },
+            FragmentRowIdIndex {
+                fragment_id: 30,
+                row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::Range(10..15)])),
+                deletion_vector: Arc::new(DeletionVector::default()),
+            },
+        ];
+
+        let index = RowIdIndex::new(&fragment_indices).unwrap();
+
+        assert_eq!(index.get(0), Some(RowAddress::new_from_parts(10, 0)));
+        assert_eq!(index.get(4), Some(RowAddress::new_from_parts(10, 4)));
+        assert_eq!(index.get(5), Some(RowAddress::new_from_parts(20, 0)));
+        assert_eq!(index.get(9), Some(RowAddress::new_from_parts(20, 4)));
+        assert_eq!(index.get(10), Some(RowAddress::new_from_parts(30, 0)));
+        assert_eq!(index.get(14), Some(RowAddress::new_from_parts(30, 4)));
+    }
+
+    fn arbitrary_row_ids(
+        num_fragments_range: std::ops::Range<usize>,
+        frag_size_range: std::ops::Range<usize>,
+    ) -> impl Strategy<Value = Vec<(u32, Arc<RowIdSequence>)>> {
+        let fragment_sizes = proptest::collection::vec(frag_size_range, num_fragments_range);
+        fragment_sizes.prop_flat_map(|fragment_sizes| {
+            let num_rows = fragment_sizes.iter().sum::<usize>() as u64;
+            let row_ids = 0..num_rows;
+            let row_ids = row_ids.collect::<Vec<_>>();
+            let row_ids_shuffled = proptest::strategy::Just(row_ids).prop_shuffle();
+            row_ids_shuffled.prop_map(move |row_ids| {
+                let mut sequences = Vec::with_capacity(fragment_sizes.len());
+                let mut i = 0;
+                for size in &fragment_sizes {
+                    let end = i + size;
+                    let sequence =
+                        RowIdSequence(vec![U64Segment::from_slice(row_ids[i..end].into())]);
+                    sequences.push((i as u32, Arc::new(sequence)));
+                    i = end;
+                }
+                sequences
+            })
+        })
+    }
+
+    #[test]
+    fn test_large_range_segments_no_deletions() {
+        // Simulates a real-world scenario: many fragments with large Range segments
+        // and no deletions. Before optimization, this would iterate over all rows
+        // (O(total_rows)). After optimization, it's O(num_fragments).
+        let rows_per_fragment = 250_000u64;
+        let num_fragments = 100u32;
+        let mut offset = 0u64;
+
+        let fragment_indices: Vec<FragmentRowIdIndex> = (0..num_fragments)
+            .map(|frag_id| {
+                let start = offset;
+                offset += rows_per_fragment;
+                FragmentRowIdIndex {
+                    fragment_id: frag_id,
+                    row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::Range(
+                        start..start + rows_per_fragment,
+                    )])),
+                    deletion_vector: Arc::new(DeletionVector::default()),
+                }
+            })
+            .collect();
+
+        let start = std::time::Instant::now();
+        let index = RowIdIndex::new(&fragment_indices).unwrap();
+        let elapsed = start.elapsed();
+
+        // Verify correctness at boundaries
+        assert_eq!(index.get(0), Some(RowAddress::new_from_parts(0, 0)));
+        assert_eq!(
+            index.get(rows_per_fragment - 1),
+            Some(RowAddress::new_from_parts(0, rows_per_fragment as u32 - 1))
+        );
+        assert_eq!(
+            index.get(rows_per_fragment),
+            Some(RowAddress::new_from_parts(1, 0))
+        );
+        let last_row = num_fragments as u64 * rows_per_fragment - 1;
+        assert_eq!(
+            index.get(last_row),
+            Some(RowAddress::new_from_parts(
+                num_fragments - 1,
+                rows_per_fragment as u32 - 1
+            ))
+        );
+        assert_eq!(index.get(last_row + 1), None);
+
+        // With the optimization, building an index for 25M rows across 100 fragments
+        // should complete in well under 1 second (typically < 1ms).
+        assert!(
+            elapsed.as_secs() < 1,
+            "Index build took {:?} for {} fragments x {} rows = {} total rows. \
+             This suggests the O(rows) -> O(fragments) optimization is not working.",
+            elapsed,
+            num_fragments,
+            rows_per_fragment,
+            num_fragments as u64 * rows_per_fragment,
+        );
+    }
+
+    #[test]
+    fn test_large_range_segments_with_deletions() {
+        let rows_per_fragment = 1_000u64;
+        let num_fragments = 10u32;
+        let mut offset = 0u64;
+
+        let fragment_indices: Vec<FragmentRowIdIndex> = (0..num_fragments)
+            .map(|frag_id| {
+                let start = offset;
+                offset += rows_per_fragment;
+
+                // Delete every 3rd row (offsets 0, 3, 6, ...) within each fragment.
+                let mut deleted = roaring::RoaringBitmap::new();
+                for i in (0..rows_per_fragment as u32).step_by(3) {
+                    deleted.insert(i);
+                }
+
+                FragmentRowIdIndex {
+                    fragment_id: frag_id,
+                    row_id_sequence: Arc::new(RowIdSequence(vec![U64Segment::Range(
+                        start..start + rows_per_fragment,
+                    )])),
+                    deletion_vector: Arc::new(DeletionVector::Bitmap(deleted)),
+                }
+            })
+            .collect();
+
+        let index = RowIdIndex::new(&fragment_indices).unwrap();
+
+        // Deleted rows (offset 0, 3, 6, ...) should not be found.
+        // Row ID 0 has offset 0 in fragment 0 -> deleted.
+        assert_eq!(index.get(0), None);
+        // Row ID 3 has offset 3 in fragment 0 -> deleted.
+        assert_eq!(index.get(3), None);
+
+        // Non-deleted rows should resolve correctly.
+        // Row ID 1 has offset 1 in fragment 0 -> address (frag=0, row=1).
+        assert_eq!(index.get(1), Some(RowAddress::new_from_parts(0, 1)));
+        // Row ID 2 has offset 2 in fragment 0 -> address (frag=0, row=2).
+        assert_eq!(index.get(2), Some(RowAddress::new_from_parts(0, 2)));
+        // Row ID 4 has offset 4 in fragment 0 -> address (frag=0, row=4).
+        assert_eq!(index.get(4), Some(RowAddress::new_from_parts(0, 4)));
+
+        // Check second fragment: row IDs start at 1000.
+        // Row ID 1000 has offset 0 in fragment 1 -> deleted.
+        assert_eq!(index.get(rows_per_fragment), None);
+        // Row ID 1001 has offset 1 in fragment 1 -> address (frag=1, row=1).
+        assert_eq!(
+            index.get(rows_per_fragment + 1),
+            Some(RowAddress::new_from_parts(1, 1))
+        );
+
+        // Last fragment, last non-deleted row.
+        // Row ID 9999 has offset 999 in fragment 9 -> 999 % 3 == 0 -> deleted.
+        let last_row = num_fragments as u64 * rows_per_fragment - 1;
+        assert_eq!(index.get(last_row), None);
+        // Row ID 9998 has offset 998 -> 998 % 3 == 2 -> not deleted.
+        assert_eq!(
+            index.get(last_row - 1),
+            Some(RowAddress::new_from_parts(num_fragments - 1, 998))
+        );
+
+        // Out of range.
+        assert_eq!(index.get(last_row + 1), None);
+    }
+
+    proptest::proptest! {
+        #[test]
+        fn test_new_index_robustness(row_ids in arbitrary_row_ids(0..5, 0..32)) {
+            let fragment_indices: Vec<FragmentRowIdIndex> = row_ids
+                .iter()
+                .map(|(frag_id, sequence)| FragmentRowIdIndex {
+                    fragment_id: *frag_id,
+                    row_id_sequence: sequence.clone(),
+                    deletion_vector: Arc::new(DeletionVector::default()),
+                })
+                .collect();
+
+            let index = RowIdIndex::new(&fragment_indices).unwrap();
+            for (frag_id, sequence) in row_ids.iter() {
+                for (local_offset, row_id) in sequence.iter().enumerate() {
+                    prop_assert_eq!(
+                        index.get(row_id),
+                        Some(RowAddress::new_from_parts(*frag_id, local_offset as u32)),
+                        "Row id {} in sequence {:?} not found in index {:?}",
+                        row_id,
+                        sequence,
+                        index
+                    );
+                }
+            }
+        }
+    }
+}
--- a/vendor/lance-table/src/rowids/segment.rs
+++ b/vendor/lance-table/src/rowids/segment.rs
--- a/vendor/lance-table/src/rowids/serde.rs
+++ b/vendor/lance-table/src/rowids/serde.rs
@ -0,0 +1,239 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+use crate::{format::pb, rowids::bitmap::Bitmap};
+use lance_core::{Error, Result};
+
+use super::{RowIdSequence, U64Segment, encoded_array::EncodedU64Array};
+use prost::Message;
+
+impl TryFrom<pb::RowIdSequence> for RowIdSequence {
+    type Error = Error;
+
+    fn try_from(pb: pb::RowIdSequence) -> Result<Self> {
+        Ok(Self(
+            pb.segments
+                .into_iter()
+                .map(U64Segment::try_from)
+                .collect::<Result<Vec<_>>>()?,
+        ))
+    }
+}
+
+impl TryFrom<pb::U64Segment> for U64Segment {
+    type Error = Error;
+
+    fn try_from(pb: pb::U64Segment) -> Result<Self> {
+        use pb::u64_segment as pb_seg;
+        use pb::u64_segment::Segment::*;
+        match pb.segment {
+            Some(Range(pb_seg::Range { start, end })) => Ok(Self::Range(start..end)),
+            Some(RangeWithHoles(pb_seg::RangeWithHoles { start, end, holes })) => {
+                let holes = holes
+                    .ok_or_else(|| Error::invalid_input("missing hole"))?
+                    .try_into()?;
+                Ok(Self::RangeWithHoles {
+                    range: start..end,
+                    holes,
+                })
+            }
+            Some(RangeWithBitmap(pb_seg::RangeWithBitmap { start, end, bitmap })) => {
+                Ok(Self::RangeWithBitmap {
+                    range: start..end,
+                    bitmap: Bitmap {
+                        data: bitmap,
+                        len: (end - start) as usize,
+                    },
+                })
+            }
+            Some(SortedArray(array)) => Ok(Self::SortedArray(EncodedU64Array::try_from(array)?)),
+            Some(Array(array)) => Ok(Self::Array(EncodedU64Array::try_from(array)?)),
+            // TODO: why non-exhaustive?
+            // Some(_) => Err(Error::invalid_input("unknown segment type")),
+            None => Err(Error::invalid_input("missing segment type")),
+        }
+    }
+}
+
+impl TryFrom<pb::EncodedU64Array> for EncodedU64Array {
+    type Error = Error;
+
+    fn try_from(pb: pb::EncodedU64Array) -> Result<Self> {
+        use pb::encoded_u64_array as pb_arr;
+        use pb::encoded_u64_array::Array::*;
+        match pb.array {
+            Some(U16Array(pb_arr::U16Array { base, offsets })) => {
+                assert!(
+                    offsets.len() % 2 == 0,
+                    "Must have even number of bytes to store u16 array"
+                );
+                let offsets = offsets
+                    .chunks_exact(2)
+                    .map(|chunk| u16::from_le_bytes([chunk[0], chunk[1]]))
+                    .collect();
+                Ok(Self::U16 { base, offsets })
+            }
+            Some(U32Array(pb_arr::U32Array { base, offsets })) => {
+                assert!(
+                    offsets.len() % 4 == 0,
+                    "Must have even number of bytes to store u32 array"
+                );
+                let offsets = offsets
+                    .chunks_exact(4)
+                    .map(|chunk| u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]))
+                    .collect();
+                Ok(Self::U32 { base, offsets })
+            }
+            Some(U64Array(pb_arr::U64Array { values })) => {
+                assert!(
+                    values.len() % 8 == 0,
+                    "Must have even number of bytes to store u64 array"
+                );
+                let values = values
+                    .chunks_exact(8)
+                    .map(|chunk| {
+                        u64::from_le_bytes([
+                            chunk[0], chunk[1], chunk[2], chunk[3], chunk[4], chunk[5], chunk[6],
+                            chunk[7],
+                        ])
+                    })
+                    .collect();
+                Ok(Self::U64(values))
+            }
+            // TODO: shouldn't this enum be non-exhaustive?
+            // Some(_) => Err(Error::invalid_input("unknown array type")),
+            None => Err(Error::invalid_input("missing array type")),
+        }
+    }
+}
+
+impl From<RowIdSequence> for pb::RowIdSequence {
+    fn from(sequence: RowIdSequence) -> Self {
+        Self {
+            segments: sequence.0.into_iter().map(pb::U64Segment::from).collect(),
+        }
+    }
+}
+
+impl From<U64Segment> for pb::U64Segment {
+    fn from(segment: U64Segment) -> Self {
+        match segment {
+            U64Segment::Range(range) => Self {
+                segment: Some(pb::u64_segment::Segment::Range(pb::u64_segment::Range {
+                    start: range.start,
+                    end: range.end,
+                })),
+            },
+            U64Segment::RangeWithHoles { range, holes } => Self {
+                segment: Some(pb::u64_segment::Segment::RangeWithHoles(
+                    pb::u64_segment::RangeWithHoles {
+                        start: range.start,
+                        end: range.end,
+                        holes: Some(holes.into()),
+                    },
+                )),
+            },
+            U64Segment::RangeWithBitmap { range, bitmap } => Self {
+                segment: Some(pb::u64_segment::Segment::RangeWithBitmap(
+                    pb::u64_segment::RangeWithBitmap {
+                        start: range.start,
+                        end: range.end,
+                        bitmap: bitmap.data,
+                    },
+                )),
+            },
+            U64Segment::SortedArray(array) => Self {
+                segment: Some(pb::u64_segment::Segment::SortedArray(array.into())),
+            },
+            U64Segment::Array(array) => Self {
+                segment: Some(pb::u64_segment::Segment::Array(array.into())),
+            },
+        }
+    }
+}
+
+impl From<EncodedU64Array> for pb::EncodedU64Array {
+    fn from(array: EncodedU64Array) -> Self {
+        match array {
+            EncodedU64Array::U16 { base, offsets } => Self {
+                array: Some(pb::encoded_u64_array::Array::U16Array(
+                    pb::encoded_u64_array::U16Array {
+                        base,
+                        offsets: offsets
+                            .iter()
+                            .flat_map(|&offset| offset.to_le_bytes().to_vec())
+                            .collect(),
+                    },
+                )),
+            },
+            EncodedU64Array::U32 { base, offsets } => Self {
+                array: Some(pb::encoded_u64_array::Array::U32Array(
+                    pb::encoded_u64_array::U32Array {
+                        base,
+                        offsets: offsets
+                            .iter()
+                            .flat_map(|&offset| offset.to_le_bytes().to_vec())
+                            .collect(),
+                    },
+                )),
+            },
+            EncodedU64Array::U64(values) => Self {
+                array: Some(pb::encoded_u64_array::Array::U64Array(
+                    pb::encoded_u64_array::U64Array {
+                        values: values
+                            .iter()
+                            .flat_map(|&value| value.to_le_bytes().to_vec())
+                            .collect(),
+                    },
+                )),
+            },
+        }
+    }
+}
+
+/// Serialize a rowid sequence to a buffer.
+pub fn write_row_ids(sequence: &RowIdSequence) -> Vec<u8> {
+    let pb_sequence = pb::RowIdSequence::from(sequence.clone());
+    pb_sequence.encode_to_vec()
+}
+
+/// Deserialize a rowid sequence from some bytes.
+pub fn read_row_ids(reader: &[u8]) -> Result<RowIdSequence> {
+    let pb_sequence = pb::RowIdSequence::decode(reader)?;
+    RowIdSequence::try_from(pb_sequence)
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use pretty_assertions::assert_eq;
+
+    #[test]
+    fn test_write_read_row_ids() {
+        let mut sequence = RowIdSequence::from(0..20);
+        sequence.0.push(U64Segment::Range(30..100));
+        sequence.0.push(U64Segment::RangeWithHoles {
+            range: 100..200,
+            holes: EncodedU64Array::U64(vec![104, 108, 150]),
+        });
+        sequence.0.push(U64Segment::RangeWithBitmap {
+            range: 200..300,
+            bitmap: Bitmap::new_empty(100),
+        });
+        sequence
+            .0
+            .push(U64Segment::SortedArray(EncodedU64Array::U16 {
+                base: 200,
+                offsets: vec![1, 2, 3],
+            }));
+        sequence
+            .0
+            .push(U64Segment::Array(EncodedU64Array::U64(vec![1, 2, 3])));
+
+        let serialized = write_row_ids(&sequence);
+
+        let sequence2 = read_row_ids(&serialized).unwrap();
+
+        assert_eq!(sequence.0, sequence2.0);
+    }
+}
--- a/vendor/lance-table/src/rowids/version.rs
+++ b/vendor/lance-table/src/rowids/version.rs
@ -0,0 +1,713 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+//! Row version tracking for cross-version diff functionality
+//!
+//! This module provides data structures and functionality to track the latest
+//! update version for each row in a Lance dataset, enabling efficient
+//! cross-version diff operations.
+
+use std::sync::Arc;
+
+use deepsize::DeepSizeOf;
+use lance_core::Error;
+use lance_core::Result;
+use prost::Message;
+use serde::de::Deserializer;
+use serde::ser::Serializer;
+use serde::{Deserialize, Serialize};
+
+use crate::format::{ExternalFile, Fragment, pb};
+use crate::rowids::segment::U64Segment;
+use crate::rowids::{RowIdSequence, read_row_ids};
+
+/// A run of identical versions over a contiguous span of row positions.
+///
+/// Span is expressed as a U64Segment over row offsets (0..N within a fragment),
+/// not over row IDs. This keeps the encoding aligned with RowIdSequence order
+/// and enables zipped iteration without building a map.
+#[derive(Debug, Clone, PartialEq, Eq, DeepSizeOf)]
+pub struct RowDatasetVersionRun {
+    pub span: U64Segment,
+    pub version: u64,
+}
+
+impl RowDatasetVersionRun {
+    /// Number of rows covered by this run.
+    pub fn len(&self) -> usize {
+        self.span.len()
+    }
+
+    /// Whether this run covers no rows.
+    pub fn is_empty(&self) -> bool {
+        self.span.is_empty()
+    }
+
+    /// The version value of this run.
+    pub fn version(&self) -> u64 {
+        self.version
+    }
+}
+
+/// Sequence of dataset versions
+///
+/// Stores version runs aligned to the positional order of RowIdSequence.
+/// Provides sequential iterators and optional lightweight indexing for
+/// efficient random access.
+#[derive(Debug, Clone, PartialEq, Eq, DeepSizeOf, Default)]
+pub struct RowDatasetVersionSequence {
+    pub runs: Vec<RowDatasetVersionRun>,
+}
+
+impl RowDatasetVersionSequence {
+    /// Create a new empty version sequence
+    pub fn new() -> Self {
+        Self { runs: Vec::new() }
+    }
+
+    /// Create a version sequence with a single uniform run of `row_count` rows.
+    pub fn from_uniform_row_count(row_count: u64, version: u64) -> Self {
+        if row_count == 0 {
+            return Self::new();
+        }
+        let run = RowDatasetVersionRun {
+            span: U64Segment::Range(0..row_count),
+            version,
+        };
+        Self { runs: vec![run] }
+    }
+
+    /// Number of rows tracked by this sequence (sum of run lengths).
+    pub fn len(&self) -> u64 {
+        self.runs.iter().map(|s| s.len() as u64).sum()
+    }
+
+    /// Empty if there are no runs or all runs are empty.
+    pub fn is_empty(&self) -> bool {
+        self.runs.is_empty() || self.runs.iter().all(|s| s.is_empty())
+    }
+
+    /// Returns a forward iterator over versions, expanding runs lazily.
+    pub fn versions(&self) -> VersionsIter<'_> {
+        VersionsIter::new(&self.runs)
+    }
+
+    /// Random access: get the version at global row position `index`.
+    pub fn version_at(&self, index: usize) -> Option<u64> {
+        let mut offset = 0usize;
+        for run in &self.runs {
+            let len = run.len();
+            if index < offset + len {
+                return Some(run.version());
+            }
+            offset += len;
+        }
+        None
+    }
+
+    /// Get the version associated with a specific row id.
+    /// This reconstructs the positional offset from RowIdSequence and then
+    /// performs `version_at` lookup.
+    pub fn get_version_for_row_id(&self, row_ids: &RowIdSequence, row_id: u64) -> Option<u64> {
+        let mut offset = 0usize;
+        for seg in &row_ids.0 {
+            if seg.range().is_some_and(|r| r.contains(&row_id))
+                && let Some(local) = seg.position(row_id)
+            {
+                return self.version_at(offset + local);
+            }
+            offset += seg.len();
+        }
+        None
+    }
+
+    /// Convenience: collect row IDs with version strictly greater than `threshold`.
+    pub fn rows_with_version_greater_than(
+        &self,
+        row_ids: &RowIdSequence,
+        threshold: u64,
+    ) -> Vec<u64> {
+        row_ids
+            .iter()
+            .zip(self.versions())
+            .filter_map(|(rid, v)| if v > threshold { Some(rid) } else { None })
+            .collect()
+    }
+
+    /// Delete rows by positional offsets (e.g., from a deletion vector)
+    pub fn mask(&mut self, positions: impl IntoIterator<Item = u32>) -> Result<()> {
+        let mut local_positions: Vec<u32> = Vec::new();
+        let mut positions_iter = positions.into_iter();
+        let mut curr_position = positions_iter.next();
+        let mut offset: usize = 0;
+        let mut cutoff: usize = 0;
+
+        for run in self.runs.iter_mut() {
+            cutoff += run.span.len();
+            while let Some(position) = curr_position {
+                if position as usize >= cutoff {
+                    break;
+                }
+                local_positions.push(position - offset as u32);
+                curr_position = positions_iter.next();
+            }
+
+            if !local_positions.is_empty() {
+                run.span.mask(local_positions.as_slice());
+                local_positions.clear();
+            }
+            offset = cutoff;
+        }
+
+        self.runs.retain(|r| !r.span.is_empty());
+        Ok(())
+    }
+}
+
+/// Iterator over versions expanding runs lazily.
+pub struct VersionsIter<'a> {
+    runs: &'a [RowDatasetVersionRun],
+    run_idx: usize,
+    remaining_in_run: usize,
+    current_version: u64,
+}
+
+impl<'a> VersionsIter<'a> {
+    fn new(runs: &'a [RowDatasetVersionRun]) -> Self {
+        let mut it = Self {
+            runs,
+            run_idx: 0,
+            remaining_in_run: 0,
+            current_version: 0,
+        };
+        it.advance_run();
+        it
+    }
+
+    fn advance_run(&mut self) {
+        if self.run_idx < self.runs.len() {
+            let run = &self.runs[self.run_idx];
+            self.remaining_in_run = run.len();
+            self.current_version = run.version();
+        } else {
+            self.remaining_in_run = 0;
+        }
+    }
+}
+
+impl<'a> Iterator for VersionsIter<'a> {
+    type Item = u64;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.remaining_in_run == 0 {
+            // Move to next run
+            self.run_idx += 1;
+            if self.run_idx >= self.runs.len() {
+                return None;
+            }
+            self.advance_run();
+        }
+        self.remaining_in_run = self.remaining_in_run.saturating_sub(1);
+        Some(self.current_version)
+    }
+}
+
+/// Metadata about the location of dataset version sequence data
+/// Following the same pattern as RowIdMeta
+///
+/// When stored inline, identical byte sequences are shared across fragments
+/// via `Arc<[u8]>` to reduce manifest memory for large tables.
+#[derive(Debug, Clone, PartialEq, Eq, DeepSizeOf)]
+pub enum RowDatasetVersionMeta {
+    /// Small sequences stored inline in the fragment metadata
+    Inline(Arc<[u8]>),
+    /// Large sequences stored in external files
+    External(ExternalFile),
+}
+
+// Custom Serialize: convert Arc<[u8]> to slice for transparent JSON output
+impl Serialize for RowDatasetVersionMeta {
+    fn serialize<S: Serializer>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error> {
+        #[derive(Serialize)]
+        #[serde(untagged)]
+        enum Helper<'a> {
+            Inline { inline: &'a [u8] },
+            External { external: &'a ExternalFile },
+        }
+
+        match self {
+            Self::Inline(data) => Helper::Inline {
+                inline: data.as_ref(),
+            }
+            .serialize(serializer),
+            Self::External(file) => Helper::External { external: file }.serialize(serializer),
+        }
+    }
+}
+
+// Custom Deserialize: read Vec<u8> and convert to Arc<[u8]>
+impl<'de> Deserialize<'de> for RowDatasetVersionMeta {
+    fn deserialize<D: Deserializer<'de>>(deserializer: D) -> std::result::Result<Self, D::Error> {
+        #[derive(Deserialize)]
+        #[serde(untagged)]
+        enum Helper {
+            Inline { inline: Vec<u8> },
+            External { external: ExternalFile },
+        }
+
+        match Helper::deserialize(deserializer)? {
+            Helper::Inline { inline } => Ok(Self::Inline(Arc::from(inline))),
+            Helper::External { external } => Ok(Self::External(external)),
+        }
+    }
+}
+
+impl RowDatasetVersionMeta {
+    /// Create inline metadata from a version sequence
+    pub fn from_sequence(sequence: &RowDatasetVersionSequence) -> lance_core::Result<Self> {
+        let bytes = write_dataset_versions(sequence);
+        Ok(Self::Inline(Arc::from(bytes)))
+    }
+
+    /// Create external metadata reference
+    pub fn from_external_file(path: String, offset: u64, size: u64) -> Self {
+        Self::External(ExternalFile { path, offset, size })
+    }
+
+    /// Load the version sequence from this metadata
+    pub fn load_sequence(&self) -> lance_core::Result<RowDatasetVersionSequence> {
+        match self {
+            Self::Inline(data) => read_dataset_versions(data),
+            Self::External(_file) => {
+                todo!("External file loading not yet implemented")
+            }
+        }
+    }
+}
+
+/// Helper function to convert RowDatasetVersionMeta to protobuf format for last_updated_at
+pub fn last_updated_at_version_meta_to_pb(
+    meta: &Option<RowDatasetVersionMeta>,
+) -> Option<pb::data_fragment::LastUpdatedAtVersionSequence> {
+    meta.as_ref().map(|m| match m {
+        RowDatasetVersionMeta::Inline(data) => {
+            pb::data_fragment::LastUpdatedAtVersionSequence::InlineLastUpdatedAtVersions(
+                data.to_vec(),
+            )
+        }
+        RowDatasetVersionMeta::External(file) => {
+            pb::data_fragment::LastUpdatedAtVersionSequence::ExternalLastUpdatedAtVersions(
+                pb::ExternalFile {
+                    path: file.path.clone(),
+                    offset: file.offset,
+                    size: file.size,
+                },
+            )
+        }
+    })
+}
+
+/// Helper function to convert RowDatasetVersionMeta to protobuf format for created_at
+pub fn created_at_version_meta_to_pb(
+    meta: &Option<RowDatasetVersionMeta>,
+) -> Option<pb::data_fragment::CreatedAtVersionSequence> {
+    meta.as_ref().map(|m| match m {
+        RowDatasetVersionMeta::Inline(data) => {
+            pb::data_fragment::CreatedAtVersionSequence::InlineCreatedAtVersions(data.to_vec())
+        }
+        RowDatasetVersionMeta::External(file) => {
+            pb::data_fragment::CreatedAtVersionSequence::ExternalCreatedAtVersions(
+                pb::ExternalFile {
+                    path: file.path.clone(),
+                    offset: file.offset,
+                    size: file.size,
+                },
+            )
+        }
+    })
+}
+
+/// Serialize a dataset version sequence to a buffer (following RowIdSequence pattern)
+pub fn write_dataset_versions(sequence: &RowDatasetVersionSequence) -> Vec<u8> {
+    // Convert to protobuf sequence
+    let pb_sequence = pb::RowDatasetVersionSequence {
+        runs: sequence
+            .runs
+            .iter()
+            .map(|run| pb::RowDatasetVersionRun {
+                span: Some(pb::U64Segment::from(run.span.clone())),
+                version: run.version,
+            })
+            .collect(),
+    };
+
+    pb_sequence.encode_to_vec()
+}
+
+/// Deserialize a dataset version sequence from bytes (following RowIdSequence pattern)
+pub fn read_dataset_versions(data: &[u8]) -> lance_core::Result<RowDatasetVersionSequence> {
+    let pb_sequence = pb::RowDatasetVersionSequence::decode(data).map_err(|e| {
+        Error::internal(format!("Failed to decode RowDatasetVersionSequence: {}", e))
+    })?;
+
+    let segments = pb_sequence
+        .runs
+        .into_iter()
+        .map(|pb_run| {
+            let positions_pb = pb_run.span.ok_or_else(|| {
+                Error::internal("Missing positions in RowDatasetVersionRun".to_string())
+            })?;
+            let segment = U64Segment::try_from(positions_pb)?;
+            Ok(RowDatasetVersionRun {
+                span: segment,
+                version: pb_run.version,
+            })
+        })
+        .collect::<Result<Vec<_>>>()?;
+
+    Ok(RowDatasetVersionSequence { runs: segments })
+}
+
+/// Re-chunk a sequence of dataset version runs into new chunk sizes (aligned with RowIdSequence rechunking)
+pub fn rechunk_version_sequences(
+    sequences: impl IntoIterator<Item = RowDatasetVersionSequence>,
+    chunk_sizes: impl IntoIterator<Item = u64>,
+    allow_incomplete: bool,
+) -> Result<Vec<RowDatasetVersionSequence>> {
+    let chunk_sizes_vec: Vec<u64> = chunk_sizes.into_iter().collect();
+    let total_chunks = chunk_sizes_vec.len();
+    let mut chunked_sequences: Vec<RowDatasetVersionSequence> = Vec::with_capacity(total_chunks);
+
+    let mut run_iter = sequences
+        .into_iter()
+        .flat_map(|sequence| sequence.runs.into_iter())
+        .peekable();
+
+    let too_few_segments_error = |chunk_index: usize, expected_chunk_size: u64, remaining: u64| {
+        Error::invalid_input(format!(
+            "Got too few version runs for chunk {}. Expected chunk size: {}, remaining needed: {}",
+            chunk_index, expected_chunk_size, remaining
+        ))
+    };
+
+    let too_many_segments_error = |processed_chunks: usize, total_chunk_sizes: usize| {
+        Error::invalid_input(format!(
+            "Got too many version runs for the provided chunk lengths. Processed {} chunks out of {} expected",
+            processed_chunks, total_chunk_sizes
+        ))
+    };
+
+    let mut segment_offset = 0_u64;
+
+    for (chunk_index, chunk_size) in chunk_sizes_vec.iter().enumerate() {
+        let chunk_size = *chunk_size;
+        let mut out_seq = RowDatasetVersionSequence::new();
+        let mut remaining = chunk_size;
+
+        while remaining > 0 {
+            let remaining_in_segment = run_iter
+                .peek()
+                .map_or(0, |run| run.span.len() as u64 - segment_offset);
+
+            if remaining_in_segment == 0 {
+                if run_iter.next().is_some() {
+                    segment_offset = 0;
+                    continue;
+                } else if allow_incomplete {
+                    break;
+                } else {
+                    return Err(too_few_segments_error(chunk_index, chunk_size, remaining));
+                }
+            }
+
+            match remaining_in_segment.cmp(&remaining) {
+                std::cmp::Ordering::Greater => {
+                    let run = run_iter.peek().unwrap();
+                    let seg = run.span.slice(segment_offset as usize, remaining as usize);
+                    out_seq.runs.push(RowDatasetVersionRun {
+                        span: seg,
+                        version: run.version,
+                    });
+                    segment_offset += remaining;
+                    remaining = 0;
+                }
+                std::cmp::Ordering::Equal | std::cmp::Ordering::Less => {
+                    let run = run_iter.next().ok_or_else(|| {
+                        too_few_segments_error(chunk_index, chunk_size, remaining)
+                    })?;
+                    let seg = run
+                        .span
+                        .slice(segment_offset as usize, remaining_in_segment as usize);
+                    out_seq.runs.push(RowDatasetVersionRun {
+                        span: seg,
+                        version: run.version,
+                    });
+                    segment_offset = 0;
+                    remaining -= remaining_in_segment;
+                }
+            }
+        }
+
+        chunked_sequences.push(out_seq);
+    }
+
+    if run_iter.peek().is_some() {
+        return Err(too_many_segments_error(
+            chunked_sequences.len(),
+            total_chunks,
+        ));
+    }
+
+    Ok(chunked_sequences)
+}
+
+/// Build version metadata for a fragment if it has physical rows and no existing metadata.
+pub fn build_version_meta(
+    fragment: &Fragment,
+    current_version: u64,
+) -> Option<RowDatasetVersionMeta> {
+    if let Some(physical_rows) = fragment.physical_rows
+        && physical_rows > 0
+    {
+        // Verify row_id_meta exists (sanity check for stable row IDs)
+        if fragment.row_id_meta.is_none() {
+            panic!("Can not find row id meta, please make sure you have enabled stable row id.")
+        }
+
+        // Use physical_rows directly as the authoritative row count
+        // This is correct even for compacted fragments where row_id_meta might
+        // have been partially copied
+        let version_sequence = RowDatasetVersionSequence::from_uniform_row_count(
+            physical_rows as u64,
+            current_version,
+        );
+
+        return Some(RowDatasetVersionMeta::from_sequence(&version_sequence).unwrap());
+    }
+    None
+}
+
+/// Refresh row-level latest update version metadata for a full fragment rewrite-column update.
+///
+/// This sets a uniform version sequence for all rows in the fragment to `current_version`.
+pub fn refresh_row_latest_update_meta_for_full_frag_rewrite_cols(
+    fragment: &mut Fragment,
+    current_version: u64,
+) -> Result<()> {
+    let row_count = if let Some(pr) = fragment.physical_rows {
+        pr as u64
+    } else if let Some(row_id_meta) = fragment.row_id_meta.as_ref() {
+        match row_id_meta {
+            crate::format::RowIdMeta::Inline(data) => {
+                let sequence = read_row_ids(data).unwrap();
+                sequence.len()
+            }
+            // Follow existing behavior: external sequence not yet supported here
+            crate::format::RowIdMeta::External(_file) => 0,
+        }
+    } else {
+        0
+    };
+
+    if row_count > 0 {
+        let version_seq =
+            RowDatasetVersionSequence::from_uniform_row_count(row_count, current_version);
+        let version_meta = RowDatasetVersionMeta::from_sequence(&version_seq)?;
+        fragment.last_updated_at_version_meta = Some(version_meta);
+    }
+
+    Ok(())
+}
+
+/// Refresh row-level latest update version metadata for a partial fragment rewrite-column update.
+///
+/// `updated_offsets` are local row offsets (within the fragment) that have been updated.
+/// Existing version metadata is preserved and only the updated positions are set to `current_version`.
+/// If no existing metadata is present, positions default to `prev_version`.
+pub fn refresh_row_latest_update_meta_for_partial_frag_rewrite_cols(
+    fragment: &mut Fragment,
+    updated_offsets: &[usize],
+    current_version: u64,
+    prev_version: u64,
+) -> Result<()> {
+    // Determine row count for fragment
+    let row_count_u64: u64 = if let Some(pr) = fragment.physical_rows {
+        pr as u64
+    } else if let Some(row_id_meta) = fragment.row_id_meta.as_ref() {
+        match row_id_meta {
+            crate::format::RowIdMeta::Inline(data) => {
+                let sequence = read_row_ids(data).unwrap();
+                sequence.len()
+            }
+            crate::format::RowIdMeta::External(_file) => {
+                // Preserve original behavior for external sequences
+                todo!("External file loading not yet implemented")
+            }
+        }
+    } else {
+        0
+    };
+
+    if row_count_u64 > 0 {
+        // Build base version vector from existing meta or previous dataset version
+        let mut base_versions: Vec<u64> = Vec::with_capacity(row_count_u64 as usize);
+        if let Some(meta) = fragment.last_updated_at_version_meta.as_ref() {
+            if let Ok(base_seq) = meta.load_sequence() {
+                for pos in 0..(row_count_u64 as usize) {
+                    base_versions.push(base_seq.version_at(pos).unwrap_or(prev_version));
+                }
+            } else {
+                base_versions.resize(row_count_u64 as usize, prev_version);
+            }
+        } else {
+            base_versions.resize(row_count_u64 as usize, prev_version);
+        }
+
+        // Apply updates to updated positions
+        for &pos in updated_offsets {
+            if pos < base_versions.len() {
+                base_versions[pos] = current_version;
+            }
+        }
+
+        // Compress into runs
+        let mut runs: Vec<RowDatasetVersionRun> = Vec::new();
+        if !base_versions.is_empty() {
+            let mut start = 0usize;
+            let mut curr_ver = base_versions[0];
+            for (idx, &ver) in base_versions.iter().enumerate().skip(1) {
+                if ver != curr_ver {
+                    runs.push(RowDatasetVersionRun {
+                        span: U64Segment::Range(start as u64..idx as u64),
+                        version: curr_ver,
+                    });
+                    start = idx;
+                    curr_ver = ver;
+                }
+            }
+            runs.push(RowDatasetVersionRun {
+                span: U64Segment::Range(start as u64..base_versions.len() as u64),
+                version: curr_ver,
+            });
+        }
+        let new_seq = RowDatasetVersionSequence { runs };
+        let new_meta = RowDatasetVersionMeta::from_sequence(&new_seq)?;
+        fragment.last_updated_at_version_meta = Some(new_meta);
+    }
+
+    Ok(())
+}
+
+// Protobuf conversion implementations
+impl TryFrom<pb::data_fragment::LastUpdatedAtVersionSequence> for RowDatasetVersionMeta {
+    type Error = Error;
+
+    fn try_from(value: pb::data_fragment::LastUpdatedAtVersionSequence) -> Result<Self> {
+        match value {
+            pb::data_fragment::LastUpdatedAtVersionSequence::InlineLastUpdatedAtVersions(data) => {
+                Ok(Self::Inline(Arc::from(data)))
+            }
+            pb::data_fragment::LastUpdatedAtVersionSequence::ExternalLastUpdatedAtVersions(
+                file,
+            ) => Ok(Self::External(ExternalFile {
+                path: file.path,
+                offset: file.offset,
+                size: file.size,
+            })),
+        }
+    }
+}
+
+impl TryFrom<pb::data_fragment::CreatedAtVersionSequence> for RowDatasetVersionMeta {
+    type Error = Error;
+
+    fn try_from(value: pb::data_fragment::CreatedAtVersionSequence) -> Result<Self> {
+        match value {
+            pb::data_fragment::CreatedAtVersionSequence::InlineCreatedAtVersions(data) => {
+                Ok(Self::Inline(Arc::from(data)))
+            }
+            pb::data_fragment::CreatedAtVersionSequence::ExternalCreatedAtVersions(file) => {
+                Ok(Self::External(ExternalFile {
+                    path: file.path,
+                    offset: file.offset,
+                    size: file.size,
+                }))
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_version_random_access() {
+        let seq = RowDatasetVersionSequence {
+            runs: vec![
+                RowDatasetVersionRun {
+                    span: U64Segment::Range(0..3),
+                    version: 1,
+                },
+                RowDatasetVersionRun {
+                    span: U64Segment::Range(0..2),
+                    version: 2,
+                },
+                RowDatasetVersionRun {
+                    span: U64Segment::Range(0..1),
+                    version: 3,
+                },
+            ],
+        };
+        assert_eq!(seq.version_at(0), Some(1));
+        assert_eq!(seq.version_at(2), Some(1));
+        assert_eq!(seq.version_at(3), Some(2));
+        assert_eq!(seq.version_at(4), Some(2));
+        assert_eq!(seq.version_at(5), Some(3));
+        assert_eq!(seq.version_at(6), None);
+    }
+
+    #[test]
+    fn test_serialization_round_trip() {
+        let seq = RowDatasetVersionSequence {
+            runs: vec![
+                RowDatasetVersionRun {
+                    span: U64Segment::Range(0..4),
+                    version: 42,
+                },
+                RowDatasetVersionRun {
+                    span: U64Segment::Range(0..3),
+                    version: 99,
+                },
+            ],
+        };
+        let bytes = write_dataset_versions(&seq);
+        let seq2 = read_dataset_versions(&bytes).unwrap();
+        assert_eq!(seq2.runs.len(), 2);
+        assert_eq!(seq2.len(), 7);
+        assert_eq!(seq2.version_at(0), Some(42));
+        assert_eq!(seq2.version_at(5), Some(99));
+    }
+
+    #[test]
+    fn test_get_version_for_row_id() {
+        let seq = RowDatasetVersionSequence {
+            runs: vec![
+                RowDatasetVersionRun {
+                    span: U64Segment::Range(0..2),
+                    version: 8,
+                },
+                RowDatasetVersionRun {
+                    span: U64Segment::Range(0..2),
+                    version: 9,
+                },
+            ],
+        };
+        let rows = RowIdSequence::from(10..14); // row ids: 10,11,12,13
+        assert_eq!(seq.get_version_for_row_id(&rows, 10), Some(8));
+        assert_eq!(seq.get_version_for_row_id(&rows, 11), Some(8));
+        assert_eq!(seq.get_version_for_row_id(&rows, 12), Some(9));
+        assert_eq!(seq.get_version_for_row_id(&rows, 13), Some(9));
+        assert_eq!(seq.get_version_for_row_id(&rows, 99), None);
+    }
+}
--- a/vendor/lance-table/src/utils.rs
+++ b/vendor/lance-table/src/utils.rs
@ -0,0 +1,47 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+pub mod stream;
+
+pub trait LanceIteratorExtension {
+    fn exact_size(self, size: usize) -> ExactSize<Self>
+    where
+        Self: Sized;
+}
+
+impl<I: Iterator> LanceIteratorExtension for I {
+    fn exact_size(self, size: usize) -> ExactSize<Self>
+    where
+        Self: Sized,
+    {
+        ExactSize { inner: self, size }
+    }
+}
+
+/// A iterator that is tagged with a known size. This is useful when we are
+/// able to pre-compute the size of the iterator but the iterator implementation
+/// isn't able to itself. A common example is when using `flatten()`.
+///
+/// This is inspired by discussion in <https://github.com/rust-lang/rust/issues/68995>
+pub struct ExactSize<I> {
+    inner: I,
+    size: usize,
+}
+
+impl<I: Iterator> Iterator for ExactSize<I> {
+    type Item = I::Item;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        match self.inner.next() {
+            None => None,
+            Some(x) => {
+                self.size -= 1;
+                Some(x)
+            }
+        }
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        (self.size, Some(self.size))
+    }
+}
--- a/vendor/lance-table/src/utils/stream.rs
+++ b/vendor/lance-table/src/utils/stream.rs
@ -0,0 +1,806 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+use std::sync::Arc;
+
+use arrow_array::{BooleanArray, RecordBatch, RecordBatchOptions, UInt64Array, make_array};
+use arrow_buffer::NullBuffer;
+use futures::{
+    FutureExt, Stream, StreamExt,
+    future::BoxFuture,
+    stream::{BoxStream, FuturesOrdered},
+};
+use lance_arrow::RecordBatchExt;
+use lance_core::{
+    ROW_ADDR, ROW_ADDR_FIELD, ROW_CREATED_AT_VERSION_FIELD, ROW_ID, ROW_ID_FIELD,
+    ROW_LAST_UPDATED_AT_VERSION_FIELD, Result,
+    utils::{address::RowAddress, deletion::DeletionVector},
+};
+use lance_io::ReadBatchParams;
+use tracing::instrument;
+
+use crate::rowids::RowIdSequence;
+
+pub type ReadBatchFut = BoxFuture<'static, Result<RecordBatch>>;
+/// A task, emitted by a file reader, that will produce a batch (of the
+/// given size)
+pub struct ReadBatchTask {
+    pub task: ReadBatchFut,
+    pub num_rows: u32,
+}
+pub type ReadBatchTaskStream = BoxStream<'static, ReadBatchTask>;
+pub type ReadBatchFutStream = BoxStream<'static, ReadBatchFut>;
+
+struct MergeStream {
+    streams: Vec<ReadBatchTaskStream>,
+    next_batch: FuturesOrdered<ReadBatchFut>,
+    next_num_rows: u32,
+    index: usize,
+}
+
+impl MergeStream {
+    fn emit(&mut self) -> ReadBatchTask {
+        let mut iter = std::mem::take(&mut self.next_batch);
+        let task = async move {
+            let mut batch = iter.next().await.unwrap()?;
+            while let Some(next) = iter.next().await {
+                let next = next?;
+                batch = batch.merge(&next)?;
+            }
+            Ok(batch)
+        }
+        .boxed();
+        let num_rows = self.next_num_rows;
+        self.next_num_rows = 0;
+        ReadBatchTask { task, num_rows }
+    }
+}
+
+impl Stream for MergeStream {
+    type Item = ReadBatchTask;
+
+    fn poll_next(
+        mut self: std::pin::Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+    ) -> std::task::Poll<Option<Self::Item>> {
+        loop {
+            let index = self.index;
+            match self.streams[index].poll_next_unpin(cx) {
+                std::task::Poll::Ready(Some(batch_task)) => {
+                    if self.index == 0 {
+                        self.next_num_rows = batch_task.num_rows;
+                    } else {
+                        debug_assert_eq!(self.next_num_rows, batch_task.num_rows);
+                    }
+                    self.next_batch.push_back(batch_task.task);
+                    self.index += 1;
+                    if self.index == self.streams.len() {
+                        self.index = 0;
+                        let next_batch = self.emit();
+                        return std::task::Poll::Ready(Some(next_batch));
+                    }
+                }
+                std::task::Poll::Ready(None) => {
+                    return std::task::Poll::Ready(None);
+                }
+                std::task::Poll::Pending => {
+                    return std::task::Poll::Pending;
+                }
+            }
+        }
+    }
+}
+
+/// Given multiple streams of batch tasks, merge them into a single stream
+///
+/// This pulls one batch from each stream and then combines the columns from
+/// all of the batches into a single batch.  The order of the batches in the
+/// streams is maintained and the merged batch columns will be in order from
+/// first to last stream.
+///
+/// This stream ends as soon as any of the input streams ends (we do not
+/// verify that the other input streams are finished as well)
+///
+/// This will panic if any of the input streams return a batch with a different
+/// number of rows than the first stream.
+pub fn merge_streams(streams: Vec<ReadBatchTaskStream>) -> ReadBatchTaskStream {
+    MergeStream {
+        streams,
+        next_batch: FuturesOrdered::new(),
+        next_num_rows: 0,
+        index: 0,
+    }
+    .boxed()
+}
+
+/// Apply a mask to the batch, where rows are "deleted" by the _rowid column null.
+///
+/// This is used partly as a performance optimization (cheaper to null than to filter)
+/// but also because there are cases where we want to load the physical rows.  For example,
+/// we may be replacing a column based on some UDF and we want to provide a value for the
+/// deleted rows to ensure the fragments are aligned.
+fn apply_deletions_as_nulls(batch: RecordBatch, mask: &BooleanArray) -> Result<RecordBatch> {
+    // Transform mask into null buffer. Null means deleted, though note that
+    // null buffers are actually validity buffers, so True means not null
+    // and thus not deleted.
+    let mask_buffer = NullBuffer::new(mask.values().clone());
+
+    if mask_buffer.null_count() == 0 {
+        // No rows are deleted
+        return Ok(batch);
+    }
+
+    // For each column convert to data
+    let new_columns = batch
+        .schema()
+        .fields()
+        .iter()
+        .zip(batch.columns())
+        .map(|(field, col)| {
+            if field.name() == ROW_ID || field.name() == ROW_ADDR {
+                let col_data = col.to_data();
+                // If it already has a validity bitmap, then AND it with the mask.
+                // Otherwise, use the boolean buffer as the mask.
+                let null_buffer = NullBuffer::union(col_data.nulls(), Some(&mask_buffer));
+
+                Ok(col_data
+                    .into_builder()
+                    .null_bit_buffer(null_buffer.map(|b| b.buffer().clone()))
+                    .build()
+                    .map(make_array)?)
+            } else {
+                Ok(col.clone())
+            }
+        })
+        .collect::<Result<Vec<_>>>()?;
+
+    Ok(RecordBatch::try_new_with_options(
+        batch.schema(),
+        new_columns,
+        &RecordBatchOptions::new().with_row_count(Some(batch.num_rows())),
+    )?)
+}
+
+/// Extract version values for a batch selection by binary-searching over
+/// precomputed RLE run offsets. Single-run fragments (the common case)
+/// take the O(1) fast path.
+fn version_values_for_selection(
+    sequence: &crate::rowids::version::RowDatasetVersionSequence,
+    params: &ReadBatchParams,
+    batch_offset: u32,
+    num_rows: u32,
+) -> Result<Vec<u64>> {
+    let selection = params
+        .slice(batch_offset as usize, num_rows as usize)
+        .unwrap()
+        .to_ranges()
+        .unwrap();
+
+    if sequence.runs.len() == 1 {
+        return Ok(vec![sequence.runs[0].version(); num_rows as usize]);
+    }
+
+    let mut versions = Vec::with_capacity(num_rows as usize);
+    let run_offsets: Vec<usize> = sequence
+        .runs
+        .iter()
+        .scan(0usize, |acc, run| {
+            let start = *acc;
+            *acc += run.len();
+            Some(start)
+        })
+        .collect();
+    let total_len: usize = sequence.runs.iter().map(|r| r.len()).sum();
+
+    for r in &selection {
+        for pos in r.start..r.end {
+            let pos = pos as usize;
+            if pos >= total_len {
+                return Err(lance_core::Error::internal(format!(
+                    "version column position {} out of range (total_len={})",
+                    pos, total_len
+                )));
+            }
+            let run_idx = match run_offsets.binary_search(&pos) {
+                Ok(idx) => idx,
+                Err(idx) => idx - 1,
+            };
+            versions.push(sequence.runs[run_idx].version());
+        }
+    }
+    Ok(versions)
+}
+
+/// Configuration needed to apply row ids and deletions to a batch
+#[derive(Debug)]
+pub struct RowIdAndDeletesConfig {
+    /// The row ids that were requested
+    pub params: ReadBatchParams,
+    /// Whether to include the row id column in the final batch
+    pub with_row_id: bool,
+    /// Whether to include the row address column in the final batch
+    pub with_row_addr: bool,
+    /// Whether to include the last updated at version column in the final batch
+    pub with_row_last_updated_at_version: bool,
+    /// Whether to include the created at version column in the final batch
+    pub with_row_created_at_version: bool,
+    /// An optional deletion vector to apply to the batch
+    pub deletion_vector: Option<Arc<DeletionVector>>,
+    /// An optional row id sequence to use for the row id column.
+    pub row_id_sequence: Option<Arc<RowIdSequence>>,
+    /// The last_updated_at version sequence
+    pub last_updated_at_sequence: Option<Arc<crate::rowids::version::RowDatasetVersionSequence>>,
+    /// The created_at version sequence
+    pub created_at_sequence: Option<Arc<crate::rowids::version::RowDatasetVersionSequence>>,
+    /// Whether to make deleted rows null instead of filtering them out
+    pub make_deletions_null: bool,
+    /// The total number of rows that will be loaded
+    ///
+    /// This is needed to convert ReadbatchParams::RangeTo into a valid range
+    pub total_num_rows: u32,
+}
+
+impl RowIdAndDeletesConfig {
+    fn has_system_cols(&self) -> bool {
+        self.with_row_id
+            || self.with_row_addr
+            || self.with_row_last_updated_at_version
+            || self.with_row_created_at_version
+    }
+}
+
+#[instrument(level = "debug", skip_all)]
+pub fn apply_row_id_and_deletes(
+    batch: RecordBatch,
+    batch_offset: u32,
+    fragment_id: u32,
+    config: &RowIdAndDeletesConfig,
+) -> Result<RecordBatch> {
+    let mut deletion_vector = config.deletion_vector.as_ref();
+    // Convert Some(NoDeletions) into None to simplify logic below
+    if let Some(deletion_vector_inner) = deletion_vector
+        && matches!(deletion_vector_inner.as_ref(), DeletionVector::NoDeletions)
+    {
+        deletion_vector = None;
+    }
+    let has_deletions = deletion_vector.is_some();
+    debug_assert!(batch.num_columns() > 0 || config.has_system_cols() || has_deletions);
+
+    // If row id sequence is None, then row id IS row address.
+    let should_fetch_row_addr = config.with_row_addr
+        || (config.with_row_id && config.row_id_sequence.is_none())
+        || has_deletions;
+
+    let num_rows = batch.num_rows() as u32;
+
+    let row_addrs =
+        if should_fetch_row_addr {
+            let _rowaddrs = tracing::span!(tracing::Level::DEBUG, "fetch_row_addrs").entered();
+            let mut row_addrs = Vec::with_capacity(num_rows as usize);
+            for offset_range in config
+                .params
+                .slice(batch_offset as usize, num_rows as usize)
+                .unwrap()
+                .iter_offset_ranges()?
+            {
+                row_addrs.extend(offset_range.map(|row_offset| {
+                    u64::from(RowAddress::new_from_parts(fragment_id, row_offset))
+                }));
+            }
+
+            Some(Arc::new(UInt64Array::from(row_addrs)))
+        } else {
+            None
+        };
+
+    let row_ids = if config.with_row_id {
+        let _rowids = tracing::span!(tracing::Level::DEBUG, "fetch_row_ids").entered();
+        if let Some(row_id_sequence) = &config.row_id_sequence {
+            let selection = config
+                .params
+                .slice(batch_offset as usize, num_rows as usize)
+                .unwrap()
+                .to_ranges()
+                .unwrap();
+            let row_ids = row_id_sequence
+                .select(
+                    selection
+                        .iter()
+                        .flat_map(|r| r.start as usize..r.end as usize),
+                )
+                .collect::<UInt64Array>();
+            Some(Arc::new(row_ids))
+        } else {
+            // If we don't have a row id sequence, can assume the row ids are
+            // the same as the row addresses.
+            row_addrs.clone()
+        }
+    } else {
+        None
+    };
+
+    let span = tracing::span!(tracing::Level::DEBUG, "apply_deletions");
+    let _enter = span.enter();
+    let deletion_mask = deletion_vector.and_then(|v| {
+        let row_addrs: &[u64] = row_addrs.as_ref().unwrap().values();
+        v.build_predicate(row_addrs.iter())
+    });
+
+    let batch = if config.with_row_id {
+        let row_id_arr = row_ids.unwrap();
+        batch.try_with_column(ROW_ID_FIELD.clone(), row_id_arr)?
+    } else {
+        batch
+    };
+
+    let batch = if config.with_row_addr {
+        let row_addr_arr = row_addrs.unwrap();
+        batch.try_with_column(ROW_ADDR_FIELD.clone(), row_addr_arr)?
+    } else {
+        batch
+    };
+
+    // Add version columns if requested
+    let batch = if config.with_row_last_updated_at_version || config.with_row_created_at_version {
+        let mut batch = batch;
+
+        if config.with_row_last_updated_at_version {
+            let version_arr = if let Some(sequence) = &config.last_updated_at_sequence {
+                Arc::new(UInt64Array::from(version_values_for_selection(
+                    sequence,
+                    &config.params,
+                    batch_offset,
+                    num_rows,
+                )?))
+            } else {
+                // Default to version 1 if sequence not provided
+                Arc::new(UInt64Array::from(vec![1u64; num_rows as usize]))
+            };
+            batch =
+                batch.try_with_column(ROW_LAST_UPDATED_AT_VERSION_FIELD.clone(), version_arr)?;
+        }
+
+        if config.with_row_created_at_version {
+            let version_arr = if let Some(sequence) = &config.created_at_sequence {
+                Arc::new(UInt64Array::from(version_values_for_selection(
+                    sequence,
+                    &config.params,
+                    batch_offset,
+                    num_rows,
+                )?))
+            } else {
+                // Default to version 1 if sequence not provided
+                Arc::new(UInt64Array::from(vec![1u64; num_rows as usize]))
+            };
+            batch = batch.try_with_column(ROW_CREATED_AT_VERSION_FIELD.clone(), version_arr)?;
+        }
+
+        batch
+    } else {
+        batch
+    };
+
+    match (deletion_mask, config.make_deletions_null) {
+        (None, _) => Ok(batch),
+        (Some(mask), false) => Ok(arrow::compute::filter_record_batch(&batch, &mask)?),
+        (Some(mask), true) => Ok(apply_deletions_as_nulls(batch, &mask)?),
+    }
+}
+
+/// Given a stream of batch tasks this function will add a row ids column (if requested)
+/// and also apply a deletions vector to the batch.
+///
+/// This converts from BatchTaskStream to BatchFutStream because, if we are applying a
+/// deletion vector, it is impossible to know how many output rows we will have.
+pub fn wrap_with_row_id_and_delete(
+    stream: ReadBatchTaskStream,
+    fragment_id: u32,
+    config: RowIdAndDeletesConfig,
+) -> ReadBatchFutStream {
+    let config = Arc::new(config);
+    let mut offset = 0;
+    stream
+        .map(move |batch_task| {
+            let config = config.clone();
+            let this_offset = offset;
+            let num_rows = batch_task.num_rows;
+            offset += num_rows;
+            batch_task
+                .task
+                .map(move |batch| {
+                    apply_row_id_and_deletes(batch?, this_offset, fragment_id, config.as_ref())
+                })
+                .boxed()
+        })
+        .boxed()
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use arrow::{array::AsArray, datatypes::UInt64Type};
+    use arrow_array::{RecordBatch, UInt32Array, types::Int32Type};
+    use arrow_schema::ArrowError;
+    use futures::{FutureExt, StreamExt, TryStreamExt, stream::BoxStream};
+    use lance_core::{
+        ROW_ID,
+        utils::{address::RowAddress, deletion::DeletionVector},
+    };
+    use lance_datagen::{BatchCount, RowCount};
+    use lance_io::{ReadBatchParams, stream::arrow_stream_to_lance_stream};
+    use roaring::RoaringBitmap;
+
+    use crate::utils::stream::ReadBatchTask;
+
+    use super::RowIdAndDeletesConfig;
+
+    fn batch_task_stream(
+        datagen_stream: BoxStream<'static, std::result::Result<RecordBatch, ArrowError>>,
+    ) -> super::ReadBatchTaskStream {
+        arrow_stream_to_lance_stream(datagen_stream)
+            .map(|batch| ReadBatchTask {
+                num_rows: batch.as_ref().unwrap().num_rows() as u32,
+                task: std::future::ready(batch).boxed(),
+            })
+            .boxed()
+    }
+
+    #[tokio::test]
+    async fn test_basic_zip() {
+        let left = batch_task_stream(
+            lance_datagen::gen_batch()
+                .col("x", lance_datagen::array::step::<Int32Type>())
+                .into_reader_stream(RowCount::from(100), BatchCount::from(10))
+                .0,
+        );
+        let right = batch_task_stream(
+            lance_datagen::gen_batch()
+                .col("y", lance_datagen::array::step::<Int32Type>())
+                .into_reader_stream(RowCount::from(100), BatchCount::from(10))
+                .0,
+        );
+
+        let merged = super::merge_streams(vec![left, right])
+            .map(|batch_task| batch_task.task)
+            .buffered(1)
+            .try_collect::<Vec<_>>()
+            .await
+            .unwrap();
+
+        let expected = lance_datagen::gen_batch()
+            .col("x", lance_datagen::array::step::<Int32Type>())
+            .col("y", lance_datagen::array::step::<Int32Type>())
+            .into_reader_rows(RowCount::from(100), BatchCount::from(10))
+            .collect::<Result<Vec<_>, ArrowError>>()
+            .unwrap();
+        assert_eq!(merged, expected);
+    }
+
+    async fn check_row_id(params: ReadBatchParams, expected: impl IntoIterator<Item = u32>) {
+        let expected = Vec::from_iter(expected);
+
+        for has_columns in [false, true] {
+            for fragment_id in [0, 10] {
+                // 100 rows across 10 batches of 10 rows
+                let mut datagen = lance_datagen::gen_batch();
+                if has_columns {
+                    datagen = datagen.col("x", lance_datagen::array::rand::<Int32Type>());
+                }
+                let data = batch_task_stream(
+                    datagen
+                        .into_reader_stream(RowCount::from(10), BatchCount::from(10))
+                        .0,
+                );
+
+                let config = RowIdAndDeletesConfig {
+                    params: params.clone(),
+                    with_row_id: true,
+                    with_row_addr: false,
+                    with_row_last_updated_at_version: false,
+                    with_row_created_at_version: false,
+                    deletion_vector: None,
+                    row_id_sequence: None,
+                    last_updated_at_sequence: None,
+                    created_at_sequence: None,
+                    make_deletions_null: false,
+                    total_num_rows: 100,
+                };
+                let stream = super::wrap_with_row_id_and_delete(data, fragment_id, config);
+                let batches = stream.buffered(1).try_collect::<Vec<_>>().await.unwrap();
+
+                let mut offset = 0;
+                let expected = expected.clone();
+                for batch in batches {
+                    let actual_row_ids =
+                        batch[ROW_ID].as_primitive::<UInt64Type>().values().to_vec();
+                    let expected_row_ids = expected[offset..offset + 10]
+                        .iter()
+                        .map(|row_offset| {
+                            RowAddress::new_from_parts(fragment_id, *row_offset).into()
+                        })
+                        .collect::<Vec<u64>>();
+                    assert_eq!(actual_row_ids, expected_row_ids);
+                    offset += batch.num_rows();
+                }
+            }
+        }
+    }
+
+    #[tokio::test]
+    async fn test_row_id() {
+        let some_indices = (0..100).rev().collect::<Vec<u32>>();
+        let some_indices_arr = UInt32Array::from(some_indices.clone());
+        check_row_id(ReadBatchParams::RangeFull, 0..100).await;
+        check_row_id(ReadBatchParams::Indices(some_indices_arr), some_indices).await;
+        check_row_id(ReadBatchParams::Range(1000..1100), 1000..1100).await;
+        check_row_id(
+            ReadBatchParams::RangeFrom(std::ops::RangeFrom { start: 1000 }),
+            1000..1100,
+        )
+        .await;
+        check_row_id(
+            ReadBatchParams::RangeTo(std::ops::RangeTo { end: 1000 }),
+            0..100,
+        )
+        .await;
+    }
+
+    #[tokio::test]
+    async fn test_deletes() {
+        let no_deletes: Option<Arc<DeletionVector>> = None;
+        let no_deletes_2 = Some(Arc::new(DeletionVector::NoDeletions));
+        let delete_some_bitmap = Some(Arc::new(DeletionVector::Bitmap(RoaringBitmap::from_iter(
+            0..35,
+        ))));
+        let delete_some_set = Some(Arc::new(DeletionVector::Set((0..35).collect())));
+
+        for deletion_vector in [
+            no_deletes,
+            no_deletes_2,
+            delete_some_bitmap,
+            delete_some_set,
+        ] {
+            for has_columns in [false, true] {
+                for with_row_id in [false, true] {
+                    for make_deletions_null in [false, true] {
+                        for frag_id in [0, 1] {
+                            let has_deletions = if let Some(dv) = &deletion_vector {
+                                !matches!(dv.as_ref(), DeletionVector::NoDeletions)
+                            } else {
+                                false
+                            };
+                            if !has_columns && !has_deletions && !with_row_id {
+                                // This is an invalid case and should be prevented upstream,
+                                // no meaningful work is being done!
+                                continue;
+                            }
+                            if make_deletions_null && !with_row_id {
+                                // This is an invalid case and should be prevented upstream
+                                // we cannot make the row_id column null if it isn't present
+                                continue;
+                            }
+
+                            let mut datagen = lance_datagen::gen_batch();
+                            if has_columns {
+                                datagen =
+                                    datagen.col("x", lance_datagen::array::rand::<Int32Type>());
+                            }
+                            // 100 rows across 10 batches of 10 rows
+                            let data = batch_task_stream(
+                                datagen
+                                    .into_reader_stream(RowCount::from(10), BatchCount::from(10))
+                                    .0,
+                            );
+
+                            let config = RowIdAndDeletesConfig {
+                                params: ReadBatchParams::RangeFull,
+                                with_row_id,
+                                with_row_addr: false,
+                                with_row_last_updated_at_version: false,
+                                with_row_created_at_version: false,
+                                deletion_vector: deletion_vector.clone(),
+                                row_id_sequence: None,
+                                last_updated_at_sequence: None,
+                                created_at_sequence: None,
+                                make_deletions_null,
+                                total_num_rows: 100,
+                            };
+                            let stream = super::wrap_with_row_id_and_delete(data, frag_id, config);
+                            let batches = stream
+                                .buffered(1)
+                                .filter_map(|batch| {
+                                    std::future::ready(
+                                        batch
+                                            .map(|batch| {
+                                                if batch.num_rows() == 0 {
+                                                    None
+                                                } else {
+                                                    Some(batch)
+                                                }
+                                            })
+                                            .transpose(),
+                                    )
+                                })
+                                .try_collect::<Vec<_>>()
+                                .await
+                                .unwrap();
+
+                            let total_num_rows =
+                                batches.iter().map(|b| b.num_rows()).sum::<usize>();
+                            let total_num_nulls = if make_deletions_null {
+                                batches
+                                    .iter()
+                                    .map(|b| b[ROW_ID].null_count())
+                                    .sum::<usize>()
+                            } else {
+                                0
+                            };
+                            let total_actually_deleted = total_num_nulls + (100 - total_num_rows);
+
+                            let expected_deletions = match &deletion_vector {
+                                None => 0,
+                                Some(deletion_vector) => match deletion_vector.as_ref() {
+                                    DeletionVector::NoDeletions => 0,
+                                    DeletionVector::Bitmap(b) => b.len() as usize,
+                                    DeletionVector::Set(s) => s.len(),
+                                },
+                            };
+                            assert_eq!(total_actually_deleted, expected_deletions);
+                            if expected_deletions > 0 && with_row_id {
+                                if make_deletions_null {
+                                    // If we make deletions null we get 3 batches of all-null and then
+                                    // a batch of half-null
+                                    assert_eq!(
+                                        batches[3][ROW_ID].as_primitive::<UInt64Type>().value(0),
+                                        u64::from(RowAddress::new_from_parts(frag_id, 30))
+                                    );
+                                    assert_eq!(batches[3][ROW_ID].null_count(), 5);
+                                } else {
+                                    // If we materialize deletions the first row will be 35
+                                    assert_eq!(
+                                        batches[0][ROW_ID].as_primitive::<UInt64Type>().value(0),
+                                        u64::from(RowAddress::new_from_parts(frag_id, 35))
+                                    );
+                                }
+                            }
+                            if !with_row_id {
+                                assert!(batches[0].column_by_name(ROW_ID).is_none());
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    #[tokio::test]
+    async fn test_version_column_with_deletions() {
+        use crate::rowids::segment::U64Segment;
+        use crate::rowids::version::{RowDatasetVersionRun, RowDatasetVersionSequence};
+
+        let seq = Arc::new(RowDatasetVersionSequence {
+            runs: vec![RowDatasetVersionRun {
+                span: U64Segment::Range(0..100),
+                version: 42,
+            }],
+        });
+
+        let data = batch_task_stream(
+            lance_datagen::gen_batch()
+                .col("x", lance_datagen::array::rand::<Int32Type>())
+                .into_reader_stream(RowCount::from(10), BatchCount::from(10))
+                .0,
+        );
+
+        let config = RowIdAndDeletesConfig {
+            params: ReadBatchParams::RangeFull,
+            with_row_id: true,
+            with_row_addr: false,
+            with_row_last_updated_at_version: false,
+            with_row_created_at_version: true,
+            deletion_vector: Some(Arc::new(DeletionVector::Bitmap(RoaringBitmap::from_iter(
+                0..35,
+            )))),
+            row_id_sequence: None,
+            last_updated_at_sequence: None,
+            created_at_sequence: Some(seq),
+            make_deletions_null: false,
+            total_num_rows: 100,
+        };
+        let stream = super::wrap_with_row_id_and_delete(data, 0, config);
+        let batches: Vec<_> = stream
+            .buffered(1)
+            .try_filter(|b| std::future::ready(b.num_rows() > 0))
+            .try_collect()
+            .await
+            .unwrap();
+
+        let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
+        assert_eq!(total_rows, 65);
+
+        for batch in &batches {
+            let versions = batch
+                .column_by_name("_row_created_at_version")
+                .unwrap()
+                .as_primitive::<UInt64Type>()
+                .values();
+            assert!(versions.iter().all(|&v| v == 42));
+        }
+    }
+
+    #[tokio::test]
+    async fn test_version_column_multi_run() {
+        use crate::rowids::segment::U64Segment;
+        use crate::rowids::version::{RowDatasetVersionRun, RowDatasetVersionSequence};
+
+        // 3 runs: 0..40 v1, 40..70 v2, 70..100 v3
+        let seq = Arc::new(RowDatasetVersionSequence {
+            runs: vec![
+                RowDatasetVersionRun {
+                    span: U64Segment::Range(0..40),
+                    version: 1,
+                },
+                RowDatasetVersionRun {
+                    span: U64Segment::Range(40..70),
+                    version: 2,
+                },
+                RowDatasetVersionRun {
+                    span: U64Segment::Range(70..100),
+                    version: 3,
+                },
+            ],
+        });
+
+        // Delete 0..20 and 60..80 (spans run boundary).
+        // Survivors: 20..40 (v1), 40..60 (v2), 80..100 (v3) = 60 rows
+        let mut deletions = RoaringBitmap::from_iter(0..20);
+        deletions.extend(60..80);
+
+        let data = batch_task_stream(
+            lance_datagen::gen_batch()
+                .col("x", lance_datagen::array::rand::<Int32Type>())
+                .into_reader_stream(RowCount::from(10), BatchCount::from(10))
+                .0,
+        );
+
+        let config = RowIdAndDeletesConfig {
+            params: ReadBatchParams::RangeFull,
+            with_row_id: true,
+            with_row_addr: false,
+            with_row_last_updated_at_version: false,
+            with_row_created_at_version: true,
+            deletion_vector: Some(Arc::new(DeletionVector::Bitmap(deletions))),
+            row_id_sequence: None,
+            last_updated_at_sequence: None,
+            created_at_sequence: Some(seq),
+            make_deletions_null: false,
+            total_num_rows: 100,
+        };
+        let stream = super::wrap_with_row_id_and_delete(data, 0, config);
+        let batches: Vec<_> = stream
+            .buffered(1)
+            .try_filter(|b| std::future::ready(b.num_rows() > 0))
+            .try_collect()
+            .await
+            .unwrap();
+
+        let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
+        assert_eq!(total_rows, 60);
+
+        let all_versions: Vec<u64> = batches
+            .iter()
+            .flat_map(|b| {
+                b.column_by_name("_row_created_at_version")
+                    .unwrap()
+                    .as_primitive::<UInt64Type>()
+                    .values()
+                    .to_vec()
+            })
+            .collect();
+
+        assert!(all_versions[..20].iter().all(|&v| v == 1));
+        assert!(all_versions[20..40].iter().all(|&v| v == 2));
+        assert!(all_versions[40..60].iter().all(|&v| v == 3));
+    }
+}