From 2121d9f6c37b995ac9d256f0b5fcdd3bb57024be Mon Sep 17 00:00:00 2001 From: Ragnor Comerford Date: Tue, 12 May 2026 16:56:51 -0700 Subject: [PATCH 1/4] docs: storage stable-row-ids reflects every dataset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The L1 capability list claimed the flag was enabled "for the commit-graph and run-registry datasets" — stale. Every Lance dataset OmniGraph creates has enable_stable_row_ids: true; the run-registry datasets are gone since MR-771. Replace with a single paragraph capturing the invariant, the consequences (row-version columns available, CreateIndex × Rewrite not retryable, Lance reader version required), the legacy-dataset constraint (one-way at create, dump-and-reload to migrate), and a pointer to the regression test in staged_writes.rs. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/storage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/storage.md b/docs/storage.md index 825fbbe..b284bc2 100644 --- a/docs/storage.md +++ b/docs/storage.md @@ -7,7 +7,7 @@ Every node type and every edge type is its own Lance dataset: - **Columnar Arrow storage**: each property is a column; nullable per Arrow schema. - **Fragments**: data is partitioned into fragments; new writes create new fragments. - **Manifest versioning**: every commit produces a new dataset version; old versions remain readable. -- **Stable row IDs**: enabled by OmniGraph for the commit-graph and run-registry datasets so durable references survive compaction. +- **Stable row IDs**: `enable_stable_row_ids: true` is set on every Lance dataset OmniGraph creates — node and edge data tables, `__manifest`, `_graph_commits.lance`, `_graph_commit_recoveries.lance`, and any future system tables. This is an architectural invariant: the flag is one-way at dataset create per Lance's row-id-lineage spec, so a future change that introduces a Lance dataset must preserve it. Consequences: `_row_created_at_version` and `_row_last_updated_at_version` are available on every dataset (load-bearing for change-feed validators); `CreateIndex × Rewrite` is not a retryable conflict, so indices survive `omnigraph optimize` without needing the Fragment Reuse Index; readers must use a Lance build that recognises the flag (our pinned 4.0.0 is fine). Pre-0.4.x repos created before this code path settled may have datasets without the flag and cannot be retrofitted in place — the supported path is dump-and-reload. The `stage_overwrite` rewrite path (used by `schema_apply`) preserves the flag through `Operation::Overwrite`; pinned by `stage_overwrite_preserves_stable_row_ids` in `crates/omnigraph/tests/staged_writes.rs`. - **Append / delete / `merge_insert`**: native Lance write modes. - **Per-dataset branches** (Lance native): copy-on-write at the dataset level. - **Object-store agnostic**: file://, s3://, gs://, az://, http (read-only via Lance) — OmniGraph wires file:// and s3:// (`storage.rs`). From 549060f297197afe7cd4774c046736add5f1e878 Mon Sep 17 00:00:00 2001 From: Ragnor Comerford Date: Tue, 12 May 2026 16:56:58 -0700 Subject: [PATCH 2/4] tests: pin stable-row-id preservation across stage_overwrite MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit stage_overwrite is used by schema_apply to rewrite tables when an additive migration touches data. If Lance Operation::Overwrite ever stopped preserving the source dataset's enable_stable_row_ids flag, every schema_apply that triggers a rewrite would silently disable stable row IDs on the affected tables and downstream readers that depend on _rowid stability (change-feed validators, index reconcilers) would observe silent corruption. Empirically Lance 4.0.0 does preserve the flag through Overwrite even when WriteParams omits it — but the preservation isn't documented at the Lance spec level, so pin it here. Any future behaviour change surfaces as a test failure rather than silent corruption. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/omnigraph/tests/staged_writes.rs | 48 +++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/crates/omnigraph/tests/staged_writes.rs b/crates/omnigraph/tests/staged_writes.rs index 88b65e3..30ef28b 100644 --- a/crates/omnigraph/tests/staged_writes.rs +++ b/crates/omnigraph/tests/staged_writes.rs @@ -532,6 +532,54 @@ async fn stage_overwrite_does_not_advance_head_until_commit() { assert_eq!(collect_ids(&after), vec!["zoe"]); } +/// `stage_overwrite` is used by `schema_apply` to rewrite tables when +/// an additive migration touches data. The rewrite MUST preserve the +/// source dataset's `enable_stable_row_ids` flag — otherwise every +/// schema_apply that triggers a rewrite would silently disable stable +/// row IDs on the affected tables, and downstream readers depending on +/// `_rowid` stability (change-feed validators, index reconcilers) would +/// observe silent corruption. +/// +/// Pinned invariant — see `docs/storage.md` "Stable row IDs". +#[tokio::test] +async fn stage_overwrite_preserves_stable_row_ids() { + let dir = tempfile::tempdir().unwrap(); + let uri = format!("{}/people.lance", dir.path().to_str().unwrap()); + let store = TableStore::new(dir.path().to_str().unwrap()); + + // `write_dataset` creates with `enable_stable_row_ids: true` — see + // ADR 0001. We verify that as a precondition so a future change to + // the bootstrap helper that drops the flag surfaces here rather + // than turning this test into a silent no-op. + let ds = TableStore::write_dataset(&uri, person_batch(&[("alice", Some(30))])) + .await + .unwrap(); + assert!( + ds.manifest.uses_stable_row_ids(), + "precondition: TableStore::write_dataset must create datasets \ + with stable row IDs enabled — see ADR 0001" + ); + + let staged = store + .stage_overwrite(&ds, person_batch(&[("zoe", Some(99))])) + .await + .unwrap(); + let new_ds = store + .commit_staged(Arc::new(ds.clone()), staged.transaction) + .await + .unwrap(); + + assert!( + new_ds.manifest.uses_stable_row_ids(), + "stage_overwrite + commit_staged must preserve \ + enable_stable_row_ids from the source dataset. If this fails, \ + schema_apply has been silently disabling stable row IDs on \ + every additive migration that triggers a table rewrite. Fix \ + is in WriteParams at table_store.rs::stage_overwrite — see \ + ADR 0001." + ); +} + /// `stage_overwrite` semantically REPLACES every committed fragment. /// `removed_fragment_ids` lists every committed fragment so /// `scan_with_staged` shows only the staged rows (not committed + staged). From a30d1cc0dc252f089ee61c863f4ed60048459fab Mon Sep 17 00:00:00 2001 From: Ragnor Comerford Date: Tue, 12 May 2026 16:57:05 -0700 Subject: [PATCH 3/4] engine: stage_overwrite sets enable_stable_row_ids explicitly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Defensive — Lance 4.0.0 preserves the source dataset's flag through Operation::Overwrite even when WriteParams omits it (pinned by the prior commit's test), but setting it explicitly matches the public overwrite_dataset path at line 454 and documents the dependency at the call site so a future refactor doesn't accidentally drop it. Setting it on a dataset created without stable row IDs is a no-op per Lance's row-id-lineage spec, so this stays correct for legacy datasets. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/omnigraph/src/table_store.rs | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/crates/omnigraph/src/table_store.rs b/crates/omnigraph/src/table_store.rs index 5b3fabf..e69067c 100644 --- a/crates/omnigraph/src/table_store.rs +++ b/crates/omnigraph/src/table_store.rs @@ -782,8 +782,19 @@ impl TableStore { "stage_overwrite called with empty batch".to_string(), )); } + // `enable_stable_row_ids: true` is defensive — empirically Lance 4.0.0 + // preserves the source dataset's flag through `Operation::Overwrite` + // when WriteParams omits it (pinned by + // `stage_overwrite_preserves_stable_row_ids` in tests/staged_writes.rs), + // but setting it explicitly matches the public `overwrite_dataset` + // path and keeps the invariant documented at every Overwrite site + // (see docs/storage.md "Stable row IDs"). Setting it on an existing + // dataset that was created without stable row IDs is a no-op per + // Lance's row-id-lineage spec, so this stays correct for legacy + // datasets. let params = WriteParams { mode: WriteMode::Overwrite, + enable_stable_row_ids: true, allow_external_blob_outside_bases: true, ..Default::default() }; From 3cc5c6a9a29fdb88d5c9903718041371a860cb10 Mon Sep 17 00:00:00 2001 From: Ragnor Comerford Date: Tue, 12 May 2026 17:02:14 -0700 Subject: [PATCH 4/4] chore: gitignore the mdrip/ markdown snapshot cache npx mdrip writes fetched-page snapshots under mdrip/. The cache is a local-only working artifact (docs/lance.md is the curated index of upstream Lance pages we fetch on demand). Keep the cache out of the tree. Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 4207a6f..919d9d8 100644 --- a/.gitignore +++ b/.gitignore @@ -25,3 +25,4 @@ demo/*.omni/ /schema-improvements.md /schema-issues.md /schema-prior-art.md +mdrip/