mirror of
https://github.com/ModernRelay/omnigraph.git
synced 2026-06-09 01:35:18 +02:00
Merge pull request #85 from ModernRelay/ragnorc/survey-state
engine: pin stable-row-id preservation through stage_overwrite
This commit is contained in:
commit
53d41a30b4
4 changed files with 61 additions and 1 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
|
@ -25,3 +25,4 @@ demo/*.omni/
|
|||
/schema-improvements.md
|
||||
/schema-issues.md
|
||||
/schema-prior-art.md
|
||||
mdrip/
|
||||
|
|
|
|||
|
|
@ -953,8 +953,19 @@ impl TableStore {
|
|||
"stage_overwrite called with empty batch".to_string(),
|
||||
));
|
||||
}
|
||||
// `enable_stable_row_ids: true` is defensive — empirically Lance 4.0.0
|
||||
// preserves the source dataset's flag through `Operation::Overwrite`
|
||||
// when WriteParams omits it (pinned by
|
||||
// `stage_overwrite_preserves_stable_row_ids` in tests/staged_writes.rs),
|
||||
// but setting it explicitly matches the public `overwrite_dataset`
|
||||
// path and keeps the invariant documented at every Overwrite site
|
||||
// (see docs/storage.md "Stable row IDs"). Setting it on an existing
|
||||
// dataset that was created without stable row IDs is a no-op per
|
||||
// Lance's row-id-lineage spec, so this stays correct for legacy
|
||||
// datasets.
|
||||
let params = WriteParams {
|
||||
mode: WriteMode::Overwrite,
|
||||
enable_stable_row_ids: true,
|
||||
allow_external_blob_outside_bases: true,
|
||||
..Default::default()
|
||||
};
|
||||
|
|
|
|||
|
|
@ -532,6 +532,54 @@ async fn stage_overwrite_does_not_advance_head_until_commit() {
|
|||
assert_eq!(collect_ids(&after), vec!["zoe"]);
|
||||
}
|
||||
|
||||
/// `stage_overwrite` is used by `schema_apply` to rewrite tables when
|
||||
/// an additive migration touches data. The rewrite MUST preserve the
|
||||
/// source dataset's `enable_stable_row_ids` flag — otherwise every
|
||||
/// schema_apply that triggers a rewrite would silently disable stable
|
||||
/// row IDs on the affected tables, and downstream readers depending on
|
||||
/// `_rowid` stability (change-feed validators, index reconcilers) would
|
||||
/// observe silent corruption.
|
||||
///
|
||||
/// Pinned invariant — see `docs/storage.md` "Stable row IDs".
|
||||
#[tokio::test]
|
||||
async fn stage_overwrite_preserves_stable_row_ids() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let uri = format!("{}/people.lance", dir.path().to_str().unwrap());
|
||||
let store = TableStore::new(dir.path().to_str().unwrap());
|
||||
|
||||
// `write_dataset` creates with `enable_stable_row_ids: true` — see
|
||||
// ADR 0001. We verify that as a precondition so a future change to
|
||||
// the bootstrap helper that drops the flag surfaces here rather
|
||||
// than turning this test into a silent no-op.
|
||||
let ds = TableStore::write_dataset(&uri, person_batch(&[("alice", Some(30))]))
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(
|
||||
ds.manifest.uses_stable_row_ids(),
|
||||
"precondition: TableStore::write_dataset must create datasets \
|
||||
with stable row IDs enabled — see ADR 0001"
|
||||
);
|
||||
|
||||
let staged = store
|
||||
.stage_overwrite(&ds, person_batch(&[("zoe", Some(99))]))
|
||||
.await
|
||||
.unwrap();
|
||||
let new_ds = store
|
||||
.commit_staged(Arc::new(ds.clone()), staged.transaction)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert!(
|
||||
new_ds.manifest.uses_stable_row_ids(),
|
||||
"stage_overwrite + commit_staged must preserve \
|
||||
enable_stable_row_ids from the source dataset. If this fails, \
|
||||
schema_apply has been silently disabling stable row IDs on \
|
||||
every additive migration that triggers a table rewrite. Fix \
|
||||
is in WriteParams at table_store.rs::stage_overwrite — see \
|
||||
ADR 0001."
|
||||
);
|
||||
}
|
||||
|
||||
/// `stage_overwrite` semantically REPLACES every committed fragment.
|
||||
/// `removed_fragment_ids` lists every committed fragment so
|
||||
/// `scan_with_staged` shows only the staged rows (not committed + staged).
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ Every node type and every edge type is its own Lance dataset:
|
|||
- **Columnar Arrow storage**: each property is a column; nullable per Arrow schema.
|
||||
- **Fragments**: data is partitioned into fragments; new writes create new fragments.
|
||||
- **Manifest versioning**: every commit produces a new dataset version; old versions remain readable.
|
||||
- **Stable row IDs**: enabled by OmniGraph for the commit-graph and run-registry datasets so durable references survive compaction.
|
||||
- **Stable row IDs**: `enable_stable_row_ids: true` is set on every Lance dataset OmniGraph creates — node and edge data tables, `__manifest`, `_graph_commits.lance`, `_graph_commit_recoveries.lance`, and any future system tables. This is an architectural invariant: the flag is one-way at dataset create per Lance's row-id-lineage spec, so a future change that introduces a Lance dataset must preserve it. Consequences: `_row_created_at_version` and `_row_last_updated_at_version` are available on every dataset (load-bearing for change-feed validators); `CreateIndex × Rewrite` is not a retryable conflict, so indices survive `omnigraph optimize` without needing the Fragment Reuse Index; readers must use a Lance build that recognises the flag (our pinned 4.0.0 is fine). Pre-0.4.x repos created before this code path settled may have datasets without the flag and cannot be retrofitted in place — the supported path is dump-and-reload. The `stage_overwrite` rewrite path (used by `schema_apply`) preserves the flag through `Operation::Overwrite`; pinned by `stage_overwrite_preserves_stable_row_ids` in `crates/omnigraph/tests/staged_writes.rs`.
|
||||
- **Append / delete / `merge_insert`**: native Lance write modes.
|
||||
- **Per-dataset branches** (Lance native): copy-on-write at the dataset level.
|
||||
- **Object-store agnostic**: file://, s3://, gs://, az://, http (read-only via Lance) — OmniGraph wires file:// and s3:// (`storage.rs`).
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue