diff --git a/crates/omnigraph/src/db/manifest.rs b/crates/omnigraph/src/db/manifest.rs index 7fcf7de..3b2886f 100644 --- a/crates/omnigraph/src/db/manifest.rs +++ b/crates/omnigraph/src/db/manifest.rs @@ -48,6 +48,22 @@ const OBJECT_TYPE_TABLE_VERSION: &str = "table_version"; const OBJECT_TYPE_TABLE_TOMBSTONE: &str = "table_tombstone"; const TABLE_VERSION_MANAGEMENT_KEY: &str = "table_version_management"; +/// Apply pending internal-schema migrations against `__manifest` on the +/// open-for-write path, independent of a publish. +/// +/// `Omnigraph::open(ReadWrite)` calls this before the coordinator reads branch +/// state, so branch-observing code (`branch_list`, the schema-apply +/// blocking-branch checks) sees the post-migration graph. In particular the +/// v2→v3 step sweeps legacy `__run__*` staging branches off `__manifest` +/// (MR-770); running it here closes the window where those branches would +/// otherwise block schema apply before the first publish runs the migration. +/// +/// Idempotent: a no-op stamp read when the on-disk version already matches. +pub(crate) async fn migrate_on_open(root_uri: &str) -> Result<()> { + let mut dataset = open_manifest_dataset(root_uri, None).await?; + migrations::migrate_internal_schema(&mut dataset).await +} + /// Immutable point-in-time view of the database. /// /// Cheap to create (no storage I/O). All reads within a query go through one diff --git a/crates/omnigraph/src/db/omnigraph.rs b/crates/omnigraph/src/db/omnigraph.rs index 998eb85..39f07d1 100644 --- a/crates/omnigraph/src/db/omnigraph.rs +++ b/crates/omnigraph/src/db/omnigraph.rs @@ -340,6 +340,16 @@ impl Omnigraph { mode: OpenMode, ) -> Result { let root = normalize_root_uri(uri)?; + // Apply pending internal-schema migrations before the coordinator reads + // branch state, so `branch_list` and the schema-apply blocking-branch + // checks observe the post-migration graph — notably the v2→v3 sweep of + // legacy `__run__*` staging branches (MR-770). ReadWrite only: a + // read-only open must not trigger object-store writes, so a read-only + // open of an unmigrated legacy graph still lists `__run__*` until its + // first read-write open (an accepted, documented limitation). + if matches!(mode, OpenMode::ReadWrite) { + crate::db::manifest::migrate_on_open(&root).await?; + } // Open the coordinator first so the schema-staging recovery sweep can // compare its snapshot against any leftover staging files. let mut coordinator = GraphCoordinator::open(&root, Arc::clone(&storage)).await?; @@ -2216,6 +2226,56 @@ edge WorksAt: Person -> Company assert!(result.applied, "schema apply should have applied"); } + /// Regression (MR-770): a pre-v0.4.0 graph that still carries a stale + /// `__run__*` branch on `__manifest` must not block schema apply. The + /// v2→v3 sweep runs in `Omnigraph::open(ReadWrite)` — before the + /// schema-apply blocking-branch check — so apply succeeds with no + /// intervening publish. + /// + /// Confirmed to fail before the open-time migration landed: the reopened + /// graph still listed `__run__legacy`, and `apply_schema` returned + /// "found non-main branches: __run__legacy". + #[tokio::test] + async fn legacy_run_branch_is_swept_on_open_and_does_not_block_schema_apply() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); + + // Synthesize a legacy graph: a stale `__run__` branch on `__manifest` + // plus the manifest stamp rewound to v2 (pre-sweep). + db.branch_create("__run__legacy").await.unwrap(); + drop(db); + { + let mut ds = lance::Dataset::open(&format!("{}/__manifest", uri)) + .await + .unwrap(); + ds.update_schema_metadata([( + "omnigraph:internal_schema_version".to_string(), + Some("2".to_string()), + )]) + .await + .unwrap(); + } + + // Reopen (ReadWrite): the open-time migration must sweep `__run__legacy` + // before any branch-observing code runs. + let db = Omnigraph::open(uri).await.unwrap(); + let branches = db.branch_list().await.unwrap(); + assert!( + !branches.iter().any(|b| b.starts_with("__run__")), + "open-time migration must sweep legacy __run__ branches; got {branches:?}", + ); + + // Schema apply must proceed with no intervening publish — the + // blocking-branch check no longer sees `__run__legacy`. + let desired = TEST_SCHEMA.replace( + " age: I32?\n}", + " age: I32?\n nickname: String?\n}", + ); + let result = db.apply_schema(&desired).await.unwrap(); + assert!(result.applied, "schema apply should have applied"); + } + #[tokio::test] async fn test_apply_schema_adds_index_for_existing_property() { let dir = tempfile::tempdir().unwrap(); diff --git a/crates/omnigraph/src/db/omnigraph/schema_apply.rs b/crates/omnigraph/src/db/omnigraph/schema_apply.rs index 0dcf0f9..b4ec5e4 100644 --- a/crates/omnigraph/src/db/omnigraph/schema_apply.rs +++ b/crates/omnigraph/src/db/omnigraph/schema_apply.rs @@ -94,11 +94,11 @@ pub(super) async fn apply_schema_with_lock( ) -> Result { db.ensure_schema_state_valid().await?; let branches = db.coordinator.read().await.all_branches().await?; - // Skip `main` and internal system branches. The schema-apply lock branch - // is excluded because it is the cluster-wide schema-apply serializer. - // `__run__*` branches are no longer created; the filter remains as - // defense-in-depth for legacy graphs with leftover staging branches. - // A future production sweep will let this guard go. + // Skip `main` and internal system branches (the schema-apply lock branch, + // the cluster-wide schema-apply serializer). Legacy `__run__*` staging + // branches were swept off `__manifest` by the v2→v3 migration that runs in + // `Omnigraph::open(ReadWrite)` before this check (MR-770), so they no + // longer appear here. let blocking_branches = branches .into_iter() .filter(|branch| branch != "main" && !is_internal_system_branch(branch)) diff --git a/docs/dev/writes.md b/docs/dev/writes.md index 0876ad3..8b692b4 100644 --- a/docs/dev/writes.md +++ b/docs/dev/writes.md @@ -248,9 +248,14 @@ list`. ## Migration code -`db/manifest/migrations.rs` does not change. Active deletion of -`_graph_runs.lance` belongs in MR-770 (the production sweep) — this PR -stops *creating* run state but does not destroy legacy bytes on disk. +`db/manifest/migrations.rs` carries the v2→v3 internal-schema step (MR-770): +a one-time sweep that deletes legacy `__run__*` staging branches off +`__manifest`. It runs in `Omnigraph::open(ReadWrite)` (via +`manifest::migrate_on_open`, before the coordinator reads branch state) and +again on the publisher's write path; both are idempotent once the stamp is at +v3. Deleting the inert `_graph_runs.lance` / `_graph_run_actors.lance` dataset +*bytes* is still deferred — it needs a `StorageAdapter::delete_prefix` +primitive — but those bytes are invisible to graph-level state. ## Mid-query partial failure: closed by MR-794 diff --git a/docs/user/storage.md b/docs/user/storage.md index e71fe71..d1c52b5 100644 --- a/docs/user/storage.md +++ b/docs/user/storage.md @@ -47,6 +47,7 @@ Adding a new on-disk shape change is one constant bump (`INTERNAL_MANIFEST_SCHEM |---|---| | v1 (implicit, pre-stamp) | `__manifest.object_id` had no PK annotation; publisher had no row-level CAS protection. | | v2 | `__manifest.object_id` carries `lance-schema:unenforced-primary-key=true`; row-level CAS engaged. Stamped as `omnigraph:internal_schema_version=2`. | +| v3 | One-time sweep of legacy `__run__*` staging branches (pre-v0.4.0 Run state machine, removed MR-771) off `__manifest`. Runs at `Omnigraph::open(ReadWrite)` and on publish. Stamped as `omnigraph:internal_schema_version=3`. | ## On-disk layout