diff --git a/crates/omnigraph/src/db/mod.rs b/crates/omnigraph/src/db/mod.rs index d0b292f..ed69797 100644 --- a/crates/omnigraph/src/db/mod.rs +++ b/crates/omnigraph/src/db/mod.rs @@ -3,7 +3,6 @@ pub mod graph_coordinator; pub mod manifest; mod omnigraph; mod recovery_audit; -mod run_registry; mod schema_state; pub(crate) mod write_queue; @@ -15,7 +14,6 @@ pub use omnigraph::{ CleanupPolicyOptions, InitOptions, MergeOutcome, Omnigraph, OpenMode, SchemaApplyOptions, SchemaApplyResult, TableCleanupStats, TableOptimizeStats, }; -pub(crate) use run_registry::is_internal_run_branch; pub(crate) const SCHEMA_APPLY_LOCK_BRANCH: &str = "__schema_apply_lock__"; @@ -69,5 +67,8 @@ pub(crate) fn is_schema_apply_lock_branch(name: &str) -> bool { } pub(crate) fn is_internal_system_branch(name: &str) -> bool { - is_internal_run_branch(name) || is_schema_apply_lock_branch(name) + // Legacy `__run__*` staging branches (Run state machine, removed MR-771) + // are swept off `__manifest` by the v2→v3 internal-schema migration, so the + // only internal branch the engine still creates is the schema-apply lock. + is_schema_apply_lock_branch(name) } diff --git a/crates/omnigraph/src/db/omnigraph.rs b/crates/omnigraph/src/db/omnigraph.rs index 5c92ac3..998eb85 100644 --- a/crates/omnigraph/src/db/omnigraph.rs +++ b/crates/omnigraph/src/db/omnigraph.rs @@ -1444,12 +1444,6 @@ pub(crate) fn normalize_branch_name(branch: &str) -> Result> { } pub(crate) fn ensure_public_branch_ref(branch: &str, operation: &str) -> Result<()> { - if super::is_internal_run_branch(branch) { - return Err(OmniError::manifest(format!( - "{} does not allow internal run ref '{}'", - operation, branch - ))); - } if is_internal_system_branch(branch) { return Err(OmniError::manifest(format!( "{} does not allow internal system ref '{}'", @@ -1853,7 +1847,6 @@ fn json_value_from_array(array: &dyn Array, row: usize) -> Result Company #[tokio::test] async fn test_apply_schema_succeeds_after_load() { // Historical: schema apply used to be blocked by leftover - // `__run__` branches. A defense-in-depth filter now skips - // internal system branches, and run branches were made - // ephemeral on every terminal state — so in practice no - // `__run__` branch survives publish. The filter still guards - // the invariant. + // `__run__` branches. The Run state machine was removed in + // MR-771, so a fresh graph never creates a `__run__` branch; + // legacy ones are swept by the v2→v3 manifest migration. This + // asserts the invariant a current graph upholds: publish leaves + // no `__run__` branch behind, so schema apply proceeds. let dir = tempfile::tempdir().unwrap(); let uri = dir.path().to_str().unwrap(); let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); @@ -2210,8 +2203,8 @@ edge WorksAt: Person -> Company let all_branches = db.coordinator.read().await.all_branches().await.unwrap(); assert!( - !all_branches.iter().any(|b| is_internal_run_branch(b)), - "run branch should be deleted after publish, got: {:?}", + !all_branches.iter().any(|b| b.starts_with("__run__")), + "no __run__ branch should exist after publish, got: {:?}", all_branches ); diff --git a/crates/omnigraph/src/db/run_registry.rs b/crates/omnigraph/src/db/run_registry.rs deleted file mode 100644 index ee3d336..0000000 --- a/crates/omnigraph/src/db/run_registry.rs +++ /dev/null @@ -1,16 +0,0 @@ -// The Run state machine has been removed. Mutations now write directly -// to target tables and use the publisher's `expected_table_versions` -// CAS for cross-table OCC; `__run__` staging branches and the -// `_graph_runs.lance` state machine no longer exist. -// -// What remains is the branch-name predicate, kept as a defense-in-depth -// guard against users naming a public branch `__run__*`. A future -// production sweep of legacy `_graph_runs.lance` rows and stale -// `__run__*` branches will let this predicate (and this file) go too. - -pub(crate) const INTERNAL_RUN_BRANCH_PREFIX: &str = "__run__"; - -pub(crate) fn is_internal_run_branch(name: &str) -> bool { - name.trim_start_matches('/') - .starts_with(INTERNAL_RUN_BRANCH_PREFIX) -} diff --git a/crates/omnigraph/src/exec/merge.rs b/crates/omnigraph/src/exec/merge.rs index 2e5f32e..eb6c4a3 100644 --- a/crates/omnigraph/src/exec/merge.rs +++ b/crates/omnigraph/src/exec/merge.rs @@ -1087,9 +1087,9 @@ impl Omnigraph { target: &str, actor_id: Option<&str>, ) -> Result { - if is_internal_run_branch(source) || is_internal_run_branch(target) { + if is_internal_system_branch(source) || is_internal_system_branch(target) { return Err(OmniError::manifest(format!( - "branch_merge does not allow internal run refs ('{}' -> '{}')", + "branch_merge does not allow internal system refs ('{}' -> '{}')", source, target ))); } diff --git a/crates/omnigraph/src/exec/mod.rs b/crates/omnigraph/src/exec/mod.rs index 33a7e41..ce72d42 100644 --- a/crates/omnigraph/src/exec/mod.rs +++ b/crates/omnigraph/src/exec/mod.rs @@ -35,7 +35,7 @@ use time::format_description::well_known::Rfc3339; use crate::db::commit_graph::CommitGraph; use crate::db::manifest::ManifestCoordinator; -use crate::db::{MergeOutcome, Omnigraph, is_internal_run_branch}; +use crate::db::{MergeOutcome, Omnigraph, is_internal_system_branch}; use crate::db::{ReadTarget, Snapshot}; use crate::embedding::EmbeddingClient; use crate::error::{MergeConflict, MergeConflictKind, OmniError, Result}; diff --git a/crates/omnigraph/tests/writes.rs b/crates/omnigraph/tests/writes.rs index 13cb10f..0a309c9 100644 --- a/crates/omnigraph/tests/writes.rs +++ b/crates/omnigraph/tests/writes.rs @@ -371,11 +371,10 @@ async fn cancelled_mutation_future_leaves_no_state() { // Cancel-safety property: no graph-level run/staging state remains. // - // Note: `branch_list()` already filters `__run__*` via - // `is_internal_system_branch`, so a runtime "no `__run__` branches" check - // would be vacuous. The structural property that no `__run__` branches - // can ever be created is enforced by deletion of `begin_run` etc. in - // (verified by the build itself — those symbols no longer exist). + // No `__run__` branches can ever be created: the Run state machine + // (`begin_run` etc.) was deleted in MR-771 — verified by the build itself, + // those symbols no longer exist. Any legacy `__run__*` branch on an + // upgraded graph is swept by the v2→v3 manifest migration. // // (1) The branch list is unchanged: cancellation/completion cannot // synthesize new public branches. @@ -442,34 +441,40 @@ async fn repeated_loads_do_not_accumulate_branches() { assert_eq!(db.branch_list().await.unwrap(), vec!["main".to_string()]); } -/// User code must not be able to write to internal `__run__*` names. -/// The branch-name guard predicate is kept as defense-in-depth; it -/// will be removed once a future production sweep retires the legacy -/// branches. +/// After MR-770, `__run__*` is an ordinary branch name — the Run state machine +/// and its `is_internal_run_branch` guard are gone. The surviving internal-ref +/// guard still rejects the active `__schema_apply_lock__` branch on the public +/// create/merge APIs. #[tokio::test] -async fn public_branch_apis_reject_internal_run_refs() { +async fn public_branch_apis_reject_internal_system_refs() { let dir = tempfile::tempdir().unwrap(); let mut db = init_and_load(&dir).await; - let create_err = db.branch_create("__run__synthetic").await.unwrap_err(); + // `__run__*` is no longer reserved — creating it now succeeds. + db.branch_create("__run__formerly_reserved") + .await + .expect("__run__ prefix is a normal branch name post-MR-770"); + + // The schema-apply lock branch is still rejected on public branch APIs. + let create_err = db.branch_create("__schema_apply_lock__").await.unwrap_err(); let OmniError::Manifest(err) = create_err else { panic!("expected Manifest error"); }; assert!( - err.message.contains("internal run ref"), + err.message.contains("internal system ref"), "unexpected error: {}", err.message ); let merge_err = db - .branch_merge("__run__synthetic", "main") + .branch_merge("__schema_apply_lock__", "main") .await .unwrap_err(); let OmniError::Manifest(err) = merge_err else { panic!("expected Manifest error"); }; assert!( - err.message.contains("internal run refs"), + err.message.contains("internal system refs"), "unexpected error: {}", err.message ); diff --git a/docs/dev/writes.md b/docs/dev/writes.md index 974f7a6..0876ad3 100644 --- a/docs/dev/writes.md +++ b/docs/dev/writes.md @@ -14,8 +14,11 @@ publisher's row-level CAS on `__manifest` is the single fence. - No `RunRecord`, no `_graph_runs.lance`, no `_graph_run_actors.lance`. - No `omnigraph run *` CLI subcommands and no `/runs/*` HTTP endpoints. -- No `__run__` staging branches. (Legacy on-disk artifacts from - pre-MR-771 repos are inert; MR-770 sweeps them in production.) +- No `__run__` staging branches; `__run__*` is no longer a reserved + name. The branch-name guard was removed in MR-770, and any stale + `__run__*` branch on an upgraded graph is swept off `__manifest` by the + v2→v3 internal-schema migration on first read-write open. (The inert + `_graph_runs.lance` bytes remain until a `delete_prefix` primitive lands.) - Cancelled mutation futures leave **no graph-level state** — only orphaned Lance fragments, which the existing `omnigraph cleanup` pipe reclaims. diff --git a/docs/user/audit.md b/docs/user/audit.md index e8abe5b..ab028ac 100644 --- a/docs/user/audit.md +++ b/docs/user/audit.md @@ -4,4 +4,4 @@ - `_as` variants of every write API let callers override the actor: `mutate_as`, `ingest_as`, `branch_merge_as`, `apply_schema_as`, etc. - Actor IDs are persisted on `GraphCommit.actor_id` with split storage in `_graph_commit_actors.lance` (the commit graph is split into `_graph_commits.lance` for the linkage and `_graph_commit_actors.lance` for the actor map). - HTTP server uses the bearer-token actor automatically; CLI uses the local user / explicit env (no implicit actor). -- Pre-v0.4.0 graphs also stored actor IDs on `RunRecord.actor_id` in `_graph_runs.lance` / `_graph_run_actors.lance`. The Run state machine was removed in MR-771; those files are inert post-v0.4.0 and reclaimed by MR-770's production sweep. +- Pre-v0.4.0 graphs also stored actor IDs on `RunRecord.actor_id` in `_graph_runs.lance` / `_graph_run_actors.lance`. The Run state machine was removed in MR-771; those files are inert post-v0.4.0. The v2→v3 manifest migration sweeps any stale `__run__*` branches on first write-open (MR-770); the inert dataset bytes remain until a `delete_prefix` primitive lands. diff --git a/docs/user/branches-commits.md b/docs/user/branches-commits.md index de6c653..02cbe1a 100644 --- a/docs/user/branches-commits.md +++ b/docs/user/branches-commits.md @@ -9,7 +9,7 @@ Lance supports branching at the dataset level: a branch is a named lineage of ve OmniGraph builds *graph branches* on top by branching every sub-table coherently: - `branch_create(name)` / `branch_create_from(target, name)` — disallowed name `main`; fails if branch exists; ensures the schema-apply lock is idle. -- `branch_list()` — returns public branches, **filters internal** `__run__…` and `__schema_apply_lock__` prefixes. +- `branch_list()` — returns public branches, **filters the internal** `__schema_apply_lock__` branch. - `branch_delete(name)` — refuses if there are descendants or active runs on the branch; cleans up owned per-branch fragments. - **Lazy forking**: a branch only forks a sub-table when that sub-table is first mutated on it. Pure-read branches share fragments with their source. - `sync_branch(branch)` — re-binds the in-memory handle to the latest head of the branch. @@ -54,7 +54,7 @@ Notes: Filtered from `branch_list()` but visible to internals: - `__schema_apply_lock__` — serializes schema migrations. -- `__run__` — legacy from the pre-v0.4.0 Run state machine (removed in MR-771). The branch-name guard predicate `is_internal_run_branch` is kept as defense-in-depth so users cannot create a branch matching the legacy prefix; the filter will be removed once production legacy branches are swept (MR-770). +- `__run__` — legacy from the pre-v0.4.0 Run state machine (removed in MR-771). These are swept off `__manifest` on the first read-write open by the v2→v3 internal-schema migration (MR-770), and `__run__*` is no longer a reserved name. Known limitation: a pre-v0.4.0 graph opened **read-only** still surfaces any stale `__run__*` branch in `branch_list()` until its first read-write open (the migration is write-path-only, like all manifest migrations). ## L2 — Recovery audit trail diff --git a/docs/user/constants.md b/docs/user/constants.md index 527aaea..76560e3 100644 --- a/docs/user/constants.md +++ b/docs/user/constants.md @@ -4,11 +4,11 @@ |---|---|---| | `MANIFEST_DIR` | `__manifest` | `db/manifest/layout.rs` | | Commit graph dir | `_graph_commits.lance` | `db/commit_graph.rs` | -| Run registry dir (legacy, removed MR-771) | `_graph_runs.lance` | inert post-v0.4.0; reclaimed by MR-770 | -| Run branch prefix (legacy, removed MR-771) | `__run__` | filtered by `is_internal_run_branch` defense-in-depth | +| Run registry dir (legacy, removed MR-771) | `_graph_runs.lance` | inert post-v0.4.0; bytes remain until a `delete_prefix` primitive lands | +| Run branch prefix (legacy, removed MR-771/MR-770) | `__run__` | swept off `__manifest` by the v2→v3 migration; no longer a reserved name | | Schema apply lock | `__schema_apply_lock__` | `db/mod.rs` | | Manifest publisher retry budget | `PUBLISHER_RETRY_BUDGET = 5` | `db/manifest/publisher.rs` | -| Internal manifest schema version | `INTERNAL_MANIFEST_SCHEMA_VERSION = 2` | `db/manifest/migrations.rs` | +| Internal manifest schema version | `INTERNAL_MANIFEST_SCHEMA_VERSION = 3` | `db/manifest/migrations.rs` | | Merge stage batch | `MERGE_STAGE_BATCH_ROWS = 8192` | `exec/merge.rs` | | Maintenance concurrency | `OMNIGRAPH_MAINTENANCE_CONCURRENCY=8` | `db/omnigraph/optimize.rs` | | Graph index cache size | `8` (LRU) | `runtime_cache.rs` | diff --git a/docs/user/storage.md b/docs/user/storage.md index c22d4d6..e71fe71 100644 --- a/docs/user/storage.md +++ b/docs/user/storage.md @@ -22,7 +22,7 @@ OmniGraph is **not** a single Lance dataset; it is a *graph* of datasets coordin - `edges/{fnv1a64-hex(edge_type_name)}` — one Lance dataset per edge type - `__manifest/` — the catalog of all sub-tables and their published versions - `_graph_commits.lance` / `_graph_commit_actors.lance` — the commit graph and its actor map - - (legacy `_graph_runs.lance` / `_graph_run_actors.lance` from pre-v0.4.0 graphs are inert; the run state machine was removed in MR-771 and these files are cleaned up via MR-770's production sweep) + - (legacy `_graph_runs.lance` / `_graph_run_actors.lance` from pre-v0.4.0 graphs are inert; the run state machine was removed in MR-771. The v2→v3 manifest migration sweeps stale `__run__*` branches on first write-open; the inert dataset bytes themselves remain until a `delete_prefix` storage primitive lands) - **Manifest row schema** (`object_id, object_type, location, metadata, base_objects, table_key, table_version, table_branch, row_count`): - `object_type` ∈ `table | table_version | table_tombstone` - `table_key` ∈ `node: | edge:` @@ -91,7 +91,7 @@ flowchart TB - **Graph root** is one directory (or S3 prefix). Everything below is part of one OmniGraph graph. - **`__manifest/`** is a Lance dataset whose rows describe which sub-table version is published at which graph-branch. Reading a snapshot starts here. - **`nodes/`** and **`edges/`** are sibling directories holding one Lance dataset per declared type. Names are `fnv1a64-hex` of the type name to keep paths fixed-length and case-safe. -- **`_graph_commits.lance`** is an L2 dataset that records the graph-level commit DAG, with a paired `_graph_commit_actors.lance` for the actor map. (Pre-v0.4.0 graphs also have inert `_graph_runs.lance` / `_graph_run_actors.lance` from the removed Run state machine; MR-770 sweeps these in production.) +- **`_graph_commits.lance`** is an L2 dataset that records the graph-level commit DAG, with a paired `_graph_commit_actors.lance` for the actor map. (Pre-v0.4.0 graphs also have inert `_graph_runs.lance` / `_graph_run_actors.lance` from the removed Run state machine; the v2→v3 migration sweeps their stale `__run__*` branches, and the dataset bytes are reclaimed once `delete_prefix` lands.) - **`_graph_commit_recoveries.lance`** — one row per recovery sweep action. Joined to `_graph_commits.lance` by `graph_commit_id`; the linked commit row carries `actor_id=omnigraph:recovery`. Operators correlate recoveries with the original mutations they rolled forward / back via this join. See `crates/omnigraph/src/db/recovery_audit.rs`. - **`__recovery/{ulid}.json`** — transient sidecar files written by the four migrated writers (`MutationStaging::finalize`, `schema_apply`, `branch_merge`, `ensure_indices`) before Phase B begins, deleted after Phase C succeeds. A sidecar persisting after process exit means the writer crashed in the Phase B → Phase C window; the next `Omnigraph::open` recovery sweep processes it. Steady-state directory is empty. See `crates/omnigraph/src/db/manifest/recovery.rs`. - **`_refs/branches/{name}.json`** is graph-level branch metadata — pointers from a branch name to the manifest version it heads.