mirror of
https://github.com/ModernRelay/omnigraph.git
synced 2026-06-09 01:35:18 +02:00
refactor(engine): remove the legacy __run__ branch guard (MR-770)
With the v2→v3 migration sweeping stale `__run__*` branches off `__manifest` on first read-write open, the defense-in-depth `is_internal_run_branch` guard is no longer needed. - delete `db/run_registry.rs`; drop the module + re-export from `db/mod.rs` - collapse `is_internal_system_branch` to the schema-apply-lock check only - `ensure_public_branch_ref`: drop the run-ref rejection; `__run__*` is now an ordinary branch name - `branch_merge`: reject `is_internal_system_branch` (was run-only) so the schema-apply lock is rejected consistently with create/delete — a small, deliberate tightening - update the inline schema-apply test + the writes integration tests (`public_branch_apis_reject_internal_run_refs` → `public_branch_apis_reject_internal_system_refs`, which also asserts `__run__*` now creates successfully) - docs: flip the "pending production sweep / defense-in-depth" notes to "auto-swept by the v2→v3 migration"; document the read-only-open limitation Known residual: the inert `_graph_runs.lance` / `_graph_run_actors.lance` bytes remain until a `StorageAdapter::delete_prefix` primitive lands.
This commit is contained in:
parent
1aab951c25
commit
4ed2313a80
11 changed files with 46 additions and 60 deletions
|
|
@ -3,7 +3,6 @@ pub mod graph_coordinator;
|
|||
pub mod manifest;
|
||||
mod omnigraph;
|
||||
mod recovery_audit;
|
||||
mod run_registry;
|
||||
mod schema_state;
|
||||
pub(crate) mod write_queue;
|
||||
|
||||
|
|
@ -15,7 +14,6 @@ pub use omnigraph::{
|
|||
CleanupPolicyOptions, InitOptions, MergeOutcome, Omnigraph, OpenMode, SchemaApplyOptions,
|
||||
SchemaApplyResult, TableCleanupStats, TableOptimizeStats,
|
||||
};
|
||||
pub(crate) use run_registry::is_internal_run_branch;
|
||||
|
||||
pub(crate) const SCHEMA_APPLY_LOCK_BRANCH: &str = "__schema_apply_lock__";
|
||||
|
||||
|
|
@ -69,5 +67,8 @@ pub(crate) fn is_schema_apply_lock_branch(name: &str) -> bool {
|
|||
}
|
||||
|
||||
pub(crate) fn is_internal_system_branch(name: &str) -> bool {
|
||||
is_internal_run_branch(name) || is_schema_apply_lock_branch(name)
|
||||
// Legacy `__run__*` staging branches (Run state machine, removed MR-771)
|
||||
// are swept off `__manifest` by the v2→v3 internal-schema migration, so the
|
||||
// only internal branch the engine still creates is the schema-apply lock.
|
||||
is_schema_apply_lock_branch(name)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1444,12 +1444,6 @@ pub(crate) fn normalize_branch_name(branch: &str) -> Result<Option<String>> {
|
|||
}
|
||||
|
||||
pub(crate) fn ensure_public_branch_ref(branch: &str, operation: &str) -> Result<()> {
|
||||
if super::is_internal_run_branch(branch) {
|
||||
return Err(OmniError::manifest(format!(
|
||||
"{} does not allow internal run ref '{}'",
|
||||
operation, branch
|
||||
)));
|
||||
}
|
||||
if is_internal_system_branch(branch) {
|
||||
return Err(OmniError::manifest(format!(
|
||||
"{} does not allow internal system ref '{}'",
|
||||
|
|
@ -1853,7 +1847,6 @@ fn json_value_from_array(array: &dyn Array, row: usize) -> Result<serde_json::Va
|
|||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::db::is_internal_run_branch;
|
||||
use crate::db::manifest::ManifestCoordinator;
|
||||
use async_trait::async_trait;
|
||||
use serde_json::Value;
|
||||
|
|
@ -2191,11 +2184,11 @@ edge WorksAt: Person -> Company
|
|||
#[tokio::test]
|
||||
async fn test_apply_schema_succeeds_after_load() {
|
||||
// Historical: schema apply used to be blocked by leftover
|
||||
// `__run__` branches. A defense-in-depth filter now skips
|
||||
// internal system branches, and run branches were made
|
||||
// ephemeral on every terminal state — so in practice no
|
||||
// `__run__` branch survives publish. The filter still guards
|
||||
// the invariant.
|
||||
// `__run__` branches. The Run state machine was removed in
|
||||
// MR-771, so a fresh graph never creates a `__run__` branch;
|
||||
// legacy ones are swept by the v2→v3 manifest migration. This
|
||||
// asserts the invariant a current graph upholds: publish leaves
|
||||
// no `__run__` branch behind, so schema apply proceeds.
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let uri = dir.path().to_str().unwrap();
|
||||
let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap();
|
||||
|
|
@ -2210,8 +2203,8 @@ edge WorksAt: Person -> Company
|
|||
|
||||
let all_branches = db.coordinator.read().await.all_branches().await.unwrap();
|
||||
assert!(
|
||||
!all_branches.iter().any(|b| is_internal_run_branch(b)),
|
||||
"run branch should be deleted after publish, got: {:?}",
|
||||
!all_branches.iter().any(|b| b.starts_with("__run__")),
|
||||
"no __run__ branch should exist after publish, got: {:?}",
|
||||
all_branches
|
||||
);
|
||||
|
||||
|
|
|
|||
|
|
@ -1,16 +0,0 @@
|
|||
// The Run state machine has been removed. Mutations now write directly
|
||||
// to target tables and use the publisher's `expected_table_versions`
|
||||
// CAS for cross-table OCC; `__run__<id>` staging branches and the
|
||||
// `_graph_runs.lance` state machine no longer exist.
|
||||
//
|
||||
// What remains is the branch-name predicate, kept as a defense-in-depth
|
||||
// guard against users naming a public branch `__run__*`. A future
|
||||
// production sweep of legacy `_graph_runs.lance` rows and stale
|
||||
// `__run__*` branches will let this predicate (and this file) go too.
|
||||
|
||||
pub(crate) const INTERNAL_RUN_BRANCH_PREFIX: &str = "__run__";
|
||||
|
||||
pub(crate) fn is_internal_run_branch(name: &str) -> bool {
|
||||
name.trim_start_matches('/')
|
||||
.starts_with(INTERNAL_RUN_BRANCH_PREFIX)
|
||||
}
|
||||
|
|
@ -1087,9 +1087,9 @@ impl Omnigraph {
|
|||
target: &str,
|
||||
actor_id: Option<&str>,
|
||||
) -> Result<MergeOutcome> {
|
||||
if is_internal_run_branch(source) || is_internal_run_branch(target) {
|
||||
if is_internal_system_branch(source) || is_internal_system_branch(target) {
|
||||
return Err(OmniError::manifest(format!(
|
||||
"branch_merge does not allow internal run refs ('{}' -> '{}')",
|
||||
"branch_merge does not allow internal system refs ('{}' -> '{}')",
|
||||
source, target
|
||||
)));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -35,7 +35,7 @@ use time::format_description::well_known::Rfc3339;
|
|||
|
||||
use crate::db::commit_graph::CommitGraph;
|
||||
use crate::db::manifest::ManifestCoordinator;
|
||||
use crate::db::{MergeOutcome, Omnigraph, is_internal_run_branch};
|
||||
use crate::db::{MergeOutcome, Omnigraph, is_internal_system_branch};
|
||||
use crate::db::{ReadTarget, Snapshot};
|
||||
use crate::embedding::EmbeddingClient;
|
||||
use crate::error::{MergeConflict, MergeConflictKind, OmniError, Result};
|
||||
|
|
|
|||
|
|
@ -371,11 +371,10 @@ async fn cancelled_mutation_future_leaves_no_state() {
|
|||
|
||||
// Cancel-safety property: no graph-level run/staging state remains.
|
||||
//
|
||||
// Note: `branch_list()` already filters `__run__*` via
|
||||
// `is_internal_system_branch`, so a runtime "no `__run__` branches" check
|
||||
// would be vacuous. The structural property that no `__run__` branches
|
||||
// can ever be created is enforced by deletion of `begin_run` etc. in
|
||||
// (verified by the build itself — those symbols no longer exist).
|
||||
// No `__run__` branches can ever be created: the Run state machine
|
||||
// (`begin_run` etc.) was deleted in MR-771 — verified by the build itself,
|
||||
// those symbols no longer exist. Any legacy `__run__*` branch on an
|
||||
// upgraded graph is swept by the v2→v3 manifest migration.
|
||||
//
|
||||
// (1) The branch list is unchanged: cancellation/completion cannot
|
||||
// synthesize new public branches.
|
||||
|
|
@ -442,34 +441,40 @@ async fn repeated_loads_do_not_accumulate_branches() {
|
|||
assert_eq!(db.branch_list().await.unwrap(), vec!["main".to_string()]);
|
||||
}
|
||||
|
||||
/// User code must not be able to write to internal `__run__*` names.
|
||||
/// The branch-name guard predicate is kept as defense-in-depth; it
|
||||
/// will be removed once a future production sweep retires the legacy
|
||||
/// branches.
|
||||
/// After MR-770, `__run__*` is an ordinary branch name — the Run state machine
|
||||
/// and its `is_internal_run_branch` guard are gone. The surviving internal-ref
|
||||
/// guard still rejects the active `__schema_apply_lock__` branch on the public
|
||||
/// create/merge APIs.
|
||||
#[tokio::test]
|
||||
async fn public_branch_apis_reject_internal_run_refs() {
|
||||
async fn public_branch_apis_reject_internal_system_refs() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let mut db = init_and_load(&dir).await;
|
||||
|
||||
let create_err = db.branch_create("__run__synthetic").await.unwrap_err();
|
||||
// `__run__*` is no longer reserved — creating it now succeeds.
|
||||
db.branch_create("__run__formerly_reserved")
|
||||
.await
|
||||
.expect("__run__ prefix is a normal branch name post-MR-770");
|
||||
|
||||
// The schema-apply lock branch is still rejected on public branch APIs.
|
||||
let create_err = db.branch_create("__schema_apply_lock__").await.unwrap_err();
|
||||
let OmniError::Manifest(err) = create_err else {
|
||||
panic!("expected Manifest error");
|
||||
};
|
||||
assert!(
|
||||
err.message.contains("internal run ref"),
|
||||
err.message.contains("internal system ref"),
|
||||
"unexpected error: {}",
|
||||
err.message
|
||||
);
|
||||
|
||||
let merge_err = db
|
||||
.branch_merge("__run__synthetic", "main")
|
||||
.branch_merge("__schema_apply_lock__", "main")
|
||||
.await
|
||||
.unwrap_err();
|
||||
let OmniError::Manifest(err) = merge_err else {
|
||||
panic!("expected Manifest error");
|
||||
};
|
||||
assert!(
|
||||
err.message.contains("internal run refs"),
|
||||
err.message.contains("internal system refs"),
|
||||
"unexpected error: {}",
|
||||
err.message
|
||||
);
|
||||
|
|
|
|||
|
|
@ -14,8 +14,11 @@ publisher's row-level CAS on `__manifest` is the single fence.
|
|||
|
||||
- No `RunRecord`, no `_graph_runs.lance`, no `_graph_run_actors.lance`.
|
||||
- No `omnigraph run *` CLI subcommands and no `/runs/*` HTTP endpoints.
|
||||
- No `__run__<id>` staging branches. (Legacy on-disk artifacts from
|
||||
pre-MR-771 repos are inert; MR-770 sweeps them in production.)
|
||||
- No `__run__<id>` staging branches; `__run__*` is no longer a reserved
|
||||
name. The branch-name guard was removed in MR-770, and any stale
|
||||
`__run__*` branch on an upgraded graph is swept off `__manifest` by the
|
||||
v2→v3 internal-schema migration on first read-write open. (The inert
|
||||
`_graph_runs.lance` bytes remain until a `delete_prefix` primitive lands.)
|
||||
- Cancelled mutation futures leave **no graph-level state** — only orphaned
|
||||
Lance fragments, which the existing `omnigraph cleanup` pipe reclaims.
|
||||
|
||||
|
|
|
|||
|
|
@ -4,4 +4,4 @@
|
|||
- `_as` variants of every write API let callers override the actor: `mutate_as`, `ingest_as`, `branch_merge_as`, `apply_schema_as`, etc.
|
||||
- Actor IDs are persisted on `GraphCommit.actor_id` with split storage in `_graph_commit_actors.lance` (the commit graph is split into `_graph_commits.lance` for the linkage and `_graph_commit_actors.lance` for the actor map).
|
||||
- HTTP server uses the bearer-token actor automatically; CLI uses the local user / explicit env (no implicit actor).
|
||||
- Pre-v0.4.0 graphs also stored actor IDs on `RunRecord.actor_id` in `_graph_runs.lance` / `_graph_run_actors.lance`. The Run state machine was removed in MR-771; those files are inert post-v0.4.0 and reclaimed by MR-770's production sweep.
|
||||
- Pre-v0.4.0 graphs also stored actor IDs on `RunRecord.actor_id` in `_graph_runs.lance` / `_graph_run_actors.lance`. The Run state machine was removed in MR-771; those files are inert post-v0.4.0. The v2→v3 manifest migration sweeps any stale `__run__*` branches on first write-open (MR-770); the inert dataset bytes remain until a `delete_prefix` primitive lands.
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@ Lance supports branching at the dataset level: a branch is a named lineage of ve
|
|||
OmniGraph builds *graph branches* on top by branching every sub-table coherently:
|
||||
|
||||
- `branch_create(name)` / `branch_create_from(target, name)` — disallowed name `main`; fails if branch exists; ensures the schema-apply lock is idle.
|
||||
- `branch_list()` — returns public branches, **filters internal** `__run__…` and `__schema_apply_lock__` prefixes.
|
||||
- `branch_list()` — returns public branches, **filters the internal** `__schema_apply_lock__` branch.
|
||||
- `branch_delete(name)` — refuses if there are descendants or active runs on the branch; cleans up owned per-branch fragments.
|
||||
- **Lazy forking**: a branch only forks a sub-table when that sub-table is first mutated on it. Pure-read branches share fragments with their source.
|
||||
- `sync_branch(branch)` — re-binds the in-memory handle to the latest head of the branch.
|
||||
|
|
@ -54,7 +54,7 @@ Notes:
|
|||
Filtered from `branch_list()` but visible to internals:
|
||||
|
||||
- `__schema_apply_lock__` — serializes schema migrations.
|
||||
- `__run__<run-id>` — legacy from the pre-v0.4.0 Run state machine (removed in MR-771). The branch-name guard predicate `is_internal_run_branch` is kept as defense-in-depth so users cannot create a branch matching the legacy prefix; the filter will be removed once production legacy branches are swept (MR-770).
|
||||
- `__run__<run-id>` — legacy from the pre-v0.4.0 Run state machine (removed in MR-771). These are swept off `__manifest` on the first read-write open by the v2→v3 internal-schema migration (MR-770), and `__run__*` is no longer a reserved name. Known limitation: a pre-v0.4.0 graph opened **read-only** still surfaces any stale `__run__*` branch in `branch_list()` until its first read-write open (the migration is write-path-only, like all manifest migrations).
|
||||
|
||||
## L2 — Recovery audit trail
|
||||
|
||||
|
|
|
|||
|
|
@ -4,11 +4,11 @@
|
|||
|---|---|---|
|
||||
| `MANIFEST_DIR` | `__manifest` | `db/manifest/layout.rs` |
|
||||
| Commit graph dir | `_graph_commits.lance` | `db/commit_graph.rs` |
|
||||
| Run registry dir (legacy, removed MR-771) | `_graph_runs.lance` | inert post-v0.4.0; reclaimed by MR-770 |
|
||||
| Run branch prefix (legacy, removed MR-771) | `__run__` | filtered by `is_internal_run_branch` defense-in-depth |
|
||||
| Run registry dir (legacy, removed MR-771) | `_graph_runs.lance` | inert post-v0.4.0; bytes remain until a `delete_prefix` primitive lands |
|
||||
| Run branch prefix (legacy, removed MR-771/MR-770) | `__run__` | swept off `__manifest` by the v2→v3 migration; no longer a reserved name |
|
||||
| Schema apply lock | `__schema_apply_lock__` | `db/mod.rs` |
|
||||
| Manifest publisher retry budget | `PUBLISHER_RETRY_BUDGET = 5` | `db/manifest/publisher.rs` |
|
||||
| Internal manifest schema version | `INTERNAL_MANIFEST_SCHEMA_VERSION = 2` | `db/manifest/migrations.rs` |
|
||||
| Internal manifest schema version | `INTERNAL_MANIFEST_SCHEMA_VERSION = 3` | `db/manifest/migrations.rs` |
|
||||
| Merge stage batch | `MERGE_STAGE_BATCH_ROWS = 8192` | `exec/merge.rs` |
|
||||
| Maintenance concurrency | `OMNIGRAPH_MAINTENANCE_CONCURRENCY=8` | `db/omnigraph/optimize.rs` |
|
||||
| Graph index cache size | `8` (LRU) | `runtime_cache.rs` |
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@ OmniGraph is **not** a single Lance dataset; it is a *graph* of datasets coordin
|
|||
- `edges/{fnv1a64-hex(edge_type_name)}` — one Lance dataset per edge type
|
||||
- `__manifest/` — the catalog of all sub-tables and their published versions
|
||||
- `_graph_commits.lance` / `_graph_commit_actors.lance` — the commit graph and its actor map
|
||||
- (legacy `_graph_runs.lance` / `_graph_run_actors.lance` from pre-v0.4.0 graphs are inert; the run state machine was removed in MR-771 and these files are cleaned up via MR-770's production sweep)
|
||||
- (legacy `_graph_runs.lance` / `_graph_run_actors.lance` from pre-v0.4.0 graphs are inert; the run state machine was removed in MR-771. The v2→v3 manifest migration sweeps stale `__run__*` branches on first write-open; the inert dataset bytes themselves remain until a `delete_prefix` storage primitive lands)
|
||||
- **Manifest row schema** (`object_id, object_type, location, metadata, base_objects, table_key, table_version, table_branch, row_count`):
|
||||
- `object_type` ∈ `table | table_version | table_tombstone`
|
||||
- `table_key` ∈ `node:<TypeName> | edge:<EdgeName>`
|
||||
|
|
@ -91,7 +91,7 @@ flowchart TB
|
|||
- **Graph root** is one directory (or S3 prefix). Everything below is part of one OmniGraph graph.
|
||||
- **`__manifest/`** is a Lance dataset whose rows describe which sub-table version is published at which graph-branch. Reading a snapshot starts here.
|
||||
- **`nodes/`** and **`edges/`** are sibling directories holding one Lance dataset per declared type. Names are `fnv1a64-hex` of the type name to keep paths fixed-length and case-safe.
|
||||
- **`_graph_commits.lance`** is an L2 dataset that records the graph-level commit DAG, with a paired `_graph_commit_actors.lance` for the actor map. (Pre-v0.4.0 graphs also have inert `_graph_runs.lance` / `_graph_run_actors.lance` from the removed Run state machine; MR-770 sweeps these in production.)
|
||||
- **`_graph_commits.lance`** is an L2 dataset that records the graph-level commit DAG, with a paired `_graph_commit_actors.lance` for the actor map. (Pre-v0.4.0 graphs also have inert `_graph_runs.lance` / `_graph_run_actors.lance` from the removed Run state machine; the v2→v3 migration sweeps their stale `__run__*` branches, and the dataset bytes are reclaimed once `delete_prefix` lands.)
|
||||
- **`_graph_commit_recoveries.lance`** — one row per recovery sweep action. Joined to `_graph_commits.lance` by `graph_commit_id`; the linked commit row carries `actor_id=omnigraph:recovery`. Operators correlate recoveries with the original mutations they rolled forward / back via this join. See `crates/omnigraph/src/db/recovery_audit.rs`.
|
||||
- **`__recovery/{ulid}.json`** — transient sidecar files written by the four migrated writers (`MutationStaging::finalize`, `schema_apply`, `branch_merge`, `ensure_indices`) before Phase B begins, deleted after Phase C succeeds. A sidecar persisting after process exit means the writer crashed in the Phase B → Phase C window; the next `Omnigraph::open` recovery sweep processes it. Steady-state directory is empty. See `crates/omnigraph/src/db/manifest/recovery.rs`.
|
||||
- **`_refs/branches/{name}.json`** is graph-level branch metadata — pointers from a branch name to the manifest version it heads.
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue