From fbb86dee0ea1583d8dde675cef8ff3df95a02560 Mon Sep 17 00:00:00 2001 From: aaltshuler Date: Thu, 11 Jun 2026 05:25:53 +0300 Subject: [PATCH 1/7] refactor(cluster): move the in-source test suite to tests.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Verbatim move (indentation preserved — embedded raw-string fixtures are content). lib.rs drops from 7,857 to ~4,750 lines; `use super::*` resolves to the crate root through the #[path] module declaration unchanged. 95 tests green before and after. Co-Authored-By: Claude Fable 5 --- crates/omnigraph-cluster/src/lib.rs | 3019 +------------------------ crates/omnigraph-cluster/src/tests.rs | 3019 +++++++++++++++++++++++++ 2 files changed, 3022 insertions(+), 3016 deletions(-) create mode 100644 crates/omnigraph-cluster/src/tests.rs diff --git a/crates/omnigraph-cluster/src/lib.rs b/crates/omnigraph-cluster/src/lib.rs index bb0c66b..da80710 100644 --- a/crates/omnigraph-cluster/src/lib.rs +++ b/crates/omnigraph-cluster/src/lib.rs @@ -4838,3020 +4838,7 @@ fn display_path(path: &Path) -> String { path.display().to_string() } + #[cfg(test)] -mod tests { - use std::fs; - use std::path::Path; - - use omnigraph::db::Omnigraph; - use serde_json::json; - use tempfile::tempdir; - - use super::*; - - const SCHEMA: &str = r#" -node Person { - name: String @key - age: I32? -} -"#; - - const QUERY: &str = r#" -query find_person($name: String) { - match { $p: Person { name: $name } } - return { $p.name, $p.age } -} -"#; - - fn fixture() -> tempfile::TempDir { - let dir = tempdir().unwrap(); - fs::write(dir.path().join("people.pg"), SCHEMA).unwrap(); - fs::write(dir.path().join("people.gq"), QUERY).unwrap(); - fs::write(dir.path().join("base.policy.yaml"), "rules: []\n").unwrap(); - fs::write( - dir.path().join(CLUSTER_CONFIG_FILE), - r#" -version: 1 -metadata: - name: test -state: - backend: cluster - lock: true -graphs: - knowledge: - schema: ./people.pg - queries: - find_person: - file: ./people.gq -policies: - base: - file: ./base.policy.yaml - applies_to: [knowledge] -"#, - ) - .unwrap(); - dir - } - - async fn init_derived_graph(root: &Path) { - let graph_dir = root.join(CLUSTER_GRAPHS_DIR); - fs::create_dir_all(&graph_dir).unwrap(); - let graph = graph_dir.join("knowledge.omni"); - Omnigraph::init(graph.to_string_lossy().as_ref(), SCHEMA) - .await - .unwrap(); - } - - fn write_lock_file(config_dir: &Path, lock_id: &str, operation: &str) { - let state_dir = config_dir.join(CLUSTER_STATE_DIR); - fs::create_dir_all(&state_dir).unwrap(); - fs::write( - state_dir.join("lock.json"), - json!({ - "version": 1, - "lock_id": lock_id, - "operation": operation, - "created_at": "1970-01-01T00:00:00Z", - "pid": 123 - }) - .to_string(), - ) - .unwrap(); - } - - #[test] - fn valid_minimal_config() { - let dir = fixture(); - let out = validate_config_dir(dir.path()); - assert!(out.ok, "{:?}", out.diagnostics); - assert!(out.resource_digests.contains_key("graph.knowledge")); - assert!(out.resource_digests.contains_key("schema.knowledge")); - assert!( - out.dependencies - .iter() - .any(|dep| dep.from == "policy.base" && dep.to == "graph.knowledge") - ); - } - - #[test] - fn unknown_field_rejection() { - let dir = fixture(); - fs::write( - dir.path().join(CLUSTER_CONFIG_FILE), - "version: 1\ngraphs: {}\nwat: true\n", - ) - .unwrap(); - let out = validate_config_dir(dir.path()); - assert!(!out.ok); - assert!(out.diagnostics[0].message.contains("unknown field")); - } - - #[test] - fn future_phase_field_rejection() { - let dir = fixture(); - fs::write( - dir.path().join(CLUSTER_CONFIG_FILE), - "version: 1\ngraphs: {}\npipelines: {}\n", - ) - .unwrap(); - let out = validate_config_dir(dir.path()); - assert!(!out.ok); - assert_eq!(out.diagnostics[0].code, "future_phase_field"); - } - - #[test] - fn duplicate_yaml_key_rejection() { - let dir = fixture(); - fs::write( - dir.path().join(CLUSTER_CONFIG_FILE), - "version: 1\ngraphs: {}\ngraphs: {}\n", - ) - .unwrap(); - let out = validate_config_dir(dir.path()); - assert!(!out.ok); - assert_eq!(out.diagnostics[0].code, "duplicate_yaml_key"); - } - - #[test] - fn duplicate_yaml_key_rejection_keeps_quoted_hashes() { - let diagnostics = - duplicate_key_diagnostics("\"name#display\": one\n\"name#display\": two\n"); - assert_eq!(diagnostics.len(), 1); - assert_eq!(diagnostics[0].code, "duplicate_yaml_key"); - } - - #[test] - fn missing_schema_query_and_policy_files() { - let dir = fixture(); - fs::write( - dir.path().join(CLUSTER_CONFIG_FILE), - r#" -version: 1 -graphs: - knowledge: - schema: ./missing.pg - queries: - find_person: { file: ./missing.gq } -policies: - base: - file: ./missing.policy.yaml - applies_to: [knowledge] -"#, - ) - .unwrap(); - let out = validate_config_dir(dir.path()); - assert!(!out.ok); - let codes: BTreeSet<_> = out.diagnostics.iter().map(|d| d.code.as_str()).collect(); - assert!(codes.contains("schema_file_missing")); - assert!(codes.contains("query_file_missing")); - assert!(codes.contains("policy_file_missing")); - } - - #[test] - fn wrong_kind_and_dangling_refs_fail() { - let dir = fixture(); - fs::write( - dir.path().join(CLUSTER_CONFIG_FILE), - r#" -version: 1 -graphs: - knowledge: - schema: ./people.pg -policies: - base: - file: ./base.policy.yaml - applies_to: [query.knowledge.find_person, missing] -"#, - ) - .unwrap(); - let out = validate_config_dir(dir.path()); - assert!(!out.ok); - let codes: BTreeSet<_> = out.diagnostics.iter().map(|d| d.code.as_str()).collect(); - assert!(codes.contains("wrong_kind_reference")); - assert!(codes.contains("dangling_graph_reference")); - } - - #[test] - fn query_key_mismatch_fails() { - let dir = fixture(); - fs::write( - dir.path().join(CLUSTER_CONFIG_FILE), - r#" -version: 1 -graphs: - knowledge: - schema: ./people.pg - queries: - different: { file: ./people.gq } -"#, - ) - .unwrap(); - let out = validate_config_dir(dir.path()); - assert!(!out.ok); - assert_eq!(out.diagnostics[0].code, "query_key_mismatch"); - } - - #[test] - fn query_typecheck_failure_fails() { - let dir = fixture(); - fs::write( - dir.path().join("people.gq"), - "query find_person() { match { $d: DoesNotExist } return { $d.name } }\n", - ) - .unwrap(); - let out = validate_config_dir(dir.path()); - assert!(!out.ok); - assert!( - out.diagnostics - .iter() - .any(|diagnostic| diagnostic.code == "query_typecheck_error") - ); - } - - #[tokio::test] - async fn missing_state_plans_creates() { - let dir = fixture(); - let out = plan_config_dir(dir.path()).await; - assert!(out.ok, "{:?}", out.diagnostics); - assert!(!out.state_observations.state_found); - assert!(!out.state_observations.locked); - assert!(out.state_observations.lock_acquired); - assert!( - out.changes - .iter() - .all(|c| c.operation == PlanOperation::Create) - ); - assert!(out.changes.iter().any(|c| c.resource == "graph.knowledge")); - assert!(!dir.path().join(CLUSTER_LOCK_FILE).exists()); - } - - #[tokio::test] - async fn config_digest_ignores_yaml_comments_and_formatting() { - let dir = fixture(); - let first = plan_config_dir(dir.path()).await; - assert!(first.ok, "{:?}", first.diagnostics); - - fs::write( - dir.path().join(CLUSTER_CONFIG_FILE), - r#" -# Same semantic config as the fixture, intentionally rendered differently. -version: 1 -metadata: { name: test } -state: { backend: cluster, lock: true } -graphs: - knowledge: - schema: ./people.pg - queries: { find_person: { file: ./people.gq } } -policies: - base: - file: ./base.policy.yaml - applies_to: - - knowledge -"#, - ) - .unwrap(); - - let second = plan_config_dir(dir.path()).await; - assert!(second.ok, "{:?}", second.diagnostics); - assert_eq!( - first.desired_revision.config_digest, - second.desired_revision.config_digest - ); - } - - #[tokio::test] - async fn existing_state_plans_update_and_delete_deterministically() { - let dir = fixture(); - let first = plan_config_dir(dir.path()).await; - let state_dir = dir.path().join("__cluster"); - fs::create_dir_all(&state_dir).unwrap(); - fs::write( - state_dir.join("state.json"), - serde_json::to_string_pretty(&json!({ - "version": 1, - "applied_revision": { - "config_digest": "old", - "resources": { - "graph.knowledge": { "digest": first.resource_digests["graph.knowledge"] }, - "policy.old": { "digest": "abc" }, - "schema.knowledge": { "digest": "old-schema" } - } - } - })) - .unwrap(), - ) - .unwrap(); - - let out = plan_config_dir(dir.path()).await; - assert!(out.ok, "{:?}", out.diagnostics); - let rendered: Vec<_> = out - .changes - .iter() - .map(|change| (change.resource.as_str(), &change.operation)) - .collect(); - assert_eq!( - rendered, - vec![ - ("policy.base", &PlanOperation::Create), - ("policy.old", &PlanOperation::Delete), - ("query.knowledge.find_person", &PlanOperation::Create), - ("schema.knowledge", &PlanOperation::Update), - ] - ); - } - - #[tokio::test] - async fn old_minimal_state_json_still_plans_with_default_revision() { - let dir = fixture(); - let state_dir = dir.path().join(CLUSTER_STATE_DIR); - fs::create_dir_all(&state_dir).unwrap(); - fs::write( - state_dir.join("state.json"), - r#"{ - "version": 1, - "applied_revision": { - "config_digest": "old", - "resources": { - "graph.knowledge": { "digest": "old-graph" } - } - } -}"#, - ) - .unwrap(); - - let out = plan_config_dir(dir.path()).await; - assert!(out.ok, "{:?}", out.diagnostics); - assert_eq!(out.state_observations.state_revision, 0); - assert!(out.state_observations.state_cas.is_some()); - assert!(out.changes.iter().any(|change| { - change.resource == "graph.knowledge" && change.operation == PlanOperation::Update - })); - } - - #[test] - fn extended_state_json_status_surfaces_statuses() { - let dir = fixture(); - let state_dir = dir.path().join(CLUSTER_STATE_DIR); - fs::create_dir_all(&state_dir).unwrap(); - let state = r#"{ - "version": 1, - "state_revision": 42, - "applied_revision": { - "config_digest": "applied-config", - "resources": { - "graph.knowledge": { "digest": "graph-digest" } - } - }, - "resource_statuses": { - "graph.knowledge": { - "status": "applied", - "conditions": ["healthy"], - "message": "ready" - } - }, - "approval_records": {}, - "recovery_records": {}, - "observations": { - "graph.knowledge": { "manifest_version": 12 } - } -}"#; - fs::write(state_dir.join("state.json"), state).unwrap(); - - let out = status_config_dir(dir.path()); - assert!(out.ok, "{:?}", out.diagnostics); - assert!(out.state_observations.state_found); - assert_eq!(out.state_observations.state_revision, 42); - assert_eq!( - out.state_observations.state_cas.as_deref(), - Some(format!("sha256:{}", sha256_hex(state.as_bytes())).as_str()) - ); - assert_eq!( - out.resource_digests - .get("graph.knowledge") - .map(String::as_str), - Some("graph-digest") - ); - assert_eq!( - out.resource_statuses["graph.knowledge"].status, - ResourceLifecycleStatus::Applied - ); - } - - #[test] - fn missing_state_status_succeeds_with_warning() { - let dir = fixture(); - let out = status_config_dir(dir.path()); - assert!(out.ok, "{:?}", out.diagnostics); - assert!(!out.state_observations.state_found); - assert_eq!(out.state_observations.state_revision, 0); - assert!( - out.diagnostics - .iter() - .any(|diagnostic| diagnostic.code == "state_missing") - ); - } - - #[test] - fn invalid_state_status_fails() { - let dir = fixture(); - let state_dir = dir.path().join(CLUSTER_STATE_DIR); - fs::create_dir_all(&state_dir).unwrap(); - fs::write(state_dir.join("state.json"), "{").unwrap(); - - let out = status_config_dir(dir.path()); - assert!(!out.ok); - assert!(out.state_observations.state_found); - assert!( - out.diagnostics - .iter() - .any(|diagnostic| diagnostic.code == "invalid_state_json") - ); - } - - #[test] - fn status_surfaces_full_lock_metadata() { - let dir = fixture(); - write_lock_file(dir.path(), "held-lock", "refresh"); - - let out = status_config_dir(dir.path()); - assert!(out.ok, "{:?}", out.diagnostics); - assert!(out.state_observations.locked); - assert_eq!(out.state_observations.lock_id.as_deref(), Some("held-lock")); - assert_eq!( - out.state_observations.lock_operation.as_deref(), - Some("refresh") - ); - assert_eq!( - out.state_observations.lock_created_at.as_deref(), - Some("1970-01-01T00:00:00Z") - ); - assert_eq!(out.state_observations.lock_pid, Some(123)); - assert!(out.state_observations.lock_age_seconds.is_some()); - } - - #[test] - fn force_unlock_matching_id_removes_lock() { - let dir = fixture(); - write_lock_file(dir.path(), "held-lock", "plan"); - - let out = force_unlock_config_dir(dir.path(), "held-lock"); - assert!(out.ok, "{:?}", out.diagnostics); - assert!(out.lock_removed); - assert_eq!(out.state_observations.lock_id.as_deref(), Some("held-lock")); - assert_eq!( - out.state_observations.lock_operation.as_deref(), - Some("plan") - ); - assert!(!dir.path().join(CLUSTER_LOCK_FILE).exists()); - } - - #[test] - fn force_unlock_wrong_id_fails_and_preserves_lock() { - let dir = fixture(); - write_lock_file(dir.path(), "held-lock", "plan"); - - let out = force_unlock_config_dir(dir.path(), "other-lock"); - assert!(!out.ok); - assert!(!out.lock_removed); - assert_eq!(out.state_observations.lock_id.as_deref(), Some("held-lock")); - assert!( - out.diagnostics - .iter() - .any(|diagnostic| diagnostic.code == "state_lock_id_mismatch") - ); - assert!(dir.path().join(CLUSTER_LOCK_FILE).exists()); - } - - #[test] - fn force_unlock_missing_lock_fails() { - let dir = fixture(); - - let out = force_unlock_config_dir(dir.path(), "held-lock"); - assert!(!out.ok); - assert!(!out.lock_removed); - assert!(!out.state_observations.locked); - assert!( - out.diagnostics - .iter() - .any(|diagnostic| diagnostic.code == "state_lock_missing") - ); - } - - #[test] - fn force_unlock_invalid_lock_json_fails_and_preserves_lock() { - let dir = fixture(); - let state_dir = dir.path().join(CLUSTER_STATE_DIR); - fs::create_dir_all(&state_dir).unwrap(); - fs::write(state_dir.join("lock.json"), "{").unwrap(); - - let out = force_unlock_config_dir(dir.path(), "held-lock"); - assert!(!out.ok); - assert!(!out.lock_removed); - assert!( - out.diagnostics - .iter() - .any(|diagnostic| diagnostic.code == "invalid_state_lock") - ); - assert!(dir.path().join(CLUSTER_LOCK_FILE).exists()); - } - - #[test] - fn force_unlock_unsupported_lock_version_fails_and_preserves_lock() { - let dir = fixture(); - let state_dir = dir.path().join(CLUSTER_STATE_DIR); - fs::create_dir_all(&state_dir).unwrap(); - fs::write( - state_dir.join("lock.json"), - r#"{"version":2,"lock_id":"held-lock","operation":"plan","created_at":"1970-01-01T00:00:00Z","pid":123}"#, - ) - .unwrap(); - - let out = force_unlock_config_dir(dir.path(), "held-lock"); - assert!(!out.ok); - assert!(!out.lock_removed); - assert!( - out.diagnostics - .iter() - .any(|diagnostic| diagnostic.code == "unsupported_state_lock_version") - ); - assert!(dir.path().join(CLUSTER_LOCK_FILE).exists()); - } - - #[test] - fn force_unlock_external_state_backend_rejected() { - let dir = fixture(); - write_lock_file(dir.path(), "held-lock", "plan"); - fs::write( - dir.path().join(CLUSTER_CONFIG_FILE), - r#" -version: 1 -state: - backend: s3://state-bucket/cluster -graphs: - knowledge: - schema: ./people.pg -"#, - ) - .unwrap(); - - let out = force_unlock_config_dir(dir.path(), "held-lock"); - assert!(!out.ok); - assert!(!out.lock_removed); - assert!( - out.diagnostics - .iter() - .any(|diagnostic| diagnostic.code == "unsupported_state_backend") - ); - assert!(dir.path().join(CLUSTER_LOCK_FILE).exists()); - } - - #[tokio::test] - async fn plan_succeeds_after_force_unlock() { - let dir = fixture(); - write_lock_file(dir.path(), "held-lock", "plan"); - - let locked = plan_config_dir(dir.path()).await; - assert!(!locked.ok); - assert!( - locked - .diagnostics - .iter() - .any(|diagnostic| diagnostic.code == "state_lock_held") - ); - - let unlocked = force_unlock_config_dir(dir.path(), "held-lock"); - assert!(unlocked.ok, "{:?}", unlocked.diagnostics); - - let out = plan_config_dir(dir.path()).await; - assert!(out.ok, "{:?}", out.diagnostics); - } - - #[tokio::test] - async fn plan_reports_state_cas_revision_and_removes_lock() { - let dir = fixture(); - let state_dir = dir.path().join(CLUSTER_STATE_DIR); - fs::create_dir_all(&state_dir).unwrap(); - let state = r#"{ - "version": 1, - "state_revision": 7, - "applied_revision": { - "config_digest": "old", - "resources": { - "graph.knowledge": { "digest": "old-graph" } - } - } -}"#; - fs::write(state_dir.join("state.json"), state).unwrap(); - - let out = plan_config_dir(dir.path()).await; - assert!(out.ok, "{:?}", out.diagnostics); - assert_eq!(out.state_observations.state_revision, 7); - assert_eq!( - out.state_observations.state_cas.as_deref(), - Some(format!("sha256:{}", sha256_hex(state.as_bytes())).as_str()) - ); - assert!(!out.state_observations.locked); - assert!(out.state_observations.lock_id.is_none()); - assert!(out.state_observations.lock_acquired); - assert!(out.state_observations.acquired_lock_id.is_some()); - assert!( - !dir.path().join(CLUSTER_LOCK_FILE).exists(), - "plan must release lock before returning" - ); - } - - #[tokio::test] - async fn existing_lock_makes_plan_fail() { - let dir = fixture(); - let state_dir = dir.path().join(CLUSTER_STATE_DIR); - fs::create_dir_all(&state_dir).unwrap(); - fs::write( - state_dir.join("lock.json"), - r#"{ - "version": 1, - "lock_id": "held-lock", - "operation": "plan", - "created_at": "2026-06-08T00:00:00Z", - "pid": 123 -}"#, - ) - .unwrap(); - - let out = plan_config_dir(dir.path()).await; - assert!(!out.ok); - assert!(out.state_observations.locked); - assert_eq!(out.state_observations.lock_id.as_deref(), Some("held-lock")); - assert!(!out.state_observations.lock_acquired); - assert!(out.state_observations.acquired_lock_id.is_none()); - assert_eq!( - out.state_observations.lock_operation.as_deref(), - Some("plan") - ); - assert!( - out.diagnostics - .iter() - .any(|diagnostic| diagnostic.code == "state_lock_held") - ); - assert!(out.diagnostics.iter().any(|diagnostic| { - diagnostic.code == "state_lock_held" - && diagnostic.message.contains("force-unlock held-lock") - })); - } - - #[tokio::test] - async fn state_lock_false_bypasses_lock_with_warning() { - let dir = fixture(); - fs::write( - dir.path().join(CLUSTER_CONFIG_FILE), - r#" -version: 1 -state: - backend: cluster - lock: false -graphs: - knowledge: - schema: ./people.pg -"#, - ) - .unwrap(); - - let out = plan_config_dir(dir.path()).await; - assert!(out.ok, "{:?}", out.diagnostics); - assert!(!out.state_observations.locked); - assert!(!out.state_observations.lock_acquired); - assert!( - out.diagnostics - .iter() - .any(|diagnostic| diagnostic.code == "state_lock_disabled") - ); - assert!(!dir.path().join(CLUSTER_LOCK_FILE).exists()); - } - - #[test] - fn external_state_backend_rejected() { - let dir = fixture(); - fs::write( - dir.path().join(CLUSTER_CONFIG_FILE), - "version: 1\nstate:\n backend: s3://bucket/state\ngraphs: {}\n", - ) - .unwrap(); - let out = validate_config_dir(dir.path()); - assert!(!out.ok); - assert_eq!(out.diagnostics[0].code, "unsupported_state_backend"); - } - - #[tokio::test] - async fn external_state_backend_plan_rejected() { - let dir = fixture(); - fs::write( - dir.path().join(CLUSTER_CONFIG_FILE), - "version: 1\nstate:\n backend: s3://bucket/state\ngraphs: {}\n", - ) - .unwrap(); - let out = plan_config_dir(dir.path()).await; - assert!(!out.ok); - assert!( - out.diagnostics - .iter() - .any(|diagnostic| diagnostic.code == "unsupported_state_backend") - ); - } - - #[tokio::test] - async fn import_missing_state_creates_state_with_graph_observation() { - let dir = fixture(); - init_derived_graph(dir.path()).await; - - let out = import_config_dir(dir.path()).await; - assert!(out.ok, "{:?}", out.diagnostics); - assert_eq!(out.state_observations.state_revision, 1); - assert!(out.state_observations.state_cas.is_some()); - assert!(!out.state_observations.locked); - assert!(out.state_observations.lock_acquired); - assert!(out.state_observations.acquired_lock_id.is_some()); - assert!(!dir.path().join(CLUSTER_LOCK_FILE).exists()); - assert_eq!( - out.resource_digests - .get("schema.knowledge") - .map(String::as_str), - Some(sha256_hex(SCHEMA.as_bytes()).as_str()) - ); - assert!(out.observations["graph.knowledge"]["manifest_version"].is_number()); - assert_eq!( - out.observations["graph.knowledge"]["schema_matches_desired"], - true - ); - - let state: serde_json::Value = - serde_json::from_str(&fs::read_to_string(dir.path().join(CLUSTER_STATE_FILE)).unwrap()) - .unwrap(); - assert_eq!(state["state_revision"], 1); - assert_eq!( - state["resource_statuses"]["graph.knowledge"]["status"], - "applied" - ); - } - - #[tokio::test] - async fn import_existing_state_fails() { - let dir = fixture(); - let state_dir = dir.path().join(CLUSTER_STATE_DIR); - fs::create_dir_all(&state_dir).unwrap(); - fs::write( - state_dir.join("state.json"), - r#"{"version":1,"applied_revision":{"resources":{}}}"#, - ) - .unwrap(); - - let out = import_config_dir(dir.path()).await; - assert!(!out.ok); - assert!( - out.diagnostics - .iter() - .any(|diagnostic| diagnostic.code == "state_already_exists") - ); - } - - #[tokio::test] - async fn refresh_missing_state_fails() { - let dir = fixture(); - let out = refresh_config_dir(dir.path()).await; - assert!(!out.ok); - assert!( - out.diagnostics - .iter() - .any(|diagnostic| diagnostic.code == "state_missing") - ); - } - - #[tokio::test] - async fn refresh_existing_minimal_state_increments_revision_and_updates_cas() { - let dir = fixture(); - init_derived_graph(dir.path()).await; - let state_dir = dir.path().join(CLUSTER_STATE_DIR); - fs::create_dir_all(&state_dir).unwrap(); - fs::write( - state_dir.join("state.json"), - r#"{"version":1,"applied_revision":{"config_digest":"old","resources":{"graph.knowledge":{"digest":"old"}}}}"#, - ) - .unwrap(); - - let out = refresh_config_dir(dir.path()).await; - assert!(out.ok, "{:?}", out.diagnostics); - assert_eq!(out.state_observations.state_revision, 1); - assert!(out.state_observations.state_cas.is_some()); - assert!(!out.state_observations.locked); - assert!(out.state_observations.lock_acquired); - assert_eq!( - out.resource_statuses["graph.knowledge"].status, - ResourceLifecycleStatus::Applied - ); - assert!(!dir.path().join(CLUSTER_LOCK_FILE).exists()); - } - - #[tokio::test] - async fn refresh_records_live_schema_digest_and_manifest_version() { - let dir = fixture(); - init_derived_graph(dir.path()).await; - let state_dir = dir.path().join(CLUSTER_STATE_DIR); - fs::create_dir_all(&state_dir).unwrap(); - fs::write( - state_dir.join("state.json"), - r#"{"version":1,"state_revision":4,"applied_revision":{"resources":{}}}"#, - ) - .unwrap(); - - let out = refresh_config_dir(dir.path()).await; - assert!(out.ok, "{:?}", out.diagnostics); - assert_eq!(out.state_observations.state_revision, 5); - assert_eq!( - out.observations["graph.knowledge"]["schema_digest"], - sha256_hex(SCHEMA.as_bytes()) - ); - assert!(out.observations["graph.knowledge"]["manifest_version"].is_u64()); - } - - #[tokio::test] - async fn missing_derived_graph_root_marks_drifted_and_plans_creates() { - let dir = fixture(); - let state_dir = dir.path().join(CLUSTER_STATE_DIR); - fs::create_dir_all(&state_dir).unwrap(); - fs::write( - state_dir.join("state.json"), - r#"{"version":1,"applied_revision":{"resources":{"graph.knowledge":{"digest":"old-graph"},"schema.knowledge":{"digest":"old-schema"}}}}"#, - ) - .unwrap(); - - let out = refresh_config_dir(dir.path()).await; - assert!(out.ok, "{:?}", out.diagnostics); - assert_eq!( - out.resource_statuses["graph.knowledge"].status, - ResourceLifecycleStatus::Drifted - ); - assert!(!out.resource_digests.contains_key("graph.knowledge")); - assert_eq!(out.observations["graph.knowledge"]["exists"], false); - - let plan = plan_config_dir(dir.path()).await; - assert!(plan.ok, "{:?}", plan.diagnostics); - assert!(plan.changes.iter().any(|change| { - change.resource == "graph.knowledge" && change.operation == PlanOperation::Create - })); - assert!(plan.changes.iter().any(|change| { - change.resource == "schema.knowledge" && change.operation == PlanOperation::Create - })); - } - - #[tokio::test] - async fn live_schema_mismatch_marks_drifted_and_causes_plan_update() { - let dir = fixture(); - init_derived_graph(dir.path()).await; - fs::write( - dir.path().join("people.pg"), - SCHEMA.replace("age: I32?", "age: I32?\n nickname: String?"), - ) - .unwrap(); - let state_dir = dir.path().join(CLUSTER_STATE_DIR); - fs::create_dir_all(&state_dir).unwrap(); - fs::write( - state_dir.join("state.json"), - r#"{"version":1,"applied_revision":{"resources":{"graph.knowledge":{"digest":"old-graph"},"schema.knowledge":{"digest":"old-schema"}}}}"#, - ) - .unwrap(); - - let out = refresh_config_dir(dir.path()).await; - assert!(out.ok, "{:?}", out.diagnostics); - assert_eq!( - out.resource_statuses["schema.knowledge"].status, - ResourceLifecycleStatus::Drifted - ); - assert_eq!( - out.observations["graph.knowledge"]["schema_matches_desired"], - false - ); - - let plan = plan_config_dir(dir.path()).await; - assert!(plan.ok, "{:?}", plan.diagnostics); - assert!(plan.changes.iter().any(|change| { - change.resource == "schema.knowledge" && change.operation == PlanOperation::Update - })); - } - - #[tokio::test] - async fn existing_lock_makes_refresh_fail() { - let dir = fixture(); - let state_dir = dir.path().join(CLUSTER_STATE_DIR); - fs::create_dir_all(&state_dir).unwrap(); - fs::write( - state_dir.join("state.json"), - r#"{"version":1,"applied_revision":{"resources":{}}}"#, - ) - .unwrap(); - fs::write( - state_dir.join("lock.json"), - r#"{"version":1,"lock_id":"held-lock","operation":"refresh","created_at":"2026-06-08T00:00:00Z","pid":123}"#, - ) - .unwrap(); - - let out = refresh_config_dir(dir.path()).await; - assert!(!out.ok); - assert!(out.state_observations.locked); - assert_eq!(out.state_observations.lock_id.as_deref(), Some("held-lock")); - assert!(!out.state_observations.lock_acquired); - assert_eq!( - out.state_observations.lock_operation.as_deref(), - Some("refresh") - ); - assert!( - out.diagnostics - .iter() - .any(|diagnostic| diagnostic.code == "state_lock_held") - ); - assert!(out.diagnostics.iter().any(|diagnostic| { - diagnostic.code == "state_lock_held" - && diagnostic.message.contains("force-unlock held-lock") - })); - } - - #[tokio::test] - async fn state_lock_false_bypasses_refresh_lock_with_warning() { - let dir = fixture(); - init_derived_graph(dir.path()).await; - fs::write( - dir.path().join(CLUSTER_CONFIG_FILE), - r#" -version: 1 -state: - backend: cluster - lock: false -graphs: - knowledge: - schema: ./people.pg -"#, - ) - .unwrap(); - let state_dir = dir.path().join(CLUSTER_STATE_DIR); - fs::create_dir_all(&state_dir).unwrap(); - fs::write( - state_dir.join("state.json"), - r#"{"version":1,"applied_revision":{"resources":{}}}"#, - ) - .unwrap(); - - let out = refresh_config_dir(dir.path()).await; - assert!(out.ok, "{:?}", out.diagnostics); - assert!(!out.state_observations.locked); - assert!(!out.state_observations.lock_acquired); - assert!( - out.diagnostics - .iter() - .any(|diagnostic| diagnostic.code == "state_lock_disabled") - ); - } - - #[tokio::test] - async fn external_state_backend_refresh_rejected() { - let dir = fixture(); - fs::write( - dir.path().join(CLUSTER_CONFIG_FILE), - "version: 1\nstate:\n backend: s3://bucket/state\ngraphs: {}\n", - ) - .unwrap(); - - let out = refresh_config_dir(dir.path()).await; - assert!(!out.ok); - assert!( - out.diagnostics - .iter() - .any(|diagnostic| diagnostic.code == "unsupported_state_backend") - ); - } - - #[tokio::test] - async fn import_graph_open_error_does_not_create_state() { - let dir = fixture(); - fs::create_dir_all(dir.path().join(CLUSTER_GRAPHS_DIR).join("knowledge.omni")).unwrap(); - - let out = import_config_dir(dir.path()).await; - assert!(!out.ok); - assert!( - out.diagnostics - .iter() - .any(|diagnostic| diagnostic.code == "graph_observation_error") - ); - assert!(!dir.path().join(CLUSTER_STATE_FILE).exists()); - } - - // ---- config-only apply (Stage 3A) ---- - - /// Seed a state.json that simulates "graph exists with the desired schema, - /// queries/policies not yet applied" by borrowing the desired digests. - fn write_applyable_state(config_dir: &Path) { - let out = validate_config_dir(config_dir); - assert!(out.ok, "{:?}", out.diagnostics); - let schema_digest = out.resource_digests.get("schema.knowledge").unwrap().clone(); - let graph_composite = - graph_digest("knowledge", Some(&schema_digest), Some(&BTreeMap::new())); - write_state_resources( - config_dir, - &[ - ("graph.knowledge", graph_composite.as_str()), - ("schema.knowledge", schema_digest.as_str()), - ], - ); - } - - fn write_state_resources(config_dir: &Path, resources: &[(&str, &str)]) { - let resource_map: serde_json::Map = resources - .iter() - .map(|(address, digest)| ((*address).to_string(), json!({ "digest": digest }))) - .collect(); - let state_dir = config_dir.join(CLUSTER_STATE_DIR); - fs::create_dir_all(&state_dir).unwrap(); - fs::write( - state_dir.join("state.json"), - serde_json::to_string_pretty(&json!({ - "version": 1, - "state_revision": 1, - "applied_revision": { "resources": resource_map } - })) - .unwrap(), - ) - .unwrap(); - } - - fn read_state_json(config_dir: &Path) -> serde_json::Value { - serde_json::from_str(&fs::read_to_string(config_dir.join(CLUSTER_STATE_FILE)).unwrap()) - .unwrap() - } - - fn query_payload_path(config_dir: &Path, digest: &str) -> std::path::PathBuf { - config_dir - .join(CLUSTER_RESOURCES_DIR) - .join("query/knowledge/find_person") - .join(format!("{digest}.gq")) - } - - fn policy_payload_path(config_dir: &Path, digest: &str) -> std::path::PathBuf { - config_dir - .join(CLUSTER_RESOURCES_DIR) - .join("policy/base") - .join(format!("{digest}.yaml")) - } - - #[tokio::test] - async fn apply_without_state_fails_with_state_missing() { - let dir = fixture(); - let out = apply_config_dir(dir.path()).await; - assert!(!out.ok); - assert!( - out.diagnostics - .iter() - .any(|diagnostic| diagnostic.code == "state_missing" - && diagnostic.message.contains("cluster import")) - ); - assert!(!dir.path().join(CLUSTER_STATE_FILE).exists()); - assert!(!dir.path().join(CLUSTER_RESOURCES_DIR).exists()); - assert!(!dir.path().join(CLUSTER_LOCK_FILE).exists()); - } - - #[tokio::test] - async fn apply_writes_payloads_state_and_statuses() { - let dir = fixture(); - write_applyable_state(dir.path()); - let desired = validate_config_dir(dir.path()); - let query_digest = desired - .resource_digests - .get("query.knowledge.find_person") - .unwrap() - .clone(); - let policy_digest = desired.resource_digests.get("policy.base").unwrap().clone(); - let schema_digest = desired - .resource_digests - .get("schema.knowledge") - .unwrap() - .clone(); - - let out = apply_config_dir(dir.path()).await; - assert!(out.ok, "{:?}", out.diagnostics); - assert_eq!(out.applied_count, 2); - assert_eq!(out.deferred_count, 0); - assert!(out.converged); - assert!(out.state_written); - - let query_blob = query_payload_path(dir.path(), &query_digest); - assert_eq!(fs::read_to_string(&query_blob).unwrap(), QUERY); - let policy_blob = policy_payload_path(dir.path(), &policy_digest); - assert_eq!(fs::read_to_string(&policy_blob).unwrap(), "rules: []\n"); - - let state = read_state_json(dir.path()); - assert_eq!(state["state_revision"], 2); - let resources = &state["applied_revision"]["resources"]; - assert_eq!( - resources["query.knowledge.find_person"]["digest"], - query_digest - ); - assert_eq!(resources["policy.base"]["digest"], policy_digest); - let expected_composite = graph_digest( - "knowledge", - Some(&schema_digest), - Some( - &[("find_person".to_string(), query_digest.clone())] - .into_iter() - .collect(), - ), - ); - assert_eq!(resources["graph.knowledge"]["digest"], expected_composite); - assert_eq!( - state["applied_revision"]["config_digest"], - desired_revision_digest(&out) - ); - assert_eq!( - state["resource_statuses"]["query.knowledge.find_person"]["status"], - "applied" - ); - assert_eq!(state["resource_statuses"]["policy.base"]["status"], "applied"); - assert!(!dir.path().join(CLUSTER_LOCK_FILE).exists()); - } - - fn desired_revision_digest(out: &ApplyOutput) -> String { - out.desired_revision.config_digest.clone().unwrap() - } - - #[tokio::test] - async fn apply_update_changes_query_digest_and_keeps_old_blob() { - let dir = fixture(); - let desired = validate_config_dir(dir.path()); - let schema_digest = desired - .resource_digests - .get("schema.knowledge") - .unwrap() - .clone(); - let old_digest = "0".repeat(64); - let graph_composite = - graph_digest("knowledge", Some(&schema_digest), Some(&BTreeMap::new())); - write_state_resources( - dir.path(), - &[ - ("graph.knowledge", graph_composite.as_str()), - ("schema.knowledge", schema_digest.as_str()), - ("query.knowledge.find_person", old_digest.as_str()), - ], - ); - let old_blob = query_payload_path(dir.path(), &old_digest); - fs::create_dir_all(old_blob.parent().unwrap()).unwrap(); - fs::write(&old_blob, "old query source").unwrap(); - - let out = apply_config_dir(dir.path()).await; - assert!(out.ok, "{:?}", out.diagnostics); - let new_digest = desired - .resource_digests - .get("query.knowledge.find_person") - .unwrap(); - let state = read_state_json(dir.path()); - assert_eq!( - state["applied_revision"]["resources"]["query.knowledge.find_person"]["digest"], - *new_digest - ); - assert_eq!(fs::read_to_string(&old_blob).unwrap(), "old query source"); - assert!(query_payload_path(dir.path(), new_digest).exists()); - } - - #[tokio::test] - async fn apply_deletes_removed_resources_but_keeps_blobs() { - let dir = fixture(); - let desired = validate_config_dir(dir.path()); - let schema_digest = desired - .resource_digests - .get("schema.knowledge") - .unwrap() - .clone(); - let stale_query_digest = "1".repeat(64); - let stale_policy_digest = "2".repeat(64); - let graph_composite = - graph_digest("knowledge", Some(&schema_digest), Some(&BTreeMap::new())); - write_state_resources( - dir.path(), - &[ - ("graph.knowledge", graph_composite.as_str()), - ("schema.knowledge", schema_digest.as_str()), - ("query.knowledge.orphan", stale_query_digest.as_str()), - ("policy.old", stale_policy_digest.as_str()), - ], - ); - let stale_blob = dir - .path() - .join(CLUSTER_RESOURCES_DIR) - .join("policy/old") - .join(format!("{stale_policy_digest}.yaml")); - fs::create_dir_all(stale_blob.parent().unwrap()).unwrap(); - fs::write(&stale_blob, "old policy").unwrap(); - - let out = apply_config_dir(dir.path()).await; - assert!(out.ok, "{:?}", out.diagnostics); - assert!(out.converged); - let state = read_state_json(dir.path()); - let resources = &state["applied_revision"]["resources"]; - assert!(resources.get("query.knowledge.orphan").is_none()); - assert!(resources.get("policy.old").is_none()); - assert!( - state["resource_statuses"] - .get("query.knowledge.orphan") - .is_none() - ); - // Deleted resources leave their content-addressed blobs in place; GC is - // a later stage. - assert_eq!(fs::read_to_string(&stale_blob).unwrap(), "old policy"); - // The composite no longer includes the orphan query. - let query_digest = desired - .resource_digests - .get("query.knowledge.find_person") - .unwrap() - .clone(); - let expected_composite = graph_digest( - "knowledge", - Some(&schema_digest), - Some(&[("find_person".to_string(), query_digest)].into_iter().collect()), - ); - assert_eq!(resources["graph.knowledge"]["digest"], expected_composite); - } - - #[tokio::test] - async fn apply_schema_update_and_dependent_query_in_one_run() { - let dir = fixture(); - init_derived_graph(dir.path()).await; - write_applyable_state(dir.path()); - // Schema update + a query update that depends on the new field: one - // apply executes the schema migration first, then the catalog write. - fs::write(dir.path().join("people.pg"), SCHEMA_V2).unwrap(); - fs::write( - dir.path().join("people.gq"), - "\nquery find_person($name: String) {\n match { $p: Person { name: $name } }\n return { $p.name, $p.bio }\n}\n", - ) - .unwrap(); - - let out = apply_config_dir(dir.path()).await; - assert!(out.ok, "{:?}", out.diagnostics); - assert!(out.converged, "{out:?}"); - let by_resource: BTreeMap<&str, &PlanChange> = out - .changes - .iter() - .map(|change| (change.resource.as_str(), change)) - .collect(); - assert_eq!( - by_resource["schema.knowledge"].disposition, - Some(ApplyDisposition::Applied) - ); - assert_eq!( - by_resource["query.knowledge.find_person"].disposition, - Some(ApplyDisposition::Applied) - ); - assert_eq!( - by_resource["graph.knowledge"].disposition, - Some(ApplyDisposition::Derived) - ); - // The live graph carries the new schema. - let db = Omnigraph::open_read_only(&derived_graph_uri(dir.path(), "knowledge")) - .await - .unwrap(); - let desired = validate_config_dir(dir.path()); - assert_eq!( - sha256_hex(db.schema_source().as_bytes()), - desired.resource_digests["schema.knowledge"] - ); - let state = read_state_json(dir.path()); - assert_eq!( - state["applied_revision"]["resources"]["schema.knowledge"]["digest"], - desired.resource_digests["schema.knowledge"] - ); - // Sidecar retired after the CAS landed. - assert!( - !dir.path().join(CLUSTER_RECOVERIES_DIR).exists() - || fs::read_dir(dir.path().join(CLUSTER_RECOVERIES_DIR)) - .unwrap() - .next() - .is_none() - ); - } - - #[tokio::test] - async fn apply_unsupported_schema_change_fails_loudly() { - let dir = fixture(); - init_derived_graph(dir.path()).await; - write_applyable_state(dir.path()); - // Property type changes are unsupported by the engine planner. - fs::write( - dir.path().join("people.pg"), - "\nnode Person {\n name: String @key\n age: I64?\n}\n", - ) - .unwrap(); - - let out = apply_config_dir(dir.path()).await; - assert!(!out.ok); - assert!(out.diagnostics.iter().any(|diagnostic| { - diagnostic.code == "schema_apply_failed" - && diagnostic.message.contains("changing property type") - })); - let by_resource: BTreeMap<&str, &PlanChange> = out - .changes - .iter() - .map(|change| (change.resource.as_str(), change)) - .collect(); - assert_eq!( - by_resource["schema.knowledge"].disposition, - Some(ApplyDisposition::Blocked) - ); - assert_eq!( - by_resource["schema.knowledge"].reason.as_deref(), - Some("schema_apply_failed") - ); - // The live schema and the ledger are unchanged. - let state = read_state_json(dir.path()); - let desired = validate_config_dir(dir.path()); - assert_ne!( - state["applied_revision"]["resources"]["schema.knowledge"]["digest"], - desired.resource_digests["schema.knowledge"] - ); - // Second run: the sweep retires the stale sidecar (ledger consistent) - // and the run fails just as loudly — idempotent loudness. - let second = apply_config_dir(dir.path()).await; - assert!(!second.ok); - assert!( - second - .diagnostics - .iter() - .any(|diagnostic| diagnostic.code == "schema_apply_failed") - ); - } - - #[tokio::test] - async fn apply_blocks_schema_update_while_recovery_pending() { - let dir = fixture(); - init_derived_graph(dir.path()).await; - write_state_resources(dir.path(), &[("schema.knowledge", "stale-digest")]); - fs::write(dir.path().join("people.pg"), SCHEMA_V2).unwrap(); - // A pending sidecar whose intent matches neither live nor recorded. - write_schema_apply_sidecar(dir.path(), "knowledge", "intended-digest", "01PENDS"); - - let out = apply_config_dir(dir.path()).await; - let by_resource: BTreeMap<&str, &PlanChange> = out - .changes - .iter() - .map(|change| (change.resource.as_str(), change)) - .collect(); - assert_eq!( - by_resource["schema.knowledge"].disposition, - Some(ApplyDisposition::Blocked) - ); - assert_eq!( - by_resource["schema.knowledge"].reason.as_deref(), - Some("cluster_recovery_pending") - ); - } - - #[tokio::test] - async fn apply_creates_graph_and_unblocks_dependents() { - let dir = fixture(); - write_state_resources(dir.path(), &[]); - - let out = apply_config_dir(dir.path()).await; - assert!(out.ok, "{:?}", out.diagnostics); - assert!(out.converged, "{out:?}"); - let by_resource: BTreeMap<&str, &PlanChange> = out - .changes - .iter() - .map(|change| (change.resource.as_str(), change)) - .collect(); - // Stage 4A: the create executes, and its dependents apply in-run. - assert_eq!( - by_resource["graph.knowledge"].disposition, - Some(ApplyDisposition::Applied) - ); - assert_eq!( - by_resource["schema.knowledge"].disposition, - Some(ApplyDisposition::Applied) - ); - assert_eq!( - by_resource["query.knowledge.find_person"].disposition, - Some(ApplyDisposition::Applied) - ); - assert_eq!( - by_resource["policy.base"].disposition, - Some(ApplyDisposition::Applied) - ); - // The graph exists on disk and opens; state records everything. - let graph_uri = derived_graph_uri(dir.path(), "knowledge"); - let db = Omnigraph::open_read_only(&graph_uri).await.unwrap(); - let desired = validate_config_dir(dir.path()); - assert_eq!( - sha256_hex(db.schema_source().as_bytes()), - desired.resource_digests["schema.knowledge"] - ); - let state = read_state_json(dir.path()); - assert_eq!( - state["applied_revision"]["resources"]["schema.knowledge"]["digest"], - desired.resource_digests["schema.knowledge"] - ); - assert_eq!( - state["resource_statuses"]["graph.knowledge"]["status"], - "applied" - ); - // The create's sidecar was retired after the state CAS landed. - assert!( - !dir.path().join(CLUSTER_RECOVERIES_DIR).exists() - || fs::read_dir(dir.path().join(CLUSTER_RECOVERIES_DIR)) - .unwrap() - .next() - .is_none() - ); - } - - #[tokio::test] - async fn apply_create_failure_blocks_dependents_and_keeps_sidecar() { - let dir = fixture(); - write_state_resources(dir.path(), &[]); - // Make the init fail its strict preflight: a junk _schema.pg already - // sits at the derived root (the engine refuses to overwrite it). - let root = dir.path().join(CLUSTER_GRAPHS_DIR).join("knowledge.omni"); - fs::create_dir_all(&root).unwrap(); - fs::write(root.join("_schema.pg"), "junk").unwrap(); - - let out = apply_config_dir(dir.path()).await; - assert!(!out.ok); - assert!( - out.diagnostics - .iter() - .any(|diagnostic| diagnostic.code == "graph_create_failed") - ); - let by_resource: BTreeMap<&str, &PlanChange> = out - .changes - .iter() - .map(|change| (change.resource.as_str(), change)) - .collect(); - // Dependents are demoted: the run tells the truth about what executed. - assert_eq!( - by_resource["graph.knowledge"].disposition, - Some(ApplyDisposition::Blocked) - ); - assert_eq!( - by_resource["query.knowledge.find_person"].disposition, - Some(ApplyDisposition::Blocked) - ); - assert_eq!( - by_resource["query.knowledge.find_person"].reason.as_deref(), - Some("dependency_not_applied") - ); - assert_eq!( - by_resource["policy.base"].disposition, - Some(ApplyDisposition::Blocked) - ); - assert!(!out.converged); - // The sidecar stays for the sweep to classify next run. - assert!( - fs::read_dir(dir.path().join(CLUSTER_RECOVERIES_DIR)) - .unwrap() - .next() - .is_some() - ); - // No graph digests moved. - let state = read_state_json(dir.path()); - assert!( - state["applied_revision"]["resources"] - .as_object() - .unwrap() - .is_empty() - ); - } - - #[tokio::test] - async fn apply_blocks_graph_delete_without_approval() { - let dir = fixture(); - let desired = validate_config_dir(dir.path()); - let schema_digest = desired - .resource_digests - .get("schema.knowledge") - .unwrap() - .clone(); - let graph_composite = - graph_digest("knowledge", Some(&schema_digest), Some(&BTreeMap::new())); - write_state_resources( - dir.path(), - &[ - ("graph.knowledge", graph_composite.as_str()), - ("schema.knowledge", schema_digest.as_str()), - ("graph.old", "3333"), - ("schema.old", "4444"), - ("query.old.q", "5555"), - ], - ); - - let out = apply_config_dir(dir.path()).await; - assert!(out.ok, "{:?}", out.diagnostics); - assert!(!out.converged); - let by_resource: BTreeMap<&str, &PlanChange> = out - .changes - .iter() - .map(|change| (change.resource.as_str(), change)) - .collect(); - // Stage 4C: deletes are gated, not deferred — every subtree change - // blocks on the single graph-level approval. - assert_eq!( - by_resource["graph.old"].disposition, - Some(ApplyDisposition::Blocked) - ); - assert_eq!( - by_resource["graph.old"].reason.as_deref(), - Some("approval_required") - ); - assert_eq!( - by_resource["schema.old"].reason.as_deref(), - Some("approval_required") - ); - assert_eq!( - by_resource["query.old.q"].reason.as_deref(), - Some("approval_required") - ); - // State intact; nothing destroyed without the artifact. - let state = read_state_json(dir.path()); - let resources = &state["applied_revision"]["resources"]; - assert_eq!(resources["graph.old"]["digest"], "3333"); - assert_eq!(resources["schema.old"]["digest"], "4444"); - assert_eq!(resources["query.old.q"]["digest"], "5555"); - } - - #[tokio::test] - async fn approve_writes_digest_bound_artifact() { - let dir = fixture(); - write_applyable_state(dir.path()); - // Seed a deletable subtree. - let state = read_state_json(dir.path()); - let graph_digest_str = state["applied_revision"]["resources"]["graph.knowledge"]["digest"] - .as_str() - .unwrap() - .to_string(); - let schema_digest_str = state["applied_revision"]["resources"]["schema.knowledge"] - ["digest"] - .as_str() - .unwrap() - .to_string(); - write_state_resources( - dir.path(), - &[ - ("graph.knowledge", graph_digest_str.as_str()), - ("schema.knowledge", schema_digest_str.as_str()), - ("graph.old", "3333"), - ("schema.old", "4444"), - ], - ); - - let out = approve_config_dir(dir.path(), "graph.old", "andrew").await; - assert!(out.ok, "{:?}", out.diagnostics); - let approval_id = out.approval_id.clone().unwrap(); - let artifact: serde_json::Value = serde_json::from_str( - &fs::read_to_string( - dir.path() - .join(CLUSTER_APPROVALS_DIR) - .join(format!("{approval_id}.json")), - ) - .unwrap(), - ) - .unwrap(); - assert_eq!(artifact["resource"], "graph.old"); - assert_eq!(artifact["operation"], "delete"); - assert_eq!(artifact["approved_by"], "andrew"); - assert_eq!(artifact["bound_before_digest"], "3333"); - assert!(artifact["bound_after_digest"].is_null()); - assert!(artifact["bound_config_digest"].is_string()); - assert!(artifact["consumed_at"].is_null()); - - // A non-gated address is refused. - let not_gated = approve_config_dir(dir.path(), "query.knowledge.find_person", "andrew").await; - assert!(!not_gated.ok); - assert!( - not_gated - .diagnostics - .iter() - .any(|diagnostic| diagnostic.code == "approval_not_required") - ); - } - - #[tokio::test] - async fn stale_approval_is_ignored() { - let dir = fixture(); - write_applyable_state(dir.path()); - let state = read_state_json(dir.path()); - let graph_digest_str = state["applied_revision"]["resources"]["graph.knowledge"]["digest"] - .as_str() - .unwrap() - .to_string(); - let schema_digest_str = state["applied_revision"]["resources"]["schema.knowledge"] - ["digest"] - .as_str() - .unwrap() - .to_string(); - write_state_resources( - dir.path(), - &[ - ("graph.knowledge", graph_digest_str.as_str()), - ("schema.knowledge", schema_digest_str.as_str()), - ("graph.old", "3333"), - ], - ); - let approved = approve_config_dir(dir.path(), "graph.old", "andrew").await; - assert!(approved.ok, "{:?}", approved.diagnostics); - // The config moves after approval: the bound config digest no longer - // matches and the artifact authorizes nothing. - fs::write(dir.path().join("base.policy.yaml"), "rules: [] # moved\n").unwrap(); - - let out = apply_config_dir(dir.path()).await; - assert!( - out.diagnostics - .iter() - .any(|diagnostic| diagnostic.code == "approval_stale"), - "{:?}", - out.diagnostics - ); - let by_resource: BTreeMap<&str, &PlanChange> = out - .changes - .iter() - .map(|change| (change.resource.as_str(), change)) - .collect(); - assert_eq!( - by_resource["graph.old"].reason.as_deref(), - Some("approval_required") - ); - let state = read_state_json(dir.path()); - assert_eq!( - state["applied_revision"]["resources"]["graph.old"]["digest"], - "3333" - ); - } - - #[tokio::test] - async fn compute_approvals_one_gate_per_subtree() { - let dir = fixture(); - write_applyable_state(dir.path()); - let state = read_state_json(dir.path()); - let g = state["applied_revision"]["resources"]["graph.knowledge"]["digest"] - .as_str() - .unwrap() - .to_string(); - let sc = state["applied_revision"]["resources"]["schema.knowledge"]["digest"] - .as_str() - .unwrap() - .to_string(); - write_state_resources( - dir.path(), - &[ - ("graph.knowledge", g.as_str()), - ("schema.knowledge", sc.as_str()), - ("graph.old", "3333"), - ("schema.old", "4444"), - ("query.old.q", "5555"), - ], - ); - let plan = plan_config_dir(dir.path()).await; - let gated: Vec<&str> = plan - .approvals_required - .iter() - .map(|gate| gate.resource.as_str()) - .collect(); - assert_eq!(gated, vec!["graph.old"], "{plan:?}"); - assert!(!plan.approvals_required[0].satisfied); - } - - #[tokio::test] - async fn apply_is_idempotent() { - let dir = fixture(); - write_applyable_state(dir.path()); - - let first = apply_config_dir(dir.path()).await; - assert!(first.ok, "{:?}", first.diagnostics); - assert!(first.state_written); - let state_after_first = fs::read_to_string(dir.path().join(CLUSTER_STATE_FILE)).unwrap(); - - let second = apply_config_dir(dir.path()).await; - assert!(second.ok, "{:?}", second.diagnostics); - assert!(second.changes.is_empty()); - assert_eq!(second.applied_count, 0); - assert!(second.converged); - assert!(!second.state_written); - let state_after_second = fs::read_to_string(dir.path().join(CLUSTER_STATE_FILE)).unwrap(); - assert_eq!(state_after_first, state_after_second); - assert_eq!(second.state_observations.state_revision, 2); - } - - #[tokio::test] - async fn apply_respects_held_lock() { - let dir = fixture(); - write_applyable_state(dir.path()); - write_lock_file(dir.path(), "held-lock", "plan"); - - let out = apply_config_dir(dir.path()).await; - assert!(!out.ok); - assert!( - out.diagnostics - .iter() - .any(|diagnostic| diagnostic.code == "state_lock_held") - ); - // The held lock survives a refused apply, and nothing was written. - assert!(dir.path().join(CLUSTER_LOCK_FILE).exists()); - assert!(!dir.path().join(CLUSTER_RESOURCES_DIR).exists()); - let state = read_state_json(dir.path()); - assert_eq!(state["state_revision"], 1); - } - - #[tokio::test] - async fn apply_state_lock_false_bypasses_with_warning() { - let dir = fixture(); - fs::write( - dir.path().join(CLUSTER_CONFIG_FILE), - r#" -version: 1 -state: - backend: cluster - lock: false -graphs: - knowledge: - schema: ./people.pg - queries: - find_person: - file: ./people.gq -"#, - ) - .unwrap(); - write_applyable_state(dir.path()); - - let out = apply_config_dir(dir.path()).await; - assert!(out.ok, "{:?}", out.diagnostics); - assert!(out.state_written); - assert!(!out.state_observations.lock_acquired); - assert!( - out.diagnostics - .iter() - .any(|diagnostic| diagnostic.code == "state_lock_disabled") - ); - assert!(!dir.path().join(CLUSTER_LOCK_FILE).exists()); - } - - #[tokio::test] - async fn apply_skips_existing_payload_blob() { - let dir = fixture(); - write_applyable_state(dir.path()); - let desired = validate_config_dir(dir.path()); - let query_digest = desired - .resource_digests - .get("query.knowledge.find_person") - .unwrap() - .clone(); - // Content-addressed blobs are trusted by name: an existing file is - // never rewritten. - let blob = query_payload_path(dir.path(), &query_digest); - fs::create_dir_all(blob.parent().unwrap()).unwrap(); - fs::write(&blob, "pre-existing").unwrap(); - - let out = apply_config_dir(dir.path()).await; - assert!(out.ok, "{:?}", out.diagnostics); - assert_eq!(fs::read_to_string(&blob).unwrap(), "pre-existing"); - } - - #[tokio::test] - async fn apply_invalid_config_fails_before_lock() { - let dir = fixture(); - fs::write( - dir.path().join(CLUSTER_CONFIG_FILE), - "version: 1\nnot_a_field: true\n", - ) - .unwrap(); - - let out = apply_config_dir(dir.path()).await; - assert!(!out.ok); - // Config errors bail before the lock or any state directory exists. - assert!(!dir.path().join(CLUSTER_STATE_DIR).exists()); - } - - /// When the state write fails after payloads landed, the output must - /// report the statuses actually on disk — not the unpersisted in-memory - /// mutations (phantom `applied` entries would mislead automation that - /// reads `resource_statuses` independently of `ok`). - #[cfg(unix)] - #[tokio::test] - async fn apply_state_write_failure_reports_persisted_statuses() { - use std::os::unix::fs::PermissionsExt; - - let dir = fixture(); - // lock: false so the only write into __cluster/ is state.json itself. - fs::write( - dir.path().join(CLUSTER_CONFIG_FILE), - r#" -version: 1 -state: - backend: cluster - lock: false -graphs: - knowledge: - schema: ./people.pg - queries: - find_person: - file: ./people.gq -"#, - ) - .unwrap(); - write_applyable_state(dir.path()); - // Pre-create the payload blob so the payload phase is a no-op and the - // failure lands exactly at the state write. - let desired = validate_config_dir(dir.path()); - let query_digest = desired - .resource_digests - .get("query.knowledge.find_person") - .unwrap(); - let blob = query_payload_path(dir.path(), query_digest); - fs::create_dir_all(blob.parent().unwrap()).unwrap(); - fs::write(&blob, QUERY).unwrap(); - - let state_dir = dir.path().join(CLUSTER_STATE_DIR); - fs::set_permissions(&state_dir, fs::Permissions::from_mode(0o555)).unwrap(); - // Running as root ignores permission bits; skip rather than flake. - if fs::write(state_dir.join("probe"), b"x").is_ok() { - let _ = fs::remove_file(state_dir.join("probe")); - fs::set_permissions(&state_dir, fs::Permissions::from_mode(0o755)).unwrap(); - eprintln!("skipping: permissions are not enforced (running as root)"); - return; - } - - let out = apply_config_dir(dir.path()).await; - fs::set_permissions(&state_dir, fs::Permissions::from_mode(0o755)).unwrap(); - - assert!(!out.ok); - assert!(!out.state_written); - assert!( - out.diagnostics - .iter() - .any(|diagnostic| diagnostic.code == "state_write_error"), - "{:?}", - out.diagnostics - ); - // The seeded state has no statuses; the failed apply must not invent - // the in-memory `applied` ones it failed to persist. - assert!( - out.resource_statuses.is_empty(), - "unpersisted statuses leaked into output: {:?}", - out.resource_statuses - ); - } - - // ---- catalog payload verification (Stage 3B) ---- - - /// Converge a fixture dir and return the query blob path. - async fn converge_fixture(config_dir: &Path) -> std::path::PathBuf { - write_applyable_state(config_dir); - let out = apply_config_dir(config_dir).await; - assert!(out.ok && out.converged, "{:?}", out.diagnostics); - let desired = validate_config_dir(config_dir); - query_payload_path( - config_dir, - desired - .resource_digests - .get("query.knowledge.find_person") - .unwrap(), - ) - } - - #[tokio::test] - async fn status_reports_missing_payload_read_only() { - let dir = fixture(); - let blob = converge_fixture(dir.path()).await; - let state_before = fs::read_to_string(dir.path().join(CLUSTER_STATE_FILE)).unwrap(); - fs::remove_file(&blob).unwrap(); - - let out = status_config_dir(dir.path()); - assert!(out.ok, "{:?}", out.diagnostics); - assert!(out.diagnostics.iter().any(|diagnostic| { - diagnostic.code == "catalog_payload_missing" - && diagnostic.path == "query.knowledge.find_person" - })); - // Read-only: persisted statuses and state bytes untouched. - assert_eq!( - out.resource_statuses["query.knowledge.find_person"].status, - ResourceLifecycleStatus::Applied - ); - assert_eq!( - fs::read_to_string(dir.path().join(CLUSTER_STATE_FILE)).unwrap(), - state_before - ); - } - - #[tokio::test] - async fn refresh_removes_digest_and_drifts_on_missing_payload() { - let dir = fixture(); - init_derived_graph(dir.path()).await; - let blob = converge_fixture(dir.path()).await; - fs::remove_file(&blob).unwrap(); - - let out = refresh_config_dir(dir.path()).await; - assert!(out.ok, "{:?}", out.diagnostics); - assert!( - out.diagnostics - .iter() - .any(|diagnostic| diagnostic.code == "catalog_payload_missing") - ); - let status = &out.resource_statuses["query.knowledge.find_person"]; - assert_eq!(status.status, ResourceLifecycleStatus::Drifted); - assert!(status.conditions.contains(&"payload_missing".to_string())); - let state = read_state_json(dir.path()); - assert!( - state["applied_revision"]["resources"] - .get("query.knowledge.find_person") - .is_none(), - "{state}" - ); - } - - #[tokio::test] - async fn refresh_drifts_on_corrupted_payload() { - let dir = fixture(); - init_derived_graph(dir.path()).await; - let blob = converge_fixture(dir.path()).await; - fs::write(&blob, "corrupted content").unwrap(); - - let out = refresh_config_dir(dir.path()).await; - assert!(out.ok, "{:?}", out.diagnostics); - let status = &out.resource_statuses["query.knowledge.find_person"]; - assert_eq!(status.status, ResourceLifecycleStatus::Drifted); - assert!(status.conditions.contains(&"payload_mismatch".to_string())); - let state = read_state_json(dir.path()); - assert!( - state["applied_revision"]["resources"] - .get("query.knowledge.find_person") - .is_none() - ); - } - - #[tokio::test] - async fn refresh_flags_unreadable_payload_as_error() { - let dir = fixture(); - init_derived_graph(dir.path()).await; - let blob = converge_fixture(dir.path()).await; - // A same-named directory yields a non-NotFound IO error portably. - fs::remove_file(&blob).unwrap(); - fs::create_dir(&blob).unwrap(); - - let out = refresh_config_dir(dir.path()).await; - assert!(!out.ok); - assert!( - out.diagnostics - .iter() - .any(|diagnostic| diagnostic.code == "catalog_payload_read_error") - ); - let status = &out.resource_statuses["query.knowledge.find_person"]; - assert_eq!(status.status, ResourceLifecycleStatus::Error); - assert!(status.conditions.contains(&"payload_read_error".to_string())); - // Transient IO keeps the digest: no spurious republish. - let state = read_state_json(dir.path()); - assert!( - state["applied_revision"]["resources"] - .get("query.knowledge.find_person") - .is_some() - ); - } - - #[tokio::test] - async fn payload_drift_self_heals_through_refresh_plan_apply() { - let dir = fixture(); - init_derived_graph(dir.path()).await; - let blob = converge_fixture(dir.path()).await; - let original = fs::read_to_string(&blob).unwrap(); - fs::remove_file(&blob).unwrap(); - - let refresh = refresh_config_dir(dir.path()).await; - assert!(refresh.ok, "{:?}", refresh.diagnostics); - - let plan = plan_config_dir(dir.path()).await; - let query_change = plan - .changes - .iter() - .find(|change| change.resource == "query.knowledge.find_person") - .expect("plan must propose recreating the query"); - assert_eq!(query_change.operation, PlanOperation::Create); - assert_eq!(query_change.disposition, Some(ApplyDisposition::Applied)); - - let apply = apply_config_dir(dir.path()).await; - assert!(apply.ok && apply.converged, "{:?}", apply.diagnostics); - assert_eq!(fs::read_to_string(&blob).unwrap(), original); - - let status = status_config_dir(dir.path()); - assert!( - !status - .diagnostics - .iter() - .any(|diagnostic| diagnostic.code.starts_with("catalog_payload")), - "{:?}", - status.diagnostics - ); - } - - #[test] - fn verification_skips_graph_and_schema_resources() { - let dir = fixture(); - write_applyable_state(dir.path()); // graph + schema digests only, no blobs - - let out = status_config_dir(dir.path()); - assert!( - !out.diagnostics - .iter() - .any(|diagnostic| diagnostic.code.starts_with("catalog_payload")), - "{:?}", - out.diagnostics - ); - } - - // ---- recovery sidecars + sweep (Stage 4A) ---- - - fn derived_graph_uri(config_dir: &Path, graph_id: &str) -> String { - display_path( - &config_dir - .join(CLUSTER_GRAPHS_DIR) - .join(format!("{graph_id}.omni")), - ) - } - - fn write_create_sidecar( - config_dir: &Path, - graph_id: &str, - desired_schema_digest: &str, - operation_id: &str, - ) -> PathBuf { - let dir = config_dir.join(CLUSTER_RECOVERIES_DIR); - fs::create_dir_all(&dir).unwrap(); - let path = dir.join(format!("{operation_id}.json")); - fs::write( - &path, - serde_json::to_string_pretty(&json!({ - "schema_version": 1, - "operation_id": operation_id, - "started_at": "1970-01-01T00:00:00Z", - "kind": "graph_create", - "graph_id": graph_id, - "graph_uri": derived_graph_uri(config_dir, graph_id), - "desired_schema_digest": desired_schema_digest, - })) - .unwrap(), - ) - .unwrap(); - path - } - - #[tokio::test] - async fn sweep_removes_sidecar_when_root_absent() { - let dir = fixture(); - write_applyable_state(dir.path()); - let sidecar = write_create_sidecar(dir.path(), "knowledge", "irrelevant", "01ROW1"); - - let out = apply_config_dir(dir.path()).await; - assert!(out.ok, "{:?}", out.diagnostics); - // Row 1: nothing moved; intent removed, run proceeds normally. - assert!(!sidecar.exists()); - assert!(out.converged); - } - - #[tokio::test] - async fn sweep_rolls_forward_completed_create() { - let dir = fixture(); - init_derived_graph(dir.path()).await; - write_state_resources(dir.path(), &[]); // state predates the create - let desired = validate_config_dir(dir.path()); - let schema_digest = desired.resource_digests["schema.knowledge"].clone(); - let sidecar = write_create_sidecar(dir.path(), "knowledge", &schema_digest, "01ROW4"); - - let out = apply_config_dir(dir.path()).await; - assert!(out.ok, "{:?}", out.diagnostics); - assert!( - out.diagnostics - .iter() - .any(|diagnostic| diagnostic.code == "cluster_recovery_rolled_forward") - ); - // Row 4: ledger converged to observable reality, audit recorded, - // sidecar retired after the CAS landed. - let state = read_state_json(dir.path()); - assert_eq!( - state["applied_revision"]["resources"]["schema.knowledge"]["digest"], - schema_digest - ); - assert!( - state["recovery_records"] - .as_object() - .unwrap() - .values() - .any(|record| record["outcome"] == "rolled_forward" - && record["graph_id"] == "knowledge") - ); - assert!(!sidecar.exists()); - // With the graph rolled forward, the same run converges the catalog. - assert!(out.converged, "{out:?}"); - } - - #[tokio::test] - async fn sweep_completes_already_recorded_create() { - let dir = fixture(); - init_derived_graph(dir.path()).await; - write_applyable_state(dir.path()); // state already records graph+schema - let desired = validate_config_dir(dir.path()); - let sidecar = write_create_sidecar( - dir.path(), - "knowledge", - &desired.resource_digests["schema.knowledge"], - "01ROW2", - ); - - let out = apply_config_dir(dir.path()).await; - assert!(out.ok, "{:?}", out.diagnostics); - // Row 2: outcome was already durable; no audit entry, sidecar retired. - assert!(!sidecar.exists()); - let state = read_state_json(dir.path()); - assert!( - state["recovery_records"] - .as_object() - .is_none_or(|records| records.is_empty()), - "{state}" - ); - } - - #[tokio::test] - async fn sweep_keeps_sidecar_for_incomplete_root() { - let dir = fixture(); - write_applyable_state(dir.path()); - // A root that exists but cannot be opened: the engine's partial-init gap. - let root = dir.path().join(CLUSTER_GRAPHS_DIR).join("knowledge.omni"); - fs::create_dir_all(&root).unwrap(); - fs::write(root.join("_schema.pg"), "junk").unwrap(); - let sidecar = write_create_sidecar(dir.path(), "knowledge", "whatever", "01ROW5"); - - let out = apply_config_dir(dir.path()).await; - assert!(!out.ok); - assert!( - out.diagnostics - .iter() - .any(|diagnostic| diagnostic.code == "graph_create_incomplete") - ); - // Row 5: never auto-delete; sidecar and root stay for the operator, - // and the Error status is persisted by the run's state write. - assert!(sidecar.exists()); - assert!(root.exists()); - let state = read_state_json(dir.path()); - assert_eq!(state["resource_statuses"]["graph.knowledge"]["status"], "error"); - assert!( - state["resource_statuses"]["graph.knowledge"]["conditions"] - .as_array() - .unwrap() - .iter() - .any(|condition| condition == "graph_create_incomplete") - ); - } - - #[tokio::test] - async fn sweep_flags_unexpected_schema_as_pending() { - let dir = fixture(); - write_state_resources(dir.path(), &[]); - // Live graph exists with a schema the sidecar never intended. - let graph_dir = dir.path().join(CLUSTER_GRAPHS_DIR); - fs::create_dir_all(&graph_dir).unwrap(); - Omnigraph::init( - &derived_graph_uri(dir.path(), "knowledge"), - "\nnode Other {\n name: String @key\n}\n", - ) - .await - .unwrap(); - let desired = validate_config_dir(dir.path()); - let sidecar = write_create_sidecar( - dir.path(), - "knowledge", - &desired.resource_digests["schema.knowledge"], - "01ROW6", - ); - - let out = apply_config_dir(dir.path()).await; - assert!(out.ok, "{:?}", out.diagnostics); // warning, not error - assert!( - out.diagnostics - .iter() - .any(|diagnostic| diagnostic.code == "cluster_recovery_pending") - ); - // Row 6: refuse to guess; sidecar kept, Drifted persisted. - assert!(sidecar.exists()); - let state = read_state_json(dir.path()); - assert_eq!( - state["resource_statuses"]["graph.knowledge"]["status"], - "drifted" - ); - assert!( - state["resource_statuses"]["graph.knowledge"]["conditions"] - .as_array() - .unwrap() - .iter() - .any(|condition| condition == "actual_applied_state_pending") - ); - } - - #[tokio::test] - async fn apply_blocks_create_while_recovery_pending() { - let dir = fixture(); - write_state_resources(dir.path(), &[]); - // A kept (row 5) sidecar: partial root that cannot be opened. - let root = dir.path().join(CLUSTER_GRAPHS_DIR).join("knowledge.omni"); - fs::create_dir_all(&root).unwrap(); - fs::write(root.join("_schema.pg"), "junk").unwrap(); - let sidecar = write_create_sidecar(dir.path(), "knowledge", "whatever", "01PEND"); - - let out = apply_config_dir(dir.path()).await; - assert!(!out.ok); // row 5 is an error condition - let by_resource: BTreeMap<&str, &PlanChange> = out - .changes - .iter() - .map(|change| (change.resource.as_str(), change)) - .collect(); - // The pending recovery blocks the create and its dependents; the - // executor never attempts the init. - assert_eq!( - by_resource["graph.knowledge"].disposition, - Some(ApplyDisposition::Blocked) - ); - assert_eq!( - by_resource["graph.knowledge"].reason.as_deref(), - Some("cluster_recovery_pending") - ); - assert_eq!( - by_resource["query.knowledge.find_person"].reason.as_deref(), - Some("cluster_recovery_pending") - ); - assert_eq!( - by_resource["policy.base"].reason.as_deref(), - Some("cluster_recovery_pending") - ); - assert!(sidecar.exists()); - // The sweep's Error status is what persists — not a generic Blocked. - let state = read_state_json(dir.path()); - assert_eq!(state["resource_statuses"]["graph.knowledge"]["status"], "error"); - } - - #[tokio::test] - async fn plan_embeds_migration_preview_for_schema_update() { - let dir = fixture(); - init_derived_graph(dir.path()).await; - write_applyable_state(dir.path()); - fs::write( - dir.path().join("people.pg"), - "\nnode Person {\n name: String @key\n age: I32?\n bio: String?\n}\n", - ) - .unwrap(); - - let out = plan_config_dir(dir.path()).await; - assert!(out.ok, "{:?}", out.diagnostics); - let schema_change = out - .changes - .iter() - .find(|change| change.resource == "schema.knowledge") - .unwrap(); - let migration = schema_change.migration.as_ref().expect("preview embedded"); - assert!(migration.supported); - assert!( - serde_json::to_string(&migration.steps) - .unwrap() - .contains("add_property"), - "{migration:?}" - ); - } - - #[tokio::test] - async fn plan_warns_when_preview_unavailable() { - let dir = fixture(); - write_applyable_state(dir.path()); // digests recorded, but no live root - fs::write( - dir.path().join("people.pg"), - "\nnode Person {\n name: String @key\n age: I32?\n bio: String?\n}\n", - ) - .unwrap(); - - let out = plan_config_dir(dir.path()).await; - assert!(out.ok, "{:?}", out.diagnostics); - let schema_change = out - .changes - .iter() - .find(|change| change.resource == "schema.knowledge") - .unwrap(); - assert!(schema_change.migration.is_none()); - assert!( - out.diagnostics - .iter() - .any(|diagnostic| diagnostic.code == "schema_preview_unavailable") - ); - } - - fn write_schema_apply_sidecar( - config_dir: &Path, - graph_id: &str, - desired_schema_digest: &str, - operation_id: &str, - ) -> PathBuf { - let dir = config_dir.join(CLUSTER_RECOVERIES_DIR); - fs::create_dir_all(&dir).unwrap(); - let path = dir.join(format!("{operation_id}.json")); - fs::write( - &path, - serde_json::to_string_pretty(&json!({ - "schema_version": 1, - "operation_id": operation_id, - "started_at": "1970-01-01T00:00:00Z", - "kind": "schema_apply", - "graph_id": graph_id, - "graph_uri": derived_graph_uri(config_dir, graph_id), - "desired_schema_digest": desired_schema_digest, - })) - .unwrap(), - ) - .unwrap(); - path - } - - const SCHEMA_V2: &str = "\nnode Person {\n name: String @key\n age: I32?\n bio: String?\n}\n"; - - #[tokio::test] - async fn sweep_retires_schema_sidecar_when_ledger_consistent() { - let dir = fixture(); - init_derived_graph(dir.path()).await; - write_applyable_state(dir.path()); // state digest == live digest - let sidecar = - write_schema_apply_sidecar(dir.path(), "knowledge", "never-applied", "01SROW1"); - - let out = apply_config_dir(dir.path()).await; - assert!(out.ok, "{:?}", out.diagnostics); - assert!(!sidecar.exists()); - let state = read_state_json(dir.path()); - assert!( - state["recovery_records"] - .as_object() - .is_none_or(|records| records.is_empty()) - ); - } - - #[tokio::test] - async fn sweep_rolls_forward_completed_schema_apply() { - let dir = fixture(); - init_derived_graph(dir.path()).await; - write_applyable_state(dir.path()); - // The schema apply completed on the graph out-of-process... - let graph_uri = derived_graph_uri(dir.path(), "knowledge"); - let db = Omnigraph::open(&graph_uri).await.unwrap(); - db.apply_schema(SCHEMA_V2).await.unwrap(); - // ...the desired config matches it, and the sidecar records the intent. - fs::write(dir.path().join("people.pg"), SCHEMA_V2).unwrap(); - let desired = validate_config_dir(dir.path()); - let v2_digest = desired.resource_digests["schema.knowledge"].clone(); - let sidecar = write_schema_apply_sidecar(dir.path(), "knowledge", &v2_digest, "01SROW3"); - - let out = apply_config_dir(dir.path()).await; - assert!(out.ok, "{:?}", out.diagnostics); - assert!( - out.diagnostics - .iter() - .any(|diagnostic| diagnostic.code == "cluster_recovery_rolled_forward") - ); - assert!(!sidecar.exists()); - let state = read_state_json(dir.path()); - assert_eq!( - state["applied_revision"]["resources"]["schema.knowledge"]["digest"], - v2_digest - ); - assert!( - state["recovery_records"] - .as_object() - .unwrap() - .values() - .any(|record| record["kind"] == "schema_apply" - && record["outcome"] == "rolled_forward") - ); - assert!(out.converged, "{out:?}"); - } - - #[tokio::test] - async fn sweep_flags_unexpected_schema_apply_state_as_pending() { - let dir = fixture(); - init_derived_graph(dir.path()).await; // live = v1 - write_state_resources(dir.path(), &[("schema.knowledge", "stale-digest")]); - // Sidecar intended a digest that is neither live nor recorded. - let sidecar = - write_schema_apply_sidecar(dir.path(), "knowledge", "intended-digest", "01SROW6"); - - let out = apply_config_dir(dir.path()).await; - assert!(out.ok, "{:?}", out.diagnostics); // warnings only - assert!( - out.diagnostics - .iter() - .any(|diagnostic| diagnostic.code == "cluster_recovery_pending") - ); - assert!(sidecar.exists()); - let state = read_state_json(dir.path()); - assert_eq!( - state["resource_statuses"]["schema.knowledge"]["status"], - "drifted" - ); - } - - #[tokio::test] - async fn sweep_keeps_schema_sidecar_for_unopenable_root() { - let dir = fixture(); - write_applyable_state(dir.path()); - let root = dir.path().join(CLUSTER_GRAPHS_DIR).join("knowledge.omni"); - fs::create_dir_all(&root).unwrap(); // exists, won't open - let sidecar = - write_schema_apply_sidecar(dir.path(), "knowledge", "whatever", "01SROWX"); - - let out = apply_config_dir(dir.path()).await; - assert!(out.ok, "{:?}", out.diagnostics); // warning: cannot verify - assert!( - out.diagnostics - .iter() - .any(|diagnostic| diagnostic.code == "cluster_recovery_pending") - ); - assert!(sidecar.exists()); - } - - /// Seed: converged knowledge subtree + a stale `old` graph subtree with a - /// real directory on disk. - fn seed_deletable_state(config_dir: &Path) { - write_applyable_state(config_dir); - let state = read_state_json(config_dir); - let g = state["applied_revision"]["resources"]["graph.knowledge"]["digest"] - .as_str() - .unwrap() - .to_string(); - let sc = state["applied_revision"]["resources"]["schema.knowledge"]["digest"] - .as_str() - .unwrap() - .to_string(); - write_state_resources( - config_dir, - &[ - ("graph.knowledge", g.as_str()), - ("schema.knowledge", sc.as_str()), - ("graph.old", "3333"), - ("schema.old", "4444"), - ("query.old.q", "5555"), - ], - ); - let root = config_dir.join(CLUSTER_GRAPHS_DIR).join("old.omni"); - fs::create_dir_all(&root).unwrap(); - fs::write(root.join("_schema.pg"), "stale").unwrap(); - } - - #[tokio::test] - async fn apply_executes_approved_graph_delete() { - let dir = fixture(); - seed_deletable_state(dir.path()); - let approved = approve_config_dir(dir.path(), "graph.old", "andrew").await; - assert!(approved.ok, "{:?}", approved.diagnostics); - let approval_id = approved.approval_id.clone().unwrap(); - - let out = apply_config_dir(dir.path()).await; - assert!(out.ok, "{:?}", out.diagnostics); - assert!(out.converged, "{out:?}"); - let by_resource: BTreeMap<&str, &PlanChange> = out - .changes - .iter() - .map(|change| (change.resource.as_str(), change)) - .collect(); - assert_eq!(by_resource["graph.old"].disposition, Some(ApplyDisposition::Applied)); - assert_eq!(by_resource["schema.old"].disposition, Some(ApplyDisposition::Applied)); - assert_eq!(by_resource["query.old.q"].disposition, Some(ApplyDisposition::Applied)); - // The root is gone; the subtree is tombstoned out of the ledger. - assert!(!dir.path().join(CLUSTER_GRAPHS_DIR).join("old.omni").exists()); - let state = read_state_json(dir.path()); - let resources = state["applied_revision"]["resources"].as_object().unwrap(); - assert!(!resources.contains_key("graph.old")); - assert!(!resources.contains_key("schema.old")); - assert!(!resources.contains_key("query.old.q")); - assert_eq!(state["observations"]["graph.old"]["kind"], "tombstone"); - assert_eq!(state["observations"]["graph.old"]["approval_id"], approval_id); - // Approval consumed in BOTH stores: ledger summary + artifact file. - assert!(state["approval_records"][&approval_id]["consumed_at"].is_string()); - let artifact: serde_json::Value = serde_json::from_str( - &fs::read_to_string( - dir.path() - .join(CLUSTER_APPROVALS_DIR) - .join(format!("{approval_id}.json")), - ) - .unwrap(), - ) - .unwrap(); - assert!(artifact["consumed_at"].is_string(), "{artifact}"); - // Sidecar retired. - assert!( - fs::read_dir(dir.path().join(CLUSTER_RECOVERIES_DIR)) - .map(|mut entries| entries.next().is_none()) - .unwrap_or(true) - ); - // A consumed approval authorizes nothing further (idempotent re-apply). - let again = apply_config_dir(dir.path()).await; - assert!(again.ok && again.converged && !again.state_written, "{again:?}"); - } - - fn write_delete_sidecar( - config_dir: &Path, - graph_id: &str, - approval_id: Option<&str>, - operation_id: &str, - ) -> PathBuf { - let dir = config_dir.join(CLUSTER_RECOVERIES_DIR); - fs::create_dir_all(&dir).unwrap(); - let path = dir.join(format!("{operation_id}.json")); - fs::write( - &path, - serde_json::to_string_pretty(&json!({ - "schema_version": 1, - "operation_id": operation_id, - "started_at": "1970-01-01T00:00:00Z", - "kind": "graph_delete", - "graph_id": graph_id, - "graph_uri": derived_graph_uri(config_dir, graph_id), - "desired_schema_digest": "", - "approval_id": approval_id, - })) - .unwrap(), - ) - .unwrap(); - path - } - - #[tokio::test] - async fn sweep_retires_delete_sidecar_when_tombstoned() { - let dir = fixture(); - write_applyable_state(dir.path()); // no graph.old in state, no root - let sidecar = write_delete_sidecar(dir.path(), "old", None, "01DROW7"); - - let out = apply_config_dir(dir.path()).await; - assert!(out.ok, "{:?}", out.diagnostics); - assert!(!sidecar.exists()); - let state = read_state_json(dir.path()); - assert!( - state["recovery_records"] - .as_object() - .is_none_or(|records| records.is_empty()) - ); - } - - #[tokio::test] - async fn sweep_rolls_forward_completed_delete() { - let dir = fixture(); - seed_deletable_state(dir.path()); - // Approve, then simulate: root removed, state stale, sidecar present. - let approved = approve_config_dir(dir.path(), "graph.old", "andrew").await; - let approval_id = approved.approval_id.unwrap(); - fs::remove_dir_all(dir.path().join(CLUSTER_GRAPHS_DIR).join("old.omni")).unwrap(); - let sidecar = write_delete_sidecar(dir.path(), "old", Some(&approval_id), "01DROW7B"); - - let out = apply_config_dir(dir.path()).await; - assert!(out.ok, "{:?}", out.diagnostics); - assert!( - out.diagnostics - .iter() - .any(|diagnostic| diagnostic.code == "cluster_recovery_rolled_forward") - ); - assert!(!sidecar.exists()); - let state = read_state_json(dir.path()); - assert!( - !state["applied_revision"]["resources"] - .as_object() - .unwrap() - .contains_key("graph.old") - ); - assert_eq!(state["observations"]["graph.old"]["kind"], "tombstone"); - assert!(state["approval_records"][&approval_id]["consumed_at"].is_string()); - assert!( - state["recovery_records"] - .as_object() - .unwrap() - .values() - .any(|record| record["kind"] == "graph_delete" - && record["outcome"] == "rolled_forward") - ); - // The artifact file is marked consumed post-CAS. - let artifact: serde_json::Value = serde_json::from_str( - &fs::read_to_string( - dir.path() - .join(CLUSTER_APPROVALS_DIR) - .join(format!("{approval_id}.json")), - ) - .unwrap(), - ) - .unwrap(); - assert!(artifact["consumed_at"].is_string()); - assert!(out.converged, "{out:?}"); - } - - #[tokio::test] - async fn sweep_reproposes_incomplete_delete() { - let dir = fixture(); - seed_deletable_state(dir.path()); // root present - let approved = approve_config_dir(dir.path(), "graph.old", "andrew").await; - assert!(approved.ok); - let sidecar = write_delete_sidecar(dir.path(), "old", approved.approval_id.as_deref(), "01DROW8"); - - // Row 8: the stale intent is retired with a warning, and the same run - // re-executes the still-approved delete to completion. - let out = apply_config_dir(dir.path()).await; - assert!(out.ok, "{:?}", out.diagnostics); - assert!( - out.diagnostics - .iter() - .any(|diagnostic| diagnostic.code == "graph_delete_incomplete") - ); - assert!(!sidecar.exists()); - assert!(!dir.path().join(CLUSTER_GRAPHS_DIR).join("old.omni").exists()); - assert!(out.converged, "{out:?}"); - } - - // ---- policy bindings in the applied revision (5A) ---- - - #[tokio::test] - async fn apply_records_policy_bindings() { - let dir = fixture(); - write_applyable_state(dir.path()); - - let out = apply_config_dir(dir.path()).await; - assert!(out.ok && out.converged, "{:?}", out.diagnostics); - let state = read_state_json(dir.path()); - assert_eq!( - state["applied_revision"]["resources"]["policy.base"]["applies_to"], - serde_json::json!(["graph.knowledge"]), - "{state}" - ); - // Non-policy entries carry no bindings field at all. - assert!( - state["applied_revision"]["resources"]["query.knowledge.find_person"] - .get("applies_to") - .is_none() - ); - } - - #[tokio::test] - async fn binding_change_is_a_visible_plan_change() { - let dir = fixture(); - write_applyable_state(dir.path()); - let converge = apply_config_dir(dir.path()).await; - assert!(converge.converged, "{converge:?}"); - // Edit ONLY applies_to: the policy file digest is unchanged. - fs::write( - dir.path().join(CLUSTER_CONFIG_FILE), - r#" -version: 1 -metadata: - name: test -state: - backend: cluster - lock: true -graphs: - knowledge: - schema: ./people.pg - queries: - find_person: - file: ./people.gq -policies: - base: - file: ./base.policy.yaml - applies_to: [cluster, knowledge] -"#, - ) - .unwrap(); - - let plan = plan_config_dir(dir.path()).await; - let change = plan - .changes - .iter() - .find(|change| change.resource == "policy.base") - .expect("binding change must be visible in plan"); - assert!(change.binding_change); - assert_eq!(change.operation, PlanOperation::Update); - assert_eq!(change.before_digest, change.after_digest); - - let out = apply_config_dir(dir.path()).await; - assert!(out.ok && out.converged, "{out:?}"); - let state = read_state_json(dir.path()); - assert_eq!( - state["applied_revision"]["resources"]["policy.base"]["applies_to"], - serde_json::json!(["cluster", "graph.knowledge"]) - ); - // Idempotent: a second run sees no changes. - let again = apply_config_dir(dir.path()).await; - assert!(again.changes.is_empty() && !again.state_written, "{again:?}"); - } - - #[tokio::test] - async fn pre_5a_state_backfills_bindings() { - let dir = fixture(); - write_applyable_state(dir.path()); - let converge = apply_config_dir(dir.path()).await; - assert!(converge.converged, "{converge:?}"); - // Strip the bindings from the state entry (a pre-5A ledger). - let mut state: serde_json::Value = serde_json::from_str( - &fs::read_to_string(dir.path().join(CLUSTER_STATE_FILE)).unwrap(), - ) - .unwrap(); - state["applied_revision"]["resources"]["policy.base"] - .as_object_mut() - .unwrap() - .remove("applies_to"); - fs::write( - dir.path().join(CLUSTER_STATE_FILE), - serde_json::to_string_pretty(&state).unwrap(), - ) - .unwrap(); - - let plan = plan_config_dir(dir.path()).await; - assert!( - plan.changes - .iter() - .any(|change| change.resource == "policy.base" && change.binding_change), - "{plan:?}" - ); - let out = apply_config_dir(dir.path()).await; - assert!(out.ok && out.converged, "{out:?}"); - let healed = read_state_json(dir.path()); - assert_eq!( - healed["applied_revision"]["resources"]["policy.base"]["applies_to"], - serde_json::json!(["graph.knowledge"]) - ); - } - - #[tokio::test] - async fn bindings_survive_refresh() { - let dir = fixture(); - init_derived_graph(dir.path()).await; - write_applyable_state(dir.path()); - let converge = apply_config_dir(dir.path()).await; - assert!(converge.converged, "{converge:?}"); - - let refresh = refresh_config_dir(dir.path()).await; - assert!(refresh.ok, "{:?}", refresh.diagnostics); - let state = read_state_json(dir.path()); - assert_eq!( - state["applied_revision"]["resources"]["policy.base"]["applies_to"], - serde_json::json!(["graph.knowledge"]) - ); - } - - // ---- serving snapshot (5B read-only loader) ---- - - #[tokio::test] - async fn serving_snapshot_reads_converged_cluster() { - let dir = fixture(); - init_derived_graph(dir.path()).await; - write_applyable_state(dir.path()); - let converge = apply_config_dir(dir.path()).await; - assert!(converge.converged, "{converge:?}"); - - let snapshot = read_serving_snapshot(dir.path()).expect("converged cluster must serve"); - assert_eq!(snapshot.graphs.len(), 1); - assert_eq!(snapshot.graphs[0].graph_id, "knowledge"); - assert!(snapshot.graphs[0].root.ends_with("graphs/knowledge.omni")); - assert_eq!(snapshot.queries.len(), 1); - assert_eq!(snapshot.queries[0].name, "find_person"); - assert!(snapshot.queries[0].source.contains("query find_person")); - assert_eq!(snapshot.policies.len(), 1); - assert_eq!(snapshot.policies[0].applies_to, vec!["graph.knowledge"]); - assert!(snapshot.policies[0].blob_path.exists()); - } - - #[test] - fn serving_snapshot_refuses_missing_state() { - let dir = fixture(); - let err = read_serving_snapshot(dir.path()).unwrap_err(); - assert!( - err.iter().any(|diagnostic| diagnostic.code == "cluster_state_missing"), - "{err:?}" - ); - } - - #[tokio::test] - async fn serving_snapshot_refuses_pending_recovery() { - let dir = fixture(); - init_derived_graph(dir.path()).await; - write_applyable_state(dir.path()); - apply_config_dir(dir.path()).await; - write_schema_apply_sidecar(dir.path(), "knowledge", "whatever", "01SERVE"); - - let err = read_serving_snapshot(dir.path()).unwrap_err(); - assert!( - err.iter().any(|diagnostic| diagnostic.code == "cluster_recovery_pending"), - "{err:?}" - ); - } - - #[tokio::test] - async fn serving_snapshot_refuses_tampered_blob_and_stripped_bindings() { - let dir = fixture(); - init_derived_graph(dir.path()).await; - write_applyable_state(dir.path()); - apply_config_dir(dir.path()).await; - // Tamper with the query blob... - let snapshot = read_serving_snapshot(dir.path()).unwrap(); - let desired = validate_config_dir(dir.path()); - let query_digest = &desired.resource_digests["query.knowledge.find_person"]; - let blob = dir - .path() - .join(CLUSTER_RESOURCES_DIR) - .join("query/knowledge/find_person") - .join(format!("{query_digest}.gq")); - fs::write(&blob, "tampered").unwrap(); - // ...and strip the policy bindings (pre-5A ledger). - let mut state: serde_json::Value = serde_json::from_str( - &fs::read_to_string(dir.path().join(CLUSTER_STATE_FILE)).unwrap(), - ) - .unwrap(); - state["applied_revision"]["resources"]["policy.base"] - .as_object_mut() - .unwrap() - .remove("applies_to"); - fs::write( - dir.path().join(CLUSTER_STATE_FILE), - serde_json::to_string_pretty(&state).unwrap(), - ) - .unwrap(); - - let err = read_serving_snapshot(dir.path()).unwrap_err(); - assert!( - err.iter() - .any(|diagnostic| diagnostic.code == "catalog_payload_digest_mismatch"), - "{err:?}" - ); - assert!( - err.iter().any(|diagnostic| diagnostic.code == "policy_bindings_missing"), - "{err:?}" - ); - let _ = snapshot; // the pre-tamper read succeeded - } - - #[test] - fn serving_snapshot_refuses_empty_cluster() { - let dir = fixture(); - write_state_resources(dir.path(), &[]); // state exists, no graphs - - let err = read_serving_snapshot(dir.path()).unwrap_err(); - assert!( - err.iter().any(|diagnostic| diagnostic.code == "cluster_empty"), - "{err:?}" - ); - } - - // ---- query discovery (Terraform-style declaration) ---- - - #[test] - fn queries_directory_discovers_every_declaration() { - let dir = tempfile::tempdir().unwrap(); - fs::write(dir.path().join("people.pg"), "\nnode Person {\n name: String @key\n}\n").unwrap(); - fs::create_dir(dir.path().join("queries")).unwrap(); - fs::write( - dir.path().join("queries/people.gq"), - "\nquery find_person($name: String) {\n match { $p: Person { name: $name } }\n return { $p.name }\n}\n\nquery all_people() {\n match { $p: Person }\n return { $p.name }\n}\n", - ) - .unwrap(); - fs::write( - dir.path().join("queries/extra.gq"), - "\nquery count_people() {\n match { $p: Person }\n return { count($p) }\n}\n", - ) - .unwrap(); - fs::write(dir.path().join("queries/notes.txt"), "ignored").unwrap(); - fs::write( - dir.path().join("cluster.yaml"), - "version: 1\ngraphs:\n knowledge:\n schema: ./people.pg\n queries: ./queries/\n", - ) - .unwrap(); - - let out = validate_config_dir(dir.path()); - assert!(out.ok, "{:?}", out.diagnostics); - let names: Vec<&str> = out - .resource_digests - .keys() - .filter_map(|address| address.strip_prefix("query.knowledge.")) - .collect(); - assert_eq!(names, vec!["all_people", "count_people", "find_person"]); - } - - #[test] - fn queries_list_and_single_file_forms_discover() { - let dir = tempfile::tempdir().unwrap(); - fs::write(dir.path().join("people.pg"), "\nnode Person {\n name: String @key\n}\n").unwrap(); - fs::write( - dir.path().join("a.gq"), - "\nquery find_person($name: String) {\n match { $p: Person { name: $name } }\n return { $p.name }\n}\n", - ) - .unwrap(); - fs::write( - dir.path().join("b.gq"), - "\nquery all_people() {\n match { $p: Person }\n return { $p.name }\n}\n", - ) - .unwrap(); - fs::write( - dir.path().join("cluster.yaml"), - "version: 1\ngraphs:\n knowledge:\n schema: ./people.pg\n queries: [./a.gq, ./b.gq]\n", - ) - .unwrap(); - let out = validate_config_dir(dir.path()); - assert!(out.ok, "{:?}", out.diagnostics); - assert!(out.resource_digests.contains_key("query.knowledge.find_person")); - assert!(out.resource_digests.contains_key("query.knowledge.all_people")); - - // Single-file string form - fs::write( - dir.path().join("cluster.yaml"), - "version: 1\ngraphs:\n knowledge:\n schema: ./people.pg\n queries: ./a.gq\n", - ) - .unwrap(); - let out = validate_config_dir(dir.path()); - assert!(out.ok, "{:?}", out.diagnostics); - assert!(out.resource_digests.contains_key("query.knowledge.find_person")); - assert!(!out.resource_digests.contains_key("query.knowledge.all_people")); - } - - #[test] - fn query_discovery_rejects_duplicates_and_parse_errors() { - let dir = tempfile::tempdir().unwrap(); - fs::write(dir.path().join("people.pg"), "\nnode Person {\n name: String @key\n}\n").unwrap(); - let decl = "\nquery find_person($name: String) {\n match { $p: Person { name: $name } }\n return { $p.name }\n}\n"; - fs::write(dir.path().join("a.gq"), decl).unwrap(); - fs::write(dir.path().join("b.gq"), decl).unwrap(); - fs::write( - dir.path().join("cluster.yaml"), - "version: 1\ngraphs:\n knowledge:\n schema: ./people.pg\n queries: [./a.gq, ./b.gq]\n", - ) - .unwrap(); - let out = validate_config_dir(dir.path()); - assert!(!out.ok); - assert!( - out.diagnostics - .iter() - .any(|diagnostic| diagnostic.code == "duplicate_query_name"), - "{:?}", - out.diagnostics - ); - - fs::write(dir.path().join("broken.gq"), "query {{{ nope").unwrap(); - fs::write( - dir.path().join("cluster.yaml"), - "version: 1\ngraphs:\n knowledge:\n schema: ./people.pg\n queries: ./broken.gq\n", - ) - .unwrap(); - let out = validate_config_dir(dir.path()); - assert!(!out.ok); - assert!( - out.diagnostics - .iter() - .any(|diagnostic| diagnostic.code == "query_parse_error"), - "{:?}", - out.diagnostics - ); - } - - #[test] - fn status_warns_on_pending_recovery_sidecar() { - let dir = fixture(); - write_applyable_state(dir.path()); - write_create_sidecar(dir.path(), "knowledge", "irrelevant", "01STATUS"); - - let out = status_config_dir(dir.path()); - assert!(out.ok, "{:?}", out.diagnostics); - assert!( - out.diagnostics - .iter() - .any(|diagnostic| diagnostic.code == "cluster_recovery_pending" - && diagnostic.severity == DiagnosticSeverity::Warning) - ); - } - - #[tokio::test] - async fn plan_annotates_apply_dispositions() { - let dir = fixture(); - let out = plan_config_dir(dir.path()).await; - assert!(out.ok, "{:?}", out.diagnostics); - let by_resource: BTreeMap<&str, &PlanChange> = out - .changes - .iter() - .map(|change| (change.resource.as_str(), change)) - .collect(); - // Stage 4A: graph/schema creates are executable, and dependents ride - // the same run — plan previews exactly that. - assert_eq!( - by_resource["graph.knowledge"].disposition, - Some(ApplyDisposition::Applied) - ); - assert_eq!( - by_resource["schema.knowledge"].disposition, - Some(ApplyDisposition::Applied) - ); - assert_eq!( - by_resource["query.knowledge.find_person"].disposition, - Some(ApplyDisposition::Applied) - ); - assert_eq!( - by_resource["policy.base"].disposition, - Some(ApplyDisposition::Applied) - ); - } -} +#[path = "tests.rs"] +mod tests; diff --git a/crates/omnigraph-cluster/src/tests.rs b/crates/omnigraph-cluster/src/tests.rs new file mode 100644 index 0000000..a03c522 --- /dev/null +++ b/crates/omnigraph-cluster/src/tests.rs @@ -0,0 +1,3019 @@ +//! In-source test suite, moved verbatim from lib.rs (modularization). +//! Indentation is preserved exactly — embedded raw-string fixtures +//! (cluster.yaml/JSON bodies) are content, not formatting. +#![allow(clippy::all)] + + use std::fs; + use std::path::Path; + + use omnigraph::db::Omnigraph; + use serde_json::json; + use tempfile::tempdir; + + use super::*; + + const SCHEMA: &str = r#" +node Person { + name: String @key + age: I32? +} +"#; + + const QUERY: &str = r#" +query find_person($name: String) { + match { $p: Person { name: $name } } + return { $p.name, $p.age } +} +"#; + + fn fixture() -> tempfile::TempDir { + let dir = tempdir().unwrap(); + fs::write(dir.path().join("people.pg"), SCHEMA).unwrap(); + fs::write(dir.path().join("people.gq"), QUERY).unwrap(); + fs::write(dir.path().join("base.policy.yaml"), "rules: []\n").unwrap(); + fs::write( + dir.path().join(CLUSTER_CONFIG_FILE), + r#" +version: 1 +metadata: + name: test +state: + backend: cluster + lock: true +graphs: + knowledge: + schema: ./people.pg + queries: + find_person: + file: ./people.gq +policies: + base: + file: ./base.policy.yaml + applies_to: [knowledge] +"#, + ) + .unwrap(); + dir + } + + async fn init_derived_graph(root: &Path) { + let graph_dir = root.join(CLUSTER_GRAPHS_DIR); + fs::create_dir_all(&graph_dir).unwrap(); + let graph = graph_dir.join("knowledge.omni"); + Omnigraph::init(graph.to_string_lossy().as_ref(), SCHEMA) + .await + .unwrap(); + } + + fn write_lock_file(config_dir: &Path, lock_id: &str, operation: &str) { + let state_dir = config_dir.join(CLUSTER_STATE_DIR); + fs::create_dir_all(&state_dir).unwrap(); + fs::write( + state_dir.join("lock.json"), + json!({ + "version": 1, + "lock_id": lock_id, + "operation": operation, + "created_at": "1970-01-01T00:00:00Z", + "pid": 123 + }) + .to_string(), + ) + .unwrap(); + } + + #[test] + fn valid_minimal_config() { + let dir = fixture(); + let out = validate_config_dir(dir.path()); + assert!(out.ok, "{:?}", out.diagnostics); + assert!(out.resource_digests.contains_key("graph.knowledge")); + assert!(out.resource_digests.contains_key("schema.knowledge")); + assert!( + out.dependencies + .iter() + .any(|dep| dep.from == "policy.base" && dep.to == "graph.knowledge") + ); + } + + #[test] + fn unknown_field_rejection() { + let dir = fixture(); + fs::write( + dir.path().join(CLUSTER_CONFIG_FILE), + "version: 1\ngraphs: {}\nwat: true\n", + ) + .unwrap(); + let out = validate_config_dir(dir.path()); + assert!(!out.ok); + assert!(out.diagnostics[0].message.contains("unknown field")); + } + + #[test] + fn future_phase_field_rejection() { + let dir = fixture(); + fs::write( + dir.path().join(CLUSTER_CONFIG_FILE), + "version: 1\ngraphs: {}\npipelines: {}\n", + ) + .unwrap(); + let out = validate_config_dir(dir.path()); + assert!(!out.ok); + assert_eq!(out.diagnostics[0].code, "future_phase_field"); + } + + #[test] + fn duplicate_yaml_key_rejection() { + let dir = fixture(); + fs::write( + dir.path().join(CLUSTER_CONFIG_FILE), + "version: 1\ngraphs: {}\ngraphs: {}\n", + ) + .unwrap(); + let out = validate_config_dir(dir.path()); + assert!(!out.ok); + assert_eq!(out.diagnostics[0].code, "duplicate_yaml_key"); + } + + #[test] + fn duplicate_yaml_key_rejection_keeps_quoted_hashes() { + let diagnostics = + duplicate_key_diagnostics("\"name#display\": one\n\"name#display\": two\n"); + assert_eq!(diagnostics.len(), 1); + assert_eq!(diagnostics[0].code, "duplicate_yaml_key"); + } + + #[test] + fn missing_schema_query_and_policy_files() { + let dir = fixture(); + fs::write( + dir.path().join(CLUSTER_CONFIG_FILE), + r#" +version: 1 +graphs: + knowledge: + schema: ./missing.pg + queries: + find_person: { file: ./missing.gq } +policies: + base: + file: ./missing.policy.yaml + applies_to: [knowledge] +"#, + ) + .unwrap(); + let out = validate_config_dir(dir.path()); + assert!(!out.ok); + let codes: BTreeSet<_> = out.diagnostics.iter().map(|d| d.code.as_str()).collect(); + assert!(codes.contains("schema_file_missing")); + assert!(codes.contains("query_file_missing")); + assert!(codes.contains("policy_file_missing")); + } + + #[test] + fn wrong_kind_and_dangling_refs_fail() { + let dir = fixture(); + fs::write( + dir.path().join(CLUSTER_CONFIG_FILE), + r#" +version: 1 +graphs: + knowledge: + schema: ./people.pg +policies: + base: + file: ./base.policy.yaml + applies_to: [query.knowledge.find_person, missing] +"#, + ) + .unwrap(); + let out = validate_config_dir(dir.path()); + assert!(!out.ok); + let codes: BTreeSet<_> = out.diagnostics.iter().map(|d| d.code.as_str()).collect(); + assert!(codes.contains("wrong_kind_reference")); + assert!(codes.contains("dangling_graph_reference")); + } + + #[test] + fn query_key_mismatch_fails() { + let dir = fixture(); + fs::write( + dir.path().join(CLUSTER_CONFIG_FILE), + r#" +version: 1 +graphs: + knowledge: + schema: ./people.pg + queries: + different: { file: ./people.gq } +"#, + ) + .unwrap(); + let out = validate_config_dir(dir.path()); + assert!(!out.ok); + assert_eq!(out.diagnostics[0].code, "query_key_mismatch"); + } + + #[test] + fn query_typecheck_failure_fails() { + let dir = fixture(); + fs::write( + dir.path().join("people.gq"), + "query find_person() { match { $d: DoesNotExist } return { $d.name } }\n", + ) + .unwrap(); + let out = validate_config_dir(dir.path()); + assert!(!out.ok); + assert!( + out.diagnostics + .iter() + .any(|diagnostic| diagnostic.code == "query_typecheck_error") + ); + } + + #[tokio::test] + async fn missing_state_plans_creates() { + let dir = fixture(); + let out = plan_config_dir(dir.path()).await; + assert!(out.ok, "{:?}", out.diagnostics); + assert!(!out.state_observations.state_found); + assert!(!out.state_observations.locked); + assert!(out.state_observations.lock_acquired); + assert!( + out.changes + .iter() + .all(|c| c.operation == PlanOperation::Create) + ); + assert!(out.changes.iter().any(|c| c.resource == "graph.knowledge")); + assert!(!dir.path().join(CLUSTER_LOCK_FILE).exists()); + } + + #[tokio::test] + async fn config_digest_ignores_yaml_comments_and_formatting() { + let dir = fixture(); + let first = plan_config_dir(dir.path()).await; + assert!(first.ok, "{:?}", first.diagnostics); + + fs::write( + dir.path().join(CLUSTER_CONFIG_FILE), + r#" +# Same semantic config as the fixture, intentionally rendered differently. +version: 1 +metadata: { name: test } +state: { backend: cluster, lock: true } +graphs: + knowledge: + schema: ./people.pg + queries: { find_person: { file: ./people.gq } } +policies: + base: + file: ./base.policy.yaml + applies_to: + - knowledge +"#, + ) + .unwrap(); + + let second = plan_config_dir(dir.path()).await; + assert!(second.ok, "{:?}", second.diagnostics); + assert_eq!( + first.desired_revision.config_digest, + second.desired_revision.config_digest + ); + } + + #[tokio::test] + async fn existing_state_plans_update_and_delete_deterministically() { + let dir = fixture(); + let first = plan_config_dir(dir.path()).await; + let state_dir = dir.path().join("__cluster"); + fs::create_dir_all(&state_dir).unwrap(); + fs::write( + state_dir.join("state.json"), + serde_json::to_string_pretty(&json!({ + "version": 1, + "applied_revision": { + "config_digest": "old", + "resources": { + "graph.knowledge": { "digest": first.resource_digests["graph.knowledge"] }, + "policy.old": { "digest": "abc" }, + "schema.knowledge": { "digest": "old-schema" } + } + } + })) + .unwrap(), + ) + .unwrap(); + + let out = plan_config_dir(dir.path()).await; + assert!(out.ok, "{:?}", out.diagnostics); + let rendered: Vec<_> = out + .changes + .iter() + .map(|change| (change.resource.as_str(), &change.operation)) + .collect(); + assert_eq!( + rendered, + vec![ + ("policy.base", &PlanOperation::Create), + ("policy.old", &PlanOperation::Delete), + ("query.knowledge.find_person", &PlanOperation::Create), + ("schema.knowledge", &PlanOperation::Update), + ] + ); + } + + #[tokio::test] + async fn old_minimal_state_json_still_plans_with_default_revision() { + let dir = fixture(); + let state_dir = dir.path().join(CLUSTER_STATE_DIR); + fs::create_dir_all(&state_dir).unwrap(); + fs::write( + state_dir.join("state.json"), + r#"{ + "version": 1, + "applied_revision": { + "config_digest": "old", + "resources": { + "graph.knowledge": { "digest": "old-graph" } + } + } +}"#, + ) + .unwrap(); + + let out = plan_config_dir(dir.path()).await; + assert!(out.ok, "{:?}", out.diagnostics); + assert_eq!(out.state_observations.state_revision, 0); + assert!(out.state_observations.state_cas.is_some()); + assert!(out.changes.iter().any(|change| { + change.resource == "graph.knowledge" && change.operation == PlanOperation::Update + })); + } + + #[test] + fn extended_state_json_status_surfaces_statuses() { + let dir = fixture(); + let state_dir = dir.path().join(CLUSTER_STATE_DIR); + fs::create_dir_all(&state_dir).unwrap(); + let state = r#"{ + "version": 1, + "state_revision": 42, + "applied_revision": { + "config_digest": "applied-config", + "resources": { + "graph.knowledge": { "digest": "graph-digest" } + } + }, + "resource_statuses": { + "graph.knowledge": { + "status": "applied", + "conditions": ["healthy"], + "message": "ready" + } + }, + "approval_records": {}, + "recovery_records": {}, + "observations": { + "graph.knowledge": { "manifest_version": 12 } + } +}"#; + fs::write(state_dir.join("state.json"), state).unwrap(); + + let out = status_config_dir(dir.path()); + assert!(out.ok, "{:?}", out.diagnostics); + assert!(out.state_observations.state_found); + assert_eq!(out.state_observations.state_revision, 42); + assert_eq!( + out.state_observations.state_cas.as_deref(), + Some(format!("sha256:{}", sha256_hex(state.as_bytes())).as_str()) + ); + assert_eq!( + out.resource_digests + .get("graph.knowledge") + .map(String::as_str), + Some("graph-digest") + ); + assert_eq!( + out.resource_statuses["graph.knowledge"].status, + ResourceLifecycleStatus::Applied + ); + } + + #[test] + fn missing_state_status_succeeds_with_warning() { + let dir = fixture(); + let out = status_config_dir(dir.path()); + assert!(out.ok, "{:?}", out.diagnostics); + assert!(!out.state_observations.state_found); + assert_eq!(out.state_observations.state_revision, 0); + assert!( + out.diagnostics + .iter() + .any(|diagnostic| diagnostic.code == "state_missing") + ); + } + + #[test] + fn invalid_state_status_fails() { + let dir = fixture(); + let state_dir = dir.path().join(CLUSTER_STATE_DIR); + fs::create_dir_all(&state_dir).unwrap(); + fs::write(state_dir.join("state.json"), "{").unwrap(); + + let out = status_config_dir(dir.path()); + assert!(!out.ok); + assert!(out.state_observations.state_found); + assert!( + out.diagnostics + .iter() + .any(|diagnostic| diagnostic.code == "invalid_state_json") + ); + } + + #[test] + fn status_surfaces_full_lock_metadata() { + let dir = fixture(); + write_lock_file(dir.path(), "held-lock", "refresh"); + + let out = status_config_dir(dir.path()); + assert!(out.ok, "{:?}", out.diagnostics); + assert!(out.state_observations.locked); + assert_eq!(out.state_observations.lock_id.as_deref(), Some("held-lock")); + assert_eq!( + out.state_observations.lock_operation.as_deref(), + Some("refresh") + ); + assert_eq!( + out.state_observations.lock_created_at.as_deref(), + Some("1970-01-01T00:00:00Z") + ); + assert_eq!(out.state_observations.lock_pid, Some(123)); + assert!(out.state_observations.lock_age_seconds.is_some()); + } + + #[test] + fn force_unlock_matching_id_removes_lock() { + let dir = fixture(); + write_lock_file(dir.path(), "held-lock", "plan"); + + let out = force_unlock_config_dir(dir.path(), "held-lock"); + assert!(out.ok, "{:?}", out.diagnostics); + assert!(out.lock_removed); + assert_eq!(out.state_observations.lock_id.as_deref(), Some("held-lock")); + assert_eq!( + out.state_observations.lock_operation.as_deref(), + Some("plan") + ); + assert!(!dir.path().join(CLUSTER_LOCK_FILE).exists()); + } + + #[test] + fn force_unlock_wrong_id_fails_and_preserves_lock() { + let dir = fixture(); + write_lock_file(dir.path(), "held-lock", "plan"); + + let out = force_unlock_config_dir(dir.path(), "other-lock"); + assert!(!out.ok); + assert!(!out.lock_removed); + assert_eq!(out.state_observations.lock_id.as_deref(), Some("held-lock")); + assert!( + out.diagnostics + .iter() + .any(|diagnostic| diagnostic.code == "state_lock_id_mismatch") + ); + assert!(dir.path().join(CLUSTER_LOCK_FILE).exists()); + } + + #[test] + fn force_unlock_missing_lock_fails() { + let dir = fixture(); + + let out = force_unlock_config_dir(dir.path(), "held-lock"); + assert!(!out.ok); + assert!(!out.lock_removed); + assert!(!out.state_observations.locked); + assert!( + out.diagnostics + .iter() + .any(|diagnostic| diagnostic.code == "state_lock_missing") + ); + } + + #[test] + fn force_unlock_invalid_lock_json_fails_and_preserves_lock() { + let dir = fixture(); + let state_dir = dir.path().join(CLUSTER_STATE_DIR); + fs::create_dir_all(&state_dir).unwrap(); + fs::write(state_dir.join("lock.json"), "{").unwrap(); + + let out = force_unlock_config_dir(dir.path(), "held-lock"); + assert!(!out.ok); + assert!(!out.lock_removed); + assert!( + out.diagnostics + .iter() + .any(|diagnostic| diagnostic.code == "invalid_state_lock") + ); + assert!(dir.path().join(CLUSTER_LOCK_FILE).exists()); + } + + #[test] + fn force_unlock_unsupported_lock_version_fails_and_preserves_lock() { + let dir = fixture(); + let state_dir = dir.path().join(CLUSTER_STATE_DIR); + fs::create_dir_all(&state_dir).unwrap(); + fs::write( + state_dir.join("lock.json"), + r#"{"version":2,"lock_id":"held-lock","operation":"plan","created_at":"1970-01-01T00:00:00Z","pid":123}"#, + ) + .unwrap(); + + let out = force_unlock_config_dir(dir.path(), "held-lock"); + assert!(!out.ok); + assert!(!out.lock_removed); + assert!( + out.diagnostics + .iter() + .any(|diagnostic| diagnostic.code == "unsupported_state_lock_version") + ); + assert!(dir.path().join(CLUSTER_LOCK_FILE).exists()); + } + + #[test] + fn force_unlock_external_state_backend_rejected() { + let dir = fixture(); + write_lock_file(dir.path(), "held-lock", "plan"); + fs::write( + dir.path().join(CLUSTER_CONFIG_FILE), + r#" +version: 1 +state: + backend: s3://state-bucket/cluster +graphs: + knowledge: + schema: ./people.pg +"#, + ) + .unwrap(); + + let out = force_unlock_config_dir(dir.path(), "held-lock"); + assert!(!out.ok); + assert!(!out.lock_removed); + assert!( + out.diagnostics + .iter() + .any(|diagnostic| diagnostic.code == "unsupported_state_backend") + ); + assert!(dir.path().join(CLUSTER_LOCK_FILE).exists()); + } + + #[tokio::test] + async fn plan_succeeds_after_force_unlock() { + let dir = fixture(); + write_lock_file(dir.path(), "held-lock", "plan"); + + let locked = plan_config_dir(dir.path()).await; + assert!(!locked.ok); + assert!( + locked + .diagnostics + .iter() + .any(|diagnostic| diagnostic.code == "state_lock_held") + ); + + let unlocked = force_unlock_config_dir(dir.path(), "held-lock"); + assert!(unlocked.ok, "{:?}", unlocked.diagnostics); + + let out = plan_config_dir(dir.path()).await; + assert!(out.ok, "{:?}", out.diagnostics); + } + + #[tokio::test] + async fn plan_reports_state_cas_revision_and_removes_lock() { + let dir = fixture(); + let state_dir = dir.path().join(CLUSTER_STATE_DIR); + fs::create_dir_all(&state_dir).unwrap(); + let state = r#"{ + "version": 1, + "state_revision": 7, + "applied_revision": { + "config_digest": "old", + "resources": { + "graph.knowledge": { "digest": "old-graph" } + } + } +}"#; + fs::write(state_dir.join("state.json"), state).unwrap(); + + let out = plan_config_dir(dir.path()).await; + assert!(out.ok, "{:?}", out.diagnostics); + assert_eq!(out.state_observations.state_revision, 7); + assert_eq!( + out.state_observations.state_cas.as_deref(), + Some(format!("sha256:{}", sha256_hex(state.as_bytes())).as_str()) + ); + assert!(!out.state_observations.locked); + assert!(out.state_observations.lock_id.is_none()); + assert!(out.state_observations.lock_acquired); + assert!(out.state_observations.acquired_lock_id.is_some()); + assert!( + !dir.path().join(CLUSTER_LOCK_FILE).exists(), + "plan must release lock before returning" + ); + } + + #[tokio::test] + async fn existing_lock_makes_plan_fail() { + let dir = fixture(); + let state_dir = dir.path().join(CLUSTER_STATE_DIR); + fs::create_dir_all(&state_dir).unwrap(); + fs::write( + state_dir.join("lock.json"), + r#"{ + "version": 1, + "lock_id": "held-lock", + "operation": "plan", + "created_at": "2026-06-08T00:00:00Z", + "pid": 123 +}"#, + ) + .unwrap(); + + let out = plan_config_dir(dir.path()).await; + assert!(!out.ok); + assert!(out.state_observations.locked); + assert_eq!(out.state_observations.lock_id.as_deref(), Some("held-lock")); + assert!(!out.state_observations.lock_acquired); + assert!(out.state_observations.acquired_lock_id.is_none()); + assert_eq!( + out.state_observations.lock_operation.as_deref(), + Some("plan") + ); + assert!( + out.diagnostics + .iter() + .any(|diagnostic| diagnostic.code == "state_lock_held") + ); + assert!(out.diagnostics.iter().any(|diagnostic| { + diagnostic.code == "state_lock_held" + && diagnostic.message.contains("force-unlock held-lock") + })); + } + + #[tokio::test] + async fn state_lock_false_bypasses_lock_with_warning() { + let dir = fixture(); + fs::write( + dir.path().join(CLUSTER_CONFIG_FILE), + r#" +version: 1 +state: + backend: cluster + lock: false +graphs: + knowledge: + schema: ./people.pg +"#, + ) + .unwrap(); + + let out = plan_config_dir(dir.path()).await; + assert!(out.ok, "{:?}", out.diagnostics); + assert!(!out.state_observations.locked); + assert!(!out.state_observations.lock_acquired); + assert!( + out.diagnostics + .iter() + .any(|diagnostic| diagnostic.code == "state_lock_disabled") + ); + assert!(!dir.path().join(CLUSTER_LOCK_FILE).exists()); + } + + #[test] + fn external_state_backend_rejected() { + let dir = fixture(); + fs::write( + dir.path().join(CLUSTER_CONFIG_FILE), + "version: 1\nstate:\n backend: s3://bucket/state\ngraphs: {}\n", + ) + .unwrap(); + let out = validate_config_dir(dir.path()); + assert!(!out.ok); + assert_eq!(out.diagnostics[0].code, "unsupported_state_backend"); + } + + #[tokio::test] + async fn external_state_backend_plan_rejected() { + let dir = fixture(); + fs::write( + dir.path().join(CLUSTER_CONFIG_FILE), + "version: 1\nstate:\n backend: s3://bucket/state\ngraphs: {}\n", + ) + .unwrap(); + let out = plan_config_dir(dir.path()).await; + assert!(!out.ok); + assert!( + out.diagnostics + .iter() + .any(|diagnostic| diagnostic.code == "unsupported_state_backend") + ); + } + + #[tokio::test] + async fn import_missing_state_creates_state_with_graph_observation() { + let dir = fixture(); + init_derived_graph(dir.path()).await; + + let out = import_config_dir(dir.path()).await; + assert!(out.ok, "{:?}", out.diagnostics); + assert_eq!(out.state_observations.state_revision, 1); + assert!(out.state_observations.state_cas.is_some()); + assert!(!out.state_observations.locked); + assert!(out.state_observations.lock_acquired); + assert!(out.state_observations.acquired_lock_id.is_some()); + assert!(!dir.path().join(CLUSTER_LOCK_FILE).exists()); + assert_eq!( + out.resource_digests + .get("schema.knowledge") + .map(String::as_str), + Some(sha256_hex(SCHEMA.as_bytes()).as_str()) + ); + assert!(out.observations["graph.knowledge"]["manifest_version"].is_number()); + assert_eq!( + out.observations["graph.knowledge"]["schema_matches_desired"], + true + ); + + let state: serde_json::Value = + serde_json::from_str(&fs::read_to_string(dir.path().join(CLUSTER_STATE_FILE)).unwrap()) + .unwrap(); + assert_eq!(state["state_revision"], 1); + assert_eq!( + state["resource_statuses"]["graph.knowledge"]["status"], + "applied" + ); + } + + #[tokio::test] + async fn import_existing_state_fails() { + let dir = fixture(); + let state_dir = dir.path().join(CLUSTER_STATE_DIR); + fs::create_dir_all(&state_dir).unwrap(); + fs::write( + state_dir.join("state.json"), + r#"{"version":1,"applied_revision":{"resources":{}}}"#, + ) + .unwrap(); + + let out = import_config_dir(dir.path()).await; + assert!(!out.ok); + assert!( + out.diagnostics + .iter() + .any(|diagnostic| diagnostic.code == "state_already_exists") + ); + } + + #[tokio::test] + async fn refresh_missing_state_fails() { + let dir = fixture(); + let out = refresh_config_dir(dir.path()).await; + assert!(!out.ok); + assert!( + out.diagnostics + .iter() + .any(|diagnostic| diagnostic.code == "state_missing") + ); + } + + #[tokio::test] + async fn refresh_existing_minimal_state_increments_revision_and_updates_cas() { + let dir = fixture(); + init_derived_graph(dir.path()).await; + let state_dir = dir.path().join(CLUSTER_STATE_DIR); + fs::create_dir_all(&state_dir).unwrap(); + fs::write( + state_dir.join("state.json"), + r#"{"version":1,"applied_revision":{"config_digest":"old","resources":{"graph.knowledge":{"digest":"old"}}}}"#, + ) + .unwrap(); + + let out = refresh_config_dir(dir.path()).await; + assert!(out.ok, "{:?}", out.diagnostics); + assert_eq!(out.state_observations.state_revision, 1); + assert!(out.state_observations.state_cas.is_some()); + assert!(!out.state_observations.locked); + assert!(out.state_observations.lock_acquired); + assert_eq!( + out.resource_statuses["graph.knowledge"].status, + ResourceLifecycleStatus::Applied + ); + assert!(!dir.path().join(CLUSTER_LOCK_FILE).exists()); + } + + #[tokio::test] + async fn refresh_records_live_schema_digest_and_manifest_version() { + let dir = fixture(); + init_derived_graph(dir.path()).await; + let state_dir = dir.path().join(CLUSTER_STATE_DIR); + fs::create_dir_all(&state_dir).unwrap(); + fs::write( + state_dir.join("state.json"), + r#"{"version":1,"state_revision":4,"applied_revision":{"resources":{}}}"#, + ) + .unwrap(); + + let out = refresh_config_dir(dir.path()).await; + assert!(out.ok, "{:?}", out.diagnostics); + assert_eq!(out.state_observations.state_revision, 5); + assert_eq!( + out.observations["graph.knowledge"]["schema_digest"], + sha256_hex(SCHEMA.as_bytes()) + ); + assert!(out.observations["graph.knowledge"]["manifest_version"].is_u64()); + } + + #[tokio::test] + async fn missing_derived_graph_root_marks_drifted_and_plans_creates() { + let dir = fixture(); + let state_dir = dir.path().join(CLUSTER_STATE_DIR); + fs::create_dir_all(&state_dir).unwrap(); + fs::write( + state_dir.join("state.json"), + r#"{"version":1,"applied_revision":{"resources":{"graph.knowledge":{"digest":"old-graph"},"schema.knowledge":{"digest":"old-schema"}}}}"#, + ) + .unwrap(); + + let out = refresh_config_dir(dir.path()).await; + assert!(out.ok, "{:?}", out.diagnostics); + assert_eq!( + out.resource_statuses["graph.knowledge"].status, + ResourceLifecycleStatus::Drifted + ); + assert!(!out.resource_digests.contains_key("graph.knowledge")); + assert_eq!(out.observations["graph.knowledge"]["exists"], false); + + let plan = plan_config_dir(dir.path()).await; + assert!(plan.ok, "{:?}", plan.diagnostics); + assert!(plan.changes.iter().any(|change| { + change.resource == "graph.knowledge" && change.operation == PlanOperation::Create + })); + assert!(plan.changes.iter().any(|change| { + change.resource == "schema.knowledge" && change.operation == PlanOperation::Create + })); + } + + #[tokio::test] + async fn live_schema_mismatch_marks_drifted_and_causes_plan_update() { + let dir = fixture(); + init_derived_graph(dir.path()).await; + fs::write( + dir.path().join("people.pg"), + SCHEMA.replace("age: I32?", "age: I32?\n nickname: String?"), + ) + .unwrap(); + let state_dir = dir.path().join(CLUSTER_STATE_DIR); + fs::create_dir_all(&state_dir).unwrap(); + fs::write( + state_dir.join("state.json"), + r#"{"version":1,"applied_revision":{"resources":{"graph.knowledge":{"digest":"old-graph"},"schema.knowledge":{"digest":"old-schema"}}}}"#, + ) + .unwrap(); + + let out = refresh_config_dir(dir.path()).await; + assert!(out.ok, "{:?}", out.diagnostics); + assert_eq!( + out.resource_statuses["schema.knowledge"].status, + ResourceLifecycleStatus::Drifted + ); + assert_eq!( + out.observations["graph.knowledge"]["schema_matches_desired"], + false + ); + + let plan = plan_config_dir(dir.path()).await; + assert!(plan.ok, "{:?}", plan.diagnostics); + assert!(plan.changes.iter().any(|change| { + change.resource == "schema.knowledge" && change.operation == PlanOperation::Update + })); + } + + #[tokio::test] + async fn existing_lock_makes_refresh_fail() { + let dir = fixture(); + let state_dir = dir.path().join(CLUSTER_STATE_DIR); + fs::create_dir_all(&state_dir).unwrap(); + fs::write( + state_dir.join("state.json"), + r#"{"version":1,"applied_revision":{"resources":{}}}"#, + ) + .unwrap(); + fs::write( + state_dir.join("lock.json"), + r#"{"version":1,"lock_id":"held-lock","operation":"refresh","created_at":"2026-06-08T00:00:00Z","pid":123}"#, + ) + .unwrap(); + + let out = refresh_config_dir(dir.path()).await; + assert!(!out.ok); + assert!(out.state_observations.locked); + assert_eq!(out.state_observations.lock_id.as_deref(), Some("held-lock")); + assert!(!out.state_observations.lock_acquired); + assert_eq!( + out.state_observations.lock_operation.as_deref(), + Some("refresh") + ); + assert!( + out.diagnostics + .iter() + .any(|diagnostic| diagnostic.code == "state_lock_held") + ); + assert!(out.diagnostics.iter().any(|diagnostic| { + diagnostic.code == "state_lock_held" + && diagnostic.message.contains("force-unlock held-lock") + })); + } + + #[tokio::test] + async fn state_lock_false_bypasses_refresh_lock_with_warning() { + let dir = fixture(); + init_derived_graph(dir.path()).await; + fs::write( + dir.path().join(CLUSTER_CONFIG_FILE), + r#" +version: 1 +state: + backend: cluster + lock: false +graphs: + knowledge: + schema: ./people.pg +"#, + ) + .unwrap(); + let state_dir = dir.path().join(CLUSTER_STATE_DIR); + fs::create_dir_all(&state_dir).unwrap(); + fs::write( + state_dir.join("state.json"), + r#"{"version":1,"applied_revision":{"resources":{}}}"#, + ) + .unwrap(); + + let out = refresh_config_dir(dir.path()).await; + assert!(out.ok, "{:?}", out.diagnostics); + assert!(!out.state_observations.locked); + assert!(!out.state_observations.lock_acquired); + assert!( + out.diagnostics + .iter() + .any(|diagnostic| diagnostic.code == "state_lock_disabled") + ); + } + + #[tokio::test] + async fn external_state_backend_refresh_rejected() { + let dir = fixture(); + fs::write( + dir.path().join(CLUSTER_CONFIG_FILE), + "version: 1\nstate:\n backend: s3://bucket/state\ngraphs: {}\n", + ) + .unwrap(); + + let out = refresh_config_dir(dir.path()).await; + assert!(!out.ok); + assert!( + out.diagnostics + .iter() + .any(|diagnostic| diagnostic.code == "unsupported_state_backend") + ); + } + + #[tokio::test] + async fn import_graph_open_error_does_not_create_state() { + let dir = fixture(); + fs::create_dir_all(dir.path().join(CLUSTER_GRAPHS_DIR).join("knowledge.omni")).unwrap(); + + let out = import_config_dir(dir.path()).await; + assert!(!out.ok); + assert!( + out.diagnostics + .iter() + .any(|diagnostic| diagnostic.code == "graph_observation_error") + ); + assert!(!dir.path().join(CLUSTER_STATE_FILE).exists()); + } + + // ---- config-only apply (Stage 3A) ---- + + /// Seed a state.json that simulates "graph exists with the desired schema, + /// queries/policies not yet applied" by borrowing the desired digests. + fn write_applyable_state(config_dir: &Path) { + let out = validate_config_dir(config_dir); + assert!(out.ok, "{:?}", out.diagnostics); + let schema_digest = out.resource_digests.get("schema.knowledge").unwrap().clone(); + let graph_composite = + graph_digest("knowledge", Some(&schema_digest), Some(&BTreeMap::new())); + write_state_resources( + config_dir, + &[ + ("graph.knowledge", graph_composite.as_str()), + ("schema.knowledge", schema_digest.as_str()), + ], + ); + } + + fn write_state_resources(config_dir: &Path, resources: &[(&str, &str)]) { + let resource_map: serde_json::Map = resources + .iter() + .map(|(address, digest)| ((*address).to_string(), json!({ "digest": digest }))) + .collect(); + let state_dir = config_dir.join(CLUSTER_STATE_DIR); + fs::create_dir_all(&state_dir).unwrap(); + fs::write( + state_dir.join("state.json"), + serde_json::to_string_pretty(&json!({ + "version": 1, + "state_revision": 1, + "applied_revision": { "resources": resource_map } + })) + .unwrap(), + ) + .unwrap(); + } + + fn read_state_json(config_dir: &Path) -> serde_json::Value { + serde_json::from_str(&fs::read_to_string(config_dir.join(CLUSTER_STATE_FILE)).unwrap()) + .unwrap() + } + + fn query_payload_path(config_dir: &Path, digest: &str) -> std::path::PathBuf { + config_dir + .join(CLUSTER_RESOURCES_DIR) + .join("query/knowledge/find_person") + .join(format!("{digest}.gq")) + } + + fn policy_payload_path(config_dir: &Path, digest: &str) -> std::path::PathBuf { + config_dir + .join(CLUSTER_RESOURCES_DIR) + .join("policy/base") + .join(format!("{digest}.yaml")) + } + + #[tokio::test] + async fn apply_without_state_fails_with_state_missing() { + let dir = fixture(); + let out = apply_config_dir(dir.path()).await; + assert!(!out.ok); + assert!( + out.diagnostics + .iter() + .any(|diagnostic| diagnostic.code == "state_missing" + && diagnostic.message.contains("cluster import")) + ); + assert!(!dir.path().join(CLUSTER_STATE_FILE).exists()); + assert!(!dir.path().join(CLUSTER_RESOURCES_DIR).exists()); + assert!(!dir.path().join(CLUSTER_LOCK_FILE).exists()); + } + + #[tokio::test] + async fn apply_writes_payloads_state_and_statuses() { + let dir = fixture(); + write_applyable_state(dir.path()); + let desired = validate_config_dir(dir.path()); + let query_digest = desired + .resource_digests + .get("query.knowledge.find_person") + .unwrap() + .clone(); + let policy_digest = desired.resource_digests.get("policy.base").unwrap().clone(); + let schema_digest = desired + .resource_digests + .get("schema.knowledge") + .unwrap() + .clone(); + + let out = apply_config_dir(dir.path()).await; + assert!(out.ok, "{:?}", out.diagnostics); + assert_eq!(out.applied_count, 2); + assert_eq!(out.deferred_count, 0); + assert!(out.converged); + assert!(out.state_written); + + let query_blob = query_payload_path(dir.path(), &query_digest); + assert_eq!(fs::read_to_string(&query_blob).unwrap(), QUERY); + let policy_blob = policy_payload_path(dir.path(), &policy_digest); + assert_eq!(fs::read_to_string(&policy_blob).unwrap(), "rules: []\n"); + + let state = read_state_json(dir.path()); + assert_eq!(state["state_revision"], 2); + let resources = &state["applied_revision"]["resources"]; + assert_eq!( + resources["query.knowledge.find_person"]["digest"], + query_digest + ); + assert_eq!(resources["policy.base"]["digest"], policy_digest); + let expected_composite = graph_digest( + "knowledge", + Some(&schema_digest), + Some( + &[("find_person".to_string(), query_digest.clone())] + .into_iter() + .collect(), + ), + ); + assert_eq!(resources["graph.knowledge"]["digest"], expected_composite); + assert_eq!( + state["applied_revision"]["config_digest"], + desired_revision_digest(&out) + ); + assert_eq!( + state["resource_statuses"]["query.knowledge.find_person"]["status"], + "applied" + ); + assert_eq!(state["resource_statuses"]["policy.base"]["status"], "applied"); + assert!(!dir.path().join(CLUSTER_LOCK_FILE).exists()); + } + + fn desired_revision_digest(out: &ApplyOutput) -> String { + out.desired_revision.config_digest.clone().unwrap() + } + + #[tokio::test] + async fn apply_update_changes_query_digest_and_keeps_old_blob() { + let dir = fixture(); + let desired = validate_config_dir(dir.path()); + let schema_digest = desired + .resource_digests + .get("schema.knowledge") + .unwrap() + .clone(); + let old_digest = "0".repeat(64); + let graph_composite = + graph_digest("knowledge", Some(&schema_digest), Some(&BTreeMap::new())); + write_state_resources( + dir.path(), + &[ + ("graph.knowledge", graph_composite.as_str()), + ("schema.knowledge", schema_digest.as_str()), + ("query.knowledge.find_person", old_digest.as_str()), + ], + ); + let old_blob = query_payload_path(dir.path(), &old_digest); + fs::create_dir_all(old_blob.parent().unwrap()).unwrap(); + fs::write(&old_blob, "old query source").unwrap(); + + let out = apply_config_dir(dir.path()).await; + assert!(out.ok, "{:?}", out.diagnostics); + let new_digest = desired + .resource_digests + .get("query.knowledge.find_person") + .unwrap(); + let state = read_state_json(dir.path()); + assert_eq!( + state["applied_revision"]["resources"]["query.knowledge.find_person"]["digest"], + *new_digest + ); + assert_eq!(fs::read_to_string(&old_blob).unwrap(), "old query source"); + assert!(query_payload_path(dir.path(), new_digest).exists()); + } + + #[tokio::test] + async fn apply_deletes_removed_resources_but_keeps_blobs() { + let dir = fixture(); + let desired = validate_config_dir(dir.path()); + let schema_digest = desired + .resource_digests + .get("schema.knowledge") + .unwrap() + .clone(); + let stale_query_digest = "1".repeat(64); + let stale_policy_digest = "2".repeat(64); + let graph_composite = + graph_digest("knowledge", Some(&schema_digest), Some(&BTreeMap::new())); + write_state_resources( + dir.path(), + &[ + ("graph.knowledge", graph_composite.as_str()), + ("schema.knowledge", schema_digest.as_str()), + ("query.knowledge.orphan", stale_query_digest.as_str()), + ("policy.old", stale_policy_digest.as_str()), + ], + ); + let stale_blob = dir + .path() + .join(CLUSTER_RESOURCES_DIR) + .join("policy/old") + .join(format!("{stale_policy_digest}.yaml")); + fs::create_dir_all(stale_blob.parent().unwrap()).unwrap(); + fs::write(&stale_blob, "old policy").unwrap(); + + let out = apply_config_dir(dir.path()).await; + assert!(out.ok, "{:?}", out.diagnostics); + assert!(out.converged); + let state = read_state_json(dir.path()); + let resources = &state["applied_revision"]["resources"]; + assert!(resources.get("query.knowledge.orphan").is_none()); + assert!(resources.get("policy.old").is_none()); + assert!( + state["resource_statuses"] + .get("query.knowledge.orphan") + .is_none() + ); + // Deleted resources leave their content-addressed blobs in place; GC is + // a later stage. + assert_eq!(fs::read_to_string(&stale_blob).unwrap(), "old policy"); + // The composite no longer includes the orphan query. + let query_digest = desired + .resource_digests + .get("query.knowledge.find_person") + .unwrap() + .clone(); + let expected_composite = graph_digest( + "knowledge", + Some(&schema_digest), + Some(&[("find_person".to_string(), query_digest)].into_iter().collect()), + ); + assert_eq!(resources["graph.knowledge"]["digest"], expected_composite); + } + + #[tokio::test] + async fn apply_schema_update_and_dependent_query_in_one_run() { + let dir = fixture(); + init_derived_graph(dir.path()).await; + write_applyable_state(dir.path()); + // Schema update + a query update that depends on the new field: one + // apply executes the schema migration first, then the catalog write. + fs::write(dir.path().join("people.pg"), SCHEMA_V2).unwrap(); + fs::write( + dir.path().join("people.gq"), + "\nquery find_person($name: String) {\n match { $p: Person { name: $name } }\n return { $p.name, $p.bio }\n}\n", + ) + .unwrap(); + + let out = apply_config_dir(dir.path()).await; + assert!(out.ok, "{:?}", out.diagnostics); + assert!(out.converged, "{out:?}"); + let by_resource: BTreeMap<&str, &PlanChange> = out + .changes + .iter() + .map(|change| (change.resource.as_str(), change)) + .collect(); + assert_eq!( + by_resource["schema.knowledge"].disposition, + Some(ApplyDisposition::Applied) + ); + assert_eq!( + by_resource["query.knowledge.find_person"].disposition, + Some(ApplyDisposition::Applied) + ); + assert_eq!( + by_resource["graph.knowledge"].disposition, + Some(ApplyDisposition::Derived) + ); + // The live graph carries the new schema. + let db = Omnigraph::open_read_only(&derived_graph_uri(dir.path(), "knowledge")) + .await + .unwrap(); + let desired = validate_config_dir(dir.path()); + assert_eq!( + sha256_hex(db.schema_source().as_bytes()), + desired.resource_digests["schema.knowledge"] + ); + let state = read_state_json(dir.path()); + assert_eq!( + state["applied_revision"]["resources"]["schema.knowledge"]["digest"], + desired.resource_digests["schema.knowledge"] + ); + // Sidecar retired after the CAS landed. + assert!( + !dir.path().join(CLUSTER_RECOVERIES_DIR).exists() + || fs::read_dir(dir.path().join(CLUSTER_RECOVERIES_DIR)) + .unwrap() + .next() + .is_none() + ); + } + + #[tokio::test] + async fn apply_unsupported_schema_change_fails_loudly() { + let dir = fixture(); + init_derived_graph(dir.path()).await; + write_applyable_state(dir.path()); + // Property type changes are unsupported by the engine planner. + fs::write( + dir.path().join("people.pg"), + "\nnode Person {\n name: String @key\n age: I64?\n}\n", + ) + .unwrap(); + + let out = apply_config_dir(dir.path()).await; + assert!(!out.ok); + assert!(out.diagnostics.iter().any(|diagnostic| { + diagnostic.code == "schema_apply_failed" + && diagnostic.message.contains("changing property type") + })); + let by_resource: BTreeMap<&str, &PlanChange> = out + .changes + .iter() + .map(|change| (change.resource.as_str(), change)) + .collect(); + assert_eq!( + by_resource["schema.knowledge"].disposition, + Some(ApplyDisposition::Blocked) + ); + assert_eq!( + by_resource["schema.knowledge"].reason.as_deref(), + Some("schema_apply_failed") + ); + // The live schema and the ledger are unchanged. + let state = read_state_json(dir.path()); + let desired = validate_config_dir(dir.path()); + assert_ne!( + state["applied_revision"]["resources"]["schema.knowledge"]["digest"], + desired.resource_digests["schema.knowledge"] + ); + // Second run: the sweep retires the stale sidecar (ledger consistent) + // and the run fails just as loudly — idempotent loudness. + let second = apply_config_dir(dir.path()).await; + assert!(!second.ok); + assert!( + second + .diagnostics + .iter() + .any(|diagnostic| diagnostic.code == "schema_apply_failed") + ); + } + + #[tokio::test] + async fn apply_blocks_schema_update_while_recovery_pending() { + let dir = fixture(); + init_derived_graph(dir.path()).await; + write_state_resources(dir.path(), &[("schema.knowledge", "stale-digest")]); + fs::write(dir.path().join("people.pg"), SCHEMA_V2).unwrap(); + // A pending sidecar whose intent matches neither live nor recorded. + write_schema_apply_sidecar(dir.path(), "knowledge", "intended-digest", "01PENDS"); + + let out = apply_config_dir(dir.path()).await; + let by_resource: BTreeMap<&str, &PlanChange> = out + .changes + .iter() + .map(|change| (change.resource.as_str(), change)) + .collect(); + assert_eq!( + by_resource["schema.knowledge"].disposition, + Some(ApplyDisposition::Blocked) + ); + assert_eq!( + by_resource["schema.knowledge"].reason.as_deref(), + Some("cluster_recovery_pending") + ); + } + + #[tokio::test] + async fn apply_creates_graph_and_unblocks_dependents() { + let dir = fixture(); + write_state_resources(dir.path(), &[]); + + let out = apply_config_dir(dir.path()).await; + assert!(out.ok, "{:?}", out.diagnostics); + assert!(out.converged, "{out:?}"); + let by_resource: BTreeMap<&str, &PlanChange> = out + .changes + .iter() + .map(|change| (change.resource.as_str(), change)) + .collect(); + // Stage 4A: the create executes, and its dependents apply in-run. + assert_eq!( + by_resource["graph.knowledge"].disposition, + Some(ApplyDisposition::Applied) + ); + assert_eq!( + by_resource["schema.knowledge"].disposition, + Some(ApplyDisposition::Applied) + ); + assert_eq!( + by_resource["query.knowledge.find_person"].disposition, + Some(ApplyDisposition::Applied) + ); + assert_eq!( + by_resource["policy.base"].disposition, + Some(ApplyDisposition::Applied) + ); + // The graph exists on disk and opens; state records everything. + let graph_uri = derived_graph_uri(dir.path(), "knowledge"); + let db = Omnigraph::open_read_only(&graph_uri).await.unwrap(); + let desired = validate_config_dir(dir.path()); + assert_eq!( + sha256_hex(db.schema_source().as_bytes()), + desired.resource_digests["schema.knowledge"] + ); + let state = read_state_json(dir.path()); + assert_eq!( + state["applied_revision"]["resources"]["schema.knowledge"]["digest"], + desired.resource_digests["schema.knowledge"] + ); + assert_eq!( + state["resource_statuses"]["graph.knowledge"]["status"], + "applied" + ); + // The create's sidecar was retired after the state CAS landed. + assert!( + !dir.path().join(CLUSTER_RECOVERIES_DIR).exists() + || fs::read_dir(dir.path().join(CLUSTER_RECOVERIES_DIR)) + .unwrap() + .next() + .is_none() + ); + } + + #[tokio::test] + async fn apply_create_failure_blocks_dependents_and_keeps_sidecar() { + let dir = fixture(); + write_state_resources(dir.path(), &[]); + // Make the init fail its strict preflight: a junk _schema.pg already + // sits at the derived root (the engine refuses to overwrite it). + let root = dir.path().join(CLUSTER_GRAPHS_DIR).join("knowledge.omni"); + fs::create_dir_all(&root).unwrap(); + fs::write(root.join("_schema.pg"), "junk").unwrap(); + + let out = apply_config_dir(dir.path()).await; + assert!(!out.ok); + assert!( + out.diagnostics + .iter() + .any(|diagnostic| diagnostic.code == "graph_create_failed") + ); + let by_resource: BTreeMap<&str, &PlanChange> = out + .changes + .iter() + .map(|change| (change.resource.as_str(), change)) + .collect(); + // Dependents are demoted: the run tells the truth about what executed. + assert_eq!( + by_resource["graph.knowledge"].disposition, + Some(ApplyDisposition::Blocked) + ); + assert_eq!( + by_resource["query.knowledge.find_person"].disposition, + Some(ApplyDisposition::Blocked) + ); + assert_eq!( + by_resource["query.knowledge.find_person"].reason.as_deref(), + Some("dependency_not_applied") + ); + assert_eq!( + by_resource["policy.base"].disposition, + Some(ApplyDisposition::Blocked) + ); + assert!(!out.converged); + // The sidecar stays for the sweep to classify next run. + assert!( + fs::read_dir(dir.path().join(CLUSTER_RECOVERIES_DIR)) + .unwrap() + .next() + .is_some() + ); + // No graph digests moved. + let state = read_state_json(dir.path()); + assert!( + state["applied_revision"]["resources"] + .as_object() + .unwrap() + .is_empty() + ); + } + + #[tokio::test] + async fn apply_blocks_graph_delete_without_approval() { + let dir = fixture(); + let desired = validate_config_dir(dir.path()); + let schema_digest = desired + .resource_digests + .get("schema.knowledge") + .unwrap() + .clone(); + let graph_composite = + graph_digest("knowledge", Some(&schema_digest), Some(&BTreeMap::new())); + write_state_resources( + dir.path(), + &[ + ("graph.knowledge", graph_composite.as_str()), + ("schema.knowledge", schema_digest.as_str()), + ("graph.old", "3333"), + ("schema.old", "4444"), + ("query.old.q", "5555"), + ], + ); + + let out = apply_config_dir(dir.path()).await; + assert!(out.ok, "{:?}", out.diagnostics); + assert!(!out.converged); + let by_resource: BTreeMap<&str, &PlanChange> = out + .changes + .iter() + .map(|change| (change.resource.as_str(), change)) + .collect(); + // Stage 4C: deletes are gated, not deferred — every subtree change + // blocks on the single graph-level approval. + assert_eq!( + by_resource["graph.old"].disposition, + Some(ApplyDisposition::Blocked) + ); + assert_eq!( + by_resource["graph.old"].reason.as_deref(), + Some("approval_required") + ); + assert_eq!( + by_resource["schema.old"].reason.as_deref(), + Some("approval_required") + ); + assert_eq!( + by_resource["query.old.q"].reason.as_deref(), + Some("approval_required") + ); + // State intact; nothing destroyed without the artifact. + let state = read_state_json(dir.path()); + let resources = &state["applied_revision"]["resources"]; + assert_eq!(resources["graph.old"]["digest"], "3333"); + assert_eq!(resources["schema.old"]["digest"], "4444"); + assert_eq!(resources["query.old.q"]["digest"], "5555"); + } + + #[tokio::test] + async fn approve_writes_digest_bound_artifact() { + let dir = fixture(); + write_applyable_state(dir.path()); + // Seed a deletable subtree. + let state = read_state_json(dir.path()); + let graph_digest_str = state["applied_revision"]["resources"]["graph.knowledge"]["digest"] + .as_str() + .unwrap() + .to_string(); + let schema_digest_str = state["applied_revision"]["resources"]["schema.knowledge"] + ["digest"] + .as_str() + .unwrap() + .to_string(); + write_state_resources( + dir.path(), + &[ + ("graph.knowledge", graph_digest_str.as_str()), + ("schema.knowledge", schema_digest_str.as_str()), + ("graph.old", "3333"), + ("schema.old", "4444"), + ], + ); + + let out = approve_config_dir(dir.path(), "graph.old", "andrew").await; + assert!(out.ok, "{:?}", out.diagnostics); + let approval_id = out.approval_id.clone().unwrap(); + let artifact: serde_json::Value = serde_json::from_str( + &fs::read_to_string( + dir.path() + .join(CLUSTER_APPROVALS_DIR) + .join(format!("{approval_id}.json")), + ) + .unwrap(), + ) + .unwrap(); + assert_eq!(artifact["resource"], "graph.old"); + assert_eq!(artifact["operation"], "delete"); + assert_eq!(artifact["approved_by"], "andrew"); + assert_eq!(artifact["bound_before_digest"], "3333"); + assert!(artifact["bound_after_digest"].is_null()); + assert!(artifact["bound_config_digest"].is_string()); + assert!(artifact["consumed_at"].is_null()); + + // A non-gated address is refused. + let not_gated = approve_config_dir(dir.path(), "query.knowledge.find_person", "andrew").await; + assert!(!not_gated.ok); + assert!( + not_gated + .diagnostics + .iter() + .any(|diagnostic| diagnostic.code == "approval_not_required") + ); + } + + #[tokio::test] + async fn stale_approval_is_ignored() { + let dir = fixture(); + write_applyable_state(dir.path()); + let state = read_state_json(dir.path()); + let graph_digest_str = state["applied_revision"]["resources"]["graph.knowledge"]["digest"] + .as_str() + .unwrap() + .to_string(); + let schema_digest_str = state["applied_revision"]["resources"]["schema.knowledge"] + ["digest"] + .as_str() + .unwrap() + .to_string(); + write_state_resources( + dir.path(), + &[ + ("graph.knowledge", graph_digest_str.as_str()), + ("schema.knowledge", schema_digest_str.as_str()), + ("graph.old", "3333"), + ], + ); + let approved = approve_config_dir(dir.path(), "graph.old", "andrew").await; + assert!(approved.ok, "{:?}", approved.diagnostics); + // The config moves after approval: the bound config digest no longer + // matches and the artifact authorizes nothing. + fs::write(dir.path().join("base.policy.yaml"), "rules: [] # moved\n").unwrap(); + + let out = apply_config_dir(dir.path()).await; + assert!( + out.diagnostics + .iter() + .any(|diagnostic| diagnostic.code == "approval_stale"), + "{:?}", + out.diagnostics + ); + let by_resource: BTreeMap<&str, &PlanChange> = out + .changes + .iter() + .map(|change| (change.resource.as_str(), change)) + .collect(); + assert_eq!( + by_resource["graph.old"].reason.as_deref(), + Some("approval_required") + ); + let state = read_state_json(dir.path()); + assert_eq!( + state["applied_revision"]["resources"]["graph.old"]["digest"], + "3333" + ); + } + + #[tokio::test] + async fn compute_approvals_one_gate_per_subtree() { + let dir = fixture(); + write_applyable_state(dir.path()); + let state = read_state_json(dir.path()); + let g = state["applied_revision"]["resources"]["graph.knowledge"]["digest"] + .as_str() + .unwrap() + .to_string(); + let sc = state["applied_revision"]["resources"]["schema.knowledge"]["digest"] + .as_str() + .unwrap() + .to_string(); + write_state_resources( + dir.path(), + &[ + ("graph.knowledge", g.as_str()), + ("schema.knowledge", sc.as_str()), + ("graph.old", "3333"), + ("schema.old", "4444"), + ("query.old.q", "5555"), + ], + ); + let plan = plan_config_dir(dir.path()).await; + let gated: Vec<&str> = plan + .approvals_required + .iter() + .map(|gate| gate.resource.as_str()) + .collect(); + assert_eq!(gated, vec!["graph.old"], "{plan:?}"); + assert!(!plan.approvals_required[0].satisfied); + } + + #[tokio::test] + async fn apply_is_idempotent() { + let dir = fixture(); + write_applyable_state(dir.path()); + + let first = apply_config_dir(dir.path()).await; + assert!(first.ok, "{:?}", first.diagnostics); + assert!(first.state_written); + let state_after_first = fs::read_to_string(dir.path().join(CLUSTER_STATE_FILE)).unwrap(); + + let second = apply_config_dir(dir.path()).await; + assert!(second.ok, "{:?}", second.diagnostics); + assert!(second.changes.is_empty()); + assert_eq!(second.applied_count, 0); + assert!(second.converged); + assert!(!second.state_written); + let state_after_second = fs::read_to_string(dir.path().join(CLUSTER_STATE_FILE)).unwrap(); + assert_eq!(state_after_first, state_after_second); + assert_eq!(second.state_observations.state_revision, 2); + } + + #[tokio::test] + async fn apply_respects_held_lock() { + let dir = fixture(); + write_applyable_state(dir.path()); + write_lock_file(dir.path(), "held-lock", "plan"); + + let out = apply_config_dir(dir.path()).await; + assert!(!out.ok); + assert!( + out.diagnostics + .iter() + .any(|diagnostic| diagnostic.code == "state_lock_held") + ); + // The held lock survives a refused apply, and nothing was written. + assert!(dir.path().join(CLUSTER_LOCK_FILE).exists()); + assert!(!dir.path().join(CLUSTER_RESOURCES_DIR).exists()); + let state = read_state_json(dir.path()); + assert_eq!(state["state_revision"], 1); + } + + #[tokio::test] + async fn apply_state_lock_false_bypasses_with_warning() { + let dir = fixture(); + fs::write( + dir.path().join(CLUSTER_CONFIG_FILE), + r#" +version: 1 +state: + backend: cluster + lock: false +graphs: + knowledge: + schema: ./people.pg + queries: + find_person: + file: ./people.gq +"#, + ) + .unwrap(); + write_applyable_state(dir.path()); + + let out = apply_config_dir(dir.path()).await; + assert!(out.ok, "{:?}", out.diagnostics); + assert!(out.state_written); + assert!(!out.state_observations.lock_acquired); + assert!( + out.diagnostics + .iter() + .any(|diagnostic| diagnostic.code == "state_lock_disabled") + ); + assert!(!dir.path().join(CLUSTER_LOCK_FILE).exists()); + } + + #[tokio::test] + async fn apply_skips_existing_payload_blob() { + let dir = fixture(); + write_applyable_state(dir.path()); + let desired = validate_config_dir(dir.path()); + let query_digest = desired + .resource_digests + .get("query.knowledge.find_person") + .unwrap() + .clone(); + // Content-addressed blobs are trusted by name: an existing file is + // never rewritten. + let blob = query_payload_path(dir.path(), &query_digest); + fs::create_dir_all(blob.parent().unwrap()).unwrap(); + fs::write(&blob, "pre-existing").unwrap(); + + let out = apply_config_dir(dir.path()).await; + assert!(out.ok, "{:?}", out.diagnostics); + assert_eq!(fs::read_to_string(&blob).unwrap(), "pre-existing"); + } + + #[tokio::test] + async fn apply_invalid_config_fails_before_lock() { + let dir = fixture(); + fs::write( + dir.path().join(CLUSTER_CONFIG_FILE), + "version: 1\nnot_a_field: true\n", + ) + .unwrap(); + + let out = apply_config_dir(dir.path()).await; + assert!(!out.ok); + // Config errors bail before the lock or any state directory exists. + assert!(!dir.path().join(CLUSTER_STATE_DIR).exists()); + } + + /// When the state write fails after payloads landed, the output must + /// report the statuses actually on disk — not the unpersisted in-memory + /// mutations (phantom `applied` entries would mislead automation that + /// reads `resource_statuses` independently of `ok`). + #[cfg(unix)] + #[tokio::test] + async fn apply_state_write_failure_reports_persisted_statuses() { + use std::os::unix::fs::PermissionsExt; + + let dir = fixture(); + // lock: false so the only write into __cluster/ is state.json itself. + fs::write( + dir.path().join(CLUSTER_CONFIG_FILE), + r#" +version: 1 +state: + backend: cluster + lock: false +graphs: + knowledge: + schema: ./people.pg + queries: + find_person: + file: ./people.gq +"#, + ) + .unwrap(); + write_applyable_state(dir.path()); + // Pre-create the payload blob so the payload phase is a no-op and the + // failure lands exactly at the state write. + let desired = validate_config_dir(dir.path()); + let query_digest = desired + .resource_digests + .get("query.knowledge.find_person") + .unwrap(); + let blob = query_payload_path(dir.path(), query_digest); + fs::create_dir_all(blob.parent().unwrap()).unwrap(); + fs::write(&blob, QUERY).unwrap(); + + let state_dir = dir.path().join(CLUSTER_STATE_DIR); + fs::set_permissions(&state_dir, fs::Permissions::from_mode(0o555)).unwrap(); + // Running as root ignores permission bits; skip rather than flake. + if fs::write(state_dir.join("probe"), b"x").is_ok() { + let _ = fs::remove_file(state_dir.join("probe")); + fs::set_permissions(&state_dir, fs::Permissions::from_mode(0o755)).unwrap(); + eprintln!("skipping: permissions are not enforced (running as root)"); + return; + } + + let out = apply_config_dir(dir.path()).await; + fs::set_permissions(&state_dir, fs::Permissions::from_mode(0o755)).unwrap(); + + assert!(!out.ok); + assert!(!out.state_written); + assert!( + out.diagnostics + .iter() + .any(|diagnostic| diagnostic.code == "state_write_error"), + "{:?}", + out.diagnostics + ); + // The seeded state has no statuses; the failed apply must not invent + // the in-memory `applied` ones it failed to persist. + assert!( + out.resource_statuses.is_empty(), + "unpersisted statuses leaked into output: {:?}", + out.resource_statuses + ); + } + + // ---- catalog payload verification (Stage 3B) ---- + + /// Converge a fixture dir and return the query blob path. + async fn converge_fixture(config_dir: &Path) -> std::path::PathBuf { + write_applyable_state(config_dir); + let out = apply_config_dir(config_dir).await; + assert!(out.ok && out.converged, "{:?}", out.diagnostics); + let desired = validate_config_dir(config_dir); + query_payload_path( + config_dir, + desired + .resource_digests + .get("query.knowledge.find_person") + .unwrap(), + ) + } + + #[tokio::test] + async fn status_reports_missing_payload_read_only() { + let dir = fixture(); + let blob = converge_fixture(dir.path()).await; + let state_before = fs::read_to_string(dir.path().join(CLUSTER_STATE_FILE)).unwrap(); + fs::remove_file(&blob).unwrap(); + + let out = status_config_dir(dir.path()); + assert!(out.ok, "{:?}", out.diagnostics); + assert!(out.diagnostics.iter().any(|diagnostic| { + diagnostic.code == "catalog_payload_missing" + && diagnostic.path == "query.knowledge.find_person" + })); + // Read-only: persisted statuses and state bytes untouched. + assert_eq!( + out.resource_statuses["query.knowledge.find_person"].status, + ResourceLifecycleStatus::Applied + ); + assert_eq!( + fs::read_to_string(dir.path().join(CLUSTER_STATE_FILE)).unwrap(), + state_before + ); + } + + #[tokio::test] + async fn refresh_removes_digest_and_drifts_on_missing_payload() { + let dir = fixture(); + init_derived_graph(dir.path()).await; + let blob = converge_fixture(dir.path()).await; + fs::remove_file(&blob).unwrap(); + + let out = refresh_config_dir(dir.path()).await; + assert!(out.ok, "{:?}", out.diagnostics); + assert!( + out.diagnostics + .iter() + .any(|diagnostic| diagnostic.code == "catalog_payload_missing") + ); + let status = &out.resource_statuses["query.knowledge.find_person"]; + assert_eq!(status.status, ResourceLifecycleStatus::Drifted); + assert!(status.conditions.contains(&"payload_missing".to_string())); + let state = read_state_json(dir.path()); + assert!( + state["applied_revision"]["resources"] + .get("query.knowledge.find_person") + .is_none(), + "{state}" + ); + } + + #[tokio::test] + async fn refresh_drifts_on_corrupted_payload() { + let dir = fixture(); + init_derived_graph(dir.path()).await; + let blob = converge_fixture(dir.path()).await; + fs::write(&blob, "corrupted content").unwrap(); + + let out = refresh_config_dir(dir.path()).await; + assert!(out.ok, "{:?}", out.diagnostics); + let status = &out.resource_statuses["query.knowledge.find_person"]; + assert_eq!(status.status, ResourceLifecycleStatus::Drifted); + assert!(status.conditions.contains(&"payload_mismatch".to_string())); + let state = read_state_json(dir.path()); + assert!( + state["applied_revision"]["resources"] + .get("query.knowledge.find_person") + .is_none() + ); + } + + #[tokio::test] + async fn refresh_flags_unreadable_payload_as_error() { + let dir = fixture(); + init_derived_graph(dir.path()).await; + let blob = converge_fixture(dir.path()).await; + // A same-named directory yields a non-NotFound IO error portably. + fs::remove_file(&blob).unwrap(); + fs::create_dir(&blob).unwrap(); + + let out = refresh_config_dir(dir.path()).await; + assert!(!out.ok); + assert!( + out.diagnostics + .iter() + .any(|diagnostic| diagnostic.code == "catalog_payload_read_error") + ); + let status = &out.resource_statuses["query.knowledge.find_person"]; + assert_eq!(status.status, ResourceLifecycleStatus::Error); + assert!(status.conditions.contains(&"payload_read_error".to_string())); + // Transient IO keeps the digest: no spurious republish. + let state = read_state_json(dir.path()); + assert!( + state["applied_revision"]["resources"] + .get("query.knowledge.find_person") + .is_some() + ); + } + + #[tokio::test] + async fn payload_drift_self_heals_through_refresh_plan_apply() { + let dir = fixture(); + init_derived_graph(dir.path()).await; + let blob = converge_fixture(dir.path()).await; + let original = fs::read_to_string(&blob).unwrap(); + fs::remove_file(&blob).unwrap(); + + let refresh = refresh_config_dir(dir.path()).await; + assert!(refresh.ok, "{:?}", refresh.diagnostics); + + let plan = plan_config_dir(dir.path()).await; + let query_change = plan + .changes + .iter() + .find(|change| change.resource == "query.knowledge.find_person") + .expect("plan must propose recreating the query"); + assert_eq!(query_change.operation, PlanOperation::Create); + assert_eq!(query_change.disposition, Some(ApplyDisposition::Applied)); + + let apply = apply_config_dir(dir.path()).await; + assert!(apply.ok && apply.converged, "{:?}", apply.diagnostics); + assert_eq!(fs::read_to_string(&blob).unwrap(), original); + + let status = status_config_dir(dir.path()); + assert!( + !status + .diagnostics + .iter() + .any(|diagnostic| diagnostic.code.starts_with("catalog_payload")), + "{:?}", + status.diagnostics + ); + } + + #[test] + fn verification_skips_graph_and_schema_resources() { + let dir = fixture(); + write_applyable_state(dir.path()); // graph + schema digests only, no blobs + + let out = status_config_dir(dir.path()); + assert!( + !out.diagnostics + .iter() + .any(|diagnostic| diagnostic.code.starts_with("catalog_payload")), + "{:?}", + out.diagnostics + ); + } + + // ---- recovery sidecars + sweep (Stage 4A) ---- + + fn derived_graph_uri(config_dir: &Path, graph_id: &str) -> String { + display_path( + &config_dir + .join(CLUSTER_GRAPHS_DIR) + .join(format!("{graph_id}.omni")), + ) + } + + fn write_create_sidecar( + config_dir: &Path, + graph_id: &str, + desired_schema_digest: &str, + operation_id: &str, + ) -> PathBuf { + let dir = config_dir.join(CLUSTER_RECOVERIES_DIR); + fs::create_dir_all(&dir).unwrap(); + let path = dir.join(format!("{operation_id}.json")); + fs::write( + &path, + serde_json::to_string_pretty(&json!({ + "schema_version": 1, + "operation_id": operation_id, + "started_at": "1970-01-01T00:00:00Z", + "kind": "graph_create", + "graph_id": graph_id, + "graph_uri": derived_graph_uri(config_dir, graph_id), + "desired_schema_digest": desired_schema_digest, + })) + .unwrap(), + ) + .unwrap(); + path + } + + #[tokio::test] + async fn sweep_removes_sidecar_when_root_absent() { + let dir = fixture(); + write_applyable_state(dir.path()); + let sidecar = write_create_sidecar(dir.path(), "knowledge", "irrelevant", "01ROW1"); + + let out = apply_config_dir(dir.path()).await; + assert!(out.ok, "{:?}", out.diagnostics); + // Row 1: nothing moved; intent removed, run proceeds normally. + assert!(!sidecar.exists()); + assert!(out.converged); + } + + #[tokio::test] + async fn sweep_rolls_forward_completed_create() { + let dir = fixture(); + init_derived_graph(dir.path()).await; + write_state_resources(dir.path(), &[]); // state predates the create + let desired = validate_config_dir(dir.path()); + let schema_digest = desired.resource_digests["schema.knowledge"].clone(); + let sidecar = write_create_sidecar(dir.path(), "knowledge", &schema_digest, "01ROW4"); + + let out = apply_config_dir(dir.path()).await; + assert!(out.ok, "{:?}", out.diagnostics); + assert!( + out.diagnostics + .iter() + .any(|diagnostic| diagnostic.code == "cluster_recovery_rolled_forward") + ); + // Row 4: ledger converged to observable reality, audit recorded, + // sidecar retired after the CAS landed. + let state = read_state_json(dir.path()); + assert_eq!( + state["applied_revision"]["resources"]["schema.knowledge"]["digest"], + schema_digest + ); + assert!( + state["recovery_records"] + .as_object() + .unwrap() + .values() + .any(|record| record["outcome"] == "rolled_forward" + && record["graph_id"] == "knowledge") + ); + assert!(!sidecar.exists()); + // With the graph rolled forward, the same run converges the catalog. + assert!(out.converged, "{out:?}"); + } + + #[tokio::test] + async fn sweep_completes_already_recorded_create() { + let dir = fixture(); + init_derived_graph(dir.path()).await; + write_applyable_state(dir.path()); // state already records graph+schema + let desired = validate_config_dir(dir.path()); + let sidecar = write_create_sidecar( + dir.path(), + "knowledge", + &desired.resource_digests["schema.knowledge"], + "01ROW2", + ); + + let out = apply_config_dir(dir.path()).await; + assert!(out.ok, "{:?}", out.diagnostics); + // Row 2: outcome was already durable; no audit entry, sidecar retired. + assert!(!sidecar.exists()); + let state = read_state_json(dir.path()); + assert!( + state["recovery_records"] + .as_object() + .is_none_or(|records| records.is_empty()), + "{state}" + ); + } + + #[tokio::test] + async fn sweep_keeps_sidecar_for_incomplete_root() { + let dir = fixture(); + write_applyable_state(dir.path()); + // A root that exists but cannot be opened: the engine's partial-init gap. + let root = dir.path().join(CLUSTER_GRAPHS_DIR).join("knowledge.omni"); + fs::create_dir_all(&root).unwrap(); + fs::write(root.join("_schema.pg"), "junk").unwrap(); + let sidecar = write_create_sidecar(dir.path(), "knowledge", "whatever", "01ROW5"); + + let out = apply_config_dir(dir.path()).await; + assert!(!out.ok); + assert!( + out.diagnostics + .iter() + .any(|diagnostic| diagnostic.code == "graph_create_incomplete") + ); + // Row 5: never auto-delete; sidecar and root stay for the operator, + // and the Error status is persisted by the run's state write. + assert!(sidecar.exists()); + assert!(root.exists()); + let state = read_state_json(dir.path()); + assert_eq!(state["resource_statuses"]["graph.knowledge"]["status"], "error"); + assert!( + state["resource_statuses"]["graph.knowledge"]["conditions"] + .as_array() + .unwrap() + .iter() + .any(|condition| condition == "graph_create_incomplete") + ); + } + + #[tokio::test] + async fn sweep_flags_unexpected_schema_as_pending() { + let dir = fixture(); + write_state_resources(dir.path(), &[]); + // Live graph exists with a schema the sidecar never intended. + let graph_dir = dir.path().join(CLUSTER_GRAPHS_DIR); + fs::create_dir_all(&graph_dir).unwrap(); + Omnigraph::init( + &derived_graph_uri(dir.path(), "knowledge"), + "\nnode Other {\n name: String @key\n}\n", + ) + .await + .unwrap(); + let desired = validate_config_dir(dir.path()); + let sidecar = write_create_sidecar( + dir.path(), + "knowledge", + &desired.resource_digests["schema.knowledge"], + "01ROW6", + ); + + let out = apply_config_dir(dir.path()).await; + assert!(out.ok, "{:?}", out.diagnostics); // warning, not error + assert!( + out.diagnostics + .iter() + .any(|diagnostic| diagnostic.code == "cluster_recovery_pending") + ); + // Row 6: refuse to guess; sidecar kept, Drifted persisted. + assert!(sidecar.exists()); + let state = read_state_json(dir.path()); + assert_eq!( + state["resource_statuses"]["graph.knowledge"]["status"], + "drifted" + ); + assert!( + state["resource_statuses"]["graph.knowledge"]["conditions"] + .as_array() + .unwrap() + .iter() + .any(|condition| condition == "actual_applied_state_pending") + ); + } + + #[tokio::test] + async fn apply_blocks_create_while_recovery_pending() { + let dir = fixture(); + write_state_resources(dir.path(), &[]); + // A kept (row 5) sidecar: partial root that cannot be opened. + let root = dir.path().join(CLUSTER_GRAPHS_DIR).join("knowledge.omni"); + fs::create_dir_all(&root).unwrap(); + fs::write(root.join("_schema.pg"), "junk").unwrap(); + let sidecar = write_create_sidecar(dir.path(), "knowledge", "whatever", "01PEND"); + + let out = apply_config_dir(dir.path()).await; + assert!(!out.ok); // row 5 is an error condition + let by_resource: BTreeMap<&str, &PlanChange> = out + .changes + .iter() + .map(|change| (change.resource.as_str(), change)) + .collect(); + // The pending recovery blocks the create and its dependents; the + // executor never attempts the init. + assert_eq!( + by_resource["graph.knowledge"].disposition, + Some(ApplyDisposition::Blocked) + ); + assert_eq!( + by_resource["graph.knowledge"].reason.as_deref(), + Some("cluster_recovery_pending") + ); + assert_eq!( + by_resource["query.knowledge.find_person"].reason.as_deref(), + Some("cluster_recovery_pending") + ); + assert_eq!( + by_resource["policy.base"].reason.as_deref(), + Some("cluster_recovery_pending") + ); + assert!(sidecar.exists()); + // The sweep's Error status is what persists — not a generic Blocked. + let state = read_state_json(dir.path()); + assert_eq!(state["resource_statuses"]["graph.knowledge"]["status"], "error"); + } + + #[tokio::test] + async fn plan_embeds_migration_preview_for_schema_update() { + let dir = fixture(); + init_derived_graph(dir.path()).await; + write_applyable_state(dir.path()); + fs::write( + dir.path().join("people.pg"), + "\nnode Person {\n name: String @key\n age: I32?\n bio: String?\n}\n", + ) + .unwrap(); + + let out = plan_config_dir(dir.path()).await; + assert!(out.ok, "{:?}", out.diagnostics); + let schema_change = out + .changes + .iter() + .find(|change| change.resource == "schema.knowledge") + .unwrap(); + let migration = schema_change.migration.as_ref().expect("preview embedded"); + assert!(migration.supported); + assert!( + serde_json::to_string(&migration.steps) + .unwrap() + .contains("add_property"), + "{migration:?}" + ); + } + + #[tokio::test] + async fn plan_warns_when_preview_unavailable() { + let dir = fixture(); + write_applyable_state(dir.path()); // digests recorded, but no live root + fs::write( + dir.path().join("people.pg"), + "\nnode Person {\n name: String @key\n age: I32?\n bio: String?\n}\n", + ) + .unwrap(); + + let out = plan_config_dir(dir.path()).await; + assert!(out.ok, "{:?}", out.diagnostics); + let schema_change = out + .changes + .iter() + .find(|change| change.resource == "schema.knowledge") + .unwrap(); + assert!(schema_change.migration.is_none()); + assert!( + out.diagnostics + .iter() + .any(|diagnostic| diagnostic.code == "schema_preview_unavailable") + ); + } + + fn write_schema_apply_sidecar( + config_dir: &Path, + graph_id: &str, + desired_schema_digest: &str, + operation_id: &str, + ) -> PathBuf { + let dir = config_dir.join(CLUSTER_RECOVERIES_DIR); + fs::create_dir_all(&dir).unwrap(); + let path = dir.join(format!("{operation_id}.json")); + fs::write( + &path, + serde_json::to_string_pretty(&json!({ + "schema_version": 1, + "operation_id": operation_id, + "started_at": "1970-01-01T00:00:00Z", + "kind": "schema_apply", + "graph_id": graph_id, + "graph_uri": derived_graph_uri(config_dir, graph_id), + "desired_schema_digest": desired_schema_digest, + })) + .unwrap(), + ) + .unwrap(); + path + } + + const SCHEMA_V2: &str = "\nnode Person {\n name: String @key\n age: I32?\n bio: String?\n}\n"; + + #[tokio::test] + async fn sweep_retires_schema_sidecar_when_ledger_consistent() { + let dir = fixture(); + init_derived_graph(dir.path()).await; + write_applyable_state(dir.path()); // state digest == live digest + let sidecar = + write_schema_apply_sidecar(dir.path(), "knowledge", "never-applied", "01SROW1"); + + let out = apply_config_dir(dir.path()).await; + assert!(out.ok, "{:?}", out.diagnostics); + assert!(!sidecar.exists()); + let state = read_state_json(dir.path()); + assert!( + state["recovery_records"] + .as_object() + .is_none_or(|records| records.is_empty()) + ); + } + + #[tokio::test] + async fn sweep_rolls_forward_completed_schema_apply() { + let dir = fixture(); + init_derived_graph(dir.path()).await; + write_applyable_state(dir.path()); + // The schema apply completed on the graph out-of-process... + let graph_uri = derived_graph_uri(dir.path(), "knowledge"); + let db = Omnigraph::open(&graph_uri).await.unwrap(); + db.apply_schema(SCHEMA_V2).await.unwrap(); + // ...the desired config matches it, and the sidecar records the intent. + fs::write(dir.path().join("people.pg"), SCHEMA_V2).unwrap(); + let desired = validate_config_dir(dir.path()); + let v2_digest = desired.resource_digests["schema.knowledge"].clone(); + let sidecar = write_schema_apply_sidecar(dir.path(), "knowledge", &v2_digest, "01SROW3"); + + let out = apply_config_dir(dir.path()).await; + assert!(out.ok, "{:?}", out.diagnostics); + assert!( + out.diagnostics + .iter() + .any(|diagnostic| diagnostic.code == "cluster_recovery_rolled_forward") + ); + assert!(!sidecar.exists()); + let state = read_state_json(dir.path()); + assert_eq!( + state["applied_revision"]["resources"]["schema.knowledge"]["digest"], + v2_digest + ); + assert!( + state["recovery_records"] + .as_object() + .unwrap() + .values() + .any(|record| record["kind"] == "schema_apply" + && record["outcome"] == "rolled_forward") + ); + assert!(out.converged, "{out:?}"); + } + + #[tokio::test] + async fn sweep_flags_unexpected_schema_apply_state_as_pending() { + let dir = fixture(); + init_derived_graph(dir.path()).await; // live = v1 + write_state_resources(dir.path(), &[("schema.knowledge", "stale-digest")]); + // Sidecar intended a digest that is neither live nor recorded. + let sidecar = + write_schema_apply_sidecar(dir.path(), "knowledge", "intended-digest", "01SROW6"); + + let out = apply_config_dir(dir.path()).await; + assert!(out.ok, "{:?}", out.diagnostics); // warnings only + assert!( + out.diagnostics + .iter() + .any(|diagnostic| diagnostic.code == "cluster_recovery_pending") + ); + assert!(sidecar.exists()); + let state = read_state_json(dir.path()); + assert_eq!( + state["resource_statuses"]["schema.knowledge"]["status"], + "drifted" + ); + } + + #[tokio::test] + async fn sweep_keeps_schema_sidecar_for_unopenable_root() { + let dir = fixture(); + write_applyable_state(dir.path()); + let root = dir.path().join(CLUSTER_GRAPHS_DIR).join("knowledge.omni"); + fs::create_dir_all(&root).unwrap(); // exists, won't open + let sidecar = + write_schema_apply_sidecar(dir.path(), "knowledge", "whatever", "01SROWX"); + + let out = apply_config_dir(dir.path()).await; + assert!(out.ok, "{:?}", out.diagnostics); // warning: cannot verify + assert!( + out.diagnostics + .iter() + .any(|diagnostic| diagnostic.code == "cluster_recovery_pending") + ); + assert!(sidecar.exists()); + } + + /// Seed: converged knowledge subtree + a stale `old` graph subtree with a + /// real directory on disk. + fn seed_deletable_state(config_dir: &Path) { + write_applyable_state(config_dir); + let state = read_state_json(config_dir); + let g = state["applied_revision"]["resources"]["graph.knowledge"]["digest"] + .as_str() + .unwrap() + .to_string(); + let sc = state["applied_revision"]["resources"]["schema.knowledge"]["digest"] + .as_str() + .unwrap() + .to_string(); + write_state_resources( + config_dir, + &[ + ("graph.knowledge", g.as_str()), + ("schema.knowledge", sc.as_str()), + ("graph.old", "3333"), + ("schema.old", "4444"), + ("query.old.q", "5555"), + ], + ); + let root = config_dir.join(CLUSTER_GRAPHS_DIR).join("old.omni"); + fs::create_dir_all(&root).unwrap(); + fs::write(root.join("_schema.pg"), "stale").unwrap(); + } + + #[tokio::test] + async fn apply_executes_approved_graph_delete() { + let dir = fixture(); + seed_deletable_state(dir.path()); + let approved = approve_config_dir(dir.path(), "graph.old", "andrew").await; + assert!(approved.ok, "{:?}", approved.diagnostics); + let approval_id = approved.approval_id.clone().unwrap(); + + let out = apply_config_dir(dir.path()).await; + assert!(out.ok, "{:?}", out.diagnostics); + assert!(out.converged, "{out:?}"); + let by_resource: BTreeMap<&str, &PlanChange> = out + .changes + .iter() + .map(|change| (change.resource.as_str(), change)) + .collect(); + assert_eq!(by_resource["graph.old"].disposition, Some(ApplyDisposition::Applied)); + assert_eq!(by_resource["schema.old"].disposition, Some(ApplyDisposition::Applied)); + assert_eq!(by_resource["query.old.q"].disposition, Some(ApplyDisposition::Applied)); + // The root is gone; the subtree is tombstoned out of the ledger. + assert!(!dir.path().join(CLUSTER_GRAPHS_DIR).join("old.omni").exists()); + let state = read_state_json(dir.path()); + let resources = state["applied_revision"]["resources"].as_object().unwrap(); + assert!(!resources.contains_key("graph.old")); + assert!(!resources.contains_key("schema.old")); + assert!(!resources.contains_key("query.old.q")); + assert_eq!(state["observations"]["graph.old"]["kind"], "tombstone"); + assert_eq!(state["observations"]["graph.old"]["approval_id"], approval_id); + // Approval consumed in BOTH stores: ledger summary + artifact file. + assert!(state["approval_records"][&approval_id]["consumed_at"].is_string()); + let artifact: serde_json::Value = serde_json::from_str( + &fs::read_to_string( + dir.path() + .join(CLUSTER_APPROVALS_DIR) + .join(format!("{approval_id}.json")), + ) + .unwrap(), + ) + .unwrap(); + assert!(artifact["consumed_at"].is_string(), "{artifact}"); + // Sidecar retired. + assert!( + fs::read_dir(dir.path().join(CLUSTER_RECOVERIES_DIR)) + .map(|mut entries| entries.next().is_none()) + .unwrap_or(true) + ); + // A consumed approval authorizes nothing further (idempotent re-apply). + let again = apply_config_dir(dir.path()).await; + assert!(again.ok && again.converged && !again.state_written, "{again:?}"); + } + + fn write_delete_sidecar( + config_dir: &Path, + graph_id: &str, + approval_id: Option<&str>, + operation_id: &str, + ) -> PathBuf { + let dir = config_dir.join(CLUSTER_RECOVERIES_DIR); + fs::create_dir_all(&dir).unwrap(); + let path = dir.join(format!("{operation_id}.json")); + fs::write( + &path, + serde_json::to_string_pretty(&json!({ + "schema_version": 1, + "operation_id": operation_id, + "started_at": "1970-01-01T00:00:00Z", + "kind": "graph_delete", + "graph_id": graph_id, + "graph_uri": derived_graph_uri(config_dir, graph_id), + "desired_schema_digest": "", + "approval_id": approval_id, + })) + .unwrap(), + ) + .unwrap(); + path + } + + #[tokio::test] + async fn sweep_retires_delete_sidecar_when_tombstoned() { + let dir = fixture(); + write_applyable_state(dir.path()); // no graph.old in state, no root + let sidecar = write_delete_sidecar(dir.path(), "old", None, "01DROW7"); + + let out = apply_config_dir(dir.path()).await; + assert!(out.ok, "{:?}", out.diagnostics); + assert!(!sidecar.exists()); + let state = read_state_json(dir.path()); + assert!( + state["recovery_records"] + .as_object() + .is_none_or(|records| records.is_empty()) + ); + } + + #[tokio::test] + async fn sweep_rolls_forward_completed_delete() { + let dir = fixture(); + seed_deletable_state(dir.path()); + // Approve, then simulate: root removed, state stale, sidecar present. + let approved = approve_config_dir(dir.path(), "graph.old", "andrew").await; + let approval_id = approved.approval_id.unwrap(); + fs::remove_dir_all(dir.path().join(CLUSTER_GRAPHS_DIR).join("old.omni")).unwrap(); + let sidecar = write_delete_sidecar(dir.path(), "old", Some(&approval_id), "01DROW7B"); + + let out = apply_config_dir(dir.path()).await; + assert!(out.ok, "{:?}", out.diagnostics); + assert!( + out.diagnostics + .iter() + .any(|diagnostic| diagnostic.code == "cluster_recovery_rolled_forward") + ); + assert!(!sidecar.exists()); + let state = read_state_json(dir.path()); + assert!( + !state["applied_revision"]["resources"] + .as_object() + .unwrap() + .contains_key("graph.old") + ); + assert_eq!(state["observations"]["graph.old"]["kind"], "tombstone"); + assert!(state["approval_records"][&approval_id]["consumed_at"].is_string()); + assert!( + state["recovery_records"] + .as_object() + .unwrap() + .values() + .any(|record| record["kind"] == "graph_delete" + && record["outcome"] == "rolled_forward") + ); + // The artifact file is marked consumed post-CAS. + let artifact: serde_json::Value = serde_json::from_str( + &fs::read_to_string( + dir.path() + .join(CLUSTER_APPROVALS_DIR) + .join(format!("{approval_id}.json")), + ) + .unwrap(), + ) + .unwrap(); + assert!(artifact["consumed_at"].is_string()); + assert!(out.converged, "{out:?}"); + } + + #[tokio::test] + async fn sweep_reproposes_incomplete_delete() { + let dir = fixture(); + seed_deletable_state(dir.path()); // root present + let approved = approve_config_dir(dir.path(), "graph.old", "andrew").await; + assert!(approved.ok); + let sidecar = write_delete_sidecar(dir.path(), "old", approved.approval_id.as_deref(), "01DROW8"); + + // Row 8: the stale intent is retired with a warning, and the same run + // re-executes the still-approved delete to completion. + let out = apply_config_dir(dir.path()).await; + assert!(out.ok, "{:?}", out.diagnostics); + assert!( + out.diagnostics + .iter() + .any(|diagnostic| diagnostic.code == "graph_delete_incomplete") + ); + assert!(!sidecar.exists()); + assert!(!dir.path().join(CLUSTER_GRAPHS_DIR).join("old.omni").exists()); + assert!(out.converged, "{out:?}"); + } + + // ---- policy bindings in the applied revision (5A) ---- + + #[tokio::test] + async fn apply_records_policy_bindings() { + let dir = fixture(); + write_applyable_state(dir.path()); + + let out = apply_config_dir(dir.path()).await; + assert!(out.ok && out.converged, "{:?}", out.diagnostics); + let state = read_state_json(dir.path()); + assert_eq!( + state["applied_revision"]["resources"]["policy.base"]["applies_to"], + serde_json::json!(["graph.knowledge"]), + "{state}" + ); + // Non-policy entries carry no bindings field at all. + assert!( + state["applied_revision"]["resources"]["query.knowledge.find_person"] + .get("applies_to") + .is_none() + ); + } + + #[tokio::test] + async fn binding_change_is_a_visible_plan_change() { + let dir = fixture(); + write_applyable_state(dir.path()); + let converge = apply_config_dir(dir.path()).await; + assert!(converge.converged, "{converge:?}"); + // Edit ONLY applies_to: the policy file digest is unchanged. + fs::write( + dir.path().join(CLUSTER_CONFIG_FILE), + r#" +version: 1 +metadata: + name: test +state: + backend: cluster + lock: true +graphs: + knowledge: + schema: ./people.pg + queries: + find_person: + file: ./people.gq +policies: + base: + file: ./base.policy.yaml + applies_to: [cluster, knowledge] +"#, + ) + .unwrap(); + + let plan = plan_config_dir(dir.path()).await; + let change = plan + .changes + .iter() + .find(|change| change.resource == "policy.base") + .expect("binding change must be visible in plan"); + assert!(change.binding_change); + assert_eq!(change.operation, PlanOperation::Update); + assert_eq!(change.before_digest, change.after_digest); + + let out = apply_config_dir(dir.path()).await; + assert!(out.ok && out.converged, "{out:?}"); + let state = read_state_json(dir.path()); + assert_eq!( + state["applied_revision"]["resources"]["policy.base"]["applies_to"], + serde_json::json!(["cluster", "graph.knowledge"]) + ); + // Idempotent: a second run sees no changes. + let again = apply_config_dir(dir.path()).await; + assert!(again.changes.is_empty() && !again.state_written, "{again:?}"); + } + + #[tokio::test] + async fn pre_5a_state_backfills_bindings() { + let dir = fixture(); + write_applyable_state(dir.path()); + let converge = apply_config_dir(dir.path()).await; + assert!(converge.converged, "{converge:?}"); + // Strip the bindings from the state entry (a pre-5A ledger). + let mut state: serde_json::Value = serde_json::from_str( + &fs::read_to_string(dir.path().join(CLUSTER_STATE_FILE)).unwrap(), + ) + .unwrap(); + state["applied_revision"]["resources"]["policy.base"] + .as_object_mut() + .unwrap() + .remove("applies_to"); + fs::write( + dir.path().join(CLUSTER_STATE_FILE), + serde_json::to_string_pretty(&state).unwrap(), + ) + .unwrap(); + + let plan = plan_config_dir(dir.path()).await; + assert!( + plan.changes + .iter() + .any(|change| change.resource == "policy.base" && change.binding_change), + "{plan:?}" + ); + let out = apply_config_dir(dir.path()).await; + assert!(out.ok && out.converged, "{out:?}"); + let healed = read_state_json(dir.path()); + assert_eq!( + healed["applied_revision"]["resources"]["policy.base"]["applies_to"], + serde_json::json!(["graph.knowledge"]) + ); + } + + #[tokio::test] + async fn bindings_survive_refresh() { + let dir = fixture(); + init_derived_graph(dir.path()).await; + write_applyable_state(dir.path()); + let converge = apply_config_dir(dir.path()).await; + assert!(converge.converged, "{converge:?}"); + + let refresh = refresh_config_dir(dir.path()).await; + assert!(refresh.ok, "{:?}", refresh.diagnostics); + let state = read_state_json(dir.path()); + assert_eq!( + state["applied_revision"]["resources"]["policy.base"]["applies_to"], + serde_json::json!(["graph.knowledge"]) + ); + } + + // ---- serving snapshot (5B read-only loader) ---- + + #[tokio::test] + async fn serving_snapshot_reads_converged_cluster() { + let dir = fixture(); + init_derived_graph(dir.path()).await; + write_applyable_state(dir.path()); + let converge = apply_config_dir(dir.path()).await; + assert!(converge.converged, "{converge:?}"); + + let snapshot = read_serving_snapshot(dir.path()).expect("converged cluster must serve"); + assert_eq!(snapshot.graphs.len(), 1); + assert_eq!(snapshot.graphs[0].graph_id, "knowledge"); + assert!(snapshot.graphs[0].root.ends_with("graphs/knowledge.omni")); + assert_eq!(snapshot.queries.len(), 1); + assert_eq!(snapshot.queries[0].name, "find_person"); + assert!(snapshot.queries[0].source.contains("query find_person")); + assert_eq!(snapshot.policies.len(), 1); + assert_eq!(snapshot.policies[0].applies_to, vec!["graph.knowledge"]); + assert!(snapshot.policies[0].blob_path.exists()); + } + + #[test] + fn serving_snapshot_refuses_missing_state() { + let dir = fixture(); + let err = read_serving_snapshot(dir.path()).unwrap_err(); + assert!( + err.iter().any(|diagnostic| diagnostic.code == "cluster_state_missing"), + "{err:?}" + ); + } + + #[tokio::test] + async fn serving_snapshot_refuses_pending_recovery() { + let dir = fixture(); + init_derived_graph(dir.path()).await; + write_applyable_state(dir.path()); + apply_config_dir(dir.path()).await; + write_schema_apply_sidecar(dir.path(), "knowledge", "whatever", "01SERVE"); + + let err = read_serving_snapshot(dir.path()).unwrap_err(); + assert!( + err.iter().any(|diagnostic| diagnostic.code == "cluster_recovery_pending"), + "{err:?}" + ); + } + + #[tokio::test] + async fn serving_snapshot_refuses_tampered_blob_and_stripped_bindings() { + let dir = fixture(); + init_derived_graph(dir.path()).await; + write_applyable_state(dir.path()); + apply_config_dir(dir.path()).await; + // Tamper with the query blob... + let snapshot = read_serving_snapshot(dir.path()).unwrap(); + let desired = validate_config_dir(dir.path()); + let query_digest = &desired.resource_digests["query.knowledge.find_person"]; + let blob = dir + .path() + .join(CLUSTER_RESOURCES_DIR) + .join("query/knowledge/find_person") + .join(format!("{query_digest}.gq")); + fs::write(&blob, "tampered").unwrap(); + // ...and strip the policy bindings (pre-5A ledger). + let mut state: serde_json::Value = serde_json::from_str( + &fs::read_to_string(dir.path().join(CLUSTER_STATE_FILE)).unwrap(), + ) + .unwrap(); + state["applied_revision"]["resources"]["policy.base"] + .as_object_mut() + .unwrap() + .remove("applies_to"); + fs::write( + dir.path().join(CLUSTER_STATE_FILE), + serde_json::to_string_pretty(&state).unwrap(), + ) + .unwrap(); + + let err = read_serving_snapshot(dir.path()).unwrap_err(); + assert!( + err.iter() + .any(|diagnostic| diagnostic.code == "catalog_payload_digest_mismatch"), + "{err:?}" + ); + assert!( + err.iter().any(|diagnostic| diagnostic.code == "policy_bindings_missing"), + "{err:?}" + ); + let _ = snapshot; // the pre-tamper read succeeded + } + + #[test] + fn serving_snapshot_refuses_empty_cluster() { + let dir = fixture(); + write_state_resources(dir.path(), &[]); // state exists, no graphs + + let err = read_serving_snapshot(dir.path()).unwrap_err(); + assert!( + err.iter().any(|diagnostic| diagnostic.code == "cluster_empty"), + "{err:?}" + ); + } + + // ---- query discovery (Terraform-style declaration) ---- + + #[test] + fn queries_directory_discovers_every_declaration() { + let dir = tempfile::tempdir().unwrap(); + fs::write(dir.path().join("people.pg"), "\nnode Person {\n name: String @key\n}\n").unwrap(); + fs::create_dir(dir.path().join("queries")).unwrap(); + fs::write( + dir.path().join("queries/people.gq"), + "\nquery find_person($name: String) {\n match { $p: Person { name: $name } }\n return { $p.name }\n}\n\nquery all_people() {\n match { $p: Person }\n return { $p.name }\n}\n", + ) + .unwrap(); + fs::write( + dir.path().join("queries/extra.gq"), + "\nquery count_people() {\n match { $p: Person }\n return { count($p) }\n}\n", + ) + .unwrap(); + fs::write(dir.path().join("queries/notes.txt"), "ignored").unwrap(); + fs::write( + dir.path().join("cluster.yaml"), + "version: 1\ngraphs:\n knowledge:\n schema: ./people.pg\n queries: ./queries/\n", + ) + .unwrap(); + + let out = validate_config_dir(dir.path()); + assert!(out.ok, "{:?}", out.diagnostics); + let names: Vec<&str> = out + .resource_digests + .keys() + .filter_map(|address| address.strip_prefix("query.knowledge.")) + .collect(); + assert_eq!(names, vec!["all_people", "count_people", "find_person"]); + } + + #[test] + fn queries_list_and_single_file_forms_discover() { + let dir = tempfile::tempdir().unwrap(); + fs::write(dir.path().join("people.pg"), "\nnode Person {\n name: String @key\n}\n").unwrap(); + fs::write( + dir.path().join("a.gq"), + "\nquery find_person($name: String) {\n match { $p: Person { name: $name } }\n return { $p.name }\n}\n", + ) + .unwrap(); + fs::write( + dir.path().join("b.gq"), + "\nquery all_people() {\n match { $p: Person }\n return { $p.name }\n}\n", + ) + .unwrap(); + fs::write( + dir.path().join("cluster.yaml"), + "version: 1\ngraphs:\n knowledge:\n schema: ./people.pg\n queries: [./a.gq, ./b.gq]\n", + ) + .unwrap(); + let out = validate_config_dir(dir.path()); + assert!(out.ok, "{:?}", out.diagnostics); + assert!(out.resource_digests.contains_key("query.knowledge.find_person")); + assert!(out.resource_digests.contains_key("query.knowledge.all_people")); + + // Single-file string form + fs::write( + dir.path().join("cluster.yaml"), + "version: 1\ngraphs:\n knowledge:\n schema: ./people.pg\n queries: ./a.gq\n", + ) + .unwrap(); + let out = validate_config_dir(dir.path()); + assert!(out.ok, "{:?}", out.diagnostics); + assert!(out.resource_digests.contains_key("query.knowledge.find_person")); + assert!(!out.resource_digests.contains_key("query.knowledge.all_people")); + } + + #[test] + fn query_discovery_rejects_duplicates_and_parse_errors() { + let dir = tempfile::tempdir().unwrap(); + fs::write(dir.path().join("people.pg"), "\nnode Person {\n name: String @key\n}\n").unwrap(); + let decl = "\nquery find_person($name: String) {\n match { $p: Person { name: $name } }\n return { $p.name }\n}\n"; + fs::write(dir.path().join("a.gq"), decl).unwrap(); + fs::write(dir.path().join("b.gq"), decl).unwrap(); + fs::write( + dir.path().join("cluster.yaml"), + "version: 1\ngraphs:\n knowledge:\n schema: ./people.pg\n queries: [./a.gq, ./b.gq]\n", + ) + .unwrap(); + let out = validate_config_dir(dir.path()); + assert!(!out.ok); + assert!( + out.diagnostics + .iter() + .any(|diagnostic| diagnostic.code == "duplicate_query_name"), + "{:?}", + out.diagnostics + ); + + fs::write(dir.path().join("broken.gq"), "query {{{ nope").unwrap(); + fs::write( + dir.path().join("cluster.yaml"), + "version: 1\ngraphs:\n knowledge:\n schema: ./people.pg\n queries: ./broken.gq\n", + ) + .unwrap(); + let out = validate_config_dir(dir.path()); + assert!(!out.ok); + assert!( + out.diagnostics + .iter() + .any(|diagnostic| diagnostic.code == "query_parse_error"), + "{:?}", + out.diagnostics + ); + } + + #[test] + fn status_warns_on_pending_recovery_sidecar() { + let dir = fixture(); + write_applyable_state(dir.path()); + write_create_sidecar(dir.path(), "knowledge", "irrelevant", "01STATUS"); + + let out = status_config_dir(dir.path()); + assert!(out.ok, "{:?}", out.diagnostics); + assert!( + out.diagnostics + .iter() + .any(|diagnostic| diagnostic.code == "cluster_recovery_pending" + && diagnostic.severity == DiagnosticSeverity::Warning) + ); + } + + #[tokio::test] + async fn plan_annotates_apply_dispositions() { + let dir = fixture(); + let out = plan_config_dir(dir.path()).await; + assert!(out.ok, "{:?}", out.diagnostics); + let by_resource: BTreeMap<&str, &PlanChange> = out + .changes + .iter() + .map(|change| (change.resource.as_str(), change)) + .collect(); + // Stage 4A: graph/schema creates are executable, and dependents ride + // the same run — plan previews exactly that. + assert_eq!( + by_resource["graph.knowledge"].disposition, + Some(ApplyDisposition::Applied) + ); + assert_eq!( + by_resource["schema.knowledge"].disposition, + Some(ApplyDisposition::Applied) + ); + assert_eq!( + by_resource["query.knowledge.find_person"].disposition, + Some(ApplyDisposition::Applied) + ); + assert_eq!( + by_resource["policy.base"].disposition, + Some(ApplyDisposition::Applied) + ); + } From 5a8047e5d0615f4f21cd62cb4abab1680aeada91 Mon Sep 17 00:00:00 2001 From: aaltshuler Date: Thu, 11 Jun 2026 05:28:04 +0300 Subject: [PATCH 2/7] refactor(cluster): move the storage backend to store.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Verbatim move of LocalStateBackend, StateSnapshot, StateLockGuard and their impls — the single home for stored-state I/O (state ledger, lock, recovery sidecars, approval artifacts), where the RFC-006 object-storage port lands next as a focused diff. Visibility bumps (pub(crate)) only; 95 tests green before and after. Co-Authored-By: Claude Fable 5 --- crates/omnigraph-cluster/src/lib.rs | 556 +------------------------ crates/omnigraph-cluster/src/store.rs | 561 ++++++++++++++++++++++++++ 2 files changed, 564 insertions(+), 553 deletions(-) create mode 100644 crates/omnigraph-cluster/src/store.rs diff --git a/crates/omnigraph-cluster/src/lib.rs b/crates/omnigraph-cluster/src/lib.rs index da80710..17dd8a6 100644 --- a/crates/omnigraph-cluster/src/lib.rs +++ b/crates/omnigraph-cluster/src/lib.rs @@ -19,6 +19,9 @@ use ulid::Ulid; pub mod failpoints; +mod store; +use store::{LocalStateBackend, StateLockGuard, StateSnapshot}; + pub const CLUSTER_CONFIG_FILE: &str = "cluster.yaml"; pub const CLUSTER_GRAPHS_DIR: &str = "graphs"; pub const CLUSTER_STATE_DIR: &str = "__cluster"; @@ -666,25 +669,6 @@ struct SweepOutcome { consumed_approvals: Vec, } -#[derive(Debug)] -struct LocalStateBackend { - state_dir: PathBuf, - state_path: PathBuf, - lock_path: PathBuf, - recoveries_dir: PathBuf, - approvals_dir: PathBuf, -} - -#[derive(Debug)] -struct StateSnapshot { - state: Option, - state_cas: Option, -} - -#[derive(Debug)] -struct StateLockGuard { - path: PathBuf, -} pub fn validate_config_dir(config_dir: impl AsRef) -> ValidateOutput { let outcome = load_desired(config_dir.as_ref()); @@ -2436,540 +2420,6 @@ fn validate_cluster_header( } } -impl LocalStateBackend { - fn new(config_dir: &Path) -> Self { - let state_dir = config_dir.join(CLUSTER_STATE_DIR); - Self { - state_path: config_dir.join(CLUSTER_STATE_FILE), - lock_path: config_dir.join(CLUSTER_LOCK_FILE), - recoveries_dir: config_dir.join(CLUSTER_RECOVERIES_DIR), - approvals_dir: config_dir.join(CLUSTER_APPROVALS_DIR), - state_dir, - } - } - - /// List approval artifacts in ULID (filename) order; unparseable files - /// warn and stay on disk for the operator. - fn list_approval_artifacts( - &self, - diagnostics: &mut Vec, - ) -> Vec<(PathBuf, ApprovalArtifact)> { - let mut paths = Vec::new(); - match fs::read_dir(&self.approvals_dir) { - Ok(entries) => { - for entry in entries.flatten() { - let path = entry.path(); - if path.extension().is_some_and(|ext| ext == "json") { - paths.push(path); - } - } - } - Err(err) if err.kind() == ErrorKind::NotFound => {} - Err(err) => diagnostics.push(Diagnostic::warning( - "approval_read_error", - CLUSTER_APPROVALS_DIR, - format!("could not list approval artifacts: {err}"), - )), - } - paths.sort(); - let mut artifacts = Vec::new(); - for path in paths { - match fs::read_to_string(&path) - .map_err(|err| err.to_string()) - .and_then(|text| { - serde_json::from_str::(&text).map_err(|err| err.to_string()) - }) { - Ok(artifact) if artifact.schema_version == 1 => artifacts.push((path, artifact)), - Ok(artifact) => diagnostics.push(Diagnostic::warning( - "unsupported_approval_version", - display_path(&path), - format!( - "unsupported approval artifact version {}; leaving it in place", - artifact.schema_version - ), - )), - Err(err) => diagnostics.push(Diagnostic::warning( - "invalid_approval_artifact", - display_path(&path), - format!("could not parse approval artifact ({err}); leaving it in place"), - )), - } - } - artifacts - } - - /// Atomically write (or rewrite, e.g. on consumption) an approval artifact. - fn write_approval_artifact(&self, artifact: &ApprovalArtifact) -> Result { - fs::create_dir_all(&self.approvals_dir).map_err(|err| { - Diagnostic::error( - "approval_write_error", - CLUSTER_APPROVALS_DIR, - format!("could not create approvals directory: {err}"), - ) - })?; - let target = self - .approvals_dir - .join(format!("{}.json", artifact.approval_id)); - let mut payload = serde_json::to_string_pretty(artifact).map_err(|err| { - Diagnostic::error( - "approval_write_error", - display_path(&target), - format!("could not encode approval artifact: {err}"), - ) - })?; - payload.push('\n'); - let tmp_path = self - .approvals_dir - .join(format!("{}.json.tmp.{}", artifact.approval_id, Ulid::new())); - fs::write(&tmp_path, payload.as_bytes()).map_err(|err| { - Diagnostic::error( - "approval_write_error", - display_path(&tmp_path), - format!("could not write approval artifact: {err}"), - ) - })?; - if let Err(err) = fs::rename(&tmp_path, &target) { - let _ = fs::remove_file(&tmp_path); - return Err(Diagnostic::error( - "approval_write_error", - display_path(&target), - format!("could not move approval artifact into place: {err}"), - )); - } - Ok(target) - } - - /// List recovery sidecars in ULID (filename) order. Unparseable files are - /// reported as warnings and skipped — they stay on disk for the operator. - fn list_recovery_sidecars( - &self, - diagnostics: &mut Vec, - ) -> Vec<(PathBuf, RecoverySidecar)> { - let mut paths = Vec::new(); - match fs::read_dir(&self.recoveries_dir) { - Ok(entries) => { - for entry in entries.flatten() { - let path = entry.path(); - if path.extension().is_some_and(|ext| ext == "json") { - paths.push(path); - } - } - } - Err(err) if err.kind() == ErrorKind::NotFound => {} - Err(err) => { - diagnostics.push(Diagnostic::warning( - "recovery_sidecar_read_error", - CLUSTER_RECOVERIES_DIR, - format!("could not list recovery sidecars: {err}"), - )); - } - } - paths.sort(); - let mut sidecars = Vec::new(); - for path in paths { - match fs::read_to_string(&path) - .map_err(|err| err.to_string()) - .and_then(|text| { - serde_json::from_str::(&text).map_err(|err| err.to_string()) - }) { - Ok(sidecar) if sidecar.schema_version == 1 => sidecars.push((path, sidecar)), - Ok(sidecar) => diagnostics.push(Diagnostic::warning( - "unsupported_recovery_sidecar_version", - display_path(&path), - format!( - "unsupported recovery sidecar version {}; leaving it in place", - sidecar.schema_version - ), - )), - Err(err) => diagnostics.push(Diagnostic::warning( - "invalid_recovery_sidecar", - display_path(&path), - format!("could not parse recovery sidecar ({err}); leaving it in place"), - )), - } - } - sidecars - } - - /// Atomically write (or rewrite) a recovery sidecar; returns its path. - fn write_recovery_sidecar(&self, sidecar: &RecoverySidecar) -> Result { - fs::create_dir_all(&self.recoveries_dir).map_err(|err| { - Diagnostic::error( - "recovery_sidecar_write_error", - CLUSTER_RECOVERIES_DIR, - format!("could not create recoveries directory: {err}"), - ) - })?; - let target = self - .recoveries_dir - .join(format!("{}.json", sidecar.operation_id)); - let mut payload = serde_json::to_string_pretty(sidecar).map_err(|err| { - Diagnostic::error( - "recovery_sidecar_write_error", - display_path(&target), - format!("could not encode recovery sidecar: {err}"), - ) - })?; - payload.push('\n'); - let tmp_path = self - .recoveries_dir - .join(format!("{}.json.tmp.{}", sidecar.operation_id, Ulid::new())); - fs::write(&tmp_path, payload.as_bytes()).map_err(|err| { - Diagnostic::error( - "recovery_sidecar_write_error", - display_path(&tmp_path), - format!("could not write recovery sidecar: {err}"), - ) - })?; - if let Err(err) = fs::rename(&tmp_path, &target) { - let _ = fs::remove_file(&tmp_path); - return Err(Diagnostic::error( - "recovery_sidecar_write_error", - display_path(&target), - format!("could not move recovery sidecar into place: {err}"), - )); - } - Ok(target) - } - - fn observations(&self) -> StateObservations { - StateObservations { - state_path: display_path(&self.state_path), - lock_path: display_path(&self.lock_path), - state_found: false, - applied_config_digest: None, - state_revision: 0, - state_cas: None, - resource_count: 0, - locked: false, - lock_id: None, - lock_acquired: false, - acquired_lock_id: None, - lock_operation: None, - lock_created_at: None, - lock_pid: None, - lock_age_seconds: None, - } - } - - fn read_state( - &self, - observations: &mut StateObservations, - ) -> Result { - let text = match fs::read_to_string(&self.state_path) { - Ok(text) => text, - Err(err) if err.kind() == ErrorKind::NotFound => { - return Ok(StateSnapshot { - state: None, - state_cas: None, - }); - } - Err(err) => { - return Err(Diagnostic::error( - "state_read_error", - CLUSTER_STATE_FILE, - format!("could not read state file: {err}"), - )); - } - }; - - observations.state_found = true; - let state_cas = format!("sha256:{}", sha256_hex(text.as_bytes())); - observations.state_cas = Some(state_cas.clone()); - - let state = serde_json::from_str::(&text).map_err(|err| { - Diagnostic::error( - "invalid_state_json", - CLUSTER_STATE_FILE, - format!("could not parse state JSON: {err}"), - ) - })?; - - if state.version != 1 { - return Err(Diagnostic::error( - "unsupported_state_version", - "state.version", - format!( - "unsupported cluster state version {}; this build supports version 1", - state.version - ), - )); - } - - observations.applied_config_digest = state.applied_revision.config_digest.clone(); - observations.state_revision = state.state_revision; - observations.resource_count = state.applied_revision.resources.len(); - - Ok(StateSnapshot { - state: Some(state), - state_cas: Some(state_cas), - }) - } - - fn write_state( - &self, - state: &ClusterState, - expected_cas: Option<&str>, - observations: &mut StateObservations, - ) -> Result<(), Diagnostic> { - fs::create_dir_all(&self.state_dir).map_err(|err| { - Diagnostic::error( - "state_write_error", - CLUSTER_STATE_DIR, - format!("could not create cluster state directory: {err}"), - ) - })?; - - let current_cas = self.current_state_cas()?; - if current_cas.as_deref() != expected_cas { - return Err(Diagnostic::error( - "state_cas_mismatch", - CLUSTER_STATE_FILE, - "state.json changed while the command was running; re-run the command against the latest state", - )); - } - - let mut payload = serde_json::to_string_pretty(state).map_err(|err| { - Diagnostic::error( - "state_write_error", - CLUSTER_STATE_FILE, - format!("could not encode state JSON: {err}"), - ) - })?; - payload.push('\n'); - - let tmp_path = self - .state_dir - .join(format!("state.json.tmp.{}", Ulid::new())); - let mut file = OpenOptions::new() - .write(true) - .create_new(true) - .open(&tmp_path) - .map_err(|err| { - Diagnostic::error( - "state_write_error", - display_path(&tmp_path), - format!("could not create temporary state file: {err}"), - ) - })?; - file.write_all(payload.as_bytes()).map_err(|err| { - Diagnostic::error( - "state_write_error", - display_path(&tmp_path), - format!("could not write temporary state file: {err}"), - ) - })?; - file.sync_all().map_err(|err| { - Diagnostic::error( - "state_write_error", - display_path(&tmp_path), - format!("could not sync temporary state file: {err}"), - ) - })?; - drop(file); - - if let Err(err) = fs::rename(&tmp_path, &self.state_path) { - let _ = fs::remove_file(&tmp_path); - return Err(Diagnostic::error( - "state_write_error", - CLUSTER_STATE_FILE, - format!("could not replace state.json atomically: {err}"), - )); - } - - let written = fs::read_to_string(&self.state_path).map_err(|err| { - Diagnostic::error( - "state_write_error", - CLUSTER_STATE_FILE, - format!("could not read state.json after write: {err}"), - ) - })?; - observations.state_found = true; - observations.applied_config_digest = state.applied_revision.config_digest.clone(); - observations.state_revision = state.state_revision; - observations.state_cas = Some(format!("sha256:{}", sha256_hex(written.as_bytes()))); - observations.resource_count = state.applied_revision.resources.len(); - - Ok(()) - } - - fn current_state_cas(&self) -> Result, Diagnostic> { - match fs::read(&self.state_path) { - Ok(bytes) => Ok(Some(format!("sha256:{}", sha256_hex(&bytes)))), - Err(err) if err.kind() == ErrorKind::NotFound => Ok(None), - Err(err) => Err(Diagnostic::error( - "state_read_error", - CLUSTER_STATE_FILE, - format!("could not read state file for CAS check: {err}"), - )), - } - } - - fn acquire_lock( - &self, - operation: &str, - observations: &mut StateObservations, - ) -> Result { - fs::create_dir_all(&self.state_dir).map_err(|err| { - Diagnostic::error( - "state_lock_error", - CLUSTER_STATE_DIR, - format!("could not create cluster state directory: {err}"), - ) - })?; - - let lock_id = Ulid::new().to_string(); - let lock = StateLockFile { - version: 1, - lock_id: lock_id.clone(), - operation: operation.to_string(), - created_at: OffsetDateTime::now_utc() - .format(&Rfc3339) - .unwrap_or_else(|_| "1970-01-01T00:00:00Z".to_string()), - pid: process::id(), - }; - let payload = serde_json::to_string_pretty(&lock).map_err(|err| { - Diagnostic::error( - "state_lock_error", - CLUSTER_LOCK_FILE, - format!("could not encode state lock: {err}"), - ) - })?; - - match OpenOptions::new() - .write(true) - .create_new(true) - .open(&self.lock_path) - { - Ok(mut file) => { - if let Err(err) = file.write_all(payload.as_bytes()) { - // No guard exists yet, so clean up the create-new file here - // instead of leaving a stale partial lock for the next run. - drop(file); - let _ = fs::remove_file(&self.lock_path); - return Err(Diagnostic::error( - "state_lock_error", - CLUSTER_LOCK_FILE, - format!("could not write state lock: {err}"), - )); - } - observations.lock_acquired = true; - observations.acquired_lock_id = Some(lock_id.clone()); - Ok(StateLockGuard { - path: self.lock_path.clone(), - }) - } - Err(err) if err.kind() == ErrorKind::AlreadyExists => { - self.observe_lock_metadata_lossy(observations); - Err(Diagnostic::error( - "state_lock_held", - CLUSTER_LOCK_FILE, - state_lock_held_message(observations), - )) - } - Err(err) => Err(Diagnostic::error( - "state_lock_error", - CLUSTER_LOCK_FILE, - format!("could not acquire state lock: {err}"), - )), - } - } - - fn force_unlock( - &self, - requested_lock_id: &str, - observations: &mut StateObservations, - ) -> Result<(), Diagnostic> { - let text = match fs::read_to_string(&self.lock_path) { - Ok(text) => text, - Err(err) if err.kind() == ErrorKind::NotFound => { - return Err(Diagnostic::error( - "state_lock_missing", - CLUSTER_LOCK_FILE, - "cluster state lock is not present; nothing was unlocked", - )); - } - Err(err) => { - return Err(Diagnostic::error( - "state_lock_read_error", - CLUSTER_LOCK_FILE, - format!("could not read state lock: {err}"), - )); - } - }; - observations.locked = true; - let lock = parse_lock_file_for_unlock(&text)?; - observations.observe_lock_metadata(&lock); - - if lock.lock_id != requested_lock_id { - return Err(Diagnostic::error( - "state_lock_id_mismatch", - CLUSTER_LOCK_FILE, - format!( - "cluster state lock id is {}; refusing to unlock with requested id {requested_lock_id}", - lock.lock_id - ), - )); - } - - fs::remove_file(&self.lock_path).map_err(|err| { - Diagnostic::error( - "state_unlock_error", - CLUSTER_LOCK_FILE, - format!("could not remove state lock: {err}"), - ) - }) - } - - fn observe_lock( - &self, - observations: &mut StateObservations, - diagnostics: &mut Vec, - ) { - if self.lock_path.exists() { - observations.locked = true; - match fs::read_to_string(&self.lock_path) { - Ok(text) => match serde_json::from_str::(&text) { - Ok(lock) if lock.version == 1 => { - observations.observe_lock_metadata(&lock); - } - Ok(lock) => diagnostics.push(Diagnostic::warning( - "unsupported_state_lock_version", - CLUSTER_LOCK_FILE, - format!("unsupported cluster state lock version {}", lock.version), - )), - Err(err) => diagnostics.push(Diagnostic::warning( - "invalid_state_lock", - CLUSTER_LOCK_FILE, - format!("could not parse state lock: {err}"), - )), - }, - Err(err) => diagnostics.push(Diagnostic::warning( - "state_lock_read_error", - CLUSTER_LOCK_FILE, - format!("could not read state lock: {err}"), - )), - } - } - } - - fn observe_lock_metadata_lossy(&self, observations: &mut StateObservations) { - observations.locked = true; - if let Ok(text) = fs::read_to_string(&self.lock_path) { - if let Ok(lock) = serde_json::from_str::(&text) { - if lock.version == 1 { - observations.observe_lock_metadata(&lock); - } - } - } - } -} - -impl Drop for StateLockGuard { - fn drop(&mut self) { - let _ = fs::remove_file(&self.path); - } -} fn parse_lock_file_for_unlock(text: &str) -> Result { let lock = serde_json::from_str::(text).map_err(|err| { diff --git a/crates/omnigraph-cluster/src/store.rs b/crates/omnigraph-cluster/src/store.rs new file mode 100644 index 0000000..f378660 --- /dev/null +++ b/crates/omnigraph-cluster/src/store.rs @@ -0,0 +1,561 @@ +//! The cluster's storage backend: state ledger, lock, recovery +//! sidecars, approval artifacts (moved verbatim from lib.rs in the +//! modularization). The object-storage port (RFC-006) lands here as a +//! follow-up — this module is the single home for stored-state I/O. + +use super::*; + +#[derive(Debug)] +pub(crate) struct LocalStateBackend { + state_dir: PathBuf, + state_path: PathBuf, + lock_path: PathBuf, + recoveries_dir: PathBuf, + approvals_dir: PathBuf, +} + +#[derive(Debug)] +pub(crate) struct StateSnapshot { + pub(crate) state: Option, + pub(crate) state_cas: Option, +} + +#[derive(Debug)] +pub(crate) struct StateLockGuard { + path: PathBuf, +} + +impl LocalStateBackend { + pub(crate) fn new(config_dir: &Path) -> Self { + let state_dir = config_dir.join(CLUSTER_STATE_DIR); + Self { + state_path: config_dir.join(CLUSTER_STATE_FILE), + lock_path: config_dir.join(CLUSTER_LOCK_FILE), + recoveries_dir: config_dir.join(CLUSTER_RECOVERIES_DIR), + approvals_dir: config_dir.join(CLUSTER_APPROVALS_DIR), + state_dir, + } + } + + /// List approval artifacts in ULID (filename) order; unparseable files + /// warn and stay on disk for the operator. + pub(crate) fn list_approval_artifacts( + &self, + diagnostics: &mut Vec, + ) -> Vec<(PathBuf, ApprovalArtifact)> { + let mut paths = Vec::new(); + match fs::read_dir(&self.approvals_dir) { + Ok(entries) => { + for entry in entries.flatten() { + let path = entry.path(); + if path.extension().is_some_and(|ext| ext == "json") { + paths.push(path); + } + } + } + Err(err) if err.kind() == ErrorKind::NotFound => {} + Err(err) => diagnostics.push(Diagnostic::warning( + "approval_read_error", + CLUSTER_APPROVALS_DIR, + format!("could not list approval artifacts: {err}"), + )), + } + paths.sort(); + let mut artifacts = Vec::new(); + for path in paths { + match fs::read_to_string(&path) + .map_err(|err| err.to_string()) + .and_then(|text| { + serde_json::from_str::(&text).map_err(|err| err.to_string()) + }) { + Ok(artifact) if artifact.schema_version == 1 => artifacts.push((path, artifact)), + Ok(artifact) => diagnostics.push(Diagnostic::warning( + "unsupported_approval_version", + display_path(&path), + format!( + "unsupported approval artifact version {}; leaving it in place", + artifact.schema_version + ), + )), + Err(err) => diagnostics.push(Diagnostic::warning( + "invalid_approval_artifact", + display_path(&path), + format!("could not parse approval artifact ({err}); leaving it in place"), + )), + } + } + artifacts + } + + /// Atomically write (or rewrite, e.g. on consumption) an approval artifact. + pub(crate) fn write_approval_artifact(&self, artifact: &ApprovalArtifact) -> Result { + fs::create_dir_all(&self.approvals_dir).map_err(|err| { + Diagnostic::error( + "approval_write_error", + CLUSTER_APPROVALS_DIR, + format!("could not create approvals directory: {err}"), + ) + })?; + let target = self + .approvals_dir + .join(format!("{}.json", artifact.approval_id)); + let mut payload = serde_json::to_string_pretty(artifact).map_err(|err| { + Diagnostic::error( + "approval_write_error", + display_path(&target), + format!("could not encode approval artifact: {err}"), + ) + })?; + payload.push('\n'); + let tmp_path = self + .approvals_dir + .join(format!("{}.json.tmp.{}", artifact.approval_id, Ulid::new())); + fs::write(&tmp_path, payload.as_bytes()).map_err(|err| { + Diagnostic::error( + "approval_write_error", + display_path(&tmp_path), + format!("could not write approval artifact: {err}"), + ) + })?; + if let Err(err) = fs::rename(&tmp_path, &target) { + let _ = fs::remove_file(&tmp_path); + return Err(Diagnostic::error( + "approval_write_error", + display_path(&target), + format!("could not move approval artifact into place: {err}"), + )); + } + Ok(target) + } + + /// List recovery sidecars in ULID (filename) order. Unparseable files are + /// reported as warnings and skipped — they stay on disk for the operator. + pub(crate) fn list_recovery_sidecars( + &self, + diagnostics: &mut Vec, + ) -> Vec<(PathBuf, RecoverySidecar)> { + let mut paths = Vec::new(); + match fs::read_dir(&self.recoveries_dir) { + Ok(entries) => { + for entry in entries.flatten() { + let path = entry.path(); + if path.extension().is_some_and(|ext| ext == "json") { + paths.push(path); + } + } + } + Err(err) if err.kind() == ErrorKind::NotFound => {} + Err(err) => { + diagnostics.push(Diagnostic::warning( + "recovery_sidecar_read_error", + CLUSTER_RECOVERIES_DIR, + format!("could not list recovery sidecars: {err}"), + )); + } + } + paths.sort(); + let mut sidecars = Vec::new(); + for path in paths { + match fs::read_to_string(&path) + .map_err(|err| err.to_string()) + .and_then(|text| { + serde_json::from_str::(&text).map_err(|err| err.to_string()) + }) { + Ok(sidecar) if sidecar.schema_version == 1 => sidecars.push((path, sidecar)), + Ok(sidecar) => diagnostics.push(Diagnostic::warning( + "unsupported_recovery_sidecar_version", + display_path(&path), + format!( + "unsupported recovery sidecar version {}; leaving it in place", + sidecar.schema_version + ), + )), + Err(err) => diagnostics.push(Diagnostic::warning( + "invalid_recovery_sidecar", + display_path(&path), + format!("could not parse recovery sidecar ({err}); leaving it in place"), + )), + } + } + sidecars + } + + /// Atomically write (or rewrite) a recovery sidecar; returns its path. + pub(crate) fn write_recovery_sidecar(&self, sidecar: &RecoverySidecar) -> Result { + fs::create_dir_all(&self.recoveries_dir).map_err(|err| { + Diagnostic::error( + "recovery_sidecar_write_error", + CLUSTER_RECOVERIES_DIR, + format!("could not create recoveries directory: {err}"), + ) + })?; + let target = self + .recoveries_dir + .join(format!("{}.json", sidecar.operation_id)); + let mut payload = serde_json::to_string_pretty(sidecar).map_err(|err| { + Diagnostic::error( + "recovery_sidecar_write_error", + display_path(&target), + format!("could not encode recovery sidecar: {err}"), + ) + })?; + payload.push('\n'); + let tmp_path = self + .recoveries_dir + .join(format!("{}.json.tmp.{}", sidecar.operation_id, Ulid::new())); + fs::write(&tmp_path, payload.as_bytes()).map_err(|err| { + Diagnostic::error( + "recovery_sidecar_write_error", + display_path(&tmp_path), + format!("could not write recovery sidecar: {err}"), + ) + })?; + if let Err(err) = fs::rename(&tmp_path, &target) { + let _ = fs::remove_file(&tmp_path); + return Err(Diagnostic::error( + "recovery_sidecar_write_error", + display_path(&target), + format!("could not move recovery sidecar into place: {err}"), + )); + } + Ok(target) + } + + pub(crate) fn observations(&self) -> StateObservations { + StateObservations { + state_path: display_path(&self.state_path), + lock_path: display_path(&self.lock_path), + state_found: false, + applied_config_digest: None, + state_revision: 0, + state_cas: None, + resource_count: 0, + locked: false, + lock_id: None, + lock_acquired: false, + acquired_lock_id: None, + lock_operation: None, + lock_created_at: None, + lock_pid: None, + lock_age_seconds: None, + } + } + + pub(crate) fn read_state( + &self, + observations: &mut StateObservations, + ) -> Result { + let text = match fs::read_to_string(&self.state_path) { + Ok(text) => text, + Err(err) if err.kind() == ErrorKind::NotFound => { + return Ok(StateSnapshot { + state: None, + state_cas: None, + }); + } + Err(err) => { + return Err(Diagnostic::error( + "state_read_error", + CLUSTER_STATE_FILE, + format!("could not read state file: {err}"), + )); + } + }; + + observations.state_found = true; + let state_cas = format!("sha256:{}", sha256_hex(text.as_bytes())); + observations.state_cas = Some(state_cas.clone()); + + let state = serde_json::from_str::(&text).map_err(|err| { + Diagnostic::error( + "invalid_state_json", + CLUSTER_STATE_FILE, + format!("could not parse state JSON: {err}"), + ) + })?; + + if state.version != 1 { + return Err(Diagnostic::error( + "unsupported_state_version", + "state.version", + format!( + "unsupported cluster state version {}; this build supports version 1", + state.version + ), + )); + } + + observations.applied_config_digest = state.applied_revision.config_digest.clone(); + observations.state_revision = state.state_revision; + observations.resource_count = state.applied_revision.resources.len(); + + Ok(StateSnapshot { + state: Some(state), + state_cas: Some(state_cas), + }) + } + + pub(crate) fn write_state( + &self, + state: &ClusterState, + expected_cas: Option<&str>, + observations: &mut StateObservations, + ) -> Result<(), Diagnostic> { + fs::create_dir_all(&self.state_dir).map_err(|err| { + Diagnostic::error( + "state_write_error", + CLUSTER_STATE_DIR, + format!("could not create cluster state directory: {err}"), + ) + })?; + + let current_cas = self.current_state_cas()?; + if current_cas.as_deref() != expected_cas { + return Err(Diagnostic::error( + "state_cas_mismatch", + CLUSTER_STATE_FILE, + "state.json changed while the command was running; re-run the command against the latest state", + )); + } + + let mut payload = serde_json::to_string_pretty(state).map_err(|err| { + Diagnostic::error( + "state_write_error", + CLUSTER_STATE_FILE, + format!("could not encode state JSON: {err}"), + ) + })?; + payload.push('\n'); + + let tmp_path = self + .state_dir + .join(format!("state.json.tmp.{}", Ulid::new())); + let mut file = OpenOptions::new() + .write(true) + .create_new(true) + .open(&tmp_path) + .map_err(|err| { + Diagnostic::error( + "state_write_error", + display_path(&tmp_path), + format!("could not create temporary state file: {err}"), + ) + })?; + file.write_all(payload.as_bytes()).map_err(|err| { + Diagnostic::error( + "state_write_error", + display_path(&tmp_path), + format!("could not write temporary state file: {err}"), + ) + })?; + file.sync_all().map_err(|err| { + Diagnostic::error( + "state_write_error", + display_path(&tmp_path), + format!("could not sync temporary state file: {err}"), + ) + })?; + drop(file); + + if let Err(err) = fs::rename(&tmp_path, &self.state_path) { + let _ = fs::remove_file(&tmp_path); + return Err(Diagnostic::error( + "state_write_error", + CLUSTER_STATE_FILE, + format!("could not replace state.json atomically: {err}"), + )); + } + + let written = fs::read_to_string(&self.state_path).map_err(|err| { + Diagnostic::error( + "state_write_error", + CLUSTER_STATE_FILE, + format!("could not read state.json after write: {err}"), + ) + })?; + observations.state_found = true; + observations.applied_config_digest = state.applied_revision.config_digest.clone(); + observations.state_revision = state.state_revision; + observations.state_cas = Some(format!("sha256:{}", sha256_hex(written.as_bytes()))); + observations.resource_count = state.applied_revision.resources.len(); + + Ok(()) + } + + pub(crate) fn current_state_cas(&self) -> Result, Diagnostic> { + match fs::read(&self.state_path) { + Ok(bytes) => Ok(Some(format!("sha256:{}", sha256_hex(&bytes)))), + Err(err) if err.kind() == ErrorKind::NotFound => Ok(None), + Err(err) => Err(Diagnostic::error( + "state_read_error", + CLUSTER_STATE_FILE, + format!("could not read state file for CAS check: {err}"), + )), + } + } + + pub(crate) fn acquire_lock( + &self, + operation: &str, + observations: &mut StateObservations, + ) -> Result { + fs::create_dir_all(&self.state_dir).map_err(|err| { + Diagnostic::error( + "state_lock_error", + CLUSTER_STATE_DIR, + format!("could not create cluster state directory: {err}"), + ) + })?; + + let lock_id = Ulid::new().to_string(); + let lock = StateLockFile { + version: 1, + lock_id: lock_id.clone(), + operation: operation.to_string(), + created_at: OffsetDateTime::now_utc() + .format(&Rfc3339) + .unwrap_or_else(|_| "1970-01-01T00:00:00Z".to_string()), + pid: process::id(), + }; + let payload = serde_json::to_string_pretty(&lock).map_err(|err| { + Diagnostic::error( + "state_lock_error", + CLUSTER_LOCK_FILE, + format!("could not encode state lock: {err}"), + ) + })?; + + match OpenOptions::new() + .write(true) + .create_new(true) + .open(&self.lock_path) + { + Ok(mut file) => { + if let Err(err) = file.write_all(payload.as_bytes()) { + // No guard exists yet, so clean up the create-new file here + // instead of leaving a stale partial lock for the next run. + drop(file); + let _ = fs::remove_file(&self.lock_path); + return Err(Diagnostic::error( + "state_lock_error", + CLUSTER_LOCK_FILE, + format!("could not write state lock: {err}"), + )); + } + observations.lock_acquired = true; + observations.acquired_lock_id = Some(lock_id.clone()); + Ok(StateLockGuard { + path: self.lock_path.clone(), + }) + } + Err(err) if err.kind() == ErrorKind::AlreadyExists => { + self.observe_lock_metadata_lossy(observations); + Err(Diagnostic::error( + "state_lock_held", + CLUSTER_LOCK_FILE, + state_lock_held_message(observations), + )) + } + Err(err) => Err(Diagnostic::error( + "state_lock_error", + CLUSTER_LOCK_FILE, + format!("could not acquire state lock: {err}"), + )), + } + } + + pub(crate) fn force_unlock( + &self, + requested_lock_id: &str, + observations: &mut StateObservations, + ) -> Result<(), Diagnostic> { + let text = match fs::read_to_string(&self.lock_path) { + Ok(text) => text, + Err(err) if err.kind() == ErrorKind::NotFound => { + return Err(Diagnostic::error( + "state_lock_missing", + CLUSTER_LOCK_FILE, + "cluster state lock is not present; nothing was unlocked", + )); + } + Err(err) => { + return Err(Diagnostic::error( + "state_lock_read_error", + CLUSTER_LOCK_FILE, + format!("could not read state lock: {err}"), + )); + } + }; + observations.locked = true; + let lock = parse_lock_file_for_unlock(&text)?; + observations.observe_lock_metadata(&lock); + + if lock.lock_id != requested_lock_id { + return Err(Diagnostic::error( + "state_lock_id_mismatch", + CLUSTER_LOCK_FILE, + format!( + "cluster state lock id is {}; refusing to unlock with requested id {requested_lock_id}", + lock.lock_id + ), + )); + } + + fs::remove_file(&self.lock_path).map_err(|err| { + Diagnostic::error( + "state_unlock_error", + CLUSTER_LOCK_FILE, + format!("could not remove state lock: {err}"), + ) + }) + } + + pub(crate) fn observe_lock( + &self, + observations: &mut StateObservations, + diagnostics: &mut Vec, + ) { + if self.lock_path.exists() { + observations.locked = true; + match fs::read_to_string(&self.lock_path) { + Ok(text) => match serde_json::from_str::(&text) { + Ok(lock) if lock.version == 1 => { + observations.observe_lock_metadata(&lock); + } + Ok(lock) => diagnostics.push(Diagnostic::warning( + "unsupported_state_lock_version", + CLUSTER_LOCK_FILE, + format!("unsupported cluster state lock version {}", lock.version), + )), + Err(err) => diagnostics.push(Diagnostic::warning( + "invalid_state_lock", + CLUSTER_LOCK_FILE, + format!("could not parse state lock: {err}"), + )), + }, + Err(err) => diagnostics.push(Diagnostic::warning( + "state_lock_read_error", + CLUSTER_LOCK_FILE, + format!("could not read state lock: {err}"), + )), + } + } + } + + pub(crate) fn observe_lock_metadata_lossy(&self, observations: &mut StateObservations) { + observations.locked = true; + if let Ok(text) = fs::read_to_string(&self.lock_path) { + if let Ok(lock) = serde_json::from_str::(&text) { + if lock.version == 1 { + observations.observe_lock_metadata(&lock); + } + } + } + } +} + +impl Drop for StateLockGuard { + fn drop(&mut self) { + let _ = fs::remove_file(&self.path); + } +} From 00fc5cf5378f8018023bf362d1aa3119c526c2a8 Mon Sep 17 00:00:00 2001 From: aaltshuler Date: Thu, 11 Jun 2026 05:29:44 +0300 Subject: [PATCH 3/7] refactor(cluster): move the serving snapshot to serve.rs Verbatim move of the Serving* types, read_serving_snapshot, and read_verified_payload; public re-exports preserved (the server's imports are unchanged). 95 tests green. Co-Authored-By: Claude Fable 5 --- crates/omnigraph-cluster/src/lib.rs | 187 +------------------------ crates/omnigraph-cluster/src/serve.rs | 189 ++++++++++++++++++++++++++ 2 files changed, 192 insertions(+), 184 deletions(-) create mode 100644 crates/omnigraph-cluster/src/serve.rs diff --git a/crates/omnigraph-cluster/src/lib.rs b/crates/omnigraph-cluster/src/lib.rs index 17dd8a6..f26110e 100644 --- a/crates/omnigraph-cluster/src/lib.rs +++ b/crates/omnigraph-cluster/src/lib.rs @@ -19,8 +19,11 @@ use ulid::Ulid; pub mod failpoints; +mod serve; mod store; use store::{LocalStateBackend, StateLockGuard, StateSnapshot}; +pub use serve::{ServingGraph, ServingPolicy, ServingQuery, ServingSnapshot, read_serving_snapshot}; +use serve::read_verified_payload; pub const CLUSTER_CONFIG_FILE: &str = "cluster.yaml"; pub const CLUSTER_GRAPHS_DIR: &str = "graphs"; @@ -1817,190 +1820,6 @@ pub async fn approve_config_dir( } } -/// One graph in a serving snapshot: its id and on-disk root. -#[derive(Debug, Clone)] -pub struct ServingGraph { - pub graph_id: String, - pub root: PathBuf, -} - -/// One stored query: its graph binding, registry name, and verified source. -#[derive(Debug, Clone)] -pub struct ServingQuery { - pub graph_id: String, - pub name: String, - pub source: String, -} - -/// One policy bundle: its verified catalog blob path and applied bindings -/// (normalized typed refs: `cluster` | `graph.`). -#[derive(Debug, Clone)] -pub struct ServingPolicy { - pub name: String, - pub blob_path: PathBuf, - pub applies_to: Vec, -} - -/// Everything a server needs to boot from the cluster catalog (RFC-005 §D2). -#[derive(Debug, Clone)] -pub struct ServingSnapshot { - pub graphs: Vec, - pub queries: Vec, - pub policies: Vec, -} - -/// Read the applied revision as a serving snapshot — the read-only loader for -/// the Phase-5 server boot. All-or-nothing per RFC-005 §D4: every readiness -/// failure is collected and the whole snapshot refused; no partial serving. -/// Takes no lock: the state file is replaced atomically, so this reads a -/// consistent point-in-time ledger. -pub fn read_serving_snapshot(config_dir: impl AsRef) -> Result> { - let config_dir = config_dir.as_ref().to_path_buf(); - let backend = LocalStateBackend::new(&config_dir); - let mut diagnostics: Vec = Vec::new(); - - // A ledger a sweep is about to rewrite must not start serving. - let sidecars = backend.list_recovery_sidecars(&mut diagnostics); - if !sidecars.is_empty() { - diagnostics.push(Diagnostic::error( - "cluster_recovery_pending", - CLUSTER_RECOVERIES_DIR, - format!( - "{} interrupted operation(s) await recovery; run any state-mutating cluster command (e.g. `cluster apply`) to sweep, then retry", - sidecars.len() - ), - )); - } - - let mut observations = backend.observations(); - let state = match backend.read_state(&mut observations) { - Ok(snapshot) => match snapshot.state { - Some(state) => Some(state), - None => { - diagnostics.push(Diagnostic::error( - "cluster_state_missing", - CLUSTER_STATE_FILE, - "no cluster state ledger; run `cluster import` and `cluster apply` first", - )); - None - } - }, - Err(diagnostic) => { - diagnostics.push(diagnostic); - None - } - }; - let Some(state) = state else { - return Err(diagnostics); - }; - - let mut graphs = Vec::new(); - let mut queries = Vec::new(); - let mut policies = Vec::new(); - for (address, entry) in &state.applied_revision.resources { - match resource_kind(address) { - ResourceKind::Graph(graph_id) => { - graphs.push(ServingGraph { - root: config_dir - .join(CLUSTER_GRAPHS_DIR) - .join(format!("{graph_id}.omni")), - graph_id, - }); - } - ResourceKind::Schema(_) => {} - kind @ ResourceKind::Query { .. } => { - let ResourceKind::Query { graph, name } = &kind else { - unreachable!() - }; - match read_verified_payload(&config_dir, &kind, &entry.digest, address) { - Ok(source) => queries.push(ServingQuery { - graph_id: graph.clone(), - name: name.clone(), - source, - }), - Err(diagnostic) => diagnostics.push(diagnostic), - } - } - kind @ ResourceKind::Policy(_) => { - let ResourceKind::Policy(name) = &kind else { - unreachable!() - }; - let Some(applies_to) = entry.applies_to.clone() else { - diagnostics.push(Diagnostic::error( - "policy_bindings_missing", - address.clone(), - "no applied applies_to bindings recorded (ledger predates binding metadata); re-run `cluster apply` to backfill", - )); - continue; - }; - match read_verified_payload(&config_dir, &kind, &entry.digest, address) { - Ok(_) => policies.push(ServingPolicy { - name: name.clone(), - blob_path: payload_path(&config_dir, &kind, &entry.digest) - .expect("policy kind always has a payload path"), - applies_to, - }), - Err(diagnostic) => diagnostics.push(diagnostic), - } - } - ResourceKind::Unknown => {} - } - } - - if graphs.is_empty() { - diagnostics.push(Diagnostic::error( - "cluster_empty", - CLUSTER_STATE_FILE, - "the applied revision records no graphs; apply a cluster with at least one graph before serving from it", - )); - } - if has_errors(&diagnostics) { - return Err(diagnostics); - } - Ok(ServingSnapshot { - graphs, - queries, - policies, - }) -} - -/// Read a catalog blob and verify it against the recorded digest. -fn read_verified_payload( - config_dir: &Path, - kind: &ResourceKind, - digest: &str, - address: &str, -) -> Result { - let path = payload_path(config_dir, kind, digest) - .expect("query/policy kinds always have a payload path"); - let bytes = fs::read(&path).map_err(|err| { - Diagnostic::error( - "catalog_payload_missing", - address, - format!( - "catalog blob '{}' unreadable ({err}); run `cluster refresh` then `cluster apply`, and restart", - display_path(&path) - ), - ) - })?; - if sha256_hex(&bytes) != digest { - return Err(Diagnostic::error( - "catalog_payload_digest_mismatch", - address, - format!( - "catalog blob '{}' does not match its recorded digest; run `cluster refresh` then `cluster apply`, and restart", - display_path(&path) - ), - )); - } - String::from_utf8(bytes).map_err(|err| { - Diagnostic::error( - "catalog_payload_invalid", - address, - format!("catalog blob is not valid UTF-8: {err}"), - ) - }) -} pub fn status_config_dir(config_dir: impl AsRef) -> StatusOutput { let parsed = parse_cluster_config(config_dir.as_ref()); diff --git a/crates/omnigraph-cluster/src/serve.rs b/crates/omnigraph-cluster/src/serve.rs new file mode 100644 index 0000000..0152bc4 --- /dev/null +++ b/crates/omnigraph-cluster/src/serve.rs @@ -0,0 +1,189 @@ +//! Phase-5 serving snapshot: the read-only loader a `--cluster` server +//! boots from (moved verbatim from lib.rs in the modularization). + +use super::*; + +/// One graph in a serving snapshot: its id and on-disk root. +#[derive(Debug, Clone)] +pub struct ServingGraph { + pub graph_id: String, + pub root: PathBuf, +} + +/// One stored query: its graph binding, registry name, and verified source. +#[derive(Debug, Clone)] +pub struct ServingQuery { + pub graph_id: String, + pub name: String, + pub source: String, +} + +/// One policy bundle: its verified catalog blob path and applied bindings +/// (normalized typed refs: `cluster` | `graph.`). +#[derive(Debug, Clone)] +pub struct ServingPolicy { + pub name: String, + pub blob_path: PathBuf, + pub applies_to: Vec, +} + +/// Everything a server needs to boot from the cluster catalog (RFC-005 §D2). +#[derive(Debug, Clone)] +pub struct ServingSnapshot { + pub graphs: Vec, + pub queries: Vec, + pub policies: Vec, +} + +/// Read the applied revision as a serving snapshot — the read-only loader for +/// the Phase-5 server boot. All-or-nothing per RFC-005 §D4: every readiness +/// failure is collected and the whole snapshot refused; no partial serving. +/// Takes no lock: the state file is replaced atomically, so this reads a +/// consistent point-in-time ledger. +pub fn read_serving_snapshot(config_dir: impl AsRef) -> Result> { + let config_dir = config_dir.as_ref().to_path_buf(); + let backend = LocalStateBackend::new(&config_dir); + let mut diagnostics: Vec = Vec::new(); + + // A ledger a sweep is about to rewrite must not start serving. + let sidecars = backend.list_recovery_sidecars(&mut diagnostics); + if !sidecars.is_empty() { + diagnostics.push(Diagnostic::error( + "cluster_recovery_pending", + CLUSTER_RECOVERIES_DIR, + format!( + "{} interrupted operation(s) await recovery; run any state-mutating cluster command (e.g. `cluster apply`) to sweep, then retry", + sidecars.len() + ), + )); + } + + let mut observations = backend.observations(); + let state = match backend.read_state(&mut observations) { + Ok(snapshot) => match snapshot.state { + Some(state) => Some(state), + None => { + diagnostics.push(Diagnostic::error( + "cluster_state_missing", + CLUSTER_STATE_FILE, + "no cluster state ledger; run `cluster import` and `cluster apply` first", + )); + None + } + }, + Err(diagnostic) => { + diagnostics.push(diagnostic); + None + } + }; + let Some(state) = state else { + return Err(diagnostics); + }; + + let mut graphs = Vec::new(); + let mut queries = Vec::new(); + let mut policies = Vec::new(); + for (address, entry) in &state.applied_revision.resources { + match resource_kind(address) { + ResourceKind::Graph(graph_id) => { + graphs.push(ServingGraph { + root: config_dir + .join(CLUSTER_GRAPHS_DIR) + .join(format!("{graph_id}.omni")), + graph_id, + }); + } + ResourceKind::Schema(_) => {} + kind @ ResourceKind::Query { .. } => { + let ResourceKind::Query { graph, name } = &kind else { + unreachable!() + }; + match read_verified_payload(&config_dir, &kind, &entry.digest, address) { + Ok(source) => queries.push(ServingQuery { + graph_id: graph.clone(), + name: name.clone(), + source, + }), + Err(diagnostic) => diagnostics.push(diagnostic), + } + } + kind @ ResourceKind::Policy(_) => { + let ResourceKind::Policy(name) = &kind else { + unreachable!() + }; + let Some(applies_to) = entry.applies_to.clone() else { + diagnostics.push(Diagnostic::error( + "policy_bindings_missing", + address.clone(), + "no applied applies_to bindings recorded (ledger predates binding metadata); re-run `cluster apply` to backfill", + )); + continue; + }; + match read_verified_payload(&config_dir, &kind, &entry.digest, address) { + Ok(_) => policies.push(ServingPolicy { + name: name.clone(), + blob_path: payload_path(&config_dir, &kind, &entry.digest) + .expect("policy kind always has a payload path"), + applies_to, + }), + Err(diagnostic) => diagnostics.push(diagnostic), + } + } + ResourceKind::Unknown => {} + } + } + + if graphs.is_empty() { + diagnostics.push(Diagnostic::error( + "cluster_empty", + CLUSTER_STATE_FILE, + "the applied revision records no graphs; apply a cluster with at least one graph before serving from it", + )); + } + if has_errors(&diagnostics) { + return Err(diagnostics); + } + Ok(ServingSnapshot { + graphs, + queries, + policies, + }) +} + +/// Read a catalog blob and verify it against the recorded digest. +pub(crate) fn read_verified_payload( + config_dir: &Path, + kind: &ResourceKind, + digest: &str, + address: &str, +) -> Result { + let path = payload_path(config_dir, kind, digest) + .expect("query/policy kinds always have a payload path"); + let bytes = fs::read(&path).map_err(|err| { + Diagnostic::error( + "catalog_payload_missing", + address, + format!( + "catalog blob '{}' unreadable ({err}); run `cluster refresh` then `cluster apply`, and restart", + display_path(&path) + ), + ) + })?; + if sha256_hex(&bytes) != digest { + return Err(Diagnostic::error( + "catalog_payload_digest_mismatch", + address, + format!( + "catalog blob '{}' does not match its recorded digest; run `cluster refresh` then `cluster apply`, and restart", + display_path(&path) + ), + )); + } + String::from_utf8(bytes).map_err(|err| { + Diagnostic::error( + "catalog_payload_invalid", + address, + format!("catalog blob is not valid UTF-8: {err}"), + ) + }) +} From 9c3e09e838695fc0f55a04bf1e04ced25ab3ed4b Mon Sep 17 00:00:00 2001 From: aaltshuler Date: Thu, 11 Jun 2026 05:30:55 +0300 Subject: [PATCH 4/7] refactor(cluster): move the recovery sweep to sweep.rs Verbatim move of the sidecar classification (all RFC-004 D3 rows), tombstoning, and approval-consumption helpers. 95 tests green. Co-Authored-By: Claude Fable 5 --- crates/omnigraph-cluster/src/lib.rs | 383 +------------------------ crates/omnigraph-cluster/src/sweep.rs | 386 ++++++++++++++++++++++++++ 2 files changed, 388 insertions(+), 381 deletions(-) create mode 100644 crates/omnigraph-cluster/src/sweep.rs diff --git a/crates/omnigraph-cluster/src/lib.rs b/crates/omnigraph-cluster/src/lib.rs index f26110e..3faacaa 100644 --- a/crates/omnigraph-cluster/src/lib.rs +++ b/crates/omnigraph-cluster/src/lib.rs @@ -20,10 +20,12 @@ use ulid::Ulid; pub mod failpoints; mod serve; +mod sweep; mod store; use store::{LocalStateBackend, StateLockGuard, StateSnapshot}; pub use serve::{ServingGraph, ServingPolicy, ServingQuery, ServingSnapshot, read_serving_snapshot}; use serve::read_verified_payload; +use sweep::{mark_approvals_consumed, record_approval_consumed, sweep_recovery_sidecars, tombstone_graph_subtree, warn_pending_recovery_sidecars}; pub const CLUSTER_CONFIG_FILE: &str = "cluster.yaml"; pub const CLUSTER_GRAPHS_DIR: &str = "graphs"; @@ -2291,387 +2293,6 @@ fn initial_import_state(desired: &DesiredCluster) -> ClusterState { } } -/// Recovery sweep (RFC-004 §D3): runs at the start of every state-mutating -/// cluster command, under the state lock, before the command's own work. -/// Roll-forward-only — the engine's own sidecars make each graph-level -/// operation atomic within the graph, so the cluster never rolls a graph -/// back; it converges the ledger to observable reality or refuses loudly. -/// Mutations ride the calling command's CAS-checked state write; completed -/// sidecars are deleted only after that write lands. -async fn sweep_recovery_sidecars( - backend: &LocalStateBackend, - state: &mut ClusterState, - diagnostics: &mut Vec, -) -> SweepOutcome { - let mut outcome = SweepOutcome::default(); - for (path, sidecar) in backend.list_recovery_sidecars(diagnostics) { - match sidecar.kind { - RecoverySidecarKind::GraphCreate => { - sweep_graph_create_sidecar(path, sidecar, state, diagnostics, &mut outcome).await; - } - RecoverySidecarKind::SchemaApply => { - sweep_schema_apply_sidecar(path, sidecar, state, diagnostics, &mut outcome).await; - } - RecoverySidecarKind::GraphDelete => { - sweep_graph_delete_sidecar(path, sidecar, state, diagnostics, &mut outcome); - } - } - } - outcome -} - -async fn sweep_graph_create_sidecar( - path: PathBuf, - sidecar: RecoverySidecar, - state: &mut ClusterState, - diagnostics: &mut Vec, - outcome: &mut SweepOutcome, -) { - let graph_address = graph_address(&sidecar.graph_id); - let schema_addr = schema_address(&sidecar.graph_id); - let graph_path = PathBuf::from(&sidecar.graph_uri); - - // Row 1: nothing moved — the init never landed. The sidecar is pure - // intent; remove it and let the command's own plan re-propose the create. - if !graph_path.exists() { - let _ = fs::remove_file(&path); - return; - } - - match Omnigraph::open_read_only(&sidecar.graph_uri).await { - Ok(db) => { - let live_digest = sha256_hex(db.schema_source().as_bytes()); - let recorded = state - .applied_revision - .resources - .get(&schema_addr) - .map(|resource| resource.digest.clone()); - if recorded.as_deref() == Some(live_digest.as_str()) { - // Row 2: crash fell between the state CAS and sidecar delete. - outcome.completed_sidecars.push(path); - } else if live_digest == sidecar.desired_schema_digest { - // Row 4: the create completed on the graph; roll the cluster - // state forward to observable reality. - state.applied_revision.resources.insert( - schema_addr.clone(), - StateResource { - digest: live_digest.clone(), - applies_to: None, - }, - ); - let query_digests = state_query_digests_for_graph(state, &sidecar.graph_id); - let composite = - graph_digest(&sidecar.graph_id, Some(&live_digest), Some(&query_digests)); - state - .applied_revision - .resources - .insert(graph_address.clone(), StateResource { digest: composite, applies_to: None }); - set_resource_status_applied(state, &graph_address); - set_resource_status_applied(state, &schema_addr); - state.recovery_records.insert( - sidecar.operation_id.clone(), - json!({ - "kind": "graph_create", - "graph_id": sidecar.graph_id, - "outcome": "rolled_forward", - "recovered_at": now_rfc3339(), - "actor": sidecar.actor, - }), - ); - diagnostics.push(Diagnostic::warning( - "cluster_recovery_rolled_forward", - graph_address.clone(), - "an interrupted graph create had completed on the graph; cluster state was rolled forward to match", - )); - outcome.completed_sidecars.push(path); - } else { - // Row 6: the graph moved to something the sidecar did not - // intend. Refuse to guess; require refresh + operator re-plan. - set_resource_status( - state, - &graph_address, - ResourceLifecycleStatus::Drifted, - "actual_applied_state_pending", - "graph state does not match the interrupted operation; run `cluster refresh` and re-plan", - ); - set_resource_status( - state, - &schema_addr, - ResourceLifecycleStatus::Drifted, - "actual_applied_state_pending", - "graph state does not match the interrupted operation; run `cluster refresh` and re-plan", - ); - diagnostics.push(Diagnostic::warning( - "cluster_recovery_pending", - graph_address.clone(), - "an interrupted graph create left unexpected graph state; graph-moving work is blocked until repaired", - )); - outcome.pending_graphs.insert(sidecar.graph_id.clone()); - } - } - Err(err) => { - // Row 5: partial root (the engine's documented init gap). Never - // auto-delete — reconciler deletes are the same data-loss class - // as human deletes; the operator removes the root explicitly. - set_resource_status( - state, - &graph_address, - ResourceLifecycleStatus::Error, - "graph_create_incomplete", - "graph root exists but cannot be opened; remove the graph root and re-run `cluster apply`", - ); - set_resource_status( - state, - &schema_addr, - ResourceLifecycleStatus::Error, - "graph_create_incomplete", - "graph root exists but cannot be opened; remove the graph root and re-run `cluster apply`", - ); - diagnostics.push(Diagnostic::error( - "graph_create_incomplete", - graph_address.clone(), - format!( - "graph root '{}' exists but cannot be opened ({err}); remove the graph root and re-run `cluster apply`", - sidecar.graph_uri - ), - )); - outcome.pending_graphs.insert(sidecar.graph_id.clone()); - } - } -} - -async fn sweep_schema_apply_sidecar( - path: PathBuf, - sidecar: RecoverySidecar, - state: &mut ClusterState, - diagnostics: &mut Vec, - outcome: &mut SweepOutcome, -) { - let graph_address = graph_address(&sidecar.graph_id); - let schema_addr = schema_address(&sidecar.graph_id); - - // Digest-based classification: robust to unrelated manifest movement; - // the sidecar's version pins stay forensic. - let live_digest = match Omnigraph::open_read_only(&sidecar.graph_uri).await { - Ok(db) => sha256_hex(db.schema_source().as_bytes()), - Err(err) => { - // Cannot verify the interrupted operation — refuse to guess. - diagnostics.push(Diagnostic::warning( - "cluster_recovery_pending", - graph_address.clone(), - format!( - "an interrupted schema apply cannot be verified (graph '{}' did not open: {err}); graph-moving work is blocked until repaired", - sidecar.graph_uri - ), - )); - outcome.pending_graphs.insert(sidecar.graph_id.clone()); - return; - } - }; - - let recorded = state - .applied_revision - .resources - .get(&schema_addr) - .map(|resource| resource.digest.clone()); - if recorded.as_deref() == Some(live_digest.as_str()) { - // Ledger consistent with the live graph (the apply never landed, or - // landed and was recorded): the sidecar is stale intent — retire it. - outcome.completed_sidecars.push(path); - } else if live_digest == sidecar.desired_schema_digest { - // RFC-004 §D3 row 3: the schema apply completed on the graph; roll - // the cluster state forward to observable reality. - state.applied_revision.resources.insert( - schema_addr.clone(), - StateResource { - digest: live_digest.clone(), - applies_to: None, - }, - ); - let query_digests = state_query_digests_for_graph(state, &sidecar.graph_id); - let composite = graph_digest(&sidecar.graph_id, Some(&live_digest), Some(&query_digests)); - state - .applied_revision - .resources - .insert(graph_address.clone(), StateResource { digest: composite, applies_to: None }); - set_resource_status_applied(state, &graph_address); - set_resource_status_applied(state, &schema_addr); - state.recovery_records.insert( - sidecar.operation_id.clone(), - json!({ - "kind": "schema_apply", - "graph_id": sidecar.graph_id, - "outcome": "rolled_forward", - "recovered_at": now_rfc3339(), - "actor": sidecar.actor, - }), - ); - diagnostics.push(Diagnostic::warning( - "cluster_recovery_rolled_forward", - graph_address.clone(), - "an interrupted schema apply had completed on the graph; cluster state was rolled forward to match", - )); - outcome.completed_sidecars.push(path); - } else { - // Row 6: live schema is neither the recorded nor the desired digest. - set_resource_status( - state, - &graph_address, - ResourceLifecycleStatus::Drifted, - "actual_applied_state_pending", - "graph state does not match the interrupted operation; run `cluster refresh` and re-plan", - ); - set_resource_status( - state, - &schema_addr, - ResourceLifecycleStatus::Drifted, - "actual_applied_state_pending", - "graph state does not match the interrupted operation; run `cluster refresh` and re-plan", - ); - diagnostics.push(Diagnostic::warning( - "cluster_recovery_pending", - graph_address.clone(), - "an interrupted schema apply left unexpected graph state; graph-moving work is blocked until repaired", - )); - outcome.pending_graphs.insert(sidecar.graph_id.clone()); - } -} - -fn sweep_graph_delete_sidecar( - path: PathBuf, - sidecar: RecoverySidecar, - state: &mut ClusterState, - diagnostics: &mut Vec, - outcome: &mut SweepOutcome, -) { - let graph_address = graph_address(&sidecar.graph_id); - let root = PathBuf::from(&sidecar.graph_uri); - - if root.exists() { - // Row 8: the delete never completed. Prefix removal is idempotent and - // works on partial roots, so the repair is simply the re-proposed, - // still-approved delete on a later run — retire the stale intent. - diagnostics.push(Diagnostic::warning( - "graph_delete_incomplete", - graph_address, - "a previous graph delete did not complete; it will be re-proposed by plan and can be retried under its approval", - )); - outcome.completed_sidecars.push(path); - return; - } - - if !state.applied_revision.resources.contains_key(&graph_address) { - // Row 7: already tombstoned (or never recorded); crash fell between - // the state CAS and sidecar delete. - outcome.completed_sidecars.push(path); - return; - } - - // Row 7b: the root is gone, the ledger is stale — roll forward the - // tombstone, consume the approval the sidecar carries, audit. - tombstone_graph_subtree(state, &sidecar.graph_id, sidecar.approval_id.as_deref(), sidecar.actor.as_deref()); - state.recovery_records.insert( - sidecar.operation_id.clone(), - json!({ - "kind": "graph_delete", - "graph_id": sidecar.graph_id, - "outcome": "rolled_forward", - "recovered_at": now_rfc3339(), - "actor": sidecar.actor, - }), - ); - if let Some(approval_id) = &sidecar.approval_id { - record_approval_consumed(state, approval_id, &sidecar.operation_id); - outcome.consumed_approvals.push(approval_id.clone()); - } - diagnostics.push(Diagnostic::warning( - "cluster_recovery_rolled_forward", - graph_address, - "an interrupted graph delete had completed on disk; cluster state was rolled forward to match", - )); - outcome.completed_sidecars.push(path); -} - -/// Remove a graph's subtree (graph, schema, queries) from the ledger and -/// leave a tombstone observation. Idempotent. -fn tombstone_graph_subtree( - state: &mut ClusterState, - graph_id: &str, - approval_id: Option<&str>, - actor: Option<&str>, -) { - let graph_addr = graph_address(graph_id); - let schema_addr = schema_address(graph_id); - let query_prefix = format!("query.{graph_id}."); - state.applied_revision.resources.remove(&graph_addr); - state.applied_revision.resources.remove(&schema_addr); - state - .applied_revision - .resources - .retain(|address, _| !address.starts_with(&query_prefix)); - state.resource_statuses.remove(&graph_addr); - state.resource_statuses.remove(&schema_addr); - state - .resource_statuses - .retain(|address, _| !address.starts_with(&query_prefix)); - state.observations.insert( - graph_addr, - json!({ - "kind": "tombstone", - "deleted_at": now_rfc3339(), - "approval_id": approval_id, - "actor": actor, - }), - ); -} - -/// Record approval consumption in the state ledger. The artifact FILE is -/// rewritten with consumed_at only after the state write lands, so a failed -/// CAS leaves the approval valid for the retry. -fn record_approval_consumed(state: &mut ClusterState, approval_id: &str, operation_id: &str) { - state.approval_records.insert( - approval_id.to_string(), - json!({ - "consumed_at": now_rfc3339(), - "consumed_by_operation": operation_id, - }), - ); -} - -/// Mark approval artifact files consumed on disk (post-CAS). -fn mark_approvals_consumed(backend: &LocalStateBackend, approval_ids: &[String]) { - if approval_ids.is_empty() { - return; - } - let mut sink = Vec::new(); - for (_, mut artifact) in backend.list_approval_artifacts(&mut sink) { - if approval_ids.contains(&artifact.approval_id) && artifact.consumed_at.is_none() { - artifact.consumed_at = Some(now_rfc3339()); - let _ = backend.write_approval_artifact(&artifact); - } - } -} - -/// Read-only commands report pending sidecars without acting on them. -fn warn_pending_recovery_sidecars(config_dir: &Path, diagnostics: &mut Vec) { - let recoveries_dir = config_dir.join(CLUSTER_RECOVERIES_DIR); - let Ok(entries) = fs::read_dir(&recoveries_dir) else { - return; - }; - let mut names: Vec = entries - .flatten() - .filter(|entry| entry.path().extension().is_some_and(|ext| ext == "json")) - .map(|entry| entry.file_name().to_string_lossy().into_owned()) - .collect(); - names.sort(); - for name in names { - diagnostics.push(Diagnostic::warning( - "cluster_recovery_pending", - format!("{CLUSTER_RECOVERIES_DIR}/{name}"), - "a recovery sidecar from an interrupted apply is pending; the next state-mutating command will classify it", - )); - } -} async fn observe_declared_graphs(desired: &DesiredCluster, state: &mut ClusterState) -> usize { let mut graph_error_count = 0; diff --git a/crates/omnigraph-cluster/src/sweep.rs b/crates/omnigraph-cluster/src/sweep.rs new file mode 100644 index 0000000..77ad8c5 --- /dev/null +++ b/crates/omnigraph-cluster/src/sweep.rs @@ -0,0 +1,386 @@ +//! The recovery sweep: RFC-004's roll-forward-only sidecar +//! classification (moved verbatim from lib.rs in the modularization). + +use super::*; + +/// Recovery sweep (RFC-004 §D3): runs at the start of every state-mutating +/// cluster command, under the state lock, before the command's own work. +/// Roll-forward-only — the engine's own sidecars make each graph-level +/// operation atomic within the graph, so the cluster never rolls a graph +/// back; it converges the ledger to observable reality or refuses loudly. +/// Mutations ride the calling command's CAS-checked state write; completed +/// sidecars are deleted only after that write lands. +pub(crate) async fn sweep_recovery_sidecars( + backend: &LocalStateBackend, + state: &mut ClusterState, + diagnostics: &mut Vec, +) -> SweepOutcome { + let mut outcome = SweepOutcome::default(); + for (path, sidecar) in backend.list_recovery_sidecars(diagnostics) { + match sidecar.kind { + RecoverySidecarKind::GraphCreate => { + sweep_graph_create_sidecar(path, sidecar, state, diagnostics, &mut outcome).await; + } + RecoverySidecarKind::SchemaApply => { + sweep_schema_apply_sidecar(path, sidecar, state, diagnostics, &mut outcome).await; + } + RecoverySidecarKind::GraphDelete => { + sweep_graph_delete_sidecar(path, sidecar, state, diagnostics, &mut outcome); + } + } + } + outcome +} + +pub(crate) async fn sweep_graph_create_sidecar( + path: PathBuf, + sidecar: RecoverySidecar, + state: &mut ClusterState, + diagnostics: &mut Vec, + outcome: &mut SweepOutcome, +) { + let graph_address = graph_address(&sidecar.graph_id); + let schema_addr = schema_address(&sidecar.graph_id); + let graph_path = PathBuf::from(&sidecar.graph_uri); + + // Row 1: nothing moved — the init never landed. The sidecar is pure + // intent; remove it and let the command's own plan re-propose the create. + if !graph_path.exists() { + let _ = fs::remove_file(&path); + return; + } + + match Omnigraph::open_read_only(&sidecar.graph_uri).await { + Ok(db) => { + let live_digest = sha256_hex(db.schema_source().as_bytes()); + let recorded = state + .applied_revision + .resources + .get(&schema_addr) + .map(|resource| resource.digest.clone()); + if recorded.as_deref() == Some(live_digest.as_str()) { + // Row 2: crash fell between the state CAS and sidecar delete. + outcome.completed_sidecars.push(path); + } else if live_digest == sidecar.desired_schema_digest { + // Row 4: the create completed on the graph; roll the cluster + // state forward to observable reality. + state.applied_revision.resources.insert( + schema_addr.clone(), + StateResource { + digest: live_digest.clone(), + applies_to: None, + }, + ); + let query_digests = state_query_digests_for_graph(state, &sidecar.graph_id); + let composite = + graph_digest(&sidecar.graph_id, Some(&live_digest), Some(&query_digests)); + state + .applied_revision + .resources + .insert(graph_address.clone(), StateResource { digest: composite, applies_to: None }); + set_resource_status_applied(state, &graph_address); + set_resource_status_applied(state, &schema_addr); + state.recovery_records.insert( + sidecar.operation_id.clone(), + json!({ + "kind": "graph_create", + "graph_id": sidecar.graph_id, + "outcome": "rolled_forward", + "recovered_at": now_rfc3339(), + "actor": sidecar.actor, + }), + ); + diagnostics.push(Diagnostic::warning( + "cluster_recovery_rolled_forward", + graph_address.clone(), + "an interrupted graph create had completed on the graph; cluster state was rolled forward to match", + )); + outcome.completed_sidecars.push(path); + } else { + // Row 6: the graph moved to something the sidecar did not + // intend. Refuse to guess; require refresh + operator re-plan. + set_resource_status( + state, + &graph_address, + ResourceLifecycleStatus::Drifted, + "actual_applied_state_pending", + "graph state does not match the interrupted operation; run `cluster refresh` and re-plan", + ); + set_resource_status( + state, + &schema_addr, + ResourceLifecycleStatus::Drifted, + "actual_applied_state_pending", + "graph state does not match the interrupted operation; run `cluster refresh` and re-plan", + ); + diagnostics.push(Diagnostic::warning( + "cluster_recovery_pending", + graph_address.clone(), + "an interrupted graph create left unexpected graph state; graph-moving work is blocked until repaired", + )); + outcome.pending_graphs.insert(sidecar.graph_id.clone()); + } + } + Err(err) => { + // Row 5: partial root (the engine's documented init gap). Never + // auto-delete — reconciler deletes are the same data-loss class + // as human deletes; the operator removes the root explicitly. + set_resource_status( + state, + &graph_address, + ResourceLifecycleStatus::Error, + "graph_create_incomplete", + "graph root exists but cannot be opened; remove the graph root and re-run `cluster apply`", + ); + set_resource_status( + state, + &schema_addr, + ResourceLifecycleStatus::Error, + "graph_create_incomplete", + "graph root exists but cannot be opened; remove the graph root and re-run `cluster apply`", + ); + diagnostics.push(Diagnostic::error( + "graph_create_incomplete", + graph_address.clone(), + format!( + "graph root '{}' exists but cannot be opened ({err}); remove the graph root and re-run `cluster apply`", + sidecar.graph_uri + ), + )); + outcome.pending_graphs.insert(sidecar.graph_id.clone()); + } + } +} + +pub(crate) async fn sweep_schema_apply_sidecar( + path: PathBuf, + sidecar: RecoverySidecar, + state: &mut ClusterState, + diagnostics: &mut Vec, + outcome: &mut SweepOutcome, +) { + let graph_address = graph_address(&sidecar.graph_id); + let schema_addr = schema_address(&sidecar.graph_id); + + // Digest-based classification: robust to unrelated manifest movement; + // the sidecar's version pins stay forensic. + let live_digest = match Omnigraph::open_read_only(&sidecar.graph_uri).await { + Ok(db) => sha256_hex(db.schema_source().as_bytes()), + Err(err) => { + // Cannot verify the interrupted operation — refuse to guess. + diagnostics.push(Diagnostic::warning( + "cluster_recovery_pending", + graph_address.clone(), + format!( + "an interrupted schema apply cannot be verified (graph '{}' did not open: {err}); graph-moving work is blocked until repaired", + sidecar.graph_uri + ), + )); + outcome.pending_graphs.insert(sidecar.graph_id.clone()); + return; + } + }; + + let recorded = state + .applied_revision + .resources + .get(&schema_addr) + .map(|resource| resource.digest.clone()); + if recorded.as_deref() == Some(live_digest.as_str()) { + // Ledger consistent with the live graph (the apply never landed, or + // landed and was recorded): the sidecar is stale intent — retire it. + outcome.completed_sidecars.push(path); + } else if live_digest == sidecar.desired_schema_digest { + // RFC-004 §D3 row 3: the schema apply completed on the graph; roll + // the cluster state forward to observable reality. + state.applied_revision.resources.insert( + schema_addr.clone(), + StateResource { + digest: live_digest.clone(), + applies_to: None, + }, + ); + let query_digests = state_query_digests_for_graph(state, &sidecar.graph_id); + let composite = graph_digest(&sidecar.graph_id, Some(&live_digest), Some(&query_digests)); + state + .applied_revision + .resources + .insert(graph_address.clone(), StateResource { digest: composite, applies_to: None }); + set_resource_status_applied(state, &graph_address); + set_resource_status_applied(state, &schema_addr); + state.recovery_records.insert( + sidecar.operation_id.clone(), + json!({ + "kind": "schema_apply", + "graph_id": sidecar.graph_id, + "outcome": "rolled_forward", + "recovered_at": now_rfc3339(), + "actor": sidecar.actor, + }), + ); + diagnostics.push(Diagnostic::warning( + "cluster_recovery_rolled_forward", + graph_address.clone(), + "an interrupted schema apply had completed on the graph; cluster state was rolled forward to match", + )); + outcome.completed_sidecars.push(path); + } else { + // Row 6: live schema is neither the recorded nor the desired digest. + set_resource_status( + state, + &graph_address, + ResourceLifecycleStatus::Drifted, + "actual_applied_state_pending", + "graph state does not match the interrupted operation; run `cluster refresh` and re-plan", + ); + set_resource_status( + state, + &schema_addr, + ResourceLifecycleStatus::Drifted, + "actual_applied_state_pending", + "graph state does not match the interrupted operation; run `cluster refresh` and re-plan", + ); + diagnostics.push(Diagnostic::warning( + "cluster_recovery_pending", + graph_address.clone(), + "an interrupted schema apply left unexpected graph state; graph-moving work is blocked until repaired", + )); + outcome.pending_graphs.insert(sidecar.graph_id.clone()); + } +} + +pub(crate) fn sweep_graph_delete_sidecar( + path: PathBuf, + sidecar: RecoverySidecar, + state: &mut ClusterState, + diagnostics: &mut Vec, + outcome: &mut SweepOutcome, +) { + let graph_address = graph_address(&sidecar.graph_id); + let root = PathBuf::from(&sidecar.graph_uri); + + if root.exists() { + // Row 8: the delete never completed. Prefix removal is idempotent and + // works on partial roots, so the repair is simply the re-proposed, + // still-approved delete on a later run — retire the stale intent. + diagnostics.push(Diagnostic::warning( + "graph_delete_incomplete", + graph_address, + "a previous graph delete did not complete; it will be re-proposed by plan and can be retried under its approval", + )); + outcome.completed_sidecars.push(path); + return; + } + + if !state.applied_revision.resources.contains_key(&graph_address) { + // Row 7: already tombstoned (or never recorded); crash fell between + // the state CAS and sidecar delete. + outcome.completed_sidecars.push(path); + return; + } + + // Row 7b: the root is gone, the ledger is stale — roll forward the + // tombstone, consume the approval the sidecar carries, audit. + tombstone_graph_subtree(state, &sidecar.graph_id, sidecar.approval_id.as_deref(), sidecar.actor.as_deref()); + state.recovery_records.insert( + sidecar.operation_id.clone(), + json!({ + "kind": "graph_delete", + "graph_id": sidecar.graph_id, + "outcome": "rolled_forward", + "recovered_at": now_rfc3339(), + "actor": sidecar.actor, + }), + ); + if let Some(approval_id) = &sidecar.approval_id { + record_approval_consumed(state, approval_id, &sidecar.operation_id); + outcome.consumed_approvals.push(approval_id.clone()); + } + diagnostics.push(Diagnostic::warning( + "cluster_recovery_rolled_forward", + graph_address, + "an interrupted graph delete had completed on disk; cluster state was rolled forward to match", + )); + outcome.completed_sidecars.push(path); +} + +/// Remove a graph's subtree (graph, schema, queries) from the ledger and +/// leave a tombstone observation. Idempotent. +pub(crate) fn tombstone_graph_subtree( + state: &mut ClusterState, + graph_id: &str, + approval_id: Option<&str>, + actor: Option<&str>, +) { + let graph_addr = graph_address(graph_id); + let schema_addr = schema_address(graph_id); + let query_prefix = format!("query.{graph_id}."); + state.applied_revision.resources.remove(&graph_addr); + state.applied_revision.resources.remove(&schema_addr); + state + .applied_revision + .resources + .retain(|address, _| !address.starts_with(&query_prefix)); + state.resource_statuses.remove(&graph_addr); + state.resource_statuses.remove(&schema_addr); + state + .resource_statuses + .retain(|address, _| !address.starts_with(&query_prefix)); + state.observations.insert( + graph_addr, + json!({ + "kind": "tombstone", + "deleted_at": now_rfc3339(), + "approval_id": approval_id, + "actor": actor, + }), + ); +} + +/// Record approval consumption in the state ledger. The artifact FILE is +/// rewritten with consumed_at only after the state write lands, so a failed +/// CAS leaves the approval valid for the retry. +pub(crate) fn record_approval_consumed(state: &mut ClusterState, approval_id: &str, operation_id: &str) { + state.approval_records.insert( + approval_id.to_string(), + json!({ + "consumed_at": now_rfc3339(), + "consumed_by_operation": operation_id, + }), + ); +} + +/// Mark approval artifact files consumed on disk (post-CAS). +pub(crate) fn mark_approvals_consumed(backend: &LocalStateBackend, approval_ids: &[String]) { + if approval_ids.is_empty() { + return; + } + let mut sink = Vec::new(); + for (_, mut artifact) in backend.list_approval_artifacts(&mut sink) { + if approval_ids.contains(&artifact.approval_id) && artifact.consumed_at.is_none() { + artifact.consumed_at = Some(now_rfc3339()); + let _ = backend.write_approval_artifact(&artifact); + } + } +} + +/// Read-only commands report pending sidecars without acting on them. +pub(crate) fn warn_pending_recovery_sidecars(config_dir: &Path, diagnostics: &mut Vec) { + let recoveries_dir = config_dir.join(CLUSTER_RECOVERIES_DIR); + let Ok(entries) = fs::read_dir(&recoveries_dir) else { + return; + }; + let mut names: Vec = entries + .flatten() + .filter(|entry| entry.path().extension().is_some_and(|ext| ext == "json")) + .map(|entry| entry.file_name().to_string_lossy().into_owned()) + .collect(); + names.sort(); + for name in names { + diagnostics.push(Diagnostic::warning( + "cluster_recovery_pending", + format!("{CLUSTER_RECOVERIES_DIR}/{name}"), + "a recovery sidecar from an interrupted apply is pending; the next state-mutating command will classify it", + )); + } +} From dd17c0c50f1ebd8a350373b5df4c436d5d1fb578 Mon Sep 17 00:00:00 2001 From: aaltshuler Date: Thu, 11 Jun 2026 05:33:13 +0300 Subject: [PATCH 5/7] refactor(cluster): move diffing and classification to diff.rs Verbatim move of diff_resources, binding-change diffing, blast radius, approval gating, ResourceKind, classify_changes, and demotion. 95 tests green. Co-Authored-By: Claude Fable 5 --- crates/omnigraph-cluster/src/diff.rs | 420 +++++++++++++++++++++++++++ crates/omnigraph-cluster/src/lib.rs | 414 +------------------------- 2 files changed, 422 insertions(+), 412 deletions(-) create mode 100644 crates/omnigraph-cluster/src/diff.rs diff --git a/crates/omnigraph-cluster/src/diff.rs b/crates/omnigraph-cluster/src/diff.rs new file mode 100644 index 0000000..e75db4d --- /dev/null +++ b/crates/omnigraph-cluster/src/diff.rs @@ -0,0 +1,420 @@ +//! Plan/apply classification: resource diffing, dispositions, approval +//! gating, demotion (moved verbatim from lib.rs in the modularization). + +use super::*; + +pub(crate) fn diff_resources( + prior: &BTreeMap, + desired: &BTreeMap, +) -> Vec { + let mut changes = Vec::new(); + for (address, after) in desired { + match prior.get(address) { + None => changes.push(PlanChange { + resource: address.clone(), + operation: PlanOperation::Create, + before_digest: None, + after_digest: Some(after.clone()), + disposition: None, + reason: None, + binding_change: false, + migration: None, + }), + Some(before) if before != after => changes.push(PlanChange { + resource: address.clone(), + operation: PlanOperation::Update, + before_digest: Some(before.clone()), + after_digest: Some(after.clone()), + disposition: None, + reason: None, + binding_change: false, + migration: None, + }), + Some(_) => {} + } + } + for (address, before) in prior { + if !desired.contains_key(address) { + changes.push(PlanChange { + resource: address.clone(), + operation: PlanOperation::Delete, + before_digest: Some(before.clone()), + after_digest: None, + disposition: None, + reason: None, + binding_change: false, + migration: None, + }); + } + } + changes.sort_by(|a, b| a.resource.cmp(&b.resource)); + changes +} + +/// Binding-only policy changes: the file digest is unchanged (so +/// `diff_resources` saw nothing) but the applied `applies_to` differs from +/// the desired bindings — including the pre-5A case where the state entry +/// has no bindings recorded yet. These are first-class plan changes: without +/// this pass a binding edit would silently rot or silently converge. +pub(crate) fn append_policy_binding_changes( + changes: &mut Vec, + prior_state: Option<&ClusterState>, + desired: &DesiredCluster, +) { + let Some(state) = prior_state else { + return; // no state: everything is already a Create carrying bindings + }; + for (address, desired_bindings) in &desired.policy_bindings { + if changes.iter().any(|change| &change.resource == address) { + continue; // content change already covers it + } + let Some(entry) = state.applied_revision.resources.get(address) else { + continue; // not applied yet: the Create covers it + }; + if entry.applies_to.as_ref() == Some(desired_bindings) { + continue; + } + changes.push(PlanChange { + resource: address.clone(), + operation: PlanOperation::Update, + before_digest: Some(entry.digest.clone()), + after_digest: Some(entry.digest.clone()), + disposition: None, + reason: None, + binding_change: true, + migration: None, + }); + } + changes.sort_by(|a, b| a.resource.cmp(&b.resource)); +} + +pub(crate) fn compute_blast_radius( + changes: &[PlanChange], + dependencies: &[Dependency], +) -> Vec { + changes + .iter() + .filter_map(|change| { + let affected: Vec<_> = dependencies + .iter() + .filter_map(|dep| (dep.to == change.resource).then_some(dep.from.clone())) + .collect(); + (!affected.is_empty()).then(|| BlastRadius { + resource: change.resource.clone(), + affected, + }) + }) + .collect() +} + +pub(crate) fn compute_approvals( + changes: &[PlanChange], + approved: &BTreeSet, +) -> Vec { + // One gate per subtree: the graph. delete carries its schema and + // queries, so a schema delete whose graph is also deleted is not listed. + let graph_deletes: BTreeSet = changes + .iter() + .filter(|change| change.operation == PlanOperation::Delete) + .filter_map(|change| change.resource.strip_prefix("graph.").map(str::to_string)) + .collect(); + changes + .iter() + .filter_map(|change| { + if change.operation != PlanOperation::Delete { + return None; + } + let gated = match resource_kind(&change.resource) { + ResourceKind::Graph(_) => true, + ResourceKind::Schema(graph) => !graph_deletes.contains(&graph), + _ => false, + }; + gated.then(|| ApprovalRequirement { + resource: change.resource.clone(), + reason: "delete may remove deployed graph or schema definition".to_string(), + satisfied: approved.contains(&change.resource), + }) + }) + .collect() +} + +/// Resources with a valid (digest-matching, unconsumed) pending approval. +/// Near-misses — an artifact for the same resource whose bound digests no +/// longer match — warn as `approval_stale` and never authorize anything. +pub(crate) fn approved_resources( + artifacts: &[(PathBuf, ApprovalArtifact)], + changes: &[PlanChange], + config_digest: &str, + diagnostics: &mut Vec, +) -> BTreeSet { + let mut approved = BTreeSet::new(); + for change in changes { + let candidates: Vec<&ApprovalArtifact> = artifacts + .iter() + .map(|(_, artifact)| artifact) + .filter(|artifact| artifact.consumed_at.is_none() && artifact.resource == change.resource) + .collect(); + if candidates.is_empty() { + continue; + } + let matched = candidates.iter().any(|artifact| { + artifact.bound_config_digest == config_digest + && artifact.bound_before_digest == change.before_digest + && artifact.bound_after_digest == change.after_digest + }); + if matched { + approved.insert(change.resource.clone()); + } else { + diagnostics.push(Diagnostic::warning( + "approval_stale", + change.resource.clone(), + "an approval artifact exists but its bound digests no longer match the plan; re-run `cluster approve`", + )); + } + } + approved +} + +#[derive(Debug, PartialEq, Eq)] +pub(crate) enum ResourceKind { + Graph(String), + Schema(String), + Query { graph: String, name: String }, + Policy(String), + Unknown, +} + +pub(crate) fn resource_kind(address: &str) -> ResourceKind { + if let Some(graph) = address.strip_prefix("graph.") { + ResourceKind::Graph(graph.to_string()) + } else if let Some(graph) = address.strip_prefix("schema.") { + ResourceKind::Schema(graph.to_string()) + } else if let Some(rest) = address.strip_prefix("query.") { + match rest.split_once('.') { + Some((graph, name)) => ResourceKind::Query { + graph: graph.to_string(), + name: name.to_string(), + }, + None => ResourceKind::Unknown, + } + } else if let Some(name) = address.strip_prefix("policy.") { + ResourceKind::Policy(name.to_string()) + } else { + ResourceKind::Unknown + } +} + +/// Classify every planned change with the disposition config-only apply gives +/// it. Stage 3A executes only query/policy catalog writes; graph/schema +/// movement is a later phase, and `graph.` composite updates whose schema +/// component is unchanged converge automatically once query digests land. +pub(crate) fn classify_changes( + changes: &mut [PlanChange], + dependencies: &[Dependency], + pending_recovery: &BTreeSet, + approved: &BTreeSet, +) { + let mut schema_creates = BTreeSet::new(); + let mut schema_pending = BTreeSet::new(); + let mut graph_creates = BTreeSet::new(); + let mut graph_deletes = BTreeSet::new(); + for change in changes.iter() { + match resource_kind(&change.resource) { + ResourceKind::Schema(graph) => match change.operation { + PlanOperation::Create => { + schema_creates.insert(graph); + } + // Schema updates execute in-run before catalog writes (4B) + // and no longer block dependents; deletes (4C) still do. + PlanOperation::Update => {} + PlanOperation::Delete => { + schema_pending.insert(graph); + } + }, + ResourceKind::Graph(graph) => match change.operation { + PlanOperation::Create => { + graph_creates.insert(graph); + } + PlanOperation::Delete => { + graph_deletes.insert(graph); + } + PlanOperation::Update => {} + }, + _ => {} + } + } + // A schema Create is satisfied by its paired graph create (the init + // carries the schema); a standalone schema Create stays pending. + for graph in &schema_creates { + if !graph_creates.contains(graph) { + schema_pending.insert(graph.clone()); + } + } + // Subtree deletes ride the approved graph delete. + let rides_approved_delete = |graph: &str| { + graph_deletes.contains(graph) + && approved.contains(&graph_address(graph)) + && !pending_recovery.contains(graph) + }; + + for change in changes.iter_mut() { + let (disposition, reason) = match resource_kind(&change.resource) { + ResourceKind::Schema(graph) => match change.operation { + PlanOperation::Create + if graph_creates.contains(&graph) + && !pending_recovery.contains(&graph) => + { + // Applied with the graph create — the init carries it. + (ApplyDisposition::Applied, None) + } + PlanOperation::Update if !pending_recovery.contains(&graph) => { + // Stage 4B: schema updates execute via the engine's + // schema apply (soft drops only; allow_data_loss is 4C). + (ApplyDisposition::Applied, None) + } + PlanOperation::Create | PlanOperation::Update => { + (ApplyDisposition::Blocked, Some("cluster_recovery_pending")) + } + PlanOperation::Delete if graph_deletes.contains(&graph) => { + if rides_approved_delete(&graph) { + (ApplyDisposition::Applied, None) + } else if pending_recovery.contains(&graph) { + (ApplyDisposition::Blocked, Some("cluster_recovery_pending")) + } else { + (ApplyDisposition::Blocked, Some("approval_required")) + } + } + _ => (ApplyDisposition::Deferred, Some("apply_unsupported_kind")), + }, + ResourceKind::Graph(graph) => match change.operation { + PlanOperation::Create => { + if pending_recovery.contains(&graph) { + (ApplyDisposition::Blocked, Some("cluster_recovery_pending")) + } else { + (ApplyDisposition::Applied, None) + } + } + PlanOperation::Update if !schema_pending.contains(&graph) => { + (ApplyDisposition::Derived, None) + } + // Stage 4C: an approved graph delete executes (the + // irreversible tier — gated by a digest-bound artifact). + PlanOperation::Delete => { + if pending_recovery.contains(&graph) { + (ApplyDisposition::Blocked, Some("cluster_recovery_pending")) + } else if rides_approved_delete(&graph) { + (ApplyDisposition::Applied, None) + } else { + (ApplyDisposition::Blocked, Some("approval_required")) + } + } + _ => (ApplyDisposition::Deferred, Some("apply_unsupported_kind")), + }, + ResourceKind::Query { graph, .. } => match change.operation { + PlanOperation::Delete => { + if rides_approved_delete(&graph) { + // Tombstoned with the approved graph delete. + (ApplyDisposition::Applied, None) + } else if graph_deletes.contains(&graph) { + (ApplyDisposition::Blocked, Some("approval_required")) + } else { + (ApplyDisposition::Applied, None) + } + } + PlanOperation::Create | PlanOperation::Update => { + if pending_recovery.contains(&graph) { + (ApplyDisposition::Blocked, Some("cluster_recovery_pending")) + } else if schema_pending.contains(&graph) { + ( + ApplyDisposition::Blocked, + Some("dependency_not_applied"), + ) + } else { + // A graph create in the same plan no longer blocks: + // creates execute first in the same apply run. + (ApplyDisposition::Applied, None) + } + } + }, + ResourceKind::Policy(_) => match change.operation { + PlanOperation::Delete => (ApplyDisposition::Applied, None), + PlanOperation::Create | PlanOperation::Update => { + let blocked_pending = dependencies.iter().any(|dep| { + dep.from == change.resource + && dep + .to + .strip_prefix("graph.") + .is_some_and(|graph| pending_recovery.contains(graph)) + }); + if blocked_pending { + (ApplyDisposition::Blocked, Some("cluster_recovery_pending")) + } else { + (ApplyDisposition::Applied, None) + } + } + }, + ResourceKind::Unknown => { + (ApplyDisposition::Deferred, Some("apply_unsupported_kind")) + } + }; + change.disposition = Some(disposition); + change.reason = reason.map(str::to_string); + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(crate) enum FailedGraphOrigin { + GraphCreate, + SchemaApply, + GraphDelete, +} + +/// After a graph-moving operation fails mid-run, every change that depended +/// on that graph flips from Applied to Blocked so the output and the +/// persisted statuses tell the truth about what this run actually executed. +/// The originating change carries the failure code; dependents carry +/// `dependency_not_applied`. +pub(crate) fn demote_dependents_of_failed_graphs( + changes: &mut [PlanChange], + failed: &BTreeMap, + dependencies: &[Dependency], +) { + for change in changes.iter_mut() { + if change.disposition != Some(ApplyDisposition::Applied) { + continue; + } + let demote_reason = match resource_kind(&change.resource) { + ResourceKind::Graph(graph) => match failed.get(&graph) { + Some(FailedGraphOrigin::GraphCreate) => Some("graph_create_failed"), + Some(FailedGraphOrigin::GraphDelete) => Some("graph_delete_failed"), + Some(FailedGraphOrigin::SchemaApply) => Some("dependency_not_applied"), + None => None, + }, + ResourceKind::Schema(graph) => match failed.get(&graph) { + Some(FailedGraphOrigin::SchemaApply) => Some("schema_apply_failed"), + Some(FailedGraphOrigin::GraphCreate) | Some(FailedGraphOrigin::GraphDelete) => { + Some("dependency_not_applied") + } + None => None, + }, + ResourceKind::Query { graph, .. } if failed.contains_key(&graph) => { + Some("dependency_not_applied") + } + ResourceKind::Policy(_) => { + let blocked = dependencies.iter().any(|dep| { + dep.from == change.resource + && dep + .to + .strip_prefix("graph.") + .is_some_and(|graph| failed.contains_key(graph)) + }); + blocked.then_some("dependency_not_applied") + } + _ => None, + }; + if let Some(reason) = demote_reason { + change.disposition = Some(ApplyDisposition::Blocked); + change.reason = Some(reason.to_string()); + } + } +} diff --git a/crates/omnigraph-cluster/src/lib.rs b/crates/omnigraph-cluster/src/lib.rs index 3faacaa..c54d245 100644 --- a/crates/omnigraph-cluster/src/lib.rs +++ b/crates/omnigraph-cluster/src/lib.rs @@ -19,12 +19,14 @@ use ulid::Ulid; pub mod failpoints; +mod diff; mod serve; mod sweep; mod store; use store::{LocalStateBackend, StateLockGuard, StateSnapshot}; pub use serve::{ServingGraph, ServingPolicy, ServingQuery, ServingSnapshot, read_serving_snapshot}; use serve::read_verified_payload; +use diff::{FailedGraphOrigin, ResourceKind, append_policy_binding_changes, approved_resources, classify_changes, compute_approvals, compute_blast_radius, demote_dependents_of_failed_graphs, diff_resources, resource_kind}; use sweep::{mark_approvals_consumed, record_approval_consumed, sweep_recovery_sidecars, tombstone_graph_subtree, warn_pending_recovery_sidecars}; pub const CLUSTER_CONFIG_FILE: &str = "cluster.yaml"; @@ -2856,418 +2858,6 @@ fn validate_query_source( } } -fn diff_resources( - prior: &BTreeMap, - desired: &BTreeMap, -) -> Vec { - let mut changes = Vec::new(); - for (address, after) in desired { - match prior.get(address) { - None => changes.push(PlanChange { - resource: address.clone(), - operation: PlanOperation::Create, - before_digest: None, - after_digest: Some(after.clone()), - disposition: None, - reason: None, - binding_change: false, - migration: None, - }), - Some(before) if before != after => changes.push(PlanChange { - resource: address.clone(), - operation: PlanOperation::Update, - before_digest: Some(before.clone()), - after_digest: Some(after.clone()), - disposition: None, - reason: None, - binding_change: false, - migration: None, - }), - Some(_) => {} - } - } - for (address, before) in prior { - if !desired.contains_key(address) { - changes.push(PlanChange { - resource: address.clone(), - operation: PlanOperation::Delete, - before_digest: Some(before.clone()), - after_digest: None, - disposition: None, - reason: None, - binding_change: false, - migration: None, - }); - } - } - changes.sort_by(|a, b| a.resource.cmp(&b.resource)); - changes -} - -/// Binding-only policy changes: the file digest is unchanged (so -/// `diff_resources` saw nothing) but the applied `applies_to` differs from -/// the desired bindings — including the pre-5A case where the state entry -/// has no bindings recorded yet. These are first-class plan changes: without -/// this pass a binding edit would silently rot or silently converge. -fn append_policy_binding_changes( - changes: &mut Vec, - prior_state: Option<&ClusterState>, - desired: &DesiredCluster, -) { - let Some(state) = prior_state else { - return; // no state: everything is already a Create carrying bindings - }; - for (address, desired_bindings) in &desired.policy_bindings { - if changes.iter().any(|change| &change.resource == address) { - continue; // content change already covers it - } - let Some(entry) = state.applied_revision.resources.get(address) else { - continue; // not applied yet: the Create covers it - }; - if entry.applies_to.as_ref() == Some(desired_bindings) { - continue; - } - changes.push(PlanChange { - resource: address.clone(), - operation: PlanOperation::Update, - before_digest: Some(entry.digest.clone()), - after_digest: Some(entry.digest.clone()), - disposition: None, - reason: None, - binding_change: true, - migration: None, - }); - } - changes.sort_by(|a, b| a.resource.cmp(&b.resource)); -} - -fn compute_blast_radius(changes: &[PlanChange], dependencies: &[Dependency]) -> Vec { - changes - .iter() - .filter_map(|change| { - let affected: Vec<_> = dependencies - .iter() - .filter_map(|dep| (dep.to == change.resource).then_some(dep.from.clone())) - .collect(); - (!affected.is_empty()).then(|| BlastRadius { - resource: change.resource.clone(), - affected, - }) - }) - .collect() -} - -fn compute_approvals( - changes: &[PlanChange], - approved: &BTreeSet, -) -> Vec { - // One gate per subtree: the graph. delete carries its schema and - // queries, so a schema delete whose graph is also deleted is not listed. - let graph_deletes: BTreeSet = changes - .iter() - .filter(|change| change.operation == PlanOperation::Delete) - .filter_map(|change| change.resource.strip_prefix("graph.").map(str::to_string)) - .collect(); - changes - .iter() - .filter_map(|change| { - if change.operation != PlanOperation::Delete { - return None; - } - let gated = match resource_kind(&change.resource) { - ResourceKind::Graph(_) => true, - ResourceKind::Schema(graph) => !graph_deletes.contains(&graph), - _ => false, - }; - gated.then(|| ApprovalRequirement { - resource: change.resource.clone(), - reason: "delete may remove deployed graph or schema definition".to_string(), - satisfied: approved.contains(&change.resource), - }) - }) - .collect() -} - -/// Resources with a valid (digest-matching, unconsumed) pending approval. -/// Near-misses — an artifact for the same resource whose bound digests no -/// longer match — warn as `approval_stale` and never authorize anything. -fn approved_resources( - artifacts: &[(PathBuf, ApprovalArtifact)], - changes: &[PlanChange], - config_digest: &str, - diagnostics: &mut Vec, -) -> BTreeSet { - let mut approved = BTreeSet::new(); - for change in changes { - let candidates: Vec<&ApprovalArtifact> = artifacts - .iter() - .map(|(_, artifact)| artifact) - .filter(|artifact| artifact.consumed_at.is_none() && artifact.resource == change.resource) - .collect(); - if candidates.is_empty() { - continue; - } - let matched = candidates.iter().any(|artifact| { - artifact.bound_config_digest == config_digest - && artifact.bound_before_digest == change.before_digest - && artifact.bound_after_digest == change.after_digest - }); - if matched { - approved.insert(change.resource.clone()); - } else { - diagnostics.push(Diagnostic::warning( - "approval_stale", - change.resource.clone(), - "an approval artifact exists but its bound digests no longer match the plan; re-run `cluster approve`", - )); - } - } - approved -} - -#[derive(Debug, PartialEq, Eq)] -enum ResourceKind { - Graph(String), - Schema(String), - Query { graph: String, name: String }, - Policy(String), - Unknown, -} - -fn resource_kind(address: &str) -> ResourceKind { - if let Some(graph) = address.strip_prefix("graph.") { - ResourceKind::Graph(graph.to_string()) - } else if let Some(graph) = address.strip_prefix("schema.") { - ResourceKind::Schema(graph.to_string()) - } else if let Some(rest) = address.strip_prefix("query.") { - match rest.split_once('.') { - Some((graph, name)) => ResourceKind::Query { - graph: graph.to_string(), - name: name.to_string(), - }, - None => ResourceKind::Unknown, - } - } else if let Some(name) = address.strip_prefix("policy.") { - ResourceKind::Policy(name.to_string()) - } else { - ResourceKind::Unknown - } -} - -/// Classify every planned change with the disposition config-only apply gives -/// it. Stage 3A executes only query/policy catalog writes; graph/schema -/// movement is a later phase, and `graph.` composite updates whose schema -/// component is unchanged converge automatically once query digests land. -fn classify_changes( - changes: &mut [PlanChange], - dependencies: &[Dependency], - pending_recovery: &BTreeSet, - approved: &BTreeSet, -) { - let mut schema_creates = BTreeSet::new(); - let mut schema_pending = BTreeSet::new(); - let mut graph_creates = BTreeSet::new(); - let mut graph_deletes = BTreeSet::new(); - for change in changes.iter() { - match resource_kind(&change.resource) { - ResourceKind::Schema(graph) => match change.operation { - PlanOperation::Create => { - schema_creates.insert(graph); - } - // Schema updates execute in-run before catalog writes (4B) - // and no longer block dependents; deletes (4C) still do. - PlanOperation::Update => {} - PlanOperation::Delete => { - schema_pending.insert(graph); - } - }, - ResourceKind::Graph(graph) => match change.operation { - PlanOperation::Create => { - graph_creates.insert(graph); - } - PlanOperation::Delete => { - graph_deletes.insert(graph); - } - PlanOperation::Update => {} - }, - _ => {} - } - } - // A schema Create is satisfied by its paired graph create (the init - // carries the schema); a standalone schema Create stays pending. - for graph in &schema_creates { - if !graph_creates.contains(graph) { - schema_pending.insert(graph.clone()); - } - } - // Subtree deletes ride the approved graph delete. - let rides_approved_delete = |graph: &str| { - graph_deletes.contains(graph) - && approved.contains(&graph_address(graph)) - && !pending_recovery.contains(graph) - }; - - for change in changes.iter_mut() { - let (disposition, reason) = match resource_kind(&change.resource) { - ResourceKind::Schema(graph) => match change.operation { - PlanOperation::Create - if graph_creates.contains(&graph) - && !pending_recovery.contains(&graph) => - { - // Applied with the graph create — the init carries it. - (ApplyDisposition::Applied, None) - } - PlanOperation::Update if !pending_recovery.contains(&graph) => { - // Stage 4B: schema updates execute via the engine's - // schema apply (soft drops only; allow_data_loss is 4C). - (ApplyDisposition::Applied, None) - } - PlanOperation::Create | PlanOperation::Update => { - (ApplyDisposition::Blocked, Some("cluster_recovery_pending")) - } - PlanOperation::Delete if graph_deletes.contains(&graph) => { - if rides_approved_delete(&graph) { - (ApplyDisposition::Applied, None) - } else if pending_recovery.contains(&graph) { - (ApplyDisposition::Blocked, Some("cluster_recovery_pending")) - } else { - (ApplyDisposition::Blocked, Some("approval_required")) - } - } - _ => (ApplyDisposition::Deferred, Some("apply_unsupported_kind")), - }, - ResourceKind::Graph(graph) => match change.operation { - PlanOperation::Create => { - if pending_recovery.contains(&graph) { - (ApplyDisposition::Blocked, Some("cluster_recovery_pending")) - } else { - (ApplyDisposition::Applied, None) - } - } - PlanOperation::Update if !schema_pending.contains(&graph) => { - (ApplyDisposition::Derived, None) - } - // Stage 4C: an approved graph delete executes (the - // irreversible tier — gated by a digest-bound artifact). - PlanOperation::Delete => { - if pending_recovery.contains(&graph) { - (ApplyDisposition::Blocked, Some("cluster_recovery_pending")) - } else if rides_approved_delete(&graph) { - (ApplyDisposition::Applied, None) - } else { - (ApplyDisposition::Blocked, Some("approval_required")) - } - } - _ => (ApplyDisposition::Deferred, Some("apply_unsupported_kind")), - }, - ResourceKind::Query { graph, .. } => match change.operation { - PlanOperation::Delete => { - if rides_approved_delete(&graph) { - // Tombstoned with the approved graph delete. - (ApplyDisposition::Applied, None) - } else if graph_deletes.contains(&graph) { - (ApplyDisposition::Blocked, Some("approval_required")) - } else { - (ApplyDisposition::Applied, None) - } - } - PlanOperation::Create | PlanOperation::Update => { - if pending_recovery.contains(&graph) { - (ApplyDisposition::Blocked, Some("cluster_recovery_pending")) - } else if schema_pending.contains(&graph) { - ( - ApplyDisposition::Blocked, - Some("dependency_not_applied"), - ) - } else { - // A graph create in the same plan no longer blocks: - // creates execute first in the same apply run. - (ApplyDisposition::Applied, None) - } - } - }, - ResourceKind::Policy(_) => match change.operation { - PlanOperation::Delete => (ApplyDisposition::Applied, None), - PlanOperation::Create | PlanOperation::Update => { - let blocked_pending = dependencies.iter().any(|dep| { - dep.from == change.resource - && dep - .to - .strip_prefix("graph.") - .is_some_and(|graph| pending_recovery.contains(graph)) - }); - if blocked_pending { - (ApplyDisposition::Blocked, Some("cluster_recovery_pending")) - } else { - (ApplyDisposition::Applied, None) - } - } - }, - ResourceKind::Unknown => { - (ApplyDisposition::Deferred, Some("apply_unsupported_kind")) - } - }; - change.disposition = Some(disposition); - change.reason = reason.map(str::to_string); - } -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -enum FailedGraphOrigin { - GraphCreate, - SchemaApply, - GraphDelete, -} - -/// After a graph-moving operation fails mid-run, every change that depended -/// on that graph flips from Applied to Blocked so the output and the -/// persisted statuses tell the truth about what this run actually executed. -/// The originating change carries the failure code; dependents carry -/// `dependency_not_applied`. -fn demote_dependents_of_failed_graphs( - changes: &mut [PlanChange], - failed: &BTreeMap, - dependencies: &[Dependency], -) { - for change in changes.iter_mut() { - if change.disposition != Some(ApplyDisposition::Applied) { - continue; - } - let demote_reason = match resource_kind(&change.resource) { - ResourceKind::Graph(graph) => match failed.get(&graph) { - Some(FailedGraphOrigin::GraphCreate) => Some("graph_create_failed"), - Some(FailedGraphOrigin::GraphDelete) => Some("graph_delete_failed"), - Some(FailedGraphOrigin::SchemaApply) => Some("dependency_not_applied"), - None => None, - }, - ResourceKind::Schema(graph) => match failed.get(&graph) { - Some(FailedGraphOrigin::SchemaApply) => Some("schema_apply_failed"), - Some(FailedGraphOrigin::GraphCreate) | Some(FailedGraphOrigin::GraphDelete) => { - Some("dependency_not_applied") - } - None => None, - }, - ResourceKind::Query { graph, .. } if failed.contains_key(&graph) => { - Some("dependency_not_applied") - } - ResourceKind::Policy(_) => { - let blocked = dependencies.iter().any(|dep| { - dep.from == change.resource - && dep - .to - .strip_prefix("graph.") - .is_some_and(|graph| failed.contains_key(graph)) - }); - blocked.then_some("dependency_not_applied") - } - _ => None, - }; - if let Some(reason) = demote_reason { - change.disposition = Some(ApplyDisposition::Blocked); - change.reason = Some(reason.to_string()); - } - } -} /// Content-addressed catalog path for an applied resource payload. Extensions /// are fixed per kind (`.gq` / `.yaml`) regardless of the source file's name, From dc0a1fc5a5720d44f78a59f3581f2408d8c79705 Mon Sep 17 00:00:00 2001 From: aaltshuler Date: Thu, 11 Jun 2026 05:37:20 +0300 Subject: [PATCH 6/7] refactor(cluster): move declared-config loading to config.rs Verbatim move of cluster.yaml parsing, query discovery, source digesting, header/id validation, path resolution, and live-graph observation. Two helpers that the cut swept along were relocated to their right homes (state-status helpers back to lib.rs, lock-file helpers to store.rs). 95 tests green. Co-Authored-By: Claude Fable 5 --- crates/omnigraph-cluster/src/config.rs | 881 ++++++++++++++++++++++ crates/omnigraph-cluster/src/lib.rs | 971 +------------------------ crates/omnigraph-cluster/src/store.rs | 27 + 3 files changed, 947 insertions(+), 932 deletions(-) create mode 100644 crates/omnigraph-cluster/src/config.rs diff --git a/crates/omnigraph-cluster/src/config.rs b/crates/omnigraph-cluster/src/config.rs new file mode 100644 index 0000000..ecdc71c --- /dev/null +++ b/crates/omnigraph-cluster/src/config.rs @@ -0,0 +1,881 @@ +//! Declared-configuration loading: cluster.yaml parsing, query +//! discovery, source digesting, validation (moved verbatim from lib.rs +//! in the modularization). Reads the operator's WORKING TREE — stored +//! state never lives here (see store.rs). + +use super::*; + +/// How a graph declares its stored queries. Terraform-style: the `.gq` +/// files ARE the declaration — point at them (or a directory) and every +/// `query ` they contain is discovered. The explicit name->file map +/// remains for fine-grained control. +#[derive(Debug, Serialize, Deserialize)] +#[serde(untagged)] +pub(crate) enum QueriesDecl { + /// `queries: ./queries/` — a directory (top-level `*.gq`, sorted) or a + /// single `.gq` file; every declaration inside is registered. + Discover(PathBuf), + /// `queries: [./queries/, ./extra.gq]` — several directories/files. + DiscoverMany(Vec), + /// `queries: { name: { file: ... } }` — explicit registry. + Explicit(BTreeMap), +} + +impl Default for QueriesDecl { + fn default() -> Self { + QueriesDecl::Explicit(BTreeMap::new()) + } +} + +/// Expand a graph's query declaration into the canonical name->file map. +/// Discovery reads and parses each `.gq`; unreadable or unparseable files +/// and duplicate query names are loud validation errors — a declaration the +/// tool cannot enumerate is broken, not partially usable. +pub(crate) fn resolve_query_decls( + config_dir: &Path, + graph_id: &str, + decl: &QueriesDecl, + diagnostics: &mut Vec, +) -> (BTreeMap, BTreeMap) { + let paths: Vec = match decl { + QueriesDecl::Explicit(map) => { + return ( + map.iter() + .map(|(name, config)| { + (name.clone(), QueryConfig { file: config.file.clone() }) + }) + .collect(), + BTreeMap::new(), + ); + } + QueriesDecl::Discover(path) => vec![path.clone()], + QueriesDecl::DiscoverMany(paths) => paths.clone(), + }; + + let mut files: Vec<(PathBuf, PathBuf)> = Vec::new(); // (declared-relative, resolved) + for declared in &paths { + let resolved = resolve_config_path(config_dir, declared); + if resolved.is_dir() { + let mut entries: Vec = match fs::read_dir(&resolved) { + Ok(read) => read + .flatten() + .map(|entry| entry.path()) + .filter(|path| path.extension().is_some_and(|ext| ext == "gq")) + .collect(), + Err(err) => { + diagnostics.push(Diagnostic::error( + "query_dir_unreadable", + format!("graphs.{graph_id}.queries"), + format!("could not list query directory '{}': {err}", resolved.display()), + )); + continue; + } + }; + entries.sort(); + if entries.is_empty() { + diagnostics.push(Diagnostic::warning( + "query_dir_empty", + format!("graphs.{graph_id}.queries"), + format!("query directory '{}' contains no .gq files", resolved.display()), + )); + } + for path in entries { + let relative = declared.join(path.file_name().expect("dir entries have names")); + files.push((relative, path)); + } + } else { + files.push((declared.clone(), resolved)); + } + } + + let mut registry: BTreeMap = BTreeMap::new(); + let mut origin: BTreeMap = BTreeMap::new(); + // Content read once at discovery and handed to the caller — the per-query + // digest/typecheck pass reuses it instead of re-reading (no N+1 reads, no + // window for the file to change between enumeration and validation). + let mut contents: BTreeMap = BTreeMap::new(); + for (declared, resolved) in files { + let source = match fs::read_to_string(&resolved) { + Ok(source) => source, + Err(err) => { + diagnostics.push(Diagnostic::error( + "query_file_missing", + format!("graphs.{graph_id}.queries"), + format!("could not read query file '{}': {err}", resolved.display()), + )); + continue; + } + }; + let parsed = match parse_query(&source) { + Ok(parsed) => parsed, + Err(err) => { + diagnostics.push(Diagnostic::error( + "query_parse_error", + format!("graphs.{graph_id}.queries"), + format!("'{}' does not parse: {err}", resolved.display()), + )); + continue; + } + }; + for query_decl in &parsed.queries { + let name = query_decl.name.clone(); + if let Some(previous) = origin.get(&name) { + diagnostics.push(Diagnostic::error( + "duplicate_query_name", + format!("graphs.{graph_id}.queries.{name}"), + format!( + "query '{name}' is declared in both '{}' and '{}'", + previous.display(), + declared.display() + ), + )); + continue; + } + origin.insert(name.clone(), declared.clone()); + registry.insert(name, QueryConfig { file: declared.clone() }); + } + contents.insert(declared, source); + } + (registry, contents) +} + +pub(crate) fn parse_cluster_config(config_dir: &Path) -> ParsedConfig { + let config_dir = config_dir.to_path_buf(); + let config_file = config_dir.join(CLUSTER_CONFIG_FILE); + let mut diagnostics = Vec::new(); + + if !config_dir.is_dir() { + diagnostics.push(Diagnostic::error( + "config_dir_not_found", + display_path(&config_dir), + "`--config` must point at a directory containing cluster.yaml", + )); + return ParsedConfig { + raw: None, + diagnostics, + config_dir, + config_file, + }; + } + + let text = match fs::read_to_string(&config_file) { + Ok(text) => text, + Err(err) => { + diagnostics.push(Diagnostic::error( + "cluster_config_read_error", + CLUSTER_CONFIG_FILE, + format!("could not read cluster.yaml: {err}"), + )); + return ParsedConfig { + raw: None, + diagnostics, + config_dir, + config_file, + }; + } + }; + + diagnostics.extend(duplicate_key_diagnostics(&text)); + diagnostics.extend(future_field_diagnostics(&text)); + if has_errors(&diagnostics) { + return ParsedConfig { + raw: None, + diagnostics, + config_dir, + config_file, + }; + } + + let raw = match serde_yaml::from_str::(&text) { + Ok(raw) => Some(raw), + Err(err) => { + diagnostics.push(Diagnostic::error( + "invalid_cluster_yaml", + CLUSTER_CONFIG_FILE, + format!("could not parse cluster.yaml: {err}"), + )); + None + } + }; + + ParsedConfig { + raw, + diagnostics, + config_dir, + config_file, + } +} + +pub(crate) fn validate_cluster_header( + raw: &RawClusterConfig, + diagnostics: &mut Vec, +) -> ClusterSettings { + if raw.version != 1 { + diagnostics.push(Diagnostic::error( + "unsupported_cluster_config_version", + "version", + format!( + "unsupported cluster config version {}; this build supports version 1", + raw.version + ), + )); + } + if let Some(name) = raw.metadata.name.as_deref() { + if name.trim().is_empty() { + diagnostics.push(Diagnostic::error( + "empty_metadata_name", + "metadata.name", + "metadata.name must not be empty when provided", + )); + } + } + if let Some(backend) = raw.state.backend.as_deref() { + if backend != "cluster" { + diagnostics.push(Diagnostic::error( + "unsupported_state_backend", + "state.backend", + "Stage 2C supports only omitted state.backend or `cluster`", + )); + } + } + + ClusterSettings { + state_lock: raw.state.lock.unwrap_or(true), + } +} + + + +pub(crate) fn state_resource_digests(state: &ClusterState) -> BTreeMap { + state + .applied_revision + .resources + .iter() + .map(|(address, resource)| (address.clone(), resource.digest.clone())) + .collect() +} + +pub(crate) fn initial_import_state(desired: &DesiredCluster) -> ClusterState { + ClusterState { + version: 1, + state_revision: 0, + applied_revision: AppliedRevisionState { + config_digest: Some(desired.config_digest.clone()), + resources: BTreeMap::new(), + }, + resource_statuses: BTreeMap::new(), + approval_records: BTreeMap::new(), + recovery_records: BTreeMap::new(), + observations: BTreeMap::new(), + } +} + + +pub(crate) async fn observe_declared_graphs(desired: &DesiredCluster, state: &mut ClusterState) -> usize { + let mut graph_error_count = 0; + for graph in &desired.graphs { + let graph_address = graph_address(&graph.id); + let schema_address = schema_address(&graph.id); + let graph_path = desired + .config_dir + .join(CLUSTER_GRAPHS_DIR) + .join(format!("{}.omni", graph.id)); + let graph_uri = display_path(&graph_path); + let observed_at = now_rfc3339(); + + if !graph_path.exists() { + state.applied_revision.resources.remove(&graph_address); + state.applied_revision.resources.remove(&schema_address); + state.observations.insert( + graph_address.clone(), + graph_observation_json(GraphObservationJson { + address: &graph_address, + graph_uri: &graph_uri, + observed_at: &observed_at, + exists: false, + manifest_version: None, + schema_digest: None, + desired_schema_digest: &graph.schema_digest, + schema_matches_desired: Some(false), + error: Some("derived graph root is missing"), + }), + ); + set_resource_status( + state, + &graph_address, + ResourceLifecycleStatus::Drifted, + "graph_missing", + "derived graph root is missing", + ); + set_resource_status( + state, + &schema_address, + ResourceLifecycleStatus::Drifted, + "graph_missing", + "derived graph root is missing", + ); + continue; + } + + match observe_live_graph(&graph_uri).await { + Ok(observation) => { + let schema_matches = observation.schema_digest == graph.schema_digest; + state.applied_revision.resources.insert( + schema_address.clone(), + StateResource { + digest: observation.schema_digest.clone(), + applies_to: None, + }, + ); + let query_digests = state_query_digests_for_graph(state, &graph.id); + let graph_digest_value = graph_digest( + &graph.id, + Some(&observation.schema_digest), + Some(&query_digests), + ); + state.applied_revision.resources.insert( + graph_address.clone(), + StateResource { + digest: graph_digest_value, + applies_to: None, + }, + ); + state.observations.insert( + graph_address.clone(), + graph_observation_json(GraphObservationJson { + address: &graph_address, + graph_uri: &graph_uri, + observed_at: &observed_at, + exists: true, + manifest_version: Some(observation.manifest_version), + schema_digest: Some(observation.schema_digest.as_str()), + desired_schema_digest: &graph.schema_digest, + schema_matches_desired: Some(schema_matches), + error: None, + }), + ); + if schema_matches { + set_resource_status_applied(state, &graph_address); + set_resource_status_applied(state, &schema_address); + } else { + set_resource_status( + state, + &graph_address, + ResourceLifecycleStatus::Drifted, + "schema_mismatch", + "live schema digest differs from desired schema digest", + ); + set_resource_status( + state, + &schema_address, + ResourceLifecycleStatus::Drifted, + "schema_mismatch", + "live schema digest differs from desired schema digest", + ); + } + } + Err(error) => { + graph_error_count += 1; + state.observations.insert( + graph_address.clone(), + graph_observation_json(GraphObservationJson { + address: &graph_address, + graph_uri: &graph_uri, + observed_at: &observed_at, + exists: true, + manifest_version: None, + schema_digest: None, + desired_schema_digest: &graph.schema_digest, + schema_matches_desired: None, + error: Some(error.as_str()), + }), + ); + set_resource_status( + state, + &graph_address, + ResourceLifecycleStatus::Error, + "graph_observation_error", + error.as_str(), + ); + set_resource_status( + state, + &schema_address, + ResourceLifecycleStatus::Error, + "graph_observation_error", + error.as_str(), + ); + } + } + } + graph_error_count +} + +/// RFC-004 §D7: the data-aware preview — the engine's migration plan for a +/// desired schema against the live graph, computed read-only (no lock). +pub(crate) async fn preview_schema_migration( + graph_uri: &str, + schema_path: &str, +) -> Result { + let source = fs::read_to_string(schema_path).map_err(|err| err.to_string())?; + let db = Omnigraph::open_read_only(graph_uri) + .await + .map_err(|err| err.to_string())?; + let preview = db + .preview_schema_apply_with_options(&source, SchemaApplyOptions::default()) + .await + .map_err(|err| err.to_string())?; + Ok(preview.plan) +} + +struct LiveGraphObservation { + manifest_version: u64, + schema_digest: String, +} + +pub(crate) async fn observe_live_graph(graph_uri: &str) -> Result { + let db = Omnigraph::open_read_only(graph_uri) + .await + .map_err(|err| err.to_string())?; + let snapshot = db + .snapshot_of(ReadTarget::branch("main")) + .await + .map_err(|err| err.to_string())?; + let schema_source = db.schema_source(); + Ok(LiveGraphObservation { + manifest_version: snapshot.version(), + schema_digest: sha256_hex(schema_source.as_bytes()), + }) +} + +struct GraphObservationJson<'a> { + address: &'a str, + graph_uri: &'a str, + observed_at: &'a str, + exists: bool, + manifest_version: Option, + schema_digest: Option<&'a str>, + desired_schema_digest: &'a str, + schema_matches_desired: Option, + error: Option<&'a str>, +} + +pub(crate) fn graph_observation_json(observation: GraphObservationJson<'_>) -> serde_json::Value { + json!({ + "kind": "graph", + "address": observation.address, + "graph_uri": observation.graph_uri, + "observed_at": observation.observed_at, + "exists": observation.exists, + "manifest_version": observation.manifest_version, + "schema_digest": observation.schema_digest, + "desired_schema_digest": observation.desired_schema_digest, + "schema_matches_desired": observation.schema_matches_desired, + "error": observation.error, + }) +} + + +pub(crate) fn load_desired(config_dir: &Path) -> LoadOutcome { + let parsed = parse_cluster_config(config_dir); + let config_dir = parsed.config_dir; + let config_file = parsed.config_file; + let mut diagnostics = parsed.diagnostics; + let Some(raw) = parsed.raw else { + return LoadOutcome { + desired: None, + diagnostics, + config_dir, + config_file, + }; + }; + let settings = validate_cluster_header(&raw, &mut diagnostics); + + let mut resources = BTreeMap::new(); + let mut dependencies = BTreeSet::new(); + let mut graph_query_digests: BTreeMap> = BTreeMap::new(); + let mut graph_schema_digests: BTreeMap = BTreeMap::new(); + + for (graph_id, graph) in &raw.graphs { + validate_id( + "graph id", + &format!("graphs.{graph_id}"), + graph_id, + &mut diagnostics, + ); + let graph_address = graph_address(graph_id); + let schema_address = schema_address(graph_id); + dependencies.insert(Dependency { + from: schema_address.clone(), + to: graph_address.clone(), + }); + + let schema_path = resolve_config_path(&config_dir, &graph.schema); + let schema_source = match fs::read_to_string(&schema_path) { + Ok(source) => { + let digest = sha256_hex(source.as_bytes()); + graph_schema_digests.insert(graph_id.clone(), digest.clone()); + resources.insert( + schema_address.clone(), + ResourceSummary { + address: schema_address.clone(), + kind: "schema".to_string(), + digest, + path: Some(display_path(&schema_path)), + }, + ); + Some(source) + } + Err(err) => { + diagnostics.push(Diagnostic::error( + "schema_file_missing", + format!("graphs.{graph_id}.schema"), + format!( + "could not read schema file '{}': {err}", + schema_path.display() + ), + )); + None + } + }; + + let catalog = schema_source.and_then(|source| match parse_schema(&source) { + Ok(schema) => match build_catalog(&schema) { + Ok(catalog) => Some(catalog), + Err(err) => { + diagnostics.push(Diagnostic::error( + "schema_catalog_error", + format!("graphs.{graph_id}.schema"), + err.to_string(), + )); + None + } + }, + Err(err) => { + diagnostics.push(Diagnostic::error( + "schema_parse_error", + format!("graphs.{graph_id}.schema"), + err.to_string(), + )); + None + } + }); + + let (graph_queries, query_contents) = + resolve_query_decls(&config_dir, graph_id, &graph.queries, &mut diagnostics); + for (query_name, query) in &graph_queries { + validate_id( + "query name", + &format!("graphs.{graph_id}.queries.{query_name}"), + query_name, + &mut diagnostics, + ); + let query_address = query_address(graph_id, query_name); + dependencies.insert(Dependency { + from: query_address.clone(), + to: graph_address.clone(), + }); + dependencies.insert(Dependency { + from: query_address.clone(), + to: schema_address.clone(), + }); + + let query_path = resolve_config_path(&config_dir, &query.file); + let source = match query_contents.get(&query.file) { + Some(cached) => Ok(cached.clone()), + None => fs::read_to_string(&query_path), + }; + match source { + Ok(source) => { + let digest = sha256_hex(source.as_bytes()); + graph_query_digests + .entry(graph_id.clone()) + .or_default() + .insert(query_name.clone(), digest.clone()); + resources.insert( + query_address.clone(), + ResourceSummary { + address: query_address, + kind: "query".to_string(), + digest, + path: Some(display_path(&query_path)), + }, + ); + validate_query_source( + graph_id, + query_name, + &source, + catalog.as_ref(), + &mut diagnostics, + ); + } + Err(err) => diagnostics.push(Diagnostic::error( + "query_file_missing", + format!("graphs.{graph_id}.queries.{query_name}.file"), + format!( + "could not read query file '{}': {err}", + query_path.display() + ), + )), + } + } + } + + for graph_id in raw.graphs.keys() { + let digest = graph_digest( + graph_id, + graph_schema_digests.get(graph_id), + graph_query_digests.get(graph_id), + ); + resources.insert( + graph_address(graph_id), + ResourceSummary { + address: graph_address(graph_id), + kind: "graph".to_string(), + digest, + path: None, + }, + ); + } + + let mut policy_bindings: BTreeMap> = BTreeMap::new(); + for (policy_name, policy) in &raw.policies { + validate_id( + "policy name", + &format!("policies.{policy_name}"), + policy_name, + &mut diagnostics, + ); + if policy.applies_to.is_empty() { + diagnostics.push(Diagnostic::error( + "policy_missing_applies_to", + format!("policies.{policy_name}.applies_to"), + "policy.applies_to must name `cluster` or at least one graph", + )); + } + + let policy_address = policy_address(policy_name); + let mut normalized_bindings: Vec = Vec::new(); + for (idx, target) in policy.applies_to.iter().enumerate() { + match normalize_policy_target(target) { + PolicyTarget::Cluster => { + normalized_bindings.push("cluster".to_string()); + } + PolicyTarget::Graph(graph_id) => { + normalized_bindings.push(graph_address(&graph_id)); + if raw.graphs.contains_key(&graph_id) { + dependencies.insert(Dependency { + from: policy_address.clone(), + to: graph_address(&graph_id), + }); + } else { + diagnostics.push(Diagnostic::error( + "dangling_graph_reference", + format!("policies.{policy_name}.applies_to[{idx}]"), + format!( + "policy references graph `{graph_id}`, but no graph with that id is declared" + ), + )); + } + } + PolicyTarget::WrongKind(kind) => diagnostics.push(Diagnostic::error( + "wrong_kind_reference", + format!("policies.{policy_name}.applies_to[{idx}]"), + format!("policy applies_to expects graph refs or `cluster`, got `{kind}`"), + )), + } + } + + normalized_bindings.sort(); + normalized_bindings.dedup(); + policy_bindings.insert(policy_address.clone(), normalized_bindings); + + let policy_path = resolve_config_path(&config_dir, &policy.file); + match fs::read(&policy_path) { + Ok(bytes) => { + resources.insert( + policy_address.clone(), + ResourceSummary { + address: policy_address, + kind: "policy".to_string(), + digest: sha256_hex(&bytes), + path: Some(display_path(&policy_path)), + }, + ); + } + Err(err) => diagnostics.push(Diagnostic::error( + "policy_file_missing", + format!("policies.{policy_name}.file"), + format!( + "could not read policy file '{}': {err}", + policy_path.display() + ), + )), + } + } + + let mut resource_digests = BTreeMap::new(); + let mut resource_list = Vec::new(); + for (address, resource) in resources { + resource_digests.insert(address, resource.digest.clone()); + resource_list.push(resource); + } + let dependencies: Vec<_> = dependencies.into_iter().collect(); + let graphs = raw + .graphs + .keys() + .map(|graph_id| DesiredGraph { + id: graph_id.clone(), + schema_digest: graph_schema_digests + .get(graph_id) + .cloned() + .unwrap_or_default(), + }) + .collect(); + let config_digest = desired_config_digest(&raw, &resource_digests); + + LoadOutcome { + desired: Some(DesiredCluster { + config_dir: config_dir.clone(), + config_digest, + state_lock: settings.state_lock, + graphs, + resource_digests, + resources: resource_list, + dependencies, + policy_bindings, + }), + diagnostics, + config_dir, + config_file, + } +} + +pub(crate) fn validate_query_source( + graph_id: &str, + query_name: &str, + source: &str, + catalog: Option<&omnigraph_compiler::catalog::Catalog>, + diagnostics: &mut Vec, +) { + let path = format!("graphs.{graph_id}.queries.{query_name}"); + match parse_query(source) { + Ok(query_file) => { + let Some(query_decl) = query_file.queries.iter().find(|q| q.name == query_name) else { + diagnostics.push(Diagnostic::error( + "query_key_mismatch", + path, + format!("no `query {query_name}` declaration found in the referenced .gq file"), + )); + return; + }; + if let Some(catalog) = catalog { + if let Err(err) = typecheck_query_decl(catalog, query_decl) { + diagnostics.push(Diagnostic::error( + "query_typecheck_error", + format!("graphs.{graph_id}.queries.{query_name}"), + err.to_string(), + )); + } + } else { + diagnostics.push(Diagnostic::warning( + "query_typecheck_skipped", + format!("graphs.{graph_id}.queries.{query_name}"), + "query parsed, but type-check was skipped because the graph schema is invalid", + )); + } + } + Err(err) => diagnostics.push(Diagnostic::error( + "query_parse_error", + path, + err.to_string(), + )), + } +} + +pub(crate) fn future_field_diagnostics(text: &str) -> Vec { + let Ok(value) = serde_yaml::from_str::(text) else { + return Vec::new(); + }; + let Some(mapping) = value.as_mapping() else { + return Vec::new(); + }; + let future_fields = [ + "apply", + "env_file", + "providers", + "pipelines", + "embeddings", + "ui", + "aliases", + "bindings", + ]; + mapping + .keys() + .filter_map(|key| key.as_str()) + .filter(|key| future_fields.contains(key)) + .map(|key| { + Diagnostic::error( + "future_phase_field", + key, + format!("`{key}` is reserved for a later cluster-control phase"), + ) + }) + .collect() +} + +pub(crate) fn validate_id(kind: &str, path: &str, value: &str, diagnostics: &mut Vec) { + let mut chars = value.chars(); + let valid = chars + .next() + .is_some_and(|ch| ch.is_ascii_alphabetic() || ch == '_') + && chars.all(|ch| ch.is_ascii_alphanumeric() || ch == '_' || ch == '-'); + if !valid { + diagnostics.push(Diagnostic::error( + "invalid_resource_id", + path, + format!("{kind} `{value}` must start with a letter or `_` and contain only ASCII letters, digits, `_`, or `-`"), + )); + } +} + +enum PolicyTarget { + Cluster, + Graph(String), + WrongKind(String), +} + +pub(crate) fn normalize_policy_target(value: &str) -> PolicyTarget { + if value == "cluster" { + PolicyTarget::Cluster + } else if let Some(graph_id) = value.strip_prefix("graph.") { + PolicyTarget::Graph(graph_id.to_string()) + } else if value.contains('.') { + PolicyTarget::WrongKind(value.to_string()) + } else { + PolicyTarget::Graph(value.to_string()) + } +} + +pub(crate) fn graph_address(graph_id: &str) -> String { + format!("graph.{graph_id}") +} + +pub(crate) fn schema_address(graph_id: &str) -> String { + format!("schema.{graph_id}") +} + +pub(crate) fn query_address(graph_id: &str, query_name: &str) -> String { + format!("query.{graph_id}.{query_name}") +} + +pub(crate) fn policy_address(policy_name: &str) -> String { + format!("policy.{policy_name}") +} + +pub(crate) fn resolve_config_path(config_dir: &Path, path: &Path) -> PathBuf { + if path.is_absolute() { + path.to_path_buf() + } else { + config_dir.join(path) + } +} diff --git a/crates/omnigraph-cluster/src/lib.rs b/crates/omnigraph-cluster/src/lib.rs index c54d245..8b41fdf 100644 --- a/crates/omnigraph-cluster/src/lib.rs +++ b/crates/omnigraph-cluster/src/lib.rs @@ -19,6 +19,7 @@ use ulid::Ulid; pub mod failpoints; +mod config; mod diff; mod serve; mod sweep; @@ -26,6 +27,7 @@ mod store; use store::{LocalStateBackend, StateLockGuard, StateSnapshot}; pub use serve::{ServingGraph, ServingPolicy, ServingQuery, ServingSnapshot, read_serving_snapshot}; use serve::read_verified_payload; +use config::{QueriesDecl, observe_declared_graphs, validate_cluster_header, future_field_diagnostics, initial_import_state, observe_live_graph, preview_schema_migration, state_resource_digests, graph_address, policy_address, query_address, schema_address, load_desired, normalize_policy_target, parse_cluster_config, resolve_config_path, resolve_query_decls, validate_id, validate_query_source}; use diff::{FailedGraphOrigin, ResourceKind, append_policy_binding_changes, approved_resources, classify_changes, compute_approvals, compute_blast_radius, demote_dependents_of_failed_graphs, diff_resources, resource_kind}; use sweep::{mark_approvals_consumed, record_approval_consumed, sweep_recovery_sidecars, tombstone_graph_subtree, warn_pending_recovery_sidecars}; @@ -430,138 +432,6 @@ struct GraphConfig { /// How a graph declares its stored queries. Terraform-style: the `.gq` /// files ARE the declaration — point at them (or a directory) and every -/// `query ` they contain is discovered. The explicit name->file map -/// remains for fine-grained control. -#[derive(Debug, Serialize, Deserialize)] -#[serde(untagged)] -enum QueriesDecl { - /// `queries: ./queries/` — a directory (top-level `*.gq`, sorted) or a - /// single `.gq` file; every declaration inside is registered. - Discover(PathBuf), - /// `queries: [./queries/, ./extra.gq]` — several directories/files. - DiscoverMany(Vec), - /// `queries: { name: { file: ... } }` — explicit registry. - Explicit(BTreeMap), -} - -impl Default for QueriesDecl { - fn default() -> Self { - QueriesDecl::Explicit(BTreeMap::new()) - } -} - -/// Expand a graph's query declaration into the canonical name->file map. -/// Discovery reads and parses each `.gq`; unreadable or unparseable files -/// and duplicate query names are loud validation errors — a declaration the -/// tool cannot enumerate is broken, not partially usable. -fn resolve_query_decls( - config_dir: &Path, - graph_id: &str, - decl: &QueriesDecl, - diagnostics: &mut Vec, -) -> (BTreeMap, BTreeMap) { - let paths: Vec = match decl { - QueriesDecl::Explicit(map) => { - return ( - map.iter() - .map(|(name, config)| { - (name.clone(), QueryConfig { file: config.file.clone() }) - }) - .collect(), - BTreeMap::new(), - ); - } - QueriesDecl::Discover(path) => vec![path.clone()], - QueriesDecl::DiscoverMany(paths) => paths.clone(), - }; - - let mut files: Vec<(PathBuf, PathBuf)> = Vec::new(); // (declared-relative, resolved) - for declared in &paths { - let resolved = resolve_config_path(config_dir, declared); - if resolved.is_dir() { - let mut entries: Vec = match fs::read_dir(&resolved) { - Ok(read) => read - .flatten() - .map(|entry| entry.path()) - .filter(|path| path.extension().is_some_and(|ext| ext == "gq")) - .collect(), - Err(err) => { - diagnostics.push(Diagnostic::error( - "query_dir_unreadable", - format!("graphs.{graph_id}.queries"), - format!("could not list query directory '{}': {err}", resolved.display()), - )); - continue; - } - }; - entries.sort(); - if entries.is_empty() { - diagnostics.push(Diagnostic::warning( - "query_dir_empty", - format!("graphs.{graph_id}.queries"), - format!("query directory '{}' contains no .gq files", resolved.display()), - )); - } - for path in entries { - let relative = declared.join(path.file_name().expect("dir entries have names")); - files.push((relative, path)); - } - } else { - files.push((declared.clone(), resolved)); - } - } - - let mut registry: BTreeMap = BTreeMap::new(); - let mut origin: BTreeMap = BTreeMap::new(); - // Content read once at discovery and handed to the caller — the per-query - // digest/typecheck pass reuses it instead of re-reading (no N+1 reads, no - // window for the file to change between enumeration and validation). - let mut contents: BTreeMap = BTreeMap::new(); - for (declared, resolved) in files { - let source = match fs::read_to_string(&resolved) { - Ok(source) => source, - Err(err) => { - diagnostics.push(Diagnostic::error( - "query_file_missing", - format!("graphs.{graph_id}.queries"), - format!("could not read query file '{}': {err}", resolved.display()), - )); - continue; - } - }; - let parsed = match parse_query(&source) { - Ok(parsed) => parsed, - Err(err) => { - diagnostics.push(Diagnostic::error( - "query_parse_error", - format!("graphs.{graph_id}.queries"), - format!("'{}' does not parse: {err}", resolved.display()), - )); - continue; - } - }; - for query_decl in &parsed.queries { - let name = query_decl.name.clone(); - if let Some(previous) = origin.get(&name) { - diagnostics.push(Diagnostic::error( - "duplicate_query_name", - format!("graphs.{graph_id}.queries.{name}"), - format!( - "query '{name}' is declared in both '{}' and '{}'", - previous.display(), - declared.display() - ), - )); - continue; - } - origin.insert(name.clone(), declared.clone()); - registry.insert(name, QueryConfig { file: declared.clone() }); - } - contents.insert(declared, source); - } - (registry, contents) -} - #[derive(Debug, Serialize, Deserialize)] #[serde(deny_unknown_fields)] struct QueryConfig { @@ -2138,725 +2008,6 @@ async fn sync_config_dir(config_dir: &Path, operation: StateSyncOperation) -> St } } -fn parse_cluster_config(config_dir: &Path) -> ParsedConfig { - let config_dir = config_dir.to_path_buf(); - let config_file = config_dir.join(CLUSTER_CONFIG_FILE); - let mut diagnostics = Vec::new(); - - if !config_dir.is_dir() { - diagnostics.push(Diagnostic::error( - "config_dir_not_found", - display_path(&config_dir), - "`--config` must point at a directory containing cluster.yaml", - )); - return ParsedConfig { - raw: None, - diagnostics, - config_dir, - config_file, - }; - } - - let text = match fs::read_to_string(&config_file) { - Ok(text) => text, - Err(err) => { - diagnostics.push(Diagnostic::error( - "cluster_config_read_error", - CLUSTER_CONFIG_FILE, - format!("could not read cluster.yaml: {err}"), - )); - return ParsedConfig { - raw: None, - diagnostics, - config_dir, - config_file, - }; - } - }; - - diagnostics.extend(duplicate_key_diagnostics(&text)); - diagnostics.extend(future_field_diagnostics(&text)); - if has_errors(&diagnostics) { - return ParsedConfig { - raw: None, - diagnostics, - config_dir, - config_file, - }; - } - - let raw = match serde_yaml::from_str::(&text) { - Ok(raw) => Some(raw), - Err(err) => { - diagnostics.push(Diagnostic::error( - "invalid_cluster_yaml", - CLUSTER_CONFIG_FILE, - format!("could not parse cluster.yaml: {err}"), - )); - None - } - }; - - ParsedConfig { - raw, - diagnostics, - config_dir, - config_file, - } -} - -fn validate_cluster_header( - raw: &RawClusterConfig, - diagnostics: &mut Vec, -) -> ClusterSettings { - if raw.version != 1 { - diagnostics.push(Diagnostic::error( - "unsupported_cluster_config_version", - "version", - format!( - "unsupported cluster config version {}; this build supports version 1", - raw.version - ), - )); - } - if let Some(name) = raw.metadata.name.as_deref() { - if name.trim().is_empty() { - diagnostics.push(Diagnostic::error( - "empty_metadata_name", - "metadata.name", - "metadata.name must not be empty when provided", - )); - } - } - if let Some(backend) = raw.state.backend.as_deref() { - if backend != "cluster" { - diagnostics.push(Diagnostic::error( - "unsupported_state_backend", - "state.backend", - "Stage 2C supports only omitted state.backend or `cluster`", - )); - } - } - - ClusterSettings { - state_lock: raw.state.lock.unwrap_or(true), - } -} - - -fn parse_lock_file_for_unlock(text: &str) -> Result { - let lock = serde_json::from_str::(text).map_err(|err| { - Diagnostic::error( - "invalid_state_lock", - CLUSTER_LOCK_FILE, - format!("could not parse state lock: {err}"), - ) - })?; - if lock.version != 1 { - return Err(Diagnostic::error( - "unsupported_state_lock_version", - CLUSTER_LOCK_FILE, - format!("unsupported cluster state lock version {}", lock.version), - )); - } - Ok(lock) -} - -fn state_lock_held_message(observations: &StateObservations) -> String { - match observations.lock_id.as_deref() { - Some(lock_id) => format!( - "cluster state lock already exists (lock id {lock_id}); run `omnigraph cluster force-unlock {lock_id}` only after confirming no cluster operation is active" - ), - None => "cluster state lock already exists; remove it only after confirming no cluster operation is active".to_string(), - } -} - -fn state_resource_digests(state: &ClusterState) -> BTreeMap { - state - .applied_revision - .resources - .iter() - .map(|(address, resource)| (address.clone(), resource.digest.clone())) - .collect() -} - -fn initial_import_state(desired: &DesiredCluster) -> ClusterState { - ClusterState { - version: 1, - state_revision: 0, - applied_revision: AppliedRevisionState { - config_digest: Some(desired.config_digest.clone()), - resources: BTreeMap::new(), - }, - resource_statuses: BTreeMap::new(), - approval_records: BTreeMap::new(), - recovery_records: BTreeMap::new(), - observations: BTreeMap::new(), - } -} - - -async fn observe_declared_graphs(desired: &DesiredCluster, state: &mut ClusterState) -> usize { - let mut graph_error_count = 0; - for graph in &desired.graphs { - let graph_address = graph_address(&graph.id); - let schema_address = schema_address(&graph.id); - let graph_path = desired - .config_dir - .join(CLUSTER_GRAPHS_DIR) - .join(format!("{}.omni", graph.id)); - let graph_uri = display_path(&graph_path); - let observed_at = now_rfc3339(); - - if !graph_path.exists() { - state.applied_revision.resources.remove(&graph_address); - state.applied_revision.resources.remove(&schema_address); - state.observations.insert( - graph_address.clone(), - graph_observation_json(GraphObservationJson { - address: &graph_address, - graph_uri: &graph_uri, - observed_at: &observed_at, - exists: false, - manifest_version: None, - schema_digest: None, - desired_schema_digest: &graph.schema_digest, - schema_matches_desired: Some(false), - error: Some("derived graph root is missing"), - }), - ); - set_resource_status( - state, - &graph_address, - ResourceLifecycleStatus::Drifted, - "graph_missing", - "derived graph root is missing", - ); - set_resource_status( - state, - &schema_address, - ResourceLifecycleStatus::Drifted, - "graph_missing", - "derived graph root is missing", - ); - continue; - } - - match observe_live_graph(&graph_uri).await { - Ok(observation) => { - let schema_matches = observation.schema_digest == graph.schema_digest; - state.applied_revision.resources.insert( - schema_address.clone(), - StateResource { - digest: observation.schema_digest.clone(), - applies_to: None, - }, - ); - let query_digests = state_query_digests_for_graph(state, &graph.id); - let graph_digest_value = graph_digest( - &graph.id, - Some(&observation.schema_digest), - Some(&query_digests), - ); - state.applied_revision.resources.insert( - graph_address.clone(), - StateResource { - digest: graph_digest_value, - applies_to: None, - }, - ); - state.observations.insert( - graph_address.clone(), - graph_observation_json(GraphObservationJson { - address: &graph_address, - graph_uri: &graph_uri, - observed_at: &observed_at, - exists: true, - manifest_version: Some(observation.manifest_version), - schema_digest: Some(observation.schema_digest.as_str()), - desired_schema_digest: &graph.schema_digest, - schema_matches_desired: Some(schema_matches), - error: None, - }), - ); - if schema_matches { - set_resource_status_applied(state, &graph_address); - set_resource_status_applied(state, &schema_address); - } else { - set_resource_status( - state, - &graph_address, - ResourceLifecycleStatus::Drifted, - "schema_mismatch", - "live schema digest differs from desired schema digest", - ); - set_resource_status( - state, - &schema_address, - ResourceLifecycleStatus::Drifted, - "schema_mismatch", - "live schema digest differs from desired schema digest", - ); - } - } - Err(error) => { - graph_error_count += 1; - state.observations.insert( - graph_address.clone(), - graph_observation_json(GraphObservationJson { - address: &graph_address, - graph_uri: &graph_uri, - observed_at: &observed_at, - exists: true, - manifest_version: None, - schema_digest: None, - desired_schema_digest: &graph.schema_digest, - schema_matches_desired: None, - error: Some(error.as_str()), - }), - ); - set_resource_status( - state, - &graph_address, - ResourceLifecycleStatus::Error, - "graph_observation_error", - error.as_str(), - ); - set_resource_status( - state, - &schema_address, - ResourceLifecycleStatus::Error, - "graph_observation_error", - error.as_str(), - ); - } - } - } - graph_error_count -} - -/// RFC-004 §D7: the data-aware preview — the engine's migration plan for a -/// desired schema against the live graph, computed read-only (no lock). -async fn preview_schema_migration( - graph_uri: &str, - schema_path: &str, -) -> Result { - let source = fs::read_to_string(schema_path).map_err(|err| err.to_string())?; - let db = Omnigraph::open_read_only(graph_uri) - .await - .map_err(|err| err.to_string())?; - let preview = db - .preview_schema_apply_with_options(&source, SchemaApplyOptions::default()) - .await - .map_err(|err| err.to_string())?; - Ok(preview.plan) -} - -struct LiveGraphObservation { - manifest_version: u64, - schema_digest: String, -} - -async fn observe_live_graph(graph_uri: &str) -> Result { - let db = Omnigraph::open_read_only(graph_uri) - .await - .map_err(|err| err.to_string())?; - let snapshot = db - .snapshot_of(ReadTarget::branch("main")) - .await - .map_err(|err| err.to_string())?; - let schema_source = db.schema_source(); - Ok(LiveGraphObservation { - manifest_version: snapshot.version(), - schema_digest: sha256_hex(schema_source.as_bytes()), - }) -} - -struct GraphObservationJson<'a> { - address: &'a str, - graph_uri: &'a str, - observed_at: &'a str, - exists: bool, - manifest_version: Option, - schema_digest: Option<&'a str>, - desired_schema_digest: &'a str, - schema_matches_desired: Option, - error: Option<&'a str>, -} - -fn graph_observation_json(observation: GraphObservationJson<'_>) -> serde_json::Value { - json!({ - "kind": "graph", - "address": observation.address, - "graph_uri": observation.graph_uri, - "observed_at": observation.observed_at, - "exists": observation.exists, - "manifest_version": observation.manifest_version, - "schema_digest": observation.schema_digest, - "desired_schema_digest": observation.desired_schema_digest, - "schema_matches_desired": observation.schema_matches_desired, - "error": observation.error, - }) -} - -fn state_query_digests_for_graph(state: &ClusterState, graph_id: &str) -> BTreeMap { - let prefix = format!("query.{graph_id}."); - state - .applied_revision - .resources - .iter() - .filter_map(|(address, resource)| { - address - .strip_prefix(&prefix) - .map(|name| (name.to_string(), resource.digest.clone())) - }) - .collect() -} - -fn set_resource_status_applied(state: &mut ClusterState, address: &str) { - state.resource_statuses.insert( - address.to_string(), - ResourceStatusRecord { - status: ResourceLifecycleStatus::Applied, - conditions: Vec::new(), - message: None, - }, - ); -} - -fn set_resource_status( - state: &mut ClusterState, - address: &str, - status: ResourceLifecycleStatus, - condition: &str, - message: &str, -) { - state.resource_statuses.insert( - address.to_string(), - ResourceStatusRecord { - status, - conditions: vec![condition.to_string()], - message: Some(message.to_string()), - }, - ); -} - -fn load_desired(config_dir: &Path) -> LoadOutcome { - let parsed = parse_cluster_config(config_dir); - let config_dir = parsed.config_dir; - let config_file = parsed.config_file; - let mut diagnostics = parsed.diagnostics; - let Some(raw) = parsed.raw else { - return LoadOutcome { - desired: None, - diagnostics, - config_dir, - config_file, - }; - }; - let settings = validate_cluster_header(&raw, &mut diagnostics); - - let mut resources = BTreeMap::new(); - let mut dependencies = BTreeSet::new(); - let mut graph_query_digests: BTreeMap> = BTreeMap::new(); - let mut graph_schema_digests: BTreeMap = BTreeMap::new(); - - for (graph_id, graph) in &raw.graphs { - validate_id( - "graph id", - &format!("graphs.{graph_id}"), - graph_id, - &mut diagnostics, - ); - let graph_address = graph_address(graph_id); - let schema_address = schema_address(graph_id); - dependencies.insert(Dependency { - from: schema_address.clone(), - to: graph_address.clone(), - }); - - let schema_path = resolve_config_path(&config_dir, &graph.schema); - let schema_source = match fs::read_to_string(&schema_path) { - Ok(source) => { - let digest = sha256_hex(source.as_bytes()); - graph_schema_digests.insert(graph_id.clone(), digest.clone()); - resources.insert( - schema_address.clone(), - ResourceSummary { - address: schema_address.clone(), - kind: "schema".to_string(), - digest, - path: Some(display_path(&schema_path)), - }, - ); - Some(source) - } - Err(err) => { - diagnostics.push(Diagnostic::error( - "schema_file_missing", - format!("graphs.{graph_id}.schema"), - format!( - "could not read schema file '{}': {err}", - schema_path.display() - ), - )); - None - } - }; - - let catalog = schema_source.and_then(|source| match parse_schema(&source) { - Ok(schema) => match build_catalog(&schema) { - Ok(catalog) => Some(catalog), - Err(err) => { - diagnostics.push(Diagnostic::error( - "schema_catalog_error", - format!("graphs.{graph_id}.schema"), - err.to_string(), - )); - None - } - }, - Err(err) => { - diagnostics.push(Diagnostic::error( - "schema_parse_error", - format!("graphs.{graph_id}.schema"), - err.to_string(), - )); - None - } - }); - - let (graph_queries, query_contents) = - resolve_query_decls(&config_dir, graph_id, &graph.queries, &mut diagnostics); - for (query_name, query) in &graph_queries { - validate_id( - "query name", - &format!("graphs.{graph_id}.queries.{query_name}"), - query_name, - &mut diagnostics, - ); - let query_address = query_address(graph_id, query_name); - dependencies.insert(Dependency { - from: query_address.clone(), - to: graph_address.clone(), - }); - dependencies.insert(Dependency { - from: query_address.clone(), - to: schema_address.clone(), - }); - - let query_path = resolve_config_path(&config_dir, &query.file); - let source = match query_contents.get(&query.file) { - Some(cached) => Ok(cached.clone()), - None => fs::read_to_string(&query_path), - }; - match source { - Ok(source) => { - let digest = sha256_hex(source.as_bytes()); - graph_query_digests - .entry(graph_id.clone()) - .or_default() - .insert(query_name.clone(), digest.clone()); - resources.insert( - query_address.clone(), - ResourceSummary { - address: query_address, - kind: "query".to_string(), - digest, - path: Some(display_path(&query_path)), - }, - ); - validate_query_source( - graph_id, - query_name, - &source, - catalog.as_ref(), - &mut diagnostics, - ); - } - Err(err) => diagnostics.push(Diagnostic::error( - "query_file_missing", - format!("graphs.{graph_id}.queries.{query_name}.file"), - format!( - "could not read query file '{}': {err}", - query_path.display() - ), - )), - } - } - } - - for graph_id in raw.graphs.keys() { - let digest = graph_digest( - graph_id, - graph_schema_digests.get(graph_id), - graph_query_digests.get(graph_id), - ); - resources.insert( - graph_address(graph_id), - ResourceSummary { - address: graph_address(graph_id), - kind: "graph".to_string(), - digest, - path: None, - }, - ); - } - - let mut policy_bindings: BTreeMap> = BTreeMap::new(); - for (policy_name, policy) in &raw.policies { - validate_id( - "policy name", - &format!("policies.{policy_name}"), - policy_name, - &mut diagnostics, - ); - if policy.applies_to.is_empty() { - diagnostics.push(Diagnostic::error( - "policy_missing_applies_to", - format!("policies.{policy_name}.applies_to"), - "policy.applies_to must name `cluster` or at least one graph", - )); - } - - let policy_address = policy_address(policy_name); - let mut normalized_bindings: Vec = Vec::new(); - for (idx, target) in policy.applies_to.iter().enumerate() { - match normalize_policy_target(target) { - PolicyTarget::Cluster => { - normalized_bindings.push("cluster".to_string()); - } - PolicyTarget::Graph(graph_id) => { - normalized_bindings.push(graph_address(&graph_id)); - if raw.graphs.contains_key(&graph_id) { - dependencies.insert(Dependency { - from: policy_address.clone(), - to: graph_address(&graph_id), - }); - } else { - diagnostics.push(Diagnostic::error( - "dangling_graph_reference", - format!("policies.{policy_name}.applies_to[{idx}]"), - format!( - "policy references graph `{graph_id}`, but no graph with that id is declared" - ), - )); - } - } - PolicyTarget::WrongKind(kind) => diagnostics.push(Diagnostic::error( - "wrong_kind_reference", - format!("policies.{policy_name}.applies_to[{idx}]"), - format!("policy applies_to expects graph refs or `cluster`, got `{kind}`"), - )), - } - } - - normalized_bindings.sort(); - normalized_bindings.dedup(); - policy_bindings.insert(policy_address.clone(), normalized_bindings); - - let policy_path = resolve_config_path(&config_dir, &policy.file); - match fs::read(&policy_path) { - Ok(bytes) => { - resources.insert( - policy_address.clone(), - ResourceSummary { - address: policy_address, - kind: "policy".to_string(), - digest: sha256_hex(&bytes), - path: Some(display_path(&policy_path)), - }, - ); - } - Err(err) => diagnostics.push(Diagnostic::error( - "policy_file_missing", - format!("policies.{policy_name}.file"), - format!( - "could not read policy file '{}': {err}", - policy_path.display() - ), - )), - } - } - - let mut resource_digests = BTreeMap::new(); - let mut resource_list = Vec::new(); - for (address, resource) in resources { - resource_digests.insert(address, resource.digest.clone()); - resource_list.push(resource); - } - let dependencies: Vec<_> = dependencies.into_iter().collect(); - let graphs = raw - .graphs - .keys() - .map(|graph_id| DesiredGraph { - id: graph_id.clone(), - schema_digest: graph_schema_digests - .get(graph_id) - .cloned() - .unwrap_or_default(), - }) - .collect(); - let config_digest = desired_config_digest(&raw, &resource_digests); - - LoadOutcome { - desired: Some(DesiredCluster { - config_dir: config_dir.clone(), - config_digest, - state_lock: settings.state_lock, - graphs, - resource_digests, - resources: resource_list, - dependencies, - policy_bindings, - }), - diagnostics, - config_dir, - config_file, - } -} - -fn validate_query_source( - graph_id: &str, - query_name: &str, - source: &str, - catalog: Option<&omnigraph_compiler::catalog::Catalog>, - diagnostics: &mut Vec, -) { - let path = format!("graphs.{graph_id}.queries.{query_name}"); - match parse_query(source) { - Ok(query_file) => { - let Some(query_decl) = query_file.queries.iter().find(|q| q.name == query_name) else { - diagnostics.push(Diagnostic::error( - "query_key_mismatch", - path, - format!("no `query {query_name}` declaration found in the referenced .gq file"), - )); - return; - }; - if let Some(catalog) = catalog { - if let Err(err) = typecheck_query_decl(catalog, query_decl) { - diagnostics.push(Diagnostic::error( - "query_typecheck_error", - format!("graphs.{graph_id}.queries.{query_name}"), - err.to_string(), - )); - } - } else { - diagnostics.push(Diagnostic::warning( - "query_typecheck_skipped", - format!("graphs.{graph_id}.queries.{query_name}"), - "query parsed, but type-check was skipped because the graph schema is invalid", - )); - } - } - Err(err) => diagnostics.push(Diagnostic::error( - "query_parse_error", - path, - err.to_string(), - )), - } -} /// Content-addressed catalog path for an applied resource payload. Extensions @@ -3117,36 +2268,6 @@ fn duplicate_key_diagnostics(text: &str) -> Vec { diagnostics } -fn future_field_diagnostics(text: &str) -> Vec { - let Ok(value) = serde_yaml::from_str::(text) else { - return Vec::new(); - }; - let Some(mapping) = value.as_mapping() else { - return Vec::new(); - }; - let future_fields = [ - "apply", - "env_file", - "providers", - "pipelines", - "embeddings", - "ui", - "aliases", - "bindings", - ]; - mapping - .keys() - .filter_map(|key| key.as_str()) - .filter(|key| future_fields.contains(key)) - .map(|key| { - Diagnostic::error( - "future_phase_field", - key, - format!("`{key}` is reserved for a later cluster-control phase"), - ) - }) - .collect() -} fn strip_comment(line: &str) -> String { let mut in_single_quote = false; @@ -3170,61 +2291,47 @@ fn strip_comment(line: &str) -> String { line.to_string() } -fn validate_id(kind: &str, path: &str, value: &str, diagnostics: &mut Vec) { - let mut chars = value.chars(); - let valid = chars - .next() - .is_some_and(|ch| ch.is_ascii_alphabetic() || ch == '_') - && chars.all(|ch| ch.is_ascii_alphanumeric() || ch == '_' || ch == '-'); - if !valid { - diagnostics.push(Diagnostic::error( - "invalid_resource_id", - path, - format!("{kind} `{value}` must start with a letter or `_` and contain only ASCII letters, digits, `_`, or `-`"), - )); - } + +fn state_query_digests_for_graph(state: &ClusterState, graph_id: &str) -> BTreeMap { + let prefix = format!("query.{graph_id}."); + state + .applied_revision + .resources + .iter() + .filter_map(|(address, resource)| { + address + .strip_prefix(&prefix) + .map(|name| (name.to_string(), resource.digest.clone())) + }) + .collect() } -enum PolicyTarget { - Cluster, - Graph(String), - WrongKind(String), +fn set_resource_status_applied(state: &mut ClusterState, address: &str) { + state.resource_statuses.insert( + address.to_string(), + ResourceStatusRecord { + status: ResourceLifecycleStatus::Applied, + conditions: Vec::new(), + message: None, + }, + ); } -fn normalize_policy_target(value: &str) -> PolicyTarget { - if value == "cluster" { - PolicyTarget::Cluster - } else if let Some(graph_id) = value.strip_prefix("graph.") { - PolicyTarget::Graph(graph_id.to_string()) - } else if value.contains('.') { - PolicyTarget::WrongKind(value.to_string()) - } else { - PolicyTarget::Graph(value.to_string()) - } -} - -fn graph_address(graph_id: &str) -> String { - format!("graph.{graph_id}") -} - -fn schema_address(graph_id: &str) -> String { - format!("schema.{graph_id}") -} - -fn query_address(graph_id: &str, query_name: &str) -> String { - format!("query.{graph_id}.{query_name}") -} - -fn policy_address(policy_name: &str) -> String { - format!("policy.{policy_name}") -} - -fn resolve_config_path(config_dir: &Path, path: &Path) -> PathBuf { - if path.is_absolute() { - path.to_path_buf() - } else { - config_dir.join(path) - } +fn set_resource_status( + state: &mut ClusterState, + address: &str, + status: ResourceLifecycleStatus, + condition: &str, + message: &str, +) { + state.resource_statuses.insert( + address.to_string(), + ResourceStatusRecord { + status, + conditions: vec![condition.to_string()], + message: Some(message.to_string()), + }, + ); } fn graph_digest( diff --git a/crates/omnigraph-cluster/src/store.rs b/crates/omnigraph-cluster/src/store.rs index f378660..8a95661 100644 --- a/crates/omnigraph-cluster/src/store.rs +++ b/crates/omnigraph-cluster/src/store.rs @@ -559,3 +559,30 @@ impl Drop for StateLockGuard { let _ = fs::remove_file(&self.path); } } + +pub(crate) fn parse_lock_file_for_unlock(text: &str) -> Result { + let lock = serde_json::from_str::(text).map_err(|err| { + Diagnostic::error( + "invalid_state_lock", + CLUSTER_LOCK_FILE, + format!("could not parse state lock: {err}"), + ) + })?; + if lock.version != 1 { + return Err(Diagnostic::error( + "unsupported_state_lock_version", + CLUSTER_LOCK_FILE, + format!("unsupported cluster state lock version {}", lock.version), + )); + } + Ok(lock) +} + +pub(crate) fn state_lock_held_message(observations: &StateObservations) -> String { + match observations.lock_id.as_deref() { + Some(lock_id) => format!( + "cluster state lock already exists (lock id {lock_id}); run `omnigraph cluster force-unlock {lock_id}` only after confirming no cluster operation is active" + ), + None => "cluster state lock already exists; remove it only after confirming no cluster operation is active".to_string(), + } +} From db6fe03be1527b0c8f4dbfa74e5ff9033dec2466 Mon Sep 17 00:00:00 2001 From: aaltshuler Date: Thu, 11 Jun 2026 05:42:02 +0300 Subject: [PATCH 7/7] refactor(cluster): move type definitions to types.rs Verbatim move of the public output/diagnostic types and the internal state/sidecar/approval models; previously-private types and their fields get pub(crate) (they were crate-visible by position before). lib.rs is now the command pipeline + public API. 95 tests green; full workspace gate green. Co-Authored-By: Claude Fable 5 --- crates/omnigraph-cluster/src/lib.rs | 510 +------------------------- crates/omnigraph-cluster/src/types.rs | 510 ++++++++++++++++++++++++++ 2 files changed, 513 insertions(+), 507 deletions(-) create mode 100644 crates/omnigraph-cluster/src/types.rs diff --git a/crates/omnigraph-cluster/src/lib.rs b/crates/omnigraph-cluster/src/lib.rs index 8b41fdf..dc66408 100644 --- a/crates/omnigraph-cluster/src/lib.rs +++ b/crates/omnigraph-cluster/src/lib.rs @@ -20,11 +20,14 @@ use ulid::Ulid; pub mod failpoints; mod config; +mod types; mod diff; mod serve; mod sweep; mod store; use store::{LocalStateBackend, StateLockGuard, StateSnapshot}; +pub use types::*; +use types::*; pub use serve::{ServingGraph, ServingPolicy, ServingQuery, ServingSnapshot, read_serving_snapshot}; use serve::read_verified_payload; use config::{QueriesDecl, observe_declared_graphs, validate_cluster_header, future_field_diagnostics, initial_import_state, observe_live_graph, preview_schema_migration, state_resource_digests, graph_address, policy_address, query_address, schema_address, load_desired, normalize_policy_target, parse_cluster_config, resolve_config_path, resolve_query_decls, validate_id, validate_query_source}; @@ -40,513 +43,6 @@ pub const CLUSTER_RESOURCES_DIR: &str = "__cluster/resources"; pub const CLUSTER_RECOVERIES_DIR: &str = "__cluster/recoveries"; pub const CLUSTER_APPROVALS_DIR: &str = "__cluster/approvals"; -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] -#[serde(rename_all = "snake_case")] -pub enum DiagnosticSeverity { - Error, - Warning, -} - -#[derive(Debug, Clone, Serialize, PartialEq, Eq)] -pub struct Diagnostic { - pub code: String, - pub severity: DiagnosticSeverity, - pub path: String, - pub message: String, -} - -impl Diagnostic { - fn error(code: impl Into, path: impl Into, message: impl Into) -> Self { - Self { - code: code.into(), - severity: DiagnosticSeverity::Error, - path: path.into(), - message: message.into(), - } - } - - fn warning( - code: impl Into, - path: impl Into, - message: impl Into, - ) -> Self { - Self { - code: code.into(), - severity: DiagnosticSeverity::Warning, - path: path.into(), - message: message.into(), - } - } -} - -#[derive(Debug, Clone, Serialize, PartialEq, Eq)] -pub struct ResourceSummary { - pub address: String, - pub kind: String, - pub digest: String, - #[serde(skip_serializing_if = "Option::is_none")] - pub path: Option, -} - -#[derive(Debug, Clone, Serialize, PartialEq, Eq, PartialOrd, Ord)] -pub struct Dependency { - pub from: String, - pub to: String, -} - -#[derive(Debug, Clone, Serialize)] -pub struct ValidateOutput { - pub ok: bool, - pub config_dir: String, - pub config_file: String, - pub resource_digests: BTreeMap, - pub resources: Vec, - pub dependencies: Vec, - pub diagnostics: Vec, -} - -#[derive(Debug, Clone, Serialize)] -pub struct DesiredRevision { - #[serde(skip_serializing_if = "Option::is_none")] - pub config_digest: Option, -} - -#[derive(Debug, Clone, Serialize)] -pub struct StateObservations { - pub state_path: String, - pub lock_path: String, - pub state_found: bool, - #[serde(skip_serializing_if = "Option::is_none")] - pub applied_config_digest: Option, - pub state_revision: u64, - #[serde(skip_serializing_if = "Option::is_none")] - pub state_cas: Option, - pub resource_count: usize, - pub locked: bool, - #[serde(skip_serializing_if = "Option::is_none")] - pub lock_id: Option, - pub lock_acquired: bool, - #[serde(skip_serializing_if = "Option::is_none")] - pub acquired_lock_id: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub lock_operation: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub lock_created_at: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub lock_pid: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub lock_age_seconds: Option, -} - -impl StateObservations { - fn observe_lock_metadata(&mut self, lock: &StateLockFile) { - self.locked = true; - self.lock_id = Some(lock.lock_id.clone()); - self.lock_operation = Some(lock.operation.clone()); - self.lock_created_at = Some(lock.created_at.clone()); - self.lock_pid = Some(lock.pid); - self.lock_age_seconds = lock_age_seconds(&lock.created_at); - } -} - -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] -#[serde(rename_all = "snake_case")] -pub enum ResourceLifecycleStatus { - Pending, - Planned, - Applying, - Applied, - Drifted, - Blocked, - Error, -} - -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] -#[serde(deny_unknown_fields)] -pub struct ResourceStatusRecord { - pub status: ResourceLifecycleStatus, - #[serde(default, skip_serializing_if = "Vec::is_empty")] - pub conditions: Vec, - #[serde(default, skip_serializing_if = "Option::is_none")] - pub message: Option, -} - -#[derive(Debug, Clone, Serialize, PartialEq, Eq)] -#[serde(rename_all = "snake_case")] -pub enum PlanOperation { - Create, - Update, - Delete, -} - -/// How `cluster apply` treats a planned change in the current stage. -/// -/// `Applied` changes execute (config-only query/policy catalog writes). -/// `Derived` marks a `graph.` composite-digest update that converges -/// automatically once its applied query digests land in state. `Deferred` -/// changes need a later phase (graph/schema lifecycle or schema content). -/// `Blocked` query/policy changes are gated by an unapplied or missing -/// dependency. -#[derive(Debug, Clone, Copy, Serialize, PartialEq, Eq)] -#[serde(rename_all = "snake_case")] -pub enum ApplyDisposition { - Applied, - Derived, - Deferred, - Blocked, -} - -#[derive(Debug, Clone, Serialize, PartialEq)] -pub struct PlanChange { - pub resource: String, - pub operation: PlanOperation, - #[serde(skip_serializing_if = "Option::is_none")] - pub before_digest: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub after_digest: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub disposition: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub reason: Option, - /// True for a policy change whose file digest is unchanged but whose - /// `applies_to` bindings differ from the applied revision (including the - /// pre-5A backfill case). - #[serde(default, skip_serializing_if = "std::ops::Not::not")] - pub binding_change: bool, - /// For schema updates: the engine's migration plan against the live - /// graph (RFC-004 §D7's data-aware preview). Absent when the preview is - /// unavailable (warning `schema_preview_unavailable`). - #[serde(skip_serializing_if = "Option::is_none")] - pub migration: Option, -} - -#[derive(Debug, Clone, Serialize, PartialEq, Eq)] -pub struct BlastRadius { - pub resource: String, - pub affected: Vec, -} - -#[derive(Debug, Clone, Serialize, PartialEq, Eq)] -pub struct ApprovalRequirement { - pub resource: String, - pub reason: String, - /// True when a valid (digest-matching, unconsumed) approval artifact is - /// pending for this change. - pub satisfied: bool, -} - -#[derive(Debug, Clone, Serialize)] -pub struct PlanOutput { - pub ok: bool, - pub config_dir: String, - pub desired_revision: DesiredRevision, - pub resource_digests: BTreeMap, - pub dependencies: Vec, - pub state_observations: StateObservations, - pub changes: Vec, - pub blast_radius: Vec, - pub approvals_required: Vec, - pub diagnostics: Vec, -} - -#[derive(Debug, Clone, Serialize)] -pub struct StatusOutput { - pub ok: bool, - pub config_dir: String, - pub state_observations: StateObservations, - pub resource_digests: BTreeMap, - pub resource_statuses: BTreeMap, - pub observations: BTreeMap, - pub diagnostics: Vec, -} - -#[derive(Debug, Clone, Copy, Serialize, PartialEq, Eq)] -#[serde(rename_all = "snake_case")] -pub enum StateSyncOperation { - Refresh, - Import, -} - -#[derive(Debug, Clone, Serialize)] -pub struct StateSyncOutput { - pub ok: bool, - pub operation: StateSyncOperation, - pub config_dir: String, - pub state_observations: StateObservations, - pub resource_digests: BTreeMap, - pub resource_statuses: BTreeMap, - pub observations: BTreeMap, - pub diagnostics: Vec, -} - -#[derive(Debug, Clone, Serialize)] -pub struct ForceUnlockOutput { - pub ok: bool, - pub config_dir: String, - pub state_observations: StateObservations, - pub lock_removed: bool, - pub diagnostics: Vec, -} - -/// Output of config-only `cluster apply`. "Applied" means recorded in the -/// local cluster catalog (`__cluster/`); nothing applied here serves traffic — -/// the server still boots from `omnigraph.yaml` until the server-boot stage. -#[derive(Debug, Clone, Serialize)] -pub struct ApplyOutput { - pub ok: bool, - pub config_dir: String, - #[serde(skip_serializing_if = "Option::is_none")] - pub actor: Option, - pub desired_revision: DesiredRevision, - pub state_observations: StateObservations, - /// Every planned change, with `disposition`/`reason` always populated. - pub changes: Vec, - pub applied_count: usize, - /// Deferred + Blocked changes (Derived composite updates count as neither). - pub deferred_count: usize, - /// True when state matches the desired revision after this apply. - pub converged: bool, - /// False for a no-op re-apply: state bytes (and revision) were left untouched. - pub state_written: bool, - /// The statuses as persisted: post-apply on success, the pre-apply on-disk - /// snapshot when the state write fails (never unpersisted in-memory state). - pub resource_statuses: BTreeMap, - pub diagnostics: Vec, -} - -/// A digest-bound human approval for an irreversible operation (RFC-004 -/// §D4). Written by `cluster approve`, consumed by apply. The file is never -/// deleted on consumption — it is rewritten with `consumed_at` and also -/// summarized into the state ledger's `approval_records`, so the audit fact -/// survives the loss of either store (axiom 11). -#[derive(Debug, Clone, Serialize, Deserialize)] -#[serde(deny_unknown_fields)] -struct ApprovalArtifact { - schema_version: u32, - approval_id: String, - resource: String, - operation: String, - reason: String, - bound_config_digest: String, - #[serde(default)] - bound_before_digest: Option, - #[serde(default)] - bound_after_digest: Option, - approved_by: String, - created_at: String, - #[serde(default)] - consumed_at: Option, - #[serde(default)] - consumed_by_operation: Option, -} - -#[derive(Debug, Clone, Serialize)] -pub struct ApproveOutput { - pub ok: bool, - pub config_dir: String, - #[serde(skip_serializing_if = "Option::is_none")] - pub approval_id: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub resource: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub operation: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pub approved_by: Option, - pub diagnostics: Vec, -} - -#[derive(Debug, Clone)] -struct DesiredCluster { - config_dir: PathBuf, - config_digest: String, - state_lock: bool, - graphs: Vec, - resource_digests: BTreeMap, - resources: Vec, - dependencies: Vec, - /// `policy.` address -> normalized applies_to refs. - policy_bindings: BTreeMap>, -} - -#[derive(Debug, Clone)] -struct DesiredGraph { - id: String, - schema_digest: String, -} - -#[derive(Debug)] -struct ParsedConfig { - raw: Option, - diagnostics: Vec, - config_dir: PathBuf, - config_file: PathBuf, -} - -#[derive(Debug, Clone, Copy)] -struct ClusterSettings { - state_lock: bool, -} - -#[derive(Debug)] -struct LoadOutcome { - desired: Option, - diagnostics: Vec, - config_dir: PathBuf, - config_file: PathBuf, -} - -#[derive(Debug, Serialize, Deserialize)] -#[serde(deny_unknown_fields)] -struct RawClusterConfig { - version: u32, - #[serde(default)] - metadata: Metadata, - #[serde(default)] - state: StateConfig, - #[serde(default)] - graphs: BTreeMap, - #[serde(default)] - policies: BTreeMap, -} - -#[derive(Debug, Default, Serialize, Deserialize)] -#[serde(deny_unknown_fields)] -struct Metadata { - name: Option, -} - -#[derive(Debug, Default, Serialize, Deserialize)] -#[serde(deny_unknown_fields)] -struct StateConfig { - backend: Option, - lock: Option, -} - -#[derive(Debug, Serialize, Deserialize)] -#[serde(deny_unknown_fields)] -struct GraphConfig { - schema: PathBuf, - #[serde(default)] - queries: QueriesDecl, -} - -/// How a graph declares its stored queries. Terraform-style: the `.gq` -/// files ARE the declaration — point at them (or a directory) and every -#[derive(Debug, Serialize, Deserialize)] -#[serde(deny_unknown_fields)] -struct QueryConfig { - file: PathBuf, -} - -#[derive(Debug, Serialize, Deserialize)] -#[serde(deny_unknown_fields)] -struct PolicyConfig { - file: PathBuf, - applies_to: Vec, -} - -// Stage 2A/2B accept these forward-compatible state sections so existing -// ledgers won't churn while approval/recovery semantics are staged later. -#[allow(dead_code)] -#[derive(Debug, Clone, Serialize, Deserialize)] -#[serde(deny_unknown_fields)] -struct ClusterState { - version: u32, - #[serde(default)] - state_revision: u64, - applied_revision: AppliedRevisionState, - #[serde(default)] - resource_statuses: BTreeMap, - #[serde(default)] - approval_records: BTreeMap, - #[serde(default)] - recovery_records: BTreeMap, - #[serde(default)] - observations: BTreeMap, -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -#[serde(deny_unknown_fields)] -struct AppliedRevisionState { - #[serde(default)] - config_digest: Option, - #[serde(default)] - resources: BTreeMap, -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -#[serde(deny_unknown_fields)] -struct StateResource { - digest: String, - /// Policy resources only: the applied `applies_to` bindings, normalized - /// to typed refs (`cluster` | `graph.`). Recorded so the state - /// ledger is serving-sufficient for the Phase-5 server boot (RFC-005 - /// §D3). Absent on pre-5A entries (backfilled by the next apply) and on - /// non-policy resources. - #[serde(default, skip_serializing_if = "Option::is_none")] - applies_to: Option>, -} - -#[derive(Debug, Serialize, Deserialize)] -#[serde(deny_unknown_fields)] -struct StateLockFile { - version: u32, - lock_id: String, - operation: String, - created_at: String, - pid: u32, -} - -/// Recovery-intent record for a graph-moving apply operation (RFC-004 §D2). -/// Written under the state lock before the engine call that can create or -/// move a graph manifest; deleted only after the cluster state CAS that -/// records the outcome lands. The sweep (§D3) classifies survivors. -#[derive(Debug, Clone, Serialize, Deserialize)] -#[serde(deny_unknown_fields)] -struct RecoverySidecar { - schema_version: u32, - operation_id: String, - started_at: String, - #[serde(default)] - actor: Option, - kind: RecoverySidecarKind, - graph_id: String, - graph_uri: String, - #[serde(default)] - observed_manifest_version: Option, - #[serde(default)] - expected_manifest_version: Option, - desired_schema_digest: String, - #[serde(default)] - state_cas_base: Option, - /// For graph_delete: the approval this operation consumes; lets a sweep - /// roll-forward consume it too. - #[serde(default)] - approval_id: Option, -} - -#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)] -#[serde(rename_all = "snake_case")] -enum RecoverySidecarKind { - GraphCreate, - SchemaApply, - GraphDelete, -} - -#[derive(Debug, Default)] -struct SweepOutcome { - /// Graphs whose sidecar was kept (rows 5/6): graph-moving work for them - /// is blocked until the operator repairs and re-observes. - pending_graphs: BTreeSet, - /// Sidecars whose outcome is recorded (rows 2/4): deleted only after the - /// command's state write lands, so a CAS failure re-sweeps them. - completed_sidecars: Vec, - /// Approval artifacts consumed by a roll-forward (delete row 7b): their - /// files are rewritten with consumed_at only after the state write lands. - consumed_approvals: Vec, -} - - pub fn validate_config_dir(config_dir: impl AsRef) -> ValidateOutput { let outcome = load_desired(config_dir.as_ref()); let (resource_digests, resources, dependencies) = match outcome.desired { diff --git a/crates/omnigraph-cluster/src/types.rs b/crates/omnigraph-cluster/src/types.rs new file mode 100644 index 0000000..c366f04 --- /dev/null +++ b/crates/omnigraph-cluster/src/types.rs @@ -0,0 +1,510 @@ +//! Public output/diagnostic types and internal state/sidecar/approval +//! models (moved verbatim from lib.rs in the modularization). + +use super::*; + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum DiagnosticSeverity { + Error, + Warning, +} + +#[derive(Debug, Clone, Serialize, PartialEq, Eq)] +pub struct Diagnostic { + pub code: String, + pub severity: DiagnosticSeverity, + pub path: String, + pub message: String, +} + +impl Diagnostic { + pub(crate) fn error(code: impl Into, path: impl Into, message: impl Into) -> Self { + Self { + code: code.into(), + severity: DiagnosticSeverity::Error, + path: path.into(), + message: message.into(), + } + } + + pub(crate) fn warning( + code: impl Into, + path: impl Into, + message: impl Into, + ) -> Self { + Self { + code: code.into(), + severity: DiagnosticSeverity::Warning, + path: path.into(), + message: message.into(), + } + } +} + +#[derive(Debug, Clone, Serialize, PartialEq, Eq)] +pub struct ResourceSummary { + pub address: String, + pub kind: String, + pub digest: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub path: Option, +} + +#[derive(Debug, Clone, Serialize, PartialEq, Eq, PartialOrd, Ord)] +pub struct Dependency { + pub from: String, + pub to: String, +} + +#[derive(Debug, Clone, Serialize)] +pub struct ValidateOutput { + pub ok: bool, + pub config_dir: String, + pub config_file: String, + pub resource_digests: BTreeMap, + pub resources: Vec, + pub dependencies: Vec, + pub diagnostics: Vec, +} + +#[derive(Debug, Clone, Serialize)] +pub struct DesiredRevision { + #[serde(skip_serializing_if = "Option::is_none")] + pub config_digest: Option, +} + +#[derive(Debug, Clone, Serialize)] +pub struct StateObservations { + pub state_path: String, + pub lock_path: String, + pub state_found: bool, + #[serde(skip_serializing_if = "Option::is_none")] + pub applied_config_digest: Option, + pub state_revision: u64, + #[serde(skip_serializing_if = "Option::is_none")] + pub state_cas: Option, + pub resource_count: usize, + pub locked: bool, + #[serde(skip_serializing_if = "Option::is_none")] + pub lock_id: Option, + pub lock_acquired: bool, + #[serde(skip_serializing_if = "Option::is_none")] + pub acquired_lock_id: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub lock_operation: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub lock_created_at: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub lock_pid: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub lock_age_seconds: Option, +} + +impl StateObservations { + pub(crate) fn observe_lock_metadata(&mut self, lock: &StateLockFile) { + self.locked = true; + self.lock_id = Some(lock.lock_id.clone()); + self.lock_operation = Some(lock.operation.clone()); + self.lock_created_at = Some(lock.created_at.clone()); + self.lock_pid = Some(lock.pid); + self.lock_age_seconds = lock_age_seconds(&lock.created_at); + } +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum ResourceLifecycleStatus { + Pending, + Planned, + Applying, + Applied, + Drifted, + Blocked, + Error, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[serde(deny_unknown_fields)] +pub struct ResourceStatusRecord { + pub status: ResourceLifecycleStatus, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub conditions: Vec, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub message: Option, +} + +#[derive(Debug, Clone, Serialize, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum PlanOperation { + Create, + Update, + Delete, +} + +/// How `cluster apply` treats a planned change in the current stage. +/// +/// `Applied` changes execute (config-only query/policy catalog writes). +/// `Derived` marks a `graph.` composite-digest update that converges +/// automatically once its applied query digests land in state. `Deferred` +/// changes need a later phase (graph/schema lifecycle or schema content). +/// `Blocked` query/policy changes are gated by an unapplied or missing +/// dependency. +#[derive(Debug, Clone, Copy, Serialize, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum ApplyDisposition { + Applied, + Derived, + Deferred, + Blocked, +} + +#[derive(Debug, Clone, Serialize, PartialEq)] +pub struct PlanChange { + pub resource: String, + pub operation: PlanOperation, + #[serde(skip_serializing_if = "Option::is_none")] + pub before_digest: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub after_digest: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub disposition: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub reason: Option, + /// True for a policy change whose file digest is unchanged but whose + /// `applies_to` bindings differ from the applied revision (including the + /// pre-5A backfill case). + #[serde(default, skip_serializing_if = "std::ops::Not::not")] + pub binding_change: bool, + /// For schema updates: the engine's migration plan against the live + /// graph (RFC-004 §D7's data-aware preview). Absent when the preview is + /// unavailable (warning `schema_preview_unavailable`). + #[serde(skip_serializing_if = "Option::is_none")] + pub migration: Option, +} + +#[derive(Debug, Clone, Serialize, PartialEq, Eq)] +pub struct BlastRadius { + pub resource: String, + pub affected: Vec, +} + +#[derive(Debug, Clone, Serialize, PartialEq, Eq)] +pub struct ApprovalRequirement { + pub resource: String, + pub reason: String, + /// True when a valid (digest-matching, unconsumed) approval artifact is + /// pending for this change. + pub satisfied: bool, +} + +#[derive(Debug, Clone, Serialize)] +pub struct PlanOutput { + pub ok: bool, + pub config_dir: String, + pub desired_revision: DesiredRevision, + pub resource_digests: BTreeMap, + pub dependencies: Vec, + pub state_observations: StateObservations, + pub changes: Vec, + pub blast_radius: Vec, + pub approvals_required: Vec, + pub diagnostics: Vec, +} + +#[derive(Debug, Clone, Serialize)] +pub struct StatusOutput { + pub ok: bool, + pub config_dir: String, + pub state_observations: StateObservations, + pub resource_digests: BTreeMap, + pub resource_statuses: BTreeMap, + pub observations: BTreeMap, + pub diagnostics: Vec, +} + +#[derive(Debug, Clone, Copy, Serialize, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum StateSyncOperation { + Refresh, + Import, +} + +#[derive(Debug, Clone, Serialize)] +pub struct StateSyncOutput { + pub ok: bool, + pub operation: StateSyncOperation, + pub config_dir: String, + pub state_observations: StateObservations, + pub resource_digests: BTreeMap, + pub resource_statuses: BTreeMap, + pub observations: BTreeMap, + pub diagnostics: Vec, +} + +#[derive(Debug, Clone, Serialize)] +pub struct ForceUnlockOutput { + pub ok: bool, + pub config_dir: String, + pub state_observations: StateObservations, + pub lock_removed: bool, + pub diagnostics: Vec, +} + +/// Output of config-only `cluster apply`. "Applied" means recorded in the +/// local cluster catalog (`__cluster/`); nothing applied here serves traffic — +/// the server still boots from `omnigraph.yaml` until the server-boot stage. +#[derive(Debug, Clone, Serialize)] +pub struct ApplyOutput { + pub ok: bool, + pub config_dir: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub actor: Option, + pub desired_revision: DesiredRevision, + pub state_observations: StateObservations, + /// Every planned change, with `disposition`/`reason` always populated. + pub changes: Vec, + pub applied_count: usize, + /// Deferred + Blocked changes (Derived composite updates count as neither). + pub deferred_count: usize, + /// True when state matches the desired revision after this apply. + pub converged: bool, + /// False for a no-op re-apply: state bytes (and revision) were left untouched. + pub state_written: bool, + /// The statuses as persisted: post-apply on success, the pre-apply on-disk + /// snapshot when the state write fails (never unpersisted in-memory state). + pub resource_statuses: BTreeMap, + pub diagnostics: Vec, +} + +/// A digest-bound human approval for an irreversible operation (RFC-004 +/// §D4). Written by `cluster approve`, consumed by apply. The file is never +/// deleted on consumption — it is rewritten with `consumed_at` and also +/// summarized into the state ledger's `approval_records`, so the audit fact +/// survives the loss of either store (axiom 11). +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub(crate) struct ApprovalArtifact { + pub(crate) schema_version: u32, + pub(crate) approval_id: String, + pub(crate) resource: String, + pub(crate) operation: String, + pub(crate) reason: String, + pub(crate) bound_config_digest: String, + #[serde(default)] + pub(crate) bound_before_digest: Option, + #[serde(default)] + pub(crate) bound_after_digest: Option, + pub(crate) approved_by: String, + pub(crate) created_at: String, + #[serde(default)] + pub(crate) consumed_at: Option, + #[serde(default)] + pub(crate) consumed_by_operation: Option, +} + +#[derive(Debug, Clone, Serialize)] +pub struct ApproveOutput { + pub ok: bool, + pub config_dir: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub approval_id: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub resource: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub operation: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub approved_by: Option, + pub diagnostics: Vec, +} + +#[derive(Debug, Clone)] +pub(crate) struct DesiredCluster { + pub(crate) config_dir: PathBuf, + pub(crate) config_digest: String, + pub(crate) state_lock: bool, + pub(crate) graphs: Vec, + pub(crate) resource_digests: BTreeMap, + pub(crate) resources: Vec, + pub(crate) dependencies: Vec, + /// `policy.` address -> normalized applies_to refs. + pub(crate) policy_bindings: BTreeMap>, +} + +#[derive(Debug, Clone)] +pub(crate) struct DesiredGraph { + pub(crate) id: String, + pub(crate) schema_digest: String, +} + +#[derive(Debug)] +pub(crate) struct ParsedConfig { + pub(crate) raw: Option, + pub(crate) diagnostics: Vec, + pub(crate) config_dir: PathBuf, + pub(crate) config_file: PathBuf, +} + +#[derive(Debug, Clone, Copy)] +pub(crate) struct ClusterSettings { + pub(crate) state_lock: bool, +} + +#[derive(Debug)] +pub(crate) struct LoadOutcome { + pub(crate) desired: Option, + pub(crate) diagnostics: Vec, + pub(crate) config_dir: PathBuf, + pub(crate) config_file: PathBuf, +} + +#[derive(Debug, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub(crate) struct RawClusterConfig { + pub(crate) version: u32, + #[serde(default)] + pub(crate) metadata: Metadata, + #[serde(default)] + pub(crate) state: StateConfig, + #[serde(default)] + pub(crate) graphs: BTreeMap, + #[serde(default)] + pub(crate) policies: BTreeMap, +} + +#[derive(Debug, Default, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub(crate) struct Metadata { + pub(crate) name: Option, +} + +#[derive(Debug, Default, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub(crate) struct StateConfig { + pub(crate) backend: Option, + pub(crate) lock: Option, +} + +#[derive(Debug, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub(crate) struct GraphConfig { + pub(crate) schema: PathBuf, + #[serde(default)] + pub(crate) queries: QueriesDecl, +} + +/// How a graph declares its stored queries. Terraform-style: the `.gq` +/// files ARE the declaration — point at them (or a directory) and every +#[derive(Debug, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub(crate) struct QueryConfig { + pub(crate) file: PathBuf, +} + +#[derive(Debug, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub(crate) struct PolicyConfig { + pub(crate) file: PathBuf, + pub(crate) applies_to: Vec, +} + +// Stage 2A/2B accept these forward-compatible state sections so existing +// ledgers won't churn while approval/recovery semantics are staged later. +#[allow(dead_code)] +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub(crate) struct ClusterState { + pub(crate) version: u32, + #[serde(default)] + pub(crate) state_revision: u64, + pub(crate) applied_revision: AppliedRevisionState, + #[serde(default)] + pub(crate) resource_statuses: BTreeMap, + #[serde(default)] + pub(crate) approval_records: BTreeMap, + #[serde(default)] + pub(crate) recovery_records: BTreeMap, + #[serde(default)] + pub(crate) observations: BTreeMap, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub(crate) struct AppliedRevisionState { + #[serde(default)] + pub(crate) config_digest: Option, + #[serde(default)] + pub(crate) resources: BTreeMap, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub(crate) struct StateResource { + pub(crate) digest: String, + /// Policy resources only: the applied `applies_to` bindings, normalized + /// to typed refs (`cluster` | `graph.`). Recorded so the state + /// ledger is serving-sufficient for the Phase-5 server boot (RFC-005 + /// §D3). Absent on pre-5A entries (backfilled by the next apply) and on + /// non-policy resources. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub(crate) applies_to: Option>, +} + +#[derive(Debug, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub(crate) struct StateLockFile { + pub(crate) version: u32, + pub(crate) lock_id: String, + pub(crate) operation: String, + pub(crate) created_at: String, + pub(crate) pid: u32, +} + +/// Recovery-intent record for a graph-moving apply operation (RFC-004 §D2). +/// Written under the state lock before the engine call that can create or +/// move a graph manifest; deleted only after the cluster state CAS that +/// records the outcome lands. The sweep (§D3) classifies survivors. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub(crate) struct RecoverySidecar { + pub(crate) schema_version: u32, + pub(crate) operation_id: String, + pub(crate) started_at: String, + #[serde(default)] + pub(crate) actor: Option, + pub(crate) kind: RecoverySidecarKind, + pub(crate) graph_id: String, + pub(crate) graph_uri: String, + #[serde(default)] + pub(crate) observed_manifest_version: Option, + #[serde(default)] + pub(crate) expected_manifest_version: Option, + pub(crate) desired_schema_digest: String, + #[serde(default)] + pub(crate) state_cas_base: Option, + /// For graph_delete: the approval this operation consumes; lets a sweep + /// roll-forward consume it too. + #[serde(default)] + pub(crate) approval_id: Option, +} + +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub(crate) enum RecoverySidecarKind { + GraphCreate, + SchemaApply, + GraphDelete, +} + +#[derive(Debug, Default)] +pub(crate) struct SweepOutcome { + /// Graphs whose sidecar was kept (rows 5/6): graph-moving work for them + /// is blocked until the operator repairs and re-observes. + pub(crate) pending_graphs: BTreeSet, + /// Sidecars whose outcome is recorded (rows 2/4): deleted only after the + /// command's state write lands, so a CAS failure re-sweeps them. + pub(crate) completed_sidecars: Vec, + /// Approval artifacts consumed by a roll-forward (delete row 7b): their + /// files are rewritten with consumed_at only after the state write lands. + pub(crate) consumed_approvals: Vec, +}