omnigraph/crates/omnigraph-server/tests/multi_graph.rs

//! Cluster-mode boot and the concurrent branch-ops matrix.
//! Moved verbatim from tests/server.rs in the modularization.

use std::fs;

use axum::body::{Body, to_bytes};
use axum::http::{Method, Request, StatusCode};
use omnigraph::db::Omnigraph;
use omnigraph::loader::{LoadMode, load_jsonl};
use omnigraph_server::api::{ErrorOutput, ReadRequest};
use omnigraph_server::{AppState, build_app};
use serde_json::Value;
use serial_test::serial;
use tower::ServiceExt;

mod support;
use support::*;

#[tokio::test(flavor = "multi_thread", worker_threads = 4)]
async fn concurrent_branch_ops_morphological_matrix() {
    // Cell a: Merge × Merge, distinct targets.
    // Pre-fix on b09a097/22d76db: branch_merge_impl's swap-restore race
    // landed feature_a's content in target_b instead of target_a (and
    // vice versa — symmetric swap). Identity asserts catch both
    // asymmetric and symmetric variants.
    {
        let cell = "a:merge×merge:distinct-targets";
        let h = matrix::Harness::new().await;
        h.create_branch("main", "feature-a-cella").await;
        h.insert_person("feature-a-cella", "EveA-cella", 22).await;
        h.create_branch("main", "feature-b-cella").await;
        h.insert_person("feature-b-cella", "FrankB-cella", 33).await;
        h.create_branch("main", "target-a-cella").await;
        h.create_branch("main", "target-b-cella").await;

        let (sa, sb) = h
            .run_pair(
                matrix::op_merge("feature-a-cella".to_string(), "target-a-cella".to_string()),
                matrix::op_merge("feature-b-cella".to_string(), "target-b-cella".to_string()),
            )
            .await;
        assert_eq!(sa.status, StatusCode::OK, "[{}] merge a", cell);
        assert_eq!(sb.status, StatusCode::OK, "[{}] merge b", cell);
        h.assert_persons("target-a-cella", cell, &["EveA-cella"], &["FrankB-cella"])
            .await;
        h.assert_persons("target-b-cella", cell, &["FrankB-cella"], &["EveA-cella"])
            .await;
        h.assert_post_op_sentinel(cell, "sentinel-cella").await;
    }

    // Cell b: Merge × Merge, same target / distinct sources.
    // Both want to land in main. merge_exclusive serializes; both should
    // succeed and main should contain BOTH sources' contributions.
    {
        let cell = "b:merge×merge:same-target-distinct-sources";
        let h = matrix::Harness::new().await;
        h.create_branch("main", "src-x-cellb").await;
        h.insert_person("src-x-cellb", "Xavier-cellb", 41).await;
        h.create_branch("main", "src-y-cellb").await;
        h.insert_person("src-y-cellb", "Yvonne-cellb", 42).await;

        let (sa, sb) = h
            .run_pair(
                matrix::op_merge("src-x-cellb".to_string(), "main".to_string()),
                matrix::op_merge("src-y-cellb".to_string(), "main".to_string()),
            )
            .await;
        assert_eq!(sa.status, StatusCode::OK, "[{}] merge x", cell);
        assert_eq!(sb.status, StatusCode::OK, "[{}] merge y", cell);
        h.assert_persons("main", cell, &["Xavier-cellb", "Yvonne-cellb"], &[])
            .await;
        h.assert_post_op_sentinel(cell, "sentinel-cellb").await;
    }

    // Cell c: Merge × Merge, same source / distinct targets (fanout).
    // One source merged into two targets simultaneously. merge_exclusive
    // serializes; both targets should reflect the source's content.
    {
        let cell = "c:merge×merge:same-source-distinct-targets";
        let h = matrix::Harness::new().await;
        h.create_branch("main", "src-shared-cellc").await;
        h.insert_person("src-shared-cellc", "Sharon-cellc", 50)
            .await;
        h.create_branch("main", "tgt-1-cellc").await;
        h.create_branch("main", "tgt-2-cellc").await;

        let (sa, sb) = h
            .run_pair(
                matrix::op_merge("src-shared-cellc".to_string(), "tgt-1-cellc".to_string()),
                matrix::op_merge("src-shared-cellc".to_string(), "tgt-2-cellc".to_string()),
            )
            .await;
        assert_eq!(sa.status, StatusCode::OK, "[{}] merge into tgt-1", cell);
        assert_eq!(sb.status, StatusCode::OK, "[{}] merge into tgt-2", cell);
        h.assert_persons("tgt-1-cellc", cell, &["Sharon-cellc"], &[])
            .await;
        h.assert_persons("tgt-2-cellc", cell, &["Sharon-cellc"], &[])
            .await;
        h.assert_post_op_sentinel(cell, "sentinel-cellc").await;
    }

    // Cell d: Merge × Change, both touching main. C2 permits both
    // succeed, or exactly one clean 409 if the merge detects target
    // movement after planning but before acquiring the queue.
    {
        let cell = "d:merge×change:into-target";
        let h = matrix::Harness::new().await;
        h.create_branch("main", "feature-celld").await;
        h.insert_person("feature-celld", "EveD-celld", 22).await;

        let (sa, sb) = h
            .run_pair(
                matrix::op_merge("feature-celld".to_string(), "main".to_string()),
                matrix::op_change_insert("main".to_string(), "FrankD-celld".to_string(), 33),
            )
            .await;
        assert_eq!(sb.status, StatusCode::OK, "[{}] change", cell);
        assert!(
            sa.status == StatusCode::OK || sa.status == StatusCode::CONFLICT,
            "[{}] merge must be 200 or clean 409, got {}",
            cell,
            sa.status
        );
        if sa.status == StatusCode::OK {
            h.assert_persons("main", cell, &["EveD-celld", "FrankD-celld"], &[])
                .await;
        } else {
            let error: ErrorOutput = serde_json::from_slice(&sa.body).unwrap();
            let conflict = error
                .manifest_conflict
                .expect("merge 409 must include manifest_conflict");
            assert_eq!(
                conflict.table_key, "node:Person",
                "[{}] conflict table",
                cell
            );
            h.assert_persons("main", cell, &["FrankD-celld"], &["EveD-celld"])
                .await;
        }
        h.assert_post_op_sentinel(cell, "sentinel-celld").await;
    }

    // Cell e: Merge × BranchCreateFrom-target. Concurrent fork off the
    // merge target while the merge runs. Both should succeed; the new
    // branch should have a coherent view (either pre- or post-merge,
    // both valid). After both, target = main has the merged content.
    {
        let cell = "e:merge×branch_create_from:target";
        let h = matrix::Harness::new().await;
        h.create_branch("main", "src-celle").await;
        h.insert_person("src-celle", "Eve-celle", 22).await;

        let (sa, sb) = h
            .run_pair(
                matrix::op_merge("src-celle".to_string(), "main".to_string()),
                matrix::op_branch_create("main".to_string(), "fork-celle".to_string()),
            )
            .await;
        assert_eq!(sa.status, StatusCode::OK, "[{}] merge", cell);
        assert_eq!(sb.status, StatusCode::OK, "[{}] branch_create_from", cell);
        // Main definitely has Eve.
        h.assert_persons("main", cell, &["Eve-celle"], &[]).await;
        // fork-celle was forked off main at SOME version; main's current
        // count is 5 (4 seeded + Eve). fork-celle has either 4 (pre-merge
        // snapshot) or 5 (post-merge snapshot); both are valid timings.
        let fork_count = h.person_count("fork-celle").await;
        assert!(
            fork_count == 4 || fork_count == 5,
            "[{}] fork-celle row count must be pre- or post-merge view (4 or 5), got {}",
            cell,
            fork_count
        );
        h.assert_post_op_sentinel(cell, "sentinel-celle").await;
    }

    // Cell f: BranchCreateFrom × BranchCreateFrom, distinct parents.
    // Pre-fix on f925ad1: swap-restore race in branch_create_from_impl
    // forked the new branch off the wrong parent. Identity asserts pin
    // that fork-from-A inherits A's content, fork-from-B inherits B's.
    {
        let cell = "f:branch_create_from×branch_create_from:distinct-parents";
        let h = matrix::Harness::new().await;
        h.create_branch("main", "alpha-cellf").await;
        h.insert_person("alpha-cellf", "Eve-cellf", 22).await;
        h.create_branch("main", "beta-cellf").await;

        let (sa, sb) = h
            .run_pair(
                matrix::op_branch_create("alpha-cellf".to_string(), "gamma-cellf".to_string()),
                matrix::op_branch_create("beta-cellf".to_string(), "delta-cellf".to_string()),
            )
            .await;
        assert_eq!(sa.status, StatusCode::OK, "[{}] gamma create", cell);
        assert_eq!(sb.status, StatusCode::OK, "[{}] delta create", cell);
        // gamma forks off alpha → must contain Eve.
        h.assert_persons("gamma-cellf", cell, &["Eve-cellf"], &[])
            .await;
        // delta forks off beta → must NOT contain Eve.
        h.assert_persons("delta-cellf", cell, &[], &["Eve-cellf"])
            .await;
        h.assert_post_op_sentinel(cell, "sentinel-cellf").await;
    }

    // Cell g: BranchCreateFrom × BranchDelete, unrelated branches.
    // Disjoint branches; both should complete cleanly without
    // interference.
    {
        let cell = "g:branch_create_from×branch_delete:unrelated";
        let h = matrix::Harness::new().await;
        h.create_branch("main", "doomed-cellg").await;

        let (sa, sb) = h
            .run_pair(
                matrix::op_branch_create("main".to_string(), "newborn-cellg".to_string()),
                matrix::op_branch_delete("doomed-cellg".to_string()),
            )
            .await;
        assert_eq!(sa.status, StatusCode::OK, "[{}] create newborn", cell);
        assert_eq!(sb.status, StatusCode::OK, "[{}] delete doomed", cell);
        // newborn-cellg exists with main's content.
        h.assert_persons("newborn-cellg", cell, &["Alice"], &[])
            .await;
        h.assert_post_op_sentinel(cell, "sentinel-cellg").await;
    }

    // Cell h: BranchDelete × BranchDelete, distinct branches. Both call
    // refresh() internally; verify no deadlock and both deletes land.
    {
        let cell = "h:branch_delete×branch_delete:distinct";
        let h = matrix::Harness::new().await;
        h.create_branch("main", "doomed1-cellh").await;
        h.create_branch("main", "doomed2-cellh").await;

        let (sa, sb) = h
            .run_pair(
                matrix::op_branch_delete("doomed1-cellh".to_string()),
                matrix::op_branch_delete("doomed2-cellh".to_string()),
            )
            .await;
        assert_eq!(sa.status, StatusCode::OK, "[{}] delete 1", cell);
        assert_eq!(sb.status, StatusCode::OK, "[{}] delete 2", cell);
        // Verify both gone via /branches list (snapshot would still work
        // for a deleted branch via parent fallback in some paths, so we
        // use the explicit list).
        let r = h
            .app
            .clone()
            .oneshot(
                Request::builder()
                    .uri(g("/branches"))
                    .method(Method::GET)
                    .body(Body::empty())
                    .unwrap(),
            )
            .await
            .unwrap();
        assert_eq!(r.status(), StatusCode::OK);
        let body = to_bytes(r.into_body(), usize::MAX).await.unwrap();
        let list_body: Value = serde_json::from_slice(&body).unwrap();
        let branches: Vec<&str> = list_body["branches"]
            .as_array()
            .unwrap()
            .iter()
            .filter_map(|v| v.as_str())
            .collect();
        assert!(
            !branches.contains(&"doomed1-cellh"),
            "[{}] doomed1 still in branch list: {:?}",
            cell,
            branches
        );
        assert!(
            !branches.contains(&"doomed2-cellh"),
            "[{}] doomed2 still in branch list: {:?}",
            cell,
            branches
        );
        h.assert_post_op_sentinel(cell, "sentinel-cellh").await;
    }

    // Cell i: BranchDelete × Change, on a different branch. Delete one
    // branch while a /change runs on main. Both should succeed.
    {
        let cell = "i:branch_delete×change:distinct-branch";
        let h = matrix::Harness::new().await;
        h.create_branch("main", "doomed-celli").await;

        let (sa, sb) = h
            .run_pair(
                matrix::op_branch_delete("doomed-celli".to_string()),
                matrix::op_change_insert("main".to_string(), "Pat-celli".to_string(), 44),
            )
            .await;
        assert_eq!(sa.status, StatusCode::OK, "[{}] delete", cell);
        assert_eq!(sb.status, StatusCode::OK, "[{}] change", cell);
        h.assert_persons("main", cell, &["Pat-celli"], &[]).await;
        h.assert_post_op_sentinel(cell, "sentinel-celli").await;
    }

    // Cell j: BranchCreateFrom × Change, both on main. The fork timing
    // determines whether the new branch sees the change (pre or post).
    // Both valid. Main must contain the inserted row.
    {
        let cell = "j:branch_create_from×change:on-source";
        let h = matrix::Harness::new().await;

        let (sa, sb) = h
            .run_pair(
                matrix::op_branch_create("main".to_string(), "twin-cellj".to_string()),
                matrix::op_change_insert("main".to_string(), "Quincy-cellj".to_string(), 55),
            )
            .await;
        assert_eq!(sa.status, StatusCode::OK, "[{}] branch_create", cell);
        assert_eq!(sb.status, StatusCode::OK, "[{}] change", cell);
        h.assert_persons("main", cell, &["Quincy-cellj"], &[]).await;
        // twin-cellj has either pre-change view (no Quincy) or
        // post-change view (with Quincy); either is valid.
        let twin_has_quincy = h.person_exists("twin-cellj", "Quincy-cellj").await;
        let _ = twin_has_quincy; // either valid timing — just ensure no panic
        h.assert_post_op_sentinel(cell, "sentinel-cellj").await;
    }

    // Cell k: reopen consistency. Run a representative concurrent pair,
    // drop the engine, reopen on a separate handle, verify state matches.
    {
        let cell = "k:reopen-after-pair";
        let h = matrix::Harness::new().await;
        h.create_branch("main", "src-cellk").await;
        h.insert_person("src-cellk", "Rita-cellk", 36).await;

        let (sa, sb) = h
            .run_pair(
                matrix::op_merge("src-cellk".to_string(), "main".to_string()),
                matrix::op_change_insert("main".to_string(), "Steve-cellk".to_string(), 37),
            )
            .await;
        assert_eq!(sb.status, StatusCode::OK, "[{}] change", cell);
        assert!(
            sa.status == StatusCode::OK || sa.status == StatusCode::CONFLICT,
            "[{}] merge must be 200 or clean 409, got {}",
            cell,
            sa.status
        );
        if sa.status == StatusCode::OK {
            h.assert_persons("main", cell, &["Rita-cellk", "Steve-cellk"], &[])
                .await;
        } else {
            let error: ErrorOutput = serde_json::from_slice(&sa.body).unwrap();
            let conflict = error
                .manifest_conflict
                .expect("merge 409 must include manifest_conflict");
            assert_eq!(
                conflict.table_key, "node:Person",
                "[{}] conflict table",
                cell
            );
            h.assert_persons("main", cell, &["Steve-cellk"], &["Rita-cellk"])
                .await;
        }

        // Reopen via a fresh AppState on the same graph.
        let graph_uri = format!("{}/server.omni", h._temp.path().display());
        let reopened = AppState::open(graph_uri.clone()).await.unwrap();
        let app2 = build_app(reopened);
        // Sanity: the same identity check via the new app must see
        // Rita and Steve.
        let r = app2
            .clone()
            .oneshot(
                Request::builder()
                    .uri(g("/snapshot?branch=main"))
                    .method(Method::GET)
                    .body(Body::empty())
                    .unwrap(),
            )
            .await
            .unwrap();
        assert_eq!(r.status(), StatusCode::OK, "[{}] reopen snapshot", cell);
        let body = to_bytes(r.into_body(), usize::MAX).await.unwrap();
        let v: Value = serde_json::from_slice(&body).unwrap();
        let person_rows = v["tables"]
            .as_array()
            .and_then(|tables| {
                tables
                    .iter()
                    .find(|t| t["table_key"].as_str() == Some("node:Person"))
            })
            .and_then(|t| t["row_count"].as_u64())
            .expect("reopen snapshot must include node:Person row_count");
        let expected_rows = if sa.status == StatusCode::OK { 6 } else { 5 };
        assert_eq!(
            person_rows, expected_rows,
            "[{}] reopened main should include seed (4) + committed concurrent writes",
            cell,
        );
    }
}

#[tokio::test]
async fn cluster_boot_serves_applied_state() {
    let temp = converged_cluster_dir("").await;
    let settings = cluster_settings(temp.path()).await.unwrap();
    let omnigraph_server::ServerConfigMode::Multi {
        graphs,
        config_path,
        server_policy,
    } = settings.mode
    else {
        panic!("cluster boot must select multi-graph routing");
    };
    assert_eq!(graphs.len(), 1);
    assert_eq!(graphs[0].graph_id, "knowledge");
    assert!(server_policy.is_none());

    let state =
        omnigraph_server::open_multi_graph_state(graphs, Vec::new(), None, config_path, false)
            .await
            .unwrap();
    let app = build_app(state);

    // The management surface keeps its closed-by-default contract: without a
    // cluster-scoped policy bundle there is no server-level Cedar engine, so
    // GET /graphs refuses even in cluster mode.
    let (status, body) = json_response(
        &app,
        Request::builder()
            .uri("/graphs")
            .body(Body::empty())
            .unwrap(),
    )
    .await;
    assert_eq!(status, StatusCode::FORBIDDEN, "{body}");

    let (status, body) = json_response(
        &app,
        Request::builder()
            .uri("/graphs/knowledge/queries")
            .body(Body::empty())
            .unwrap(),
    )
    .await;
    assert_eq!(status, StatusCode::OK, "{body}");
    assert!(
        body["queries"]
            .as_array()
            .unwrap()
            .iter()
            .any(|q| q["name"] == "find_person"),
        "{body}"
    );

    let (status, body) = json_response(
        &app,
        Request::builder()
            .method(Method::POST)
            .uri("/graphs/knowledge/queries/find_person")
            .header("content-type", "application/json")
            .body(Body::from(r#"{"params":{"name":"nobody"}}"#))
            .unwrap(),
    )
    .await;
    assert_eq!(status, StatusCode::OK, "{body}");
}

#[tokio::test]
async fn cluster_boot_quarantines_graph_open_failures() {
    let temp = tempfile::tempdir().unwrap();
    let schema = "\nnode Person {\n  name: String @key\n}\n";
    let good_uri = temp.path().join("good.omni");
    Omnigraph::init(good_uri.to_string_lossy().as_ref(), schema)
        .await
        .unwrap();
    let bad_uri = temp.path().join("missing.omni");
    let server_policy = omnigraph_server::PolicySource::Inline(
        r#"
version: 1
kind: server
groups:
  admins: [act-admin]
rules:
  - id: admins-list-graphs
    allow:
      actors: { group: admins }
      actions: [graph_list]
"#
        .to_string(),
    );
    let graphs = vec![
        omnigraph_server::GraphStartupConfig {
            graph_id: "broken".to_string(),
            uri: bad_uri.to_string_lossy().to_string(),
            policy: None,
            embedding: None,
            queries: stored_query_registry(&[]),
        },
        omnigraph_server::GraphStartupConfig {
            graph_id: "good".to_string(),
            uri: good_uri.to_string_lossy().to_string(),
            policy: None,
            embedding: None,
            queries: stored_query_registry(&[]),
        },
    ];
    let strict_err = match omnigraph_server::open_multi_graph_state(
        graphs.clone(),
        vec![("act-admin".to_string(), "admin-token".to_string())],
        Some(&server_policy),
        temp.path().join("cluster.yaml"),
        true,
    )
    .await
    {
        Ok(_) => panic!("strict startup should reject a failed graph open"),
        Err(err) => err,
    };
    assert!(
        strict_err
            .to_string()
            .contains("strict multi-graph startup requires every graph to open"),
        "{strict_err}"
    );
    let state = omnigraph_server::open_multi_graph_state(
        graphs,
        vec![("act-admin".to_string(), "admin-token".to_string())],
        Some(&server_policy),
        temp.path().join("cluster.yaml"),
        false,
    )
    .await
    .unwrap();
    let mut ready: Vec<_> = state
        .routing()
        .registry
        .list()
        .iter()
        .map(|handle| handle.key.graph_id.as_str().to_string())
        .collect();
    ready.sort();
    assert_eq!(ready, vec!["good"]);
    let app = build_app(state);

    let (status, body) = json_response(
        &app,
        Request::builder()
            .uri("/graphs")
            .header("authorization", "Bearer admin-token")
            .body(Body::empty())
            .unwrap(),
    )
    .await;
    assert_eq!(status, StatusCode::OK, "{body}");
    assert_eq!(
        body["graphs"]
            .as_array()
            .unwrap()
            .iter()
            .map(|graph| graph["graph_id"].as_str().unwrap())
            .collect::<Vec<_>>(),
        vec!["good"]
    );

    let (status, body) = json_response(
        &app,
        Request::builder()
            .uri("/graphs/broken/queries")
            .header("authorization", "Bearer admin-token")
            .body(Body::empty())
            .unwrap(),
    )
    .await;
    assert_eq!(status, StatusCode::NOT_FOUND, "{body}");
}

#[tokio::test(flavor = "multi_thread")]
#[serial]
async fn cluster_boot_injects_embedding_provider_config() {
    const EMBED_SCHEMA: &str = r#"
node Doc {
    slug: String @key
    title: String @index
    embedding: Vector(4) @embed("title", model="cluster-mock") @index
}
"#;
    const EMBED_QUERY: &str = r#"
query vector_search_string($q: String) {
    match { $d: Doc }
    return { $d.slug, $d.title }
    order { nearest($d.embedding, $q) }
    limit 3
}
"#;

    let alpha = mock_embedding("alpha", 4);
    let beta = mock_embedding("beta", 4);
    let gamma = mock_embedding("gamma", 4);
    let data = format!(
        concat!(
            r#"{{"type":"Doc","data":{{"slug":"alpha-doc","title":"alpha guide","embedding":[{}]}}}}"#,
            "\n",
            r#"{{"type":"Doc","data":{{"slug":"beta-doc","title":"beta guide","embedding":[{}]}}}}"#,
            "\n",
            r#"{{"type":"Doc","data":{{"slug":"gamma-doc","title":"gamma handbook","embedding":[{}]}}}}"#
        ),
        format_vector(&alpha),
        format_vector(&beta),
        format_vector(&gamma),
    );

    let temp = tempfile::tempdir().unwrap();
    fs::write(temp.path().join("docs.pg"), EMBED_SCHEMA).unwrap();
    fs::write(temp.path().join("search.gq"), EMBED_QUERY).unwrap();
    fs::write(
        temp.path().join("cluster.yaml"),
        r#"
version: 1
providers:
  embedding:
    default:
      kind: mock
      model: cluster-mock
graphs:
  knowledge:
    schema: ./docs.pg
    embedding_provider: default
    queries:
      vector_search_string:
        file: ./search.gq
"#,
    )
    .unwrap();
    let import = omnigraph_cluster::import_config_dir(temp.path()).await;
    assert!(import.ok, "{:?}", import.diagnostics);
    let apply = omnigraph_cluster::apply_config_dir(temp.path()).await;
    assert!(apply.ok && apply.converged, "{:?}", apply.diagnostics);

    let graph_uri = temp
        .path()
        .join("graphs/knowledge.omni")
        .to_string_lossy()
        .to_string();
    let mut db = Omnigraph::open(&graph_uri).await.unwrap();
    load_jsonl(&mut db, &data, LoadMode::Overwrite)
        .await
        .unwrap();

    let _guard = EnvGuard::set(&[
        ("OMNIGRAPH_EMBEDDINGS_MOCK", None),
        ("OMNIGRAPH_EMBED_PROVIDER", None),
        ("OMNIGRAPH_EMBED_BASE_URL", None),
        ("OMNIGRAPH_EMBED_MODEL", None),
        ("OPENROUTER_API_KEY", None),
        ("OPENAI_API_KEY", None),
        ("GEMINI_API_KEY", None),
    ]);
    let settings = cluster_settings(temp.path()).await.unwrap();
    let omnigraph_server::ServerConfigMode::Multi {
        graphs,
        config_path,
        server_policy,
    } = settings.mode
    else {
        panic!("cluster boot must select multi-graph routing");
    };
    let state = omnigraph_server::open_multi_graph_state(
        graphs,
        Vec::new(),
        server_policy.as_ref(),
        config_path,
        false,
    )
    .await
    .unwrap();
    let app = build_app(state);

    let read = ReadRequest {
        query_source: EMBED_QUERY.to_string(),
        query_name: Some("vector_search_string".to_string()),
        params: Some(serde_json::json!({ "q": "alpha" })),
        branch: Some("main".to_string()),
        snapshot: None,
    };
    let (status, body) = json_response(
        &app,
        Request::builder()
            .uri("/graphs/knowledge/read")
            .method(Method::POST)
            .header("content-type", "application/json")
            .body(Body::from(serde_json::to_vec(&read).unwrap()))
            .unwrap(),
    )
    .await;

    assert_eq!(status, StatusCode::OK, "{body}");
    assert_eq!(body["row_count"], 3);
    assert_eq!(body["rows"][0]["d.slug"], "alpha-doc");
}

#[tokio::test(flavor = "multi_thread")]
#[serial]
async fn cluster_boot_refuses_missing_embedding_secret_env() {
    let temp = tempfile::tempdir().unwrap();
    fs::write(
        temp.path().join("people.pg"),
        "\nnode Person {\n  name: String @key\n}\n",
    )
    .unwrap();
    fs::write(
        temp.path().join("people.gq"),
        "\nquery find_person($name: String) {\n  match { $p: Person { name: $name } }\n  return { $p.name }\n}\n",
    )
    .unwrap();
    fs::write(
        temp.path().join("cluster.yaml"),
        r#"
version: 1
providers:
  embedding:
    default:
      kind: openai-compatible
      api_key: ${OG_TEST_MISSING_EMBED_KEY}
graphs:
  knowledge:
    schema: ./people.pg
    embedding_provider: default
    queries:
      find_person:
        file: ./people.gq
"#,
    )
    .unwrap();
    let import = omnigraph_cluster::import_config_dir(temp.path()).await;
    assert!(import.ok, "{:?}", import.diagnostics);
    let apply = omnigraph_cluster::apply_config_dir(temp.path()).await;
    assert!(apply.ok && apply.converged, "{:?}", apply.diagnostics);

    let _guard = EnvGuard::set(&[
        ("OG_TEST_MISSING_EMBED_KEY", None),
        ("OMNIGRAPH_EMBEDDINGS_MOCK", None),
    ]);
    let err = cluster_settings(temp.path()).await.unwrap_err();
    let message = err.to_string();
    assert!(
        message.contains("embedding provider for graph 'knowledge'"),
        "{message}"
    );
    assert!(message.contains("OG_TEST_MISSING_EMBED_KEY"), "{message}");
}

#[tokio::test]
async fn cluster_boot_wires_policy_bindings_into_cedar_slots() {
    let temp = tempfile::tempdir().unwrap();
    drop(temp);
    let policy_block = r#"policies:
  graph_rules:
    file: ./graph.policy.yaml
    applies_to: [knowledge]
  cluster_rules:
    file: ./cluster.policy.yaml
    applies_to: [cluster]
"#;
    let temp = {
        let temp = tempfile::tempdir().unwrap();
        fs::write(
            temp.path().join("people.pg"),
            "\nnode Person {\n  name: String @key\n}\n",
        )
        .unwrap();
        fs::write(
            temp.path().join("people.gq"),
            "\nquery find_person($name: String) {\n  match { $p: Person { name: $name } }\n  return { $p.name }\n}\n",
        )
        .unwrap();
        fs::write(
            temp.path().join("graph.policy.yaml"),
            permit_all_policy_yaml(&["default"]),
        )
        .unwrap();
        fs::write(
            temp.path().join("cluster.policy.yaml"),
            permit_all_policy_yaml(&["default"]).replace(
                "protected_branches: [main]\n",
                "protected_branches: [main]\nkind: server\n",
            ),
        )
        .unwrap();
        fs::write(
            temp.path().join("cluster.yaml"),
            format!(
                r#"
version: 1
graphs:
  knowledge:
    schema: ./people.pg
    queries:
      find_person:
        file: ./people.gq
{policy_block}"#
            ),
        )
        .unwrap();
        let import = omnigraph_cluster::import_config_dir(temp.path()).await;
        assert!(import.ok, "{:?}", import.diagnostics);
        let apply = omnigraph_cluster::apply_config_dir(temp.path()).await;
        assert!(apply.ok && apply.converged, "{:?}", apply.diagnostics);
        temp
    };

    let settings = cluster_settings(temp.path()).await.unwrap();
    let omnigraph_server::ServerConfigMode::Multi {
        graphs,
        server_policy,
        ..
    } = settings.mode
    else {
        panic!("cluster boot must select multi-graph routing");
    };
    // Cluster boots carry policy CONTENT (digest-verified catalog blobs),
    // not paths — the catalog may live on object storage.
    let omnigraph_server::PolicySource::Inline(graph_policy) =
        graphs[0].policy.as_ref().expect("graph-bound bundle")
    else {
        panic!("cluster-mode graph policy must be inline content");
    };
    assert!(graph_policy.contains("actors:"), "{graph_policy:?}");
    let omnigraph_server::PolicySource::Inline(server_policy) =
        server_policy.expect("cluster-bound bundle")
    else {
        panic!("cluster-mode server policy must be inline content");
    };
    assert!(server_policy.contains("kind: server"), "{server_policy:?}");
}

#[tokio::test]
async fn cluster_boot_refusals() {
    // RFC-011 cluster-only: with no --cluster, boot refuses with the
    // cluster-required remedy.
    let err = omnigraph_server::load_server_settings(None, None, true, false)
        .await
        .unwrap_err();
    assert!(err.to_string().contains("boots from a cluster"), "{err}");

    let temp = converged_cluster_dir("").await;
    let dir = temp.path().to_path_buf();

    // Tampered catalog blob refuses boot with the remedy.
    let blob_dir = dir.join("__cluster/resources/query/knowledge/find_person");
    let blob = fs::read_dir(&blob_dir)
        .unwrap()
        .next()
        .unwrap()
        .unwrap()
        .path();
    fs::write(&blob, "tampered").unwrap();
    let err = cluster_settings(&dir).await.unwrap_err();
    assert!(
        err.to_string().contains("catalog_payload_digest_mismatch"),
        "{err}"
    );
    assert!(err.to_string().contains("cluster refresh"), "{err}");

    // Missing state refuses with the import/apply remedy.
    let empty = tempfile::tempdir().unwrap();
    let err = cluster_settings(empty.path()).await.unwrap_err();
    assert!(err.to_string().contains("cluster_state_missing"), "{err}");
}
-												refactor(server): split the test monolith into area suites

tests/server.rs (6,517 lines, 110 tests) becomes seven area files —
auth_policy, data_routes, schema_routes, stored_queries, multi_graph,
boot_settings, s3 — with shared helpers in tests/support/mod.rs. Verbatim
moves + visibility bumps (pub on helpers, pub(super)->pub inside the
matrix harness); cargo fix stripped the per-file unused imports. All 110
tests pass in their new homes (289 across the crate including lib and
openapi).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>

											
										
										
											2026-06-11 15:03:51 +03:00
+								//! Cluster-mode boot and the concurrent branch-ops matrix.
 								//! Moved verbatim from tests/server.rs in the modularization.
 								use std::fs;
 								use axum::body::{Body, to_bytes};
 								use axum::http::{Method, Request, StatusCode};
-												Wire cluster embedding providers

											
										
										
											2026-06-16 04:02:08 +03:00
+								use omnigraph::db::Omnigraph;
 								use omnigraph::loader::{LoadMode, load_jsonl};
 								use omnigraph_server::api::{ErrorOutput, ReadRequest};
-												refactor(server): split the test monolith into area suites

tests/server.rs (6,517 lines, 110 tests) becomes seven area files —
auth_policy, data_routes, schema_routes, stored_queries, multi_graph,
boot_settings, s3 — with shared helpers in tests/support/mod.rs. Verbatim
moves + visibility bumps (pub on helpers, pub(super)->pub inside the
matrix harness); cargo fix stripped the per-file unused imports. All 110
tests pass in their new homes (289 across the crate including lib and
openapi).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>

											
										
										
											2026-06-11 15:03:51 +03:00
+								use omnigraph_server::{AppState, build_app};
 								use serde_json::Value;
-												Wire cluster embedding providers

											
										
										
											2026-06-16 04:02:08 +03:00
+								use serial_test::serial;
-												refactor(server): split the test monolith into area suites

tests/server.rs (6,517 lines, 110 tests) becomes seven area files —
auth_policy, data_routes, schema_routes, stored_queries, multi_graph,
boot_settings, s3 — with shared helpers in tests/support/mod.rs. Verbatim
moves + visibility bumps (pub on helpers, pub(super)->pub inside the
matrix harness); cargo fix stripped the per-file unused imports. All 110
tests pass in their new homes (289 across the crate including lib and
openapi).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>

											
										
										
											2026-06-11 15:03:51 +03:00
+								use tower::ServiceExt;
 								mod support;
 								use support::*;
 								#[tokio::test(flavor = "multi_thread", worker_threads = 4)]
 								async fn concurrent_branch_ops_morphological_matrix() {
 								    // Cell a: Merge × Merge, distinct targets.
 								    // Pre-fix on b09a097/22d76db: branch_merge_impl's swap-restore race
 								    // landed feature_a's content in target_b instead of target_a (and
 								    // vice versa — symmetric swap). Identity asserts catch both
 								    // asymmetric and symmetric variants.
 								    {
 								        let cell = "a:merge×merge:distinct-targets";
 								        let h = matrix::Harness::new().await;
 								        h.create_branch("main", "feature-a-cella").await;
 								        h.insert_person("feature-a-cella", "EveA-cella", 22).await;
 								        h.create_branch("main", "feature-b-cella").await;
 								        h.insert_person("feature-b-cella", "FrankB-cella", 33).await;
 								        h.create_branch("main", "target-a-cella").await;
 								        h.create_branch("main", "target-b-cella").await;
 								        let (sa, sb) = h
 								            .run_pair(
 								                matrix::op_merge("feature-a-cella".to_string(), "target-a-cella".to_string()),
 								                matrix::op_merge("feature-b-cella".to_string(), "target-b-cella".to_string()),
 								            )
 								            .await;
 								        assert_eq!(sa.status, StatusCode::OK, "[{}] merge a", cell);
 								        assert_eq!(sb.status, StatusCode::OK, "[{}] merge b", cell);
 								        h.assert_persons("target-a-cella", cell, &["EveA-cella"], &["FrankB-cella"])
 								            .await;
 								        h.assert_persons("target-b-cella", cell, &["FrankB-cella"], &["EveA-cella"])
 								            .await;
 								        h.assert_post_op_sentinel(cell, "sentinel-cella").await;
 								    }
 								    // Cell b: Merge × Merge, same target / distinct sources.
 								    // Both want to land in main. merge_exclusive serializes; both should
 								    // succeed and main should contain BOTH sources' contributions.
 								    {
 								        let cell = "b:merge×merge:same-target-distinct-sources";
 								        let h = matrix::Harness::new().await;
 								        h.create_branch("main", "src-x-cellb").await;
 								        h.insert_person("src-x-cellb", "Xavier-cellb", 41).await;
 								        h.create_branch("main", "src-y-cellb").await;
 								        h.insert_person("src-y-cellb", "Yvonne-cellb", 42).await;
 								        let (sa, sb) = h
 								            .run_pair(
 								                matrix::op_merge("src-x-cellb".to_string(), "main".to_string()),
 								                matrix::op_merge("src-y-cellb".to_string(), "main".to_string()),
 								            )
 								            .await;
 								        assert_eq!(sa.status, StatusCode::OK, "[{}] merge x", cell);
 								        assert_eq!(sb.status, StatusCode::OK, "[{}] merge y", cell);
 								        h.assert_persons("main", cell, &["Xavier-cellb", "Yvonne-cellb"], &[])
 								            .await;
 								        h.assert_post_op_sentinel(cell, "sentinel-cellb").await;
 								    }
 								    // Cell c: Merge × Merge, same source / distinct targets (fanout).
 								    // One source merged into two targets simultaneously. merge_exclusive
 								    // serializes; both targets should reflect the source's content.
 								    {
 								        let cell = "c:merge×merge:same-source-distinct-targets";
 								        let h = matrix::Harness::new().await;
 								        h.create_branch("main", "src-shared-cellc").await;
 								        h.insert_person("src-shared-cellc", "Sharon-cellc", 50)
 								            .await;
 								        h.create_branch("main", "tgt-1-cellc").await;
 								        h.create_branch("main", "tgt-2-cellc").await;
 								        let (sa, sb) = h
 								            .run_pair(
 								                matrix::op_merge("src-shared-cellc".to_string(), "tgt-1-cellc".to_string()),
 								                matrix::op_merge("src-shared-cellc".to_string(), "tgt-2-cellc".to_string()),
 								            )
 								            .await;
 								        assert_eq!(sa.status, StatusCode::OK, "[{}] merge into tgt-1", cell);
 								        assert_eq!(sb.status, StatusCode::OK, "[{}] merge into tgt-2", cell);
 								        h.assert_persons("tgt-1-cellc", cell, &["Sharon-cellc"], &[])
 								            .await;
 								        h.assert_persons("tgt-2-cellc", cell, &["Sharon-cellc"], &[])
 								            .await;
 								        h.assert_post_op_sentinel(cell, "sentinel-cellc").await;
 								    }
 								    // Cell d: Merge × Change, both touching main. C2 permits both
 								    // succeed, or exactly one clean 409 if the merge detects target
 								    // movement after planning but before acquiring the queue.
 								    {
 								        let cell = "d:merge×change:into-target";
 								        let h = matrix::Harness::new().await;
 								        h.create_branch("main", "feature-celld").await;
 								        h.insert_person("feature-celld", "EveD-celld", 22).await;
 								        let (sa, sb) = h
 								            .run_pair(
 								                matrix::op_merge("feature-celld".to_string(), "main".to_string()),
 								                matrix::op_change_insert("main".to_string(), "FrankD-celld".to_string(), 33),
 								            )
 								            .await;
 								        assert_eq!(sb.status, StatusCode::OK, "[{}] change", cell);
 								        assert!(
 								            sa.status == StatusCode::OK || sa.status == StatusCode::CONFLICT,
 								            "[{}] merge must be 200 or clean 409, got {}",
 								            cell,
 								            sa.status
 								        );
 								        if sa.status == StatusCode::OK {
 								            h.assert_persons("main", cell, &["EveD-celld", "FrankD-celld"], &[])
 								                .await;
 								        } else {
 								            let error: ErrorOutput = serde_json::from_slice(&sa.body).unwrap();
 								            let conflict = error
 								                .manifest_conflict
 								                .expect("merge 409 must include manifest_conflict");
 								            assert_eq!(
 								                conflict.table_key, "node:Person",
 								                "[{}] conflict table",
 								                cell
 								            );
 								            h.assert_persons("main", cell, &["FrankD-celld"], &["EveD-celld"])
 								                .await;
 								        }
 								        h.assert_post_op_sentinel(cell, "sentinel-celld").await;
 								    }
 								    // Cell e: Merge × BranchCreateFrom-target. Concurrent fork off the
 								    // merge target while the merge runs. Both should succeed; the new
 								    // branch should have a coherent view (either pre- or post-merge,
 								    // both valid). After both, target = main has the merged content.
 								    {
 								        let cell = "e:merge×branch_create_from:target";
 								        let h = matrix::Harness::new().await;
 								        h.create_branch("main", "src-celle").await;
 								        h.insert_person("src-celle", "Eve-celle", 22).await;
 								        let (sa, sb) = h
 								            .run_pair(
 								                matrix::op_merge("src-celle".to_string(), "main".to_string()),
 								                matrix::op_branch_create("main".to_string(), "fork-celle".to_string()),
 								            )
 								            .await;
 								        assert_eq!(sa.status, StatusCode::OK, "[{}] merge", cell);
 								        assert_eq!(sb.status, StatusCode::OK, "[{}] branch_create_from", cell);
 								        // Main definitely has Eve.
 								        h.assert_persons("main", cell, &["Eve-celle"], &[]).await;
 								        // fork-celle was forked off main at SOME version; main's current
 								        // count is 5 (4 seeded + Eve). fork-celle has either 4 (pre-merge
 								        // snapshot) or 5 (post-merge snapshot); both are valid timings.
 								        let fork_count = h.person_count("fork-celle").await;
 								        assert!(
 								            fork_count == 4 || fork_count == 5,
 								            "[{}] fork-celle row count must be pre- or post-merge view (4 or 5), got {}",
 								            cell,
 								            fork_count
 								        );
 								        h.assert_post_op_sentinel(cell, "sentinel-celle").await;
 								    }
 								    // Cell f: BranchCreateFrom × BranchCreateFrom, distinct parents.
 								    // Pre-fix on f925ad1: swap-restore race in branch_create_from_impl
 								    // forked the new branch off the wrong parent. Identity asserts pin
 								    // that fork-from-A inherits A's content, fork-from-B inherits B's.
 								    {
 								        let cell = "f:branch_create_from×branch_create_from:distinct-parents";
 								        let h = matrix::Harness::new().await;
 								        h.create_branch("main", "alpha-cellf").await;
 								        h.insert_person("alpha-cellf", "Eve-cellf", 22).await;
 								        h.create_branch("main", "beta-cellf").await;
 								        let (sa, sb) = h
 								            .run_pair(
 								                matrix::op_branch_create("alpha-cellf".to_string(), "gamma-cellf".to_string()),
 								                matrix::op_branch_create("beta-cellf".to_string(), "delta-cellf".to_string()),
 								            )
 								            .await;
 								        assert_eq!(sa.status, StatusCode::OK, "[{}] gamma create", cell);
 								        assert_eq!(sb.status, StatusCode::OK, "[{}] delta create", cell);
 								        // gamma forks off alpha → must contain Eve.
 								        h.assert_persons("gamma-cellf", cell, &["Eve-cellf"], &[])
 								            .await;
 								        // delta forks off beta → must NOT contain Eve.
 								        h.assert_persons("delta-cellf", cell, &[], &["Eve-cellf"])
 								            .await;
 								        h.assert_post_op_sentinel(cell, "sentinel-cellf").await;
 								    }
 								    // Cell g: BranchCreateFrom × BranchDelete, unrelated branches.
 								    // Disjoint branches; both should complete cleanly without
 								    // interference.
 								    {
 								        let cell = "g:branch_create_from×branch_delete:unrelated";
 								        let h = matrix::Harness::new().await;
 								        h.create_branch("main", "doomed-cellg").await;
 								        let (sa, sb) = h
 								            .run_pair(
 								                matrix::op_branch_create("main".to_string(), "newborn-cellg".to_string()),
 								                matrix::op_branch_delete("doomed-cellg".to_string()),
 								            )
 								            .await;
 								        assert_eq!(sa.status, StatusCode::OK, "[{}] create newborn", cell);
 								        assert_eq!(sb.status, StatusCode::OK, "[{}] delete doomed", cell);
 								        // newborn-cellg exists with main's content.
 								        h.assert_persons("newborn-cellg", cell, &["Alice"], &[])
 								            .await;
 								        h.assert_post_op_sentinel(cell, "sentinel-cellg").await;
 								    }
 								    // Cell h: BranchDelete × BranchDelete, distinct branches. Both call
 								    // refresh() internally; verify no deadlock and both deletes land.
 								    {
 								        let cell = "h:branch_delete×branch_delete:distinct";
 								        let h = matrix::Harness::new().await;
 								        h.create_branch("main", "doomed1-cellh").await;
 								        h.create_branch("main", "doomed2-cellh").await;
 								        let (sa, sb) = h
 								            .run_pair(
 								                matrix::op_branch_delete("doomed1-cellh".to_string()),
 								                matrix::op_branch_delete("doomed2-cellh".to_string()),
 								            )
 								            .await;
 								        assert_eq!(sa.status, StatusCode::OK, "[{}] delete 1", cell);
 								        assert_eq!(sb.status, StatusCode::OK, "[{}] delete 2", cell);
 								        // Verify both gone via /branches list (snapshot would still work
 								        // for a deleted branch via parent fallback in some paths, so we
 								        // use the explicit list).
 								        let r = h
 								            .app
 								            .clone()
 								            .oneshot(
 								                Request::builder()
-												feat(server)!: cluster-only server — remove single-graph serving (RFC-011) (#250)

omnigraph-server boots only from --cluster; all HTTP is /graphs/<id>/…; flat single-graph routes and the omnigraph.yaml server boot are removed. GraphRouting/ServerConfigMode collapse to multi-only; openapi.json regenerated to the nested shape; ~100 server route tests migrated; parity/system_local boot from a converged cluster. Gate green (1410 tests).
											
										
										
											2026-06-15 20:17:25 +03:00
+								                    .uri(g("/branches"))
-												refactor(server): split the test monolith into area suites

tests/server.rs (6,517 lines, 110 tests) becomes seven area files —
auth_policy, data_routes, schema_routes, stored_queries, multi_graph,
boot_settings, s3 — with shared helpers in tests/support/mod.rs. Verbatim
moves + visibility bumps (pub on helpers, pub(super)->pub inside the
matrix harness); cargo fix stripped the per-file unused imports. All 110
tests pass in their new homes (289 across the crate including lib and
openapi).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>

											
										
										
											2026-06-11 15:03:51 +03:00
+								                    .method(Method::GET)
 								                    .body(Body::empty())
 								                    .unwrap(),
 								            )
 								            .await
 								            .unwrap();
 								        assert_eq!(r.status(), StatusCode::OK);
 								        let body = to_bytes(r.into_body(), usize::MAX).await.unwrap();
 								        let list_body: Value = serde_json::from_slice(&body).unwrap();
 								        let branches: Vec<&str> = list_body["branches"]
 								            .as_array()
 								            .unwrap()
 								            .iter()
 								            .filter_map(|v| v.as_str())
 								            .collect();
 								        assert!(
 								            !branches.contains(&"doomed1-cellh"),
 								            "[{}] doomed1 still in branch list: {:?}",
 								            cell,
 								            branches
 								        );
 								        assert!(
 								            !branches.contains(&"doomed2-cellh"),
 								            "[{}] doomed2 still in branch list: {:?}",
 								            cell,
 								            branches
 								        );
 								        h.assert_post_op_sentinel(cell, "sentinel-cellh").await;
 								    }
 								    // Cell i: BranchDelete × Change, on a different branch. Delete one
 								    // branch while a /change runs on main. Both should succeed.
 								    {
 								        let cell = "i:branch_delete×change:distinct-branch";
 								        let h = matrix::Harness::new().await;
 								        h.create_branch("main", "doomed-celli").await;
 								        let (sa, sb) = h
 								            .run_pair(
 								                matrix::op_branch_delete("doomed-celli".to_string()),
 								                matrix::op_change_insert("main".to_string(), "Pat-celli".to_string(), 44),
 								            )
 								            .await;
 								        assert_eq!(sa.status, StatusCode::OK, "[{}] delete", cell);
 								        assert_eq!(sb.status, StatusCode::OK, "[{}] change", cell);
 								        h.assert_persons("main", cell, &["Pat-celli"], &[]).await;
 								        h.assert_post_op_sentinel(cell, "sentinel-celli").await;
 								    }
 								    // Cell j: BranchCreateFrom × Change, both on main. The fork timing
 								    // determines whether the new branch sees the change (pre or post).
 								    // Both valid. Main must contain the inserted row.
 								    {
 								        let cell = "j:branch_create_from×change:on-source";
 								        let h = matrix::Harness::new().await;
 								        let (sa, sb) = h
 								            .run_pair(
 								                matrix::op_branch_create("main".to_string(), "twin-cellj".to_string()),
 								                matrix::op_change_insert("main".to_string(), "Quincy-cellj".to_string(), 55),
 								            )
 								            .await;
 								        assert_eq!(sa.status, StatusCode::OK, "[{}] branch_create", cell);
 								        assert_eq!(sb.status, StatusCode::OK, "[{}] change", cell);
 								        h.assert_persons("main", cell, &["Quincy-cellj"], &[]).await;
 								        // twin-cellj has either pre-change view (no Quincy) or
 								        // post-change view (with Quincy); either is valid.
 								        let twin_has_quincy = h.person_exists("twin-cellj", "Quincy-cellj").await;
 								        let _ = twin_has_quincy; // either valid timing — just ensure no panic
 								        h.assert_post_op_sentinel(cell, "sentinel-cellj").await;
 								    }
 								    // Cell k: reopen consistency. Run a representative concurrent pair,
 								    // drop the engine, reopen on a separate handle, verify state matches.
 								    {
 								        let cell = "k:reopen-after-pair";
 								        let h = matrix::Harness::new().await;
 								        h.create_branch("main", "src-cellk").await;
 								        h.insert_person("src-cellk", "Rita-cellk", 36).await;
 								        let (sa, sb) = h
 								            .run_pair(
 								                matrix::op_merge("src-cellk".to_string(), "main".to_string()),
 								                matrix::op_change_insert("main".to_string(), "Steve-cellk".to_string(), 37),
 								            )
 								            .await;
 								        assert_eq!(sb.status, StatusCode::OK, "[{}] change", cell);
 								        assert!(
 								            sa.status == StatusCode::OK || sa.status == StatusCode::CONFLICT,
 								            "[{}] merge must be 200 or clean 409, got {}",
 								            cell,
 								            sa.status
 								        );
 								        if sa.status == StatusCode::OK {
 								            h.assert_persons("main", cell, &["Rita-cellk", "Steve-cellk"], &[])
 								                .await;
 								        } else {
 								            let error: ErrorOutput = serde_json::from_slice(&sa.body).unwrap();
 								            let conflict = error
 								                .manifest_conflict
 								                .expect("merge 409 must include manifest_conflict");
 								            assert_eq!(
 								                conflict.table_key, "node:Person",
 								                "[{}] conflict table",
 								                cell
 								            );
 								            h.assert_persons("main", cell, &["Steve-cellk"], &["Rita-cellk"])
 								                .await;
 								        }
 								        // Reopen via a fresh AppState on the same graph.
 								        let graph_uri = format!("{}/server.omni", h._temp.path().display());
 								        let reopened = AppState::open(graph_uri.clone()).await.unwrap();
 								        let app2 = build_app(reopened);
 								        // Sanity: the same identity check via the new app must see
 								        // Rita and Steve.
 								        let r = app2
 								            .clone()
 								            .oneshot(
 								                Request::builder()
-												feat(server)!: cluster-only server — remove single-graph serving (RFC-011) (#250)

omnigraph-server boots only from --cluster; all HTTP is /graphs/<id>/…; flat single-graph routes and the omnigraph.yaml server boot are removed. GraphRouting/ServerConfigMode collapse to multi-only; openapi.json regenerated to the nested shape; ~100 server route tests migrated; parity/system_local boot from a converged cluster. Gate green (1410 tests).
											
										
										
											2026-06-15 20:17:25 +03:00
+								                    .uri(g("/snapshot?branch=main"))
-												refactor(server): split the test monolith into area suites

tests/server.rs (6,517 lines, 110 tests) becomes seven area files —
auth_policy, data_routes, schema_routes, stored_queries, multi_graph,
boot_settings, s3 — with shared helpers in tests/support/mod.rs. Verbatim
moves + visibility bumps (pub on helpers, pub(super)->pub inside the
matrix harness); cargo fix stripped the per-file unused imports. All 110
tests pass in their new homes (289 across the crate including lib and
openapi).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>

											
										
										
											2026-06-11 15:03:51 +03:00
+								                    .method(Method::GET)
 								                    .body(Body::empty())
 								                    .unwrap(),
 								            )
 								            .await
 								            .unwrap();
 								        assert_eq!(r.status(), StatusCode::OK, "[{}] reopen snapshot", cell);
 								        let body = to_bytes(r.into_body(), usize::MAX).await.unwrap();
 								        let v: Value = serde_json::from_slice(&body).unwrap();
 								        let person_rows = v["tables"]
 								            .as_array()
 								            .and_then(|tables| {
 								                tables
 								                    .iter()
 								                    .find(|t| t["table_key"].as_str() == Some("node:Person"))
 								            })
 								            .and_then(|t| t["row_count"].as_u64())
 								            .expect("reopen snapshot must include node:Person row_count");
 								        let expected_rows = if sa.status == StatusCode::OK { 6 } else { 5 };
 								        assert_eq!(
 								            person_rows, expected_rows,
 								            "[{}] reopened main should include seed (4) + committed concurrent writes",
 								            cell,
 								        );
 								    }
 								}
 								#[tokio::test]
 								async fn cluster_boot_serves_applied_state() {
 								    let temp = converged_cluster_dir("").await;
 								    let settings = cluster_settings(temp.path()).await.unwrap();
 								    let omnigraph_server::ServerConfigMode::Multi {
 								        graphs,
 								        config_path,
-												feat(cluster,server): inline policy content + config-free --cluster URI boot

Two serving changes that complete RFC-006's read side:

ServingPolicy carries the policy bundle CONTENT (digest-verified at
snapshot read) instead of a blob path — the catalog may live on object
storage, and the server must not re-read mutable state after the
snapshot. The server grows a PolicySource enum: File for omnigraph.yaml
deployments (unchanged), Inline for cluster boots, wired through
PolicyEngine::load_{graph,server}_from_source.

read_serving_snapshot_from_storage(uri) reads the applied revision
straight from a storage root, and --cluster accepts a scheme-qualified
URI (s3://bucket/prefix): config-free serving — a serving box needs only
the URI and credentials; the ledger and catalog on the bucket ARE the
deployment artifact. Bare paths keep the config-directory behavior.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>

											
										
										
											2026-06-11 15:56:22 +03:00
+								        server_policy,
-												refactor(server): split the test monolith into area suites

tests/server.rs (6,517 lines, 110 tests) becomes seven area files —
auth_policy, data_routes, schema_routes, stored_queries, multi_graph,
boot_settings, s3 — with shared helpers in tests/support/mod.rs. Verbatim
moves + visibility bumps (pub on helpers, pub(super)->pub inside the
matrix harness); cargo fix stripped the per-file unused imports. All 110
tests pass in their new homes (289 across the crate including lib and
openapi).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>

											
										
										
											2026-06-11 15:03:51 +03:00
+								    } = settings.mode
 								    else {
 								        panic!("cluster boot must select multi-graph routing");
 								    };
 								    assert_eq!(graphs.len(), 1);
 								    assert_eq!(graphs[0].graph_id, "knowledge");
-												feat(cluster,server): inline policy content + config-free --cluster URI boot

Two serving changes that complete RFC-006's read side:

ServingPolicy carries the policy bundle CONTENT (digest-verified at
snapshot read) instead of a blob path — the catalog may live on object
storage, and the server must not re-read mutable state after the
snapshot. The server grows a PolicySource enum: File for omnigraph.yaml
deployments (unchanged), Inline for cluster boots, wired through
PolicyEngine::load_{graph,server}_from_source.

read_serving_snapshot_from_storage(uri) reads the applied revision
straight from a storage root, and --cluster accepts a scheme-qualified
URI (s3://bucket/prefix): config-free serving — a serving box needs only
the URI and credentials; the ledger and catalog on the bucket ARE the
deployment artifact. Bare paths keep the config-directory behavior.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>

											
										
										
											2026-06-11 15:56:22 +03:00
+								    assert!(server_policy.is_none());
-												refactor(server): split the test monolith into area suites

tests/server.rs (6,517 lines, 110 tests) becomes seven area files —
auth_policy, data_routes, schema_routes, stored_queries, multi_graph,
boot_settings, s3 — with shared helpers in tests/support/mod.rs. Verbatim
moves + visibility bumps (pub on helpers, pub(super)->pub inside the
matrix harness); cargo fix stripped the per-file unused imports. All 110
tests pass in their new homes (289 across the crate including lib and
openapi).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>

											
										
										
											2026-06-11 15:03:51 +03:00
 								    let state =
-												fix(cluster): stop cluster-apply crash-loops from the recovery-sidecar trap (#284)

* fix(cluster): stop cluster-apply crash-loops from the recovery-sidecar trap

A `cluster apply` carrying a schema change against a graph that has
non-main branches, or an unsupported "needs backfill" migration, armed a
recovery sidecar *before* calling the engine, then left it behind when the
engine rejected the apply pre-movement. The server refuses to boot while
any sidecar is pending, and re-running apply re-armed a fresh sidecar — an
unescapable crash loop. None of the engine rejections are bugs; the trap
is in the apply/serve choreography.

Three coordinated changes:

1. Preview before arming the sidecar. `cluster apply` now runs
   `preview_schema_apply_with_options` before `write_recovery_sidecar`, so
   parser/planner rejections (non-main branches, unsupported plan) fail
   loudly without leaving recovery work behind. The post-preview engine
   error path now deletes the sidecar when the live schema still matches
   the recorded digest (nothing moved), and keeps it only on real
   mid-movement failure — both branches covered by new engine-failpoint
   tests (cluster failpoints now enable omnigraph/failpoints).

2. Per-graph quarantine at serve time instead of whole-cluster refusal.
   A graph-attributed pending sidecar, an unopenable graph root, a query
   parse failure, or an unresolvable embedding provider now quarantines
   just that graph (logged loudly at every boot layer) while healthy
   graphs serve; `/graphs` lists only ready graphs and quarantined routes
   404. Cluster-global problems (missing/unreadable state, malformed or
   unattributable sidecars, shared-catalog or cluster-policy errors, zero
   healthy graphs) stay fail-fast. `--require-all-graphs` /
   OMNIGRAPH_REQUIRE_ALL_GRAPHS=1 restores all-or-nothing boot.

3. Backfill embedding-provider profile metadata on apply. Mirrors the
   existing policy-binding backfill: a pre-5A ledger missing
   `embedding_profile` is now detected as a metadata-only change and
   backfilled by a no-op apply, instead of bricking serve with
   `embedding_provider_profile_missing` forever.

Tests: trap (no sidecar after a rejected apply), both digest-cleanup
branches, per-graph quarantine (cluster + server), embedding backfill.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

* docs: resilient cluster boot + recovery-sidecar trap fix

Amend RFC-005 D4 readiness posture (cluster-global fail-fast vs graph-local
quarantine; deviation #5 for --require-all-graphs), add the v0.7.0 release
note, and update the user cluster/server/deployment docs and the
OMNIGRAPH_REQUIRE_ALL_GRAPHS env var.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

* fix(cluster): surface sidecar-cleanup failures; document severity promotion

Address Greptile review on PR #284:

- The pre-movement sidecar cleanup fast-path discarded `delete_object`'s
  result, so a transient delete failure left the graph quarantined with no
  signal. Add `try_delete_object` (Result-returning) and emit a
  `recovery_sidecar_cleanup_failed` warning diagnostic on failure; the
  fire-and-forget `delete_object` now delegates to it.
- Document why the serve-time loop promotes every `list_recovery_sidecars`
  diagnostic to a cluster-fatal error (the listing only emits genuine
  read/parse/version failures, as warnings, whose blast radius serving
  cannot prove) and note the promote-by-code path if that ever changes.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
											
										
										
											2026-06-19 03:34:15 +03:00
+								        omnigraph_server::open_multi_graph_state(graphs, Vec::new(), None, config_path, false)
-												refactor(server): split the test monolith into area suites

tests/server.rs (6,517 lines, 110 tests) becomes seven area files —
auth_policy, data_routes, schema_routes, stored_queries, multi_graph,
boot_settings, s3 — with shared helpers in tests/support/mod.rs. Verbatim
moves + visibility bumps (pub on helpers, pub(super)->pub inside the
matrix harness); cargo fix stripped the per-file unused imports. All 110
tests pass in their new homes (289 across the crate including lib and
openapi).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>

											
										
										
											2026-06-11 15:03:51 +03:00
+								            .await
 								            .unwrap();
 								    let app = build_app(state);
 								    // The management surface keeps its closed-by-default contract: without a
 								    // cluster-scoped policy bundle there is no server-level Cedar engine, so
 								    // GET /graphs refuses even in cluster mode.
 								    let (status, body) = json_response(
 								        &app,
-												fix(cluster): stop cluster-apply crash-loops from the recovery-sidecar trap (#284)

* fix(cluster): stop cluster-apply crash-loops from the recovery-sidecar trap

A `cluster apply` carrying a schema change against a graph that has
non-main branches, or an unsupported "needs backfill" migration, armed a
recovery sidecar *before* calling the engine, then left it behind when the
engine rejected the apply pre-movement. The server refuses to boot while
any sidecar is pending, and re-running apply re-armed a fresh sidecar — an
unescapable crash loop. None of the engine rejections are bugs; the trap
is in the apply/serve choreography.

Three coordinated changes:

1. Preview before arming the sidecar. `cluster apply` now runs
   `preview_schema_apply_with_options` before `write_recovery_sidecar`, so
   parser/planner rejections (non-main branches, unsupported plan) fail
   loudly without leaving recovery work behind. The post-preview engine
   error path now deletes the sidecar when the live schema still matches
   the recorded digest (nothing moved), and keeps it only on real
   mid-movement failure — both branches covered by new engine-failpoint
   tests (cluster failpoints now enable omnigraph/failpoints).

2. Per-graph quarantine at serve time instead of whole-cluster refusal.
   A graph-attributed pending sidecar, an unopenable graph root, a query
   parse failure, or an unresolvable embedding provider now quarantines
   just that graph (logged loudly at every boot layer) while healthy
   graphs serve; `/graphs` lists only ready graphs and quarantined routes
   404. Cluster-global problems (missing/unreadable state, malformed or
   unattributable sidecars, shared-catalog or cluster-policy errors, zero
   healthy graphs) stay fail-fast. `--require-all-graphs` /
   OMNIGRAPH_REQUIRE_ALL_GRAPHS=1 restores all-or-nothing boot.

3. Backfill embedding-provider profile metadata on apply. Mirrors the
   existing policy-binding backfill: a pre-5A ledger missing
   `embedding_profile` is now detected as a metadata-only change and
   backfilled by a no-op apply, instead of bricking serve with
   `embedding_provider_profile_missing` forever.

Tests: trap (no sidecar after a rejected apply), both digest-cleanup
branches, per-graph quarantine (cluster + server), embedding backfill.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

* docs: resilient cluster boot + recovery-sidecar trap fix

Amend RFC-005 D4 readiness posture (cluster-global fail-fast vs graph-local
quarantine; deviation #5 for --require-all-graphs), add the v0.7.0 release
note, and update the user cluster/server/deployment docs and the
OMNIGRAPH_REQUIRE_ALL_GRAPHS env var.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

* fix(cluster): surface sidecar-cleanup failures; document severity promotion

Address Greptile review on PR #284:

- The pre-movement sidecar cleanup fast-path discarded `delete_object`'s
  result, so a transient delete failure left the graph quarantined with no
  signal. Add `try_delete_object` (Result-returning) and emit a
  `recovery_sidecar_cleanup_failed` warning diagnostic on failure; the
  fire-and-forget `delete_object` now delegates to it.
- Document why the serve-time loop promotes every `list_recovery_sidecars`
  diagnostic to a cluster-fatal error (the listing only emits genuine
  read/parse/version failures, as warnings, whose blast radius serving
  cannot prove) and note the promote-by-code path if that ever changes.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
											
										
										
											2026-06-19 03:34:15 +03:00
+								        Request::builder()
 								            .uri("/graphs")
 								            .body(Body::empty())
 								            .unwrap(),
-												refactor(server): split the test monolith into area suites

tests/server.rs (6,517 lines, 110 tests) becomes seven area files —
auth_policy, data_routes, schema_routes, stored_queries, multi_graph,
boot_settings, s3 — with shared helpers in tests/support/mod.rs. Verbatim
moves + visibility bumps (pub on helpers, pub(super)->pub inside the
matrix harness); cargo fix stripped the per-file unused imports. All 110
tests pass in their new homes (289 across the crate including lib and
openapi).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>

											
										
										
											2026-06-11 15:03:51 +03:00
+								    )
 								    .await;
 								    assert_eq!(status, StatusCode::FORBIDDEN, "{body}");
 								    let (status, body) = json_response(
 								        &app,
 								        Request::builder()
 								            .uri("/graphs/knowledge/queries")
 								            .body(Body::empty())
 								            .unwrap(),
 								    )
 								    .await;
 								    assert_eq!(status, StatusCode::OK, "{body}");
 								    assert!(
 								        body["queries"]
 								            .as_array()
 								            .unwrap()
 								            .iter()
 								            .any(|q| q["name"] == "find_person"),
 								        "{body}"
 								    );
 								    let (status, body) = json_response(
 								        &app,
 								        Request::builder()
 								            .method(Method::POST)
 								            .uri("/graphs/knowledge/queries/find_person")
 								            .header("content-type", "application/json")
 								            .body(Body::from(r#"{"params":{"name":"nobody"}}"#))
 								            .unwrap(),
 								    )
 								    .await;
 								    assert_eq!(status, StatusCode::OK, "{body}");
 								}
-												fix(cluster): stop cluster-apply crash-loops from the recovery-sidecar trap (#284)

* fix(cluster): stop cluster-apply crash-loops from the recovery-sidecar trap

A `cluster apply` carrying a schema change against a graph that has
non-main branches, or an unsupported "needs backfill" migration, armed a
recovery sidecar *before* calling the engine, then left it behind when the
engine rejected the apply pre-movement. The server refuses to boot while
any sidecar is pending, and re-running apply re-armed a fresh sidecar — an
unescapable crash loop. None of the engine rejections are bugs; the trap
is in the apply/serve choreography.

Three coordinated changes:

1. Preview before arming the sidecar. `cluster apply` now runs
   `preview_schema_apply_with_options` before `write_recovery_sidecar`, so
   parser/planner rejections (non-main branches, unsupported plan) fail
   loudly without leaving recovery work behind. The post-preview engine
   error path now deletes the sidecar when the live schema still matches
   the recorded digest (nothing moved), and keeps it only on real
   mid-movement failure — both branches covered by new engine-failpoint
   tests (cluster failpoints now enable omnigraph/failpoints).

2. Per-graph quarantine at serve time instead of whole-cluster refusal.
   A graph-attributed pending sidecar, an unopenable graph root, a query
   parse failure, or an unresolvable embedding provider now quarantines
   just that graph (logged loudly at every boot layer) while healthy
   graphs serve; `/graphs` lists only ready graphs and quarantined routes
   404. Cluster-global problems (missing/unreadable state, malformed or
   unattributable sidecars, shared-catalog or cluster-policy errors, zero
   healthy graphs) stay fail-fast. `--require-all-graphs` /
   OMNIGRAPH_REQUIRE_ALL_GRAPHS=1 restores all-or-nothing boot.

3. Backfill embedding-provider profile metadata on apply. Mirrors the
   existing policy-binding backfill: a pre-5A ledger missing
   `embedding_profile` is now detected as a metadata-only change and
   backfilled by a no-op apply, instead of bricking serve with
   `embedding_provider_profile_missing` forever.

Tests: trap (no sidecar after a rejected apply), both digest-cleanup
branches, per-graph quarantine (cluster + server), embedding backfill.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

* docs: resilient cluster boot + recovery-sidecar trap fix

Amend RFC-005 D4 readiness posture (cluster-global fail-fast vs graph-local
quarantine; deviation #5 for --require-all-graphs), add the v0.7.0 release
note, and update the user cluster/server/deployment docs and the
OMNIGRAPH_REQUIRE_ALL_GRAPHS env var.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

* fix(cluster): surface sidecar-cleanup failures; document severity promotion

Address Greptile review on PR #284:

- The pre-movement sidecar cleanup fast-path discarded `delete_object`'s
  result, so a transient delete failure left the graph quarantined with no
  signal. Add `try_delete_object` (Result-returning) and emit a
  `recovery_sidecar_cleanup_failed` warning diagnostic on failure; the
  fire-and-forget `delete_object` now delegates to it.
- Document why the serve-time loop promotes every `list_recovery_sidecars`
  diagnostic to a cluster-fatal error (the listing only emits genuine
  read/parse/version failures, as warnings, whose blast radius serving
  cannot prove) and note the promote-by-code path if that ever changes.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
											
										
										
											2026-06-19 03:34:15 +03:00
+								#[tokio::test]
 								async fn cluster_boot_quarantines_graph_open_failures() {
 								    let temp = tempfile::tempdir().unwrap();
 								    let schema = "\nnode Person {\n  name: String @key\n}\n";
 								    let good_uri = temp.path().join("good.omni");
 								    Omnigraph::init(good_uri.to_string_lossy().as_ref(), schema)
 								        .await
 								        .unwrap();
 								    let bad_uri = temp.path().join("missing.omni");
 								    let server_policy = omnigraph_server::PolicySource::Inline(
 								        r#"
 								version: 1
 								kind: server
 								groups:
 								  admins: [act-admin]
 								rules:
 								  - id: admins-list-graphs
 								    allow:
 								      actors: { group: admins }
 								      actions: [graph_list]
 								"#
 								        .to_string(),
 								    );
 								    let graphs = vec![
 								        omnigraph_server::GraphStartupConfig {
 								            graph_id: "broken".to_string(),
 								            uri: bad_uri.to_string_lossy().to_string(),
 								            policy: None,
 								            embedding: None,
 								            queries: stored_query_registry(&[]),
 								        },
 								        omnigraph_server::GraphStartupConfig {
 								            graph_id: "good".to_string(),
 								            uri: good_uri.to_string_lossy().to_string(),
 								            policy: None,
 								            embedding: None,
 								            queries: stored_query_registry(&[]),
 								        },
 								    ];
 								    let strict_err = match omnigraph_server::open_multi_graph_state(
 								        graphs.clone(),
 								        vec![("act-admin".to_string(), "admin-token".to_string())],
 								        Some(&server_policy),
 								        temp.path().join("cluster.yaml"),
 								        true,
 								    )
 								    .await
 								    {
 								        Ok(_) => panic!("strict startup should reject a failed graph open"),
 								        Err(err) => err,
 								    };
 								    assert!(
 								        strict_err
 								            .to_string()
 								            .contains("strict multi-graph startup requires every graph to open"),
 								        "{strict_err}"
 								    );
 								    let state = omnigraph_server::open_multi_graph_state(
 								        graphs,
 								        vec![("act-admin".to_string(), "admin-token".to_string())],
 								        Some(&server_policy),
 								        temp.path().join("cluster.yaml"),
 								        false,
 								    )
 								    .await
 								    .unwrap();
 								    let mut ready: Vec<_> = state
 								        .routing()
 								        .registry
 								        .list()
 								        .iter()
 								        .map(|handle| handle.key.graph_id.as_str().to_string())
 								        .collect();
 								    ready.sort();
 								    assert_eq!(ready, vec!["good"]);
 								    let app = build_app(state);
 								    let (status, body) = json_response(
 								        &app,
 								        Request::builder()
 								            .uri("/graphs")
 								            .header("authorization", "Bearer admin-token")
 								            .body(Body::empty())
 								            .unwrap(),
 								    )
 								    .await;
 								    assert_eq!(status, StatusCode::OK, "{body}");
 								    assert_eq!(
 								        body["graphs"]
 								            .as_array()
 								            .unwrap()
 								            .iter()
 								            .map(|graph| graph["graph_id"].as_str().unwrap())
 								            .collect::<Vec<_>>(),
 								        vec!["good"]
 								    );
 								    let (status, body) = json_response(
 								        &app,
 								        Request::builder()
 								            .uri("/graphs/broken/queries")
 								            .header("authorization", "Bearer admin-token")
 								            .body(Body::empty())
 								            .unwrap(),
 								    )
 								    .await;
 								    assert_eq!(status, StatusCode::NOT_FOUND, "{body}");
 								}
-												Wire cluster embedding providers

											
										
										
											2026-06-16 04:02:08 +03:00
+								#[tokio::test(flavor = "multi_thread")]
 								#[serial]
 								async fn cluster_boot_injects_embedding_provider_config() {
 								    const EMBED_SCHEMA: &str = r#"
 								node Doc {
 								    slug: String @key
 								    title: String @index
 								    embedding: Vector(4) @embed("title", model="cluster-mock") @index
 								}
 								"#;
 								    const EMBED_QUERY: &str = r#"
 								query vector_search_string($q: String) {
 								    match { $d: Doc }
 								    return { $d.slug, $d.title }
 								    order { nearest($d.embedding, $q) }
 								    limit 3
 								}
 								"#;
 								    let alpha = mock_embedding("alpha", 4);
 								    let beta = mock_embedding("beta", 4);
 								    let gamma = mock_embedding("gamma", 4);
 								    let data = format!(
 								        concat!(
 								            r#"{{"type":"Doc","data":{{"slug":"alpha-doc","title":"alpha guide","embedding":[{}]}}}}"#,
 								            "\n",
 								            r#"{{"type":"Doc","data":{{"slug":"beta-doc","title":"beta guide","embedding":[{}]}}}}"#,
 								            "\n",
 								            r#"{{"type":"Doc","data":{{"slug":"gamma-doc","title":"gamma handbook","embedding":[{}]}}}}"#
 								        ),
 								        format_vector(&alpha),
 								        format_vector(&beta),
 								        format_vector(&gamma),
 								    );
 								    let temp = tempfile::tempdir().unwrap();
 								    fs::write(temp.path().join("docs.pg"), EMBED_SCHEMA).unwrap();
 								    fs::write(temp.path().join("search.gq"), EMBED_QUERY).unwrap();
 								    fs::write(
 								        temp.path().join("cluster.yaml"),
 								        r#"
 								version: 1
 								providers:
 								  embedding:
 								    default:
 								      kind: mock
 								      model: cluster-mock
 								graphs:
 								  knowledge:
 								    schema: ./docs.pg
 								    embedding_provider: default
 								    queries:
 								      vector_search_string:
 								        file: ./search.gq
 								"#,
 								    )
 								    .unwrap();
 								    let import = omnigraph_cluster::import_config_dir(temp.path()).await;
 								    assert!(import.ok, "{:?}", import.diagnostics);
 								    let apply = omnigraph_cluster::apply_config_dir(temp.path()).await;
 								    assert!(apply.ok && apply.converged, "{:?}", apply.diagnostics);
 								    let graph_uri = temp
 								        .path()
 								        .join("graphs/knowledge.omni")
 								        .to_string_lossy()
 								        .to_string();
 								    let mut db = Omnigraph::open(&graph_uri).await.unwrap();
 								    load_jsonl(&mut db, &data, LoadMode::Overwrite)
 								        .await
 								        .unwrap();
 								    let _guard = EnvGuard::set(&[
 								        ("OMNIGRAPH_EMBEDDINGS_MOCK", None),
 								        ("OMNIGRAPH_EMBED_PROVIDER", None),
 								        ("OMNIGRAPH_EMBED_BASE_URL", None),
 								        ("OMNIGRAPH_EMBED_MODEL", None),
 								        ("OPENROUTER_API_KEY", None),
 								        ("OPENAI_API_KEY", None),
 								        ("GEMINI_API_KEY", None),
 								    ]);
 								    let settings = cluster_settings(temp.path()).await.unwrap();
 								    let omnigraph_server::ServerConfigMode::Multi {
 								        graphs,
 								        config_path,
 								        server_policy,
 								    } = settings.mode
 								    else {
 								        panic!("cluster boot must select multi-graph routing");
 								    };
 								    let state = omnigraph_server::open_multi_graph_state(
 								        graphs,
 								        Vec::new(),
 								        server_policy.as_ref(),
 								        config_path,
-												fix(cluster): stop cluster-apply crash-loops from the recovery-sidecar trap (#284)

* fix(cluster): stop cluster-apply crash-loops from the recovery-sidecar trap

A `cluster apply` carrying a schema change against a graph that has
non-main branches, or an unsupported "needs backfill" migration, armed a
recovery sidecar *before* calling the engine, then left it behind when the
engine rejected the apply pre-movement. The server refuses to boot while
any sidecar is pending, and re-running apply re-armed a fresh sidecar — an
unescapable crash loop. None of the engine rejections are bugs; the trap
is in the apply/serve choreography.

Three coordinated changes:

1. Preview before arming the sidecar. `cluster apply` now runs
   `preview_schema_apply_with_options` before `write_recovery_sidecar`, so
   parser/planner rejections (non-main branches, unsupported plan) fail
   loudly without leaving recovery work behind. The post-preview engine
   error path now deletes the sidecar when the live schema still matches
   the recorded digest (nothing moved), and keeps it only on real
   mid-movement failure — both branches covered by new engine-failpoint
   tests (cluster failpoints now enable omnigraph/failpoints).

2. Per-graph quarantine at serve time instead of whole-cluster refusal.
   A graph-attributed pending sidecar, an unopenable graph root, a query
   parse failure, or an unresolvable embedding provider now quarantines
   just that graph (logged loudly at every boot layer) while healthy
   graphs serve; `/graphs` lists only ready graphs and quarantined routes
   404. Cluster-global problems (missing/unreadable state, malformed or
   unattributable sidecars, shared-catalog or cluster-policy errors, zero
   healthy graphs) stay fail-fast. `--require-all-graphs` /
   OMNIGRAPH_REQUIRE_ALL_GRAPHS=1 restores all-or-nothing boot.

3. Backfill embedding-provider profile metadata on apply. Mirrors the
   existing policy-binding backfill: a pre-5A ledger missing
   `embedding_profile` is now detected as a metadata-only change and
   backfilled by a no-op apply, instead of bricking serve with
   `embedding_provider_profile_missing` forever.

Tests: trap (no sidecar after a rejected apply), both digest-cleanup
branches, per-graph quarantine (cluster + server), embedding backfill.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

* docs: resilient cluster boot + recovery-sidecar trap fix

Amend RFC-005 D4 readiness posture (cluster-global fail-fast vs graph-local
quarantine; deviation #5 for --require-all-graphs), add the v0.7.0 release
note, and update the user cluster/server/deployment docs and the
OMNIGRAPH_REQUIRE_ALL_GRAPHS env var.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

* fix(cluster): surface sidecar-cleanup failures; document severity promotion

Address Greptile review on PR #284:

- The pre-movement sidecar cleanup fast-path discarded `delete_object`'s
  result, so a transient delete failure left the graph quarantined with no
  signal. Add `try_delete_object` (Result-returning) and emit a
  `recovery_sidecar_cleanup_failed` warning diagnostic on failure; the
  fire-and-forget `delete_object` now delegates to it.
- Document why the serve-time loop promotes every `list_recovery_sidecars`
  diagnostic to a cluster-fatal error (the listing only emits genuine
  read/parse/version failures, as warnings, whose blast radius serving
  cannot prove) and note the promote-by-code path if that ever changes.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
											
										
										
											2026-06-19 03:34:15 +03:00
+								        false,
-												Wire cluster embedding providers

											
										
										
											2026-06-16 04:02:08 +03:00
+								    )
 								    .await
 								    .unwrap();
 								    let app = build_app(state);
 								    let read = ReadRequest {
 								        query_source: EMBED_QUERY.to_string(),
 								        query_name: Some("vector_search_string".to_string()),
 								        params: Some(serde_json::json!({ "q": "alpha" })),
 								        branch: Some("main".to_string()),
 								        snapshot: None,
 								    };
 								    let (status, body) = json_response(
 								        &app,
 								        Request::builder()
 								            .uri("/graphs/knowledge/read")
 								            .method(Method::POST)
 								            .header("content-type", "application/json")
 								            .body(Body::from(serde_json::to_vec(&read).unwrap()))
 								            .unwrap(),
 								    )
 								    .await;
 								    assert_eq!(status, StatusCode::OK, "{body}");
 								    assert_eq!(body["row_count"], 3);
 								    assert_eq!(body["rows"][0]["d.slug"], "alpha-doc");
 								}
 								#[tokio::test(flavor = "multi_thread")]
 								#[serial]
 								async fn cluster_boot_refuses_missing_embedding_secret_env() {
 								    let temp = tempfile::tempdir().unwrap();
 								    fs::write(
 								        temp.path().join("people.pg"),
 								        "\nnode Person {\n  name: String @key\n}\n",
 								    )
 								    .unwrap();
 								    fs::write(
 								        temp.path().join("people.gq"),
 								        "\nquery find_person($name: String) {\n  match { $p: Person { name: $name } }\n  return { $p.name }\n}\n",
 								    )
 								    .unwrap();
 								    fs::write(
 								        temp.path().join("cluster.yaml"),
 								        r#"
 								version: 1
 								providers:
 								  embedding:
 								    default:
 								      kind: openai-compatible
 								      api_key: ${OG_TEST_MISSING_EMBED_KEY}
 								graphs:
 								  knowledge:
 								    schema: ./people.pg
 								    embedding_provider: default
 								    queries:
 								      find_person:
 								        file: ./people.gq
 								"#,
 								    )
 								    .unwrap();
 								    let import = omnigraph_cluster::import_config_dir(temp.path()).await;
 								    assert!(import.ok, "{:?}", import.diagnostics);
 								    let apply = omnigraph_cluster::apply_config_dir(temp.path()).await;
 								    assert!(apply.ok && apply.converged, "{:?}", apply.diagnostics);
 								    let _guard = EnvGuard::set(&[
 								        ("OG_TEST_MISSING_EMBED_KEY", None),
 								        ("OMNIGRAPH_EMBEDDINGS_MOCK", None),
 								    ]);
 								    let err = cluster_settings(temp.path()).await.unwrap_err();
 								    let message = err.to_string();
 								    assert!(
 								        message.contains("embedding provider for graph 'knowledge'"),
 								        "{message}"
 								    );
 								    assert!(message.contains("OG_TEST_MISSING_EMBED_KEY"), "{message}");
 								}
-												refactor(server): split the test monolith into area suites

tests/server.rs (6,517 lines, 110 tests) becomes seven area files —
auth_policy, data_routes, schema_routes, stored_queries, multi_graph,
boot_settings, s3 — with shared helpers in tests/support/mod.rs. Verbatim
moves + visibility bumps (pub on helpers, pub(super)->pub inside the
matrix harness); cargo fix stripped the per-file unused imports. All 110
tests pass in their new homes (289 across the crate including lib and
openapi).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>

											
										
										
											2026-06-11 15:03:51 +03:00
+								#[tokio::test]
 								async fn cluster_boot_wires_policy_bindings_into_cedar_slots() {
 								    let temp = tempfile::tempdir().unwrap();
 								    drop(temp);
 								    let policy_block = r#"policies:
 								  graph_rules:
 								    file: ./graph.policy.yaml
 								    applies_to: [knowledge]
 								  cluster_rules:
 								    file: ./cluster.policy.yaml
 								    applies_to: [cluster]
 								"#;
 								    let temp = {
 								        let temp = tempfile::tempdir().unwrap();
 								        fs::write(
 								            temp.path().join("people.pg"),
 								            "\nnode Person {\n  name: String @key\n}\n",
 								        )
 								        .unwrap();
 								        fs::write(
 								            temp.path().join("people.gq"),
 								            "\nquery find_person($name: String) {\n  match { $p: Person { name: $name } }\n  return { $p.name }\n}\n",
 								        )
 								        .unwrap();
 								        fs::write(
 								            temp.path().join("graph.policy.yaml"),
 								            permit_all_policy_yaml(&["default"]),
 								        )
 								        .unwrap();
 								        fs::write(
 								            temp.path().join("cluster.policy.yaml"),
-												fix(cluster): stop cluster-apply crash-loops from the recovery-sidecar trap (#284)

* fix(cluster): stop cluster-apply crash-loops from the recovery-sidecar trap

A `cluster apply` carrying a schema change against a graph that has
non-main branches, or an unsupported "needs backfill" migration, armed a
recovery sidecar *before* calling the engine, then left it behind when the
engine rejected the apply pre-movement. The server refuses to boot while
any sidecar is pending, and re-running apply re-armed a fresh sidecar — an
unescapable crash loop. None of the engine rejections are bugs; the trap
is in the apply/serve choreography.

Three coordinated changes:

1. Preview before arming the sidecar. `cluster apply` now runs
   `preview_schema_apply_with_options` before `write_recovery_sidecar`, so
   parser/planner rejections (non-main branches, unsupported plan) fail
   loudly without leaving recovery work behind. The post-preview engine
   error path now deletes the sidecar when the live schema still matches
   the recorded digest (nothing moved), and keeps it only on real
   mid-movement failure — both branches covered by new engine-failpoint
   tests (cluster failpoints now enable omnigraph/failpoints).

2. Per-graph quarantine at serve time instead of whole-cluster refusal.
   A graph-attributed pending sidecar, an unopenable graph root, a query
   parse failure, or an unresolvable embedding provider now quarantines
   just that graph (logged loudly at every boot layer) while healthy
   graphs serve; `/graphs` lists only ready graphs and quarantined routes
   404. Cluster-global problems (missing/unreadable state, malformed or
   unattributable sidecars, shared-catalog or cluster-policy errors, zero
   healthy graphs) stay fail-fast. `--require-all-graphs` /
   OMNIGRAPH_REQUIRE_ALL_GRAPHS=1 restores all-or-nothing boot.

3. Backfill embedding-provider profile metadata on apply. Mirrors the
   existing policy-binding backfill: a pre-5A ledger missing
   `embedding_profile` is now detected as a metadata-only change and
   backfilled by a no-op apply, instead of bricking serve with
   `embedding_provider_profile_missing` forever.

Tests: trap (no sidecar after a rejected apply), both digest-cleanup
branches, per-graph quarantine (cluster + server), embedding backfill.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

* docs: resilient cluster boot + recovery-sidecar trap fix

Amend RFC-005 D4 readiness posture (cluster-global fail-fast vs graph-local
quarantine; deviation #5 for --require-all-graphs), add the v0.7.0 release
note, and update the user cluster/server/deployment docs and the
OMNIGRAPH_REQUIRE_ALL_GRAPHS env var.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

* fix(cluster): surface sidecar-cleanup failures; document severity promotion

Address Greptile review on PR #284:

- The pre-movement sidecar cleanup fast-path discarded `delete_object`'s
  result, so a transient delete failure left the graph quarantined with no
  signal. Add `try_delete_object` (Result-returning) and emit a
  `recovery_sidecar_cleanup_failed` warning diagnostic on failure; the
  fire-and-forget `delete_object` now delegates to it.
- Document why the serve-time loop promotes every `list_recovery_sidecars`
  diagnostic to a cluster-fatal error (the listing only emits genuine
  read/parse/version failures, as warnings, whose blast radius serving
  cannot prove) and note the promote-by-code path if that ever changes.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
											
										
										
											2026-06-19 03:34:15 +03:00
+								            permit_all_policy_yaml(&["default"]).replace(
 								                "protected_branches: [main]\n",
 								                "protected_branches: [main]\nkind: server\n",
 								            ),
-												refactor(server): split the test monolith into area suites

tests/server.rs (6,517 lines, 110 tests) becomes seven area files —
auth_policy, data_routes, schema_routes, stored_queries, multi_graph,
boot_settings, s3 — with shared helpers in tests/support/mod.rs. Verbatim
moves + visibility bumps (pub on helpers, pub(super)->pub inside the
matrix harness); cargo fix stripped the per-file unused imports. All 110
tests pass in their new homes (289 across the crate including lib and
openapi).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>

											
										
										
											2026-06-11 15:03:51 +03:00
+								        )
 								        .unwrap();
 								        fs::write(
 								            temp.path().join("cluster.yaml"),
 								            format!(
 								                r#"
 								version: 1
 								graphs:
 								  knowledge:
 								    schema: ./people.pg
 								    queries:
 								      find_person:
 								        file: ./people.gq
 								{policy_block}"#
 								            ),
 								        )
 								        .unwrap();
 								        let import = omnigraph_cluster::import_config_dir(temp.path()).await;
 								        assert!(import.ok, "{:?}", import.diagnostics);
 								        let apply = omnigraph_cluster::apply_config_dir(temp.path()).await;
 								        assert!(apply.ok && apply.converged, "{:?}", apply.diagnostics);
 								        temp
 								    };
 								    let settings = cluster_settings(temp.path()).await.unwrap();
 								    let omnigraph_server::ServerConfigMode::Multi {
 								        graphs,
-												feat(cluster,server): inline policy content + config-free --cluster URI boot

Two serving changes that complete RFC-006's read side:

ServingPolicy carries the policy bundle CONTENT (digest-verified at
snapshot read) instead of a blob path — the catalog may live on object
storage, and the server must not re-read mutable state after the
snapshot. The server grows a PolicySource enum: File for omnigraph.yaml
deployments (unchanged), Inline for cluster boots, wired through
PolicyEngine::load_{graph,server}_from_source.

read_serving_snapshot_from_storage(uri) reads the applied revision
straight from a storage root, and --cluster accepts a scheme-qualified
URI (s3://bucket/prefix): config-free serving — a serving box needs only
the URI and credentials; the ledger and catalog on the bucket ARE the
deployment artifact. Bare paths keep the config-directory behavior.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>

											
										
										
											2026-06-11 15:56:22 +03:00
+								        server_policy,
-												refactor(server): split the test monolith into area suites

tests/server.rs (6,517 lines, 110 tests) becomes seven area files —
auth_policy, data_routes, schema_routes, stored_queries, multi_graph,
boot_settings, s3 — with shared helpers in tests/support/mod.rs. Verbatim
moves + visibility bumps (pub on helpers, pub(super)->pub inside the
matrix harness); cargo fix stripped the per-file unused imports. All 110
tests pass in their new homes (289 across the crate including lib and
openapi).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>

											
										
										
											2026-06-11 15:03:51 +03:00
+								        ..
 								    } = settings.mode
 								    else {
 								        panic!("cluster boot must select multi-graph routing");
 								    };
-												feat(cluster,server): inline policy content + config-free --cluster URI boot

Two serving changes that complete RFC-006's read side:

ServingPolicy carries the policy bundle CONTENT (digest-verified at
snapshot read) instead of a blob path — the catalog may live on object
storage, and the server must not re-read mutable state after the
snapshot. The server grows a PolicySource enum: File for omnigraph.yaml
deployments (unchanged), Inline for cluster boots, wired through
PolicyEngine::load_{graph,server}_from_source.

read_serving_snapshot_from_storage(uri) reads the applied revision
straight from a storage root, and --cluster accepts a scheme-qualified
URI (s3://bucket/prefix): config-free serving — a serving box needs only
the URI and credentials; the ledger and catalog on the bucket ARE the
deployment artifact. Bare paths keep the config-directory behavior.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>

											
										
										
											2026-06-11 15:56:22 +03:00
+								    // Cluster boots carry policy CONTENT (digest-verified catalog blobs),
 								    // not paths — the catalog may live on object storage.
 								    let omnigraph_server::PolicySource::Inline(graph_policy) =
 								        graphs[0].policy.as_ref().expect("graph-bound bundle")
 								    else {
 								        panic!("cluster-mode graph policy must be inline content");
 								    };
 								    assert!(graph_policy.contains("actors:"), "{graph_policy:?}");
 								    let omnigraph_server::PolicySource::Inline(server_policy) =
 								        server_policy.expect("cluster-bound bundle")
 								    else {
 								        panic!("cluster-mode server policy must be inline content");
 								    };
 								    assert!(server_policy.contains("kind: server"), "{server_policy:?}");
-												refactor(server): split the test monolith into area suites

tests/server.rs (6,517 lines, 110 tests) becomes seven area files —
auth_policy, data_routes, schema_routes, stored_queries, multi_graph,
boot_settings, s3 — with shared helpers in tests/support/mod.rs. Verbatim
moves + visibility bumps (pub on helpers, pub(super)->pub inside the
matrix harness); cargo fix stripped the per-file unused imports. All 110
tests pass in their new homes (289 across the crate including lib and
openapi).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>

											
										
										
											2026-06-11 15:03:51 +03:00
+								}
 								#[tokio::test]
 								async fn cluster_boot_refusals() {
-												feat(server)!: cluster-only server — remove single-graph serving (RFC-011) (#250)

omnigraph-server boots only from --cluster; all HTTP is /graphs/<id>/…; flat single-graph routes and the omnigraph.yaml server boot are removed. GraphRouting/ServerConfigMode collapse to multi-only; openapi.json regenerated to the nested shape; ~100 server route tests migrated; parity/system_local boot from a converged cluster. Gate green (1410 tests).
											
										
										
											2026-06-15 20:17:25 +03:00
+								    // RFC-011 cluster-only: with no --cluster, boot refuses with the
 								    // cluster-required remedy.
-												fix(cluster): stop cluster-apply crash-loops from the recovery-sidecar trap (#284)

* fix(cluster): stop cluster-apply crash-loops from the recovery-sidecar trap

A `cluster apply` carrying a schema change against a graph that has
non-main branches, or an unsupported "needs backfill" migration, armed a
recovery sidecar *before* calling the engine, then left it behind when the
engine rejected the apply pre-movement. The server refuses to boot while
any sidecar is pending, and re-running apply re-armed a fresh sidecar — an
unescapable crash loop. None of the engine rejections are bugs; the trap
is in the apply/serve choreography.

Three coordinated changes:

1. Preview before arming the sidecar. `cluster apply` now runs
   `preview_schema_apply_with_options` before `write_recovery_sidecar`, so
   parser/planner rejections (non-main branches, unsupported plan) fail
   loudly without leaving recovery work behind. The post-preview engine
   error path now deletes the sidecar when the live schema still matches
   the recorded digest (nothing moved), and keeps it only on real
   mid-movement failure — both branches covered by new engine-failpoint
   tests (cluster failpoints now enable omnigraph/failpoints).

2. Per-graph quarantine at serve time instead of whole-cluster refusal.
   A graph-attributed pending sidecar, an unopenable graph root, a query
   parse failure, or an unresolvable embedding provider now quarantines
   just that graph (logged loudly at every boot layer) while healthy
   graphs serve; `/graphs` lists only ready graphs and quarantined routes
   404. Cluster-global problems (missing/unreadable state, malformed or
   unattributable sidecars, shared-catalog or cluster-policy errors, zero
   healthy graphs) stay fail-fast. `--require-all-graphs` /
   OMNIGRAPH_REQUIRE_ALL_GRAPHS=1 restores all-or-nothing boot.

3. Backfill embedding-provider profile metadata on apply. Mirrors the
   existing policy-binding backfill: a pre-5A ledger missing
   `embedding_profile` is now detected as a metadata-only change and
   backfilled by a no-op apply, instead of bricking serve with
   `embedding_provider_profile_missing` forever.

Tests: trap (no sidecar after a rejected apply), both digest-cleanup
branches, per-graph quarantine (cluster + server), embedding backfill.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

* docs: resilient cluster boot + recovery-sidecar trap fix

Amend RFC-005 D4 readiness posture (cluster-global fail-fast vs graph-local
quarantine; deviation #5 for --require-all-graphs), add the v0.7.0 release
note, and update the user cluster/server/deployment docs and the
OMNIGRAPH_REQUIRE_ALL_GRAPHS env var.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

* fix(cluster): surface sidecar-cleanup failures; document severity promotion

Address Greptile review on PR #284:

- The pre-movement sidecar cleanup fast-path discarded `delete_object`'s
  result, so a transient delete failure left the graph quarantined with no
  signal. Add `try_delete_object` (Result-returning) and emit a
  `recovery_sidecar_cleanup_failed` warning diagnostic on failure; the
  fire-and-forget `delete_object` now delegates to it.
- Document why the serve-time loop promotes every `list_recovery_sidecars`
  diagnostic to a cluster-fatal error (the listing only emits genuine
  read/parse/version failures, as warnings, whose blast radius serving
  cannot prove) and note the promote-by-code path if that ever changes.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
											
										
										
											2026-06-19 03:34:15 +03:00
+								    let err = omnigraph_server::load_server_settings(None, None, true, false)
-												feat(server)!: cluster-only server — remove single-graph serving (RFC-011) (#250)

omnigraph-server boots only from --cluster; all HTTP is /graphs/<id>/…; flat single-graph routes and the omnigraph.yaml server boot are removed. GraphRouting/ServerConfigMode collapse to multi-only; openapi.json regenerated to the nested shape; ~100 server route tests migrated; parity/system_local boot from a converged cluster. Gate green (1410 tests).
											
										
										
											2026-06-15 20:17:25 +03:00
+								        .await
 								        .unwrap_err();
 								    assert!(err.to_string().contains("boots from a cluster"), "{err}");
-												refactor(server): split the test monolith into area suites

tests/server.rs (6,517 lines, 110 tests) becomes seven area files —
auth_policy, data_routes, schema_routes, stored_queries, multi_graph,
boot_settings, s3 — with shared helpers in tests/support/mod.rs. Verbatim
moves + visibility bumps (pub on helpers, pub(super)->pub inside the
matrix harness); cargo fix stripped the per-file unused imports. All 110
tests pass in their new homes (289 across the crate including lib and
openapi).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>

											
										
										
											2026-06-11 15:03:51 +03:00
+								    let temp = converged_cluster_dir("").await;
 								    let dir = temp.path().to_path_buf();
 								    // Tampered catalog blob refuses boot with the remedy.
 								    let blob_dir = dir.join("__cluster/resources/query/knowledge/find_person");
-												fix(cluster): stop cluster-apply crash-loops from the recovery-sidecar trap (#284)

* fix(cluster): stop cluster-apply crash-loops from the recovery-sidecar trap

A `cluster apply` carrying a schema change against a graph that has
non-main branches, or an unsupported "needs backfill" migration, armed a
recovery sidecar *before* calling the engine, then left it behind when the
engine rejected the apply pre-movement. The server refuses to boot while
any sidecar is pending, and re-running apply re-armed a fresh sidecar — an
unescapable crash loop. None of the engine rejections are bugs; the trap
is in the apply/serve choreography.

Three coordinated changes:

1. Preview before arming the sidecar. `cluster apply` now runs
   `preview_schema_apply_with_options` before `write_recovery_sidecar`, so
   parser/planner rejections (non-main branches, unsupported plan) fail
   loudly without leaving recovery work behind. The post-preview engine
   error path now deletes the sidecar when the live schema still matches
   the recorded digest (nothing moved), and keeps it only on real
   mid-movement failure — both branches covered by new engine-failpoint
   tests (cluster failpoints now enable omnigraph/failpoints).

2. Per-graph quarantine at serve time instead of whole-cluster refusal.
   A graph-attributed pending sidecar, an unopenable graph root, a query
   parse failure, or an unresolvable embedding provider now quarantines
   just that graph (logged loudly at every boot layer) while healthy
   graphs serve; `/graphs` lists only ready graphs and quarantined routes
   404. Cluster-global problems (missing/unreadable state, malformed or
   unattributable sidecars, shared-catalog or cluster-policy errors, zero
   healthy graphs) stay fail-fast. `--require-all-graphs` /
   OMNIGRAPH_REQUIRE_ALL_GRAPHS=1 restores all-or-nothing boot.

3. Backfill embedding-provider profile metadata on apply. Mirrors the
   existing policy-binding backfill: a pre-5A ledger missing
   `embedding_profile` is now detected as a metadata-only change and
   backfilled by a no-op apply, instead of bricking serve with
   `embedding_provider_profile_missing` forever.

Tests: trap (no sidecar after a rejected apply), both digest-cleanup
branches, per-graph quarantine (cluster + server), embedding backfill.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

* docs: resilient cluster boot + recovery-sidecar trap fix

Amend RFC-005 D4 readiness posture (cluster-global fail-fast vs graph-local
quarantine; deviation #5 for --require-all-graphs), add the v0.7.0 release
note, and update the user cluster/server/deployment docs and the
OMNIGRAPH_REQUIRE_ALL_GRAPHS env var.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

* fix(cluster): surface sidecar-cleanup failures; document severity promotion

Address Greptile review on PR #284:

- The pre-movement sidecar cleanup fast-path discarded `delete_object`'s
  result, so a transient delete failure left the graph quarantined with no
  signal. Add `try_delete_object` (Result-returning) and emit a
  `recovery_sidecar_cleanup_failed` warning diagnostic on failure; the
  fire-and-forget `delete_object` now delegates to it.
- Document why the serve-time loop promotes every `list_recovery_sidecars`
  diagnostic to a cluster-fatal error (the listing only emits genuine
  read/parse/version failures, as warnings, whose blast radius serving
  cannot prove) and note the promote-by-code path if that ever changes.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
											
										
										
											2026-06-19 03:34:15 +03:00
+								    let blob = fs::read_dir(&blob_dir)
 								        .unwrap()
 								        .next()
 								        .unwrap()
 								        .unwrap()
 								        .path();
-												refactor(server): split the test monolith into area suites

tests/server.rs (6,517 lines, 110 tests) becomes seven area files —
auth_policy, data_routes, schema_routes, stored_queries, multi_graph,
boot_settings, s3 — with shared helpers in tests/support/mod.rs. Verbatim
moves + visibility bumps (pub on helpers, pub(super)->pub inside the
matrix harness); cargo fix stripped the per-file unused imports. All 110
tests pass in their new homes (289 across the crate including lib and
openapi).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>

											
										
										
											2026-06-11 15:03:51 +03:00
+								    fs::write(&blob, "tampered").unwrap();
 								    let err = cluster_settings(&dir).await.unwrap_err();
 								    assert!(
 								        err.to_string().contains("catalog_payload_digest_mismatch"),
 								        "{err}"
 								    );
 								    assert!(err.to_string().contains("cluster refresh"), "{err}");
 								    // Missing state refuses with the import/apply remedy.
 								    let empty = tempfile::tempdir().unwrap();
 								    let err = cluster_settings(empty.path()).await.unwrap_err();
 								    assert!(err.to_string().contains("cluster_state_missing"), "{err}");
 								}