Merge pull request #180 from ModernRelay/feat/cluster-local-config

feat(cli): per-operator actor for cluster ops; pin omnigraph.yaml isolation
This commit is contained in:
Andrew Altshuler 2026-06-10 23:57:31 +03:00 committed by GitHub
commit 2b5fb7197e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 320 additions and 29 deletions

View file

@ -6,7 +6,7 @@ use std::path::PathBuf;
use std::sync::Arc;
use clap::{Arg, ArgAction, Args, CommandFactory, FromArgMatches, Parser, Subcommand, ValueEnum};
use color_eyre::eyre::{Result, bail};
use color_eyre::eyre::{Result, WrapErr, bail};
use omnigraph::db::{Omnigraph, ReadTarget, SnapshotId};
use omnigraph::loader::LoadMode;
use omnigraph::storage::normalize_root_uri;
@ -1251,6 +1251,25 @@ async fn open_local_db_with_policy(graph: &ResolvedCliGraph) -> Result<Omnigraph
}
}
/// Actor resolution for cluster operations. Cluster FACTS stay unlayered
/// (cluster.yaml only), but the operator's identity is a per-operator fact —
/// the per-operator config's permanent job. An explicit --as never touches
/// any config (containers and CI stay config-free); without it, the standard
/// cwd omnigraph.yaml search supplies `cli.actor`, and a malformed config
/// fails loudly rather than silently dropping attribution. Deliberately
/// `load_config`, NOT `load_cli_config`: the latter also loads
/// `auth.env_file` into the process env — a second thing, violating the
/// documented "exactly one thing" contract.
fn resolve_cluster_actor(cli_as: Option<&str>) -> Result<Option<String>> {
if let Some(actor) = cli_as {
return Ok(Some(actor.to_string()));
}
let config = load_config(None).wrap_err(
"resolving the default actor from the per-operator omnigraph.yaml (pass --as <ACTOR> to skip this lookup)",
)?;
Ok(config.cli.actor.clone())
}
/// Resolve the CLI's effective actor identity for engine-layer policy
/// (MR-722). Precedence: `--as <ACTOR>` (top-level flag) overrides
/// `cli.actor` from `omnigraph.yaml`; both unset returns `None`. When
@ -3610,16 +3629,12 @@ async fn main() -> Result<()> {
finish_cluster_plan(&output, json)?;
}
ClusterCommand::Apply { config, json } => {
// The global --as actor attributes graph-moving operations
// (sidecars, audit entries, engine schema-apply commits).
// Cluster config stays unlayered: no omnigraph.yaml fallback.
let output = apply_config_dir_with_options(
config,
ApplyOptions {
actor: cli.as_actor.clone(),
},
)
.await;
// The actor attributes graph-moving operations (sidecars,
// audit entries, engine schema-apply commits). Cluster FACTS
// stay unlayered; the operator's identity resolves --as flag
// first, then the per-operator omnigraph.yaml `cli.actor`.
let actor = resolve_cluster_actor(cli.as_actor.as_deref())?;
let output = apply_config_dir_with_options(config, ApplyOptions { actor }).await;
finish_cluster_apply(&output, json)?;
}
ClusterCommand::Approve {
@ -3627,12 +3642,12 @@ async fn main() -> Result<()> {
config,
json,
} => {
let Some(approver) = cli.as_actor.as_deref() else {
let Some(approver) = resolve_cluster_actor(cli.as_actor.as_deref())? else {
bail!(
"`cluster approve` requires the global --as <ACTOR> flag: an approval without an approver is meaningless"
"`cluster approve` requires an approver: pass the global --as <ACTOR> flag or set `cli.actor` in your omnigraph.yaml — an approval without an approver is meaningless"
);
};
let output = approve_config_dir(config, &resource, approver).await;
let output = approve_config_dir(config, &resource, &approver).await;
finish_cluster_approve(&output, json)?;
}
ClusterCommand::Status { config, json } => {

View file

@ -4325,3 +4325,212 @@ fn queries_validate_positional_uri_ignores_default_graph() {
"positional URI must validate the top-level registry, not the cli.graph default; stdout:\n{stdout}"
);
}
// ---- per-operator local config (omnigraph.yaml) vs the cluster surfaces ----
/// Cluster ops resolve operator identity per-operator: --as wins, and
/// without it the cwd omnigraph.yaml's `cli.actor` is the default.
#[test]
fn cluster_apply_uses_cli_actor_from_local_config() {
let temp = tempdir().unwrap();
write_cluster_config_fixture(temp.path());
fs::write(
temp.path().join("omnigraph.yaml"),
"cli:\n actor: act-local\n",
)
.unwrap();
// Phase 1: import once (setup, not under test).
let output = cli()
.current_dir(temp.path())
.arg("cluster")
.arg("import")
.arg("--config")
.arg(temp.path())
.output()
.unwrap();
assert!(output.status.success(), "{output:?}");
// Phase 2: apply alone, capturing the echoed actor (idempotent re-runs).
let apply = |extra: &[&str]| {
let mut command = cli();
command.current_dir(temp.path());
for arg in extra {
command.arg(arg);
}
let output = command
.arg("cluster")
.arg("apply")
.arg("--config")
.arg(temp.path())
.arg("--json")
.output()
.unwrap();
let json: serde_json::Value =
serde_json::from_str(String::from_utf8_lossy(&output.stdout).trim()).unwrap();
json["actor"].clone()
};
assert_eq!(apply(&[]), "act-local", "cli.actor is the no-flag default");
assert_eq!(apply(&["--as", "andrew"]), "andrew", "--as overrides cli.actor");
}
#[test]
fn cluster_approve_uses_cli_actor_fallback() {
let temp = tempdir().unwrap();
write_cluster_config_fixture(temp.path());
fs::write(
temp.path().join("omnigraph.yaml"),
"cli:\n actor: act-local\n",
)
.unwrap();
// Converge, then remove the graph so a gated delete is pending.
for command in ["import", "apply"] {
let output = cli()
.current_dir(temp.path())
.arg("cluster")
.arg(command)
.arg("--config")
.arg(temp.path())
.output()
.unwrap();
assert!(output.status.success(), "cluster {command} failed");
}
fs::write(temp.path().join("cluster.yaml"), "version: 1\ngraphs: {}\n").unwrap();
let output = cli()
.current_dir(temp.path())
.arg("cluster")
.arg("approve")
.arg("graph.knowledge")
.arg("--config")
.arg(temp.path())
.arg("--json")
.output()
.unwrap();
assert!(output.status.success(), "{output:?}");
let json: serde_json::Value =
serde_json::from_str(String::from_utf8_lossy(&output.stdout).trim()).unwrap();
assert_eq!(json["approved_by"], "act-local");
// With neither flag nor config: refused with the actionable message.
let bare = tempdir().unwrap();
write_cluster_config_fixture(bare.path());
let output = output_failure(
cli()
.current_dir(bare.path())
.arg("cluster")
.arg("approve")
.arg("graph.knowledge")
.arg("--config")
.arg(bare.path()),
);
let stderr = String::from_utf8_lossy(&output.stderr);
assert!(stderr.contains("--as"), "{stderr}");
assert!(stderr.contains("cli.actor"), "{stderr}");
}
/// A malformed omnigraph.yaml in the cwd must never break cluster commands;
/// it is read for exactly one thing (the actor default when --as is absent),
/// and only that path fails loudly.
#[test]
fn cluster_commands_ignore_malformed_local_config() {
let temp = tempdir().unwrap();
write_cluster_config_fixture(temp.path());
fs::write(temp.path().join("omnigraph.yaml"), "{{{{ not yaml").unwrap();
for command in ["validate", "plan", "status"] {
let output = cli()
.current_dir(temp.path())
.arg("cluster")
.arg(command)
.arg("--config")
.arg(temp.path())
.arg("--json")
.output()
.unwrap();
assert!(
output.status.success() || command == "plan", // plan warns state-missing pre-import; still must not config-error
"cluster {command} affected by malformed omnigraph.yaml: {output:?}"
);
assert!(
!String::from_utf8_lossy(&output.stderr).contains("omnigraph.yaml"),
"cluster {command} touched omnigraph.yaml"
);
}
// import + apply with an explicit --as: the config is never loaded.
for (command, args) in [("import", vec![]), ("apply", vec!["--as", "andrew"])] {
let mut invocation = cli();
invocation.current_dir(temp.path());
for arg in &args {
invocation.arg(arg);
}
let output = invocation
.arg("cluster")
.arg(command)
.arg("--config")
.arg(temp.path())
.output()
.unwrap();
assert!(
output.status.success(),
"cluster {command} affected by malformed omnigraph.yaml: {}",
String::from_utf8_lossy(&output.stderr)
);
}
// Only the no-flag actor lookup is allowed to fail, and loudly.
let output = output_failure(
cli()
.current_dir(temp.path())
.arg("cluster")
.arg("apply")
.arg("--config")
.arg(temp.path()),
);
let stderr = String::from_utf8_lossy(&output.stderr);
assert!(
stderr.contains("omnigraph.yaml") && stderr.contains("--as"),
"the actor-default config read must fail loudly and actionably: {stderr}"
);
}
/// A well-formed omnigraph.yaml with a CONFLICTING world view (different
/// graphs, server bind) leaks nothing into cluster outputs.
#[test]
fn cluster_commands_ignore_conflicting_local_config() {
let baseline = tempdir().unwrap();
write_cluster_config_fixture(baseline.path());
let with_config = tempdir().unwrap();
write_cluster_config_fixture(with_config.path());
fs::write(
with_config.path().join("omnigraph.yaml"),
r#"
server:
bind: 0.0.0.0:9999
graphs:
phantom:
uri: ./phantom.omni
"#,
)
.unwrap();
let validate = |dir: &std::path::Path| {
let output = cli()
.current_dir(dir)
.arg("cluster")
.arg("validate")
.arg("--config")
.arg(dir)
.arg("--json")
.output()
.unwrap();
assert!(output.status.success(), "{output:?}");
serde_json::from_str::<serde_json::Value>(String::from_utf8_lossy(&output.stdout).trim())
.unwrap()
};
let (a, b) = (validate(baseline.path()), validate(with_config.path()));
// Compare the path-free invariants (paths embed each tempdir).
for key in ["ok", "diagnostics", "resource_digests", "dependencies"] {
assert_eq!(a[key], b[key], "conflicting omnigraph.yaml leaked into cluster validate ({key})");
}
let leaked = b.to_string();
assert!(!leaked.contains("phantom") && !leaked.contains("9999"), "{leaked}");
}

View file

@ -218,6 +218,18 @@ pub fn spawn_server_with_cluster(cluster_dir: &Path) -> TestServer {
spawn_server_process(command)
}
/// Cluster boot with the server process's cwd set explicitly — used to prove
/// rule 0 never touches the cwd omnigraph.yaml search.
pub fn spawn_server_with_cluster_in(cluster_dir: &Path, cwd: &Path) -> TestServer {
let mut command = server_process();
command
.arg("--cluster")
.arg(cluster_dir)
.arg("--unauthenticated")
.current_dir(cwd);
spawn_server_process(command)
}
pub fn spawn_server_with_cluster_env(cluster_dir: &Path, envs: &[(&str, &str)]) -> TestServer {
let mut command = server_process();
command.arg("--cluster").arg(cluster_dir);

View file

@ -595,8 +595,12 @@ policy: {{}}
),
);
// current_dir matters: `init` scaffolds an omnigraph.yaml into its cwd,
// and without this it pollutes the crate dir, breaking unrelated tests
// (anything resolving a graph target from the cwd config).
output_success(
cli()
.current_dir(query_root)
.arg("init")
.arg("--schema")
.arg(fixture("test.pg"))
@ -604,6 +608,7 @@ policy: {{}}
);
output_success(
cli()
.current_dir(query_root)
.arg("load")
.arg("--data")
.arg(fixture("test.jsonl"))
@ -2152,3 +2157,37 @@ policies:
// unknown query — the server's anti-probing contract.
assert_eq!(invoke("admin-token").status().as_u16(), 404);
}
/// Rule 0 (axiom 15): a --cluster server never reads omnigraph.yaml — not
/// even the implicit cwd search. A MALFORMED config in the process cwd must
/// not affect boot or serving.
#[test]
fn cluster_server_boot_ignores_local_config_in_cwd() {
let cluster = tempfile::tempdir().unwrap();
std::fs::write(
cluster.path().join("people.pg"),
"\nnode Person {\n name: String @key\n}\n",
)
.unwrap();
std::fs::write(
cluster.path().join("cluster.yaml"),
"version: 1\ngraphs:\n knowledge:\n schema: ./people.pg\n",
)
.unwrap();
for command in ["import", "apply"] {
let output = cli()
.arg("cluster")
.arg(command)
.arg("--config")
.arg(cluster.path())
.output()
.unwrap();
assert!(output.status.success(), "cluster {command} failed");
}
let cwd = tempfile::tempdir().unwrap();
std::fs::write(cwd.path().join("omnigraph.yaml"), "{{{{ not yaml").unwrap();
let server = spawn_server_with_cluster_in(cluster.path(), cwd.path());
let response = reqwest::blocking::get(format!("{}/healthz", server.base_url)).unwrap();
assert!(response.status().is_success());
}

View file

@ -19,7 +19,7 @@ Top-level command families and subcommands. Graph-targeting commands accept eith
| `commit list \| show` | inspect commit graph |
| `schema plan \| apply \| show (alias: get)` | migrations |
| `lint` (alias: `check`) | offline / graph-backed query validation. Replaces `query lint` / `query check`, which are kept as deprecated argv-level shims that print a one-line warning and rewrite to `omnigraph lint` |
| `cluster validate \| plan \| apply \| approve \| status \| refresh \| import \| force-unlock` | declarative cluster control plane. `validate` checks a local `cluster.yaml` folder and referenced schema/query/policy files; `plan` diffs it against local JSON state at `__cluster/state.json`, annotates dispositions, and embeds real schema-migration previews; `apply` converges the cluster — stored-query/policy catalog writes (content-addressed under `__cluster/resources/`), graph creates, schema updates (soft drops only; `--as` records the actor), and graph deletes behind a digest-bound approval from `cluster approve <resource> --as <actor>`; what apply converges is what an `omnigraph-server --cluster <dir>` deployment serves on its next restart (omnigraph.yaml deployments are unaffected); `status` reads the state ledger; `refresh`/`import` explicitly update local JSON state from read-only graph observations; `force-unlock <LOCK_ID>` manually removes a held local state lock by exact id |
| `cluster validate \| plan \| apply \| approve \| status \| refresh \| import \| force-unlock` | declarative cluster control plane. `validate` checks a local `cluster.yaml` folder and referenced schema/query/policy files; `plan` diffs it against local JSON state at `__cluster/state.json`, annotates dispositions, and embeds real schema-migration previews; `apply` converges the cluster — stored-query/policy catalog writes (content-addressed under `__cluster/resources/`), graph creates, schema updates (soft drops only; `--as` records the actor), and graph deletes behind a digest-bound approval from `cluster approve <resource> --as <actor>` (`apply`/`approve` default the actor from the per-operator `omnigraph.yaml`'s `cli.actor` when `--as` is omitted; nothing else in that file affects cluster commands); what apply converges is what an `omnigraph-server --cluster <dir>` deployment serves on its next restart (omnigraph.yaml deployments are unaffected); `status` reads the state ledger; `refresh`/`import` explicitly update local JSON state from read-only graph observations; `force-unlock <LOCK_ID>` manually removes a held local state lock by exact id |
| `optimize` | non-destructive Lance compaction (skips tables with `Blob` columns or uncovered drift; `--json` reports `skipped`) |
| `repair [--confirm] [--force]` | preview or explicitly publish uncovered manifest/head drift. `--confirm` heals verified maintenance drift and exits non-zero if suspicious/unverifiable drift is refused; `--force --confirm` publishes suspicious/unverifiable drift after operator review |
| `cleanup --keep N --older-than 7d --confirm` | destructive version GC |

View file

@ -36,15 +36,26 @@ omnigraph cluster force-unlock <LOCK_ID> --config ./company-brain --json
## Relationship to `omnigraph.yaml`
`cluster.yaml` does not replace `omnigraph.yaml`, and the two never describe
the same fact. `omnigraph.yaml` remains how the CLI and server are configured
today (graph targets, server bind, CLI defaults, credential env references) and
is its long-term home for per-operator settings. `cluster.yaml` is the shared
desired state of a whole deployment, read only by the `cluster` commands via
`--config`. In the current stage, nothing recorded in the cluster state ledger
affects what a server serves or what other CLI commands target — the cluster
catalog is inspectable, not live. When server boot from cluster state ships in
a later stage, it will be an explicit per-deployment mode switch, not a merge
of the two files.
the same fact. `omnigraph.yaml` is the permanent **per-operator** layer (CLI
defaults, the operator's identity and credential references, graph targets
for data-plane commands); `cluster.yaml` is the shared desired state of a
whole deployment, read only by the `cluster` commands via `--config`.
The exact contract:
- **Cluster commands read `omnigraph.yaml` for exactly one thing**: the
`cli.actor` default used by `apply`/`approve` when `--as` is omitted —
operator identity is a per-operator fact. With `--as` present, no config
is read at all. Nothing else (its graph set, targets, bind, queries,
policies) ever influences a cluster command; a malformed `omnigraph.yaml`
breaks only the no-flag actor lookup, loudly.
- **A `--cluster` server reads `omnigraph.yaml` for nothing** — not even the
implicit current-directory search runs (mode-inference rule 0). Boot from
cluster state XOR `omnigraph.yaml`, never a merge.
- **The other direction is ergonomics, not coupling**: a per-operator
`omnigraph.yaml` may point `graphs.<name>.uri` at a cluster's derived root
(`./company-brain/graphs/knowledge.omni`) so data-plane commands can use
`--target <name>` — an ordinary local path, no special handling.
## Supported `cluster.yaml`

View file

@ -109,8 +109,10 @@ omnigraph cluster apply --config ./company-brain --as andrew
```
`--as <actor>` attributes the run: it is recorded in recovery sidecars and
audit entries and threaded into the engine's commit history. Make it a habit
on every apply (it is required for `approve`).
audit entries and threaded into the engine's commit history. Set
`cli: { actor: <you> }` in your per-operator `omnigraph.yaml` to make it the
default when `--as` is omitted (the flag always wins; `approve` requires one
of the two).
What each change kind does:
@ -234,9 +236,12 @@ with an in-flight apply.
- **CI-driven convergence**: `validate` and `plan --json` are read-only and
safe in pipelines; gate `apply --as ci` on plan review. Approvals are the
human step by design — keep `cluster approve` out of automation.
- **`omnigraph.yaml` still has a job**: per-operator settings (CLI defaults,
credentials, active context). It just no longer describes the deployment —
a server boots from one source or the other, never a merge of both.
- **`omnigraph.yaml` still has a job**: per-operator settings — your
`cli.actor` default for `--as`, CLI defaults, credentials, and data-plane
ergonomics (point `graphs.<name>.uri` at a derived root like
`./company-brain/graphs/knowledge.omni` to use `--target <name>` for
loads). It just no longer describes the deployment — a server boots from
one source or the other, never a merge of both.
## What the control plane does not do (yet)