mirror of
https://github.com/ModernRelay/omnigraph.git
synced 2026-06-12 01:45:14 +02:00
Merge pull request #180 from ModernRelay/feat/cluster-local-config
feat(cli): per-operator actor for cluster ops; pin omnigraph.yaml isolation
This commit is contained in:
commit
2b5fb7197e
7 changed files with 320 additions and 29 deletions
|
|
@ -6,7 +6,7 @@ use std::path::PathBuf;
|
|||
use std::sync::Arc;
|
||||
|
||||
use clap::{Arg, ArgAction, Args, CommandFactory, FromArgMatches, Parser, Subcommand, ValueEnum};
|
||||
use color_eyre::eyre::{Result, bail};
|
||||
use color_eyre::eyre::{Result, WrapErr, bail};
|
||||
use omnigraph::db::{Omnigraph, ReadTarget, SnapshotId};
|
||||
use omnigraph::loader::LoadMode;
|
||||
use omnigraph::storage::normalize_root_uri;
|
||||
|
|
@ -1251,6 +1251,25 @@ async fn open_local_db_with_policy(graph: &ResolvedCliGraph) -> Result<Omnigraph
|
|||
}
|
||||
}
|
||||
|
||||
/// Actor resolution for cluster operations. Cluster FACTS stay unlayered
|
||||
/// (cluster.yaml only), but the operator's identity is a per-operator fact —
|
||||
/// the per-operator config's permanent job. An explicit --as never touches
|
||||
/// any config (containers and CI stay config-free); without it, the standard
|
||||
/// cwd omnigraph.yaml search supplies `cli.actor`, and a malformed config
|
||||
/// fails loudly rather than silently dropping attribution. Deliberately
|
||||
/// `load_config`, NOT `load_cli_config`: the latter also loads
|
||||
/// `auth.env_file` into the process env — a second thing, violating the
|
||||
/// documented "exactly one thing" contract.
|
||||
fn resolve_cluster_actor(cli_as: Option<&str>) -> Result<Option<String>> {
|
||||
if let Some(actor) = cli_as {
|
||||
return Ok(Some(actor.to_string()));
|
||||
}
|
||||
let config = load_config(None).wrap_err(
|
||||
"resolving the default actor from the per-operator omnigraph.yaml (pass --as <ACTOR> to skip this lookup)",
|
||||
)?;
|
||||
Ok(config.cli.actor.clone())
|
||||
}
|
||||
|
||||
/// Resolve the CLI's effective actor identity for engine-layer policy
|
||||
/// (MR-722). Precedence: `--as <ACTOR>` (top-level flag) overrides
|
||||
/// `cli.actor` from `omnigraph.yaml`; both unset returns `None`. When
|
||||
|
|
@ -3610,16 +3629,12 @@ async fn main() -> Result<()> {
|
|||
finish_cluster_plan(&output, json)?;
|
||||
}
|
||||
ClusterCommand::Apply { config, json } => {
|
||||
// The global --as actor attributes graph-moving operations
|
||||
// (sidecars, audit entries, engine schema-apply commits).
|
||||
// Cluster config stays unlayered: no omnigraph.yaml fallback.
|
||||
let output = apply_config_dir_with_options(
|
||||
config,
|
||||
ApplyOptions {
|
||||
actor: cli.as_actor.clone(),
|
||||
},
|
||||
)
|
||||
.await;
|
||||
// The actor attributes graph-moving operations (sidecars,
|
||||
// audit entries, engine schema-apply commits). Cluster FACTS
|
||||
// stay unlayered; the operator's identity resolves --as flag
|
||||
// first, then the per-operator omnigraph.yaml `cli.actor`.
|
||||
let actor = resolve_cluster_actor(cli.as_actor.as_deref())?;
|
||||
let output = apply_config_dir_with_options(config, ApplyOptions { actor }).await;
|
||||
finish_cluster_apply(&output, json)?;
|
||||
}
|
||||
ClusterCommand::Approve {
|
||||
|
|
@ -3627,12 +3642,12 @@ async fn main() -> Result<()> {
|
|||
config,
|
||||
json,
|
||||
} => {
|
||||
let Some(approver) = cli.as_actor.as_deref() else {
|
||||
let Some(approver) = resolve_cluster_actor(cli.as_actor.as_deref())? else {
|
||||
bail!(
|
||||
"`cluster approve` requires the global --as <ACTOR> flag: an approval without an approver is meaningless"
|
||||
"`cluster approve` requires an approver: pass the global --as <ACTOR> flag or set `cli.actor` in your omnigraph.yaml — an approval without an approver is meaningless"
|
||||
);
|
||||
};
|
||||
let output = approve_config_dir(config, &resource, approver).await;
|
||||
let output = approve_config_dir(config, &resource, &approver).await;
|
||||
finish_cluster_approve(&output, json)?;
|
||||
}
|
||||
ClusterCommand::Status { config, json } => {
|
||||
|
|
|
|||
|
|
@ -4325,3 +4325,212 @@ fn queries_validate_positional_uri_ignores_default_graph() {
|
|||
"positional URI must validate the top-level registry, not the cli.graph default; stdout:\n{stdout}"
|
||||
);
|
||||
}
|
||||
|
||||
// ---- per-operator local config (omnigraph.yaml) vs the cluster surfaces ----
|
||||
|
||||
/// Cluster ops resolve operator identity per-operator: --as wins, and
|
||||
/// without it the cwd omnigraph.yaml's `cli.actor` is the default.
|
||||
#[test]
|
||||
fn cluster_apply_uses_cli_actor_from_local_config() {
|
||||
let temp = tempdir().unwrap();
|
||||
write_cluster_config_fixture(temp.path());
|
||||
fs::write(
|
||||
temp.path().join("omnigraph.yaml"),
|
||||
"cli:\n actor: act-local\n",
|
||||
)
|
||||
.unwrap();
|
||||
// Phase 1: import once (setup, not under test).
|
||||
let output = cli()
|
||||
.current_dir(temp.path())
|
||||
.arg("cluster")
|
||||
.arg("import")
|
||||
.arg("--config")
|
||||
.arg(temp.path())
|
||||
.output()
|
||||
.unwrap();
|
||||
assert!(output.status.success(), "{output:?}");
|
||||
|
||||
// Phase 2: apply alone, capturing the echoed actor (idempotent re-runs).
|
||||
let apply = |extra: &[&str]| {
|
||||
let mut command = cli();
|
||||
command.current_dir(temp.path());
|
||||
for arg in extra {
|
||||
command.arg(arg);
|
||||
}
|
||||
let output = command
|
||||
.arg("cluster")
|
||||
.arg("apply")
|
||||
.arg("--config")
|
||||
.arg(temp.path())
|
||||
.arg("--json")
|
||||
.output()
|
||||
.unwrap();
|
||||
let json: serde_json::Value =
|
||||
serde_json::from_str(String::from_utf8_lossy(&output.stdout).trim()).unwrap();
|
||||
json["actor"].clone()
|
||||
};
|
||||
assert_eq!(apply(&[]), "act-local", "cli.actor is the no-flag default");
|
||||
assert_eq!(apply(&["--as", "andrew"]), "andrew", "--as overrides cli.actor");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cluster_approve_uses_cli_actor_fallback() {
|
||||
let temp = tempdir().unwrap();
|
||||
write_cluster_config_fixture(temp.path());
|
||||
fs::write(
|
||||
temp.path().join("omnigraph.yaml"),
|
||||
"cli:\n actor: act-local\n",
|
||||
)
|
||||
.unwrap();
|
||||
// Converge, then remove the graph so a gated delete is pending.
|
||||
for command in ["import", "apply"] {
|
||||
let output = cli()
|
||||
.current_dir(temp.path())
|
||||
.arg("cluster")
|
||||
.arg(command)
|
||||
.arg("--config")
|
||||
.arg(temp.path())
|
||||
.output()
|
||||
.unwrap();
|
||||
assert!(output.status.success(), "cluster {command} failed");
|
||||
}
|
||||
fs::write(temp.path().join("cluster.yaml"), "version: 1\ngraphs: {}\n").unwrap();
|
||||
|
||||
let output = cli()
|
||||
.current_dir(temp.path())
|
||||
.arg("cluster")
|
||||
.arg("approve")
|
||||
.arg("graph.knowledge")
|
||||
.arg("--config")
|
||||
.arg(temp.path())
|
||||
.arg("--json")
|
||||
.output()
|
||||
.unwrap();
|
||||
assert!(output.status.success(), "{output:?}");
|
||||
let json: serde_json::Value =
|
||||
serde_json::from_str(String::from_utf8_lossy(&output.stdout).trim()).unwrap();
|
||||
assert_eq!(json["approved_by"], "act-local");
|
||||
|
||||
// With neither flag nor config: refused with the actionable message.
|
||||
let bare = tempdir().unwrap();
|
||||
write_cluster_config_fixture(bare.path());
|
||||
let output = output_failure(
|
||||
cli()
|
||||
.current_dir(bare.path())
|
||||
.arg("cluster")
|
||||
.arg("approve")
|
||||
.arg("graph.knowledge")
|
||||
.arg("--config")
|
||||
.arg(bare.path()),
|
||||
);
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
assert!(stderr.contains("--as"), "{stderr}");
|
||||
assert!(stderr.contains("cli.actor"), "{stderr}");
|
||||
}
|
||||
|
||||
/// A malformed omnigraph.yaml in the cwd must never break cluster commands;
|
||||
/// it is read for exactly one thing (the actor default when --as is absent),
|
||||
/// and only that path fails loudly.
|
||||
#[test]
|
||||
fn cluster_commands_ignore_malformed_local_config() {
|
||||
let temp = tempdir().unwrap();
|
||||
write_cluster_config_fixture(temp.path());
|
||||
fs::write(temp.path().join("omnigraph.yaml"), "{{{{ not yaml").unwrap();
|
||||
|
||||
for command in ["validate", "plan", "status"] {
|
||||
let output = cli()
|
||||
.current_dir(temp.path())
|
||||
.arg("cluster")
|
||||
.arg(command)
|
||||
.arg("--config")
|
||||
.arg(temp.path())
|
||||
.arg("--json")
|
||||
.output()
|
||||
.unwrap();
|
||||
assert!(
|
||||
output.status.success() || command == "plan", // plan warns state-missing pre-import; still must not config-error
|
||||
"cluster {command} affected by malformed omnigraph.yaml: {output:?}"
|
||||
);
|
||||
assert!(
|
||||
!String::from_utf8_lossy(&output.stderr).contains("omnigraph.yaml"),
|
||||
"cluster {command} touched omnigraph.yaml"
|
||||
);
|
||||
}
|
||||
// import + apply with an explicit --as: the config is never loaded.
|
||||
for (command, args) in [("import", vec![]), ("apply", vec!["--as", "andrew"])] {
|
||||
let mut invocation = cli();
|
||||
invocation.current_dir(temp.path());
|
||||
for arg in &args {
|
||||
invocation.arg(arg);
|
||||
}
|
||||
let output = invocation
|
||||
.arg("cluster")
|
||||
.arg(command)
|
||||
.arg("--config")
|
||||
.arg(temp.path())
|
||||
.output()
|
||||
.unwrap();
|
||||
assert!(
|
||||
output.status.success(),
|
||||
"cluster {command} affected by malformed omnigraph.yaml: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
}
|
||||
// Only the no-flag actor lookup is allowed to fail, and loudly.
|
||||
let output = output_failure(
|
||||
cli()
|
||||
.current_dir(temp.path())
|
||||
.arg("cluster")
|
||||
.arg("apply")
|
||||
.arg("--config")
|
||||
.arg(temp.path()),
|
||||
);
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
assert!(
|
||||
stderr.contains("omnigraph.yaml") && stderr.contains("--as"),
|
||||
"the actor-default config read must fail loudly and actionably: {stderr}"
|
||||
);
|
||||
}
|
||||
|
||||
/// A well-formed omnigraph.yaml with a CONFLICTING world view (different
|
||||
/// graphs, server bind) leaks nothing into cluster outputs.
|
||||
#[test]
|
||||
fn cluster_commands_ignore_conflicting_local_config() {
|
||||
let baseline = tempdir().unwrap();
|
||||
write_cluster_config_fixture(baseline.path());
|
||||
let with_config = tempdir().unwrap();
|
||||
write_cluster_config_fixture(with_config.path());
|
||||
fs::write(
|
||||
with_config.path().join("omnigraph.yaml"),
|
||||
r#"
|
||||
server:
|
||||
bind: 0.0.0.0:9999
|
||||
graphs:
|
||||
phantom:
|
||||
uri: ./phantom.omni
|
||||
"#,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let validate = |dir: &std::path::Path| {
|
||||
let output = cli()
|
||||
.current_dir(dir)
|
||||
.arg("cluster")
|
||||
.arg("validate")
|
||||
.arg("--config")
|
||||
.arg(dir)
|
||||
.arg("--json")
|
||||
.output()
|
||||
.unwrap();
|
||||
assert!(output.status.success(), "{output:?}");
|
||||
serde_json::from_str::<serde_json::Value>(String::from_utf8_lossy(&output.stdout).trim())
|
||||
.unwrap()
|
||||
};
|
||||
let (a, b) = (validate(baseline.path()), validate(with_config.path()));
|
||||
// Compare the path-free invariants (paths embed each tempdir).
|
||||
for key in ["ok", "diagnostics", "resource_digests", "dependencies"] {
|
||||
assert_eq!(a[key], b[key], "conflicting omnigraph.yaml leaked into cluster validate ({key})");
|
||||
}
|
||||
let leaked = b.to_string();
|
||||
assert!(!leaked.contains("phantom") && !leaked.contains("9999"), "{leaked}");
|
||||
}
|
||||
|
|
|
|||
|
|
@ -218,6 +218,18 @@ pub fn spawn_server_with_cluster(cluster_dir: &Path) -> TestServer {
|
|||
spawn_server_process(command)
|
||||
}
|
||||
|
||||
/// Cluster boot with the server process's cwd set explicitly — used to prove
|
||||
/// rule 0 never touches the cwd omnigraph.yaml search.
|
||||
pub fn spawn_server_with_cluster_in(cluster_dir: &Path, cwd: &Path) -> TestServer {
|
||||
let mut command = server_process();
|
||||
command
|
||||
.arg("--cluster")
|
||||
.arg(cluster_dir)
|
||||
.arg("--unauthenticated")
|
||||
.current_dir(cwd);
|
||||
spawn_server_process(command)
|
||||
}
|
||||
|
||||
pub fn spawn_server_with_cluster_env(cluster_dir: &Path, envs: &[(&str, &str)]) -> TestServer {
|
||||
let mut command = server_process();
|
||||
command.arg("--cluster").arg(cluster_dir);
|
||||
|
|
|
|||
|
|
@ -595,8 +595,12 @@ policy: {{}}
|
|||
),
|
||||
);
|
||||
|
||||
// current_dir matters: `init` scaffolds an omnigraph.yaml into its cwd,
|
||||
// and without this it pollutes the crate dir, breaking unrelated tests
|
||||
// (anything resolving a graph target from the cwd config).
|
||||
output_success(
|
||||
cli()
|
||||
.current_dir(query_root)
|
||||
.arg("init")
|
||||
.arg("--schema")
|
||||
.arg(fixture("test.pg"))
|
||||
|
|
@ -604,6 +608,7 @@ policy: {{}}
|
|||
);
|
||||
output_success(
|
||||
cli()
|
||||
.current_dir(query_root)
|
||||
.arg("load")
|
||||
.arg("--data")
|
||||
.arg(fixture("test.jsonl"))
|
||||
|
|
@ -2152,3 +2157,37 @@ policies:
|
|||
// unknown query — the server's anti-probing contract.
|
||||
assert_eq!(invoke("admin-token").status().as_u16(), 404);
|
||||
}
|
||||
|
||||
/// Rule 0 (axiom 15): a --cluster server never reads omnigraph.yaml — not
|
||||
/// even the implicit cwd search. A MALFORMED config in the process cwd must
|
||||
/// not affect boot or serving.
|
||||
#[test]
|
||||
fn cluster_server_boot_ignores_local_config_in_cwd() {
|
||||
let cluster = tempfile::tempdir().unwrap();
|
||||
std::fs::write(
|
||||
cluster.path().join("people.pg"),
|
||||
"\nnode Person {\n name: String @key\n}\n",
|
||||
)
|
||||
.unwrap();
|
||||
std::fs::write(
|
||||
cluster.path().join("cluster.yaml"),
|
||||
"version: 1\ngraphs:\n knowledge:\n schema: ./people.pg\n",
|
||||
)
|
||||
.unwrap();
|
||||
for command in ["import", "apply"] {
|
||||
let output = cli()
|
||||
.arg("cluster")
|
||||
.arg(command)
|
||||
.arg("--config")
|
||||
.arg(cluster.path())
|
||||
.output()
|
||||
.unwrap();
|
||||
assert!(output.status.success(), "cluster {command} failed");
|
||||
}
|
||||
let cwd = tempfile::tempdir().unwrap();
|
||||
std::fs::write(cwd.path().join("omnigraph.yaml"), "{{{{ not yaml").unwrap();
|
||||
|
||||
let server = spawn_server_with_cluster_in(cluster.path(), cwd.path());
|
||||
let response = reqwest::blocking::get(format!("{}/healthz", server.base_url)).unwrap();
|
||||
assert!(response.status().is_success());
|
||||
}
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ Top-level command families and subcommands. Graph-targeting commands accept eith
|
|||
| `commit list \| show` | inspect commit graph |
|
||||
| `schema plan \| apply \| show (alias: get)` | migrations |
|
||||
| `lint` (alias: `check`) | offline / graph-backed query validation. Replaces `query lint` / `query check`, which are kept as deprecated argv-level shims that print a one-line warning and rewrite to `omnigraph lint` |
|
||||
| `cluster validate \| plan \| apply \| approve \| status \| refresh \| import \| force-unlock` | declarative cluster control plane. `validate` checks a local `cluster.yaml` folder and referenced schema/query/policy files; `plan` diffs it against local JSON state at `__cluster/state.json`, annotates dispositions, and embeds real schema-migration previews; `apply` converges the cluster — stored-query/policy catalog writes (content-addressed under `__cluster/resources/`), graph creates, schema updates (soft drops only; `--as` records the actor), and graph deletes behind a digest-bound approval from `cluster approve <resource> --as <actor>`; what apply converges is what an `omnigraph-server --cluster <dir>` deployment serves on its next restart (omnigraph.yaml deployments are unaffected); `status` reads the state ledger; `refresh`/`import` explicitly update local JSON state from read-only graph observations; `force-unlock <LOCK_ID>` manually removes a held local state lock by exact id |
|
||||
| `cluster validate \| plan \| apply \| approve \| status \| refresh \| import \| force-unlock` | declarative cluster control plane. `validate` checks a local `cluster.yaml` folder and referenced schema/query/policy files; `plan` diffs it against local JSON state at `__cluster/state.json`, annotates dispositions, and embeds real schema-migration previews; `apply` converges the cluster — stored-query/policy catalog writes (content-addressed under `__cluster/resources/`), graph creates, schema updates (soft drops only; `--as` records the actor), and graph deletes behind a digest-bound approval from `cluster approve <resource> --as <actor>` (`apply`/`approve` default the actor from the per-operator `omnigraph.yaml`'s `cli.actor` when `--as` is omitted; nothing else in that file affects cluster commands); what apply converges is what an `omnigraph-server --cluster <dir>` deployment serves on its next restart (omnigraph.yaml deployments are unaffected); `status` reads the state ledger; `refresh`/`import` explicitly update local JSON state from read-only graph observations; `force-unlock <LOCK_ID>` manually removes a held local state lock by exact id |
|
||||
| `optimize` | non-destructive Lance compaction (skips tables with `Blob` columns or uncovered drift; `--json` reports `skipped`) |
|
||||
| `repair [--confirm] [--force]` | preview or explicitly publish uncovered manifest/head drift. `--confirm` heals verified maintenance drift and exits non-zero if suspicious/unverifiable drift is refused; `--force --confirm` publishes suspicious/unverifiable drift after operator review |
|
||||
| `cleanup --keep N --older-than 7d --confirm` | destructive version GC |
|
||||
|
|
|
|||
|
|
@ -36,15 +36,26 @@ omnigraph cluster force-unlock <LOCK_ID> --config ./company-brain --json
|
|||
## Relationship to `omnigraph.yaml`
|
||||
|
||||
`cluster.yaml` does not replace `omnigraph.yaml`, and the two never describe
|
||||
the same fact. `omnigraph.yaml` remains how the CLI and server are configured
|
||||
today (graph targets, server bind, CLI defaults, credential env references) and
|
||||
is its long-term home for per-operator settings. `cluster.yaml` is the shared
|
||||
desired state of a whole deployment, read only by the `cluster` commands via
|
||||
`--config`. In the current stage, nothing recorded in the cluster state ledger
|
||||
affects what a server serves or what other CLI commands target — the cluster
|
||||
catalog is inspectable, not live. When server boot from cluster state ships in
|
||||
a later stage, it will be an explicit per-deployment mode switch, not a merge
|
||||
of the two files.
|
||||
the same fact. `omnigraph.yaml` is the permanent **per-operator** layer (CLI
|
||||
defaults, the operator's identity and credential references, graph targets
|
||||
for data-plane commands); `cluster.yaml` is the shared desired state of a
|
||||
whole deployment, read only by the `cluster` commands via `--config`.
|
||||
|
||||
The exact contract:
|
||||
|
||||
- **Cluster commands read `omnigraph.yaml` for exactly one thing**: the
|
||||
`cli.actor` default used by `apply`/`approve` when `--as` is omitted —
|
||||
operator identity is a per-operator fact. With `--as` present, no config
|
||||
is read at all. Nothing else (its graph set, targets, bind, queries,
|
||||
policies) ever influences a cluster command; a malformed `omnigraph.yaml`
|
||||
breaks only the no-flag actor lookup, loudly.
|
||||
- **A `--cluster` server reads `omnigraph.yaml` for nothing** — not even the
|
||||
implicit current-directory search runs (mode-inference rule 0). Boot from
|
||||
cluster state XOR `omnigraph.yaml`, never a merge.
|
||||
- **The other direction is ergonomics, not coupling**: a per-operator
|
||||
`omnigraph.yaml` may point `graphs.<name>.uri` at a cluster's derived root
|
||||
(`./company-brain/graphs/knowledge.omni`) so data-plane commands can use
|
||||
`--target <name>` — an ordinary local path, no special handling.
|
||||
|
||||
## Supported `cluster.yaml`
|
||||
|
||||
|
|
|
|||
|
|
@ -109,8 +109,10 @@ omnigraph cluster apply --config ./company-brain --as andrew
|
|||
```
|
||||
|
||||
`--as <actor>` attributes the run: it is recorded in recovery sidecars and
|
||||
audit entries and threaded into the engine's commit history. Make it a habit
|
||||
on every apply (it is required for `approve`).
|
||||
audit entries and threaded into the engine's commit history. Set
|
||||
`cli: { actor: <you> }` in your per-operator `omnigraph.yaml` to make it the
|
||||
default when `--as` is omitted (the flag always wins; `approve` requires one
|
||||
of the two).
|
||||
|
||||
What each change kind does:
|
||||
|
||||
|
|
@ -234,9 +236,12 @@ with an in-flight apply.
|
|||
- **CI-driven convergence**: `validate` and `plan --json` are read-only and
|
||||
safe in pipelines; gate `apply --as ci` on plan review. Approvals are the
|
||||
human step by design — keep `cluster approve` out of automation.
|
||||
- **`omnigraph.yaml` still has a job**: per-operator settings (CLI defaults,
|
||||
credentials, active context). It just no longer describes the deployment —
|
||||
a server boots from one source or the other, never a merge of both.
|
||||
- **`omnigraph.yaml` still has a job**: per-operator settings — your
|
||||
`cli.actor` default for `--as`, CLI defaults, credentials, and data-plane
|
||||
ergonomics (point `graphs.<name>.uri` at a derived root like
|
||||
`./company-brain/graphs/knowledge.omni` to use `--target <name>` for
|
||||
loads). It just no longer describes the deployment — a server boots from
|
||||
one source or the other, never a merge of both.
|
||||
|
||||
## What the control plane does not do (yet)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue