diff --git a/AGENTS.md b/AGENTS.md index a9cc9c0..27d1b7b 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -17,7 +17,7 @@ Tools that support `@`-imports (Claude Code) auto-include all three files via th `CLAUDE.md` is a symlink to this file — there is exactly one source of truth. Edit `AGENTS.md`. **Version surveyed:** 0.6.0 -**Workspace crates:** `omnigraph-compiler`, `omnigraph` (engine), `omnigraph-cli`, `omnigraph-server` +**Workspace crates:** `omnigraph-compiler`, `omnigraph` (engine), `omnigraph-policy`, `omnigraph-cli`, `omnigraph-server` **Storage substrate:** Lance 6.x (columnar, versioned, branchable) **License:** MIT **Toolchain:** Rust stable, edition 2024 @@ -33,7 +33,7 @@ OmniGraph is a typed property-graph engine built as a coordination layer over ma - **Multi-modal querying**: vector ANN (`nearest`), full-text (`search`/`fuzzy`/`match_text`/`bm25`), Reciprocal Rank Fusion (`rrf`), and graph traversal (`Expand`, anti-join `not { … }`) in one runtime. - **Branches and commits across the whole graph**: Git-style — every successful publish appends to a commit DAG; merges are three-way at the row level. - **Atomic per-query writes**: `mutate_as` and `load` accumulate insert/update batches into an in-memory `MutationStaging.pending` per touched table; one `stage_*` + `commit_staged` per table runs at end-of-query, then `ManifestBatchPublisher::publish` commits the manifest atomically with per-table `expected_table_versions` CAS. A mid-query failure leaves Lance HEAD untouched on staged tables — no drift, no run state machine, no staging branches. Deletes still inline-commit; D₂ at parse time prevents inserts/updates and deletes from coexisting in one query. -- **HTTP server**: Axum + utoipa OpenAPI, bearer auth (SHA-256 hashed, optional AWS Secrets Manager). Cedar policy enforcement is engine-wide — every `_as` writer calls `Omnigraph::enforce(action, scope, actor)`, so HTTP, CLI, and embedded SDK consumers all hit the same gate. +- **HTTP server**: Axum + utoipa OpenAPI, bearer auth (SHA-256 hashed, optional AWS Secrets Manager). Cedar policy enforcement is engine-wide — every `_as` writer calls `Omnigraph::enforce(action, scope, actor)`, so HTTP, CLI, and embedded SDK consumers all hit the same gate. **Two modes** (v0.6.0+): single-graph (legacy flat routes) and multi-graph (`/graphs/{graph_id}/...` cluster routes + read-only `GET /graphs` enumeration). Per-graph + server-level Cedar policies. Runtime add/remove (`POST /graphs`, `DELETE /graphs/{id}`) is not exposed — operators edit `omnigraph.yaml` and restart. - **CLI** driven by a single `omnigraph.yaml`; multi-format output (json/jsonl/csv/kv/table). Throughout the docs, capabilities are split into **L1 — Inherited from Lance** vs **L2 — Added by OmniGraph**. @@ -226,8 +226,8 @@ omnigraph policy explain --actor act-alice --action change --branch main | Per-query atomic writes | — | In-memory `MutationStaging.pending` accumulator + `stage_*` / `commit_staged` per touched table at end-of-query + publisher CAS via `commit_with_expected` (single manifest commit per `mutate_as` / `load`); D₂ parse-time rule keeps inserts/updates and deletes from mixing | | Three-way row-level merge | — | `OrderedTableCursor` + `StagedTableWriter`, structured `MergeConflictKind` | | Change feeds | — | `diff_between` / `diff_commits` with manifest fast path + ID streaming | -| Cedar policy | — | 8 actions, branch / target_branch / protected scopes, validate/test/explain CLI. **Engine-wide enforcement** (MR-722): every `_as` writer (`apply_schema_as`, `mutate_as`, `load_as`, `ingest_as`, `branch_create_as` / `branch_create_from_as`, `branch_delete_as`, `branch_merge_as`) calls `Omnigraph::enforce(action, scope, actor)` — HTTP, CLI, embedded SDK all hit the same gate. | -| HTTP server | — | Axum, OpenAPI via utoipa, bearer auth (SHA-256, AWS Secrets Manager option), `authorize_request` at the HTTP boundary (resolves bearer→actor, applies admission control), NDJSON streaming export | +| Cedar policy | — | Per-graph actions plus server-scoped actions (see [docs/user/policy.md](docs/user/policy.md) for the current list), branch / target_branch / protected scopes, validate/test/explain CLI. **Engine-wide enforcement** (MR-722): every `_as` writer (`apply_schema_as`, `mutate_as`, `load_as`, `ingest_as`, `branch_create_as` / `branch_create_from_as`, `branch_delete_as`, `branch_merge_as`) calls `Omnigraph::enforce(action, scope, actor)` — HTTP, CLI, embedded SDK all hit the same gate. | +| HTTP server | — | Axum, OpenAPI via utoipa, bearer auth (SHA-256, AWS Secrets Manager option), `authorize_request` at the HTTP boundary (resolves bearer→actor, applies admission control), NDJSON streaming export, **multi-graph mode (v0.6.0+) with cluster routes + read-only `GET /graphs` enumeration + per-graph + server-level Cedar policies. Add/remove graphs by editing `omnigraph.yaml` and restarting.** | | CLI with config | — | `omnigraph.yaml`, aliases, multi-format output (json/jsonl/csv/kv/table) | | Audit / actor tracking | — | `_as` write APIs + actor map in commit graph | | Local RustFS bootstrap | — | `scripts/local-rustfs-bootstrap.sh` one-shot S3-backed dev environment | diff --git a/Cargo.lock b/Cargo.lock index f93315e..a3d6d62 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4642,6 +4642,7 @@ dependencies = [ name = "omnigraph-server" version = "0.6.0" dependencies = [ + "arc-swap", "async-trait", "aws-config", "aws-sdk-secretsmanager", @@ -4655,6 +4656,7 @@ dependencies = [ "omnigraph-compiler", "omnigraph-engine", "omnigraph-policy", + "regex", "serde", "serde_json", "serde_yaml", @@ -4662,6 +4664,7 @@ dependencies = [ "sha2", "subtle", "tempfile", + "thiserror", "tokio", "tower", "tower-http", diff --git a/crates/omnigraph-cli/src/main.rs b/crates/omnigraph-cli/src/main.rs index 84f1c19..2cb4373 100644 --- a/crates/omnigraph-cli/src/main.rs +++ b/crates/omnigraph-cli/src/main.rs @@ -18,9 +18,10 @@ use omnigraph_compiler::{ use omnigraph_server::api::{ BranchCreateOutput, BranchCreateRequest, BranchDeleteOutput, BranchListOutput, BranchMergeOutput, BranchMergeRequest, ChangeOutput, ChangeRequest, CommitListOutput, - CommitOutput, ErrorOutput, ExportRequest, IngestOutput, IngestRequest, ReadOutput, ReadRequest, - SchemaApplyOutput, SchemaApplyRequest, SchemaOutput, SnapshotOutput, SnapshotTableOutput, - commit_output, ingest_output, read_output, schema_apply_output, snapshot_payload, + CommitOutput, ErrorOutput, ExportRequest, GraphListResponse, IngestOutput, IngestRequest, + ReadOutput, ReadRequest, SchemaApplyOutput, SchemaApplyRequest, SchemaOutput, SnapshotOutput, + SnapshotTableOutput, commit_output, ingest_output, read_output, schema_apply_output, + snapshot_payload, }; use omnigraph_server::{ AliasCommand, OmnigraphConfig, PolicyAction, PolicyDecision, PolicyEngine, PolicyRequest, @@ -72,6 +73,13 @@ enum Command { schema: PathBuf, /// Graph URI (local path or s3://) uri: String, + /// Overwrite existing schema artifacts at the URI. Without + /// this flag, init refuses to touch a URI that already holds + /// `_schema.pg`, `_schema.ir.json`, or `__schema_state.json` + /// — closes the re-init footgun (MR-668 follow-up). With the + /// flag, the operator opts in to destructive semantics. + #[arg(long)] + force: bool, }, /// Load data into a graph Load { @@ -253,6 +261,33 @@ enum Command { #[arg(long)] json: bool, }, + /// Manage graphs on a multi-graph server (MR-668) + Graphs { + #[command(subcommand)] + command: GraphsCommand, + }, +} + +/// Operations on the graph registry of a multi-graph server (MR-668). +/// +/// All operations target a remote multi-graph server URL (http:// or +/// https://). Local-URI invocations return a clear error. To add or +/// remove graphs, operators edit `omnigraph.yaml` directly and restart +/// the server — runtime mutation is not exposed in v0.6.0. +#[derive(Debug, Subcommand)] +enum GraphsCommand { + /// List every graph registered with the multi-graph server. + List { + /// Remote server URL (e.g. `https://server.example.com`). + #[arg(long)] + uri: Option, + #[arg(long)] + target: Option, + #[arg(long)] + config: Option, + #[arg(long)] + json: bool, + }, } #[derive(Debug, Subcommand)] @@ -694,7 +729,7 @@ fn resolve_policy_engine(config: &OmnigraphConfig) -> Result { let policy_file = config .resolve_policy_file() .ok_or_else(|| color_eyre::eyre::eyre!("policy.file must be set in omnigraph.yaml"))?; - PolicyEngine::load(&policy_file, &policy_graph_id(config)) + PolicyEngine::load_graph(&policy_file, &policy_graph_id(config)) } /// Open a local-URI graph and, when `policy.file` is configured in @@ -1305,12 +1340,12 @@ fn print_commit_human(commit: &CommitOutput) { println!("created_at: {}", commit.created_at); } -fn print_policy_explain(decision: &PolicyDecision, request: &PolicyRequest) { +fn print_policy_explain(decision: &PolicyDecision, actor_id: &str, request: &PolicyRequest) { println!( "decision: {}", if decision.allowed { "allow" } else { "deny" } ); - println!("actor: {}", request.actor_id); + println!("actor: {}", actor_id); println!("action: {}", request.action); if let Some(branch) = &request.branch { println!("branch: {}", branch); @@ -1718,10 +1753,15 @@ async fn main() -> Result<()> { print_embed_human(&output); } } - Command::Init { schema, uri } => { + Command::Init { schema, uri, force } => { let schema_source = fs::read_to_string(&schema)?; ensure_local_graph_parent(&uri)?; - Omnigraph::init(&uri, &schema_source).await?; + Omnigraph::init_with_options( + &uri, + &schema_source, + omnigraph::db::InitOptions { force }, + ) + .await?; scaffold_config_if_missing(&uri)?; println!("initialized {}", uri); } @@ -2443,13 +2483,12 @@ async fn main() -> Result<()> { let config = load_cli_config(config.as_ref())?; let engine = resolve_policy_engine(&config)?; let request = PolicyRequest { - actor_id: actor, action, branch, target_branch, }; - let decision = engine.authorize(&request)?; - print_policy_explain(&decision, &request); + let decision = engine.authorize(&actor, &request)?; + print_policy_explain(&decision, &actor, &request); } }, Command::Optimize { @@ -2556,6 +2595,41 @@ async fn main() -> Result<()> { ); } } + Command::Graphs { command } => match command { + GraphsCommand::List { + uri, + target, + config, + json, + } => { + let config = load_cli_config(config.as_ref())?; + let bearer_token = + resolve_remote_bearer_token(&config, uri.as_deref(), target.as_deref())?; + let uri = resolve_uri(&config, uri, target.as_deref())?; + if !is_remote_uri(&uri) { + bail!( + "`omnigraph graphs list` requires a remote multi-graph server URL \ + (http:// or https://). To enumerate local graphs, read `omnigraph.yaml` \ + directly." + ); + } + let payload = remote_json::( + &http_client, + Method::GET, + remote_url(&uri, "/graphs"), + None, + bearer_token.as_deref(), + ) + .await?; + if json { + print_json(&payload)?; + } else { + for entry in payload.graphs { + println!("{}\t{}", entry.graph_id, entry.uri); + } + } + } + }, } Ok(()) } diff --git a/crates/omnigraph-cli/tests/cli.rs b/crates/omnigraph-cli/tests/cli.rs index f7238b6..f956c86 100644 --- a/crates/omnigraph-cli/tests/cli.rs +++ b/crates/omnigraph-cli/tests/cli.rs @@ -2040,3 +2040,47 @@ fn schema_plan_parity_cli_and_sdk() { ); assert_eq!(cli_payload["supported"], plan.supported); } + +// ─── MR-668 PR 8 — omnigraph graphs subcommand ───────────────────────────── + +/// `omnigraph graphs --help` lists only the read-only `list` +/// subcommand. Runtime add (`create`) and remove (`delete`) are +/// deferred — operators add/remove graphs by editing `omnigraph.yaml` +/// and restarting. This test pins the deferral against accidental +/// re-introduction. +#[test] +fn graphs_subcommand_help_lists_list_only() { + let output = output_success(cli().arg("graphs").arg("--help")); + let stdout = stdout_string(&output); + assert!( + stdout.contains("list"), + "expected `list` subcommand in help output:\n{stdout}" + ); + let lowered = stdout.to_lowercase(); + assert!( + !lowered.contains("create a new graph"), + "graph create should not be in v0.6.0 help; got:\n{stdout}" + ); + assert!( + !lowered.contains("delete a graph"), + "graph delete should not be in v0.6.0 help; got:\n{stdout}" + ); +} + +/// `omnigraph graphs list` against a local URI errors with a clear +/// message — the CLI only operates against remote multi-graph servers. +#[test] +fn graphs_list_against_local_uri_errors_with_remote_only_message() { + let output = output_failure( + cli() + .arg("graphs") + .arg("list") + .arg("--uri") + .arg("/tmp/local"), + ); + let stderr = String::from_utf8_lossy(&output.stderr).into_owned(); + assert!( + stderr.contains("remote multi-graph server URL"), + "expected 'remote multi-graph server URL' rejection in stderr; got:\n{stderr}" + ); +} diff --git a/crates/omnigraph-cli/tests/system_remote.rs b/crates/omnigraph-cli/tests/system_remote.rs index 7131bec..a1c2901 100644 --- a/crates/omnigraph-cli/tests/system_remote.rs +++ b/crates/omnigraph-cli/tests/system_remote.rs @@ -37,6 +37,17 @@ rules: target_branch_scope: protected "#; +const GRAPH_LIST_SERVER_POLICY_YAML: &str = r#" +version: 1 +groups: + admins: [act-admin] +rules: + - id: admins-can-list-graphs + allow: + actors: { group: admins } + actions: [graph_list] +"#; + fn yaml_string(value: &str) -> String { format!("'{}'", value.replace('\'', "''")) } @@ -888,3 +899,112 @@ query insert_person($name: String, $age: I32) { assert_eq!(verify["row_count"], 1); assert_eq!(verify["rows"][0]["p.name"], "PolicyRemote"); } + +// ─── MR-668 PR 8 — omnigraph graphs list end-to-end ──────────────────────── + +/// Multi-graph server + CLI `omnigraph graphs list` end-to-end. +/// +/// Steps: +/// 1. Init a graph `alpha` on disk and write an `omnigraph.yaml` +/// whose `graphs:` map references it. +/// 2. Spawn the server with `--config `. +/// 3. `omnigraph graphs list` — expect to see `alpha`. +/// +/// Ignored by default — spawning servers needs loopback socket +/// permissions some sandboxes lack. +#[test] +#[ignore = "requires loopback socket permissions in sandboxed runners"] +fn graphs_list_against_multi_graph_server() { + let cfg_dir = tempfile::tempdir().unwrap(); + let schema_path = fixture("test.pg"); + + // Init `alpha` on disk. + let alpha_uri = cfg_dir.path().join("alpha.omni"); + tokio::runtime::Runtime::new().unwrap().block_on(async { + Omnigraph::init( + alpha_uri.to_str().unwrap(), + &fs::read_to_string(&schema_path).unwrap(), + ) + .await + .unwrap(); + }); + + fs::write( + cfg_dir.path().join("server-policy.yaml"), + GRAPH_LIST_SERVER_POLICY_YAML, + ) + .unwrap(); + + // Server config with `graphs:` map and no `server.graph` selector + // — multi mode (rule 4 of the inference matrix). `GET /graphs` is a + // server-scoped action, so the success path needs an explicit server + // policy and bearer token. + let server_config_path = cfg_dir.path().join("omnigraph.yaml"); + fs::write( + &server_config_path, + format!( + "\ +server: + policy: + file: ./server-policy.yaml +graphs: + alpha: + uri: {} +", + yaml_string(&alpha_uri.to_string_lossy()) + ), + ) + .unwrap(); + + let server = spawn_server_with_config_env( + &server_config_path, + &[( + "OMNIGRAPH_SERVER_BEARER_TOKENS_JSON", + r#"{"act-admin":"admin-token"}"#, + )], + ); + + // Client config — the CLI's `--target dev` resolves to `server.base_url`. + let client_config_path = cfg_dir.path().join("client.yaml"); + fs::write( + &client_config_path, + format!( + "\ +graphs: + dev: + uri: {} + bearer_token_env: GRAPH_LIST_TOKEN +cli: + graph: dev +auth: + env_file: ./.env.omni +", + yaml_string(&server.base_url) + ), + ) + .unwrap(); + fs::write( + cfg_dir.path().join(".env.omni"), + "GRAPH_LIST_TOKEN=admin-token\n", + ) + .unwrap(); + + // `graphs list` lists `alpha`. + let payload = parse_stdout_json(&output_success( + cli() + .arg("graphs") + .arg("list") + .arg("--config") + .arg(&client_config_path) + .arg("--json"), + )); + let ids: Vec<&str> = payload["graphs"] + .as_array() + .unwrap() + .iter() + .map(|g| g["graph_id"].as_str().unwrap()) + .collect(); + assert_eq!(ids, vec!["alpha"]); + + drop(server); +} diff --git a/crates/omnigraph-compiler/src/catalog/schema_plan.rs b/crates/omnigraph-compiler/src/catalog/schema_plan.rs index a20820d..a9e26b2 100644 --- a/crates/omnigraph-compiler/src/catalog/schema_plan.rs +++ b/crates/omnigraph-compiler/src/catalog/schema_plan.rs @@ -150,9 +150,7 @@ impl SchemaMigrationStep { /// non-`UnsupportedChange` variant). pub fn diagnostic(&self) -> Option<&'static crate::lint::DiagnosticCode> { match self { - Self::UnsupportedChange { - code: Some(c), .. - } => crate::lint::lookup(c), + Self::UnsupportedChange { code: Some(c), .. } => crate::lint::lookup(c), _ => None, } } @@ -1037,10 +1035,7 @@ node Person { .unwrap(); let plan = plan_schema_migration(&accepted, &desired).unwrap(); - assert!( - plan.supported, - "drop-type plan must be supported: {plan:?}" - ); + assert!(plan.supported, "drop-type plan must be supported: {plan:?}"); assert!( plan.steps.iter().any(|step| matches!( step, @@ -1182,8 +1177,7 @@ node Person @description("new") { for step in steps { let json = serde_json::to_string(&step).expect("serialize"); - let round_trip: SchemaMigrationStep = - serde_json::from_str(&json).expect("deserialize"); + let round_trip: SchemaMigrationStep = serde_json::from_str(&json).expect("deserialize"); assert_eq!(step, round_trip, "round-trip mismatch on {json}"); } } diff --git a/crates/omnigraph-compiler/src/ir/lower.rs b/crates/omnigraph-compiler/src/ir/lower.rs index c130d18..6999d69 100644 --- a/crates/omnigraph-compiler/src/ir/lower.rs +++ b/crates/omnigraph-compiler/src/ir/lower.rs @@ -271,9 +271,7 @@ fn lower_clauses( .traversals .iter() .find(|rt| { - rt.src == traversal.src - && rt.dst == traversal.dst - && rt.edge_type == edge.name + rt.src == traversal.src && rt.dst == traversal.dst && rt.edge_type == edge.name }) .map(|rt| rt.direction) .unwrap_or(Direction::Out); diff --git a/crates/omnigraph-compiler/src/ir/lower_tests.rs b/crates/omnigraph-compiler/src/ir/lower_tests.rs index 50ce93a..7aa140e 100644 --- a/crates/omnigraph-compiler/src/ir/lower_tests.rs +++ b/crates/omnigraph-compiler/src/ir/lower_tests.rs @@ -205,12 +205,8 @@ insert Knows { from: $name, to: $friend } let ir = lower_mutation_query(&qf.queries[0]).unwrap(); assert_eq!(ir.ops.len(), 2); - assert!( - matches!(&ir.ops[0], MutationOpIR::Insert { type_name, .. } if type_name == "Person") - ); - assert!( - matches!(&ir.ops[1], MutationOpIR::Insert { type_name, .. } if type_name == "Knows") - ); + assert!(matches!(&ir.ops[0], MutationOpIR::Insert { type_name, .. } if type_name == "Person")); + assert!(matches!(&ir.ops[1], MutationOpIR::Insert { type_name, .. } if type_name == "Knows")); } /// Destination binding is deferred: NodeScan + Expand + Filter (no cross-join). diff --git a/crates/omnigraph-compiler/src/lib.rs b/crates/omnigraph-compiler/src/lib.rs index 7ebc09a..ba1aba2 100644 --- a/crates/omnigraph-compiler/src/lib.rs +++ b/crates/omnigraph-compiler/src/lib.rs @@ -18,9 +18,9 @@ pub use catalog::schema_ir::{ pub use catalog::schema_plan::{ DropMode, SchemaMigrationPlan, SchemaMigrationStep, SchemaTypeKind, plan_schema_migration, }; -pub use lint::{DiagnosticCode, Family, SafetyTier, Severity}; pub use ir::ParamMap; pub use ir::lower::{lower_mutation_query, lower_query}; +pub use lint::{DiagnosticCode, Family, SafetyTier, Severity}; pub use query::ast::Literal; pub use query::lint::{ QueryLintFinding, QueryLintOutput, QueryLintQueryKind, QueryLintQueryResult, diff --git a/crates/omnigraph-compiler/src/lint/codes.rs b/crates/omnigraph-compiler/src/lint/codes.rs index e53bf31..ba870cf 100644 --- a/crates/omnigraph-compiler/src/lint/codes.rs +++ b/crates/omnigraph-compiler/src/lint/codes.rs @@ -116,7 +116,13 @@ pub const ALL_CODES: &[DiagnosticCode] = &[ ]; /// Codes actually emitted by the planner in v0 (i.e. not reserved). -pub const EMITTED_IN_V0: &[&str] = &["OG-DS-102", "OG-DS-103", "OG-DS-104", "OG-MF-103", "OG-MF-106"]; +pub const EMITTED_IN_V0: &[&str] = &[ + "OG-DS-102", + "OG-DS-103", + "OG-DS-104", + "OG-MF-103", + "OG-MF-106", +]; /// Look up a code by its string identifier. pub fn lookup(code: &str) -> Option<&'static DiagnosticCode> { diff --git a/crates/omnigraph-compiler/src/lint/mod.rs b/crates/omnigraph-compiler/src/lint/mod.rs index 79e9986..5c6c47d 100644 --- a/crates/omnigraph-compiler/src/lint/mod.rs +++ b/crates/omnigraph-compiler/src/lint/mod.rs @@ -24,5 +24,5 @@ pub mod codes; pub mod diagnostic; -pub use codes::{lookup, DiagnosticCode, ALL_CODES}; +pub use codes::{ALL_CODES, DiagnosticCode, lookup}; pub use diagnostic::{Family, SafetyTier, Severity}; diff --git a/crates/omnigraph-compiler/src/query/parser.rs b/crates/omnigraph-compiler/src/query/parser.rs index 20fedb8..4ba8476 100644 --- a/crates/omnigraph-compiler/src/query/parser.rs +++ b/crates/omnigraph-compiler/src/query/parser.rs @@ -137,12 +137,11 @@ fn parse_query_decl(pair: pest::iterators::Pair) -> Result { Rule::mutation_body => { for mutation_pair in body.into_inner() { if let Rule::mutation_stmt = mutation_pair.as_rule() { - let stmt = - mutation_pair.into_inner().next().ok_or_else(|| { - NanoError::Parse( - "mutation statement cannot be empty".to_string(), - ) - })?; + let stmt = mutation_pair.into_inner().next().ok_or_else(|| { + NanoError::Parse( + "mutation statement cannot be empty".to_string(), + ) + })?; mutations.push(parse_mutation_stmt(stmt)?); } } diff --git a/crates/omnigraph-compiler/src/schema/parser_tests.rs b/crates/omnigraph-compiler/src/schema/parser_tests.rs index 9b96a4e..2302cfb 100644 --- a/crates/omnigraph-compiler/src/schema/parser_tests.rs +++ b/crates/omnigraph-compiler/src/schema/parser_tests.rs @@ -271,9 +271,9 @@ age: I32? match &schema.declarations[0] { SchemaDecl::Node(n) => { assert!( - n.constraints.iter().any( - |c| matches!(c, Constraint::Range { property, .. } if property == "age") - ) + n.constraints + .iter() + .any(|c| matches!(c, Constraint::Range { property, .. } if property == "age")) ); } _ => panic!("expected Node"), diff --git a/crates/omnigraph-policy/src/lib.rs b/crates/omnigraph-policy/src/lib.rs index f24124f..6459fcd 100644 --- a/crates/omnigraph-policy/src/lib.rs +++ b/crates/omnigraph-policy/src/lib.rs @@ -39,6 +39,23 @@ pub enum PolicyAction { /// future shape. Avoid writing such rules until the first consumer /// endpoint ships to prevent confusion. Admin, + /// MR-668: management action that operates on the server's graph + /// registry, not on a single graph's contents. The Cedar `appliesTo` + /// declaration binds it to `resource: Server` instead of the + /// per-graph `resource: Graph`. Operators authorize a group with: + /// ```yaml + /// rules: + /// - id: admins-can-list-graphs + /// allow: + /// actors: { group: admins } + /// actions: [graph_list] + /// ``` + /// `branch_scope` and `target_branch_scope` are NOT supported for + /// this action — there's no branch context at the server level. + /// Runtime `graph_create` / `graph_delete` are intentionally omitted + /// from v0.6.0; operators add and remove graphs by editing + /// `omnigraph.yaml` and restarting. + GraphList, } impl PolicyAction { @@ -52,6 +69,7 @@ impl PolicyAction { Self::BranchDelete => "branch_delete", Self::BranchMerge => "branch_merge", Self::Admin => "admin", + Self::GraphList => "graph_list", } } @@ -65,6 +83,56 @@ impl PolicyAction { Self::BranchCreate | Self::SchemaApply | Self::BranchDelete | Self::BranchMerge ) } + + /// Which Cedar resource entity governs this action. + /// Per-graph actions (Read, Change, …) apply to `Omnigraph::Graph::""`. + /// Server-scoped management actions (GraphList) apply to + /// `Omnigraph::Server::"root"`. `Admin` is reserved without a current + /// call site; classified as per-graph until MR-724 picks a shape. + pub fn resource_kind(self) -> PolicyResourceKind { + match self { + Self::GraphList => PolicyResourceKind::Server, + Self::Read + | Self::Export + | Self::Change + | Self::SchemaApply + | Self::BranchCreate + | Self::BranchDelete + | Self::BranchMerge + | Self::Admin => PolicyResourceKind::Graph, + } + } +} + +/// Which Cedar entity an action's policies apply to. Internal to +/// `omnigraph-policy` — drives the `compile_policy_source` template +/// and the request-time resource UID construction. +#[derive(Debug, Clone, Copy, Eq, PartialEq)] +pub enum PolicyResourceKind { + /// `Omnigraph::Graph::""` — per-graph actions. + Graph, + /// `Omnigraph::Server::"root"` — management actions. + Server, +} + +/// Which kind of policy file the caller is loading. Drives the +/// load-time validation that catches a "wrong action in wrong file" +/// mistake — a graph policy with `graph_list` rules, or a server +/// policy with `read` rules, both compile silently as Cedar but +/// never match any actual request. Typing the loader makes the +/// mistake a load-time error. +/// +/// Pairs with [`PolicyAction::resource_kind`]: every action's resource +/// kind must match the engine kind it's loaded under. +#[derive(Debug, Clone, Copy, Eq, PartialEq)] +pub enum PolicyEngineKind { + /// Engine is loaded for a single graph; only actions whose + /// `resource_kind()` is `PolicyResourceKind::Graph` are allowed. + Graph, + /// Engine is loaded for server-level management endpoints; only + /// actions whose `resource_kind()` is `PolicyResourceKind::Server` + /// are allowed. + Server, } impl fmt::Display for PolicyAction { @@ -86,6 +154,7 @@ impl FromStr for PolicyAction { "branch_delete" => Ok(Self::BranchDelete), "branch_merge" => Ok(Self::BranchMerge), "admin" => Ok(Self::Admin), + "graph_list" => Ok(Self::GraphList), other => bail!("unknown policy action '{other}'"), } } @@ -153,9 +222,16 @@ pub enum PolicyExpectation { Deny, } +/// What a caller wants to do, sans identity. Actor identity flows +/// through a separate `actor_id: &str` parameter on +/// [`PolicyEngine::authorize`] / [`PolicyChecker::check`] — encoding +/// the architectural invariant that actor identity is server-authoritative +/// and must not be supplied by the same code path that supplies the +/// requested action. In the HTTP layer, the bearer-token middleware +/// resolves the actor and passes it independently; clients cannot +/// smuggle identity inside this struct. #[derive(Debug, Clone)] pub struct PolicyRequest { - pub actor_id: String, pub action: PolicyAction, pub branch: Option, pub target_branch: Option, @@ -262,6 +338,34 @@ impl PolicyConfig { } } } + // MR-668: server-scoped actions have no branch context and + // must not be mixed with per-graph actions in the same + // rule (each rule generates one Cedar `permit` referencing + // a specific resource kind). + let mut server_scoped = false; + let mut graph_scoped = false; + for action in &rule.allow.actions { + match action.resource_kind() { + PolicyResourceKind::Server => server_scoped = true, + PolicyResourceKind::Graph => graph_scoped = true, + } + } + if server_scoped && graph_scoped { + bail!( + "policy rule '{}' mixes the server-scoped action `graph_list` \ + with per-graph actions; split into separate rules", + rule.id + ); + } + if server_scoped + && (rule.allow.branch_scope.is_some() || rule.allow.target_branch_scope.is_some()) + { + bail!( + "policy rule '{}' uses branch_scope/target_branch_scope with a \ + server-scoped action; server-scoped actions have no branch context", + rule.id + ); + } } Ok(()) @@ -330,26 +434,61 @@ impl PolicyCompiler { } impl PolicyEngine { - pub fn load(path: &Path, graph_id: &str) -> Result { + /// Load a per-graph policy file. Rejects rules whose actions are + /// server-scoped (e.g. `graph_list`) — those belong in a server + /// policy file, not a per-graph one. + /// + /// `graph_id` is the label of the graph this engine governs; + /// becomes the Cedar `Omnigraph::Graph::""` resource + /// for every per-graph action evaluated against this engine. + pub fn load_graph(path: &Path, graph_id: &str) -> Result { let config = PolicyConfig::load(path)?; + validate_kind_alignment(&config, PolicyEngineKind::Graph)?; PolicyCompiler::compile(&config, graph_id) } - pub fn authorize(&self, request: &PolicyRequest) -> Result { - if !self.known_actors.contains(request.actor_id.as_str()) { + /// Load a server-level policy file. Rejects rules whose actions + /// are per-graph (e.g. `read`, `change`) — those belong in a + /// per-graph policy file, not the server one. Takes no `graph_id`: + /// server-scoped actions resolve against the singleton + /// `Omnigraph::Server::"root"` entity, never a Graph. + pub fn load_server(path: &Path) -> Result { + let config = PolicyConfig::load(path)?; + validate_kind_alignment(&config, PolicyEngineKind::Server)?; + // The Graph entity created by the compiler is never referenced + // by a server-scoped rule, so the label below is purely a + // placeholder. Use the canonical SERVER_RESOURCE_ID so any + // future inspection of an unreachable Graph entity at least + // points at the right concept. + PolicyCompiler::compile(&config, SERVER_RESOURCE_ID) + } + + /// Evaluate a request. `actor_id` is supplied as a separate + /// argument (not inside `PolicyRequest`) so the type system enforces + /// the "server-authoritative actor identity" invariant — clients + /// supplying a `PolicyRequest` cannot smuggle identity through the + /// same struct that carries the requested action. + pub fn authorize(&self, actor_id: &str, request: &PolicyRequest) -> Result { + if !self.known_actors.contains(actor_id) { return Ok(self.deny( - request, None, format!( "policy denied action '{}' for unknown actor '{}'", - request.action, request.actor_id + request.action, actor_id ), )); } - let principal = entity_uid("Actor", &request.actor_id)?; + let principal = entity_uid("Actor", actor_id)?; let action = entity_uid("Action", request.action.as_str())?; - let resource = entity_uid("Graph", &self.graph_id)?; + // Pick the resource entity based on the action's `resource_kind`. + // Server-scoped actions (`graph_list`) bind to + // `Omnigraph::Server::"root"`; per-graph actions bind to + // `Omnigraph::Graph::""`. + let resource = match request.action.resource_kind() { + PolicyResourceKind::Server => entity_uid("Server", SERVER_RESOURCE_ID)?, + PolicyResourceKind::Graph => entity_uid("Graph", &self.graph_id)?, + }; let context_value = json!({ "has_branch": request.branch.is_some(), "branch": request.branch.clone().unwrap_or_default(), @@ -386,7 +525,7 @@ impl PolicyEngine { matched_rule_id: matched_rule_id.clone(), message: format!( "policy allowed action '{}' for actor '{}'", - request.action, request.actor_id + request.action, actor_id ), }, Decision::Deny => { @@ -403,30 +542,27 @@ impl PolicyEngine { .as_deref() .map(|branch| format!(" targeting branch '{}'", branch)) .unwrap_or_default(), - request.actor_id + actor_id ); - self.deny(request, matched_rule_id, message) + self.deny(matched_rule_id, message) } }) } - pub fn validate_request(&self, request: &PolicyRequest) -> Result<()> { - let _ = self.authorize(request)?; - Ok(()) - } - pub fn run_tests(&self, tests: &PolicyTestConfig) -> Result<()> { if tests.version != 1 { bail!("policy test version must be 1"); } let mut failures = Vec::new(); for case in &tests.cases { - let decision = self.authorize(&PolicyRequest { - actor_id: case.actor.clone(), - action: case.action, - branch: case.branch.clone(), - target_branch: case.target_branch.clone(), - })?; + let decision = self.authorize( + &case.actor, + &PolicyRequest { + action: case.action, + branch: case.branch.clone(), + target_branch: case.target_branch.clone(), + }, + )?; let expected_allowed = matches!(case.expect, PolicyExpectation::Allow); if decision.allowed != expected_allowed { failures.push(format!( @@ -448,12 +584,7 @@ impl PolicyEngine { self.known_actors.len() } - fn deny( - &self, - _request: &PolicyRequest, - matched_rule_id: Option, - message: String, - ) -> PolicyDecision { + fn deny(&self, matched_rule_id: Option, message: String) -> PolicyDecision { PolicyDecision { allowed: false, matched_rule_id, @@ -462,6 +593,38 @@ impl PolicyEngine { } } +/// Reject any rule whose actions don't match the engine kind +/// being loaded. Closes the "wrong action in wrong file silently +/// no-ops" class — `graph_list` in a per-graph file or `read` in +/// a server file fails at load time instead of compiling cleanly +/// and never matching a request. +fn validate_kind_alignment(config: &PolicyConfig, kind: PolicyEngineKind) -> Result<()> { + let required = match kind { + PolicyEngineKind::Graph => PolicyResourceKind::Graph, + PolicyEngineKind::Server => PolicyResourceKind::Server, + }; + for rule in &config.rules { + for action in &rule.allow.actions { + if action.resource_kind() != required { + let (got, expected_file) = match action.resource_kind() { + PolicyResourceKind::Server => ("server-scoped", "server policy file"), + PolicyResourceKind::Graph => ("per-graph", "per-graph policy file"), + }; + bail!( + "policy rule '{}' uses {} action '{}' in a {:?} policy file; \ + move it to a {}", + rule.id, + got, + action, + kind, + expected_file + ); + } + } + } + Ok(()) +} + fn compile_entities(config: &PolicyConfig, graph_id: &str, schema: &Schema) -> Result { let mut group_entities = Vec::new(); for group in config.groups.keys() { @@ -505,6 +668,26 @@ fn compile_entities(config: &PolicyConfig, graph_id: &str, schema: &Schema) -> R entities.extend(group_entities); entities.extend(actor_entities); entities.push(graph_entity); + + // MR-668: include the `Omnigraph::Server::"root"` entity + // whenever any rule references a server-scoped action. Cedar's + // schema validator will otherwise reject the policy. Keeping this + // conditional (rather than always-on) avoids polluting test + // assertions for graph-only policies. + let any_server_scoped = config.rules.iter().any(|rule| { + rule.allow + .actions + .iter() + .any(|action| action.resource_kind() == PolicyResourceKind::Server) + }); + if any_server_scoped { + entities.push(Entity::new( + entity_uid("Server", SERVER_RESOURCE_ID)?, + HashMap::new(), + HashSet::::new(), + )?); + } + Ok(Entities::from_entities(entities, Some(schema))?) } @@ -543,16 +726,29 @@ fn compile_policy_source(rule: &PolicyRule, action: &PolicyAction, graph_id: &st format!("\nwhen {{ {} }}", conditions.join(" && ")) }; + // MR-668: emit the resource literal that matches the action's + // `resource_kind`. Per-graph actions reference the engine's + // `Omnigraph::Graph::""` instance; server-scoped + // actions reference the singleton `Omnigraph::Server::"root"`. + let resource_literal = match action.resource_kind() { + PolicyResourceKind::Graph => { + format!("Omnigraph::Graph::{}", cedar_literal(graph_id)) + } + PolicyResourceKind::Server => { + format!("Omnigraph::Server::{}", cedar_literal(SERVER_RESOURCE_ID)) + } + }; + format!( r#"permit ( principal in Omnigraph::Group::{group}, action == Omnigraph::Action::{action}, - resource == Omnigraph::Graph::{graph} + resource == {resource_literal} ){when};"#, group = cedar_literal(&rule.allow.actors.group), action = cedar_literal(action.as_str()), - graph = cedar_literal(graph_id), when = when, + resource_literal = resource_literal, ) } @@ -581,6 +777,11 @@ fn target_branch_scope_condition(scope: PolicyBranchScope) -> String { } fn policy_schema_source() -> &'static str { + // MR-668: `entity Server;` plus the `graph_list` action that + // binds to it. Per-graph actions stay bound to `Graph`. + // The Cedar schema string lives here (not on a fixture file) so any + // omnigraph-policy build picks up the new vocabulary in lock-step + // with the Rust code. r#" namespace Omnigraph { type RequestContext = { @@ -595,6 +796,7 @@ namespace Omnigraph { entity Actor in [Group]; entity Group; entity Graph; + entity Server; action "read" appliesTo { principal: Actor, resource: Graph, context: RequestContext }; action "export" appliesTo { principal: Actor, resource: Graph, context: RequestContext }; @@ -604,10 +806,17 @@ namespace Omnigraph { action "branch_delete" appliesTo { principal: Actor, resource: Graph, context: RequestContext }; action "branch_merge" appliesTo { principal: Actor, resource: Graph, context: RequestContext }; action "admin" appliesTo { principal: Actor, resource: Graph, context: RequestContext }; + + action "graph_list" appliesTo { principal: Actor, resource: Server, context: RequestContext }; } "# } +/// Canonical id of the `Omnigraph::Server` Cedar entity. There's only one +/// (the running server); the id is fixed at `"root"` so Cedar rules can +/// reference it unambiguously: `resource == Omnigraph::Server::"root"`. +const SERVER_RESOURCE_ID: &str = "root"; + fn entity_uid(entity_type: &str, id: &str) -> Result { let typename = EntityTypeName::from_str(&format!("Omnigraph::{entity_type}"))?; let entity_id = EntityId::from_str(id).map_err(|err| eyre!(err.to_string()))?; @@ -619,10 +828,6 @@ fn cedar_literal(value: &str) -> String { } impl PolicyRequest { - pub fn actor_id(&self) -> &str { - &self.actor_id - } - pub fn action(&self) -> PolicyAction { self.action } @@ -761,13 +966,12 @@ impl PolicyChecker for PolicyEngine { ) -> Result<(), PolicyError> { let (branch, target_branch) = scope.to_branch_pair(); let request = PolicyRequest { - actor_id: actor.to_string(), action, branch: branch.map(|s| s.to_string()), target_branch: target_branch.map(|s| s.to_string()), }; let decision = self - .authorize(&request) + .authorize(actor, &request) .map_err(|e| PolicyError::Internal(e.to_string()))?; if decision.allowed { Ok(()) @@ -780,7 +984,7 @@ impl PolicyChecker for PolicyEngine { #[cfg(test)] mod tests { use super::{ - PolicyAction, PolicyCompiler, PolicyConfig, PolicyExpectation, PolicyRequest, + PolicyAction, PolicyCompiler, PolicyConfig, PolicyEngine, PolicyExpectation, PolicyRequest, PolicyTestCase, PolicyTestConfig, }; @@ -883,33 +1087,39 @@ rules: let engine = PolicyCompiler::compile(&policy, "graph").unwrap(); let allow = engine - .authorize(&PolicyRequest { - actor_id: "act-bruno".to_string(), - action: PolicyAction::Change, - branch: Some("feature".to_string()), - target_branch: None, - }) + .authorize( + "act-bruno", + &PolicyRequest { + action: PolicyAction::Change, + branch: Some("feature".to_string()), + target_branch: None, + }, + ) .unwrap(); assert!(allow.allowed); assert_eq!(allow.matched_rule_id.as_deref(), Some("team-write")); let deny = engine - .authorize(&PolicyRequest { - actor_id: "act-bruno".to_string(), - action: PolicyAction::BranchDelete, - branch: None, - target_branch: Some("main".to_string()), - }) + .authorize( + "act-bruno", + &PolicyRequest { + action: PolicyAction::BranchDelete, + branch: None, + target_branch: Some("main".to_string()), + }, + ) .unwrap(); assert!(!deny.allowed); let admin = engine - .authorize(&PolicyRequest { - actor_id: "act-andrew".to_string(), - action: PolicyAction::BranchDelete, - branch: None, - target_branch: Some("main".to_string()), - }) + .authorize( + "act-andrew", + &PolicyRequest { + action: PolicyAction::BranchDelete, + branch: None, + target_branch: Some("main".to_string()), + }, + ) .unwrap(); assert!(admin.allowed); assert_eq!(admin.matched_rule_id.as_deref(), Some("admins-promote")); @@ -978,23 +1188,305 @@ rules: let engine = PolicyCompiler::compile(&policy, "graph").unwrap(); let allow = engine - .authorize(&PolicyRequest { - actor_id: "act-ragnor".to_string(), - action: PolicyAction::SchemaApply, - branch: None, - target_branch: Some("main".to_string()), - }) + .authorize( + "act-ragnor", + &PolicyRequest { + action: PolicyAction::SchemaApply, + branch: None, + target_branch: Some("main".to_string()), + }, + ) .unwrap(); assert!(allow.allowed); let deny = engine - .authorize(&PolicyRequest { - actor_id: "act-ragnor".to_string(), - action: PolicyAction::SchemaApply, - branch: None, - target_branch: Some("feature".to_string()), - }) + .authorize( + "act-ragnor", + &PolicyRequest { + action: PolicyAction::SchemaApply, + branch: None, + target_branch: Some("feature".to_string()), + }, + ) .unwrap(); assert!(!deny.allowed); } + + // ─── MR-668 — server-scoped action (graph_list) ─ + + #[test] + fn graph_list_action_authorizes_against_server_resource() { + let policy: PolicyConfig = serde_yaml::from_str( + r#" +version: 1 +groups: + admins: [act-andrew] + viewers: [act-bruno] +rules: + - id: admins-list-graphs + allow: + actors: { group: admins } + actions: [graph_list] +"#, + ) + .unwrap(); + + // The graph_label passed at compile time is irrelevant for + // server-scoped actions — they resolve against + // `Omnigraph::Server::"root"` regardless. We pass a sentinel + // so it's obvious the value isn't used. + let engine = PolicyCompiler::compile(&policy, "ignored").unwrap(); + + let allow = engine + .authorize( + "act-andrew", + &PolicyRequest { + action: PolicyAction::GraphList, + branch: None, + target_branch: None, + }, + ) + .unwrap(); + assert!(allow.allowed); + assert_eq!(allow.matched_rule_id.as_deref(), Some("admins-list-graphs")); + + // Different actor, same policy → deny. + let deny = engine + .authorize( + "act-bruno", + &PolicyRequest { + action: PolicyAction::GraphList, + branch: None, + target_branch: None, + }, + ) + .unwrap(); + assert!(!deny.allowed); + } + + #[test] + fn server_scoped_rule_cannot_use_branch_scope() { + let policy: PolicyConfig = serde_yaml::from_str( + r#" +version: 1 +groups: + admins: [act-andrew] +rules: + - id: bad-branch-scope-on-graph-list + allow: + actors: { group: admins } + actions: [graph_list] + branch_scope: any +"#, + ) + .unwrap(); + let err = policy.validate().unwrap_err(); + let msg = err.to_string(); + assert!( + msg.contains("branch_scope") || msg.contains("server-scoped"), + "expected branch_scope rejection for server-scoped action; got: {msg}" + ); + } + + #[test] + fn rule_mixing_server_and_per_graph_actions_is_rejected() { + // A single rule must reference exactly one resource kind. + // `graph_list` (Server) + `read` (Graph) in one allow block + // is invalid — operators must split the rule. + let policy: PolicyConfig = serde_yaml::from_str( + r#" +version: 1 +groups: + admins: [act-andrew] +rules: + - id: mixed-resource-kinds + allow: + actors: { group: admins } + actions: [graph_list, read] +"#, + ) + .unwrap(); + let err = policy.validate().unwrap_err(); + let msg = err.to_string(); + assert!( + msg.contains("server-scoped") || msg.contains("split into separate rules"), + "expected mix-resource-kinds rejection; got: {msg}" + ); + } + + #[test] + fn per_graph_rules_continue_to_work_alongside_server_rules() { + // Decision 6 contract: existing operator policies (which only + // reference per-graph actions) keep compiling and authorizing + // as before, even when the compiled-in schema now declares + // `Server` + `graph_*` actions. This pins the "Cedar refactor + // is operator-invisible" promise. + let policy: PolicyConfig = serde_yaml::from_str( + r#" +version: 1 +groups: + team: [act-andrew] +protected_branches: [main] +rules: + - id: team-read + allow: + actors: { group: team } + actions: [read, export] + branch_scope: any +"#, + ) + .unwrap(); + let engine = PolicyCompiler::compile(&policy, "graph").unwrap(); + let allow = engine + .authorize( + "act-andrew", + &PolicyRequest { + action: PolicyAction::Read, + branch: Some("main".to_string()), + target_branch: None, + }, + ) + .unwrap(); + assert!(allow.allowed); + assert_eq!(allow.matched_rule_id.as_deref(), Some("team-read")); + } + + // ─── MR-668 follow-up — load_graph / load_server kind alignment ─ + + /// A per-graph policy file containing a `graph_list` rule fails + /// at load time. Pre-fix, the file compiled cleanly and the rule + /// silently never matched (per-graph engine never gets a + /// `graph_list` check). Closes the "wrong action, wrong file, + /// silent no-op" class. + #[test] + fn load_graph_rejects_server_scoped_action() { + let dir = tempfile::tempdir().unwrap(); + let path = dir.path().join("bad-graph-policy.yaml"); + std::fs::write( + &path, + r#" +version: 1 +groups: + admins: [act-andrew] +rules: + - id: misplaced-graph-list + allow: + actors: { group: admins } + actions: [graph_list] +"#, + ) + .unwrap(); + let err = match PolicyEngine::load_graph(&path, "g1") { + Ok(_) => panic!("expected server-scoped action in per-graph file to be rejected"), + Err(e) => e, + }; + let msg = err.to_string(); + assert!( + msg.contains("server-scoped") && msg.contains("graph_list"), + "expected server-scoped-in-graph-file rejection, got: {msg}" + ); + } + + /// A server policy file containing a `read` rule fails at load + /// time. Pre-fix, the file compiled cleanly and the rule silently + /// never matched (server engine never gets a `read` check). + #[test] + fn load_server_rejects_per_graph_action() { + let dir = tempfile::tempdir().unwrap(); + let path = dir.path().join("bad-server-policy.yaml"); + std::fs::write( + &path, + r#" +version: 1 +groups: + team: [act-andrew] +rules: + - id: misplaced-read + allow: + actors: { group: team } + actions: [read] + branch_scope: any +"#, + ) + .unwrap(); + let err = match PolicyEngine::load_server(&path) { + Ok(_) => panic!("expected per-graph action in server file to be rejected"), + Err(e) => e, + }; + let msg = err.to_string(); + assert!( + msg.contains("per-graph") && msg.contains("read"), + "expected per-graph-in-server-file rejection, got: {msg}" + ); + } + + /// Positive case: a properly-shaped per-graph policy loads via + /// `load_graph` and authorizes as expected. Verifies the + /// kind-alignment check is permissive when the file is correct. + #[test] + fn load_graph_accepts_per_graph_only_policy() { + let dir = tempfile::tempdir().unwrap(); + let path = dir.path().join("ok-graph-policy.yaml"); + std::fs::write( + &path, + r#" +version: 1 +groups: + team: [act-andrew] +rules: + - id: team-read + allow: + actors: { group: team } + actions: [read] + branch_scope: any +"#, + ) + .unwrap(); + let engine = PolicyEngine::load_graph(&path, "g1").unwrap(); + let decision = engine + .authorize( + "act-andrew", + &PolicyRequest { + action: PolicyAction::Read, + branch: Some("main".to_string()), + target_branch: None, + }, + ) + .unwrap(); + assert!(decision.allowed); + } + + /// Positive case: a properly-shaped server policy loads via + /// `load_server` and authorizes the `graph_list` action. + #[test] + fn load_server_accepts_server_only_policy() { + let dir = tempfile::tempdir().unwrap(); + let path = dir.path().join("ok-server-policy.yaml"); + std::fs::write( + &path, + r#" +version: 1 +groups: + admins: [act-andrew] +rules: + - id: admins-list-graphs + allow: + actors: { group: admins } + actions: [graph_list] +"#, + ) + .unwrap(); + let engine = PolicyEngine::load_server(&path).unwrap(); + let decision = engine + .authorize( + "act-andrew", + &PolicyRequest { + action: PolicyAction::GraphList, + branch: None, + target_branch: None, + }, + ) + .unwrap(); + assert!(decision.allowed); + } } diff --git a/crates/omnigraph-server/Cargo.toml b/crates/omnigraph-server/Cargo.toml index 0372adc..e9a0e46 100644 --- a/crates/omnigraph-server/Cargo.toml +++ b/crates/omnigraph-server/Cargo.toml @@ -37,7 +37,10 @@ futures = { workspace = true } sha2 = { workspace = true } subtle = { workspace = true } async-trait = { workspace = true } +arc-swap = { workspace = true } dashmap = "6" +regex = { workspace = true } +thiserror = { workspace = true } aws-config = { version = "1", optional = true, default-features = false, features = ["rustls", "rt-tokio", "credentials-process", "sso"] } aws-sdk-secretsmanager = { version = "1", optional = true, default-features = false, features = ["rustls", "rt-tokio"] } diff --git a/crates/omnigraph-server/src/api.rs b/crates/omnigraph-server/src/api.rs index 1195f12..5ff0787 100644 --- a/crates/omnigraph-server/src/api.rs +++ b/crates/omnigraph-server/src/api.rs @@ -235,7 +235,9 @@ pub struct CommitListOutput { pub struct ReadRequest { /// GQ query source. May declare one or more named queries; pick one with /// `query_name` if there is more than one. - #[schema(example = "query get_person($name: String) {\n match {\n $p: Person { name: $name }\n }\n return { $p.name, $p.age }\n}")] + #[schema( + example = "query get_person($name: String) {\n match {\n $p: Person { name: $name }\n }\n return { $p.name, $p.age }\n}" + )] pub query_source: String, /// Name of the query to run when `query_source` declares multiple. Optional /// when only one query is declared. @@ -252,7 +254,9 @@ pub struct ReadRequest { pub struct ChangeRequest { /// GQ mutation source containing `insert`, `update`, or `delete` statements. /// May declare multiple named mutations; pick one with `query_name`. - #[schema(example = "query insert_person($name: String, $age: I32) {\n insert Person { name: $name, age: $age }\n}")] + #[schema( + example = "query insert_person($name: String, $age: I32) {\n insert Person { name: $name, age: $age }\n}" + )] pub query_source: String, /// Name of the mutation to run when `query_source` declares multiple. pub query_name: Option, @@ -266,7 +270,9 @@ pub struct ChangeRequest { pub struct SchemaApplyRequest { /// Project schema in `.pg` source form. The diff against the current /// schema produces the migration steps that will be applied. - #[schema(example = "node Person {\n name: String @key\n age: I32?\n}\n\nedge Knows: Person -> Person")] + #[schema( + example = "node Person {\n name: String @key\n age: I32?\n}\n\nedge Knows: Person -> Person" + )] pub schema_source: String, /// When true, promote every `DropMode::Soft` step in the plan to /// `DropMode::Hard`, making the prior column data unreachable @@ -303,7 +309,9 @@ pub struct IngestRequest { pub mode: Option, /// NDJSON payload: one record per line, each shaped /// `{"type": "", "data": {...}}`. - #[schema(example = "{\"type\": \"Person\", \"data\": {\"name\": \"Alice\", \"age\": 30}}\n{\"type\": \"Person\", \"data\": {\"name\": \"Bob\", \"age\": 25}}")] + #[schema( + example = "{\"type\": \"Person\", \"data\": {\"name\": \"Alice\", \"age\": 30}}\n{\"type\": \"Person\", \"data\": {\"name\": \"Bob\", \"age\": 25}}" + )] pub data: String, } @@ -344,6 +352,11 @@ pub enum ErrorCode { Forbidden, BadRequest, NotFound, + /// 405 Method Not Allowed — the route exists but the active server + /// mode doesn't serve this method (e.g. `GET /graphs` in single-graph + /// mode). Distinct from 404 so clients can tell "wrong context" from + /// "no such resource." + MethodNotAllowed, Conflict, /// 429 Too Many Requests — per-actor admission cap exceeded. /// Clients should respect the `Retry-After` header. @@ -467,3 +480,23 @@ pub fn read_target_output(target: &ReadTarget) -> ReadTargetOutput { }, } } + +// ─── MR-668 — management endpoint shapes ────────────────────────────────── + +/// One entry in the response from `GET /graphs`. Cluster operators +/// consume this list to discover which graphs the server is currently +/// serving. The shape is intentionally minimal — `graph_id` and `uri` +/// are the only fields a routing client needs. +#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)] +pub struct GraphInfo { + pub graph_id: String, + pub uri: String, +} + +/// Response from `GET /graphs`. Lists every graph registered with the +/// server in alphabetical order by `graph_id` (sorted server-side so +/// clients get deterministic output across requests). +#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)] +pub struct GraphListResponse { + pub graphs: Vec, +} diff --git a/crates/omnigraph-server/src/auth.rs b/crates/omnigraph-server/src/auth.rs index 80b6ed5..4f05228 100644 --- a/crates/omnigraph-server/src/auth.rs +++ b/crates/omnigraph-server/src/auth.rs @@ -119,7 +119,10 @@ pub(crate) fn parse_json_secret_payload(payload: &str) -> Result) -> Result { - let config = - aws_config::load_defaults(aws_config::BehaviorVersion::latest()).await; + let config = aws_config::load_defaults(aws_config::BehaviorVersion::latest()).await; let client = aws_sdk_secretsmanager::Client::new(&config); Ok(Self { client, @@ -200,8 +202,8 @@ pub use aws::SecretsManagerTokenSource; #[cfg(test)] mod tests { use super::*; - use std::env; use serial_test::serial; + use std::env; fn clear_env() { unsafe { @@ -232,7 +234,10 @@ mod tests { unsafe { env::remove_var("OMNIGRAPH_SERVER_BEARER_TOKEN"); } - assert_eq!(tokens, vec![("default".to_string(), "some-token".to_string())]); + assert_eq!( + tokens, + vec![("default".to_string(), "some-token".to_string())] + ); } #[tokio::test] diff --git a/crates/omnigraph-server/src/config.rs b/crates/omnigraph-server/src/config.rs index 7145ff2..1272a7b 100644 --- a/crates/omnigraph-server/src/config.rs +++ b/crates/omnigraph-server/src/config.rs @@ -6,6 +6,7 @@ use std::path::{Path, PathBuf}; use clap::ValueEnum; use color_eyre::eyre::{Result, bail}; use serde::{Deserialize, Serialize}; + pub const DEFAULT_CONFIG_FILE: &str = "omnigraph.yaml"; #[derive(Debug, Clone, Default, Serialize, Deserialize)] @@ -17,6 +18,12 @@ pub struct ProjectConfig { pub struct TargetConfig { pub uri: String, pub bearer_token_env: Option, + /// Per-graph Cedar policy file (MR-668). In single-graph mode this + /// field is unused — the top-level `policy.file` applies. In + /// multi-graph mode, each `graphs..policy.file` governs that + /// graph's HTTP-layer Cedar enforcement. + #[serde(default)] + pub policy: PolicySettings, } #[derive(Debug, Clone, Copy, Default, Eq, PartialEq, Serialize, Deserialize, ValueEnum)] @@ -59,6 +66,12 @@ pub struct ServerDefaults { #[serde(rename = "graph")] pub graph: Option, pub bind: Option, + /// Server-level Cedar policy (MR-668). Governs management endpoints + /// — currently `GET /graphs`; future runtime add/remove endpoints + /// will plug in here too. In single-graph mode this is unused — the + /// top-level `policy.file` covers the single graph. + #[serde(default)] + pub policy: PolicySettings, } #[derive(Debug, Clone, Default, Serialize, Deserialize)] @@ -197,23 +210,46 @@ impl OmnigraphConfig { } pub fn resolve_auth_env_file(&self) -> Option { - let path = self.auth.env_file.as_deref()?; - let path = Path::new(path); - Some(if path.is_absolute() { - path.to_path_buf() - } else { - self.base_dir.join(path) - }) + self.auth + .env_file + .as_deref() + .map(|path| self.resolve_config_path(path)) } pub fn resolve_policy_file(&self) -> Option { - let path = self.policy.file.as_deref()?; - let path = Path::new(path); - Some(if path.is_absolute() { - path.to_path_buf() - } else { - self.base_dir.join(path) - }) + self.policy + .file + .as_deref() + .map(|path| self.resolve_config_path(path)) + } + + /// Resolve the per-graph policy file path for the named target, + /// relative to the config file's `base_dir`. Returns `None` if the + /// target is unknown or no per-graph `policy.file` is set. + pub fn resolve_target_policy_file(&self, target_name: &str) -> Option { + let target = self.graphs.get(target_name)?; + target + .policy + .file + .as_deref() + .map(|path| self.resolve_config_path(path)) + } + + /// Resolve the server-level policy file path (used by management + /// endpoints). Returns `None` if `server.policy.file` is not set. + pub fn resolve_server_policy_file(&self) -> Option { + self.server + .policy + .file + .as_deref() + .map(|path| self.resolve_config_path(path)) + } + + /// Resolve a raw config-supplied URI (which may be relative) to its + /// absolute form. URIs containing `://` are passed through as-is; + /// relative paths are joined with the config file's `base_dir`. + pub fn resolve_uri_value(&self, value: &str) -> String { + self.resolve_config_uri(value) } pub fn resolve_policy_tests_file(&self) -> Option { @@ -282,6 +318,15 @@ impl OmnigraphConfig { self.base_dir.join(path).to_string_lossy().to_string() } } + + fn resolve_config_path(&self, value: &str) -> PathBuf { + let path = Path::new(value); + if path.is_absolute() { + path.to_path_buf() + } else { + self.base_dir.join(path) + } + } } pub fn default_config_path() -> PathBuf { diff --git a/crates/omnigraph-server/src/graph_id.rs b/crates/omnigraph-server/src/graph_id.rs new file mode 100644 index 0000000..ffccd2a --- /dev/null +++ b/crates/omnigraph-server/src/graph_id.rs @@ -0,0 +1,254 @@ +//! `GraphId` — registry-level identity for a graph in multi-graph mode (MR-668). +//! +//! Validation lives in `GraphId::try_from(String)`; nothing else can construct a +//! `GraphId`. The newtype prevents `graph_id` strings from escaping the storage +//! root via path traversal or colliding with engine-reserved filenames. +//! +//! Regex: `^[a-zA-Z0-9-]{1,64}$` +//! +//! The engine reserves every filename starting with `_` at the graph root +//! (`_schema.pg`, `_schema.ir.json`, `__schema_state.json`, `__manifest/`, +//! `__recovery/`, etc.). Disallowing leading underscores at the regex level +//! means a `graph_id` can never collide with engine-managed files. Path +//! traversal (`..`, `/`) is unrepresentable. +//! +//! `policies` is additionally reserved as a future-proofing measure for a +//! potential `/graphs/policies/...` cluster route. + +use std::fmt; +use std::sync::OnceLock; + +use color_eyre::eyre::{Result, bail}; +use regex::Regex; +use serde::{Deserialize, Serialize}; + +/// Maximum length of a `GraphId` value. +pub const GRAPH_ID_MAX_LEN: usize = 64; + +/// Validated registry-level identity for a graph. +/// +/// Constructed only via `GraphId::try_from(String)` or +/// `GraphId::try_from(&str)`. The inner `String` is private to enforce the +/// validation contract. +#[derive(Debug, Clone, Eq, PartialEq, Hash, Serialize)] +#[serde(transparent)] +pub struct GraphId(String); + +impl GraphId { + /// View the validated identifier as `&str`. + pub fn as_str(&self) -> &str { + &self.0 + } +} + +impl fmt::Display for GraphId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(&self.0) + } +} + +impl AsRef for GraphId { + fn as_ref(&self) -> &str { + &self.0 + } +} + +impl TryFrom for GraphId { + type Error = color_eyre::eyre::Error; + + fn try_from(value: String) -> Result { + validate(value.as_str())?; + Ok(Self(value)) + } +} + +impl TryFrom<&str> for GraphId { + type Error = color_eyre::eyre::Error; + + fn try_from(value: &str) -> Result { + validate(value)?; + Ok(Self(value.to_string())) + } +} + +// Custom Deserialize that re-runs validation. Otherwise a serde-derived impl +// would accept any String, defeating the newtype's guarantee. +impl<'de> Deserialize<'de> for GraphId { + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + let s = String::deserialize(deserializer)?; + Self::try_from(s).map_err(serde::de::Error::custom) + } +} + +fn validate(value: &str) -> Result<()> { + if value.is_empty() { + bail!("graph_id must not be empty"); + } + if value.len() > GRAPH_ID_MAX_LEN { + bail!( + "graph_id '{}' is {} chars; max {}", + value, + value.len(), + GRAPH_ID_MAX_LEN + ); + } + if !regex().is_match(value) { + bail!( + "graph_id '{}' must match ^[a-zA-Z0-9-]{{1,64}}$ — \ + no underscores (engine reserves them), no path separators, no unicode", + value + ); + } + if is_reserved(value) { + bail!( + "graph_id '{}' is reserved (would collide with engine-managed names or \ + future cluster routes)", + value + ); + } + Ok(()) +} + +fn regex() -> &'static Regex { + static RE: OnceLock = OnceLock::new(); + RE.get_or_init(|| Regex::new(r"^[a-zA-Z0-9-]{1,64}$").expect("regex literal")) +} + +/// Reserved `graph_id` values that the regex alone wouldn't catch. +/// The leading-underscore rule already excludes every engine-managed +/// filename pattern (`_schema.pg`, `__manifest`, etc.); the regex +/// `^[a-zA-Z0-9-]{1,64}$` (see `regex()`) additionally rejects every +/// dot-containing name structurally — `openapi.json` and friends +/// never reach this check. +/// +/// This list only needs to cover route-prefix collisions and +/// top-level endpoint names whose spellings DO satisfy the regex +/// (no dots, no underscores). +fn is_reserved(value: &str) -> bool { + matches!(value, "policies" | "healthz" | "openapi" | "graphs") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn accepts_simple_alphanumeric_ids() { + for ok in ["alpha", "beta", "tenant-001", "A", "g", "X-9-z"] { + GraphId::try_from(ok).unwrap_or_else(|_| panic!("expected accept: {ok}")); + } + } + + #[test] + fn accepts_64_char_max() { + let max = "a".repeat(64); + GraphId::try_from(max.as_str()).unwrap(); + } + + #[test] + fn rejects_empty() { + assert!(GraphId::try_from("").is_err()); + } + + #[test] + fn rejects_over_64_chars() { + let too_long = "a".repeat(65); + assert!(GraphId::try_from(too_long.as_str()).is_err()); + } + + #[test] + fn rejects_leading_underscore() { + // Engine reserves every `_*` filename at the graph root. + assert!(GraphId::try_from("_internal").is_err()); + assert!(GraphId::try_from("__manifest").is_err()); + } + + #[test] + fn rejects_underscores_anywhere() { + // The regex doesn't allow `_` at all — keeps the disallow-leading-`_` + // rule cheap to enforce. If the rule changes later, we'd need to + // distinguish "starts with `_`" from "contains `_`". + assert!(GraphId::try_from("tenant_alpha").is_err()); + } + + #[test] + fn rejects_path_separators() { + for bad in ["alpha/beta", "../etc", "..", "alpha\\beta"] { + assert!(GraphId::try_from(bad).is_err(), "expected reject: {bad}"); + } + } + + #[test] + fn rejects_unicode() { + assert!(GraphId::try_from("αlpha").is_err()); + assert!(GraphId::try_from("graph-✨").is_err()); + } + + #[test] + fn rejects_whitespace() { + assert!(GraphId::try_from(" alpha").is_err()); + assert!(GraphId::try_from("alpha ").is_err()); + assert!(GraphId::try_from("alpha beta").is_err()); + assert!(GraphId::try_from("\talpha").is_err()); + } + + #[test] + fn rejects_dots() { + // Reserves the "extension"-shaped ids that look like filenames. + assert!(GraphId::try_from(".").is_err()); + assert!(GraphId::try_from("alpha.beta").is_err()); + assert!(GraphId::try_from("alpha.").is_err()); + } + + #[test] + fn rejects_reserved_route_names() { + // Names that satisfy the regex but are still reserved because + // they'd collide with top-level route prefixes / endpoint names. + // Dot-containing names (e.g. `openapi.json`) are rejected by the + // regex, not this list — `rejects_dots` above covers them. + for bad in ["policies", "healthz", "openapi", "graphs"] { + assert!( + GraphId::try_from(bad).is_err(), + "expected reject (reserved): {bad}" + ); + } + } + + #[test] + fn display_returns_inner_string() { + let id = GraphId::try_from("alpha").unwrap(); + assert_eq!(format!("{id}"), "alpha"); + assert_eq!(id.as_str(), "alpha"); + } + + #[test] + fn serialize_round_trips_via_json() { + let id = GraphId::try_from("tenant-007").unwrap(); + let json = serde_json::to_string(&id).unwrap(); + assert_eq!(json, "\"tenant-007\""); + let back: GraphId = serde_json::from_str(&json).unwrap(); + assert_eq!(back, id); + } + + #[test] + fn deserialize_runs_validation() { + // Hostile payload must not produce a GraphId. + let bad = serde_json::from_str::("\"_evil\""); + assert!(bad.is_err()); + let bad = serde_json::from_str::("\"../../etc\""); + assert!(bad.is_err()); + } + + #[test] + fn hash_equality_works_for_use_as_map_key() { + use std::collections::HashMap; + let a = GraphId::try_from("alpha").unwrap(); + let b = GraphId::try_from("alpha").unwrap(); + let mut m = HashMap::new(); + m.insert(a, 1u32); + assert_eq!(m.get(&b), Some(&1)); + } +} diff --git a/crates/omnigraph-server/src/identity.rs b/crates/omnigraph-server/src/identity.rs new file mode 100644 index 0000000..250640d --- /dev/null +++ b/crates/omnigraph-server/src/identity.rs @@ -0,0 +1,308 @@ +//! Identity types for the multi-graph server (MR-668) + forward-compatible +//! shapes for Cloud mode (RFC 0003) and OAuth provider (RFC 0004). +//! +//! Per decision 13 in the implementation plan: ship the type shapes that +//! Cloud mode will consume, without committing to any trait shape +//! (`TokenVerifier` stays draft in RFC 0001). Every Cluster-mode call site +//! constructs these types with their Cluster-mode-specific values: +//! +//! - `tenant_id: None` (Cloud will set `Some(...)` from the OAuth `org_id` claim) +//! - `scopes: vec![Scope::Full]` (Cloud will populate from the OAuth `scope` claim) +//! - `source: AuthSource::Static` (Cloud / OIDC will set `AuthSource::Oidc`) +//! +//! The enums use `#[non_exhaustive]` so RFC 0001 step 1 / RFC 0004 can +//! add variants without breaking exhaustive matches in callers. + +use std::fmt; +use std::sync::Arc; +use std::sync::OnceLock; + +use color_eyre::eyre::{Result, bail}; +use regex::Regex; +use serde::{Deserialize, Serialize}; + +use crate::graph_id::GraphId; + +/// Maximum length of a `TenantId` value. +pub const TENANT_ID_MAX_LEN: usize = 64; + +/// Cloud-mode tenant identifier. Validated with the same regex as +/// `GraphId` so the two interchange syntactically. +/// +/// `None` in Cluster mode; Cloud mode (RFC 0003) sets `Some(...)` from +/// the OAuth `org_id` claim. Constructed only via `try_from` so callers +/// cannot bypass validation. +#[derive(Debug, Clone, Eq, PartialEq, Hash, Serialize)] +#[serde(transparent)] +pub struct TenantId(String); + +impl TenantId { + pub fn as_str(&self) -> &str { + &self.0 + } +} + +impl fmt::Display for TenantId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(&self.0) + } +} + +impl AsRef for TenantId { + fn as_ref(&self) -> &str { + &self.0 + } +} + +impl TryFrom for TenantId { + type Error = color_eyre::eyre::Error; + + fn try_from(value: String) -> Result { + validate_tenant_id(value.as_str())?; + Ok(Self(value)) + } +} + +impl TryFrom<&str> for TenantId { + type Error = color_eyre::eyre::Error; + + fn try_from(value: &str) -> Result { + validate_tenant_id(value)?; + Ok(Self(value.to_string())) + } +} + +impl<'de> Deserialize<'de> for TenantId { + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + let s = String::deserialize(deserializer)?; + Self::try_from(s).map_err(serde::de::Error::custom) + } +} + +fn validate_tenant_id(value: &str) -> Result<()> { + if value.is_empty() { + bail!("tenant_id must not be empty"); + } + if value.len() > TENANT_ID_MAX_LEN { + bail!( + "tenant_id '{}' is {} chars; max {}", + value, + value.len(), + TENANT_ID_MAX_LEN + ); + } + if !tenant_id_regex().is_match(value) { + bail!("tenant_id '{}' must match ^[a-zA-Z0-9-]{{1,64}}$", value); + } + Ok(()) +} + +fn tenant_id_regex() -> &'static Regex { + static RE: OnceLock = OnceLock::new(); + RE.get_or_init(|| Regex::new(r"^[a-zA-Z0-9-]{1,64}$").expect("regex literal")) +} + +/// Registry HashMap key. Cluster mode populates `tenant_id: None`; +/// Cloud mode (RFC 0003) populates `tenant_id: Some(...)`. +/// +/// The `Option` field is the **single forward-compatibility seam** +/// between Cluster and Cloud modes. Every handler reaches the engine via +/// `state.registry.get(&key)` — the key shape stays stable, so handlers +/// don't get re-touched when Cloud mode lands. +#[derive(Debug, Clone, Eq, PartialEq, Hash)] +pub struct GraphKey { + pub tenant_id: Option, + pub graph_id: GraphId, +} + +impl GraphKey { + /// Cluster-mode constructor (`tenant_id: None`). + pub fn cluster(graph_id: GraphId) -> Self { + Self { + tenant_id: None, + graph_id, + } + } + + /// Cloud-mode constructor — reserved for RFC 0003; included here so + /// the seam is visible even though no Cluster-mode code path calls it. + pub fn cloud(tenant_id: TenantId, graph_id: GraphId) -> Self { + Self { + tenant_id: Some(tenant_id), + graph_id, + } + } +} + +impl fmt::Display for GraphKey { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match &self.tenant_id { + Some(t) => write!(f, "{}/{}", t, self.graph_id), + None => write!(f, "{}", self.graph_id), + } + } +} + +/// Authorization scope. Cluster mode: every authenticated actor gets +/// `Scope::Full`. Cloud mode (RFC 0004) adds OAuth-style scopes via the +/// dashboard-configured `graph:read`, `graph:write`, `graph:admin`, +/// `graph:*` set; those become additional variants here. +/// +/// `#[non_exhaustive]` so RFC 0004 can extend without breaking matches. +#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)] +#[non_exhaustive] +pub enum Scope { + /// Full access. The Cluster-mode default — every authenticated actor + /// has unrestricted access subject to Cedar policy. + Full, +} + +/// How the actor was authenticated. Cluster mode: every actor authenticates +/// via the existing SHA-256 hash compare against a static token set, so +/// `AuthSource::Static`. RFC 0001 step 1 adds `AuthSource::Oidc` when the +/// `OidcJwtVerifier` ships. +/// +/// `#[non_exhaustive]` so RFC 0001 can extend without breaking matches. +#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)] +#[non_exhaustive] +pub enum AuthSource { + /// Authenticated via the static bearer-token hash table. + Static, +} + +/// Server-resolved actor identity. Replaces the previous +/// `AuthenticatedActor(Arc)` from `lib.rs`. +/// +/// The fields are populated by `authenticate_bearer_token` after a successful +/// constant-time hash match. **Clients cannot set any of these fields directly** +/// — this is the MR-731 invariant. See `authorize_request` in `lib.rs` for the +/// chokepoint that overwrites any client-supplied actor identity. +/// +/// Cluster mode constructs this with `tenant_id: None`, `scopes: vec![Scope::Full]`, +/// `source: AuthSource::Static` via the convenience constructor below. +#[derive(Debug, Clone)] +pub struct ResolvedActor { + pub actor_id: Arc, + pub tenant_id: Option, + pub scopes: Vec, + pub source: AuthSource, +} + +impl ResolvedActor { + /// Cluster-mode constructor — Static auth, no tenant, Full scope. + /// Used by `authenticate_bearer_token` after a successful hash match. + pub fn cluster_static(actor_id: Arc) -> Self { + Self { + actor_id, + tenant_id: None, + scopes: vec![Scope::Full], + source: AuthSource::Static, + } + } + + /// View the actor identifier as `&str`. Stable across the Cluster/Cloud + /// boundary — Cedar always sees this value as the principal. + pub fn actor_id_str(&self) -> &str { + &self.actor_id + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn tenant_id_accepts_simple_values() { + for ok in ["alpha", "tenant-001", "X", "01HZWA0KT0H0V0V0V0V0V0V0V0"] { + TenantId::try_from(ok).unwrap_or_else(|_| panic!("expected accept: {ok}")); + } + } + + #[test] + fn tenant_id_rejects_empty_and_over_max() { + assert!(TenantId::try_from("").is_err()); + let too_long = "a".repeat(65); + assert!(TenantId::try_from(too_long.as_str()).is_err()); + } + + #[test] + fn tenant_id_rejects_path_traversal() { + assert!(TenantId::try_from("../etc").is_err()); + assert!(TenantId::try_from("alpha/beta").is_err()); + } + + #[test] + fn tenant_id_deserialize_runs_validation() { + let bad: Result = serde_json::from_str("\"../evil\""); + assert!(bad.is_err()); + } + + #[test] + fn graph_key_cluster_constructor_sets_no_tenant() { + let id = GraphId::try_from("alpha").unwrap(); + let key = GraphKey::cluster(id.clone()); + assert!(key.tenant_id.is_none()); + assert_eq!(key.graph_id, id); + } + + #[test] + fn graph_key_cloud_constructor_sets_tenant() { + let tenant = TenantId::try_from("acme").unwrap(); + let id = GraphId::try_from("alpha").unwrap(); + let key = GraphKey::cloud(tenant.clone(), id.clone()); + assert_eq!(key.tenant_id.as_ref(), Some(&tenant)); + assert_eq!(key.graph_id, id); + } + + #[test] + fn graph_key_displays_with_or_without_tenant() { + let id = GraphId::try_from("alpha").unwrap(); + let cluster_key = GraphKey::cluster(id.clone()); + assert_eq!(format!("{cluster_key}"), "alpha"); + + let tenant = TenantId::try_from("acme").unwrap(); + let cloud_key = GraphKey::cloud(tenant, id); + assert_eq!(format!("{cloud_key}"), "acme/alpha"); + } + + #[test] + fn graph_key_is_hashable_for_map_use() { + use std::collections::HashMap; + let a = GraphKey::cluster(GraphId::try_from("alpha").unwrap()); + let b = GraphKey::cluster(GraphId::try_from("alpha").unwrap()); + let mut m: HashMap = HashMap::new(); + m.insert(a, 1); + assert_eq!(m.get(&b), Some(&1)); + } + + #[test] + fn graph_key_distinguishes_tenants() { + let id = GraphId::try_from("alpha").unwrap(); + let t1 = TenantId::try_from("acme").unwrap(); + let t2 = TenantId::try_from("globex").unwrap(); + let k1 = GraphKey::cloud(t1, id.clone()); + let k2 = GraphKey::cloud(t2, id); + assert_ne!(k1, k2); + } + + #[test] + fn resolved_actor_cluster_defaults() { + let actor = ResolvedActor::cluster_static(Arc::::from("act-alice")); + assert_eq!(actor.actor_id_str(), "act-alice"); + assert!(actor.tenant_id.is_none()); + assert_eq!(actor.scopes, vec![Scope::Full]); + assert_eq!(actor.source, AuthSource::Static); + } + + #[test] + fn scope_and_auth_source_are_non_exhaustive() { + // Regression: keep the `#[non_exhaustive]` annotation. If someone + // removes it, this test still passes (matches are still legal); it's + // the cross-crate compile that catches it. Document the contract here. + let _scope = Scope::Full; + let _src = AuthSource::Static; + } +} diff --git a/crates/omnigraph-server/src/lib.rs b/crates/omnigraph-server/src/lib.rs index 934986f..6e3df89 100644 --- a/crates/omnigraph-server/src/lib.rs +++ b/crates/omnigraph-server/src/lib.rs @@ -1,9 +1,16 @@ pub mod api; pub mod auth; pub mod config; +pub mod graph_id; +pub mod identity; pub mod policy; +pub mod registry; pub mod workload; +pub use graph_id::GraphId; +pub use identity::{AuthSource, GraphKey, ResolvedActor, Scope, TenantId}; +pub use registry::{GraphHandle, GraphRegistry, InsertError, RegistryLookup, RegistrySnapshot}; + use std::collections::{HashMap, HashSet}; use std::fs; use std::io; @@ -14,14 +21,15 @@ use std::sync::Arc; use api::{ BranchCreateOutput, BranchCreateRequest, BranchDeleteOutput, BranchListOutput, BranchMergeOutput, BranchMergeRequest, ChangeOutput, ChangeRequest, CommitListOutput, - CommitListQuery, ErrorCode, ErrorOutput, ExportRequest, HealthOutput, IngestOutput, - IngestRequest, ReadOutput, ReadRequest, SchemaApplyOutput, SchemaApplyRequest, SchemaOutput, - SnapshotQuery, ingest_output, schema_apply_output, snapshot_payload, + CommitListQuery, ErrorCode, ErrorOutput, ExportRequest, GraphInfo, GraphListResponse, + HealthOutput, IngestOutput, IngestRequest, ReadOutput, ReadRequest, SchemaApplyOutput, + SchemaApplyRequest, SchemaOutput, SnapshotQuery, ingest_output, schema_apply_output, + snapshot_payload, }; pub use auth::{AWS_SECRET_ENV, EnvOrFileTokenSource, TokenSource, resolve_token_source}; use axum::body::{Body, Bytes}; use axum::extract::DefaultBodyLimit; -use axum::extract::{Extension, Path, Query, Request, State}; +use axum::extract::{Extension, OriginalUri, Path, Query, Request, State}; use axum::http::StatusCode; use axum::http::header::{AUTHORIZATION, CONTENT_TYPE}; use axum::middleware::{self, Next}; @@ -37,13 +45,15 @@ pub use config::{ use futures::stream; use omnigraph::db::{Omnigraph, ReadTarget}; use omnigraph::error::{ManifestConflictDetails, ManifestErrorKind, OmniError}; +use omnigraph::storage::normalize_root_uri; use omnigraph_compiler::json_params_to_param_map; use omnigraph_compiler::query::parser::parse_query; use omnigraph_compiler::{JsonParamMode, ParamMap}; pub use policy::{ PolicyAction, PolicyCompiler, PolicyConfig, PolicyDecision, PolicyEngine, PolicyExpectation, - PolicyRequest, PolicyTestConfig, + PolicyRequest, PolicyResourceKind, PolicyTestConfig, }; +use serde::Deserialize; use serde_json::Value; use sha2::{Digest, Sha256}; use subtle::ConstantTimeEq; @@ -53,6 +63,8 @@ use tower_http::trace::TraceLayer; use tracing::{error, info, warn}; use tracing_subscriber::EnvFilter; use utoipa::OpenApi; +use utoipa::openapi::path::{Parameter, ParameterIn}; +use utoipa::openapi::schema::{Object, Type}; use utoipa::openapi::security::{Http, HttpAuthScheme, SecurityScheme}; type BearerTokenHash = [u8; 32]; @@ -72,6 +84,7 @@ fn hash_bearer_token(token: &str) -> BearerTokenHash { ), paths( server_health, + server_graphs_list, server_snapshot, server_read, server_export, @@ -111,9 +124,13 @@ const SERVER_SOURCE_VERSION: Option<&str> = option_env!("OMNIGRAPH_SOURCE_VERSIO #[derive(Debug, Clone)] pub struct ServerConfig { - pub uri: String, + /// Server topology + the graphs to open at startup. Single-mode + /// invocations (`omnigraph-server ` or `--target `) + /// produce `ServerConfigMode::Single`; multi-mode invocations + /// (`--config omnigraph.yaml` with a non-empty `graphs:` map and + /// no single-mode selector) produce `ServerConfigMode::Multi`. + pub mode: ServerConfigMode, pub bind: String, - pub policy_file: Option, /// Operator opt-in for fully-unauthenticated dev mode (MR-723). /// When neither bearer tokens nor a policy file are configured, /// `serve()` refuses to start unless this is true (set via @@ -125,23 +142,100 @@ pub struct ServerConfig { pub allow_unauthenticated: bool, } -#[derive(Clone)] -pub struct AppState { - uri: String, - /// PR 2 (MR-686): the engine is now `Arc` — no global - /// write lock. Concurrent handlers call `&self` engine APIs - /// directly. Per-(table, branch) write queues inside the engine - /// serialize same-key writers; per-actor admission control on - /// `workload` isolates noisy actors. - engine: Arc, - /// Per-actor admission control. See `workload::WorkloadController`. - workload: Arc, - bearer_tokens: Arc<[(BearerTokenHash, Arc)]>, - policy_engine: Option>, +/// What `load_server_settings` produces after applying the four-rule +/// mode inference matrix (MR-668 decision 2). +#[derive(Debug, Clone)] +pub enum ServerConfigMode { + /// Legacy invocation — one graph at the given URI. Either: + /// * `omnigraph-server ` (CLI positional), or + /// * `omnigraph-server --target --config omnigraph.yaml`, or + /// * `omnigraph-server --config omnigraph.yaml` with `server.graph` + /// set to a named target. + Single { + uri: String, + /// Top-level `policy.file` (single-graph Cedar policy). + policy_file: Option, + }, + /// Multi-graph invocation — `--config omnigraph.yaml` with a + /// non-empty `graphs:` map and no single-mode selector. + Multi { + /// Per-graph startup configs, sorted by graph id (BTreeMap + /// iteration order). The parallel-open loop iterates this. + graphs: Vec, + /// Path to the config file the server was started from. Kept on + /// the mode so future runtime mutation (deferred — see release + /// notes) can locate the source of truth without re-parsing CLI + /// args. + config_path: PathBuf, + /// `server.policy.file` (server-level Cedar policy for the + /// management endpoints). Wired into `GET /graphs` authorization. + server_policy_file: Option, + }, } +/// One graph's startup-time configuration: id, opened URI, optional +/// per-graph policy file path. Constructed by `load_server_settings` +/// in multi mode; consumed by `serve`'s parallel open loop. #[derive(Debug, Clone)] -struct AuthenticatedActor(Arc); +pub struct GraphStartupConfig { + pub graph_id: String, + pub uri: String, + pub policy_file: Option, +} + +/// Runtime routing for the server. Single mode = legacy +/// `omnigraph-server ` invocation, one graph, flat HTTP routes. +/// Multi mode = `--config omnigraph.yaml` with a non-empty `graphs:` +/// map, N graphs, cluster routes (`/graphs/{graph_id}/...`). Mode is +/// determined at startup by `load_server_settings`. +/// +/// In single mode the handle lives here directly — there is no +/// registry, no sentinel key, no walk-and-assert. In multi mode the +/// registry carries N handles and the middleware dispatches on the +/// URL's `{graph_id}` segment. +/// +/// Both modes share the same handler bodies — the routing middleware +/// (`resolve_graph_handle`) injects `Arc` as a request +/// extension so handlers never see the routing discriminator. +#[derive(Clone)] +pub enum GraphRouting { + /// Single-graph deployment: one handle, flat routes (`/snapshot`, + /// `/read`, …). The `handle.uri` field carries the URI the engine + /// was opened from. Backward compatible with v0.6.0 deployments. + Single { handle: Arc }, + /// Multi-graph deployment: many handles, cluster routes + /// (`/graphs/{graph_id}/...`). `config_path` is the `omnigraph.yaml` + /// the server reads at startup; preserved here so future runtime + /// mutation (deferred) can find the source of truth without + /// re-parsing CLI args. The server treats the file as + /// operator-owned and never writes it. + Multi { + registry: Arc, + config_path: Option, + }, +} + +#[derive(Clone)] +pub struct AppState { + /// Runtime routing — the single source of truth for where each + /// request's graph lives. Single mode holds the handle directly; + /// multi mode holds the registry + config path. Both arms are + /// the same shape from a handler's perspective: middleware + /// extracts an `Arc` and injects it as a request + /// extension. + routing: GraphRouting, + /// Per-actor admission control. Process-wide (not per-graph) — + /// see MR-668 decision Q6. + workload: Arc, + bearer_tokens: Arc<[(BearerTokenHash, Arc)]>, + /// Server-level Cedar policy. Used by management endpoints (`POST + /// /graphs`, `GET /graphs`) which act on the registry resource, + /// not on a per-graph resource. Loaded from `server.policy.file` + /// in `omnigraph.yaml`. `None` outside multi mode and when no + /// server policy is configured. Per-graph policies live on each + /// `GraphHandle.policy`. + server_policy: Option>, +} struct ExportStreamWriter { sender: mpsc::UnboundedSender>, @@ -160,12 +254,6 @@ impl Write for ExportStreamWriter { } } -impl AuthenticatedActor { - fn as_str(&self) -> &str { - &self.0 - } -} - #[derive(Debug)] pub struct ApiError { status: StatusCode, @@ -176,8 +264,34 @@ pub struct ApiError { } impl AppState { + /// Canonical single-mode constructor. Every other `new_*` / `open_*` + /// helper is a thin convenience wrapper around this one. Builds the + /// engine + per-graph policy through `build_single_mode`, which + /// applies `Omnigraph::with_policy` so HTTP-layer and engine-layer + /// policy can never diverge — there is no "policy installed on HTTP + /// but not on engine" representable state (closes the prior + /// `with_policy_engine` footgun that reused the engine `Arc` + /// without re-applying `with_policy`). + pub fn new_single( + uri: String, + db: Omnigraph, + bearer_tokens: Vec<(String, String)>, + policy_engine: Option, + workload: workload::WorkloadController, + ) -> Self { + let bearer_tokens = hash_bearer_tokens(bearer_tokens); + let per_graph_policy = policy_engine.map(Arc::new); + Self::build_single_mode(uri, db, bearer_tokens, per_graph_policy, Arc::new(workload)) + } + pub fn new(uri: String, db: Omnigraph) -> Self { - Self::new_with_bearer_tokens(uri, db, Vec::new()) + Self::new_single( + uri, + db, + Vec::new(), + None, + workload::WorkloadController::from_env(), + ) } pub fn new_with_bearer_token(uri: String, db: Omnigraph, bearer_token: Option) -> Self { @@ -193,7 +307,13 @@ impl AppState { db: Omnigraph, bearer_tokens: Vec<(String, String)>, ) -> Self { - Self::new_with_bearer_tokens_and_policy(uri, db, bearer_tokens, None) + Self::new_single( + uri, + db, + bearer_tokens, + None, + workload::WorkloadController::from_env(), + ) } pub fn new_with_bearer_tokens_and_policy( @@ -202,68 +322,27 @@ impl AppState { bearer_tokens: Vec<(String, String)>, policy_engine: Option, ) -> Self { - let bearer_tokens: Vec<(BearerTokenHash, Arc)> = bearer_tokens - .into_iter() - .map(|(actor, token)| (hash_bearer_token(&token), Arc::::from(actor))) - .collect(); - let policy_engine: Option> = policy_engine.map(Arc::new); - // MR-722 chassis: inject the policy checker into the engine so - // `Omnigraph::apply_schema_as` (and PR #3's fan-out of the - // remaining writers) gates at engine-layer too. HTTP-layer - // `authorize_request` still fires first; the engine-layer gate - // is the redundant-but-correct backstop, plus the only path - // that protects SDK / embedded callers. PR #3 removes the HTTP - // redundancy once we're confident the engine gate covers it. - let db = if let Some(engine) = policy_engine.as_ref() { - // Unsizing coercion: Arc → Arc. - // Needs the explicit `as` cast — Rust 2024 doesn't infer it through - // `Arc::clone`. - let checker = Arc::clone(engine) as Arc; - db.with_policy(checker) - } else { - db - }; - Self { + Self::new_single( uri, - engine: Arc::new(db), - workload: Arc::new(workload::WorkloadController::from_env()), - bearer_tokens: Arc::from(bearer_tokens), + db, + bearer_tokens, policy_engine, - } + workload::WorkloadController::from_env(), + ) } /// Construct with a caller-provided [`workload::WorkloadController`]. /// Tests and benches use this to override per-actor caps without - /// mutating global env vars (which is unsafe in Rust 2024 once the - /// async runtime is up — `setenv` isn't thread-safe). + /// mutating global env vars (unsafe in Rust 2024 once the async + /// runtime is up — `setenv` isn't thread-safe). For tests that also + /// need a custom `PolicyEngine`, use [`new_single`] directly. pub fn new_with_workload( uri: String, db: Omnigraph, bearer_tokens: Vec<(String, String)>, workload: workload::WorkloadController, ) -> Self { - let bearer_tokens: Vec<(BearerTokenHash, Arc)> = bearer_tokens - .into_iter() - .map(|(actor, token)| (hash_bearer_token(&token), Arc::::from(actor))) - .collect(); - Self { - uri, - engine: Arc::new(db), - workload: Arc::new(workload), - bearer_tokens: Arc::from(bearer_tokens), - policy_engine: None, - } - } - - /// Install a `PolicyEngine` post-construction (MR-723). Used by - /// integration tests that need to thread custom workload limits - /// alongside a permit-all policy — the existing `new_with_*` and - /// `new_with_workload` constructors don't compose. Production - /// callers should use `open_with_bearer_tokens_and_policy` which - /// installs the policy on both the HTTP state and the engine. - pub fn with_policy_engine(mut self, engine: PolicyEngine) -> Self { - self.policy_engine = Some(Arc::new(engine)); - self + Self::new_single(uri, db, bearer_tokens, None, workload) } pub async fn open(uri: impl Into) -> Result { @@ -285,7 +364,7 @@ impl AppState { uri: impl Into, bearer_tokens: Vec<(String, String)>, ) -> Result { - let uri = uri.into(); + let uri = normalize_root_uri(&uri.into()).wrap_err("normalize graph URI")?; let db = Omnigraph::open(&uri).await?; Ok(Self::new_with_bearer_tokens(uri, db, bearer_tokens)) } @@ -295,15 +374,17 @@ impl AppState { bearer_tokens: Vec<(String, String)>, policy_file: Option<&PathBuf>, ) -> Result { - let uri = uri.into(); + // The "policy requires tokens" invariant is enforced once by + // `classify_server_runtime_state` in `serve()`, before either + // single-mode or multi-mode construction is reached. By the + // time we get here, the (policy, no-tokens) combination has + // already been rejected — no second bail needed. + let uri = normalize_root_uri(&uri.into()).wrap_err("normalize graph URI")?; let db = Omnigraph::open(&uri).await?; let policy_engine = match policy_file { - Some(path) => Some(PolicyEngine::load(path, &uri)?), + Some(path) => Some(PolicyEngine::load_graph(path, &uri)?), None => None, }; - if policy_engine.is_some() && bearer_tokens.is_empty() { - bail!("policy requires at least one configured bearer token actor"); - } Ok(Self::new_with_bearer_tokens_and_policy( uri, db, @@ -312,15 +393,109 @@ impl AppState { )) } - pub fn uri(&self) -> &str { - &self.uri + /// Single-mode shared construction: wraps the bare engine + per-graph + /// policy in a `GraphHandle` carried directly by `GraphRouting::Single`. + /// Per-graph policy enforcement on the engine (MR-722) is re-applied + /// via `Omnigraph::with_policy` so HTTP and engine layers can never + /// diverge. + fn build_single_mode( + uri: String, + db: Omnigraph, + bearer_tokens: Arc<[(BearerTokenHash, Arc)]>, + policy_engine: Option>, + workload: Arc, + ) -> Self { + // Engine-layer policy gate (MR-722). With a per-graph policy + // installed, every `_as` writer on `Omnigraph` calls into the + // PolicyChecker. HTTP-layer `authorize_request` is the first + // gate; engine-layer is the redundant-but-correct backstop. + let db = if let Some(policy) = policy_engine.as_ref() { + let checker = Arc::clone(policy) as Arc; + db.with_policy(checker) + } else { + db + }; + // `GraphHandle.key` is required by the struct, but in single + // mode it is never a registry key (there's no registry) and + // never compared against user input (routes are flat, no + // `{graph_id}` parameter). The label appears only in tracing + // output from `resolve_graph_handle`. The literal below is a + // log label, not a routing key — when the future cluster + // catalog ships, single mode may carry the catalog-assigned + // id here instead. + let uri = normalize_root_uri(&uri).unwrap_or(uri); + let key = GraphKey::cluster( + GraphId::try_from("default").expect("'default' is a valid GraphId log label"), + ); + let handle = Arc::new(GraphHandle { + key, + uri, + engine: Arc::new(db), + policy: policy_engine, + }); + Self { + routing: GraphRouting::Single { handle }, + workload, + bearer_tokens, + server_policy: None, + } + } + + /// Multi-mode constructor — used by the startup loop. Operators + /// reach this by invoking `omnigraph-server --config omnigraph.yaml` + /// with a non-empty `graphs:` map. + /// + /// Caller supplies the already-opened `GraphHandle`s and (optionally) + /// the path to the source config file. `server_policy` is loaded + /// from `server.policy.file` if configured. + pub fn new_multi( + handles: Vec>, + bearer_tokens: Vec<(String, String)>, + server_policy: Option, + workload: workload::WorkloadController, + config_path: Option, + ) -> std::result::Result { + let bearer_tokens = hash_bearer_tokens(bearer_tokens); + let registry = Arc::new(GraphRegistry::from_handles(handles)?); + Ok(Self { + routing: GraphRouting::Multi { + registry, + config_path, + }, + workload: Arc::new(workload), + bearer_tokens, + server_policy: server_policy.map(Arc::new), + }) + } + + /// Runtime routing accessor. Handlers don't typically inspect this — + /// they extract `Arc` via the routing middleware — but + /// `build_app` matches on it to decide flat vs nested route + /// mounting, and a handful of management endpoints (`GET /graphs`, + /// the OpenAPI cluster rewrite) match on the discriminant. + pub fn routing(&self) -> &GraphRouting { + &self.routing } fn requires_bearer_auth(&self) -> bool { - !self.bearer_tokens.is_empty() || self.policy_engine.is_some() + if !self.bearer_tokens.is_empty() { + return true; + } + if self.server_policy.is_some() { + return true; + } + // Any per-graph policy also requires auth — otherwise the + // policy gate would receive unauthenticated requests. Reading + // from `routing` is O(1) in both arms: single mode is a direct + // `handle.policy.is_some()` check, multi mode reads the + // cached `any_per_graph_policy` flag on the registry snapshot. + match &self.routing { + GraphRouting::Single { handle } => handle.policy.is_some(), + GraphRouting::Multi { registry, .. } => registry.snapshot_ref().any_per_graph_policy, + } } - fn authenticate_bearer_token(&self, provided_token: &str) -> Option> { + fn authenticate_bearer_token(&self, provided_token: &str) -> Option { // Hash the incoming token and compare against every stored digest in // constant time. Iterate all entries unconditionally so total work — // and therefore response timing — doesn't depend on which slot matches. @@ -331,12 +506,16 @@ impl AppState { matched = Some(Arc::clone(actor)); } } - matched + matched.map(ResolvedActor::cluster_static) } +} - fn policy_engine(&self) -> Option<&PolicyEngine> { - self.policy_engine.as_deref() - } +fn hash_bearer_tokens(bearer_tokens: Vec<(String, String)>) -> Arc<[(BearerTokenHash, Arc)]> { + let tokens: Vec<(BearerTokenHash, Arc)> = bearer_tokens + .into_iter() + .map(|(actor, token)| (hash_bearer_token(&token), Arc::::from(actor))) + .collect(); + Arc::from(tokens) } impl ApiError { @@ -380,6 +559,20 @@ impl ApiError { } } + /// HTTP 405 Method Not Allowed. Used when the route is mounted but + /// the active server mode doesn't serve it (`GET /graphs` in + /// single-graph mode returns this instead of 404 so clients can + /// distinguish "wrong context" from "no such resource"). + pub fn method_not_allowed(message: impl Into) -> Self { + Self { + status: StatusCode::METHOD_NOT_ALLOWED, + code: ErrorCode::MethodNotAllowed, + message: message.into(), + merge_conflicts: Vec::new(), + manifest_conflict: None, + } + } + pub fn conflict(message: impl Into) -> Self { Self { status: StatusCode::CONFLICT, @@ -484,6 +677,12 @@ impl ApiError { // engine gate fires, the bearer is valid — any failure from // the engine is a policy outcome, not an auth one. OmniError::Policy(message) => Self::forbidden(message), + // `Omnigraph::init` against an existing graph URI in strict + // mode. Not currently HTTP-reachable (POST /graphs was + // pulled), but mapping is wired so the variant has a + // single canonical translation when a future runtime + // create endpoint lands. + err @ OmniError::AlreadyInitialized { .. } => Self::conflict(err.to_string()), } } } @@ -555,10 +754,7 @@ pub fn load_server_settings( cli_allow_unauthenticated: bool, ) -> Result { let config = load_config(config_path)?; - let uri = - config.resolve_target_uri(cli_uri, cli_target.as_deref(), config.server_graph_name())?; let bind = cli_bind.unwrap_or_else(|| config.server_bind().to_string()); - let policy_file = config.resolve_policy_file(); // Either `--unauthenticated` or `OMNIGRAPH_UNAUTHENTICATED=1` flips // this. Treat any non-empty, non-"0"/"false" string as truthy — // standard 12-factor "any value is true" reading of the env var. @@ -571,14 +767,96 @@ pub fn load_server_settings( .unwrap_or(false); let allow_unauthenticated = cli_allow_unauthenticated || env_unauth; + // MR-668 decision 2 — four-rule mode inference matrix. + // + // 1. CLI `` positional → Single (URI = the value) + // 2. CLI `--target ` → Single (URI = graphs..uri) + // 3. `server.graph` in config → Single (URI = graphs..uri) + // 4. `--config` + non-empty `graphs:` + no single-mode selector + // → Multi (every entry in `graphs:`) + // 5. otherwise → error with migration hint + // + // Rules 1-3 are mutually compatible (CLI URI wins over `--target` + // wins over `server.graph`), reusing the existing + // `resolve_target_uri` precedence. + let has_cli_uri = cli_uri.is_some(); + let has_cli_target = cli_target.is_some(); + let has_server_graph = config.server_graph_name().is_some(); + let has_graphs_map = !config.graphs.is_empty(); + let has_explicit_config = config_path.is_some(); + + let mode = if has_cli_uri || has_cli_target || has_server_graph { + // Rules 1, 2, or 3 → Single mode. + let raw_uri = config.resolve_target_uri( + cli_uri, + cli_target.as_deref(), + config.server_graph_name(), + )?; + let uri = normalize_root_uri(&raw_uri).wrap_err_with(|| { + format!("normalize single-graph URI '{raw_uri}' from server settings") + })?; + let policy_file = config.resolve_policy_file(); + ServerConfigMode::Single { uri, policy_file } + } else if has_explicit_config && has_graphs_map { + if config.resolve_policy_file().is_some() { + bail!( + "top-level `policy.file` is single-graph/CLI-local policy only; \ + in multi-graph mode move per-graph rules to \ + `graphs..policy.file` and move `graph_list` rules to \ + `server.policy.file`." + ); + } + // Rule 4 → Multi mode. Build a startup config per graph. + let mut graphs = Vec::with_capacity(config.graphs.len()); + for (name, target) in &config.graphs { + // Validate the graph id can construct a `GraphId` newtype. + // Doing this here (not at registry insert) so a malformed + // omnigraph.yaml fails at startup with a clear error. + GraphId::try_from(name.clone()).map_err(|err| { + color_eyre::eyre::eyre!("invalid graph id '{name}' in omnigraph.yaml: {err}") + })?; + let raw_uri = config.resolve_uri_value(&target.uri); + let uri = normalize_root_uri(&raw_uri).wrap_err_with(|| { + format!("normalize URI '{raw_uri}' for graph '{name}' in omnigraph.yaml") + })?; + graphs.push(GraphStartupConfig { + graph_id: name.clone(), + uri, + policy_file: config.resolve_target_policy_file(name), + }); + } + let config_path = config_path + .cloned() + .expect("has_explicit_config implies config_path is Some"); + let server_policy_file = config.resolve_server_policy_file(); + ServerConfigMode::Multi { + graphs, + config_path, + server_policy_file, + } + } else { + // Rule 5 → error with migration hint. + bail!( + "no graph to serve: pass a URI (`omnigraph-server `), select a target \ + (`--target --config omnigraph.yaml`), set `server.graph: ` in \ + omnigraph.yaml, or for multi-graph mode add a `graphs:` map to the config \ + file referenced by `--config`." + ); + }; + Ok(ServerConfig { - uri, + mode, bind, - policy_file, allow_unauthenticated, }) } +/// Whether the loaded config will run the server in multi-graph mode. +/// Useful for the test that constructs `ServerConfig` directly. +pub fn server_config_is_multi(config: &ServerConfig) -> bool { + matches!(config.mode, ServerConfigMode::Multi { .. }) +} + /// MR-723 server runtime state, classified from the three-state matrix /// of (bearer tokens configured) × (policy file configured) at startup. /// @@ -591,10 +869,14 @@ pub fn load_server_settings( /// server requires a valid bearer token; once authenticated, every /// action except `Read` is denied with 403. Closes the "tokens but /// forgot the policy file" trap. -/// * **PolicyEnabled** — policy file configured. Cedar evaluates every -/// authenticated request. Tokens may also be configured (typical) or -/// not (unusual but valid — every request fails 401 without a -/// bearer, which is effectively "locked"). +/// * **PolicyEnabled** — policy file configured and at least one +/// bearer token configured. Cedar evaluates every authenticated +/// request. Policy without tokens is rejected at startup — +/// such a server would 401 every request, which is bug-shaped +/// rather than feature-shaped (operators wanting "deny all +/// unauthenticated traffic" should configure tokens plus a +/// deny-all policy to get meaningful 403s with policy-decision +/// logging instead). #[derive(Debug, Clone, Copy, Eq, PartialEq)] pub enum ServerRuntimeState { Open, @@ -603,8 +885,15 @@ pub enum ServerRuntimeState { } /// Compute the [`ServerRuntimeState`] from the configured inputs. -/// Pulled out as a pure function so the 3-state matrix is unit-testable +/// Pulled out as a pure function so the matrix is unit-testable /// without standing up the full server. +/// +/// The classifier is the **single source of truth** for "should we +/// start?" — both `serve()`'s single-mode and multi-mode branches +/// call this before constructing their `AppState`. Adding a startup +/// invariant here means both modes enforce it automatically; the +/// alternative (per-constructor `bail!`) drifts the moment a third +/// mode is added. pub fn classify_server_runtime_state( has_tokens: bool, has_policy: bool, @@ -619,12 +908,26 @@ pub fn classify_server_runtime_state( ), (false, false, true) => Ok(ServerRuntimeState::Open), (true, false, _) => Ok(ServerRuntimeState::DefaultDeny), - (_, true, _) => Ok(ServerRuntimeState::PolicyEnabled), + (false, true, _) => bail!( + "policy file is configured but no bearer tokens — every request would 401 \ + because no token can ever match. Configure at least one bearer token (see \ + docs/user/server.md), or remove the policy file. To deny all unauthenticated \ + traffic deliberately, configure tokens plus a deny-all Cedar rule — that \ + produces meaningful 403s with policy-decision logging instead of silent 401s." + ), + (true, true, _) => Ok(ServerRuntimeState::PolicyEnabled), } } pub fn build_app(state: AppState) -> Router { - let protected = Router::new() + // The per-graph protected routes, identical in single + multi mode. + // Two middleware layers wrap them (outer first, inner last): + // 1. `require_bearer_auth` — extracts the bearer token and injects + // `ResolvedActor` (or rejects 401). + // 2. `resolve_graph_handle` — injects `Arc` based on + // the active mode (single: the only handle; multi: lookup by + // `{graph_id}` in the URI path). + let per_graph_protected = Router::new() .route("/snapshot", get(server_snapshot)) .route("/export", post(server_export)) .route("/read", post(server_read)) @@ -643,11 +946,42 @@ pub fn build_app(state: AppState) -> Router { .route("/branches/merge", post(server_branch_merge)) .route("/commits", get(server_commit_list)) .route("/commits/{commit_id}", get(server_commit_show)) + .route_layer(middleware::from_fn_with_state( + state.clone(), + resolve_graph_handle, + )) .route_layer(middleware::from_fn_with_state( state.clone(), require_bearer_auth, )); + // Management endpoints (`GET /graphs`) live alongside the per-graph + // router. They go through bearer auth but NOT through + // `resolve_graph_handle` — they operate on the registry directly. + // The endpoint is mounted in both modes; in single mode the handler + // returns 405 so clients see "resource exists, wrong context" + // rather than 404 "no such resource." + // + // Runtime add/remove (`POST /graphs`, `DELETE /graphs/{id}`) is not + // exposed in v0.6.0 — operators add graphs by editing + // `omnigraph.yaml` and restarting. + let management = Router::new() + .route("/graphs", get(server_graphs_list)) + .route_layer(middleware::from_fn_with_state( + state.clone(), + require_bearer_auth, + )); + + // Mount the protected routes differently per mode: + // * Single → flat routes (legacy: `/snapshot`, `/read`, etc.) + // * Multi → nested under `/graphs/{graph_id}/...` + let protected: Router = match state.routing() { + GraphRouting::Single { .. } => per_graph_protected.merge(management), + GraphRouting::Multi { .. } => Router::new() + .nest("/graphs/{graph_id}", per_graph_protected) + .merge(management), + }; + Router::new() .route("/healthz", get(server_health)) .route("/openapi.json", get(server_openapi)) @@ -661,9 +995,22 @@ pub async fn serve(config: ServerConfig) -> Result<()> { let token_source = resolve_token_source().await?; info!(source = token_source.name(), "loaded bearer token source"); let tokens = token_source.load().await?; + + // For runtime-state classification, "any policy configured" means + // either the top-level/single-mode policy file OR a server-level + // policy OR any per-graph policy file. Mirrors the + // `requires_bearer_auth` semantics on AppState. + let has_policy_configured = match &config.mode { + ServerConfigMode::Single { policy_file, .. } => policy_file.is_some(), + ServerConfigMode::Multi { + graphs, + server_policy_file, + .. + } => server_policy_file.is_some() || graphs.iter().any(|g| g.policy_file.is_some()), + }; let runtime_state = classify_server_runtime_state( !tokens.is_empty(), - config.policy_file.is_some(), + has_policy_configured, config.allow_unauthenticated, )?; match runtime_state { @@ -679,20 +1026,112 @@ pub async fn serve(config: ServerConfig) -> Result<()> { ), ServerRuntimeState::PolicyEnabled => {} } - let state = AppState::open_with_bearer_tokens_and_policy( - config.uri.clone(), - tokens, - config.policy_file.as_ref(), - ) - .await?; - let listener = TcpListener::bind(&config.bind).await?; - info!(uri = %config.uri, bind = %config.bind, "serving omnigraph"); + + let bind = config.bind.clone(); + let state = match config.mode { + ServerConfigMode::Single { uri, policy_file } => { + let uri_for_log = uri.clone(); + info!(uri = %uri_for_log, bind = %bind, mode = "single", "serving omnigraph"); + AppState::open_with_bearer_tokens_and_policy(uri, tokens, policy_file.as_ref()).await? + } + ServerConfigMode::Multi { + graphs, + config_path, + server_policy_file, + } => { + info!( + bind = %bind, + mode = "multi", + graph_count = graphs.len(), + config = %config_path.display(), + "serving omnigraph" + ); + open_multi_graph_state(graphs, tokens, server_policy_file.as_ref(), config_path).await? + } + }; + + let listener = TcpListener::bind(&bind).await?; axum::serve(listener, build_app(state)) .with_graceful_shutdown(shutdown_signal()) .await?; Ok(()) } +/// Parallel open of every graph in the startup config, with bounded +/// concurrency (`buffer_unordered(4)`). Fail-fast — the first open error +/// aborts startup; other in-flight opens are dropped (their `Omnigraph` +/// instances close cleanly via Arc drop). +/// +/// The bound 4 is a rule-of-thumb for I/O-bound work. At N ≤ 10 this +/// trades startup latency for a small amount of concurrent S3 / Lance +/// open pressure. +async fn open_multi_graph_state( + graphs: Vec, + tokens: Vec<(String, String)>, + server_policy_file: Option<&PathBuf>, + config_path: PathBuf, +) -> Result { + use futures::{StreamExt, TryStreamExt}; + + if graphs.is_empty() { + bail!("multi-graph mode requires at least one graph in the `graphs:` map"); + } + + // Server-level policy (loaded once, applies to management endpoints). + // The placeholder graph_id `"server"` is the sentinel the Cedar + // resource-model refactor maps to the singleton + // `Omnigraph::Server::"root"` entity at evaluation time. + let server_policy = match server_policy_file { + Some(path) => Some(PolicyEngine::load_server(path)?), + None => None, + }; + + // `try_collect` propagates the first error eagerly, dropping every + // in-flight open. `buffer_unordered + collect::>` would drain + // the stream before checking errors — incorrect for the docstring's + // "fail-fast" claim and wasteful on S3-backed graphs. + let handles: Vec> = futures::stream::iter(graphs.into_iter()) + .map(|cfg| async move { open_single_graph(cfg).await }) + .buffer_unordered(4) + .try_collect() + .await?; + + let workload = workload::WorkloadController::from_env(); + let state = AppState::new_multi(handles, tokens, server_policy, workload, Some(config_path)) + .map_err(|err| color_eyre::eyre::eyre!("multi-graph registry: {err}"))?; + Ok(state) +} + +/// Open one graph and wrap it in a `GraphHandle`. Used at startup by +/// `open_multi_graph_state`. +async fn open_single_graph(cfg: GraphStartupConfig) -> Result> { + let graph_id = GraphId::try_from(cfg.graph_id.clone()) + .map_err(|err| color_eyre::eyre::eyre!("graph id '{}': {err}", cfg.graph_id))?; + let uri = normalize_root_uri(&cfg.uri) + .wrap_err_with(|| format!("normalize URI for graph '{}'", cfg.graph_id))?; + + let db = Omnigraph::open(&uri) + .await + .map_err(|err| color_eyre::eyre::eyre!("open graph '{}' at {}: {err}", graph_id, uri))?; + + let (policy_arc, db) = match &cfg.policy_file { + Some(path) => { + let policy = PolicyEngine::load_graph(path, graph_id.as_str())?; + let policy_arc: Arc = Arc::new(policy); + let checker = Arc::clone(&policy_arc) as Arc; + (Some(policy_arc), db.with_policy(checker)) + } + None => (None, db), + }; + + Ok(Arc::new(GraphHandle { + key: GraphKey::cluster(graph_id), + uri, + engine: Arc::new(db), + policy: policy_arc, + })) +} + async fn shutdown_signal() { if let Err(err) = tokio::signal::ctrl_c().await { error!(error = %err, "failed to install ctrl-c handler"); @@ -723,14 +1162,176 @@ async fn server_health() -> Json { }) } +#[utoipa::path( + get, + path = "/graphs", + tag = "management", + operation_id = "listGraphs", + responses( + (status = 200, description = "List of registered graphs", body = GraphListResponse), + (status = 401, description = "Unauthorized", body = ErrorOutput), + (status = 403, description = "Forbidden", body = ErrorOutput), + (status = 405, description = "Method not allowed (single-graph mode)", body = ErrorOutput), + ), + security(("bearer_token" = [])), +)] +/// List every graph currently registered with this server (MR-668). +/// +/// Multi-graph mode only. In single mode, the route returns 405 — there's +/// no registry to enumerate. Cedar-gated by the server-level policy via +/// the `graph_list` action against `Omnigraph::Server::"root"`. +/// +/// Order: alphabetical by `graph_id` (server-sorted so clients see +/// deterministic output across requests). +async fn server_graphs_list( + State(state): State, + actor: Option>, +) -> std::result::Result, ApiError> { + // 405 in single mode — there's no registry to enumerate, and the + // legacy URL surface didn't expose this endpoint. + let registry = match state.routing() { + GraphRouting::Single { .. } => { + return Err(ApiError::method_not_allowed( + "GET /graphs is only available in multi-graph mode", + )); + } + GraphRouting::Multi { registry, .. } => registry, + }; + + // Server-level Cedar gate. `state.server_policy` is loaded from + // `server.policy.file` in `omnigraph.yaml` at startup. When no + // server policy is configured, `authorize_request_server` falls + // through to the MR-723 default-deny semantics (every non-Read + // action denied for an authenticated actor). `GraphList` is not + // `Read`, so without a server policy the request gets 403 — which + // is the right default (don't leak the registry until the operator + // explicitly authorizes it). + authorize_request( + actor.as_ref().map(|Extension(actor)| actor), + state.server_policy.as_deref(), + PolicyRequest { + action: PolicyAction::GraphList, + branch: None, + target_branch: None, + }, + )?; + + let mut graphs: Vec = registry + .list() + .into_iter() + .map(|handle| GraphInfo { + graph_id: handle.key.graph_id.as_str().to_string(), + uri: handle.uri.clone(), + }) + .collect(); + graphs.sort_by(|a, b| a.graph_id.cmp(&b.graph_id)); + Ok(Json(GraphListResponse { graphs })) +} + async fn server_openapi(State(state): State) -> Json { let mut doc = ApiDoc::openapi(); if !state.requires_bearer_auth() { strip_security(&mut doc); } + // MR-668: in multi mode, the protected routes live under + // `/graphs/{graph_id}/...`. Rewrite the doc so the spec matches + // the routes the router actually serves. Public paths (`/healthz`) + // stay flat in both modes. + if matches!(state.routing(), GraphRouting::Multi { .. }) { + nest_paths_under_cluster_prefix(&mut doc); + } Json(doc) } +/// Path prefix used to namespace per-graph routes in multi mode. +/// Kept in sync with the `Router::nest(...)` invocation in `build_app`. +const CLUSTER_PATH_PREFIX: &str = "/graphs/{graph_id}"; + +/// Operation-id prefix applied to every cloned cluster operation. +/// Decision 7 in the implementation plan — keeps operation IDs unique +/// across the spec when both flat and nested variants ever appear in +/// the same generation pass. +const CLUSTER_OPERATION_ID_PREFIX: &str = "cluster_"; + +/// Paths that stay flat in every server mode (public or server-level, +/// no per-graph dependency). Update this list when adding new +/// always-flat endpoints. `/graphs` is the management enumeration — +/// it lives at the root in both single mode (405) and multi mode, and +/// must never be rewritten to `/graphs/{graph_id}/graphs`. +const ALWAYS_FLAT_PATHS: &[&str] = &["/healthz", "/graphs"]; + +/// In multi-mode `server_openapi`, every protected path-item is +/// reattached under the cluster prefix. Operation IDs gain the +/// `cluster_` prefix so SDK generators don't collide if/when both +/// surfaces are merged. Every rewritten operation also declares the +/// required `{graph_id}` path parameter so the served OpenAPI document +/// remains internally valid. +/// +/// Removing the flat protected paths matches the runtime router — +/// in multi mode, requests to `/snapshot` etc. return 404, so the +/// spec must agree. +fn nest_paths_under_cluster_prefix(doc: &mut utoipa::openapi::OpenApi) { + let original = std::mem::take(&mut doc.paths.paths); + let mut rewritten = std::collections::BTreeMap::new(); + for (path, mut item) in original { + if ALWAYS_FLAT_PATHS.contains(&path.as_str()) { + rewritten.insert(path, item); + continue; + } + rename_operation_ids(&mut item, CLUSTER_OPERATION_ID_PREFIX); + add_cluster_graph_id_parameter(&mut item); + let new_path = format!("{CLUSTER_PATH_PREFIX}{path}"); + rewritten.insert(new_path, item); + } + doc.paths.paths = rewritten; +} + +fn add_cluster_graph_id_parameter(item: &mut utoipa::openapi::PathItem) { + for op in path_item_operations_mut(item) { + let parameters = op.parameters.get_or_insert_with(Vec::new); + let has_graph_id = parameters + .iter() + .any(|param| param.name == "graph_id" && param.parameter_in == ParameterIn::Path); + if !has_graph_id { + parameters.insert(0, graph_id_path_parameter()); + } + } +} + +fn graph_id_path_parameter() -> Parameter { + let mut parameter = Parameter::new("graph_id"); + parameter.parameter_in = ParameterIn::Path; + parameter.description = Some("Graph id to route the request to.".to_string()); + parameter.schema = Some(Object::with_type(Type::String).into()); + parameter +} + +/// Prefix every operation_id in this PathItem with `prefix`. +fn rename_operation_ids(item: &mut utoipa::openapi::PathItem, prefix: &str) { + for op in path_item_operations_mut(item) { + if let Some(id) = op.operation_id.as_deref() { + op.operation_id = Some(format!("{prefix}{id}")); + } + } +} + +fn path_item_operations_mut( + item: &mut utoipa::openapi::PathItem, +) -> impl Iterator { + [ + item.get.as_mut(), + item.post.as_mut(), + item.put.as_mut(), + item.delete.as_mut(), + item.options.as_mut(), + item.head.as_mut(), + item.patch.as_mut(), + item.trace.as_mut(), + ] + .into_iter() + .flatten() +} + fn strip_security(doc: &mut utoipa::openapi::OpenApi) { if let Some(components) = doc.components.as_mut() { components.security_schemes.clear(); @@ -778,11 +1379,77 @@ async fn require_bearer_auth( let Some(actor) = state.authenticate_bearer_token(provided_token) else { return Err(ApiError::unauthorized("invalid bearer token")); }; - request.extensions_mut().insert(AuthenticatedActor(actor)); + request.extensions_mut().insert(actor); Ok(next.run(request).await) } +/// Routing middleware (MR-668). Resolves the active graph for the +/// request and injects `Arc` as an extension so handlers can +/// extract it via `Extension>`. +/// +/// **Single mode**: the routing field holds the single handle directly. +/// Routes are flat; every request resolves to that handle, regardless +/// of the URI path. No registry walk, no sentinel key, no +/// programmer-error guard. +/// +/// **Multi mode**: routes are nested under `/graphs/{graph_id}/...`. The +/// middleware extracts `{graph_id}` from the URI path and looks it up in +/// the registry. Returns 404 if the graph is not registered. +/// +/// The middleware fires AFTER `require_bearer_auth`, so the actor is +/// already in the request extensions (or auth was off entirely). +async fn resolve_graph_handle( + State(state): State, + mut request: Request, + next: Next, +) -> std::result::Result { + let handle = match &state.routing { + GraphRouting::Single { handle } => Arc::clone(handle), + GraphRouting::Multi { registry, .. } => { + // `Router::nest("/graphs/{graph_id}", inner)` rewrites + // `request.uri().path()` to the inner suffix (e.g. `/snapshot`). + // The pre-rewrite URI is preserved in the `OriginalUri` + // request extension by axum's router; we read from there to + // extract `{graph_id}`. Fall back to the current URI only if + // the extension is missing, which shouldn't happen for + // nested routes but is safe defensive code. + let original_path: String = request + .extensions() + .get::() + .map(|OriginalUri(uri)| uri.path().to_string()) + .unwrap_or_else(|| request.uri().path().to_string()); + let graph_id_str = original_path + .strip_prefix("/graphs/") + .and_then(|rest| rest.split('/').next()) + .filter(|s| !s.is_empty()) + .ok_or_else(|| { + ApiError::bad_request( + "cluster route missing /graphs/{graph_id} prefix".to_string(), + ) + })?; + let graph_id = GraphId::try_from(graph_id_str.to_string()) + .map_err(|err| ApiError::bad_request(err.to_string()))?; + let key = GraphKey::cluster(graph_id.clone()); + match registry.get(&key) { + RegistryLookup::Ready(handle) => handle, + RegistryLookup::Gone => { + return Err(ApiError::not_found(format!("graph '{graph_id}' not found"))); + } + } + } + }; + + // Per-request observability. `Span::current().record` would silently + // no-op here because no upstream `#[tracing::instrument(...)]` macro + // declares a `graph_id` field; emit an explicit event instead so the + // routing decision actually lands in logs. + info!(graph_id = %handle.key.graph_id, "graph routed"); + + request.extensions_mut().insert(handle); + Ok(next.run(request).await) +} + fn log_policy_decision(actor_id: &str, request: &PolicyRequest, decision: &PolicyDecision) { info!( actor_id = actor_id, @@ -795,26 +1462,58 @@ fn log_policy_decision(actor_id: &str, request: &PolicyRequest, decision: &Polic ); } +/// HTTP-layer Cedar policy gate. Two sources of the policy engine: +/// * Per-graph handler — passes `handle.policy.as_deref()` so the +/// graph's Cedar rules govern read/change/branch_*/schema_apply. +/// * Management handler — passes `state.server_policy.as_deref()` so +/// server-level Cedar rules govern `graph_list` (the only shipped +/// server-scoped action; runtime `graph_create` / `graph_delete` +/// are deferred until a managed cluster catalog lands). +/// +/// The MR-731 invariant lives inside this function: actor identity is +/// supplied as a separate argument from the resolved bearer match. The +/// `PolicyRequest` struct itself does not carry identity (the field was +/// dropped from the type), so handlers cannot smuggle it through the +/// request. See `actor_id_resolves_from_bearer_token_ignoring_client_supplied_headers` +/// at `tests/server.rs`. fn authorize_request( - state: &AppState, - actor: Option<&AuthenticatedActor>, - mut request: PolicyRequest, + actor: Option<&ResolvedActor>, + policy: Option<&PolicyEngine>, + request: PolicyRequest, ) -> std::result::Result<(), ApiError> { - let Some(engine) = state.policy_engine() else { - // MR-723 default-deny path. We're here when no PolicyEngine is - // installed. Two startup-validated shapes can reach this: + let Some(engine) = policy else { + // No PolicyEngine installed. Three runtime states can reach this: // // * **Open mode** (`--unauthenticated`): no tokens, no policy. - // `require_bearer_auth` short-circuits before this is called, - // but defense in depth — if a future change makes the - // middleware call here for an unauthenticated request, we - // want every action to remain Ok rather than 403. The - // operator opted in. + // Per-graph operations are open by operator opt-in (they + // accepted "trust the network" for graph data). // * **DefaultDeny mode**: tokens configured but no policy. The - // request went through bearer auth, so `actor` is Some and - // identifies a known actor. Only `Read` is permitted; every - // other action returns 403. This closes the "configured auth - // but forgot the policy file" trap from MR-723. + // request went through bearer auth, so `actor` is Some. Only + // per-graph `Read` is permitted; other per-graph actions + // return 403. Closes the "configured auth but forgot the + // policy file" trap from MR-723. + // * Either of the above with a **server-scoped** action + // (`graph_list`, future `graph_create`/`graph_delete`). + // + // Server-scoped actions are always denied here, regardless of + // mode or actor presence. The management surface leaks server + // topology (graph IDs + URIs that may contain S3 bucket paths + // or internal hostnames) — operators who opted into Open mode + // accepted exposure of graph DATA, not exposure of server + // topology. Closing the management surface by default in every + // runtime state means the docstring contract on + // `server_graphs_list` ("don't leak the registry until the + // operator explicitly authorizes it") holds uniformly; the + // operator's only path to enabling it is configuring an + // explicit `server.policy.file` in omnigraph.yaml. + if request.action.resource_kind() == PolicyResourceKind::Server { + return Err(ApiError::forbidden( + "server-scoped actions require an explicit `server.policy.file` \ + configured in omnigraph.yaml — the management surface is closed \ + by default in every runtime state, including --unauthenticated, \ + so that server topology is never exposed without operator opt-in.", + )); + } if actor.is_some() && request.action != PolicyAction::Read { return Err(ApiError::forbidden( "server runs in default-deny mode (bearer tokens configured but no \ @@ -827,26 +1526,22 @@ fn authorize_request( let Some(actor) = actor else { return Err(ApiError::unauthorized("missing bearer token")); }; - // SECURITY INVARIANT (MR-731): actor identity comes from the matched - // bearer token, never from a client-supplied request header, query - // parameter, or body field. This line is the single chokepoint where - // the authoritative actor (resolved from the bearer match by - // `require_bearer_auth`) overwrites whatever the handler put in the - // PolicyRequest. Removing or weakening it lets clients spoof identity — - // exactly the Supabase RLS footgun ("trusting raw_user_meta_data is - // asking the attacker if they're an admin"). The principle is codified - // in `docs/dev/invariants.md` Hard Invariant 11 ("clients cannot set + // SECURITY INVARIANT (MR-731): actor identity is supplied to the + // policy engine here as a separate argument, sourced from the + // bearer-token match resolved by `require_bearer_auth`. The + // `PolicyRequest` struct itself no longer carries `actor_id` (it + // was dropped from the type), so handlers cannot smuggle identity + // through the request body and there is no overwrite step that + // could be skipped. The principle is codified in + // `docs/dev/invariants.md` Hard Invariant 11 ("clients cannot set // actor identity directly") and pinned by the regression test // `actor_id_resolves_from_bearer_token_ignoring_client_supplied_headers` // in `crates/omnigraph-server/tests/server.rs`. - // - // Side effect: also prevents an empty-string default at any handler - // call site from ever reaching the engine as a policy subject. - request.actor_id = actor.as_str().to_string(); + let actor_id = actor.actor_id.as_ref(); let decision = engine - .authorize(&request) + .authorize(actor_id, &request) .map_err(|err| ApiError::internal(format!("policy: {err}")))?; - log_policy_decision(actor.as_str(), &request, &decision); + log_policy_decision(actor_id, &request, &decision); if decision.allowed { Ok(()) } else { @@ -873,26 +1568,22 @@ fn authorize_request( /// count) for every table on the branch. Defaults to `main` when `branch` is /// omitted. Read-only. async fn server_snapshot( - State(state): State, - actor: Option>, + Extension(handle): Extension>, + actor: Option>, Query(query): Query, ) -> std::result::Result, ApiError> { let branch = query.branch.unwrap_or_else(|| "main".to_string()); authorize_request( - &state, actor.as_ref().map(|Extension(actor)| actor), + handle.policy.as_deref(), PolicyRequest { - actor_id: actor - .as_ref() - .map(|Extension(actor)| actor.as_str().to_string()) - .unwrap_or_default(), action: PolicyAction::Read, branch: Some(branch.clone()), target_branch: None, }, )?; let snapshot = { - let db = &state.engine; + let db = &handle.engine; db.snapshot_of(ReadTarget::branch(branch.as_str())) .await .map_err(ApiError::from_omni)? @@ -922,8 +1613,8 @@ async fn server_snapshot( /// match the parameters declared by the query. Returns rows as a JSON array /// plus a `columns` list. Read-only. async fn server_read( - State(state): State, - actor: Option>, + Extension(handle): Extension>, + actor: Option>, Json(request): Json, ) -> std::result::Result, ApiError> { if request.branch.is_some() && request.snapshot.is_some() { @@ -935,8 +1626,8 @@ async fn server_read( let target = read_target_from_request(request.branch, request.snapshot); let policy_branch = match &target { ReadTarget::Branch(branch) => Some(branch.clone()), - ReadTarget::Snapshot(_) if state.policy_engine().is_some() && actor.is_some() => { - let db = &state.engine; + ReadTarget::Snapshot(_) if handle.policy.is_some() && actor.is_some() => { + let db = &handle.engine; db.resolved_branch_of(target.clone()) .await .map(|branch| branch.or_else(|| Some("main".to_string()))) @@ -945,13 +1636,9 @@ async fn server_read( ReadTarget::Snapshot(_) => None, }; authorize_request( - &state, actor.as_ref().map(|Extension(actor)| actor), + handle.policy.as_deref(), PolicyRequest { - actor_id: actor - .as_ref() - .map(|Extension(actor)| actor.as_str().to_string()) - .unwrap_or_default(), action: PolicyAction::Read, branch: policy_branch, target_branch: None, @@ -964,7 +1651,7 @@ async fn server_read( .map_err(|err| ApiError::bad_request(err.to_string()))?; let result = { - let db = &state.engine; + let db = &handle.engine; db.query( target.clone(), &request.query_source, @@ -998,25 +1685,21 @@ async fn server_read( /// streams the entire branch. Suitable for large exports — the response is /// streamed, not buffered. Read-only. async fn server_export( - State(state): State, - actor: Option>, + Extension(handle): Extension>, + actor: Option>, Json(request): Json, ) -> std::result::Result { let branch = request.branch.unwrap_or_else(|| "main".to_string()); authorize_request( - &state, actor.as_ref().map(|Extension(actor)| actor), + handle.policy.as_deref(), PolicyRequest { - actor_id: actor - .as_ref() - .map(|Extension(actor)| actor.as_str().to_string()) - .unwrap_or_default(), action: PolicyAction::Export, branch: Some(branch.clone()), target_branch: None, }, )?; - let engine = Arc::clone(&state.engine); + let engine = Arc::clone(&handle.engine); let type_names = request.type_names.clone(); let table_keys = request.table_keys.clone(); let (tx, rx) = mpsc::unbounded_channel::>(); @@ -1066,20 +1749,22 @@ async fn server_export( /// mutations may still acquire locks briefly. Returns 409 on merge conflict. async fn server_change( State(state): State, - actor: Option>, + Extension(handle): Extension>, + actor: Option>, Json(request): Json, ) -> std::result::Result, ApiError> { let branch = request.branch.unwrap_or_else(|| "main".to_string()); let actor_arc = actor .as_ref() - .map(|Extension(actor)| Arc::clone(&actor.0)) + .map(|Extension(actor)| Arc::clone(&actor.actor_id)) .unwrap_or_else(|| Arc::::from("anonymous")); - let actor_id = actor.as_ref().map(|Extension(actor)| actor.as_str()); + let actor_id = actor + .as_ref() + .map(|Extension(actor)| actor.actor_id.as_ref()); authorize_request( - &state, actor.as_ref().map(|Extension(actor)| actor), + handle.policy.as_deref(), PolicyRequest { - actor_id: actor_id.map(str::to_string).unwrap_or_default(), action: PolicyAction::Change, branch: Some(branch.clone()), target_branch: None, @@ -1106,7 +1791,7 @@ async fn server_change( .map_err(|err| ApiError::bad_request(err.to_string()))?; let result = { - let db = &state.engine; + let db = &handle.engine; db.mutate_as( &branch, &request.query_source, @@ -1144,24 +1829,20 @@ async fn server_change( /// Useful for clients that want to introspect available types and tables /// before constructing GQ queries. Read-only. async fn server_schema_get( - State(state): State, - actor: Option>, + Extension(handle): Extension>, + actor: Option>, ) -> std::result::Result, ApiError> { authorize_request( - &state, actor.as_ref().map(|Extension(actor)| actor), + handle.policy.as_deref(), PolicyRequest { - actor_id: actor - .as_ref() - .map(|Extension(actor)| actor.as_str().to_string()) - .unwrap_or_default(), action: PolicyAction::Read, branch: None, target_branch: None, }, )?; let schema_source = { - let db = &state.engine; + let db = &handle.engine; db.schema_source().to_string() }; Ok(Json(SchemaOutput { schema_source })) @@ -1190,19 +1871,21 @@ async fn server_schema_get( /// false the diff was unsupported and no changes were made. async fn server_schema_apply( State(state): State, - actor: Option>, + Extension(handle): Extension>, + actor: Option>, Json(request): Json, ) -> std::result::Result, ApiError> { let actor_arc = actor .as_ref() - .map(|Extension(actor)| Arc::clone(&actor.0)) + .map(|Extension(actor)| Arc::clone(&actor.actor_id)) .unwrap_or_else(|| Arc::::from("anonymous")); - let actor_id = actor.as_ref().map(|Extension(actor)| actor.as_str()); + let actor_id = actor + .as_ref() + .map(|Extension(actor)| actor.actor_id.as_ref()); authorize_request( - &state, actor.as_ref().map(|Extension(actor)| actor), + handle.policy.as_deref(), PolicyRequest { - actor_id: actor_id.map(str::to_string).unwrap_or_default(), action: PolicyAction::SchemaApply, branch: None, target_branch: Some("main".to_string()), @@ -1214,7 +1897,7 @@ async fn server_schema_apply( .try_admit(&actor_arc, est_bytes) .map_err(ApiError::from_workload_reject)?; let result = { - let db = &state.engine; + let db = &handle.engine; // Engine-layer policy enforcement (MR-722): pass the resolved // actor through so apply_schema_as can call enforce() with the // authoritative identity. With a policy installed in AppState, @@ -1231,7 +1914,7 @@ async fn server_schema_apply( .await .map_err(ApiError::from_omni)? }; - Ok(Json(schema_apply_output(state.uri(), result))) + Ok(Json(schema_apply_output(handle.uri.as_str(), result))) } #[utoipa::path( @@ -1258,7 +1941,8 @@ async fn server_schema_apply( /// `overwrite` or when ingest produces conflicting writes. async fn server_ingest( State(state): State, - actor: Option>, + Extension(handle): Extension>, + actor: Option>, Json(request): Json, ) -> std::result::Result, ApiError> { let branch = request.branch.unwrap_or_else(|| "main".to_string()); @@ -1266,12 +1950,14 @@ async fn server_ingest( let mode = request.mode.unwrap_or(omnigraph::loader::LoadMode::Merge); let actor_arc = actor .as_ref() - .map(|Extension(actor)| Arc::clone(&actor.0)) + .map(|Extension(actor)| Arc::clone(&actor.actor_id)) .unwrap_or_else(|| Arc::::from("anonymous")); - let actor_id = actor.as_ref().map(|Extension(actor)| actor.as_str()); + let actor_id = actor + .as_ref() + .map(|Extension(actor)| actor.actor_id.as_ref()); let branch_exists = { - let db = &state.engine; + let db = &handle.engine; db.branch_list() .await .map_err(ApiError::from_omni)? @@ -1281,10 +1967,9 @@ async fn server_ingest( if !branch_exists { authorize_request( - &state, actor.as_ref().map(|Extension(actor)| actor), + handle.policy.as_deref(), PolicyRequest { - actor_id: actor_id.map(str::to_string).unwrap_or_default(), action: PolicyAction::BranchCreate, branch: Some(from.clone()), target_branch: Some(branch.clone()), @@ -1292,10 +1977,9 @@ async fn server_ingest( )?; } authorize_request( - &state, actor.as_ref().map(|Extension(actor)| actor), + handle.policy.as_deref(), PolicyRequest { - actor_id: actor_id.map(str::to_string).unwrap_or_default(), action: PolicyAction::Change, branch: Some(branch.clone()), target_branch: None, @@ -1308,14 +1992,14 @@ async fn server_ingest( .map_err(ApiError::from_workload_reject)?; let result = { - let db = &state.engine; + let db = &handle.engine; db.ingest_as(&branch, Some(&from), &request.data, mode, actor_id) .await .map_err(ApiError::from_omni)? }; Ok(Json(ingest_output( - state.uri(), + handle.uri.as_str(), &result, actor_id.map(str::to_string), ))) @@ -1337,24 +2021,20 @@ async fn server_ingest( /// /// Returns branch names sorted alphabetically. Read-only. async fn server_branch_list( - State(state): State, - actor: Option>, + Extension(handle): Extension>, + actor: Option>, ) -> std::result::Result, ApiError> { authorize_request( - &state, actor.as_ref().map(|Extension(actor)| actor), + handle.policy.as_deref(), PolicyRequest { - actor_id: actor - .as_ref() - .map(|Extension(actor)| actor.as_str().to_string()) - .unwrap_or_default(), action: PolicyAction::Read, branch: None, target_branch: None, }, )?; let mut branches = { - let db = &state.engine; + let db = &handle.engine; db.branch_list().await.map_err(ApiError::from_omni)? }; branches.sort(); @@ -1384,22 +2064,19 @@ async fn server_branch_list( /// already exists. async fn server_branch_create( State(state): State, - actor: Option>, + Extension(handle): Extension>, + actor: Option>, Json(request): Json, ) -> std::result::Result, ApiError> { let from = request.from.unwrap_or_else(|| "main".to_string()); let actor_arc = actor .as_ref() - .map(|Extension(actor)| Arc::clone(&actor.0)) + .map(|Extension(actor)| Arc::clone(&actor.actor_id)) .unwrap_or_else(|| Arc::::from("anonymous")); authorize_request( - &state, actor.as_ref().map(|Extension(actor)| actor), + handle.policy.as_deref(), PolicyRequest { - actor_id: actor - .as_ref() - .map(|Extension(actor)| actor.as_str().to_string()) - .unwrap_or_default(), action: PolicyAction::BranchCreate, branch: Some(from.clone()), target_branch: Some(request.name.clone()), @@ -1413,23 +2090,37 @@ async fn server_branch_create( .try_admit(&actor_arc, 256) .map_err(ApiError::from_workload_reject)?; { - let db = &state.engine; + let db = &handle.engine; db.branch_create_from_as( ReadTarget::branch(&from), &request.name, - actor.as_ref().map(|Extension(a)| a.as_str()), + actor.as_ref().map(|Extension(a)| a.actor_id.as_ref()), ) .await .map_err(ApiError::from_omni)?; } Ok(Json(BranchCreateOutput { - uri: state.uri().to_string(), + uri: handle.uri.clone(), from, name: request.name, - actor_id: actor.map(|Extension(actor)| actor.as_str().to_string()), + actor_id: actor.map(|Extension(actor)| actor.actor_id.as_ref().to_string()), })) } +/// Path-param shape for [`server_branch_delete`]. Named-field +/// deserialization (rather than `Path` or `Path<(String,)>`) +/// keeps the extractor stable across single-mode flat routes and +/// multi-mode nested routes: the `{branch}` capture is picked by +/// name and any other captures in scope (e.g. `{graph_id}` in +/// multi-mode) are ignored without breaking deserialization. +/// +/// Closes the "handler path-extractor type is positional and breaks +/// when route nesting changes" class. +#[derive(Deserialize)] +struct BranchPath { + branch: String, +} + #[utoipa::path( delete, path = "/branches/{branch}", @@ -1454,19 +2145,21 @@ async fn server_branch_create( /// exist. async fn server_branch_delete( State(state): State, - actor: Option>, - Path(branch): Path, + Extension(handle): Extension>, + actor: Option>, + Path(BranchPath { branch }): Path, ) -> std::result::Result, ApiError> { let actor_arc = actor .as_ref() - .map(|Extension(actor)| Arc::clone(&actor.0)) + .map(|Extension(actor)| Arc::clone(&actor.actor_id)) .unwrap_or_else(|| Arc::::from("anonymous")); - let actor_id = actor.as_ref().map(|Extension(actor)| actor.as_str()); + let actor_id = actor + .as_ref() + .map(|Extension(actor)| actor.actor_id.as_ref()); authorize_request( - &state, actor.as_ref().map(|Extension(actor)| actor), + handle.policy.as_deref(), PolicyRequest { - actor_id: actor_id.map(str::to_string).unwrap_or_default(), action: PolicyAction::BranchDelete, branch: None, target_branch: Some(branch.clone()), @@ -1478,13 +2171,13 @@ async fn server_branch_delete( .try_admit(&actor_arc, 256) .map_err(ApiError::from_workload_reject)?; { - let db = &state.engine; + let db = &handle.engine; db.branch_delete_as(&branch, actor_id) .await .map_err(ApiError::from_omni)?; } Ok(Json(BranchDeleteOutput { - uri: state.uri().to_string(), + uri: handle.uri.clone(), name: branch, actor_id: actor_id.map(str::to_string), })) @@ -1514,20 +2207,22 @@ async fn server_branch_delete( /// unchanged in that case. **Destructive** to `target` on success. async fn server_branch_merge( State(state): State, - actor: Option>, + Extension(handle): Extension>, + actor: Option>, Json(request): Json, ) -> std::result::Result, ApiError> { let target = request.target.unwrap_or_else(|| "main".to_string()); let actor_arc = actor .as_ref() - .map(|Extension(actor)| Arc::clone(&actor.0)) + .map(|Extension(actor)| Arc::clone(&actor.actor_id)) .unwrap_or_else(|| Arc::::from("anonymous")); - let actor_id = actor.as_ref().map(|Extension(actor)| actor.as_str()); + let actor_id = actor + .as_ref() + .map(|Extension(actor)| actor.actor_id.as_ref()); authorize_request( - &state, actor.as_ref().map(|Extension(actor)| actor), + handle.policy.as_deref(), PolicyRequest { - actor_id: actor_id.map(str::to_string).unwrap_or_default(), action: PolicyAction::BranchMerge, branch: Some(request.source.clone()), target_branch: Some(target.clone()), @@ -1541,7 +2236,7 @@ async fn server_branch_merge( .try_admit(&actor_arc, 256) .map_err(ApiError::from_workload_reject)?; let outcome = { - let db = &state.engine; + let db = &handle.engine; db.branch_merge_as(&request.source, &target, actor_id) .await .map_err(ApiError::from_omni)? @@ -1572,25 +2267,21 @@ async fn server_branch_merge( /// Filter by `branch` to get the commits on a single branch (most recent /// first); omit to list across all branches. Read-only. async fn server_commit_list( - State(state): State, - actor: Option>, + Extension(handle): Extension>, + actor: Option>, Query(query): Query, ) -> std::result::Result, ApiError> { authorize_request( - &state, actor.as_ref().map(|Extension(actor)| actor), + handle.policy.as_deref(), PolicyRequest { - actor_id: actor - .as_ref() - .map(|Extension(actor)| actor.as_str().to_string()) - .unwrap_or_default(), action: PolicyAction::Read, branch: query.branch.clone(), target_branch: None, }, )?; let commits = { - let db = &state.engine; + let db = &handle.engine; db.list_commits(query.branch.as_deref()) .await .map_err(ApiError::from_omni)? @@ -1600,6 +2291,13 @@ async fn server_commit_list( })) } +/// Path-param shape for [`server_commit_show`]. See [`BranchPath`] +/// for the design rationale — same pattern, different field name. +#[derive(Deserialize)] +struct CommitPath { + commit_id: String, +} + #[utoipa::path( get, path = "/commits/{commit_id}", @@ -1616,30 +2314,27 @@ async fn server_commit_list( ), security(("bearer_token" = [])), )] + /// Get a single commit. /// /// Returns the commit's manifest version, parent commit(s), and creation /// metadata. Read-only. async fn server_commit_show( - State(state): State, - actor: Option>, - Path(commit_id): Path, + Extension(handle): Extension>, + actor: Option>, + Path(CommitPath { commit_id }): Path, ) -> std::result::Result, ApiError> { authorize_request( - &state, actor.as_ref().map(|Extension(actor)| actor), + handle.policy.as_deref(), PolicyRequest { - actor_id: actor - .as_ref() - .map(|Extension(actor)| actor.as_str().to_string()) - .unwrap_or_default(), action: PolicyAction::Read, branch: None, target_branch: None, }, )?; let commit = { - let db = &state.engine; + let db = &handle.engine; db.get_commit(&commit_id) .await .map_err(ApiError::from_omni)? @@ -1757,9 +2452,9 @@ fn server_bearer_tokens_from_env() -> Result> { #[cfg(test)] mod tests { use super::{ - ServerConfig, ServerRuntimeState, classify_server_runtime_state, hash_bearer_token, - load_server_settings, normalize_bearer_token, parse_bearer_tokens_json, serve, - server_bearer_tokens_from_env, + GraphStartupConfig, ServerConfig, ServerConfigMode, ServerRuntimeState, + classify_server_runtime_state, hash_bearer_token, load_server_settings, + normalize_bearer_token, parse_bearer_tokens_json, serve, server_bearer_tokens_from_env, }; use serial_test::serial; use std::env; @@ -1814,7 +2509,10 @@ server: .unwrap(); let settings = load_server_settings(Some(&config), None, None, None, false).unwrap(); - assert_eq!(settings.uri, "/tmp/demo.omni"); + match &settings.mode { + ServerConfigMode::Single { uri, .. } => assert_eq!(uri, "/tmp/demo.omni"), + ServerConfigMode::Multi { .. } => panic!("expected Single mode, got Multi"), + } assert_eq!(settings.bind, "0.0.0.0:9090"); } @@ -1843,7 +2541,10 @@ server: false, ) .unwrap(); - assert_eq!(settings.uri, "/tmp/override.omni"); + match &settings.mode { + ServerConfigMode::Single { uri, .. } => assert_eq!(uri, "/tmp/override.omni"), + ServerConfigMode::Multi { .. } => panic!("expected Single mode, got Multi"), + } assert_eq!(settings.bind, "0.0.0.0:9999"); } @@ -1869,13 +2570,19 @@ server: let settings = load_server_settings(Some(&config), None, Some("dev".to_string()), None, false) .unwrap(); - assert_eq!(settings.uri, "http://127.0.0.1:8080"); + match &settings.mode { + ServerConfigMode::Single { uri, .. } => assert_eq!(uri, "http://127.0.0.1:8080"), + ServerConfigMode::Multi { .. } => panic!("expected Single mode, got Multi"), + } } #[test] fn server_settings_require_uri_from_cli_or_config() { let error = load_server_settings(None, None, None, None, false).unwrap_err(); - assert!(error.to_string().contains("URI must be provided")); + assert!( + error.to_string().contains("no graph to serve"), + "expected mode-inference error, got: {error}", + ); } #[test] @@ -1910,6 +2617,57 @@ server: ); } + #[tokio::test] + #[serial] + async fn serve_refuses_to_start_with_policy_but_no_tokens_multi_mode() { + // Bug 2 from the bot-review pass: multi-mode startup was missing + // the "policy requires tokens" check that single-mode enforces. + // After centralizing the check in `classify_server_runtime_state`, + // both modes get the same enforcement. This test guards the + // multi-mode propagation path. + // + // Sibling test below pins single mode. Together they pin that + // the classifier is called from both branches of `serve()`. + let _guard = EnvGuard::set(&[ + ("OMNIGRAPH_SERVER_BEARER_TOKEN", None), + ("OMNIGRAPH_SERVER_BEARER_TOKENS_FILE", None), + ("OMNIGRAPH_SERVER_BEARER_TOKENS_JSON", None), + ("OMNIGRAPH_SERVER_BEARER_TOKENS_AWS_SECRET", None), + ("OMNIGRAPH_UNAUTHENTICATED", None), + ]); + let temp = tempdir().unwrap(); + // The classifier reads `has_policy_configured` from the config + // shape (does the Option contain a path?), not from file + // existence, so we can hand it a path without writing a real + // policy file — the bail fires before policy load. + let policy_path = temp.path().join("server-policy.yaml"); + let config = ServerConfig { + mode: ServerConfigMode::Multi { + graphs: vec![GraphStartupConfig { + graph_id: "alpha".to_string(), + uri: temp + .path() + .join("alpha.omni") + .to_string_lossy() + .into_owned(), + policy_file: None, + }], + config_path: temp.path().join("omnigraph.yaml"), + server_policy_file: Some(policy_path), + }, + bind: "127.0.0.1:0".to_string(), + allow_unauthenticated: false, + }; + let result = serve(config).await; + let err = result + .expect_err("serve should refuse to start in multi mode with policy but no tokens"); + let msg = format!("{:?}", err); + assert!( + msg.contains("policy file is configured but no bearer tokens"), + "expected policy-without-tokens rejection in multi mode, got: {msg}", + ); + } + #[tokio::test] #[serial] async fn serve_refuses_to_start_in_state_1_without_unauthenticated() { @@ -1934,13 +2692,15 @@ server: // Graph path doesn't need to exist — classifier fires before // `AppState::open_with_bearer_tokens_and_policy`. let config = ServerConfig { - uri: temp - .path() - .join("graph.omni") - .to_string_lossy() - .into_owned(), + mode: ServerConfigMode::Single { + uri: temp + .path() + .join("graph.omni") + .to_string_lossy() + .into_owned(), + policy_file: None, + }, bind: "127.0.0.1:0".to_string(), - policy_file: None, allow_unauthenticated: false, }; let result = serve(config).await; @@ -2023,25 +2783,43 @@ server: } #[test] - fn classify_policy_enabled_always_wins() { - // State 3: any setup with a policy file → PolicyEnabled. The - // flag doesn't matter and tokens-or-not doesn't matter (no - // tokens + policy is unusual but valid — every request fails - // 401 without a bearer, which is effectively "locked"). + fn classify_policy_enabled_requires_tokens() { + // State 3: tokens + policy → PolicyEnabled, regardless of the + // `allow_unauthenticated` flag (Cedar evaluates the bearer, + // the flag is moot once tokens exist). assert_eq!( classify_server_runtime_state(true, true, false).unwrap(), ServerRuntimeState::PolicyEnabled ); - assert_eq!( - classify_server_runtime_state(false, true, false).unwrap(), - ServerRuntimeState::PolicyEnabled - ); assert_eq!( classify_server_runtime_state(true, true, true).unwrap(), ServerRuntimeState::PolicyEnabled ); } + #[test] + fn classify_policy_without_tokens_is_rejected() { + // Closes the "policy installed but no tokens → silent 401 on + // every request" footgun. The same shape that single-mode + // `open_with_bearer_tokens_and_policy` used to bail on + // privately is now rejected by the classifier so both single + // and multi mode get the same enforcement from one source of + // truth. + for allow_unauthenticated in [false, true] { + let err = + classify_server_runtime_state(false, true, allow_unauthenticated).unwrap_err(); + let msg = err.to_string(); + assert!( + msg.contains("policy file is configured but no bearer tokens"), + "expected policy-without-tokens rejection message; got: {msg}" + ); + assert!( + msg.contains("every request would 401"), + "rejection message must name the failure mode; got: {msg}" + ); + } + } + #[test] fn normalize_bearer_token_trims_and_filters_blank_values() { assert_eq!(normalize_bearer_token(None), None); diff --git a/crates/omnigraph-server/src/registry.rs b/crates/omnigraph-server/src/registry.rs new file mode 100644 index 0000000..5897ad1 --- /dev/null +++ b/crates/omnigraph-server/src/registry.rs @@ -0,0 +1,558 @@ +//! `GraphRegistry` — the multi-graph routing substrate (MR-668). +//! +//! Holds the open `Arc` for every graph the server is currently +//! serving. Lock-free reads via `ArcSwap`; mutations +//! serialize through `mutate: Mutex<()>` for read-modify-write atomicity. +//! +//! **Deletion is deferred** in v0.6.0 (MR-668 scope cut). The registry has +//! no `tombstones` field, no `RegistryLookup::Tombstoned` variant, no +//! `tombstone()` / `clear_tombstone()` methods. When `DELETE /graphs/{id}` +//! lands in a follow-up release, those return without breaking caller +//! signatures (`Gone` is the closest semantic — the graph is no longer +//! in the registry). +//! +//! Engine instance survival across registry mutations: +//! a request that grabbed `Arc` before a registry swap keeps +//! the engine alive via its own `Arc` clone (see `server_export` at +//! `lib.rs:1019-1033` for the spawn-and-clone pattern). The engine drops +//! when the last `Arc` clone drops, regardless of the +//! registry's current state. + +use std::collections::HashMap; +use std::sync::Arc; + +use arc_swap::ArcSwap; +use omnigraph::db::Omnigraph; +use omnigraph::storage::normalize_root_uri; +#[cfg(test)] +use tokio::sync::Mutex; + +use crate::identity::GraphKey; +use crate::policy::PolicyEngine; + +/// Open handle for a single graph in the registry. Cheap to clone (`Arc`-wrapped +/// engine + policy). Cluster-mode handlers extract this via +/// `Extension>` injected by the routing middleware. +pub struct GraphHandle { + /// Registry key. In Cluster mode `key.tenant_id` is always `None`. + pub key: GraphKey, + /// The URI the engine was opened from (`s3://...` or local path). + /// Stable for the engine's lifetime; surfaced in responses like + /// `BranchCreateOutput.uri`. + pub uri: String, + /// Engine. Reads/writes go directly through `&self` methods on + /// `Omnigraph` (no `RwLock` — MR-686 preserved). + pub engine: Arc, + /// Per-graph Cedar policy. `None` means "no policy gate on engine-layer + /// `_as` writers"; the HTTP-layer `require_bearer_auth` middleware still + /// runs regardless. + pub policy: Option>, +} + +/// Immutable snapshot of the registry's current state. Replaced atomically +/// via `ArcSwap`; readers see a consistent view of all graphs without locking. +/// +/// Derived state (`any_per_graph_policy`) is computed at snapshot +/// construction so request-time middleware doesn't have to walk the +/// graph map every call. Construct only via [`RegistrySnapshot::new`] +/// (or `Default`) so the field stays in sync with `graphs`. +pub struct RegistrySnapshot { + pub graphs: HashMap>, + /// `true` iff any registered graph has a per-graph policy installed. + /// Used by `AppState::requires_bearer_auth` to decide whether the + /// auth middleware should challenge a request — a per-graph policy + /// implies bearer auth is required even when no server-level tokens + /// or policy are configured. + pub any_per_graph_policy: bool, +} + +impl RegistrySnapshot { + /// Build a snapshot from a graph map, deriving cached fields. + /// The only construction path — direct struct-literal use elsewhere + /// would let derived state drift from `graphs`. + pub fn new(graphs: HashMap>) -> Self { + let any_per_graph_policy = graphs.values().any(|h| h.policy.is_some()); + Self { + graphs, + any_per_graph_policy, + } + } +} + +impl Default for RegistrySnapshot { + fn default() -> Self { + Self::new(HashMap::new()) + } +} + +/// Result of a registry lookup. Two-valued — `Tombstoned` deferred with DELETE. +pub enum RegistryLookup { + /// Graph is open and ready to serve. + Ready(Arc), + /// Graph is not in the registry (never existed, or was unregistered in a + /// future release). Handlers respond with 404. + Gone, +} + +/// Why an `insert` was rejected. +#[derive(Debug, thiserror::Error)] +pub enum InsertError { + /// Another handle already exists for this `GraphKey`. Maps to HTTP 409. + #[error("graph '{0}' is already registered")] + DuplicateKey(GraphKey), + /// Another handle is open against this URI. Two graphs sharing a URI + /// would commit through the same Lance manifest and corrupt each other. + /// Maps to HTTP 409. + #[error("URI '{0}' is already registered as another graph")] + DuplicateUri(String), + /// A handle carried an invalid graph URI. Maps to startup failure. + #[error("URI '{uri}' is invalid: {message}")] + InvalidUri { uri: String, message: String }, +} + +pub struct GraphRegistry { + snapshot: ArcSwap, + /// Serializes runtime mutations through [`GraphRegistry::insert`]. + /// Gated with `insert` because they share a single contract — if + /// the consumer goes away, so does the lock. Re-introducing one + /// requires re-introducing the other. + #[cfg(test)] + mutate: Mutex<()>, +} + +impl GraphRegistry { + /// Empty registry. Used as a placeholder before startup populates it. + pub fn new() -> Self { + Self { + snapshot: ArcSwap::from_pointee(RegistrySnapshot::default()), + #[cfg(test)] + mutate: Mutex::new(()), + } + } + + /// Build a registry from a startup-time list of open handles. + /// Rejects duplicate `GraphKey`s and duplicate URIs. + pub fn from_handles(handles: Vec>) -> Result { + let mut graphs: HashMap> = HashMap::with_capacity(handles.len()); + let mut seen_uris: HashMap = HashMap::with_capacity(handles.len()); + for handle in handles { + let (canonical_uri, handle) = canonicalize_handle_uri(handle)?; + if graphs.contains_key(&handle.key) { + return Err(InsertError::DuplicateKey(handle.key.clone())); + } + if seen_uris.contains_key(&canonical_uri) { + return Err(InsertError::DuplicateUri(handle.uri.clone())); + } + seen_uris.insert(canonical_uri, handle.key.clone()); + graphs.insert(handle.key.clone(), handle); + } + Ok(Self { + snapshot: ArcSwap::from_pointee(RegistrySnapshot::new(graphs)), + #[cfg(test)] + mutate: Mutex::new(()), + }) + } + + /// Lock-free snapshot read. Callers that need derived state cached + /// on the snapshot (e.g. `any_per_graph_policy`) go through here; + /// callers that only need values of `graphs` should use [`list`] + /// or [`get`]. + pub fn snapshot_ref(&self) -> arc_swap::Guard> { + self.snapshot.load() + } + + /// Lock-free read. Returns `Ready` if the graph is in the current snapshot, + /// `Gone` otherwise. + pub fn get(&self, key: &GraphKey) -> RegistryLookup { + let snapshot = self.snapshot.load(); + match snapshot.graphs.get(key) { + Some(handle) => RegistryLookup::Ready(Arc::clone(handle)), + None => RegistryLookup::Gone, + } + } + + /// Snapshot the full set of currently-registered handles. Ordering + /// matches the underlying `HashMap` iteration (intentionally + /// non-deterministic — callers that need a stable order sort by + /// `handle.key.graph_id`). + pub fn list(&self) -> Vec> { + let snapshot = self.snapshot.load(); + snapshot.graphs.values().cloned().collect() + } + + /// Number of registered graphs (excluding any future tombstones). + pub fn len(&self) -> usize { + self.snapshot.load().graphs.len() + } + + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Add a new handle. Async because the mutex is `tokio::sync::Mutex` + /// (a future managed-catalog flow may hold it across `.await` points + /// during atomic registry mutations). Rejects duplicate `GraphKey` + /// and duplicate `uri`. + /// + /// **Test-only surface.** No production code reaches this — startup + /// uses `from_handles`, and runtime add/remove is deferred. The + /// race-contract tests below pin the mutex linearization point so + /// that when a real consumer ships (managed cluster catalog), the + /// concurrency contract is already proven. Ungate by removing + /// `#[cfg(test)]` once that consumer is in scope. + /// + /// Race semantics (pinned by `concurrent_insert_same_key_exactly_one_succeeds`): + /// under N concurrent calls with the same key, exactly one returns + /// `Ok(())` and the rest return `Err(InsertError::DuplicateKey(_))`. + #[cfg(test)] + pub async fn insert(&self, handle: Arc) -> Result<(), InsertError> { + let _guard = self.mutate.lock().await; + let current = self.snapshot.load(); + let (canonical_uri, handle) = canonicalize_handle_uri(handle)?; + if current.graphs.contains_key(&handle.key) { + return Err(InsertError::DuplicateKey(handle.key.clone())); + } + for existing in current.graphs.values() { + let existing_uri = + normalize_root_uri(&existing.uri).map_err(|err| InsertError::InvalidUri { + uri: existing.uri.clone(), + message: err.to_string(), + })?; + if existing_uri == canonical_uri { + return Err(InsertError::DuplicateUri(handle.uri.clone())); + } + } + let mut new_graphs = current.graphs.clone(); + new_graphs.insert(handle.key.clone(), handle); + self.snapshot + .store(Arc::new(RegistrySnapshot::new(new_graphs))); + Ok(()) + } +} + +fn canonicalize_handle_uri( + handle: Arc, +) -> Result<(String, Arc), InsertError> { + let canonical_uri = normalize_root_uri(&handle.uri).map_err(|err| InsertError::InvalidUri { + uri: handle.uri.clone(), + message: err.to_string(), + })?; + if canonical_uri == handle.uri { + return Ok((canonical_uri, handle)); + } + let canonical_handle = Arc::new(GraphHandle { + key: handle.key.clone(), + uri: canonical_uri.clone(), + engine: Arc::clone(&handle.engine), + policy: handle.policy.clone(), + }); + Ok((canonical_uri, canonical_handle)) +} + +impl Default for GraphRegistry { + fn default() -> Self { + Self::new() + } +} + +#[cfg(test)] +mod tests { + use std::path::Path; + + use tempfile::TempDir; + + use super::*; + use crate::graph_id::GraphId; + + const TEST_SCHEMA: &str = "node Person { name: String @key }\n"; + + async fn build_handle(graph_id: &str, dir: &Path) -> Arc { + let graph_uri = dir.join(graph_id).to_str().unwrap().to_string(); + let engine = Omnigraph::init(&graph_uri, TEST_SCHEMA) + .await + .expect("init engine for registry test"); + Arc::new(GraphHandle { + key: GraphKey::cluster(GraphId::try_from(graph_id).unwrap()), + uri: graph_uri, + engine: Arc::new(engine), + policy: None, + }) + } + + #[tokio::test] + async fn new_registry_is_empty() { + let registry = GraphRegistry::new(); + assert!(registry.is_empty()); + assert_eq!(registry.len(), 0); + assert!(registry.list().is_empty()); + } + + #[tokio::test] + async fn insert_then_get_returns_ready() { + let dir = TempDir::new().unwrap(); + let registry = GraphRegistry::new(); + let handle = build_handle("alpha", dir.path()).await; + registry.insert(Arc::clone(&handle)).await.unwrap(); + + match registry.get(&handle.key) { + RegistryLookup::Ready(found) => { + assert!(Arc::ptr_eq(&found, &handle)); + } + RegistryLookup::Gone => panic!("expected Ready, got Gone"), + } + } + + #[tokio::test] + async fn get_nonexistent_returns_gone() { + let registry = GraphRegistry::new(); + let key = GraphKey::cluster(GraphId::try_from("ghost").unwrap()); + match registry.get(&key) { + RegistryLookup::Gone => {} + RegistryLookup::Ready(_) => panic!("expected Gone"), + } + } + + #[tokio::test] + async fn insert_duplicate_key_returns_error() { + let dir = TempDir::new().unwrap(); + let registry = GraphRegistry::new(); + let h1 = build_handle("alpha", dir.path()).await; + // Same key, different URI sub-path (build_handle uses graph_id as subdir). + let dir2 = TempDir::new().unwrap(); + let h2 = build_handle("alpha", dir2.path()).await; + registry.insert(h1).await.unwrap(); + + match registry.insert(h2).await { + Err(InsertError::DuplicateKey(_)) => {} + other => panic!("expected DuplicateKey, got {other:?}"), + } + } + + #[tokio::test] + async fn insert_duplicate_uri_returns_error() { + let dir = TempDir::new().unwrap(); + // Two handles with the same URI but different keys. + let shared_uri = dir.path().join("shared").to_str().unwrap().to_string(); + let engine = Omnigraph::init(&shared_uri, TEST_SCHEMA).await.unwrap(); + let engine = Arc::new(engine); + let h1 = Arc::new(GraphHandle { + key: GraphKey::cluster(GraphId::try_from("alpha").unwrap()), + uri: shared_uri.clone(), + engine: Arc::clone(&engine), + policy: None, + }); + let h2 = Arc::new(GraphHandle { + key: GraphKey::cluster(GraphId::try_from("beta").unwrap()), + uri: shared_uri, + engine, + policy: None, + }); + + let registry = GraphRegistry::new(); + registry.insert(h1).await.unwrap(); + match registry.insert(h2).await { + Err(InsertError::DuplicateUri(_)) => {} + other => panic!("expected DuplicateUri, got {other:?}"), + } + } + + #[tokio::test] + async fn list_returns_all_inserted_handles() { + let dir = TempDir::new().unwrap(); + let registry = GraphRegistry::new(); + for name in ["alpha", "beta", "gamma"] { + let h = build_handle(name, dir.path()).await; + registry.insert(h).await.unwrap(); + } + assert_eq!(registry.len(), 3); + let mut ids: Vec<_> = registry + .list() + .into_iter() + .map(|h| h.key.graph_id.as_str().to_string()) + .collect(); + ids.sort(); + assert_eq!(ids, vec!["alpha", "beta", "gamma"]); + } + + #[tokio::test] + async fn from_handles_bulk_init_succeeds() { + let dir = TempDir::new().unwrap(); + let handles = vec![ + build_handle("alpha", dir.path()).await, + build_handle("beta", dir.path()).await, + ]; + let registry = GraphRegistry::from_handles(handles).unwrap(); + assert_eq!(registry.len(), 2); + } + + #[tokio::test] + async fn from_handles_rejects_duplicate_keys() { + let dir1 = TempDir::new().unwrap(); + let dir2 = TempDir::new().unwrap(); + let h1 = build_handle("alpha", dir1.path()).await; + let h2 = build_handle("alpha", dir2.path()).await; + let err = match GraphRegistry::from_handles(vec![h1, h2]) { + Ok(_) => panic!("expected DuplicateKey, got Ok"), + Err(err) => err, + }; + assert!( + matches!(err, InsertError::DuplicateKey(_)), + "expected DuplicateKey, got {err}", + ); + } + + #[tokio::test] + async fn from_handles_rejects_duplicate_uris() { + let dir = TempDir::new().unwrap(); + let shared_uri = dir.path().join("shared").to_str().unwrap().to_string(); + let engine = Arc::new(Omnigraph::init(&shared_uri, TEST_SCHEMA).await.unwrap()); + let h1 = Arc::new(GraphHandle { + key: GraphKey::cluster(GraphId::try_from("alpha").unwrap()), + uri: shared_uri.clone(), + engine: Arc::clone(&engine), + policy: None, + }); + let h2 = Arc::new(GraphHandle { + key: GraphKey::cluster(GraphId::try_from("beta").unwrap()), + uri: shared_uri, + engine, + policy: None, + }); + let err = match GraphRegistry::from_handles(vec![h1, h2]) { + Ok(_) => panic!("expected DuplicateUri, got Ok"), + Err(err) => err, + }; + assert!( + matches!(err, InsertError::DuplicateUri(_)), + "expected DuplicateUri, got {err}", + ); + } + + /// Race test modeled on `actor_admission_race_does_not_exceed_cap` + /// at `tests/server.rs:3596+`. Spawn N concurrent inserts with the + /// same `GraphKey` (each constructing its own `GraphHandle` against + /// its own tempdir). Exactly one must succeed; the others must + /// return `DuplicateKey`. No `unwrap` panic: the `Mutex<()>` + + /// in-mutex re-check is the linearization point. + #[tokio::test(flavor = "multi_thread")] + async fn concurrent_insert_same_key_exactly_one_succeeds() { + const N: usize = 8; + + let registry = Arc::new(GraphRegistry::new()); + // Pre-create N handles (each in its own tempdir; same key). + let mut handles = Vec::with_capacity(N); + let mut dirs = Vec::with_capacity(N); + for _ in 0..N { + let d = TempDir::new().unwrap(); + handles.push(build_handle("contested", d.path()).await); + dirs.push(d); + } + + let barrier = Arc::new(tokio::sync::Barrier::new(N)); + let mut tasks = Vec::with_capacity(N); + for handle in handles { + let registry = Arc::clone(®istry); + let barrier = Arc::clone(&barrier); + tasks.push(tokio::spawn(async move { + barrier.wait().await; + registry.insert(handle).await + })); + } + + let mut ok_count = 0usize; + let mut dup_count = 0usize; + for t in tasks { + match t.await.unwrap() { + Ok(()) => ok_count += 1, + Err(InsertError::DuplicateKey(_)) => dup_count += 1, + Err(other) => panic!("unexpected error: {other:?}"), + } + } + assert_eq!(ok_count, 1, "exactly one insert must succeed"); + assert_eq!(dup_count, N - 1, "the rest must return DuplicateKey"); + assert_eq!(registry.len(), 1); + + // Drop the dirs at the end (preserves engines until tasks finish). + drop(dirs); + } + + /// Concurrent inserts with **distinct** keys all succeed. + /// Linearizability over the mutex still serializes them. + #[tokio::test(flavor = "multi_thread")] + async fn concurrent_insert_distinct_keys_all_succeed() { + const N: usize = 8; + + let registry = Arc::new(GraphRegistry::new()); + // Pre-create N handles with distinct ids, each in its own tempdir. + let mut handles = Vec::with_capacity(N); + let mut dirs = Vec::with_capacity(N); + for i in 0..N { + let d = TempDir::new().unwrap(); + handles.push(build_handle(&format!("graph-{i}"), d.path()).await); + dirs.push(d); + } + + let barrier = Arc::new(tokio::sync::Barrier::new(N)); + let mut tasks = Vec::with_capacity(N); + for handle in handles { + let registry = Arc::clone(®istry); + let barrier = Arc::clone(&barrier); + tasks.push(tokio::spawn(async move { + barrier.wait().await; + registry.insert(handle).await + })); + } + for t in tasks { + t.await.unwrap().unwrap(); + } + assert_eq!(registry.len(), N); + drop(dirs); + } + + /// Concurrent reads during a write must always see a consistent + /// snapshot (no torn state). With `ArcSwap`, the read either sees + /// the old snapshot or the new one — never both, never neither. + #[tokio::test(flavor = "multi_thread")] + async fn concurrent_reads_during_inserts_see_consistent_snapshots() { + let dir = TempDir::new().unwrap(); + let registry = Arc::new(GraphRegistry::new()); + + // Spawn a writer that inserts graph-0..graph-9 sequentially. + const N_WRITES: usize = 10; + let writer_registry = Arc::clone(®istry); + let writer_dir = dir.path().to_path_buf(); + let writer = tokio::spawn(async move { + for i in 0..N_WRITES { + let h = build_handle(&format!("graph-{i}"), &writer_dir).await; + writer_registry.insert(h).await.unwrap(); + } + }); + + // Reader loop: repeatedly snapshot the registry until the writer + // finishes. Every snapshot's len must be in [0, N_WRITES], and + // for every key g in the snapshot, get(g) must return Ready. + let reader_registry = Arc::clone(®istry); + let reader = tokio::spawn(async move { + for _ in 0..200 { + let snap = reader_registry.list(); + assert!(snap.len() <= N_WRITES); + for handle in &snap { + match reader_registry.get(&handle.key) { + RegistryLookup::Ready(found) => { + assert!(Arc::ptr_eq(&found, handle)); + } + RegistryLookup::Gone => panic!( + "snapshot listed key {} but get() returned Gone", + handle.key.graph_id + ), + } + } + tokio::task::yield_now().await; + } + }); + + writer.await.unwrap(); + reader.await.unwrap(); + assert_eq!(registry.len(), N_WRITES); + } +} diff --git a/crates/omnigraph-server/src/workload.rs b/crates/omnigraph-server/src/workload.rs index efc7068..4e84532 100644 --- a/crates/omnigraph-server/src/workload.rs +++ b/crates/omnigraph-server/src/workload.rs @@ -270,12 +270,13 @@ mod tests { let err = controller .try_admit(&actor, 100) .expect_err("third should reject on count"); - assert!(matches!(err, RejectReason::InFlightCountExceeded { cap: 2 })); + assert!(matches!( + err, + RejectReason::InFlightCountExceeded { cap: 2 } + )); drop(g1); // After drop, a new admit succeeds again. - let _g3 = controller - .try_admit(&actor, 100) - .expect("admit after drop"); + let _g3 = controller.try_admit(&actor, 100).expect("admit after drop"); } #[tokio::test(flavor = "multi_thread", worker_threads = 4)] @@ -356,7 +357,9 @@ mod tests { let bob: Arc = "bob".into(); let _ga = controller.try_admit(&alice, 100).expect("alice ok"); // Alice over count cap, Bob unaffected. - let err = controller.try_admit(&alice, 100).expect_err("alice rejected"); + let err = controller + .try_admit(&alice, 100) + .expect_err("alice rejected"); assert!(matches!(err, RejectReason::InFlightCountExceeded { .. })); let _gb = controller.try_admit(&bob, 100).expect("bob ok"); } diff --git a/crates/omnigraph-server/tests/openapi.rs b/crates/omnigraph-server/tests/openapi.rs index e099ded..6b28f6c 100644 --- a/crates/omnigraph-server/tests/openapi.rs +++ b/crates/omnigraph-server/tests/openapi.rs @@ -161,6 +161,7 @@ fn openapi_info_contains_version() { const EXPECTED_PATHS: &[&str] = &[ "/healthz", + "/graphs", "/snapshot", "/read", "/export", @@ -957,3 +958,289 @@ fn openapi_spec_is_up_to_date() { "openapi.json is out of date. Run: OMNIGRAPH_UPDATE_OPENAPI=1 cargo test -p omnigraph-server --test openapi openapi_spec_is_up_to_date" ); } + +// --------------------------------------------------------------------------- +// MR-668 — multi-mode OpenAPI cluster filter +// --------------------------------------------------------------------------- +// +// In multi-graph mode, `/openapi.json` reports cluster routes +// (`/graphs/{graph_id}/...`) instead of the legacy flat routes. The +// only flat path that survives is `/healthz`. Operation IDs gain a +// `cluster_` prefix so SDK generators have stable, unique ids. +// +// These tests exercise the request-time `server_openapi` handler via +// `oneshot`, not the static `ApiDoc::openapi()` — the rewrite happens +// only on the served document. + +const EXPECTED_CLUSTER_PATHS: &[&str] = &[ + "/graphs/{graph_id}/snapshot", + "/graphs/{graph_id}/read", + "/graphs/{graph_id}/export", + "/graphs/{graph_id}/change", + "/graphs/{graph_id}/schema", + "/graphs/{graph_id}/schema/apply", + "/graphs/{graph_id}/ingest", + "/graphs/{graph_id}/branches", + "/graphs/{graph_id}/branches/{branch}", + "/graphs/{graph_id}/branches/merge", + "/graphs/{graph_id}/commits", + "/graphs/{graph_id}/commits/{commit_id}", +]; + +async fn app_for_multi_mode(graph_ids: &[&str]) -> (Vec, Router) { + use std::sync::Arc; + + use omnigraph_server::{GraphHandle, GraphId, GraphKey}; + + let mut dirs = Vec::with_capacity(graph_ids.len()); + let mut handles = Vec::with_capacity(graph_ids.len()); + for id in graph_ids { + let dir = tempfile::tempdir().unwrap(); + let graph_uri = dir.path().join(id).to_str().unwrap().to_string(); + let schema = fs::read_to_string(fixture("test.pg")).unwrap(); + let engine = Omnigraph::init(&graph_uri, &schema).await.unwrap(); + handles.push(Arc::new(GraphHandle { + key: GraphKey::cluster(GraphId::try_from(*id).unwrap()), + uri: graph_uri, + engine: Arc::new(engine), + policy: None, + })); + dirs.push(dir); + } + let workload = omnigraph_server::workload::WorkloadController::from_env(); + let state = AppState::new_multi(handles, Vec::new(), None, workload, None).unwrap(); + let app = build_app(state); + (dirs, app) +} + +#[tokio::test] +async fn multi_mode_openapi_lists_cluster_paths() { + let (_dirs, app) = app_for_multi_mode(&["alpha"]).await; + let request = Request::builder() + .method(Method::GET) + .uri("/openapi.json") + .body(Body::empty()) + .unwrap(); + let (status, json) = json_response(&app, request).await; + assert_eq!(status, StatusCode::OK); + let paths = json["paths"].as_object().expect("paths must be an object"); + let path_keys: HashSet<&str> = paths.keys().map(|k| k.as_str()).collect(); + for expected in EXPECTED_CLUSTER_PATHS { + assert!( + path_keys.contains(expected), + "missing cluster path in multi-mode spec: {expected}. \ + Found: {path_keys:?}" + ); + } +} + +#[tokio::test] +async fn multi_mode_openapi_drops_flat_protected_paths() { + let (_dirs, app) = app_for_multi_mode(&["alpha"]).await; + let request = Request::builder() + .method(Method::GET) + .uri("/openapi.json") + .body(Body::empty()) + .unwrap(); + let (_, json) = json_response(&app, request).await; + let paths = json["paths"].as_object().unwrap(); + // None of the legacy flat protected paths should appear in multi mode. + let flat_protected = [ + "/snapshot", + "/read", + "/export", + "/change", + "/schema", + "/schema/apply", + "/ingest", + "/branches", + "/branches/{branch}", + "/branches/merge", + "/commits", + "/commits/{commit_id}", + ]; + for flat in flat_protected { + assert!( + !paths.contains_key(flat), + "flat path {flat} must not appear in multi-mode spec; \ + cluster routes are the only protected surface" + ); + } +} + +#[tokio::test] +async fn multi_mode_openapi_keeps_management_paths_flat() { + let (_dirs, app) = app_for_multi_mode(&["alpha"]).await; + let request = Request::builder() + .method(Method::GET) + .uri("/openapi.json") + .body(Body::empty()) + .unwrap(); + let (_, json) = json_response(&app, request).await; + let paths = json["paths"].as_object().unwrap(); + for flat in ["/healthz", "/graphs"] { + assert!( + paths.contains_key(flat), + "{flat} must remain flat in multi mode" + ); + let nested = format!("/graphs/{{graph_id}}{flat}"); + assert!( + !paths.contains_key(&nested), + "{flat} must NOT be cluster-prefixed to {nested}" + ); + } +} + +#[tokio::test] +async fn multi_mode_openapi_prefixes_operation_ids_with_cluster() { + let (_dirs, app) = app_for_multi_mode(&["alpha"]).await; + let request = Request::builder() + .method(Method::GET) + .uri("/openapi.json") + .body(Body::empty()) + .unwrap(); + let (_, json) = json_response(&app, request).await; + // Every cluster path operation must have a `cluster_` operation_id. + // Flat-mounted paths (healthz, management /graphs) keep their + // original operation_ids — they're not per-graph. + let paths = json["paths"].as_object().unwrap(); + let mut checked = 0; + for (path, item) in paths { + if path == "/healthz" || path == "/graphs" { + continue; + } + for method in ["get", "post", "put", "delete", "patch"] { + if let Some(op) = item.get(method).filter(|v| v.is_object()) { + if let Some(id) = op["operationId"].as_str() { + assert!( + id.starts_with("cluster_"), + "operation_id at {path}.{method} must start with `cluster_`, got `{id}`" + ); + checked += 1; + } + } + } + } + assert!( + checked >= EXPECTED_CLUSTER_PATHS.len(), + "expected at least {} cluster operation_ids; checked {checked}", + EXPECTED_CLUSTER_PATHS.len() + ); +} + +#[tokio::test] +async fn multi_mode_openapi_declares_graph_id_path_parameter() { + let (_dirs, app) = app_for_multi_mode(&["alpha"]).await; + let request = Request::builder() + .method(Method::GET) + .uri("/openapi.json") + .body(Body::empty()) + .unwrap(); + let (_, json) = json_response(&app, request).await; + let paths = json["paths"].as_object().unwrap(); + + for expected_path in EXPECTED_CLUSTER_PATHS { + let item = paths + .get(*expected_path) + .unwrap_or_else(|| panic!("missing cluster path {expected_path}")); + for method in ["get", "post", "put", "delete", "patch"] { + let Some(operation) = item.get(method).filter(|value| value.is_object()) else { + continue; + }; + let parameters = operation["parameters"] + .as_array() + .unwrap_or_else(|| panic!("{expected_path}.{method} missing parameters")); + let graph_id = parameters + .iter() + .find(|param| param["name"] == "graph_id" && param["in"] == "path") + .unwrap_or_else(|| { + panic!("{expected_path}.{method} missing graph_id path parameter") + }); + assert_eq!( + graph_id["required"].as_bool(), + Some(true), + "{expected_path}.{method} graph_id parameter must be required" + ); + assert_eq!( + graph_id["schema"]["type"].as_str(), + Some("string"), + "{expected_path}.{method} graph_id parameter must be string typed" + ); + } + } + + for flat in ["/healthz", "/graphs"] { + let item = paths.get(flat).unwrap(); + for method in ["get", "post", "put", "delete", "patch"] { + if let Some(operation) = item.get(method).filter(|value| value.is_object()) { + let has_graph_id = operation["parameters"] + .as_array() + .map(|params| { + params + .iter() + .any(|param| param["name"] == "graph_id" && param["in"] == "path") + }) + .unwrap_or(false); + assert!( + !has_graph_id, + "{flat}.{method} must not declare graph_id; it remains flat" + ); + } + } + } +} + +#[tokio::test] +async fn multi_mode_operation_ids_are_unique() { + // Sanity check: the cluster_ prefix prevents collision with flat ids + // (which don't appear in multi mode, but the contract is "unique + // across the spec"). Verify every operation_id in the multi-mode + // spec is unique. + let (_dirs, app) = app_for_multi_mode(&["alpha"]).await; + let request = Request::builder() + .method(Method::GET) + .uri("/openapi.json") + .body(Body::empty()) + .unwrap(); + let (_, json) = json_response(&app, request).await; + let paths = json["paths"].as_object().unwrap(); + let mut seen_ids: HashSet = HashSet::new(); + for (_, item) in paths { + for method in ["get", "post", "put", "delete", "patch"] { + if let Some(op) = item.get(method).filter(|v| v.is_object()) { + if let Some(id) = op["operationId"].as_str() { + assert!( + seen_ids.insert(id.to_string()), + "duplicate operation_id `{id}` in multi-mode spec" + ); + } + } + } + } +} + +#[tokio::test] +async fn single_mode_openapi_unchanged_by_cluster_filter() { + // Regression: single mode still emits the legacy flat surface. + let (_temp, app) = app_for_loaded_graph().await; + let request = Request::builder() + .method(Method::GET) + .uri("/openapi.json") + .body(Body::empty()) + .unwrap(); + let (_, json) = json_response(&app, request).await; + let paths = json["paths"].as_object().unwrap(); + let path_keys: HashSet<&str> = paths.keys().map(|k| k.as_str()).collect(); + for expected in EXPECTED_PATHS { + assert!( + path_keys.contains(expected), + "single mode must still emit flat path: {expected}" + ); + } + for cluster in EXPECTED_CLUSTER_PATHS { + assert!( + !path_keys.contains(cluster), + "single mode must NOT emit cluster path: {cluster}" + ); + } +} diff --git a/crates/omnigraph-server/tests/server.rs b/crates/omnigraph-server/tests/server.rs index 2e04db1..a588cc7 100644 --- a/crates/omnigraph-server/tests/server.rs +++ b/crates/omnigraph-server/tests/server.rs @@ -3592,15 +3592,15 @@ async fn ingest_per_actor_admission_cap_returns_429() { let policy_path = temp.path().join("policy.yaml"); fs::write(&policy_path, permit_all_policy_yaml(&["act-flooder"])).unwrap(); let policy_engine = - omnigraph_server::PolicyEngine::load(&policy_path, graph.to_string_lossy().as_ref()) + omnigraph_server::PolicyEngine::load_graph(&policy_path, graph.to_string_lossy().as_ref()) .unwrap(); - let state = AppState::new_with_workload( + let state = AppState::new_single( graph.to_string_lossy().to_string(), db, vec![("act-flooder".to_string(), "flooder-token".to_string())], + Some(policy_engine), workload, - ) - .with_policy_engine(policy_engine); + ); let app = build_app(state); let _temp = temp; @@ -3693,6 +3693,79 @@ async fn ingest_per_actor_admission_cap_returns_429() { } } +/// Regression for B2 (MR-668): when an `AppState` is built with a +/// per-graph policy and a custom workload, the engine inside the +/// routing's `GraphHandle` MUST have the same policy applied via +/// `Omnigraph::with_policy`. Pre-fix, `new_with_workload(...).with_policy_engine(p)` +/// installed the policy only on the HTTP-layer `handle.policy`; the +/// underlying `Arc` was reused without `with_policy`, so any +/// caller reaching through `state.routing()` could bypass Cedar. +/// +/// This test reaches the engine the same way an embedded SDK consumer +/// or future routing code path would, and asserts the policy still +/// fires. The deny path is "act-blocked has a valid bearer but isn't in +/// the policy's allowed group" — i.e., authenticated-but-unauthorised. +#[tokio::test(flavor = "multi_thread")] +async fn engine_layer_policy_fires_via_direct_arc_omnigraph_from_new_single() { + use omnigraph_server::GraphRouting; + let temp = init_loaded_graph().await; + let graph = graph_path(temp.path()); + let db = Omnigraph::open(graph.to_str().unwrap()).await.unwrap(); + + // Permit `act-allowed` for change actions; `act-blocked` is not in + // any allowed group — every change request from them must deny. + let policy_path = temp.path().join("policy.yaml"); + fs::write(&policy_path, permit_all_policy_yaml(&["act-allowed"])).unwrap(); + let policy_engine = + omnigraph_server::PolicyEngine::load_graph(&policy_path, graph.to_string_lossy().as_ref()) + .unwrap(); + + let workload = omnigraph_server::workload::WorkloadController::new(100, 1_000_000_000); + let state = AppState::new_single( + graph.to_string_lossy().to_string(), + db, + vec![("act-blocked".to_string(), "block-token".to_string())], + Some(policy_engine), + workload, + ); + + // Reach into the routing and pull the engine the same way an + // embedded consumer holding `Arc` would. If `new_single` + // failed to apply `with_policy` to the engine, this `mutate_as` + // would succeed — the HTTP-layer is bypassed entirely. + let handle = match state.routing() { + GraphRouting::Single { handle } => Arc::clone(handle), + GraphRouting::Multi { .. } => panic!("expected single-mode routing"), + }; + let engine = Arc::clone(&handle.engine); + + let mut params: omnigraph_compiler::ParamMap = Default::default(); + params.insert( + "name".to_string(), + omnigraph_compiler::Literal::String("EngineLayerBlocked".to_string()), + ); + params.insert("age".to_string(), omnigraph_compiler::Literal::Integer(30)); + let result = engine + .mutate_as( + "main", + MUTATION_QUERIES, + "insert_person", + ¶ms, + Some("act-blocked"), + ) + .await; + match result { + Err(OmniError::Policy(_)) => { /* expected — engine-layer gate fired */ } + Ok(_) => panic!( + "engine-layer policy did NOT fire — act-blocked successfully ran mutate_as via \ + the engine pulled from the registry handle. AppState::new_single failed to apply \ + with_policy to the underlying Omnigraph engine. This is the B2 footgun the \ + with_policy_engine deletion was supposed to close." + ), + Err(other) => panic!("expected OmniError::Policy, got: {other:?}"), + } +} + #[tokio::test(flavor = "multi_thread")] async fn oversized_request_body_returns_payload_too_large() { let (_temp, app) = app_for_loaded_graph().await; @@ -3871,7 +3944,7 @@ async fn build_parity_graph() -> (tempfile::TempDir, PathBuf, PathBuf) { } async fn sdk_change_decision(graph: &Path, policy_path: &Path, actor: &str) -> ParityDecision { - let policy = PolicyEngine::load(policy_path, graph.to_string_lossy().as_ref()).unwrap(); + let policy = PolicyEngine::load_graph(policy_path, graph.to_string_lossy().as_ref()).unwrap(); let db = Omnigraph::open(graph.to_str().unwrap()) .await .unwrap() @@ -3939,7 +4012,7 @@ async fn http_change_decision( } async fn sdk_merge_decision(graph: &Path, policy_path: &Path, actor: &str) -> ParityDecision { - let policy = PolicyEngine::load(policy_path, graph.to_string_lossy().as_ref()).unwrap(); + let policy = PolicyEngine::load_graph(policy_path, graph.to_string_lossy().as_ref()).unwrap(); let db = Omnigraph::open(graph.to_str().unwrap()) .await .unwrap() @@ -4333,3 +4406,916 @@ async fn schema_apply_route_additive_property_preserves_existing_rows() { "AddProperty should preserve row count", ); } + +// ─── MR-668: multi-graph startup ────────────────────────────────────────── + +mod multi_graph_startup { + use super::*; + use omnigraph::storage::normalize_root_uri; + use omnigraph_server::{ + GraphHandle, GraphId, GraphKey, GraphRegistry, InsertError, ServerConfig, ServerConfigMode, + load_server_settings, + }; + use std::sync::Arc; + + async fn build_multi_mode_app(graph_ids: &[&str]) -> (Vec, Router) { + let mut dirs = Vec::with_capacity(graph_ids.len()); + let mut handles = Vec::with_capacity(graph_ids.len()); + for id in graph_ids { + let dir = tempfile::tempdir().unwrap(); + let graph_uri = dir.path().join(id).to_str().unwrap().to_string(); + let schema = fs::read_to_string(fixture("test.pg")).unwrap(); + let engine = Omnigraph::init(&graph_uri, &schema).await.unwrap(); + handles.push(Arc::new(GraphHandle { + key: GraphKey::cluster(GraphId::try_from(*id).unwrap()), + uri: graph_uri, + engine: Arc::new(engine), + policy: None, + })); + dirs.push(dir); + } + let workload = omnigraph_server::workload::WorkloadController::from_env(); + let state = AppState::new_multi(handles, Vec::new(), None, workload, None).unwrap(); + let app = build_app(state); + (dirs, app) + } + + /// Cluster route `/graphs/{graph_id}/snapshot` resolves to the right + /// engine. Two graphs side by side; assert each responds to its own + /// id and does NOT respond to the other's URL. + #[tokio::test(flavor = "multi_thread")] + async fn cluster_routes_dispatch_per_graph_handle() { + let (_dirs, app) = build_multi_mode_app(&["alpha", "beta"]).await; + for id in ["alpha", "beta"] { + let resp = app + .clone() + .oneshot( + Request::builder() + .method(Method::GET) + .uri(format!("/graphs/{id}/snapshot?branch=main")) + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!( + resp.status(), + StatusCode::OK, + "graph '{id}' must respond OK on its cluster snapshot route" + ); + } + } + + /// Unknown graph id under the cluster prefix yields 404 (not 500, + /// not 410 — `Gone` is reserved for the future DELETE flow). + #[tokio::test(flavor = "multi_thread")] + async fn cluster_route_for_unknown_graph_returns_404() { + let (_dirs, app) = build_multi_mode_app(&["alpha"]).await; + let resp = app + .oneshot( + Request::builder() + .method(Method::GET) + .uri("/graphs/nonexistent/snapshot?branch=main") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(resp.status(), StatusCode::NOT_FOUND); + } + + /// Coverage net for cluster-route regressions across every + /// protected handler — not just the few that have inner path + /// params. Bug-1 surfaced because only `/snapshot` was being + /// exercised in cluster mode, leaving the other six protected + /// routes implicitly untested. This sweep hits each one and + /// asserts the response shows the handler was reached: no 404 + /// (router didn't match), no 500 with "Wrong number of path + /// arguments" (path extractor broke), no 500 with "missing + /// extension" (routing middleware didn't inject the handle). + /// + /// Status codes are negative assertions because each handler's + /// happy-path inputs differ — what matters is "the request + /// reached the handler," not "the handler returned 200." The + /// individual handlers' logic is already tested in single mode. + #[tokio::test(flavor = "multi_thread")] + async fn all_protected_cluster_routes_resolve_to_their_handler() { + let (_dirs, app) = build_multi_mode_app(&["alpha"]).await; + + // (method, path, body) — one minimal request per protected + // cluster route. Bodies are valid enough that the router and + // extractors succeed; whether the engine ultimately returns + // 200 or 4xx is per-handler and not what this test pins. + let cases: &[(Method, &str, Option<&str>)] = &[ + (Method::GET, "/graphs/alpha/snapshot?branch=main", None), + (Method::GET, "/graphs/alpha/schema", None), + (Method::GET, "/graphs/alpha/branches", None), + (Method::GET, "/graphs/alpha/commits", None), + ( + Method::POST, + "/graphs/alpha/read", + Some(r#"{"query_source":"query q() { return {} }"}"#), + ), + ( + Method::POST, + "/graphs/alpha/change", + Some(r#"{"query_source":"query q() { return {} }"}"#), + ), + ( + Method::POST, + "/graphs/alpha/export", + Some(r#"{"branch":"main"}"#), + ), + ( + Method::POST, + "/graphs/alpha/schema/apply", + Some(r#"{"schema_source":"","allow_data_loss":false}"#), + ), + (Method::POST, "/graphs/alpha/ingest", Some(r#"{"data":""}"#)), + ( + Method::POST, + "/graphs/alpha/branches/merge", + Some(r#"{"source":"main","target":"main"}"#), + ), + ]; + + for (method, path, body) in cases { + let req_body = body + .map(|s| Body::from(s.to_string())) + .unwrap_or_else(Body::empty); + let req = Request::builder() + .method(method.clone()) + .uri(*path) + .header("content-type", "application/json") + .body(req_body) + .unwrap(); + let resp = app.clone().oneshot(req).await.unwrap(); + let status = resp.status(); + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + let body_str = String::from_utf8_lossy(&bytes); + + assert_ne!( + status, + StatusCode::NOT_FOUND, + "{} {} — router didn't match (cluster-route mounting regression). Body: {}", + method, + path, + body_str, + ); + assert!( + !(status == StatusCode::INTERNAL_SERVER_ERROR + && body_str.contains("Wrong number of path arguments")), + "{} {} — path extractor broke (Bug-1 class regression). Body: {}", + method, + path, + body_str, + ); + assert!( + !(status == StatusCode::INTERNAL_SERVER_ERROR + && body_str.to_lowercase().contains("missing extension")), + "{} {} — routing middleware didn't inject GraphHandle. Body: {}", + method, + path, + body_str, + ); + } + } + + /// Regression for the bot-surfaced path-extractor bug: cluster + /// routes whose inner path also captures a parameter + /// (`/graphs/{graph_id}/branches/{branch}`, + /// `/graphs/{graph_id}/commits/{commit_id}`) must extract the + /// inner param cleanly. Axum 0.8 propagates the outer `{graph_id}` + /// capture into nested handlers, so a `Path` extractor + /// would see two values and fail with "Wrong number of path + /// arguments. Expected 1 but got 2." Today both DELETE branch and + /// GET commit-by-id break in multi-mode because their handlers + /// use bare `Path` — this test pins the fix. + /// + /// The broader `all_protected_cluster_routes_resolve_to_their_handler` + /// test sweeps the full route surface; this one stays narrowly + /// targeted at the inner-path-param shape because that's the + /// specific regression class. + #[tokio::test(flavor = "multi_thread")] + async fn cluster_routes_with_inner_path_params_deserialize_correctly() { + let (_dirs, app) = build_multi_mode_app(&["alpha"]).await; + + // Create a branch we can then delete — DELETE /graphs/alpha/branches/feature + let create_resp = app + .clone() + .oneshot( + Request::builder() + .method(Method::POST) + .uri("/graphs/alpha/branches") + .header("content-type", "application/json") + .body(Body::from(r#"{"name":"feature"}"#)) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!( + create_resp.status(), + StatusCode::OK, + "branch create on the cluster route must succeed before delete can be tested" + ); + + // DELETE /graphs/{graph_id}/branches/{branch} — exercises a handler + // whose only Path extractor (`branch`) is inside a nested route + // that also captures `graph_id`. The handler must pick `branch` + // by name, not by position. + let delete_resp = app + .clone() + .oneshot( + Request::builder() + .method(Method::DELETE) + .uri("/graphs/alpha/branches/feature") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + let delete_status = delete_resp.status(); + let delete_body = to_bytes(delete_resp.into_body(), usize::MAX).await.unwrap(); + assert_eq!( + delete_status, + StatusCode::OK, + "DELETE /graphs/{{id}}/branches/{{branch}} must extract `branch` cleanly. \ + Body: {}", + String::from_utf8_lossy(&delete_body), + ); + + // GET /graphs/{graph_id}/commits/{commit_id} — same shape: the + // handler's only Path extractor is the inner `commit_id`, which + // must deserialize by name even though `graph_id` is also in scope. + // We don't know a real commit_id, but the failure mode under test + // is path extraction, not commit lookup — a 404 from the engine + // is fine; a 500 with "Wrong number of path arguments" is the bug. + let commit_resp = app + .oneshot( + Request::builder() + .method(Method::GET) + .uri("/graphs/alpha/commits/0000000000000000") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + let commit_status = commit_resp.status(); + let commit_body = to_bytes(commit_resp.into_body(), usize::MAX).await.unwrap(); + let body_str = String::from_utf8_lossy(&commit_body); + assert!( + commit_status != StatusCode::INTERNAL_SERVER_ERROR + || !body_str.contains("Wrong number of path arguments"), + "GET /graphs/{{id}}/commits/{{commit_id}} must extract `commit_id` cleanly. \ + Got: {} | {}", + commit_status, + body_str, + ); + } + + /// Flat routes 404 in multi mode — the router only mounts under + /// `/graphs/{graph_id}/...` so `/snapshot` doesn't resolve. + #[tokio::test(flavor = "multi_thread")] + async fn flat_routes_404_in_multi_mode() { + let (_dirs, app) = build_multi_mode_app(&["alpha"]).await; + let resp = app + .oneshot( + Request::builder() + .method(Method::GET) + .uri("/snapshot?branch=main") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(resp.status(), StatusCode::NOT_FOUND); + } + + /// `GraphId` validation runs at startup — a reserved name in + /// `omnigraph.yaml` produces a clear error rather than getting + /// rejected per-request. + #[test] + fn load_server_settings_rejects_reserved_graph_id() { + let temp = tempfile::tempdir().unwrap(); + let config_path = temp.path().join("omnigraph.yaml"); + fs::write( + &config_path, + r#" +graphs: + policies: + uri: /tmp/g1.omni +"#, + ) + .unwrap(); + let err = load_server_settings(Some(&config_path), None, None, None, false).unwrap_err(); + assert!( + err.to_string().contains("invalid graph id 'policies'"), + "expected reserved-name rejection, got: {err}" + ); + } + + #[tokio::test(flavor = "multi_thread")] + async fn registry_rejects_duplicate_normalized_graph_uris() { + let dir = tempfile::tempdir().unwrap(); + let graph_uri = dir.path().join("same").to_str().unwrap().to_string(); + let schema = fs::read_to_string(fixture("test.pg")).unwrap(); + let engine = Arc::new(Omnigraph::init(&graph_uri, &schema).await.unwrap()); + + let alpha = Arc::new(GraphHandle { + key: GraphKey::cluster(GraphId::try_from("alpha").unwrap()), + uri: graph_uri.clone(), + engine: Arc::clone(&engine), + policy: None, + }); + let beta = Arc::new(GraphHandle { + key: GraphKey::cluster(GraphId::try_from("beta").unwrap()), + uri: format!("file://{graph_uri}/"), + engine, + policy: None, + }); + + match GraphRegistry::from_handles(vec![alpha, beta]) { + Err(InsertError::DuplicateUri(uri)) => { + assert!( + normalize_root_uri(&uri).is_ok(), + "duplicate URI should still be parseable, got {uri}" + ); + } + Err(err) => panic!("expected DuplicateUri for normalized aliases, got {err:?}"), + Ok(_) => panic!("expected DuplicateUri for normalized aliases, got Ok"), + } + } + + #[tokio::test(flavor = "multi_thread")] + async fn registry_stores_canonical_graph_uri() { + let dir = tempfile::tempdir().unwrap(); + let graph_uri = dir.path().join("canonical").to_str().unwrap().to_string(); + let schema = fs::read_to_string(fixture("test.pg")).unwrap(); + let engine = Omnigraph::init(&graph_uri, &schema).await.unwrap(); + let handle = Arc::new(GraphHandle { + key: GraphKey::cluster(GraphId::try_from("alpha").unwrap()), + uri: format!("file://{graph_uri}/"), + engine: Arc::new(engine), + policy: None, + }); + + let registry = GraphRegistry::from_handles(vec![handle]).unwrap(); + let listed = registry.list(); + assert_eq!(listed.len(), 1); + assert_eq!(listed[0].uri, graph_uri); + } + + // ── Four-rule mode inference matrix ─────────────────────────────── + + /// Rule 1: CLI positional URI → Single. + #[test] + fn mode_inference_cli_uri_is_single() { + let settings = load_server_settings( + None, + Some("/tmp/cli.omni".to_string()), + None, + None, + true, // allow unauth so we get past the runtime-state check + ) + .unwrap(); + match settings.mode { + ServerConfigMode::Single { uri, .. } => assert_eq!(uri, "/tmp/cli.omni"), + ServerConfigMode::Multi { .. } => panic!("expected Single (rule 1), got Multi"), + } + } + + /// Rule 2: --target picks one graph from `graphs:` map → Single. + #[test] + fn mode_inference_cli_target_is_single() { + let temp = tempfile::tempdir().unwrap(); + let config_path = temp.path().join("omnigraph.yaml"); + fs::write( + &config_path, + r#" +graphs: + alpha: + uri: /tmp/alpha.omni + beta: + uri: /tmp/beta.omni +"#, + ) + .unwrap(); + let settings = + load_server_settings(Some(&config_path), None, Some("alpha".into()), None, true) + .unwrap(); + match settings.mode { + ServerConfigMode::Single { uri, .. } => assert_eq!(uri, "/tmp/alpha.omni"), + ServerConfigMode::Multi { .. } => panic!("expected Single (rule 2), got Multi"), + } + } + + /// Rule 3: `server.graph` set → Single (target picked from config). + #[test] + fn mode_inference_server_graph_is_single() { + let temp = tempfile::tempdir().unwrap(); + let config_path = temp.path().join("omnigraph.yaml"); + fs::write( + &config_path, + r#" +graphs: + alpha: + uri: /tmp/alpha.omni + beta: + uri: /tmp/beta.omni +server: + graph: beta +"#, + ) + .unwrap(); + let settings = load_server_settings(Some(&config_path), None, None, None, true).unwrap(); + match settings.mode { + ServerConfigMode::Single { uri, .. } => assert_eq!(uri, "/tmp/beta.omni"), + ServerConfigMode::Multi { .. } => panic!("expected Single (rule 3), got Multi"), + } + } + + /// Rule 4: `--config` + non-empty `graphs:` + no single-mode selector → Multi. + #[test] + fn mode_inference_config_plus_graphs_is_multi() { + let temp = tempfile::tempdir().unwrap(); + let config_path = temp.path().join("omnigraph.yaml"); + fs::write( + &config_path, + r#" +graphs: + alpha: + uri: /tmp/alpha.omni + beta: + uri: /tmp/beta.omni +"#, + ) + .unwrap(); + let settings = load_server_settings(Some(&config_path), None, None, None, true).unwrap(); + match settings.mode { + ServerConfigMode::Multi { graphs, .. } => { + let ids: Vec<&str> = graphs.iter().map(|g| g.graph_id.as_str()).collect(); + // BTreeMap iteration order is alphabetical. + assert_eq!(ids, vec!["alpha", "beta"]); + } + ServerConfigMode::Single { .. } => panic!("expected Multi (rule 4), got Single"), + } + } + + #[test] + fn mode_inference_multi_rejects_top_level_policy_file() { + let temp = tempfile::tempdir().unwrap(); + let config_path = temp.path().join("omnigraph.yaml"); + fs::write( + &config_path, + r#" +policy: + file: ./policy.yaml +graphs: + alpha: + uri: /tmp/alpha.omni +"#, + ) + .unwrap(); + let err = load_server_settings(Some(&config_path), None, None, None, true).unwrap_err(); + let msg = err.to_string(); + assert!( + msg.contains("top-level `policy.file` is single-graph/CLI-local policy only"), + "expected single-graph policy guidance, got: {msg}" + ); + assert!( + msg.contains("graphs..policy.file"), + "expected per-graph migration guidance, got: {msg}" + ); + assert!( + msg.contains("server.policy.file"), + "expected server policy migration guidance, got: {msg}" + ); + } + + #[test] + fn mode_inference_normalizes_multi_graph_uris() { + let temp = tempfile::tempdir().unwrap(); + let graph = temp.path().join("alpha.omni"); + let config_path = temp.path().join("omnigraph.yaml"); + fs::write( + &config_path, + format!( + r#" +graphs: + alpha: + uri: file://{}/ +"#, + graph.display() + ), + ) + .unwrap(); + let settings = load_server_settings(Some(&config_path), None, None, None, true).unwrap(); + match settings.mode { + ServerConfigMode::Multi { graphs, .. } => { + assert_eq!(graphs[0].uri, graph.to_string_lossy()); + } + ServerConfigMode::Single { .. } => panic!("expected Multi"), + } + } + + /// Rule 5: nothing → error with migration hint. + #[test] + fn mode_inference_no_inputs_errors_with_migration_hint() { + let err = load_server_settings(None, None, None, None, true).unwrap_err(); + let msg = err.to_string(); + assert!( + msg.contains("no graph to serve"), + "expected migration-hint error, got: {msg}" + ); + } + + /// Rule 4 sub-case: `--config` with empty `graphs:` map and no + /// single-mode selector → rule 5 fires (no graph to serve). + #[test] + fn mode_inference_empty_graphs_map_errors() { + let temp = tempfile::tempdir().unwrap(); + let config_path = temp.path().join("omnigraph.yaml"); + fs::write(&config_path, "server:\n bind: 127.0.0.1:8080\n").unwrap(); + let err = load_server_settings(Some(&config_path), None, None, None, true).unwrap_err(); + assert!(err.to_string().contains("no graph to serve")); + } + + /// `--config` + `` together: URI wins → Single (the CLI URI + /// takes precedence over the config's graphs map). + #[test] + fn mode_inference_cli_uri_overrides_graphs_map() { + let temp = tempfile::tempdir().unwrap(); + let config_path = temp.path().join("omnigraph.yaml"); + fs::write( + &config_path, + r#" +graphs: + alpha: + uri: /tmp/alpha.omni +"#, + ) + .unwrap(); + let settings = load_server_settings( + Some(&config_path), + Some("/tmp/cli-override.omni".to_string()), + None, + None, + true, + ) + .unwrap(); + match settings.mode { + ServerConfigMode::Single { uri, .. } => { + assert_eq!( + uri, "/tmp/cli-override.omni", + "CLI URI must win over graphs: map" + ); + } + ServerConfigMode::Multi { .. } => { + panic!("expected Single (CLI URI wins), got Multi") + } + } + } + + /// Per-graph `policy.file` is resolved relative to the config base_dir. + #[test] + fn per_graph_policy_file_is_resolved_relative_to_base_dir() { + let temp = tempfile::tempdir().unwrap(); + let config_path = temp.path().join("omnigraph.yaml"); + fs::write( + &config_path, + r#" +graphs: + alpha: + uri: /tmp/alpha.omni + policy: + file: ./policies/alpha.yaml + beta: + uri: /tmp/beta.omni +"#, + ) + .unwrap(); + let settings = load_server_settings(Some(&config_path), None, None, None, true).unwrap(); + let graphs = match settings.mode { + ServerConfigMode::Multi { graphs, .. } => graphs, + _ => panic!("expected Multi"), + }; + // graphs is BTreeMap-iter order (alphabetical). + let alpha = &graphs[0]; + let beta = &graphs[1]; + assert_eq!(alpha.graph_id, "alpha"); + assert_eq!( + alpha.policy_file.as_ref().unwrap(), + &temp.path().join("policies/alpha.yaml") + ); + assert_eq!(beta.graph_id, "beta"); + assert!(beta.policy_file.is_none()); + } + + /// `server.policy.file` resolves alongside the graphs map. + #[test] + fn server_policy_file_is_resolved_relative_to_base_dir() { + let temp = tempfile::tempdir().unwrap(); + let config_path = temp.path().join("omnigraph.yaml"); + fs::write( + &config_path, + r#" +server: + policy: + file: ./server-policy.yaml +graphs: + alpha: + uri: /tmp/alpha.omni +"#, + ) + .unwrap(); + let settings = load_server_settings(Some(&config_path), None, None, None, true).unwrap(); + match settings.mode { + ServerConfigMode::Multi { + server_policy_file, .. + } => { + assert_eq!( + server_policy_file.unwrap(), + temp.path().join("server-policy.yaml") + ); + } + _ => panic!("expected Multi"), + } + } + + /// `GET /graphs` must NOT leak the registry in Open mode without + /// an explicit server policy. Operators who pass `--unauthenticated` + /// opted into trusting the network for graph DATA, not for leaking + /// server topology (graph IDs + URIs, which may contain S3 bucket + /// paths or internal hostnames). Cedar gating the management + /// surface is the documented contract for `server_graphs_list` + /// ("don't leak the registry until the operator explicitly + /// authorizes it"); enforcing that contract in every runtime + /// state — not just `PolicyEnabled` — is the correct-by-design + /// closure of the open-mode hole the bot-review pass surfaced. + /// + /// Today (pre-fix) this returns 200 because `authorize_request`'s + /// no-policy fallback only denies when `actor.is_some()`, so Open + /// mode (`actor: None`) falls through to `Ok(())`. The fix in the + /// next commit tightens the fallback so server-scoped actions + /// always require explicit policy. + /// + /// Sort-order coverage previously lived here; it has moved to + /// `get_graphs_with_server_policy_authorizes_per_cedar` where + /// the response body is now non-empty and operator-authorized. + #[tokio::test(flavor = "multi_thread")] + async fn get_graphs_denied_in_open_mode_without_server_policy() { + let (_dirs, app) = build_multi_mode_app(&["beta", "alpha"]).await; + let resp = app + .oneshot( + Request::builder() + .method(Method::GET) + .uri("/graphs") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + let status = resp.status(); + let body = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + let body_str = String::from_utf8_lossy(&body); + assert_eq!( + status, + StatusCode::FORBIDDEN, + "GET /graphs must require an explicit server policy in every \ + runtime state; Open-mode bypass would leak server topology. \ + Body: {body_str}", + ); + } + + /// `GET /graphs` returns 405 in single mode (resource exists in the + /// API surface, just not operational without a `graphs:` map). + #[tokio::test(flavor = "multi_thread")] + async fn get_graphs_returns_405_in_single_mode() { + let temp = init_loaded_graph().await; + let graph = graph_path(temp.path()); + let state = AppState::open(graph.to_string_lossy().to_string()) + .await + .unwrap(); + let app = build_app(state); + let resp = app + .oneshot( + Request::builder() + .method(Method::GET) + .uri("/graphs") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(resp.status(), StatusCode::METHOD_NOT_ALLOWED); + } + + /// `GET /graphs` requires bearer auth when tokens are configured. + #[tokio::test(flavor = "multi_thread")] + async fn get_graphs_requires_bearer_auth_when_configured() { + use omnigraph_server::{GraphHandle, GraphId, GraphKey}; + // Build a multi-mode app with bearer tokens configured. + let dir = tempfile::tempdir().unwrap(); + let graph_uri = dir.path().join("alpha").to_str().unwrap().to_string(); + let schema = fs::read_to_string(fixture("test.pg")).unwrap(); + let engine = Omnigraph::init(&graph_uri, &schema).await.unwrap(); + let handle = Arc::new(GraphHandle { + key: GraphKey::cluster(GraphId::try_from("alpha").unwrap()), + uri: graph_uri, + engine: Arc::new(engine), + policy: None, + }); + let tokens = vec![("act-andrew".to_string(), "secret-token".to_string())]; + let workload = omnigraph_server::workload::WorkloadController::from_env(); + let state = AppState::new_multi(vec![handle], tokens, None, workload, None).unwrap(); + let app = build_app(state); + + // No Authorization header → 401. + let resp_no_auth = app + .clone() + .oneshot( + Request::builder() + .method(Method::GET) + .uri("/graphs") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(resp_no_auth.status(), StatusCode::UNAUTHORIZED); + + // With auth but no server policy → 403 (default-deny, since + // GraphList is not Read). + let resp_authed = app + .oneshot( + Request::builder() + .method(Method::GET) + .uri("/graphs") + .header("authorization", "Bearer secret-token") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(resp_authed.status(), StatusCode::FORBIDDEN); + } + + /// `GET /graphs` with a server policy that allows `graph_list` → 200 + /// and returns the registry sorted alphabetically by `graph_id`. + /// `GET /graphs` with a server policy that does NOT allow + /// `graph_list` (viewer group) → 403. + /// + /// This test owns the alphabetical-sort coverage that previously + /// lived in `get_graphs_lists_registered_graphs_in_multi_mode`. + /// That test now asserts denial in Open mode (server-scoped actions + /// require explicit policy in every runtime state), so the positive + /// body-shape assertions need a home where the response is + /// operator-authorized — here. + #[tokio::test(flavor = "multi_thread")] + async fn get_graphs_with_server_policy_authorizes_per_cedar() { + use omnigraph_policy::PolicyEngine; + use omnigraph_server::{GraphHandle, GraphId, GraphKey}; + + let dir = tempfile::tempdir().unwrap(); + + // Two graphs deliberately registered in non-alphabetical order + // so the test would fail if the handler relied on insertion + // order instead of server-side sorting. + let schema = fs::read_to_string(fixture("test.pg")).unwrap(); + let mut handles = Vec::new(); + for id in ["beta", "alpha"] { + let graph_uri = dir.path().join(id).to_str().unwrap().to_string(); + let engine = Omnigraph::init(&graph_uri, &schema).await.unwrap(); + handles.push(Arc::new(GraphHandle { + key: GraphKey::cluster(GraphId::try_from(id).unwrap()), + uri: graph_uri, + engine: Arc::new(engine), + policy: None, + })); + } + + // Server policy: admins can graph_list, viewers cannot. + let policy_path = dir.path().join("server-policy.yaml"); + fs::write( + &policy_path, + r#" +version: 1 +groups: + admins: [act-andrew] + viewers: [act-bruno] +rules: + - id: admins-list-graphs + allow: + actors: { group: admins } + actions: [graph_list] +"#, + ) + .unwrap(); + let server_policy = PolicyEngine::load_server(&policy_path).unwrap(); + + let tokens = vec![ + ("act-andrew".to_string(), "andrew-token".to_string()), + ("act-bruno".to_string(), "bruno-token".to_string()), + ]; + let workload = omnigraph_server::workload::WorkloadController::from_env(); + let state = + AppState::new_multi(handles, tokens, Some(server_policy), workload, None).unwrap(); + let app = build_app(state); + + // Admin → 200, body returns both graphs alphabetically sorted. + let resp_admin = app + .clone() + .oneshot( + Request::builder() + .method(Method::GET) + .uri("/graphs") + .header("authorization", "Bearer andrew-token") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!( + resp_admin.status(), + StatusCode::OK, + "admin must be allowed graph_list" + ); + let body = to_bytes(resp_admin.into_body(), usize::MAX).await.unwrap(); + let json: Value = serde_json::from_slice(&body).unwrap(); + let graphs = json["graphs"].as_array().unwrap(); + assert_eq!(graphs.len(), 2, "response must list both registered graphs"); + assert_eq!( + graphs[0]["graph_id"].as_str().unwrap(), + "alpha", + "server must sort graphs alphabetically by graph_id (insertion order was 'beta', 'alpha')" + ); + assert_eq!(graphs[1]["graph_id"].as_str().unwrap(), "beta"); + + // Viewer → 403 + let resp_viewer = app + .oneshot( + Request::builder() + .method(Method::GET) + .uri("/graphs") + .header("authorization", "Bearer bruno-token") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!( + resp_viewer.status(), + StatusCode::FORBIDDEN, + "viewer must be denied graph_list (Cedar gate)" + ); + } + + /// Loads an `omnigraph.yaml` with two graphs and verifies multi-mode + /// inference plus graph entry resolution. Cluster-route dispatch is + /// covered by the route tests above. + #[tokio::test(flavor = "multi_thread")] + async fn server_settings_load_multi_graph_config_entries() { + let cfg_dir = tempfile::tempdir().unwrap(); + // Real graph storage dirs (the URIs in the config must point to + // a graph init-able location). + let alpha_dir = cfg_dir.path().join("alpha.omni"); + let beta_dir = cfg_dir.path().join("beta.omni"); + let schema = fs::read_to_string(fixture("test.pg")).unwrap(); + Omnigraph::init(alpha_dir.to_str().unwrap(), &schema) + .await + .unwrap(); + Omnigraph::init(beta_dir.to_str().unwrap(), &schema) + .await + .unwrap(); + + let config_path = cfg_dir.path().join("omnigraph.yaml"); + fs::write( + &config_path, + format!( + r#" +graphs: + alpha: + uri: {alpha} + beta: + uri: {beta} +"#, + alpha = alpha_dir.display(), + beta = beta_dir.display(), + ), + ) + .unwrap(); + + let settings: ServerConfig = + load_server_settings(Some(&config_path), None, None, None, true).unwrap(); + assert!(matches!(settings.mode, ServerConfigMode::Multi { .. })); + + match settings.mode { + ServerConfigMode::Multi { graphs, .. } => { + assert_eq!(graphs.len(), 2); + let ids: Vec<&str> = graphs.iter().map(|g| g.graph_id.as_str()).collect(); + assert_eq!(ids, vec!["alpha", "beta"]); + } + _ => unreachable!(), + } + } +} diff --git a/crates/omnigraph/examples/bench_expand.rs b/crates/omnigraph/examples/bench_expand.rs index 1b0011a..c723b24 100644 --- a/crates/omnigraph/examples/bench_expand.rs +++ b/crates/omnigraph/examples/bench_expand.rs @@ -239,7 +239,9 @@ async fn main() { let jsonl = generate_jsonl(n, avg_deg, 42); let t = Instant::now(); - load_jsonl(&mut db, &jsonl, LoadMode::Overwrite).await.unwrap(); + load_jsonl(&mut db, &jsonl, LoadMode::Overwrite) + .await + .unwrap(); let load_elapsed = t.elapsed(); println!( diff --git a/crates/omnigraph/src/db/mod.rs b/crates/omnigraph/src/db/mod.rs index 6bdd9ee..d0b292f 100644 --- a/crates/omnigraph/src/db/mod.rs +++ b/crates/omnigraph/src/db/mod.rs @@ -10,11 +10,11 @@ pub(crate) mod write_queue; pub use commit_graph::GraphCommit; pub use graph_coordinator::{GraphCoordinator, ReadTarget, ResolvedTarget, SnapshotId}; pub use manifest::{Snapshot, SubTableEntry, SubTableUpdate}; +pub(crate) use omnigraph::ensure_public_branch_ref; pub use omnigraph::{ - CleanupPolicyOptions, MergeOutcome, Omnigraph, OpenMode, SchemaApplyOptions, + CleanupPolicyOptions, InitOptions, MergeOutcome, Omnigraph, OpenMode, SchemaApplyOptions, SchemaApplyResult, TableCleanupStats, TableOptimizeStats, }; -pub(crate) use omnigraph::ensure_public_branch_ref; pub(crate) use run_registry::is_internal_run_branch; pub(crate) const SCHEMA_APPLY_LOCK_BRANCH: &str = "__schema_apply_lock__"; @@ -59,9 +59,7 @@ impl MutationOpKind { pub(crate) fn strict_pre_stage_version_check(self) -> bool { match self { MutationOpKind::Insert | MutationOpKind::Merge => false, - MutationOpKind::Update - | MutationOpKind::Delete - | MutationOpKind::SchemaRewrite => true, + MutationOpKind::Update | MutationOpKind::Delete | MutationOpKind::SchemaRewrite => true, } } } diff --git a/crates/omnigraph/src/db/omnigraph.rs b/crates/omnigraph/src/db/omnigraph.rs index 30a8f14..5c92ac3 100644 --- a/crates/omnigraph/src/db/omnigraph.rs +++ b/crates/omnigraph/src/db/omnigraph.rs @@ -165,31 +165,137 @@ pub enum OpenMode { ReadOnly, } +/// Options for [`Omnigraph::init_with_options`]. +/// +/// `force` controls the safety preflight that prevents an +/// accidental re-init from overwriting an existing graph's schema +/// metadata. Default behavior (`force: false`) fails fast with +/// [`OmniError::AlreadyInitialized`] if any of `_schema.pg`, +/// `_schema.ir.json`, or `__schema_state.json` already exists at +/// the target URI. With `force: true` the preflight is skipped — +/// existing schema files are overwritten in place. Force does NOT +/// purge old Lance datasets or `__manifest/`; reclaiming those +/// still requires deleting the graph directory by hand (or via a +/// future `DELETE /graphs/{id}`). +#[derive(Debug, Clone, Copy, Default)] +pub struct InitOptions { + /// Skip the existing-graph preflight. Operators set this when + /// they actually mean to overwrite — e.g. `omnigraph init --force`. + pub force: bool, +} + impl Omnigraph { /// Create a new graph at `uri` from schema source. /// - /// Creates `_schema.pg`, per-type Lance datasets, and `__manifest`. + /// Strict mode: errors with [`OmniError::AlreadyInitialized`] if + /// `uri` already holds any of the three schema artifacts. To + /// overwrite an existing graph deliberately, call + /// [`Self::init_with_options`] with `InitOptions { force: true }`. pub async fn init(uri: &str, schema_source: &str) -> Result { - Self::init_with_storage(uri, schema_source, storage_for_uri(uri)?).await + Self::init_with_options(uri, schema_source, InitOptions::default()).await + } + + /// Create a new graph at `uri`, with explicit init-time options. + /// + /// See [`InitOptions`] for the safety contract — by default this + /// behaves identically to [`Self::init`]. + pub async fn init_with_options( + uri: &str, + schema_source: &str, + options: InitOptions, + ) -> Result { + Self::init_with_storage(uri, schema_source, storage_for_uri(uri)?, options).await } pub(crate) async fn init_with_storage( uri: &str, schema_source: &str, storage: Arc, + options: InitOptions, ) -> Result { let root = normalize_root_uri(uri)?; + + // Preflight: refuse to clobber an existing graph unless the + // operator passed `force`. This runs BEFORE any parse or + // write so a misdirected `init` against an existing graph + // URI cannot reach a code path that overwrites or, on a + // later cleanup, deletes the schema files. + // + // Closes the "init is destructive against existing state" + // class: there is no longer a code path where strict-mode + // `init` can mutate a populated graph root. + if !options.force { + for candidate in [ + schema_source_uri(&root), + schema_ir_uri(&root), + schema_state_uri(&root), + ] { + if storage.exists(&candidate).await? { + return Err(OmniError::AlreadyInitialized { uri: root.clone() }); + } + } + } + let schema_ir = read_schema_ir_from_source(schema_source)?; let mut catalog = build_catalog_from_ir(&schema_ir)?; fixup_blob_schemas(&mut catalog); - // Write _schema.pg - let schema_path = join_uri(&root, SCHEMA_SOURCE_FILENAME); - storage.write_text(&schema_path, schema_source).await?; - write_schema_contract(&root, storage.as_ref(), &schema_ir).await?; + // Establish an atomic ownership claim on `_schema.pg` before + // writing the remaining init artifacts. A check-then-write preflight + // is not enough under concurrent `init` calls: two callers can both + // observe an empty root, one can successfully initialize, and the + // loser can then fail in Lance `WriteMode::Create`. Only the caller + // that atomically created `_schema.pg` may clean up schema artifacts + // on later failure. + let schema_pg_claimed = if options.force { + false + } else { + let schema_path = join_uri(&root, SCHEMA_SOURCE_FILENAME); + if !storage + .write_text_if_absent(&schema_path, schema_source) + .await? + { + return Err(OmniError::AlreadyInitialized { uri: root.clone() }); + } + if let Err(err) = crate::failpoints::maybe_fail("init.after_schema_pg_written") { + best_effort_cleanup_init_artifacts(&root, storage.as_ref()).await; + return Err(err); + } + true + }; - // Create manifest + per-type datasets - let coordinator = GraphCoordinator::init(&root, &catalog, Arc::clone(&storage)).await?; + // Run the I/O phase. On any error, best-effort-clean schema + // artifacts only when this invocation owns them: strict mode owns + // them after the atomic `_schema.pg` claim above; force mode owns + // destructive overwrite semantics by explicit operator request. + // + // Coverage gap: Lance per-type datasets and `__manifest/` + // directory created by `GraphCoordinator::init` are NOT cleaned + // up here — fully recursive directory deletion requires a + // `StorageAdapter::delete_prefix` primitive that's deferred + // along with `DELETE /graphs/{id}` (PR 2b in the MR-668 plan + // is currently deferred). If `init` fails after coordinator + // init succeeds, operators may need to remove the graph + // directory manually before retrying `init` on the same URI. + // Documented in the PR 2a commit message and `init` rustdoc. + let coordinator = match init_storage_phase( + &root, + schema_source, + &schema_ir, + &catalog, + &storage, + !schema_pg_claimed, + ) + .await + { + Ok(coordinator) => coordinator, + Err(err) => { + if schema_pg_claimed || options.force { + best_effort_cleanup_init_artifacts(&root, storage.as_ref()).await; + } + return Err(err); + } + }; Ok(Self { root_uri: root.clone(), @@ -1477,6 +1583,71 @@ fn read_schema_ir_from_source(schema_source: &str) -> Result { build_schema_ir(&schema_ast).map_err(|err| OmniError::manifest(err.to_string())) } +/// I/O phase of `Omnigraph::init_with_storage`. Split out so the caller +/// can pattern-match on the result and run cleanup on error before +/// returning the original error. +/// +/// Failpoints fire at the phase boundaries: +/// * `init.after_schema_pg_written` — `_schema.pg` is on disk. In strict mode +/// this fires in the caller immediately after the atomic ownership claim; in +/// force mode it fires here after the explicit overwrite. +/// * `init.after_schema_contract_written` — `_schema.pg` + `_schema.ir.json` +/// + `__schema_state.json` are on disk. +/// * `init.after_coordinator_init` — all schema files plus Lance per-type +/// datasets and `__manifest/` are on disk. (The cleanup wrapper can only +/// remove the schema files; Lance directories need `delete_prefix` — +/// deferred along with `DELETE /graphs/{id}`.) +async fn init_storage_phase( + root: &str, + schema_source: &str, + schema_ir: &SchemaIR, + catalog: &Catalog, + storage: &Arc, + write_schema_pg: bool, +) -> Result { + if write_schema_pg { + let schema_path = join_uri(root, SCHEMA_SOURCE_FILENAME); + storage.write_text(&schema_path, schema_source).await?; + crate::failpoints::maybe_fail("init.after_schema_pg_written")?; + } + + write_schema_contract(root, storage.as_ref(), schema_ir).await?; + crate::failpoints::maybe_fail("init.after_schema_contract_written")?; + + let coordinator = GraphCoordinator::init(root, catalog, Arc::clone(storage)).await?; + crate::failpoints::maybe_fail("init.after_coordinator_init")?; + + Ok(coordinator) +} + +/// Best-effort cleanup of init-phase artifacts. Called from +/// `init_with_storage` on any error returned by `init_storage_phase`. +/// +/// Removes the three schema files: `_schema.pg`, `_schema.ir.json`, +/// `__schema_state.json`. Lance datasets and `__manifest/` are not +/// touched here — recursive directory deletion requires a +/// `StorageAdapter::delete_prefix` primitive that's deferred along +/// with `DELETE /graphs/{id}` (MR-668 PR 2b). +/// +/// Failures to delete are logged via `tracing::warn` and do not mask +/// the original init error. +async fn best_effort_cleanup_init_artifacts(root: &str, storage: &dyn StorageAdapter) { + for uri in [ + schema_source_uri(root), + schema_ir_uri(root), + schema_state_uri(root), + ] { + if let Err(err) = storage.delete(&uri).await { + tracing::warn!( + target: "omnigraph::init::cleanup", + uri = %uri, + error = %err, + "init failed; best-effort cleanup could not delete artifact", + ); + } + } +} + fn schema_table_key(type_kind: SchemaTypeKind, name: &str) -> String { match type_kind { SchemaTypeKind::Node => format!("node:{}", name), @@ -1686,7 +1857,7 @@ mod tests { use crate::db::manifest::ManifestCoordinator; use async_trait::async_trait; use serde_json::Value; - use std::sync::Mutex; + use std::sync::{Arc, Mutex}; use crate::storage::{LocalStorageAdapter, StorageAdapter, join_uri}; @@ -1740,6 +1911,11 @@ edge WorksAt: Person -> Company self.inner.write_text(uri, contents).await } + async fn write_text_if_absent(&self, uri: &str, contents: &str) -> Result { + self.writes.lock().unwrap().push(uri.to_string()); + self.inner.write_text_if_absent(uri, contents).await + } + async fn exists(&self, uri: &str) -> Result { self.exists_checks.lock().unwrap().push(uri.to_string()); self.inner.exists(uri).await @@ -1763,13 +1939,96 @@ edge WorksAt: Person -> Company } } + #[derive(Debug)] + struct InitRaceStorageAdapter { + inner: LocalStorageAdapter, + root: String, + barrier: Arc, + } + + #[async_trait] + impl StorageAdapter for InitRaceStorageAdapter { + async fn read_text(&self, uri: &str) -> Result { + self.inner.read_text(uri).await + } + + async fn write_text(&self, uri: &str, contents: &str) -> Result<()> { + self.inner.write_text(uri, contents).await + } + + async fn write_text_if_absent(&self, uri: &str, contents: &str) -> Result { + self.inner.write_text_if_absent(uri, contents).await + } + + async fn exists(&self, uri: &str) -> Result { + let exists = self.inner.exists(uri).await?; + if uri == schema_state_uri(&self.root) { + self.barrier.wait().await; + } + Ok(exists) + } + + async fn rename_text(&self, from_uri: &str, to_uri: &str) -> Result<()> { + self.inner.rename_text(from_uri, to_uri).await + } + + async fn delete(&self, uri: &str) -> Result<()> { + self.inner.delete(uri).await + } + + async fn list_dir(&self, dir_uri: &str) -> Result> { + self.inner.list_dir(dir_uri).await + } + } + + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn concurrent_strict_init_does_not_delete_winning_schema_files() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap().to_string(); + let root = normalize_root_uri(&uri).unwrap(); + let storage: Arc = Arc::new(InitRaceStorageAdapter { + inner: LocalStorageAdapter, + root, + barrier: Arc::new(tokio::sync::Barrier::new(2)), + }); + + let left = Omnigraph::init_with_storage( + &uri, + TEST_SCHEMA, + Arc::clone(&storage), + InitOptions::default(), + ); + let right = Omnigraph::init_with_storage( + &uri, + TEST_SCHEMA, + Arc::clone(&storage), + InitOptions::default(), + ); + let (left, right) = tokio::join!(left, right); + let ok_count = usize::from(left.is_ok()) + usize::from(right.is_ok()); + assert_eq!(ok_count, 1, "exactly one concurrent init should win"); + + assert!( + dir.path().join("_schema.pg").exists(), + "winning init must leave _schema.pg in place" + ); + assert!( + dir.path().join("_schema.ir.json").exists(), + "winning init must leave _schema.ir.json in place" + ); + assert!( + dir.path().join("__schema_state.json").exists(), + "winning init must leave __schema_state.json in place" + ); + } + #[tokio::test] async fn test_init_and_open_route_graph_metadata_through_storage_adapter() { let dir = tempfile::tempdir().unwrap(); let uri = dir.path().to_str().unwrap(); let adapter = Arc::new(RecordingStorageAdapter::default()); - Omnigraph::init_with_storage(uri, TEST_SCHEMA, adapter.clone()) + Omnigraph::init_with_storage(uri, TEST_SCHEMA, adapter.clone(), InitOptions::default()) .await .unwrap(); assert!(adapter.writes().contains(&join_uri(uri, "_schema.pg"))); diff --git a/crates/omnigraph/src/db/omnigraph/export.rs b/crates/omnigraph/src/db/omnigraph/export.rs index 3fcd4f4..366f50a 100644 --- a/crates/omnigraph/src/db/omnigraph/export.rs +++ b/crates/omnigraph/src/db/omnigraph/export.rs @@ -16,7 +16,12 @@ pub(super) async fn entity_at( id: &str, version: u64, ) -> Result> { - let snap = db.coordinator.read().await.snapshot_at_version(version).await?; + let snap = db + .coordinator + .read() + .await + .snapshot_at_version(version) + .await?; entity_from_snapshot(db, &snap, table_key, id).await } diff --git a/crates/omnigraph/src/db/omnigraph/table_ops.rs b/crates/omnigraph/src/db/omnigraph/table_ops.rs index 717f263..0e89c45 100644 --- a/crates/omnigraph/src/db/omnigraph/table_ops.rs +++ b/crates/omnigraph/src/db/omnigraph/table_ops.rs @@ -22,7 +22,12 @@ pub(super) async fn graph_index_for_resolved( } pub(super) async fn ensure_indices(db: &Omnigraph) -> Result<()> { - let current_branch = db.coordinator.read().await.current_branch().map(str::to_string); + let current_branch = db + .coordinator + .read() + .await + .current_branch() + .map(str::to_string); ensure_indices_for_branch(db, current_branch.as_deref()).await } @@ -68,10 +73,7 @@ pub(super) async fn failpoint_publish_table_head_without_index_rebuild_for_test( .await } -pub(super) async fn ensure_indices_for_branch( - db: &Omnigraph, - branch: Option<&str>, -) -> Result<()> { +pub(super) async fn ensure_indices_for_branch(db: &Omnigraph, branch: Option<&str>) -> Result<()> { db.ensure_schema_state_valid().await?; db.ensure_schema_apply_idle("ensure_indices").await?; let resolved = db.resolved_branch_target(branch).await?; @@ -403,7 +405,12 @@ pub(super) async fn open_for_mutation( table_key: &str, op_kind: crate::db::MutationOpKind, ) -> Result<(Dataset, String, Option)> { - let current_branch = db.coordinator.read().await.current_branch().map(str::to_string); + let current_branch = db + .coordinator + .read() + .await + .current_branch() + .map(str::to_string); open_for_mutation_on_branch(db, current_branch.as_deref(), table_key, op_kind).await } @@ -807,7 +814,12 @@ pub(super) async fn commit_prepared_updates_on_branch( updates: &[crate::db::SubTableUpdate], actor_id: Option<&str>, ) -> Result { - let current_branch = db.coordinator.read().await.current_branch().map(str::to_string); + let current_branch = db + .coordinator + .read() + .await + .current_branch() + .map(str::to_string); let requested_branch = branch.map(str::to_string); if requested_branch == current_branch { return commit_prepared_updates(db, updates, actor_id).await; @@ -835,7 +847,12 @@ pub(super) async fn commit_prepared_updates_on_branch_with_expected( expected_table_versions: &std::collections::HashMap, actor_id: Option<&str>, ) -> Result { - let current_branch = db.coordinator.read().await.current_branch().map(str::to_string); + let current_branch = db + .coordinator + .read() + .await + .current_branch() + .map(str::to_string); let requested_branch = branch.map(str::to_string); if requested_branch == current_branch { return commit_prepared_updates_with_expected( @@ -870,7 +887,12 @@ pub(super) async fn commit_updates( updates: &[crate::db::SubTableUpdate], ) -> Result { db.ensure_schema_apply_not_locked("write commit").await?; - let current_branch = db.coordinator.read().await.current_branch().map(str::to_string); + let current_branch = db + .coordinator + .read() + .await + .current_branch() + .map(str::to_string); let prepared = prepare_updates_for_commit(db, current_branch.as_deref(), updates).await?; commit_prepared_updates(db, &prepared, None).await } @@ -879,7 +901,11 @@ pub(super) async fn commit_manifest_updates( db: &Omnigraph, updates: &[crate::db::SubTableUpdate], ) -> Result { - db.coordinator.write().await.commit_manifest_updates(updates).await + db.coordinator + .write() + .await + .commit_manifest_updates(updates) + .await } pub(super) async fn record_merge_commit( @@ -889,7 +915,9 @@ pub(super) async fn record_merge_commit( merged_parent_commit_id: &str, actor_id: Option<&str>, ) -> Result { - db.coordinator.write().await + db.coordinator + .write() + .await .record_merge_commit( manifest_version, parent_commit_id, @@ -923,7 +951,11 @@ pub(super) async fn commit_updates_on_branch_with_expected( } pub(super) async fn ensure_commit_graph_initialized(db: &Omnigraph) -> Result<()> { - db.coordinator.write().await.ensure_commit_graph_initialized().await + db.coordinator + .write() + .await + .ensure_commit_graph_initialized() + .await } pub(super) async fn invalidate_graph_index(db: &Omnigraph) { diff --git a/crates/omnigraph/src/db/write_queue.rs b/crates/omnigraph/src/db/write_queue.rs index bb03022..1f0c53a 100644 --- a/crates/omnigraph/src/db/write_queue.rs +++ b/crates/omnigraph/src/db/write_queue.rs @@ -91,10 +91,7 @@ impl WriteQueueManager { /// Empty input returns an empty Vec without touching the map. /// Duplicates in `keys` are deduped before acquisition (the same /// key acquired twice would deadlock against itself). - pub(crate) async fn acquire_many( - &self, - keys: &[TableQueueKey], - ) -> Vec> { + pub(crate) async fn acquire_many(&self, keys: &[TableQueueKey]) -> Vec> { if keys.is_empty() { return Vec::new(); } @@ -167,7 +164,10 @@ mod tests { qm2.acquire_many(&[z_clone, a_clone]).await }) .await; - assert!(result.is_err(), "acquire_many should block on `a`, the lex-first key"); + assert!( + result.is_err(), + "acquire_many should block on `a`, the lex-first key" + ); } #[tokio::test] @@ -180,9 +180,10 @@ mod tests { // Second acquire on same key should NOT complete within 200ms. let qm2 = Arc::clone(&qm); let k2 = k.clone(); - let blocked = timeout(Duration::from_millis(200), async move { - qm2.acquire(&k2).await - }) + let blocked = timeout( + Duration::from_millis(200), + async move { qm2.acquire(&k2).await }, + ) .await; assert!(blocked.is_err(), "second acquire on same key must block"); diff --git a/crates/omnigraph/src/error.rs b/crates/omnigraph/src/error.rs index 5d27fcb..11f4da0 100644 --- a/crates/omnigraph/src/error.rs +++ b/crates/omnigraph/src/error.rs @@ -92,6 +92,14 @@ pub enum OmniError { /// callers can match on this variant directly. #[error("policy: {0}")] Policy(String), + /// `Omnigraph::init` was called against a URI that already holds + /// schema artifacts from a previous init. Strict mode (the default) + /// fails fast with this error before touching disk so an existing + /// graph's metadata cannot be overwritten or destroyed. Operators + /// who actually want to overwrite pass `InitOptions { force: true }` + /// (CLI: `omnigraph init --force`). + #[error("graph already initialized at '{uri}'; pass --force to overwrite")] + AlreadyInitialized { uri: String }, } impl OmniError { diff --git a/crates/omnigraph/src/exec/mutation.rs b/crates/omnigraph/src/exec/mutation.rs index a5fc6c7..02b2a21 100644 --- a/crates/omnigraph/src/exec/mutation.rs +++ b/crates/omnigraph/src/exec/mutation.rs @@ -794,11 +794,8 @@ impl Omnigraph { // post_commit_pin) and tidies up. Failing the user // here would return an error for a write that // already landed. - if let Err(err) = crate::db::manifest::delete_sidecar( - &handle, - self.storage_adapter(), - ) - .await + if let Err(err) = + crate::db::manifest::delete_sidecar(&handle, self.storage_adapter()).await { tracing::warn!( error = %err, @@ -852,15 +849,8 @@ impl Omnigraph { assignments, predicate, } => { - self.execute_update( - type_name, - assignments, - predicate, - params, - branch, - staging, - ) - .await? + self.execute_update(type_name, assignments, predicate, params, branch, staging) + .await? } MutationOpIR::Delete { type_name, @@ -981,14 +971,8 @@ impl Omnigraph { // + iterate pending edges in-memory for the `src` column, // group-by-src. The pending side already includes the row // we just appended (above). - validate_edge_cardinality_with_pending( - self, - &ds, - staging, - &table_key, - edge_type, - ) - .await?; + validate_edge_cardinality_with_pending(self, &ds, staging, &table_key, edge_type) + .await?; self.invalidate_graph_index().await; @@ -1379,14 +1363,8 @@ async fn validate_edge_cardinality_with_pending( if edge_type.cardinality.is_default() { return Ok(()); } - let counts = super::staging::count_src_per_edge( - db, - committed_ds, - table_key, - staging, - None, - ) - .await?; + let counts = + super::staging::count_src_per_edge(db, committed_ds, table_key, staging, None).await?; super::staging::enforce_cardinality_bounds(edge_type, &counts) } diff --git a/crates/omnigraph/src/exec/projection.rs b/crates/omnigraph/src/exec/projection.rs index bcfae66..dec13a8 100644 --- a/crates/omnigraph/src/exec/projection.rs +++ b/crates/omnigraph/src/exec/projection.rs @@ -345,10 +345,7 @@ fn evaluate_projection( IRExpr::PropAccess { variable, property } => { let col_name = format!("{}.{}", variable, property); let col = wide_batch.column_by_name(&col_name).ok_or_else(|| { - OmniError::manifest(format!( - "column '{}' not found in wide batch", - col_name - )) + OmniError::manifest(format!("column '{}' not found in wide batch", col_name)) })?; Ok((col_name, col.clone())) } @@ -516,12 +513,10 @@ fn aggregate_return( } let num_groups = group_indices.len(); - let mut result_columns: Vec<(usize, String, ArrayRef)> = - Vec::with_capacity(projections.len()); + let mut result_columns: Vec<(usize, String, ArrayRef)> = Vec::with_capacity(projections.len()); for gk in &group_keys { - let first_row_indices: Vec = - group_indices.iter().map(|rows| rows[0] as u32).collect(); + let first_row_indices: Vec = group_indices.iter().map(|rows| rows[0] as u32).collect(); let take_idx = UInt32Array::from(first_row_indices); let col = arrow_select::take::take(gk.column.as_ref(), &take_idx, None) .map_err(|e| OmniError::Lance(e.to_string()))?; @@ -584,11 +579,19 @@ fn compute_aggregate( } } -fn compute_sum(arg: &ArrayRef, group_indices: &[Vec], num_groups: usize) -> Result { +fn compute_sum( + arg: &ArrayRef, + group_indices: &[Vec], + num_groups: usize, +) -> Result { macro_rules! sum_numeric { ($arr_type:ty, $arg:expr, $dt:expr) => {{ let arr = $arg.as_any().downcast_ref::<$arr_type>().ok_or_else(|| { - OmniError::manifest(format!("sum: expected {:?}, got {:?}", $dt, $arg.data_type())) + OmniError::manifest(format!( + "sum: expected {:?}, got {:?}", + $dt, + $arg.data_type() + )) })?; let mut builder = Float64Builder::with_capacity(num_groups); for group in group_indices { @@ -613,24 +616,42 @@ fn compute_sum(arg: &ArrayRef, group_indices: &[Vec], num_groups: usize) dt @ DataType::UInt64 => sum_numeric!(UInt64Array, arg, dt), dt @ DataType::Float32 => sum_numeric!(Float32Array, arg, dt), dt @ DataType::Float64 => sum_numeric!(Float64Array, arg, dt), - dt => Err(OmniError::manifest(format!("sum: unsupported type {:?}", dt))), + dt => Err(OmniError::manifest(format!( + "sum: unsupported type {:?}", + dt + ))), } } -fn compute_avg(arg: &ArrayRef, group_indices: &[Vec], num_groups: usize) -> Result { +fn compute_avg( + arg: &ArrayRef, + group_indices: &[Vec], + num_groups: usize, +) -> Result { macro_rules! avg_typed { ($arr_type:ty, $arg:expr) => {{ let arr = $arg.as_any().downcast_ref::<$arr_type>().ok_or_else(|| { - OmniError::manifest(format!("avg: expected {:?}, got {:?}", stringify!($arr_type), $arg.data_type())) + OmniError::manifest(format!( + "avg: expected {:?}, got {:?}", + stringify!($arr_type), + $arg.data_type() + )) })?; let mut builder = Float64Builder::with_capacity(num_groups); for group in group_indices { let mut sum = 0.0f64; let mut count = 0usize; for &i in group { - if !arr.is_null(i) { sum += arr.value(i) as f64; count += 1; } + if !arr.is_null(i) { + sum += arr.value(i) as f64; + count += 1; + } + } + if count > 0 { + builder.append_value(sum / count as f64); + } else { + builder.append_null(); } - if count > 0 { builder.append_value(sum / count as f64); } else { builder.append_null(); } } Ok(Arc::new(builder.finish()) as ArrayRef) }}; @@ -642,15 +663,27 @@ fn compute_avg(arg: &ArrayRef, group_indices: &[Vec], num_groups: usize) DataType::UInt64 => avg_typed!(UInt64Array, arg), DataType::Float32 => avg_typed!(Float32Array, arg), DataType::Float64 => avg_typed!(Float64Array, arg), - dt => Err(OmniError::manifest(format!("avg: unsupported type {:?}", dt))), + dt => Err(OmniError::manifest(format!( + "avg: unsupported type {:?}", + dt + ))), } } -fn compute_min_max(arg: &ArrayRef, group_indices: &[Vec], num_groups: usize, is_min: bool) -> Result { +fn compute_min_max( + arg: &ArrayRef, + group_indices: &[Vec], + num_groups: usize, + is_min: bool, +) -> Result { macro_rules! minmax_typed { ($arr_type:ty, $builder_type:ty, $arg:expr, $is_min:expr) => {{ let arr = $arg.as_any().downcast_ref::<$arr_type>().ok_or_else(|| { - OmniError::manifest(format!("min/max: expected {:?}, got {:?}", stringify!($arr_type), $arg.data_type())) + OmniError::manifest(format!( + "min/max: expected {:?}, got {:?}", + stringify!($arr_type), + $arg.data_type() + )) })?; let mut builder = <$builder_type>::with_capacity(num_groups); for group in group_indices { @@ -660,11 +693,20 @@ fn compute_min_max(arg: &ArrayRef, group_indices: &[Vec], num_groups: usi let v = arr.value(i); result = Some(match result { None => v, - Some(cur) => if $is_min { if v < cur { v } else { cur } } else { if v > cur { v } else { cur } }, + Some(cur) => { + if $is_min { + if v < cur { v } else { cur } + } else { + if v > cur { v } else { cur } + } + } }); } } - match result { Some(v) => builder.append_value(v), None => builder.append_null() } + match result { + Some(v) => builder.append_value(v), + None => builder.append_null(), + } } Ok(Arc::new(builder.finish()) as ArrayRef) }}; @@ -688,15 +730,27 @@ fn compute_min_max(arg: &ArrayRef, group_indices: &[Vec], num_groups: usi let v = arr.value(i); result = Some(match result { None => v, - Some(cur) => if is_min { if v < cur { v } else { cur } } else { if v > cur { v } else { cur } }, + Some(cur) => { + if is_min { + if v < cur { v } else { cur } + } else { + if v > cur { v } else { cur } + } + } }); } } - match result { Some(v) => builder.append_value(v), None => builder.append_null() } + match result { + Some(v) => builder.append_value(v), + None => builder.append_null(), + } } Ok(Arc::new(builder.finish()) as ArrayRef) } - dt => Err(OmniError::manifest(format!("min/max: unsupported type {:?}", dt))), + dt => Err(OmniError::manifest(format!( + "min/max: unsupported type {:?}", + dt + ))), } } @@ -715,7 +769,8 @@ fn build_empty_aggregate_result(projections: &[IRProjection]) -> Result { fields.push(Field::new(name, DataType::Float64, true)); - columns.push(Arc::new(Float64Array::from(vec![None as Option])) as ArrayRef); + columns + .push(Arc::new(Float64Array::from(vec![None as Option])) as ArrayRef); } }, _ => { diff --git a/crates/omnigraph/src/exec/query.rs b/crates/omnigraph/src/exec/query.rs index 24a8722..7590512 100644 --- a/crates/omnigraph/src/exec/query.rs +++ b/crates/omnigraph/src/exec/query.rs @@ -75,14 +75,7 @@ impl Omnigraph { None }; - execute_query( - &ir, - params, - &snapshot, - graph_index.as_deref(), - &catalog, - ) - .await + execute_query(&ir, params, &snapshot, graph_index.as_deref(), &catalog).await } } @@ -360,11 +353,23 @@ pub async fn execute_query( } let mut wide: Option = None; - execute_pipeline(&ir.pipeline, params, snapshot, graph_index, catalog, &mut wide, &search_mode).await?; + execute_pipeline( + &ir.pipeline, + params, + snapshot, + graph_index, + catalog, + &mut wide, + &search_mode, + ) + .await?; let wide_batch = wide.unwrap_or_else(|| RecordBatch::new_empty(Arc::new(Schema::empty()))); // Project return expressions - let has_aggregates = ir.return_exprs.iter().any(|p| matches!(&p.expr, IRExpr::Aggregate { .. })); + let has_aggregates = ir + .return_exprs + .iter() + .any(|p| matches!(&p.expr, IRExpr::Aggregate { .. })); let mut result_batch = project_return(&wide_batch, &ir.return_exprs, params)?; // Apply ordering (skip if search mode already ordered the results) @@ -516,9 +521,9 @@ async fn execute_rrf_query( } fn extract_id_column_by_name(batch: &RecordBatch, col_name: &str) -> Result> { - let col = batch - .column_by_name(col_name) - .ok_or_else(|| OmniError::manifest(format!("batch missing '{}' column for RRF", col_name)))?; + let col = batch.column_by_name(col_name).ok_or_else(|| { + OmniError::manifest(format!("batch missing '{}' column for RRF", col_name)) + })?; let ids = col .as_any() .downcast_ref::() @@ -653,8 +658,19 @@ fn execute_pipeline<'a>( })?; if let Some(batch) = wide.as_mut() { execute_expand( - batch, gi, snapshot, catalog, src_var, dst_var, edge_type, *direction, - dst_type, *min_hops, *max_hops, dst_filters, params, + batch, + gi, + snapshot, + catalog, + src_var, + dst_var, + edge_type, + *direction, + dst_type, + *min_hops, + *max_hops, + dst_filters, + params, ) .await?; } @@ -691,7 +707,9 @@ async fn execute_expand( let src_id_col_name = format!("{}.id", src_var); let src_ids = wide .column_by_name(&src_id_col_name) - .ok_or_else(|| OmniError::manifest(format!("wide batch missing '{}' column", src_id_col_name)))? + .ok_or_else(|| { + OmniError::manifest(format!("wide batch missing '{}' column", src_id_col_name)) + })? .as_any() .downcast_ref::() .ok_or_else(|| OmniError::manifest(format!("'{}' column is not Utf8", src_id_col_name)))? @@ -1421,22 +1439,39 @@ fn literal_to_expr(lit: &Literal) -> Option { } fn prefix_batch(batch: &RecordBatch, variable: &str) -> Result { - let fields: Vec = batch.schema().fields().iter().map(|f| { - Field::new(format!("{}.{}", variable, f.name()), f.data_type().clone(), f.is_nullable()) - }).collect(); + let fields: Vec = batch + .schema() + .fields() + .iter() + .map(|f| { + Field::new( + format!("{}.{}", variable, f.name()), + f.data_type().clone(), + f.is_nullable(), + ) + }) + .collect(); let schema = Arc::new(Schema::new(fields)); - RecordBatch::try_new(schema, batch.columns().to_vec()).map_err(|e| OmniError::Lance(e.to_string())) + RecordBatch::try_new(schema, batch.columns().to_vec()) + .map_err(|e| OmniError::Lance(e.to_string())) } fn cross_join_batches(left: &RecordBatch, right: &RecordBatch) -> Result { let n = left.num_rows(); let m = right.num_rows(); if n == 0 || m == 0 { - let mut fields: Vec = left.schema().fields().iter().map(|f| f.as_ref().clone()).collect(); + let mut fields: Vec = left + .schema() + .fields() + .iter() + .map(|f| f.as_ref().clone()) + .collect(); fields.extend(right.schema().fields().iter().map(|f| f.as_ref().clone())); return Ok(RecordBatch::new_empty(Arc::new(Schema::new(fields)))); } - let left_indices: Vec = (0..n as u32).flat_map(|i| std::iter::repeat(i).take(m)).collect(); + let left_indices: Vec = (0..n as u32) + .flat_map(|i| std::iter::repeat(i).take(m)) + .collect(); let right_indices: Vec = (0..n).flat_map(|_| 0..m as u32).collect(); let left_expanded = take_batch(left, &UInt32Array::from(left_indices))?; let right_expanded = take_batch(right, &UInt32Array::from(right_indices))?; @@ -1444,23 +1479,39 @@ fn cross_join_batches(left: &RecordBatch, right: &RecordBatch) -> Result Result { - let mut fields: Vec = left.schema().fields().iter().map(|f| f.as_ref().clone()).collect(); + let mut fields: Vec = left + .schema() + .fields() + .iter() + .map(|f| f.as_ref().clone()) + .collect(); if cfg!(debug_assertions) { let left_schema = left.schema(); - let left_names: HashSet<&str> = left_schema.fields().iter().map(|f| f.name().as_str()).collect(); + let left_names: HashSet<&str> = left_schema + .fields() + .iter() + .map(|f| f.name().as_str()) + .collect(); let right_schema = right.schema(); for f in right_schema.fields() { - debug_assert!(!left_names.contains(f.name().as_str()), "hconcat_batches: duplicate column '{}'", f.name()); + debug_assert!( + !left_names.contains(f.name().as_str()), + "hconcat_batches: duplicate column '{}'", + f.name() + ); } } fields.extend(right.schema().fields().iter().map(|f| f.as_ref().clone())); let mut columns: Vec = left.columns().to_vec(); columns.extend(right.columns().to_vec()); - RecordBatch::try_new(Arc::new(Schema::new(fields)), columns).map_err(|e| OmniError::Lance(e.to_string())) + RecordBatch::try_new(Arc::new(Schema::new(fields)), columns) + .map_err(|e| OmniError::Lance(e.to_string())) } fn take_batch(batch: &RecordBatch, indices: &UInt32Array) -> Result { - let columns: Vec = batch.columns().iter() + let columns: Vec = batch + .columns() + .iter() .map(|col| arrow_select::take::take(col.as_ref(), indices, None)) .collect::, _>>() .map_err(|e| OmniError::Lance(e.to_string()))?; diff --git a/crates/omnigraph/src/loader/mod.rs b/crates/omnigraph/src/loader/mod.rs index 878dcfe..cade1f4 100644 --- a/crates/omnigraph/src/loader/mod.rs +++ b/crates/omnigraph/src/loader/mod.rs @@ -212,12 +212,7 @@ impl Omnigraph { .await } - pub async fn load_file( - &self, - branch: &str, - path: &str, - mode: LoadMode, - ) -> Result { + pub async fn load_file(&self, branch: &str, path: &str, mode: LoadMode) -> Result { self.load_file_as(branch, path, mode, None).await } @@ -457,13 +452,7 @@ async fn load_jsonl_reader( for (edge_name, rows) in &edge_rows { let edge_type = &catalog.edge_types[edge_name]; let from_ids = if use_staging { - collect_node_ids_with_pending( - db, - branch, - &edge_type.from_type, - &staging, - ) - .await? + collect_node_ids_with_pending(db, branch, &edge_type.from_type, &staging).await? } else { collect_node_ids( db, @@ -476,13 +465,7 @@ async fn load_jsonl_reader( .await? }; let to_ids = if use_staging { - collect_node_ids_with_pending( - db, - branch, - &edge_type.to_type, - &staging, - ) - .await? + collect_node_ids_with_pending(db, branch, &edge_type.to_type, &staging).await? } else { collect_node_ids( db, @@ -581,12 +564,7 @@ async fn load_jsonl_reader( let table_key = format!("edge:{}", edge_name); if use_staging { validate_edge_cardinality_with_pending_loader( - db, - branch, - edge_type, - &table_key, - &staging, - mode, + db, branch, edge_type, &table_key, &staging, mode, ) .await?; } else if let Some(update) = overwrite_updates.iter().find(|u| u.table_key == table_key) { @@ -1699,8 +1677,7 @@ async fn validate_edge_cardinality_with_pending_loader( LoadMode::Append | LoadMode::Overwrite => None, }; let counts = - crate::exec::staging::count_src_per_edge(db, &ds, table_key, staging, dedupe_key) - .await?; + crate::exec::staging::count_src_per_edge(db, &ds, table_key, staging, dedupe_key).await?; crate::exec::staging::enforce_cardinality_bounds(edge_type, &counts) } diff --git a/crates/omnigraph/src/storage.rs b/crates/omnigraph/src/storage.rs index e90c693..564b577 100644 --- a/crates/omnigraph/src/storage.rs +++ b/crates/omnigraph/src/storage.rs @@ -7,7 +7,8 @@ use async_trait::async_trait; use futures::TryStreamExt; use object_store::aws::AmazonS3Builder; use object_store::path::Path as ObjectPath; -use object_store::{DynObjectStore, ObjectStore, PutPayload}; +use object_store::{DynObjectStore, ObjectStore, PutMode, PutPayload}; +use tokio::io::AsyncWriteExt; use url::Url; use crate::error::{OmniError, Result}; @@ -19,6 +20,13 @@ const S3_SCHEME_PREFIX: &str = "s3://"; pub trait StorageAdapter: Debug + Send + Sync { async fn read_text(&self, uri: &str) -> Result; async fn write_text(&self, uri: &str, contents: &str) -> Result<()>; + /// Write a text object only if no object exists at `uri`. + /// + /// Returns `Ok(true)` when this call created the object, `Ok(false)` + /// when the object already existed, and propagates every other storage + /// error. Callers use this to establish ownership before running + /// best-effort cleanup on partial failure. + async fn write_text_if_absent(&self, uri: &str, contents: &str) -> Result; async fn exists(&self, uri: &str) -> Result; /// Move a file from `from_uri` to `to_uri`, replacing any existing file at /// `to_uri`. Atomic on local POSIX; on S3 implemented as copy + delete @@ -77,6 +85,30 @@ impl StorageAdapter for LocalStorageAdapter { Ok(()) } + async fn write_text_if_absent(&self, uri: &str, contents: &str) -> Result { + let path = local_path_from_uri(uri)?; + if let Some(parent) = path.parent() { + if !parent.as_os_str().is_empty() { + tokio::fs::create_dir_all(parent).await?; + } + } + let mut file = match tokio::fs::OpenOptions::new() + .write(true) + .create_new(true) + .open(&path) + .await + { + Ok(file) => file, + Err(err) if err.kind() == std::io::ErrorKind::AlreadyExists => return Ok(false), + Err(err) => return Err(err.into()), + }; + if let Err(err) = file.write_all(contents.as_bytes()).await { + let _ = tokio::fs::remove_file(&path).await; + return Err(err.into()); + } + Ok(true) + } + async fn exists(&self, uri: &str) -> Result { Ok(local_path_from_uri(uri)?.exists()) } @@ -146,6 +178,24 @@ impl StorageAdapter for S3StorageAdapter { Ok(()) } + async fn write_text_if_absent(&self, uri: &str, contents: &str) -> Result { + let location = self.object_path(uri)?; + match self + .store + .put_opts( + &location, + PutPayload::from(contents.as_bytes().to_vec()), + PutMode::Create.into(), + ) + .await + { + Ok(_) => Ok(true), + Err(object_store::Error::AlreadyExists { .. }) + | Err(object_store::Error::Precondition { .. }) => Ok(false), + Err(err) => Err(storage_backend_error("write_if_absent", uri, err)), + } + } + async fn exists(&self, uri: &str) -> Result { let location = self.object_path(uri)?; match self.store.head(&location).await { @@ -447,4 +497,16 @@ mod tests { assert_eq!(location.bucket, "bucket"); assert_eq!(location.key, "graph/_schema.pg"); } + + #[tokio::test] + async fn local_write_text_if_absent_creates_once_without_overwrite() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().join("claim.txt"); + let uri = uri.to_str().unwrap(); + let storage = LocalStorageAdapter; + + assert!(storage.write_text_if_absent(uri, "first").await.unwrap()); + assert!(!storage.write_text_if_absent(uri, "second").await.unwrap()); + assert_eq!(storage.read_text(uri).await.unwrap(), "first"); + } } diff --git a/crates/omnigraph/src/storage_layer.rs b/crates/omnigraph/src/storage_layer.rs index b0fc042..dac9482 100644 --- a/crates/omnigraph/src/storage_layer.rs +++ b/crates/omnigraph/src/storage_layer.rs @@ -94,7 +94,9 @@ impl SnapshotHandle { /// Construct from a Lance dataset. `pub(crate)` — only /// `TableStore` should produce these. pub(crate) fn new(ds: Dataset) -> Self { - Self { inner: Arc::new(ds) } + Self { + inner: Arc::new(ds), + } } /// Borrow the underlying Lance dataset. `pub(crate)` so only the @@ -242,16 +244,10 @@ pub trait TableStorage: sealed::Sealed + Send + Sync + Debug { async fn scan_batches(&self, snapshot: &SnapshotHandle) -> Result>; - async fn scan_batches_for_rewrite( - &self, - snapshot: &SnapshotHandle, - ) -> Result>; + async fn scan_batches_for_rewrite(&self, snapshot: &SnapshotHandle) + -> Result>; - async fn count_rows( - &self, - snapshot: &SnapshotHandle, - filter: Option, - ) -> Result; + async fn count_rows(&self, snapshot: &SnapshotHandle, filter: Option) -> Result; async fn count_rows_with_staged( &self, @@ -284,11 +280,8 @@ pub trait TableStorage: sealed::Sealed + Send + Sync + Debug { filter: &str, ) -> Result>; - async fn table_state( - &self, - dataset_uri: &str, - snapshot: &SnapshotHandle, - ) -> Result; + async fn table_state(&self, dataset_uri: &str, snapshot: &SnapshotHandle) + -> Result; // ── Staged writes (no HEAD advance) ──────────────────────────────── @@ -565,11 +558,7 @@ impl TableStorage for TableStore { TableStore::scan_batches_for_rewrite(self, snapshot.dataset()).await } - async fn count_rows( - &self, - snapshot: &SnapshotHandle, - filter: Option, - ) -> Result { + async fn count_rows(&self, snapshot: &SnapshotHandle, filter: Option) -> Result { TableStore::count_rows(self, snapshot.dataset(), filter).await } @@ -591,14 +580,8 @@ impl TableStorage for TableStore { filter: Option<&str>, ) -> Result> { let staged_writes = staged_handles_as_writes(staged); - TableStore::scan_with_staged( - self, - snapshot.dataset(), - &staged_writes, - projection, - filter, - ) - .await + TableStore::scan_with_staged(self, snapshot.dataset(), &staged_writes, projection, filter) + .await } async fn scan_with_pending( @@ -658,18 +641,10 @@ impl TableStorage for TableStore { when_matched: WhenMatched, when_not_matched: WhenNotMatched, ) -> Result { - let ds = Arc::try_unwrap(snapshot.into_arc()) - .unwrap_or_else(|arc| (*arc).clone()); - TableStore::stage_merge_insert( - self, - ds, - batch, - key_columns, - when_matched, - when_not_matched, - ) - .await - .map(StagedHandle::new) + let ds = Arc::try_unwrap(snapshot.into_arc()).unwrap_or_else(|arc| (*arc).clone()); + TableStore::stage_merge_insert(self, ds, batch, key_columns, when_matched, when_not_matched) + .await + .map(StagedHandle::new) } async fn commit_staged( @@ -720,8 +695,7 @@ impl TableStorage for TableStore { snapshot: SnapshotHandle, batch: RecordBatch, ) -> Result<(SnapshotHandle, TableState)> { - let mut ds = Arc::try_unwrap(snapshot.into_arc()) - .unwrap_or_else(|arc| (*arc).clone()); + let mut ds = Arc::try_unwrap(snapshot.into_arc()).unwrap_or_else(|arc| (*arc).clone()); let state = TableStore::append_batch(self, dataset_uri, &mut ds, batch).await?; Ok((SnapshotHandle::new(ds), state)) } @@ -735,8 +709,7 @@ impl TableStorage for TableStore { when_matched: WhenMatched, when_not_matched: WhenNotMatched, ) -> Result { - let ds = Arc::try_unwrap(snapshot.into_arc()) - .unwrap_or_else(|arc| (*arc).clone()); + let ds = Arc::try_unwrap(snapshot.into_arc()).unwrap_or_else(|arc| (*arc).clone()); TableStore::merge_insert_batches( self, dataset_uri, @@ -755,8 +728,7 @@ impl TableStorage for TableStore { snapshot: SnapshotHandle, batch: RecordBatch, ) -> Result<(SnapshotHandle, TableState)> { - let mut ds = Arc::try_unwrap(snapshot.into_arc()) - .unwrap_or_else(|arc| (*arc).clone()); + let mut ds = Arc::try_unwrap(snapshot.into_arc()).unwrap_or_else(|arc| (*arc).clone()); let state = TableStore::overwrite_batch(self, dataset_uri, &mut ds, batch).await?; Ok((SnapshotHandle::new(ds), state)) } @@ -767,8 +739,7 @@ impl TableStorage for TableStore { snapshot: SnapshotHandle, filter: &str, ) -> Result<(SnapshotHandle, DeleteState)> { - let mut ds = Arc::try_unwrap(snapshot.into_arc()) - .unwrap_or_else(|arc| (*arc).clone()); + let mut ds = Arc::try_unwrap(snapshot.into_arc()).unwrap_or_else(|arc| (*arc).clone()); let state = TableStore::delete_where(self, dataset_uri, &mut ds, filter).await?; Ok((SnapshotHandle::new(ds), state)) } @@ -790,8 +761,7 @@ impl TableStorage for TableStore { snapshot: SnapshotHandle, columns: &[&str], ) -> Result { - let mut ds = Arc::try_unwrap(snapshot.into_arc()) - .unwrap_or_else(|arc| (*arc).clone()); + let mut ds = Arc::try_unwrap(snapshot.into_arc()).unwrap_or_else(|arc| (*arc).clone()); TableStore::create_btree_index(self, &mut ds, columns).await?; Ok(SnapshotHandle::new(ds)) } @@ -801,8 +771,7 @@ impl TableStorage for TableStore { snapshot: SnapshotHandle, column: &str, ) -> Result { - let mut ds = Arc::try_unwrap(snapshot.into_arc()) - .unwrap_or_else(|arc| (*arc).clone()); + let mut ds = Arc::try_unwrap(snapshot.into_arc()).unwrap_or_else(|arc| (*arc).clone()); TableStore::create_inverted_index(self, &mut ds, column).await?; Ok(SnapshotHandle::new(ds)) } @@ -812,8 +781,7 @@ impl TableStorage for TableStore { snapshot: SnapshotHandle, column: &str, ) -> Result { - let mut ds = Arc::try_unwrap(snapshot.into_arc()) - .unwrap_or_else(|arc| (*arc).clone()); + let mut ds = Arc::try_unwrap(snapshot.into_arc()).unwrap_or_else(|arc| (*arc).clone()); TableStore::create_vector_index(self, &mut ds, column).await?; Ok(SnapshotHandle::new(ds)) } @@ -837,6 +805,13 @@ impl TableStorage for TableStore { // Note: existing TableStore::scan_stream is an associated fn that // takes &Dataset, so we delegate via the dataset reference held by // the snapshot. - TableStore::scan_stream(snapshot.dataset(), projection, filter, order_by, with_row_id).await + TableStore::scan_stream( + snapshot.dataset(), + projection, + filter, + order_by, + with_row_id, + ) + .await } } diff --git a/crates/omnigraph/src/table_store.rs b/crates/omnigraph/src/table_store.rs index c896b05..ddab706 100644 --- a/crates/omnigraph/src/table_store.rs +++ b/crates/omnigraph/src/table_store.rs @@ -1793,25 +1793,24 @@ mod tests { #[test] fn check_batch_unique_by_keys_errors_on_duplicate_id() { let batch = batch_with_ids(&["a", "b", "a"]); - let err = - check_batch_unique_by_keys(&batch, &["id".to_string()], "test").unwrap_err(); + let err = check_batch_unique_by_keys(&batch, &["id".to_string()], "test").unwrap_err(); let msg = err.to_string(); assert!( msg.contains("duplicate source row for key 'a'"), "unexpected error: {msg}" ); - assert!(msg.contains("MR-957"), "error should reference MR-957: {msg}"); + assert!( + msg.contains("MR-957"), + "error should reference MR-957: {msg}" + ); } #[test] fn check_batch_unique_by_keys_rejects_multi_column_keys() { let batch = batch_with_ids(&["a"]); - let err = check_batch_unique_by_keys( - &batch, - &["id".to_string(), "other".to_string()], - "test", - ) - .unwrap_err(); + let err = + check_batch_unique_by_keys(&batch, &["id".to_string(), "other".to_string()], "test") + .unwrap_err(); assert!(err.to_string().contains("single-column keys only")); } } diff --git a/crates/omnigraph/tests/end_to_end.rs b/crates/omnigraph/tests/end_to_end.rs index 0d9e58e..a0fdb0e 100644 --- a/crates/omnigraph/tests/end_to_end.rs +++ b/crates/omnigraph/tests/end_to_end.rs @@ -1910,9 +1910,14 @@ query docs_with_tag($tag: String) { return { $d.slug } } "#; - let result = query_main(&mut db, queries, "docs_with_tag", ¶ms(&[("$tag", "red")])) - .await - .unwrap(); + let result = query_main( + &mut db, + queries, + "docs_with_tag", + ¶ms(&[("$tag", "red")]), + ) + .await + .unwrap(); let batch = result.concat_batches().unwrap(); let slugs = batch diff --git a/crates/omnigraph/tests/failpoints.rs b/crates/omnigraph/tests/failpoints.rs index 11cff73..5ea71c5 100644 --- a/crates/omnigraph/tests/failpoints.rs +++ b/crates/omnigraph/tests/failpoints.rs @@ -1666,3 +1666,143 @@ async fn ensure_indices_phase_b_failure_does_not_leak_sidecar_when_no_work_neede "_graph_commit_recoveries.lance must NOT exist when no sidecar was processed" ); } + +// ─── MR-668 PR 2a: Omnigraph::init cleanup on partial failure ────────────── +// +// `init_with_storage` writes three schema artifacts before invoking +// `GraphCoordinator::init`. Without cleanup, a failure between any of those +// steps left orphan files behind, making the URI unusable for a retry of +// `init` (it would refuse because `_schema.pg` already exists). The tests +// below pin: on failpoint trigger at each of the three phase boundaries, +// the three schema files are removed before the error is returned. +// +// Coverage note: the third boundary (`init.after_coordinator_init`) only +// asserts cleanup of the schema files. Lance per-type directories and +// `__manifest/` are NOT cleaned up — that requires a recursive +// `StorageAdapter::delete_prefix` primitive deferred along with +// `DELETE /graphs/{id}` (MR-668 PR 2b). The orphan Lance directories +// after a coordinator-init-phase failure are documented as a known +// limitation. + +#[tokio::test] +async fn init_failpoint_after_schema_pg_written_cleans_up_schema_file() { + let _scenario = FailScenario::setup(); + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let _failpoint = ScopedFailPoint::new("init.after_schema_pg_written", "return"); + + let err = match Omnigraph::init(uri, helpers::TEST_SCHEMA).await { + Ok(_) => panic!("expected Omnigraph::init to fail at the configured failpoint"), + Err(e) => e, + }; + assert!( + err.to_string() + .contains("injected failpoint triggered: init.after_schema_pg_written"), + "got: {err}" + ); + + // Only `_schema.pg` was written at this phase boundary, but the + // cleanup attempts all three — `delete` treats not-found as Ok, + // so the other two deletes are no-ops. + assert!( + !dir.path().join("_schema.pg").exists(), + "_schema.pg must be cleaned up after init failure" + ); +} + +#[tokio::test] +async fn init_failpoint_after_schema_contract_written_cleans_up_all_schema_files() { + let _scenario = FailScenario::setup(); + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let _failpoint = ScopedFailPoint::new("init.after_schema_contract_written", "return"); + + let err = match Omnigraph::init(uri, helpers::TEST_SCHEMA).await { + Ok(_) => panic!("expected Omnigraph::init to fail at the configured failpoint"), + Err(e) => e, + }; + assert!( + err.to_string() + .contains("injected failpoint triggered: init.after_schema_contract_written"), + "got: {err}" + ); + + assert!( + !dir.path().join("_schema.pg").exists(), + "_schema.pg must be cleaned up" + ); + assert!( + !dir.path().join("_schema.ir.json").exists(), + "_schema.ir.json must be cleaned up" + ); + assert!( + !dir.path().join("__schema_state.json").exists(), + "__schema_state.json must be cleaned up" + ); +} + +#[tokio::test] +async fn init_failpoint_after_coordinator_init_cleans_up_schema_files() { + let _scenario = FailScenario::setup(); + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let _failpoint = ScopedFailPoint::new("init.after_coordinator_init", "return"); + + let err = match Omnigraph::init(uri, helpers::TEST_SCHEMA).await { + Ok(_) => panic!("expected Omnigraph::init to fail at the configured failpoint"), + Err(e) => e, + }; + assert!( + err.to_string() + .contains("injected failpoint triggered: init.after_coordinator_init"), + "got: {err}" + ); + + // Schema files are cleaned up by `best_effort_cleanup_init_artifacts`. + assert!( + !dir.path().join("_schema.pg").exists(), + "_schema.pg must be cleaned up after late-phase init failure" + ); + assert!( + !dir.path().join("_schema.ir.json").exists(), + "_schema.ir.json must be cleaned up after late-phase init failure" + ); + assert!( + !dir.path().join("__schema_state.json").exists(), + "__schema_state.json must be cleaned up after late-phase init failure" + ); + + // Documented limitation: Lance per-type datasets and `__manifest/` + // created by `GraphCoordinator::init` are NOT cleaned up — recursive + // deletion requires the deferred `delete_prefix` primitive. This + // assertion does NOT check for their absence; it merely documents + // the boundary by noting we don't validate orphan directories here. + // When PR 2b lands, this test can be tightened to assert the graph + // root is fully empty. +} + +#[tokio::test] +async fn init_failpoint_returns_original_error_not_cleanup_error() { + // The cleanup is best-effort. If `storage.delete` fails (e.g. transient + // network blip on S3), the original init failpoint error must still + // surface — not be masked by a cleanup failure. This test triggers the + // failpoint and asserts the returned error references the failpoint, + // not the cleanup. (The cleanup currently logs via `tracing::warn`; + // we can't easily fault-inject delete failures without another seam, + // so this is a smoke test for the precedence contract.) + let _scenario = FailScenario::setup(); + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let _failpoint = ScopedFailPoint::new("init.after_schema_pg_written", "return"); + + let err = match Omnigraph::init(uri, helpers::TEST_SCHEMA).await { + Ok(_) => panic!("expected Omnigraph::init to fail at the configured failpoint"), + Err(e) => e, + }; + // Failpoint message wins; no "cleanup" substring expected. + let msg = err.to_string(); + assert!( + msg.contains("init.after_schema_pg_written"), + "init error must surface the failpoint cause, got: {msg}" + ); +} diff --git a/crates/omnigraph/tests/forbidden_apis.rs b/crates/omnigraph/tests/forbidden_apis.rs index cc9f163..1936815 100644 --- a/crates/omnigraph/tests/forbidden_apis.rs +++ b/crates/omnigraph/tests/forbidden_apis.rs @@ -95,11 +95,11 @@ const FORBIDDEN_PATTERNS: &[&str] = &[ /// provide the staged primitives or to maintain the system tables /// (commit graph, manifest). const ALLOW_LIST_FILES: &[&str] = &[ - "table_store.rs", // The storage layer itself. - "storage_layer.rs", // The trait module. - "commit_graph.rs", // Maintains `_graph_commits.lance` system table. - "graph_coordinator.rs", // Drives the manifest publisher / branch coordinator. - "recovery_audit.rs", // Maintains `_graph_commit_recoveries.lance` (recovery audit trail). + "table_store.rs", // The storage layer itself. + "storage_layer.rs", // The trait module. + "commit_graph.rs", // Maintains `_graph_commits.lance` system table. + "graph_coordinator.rs", // Drives the manifest publisher / branch coordinator. + "recovery_audit.rs", // Maintains `_graph_commit_recoveries.lance` (recovery audit trail). ]; /// Directories exempt from the guard. Files under these paths may use @@ -168,10 +168,7 @@ fn engine_code_does_not_call_forbidden_lance_apis() { // comments are documentation, not code use. The trait // surface (sealed + trait-only) is the actual enforcement; // this test only catches code use. - if trimmed.starts_with("//") - || trimmed.starts_with("/*") - || trimmed.starts_with("*") - { + if trimmed.starts_with("//") || trimmed.starts_with("/*") || trimmed.starts_with("*") { continue; } // Allow lines marked with the sentinel on the SAME line or diff --git a/crates/omnigraph/tests/lifecycle.rs b/crates/omnigraph/tests/lifecycle.rs index e59dbaa..a56a80c 100644 --- a/crates/omnigraph/tests/lifecycle.rs +++ b/crates/omnigraph/tests/lifecycle.rs @@ -2,7 +2,7 @@ mod helpers; use std::fs; -use omnigraph::db::{Omnigraph, ReadTarget}; +use omnigraph::db::{InitOptions, Omnigraph, ReadTarget}; use omnigraph_compiler::schema::parser::parse_schema; use omnigraph_compiler::{build_schema_ir, schema_ir_pretty_json}; @@ -185,3 +185,122 @@ async fn snapshot_version_is_pinned() { assert_eq!(snap1.version(), v1); } + +/// Regression for the `Omnigraph::init` re-init footgun (MR-668 +/// follow-up): a second `init` against a URI that already holds a +/// graph must NOT modify or destroy the existing graph's schema +/// artifacts. Today's behavior is destructive either way — the +/// `write_text(_schema.pg, ...)` call at the top of +/// `init_storage_phase` overwrites the existing file before any +/// preflight, and `best_effort_cleanup_init_artifacts` will later +/// delete all three files if the inner `GraphCoordinator::init` +/// fails. Both outcomes corrupt an existing graph. +/// +/// After the fix: strict-mode `init` (no `force` flag) errors out +/// before touching any file, and the original schema artifacts +/// match their pre-attempt contents byte-for-byte. +#[tokio::test] +async fn init_on_existing_graph_uri_does_not_destroy_existing_schema() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + + // Establish the first graph and snapshot its three schema files. + Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); + let original_schema_pg = fs::read_to_string(dir.path().join("_schema.pg")).unwrap(); + let original_schema_ir = fs::read_to_string(dir.path().join("_schema.ir.json")).unwrap(); + let original_schema_state = fs::read_to_string(dir.path().join("__schema_state.json")).unwrap(); + + // Attempt a re-init with a deliberately different schema so any + // overwrite would be observable in the file contents. + let different_schema = "node Other { id: String @key }\n"; + let result = Omnigraph::init(uri, different_schema).await; + + // The new init must report the conflict, not silently mutate. + assert!( + result.is_err(), + "init against an existing graph URI must error, not silently overwrite" + ); + + // The three schema files must remain present and byte-identical to + // their pre-attempt contents. + assert!( + dir.path().join("_schema.pg").exists(), + "_schema.pg must not be deleted by a failed re-init" + ); + assert!( + dir.path().join("_schema.ir.json").exists(), + "_schema.ir.json must not be deleted by a failed re-init" + ); + assert!( + dir.path().join("__schema_state.json").exists(), + "__schema_state.json must not be deleted by a failed re-init" + ); + assert_eq!( + fs::read_to_string(dir.path().join("_schema.pg")).unwrap(), + original_schema_pg, + "_schema.pg contents must be preserved when re-init is rejected" + ); + assert_eq!( + fs::read_to_string(dir.path().join("_schema.ir.json")).unwrap(), + original_schema_ir, + "_schema.ir.json contents must be preserved when re-init is rejected" + ); + assert_eq!( + fs::read_to_string(dir.path().join("__schema_state.json")).unwrap(), + original_schema_state, + "__schema_state.json contents must be preserved when re-init is rejected" + ); +} + +/// Happy-path sibling to the strict re-init regression above: +/// `InitOptions { force: true }` must skip the schema-file preflight +/// when the operator deliberately wants to recover from orphan +/// schema artifacts (e.g. files left behind by a failed prior init). +/// +/// Documented semantics per `InitOptions::force`: skips the preflight +/// only. Force does NOT purge existing Lance datasets or `__manifest/` +/// — that needs `StorageAdapter::delete_prefix`, which is tracked +/// separately. The realistic recovery scenario is "schema files +/// exist but Lance state doesn't," which this test reproduces. +/// +/// Without this test, a future refactor could invert the `if !force` +/// branch and silently break the operator-facing escape hatch. +#[tokio::test] +async fn init_with_force_recovers_from_orphan_schema_files() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + + // Simulate orphan schema files: write `_schema.pg` to disk + // without running a full init. The preflight will see it and + // bail in strict mode. + fs::write(dir.path().join("_schema.pg"), TEST_SCHEMA).unwrap(); + + // Strict mode refuses because `_schema.pg` exists. + let strict_err = match Omnigraph::init(uri, TEST_SCHEMA).await { + Ok(_) => panic!("strict init must refuse when orphan _schema.pg exists"), + Err(e) => e, + }; + assert!( + strict_err.to_string().contains("already initialized"), + "strict init must surface AlreadyInitialized (sanity check); got: {strict_err}" + ); + + // Force init succeeds: it skips the preflight, overwrites the + // orphan file, and proceeds to initialize Lance state (which + // didn't exist, so `GraphCoordinator::init` is unblocked). + let db = Omnigraph::init_with_options(uri, TEST_SCHEMA, InitOptions { force: true }) + .await + .expect("force init must succeed when only orphan schema files block strict init"); + + // Confirm the catalog is populated as expected — proves the + // graph is functional after force-recovery, not just that the + // call returned Ok. + assert!( + db.catalog().node_types.contains_key("Person"), + "force-recovered graph must have the new catalog installed" + ); + assert!( + dir.path().join("__schema_state.json").exists(), + "force-recovered graph must have full schema state written" + ); +} diff --git a/crates/omnigraph/tests/policy_engine_chassis.rs b/crates/omnigraph/tests/policy_engine_chassis.rs index b1f43d9..def5349 100644 --- a/crates/omnigraph/tests/policy_engine_chassis.rs +++ b/crates/omnigraph/tests/policy_engine_chassis.rs @@ -23,8 +23,8 @@ use std::path::Path; use std::sync::Arc; use omnigraph::db::{Omnigraph, ReadTarget, SchemaApplyOptions}; -use omnigraph::loader::LoadMode; use omnigraph::error::OmniError; +use omnigraph::loader::LoadMode; use omnigraph_policy::{PolicyChecker, PolicyEngine}; use helpers::*; @@ -58,13 +58,16 @@ rules: "#; fn additive_schema() -> String { - helpers::TEST_SCHEMA.replace(" age: I32?\n}", " age: I32?\n nickname: String?\n}") + helpers::TEST_SCHEMA.replace( + " age: I32?\n}", + " age: I32?\n nickname: String?\n}", + ) } fn install_policy(db: Omnigraph, dir_path: &Path) -> (Omnigraph, Arc) { let policy_path = dir_path.join("policy.yaml"); fs::write(&policy_path, POLICY_YAML).unwrap(); - let engine = PolicyEngine::load(&policy_path, dir_path.to_str().unwrap()).unwrap(); + let engine = PolicyEngine::load_graph(&policy_path, dir_path.to_str().unwrap()).unwrap(); let engine = Arc::new(engine); let db = db.with_policy(Arc::clone(&engine) as Arc); (db, engine) @@ -238,7 +241,12 @@ async fn load_as_denies_when_policy_rejects_actor() { let (db, _engine) = init_with_policy(&dir).await; let result = db - .load_as("main", ONE_PERSON_JSONL, LoadMode::Merge, Some("act-denied")) + .load_as( + "main", + ONE_PERSON_JSONL, + LoadMode::Merge, + Some("act-denied"), + ) .await; assert_denied(result, "load_as"); } diff --git a/crates/omnigraph/tests/runs.rs b/crates/omnigraph/tests/runs.rs index f2d7dc3..cfff3fc 100644 --- a/crates/omnigraph/tests/runs.rs +++ b/crates/omnigraph/tests/runs.rs @@ -127,10 +127,7 @@ async fn multi_statement_mutation_is_atomic_with_read_your_writes() { "main", MUTATION_QUERIES, "insert_person_and_friend", - &mixed_params( - &[("$name", "Eve"), ("$friend", "Alice")], - &[("$age", 22)], - ), + &mixed_params(&[("$name", "Eve"), ("$friend", "Alice")], &[("$age", 22)]), ) .await .unwrap(); @@ -187,10 +184,7 @@ async fn partial_failure_leaves_target_queryable_and_unblocks_next_mutation() { "main", MUTATION_QUERIES, "insert_person_and_friend", - &mixed_params( - &[("$name", "Eve"), ("$friend", "Missing")], - &[("$age", 22)], - ), + &mixed_params(&[("$name", "Eve"), ("$friend", "Missing")], &[("$age", 22)]), ) .await .expect_err("op-2 must fail"); @@ -543,10 +537,7 @@ async fn mutation_rejects_mixed_insert_and_delete_at_parse_time() { "main", STAGED_QUERIES, "mixed_insert_and_delete", - &mixed_params( - &[("$name", "Eve"), ("$victim", "Alice")], - &[("$age", 22)], - ), + &mixed_params(&[("$name", "Eve"), ("$victim", "Alice")], &[("$age", 22)]), ) .await .expect_err("D₂ must reject mixed insert+delete"); @@ -559,7 +550,9 @@ async fn mutation_rejects_mixed_insert_and_delete_at_parse_time() { manifest_err.message, ); assert!( - manifest_err.message.contains("split into separate mutations"), + manifest_err + .message + .contains("split into separate mutations"), "error message should direct user to split: {}", manifest_err.message, ); @@ -668,11 +661,7 @@ async fn multiple_appends_to_same_edge_coalesce_to_one_append() { "main", STAGED_QUERIES, "insert_two_friends", - ¶ms(&[ - ("$from", "Alice"), - ("$a", "Bob"), - ("$b", "Eve"), - ]), + ¶ms(&[("$from", "Alice"), ("$a", "Bob"), ("$b", "Eve")]), ) .await .unwrap(); @@ -782,8 +771,14 @@ async fn load_with_bad_edge_reference_unblocks_next_load() { // No write made it to disk: counts unchanged. let mid_persons = count_rows(&db, "node:Person").await; let mid_edges = count_rows(&db, "edge:Knows").await; - assert_eq!(mid_persons, pre_persons, "failed load must not advance Person count"); - assert_eq!(mid_edges, pre_edges, "failed load must not advance Knows count"); + assert_eq!( + mid_persons, pre_persons, + "failed load must not advance Person count" + ); + assert_eq!( + mid_edges, pre_edges, + "failed load must not advance Knows count" + ); // Second load against the same tables — succeeds (no HEAD drift). let good = r#"{"type": "Person", "data": {"name": "Pat", "age": 55}}"#; @@ -824,7 +819,9 @@ edge WorksAt: Person -> Company @card(0..1) {"type": "Company", "data": {"name": "Acme"}} {"type": "Company", "data": {"name": "Bigco"}} "#; - load_jsonl(&mut db, seed, LoadMode::Overwrite).await.unwrap(); + load_jsonl(&mut db, seed, LoadMode::Overwrite) + .await + .unwrap(); let pre_works = count_rows(&db, "edge:WorksAt").await; @@ -1014,7 +1011,10 @@ query cascade_then_explicit($name: String, $other: String) { // — Bob→Diana would survive. The exact-count check makes both ops // independently observable. let pre_knows = count_rows(&db, "edge:Knows").await; - assert_eq!(pre_knows, 3, "fixture invariant: TEST_DATA seeds 3 Knows edges"); + assert_eq!( + pre_knows, 3, + "fixture invariant: TEST_DATA seeds 3 Knows edges" + ); db.mutate( "main", @@ -1066,7 +1066,9 @@ query add_friend($from: String, $to: String) { let seed = r#"{"type": "Person", "data": {"name": "Alice"}} {"type": "Person", "data": {"name": "Bob"}} "#; - load_jsonl(&mut db, seed, LoadMode::Overwrite).await.unwrap(); + load_jsonl(&mut db, seed, LoadMode::Overwrite) + .await + .unwrap(); // Single insert: count=1 < min=2 → reject with clear message. let err = db @@ -1082,8 +1084,7 @@ query add_friend($from: String, $to: String) { panic!("expected Manifest error, got {err:?}"); }; assert!( - manifest_err.message.contains("@card violation") - && manifest_err.message.contains("min 2"), + manifest_err.message.contains("@card violation") && manifest_err.message.contains("min 2"), "unexpected error: {}", manifest_err.message, ); @@ -1121,7 +1122,9 @@ edge WorksAt: Person -> Company @card(0..1) {"type": "Company", "data": {"name": "Bigco"}} {"edge": "WorksAt", "from": "Alice", "to": "Acme", "data": {"id": "w1"}} "#; - load_jsonl(&mut db, seed, LoadMode::Overwrite).await.unwrap(); + load_jsonl(&mut db, seed, LoadMode::Overwrite) + .await + .unwrap(); // Merge-update the same edge id w1 to point at Bigco. Counted naively // as union, Alice has 2 WorksAt (committed Acme + pending Bigco) which @@ -1167,7 +1170,9 @@ edge WorksAt: Person -> Company @card(0..1) {"type": "Company", "data": {"name": "Acme"}} {"type": "Company", "data": {"name": "Bigco"}} "#; - load_jsonl(&mut db, seed, LoadMode::Overwrite).await.unwrap(); + load_jsonl(&mut db, seed, LoadMode::Overwrite) + .await + .unwrap(); // Merge load with the SAME edge id twice — the second row supersedes // the first in the finalize-time dedupe. If pending-counting doesn't @@ -1364,7 +1369,11 @@ query insert_then_update_note( ) .await .unwrap(); - assert_eq!(qr.num_rows(), 0, "letter must not be visible after early error"); + assert_eq!( + qr.num_rows(), + 0, + "letter must not be visible after early error" + ); } /// MR-920 regression: two sequential `update T set {f:v} where x=y` @@ -1446,5 +1455,9 @@ async fn second_sequential_update_on_same_row_succeeds() { } } } - assert_eq!(alice_age, Some(42), "Alice's age must reflect the second update"); + assert_eq!( + alice_age, + Some(42), + "Alice's age must reflect the second update" + ); } diff --git a/crates/omnigraph/tests/staged_writes.rs b/crates/omnigraph/tests/staged_writes.rs index 30ef28b..021b36e 100644 --- a/crates/omnigraph/tests/staged_writes.rs +++ b/crates/omnigraph/tests/staged_writes.rs @@ -132,7 +132,11 @@ async fn stage_merge_insert_dedupes_superseded_committed_fragment() { .await .unwrap(); let ids = collect_ids(&batches); - assert_eq!(ids, vec!["alice"], "merge_insert must not surface duplicates"); + assert_eq!( + ids, + vec!["alice"], + "merge_insert must not surface duplicates" + ); // Confirm the visible row is the rewritten one. let total: usize = batches.iter().map(|b| b.num_rows()).sum(); @@ -382,12 +386,7 @@ async fn scan_with_staged_with_filter_silently_drops_staged_rows() { // Actual: dave (staged, age=35) is dropped — only the committed matches // come back. let batches = store - .scan_with_staged( - &ds, - std::slice::from_ref(&staged), - None, - Some("age >= 30"), - ) + .scan_with_staged(&ds, std::slice::from_ref(&staged), None, Some("age >= 30")) .await .unwrap(); assert_eq!( @@ -403,12 +402,7 @@ async fn scan_with_staged_with_filter_silently_drops_staged_rows() { // Without filter, staged data IS visible — confirms the issue is // specifically filter pushdown, not fragment scanning per se. let unfiltered = store - .scan_with_staged( - &ds, - std::slice::from_ref(&staged), - None, - None, - ) + .scan_with_staged(&ds, std::slice::from_ref(&staged), None, None) .await .unwrap(); assert_eq!( @@ -686,10 +680,7 @@ async fn stage_create_inverted_index_does_not_advance_head_until_commit() { .unwrap(); let pre_version = ds.version().version; - let staged = store - .stage_create_inverted_index(&ds, "id") - .await - .unwrap(); + let staged = store.stage_create_inverted_index(&ds, "id").await.unwrap(); assert_eq!( ds.version().version, pre_version, @@ -781,13 +772,9 @@ async fn create_vector_index_advances_head_inline_documents_residual() { let id_arr = StringArray::from(ids); let flat: Vec = (0..(n_rows * dim)).map(|i| i as f32).collect(); let values = arrow_array::Float32Array::from(flat); - let vec_arr = - FixedSizeListArray::new(item_field, dim as i32, Arc::new(values), None); - let batch = RecordBatch::try_new( - schema.clone(), - vec![Arc::new(id_arr), Arc::new(vec_arr)], - ) - .unwrap(); + let vec_arr = FixedSizeListArray::new(item_field, dim as i32, Arc::new(values), None); + let batch = + RecordBatch::try_new(schema.clone(), vec![Arc::new(id_arr), Arc::new(vec_arr)]).unwrap(); let mut ds = TableStore::write_dataset(&uri, batch).await.unwrap(); let pre_version = ds.version().version; diff --git a/crates/omnigraph/tests/traversal.rs b/crates/omnigraph/tests/traversal.rs index 6b6fbe3..6efe7de 100644 --- a/crates/omnigraph/tests/traversal.rs +++ b/crates/omnigraph/tests/traversal.rs @@ -504,9 +504,21 @@ query fof_chain($name: String) { let batch = result.concat_batches().unwrap(); assert_eq!(batch.num_rows(), 1); - let col0 = batch.column(0).as_any().downcast_ref::().unwrap(); - let col1 = batch.column(1).as_any().downcast_ref::().unwrap(); - let col2 = batch.column(2).as_any().downcast_ref::().unwrap(); + let col0 = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let col1 = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + let col2 = batch + .column(2) + .as_any() + .downcast_ref::() + .unwrap(); assert_eq!(col0.value(0), "Alice"); assert_eq!(col1.value(0), "Bob"); assert_eq!(col2.value(0), "Diana"); @@ -574,8 +586,16 @@ query at_acme_named() { let batch = result.concat_batches().unwrap(); assert_eq!(batch.num_rows(), 1); - let person = batch.column(0).as_any().downcast_ref::().unwrap(); - let company = batch.column(1).as_any().downcast_ref::().unwrap(); + let person = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let company = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); assert_eq!(person.value(0), "Alice"); assert_eq!(company.value(0), "Acme"); } @@ -608,8 +628,16 @@ query at_company($company: String) { let batch = result.concat_batches().unwrap(); assert_eq!(batch.num_rows(), 1); - let person = batch.column(0).as_any().downcast_ref::().unwrap(); - let company = batch.column(1).as_any().downcast_ref::().unwrap(); + let person = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let company = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); assert_eq!(person.value(0), "Bob"); assert_eq!(company.value(0), "Globex"); } @@ -633,19 +661,22 @@ query fan_out($name: String) { "#; // Alice knows Bob and Charlie, works at Acme. // Each friend paired with her company → 2 rows. - let result = query_main( - &mut db, - queries, - "fan_out", - ¶ms(&[("$name", "Alice")]), - ) - .await - .unwrap(); + let result = query_main(&mut db, queries, "fan_out", ¶ms(&[("$name", "Alice")])) + .await + .unwrap(); let batch = result.concat_batches().unwrap(); assert_eq!(batch.num_rows(), 2); - let friends = batch.column(0).as_any().downcast_ref::().unwrap(); - let companies = batch.column(1).as_any().downcast_ref::().unwrap(); + let friends = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let companies = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); let mut pairs: Vec<(&str, &str)> = (0..batch.num_rows()) .map(|i| (friends.value(i), companies.value(i))) diff --git a/crates/omnigraph/tests/validators.rs b/crates/omnigraph/tests/validators.rs index 96483d3..4c7a2f3 100644 --- a/crates/omnigraph/tests/validators.rs +++ b/crates/omnigraph/tests/validators.rs @@ -76,7 +76,9 @@ async fn init_with(schema: &str, data: &str) -> (tempfile::TempDir, Omnigraph) { let uri = dir.path().to_str().unwrap(); let mut db = Omnigraph::init(uri, schema).await.unwrap(); if !data.is_empty() { - load_jsonl(&mut db, data, LoadMode::Overwrite).await.unwrap(); + load_jsonl(&mut db, data, LoadMode::Overwrite) + .await + .unwrap(); } (dir, db) } diff --git a/docs/releases/v0.6.0.md b/docs/releases/v0.6.0.md index 80cda02..978df0c 100644 --- a/docs/releases/v0.6.0.md +++ b/docs/releases/v0.6.0.md @@ -1,19 +1,127 @@ # Omnigraph v0.6.0 +Two pieces of work land in this release: + +1. The **graph terminology rename** (renamed `Repo` → `Graph` across the Cedar resource model, policy API, and query-lint schema source). +2. **Multi-graph server mode** — one `omnigraph-server` process can now serve 1–10 graphs concurrently behind cluster routes (`/graphs/{graph_id}/...`), with per-graph and server-level Cedar policy, read-only `GET /graphs` enumeration, and CLI parity (`omnigraph graphs list`). + +Runtime add/remove (`POST /graphs`, `DELETE /graphs/{id}`, `omnigraph graphs create`) is **not** in v0.6.0. Operators add or remove graphs by editing `omnigraph.yaml` and restarting. The first cut of `POST /graphs` shipped behind an atomic-YAML-rewrite design that we pulled before release once its concurrency guarantees were challenged (flock-on-renamed-inode race, duplicate-check outside the critical section, and an init-cleanup path that could destroy an existing graph's schema on re-init). The correct fix is a Lance-style cluster catalog (reserve → init → publish with recovery sidecars); that work is deferred. + ## Breaking Changes +### Graph terminology rename + - Renamed the Cedar resource entity from `Omnigraph::Repo` to `Omnigraph::Graph`. -- Renamed policy API terminology from `repo_id` to `graph_id` on `PolicyCompiler::compile` and `PolicyEngine::load`. +- Renamed policy API terminology from `repo_id` to `graph_id` on `PolicyCompiler::compile` (and on the new `PolicyEngine::load_graph` / `PolicyEngine::load_server` loaders described below). - Renamed query-lint schema source JSON from `"repo"` to `"graph"` for `schema_source.kind`. +### Multi-graph server mode + +- **Multi-graph deployments lose flat routes.** Single-graph invocation (`omnigraph-server `) is unchanged — same flat `/snapshot`, `/read`, `/branches`, etc. Multi-graph deployments serve those routes under `/graphs/{graph_id}/...`; bare flat paths return 404 in multi mode. +- **`ServerConfig` shape change** (programmatic embedders only): `ServerConfig { uri, policy_file }` is replaced by `ServerConfig { mode: ServerConfigMode }`, where `ServerConfigMode = Single { uri, policy_file } | Multi { graphs, config_path, server_policy_file }`. Callers that use `load_server_settings` are unaffected; callers that construct `ServerConfig` directly need to wrap their fields in `ServerConfigMode::Single`. +- **`AppState`'s routing surface** is `AppState::routing() -> &GraphRouting`, where `GraphRouting = Single { handle } | Multi { registry, config_path }`. The previous `AppState::uri()`, `AppState::mode()`, `AppState::registry()` accessors and the `ServerMode` enum are gone — embedders read `state.routing()` and match on the arm they need. Per-graph URIs live on `handle.uri`. +- **`AppState::new_multi`** is the new multi-graph constructor. Single-mode `new_*` / `open_*` constructors are unchanged. +- **`AuthenticatedActor(Arc)` → `ResolvedActor { actor_id, tenant_id, scopes, source }`** (programmatic embedders only). The struct shape changes, but the HTTP contract — bearer auth and the bearer-derived-actor-identity guarantee — is unchanged. Cluster-mode call sites construct with `tenant_id: None`, `scopes: vec![Scope::Full]`, `source: AuthSource::Static`. The new fields are forward-compat seams for future multi-tenant and OAuth deployments; they're inert in this release. +- **`PolicyEngine::load(path, graph_id)` removed** in favor of two kind-typed loaders: `PolicyEngine::load_graph(path, graph_id)` for per-graph policies and `PolicyEngine::load_server(path)` for server-level policies. Each loader rejects rules whose action `resource_kind()` doesn't match the engine kind — operators who put a `graph_list` rule in a per-graph file (or a `read` rule in a server file) now get a load-time error instead of a silently-never-matching rule. +- **`PolicyRequest::actor_id` field removed.** Actor identity is now a separate parameter on `PolicyEngine::authorize(actor_id, &request)`. The type system enforces the server-authoritative-actor invariant: actor identity is always sourced from the bearer-token match resolved at the auth boundary; handlers cannot smuggle identity through the request body. +- **`Omnigraph::init` is strict by default.** Initialization at a URI that already holds schema files now errors with `OmniError::AlreadyInitialized` instead of silently overwriting. Operators who actually want to overwrite use `InitOptions { force: true }` (CLI: `omnigraph init --force`). Closes the destructive-cleanup footgun where a failed re-init would delete an existing graph's schema files. +- **Top-level `policy.file` is rejected in multi-graph server mode.** It remains valid for single-graph / CLI-local policy. Multi-graph deployments must move graph rules to `graphs..policy.file` and server-scoped `graph_list` rules to `server.policy.file`. +- **Open server startup requires explicit opt-in.** A server with no bearer tokens and no policy now refuses to start unless passed `--unauthenticated` or `OMNIGRAPH_UNAUTHENTICATED=1`. +- **Policy requires bearer tokens.** Configuring any policy file without bearer tokens now refuses startup; otherwise every protected request would 401 before Cedar could evaluate it. +- **Tokens without policy default-deny non-read actions.** Existing authenticated deployments that relied on writes or admin routes without Cedar policy must add policy rules for those actions. +- **`GET /graphs` requires `server.policy.file` in every runtime state.** Even `--unauthenticated` mode keeps server topology closed until the operator explicitly authorizes `graph_list`. + +## New + +- **Multi-graph mode**. Invoke with `omnigraph-server --config omnigraph.yaml` where the YAML has a non-empty `graphs:` map and no single-mode selector (no `server.graph`, no CLI `` or `--target`). At startup the server opens every configured graph in parallel (bounded concurrency, fail-fast). +- **`GET /graphs`**. Lists every registered graph, sorted alphabetically by `graph_id`. Auth-required when bearer tokens are configured; Cedar-gated by `PolicyAction::GraphList` against `Omnigraph::Server::"root"`. Returns 405 in single mode. Server-scoped actions require an explicit `server.policy.file` in every runtime state — the management surface is closed by default even in `--unauthenticated` mode so that server topology is never exposed without operator opt-in. +- **CLI `omnigraph graphs list`**. Mirrors the HTTP surface. Rejects local URI targets with a clear message — for remote multi-graph servers only. +- **CLI `omnigraph init --force`**. Bypasses the strict-init preflight when an operator deliberately wants to recover from orphan schema files. Does NOT purge existing Lance datasets; recursive deletion needs `StorageAdapter::delete_prefix` (deferred — see below). +- **Per-graph Cedar policy**. Each entry in the `graphs:` map can carry a `policy.file` path, loaded at startup via `PolicyEngine::load_graph`. Cedar's `Omnigraph::Graph::""` resource is per-graph; the new `Omnigraph::Server::"root"` resource governs server-level actions. +- **Server-level Cedar policy**. `server.policy.file` in the config governs the `graph_list` action on `Omnigraph::Server::"root"`. Required to expose `GET /graphs` in every runtime state — without a server policy the default-deny posture rejects `graph_list`, including in `--unauthenticated` mode. +- **Cedar action vocabulary**: `graph_list` (server-scoped). Runtime `graph_create` / `graph_delete` are reserved but not shipped — see "Deferred." +- **Canonical graph URI identity.** Server startup normalizes graph root URIs before registry insertion and response output, so aliases such as `/tmp/g`, `/tmp/g/`, and `file:///tmp/g` cannot register as distinct graphs that actually share one Lance root. + +## Configuration + +`omnigraph.yaml` schema additions (all optional, single-mode unaffected): + +```yaml +server: + bind: 0.0.0.0:8080 + policy: + file: ./server-policy.yaml # server-level Cedar (graph_list) + +graphs: + alpha: + uri: s3://tenant-bucket/alpha + policy: + file: ./policies/alpha.yaml # per-graph Cedar + beta: + uri: s3://tenant-bucket/beta + # no per-graph policy → engine-layer enforcement is a no-op +``` + +## Deferred + +- **`POST /graphs` runtime graph creation** and **CLI `omnigraph graphs create`**. Pulled before release after the YAML-rewrite design's correctness story didn't survive review. A future release will add a managed cluster catalog (Lance-backed reserve → init → publish with recovery sidecars) and re-expose runtime creation on top of it. Until then, operators add graphs by editing `omnigraph.yaml` and restarting. +- **`DELETE /graphs/{id}`**. Never shipped in v0.6.0; deferred with the same cluster-catalog work. +- **`StorageAdapter::delete_prefix`**. The substrate primitive a managed catalog would need. Will land alongside runtime mutation. +- **`omnigraph init --force` purging Lance state.** Today `--force` only bypasses the schema-file preflight; recursive deletion of existing Lance datasets needs `delete_prefix`. +- **`X-Actor-Id` service delegation forwarding**. Needs durable both-actor audit on `_graph_commits.lance` — out of scope. +- **Hot policy reload**. Restart is cheap at N≤10 graphs. + ## User Impact -- No on-disk migration is required. Existing `.omni` graphs continue to open with the same storage layout. -- Supported YAML policy authoring is unchanged because the YAML schema does not expose the Cedar entity type name. -- Operators with unsupported raw Cedar policy files should update `Omnigraph::Repo` - resource references to `Omnigraph::Graph`. +- **No on-disk migration is required.** Existing `.omni` graphs from v0.5.0 (and earlier) open cleanly under v0.6.0 — Lance datasets, `__manifest`, `_schema.pg`, `_schema.ir.json`, `__schema_state.json`, `_graph_commits.lance`, `_graph_commit_recoveries.lance` all use unchanged formats. No conversion step. +- **Existing single-graph storage upgrades without migration.** Server deployments may need auth/policy config changes: explicitly pass `--unauthenticated` for local open mode, configure tokens when using policy, and add Cedar policy for non-read authenticated actions. +- **Multi-graph adoption is opt-in.** Add a `graphs:` map to `omnigraph.yaml` (and remove `server.graph`) to switch a deployment to multi mode. +- **Cluster routes are breaking for client SDKs targeting multi mode.** Generated clients from previous v0.5.0 OpenAPI specs will hit 404 on flat paths against a multi-mode server. Regenerate against the v0.6.0 `openapi.json`. +- **Supported YAML policy authoring is unchanged.** The Cedar `Omnigraph::Graph` and `Omnigraph::Server` entities are internally generated by `compile_policy_source` — operator YAML only references actions and groups. +- **Operators with unsupported raw Cedar policy files** should update `Omnigraph::Repo` resource references to `Omnigraph::Graph`. + +## Migration: single → multi + +```yaml +# Before (v0.5.0 single-mode invocation) +server: + graph: my-graph +graphs: + my-graph: + uri: /var/lib/omnigraph/my-graph +policy: + file: ./policy.yaml +``` + +```yaml +# After (v0.6.0 multi-mode — drop `server.graph` and the top-level `policy`) +server: + policy: + file: ./server-policy.yaml # NEW: governs GET /graphs +graphs: + my-graph: + uri: /var/lib/omnigraph/my-graph + policy: + file: ./policy.yaml # MOVED: was top-level +``` + +Same `omnigraph.yaml` file; restart the server. Clients targeting the old flat routes (`/snapshot`, `/read`, …) must update to `/graphs/my-graph/snapshot`, etc. + +To add a new graph after rollout: stop the server, append a new `graphs.` entry, restart. ## Documentation - Public docs, CLI help, examples, server docs, and test helpers now consistently use "graph" for the OmniGraph data artifact. - GitHub/source repository terminology remains spelled out as "repository" where needed. +- New: `docs/user/cli.md` documents `omnigraph graphs list`; `docs/user/server.md` documents the multi-graph mode and the cluster route convention; `docs/user/policy.md` documents the per-graph vs server-scoped action distinction. + +## Test coverage + +- `GraphId` newtype validation, registry race tests, init failpoints (still reachable from `omnigraph init` CLI). +- Mode-inference four-rule matrix, parallel multi-graph startup, cluster routing. +- Cedar `Server` resource refactor, backwards-compat for graph-only policies, kind-alignment rejection (server actions in graph files / vice versa). +- `GET /graphs` enumeration, 405-in-single-mode, 403-in-Open-mode-without-server-policy, Cedar admin/viewer authorization. +- Cluster routes with inner path params (`/branches/{branch}`, `/commits/{commit_id}`) deserialize correctly under axum 0.8 nested routing. +- Policy-requires-tokens startup invariant enforced uniformly across single and multi mode. +- The bearer-auth-derived-actor-identity regression test (client-supplied identity headers are ignored; the server-resolved actor is the only identity Cedar sees) stays green across the entire refactor. + diff --git a/docs/user/cli.md b/docs/user/cli.md index 743c284..8147d0b 100644 --- a/docs/user/cli.md +++ b/docs/user/cli.md @@ -44,6 +44,27 @@ omnigraph read \ If the server requires auth, set `OMNIGRAPH_SERVER_BEARER_TOKEN` on the server and configure the matching `bearer_token_env` in `omnigraph.yaml`. +## Multi-graph servers (v0.6.0+) + +Against a multi-graph server (started with `--config omnigraph.yaml` referencing a non-empty `graphs:` map), use `omnigraph graphs list` to enumerate the registered graphs. The server must configure bearer tokens and `server.policy.file` with a rule that allows `graph_list`; `/graphs` is closed by default even when the server runs with `--unauthenticated`. + +```bash +OMNIGRAPH_BEARER_TOKEN=admin-token \ + omnigraph graphs list --uri http://server.example.com --json +``` + +For config-driven clients, set the remote graph's `bearer_token_env` to an environment variable containing a token whose actor is authorized by `server.policy.file`. + +`list` rejects local URI targets — it's for remote multi-graph servers only. + +Runtime add/remove is **not** in v0.6.0. To add a graph, stop the server, add a `graphs.` entry to `omnigraph.yaml`, then restart. To remove, stop the server, delete the entry, restart. + +Per-graph URLs: hit a graph's cluster route from any subcommand by pointing `--uri` at it: + +```bash +omnigraph read --uri http://server.example.com/graphs/beta --query ./q.gq ... +``` + ## Runs, Policy, And Diagnostics ```bash diff --git a/docs/user/deployment.md b/docs/user/deployment.md index 7857077..94f79b0 100644 --- a/docs/user/deployment.md +++ b/docs/user/deployment.md @@ -109,7 +109,8 @@ docker run --rm -p 8080:8080 \ ## Auth -The server can run unauthenticated for local development, but any shared or +The server can run unauthenticated for local development only when explicitly +started with `--unauthenticated` or `OMNIGRAPH_UNAUTHENTICATED=1`. Any shared or internet-facing deployment should set a bearer token source. ### Token sources diff --git a/docs/user/policy.md b/docs/user/policy.md index b121213..749d3be 100644 --- a/docs/user/policy.md +++ b/docs/user/policy.md @@ -4,6 +4,8 @@ OmniGraph integrates AWS Cedar (`cedar-policy = 4.9`) for ABAC. ## Policy actions +Per-graph actions (bind to `Omnigraph::Graph::""`): + 1. `read` — query / snapshot / list branches & commits 2. `export` — NDJSON export 3. `change` — mutations @@ -13,12 +15,57 @@ OmniGraph integrates AWS Cedar (`cedar-policy = 4.9`) for ABAC. 7. `branch_merge` 8. `admin` — reserved for policy-management surfaces (hot reload, audit log, approvals). No call site today; see MR-724 for the reservation rationale. +Server-scoped action (v0.6.0+; binds to `Omnigraph::Server::"root"`): + +9. `graph_list` — `GET /graphs` registry enumeration (multi-graph mode) + +Server-scoped actions cannot use `branch_scope` or `target_branch_scope` — they operate on the registry, not on a graph's branches. A rule cannot mix server-scoped and per-graph actions; split into separate rules. (Runtime `graph_create` / `graph_delete` are reserved but not shipped in v0.6.0; operators add/remove graphs by editing `omnigraph.yaml` and restarting.) + ## Scope kinds - `branch_scope` — applied to source branch (`read`, `export`, `change`) - `target_branch_scope` — applied to destination (`schema_apply`, branch ops, run ops) - `protected_branches` — named list with special rules; rule scopes are `any | protected | unprotected` +## Per-graph vs. server-level policy (multi-graph mode) + +In multi mode (`omnigraph.yaml` with a non-empty `graphs:` map), policy files attach at two levels: + +```yaml +server: + policy: + file: ./server-policy.yaml # server-level: graph_list + +graphs: + alpha: + uri: s3://tenant-bucket/alpha + policy: + file: ./policies/alpha.yaml # per-graph: read, change, branch_*, schema_apply + beta: + uri: s3://tenant-bucket/beta + # no per-graph policy → no engine-layer Cedar enforcement on beta +``` + +Top-level `policy.file` is single-graph / CLI-local policy only. Multi-graph +server startup rejects it because applying one graph policy to every configured +graph is ambiguous. Move per-graph rules to `graphs..policy.file` and +move `graph_list` rules to `server.policy.file`. + +Each graph's HTTP request flows through its own per-graph policy. The management endpoint (`GET /graphs`) flows through the server-level policy. When `server.policy.file` is unset, `GET /graphs` is denied in every runtime state, including `--unauthenticated`; with bearer tokens configured, it returns 403 after admission control because `graph_list` is not a `read`-equivalent action. The operator must explicitly authorize via `server-policy.yaml` to expose `/graphs`. + +Example server-level policy: + +```yaml +version: 1 +groups: + admins: [act-andrew] +rules: + - id: admins-can-list-graphs + allow: + actors: { group: admins } + actions: [graph_list] +``` + ## Configuration `omnigraph.yaml`: @@ -32,7 +79,7 @@ cli: actor: act-andrew # default actor for CLI direct-engine writes ``` -Each rule must use exactly one of `branch_scope` or `target_branch_scope`. +Each per-graph rule may use at most one of `branch_scope` or `target_branch_scope`. Server-scoped rules (`graph_list`) take neither — they have no branch context. `cli.actor` is the default actor identity for CLI direct-engine writes when `policy.file` is configured. Override per-invocation with `--as @@ -74,12 +121,13 @@ reaches `authorize_request()` without a matching policy permit. |---|---|---|---| | **Open** | no | no | Every request is permitted. Refuses to start unless `--unauthenticated` or `OMNIGRAPH_UNAUTHENTICATED=1` is set — the operator must explicitly opt in. | | **DefaultDeny** | yes | no | Every authenticated request for an action other than `read` is rejected with HTTP 403. Closes the "tokens but forgot the policy file" trap — an operator who sets up auth and forgot to point at a policy file used to ship the illusion of protection. | -| **PolicyEnabled** | any | yes | Every request is evaluated by Cedar against the configured policy. | +| **PolicyEnabled** | yes | yes | Authenticated requests that reach a configured policy engine are evaluated by Cedar. Server-scoped actions still require `server.policy.file`. | The classifier is `classify_server_runtime_state` in `crates/omnigraph-server/src/lib.rs`; it returns `Err` for the "no -tokens, no policy, no flag" cell so the server refuses to start instead -of silently shipping an open instance. Tests pin every cell of the +tokens, no policy, no flag" cell and for "policy file, no tokens" so the +server refuses to start instead of silently shipping an open instance or +a policy-protected server that can only 401. Tests pin every cell of the matrix and the State-2 deny path. Server-side, `authorize_request()` still runs at the HTTP boundary — diff --git a/docs/user/server.md b/docs/user/server.md index 0c4fcbd..1a1bdb8 100644 --- a/docs/user/server.md +++ b/docs/user/server.md @@ -1,26 +1,64 @@ # HTTP Server (`omnigraph-server`) -Axum 0.8 + tokio + utoipa-generated OpenAPI. Single graph per process; deploy multiple processes for multi-tenant. +Axum 0.8 + tokio + utoipa-generated OpenAPI. **Two modes** (v0.6.0+): single-graph (legacy) and multi-graph (MR-668). Mode is inferred from CLI args + config shape. + +## Modes + +### Single-graph mode (legacy) + +`omnigraph-server ` or `omnigraph-server --target --config omnigraph.yaml`. Routes are flat — `/snapshot`, `/read`, `/branches`, etc. Behavior unchanged from v0.6.0. + +### Multi-graph mode (v0.6.0+) + +`omnigraph-server --config omnigraph.yaml` with a non-empty `graphs:` map and **no** single-mode selector (no `server.graph`, no ``, no `--target`). The server opens every configured graph in parallel at startup (bounded concurrency = 4, fail-fast on the first open error). Routes are nested under `/graphs/{graph_id}/...`. Bare flat paths return 404 in multi mode. + +Mode inference (four-rule matrix): + +1. CLI positional `` → single +2. CLI `--target ` → single +3. `server.graph` in config → single +4. `--config` + non-empty `graphs:` + no single-mode selector → **multi** +5. otherwise → error with migration hint ## Endpoint inventory +Per-graph endpoints — same body shape across modes; URLs differ: + +| Method | Single-mode path | Multi-mode path | Auth | Action | Handler | +|---|---|---|---|---|---| +| GET | `/healthz` | `/healthz` | none | — | `server_health` | +| GET | `/openapi.json` | `/openapi.json` | none | — | `server_openapi` (strips security if auth disabled; in multi mode emits cluster paths with `cluster_` operation-id prefix) | +| GET | `/snapshot?branch=` | `/graphs/{id}/snapshot?branch=` | bearer + `read` | snapshot of branch | `server_snapshot` | +| POST | `/read` | `/graphs/{id}/read` | bearer + `read` | run named query | `server_read` | +| POST | `/export` | `/graphs/{id}/export` | bearer + `export` | NDJSON stream | `server_export` | +| POST | `/change` | `/graphs/{id}/change` | bearer + `change` | mutation | `server_change` | +| GET | `/schema` | `/graphs/{id}/schema` | bearer + `read` | get current `.pg` source | `server_schema_get` | +| POST | `/schema/apply` | `/graphs/{id}/schema/apply` | bearer + `schema_apply` (target=`main`) | migrate | `server_schema_apply` | +| POST | `/ingest` | `/graphs/{id}/ingest` | bearer + `branch_create` (if new) + `change` | bulk load | `server_ingest` (32 MB body limit) | +| GET | `/branches` | `/graphs/{id}/branches` | bearer + `read` | list branches | `server_branch_list` | +| POST | `/branches` | `/graphs/{id}/branches` | bearer + `branch_create` | create | `server_branch_create` | +| DELETE | `/branches/{branch}` | `/graphs/{id}/branches/{branch}` | bearer + `branch_delete` | delete | `server_branch_delete` | +| POST | `/branches/merge` | `/graphs/{id}/branches/merge` | bearer + `branch_merge` | merge `source → target` | `server_branch_merge` | +| GET | `/commits?branch=` | `/graphs/{id}/commits?branch=` | bearer + `read` | list | `server_commit_list` | +| GET | `/commits/{commit_id}` | `/graphs/{id}/commits/{commit_id}` | bearer + `read` | show | `server_commit_show` | + +Server-level management endpoints (v0.6.0+): + | Method | Path | Auth | Action | Handler | |---|---|---|---|---| -| GET | `/healthz` | none | — | `server_health` | -| GET | `/openapi.json` | none | — | `server_openapi` (strips security if auth disabled) | -| GET | `/snapshot?branch=` | bearer + `read` | snapshot of branch | `server_snapshot` | -| POST | `/read` | bearer + `read` | run named query | `server_read` | -| POST | `/export` | bearer + `export` | NDJSON stream | `server_export` | -| POST | `/change` | bearer + `change` | mutation | `server_change` | -| GET | `/schema` | bearer + `read` | get current `.pg` source | `server_schema_get` | -| POST | `/schema/apply` | bearer + `schema_apply` (target=`main`) | migrate | `server_schema_apply` | -| POST | `/ingest` | bearer + `branch_create` (if new) + `change` | bulk load | `server_ingest` (32 MB body limit) | -| GET | `/branches` | bearer + `read` | list branches | `server_branch_list` | -| POST | `/branches` | bearer + `branch_create` | create | `server_branch_create` | -| DELETE | `/branches/{branch}` | bearer + `branch_delete` | delete | `server_branch_delete` | -| POST | `/branches/merge` | bearer + `branch_merge` | merge `source → target` | `server_branch_merge` | -| GET | `/commits?branch=` | bearer + `read` | list | `server_commit_list` | -| GET | `/commits/{commit_id}` | bearer + `read` | show | `server_commit_show` | +| GET | `/graphs` | bearer + `graph_list` on `Server::"root"` | list registered graphs | `server_graphs_list` (405 in single mode) | + +## Adding and removing graphs (multi mode) + +Runtime add/remove via API is **not** exposed in v0.6.0 — neither +`POST /graphs` nor `DELETE /graphs/{id}` is implemented. Operators add +or remove graphs by stopping the server, editing the `graphs:` map in +`omnigraph.yaml`, then restarting. The server treats `omnigraph.yaml` +as operator-owned configuration and never writes it. + +A future release may introduce a managed registry (Lance-backed, +catalog-style: reserve → init → publish with recovery sidecars) and +re-expose runtime mutation on top of it. ## Streaming @@ -79,7 +117,10 @@ endpoints (`/snapshot`, `/read`, `/export`, `/branches` GET, `/commits`, 1. `OMNIGRAPH_SERVER_BEARER_TOKENS_AWS_SECRET` — AWS Secrets Manager (build with `--features aws`) 2. `OMNIGRAPH_SERVER_BEARER_TOKENS_FILE` or `OMNIGRAPH_SERVER_BEARER_TOKENS_JSON` — JSON `{actor_id: token, …}` 3. `OMNIGRAPH_SERVER_BEARER_TOKEN` — single legacy token, actor `default` -- If no tokens configured, server runs unauthenticated (local dev) and `/openapi.json` strips the security scheme. +- If no tokens are configured, startup refuses unless `--unauthenticated` or + `OMNIGRAPH_UNAUTHENTICATED=1` explicitly opts into open local-dev mode. A + policy file without tokens is also rejected at startup. In open mode + `/openapi.json` strips the security scheme. See [deployment.md](deployment.md) for token-source operational details. @@ -98,4 +139,4 @@ See [deployment.md](deployment.md) for token-source operational details. admission control" above). No global rate limiter is configured; add `tower_http::limit` if a graph-wide cap is needed. - Pagination — none (commits/branches return everything; export streams). -- Multi-tenant routing — one graph per process. +- Runtime graph add/remove — edit `omnigraph.yaml` and restart. diff --git a/openapi.json b/openapi.json index 75c9379..1e76360 100644 --- a/openapi.json +++ b/openapi.json @@ -585,6 +585,63 @@ ] } }, + "/graphs": { + "get": { + "tags": [ + "management" + ], + "summary": "List every graph currently registered with this server (MR-668).", + "description": "Multi-graph mode only. In single mode, the route returns 405 — there's\nno registry to enumerate. Cedar-gated by the server-level policy via\nthe `graph_list` action against `Omnigraph::Server::\"root\"`.\n\nOrder: alphabetical by `graph_id` (server-sorted so clients see\ndeterministic output across requests).", + "operationId": "listGraphs", + "responses": { + "200": { + "description": "List of registered graphs", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/GraphListResponse" + } + } + } + }, + "401": { + "description": "Unauthorized", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorOutput" + } + } + } + }, + "403": { + "description": "Forbidden", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorOutput" + } + } + } + }, + "405": { + "description": "Method not allowed (single-graph mode)", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorOutput" + } + } + } + } + }, + "security": [ + { + "bearer_token": [] + } + ] + } + }, "/healthz": { "get": { "tags": [ @@ -1199,6 +1256,7 @@ "forbidden", "bad_request", "not_found", + "method_not_allowed", "conflict", "too_many_requests", "internal" @@ -1268,6 +1326,37 @@ } } }, + "GraphInfo": { + "type": "object", + "description": "One entry in the response from `GET /graphs`. Cluster operators\nconsume this list to discover which graphs the server is currently\nserving. The shape is intentionally minimal — `graph_id` and `uri`\nare the only fields a routing client needs.", + "required": [ + "graph_id", + "uri" + ], + "properties": { + "graph_id": { + "type": "string" + }, + "uri": { + "type": "string" + } + } + }, + "GraphListResponse": { + "type": "object", + "description": "Response from `GET /graphs`. Lists every graph registered with the\nserver in alphabetical order by `graph_id` (sorted server-side so\nclients get deterministic output across requests).", + "required": [ + "graphs" + ], + "properties": { + "graphs": { + "type": "array", + "items": { + "$ref": "#/components/schemas/GraphInfo" + } + } + } + }, "HealthOutput": { "type": "object", "required": [