diff --git a/AGENTS.md b/AGENTS.md index 9dd090b..c851128 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -16,7 +16,7 @@ Tools that support `@`-imports (Claude Code) auto-include all three files via th `CLAUDE.md` is a symlink to this file — there is exactly one source of truth. Edit `AGENTS.md`. -**Version surveyed:** 0.3.1 +**Version surveyed:** 0.4.0 **Workspace crates:** `omnigraph-compiler`, `omnigraph` (engine), `omnigraph-cli`, `omnigraph-server` **Storage substrate:** Lance 4.x (columnar, versioned, branchable) **License:** MIT @@ -32,7 +32,7 @@ OmniGraph is a typed property-graph engine built as a coordination layer over ma - **Languages**: a `.pg` schema language and a `.gq` query language, both Pest-based, with a typed IR. - **Multi-modal querying**: vector ANN (`nearest`), full-text (`search`/`fuzzy`/`match_text`/`bm25`), Reciprocal Rank Fusion (`rrf`), and graph traversal (`Expand`, anti-join `not { … }`) in one runtime. - **Branches and commits across the whole graph**: Git-style — every successful publish appends to a commit DAG; merges are three-way at the row level. -- **Transactional runs**: ephemeral `__run__` branches for isolated mutation, fast-path or merge-path publish. +- **Atomic per-query writes**: `mutate_as` and `load` capture per-table `expected_table_versions` before writing and call `ManifestBatchPublisher::publish` once at the end. Cross-table OCC enforced inside the publisher's row-level CAS; no staging branches, no run state machine. - **HTTP server**: Axum + utoipa OpenAPI, bearer auth (SHA-256 hashed, optional AWS Secrets Manager), Cedar policy gating. - **CLI** driven by a single `omnigraph.yaml`; multi-format output (json/jsonl/csv/kv/table). @@ -77,7 +77,7 @@ Full diagram and concurrency model: [docs/architecture.md](docs/architecture.md) | Indexes (BTREE / inverted / vector / graph topology) | [docs/indexes.md](docs/indexes.md) | | Embeddings (compiler + engine clients, env vars, `@embed`) | [docs/embeddings.md](docs/embeddings.md) | | Branches, commit graph, snapshots, system branches | [docs/branches-commits.md](docs/branches-commits.md) | -| Runs (transactional graph mutations, `__run__`, publish paths) | [docs/runs.md](docs/runs.md) | +| Direct-publish writes (the former Run state machine, now demoted to publisher CAS) | [docs/runs.md](docs/runs.md) | | Three-way merge and conflict kinds | [docs/merge.md](docs/merge.md) | | Diff / change feed (`diff_between`, `diff_commits`) | [docs/changes.md](docs/changes.md) | | Query execution, mutation execution, bulk loader, `load` vs `ingest` | [docs/execution.md](docs/execution.md) | @@ -211,13 +211,13 @@ omnigraph policy explain --actor act-alice --action change --branch main | Query language | — | `.gq` + Pest grammar + IR + lowering + linter | | Schema migration planning | — | `plan_schema_migration` + `apply_schema` step types + `__schema_apply_lock__` | | Commit graph (DAG) across whole repo | — | `_graph_commits.lance` with linear + merge parents, ULID ids, actor map | -| Transactional runs | — | `_graph_runs.lance`, `__run__` ephemeral branches, fast-path & merge-path publish | +| Per-query atomic writes | — | `MutationStaging` accumulator + `commit_with_expected` publisher CAS, single commit per `mutate_as` / `load` | | Three-way row-level merge | — | `OrderedTableCursor` + `StagedTableWriter`, structured `MergeConflictKind` | | Change feeds | — | `diff_between` / `diff_commits` with manifest fast path + ID streaming | -| Cedar policy | — | 10 actions, branch / target_branch / protected scopes, validate/test/explain CLI | +| Cedar policy | — | 8 actions, branch / target_branch / protected scopes, validate/test/explain CLI | | HTTP server | — | Axum, OpenAPI via utoipa, bearer auth (SHA-256, AWS Secrets Manager option), policy gating, NDJSON streaming export | | CLI with config | — | `omnigraph.yaml`, aliases, multi-format output (json/jsonl/csv/kv/table) | -| Audit / actor tracking | — | `_as` write APIs + actor maps in commit & run datasets | +| Audit / actor tracking | — | `_as` write APIs + actor map in commit graph | | Local RustFS bootstrap | — | `scripts/local-rustfs-bootstrap.sh` one-shot S3-backed dev environment | --- diff --git a/crates/omnigraph-cli/src/main.rs b/crates/omnigraph-cli/src/main.rs index 03d1d14..f58fb1b 100644 --- a/crates/omnigraph-cli/src/main.rs +++ b/crates/omnigraph-cli/src/main.rs @@ -5,7 +5,7 @@ use std::path::PathBuf; use clap::{Arg, ArgAction, Args, CommandFactory, FromArgMatches, Parser, Subcommand, ValueEnum}; use color_eyre::eyre::{Result, bail}; -use omnigraph::db::{Omnigraph, ReadTarget, RunId, SnapshotId}; +use omnigraph::db::{Omnigraph, ReadTarget, SnapshotId}; use omnigraph::loader::LoadMode; use omnigraph_compiler::query::parser::parse_query; use omnigraph_compiler::schema::parser::parse_schema; @@ -18,9 +18,8 @@ use omnigraph_server::api::{ BranchCreateOutput, BranchCreateRequest, BranchDeleteOutput, BranchListOutput, BranchMergeOutput, BranchMergeRequest, ChangeOutput, ChangeRequest, CommitListOutput, CommitOutput, ErrorOutput, ExportRequest, IngestOutput, IngestRequest, ReadOutput, ReadRequest, - RunListOutput, RunOutput, SchemaApplyOutput, SchemaApplyRequest, SchemaOutput, SnapshotOutput, - SnapshotTableOutput, commit_output, ingest_output, read_output, run_output, - schema_apply_output, snapshot_payload, + SchemaApplyOutput, SchemaApplyRequest, SchemaOutput, SnapshotOutput, SnapshotTableOutput, + commit_output, ingest_output, read_output, schema_apply_output, snapshot_payload, }; use omnigraph_server::{ AliasCommand, OmnigraphConfig, PolicyAction, PolicyDecision, PolicyEngine, PolicyRequest, @@ -143,11 +142,6 @@ enum Command { #[arg(long = "table")] table_keys: Vec, }, - /// Run operations - Run { - #[command(subcommand)] - command: RunCommand, - }, /// Commit history operations Commit { #[command(subcommand)] @@ -370,60 +364,6 @@ enum QueryCommand { }, } -#[derive(Debug, Subcommand)] -enum RunCommand { - /// List transactional runs - List { - /// Repo URI - uri: Option, - #[arg(long)] - target: Option, - #[arg(long)] - config: Option, - #[arg(long)] - json: bool, - }, - /// Show a transactional run - Show { - /// Repo URI - #[arg(long)] - uri: Option, - #[arg(long)] - target: Option, - #[arg(long)] - config: Option, - run_id: String, - #[arg(long)] - json: bool, - }, - /// Publish a transactional run - Publish { - /// Repo URI - #[arg(long)] - uri: Option, - #[arg(long)] - target: Option, - #[arg(long)] - config: Option, - run_id: String, - #[arg(long)] - json: bool, - }, - /// Abort a transactional run - Abort { - /// Repo URI - #[arg(long)] - uri: Option, - #[arg(long)] - target: Option, - #[arg(long)] - config: Option, - run_id: String, - #[arg(long)] - json: bool, - }, -} - #[derive(Debug, Subcommand)] enum CommitCommand { /// List graph commits @@ -1214,42 +1154,6 @@ fn print_change_human(output: &ChangeOutput) { } } -fn print_run_list_human(runs: &[RunOutput]) { - for run in runs { - println!( - "{} {} target={} branch={}{}", - run.run_id, - run.status, - run.target_branch, - run.run_branch, - run.actor_id - .as_deref() - .map(|actor| format!(" actor={}", actor)) - .unwrap_or_default() - ); - } -} - -fn print_run_human(run: &RunOutput) { - println!("run_id: {}", run.run_id); - println!("status: {}", run.status); - println!("target_branch: {}", run.target_branch); - println!("run_branch: {}", run.run_branch); - println!("base_snapshot_id: {}", run.base_snapshot_id); - println!("base_manifest_version: {}", run.base_manifest_version); - if let Some(actor_id) = &run.actor_id { - println!("actor_id: {}", actor_id); - } - if let Some(operation_hash) = &run.operation_hash { - println!("operation_hash: {}", operation_hash); - } - if let Some(snapshot_id) = &run.published_snapshot_id { - println!("published_snapshot_id: {}", snapshot_id); - } - println!("created_at: {}", run.created_at); - println!("updated_at: {}", run.updated_at); -} - fn print_commit_list_human(commits: &[CommitOutput]) { for commit in commits { let branch = commit.manifest_branch.as_deref().unwrap_or("main"); @@ -2190,133 +2094,6 @@ async fn main() -> Result<()> { .await?; } } - Command::Run { command } => match command { - RunCommand::List { - uri, - target, - config, - json, - } => { - let config = load_cli_config(config.as_ref())?; - let bearer_token = - resolve_remote_bearer_token(&config, uri.as_deref(), target.as_deref())?; - let uri = resolve_uri(&config, uri, target.as_deref())?; - let runs = if is_remote_uri(&uri) { - remote_json::( - &http_client, - Method::GET, - remote_url(&uri, "/runs"), - None, - bearer_token.as_deref(), - ) - .await? - .runs - } else { - let db = Omnigraph::open(&uri).await?; - db.list_runs() - .await? - .iter() - .map(run_output) - .collect::>() - }; - if json { - print_json(&RunListOutput { runs })?; - } else { - print_run_list_human(&runs); - } - } - RunCommand::Show { - uri, - target, - config, - run_id, - json, - } => { - let config = load_cli_config(config.as_ref())?; - let bearer_token = - resolve_remote_bearer_token(&config, uri.as_deref(), target.as_deref())?; - let uri = resolve_uri(&config, uri, target.as_deref())?; - let run = if is_remote_uri(&uri) { - remote_json::( - &http_client, - Method::GET, - remote_url(&uri, &format!("/runs/{}", run_id)), - None, - bearer_token.as_deref(), - ) - .await? - } else { - let db = Omnigraph::open(&uri).await?; - run_output(&db.get_run(&RunId::new(run_id)).await?) - }; - if json { - print_json(&run)?; - } else { - print_run_human(&run); - } - } - RunCommand::Publish { - uri, - target, - config, - run_id, - json, - } => { - let config = load_cli_config(config.as_ref())?; - let bearer_token = - resolve_remote_bearer_token(&config, uri.as_deref(), target.as_deref())?; - let uri = resolve_uri(&config, uri, target.as_deref())?; - let run = if is_remote_uri(&uri) { - remote_json::( - &http_client, - Method::POST, - remote_url(&uri, &format!("/runs/{}/publish", run_id)), - Some(serde_json::json!({})), - bearer_token.as_deref(), - ) - .await? - } else { - let mut db = Omnigraph::open(&uri).await?; - db.publish_run(&RunId::new(run_id.clone())).await?; - run_output(&db.get_run(&RunId::new(run_id)).await?) - }; - if json { - print_json(&run)?; - } else { - print_run_human(&run); - } - } - RunCommand::Abort { - uri, - target, - config, - run_id, - json, - } => { - let config = load_cli_config(config.as_ref())?; - let bearer_token = - resolve_remote_bearer_token(&config, uri.as_deref(), target.as_deref())?; - let uri = resolve_uri(&config, uri, target.as_deref())?; - let run = if is_remote_uri(&uri) { - remote_json::( - &http_client, - Method::POST, - remote_url(&uri, &format!("/runs/{}/abort", run_id)), - Some(serde_json::json!({})), - bearer_token.as_deref(), - ) - .await? - } else { - let mut db = Omnigraph::open(&uri).await?; - run_output(&db.abort_run(&RunId::new(run_id)).await?) - }; - if json { - print_json(&run)?; - } else { - print_run_human(&run); - } - } - }, Command::Read { uri, legacy_uri, diff --git a/crates/omnigraph-cli/tests/cli.rs b/crates/omnigraph-cli/tests/cli.rs index 8b710c5..b1c1398 100644 --- a/crates/omnigraph-cli/tests/cli.rs +++ b/crates/omnigraph-cli/tests/cli.rs @@ -29,7 +29,7 @@ rules: - id: admins-promote allow: actors: { group: admins } - actions: [branch_merge, run_publish] + actions: [branch_merge] target_branch_scope: protected "#; @@ -1905,119 +1905,7 @@ fn cli_fails_for_invalid_merge_requests() { ); } -#[test] -fn run_list_and_show_report_published_runs() { - let temp = tempdir().unwrap(); - let repo = repo_path(temp.path()); - init_repo(&repo); - load_fixture(&repo); - - let list_output = output_success(cli().arg("run").arg("list").arg(&repo).arg("--json")); - let list_payload: Value = serde_json::from_slice(&list_output.stdout).unwrap(); - let runs = list_payload["runs"].as_array().unwrap(); - assert_eq!(runs.len(), 1); - assert_eq!(runs[0]["status"], "published"); - let run_id = runs[0]["run_id"].as_str().unwrap(); - - let show_output = output_success( - cli() - .arg("run") - .arg("show") - .arg("--uri") - .arg(&repo) - .arg(run_id) - .arg("--json"), - ); - let show_payload: Value = serde_json::from_slice(&show_output.stdout).unwrap(); - assert_eq!(show_payload["run_id"], run_id); - assert_eq!(show_payload["status"], "published"); - assert_eq!(show_payload["target_branch"], "main"); -} - -#[test] -fn run_list_can_resolve_uri_from_config() { - let temp = tempdir().unwrap(); - let repo = repo_path(temp.path()); - let config = temp.path().join("omnigraph.yaml"); - init_repo(&repo); - load_fixture(&repo); - write_config(&config, &local_yaml_config(&repo)); - - let output = output_success( - cli() - .arg("run") - .arg("list") - .arg("--config") - .arg(&config) - .arg("--json"), - ); - let payload: Value = serde_json::from_slice(&output.stdout).unwrap(); - assert_eq!(payload["runs"].as_array().unwrap().len(), 1); -} - -#[test] -fn run_publish_promotes_manual_running_run() { - let runtime = tokio::runtime::Runtime::new().unwrap(); - let temp = tempdir().unwrap(); - let repo = repo_path(temp.path()); - init_repo(&repo); - load_fixture(&repo); - - let run_id = runtime.block_on(begin_manual_run(&repo, "main")); - - let publish_output = output_success( - cli() - .arg("run") - .arg("publish") - .arg("--uri") - .arg(&repo) - .arg(&run_id) - .arg("--json"), - ); - let payload: Value = serde_json::from_slice(&publish_output.stdout).unwrap(); - assert_eq!(payload["run_id"], run_id); - assert_eq!(payload["status"], "published"); - assert!(payload["published_snapshot_id"].is_string()); - - let runtime = tokio::runtime::Runtime::new().unwrap(); - runtime.block_on(async { - let db = Omnigraph::open(repo.to_str().unwrap()).await.unwrap(); - let result = db - .query( - ReadTarget::branch("main"), - include_str!("../../omnigraph/tests/fixtures/test.gq"), - "get_person", - &omnigraph_compiler::ir::ParamMap::from([( - "name".to_string(), - omnigraph_compiler::query::ast::Literal::String("Eve".to_string()), - )]), - ) - .await - .unwrap(); - assert_eq!(result.num_rows(), 1); - }); -} - -#[test] -fn run_abort_marks_manual_running_run_aborted() { - let runtime = tokio::runtime::Runtime::new().unwrap(); - let temp = tempdir().unwrap(); - let repo = repo_path(temp.path()); - init_repo(&repo); - load_fixture(&repo); - - let run_id = runtime.block_on(begin_manual_run(&repo, "main")); - - let abort_output = output_success( - cli() - .arg("run") - .arg("abort") - .arg("--uri") - .arg(&repo) - .arg(&run_id) - .arg("--json"), - ); - let payload: Value = serde_json::from_slice(&abort_output.stdout).unwrap(); - assert_eq!(payload["run_id"], run_id); - assert_eq!(payload["status"], "aborted"); -} +// MR-771: `omnigraph run list/show/publish/abort` subcommands removed +// alongside the run state machine. Direct-to-target writes leave nothing +// for these CLIs to manage. Audit history is now visible via +// `omnigraph commit list` reading the commit graph. diff --git a/crates/omnigraph-cli/tests/support/mod.rs b/crates/omnigraph-cli/tests/support/mod.rs index 4d3a8b3..31092ea 100644 --- a/crates/omnigraph-cli/tests/support/mod.rs +++ b/crates/omnigraph-cli/tests/support/mod.rs @@ -8,8 +8,6 @@ use std::thread::sleep; use std::time::Duration; use assert_cmd::Command; -use omnigraph::db::Omnigraph; -use omnigraph::loader::LoadMode; use reqwest::blocking::Client; use serde_json::Value; use tempfile::{TempDir, tempdir}; @@ -223,21 +221,6 @@ pub fn spawn_server_with_config_env(config: &Path, envs: &[(&str, &str)]) -> Tes spawn_server_process(command) } -pub async fn begin_manual_run(repo: &Path, target_branch: &str) -> String { - let mut db = Omnigraph::open(repo.to_str().unwrap()).await.unwrap(); - let run = db - .begin_run(target_branch, Some("cli-test-run")) - .await - .unwrap(); - db.load( - &run.run_branch, - r#"{"type":"Person","data":{"name":"Eve","age":29}}"#, - LoadMode::Append, - ) - .await - .unwrap(); - run.run_id.as_str().to_string() -} pub struct SystemRepo { _temp: TempDir, diff --git a/crates/omnigraph-cli/tests/system_local.rs b/crates/omnigraph-cli/tests/system_local.rs index 7128fd6..f5e75eb 100644 --- a/crates/omnigraph-cli/tests/system_local.rs +++ b/crates/omnigraph-cli/tests/system_local.rs @@ -2,12 +2,7 @@ mod support; use std::env; use std::fs; -use std::process::Stdio; -use std::thread::sleep; -use std::time::Duration; -use omnigraph::db::Omnigraph; -use omnigraph::loader::LoadMode; use reqwest::blocking::Client; use serde_json::Value; @@ -33,7 +28,7 @@ rules: - id: admins-promote allow: actors: { group: admins } - actions: [branch_merge, run_publish] + actions: [branch_merge] target_branch_scope: protected "#; @@ -117,42 +112,6 @@ fn snapshot_table_row_count_at(repo: &std::path::Path, table_key: &str) -> u64 { .unwrap() } -fn wait_for_running_run(repo: &SystemRepo) -> String { - let runtime = tokio::runtime::Runtime::new().unwrap(); - for _ in 0..200 { - let running = runtime.block_on(async { - let db = Omnigraph::open(repo.path().to_str().unwrap()) - .await - .unwrap(); - db.list_runs() - .await - .unwrap() - .into_iter() - .find(|run| run.target_branch == "main" && run.status.as_str() == "running") - .map(|run| run.run_id.to_string()) - }); - if let Some(run_id) = running { - return run_id; - } - sleep(Duration::from_millis(50)); - } - - panic!("timed out waiting for running run"); -} - -fn bulk_people_jsonl(count: usize) -> String { - let mut rows = String::new(); - for index in 0..count { - rows.push_str(&format!( - r#"{{"type":"Person","data":{{"name":"Bulk{:05}","age":{}}}}}"#, - index, - 20 + (index % 50) - )); - rows.push('\n'); - } - rows -} - fn gemini_base_url() -> String { env::var("OMNIGRAPH_GEMINI_BASE_URL") .ok() @@ -348,15 +307,17 @@ fn local_cli_end_to_end_branch_change_merge_flow() { assert_eq!(main_read["row_count"], 1); assert_eq!(main_read["rows"][0]["p.name"], "Zoe"); - let runs_payload = parse_stdout_json(&output_success( - cli().arg("run").arg("list").arg(repo.path()).arg("--json"), + // MR-771: `omnigraph run list` removed. Audit visible via commit list. + let commits_payload = parse_stdout_json(&output_success( + cli() + .arg("commit") + .arg("list") + .arg(repo.path()) + .arg("--branch") + .arg("main") + .arg("--json"), )); - let runs = runs_payload["runs"].as_array().unwrap(); - assert!(runs.len() >= 2); - assert!( - runs.iter() - .any(|run| run["target_branch"] == "feature" && run["status"] == "published") - ); + assert!(commits_payload["commits"].as_array().unwrap().len() >= 2); } #[test] @@ -636,17 +597,8 @@ fn local_cli_failed_load_keeps_target_state_unchanged() { snapshot_table_row_count(&repo, "edge:Knows"), knows_rows_before ); - - let runs_payload = parse_stdout_json(&output_success( - cli().arg("run").arg("list").arg(repo.path()).arg("--json"), - )); - assert!( - runs_payload["runs"] - .as_array() - .unwrap() - .iter() - .any(|run| run["target_branch"] == "main" && run["status"] == "failed") - ); + // MR-771: failed loads no longer leave a RunRecord. The atomicity + // guarantee is verified above (target tables are unchanged). } #[test] @@ -679,17 +631,9 @@ fn local_cli_failed_change_keeps_target_state_unchanged() { .arg("--json"), )); assert_eq!(friends_payload["row_count"], 2); - - let runs_payload = parse_stdout_json(&output_success( - cli().arg("run").arg("list").arg(repo.path()).arg("--json"), - )); - assert!( - runs_payload["runs"] - .as_array() - .unwrap() - .iter() - .any(|run| run["target_branch"] == "main" && run["status"] == "failed") - ); + // MR-771: failed mutations no longer leave a RunRecord. The atomicity + // guarantee is verified above (the friends_of read above shows main + // unchanged). } #[test] @@ -997,102 +941,12 @@ query vector_search($q: String) { assert_eq!(result["rows"][0]["d.slug"], "alpha-doc"); } -#[test] -fn local_cli_transactional_load_drift_fails_without_partial_publish() { - let repo = SystemRepo::loaded(); - let large_data = repo.write_jsonl("system-large-load.jsonl", &bulk_people_jsonl(250_000)); - let person_rows_before = snapshot_table_row_count(&repo, "node:Person"); - - let mut load = cli_process(); - load.arg("load") - .arg("--data") - .arg(&large_data) - .arg("--mode") - .arg("merge") - .arg(repo.path()) - .stdout(Stdio::piped()) - .stderr(Stdio::piped()); - let child = load.spawn().unwrap(); - - let run_id = wait_for_running_run(&repo); - - tokio::runtime::Runtime::new().unwrap().block_on(async { - let mut db = Omnigraph::open(repo.path().to_str().unwrap()) - .await - .unwrap(); - let interloper = db - .begin_run("main", Some("system-test-interloper")) - .await - .unwrap(); - db.load( - interloper.run_branch.as_str(), - r#"{"type":"Person","data":{"name":"Interloper","age":41}}"#, - LoadMode::Append, - ) - .await - .unwrap(); - db.publish_run(&interloper.run_id).await.unwrap(); - }); - - let output = child.wait_with_output().unwrap(); - assert!( - !output.status.success(), - "load unexpectedly succeeded\nstdout:\n{}\nstderr:\n{}", - String::from_utf8_lossy(&output.stdout), - String::from_utf8_lossy(&output.stderr) - ); - let stderr = String::from_utf8(output.stderr).unwrap(); - assert!( - stderr.contains("advanced during transactional load") - || stderr.contains("version drift") - || stderr.contains("retry"), - "unexpected load failure: {stderr}" - ); - - let run_payload = parse_stdout_json(&output_success( - cli() - .arg("run") - .arg("show") - .arg("--uri") - .arg(repo.path()) - .arg(&run_id) - .arg("--json"), - )); - assert_eq!(run_payload["status"], "failed"); - - assert_eq!( - snapshot_table_row_count(&repo, "node:Person"), - person_rows_before + 1 - ); - - let interloper = parse_stdout_json(&output_success( - cli() - .arg("read") - .arg(repo.path()) - .arg("--query") - .arg(fixture("test.gq")) - .arg("--name") - .arg("get_person") - .arg("--params") - .arg(r#"{"name":"Interloper"}"#) - .arg("--json"), - )); - assert_eq!(interloper["row_count"], 1); - - let bulk_row = parse_stdout_json(&output_success( - cli() - .arg("read") - .arg(repo.path()) - .arg("--query") - .arg(fixture("test.gq")) - .arg("--name") - .arg("get_person") - .arg("--params") - .arg(r#"{"name":"Bulk00000"}"#) - .arg("--json"), - )); - assert_eq!(bulk_row["row_count"], 0); -} +// MR-771: the publisher CAS conflict shape is verified end-to-end at the +// engine level in `crates/omnigraph/tests/runs.rs::concurrent_writers_one_succeeds_one_gets_expected_version_mismatch` +// and at the HTTP boundary in +// `crates/omnigraph-server/tests/server.rs::change_conflict_returns_manifest_conflict_409`. +// The pre-MR-771 CLI-level race was timing-dependent; with direct-publish +// the surface is the same engine path the unit test already covers. #[test] fn local_cli_policy_tooling_is_end_to_end_while_local_writes_stay_unenforced() { diff --git a/crates/omnigraph-cli/tests/system_remote.rs b/crates/omnigraph-cli/tests/system_remote.rs index bb3e5fd..cecd725 100644 --- a/crates/omnigraph-cli/tests/system_remote.rs +++ b/crates/omnigraph-cli/tests/system_remote.rs @@ -33,7 +33,7 @@ rules: - id: admins-promote allow: actors: { group: admins } - actions: [branch_merge, run_publish] + actions: [branch_merge] target_branch_scope: protected "#; @@ -192,30 +192,9 @@ query insert_person($name: String, $age: I32) { assert_eq!(local_verify["row_count"], 1); assert_eq!(local_verify["rows"][0]["p.name"], "Mina"); - let manual_run = tokio::runtime::Runtime::new() - .unwrap() - .block_on(begin_manual_run(repo.path(), "main")); - let publish_payload = parse_stdout_json(&output_success( - cli() - .arg("run") - .arg("publish") - .arg("--config") - .arg(&config) - .arg(&manual_run) - .arg("--json"), - )); - assert_eq!(publish_payload["run_id"], manual_run); - assert_eq!(publish_payload["status"], "published"); - - let runs_payload = parse_stdout_json(&output_success( - cli() - .arg("run") - .arg("list") - .arg("--config") - .arg(&config) - .arg("--json"), - )); - assert!(runs_payload["runs"].as_array().unwrap().len() >= 2); + // MR-771: `run publish` / `run list` removed. Direct-to-target writes + // already landed via the change call above; the commit graph is now + // the audit surface (verified separately by `commit list`). } #[test] diff --git a/crates/omnigraph-server/src/api.rs b/crates/omnigraph-server/src/api.rs index abb9a2d..9dd45ee 100644 --- a/crates/omnigraph-server/src/api.rs +++ b/crates/omnigraph-server/src/api.rs @@ -1,6 +1,4 @@ -use omnigraph::db::{ - GraphCommit, MergeOutcome, ReadTarget, RunRecord, SchemaApplyResult, Snapshot, -}; +use omnigraph::db::{GraphCommit, MergeOutcome, ReadTarget, SchemaApplyResult, Snapshot}; use omnigraph::error::{MergeConflict, MergeConflictKind}; use omnigraph::loader::{IngestResult, LoadMode}; use omnigraph_compiler::SchemaMigrationStep; @@ -41,30 +39,6 @@ pub struct SnapshotOutput { pub tables: Vec, } -#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)] -pub struct RunOutput { - pub run_id: String, - pub target_branch: String, - pub run_branch: String, - pub base_snapshot_id: String, - pub base_manifest_version: u64, - pub operation_hash: Option, - pub actor_id: Option, - pub status: String, - pub published_snapshot_id: Option, - /// Run creation time as Unix epoch microseconds. - #[schema(example = 1714000000000000i64)] - pub created_at: i64, - /// Last status change as Unix epoch microseconds. - #[schema(example = 1714000000000000i64)] - pub updated_at: i64, -} - -#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)] -pub struct RunListOutput { - pub runs: Vec, -} - #[derive(Debug, Clone, Serialize, Deserialize, ToSchema)] pub struct BranchCreateRequest { /// Parent branch to fork from. Defaults to `main`. @@ -368,6 +342,17 @@ pub enum ErrorCode { Internal, } +/// Structured details for a publisher-level OCC failure. Surfaces alongside +/// HTTP 409 when a write was rejected because the caller's pre-write view of +/// one table's manifest version was stale relative to the current head. The +/// expected/actual fields tell the client which table to refresh. +#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)] +pub struct ManifestConflictOutput { + pub table_key: String, + pub expected: u64, + pub actual: u64, +} + #[derive(Debug, Clone, Serialize, Deserialize, ToSchema)] pub struct ErrorOutput { pub error: String, @@ -375,6 +360,12 @@ pub struct ErrorOutput { pub code: Option, #[serde(default, skip_serializing_if = "Vec::is_empty")] pub merge_conflicts: Vec, + /// Set when the conflict is a publisher CAS rejection + /// (`ManifestConflictDetails::ExpectedVersionMismatch`). The caller's + /// pre-write view of `table_key` was at version `expected` but the + /// manifest is now at `actual`. Refresh and retry. + #[serde(skip_serializing_if = "Option::is_none")] + pub manifest_conflict: Option, } pub fn snapshot_payload(branch: &str, snapshot: &Snapshot) -> SnapshotOutput { @@ -408,22 +399,6 @@ pub fn schema_apply_output(uri: &str, result: SchemaApplyResult) -> SchemaApplyO } } -pub fn run_output(run: &RunRecord) -> RunOutput { - RunOutput { - run_id: run.run_id.as_str().to_string(), - target_branch: run.target_branch.clone(), - run_branch: run.run_branch.clone(), - base_snapshot_id: run.base_snapshot_id.as_str().to_string(), - base_manifest_version: run.base_manifest_version, - operation_hash: run.operation_hash.clone(), - actor_id: run.actor_id.clone(), - status: run.status.as_str().to_string(), - published_snapshot_id: run.published_snapshot_id.clone(), - created_at: run.created_at, - updated_at: run.updated_at, - } -} - pub fn commit_output(commit: &GraphCommit) -> CommitOutput { CommitOutput { graph_commit_id: commit.graph_commit_id.clone(), diff --git a/crates/omnigraph-server/src/lib.rs b/crates/omnigraph-server/src/lib.rs index 19d374f..b18f2b7 100644 --- a/crates/omnigraph-server/src/lib.rs +++ b/crates/omnigraph-server/src/lib.rs @@ -14,8 +14,8 @@ use api::{ BranchCreateOutput, BranchCreateRequest, BranchDeleteOutput, BranchListOutput, BranchMergeOutput, BranchMergeRequest, ChangeOutput, ChangeRequest, CommitListOutput, CommitListQuery, ErrorCode, ErrorOutput, ExportRequest, HealthOutput, IngestOutput, - IngestRequest, ReadOutput, ReadRequest, RunListOutput, SchemaApplyOutput, SchemaApplyRequest, - SchemaOutput, SnapshotQuery, ingest_output, schema_apply_output, snapshot_payload, + IngestRequest, ReadOutput, ReadRequest, SchemaApplyOutput, SchemaApplyRequest, SchemaOutput, + SnapshotQuery, ingest_output, schema_apply_output, snapshot_payload, }; use axum::body::{Body, Bytes}; use axum::extract::DefaultBodyLimit; @@ -33,8 +33,8 @@ pub use config::{ load_config, }; use futures::stream; -use omnigraph::db::{Omnigraph, ReadTarget, RunId}; -use omnigraph::error::{ManifestErrorKind, OmniError}; +use omnigraph::db::{Omnigraph, ReadTarget}; +use omnigraph::error::{ManifestConflictDetails, ManifestErrorKind, OmniError}; use omnigraph_compiler::json_params_to_param_map; use omnigraph_compiler::query::parser::parse_query; use omnigraph_compiler::{JsonParamMode, ParamMap}; @@ -82,10 +82,6 @@ fn hash_bearer_token(token: &str) -> BearerTokenHash { server_branch_create, server_branch_delete, server_branch_merge, - server_run_list, - server_run_show, - server_run_publish, - server_run_abort, server_commit_list, server_commit_show, ), @@ -159,6 +155,7 @@ pub struct ApiError { code: ErrorCode, message: String, merge_conflicts: Vec, + manifest_conflict: Option, } impl AppState { @@ -280,6 +277,7 @@ impl ApiError { code: ErrorCode::Unauthorized, message: message.into(), merge_conflicts: Vec::new(), + manifest_conflict: None, } } @@ -289,6 +287,7 @@ impl ApiError { code: ErrorCode::Forbidden, message: message.into(), merge_conflicts: Vec::new(), + manifest_conflict: None, } } @@ -298,6 +297,7 @@ impl ApiError { code: ErrorCode::BadRequest, message: message.into(), merge_conflicts: Vec::new(), + manifest_conflict: None, } } @@ -307,6 +307,7 @@ impl ApiError { code: ErrorCode::NotFound, message: message.into(), merge_conflicts: Vec::new(), + manifest_conflict: None, } } @@ -316,6 +317,7 @@ impl ApiError { code: ErrorCode::Conflict, message: message.into(), merge_conflicts: Vec::new(), + manifest_conflict: None, } } @@ -325,6 +327,7 @@ impl ApiError { code: ErrorCode::Internal, message: message.into(), merge_conflicts: Vec::new(), + manifest_conflict: None, } } @@ -334,6 +337,20 @@ impl ApiError { code: ErrorCode::Conflict, message: summarize_merge_conflicts(&conflicts), merge_conflicts: conflicts, + manifest_conflict: None, + } + } + + fn manifest_version_conflict( + message: String, + details: api::ManifestConflictOutput, + ) -> Self { + Self { + status: StatusCode::CONFLICT, + code: ErrorCode::Conflict, + message, + merge_conflicts: Vec::new(), + manifest_conflict: Some(details), } } @@ -344,7 +361,21 @@ impl ApiError { OmniError::Manifest(err) => match err.kind { ManifestErrorKind::BadRequest => Self::bad_request(err.message), ManifestErrorKind::NotFound => Self::not_found(err.message), - ManifestErrorKind::Conflict => Self::conflict(err.message), + ManifestErrorKind::Conflict => match err.details { + Some(ManifestConflictDetails::ExpectedVersionMismatch { + table_key, + expected, + actual, + }) => Self::manifest_version_conflict( + err.message, + api::ManifestConflictOutput { + table_key, + expected, + actual, + }, + ), + _ => Self::conflict(err.message), + }, ManifestErrorKind::Internal => Self::internal(err.message), }, OmniError::MergeConflicts(conflicts) => Self::merge_conflict( @@ -395,6 +426,7 @@ impl IntoResponse for ApiError { error: self.message, code: Some(self.code), merge_conflicts: self.merge_conflicts, + manifest_conflict: self.manifest_conflict, }), ) .into_response() @@ -443,10 +475,6 @@ pub fn build_app(state: AppState) -> Router { ) .route("/branches/{branch}", delete(server_branch_delete)) .route("/branches/merge", post(server_branch_merge)) - .route("/runs", get(server_run_list)) - .route("/runs/{run_id}", get(server_run_show)) - .route("/runs/{run_id}/publish", post(server_run_publish)) - .route("/runs/{run_id}/abort", post(server_run_abort)) .route("/commits", get(server_commit_list)) .route("/commits/{commit_id}", get(server_commit_show)) .route_layer(middleware::from_fn_with_state( @@ -1219,203 +1247,6 @@ async fn server_branch_merge( })) } -#[utoipa::path( - get, - path = "/runs", - tag = "runs", - operation_id = "listRuns", - responses( - (status = 200, description = "List of runs", body = RunListOutput), - (status = 401, description = "Unauthorized", body = ErrorOutput), - (status = 403, description = "Forbidden", body = ErrorOutput), - ), - security(("bearer_token" = [])), -)] -/// List all runs. -/// -/// A run is an ephemeral branch produced by an agent or background job. The -/// list includes pending, in-progress, published, and aborted runs across -/// all target branches. Read-only. -async fn server_run_list( - State(state): State, - actor: Option>, -) -> std::result::Result, ApiError> { - authorize_request( - &state, - actor.as_ref().map(|Extension(actor)| actor), - PolicyRequest { - actor_id: actor - .as_ref() - .map(|Extension(actor)| actor.as_str().to_string()) - .unwrap_or_default(), - action: PolicyAction::Read, - branch: None, - target_branch: None, - }, - )?; - let runs = { - let db = Arc::clone(&state.db).read_owned().await; - db.list_runs().await.map_err(ApiError::from_omni)? - }; - Ok(Json(RunListOutput { - runs: runs.iter().map(api::run_output).collect(), - })) -} - -#[utoipa::path( - get, - path = "/runs/{run_id}", - tag = "runs", - operation_id = "getRun", - params( - ("run_id" = String, Path, description = "Run identifier"), - ), - responses( - (status = 200, description = "Run details", body = api::RunOutput), - (status = 401, description = "Unauthorized", body = ErrorOutput), - (status = 403, description = "Forbidden", body = ErrorOutput), - (status = 404, description = "Run not found", body = ErrorOutput), - ), - security(("bearer_token" = [])), -)] -/// Get a single run. -/// -/// Returns the run's status, target/run branches, base snapshot, and (if -/// published) the resulting snapshot id. Read-only. -async fn server_run_show( - State(state): State, - actor: Option>, - Path(run_id): Path, -) -> std::result::Result, ApiError> { - authorize_request( - &state, - actor.as_ref().map(|Extension(actor)| actor), - PolicyRequest { - actor_id: actor - .as_ref() - .map(|Extension(actor)| actor.as_str().to_string()) - .unwrap_or_default(), - action: PolicyAction::Read, - branch: None, - target_branch: None, - }, - )?; - let run = { - let db = Arc::clone(&state.db).read_owned().await; - db.get_run(&RunId::new(run_id)) - .await - .map_err(ApiError::from_omni)? - }; - Ok(Json(api::run_output(&run))) -} - -#[utoipa::path( - post, - path = "/runs/{run_id}/publish", - tag = "runs", - operation_id = "publishRun", - params( - ("run_id" = String, Path, description = "Run identifier"), - ), - responses( - (status = 200, description = "Run published", body = api::RunOutput), - (status = 401, description = "Unauthorized", body = ErrorOutput), - (status = 403, description = "Forbidden", body = ErrorOutput), - (status = 404, description = "Run not found", body = ErrorOutput), - ), - security(("bearer_token" = [])), -)] -/// Publish a run to its target branch. -/// -/// Promotes the run's snapshot onto its `target_branch` as a new commit. The -/// run must be in a publishable state. **Destructive** to the target branch. -async fn server_run_publish( - State(state): State, - actor: Option>, - Path(run_id): Path, -) -> std::result::Result, ApiError> { - let run_id = RunId::new(run_id); - let actor_id = actor.as_ref().map(|Extension(actor)| actor.as_str()); - let target_branch = { - let db = Arc::clone(&state.db).read_owned().await; - db.get_run(&run_id) - .await - .map_err(ApiError::from_omni)? - .target_branch - }; - authorize_request( - &state, - actor.as_ref().map(|Extension(actor)| actor), - PolicyRequest { - actor_id: actor_id.map(str::to_string).unwrap_or_default(), - action: PolicyAction::RunPublish, - branch: None, - target_branch: Some(target_branch), - }, - )?; - let run = { - let mut db = Arc::clone(&state.db).write_owned().await; - db.publish_run_as(&run_id, actor_id) - .await - .map_err(ApiError::from_omni)?; - db.get_run(&run_id).await.map_err(ApiError::from_omni)? - }; - Ok(Json(api::run_output(&run))) -} - -#[utoipa::path( - post, - path = "/runs/{run_id}/abort", - tag = "runs", - operation_id = "abortRun", - params( - ("run_id" = String, Path, description = "Run identifier"), - ), - responses( - (status = 200, description = "Run aborted", body = api::RunOutput), - (status = 401, description = "Unauthorized", body = ErrorOutput), - (status = 403, description = "Forbidden", body = ErrorOutput), - (status = 404, description = "Run not found", body = ErrorOutput), - ), - security(("bearer_token" = [])), -)] -/// Abort a run. -/// -/// Marks the run as aborted and releases its working branch. **Irreversible**: -/// the run cannot be resumed once aborted. -async fn server_run_abort( - State(state): State, - actor: Option>, - Path(run_id): Path, -) -> std::result::Result, ApiError> { - let run_id = RunId::new(run_id); - let target_branch = { - let db = Arc::clone(&state.db).read_owned().await; - db.get_run(&run_id) - .await - .map_err(ApiError::from_omni)? - .target_branch - }; - authorize_request( - &state, - actor.as_ref().map(|Extension(actor)| actor), - PolicyRequest { - actor_id: actor - .as_ref() - .map(|Extension(actor)| actor.as_str().to_string()) - .unwrap_or_default(), - action: PolicyAction::RunAbort, - branch: None, - target_branch: Some(target_branch), - }, - )?; - let run = { - let mut db = Arc::clone(&state.db).write_owned().await; - db.abort_run(&run_id).await.map_err(ApiError::from_omni)? - }; - Ok(Json(api::run_output(&run))) -} - #[utoipa::path( get, path = "/commits", diff --git a/crates/omnigraph-server/src/policy.rs b/crates/omnigraph-server/src/policy.rs index e6b43bc..4cf6412 100644 --- a/crates/omnigraph-server/src/policy.rs +++ b/crates/omnigraph-server/src/policy.rs @@ -23,8 +23,6 @@ pub enum PolicyAction { BranchCreate, BranchDelete, BranchMerge, - RunPublish, - RunAbort, Admin, } @@ -38,8 +36,6 @@ impl PolicyAction { Self::BranchCreate => "branch_create", Self::BranchDelete => "branch_delete", Self::BranchMerge => "branch_merge", - Self::RunPublish => "run_publish", - Self::RunAbort => "run_abort", Self::Admin => "admin", } } @@ -51,12 +47,7 @@ impl PolicyAction { fn uses_target_branch_scope(self) -> bool { matches!( self, - Self::BranchCreate - | Self::SchemaApply - | Self::BranchDelete - | Self::BranchMerge - | Self::RunPublish - | Self::RunAbort + Self::BranchCreate | Self::SchemaApply | Self::BranchDelete | Self::BranchMerge ) } } @@ -79,8 +70,6 @@ impl FromStr for PolicyAction { "branch_create" => Ok(Self::BranchCreate), "branch_delete" => Ok(Self::BranchDelete), "branch_merge" => Ok(Self::BranchMerge), - "run_publish" => Ok(Self::RunPublish), - "run_abort" => Ok(Self::RunAbort), "admin" => Ok(Self::Admin), other => bail!("unknown policy action '{other}'"), } @@ -599,8 +588,6 @@ namespace Omnigraph { action "branch_create" appliesTo { principal: Actor, resource: Repo, context: RequestContext }; action "branch_delete" appliesTo { principal: Actor, resource: Repo, context: RequestContext }; action "branch_merge" appliesTo { principal: Actor, resource: Repo, context: RequestContext }; - action "run_publish" appliesTo { principal: Actor, resource: Repo, context: RequestContext }; - action "run_abort" appliesTo { principal: Actor, resource: Repo, context: RequestContext }; action "admin" appliesTo { principal: Actor, resource: Repo, context: RequestContext }; } "# @@ -732,7 +719,7 @@ rules: - id: admins-promote allow: actors: { group: admins } - actions: [branch_delete, branch_merge, run_publish] + actions: [branch_delete, branch_merge] target_branch_scope: protected "#, ) diff --git a/crates/omnigraph-server/tests/openapi.rs b/crates/omnigraph-server/tests/openapi.rs index b257796..86a124d 100644 --- a/crates/omnigraph-server/tests/openapi.rs +++ b/crates/omnigraph-server/tests/openapi.rs @@ -167,10 +167,6 @@ const EXPECTED_PATHS: &[&str] = &[ "/branches", "/branches/{branch}", "/branches/merge", - "/runs", - "/runs/{run_id}", - "/runs/{run_id}/publish", - "/runs/{run_id}/abort", "/commits", "/commits/{commit_id}", ]; @@ -256,30 +252,6 @@ fn openapi_branch_merge_is_post() { assert!(doc["paths"]["/branches/merge"]["post"].is_object()); } -#[test] -fn openapi_runs_is_get() { - let doc = openapi_json(); - assert!(doc["paths"]["/runs"]["get"].is_object()); -} - -#[test] -fn openapi_run_show_is_get() { - let doc = openapi_json(); - assert!(doc["paths"]["/runs/{run_id}"]["get"].is_object()); -} - -#[test] -fn openapi_run_publish_is_post() { - let doc = openapi_json(); - assert!(doc["paths"]["/runs/{run_id}/publish"]["post"].is_object()); -} - -#[test] -fn openapi_run_abort_is_post() { - let doc = openapi_json(); - assert!(doc["paths"]["/runs/{run_id}/abort"]["post"].is_object()); -} - #[test] fn openapi_commits_is_get() { let doc = openapi_json(); @@ -321,10 +293,9 @@ const EXPECTED_SCHEMAS: &[&str] = &[ "ReadOutput", "ReadRequest", "ReadTargetOutput", + "ManifestConflictOutput", "SchemaApplyOutput", "SchemaApplyRequest", - "RunListOutput", - "RunOutput", "SnapshotOutput", "SnapshotTableOutput", ]; @@ -490,19 +461,17 @@ fn error_output_schema_has_expected_fields() { assert!(props.contains_key("error")); assert!(props.contains_key("code")); assert!(props.contains_key("merge_conflicts")); + assert!(props.contains_key("manifest_conflict")); } #[test] -fn run_output_schema_has_expected_fields() { +fn manifest_conflict_output_schema_has_expected_fields() { let doc = openapi_json(); - let schema = &doc["components"]["schemas"]["RunOutput"]; + let schema = &doc["components"]["schemas"]["ManifestConflictOutput"]; let props = schema["properties"].as_object().unwrap(); - assert!(props.contains_key("run_id")); - assert!(props.contains_key("target_branch")); - assert!(props.contains_key("run_branch")); - assert!(props.contains_key("status")); - assert!(props.contains_key("created_at")); - assert!(props.contains_key("updated_at")); + assert!(props.contains_key("table_key")); + assert!(props.contains_key("expected")); + assert!(props.contains_key("actual")); } #[test] @@ -621,10 +590,6 @@ fn protected_endpoints_reference_bearer_token_security() { ("/branches", "post"), ("/branches/{branch}", "delete"), ("/branches/merge", "post"), - ("/runs", "get"), - ("/runs/{run_id}", "get"), - ("/runs/{run_id}/publish", "post"), - ("/runs/{run_id}/abort", "post"), ("/commits", "get"), ("/commits/{commit_id}", "get"), ]; @@ -667,18 +632,6 @@ fn branch_delete_has_branch_path_parameter() { assert!(has_branch, "DELETE /branches/{{branch}} must have 'branch' path parameter"); } -#[test] -fn run_show_has_run_id_path_parameter() { - let doc = openapi_json(); - let params = doc["paths"]["/runs/{run_id}"]["get"]["parameters"] - .as_array() - .unwrap(); - let has_run_id = params.iter().any(|p| { - p["name"].as_str() == Some("run_id") && p["in"].as_str() == Some("path") - }); - assert!(has_run_id, "GET /runs/{{run_id}} must have 'run_id' path parameter"); -} - #[test] fn commit_show_has_commit_id_path_parameter() { let doc = openapi_json(); @@ -914,7 +867,6 @@ async fn auth_mode_spec_has_security_on_protected_operations() { ("/change", "post"), ("/snapshot", "get"), ("/branches", "get"), - ("/runs", "get"), ("/commits", "get"), ]; for (path, method) in protected_paths { diff --git a/crates/omnigraph-server/tests/server.rs b/crates/omnigraph-server/tests/server.rs index 77b9118..60d4043 100644 --- a/crates/omnigraph-server/tests/server.rs +++ b/crates/omnigraph-server/tests/server.rs @@ -54,11 +54,6 @@ rules: actors: { group: admins } actions: [branch_delete, branch_merge] target_branch_scope: protected - - id: admins-publish - allow: - actors: { group: admins } - actions: [run_publish] - target_branch_scope: protected "#; const POLICY_PROTECTED_READ_YAML: &str = r#" @@ -766,7 +761,7 @@ async fn protected_routes_require_bearer_token() { let (status, body) = json_response( &app, Request::builder() - .uri("/runs") + .uri("/branches") .method(Method::GET) .body(Body::empty()) .unwrap(), @@ -801,7 +796,7 @@ async fn protected_routes_accept_valid_bearer_token_while_healthz_stays_open() { let (status, body) = json_response( &app, Request::builder() - .uri("/runs") + .uri("/branches") .method(Method::GET) .header("authorization", "Bearer demo-token") .body(Body::empty()) @@ -810,7 +805,7 @@ async fn protected_routes_accept_valid_bearer_token_while_healthz_stays_open() { .await; assert_eq!(status, StatusCode::OK); - assert!(body["runs"].is_array()); + assert!(body["branches"].is_array()); } #[tokio::test(flavor = "multi_thread")] @@ -882,7 +877,7 @@ async fn protected_routes_accept_any_configured_team_bearer_token() { let (status, body) = json_response( &app, Request::builder() - .uri("/runs") + .uri("/branches") .method(Method::GET) .header("authorization", "Bearer token-two") .body(Body::empty()) @@ -891,7 +886,7 @@ async fn protected_routes_accept_any_configured_team_bearer_token() { .await; assert_eq!(status, StatusCode::OK); - assert!(body["runs"].is_array()); + assert!(body["branches"].is_array()); } /// Verifies the hashed-token lookup correctly resolves each bearer to its @@ -1350,66 +1345,9 @@ async fn policy_blocks_non_admin_merge_to_main_and_allows_admin() { } #[tokio::test(flavor = "multi_thread")] -async fn policy_blocks_non_admin_run_publish_to_main() { - let temp = init_loaded_repo().await; - let repo = repo_path(temp.path()); - let run_id = { - let mut db = Omnigraph::open(repo.to_str().unwrap()).await.unwrap(); - db.begin_run("main", Some("policy-publish")) - .await - .unwrap() - .run_id - .as_str() - .to_string() - }; - - let policy_path = temp.path().join("policy.yaml"); - fs::write(&policy_path, POLICY_YAML).unwrap(); - let state = AppState::open_with_bearer_tokens_and_policy( - repo.to_string_lossy().to_string(), - vec![ - ("act-bruno".to_string(), "team-token".to_string()), - ("act-ragnor".to_string(), "admin-token".to_string()), - ], - Some(&policy_path), - ) - .await - .unwrap(); - let app = build_app(state); - - let (deny_status, deny_body) = json_response( - &app, - Request::builder() - .uri(format!("/runs/{run_id}/publish")) - .method(Method::POST) - .header("authorization", "Bearer team-token") - .body(Body::empty()) - .unwrap(), - ) - .await; - let deny_error: ErrorOutput = serde_json::from_value(deny_body).unwrap(); - assert_eq!(deny_status, StatusCode::FORBIDDEN); - assert_eq!( - deny_error.code, - Some(omnigraph_server::api::ErrorCode::Forbidden) - ); - - let (allow_status, allow_body) = json_response( - &app, - Request::builder() - .uri(format!("/runs/{run_id}/publish")) - .method(Method::POST) - .header("authorization", "Bearer admin-token") - .body(Body::empty()) - .unwrap(), - ) - .await; - assert_eq!(allow_status, StatusCode::OK); - assert_eq!(allow_body["target_branch"], "main"); -} - -#[tokio::test(flavor = "multi_thread")] -async fn authenticated_change_stamps_actor_on_runs_and_commits() { +async fn authenticated_change_stamps_actor_on_commits() { + // MR-771: with the Run state machine removed, actor_id is recorded + // directly on the commit graph (no intermediate run record). let (_temp, app) = app_for_loaded_repo_with_auth_tokens(&[("act-andrew", "token-one")]).await; let change = ChangeRequest { @@ -1432,26 +1370,6 @@ async fn authenticated_change_stamps_actor_on_runs_and_commits() { assert_eq!(change_status, StatusCode::OK); assert_eq!(change_body["actor_id"], "act-andrew"); - let (runs_status, runs_body) = json_response( - &app, - Request::builder() - .uri("/runs") - .method(Method::GET) - .header("authorization", "Bearer token-one") - .body(Body::empty()) - .unwrap(), - ) - .await; - assert_eq!(runs_status, StatusCode::OK); - let run = runs_body["runs"] - .as_array() - .unwrap() - .iter() - .find(|run| run["operation_hash"] == "mutation:insert_person:branch=main") - .expect("mutation run should be present"); - assert_eq!(run["actor_id"], "act-andrew"); - assert_eq!(run["status"], "published"); - let (commits_status, commits_body) = json_response( &app, Request::builder() @@ -2189,94 +2107,74 @@ query vector_search_string($q: String) { } #[tokio::test(flavor = "multi_thread")] -async fn missing_run_returns_not_found() { - let (_temp, app) = app_for_loaded_repo().await; - let (status, body) = json_response( - &app, - Request::builder() - .uri("/runs/missing-run") - .method(Method::GET) - .body(Body::empty()) - .unwrap(), - ) - .await; - - let error: ErrorOutput = serde_json::from_value(body).unwrap(); - assert_eq!(status, StatusCode::NOT_FOUND); - assert_eq!(error.code, Some(omnigraph_server::api::ErrorCode::NotFound)); - assert!(error.error.contains("run 'missing-run' not found")); -} - -#[tokio::test(flavor = "multi_thread")] -async fn publish_conflict_returns_conflict_status() { +async fn change_conflict_returns_manifest_conflict_409() { + // MR-771: a write that races with another writer surfaces as HTTP 409 + // with a structured `manifest_conflict` body — `table_key`, `expected`, + // and `actual` — so clients can detect-and-retry without parsing the + // message. (Replaces the old run-publish merge-conflict shape.) let temp = init_loaded_repo().await; let repo = repo_path(temp.path()); - let mut db = Omnigraph::open(repo.to_str().unwrap()).await.unwrap(); - - let run_a = db - .begin_run("main", Some("server-conflict-a")) - .await - .unwrap(); - let run_b = db - .begin_run("main", Some("server-conflict-b")) - .await - .unwrap(); - db.mutate( - &run_a.run_branch, - MUTATION_QUERIES, - "set_age", - &omnigraph_compiler::json_params_to_param_map( - Some(&json!({"name": "Alice", "age": 31 })), - &omnigraph_compiler::find_named_query(MUTATION_QUERIES, "set_age") - .unwrap() - .params, - omnigraph_compiler::JsonParamMode::Standard, - ) - .unwrap(), - ) - .await - .unwrap(); - db.mutate( - &run_b.run_branch, - MUTATION_QUERIES, - "set_age", - &omnigraph_compiler::json_params_to_param_map( - Some(&json!({"name": "Alice", "age": 32 })), - &omnigraph_compiler::find_named_query(MUTATION_QUERIES, "set_age") - .unwrap() - .params, - omnigraph_compiler::JsonParamMode::Standard, - ) - .unwrap(), - ) - .await - .unwrap(); - db.publish_run(&run_a.run_id).await.unwrap(); - drop(db); + // Build the server first so its handle pins the pre-mutation manifest + // version. Then advance the manifest from outside the server. The + // server's next /change call will capture stale `expected_versions` + // (from its still-pinned snapshot) and the publisher's CAS rejects. let state = AppState::open(repo.to_string_lossy().to_string()) .await .unwrap(); let app = build_app(state); + + { + let mut db = Omnigraph::open(repo.to_str().unwrap()).await.unwrap(); + db.mutate( + "main", + MUTATION_QUERIES, + "set_age", + &omnigraph_compiler::json_params_to_param_map( + Some(&json!({"name": "Alice", "age": 31 })), + &omnigraph_compiler::find_named_query(MUTATION_QUERIES, "set_age") + .unwrap() + .params, + omnigraph_compiler::JsonParamMode::Standard, + ) + .unwrap(), + ) + .await + .unwrap(); + } + let (status, body) = json_response( &app, Request::builder() - .uri(format!("/runs/{}/publish", run_b.run_id.as_str())) + .uri("/change") .method(Method::POST) .header("content-type", "application/json") - .body(Body::from(b"{}" as &[u8])) + .body(Body::from( + serde_json::to_vec(&ChangeRequest { + query_source: MUTATION_QUERIES.to_string(), + query_name: Some("set_age".to_string()), + params: Some(json!({ "name": "Alice", "age": 33 })), + branch: Some("main".to_string()), + }) + .unwrap(), + )) .unwrap(), ) .await; - let error: ErrorOutput = serde_json::from_value(body).unwrap(); assert_eq!(status, StatusCode::CONFLICT); + let error: ErrorOutput = serde_json::from_value(body).unwrap(); assert_eq!(error.code, Some(omnigraph_server::api::ErrorCode::Conflict)); - assert!(error.merge_conflicts.iter().any(|conflict| { - conflict.table_key == "node:Person" - && conflict.row_id.as_deref() == Some("Alice") - && conflict.kind == omnigraph_server::api::MergeConflictKindOutput::DivergentUpdate - })); + let conflict = error + .manifest_conflict + .expect("publisher CAS rejection must populate manifest_conflict body"); + assert_eq!(conflict.table_key, "node:Person"); + assert!( + conflict.actual > conflict.expected, + "actual ({}) should be ahead of expected ({})", + conflict.actual, + conflict.expected, + ); } #[tokio::test(flavor = "multi_thread")] diff --git a/crates/omnigraph/src/db/graph_coordinator.rs b/crates/omnigraph/src/db/graph_coordinator.rs index 03422f0..a721036 100644 --- a/crates/omnigraph/src/db/graph_coordinator.rs +++ b/crates/omnigraph/src/db/graph_coordinator.rs @@ -1,3 +1,4 @@ +use std::collections::HashMap; use std::fmt; use std::sync::Arc; @@ -10,7 +11,6 @@ use crate::storage::{StorageAdapter, join_uri, normalize_root_uri}; use super::commit_graph::{CommitGraph, GraphCommit}; use super::is_internal_system_branch; use super::manifest::{ManifestChange, ManifestCoordinator, Snapshot, SubTableUpdate}; -use super::run_registry::{RunId, RunRecord, RunRegistry, graph_runs_uri}; const GRAPH_COMMITS_DIR: &str = "_graph_commits.lance"; @@ -93,7 +93,6 @@ pub struct GraphCoordinator { storage: Arc, manifest: ManifestCoordinator, commit_graph: Option, - run_registry: Option, bound_branch: Option, } @@ -111,7 +110,6 @@ impl GraphCoordinator { storage, manifest, commit_graph, - run_registry: None, bound_branch: None, }) } @@ -124,17 +122,11 @@ impl GraphCoordinator { } else { None }; - let run_registry = if storage.exists(&graph_runs_uri(&root)).await? { - Some(RunRegistry::open(&root).await?) - } else { - None - }; Ok(Self { root_uri: root, storage, manifest, commit_graph, - run_registry, bound_branch: None, }) } @@ -156,18 +148,12 @@ impl GraphCoordinator { } else { None }; - let run_registry = if storage.exists(&graph_runs_uri(&root)).await? { - Some(RunRegistry::open(&root).await?) - } else { - None - }; Ok(Self { root_uri: root, storage, manifest, commit_graph, - run_registry, bound_branch: Some(branch_name), }) } @@ -193,10 +179,6 @@ impl GraphCoordinator { if let Some(commit_graph) = &mut self.commit_graph { commit_graph.refresh().await?; } - if let Some(run_registry) = &mut self.run_registry { - let root_uri = self.root_uri.clone(); - run_registry.refresh(&root_uri).await?; - } Ok(()) } @@ -382,21 +364,6 @@ impl GraphCoordinator { Ok(()) } - pub(crate) async fn ensure_run_registry_initialized(&mut self) -> Result<()> { - if self.run_registry.is_some() { - return Ok(()); - } - if !self - .storage - .exists(&graph_runs_uri(self.root_uri())) - .await? - { - let _ = RunRegistry::init(self.root_uri()).await?; - } - self.run_registry = Some(RunRegistry::open(self.root_uri()).await?); - Ok(()) - } - pub(crate) async fn commit_updates_with_actor( &mut self, updates: &[SubTableUpdate], @@ -410,6 +377,27 @@ impl GraphCoordinator { }) } + /// Commit with publisher-level OCC fence. The `expected_table_versions` map + /// asserts the manifest's current latest non-tombstoned `table_version` for + /// each `table_key` matches what the caller observed before writing. + /// Mismatches surface as `OmniError::Manifest` with + /// `ManifestConflictDetails::ExpectedVersionMismatch`. + pub(crate) async fn commit_updates_with_actor_with_expected( + &mut self, + updates: &[SubTableUpdate], + expected_table_versions: &HashMap, + actor_id: Option<&str>, + ) -> Result { + let manifest_version = self + .commit_manifest_updates_with_expected(updates, expected_table_versions) + .await?; + let snapshot_id = self.record_graph_commit(manifest_version, actor_id).await?; + Ok(PublishedSnapshot { + manifest_version, + _snapshot_id: snapshot_id, + }) + } + pub(crate) async fn commit_manifest_updates( &mut self, updates: &[SubTableUpdate], @@ -419,6 +407,19 @@ impl GraphCoordinator { Ok(manifest_version) } + pub(crate) async fn commit_manifest_updates_with_expected( + &mut self, + updates: &[SubTableUpdate], + expected_table_versions: &HashMap, + ) -> Result { + let manifest_version = self + .manifest + .commit_with_expected(updates, expected_table_versions) + .await?; + failpoints::maybe_fail("graph_publish.after_manifest_commit")?; + Ok(manifest_version) + } + pub(crate) async fn commit_manifest_changes( &mut self, changes: &[ManifestChange], @@ -504,54 +505,6 @@ impl GraphCoordinator { Ok(Some(graph)) } - pub(crate) async fn append_run_record(&mut self, record: &RunRecord) -> Result<()> { - self.ensure_run_registry_initialized().await?; - let Some(run_registry) = &mut self.run_registry else { - return Err(OmniError::manifest( - "run registry not initialized".to_string(), - )); - }; - run_registry.append_record(record).await - } - - pub(crate) async fn get_run(&self, run_id: &RunId) -> Result { - if let Some(run_registry) = &self.run_registry { - if let Some(run) = run_registry.get_run(run_id).await? { - return Ok(run); - } - } - if !self - .storage - .exists(&graph_runs_uri(self.root_uri())) - .await? - { - return Err(OmniError::manifest_not_found(format!( - "run '{}' not found", - run_id - ))); - } - let run_registry = RunRegistry::open(self.root_uri()).await?; - run_registry - .get_run(run_id) - .await? - .ok_or_else(|| OmniError::manifest_not_found(format!("run '{}' not found", run_id))) - } - - pub(crate) async fn list_runs(&self) -> Result> { - if let Some(run_registry) = &self.run_registry { - return run_registry.list_runs().await; - } - if !self - .storage - .exists(&graph_runs_uri(self.root_uri())) - .await? - { - return Ok(Vec::new()); - } - let run_registry = RunRegistry::open(self.root_uri()).await?; - run_registry.list_runs().await - } - pub(crate) async fn list_commits(&self) -> Result> { if let Some(commit_graph) = &self.commit_graph { return commit_graph.load_commits().await; diff --git a/crates/omnigraph/src/db/mod.rs b/crates/omnigraph/src/db/mod.rs index 77c3d26..cc73315 100644 --- a/crates/omnigraph/src/db/mod.rs +++ b/crates/omnigraph/src/db/mod.rs @@ -13,7 +13,6 @@ pub use omnigraph::{ TableOptimizeStats, }; pub(crate) use run_registry::is_internal_run_branch; -pub use run_registry::{RunId, RunRecord, RunStatus}; pub(crate) const SCHEMA_APPLY_LOCK_BRANCH: &str = "__schema_apply_lock__"; diff --git a/crates/omnigraph/src/db/omnigraph.rs b/crates/omnigraph/src/db/omnigraph.rs index 302bb49..7af22be 100644 --- a/crates/omnigraph/src/db/omnigraph.rs +++ b/crates/omnigraph/src/db/omnigraph.rs @@ -22,8 +22,7 @@ use omnigraph_compiler::{ }; use crate::db::graph_coordinator::{GraphCoordinator, PublishedSnapshot}; -use crate::db::run_registry::{RunRecord, RunStatus}; -use crate::error::{ManifestErrorKind, OmniError, Result}; +use crate::error::{OmniError, Result}; use crate::runtime_cache::RuntimeCache; use crate::storage::{StorageAdapter, join_uri, normalize_root_uri, storage_for_uri}; use crate::table_store::TableStore; @@ -47,8 +46,8 @@ use super::schema_state::{ write_schema_contract, write_schema_contract_staging, }; use super::{ - ReadTarget, ResolvedTarget, RunId, SCHEMA_APPLY_LOCK_BRANCH, SnapshotId, - is_internal_system_branch, is_schema_apply_lock_branch, + ReadTarget, ResolvedTarget, SCHEMA_APPLY_LOCK_BRANCH, SnapshotId, is_internal_system_branch, + is_schema_apply_lock_branch, }; #[derive(Debug, Clone, Copy, PartialEq, Eq)] @@ -530,17 +529,6 @@ impl Omnigraph { ))); } - for run in self.list_runs().await? { - if run.target_branch == branch && matches!(run.status, RunStatus::Running) { - return Err(OmniError::manifest_conflict(format!( - "cannot delete branch '{}' while run '{}' targeting it is {}", - branch, - run.run_id, - run.status.as_str() - ))); - } - } - for other_branch in branches .iter() .filter(|candidate| candidate.as_str() != branch) @@ -608,38 +596,6 @@ impl Omnigraph { .await } - async fn cleanup_terminal_run_branches_for_target(&mut self, branch: &str) -> Result<()> { - let terminal_run_branches = self - .list_runs() - .await? - .into_iter() - .filter(|run| { - run.target_branch == branch - && matches!( - run.status, - RunStatus::Published | RunStatus::Aborted | RunStatus::Failed - ) - }) - .map(|run| run.run_branch) - .collect::>(); - - let live_branches: HashSet = - self.coordinator.all_branches().await?.into_iter().collect(); - - for run_branch in terminal_run_branches { - if !live_branches.contains(&run_branch) { - continue; - } - match self.delete_branch_storage_only(&run_branch).await { - Ok(()) => {} - Err(OmniError::Manifest(err)) if err.kind == ManifestErrorKind::NotFound => {} - Err(err) => return Err(err), - } - } - - Ok(()) - } - pub(crate) fn normalize_branch_name(branch: &str) -> Result> { normalize_branch_name(branch) } @@ -720,73 +676,9 @@ impl Omnigraph { } self.ensure_branch_delete_safe(&branch, &branches).await?; - self.cleanup_terminal_run_branches_for_target(&branch) - .await?; self.delete_branch_storage_only(&branch).await } - pub(crate) async fn latest_branch_snapshot_id(&self, branch: &str) -> Result { - let normalized = normalize_branch_name(branch)?; - let fresh = self - .open_coordinator_for_branch(normalized.as_deref()) - .await?; - fresh.resolve_snapshot_id(branch).await - } - - pub async fn begin_run( - &mut self, - target_branch: &str, - operation_hash: Option<&str>, - ) -> Result { - self.begin_run_as(target_branch, operation_hash, None).await - } - - pub async fn begin_run_as( - &mut self, - target_branch: &str, - operation_hash: Option<&str>, - actor_id: Option<&str>, - ) -> Result { - self.ensure_schema_state_valid().await?; - self.ensure_schema_apply_idle("begin_run").await?; - ensure_public_branch_ref(target_branch, "begin_run")?; - let target_branch = - normalize_branch_name(target_branch)?.unwrap_or_else(|| "main".to_string()); - let fresh = self - .open_coordinator_for_branch(Self::normalize_branch_name(&target_branch)?.as_deref()) - .await?; - let base_snapshot_id = fresh.resolve_snapshot_id(&target_branch).await?; - let base_manifest_version = fresh.version(); - let record = RunRecord::new( - target_branch.clone(), - base_snapshot_id.as_str(), - base_manifest_version, - operation_hash.map(str::to_string), - actor_id - .map(str::to_string) - .or_else(|| self.current_audit_actor().map(str::to_string)), - )?; - - self.branch_create_from_impl( - ReadTarget::branch(target_branch.clone()), - &record.run_branch, - true, - ) - .await?; - self.coordinator.append_run_record(&record).await?; - Ok(record) - } - - pub async fn get_run(&self, run_id: &RunId) -> Result { - self.ensure_schema_state_valid().await?; - self.coordinator.get_run(run_id).await - } - - pub async fn list_runs(&self) -> Result> { - self.ensure_schema_state_valid().await?; - self.coordinator.list_runs().await - } - pub async fn get_commit(&self, commit_id: &str) -> Result { self.ensure_schema_state_valid().await?; self.coordinator @@ -804,238 +696,6 @@ impl Omnigraph { coordinator.list_commits().await } - pub async fn abort_run(&mut self, run_id: &RunId) -> Result { - self.ensure_schema_state_valid().await?; - let run = self.get_run(run_id).await?; - match run.status { - RunStatus::Running | RunStatus::Failed => { - self.terminate_run(&run, RunStatus::Aborted, None).await - } - RunStatus::Published => Err(OmniError::manifest_conflict(format!( - "run '{}' is already published", - run_id - ))), - RunStatus::Aborted => Err(OmniError::manifest_conflict(format!( - "run '{}' is already aborted", - run_id - ))), - } - } - - pub async fn fail_run(&mut self, run_id: &RunId) -> Result { - self.ensure_schema_state_valid().await?; - let run = self.get_run(run_id).await?; - match run.status { - RunStatus::Running => self.terminate_run(&run, RunStatus::Failed, None).await, - RunStatus::Failed => Ok(run), - RunStatus::Published => Err(OmniError::manifest_conflict(format!( - "run '{}' is already published", - run_id - ))), - RunStatus::Aborted => Err(OmniError::manifest_conflict(format!( - "run '{}' is already aborted", - run_id - ))), - } - } - - /// Append a terminal-state run record and delete the `__run__` branch. - /// The status record is authoritative; the branch is scaffolding. Delete - /// errors are swallowed — a later `branch_delete` of the target will - /// retry via `cleanup_terminal_run_branches_for_target`. - async fn terminate_run( - &mut self, - run: &RunRecord, - status: RunStatus, - published_snapshot_id: Option, - ) -> Result { - let updated = run.with_status(status, published_snapshot_id)?; - self.coordinator.append_run_record(&updated).await?; - let _ = self.delete_branch_storage_only(&updated.run_branch).await; - Ok(updated) - } - - pub async fn publish_run(&mut self, run_id: &RunId) -> Result { - self.publish_run_as(run_id, None).await - } - - pub async fn publish_run_as( - &mut self, - run_id: &RunId, - actor_id: Option<&str>, - ) -> Result { - self.ensure_schema_state_valid().await?; - self.ensure_schema_apply_idle("publish_run").await?; - let run = self.get_run(run_id).await?; - match run.status { - RunStatus::Running => {} - RunStatus::Published => { - return run - .published_snapshot_id - .clone() - .map(SnapshotId::new) - .ok_or_else(|| { - OmniError::manifest(format!( - "run '{}' is published but missing published snapshot id", - run_id - )) - }); - } - RunStatus::Failed | RunStatus::Aborted => { - return Err(OmniError::manifest_conflict(format!( - "run '{}' is not publishable from status '{}'", - run_id, - run.status.as_str() - ))); - } - } - - let publish_actor = actor_id - .map(str::to_string) - .or_else(|| run.actor_id.clone()); - let current_target_snapshot_id = self.resolve_snapshot(&run.target_branch).await?; - let previous_actor = self.audit_actor_id.clone(); - self.audit_actor_id = publish_actor.clone(); - let publish_result = if current_target_snapshot_id.as_str() == run.base_snapshot_id { - let run_for_promotion = run.clone(); - self.sync_branch(&run_for_promotion.target_branch).await?; - self.promote_run_snapshot_to_target(&run_for_promotion) - .await - } else { - let run_branch = run.run_branch.clone(); - let target_branch = run.target_branch.clone(); - self.branch_merge_internal(&run_branch, &target_branch) - .await?; - self.reify_internal_run_refs(&target_branch, &run_branch) - .await - }; - self.audit_actor_id = previous_actor; - publish_result?; - let published_snapshot_id = self.resolve_snapshot(&run.target_branch).await?; - self.terminate_run( - &run, - RunStatus::Published, - Some(published_snapshot_id.as_str().to_string()), - ) - .await?; - Ok(published_snapshot_id) - } - - async fn promote_run_snapshot_to_target(&mut self, run: &RunRecord) -> Result<()> { - let target_snapshot = self - .snapshot_of(ReadTarget::branch(run.target_branch.as_str())) - .await?; - let run_snapshot = self - .snapshot_of(ReadTarget::branch(run.run_branch.as_str())) - .await?; - let mut table_keys = std::collections::BTreeSet::new(); - for entry in target_snapshot.entries() { - table_keys.insert(entry.table_key.clone()); - } - for entry in run_snapshot.entries() { - table_keys.insert(entry.table_key.clone()); - } - - let mut updates = Vec::new(); - let mut changed_edge_tables = false; - let target_branch = normalize_branch_name(&run.target_branch)?; - - for table_key in table_keys { - let target_entry = target_snapshot.entry(&table_key); - let run_entry = run_snapshot.entry(&table_key); - if same_manifest_state(target_entry, run_entry) { - continue; - } - let Some(_run_entry) = run_entry else { - return Err(OmniError::manifest(format!( - "run '{}' removed table '{}' which publish_run does not support", - run.run_id, table_key - ))); - }; - - let source_ds = run_snapshot.open(&table_key).await?; - let batch = self.batch_for_table_rewrite(&source_ds, &table_key).await?; - - let (mut target_ds, full_path, table_branch) = self - .open_for_mutation_on_branch(target_branch.as_deref(), &table_key) - .await?; - let state = self - .table_store() - .overwrite_batch(&full_path, &mut target_ds, batch) - .await?; - updates.push(crate::db::SubTableUpdate { - table_key: table_key.clone(), - table_version: state.version, - table_branch, - row_count: state.row_count, - version_metadata: state.version_metadata, - }); - if table_key.starts_with("edge:") { - changed_edge_tables = true; - } - } - - if !updates.is_empty() { - self.commit_updates_on_branch(target_branch.as_deref(), &updates) - .await?; - if changed_edge_tables { - self.invalidate_graph_index().await; - } - } - - Ok(()) - } - - async fn reify_internal_run_refs( - &mut self, - target_branch: &str, - run_branch: &str, - ) -> Result<()> { - let target_snapshot = self.snapshot_of(ReadTarget::branch(target_branch)).await?; - let mut updates = Vec::new(); - let mut changed_edge_tables = false; - let target_branch = normalize_branch_name(target_branch)?; - - for entry in target_snapshot.entries() { - if entry.table_branch.as_deref() != Some(run_branch) { - continue; - } - - let source_ds = target_snapshot.open(&entry.table_key).await?; - let batch = self - .batch_for_table_rewrite(&source_ds, &entry.table_key) - .await?; - - let (mut target_ds, full_path, table_branch) = self - .open_for_mutation_on_branch(target_branch.as_deref(), &entry.table_key) - .await?; - let state = self - .table_store() - .overwrite_batch(&full_path, &mut target_ds, batch) - .await?; - updates.push(crate::db::SubTableUpdate { - table_key: entry.table_key.clone(), - table_version: state.version, - table_branch, - row_count: state.row_count, - version_metadata: state.version_metadata, - }); - if entry.table_key.starts_with("edge:") { - changed_edge_tables = true; - } - } - - if !updates.is_empty() { - self.commit_updates_on_branch(target_branch.as_deref(), &updates) - .await?; - if changed_edge_tables { - self.invalidate_graph_index().await; - } - } - - Ok(()) - } - /// Open a sub-table for mutation with version-drift guard. /// /// Checks that the dataset's current version matches the snapshot-pinned @@ -1112,6 +772,9 @@ impl Omnigraph { table_ops::build_indices_on_dataset_for_catalog(self, catalog, table_key, ds).await } + // Used only by in-tree tests (`#[cfg(test)]`); the runtime path now + // uses `commit_updates_on_branch_with_expected` exclusively. + #[cfg(test)] pub(crate) async fn commit_updates( &mut self, updates: &[crate::db::SubTableUpdate], @@ -1141,12 +804,19 @@ impl Omnigraph { .await } - pub(crate) async fn commit_updates_on_branch( + pub(crate) async fn commit_updates_on_branch_with_expected( &mut self, branch: Option<&str>, updates: &[crate::db::SubTableUpdate], + expected_table_versions: &std::collections::HashMap, ) -> Result { - table_ops::commit_updates_on_branch(self, branch, updates).await + table_ops::commit_updates_on_branch_with_expected( + self, + branch, + updates, + expected_table_versions, + ) + .await } pub(crate) async fn ensure_commit_graph_initialized(&mut self) -> Result<()> { @@ -1158,13 +828,6 @@ impl Omnigraph { table_ops::invalidate_graph_index(self).await } - async fn batch_for_table_rewrite( - &self, - source_ds: &Dataset, - table_key: &str, - ) -> Result { - schema_apply::batch_for_table_rewrite(self, source_ds, table_key).await - } } pub(crate) fn normalize_branch_name(branch: &str) -> Result> { @@ -1196,22 +859,6 @@ fn ensure_public_branch_ref(branch: &str, operation: &str) -> Result<()> { Ok(()) } -fn same_manifest_state( - left: Option<&crate::db::SubTableEntry>, - right: Option<&crate::db::SubTableEntry>, -) -> bool { - match (left, right) { - (None, None) => true, - (Some(left), Some(right)) => { - left.table_path == right.table_path - && left.table_version == right.table_version - && left.table_branch == right.table_branch - && left.row_count == right.row_count - } - _ => false, - } -} - fn concat_or_empty_batches(schema: Arc, batches: Vec) -> Result { if batches.is_empty() { return Ok(RecordBatch::new_empty(schema)); diff --git a/crates/omnigraph/src/db/omnigraph/schema_apply.rs b/crates/omnigraph/src/db/omnigraph/schema_apply.rs index 1ef5907..b294260 100644 --- a/crates/omnigraph/src/db/omnigraph/schema_apply.rs +++ b/crates/omnigraph/src/db/omnigraph/schema_apply.rs @@ -32,10 +32,11 @@ pub(super) async fn apply_schema_with_lock( ) -> Result { db.ensure_schema_state_valid().await?; let branches = db.coordinator.all_branches().await?; - // Skip `main`, the schema-apply lock branch, and any internal `__run__` - // branches. Stale run branches used to block schema apply forever after - // a publish (MR-670); the publish path now cleans them up, but this - // filter is defense-in-depth for legacy repos that predate the fix. + // Skip `main` and internal system branches. The schema-apply lock branch + // is excluded because it is the cluster-wide schema-apply serializer. + // `__run__*` branches are no longer created (MR-771); the filter remains + // as defense-in-depth for legacy repos with leftover staging branches — + // MR-770 will sweep them and this guard can go. let blocking_branches = branches .into_iter() .filter(|branch| branch != "main" && !is_internal_system_branch(branch)) @@ -454,23 +455,6 @@ pub(super) async fn ensure_snapshot_entry_head_matches( .ensure_expected_version(&ds, &entry.table_key, entry.table_version) } -pub(super) async fn batch_for_table_rewrite( - db: &Omnigraph, - source_ds: &Dataset, - table_key: &str, -) -> Result { - batch_for_schema_apply_rewrite( - db, - source_ds, - table_key, - &db.catalog, - table_key, - &db.catalog, - None, - ) - .await -} - pub(super) async fn batch_for_schema_apply_rewrite( db: &Omnigraph, source_ds: &Dataset, diff --git a/crates/omnigraph/src/db/omnigraph/table_ops.rs b/crates/omnigraph/src/db/omnigraph/table_ops.rs index 8250a75..afc59d1 100644 --- a/crates/omnigraph/src/db/omnigraph/table_ops.rs +++ b/crates/omnigraph/src/db/omnigraph/table_ops.rs @@ -425,6 +425,26 @@ async fn commit_prepared_updates( Ok(manifest_version) } +async fn commit_prepared_updates_with_expected( + db: &mut Omnigraph, + updates: &[crate::db::SubTableUpdate], + expected_table_versions: &std::collections::HashMap, +) -> Result { + let actor_id = db.current_audit_actor().map(str::to_string); + let PublishedSnapshot { + manifest_version, + _snapshot_id: _, + } = db + .coordinator + .commit_updates_with_actor_with_expected( + updates, + expected_table_versions, + actor_id.as_deref(), + ) + .await?; + Ok(manifest_version) +} + pub(super) async fn commit_prepared_updates_on_branch( db: &mut Omnigraph, branch: Option<&str>, @@ -452,6 +472,41 @@ pub(super) async fn commit_prepared_updates_on_branch( Ok(manifest_version) } +pub(super) async fn commit_prepared_updates_on_branch_with_expected( + db: &mut Omnigraph, + branch: Option<&str>, + updates: &[crate::db::SubTableUpdate], + expected_table_versions: &std::collections::HashMap, +) -> Result { + let current_branch = db.coordinator.current_branch().map(str::to_string); + let requested_branch = branch.map(str::to_string); + if requested_branch == current_branch { + return commit_prepared_updates_with_expected(db, updates, expected_table_versions).await; + } + + let mut coordinator = match requested_branch.as_deref() { + Some(branch) => { + GraphCoordinator::open_branch(db.uri(), branch, Arc::clone(&db.storage)).await? + } + None => GraphCoordinator::open(db.uri(), Arc::clone(&db.storage)).await?, + }; + let actor_id = db.current_audit_actor().map(str::to_string); + let PublishedSnapshot { + manifest_version, + _snapshot_id: _, + } = coordinator + .commit_updates_with_actor_with_expected( + updates, + expected_table_versions, + actor_id.as_deref(), + ) + .await?; + Ok(manifest_version) +} + +// Used only by in-tree tests (`#[cfg(test)]`); the runtime path now uses +// `commit_updates_on_branch_with_expected` exclusively. +#[cfg(test)] pub(super) async fn commit_updates( db: &mut Omnigraph, updates: &[crate::db::SubTableUpdate], @@ -487,14 +542,19 @@ pub(super) async fn record_merge_commit( .map(|snapshot_id| snapshot_id.as_str().to_string()) } -pub(super) async fn commit_updates_on_branch( +/// Commit updates with a publisher-level OCC fence. The +/// `expected_table_versions` map asserts the manifest's pre-write per-table +/// versions; mismatches surface as `ManifestConflictDetails::ExpectedVersionMismatch`. +pub(super) async fn commit_updates_on_branch_with_expected( db: &mut Omnigraph, branch: Option<&str>, updates: &[crate::db::SubTableUpdate], + expected_table_versions: &std::collections::HashMap, ) -> Result { db.ensure_schema_apply_not_locked("write commit").await?; let prepared = prepare_updates_for_commit(db, branch, updates).await?; - commit_prepared_updates_on_branch(db, branch, &prepared).await + commit_prepared_updates_on_branch_with_expected(db, branch, &prepared, expected_table_versions) + .await } pub(super) async fn ensure_commit_graph_initialized(db: &mut Omnigraph) -> Result<()> { diff --git a/crates/omnigraph/src/db/run_registry.rs b/crates/omnigraph/src/db/run_registry.rs index 70658dc..9f1c376 100644 --- a/crates/omnigraph/src/db/run_registry.rs +++ b/crates/omnigraph/src/db/run_registry.rs @@ -1,622 +1,16 @@ -use std::collections::HashMap; -use std::fmt; -use std::sync::Arc; -use std::time::{SystemTime, UNIX_EPOCH}; +// Run state-machine has been removed (MR-771). Mutations now write directly +// to target tables and use the publisher's `expected_table_versions` CAS for +// cross-table OCC; the `__run__` staging branches and `_graph_runs.lance` +// state machine no longer exist. +// +// What remains is the branch-name predicate, kept as a defense-in-depth guard +// against users naming a public branch `__run__*`. MR-770 owns the production +// sweep of legacy `_graph_runs.lance` rows and stale `__run__*` branches; once +// that lands the predicate (and this file) can go too. -use arrow_array::{ - Array, RecordBatch, RecordBatchIterator, StringArray, TimestampMicrosecondArray, UInt64Array, -}; -use arrow_schema::{DataType, Field, Schema, SchemaRef, TimeUnit}; -use futures::TryStreamExt; -use lance::Dataset; -use lance::dataset::{WriteMode, WriteParams}; -use lance_file::version::LanceFileVersion; - -use crate::error::{OmniError, Result}; - -const GRAPH_RUNS_DIR: &str = "_graph_runs.lance"; -const GRAPH_RUN_ACTORS_DIR: &str = "_graph_run_actors.lance"; pub(crate) const INTERNAL_RUN_BRANCH_PREFIX: &str = "__run__"; -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub struct RunId(String); - -impl RunId { - pub fn new(id: impl Into) -> Self { - Self(id.into()) - } - - pub fn as_str(&self) -> &str { - &self.0 - } -} - -impl fmt::Display for RunId { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - self.0.fmt(f) - } -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum RunStatus { - Running, - Published, - Failed, - Aborted, -} - -impl RunStatus { - pub fn as_str(self) -> &'static str { - match self { - RunStatus::Running => "running", - RunStatus::Published => "published", - RunStatus::Failed => "failed", - RunStatus::Aborted => "aborted", - } - } - - fn parse(value: &str) -> Result { - match value { - "running" => Ok(Self::Running), - "published" => Ok(Self::Published), - "failed" => Ok(Self::Failed), - "aborted" => Ok(Self::Aborted), - other => Err(OmniError::manifest(format!( - "invalid run status '{}'", - other - ))), - } - } -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct RunRecord { - pub run_id: RunId, - pub target_branch: String, - pub run_branch: String, - pub base_snapshot_id: String, - pub base_manifest_version: u64, - pub operation_hash: Option, - pub actor_id: Option, - pub status: RunStatus, - pub published_snapshot_id: Option, - pub created_at: i64, - pub updated_at: i64, -} - -impl RunRecord { - pub fn new( - target_branch: impl Into, - base_snapshot_id: impl Into, - base_manifest_version: u64, - operation_hash: Option, - actor_id: Option, - ) -> Result { - let now = now_micros()?; - let run_id = RunId::new(ulid::Ulid::new().to_string()); - Ok(Self { - run_branch: internal_run_branch_name(&run_id), - run_id, - target_branch: target_branch.into(), - base_snapshot_id: base_snapshot_id.into(), - base_manifest_version, - operation_hash, - actor_id, - status: RunStatus::Running, - published_snapshot_id: None, - created_at: now, - updated_at: now, - }) - } - - pub fn with_status( - &self, - status: RunStatus, - published_snapshot_id: Option, - ) -> Result { - Ok(Self { - run_id: self.run_id.clone(), - target_branch: self.target_branch.clone(), - run_branch: self.run_branch.clone(), - base_snapshot_id: self.base_snapshot_id.clone(), - base_manifest_version: self.base_manifest_version, - operation_hash: self.operation_hash.clone(), - actor_id: self.actor_id.clone(), - status, - published_snapshot_id, - created_at: self.created_at, - updated_at: now_micros()?, - }) - } -} - -pub struct RunRegistry { - dataset: Dataset, - actor_dataset: Option, - latest_by_id: HashMap, - actor_by_run_id: HashMap, - root_uri: String, -} - -impl RunRegistry { - pub async fn init(root_uri: &str) -> Result { - let uri = graph_runs_uri(root_uri); - let batch = RecordBatch::new_empty(run_registry_schema()); - let reader = RecordBatchIterator::new(vec![Ok(batch)], run_registry_schema()); - let params = WriteParams { - mode: WriteMode::Create, - enable_stable_row_ids: true, - data_storage_version: Some(LanceFileVersion::V2_2), - ..Default::default() - }; - let dataset = Dataset::write(reader, &uri as &str, Some(params)) - .await - .map_err(|e| OmniError::Lance(e.to_string()))?; - let actor_dataset = create_run_actor_dataset(root_uri).await?; - Ok(Self { - dataset, - actor_dataset: Some(actor_dataset), - latest_by_id: HashMap::new(), - actor_by_run_id: HashMap::new(), - root_uri: root_uri.to_string(), - }) - } - - pub async fn open(root_uri: &str) -> Result { - let dataset = Dataset::open(&graph_runs_uri(root_uri)) - .await - .map_err(|e| OmniError::Lance(e.to_string()))?; - let actor_dataset = Dataset::open(&graph_run_actors_uri(root_uri)).await.ok(); - let actor_by_run_id = match &actor_dataset { - Some(dataset) => load_run_actor_cache(dataset).await?, - None => HashMap::new(), - }; - let latest_by_id = load_run_cache(&dataset, &actor_by_run_id).await?; - Ok(Self { - dataset, - actor_dataset, - latest_by_id, - actor_by_run_id, - root_uri: root_uri.to_string(), - }) - } - - pub async fn refresh(&mut self, root_uri: &str) -> Result<()> { - self.dataset = Dataset::open(&graph_runs_uri(root_uri)) - .await - .map_err(|e| OmniError::Lance(e.to_string()))?; - self.actor_dataset = Dataset::open(&graph_run_actors_uri(root_uri)).await.ok(); - self.actor_by_run_id = match &self.actor_dataset { - Some(dataset) => load_run_actor_cache(dataset).await?, - None => HashMap::new(), - }; - self.latest_by_id = load_run_cache(&self.dataset, &self.actor_by_run_id).await?; - self.root_uri = root_uri.to_string(); - Ok(()) - } - - pub async fn append_record(&mut self, record: &RunRecord) -> Result<()> { - let batch = runs_to_batch(&[record.clone()])?; - let reader = RecordBatchIterator::new(vec![Ok(batch)], run_registry_schema()); - let mut ds = self.dataset.clone(); - ds.append(reader, None) - .await - .map_err(|e| OmniError::Lance(e.to_string()))?; - self.dataset = ds; - if let Some(actor_id) = &record.actor_id { - self.append_actor(record.run_id.as_str(), actor_id).await?; - } - let mut record = record.clone(); - if record.actor_id.is_none() { - record.actor_id = self.actor_by_run_id.get(record.run_id.as_str()).cloned(); - } - merge_latest_run(&mut self.latest_by_id, record); - Ok(()) - } - - pub async fn get_run(&self, run_id: &RunId) -> Result> { - Ok(self.latest_by_id.get(run_id.as_str()).cloned()) - } - - pub async fn list_runs(&self) -> Result> { - self.load_runs().await - } - - pub async fn load_runs(&self) -> Result> { - let mut runs = self.latest_by_id.values().cloned().collect::>(); - runs.sort_by(|a, b| { - a.created_at - .cmp(&b.created_at) - .then_with(|| a.run_id.as_str().cmp(b.run_id.as_str())) - }); - Ok(runs) - } - - async fn append_actor(&mut self, run_id: &str, actor_id: &str) -> Result<()> { - if self - .actor_by_run_id - .get(run_id) - .is_some_and(|existing| existing == actor_id) - { - return Ok(()); - } - - let record = RunActorRecord { - run_id: run_id.to_string(), - actor_id: actor_id.to_string(), - created_at: now_micros()?, - }; - let batch = run_actors_to_batch(&[record])?; - let reader = RecordBatchIterator::new(vec![Ok(batch)], run_actor_schema()); - let mut dataset = match self.actor_dataset.take() { - Some(dataset) => dataset, - None => create_run_actor_dataset(&self.root_uri).await?, - }; - dataset - .append(reader, None) - .await - .map_err(|e| OmniError::Lance(e.to_string()))?; - self.actor_by_run_id - .insert(run_id.to_string(), actor_id.to_string()); - self.actor_dataset = Some(dataset); - Ok(()) - } -} - pub(crate) fn is_internal_run_branch(name: &str) -> bool { name.trim_start_matches('/') .starts_with(INTERNAL_RUN_BRANCH_PREFIX) } - -pub(crate) fn internal_run_branch_name(run_id: &RunId) -> String { - format!("{}{}", INTERNAL_RUN_BRANCH_PREFIX, run_id.as_str()) -} - -pub(crate) fn graph_runs_uri(root_uri: &str) -> String { - format!("{}/{}", root_uri.trim_end_matches('/'), GRAPH_RUNS_DIR) -} - -fn graph_run_actors_uri(root_uri: &str) -> String { - format!( - "{}/{}", - root_uri.trim_end_matches('/'), - GRAPH_RUN_ACTORS_DIR - ) -} - -fn run_registry_schema() -> SchemaRef { - Arc::new(Schema::new(vec![ - Field::new("run_id", DataType::Utf8, false), - Field::new("target_branch", DataType::Utf8, false), - Field::new("run_branch", DataType::Utf8, false), - Field::new("base_snapshot_id", DataType::Utf8, false), - Field::new("base_manifest_version", DataType::UInt64, false), - Field::new("operation_hash", DataType::Utf8, true), - Field::new("status", DataType::Utf8, false), - Field::new("published_snapshot_id", DataType::Utf8, true), - Field::new( - "created_at", - DataType::Timestamp(TimeUnit::Microsecond, None), - false, - ), - Field::new( - "updated_at", - DataType::Timestamp(TimeUnit::Microsecond, None), - false, - ), - ])) -} - -fn run_actor_schema() -> SchemaRef { - Arc::new(Schema::new(vec![ - Field::new("run_id", DataType::Utf8, false), - Field::new("actor_id", DataType::Utf8, false), - Field::new( - "created_at", - DataType::Timestamp(TimeUnit::Microsecond, None), - false, - ), - ])) -} - -async fn create_run_actor_dataset(root_uri: &str) -> Result { - let batch = RecordBatch::new_empty(run_actor_schema()); - let reader = RecordBatchIterator::new(vec![Ok(batch)], run_actor_schema()); - let params = WriteParams { - mode: WriteMode::Create, - enable_stable_row_ids: true, - data_storage_version: Some(LanceFileVersion::V2_2), - ..Default::default() - }; - Dataset::write( - reader, - &graph_run_actors_uri(root_uri) as &str, - Some(params), - ) - .await - .map_err(|e| OmniError::Lance(e.to_string())) -} - -async fn load_run_cache( - dataset: &Dataset, - actor_by_run_id: &HashMap, -) -> Result> { - let batches: Vec = dataset - .scan() - .try_into_stream() - .await - .map_err(|e| OmniError::Lance(e.to_string()))? - .try_collect() - .await - .map_err(|e| OmniError::Lance(e.to_string()))?; - - let mut latest_by_id = HashMap::new(); - for mut record in load_runs_from_batches(&batches)? { - record.actor_id = actor_by_run_id.get(record.run_id.as_str()).cloned(); - merge_latest_run(&mut latest_by_id, record); - } - Ok(latest_by_id) -} - -async fn load_run_actor_cache(dataset: &Dataset) -> Result> { - let batches: Vec = dataset - .scan() - .try_into_stream() - .await - .map_err(|e| OmniError::Lance(e.to_string()))? - .try_collect() - .await - .map_err(|e| OmniError::Lance(e.to_string()))?; - - let mut actors = HashMap::new(); - for batch in batches { - let run_ids = string_column(&batch, "run_id", "run actor registry")?; - let actor_ids = string_column(&batch, "actor_id", "run actor registry")?; - for row in 0..batch.num_rows() { - actors.insert( - run_ids.value(row).to_string(), - actor_ids.value(row).to_string(), - ); - } - } - Ok(actors) -} - -fn load_runs_from_batches(batches: &[RecordBatch]) -> Result> { - let mut runs = Vec::new(); - for batch in batches { - let run_ids = string_column(batch, "run_id", "run registry")?; - let target_branches = string_column(batch, "target_branch", "run registry")?; - let run_branches = string_column(batch, "run_branch", "run registry")?; - let base_snapshot_ids = string_column(batch, "base_snapshot_id", "run registry")?; - let base_manifest_versions = u64_column(batch, "base_manifest_version", "run registry")?; - let operation_hashes = string_column(batch, "operation_hash", "run registry")?; - let statuses = string_column(batch, "status", "run registry")?; - let published_snapshot_ids = string_column(batch, "published_snapshot_id", "run registry")?; - let created_ats = timestamp_micros_column(batch, "created_at", "run registry")?; - let updated_ats = timestamp_micros_column(batch, "updated_at", "run registry")?; - - for row in 0..batch.num_rows() { - runs.push(RunRecord { - run_id: RunId::new(run_ids.value(row)), - target_branch: target_branches.value(row).to_string(), - run_branch: run_branches.value(row).to_string(), - base_snapshot_id: base_snapshot_ids.value(row).to_string(), - base_manifest_version: base_manifest_versions.value(row), - operation_hash: if operation_hashes.is_null(row) { - None - } else { - Some(operation_hashes.value(row).to_string()) - }, - actor_id: None, - status: RunStatus::parse(statuses.value(row))?, - published_snapshot_id: if published_snapshot_ids.is_null(row) { - None - } else { - Some(published_snapshot_ids.value(row).to_string()) - }, - created_at: created_ats.value(row), - updated_at: updated_ats.value(row), - }); - } - } - Ok(runs) -} - -fn merge_latest_run(latest_by_id: &mut HashMap, record: RunRecord) { - match latest_by_id.get(record.run_id.as_str()) { - Some(existing) - if existing.updated_at > record.updated_at - || (existing.updated_at == record.updated_at - && existing.created_at >= record.created_at) => {} - _ => { - latest_by_id.insert(record.run_id.as_str().to_string(), record); - } - } -} - -fn string_column<'a>(batch: &'a RecordBatch, name: &str, context: &str) -> Result<&'a StringArray> { - batch - .column_by_name(name) - .ok_or_else(|| { - OmniError::manifest_internal(format!("{context} batch missing '{name}' column")) - })? - .as_any() - .downcast_ref::() - .ok_or_else(|| { - OmniError::manifest_internal(format!("{context} column '{name}' is not Utf8")) - }) -} - -fn u64_column<'a>(batch: &'a RecordBatch, name: &str, context: &str) -> Result<&'a UInt64Array> { - batch - .column_by_name(name) - .ok_or_else(|| { - OmniError::manifest_internal(format!("{context} batch missing '{name}' column")) - })? - .as_any() - .downcast_ref::() - .ok_or_else(|| { - OmniError::manifest_internal(format!("{context} column '{name}' is not UInt64")) - }) -} - -fn timestamp_micros_column<'a>( - batch: &'a RecordBatch, - name: &str, - context: &str, -) -> Result<&'a TimestampMicrosecondArray> { - batch - .column_by_name(name) - .ok_or_else(|| { - OmniError::manifest_internal(format!("{context} batch missing '{name}' column")) - })? - .as_any() - .downcast_ref::() - .ok_or_else(|| { - OmniError::manifest_internal(format!( - "{context} column '{name}' is not Timestamp(Microsecond)" - )) - }) -} - -fn runs_to_batch(records: &[RunRecord]) -> Result { - let run_ids: Vec<&str> = records - .iter() - .map(|record| record.run_id.as_str()) - .collect(); - let target_branches: Vec<&str> = records - .iter() - .map(|record| record.target_branch.as_str()) - .collect(); - let run_branches: Vec<&str> = records - .iter() - .map(|record| record.run_branch.as_str()) - .collect(); - let base_snapshot_ids: Vec<&str> = records - .iter() - .map(|record| record.base_snapshot_id.as_str()) - .collect(); - let base_manifest_versions: Vec = records - .iter() - .map(|record| record.base_manifest_version) - .collect(); - let operation_hashes: Vec> = records - .iter() - .map(|record| record.operation_hash.as_deref()) - .collect(); - let statuses: Vec<&str> = records - .iter() - .map(|record| record.status.as_str()) - .collect(); - let published_snapshot_ids: Vec> = records - .iter() - .map(|record| record.published_snapshot_id.as_deref()) - .collect(); - let created_ats: Vec = records.iter().map(|record| record.created_at).collect(); - let updated_ats: Vec = records.iter().map(|record| record.updated_at).collect(); - - RecordBatch::try_new( - run_registry_schema(), - vec![ - Arc::new(StringArray::from(run_ids)), - Arc::new(StringArray::from(target_branches)), - Arc::new(StringArray::from(run_branches)), - Arc::new(StringArray::from(base_snapshot_ids)), - Arc::new(UInt64Array::from(base_manifest_versions)), - Arc::new(StringArray::from(operation_hashes)), - Arc::new(StringArray::from(statuses)), - Arc::new(StringArray::from(published_snapshot_ids)), - Arc::new(TimestampMicrosecondArray::from(created_ats)), - Arc::new(TimestampMicrosecondArray::from(updated_ats)), - ], - ) - .map_err(|e| OmniError::Lance(e.to_string())) -} - -#[derive(Debug, Clone, PartialEq, Eq)] -struct RunActorRecord { - run_id: String, - actor_id: String, - created_at: i64, -} - -fn run_actors_to_batch(records: &[RunActorRecord]) -> Result { - let run_ids: Vec<&str> = records - .iter() - .map(|record| record.run_id.as_str()) - .collect(); - let actor_ids: Vec<&str> = records - .iter() - .map(|record| record.actor_id.as_str()) - .collect(); - let created_ats: Vec = records.iter().map(|record| record.created_at).collect(); - - RecordBatch::try_new( - run_actor_schema(), - vec![ - Arc::new(StringArray::from(run_ids)), - Arc::new(StringArray::from(actor_ids)), - Arc::new(TimestampMicrosecondArray::from(created_ats)), - ], - ) - .map_err(|e| OmniError::Lance(e.to_string())) -} - -fn now_micros() -> Result { - let duration = SystemTime::now() - .duration_since(UNIX_EPOCH) - .map_err(|e| OmniError::manifest(format!("system clock error: {}", e)))?; - Ok(duration.as_micros() as i64) -} - -#[cfg(test)] -mod tests { - use std::sync::Arc; - - use arrow_schema::{DataType, Field, Schema}; - - use super::*; - - #[test] - fn load_runs_from_batches_returns_error_for_bad_schema() { - let batch = RecordBatch::try_new( - Arc::new(Schema::new(vec![ - Field::new("run_id", DataType::UInt64, false), - Field::new("target_branch", DataType::Utf8, false), - Field::new("run_branch", DataType::Utf8, false), - Field::new("base_snapshot_id", DataType::Utf8, false), - Field::new("base_manifest_version", DataType::UInt64, false), - Field::new("operation_hash", DataType::Utf8, true), - Field::new("status", DataType::Utf8, false), - Field::new("published_snapshot_id", DataType::Utf8, true), - Field::new( - "created_at", - DataType::Timestamp(TimeUnit::Microsecond, None), - false, - ), - Field::new( - "updated_at", - DataType::Timestamp(TimeUnit::Microsecond, None), - false, - ), - ])), - vec![ - Arc::new(UInt64Array::from(vec![1_u64])), - Arc::new(StringArray::from(vec!["main"])), - Arc::new(StringArray::from(vec!["__run__1"])), - Arc::new(StringArray::from(vec!["snap-1"])), - Arc::new(UInt64Array::from(vec![1_u64])), - Arc::new(StringArray::from(vec![None::<&str>])), - Arc::new(StringArray::from(vec!["running"])), - Arc::new(StringArray::from(vec![None::<&str>])), - Arc::new(TimestampMicrosecondArray::from(vec![1_i64])), - Arc::new(TimestampMicrosecondArray::from(vec![1_i64])), - ], - ) - .unwrap(); - - let err = load_runs_from_batches(&[batch]).unwrap_err(); - assert!(err.to_string().contains("run_id")); - } -} diff --git a/crates/omnigraph/src/exec/merge.rs b/crates/omnigraph/src/exec/merge.rs index c141870..c5c6802 100644 --- a/crates/omnigraph/src/exec/merge.rs +++ b/crates/omnigraph/src/exec/merge.rs @@ -996,32 +996,21 @@ impl Omnigraph { self.ensure_schema_apply_idle("branch_merge").await?; let previous_actor = self.audit_actor_id.clone(); self.audit_actor_id = actor_id.map(str::to_string); - let result = self.branch_merge_impl(source, target, false).await; + let result = self.branch_merge_impl(source, target).await; self.audit_actor_id = previous_actor; result } - pub(crate) async fn branch_merge_internal( - &mut self, - source: &str, - target: &str, - ) -> Result { - self.branch_merge_impl(source, target, true).await - } - async fn branch_merge_impl( &mut self, source: &str, target: &str, - allow_internal_refs: bool, ) -> Result { - if !allow_internal_refs { - if is_internal_run_branch(source) || is_internal_run_branch(target) { - return Err(OmniError::manifest(format!( - "branch_merge does not allow internal run refs ('{}' -> '{}')", - source, target - ))); - } + if is_internal_run_branch(source) || is_internal_run_branch(target) { + return Err(OmniError::manifest(format!( + "branch_merge does not allow internal run refs ('{}' -> '{}')", + source, target + ))); } let source_branch = Omnigraph::normalize_branch_name(source)?; let target_branch = Omnigraph::normalize_branch_name(target)?; diff --git a/crates/omnigraph/src/exec/mutation.rs b/crates/omnigraph/src/exec/mutation.rs index d4df8c5..c79af3f 100644 --- a/crates/omnigraph/src/exec/mutation.rs +++ b/crates/omnigraph/src/exec/mutation.rs @@ -340,6 +340,7 @@ fn build_insert_batch( async fn validate_edge_insert_endpoints( db: &Omnigraph, + staging: &MutationStaging, edge_name: &str, assignments: &HashMap, ) -> Result<()> { @@ -381,26 +382,46 @@ async fn validate_edge_insert_endpoints( } }; - ensure_node_id_exists(db, &edge_type.from_type, from, "src").await?; - ensure_node_id_exists(db, &edge_type.to_type, to, "dst").await?; + ensure_node_id_exists(db, staging, &edge_type.from_type, from, "src").await?; + ensure_node_id_exists(db, staging, &edge_type.to_type, to, "dst").await?; Ok(()) } async fn ensure_node_id_exists( db: &Omnigraph, + staging: &MutationStaging, node_type: &str, id: &str, label: &str, ) -> Result<()> { - let snapshot = db.snapshot(); let table_key = format!("node:{}", node_type); - let ds = snapshot.open(&table_key).await?; let filter = format!("id = '{}'", id.replace('\'', "''")); - let exists = ds - .count_rows(Some(filter)) - .await - .map_err(|e| OmniError::Lance(e.to_string()))? - > 0; + + // Prefer the in-query staged dataset so a same-query insert of the + // referenced node is visible to this validation. Fall back to the + // pre-mutation manifest snapshot when no prior op touched this table. + let exists = if let Some(staged) = staging.latest.get(&table_key) { + let ds = db + .reopen_for_mutation( + &table_key, + &staged.full_path, + staged.table_branch.as_deref(), + staged.table_version, + ) + .await?; + ds.count_rows(Some(filter)) + .await + .map_err(|e| OmniError::Lance(e.to_string()))? + > 0 + } else { + let snapshot = db.snapshot(); + let ds = snapshot.open(&table_key).await?; + ds.count_rows(Some(filter)) + .await + .map_err(|e| OmniError::Lance(e.to_string()))? + > 0 + }; + if exists { Ok(()) } else { @@ -507,6 +528,144 @@ fn apply_assignments( // ─── Mutation execution ────────────────────────────────────────────────────── +/// Per-query staging state for direct-to-target mutations. Replaces the +/// `__run__` staging branch with an in-memory accumulator. +/// +/// Each unique table touched by the mutation is captured at first-open time: +/// - `expected_versions[table_key]` records the manifest version we observed +/// pre-write — the publisher's CAS fence at end-of-query. +/// - `latest[table_key]` records the most recent post-write state on that +/// table — used to thread between ops within the query so subsequent ops +/// see prior writes (read-your-writes). +/// +/// **Known limitation (mid-query partial failure).** If op-N succeeds at the +/// Lance level (a fragment is committed, advancing the table's Lance HEAD) +/// and op-N+1 then fails before the publisher commits, the table is left +/// with `Lance HEAD > manifest_version`. The next `mutate_as` against the +/// same table will surface `ExpectedVersionMismatch` (Lance HEAD ahead of +/// the manifest snapshot). Lance's `restore()` is *not* a rewind — it +/// creates a new commit, monotonically advancing the version. A proper fix +/// requires per-table Lance-internal branches (write to a transient branch, +/// fast-forward main on success, drop branch on failure); tracked as a +/// follow-up to MR-771. In practice this path is narrow: most validation +/// runs before any Lance write, so single-statement mutations are +/// unaffected. See `docs/runs.md`. +#[derive(Default)] +struct MutationStaging { + expected_versions: HashMap, + latest: HashMap, +} + +struct StagedTable { + table_key: String, + table_branch: Option, + table_version: u64, + row_count: u64, + full_path: String, + version_metadata: crate::db::manifest::TableVersionMetadata, +} + +trait IntoStagedRecord { + fn version(&self) -> u64; + fn row_count(&self) -> u64; + fn version_metadata(&self) -> &crate::db::manifest::TableVersionMetadata; +} + +impl IntoStagedRecord for crate::table_store::TableState { + fn version(&self) -> u64 { + self.version + } + fn row_count(&self) -> u64 { + self.row_count + } + fn version_metadata(&self) -> &crate::db::manifest::TableVersionMetadata { + &self.version_metadata + } +} + +impl IntoStagedRecord for crate::table_store::DeleteState { + fn version(&self) -> u64 { + self.version + } + fn row_count(&self) -> u64 { + self.row_count + } + fn version_metadata(&self) -> &crate::db::manifest::TableVersionMetadata { + &self.version_metadata + } +} + +impl MutationStaging { + fn is_empty(&self) -> bool { + self.latest.is_empty() + } + + fn record( + &mut self, + table_key: String, + full_path: String, + table_branch: Option, + state: &S, + ) { + self.latest.insert( + table_key.clone(), + StagedTable { + table_key, + table_branch, + table_version: state.version(), + row_count: state.row_count(), + full_path, + version_metadata: state.version_metadata().clone(), + }, + ); + } + + fn into_updates(self) -> (Vec, HashMap) { + let updates = self + .latest + .into_values() + .map(|st| crate::db::SubTableUpdate { + table_key: st.table_key, + table_version: st.table_version, + table_branch: st.table_branch, + row_count: st.row_count, + version_metadata: st.version_metadata, + }) + .collect(); + (updates, self.expected_versions) + } +} + +/// Open a sub-table dataset for write in the current mutation query. On the +/// first touch of a table, captures the pre-write manifest version into +/// `staging.expected_versions` so the publisher can enforce OCC. On +/// subsequent touches, re-opens the dataset at the locally-staged version +/// (the version we wrote in a prior op of the same query) — bypassing the +/// manifest because nothing has been committed yet. +async fn open_for_mutation_in_query( + db: &Omnigraph, + staging: &mut MutationStaging, + table_key: &str, +) -> Result<(Dataset, String, Option)> { + if let Some(staged) = staging.latest.get(table_key) { + let ds = db + .reopen_for_mutation( + table_key, + &staged.full_path, + staged.table_branch.as_deref(), + staged.table_version, + ) + .await?; + return Ok((ds, staged.full_path.clone(), staged.table_branch.clone())); + } + let (ds, full_path, table_branch) = db.open_for_mutation(table_key).await?; + staging + .expected_versions + .entry(table_key.to_string()) + .or_insert(ds.version().version); + Ok((ds, full_path, table_branch)) +} + impl Omnigraph { pub async fn mutate( &mut self, @@ -546,88 +705,51 @@ impl Omnigraph { self.ensure_schema_state_valid().await?; let requested = Self::normalize_branch_name(branch)?; let resolved_params = enrich_mutation_params(params)?; - let operation = format!( - "mutation:{}:branch={}", - query_name, - requested.as_deref().unwrap_or("main") - ); - if requested.as_deref().is_some_and(is_internal_run_branch) { - return self - .execute_named_mutation_on_branch( - requested.as_deref(), - query_source, - query_name, - &resolved_params, - ) - .await; - } + // Direct-to-target write path. Per-query staging captures pre-write + // manifest versions (publisher CAS fence) and threads dataset state + // across ops to maintain read-your-writes within a multi-statement + // query without per-op manifest commits. + let mut staging = MutationStaging::default(); - let target_branch = requested.clone().unwrap_or_else(|| "main".to_string()); - let target_head_before = self.latest_branch_snapshot_id(&target_branch).await?; - let run = self - .begin_run(&target_branch, Some(operation.as_str())) - .await?; + let current = self.active_branch().map(str::to_string); + let needs_swap = requested.as_deref() != current.as_deref(); + let previous = if needs_swap { + Some(self.swap_coordinator_for_branch(requested.as_deref()).await?) + } else { + None + }; - let staged_result = match self - .execute_named_mutation_on_branch( - Some(run.run_branch.as_str()), - query_source, - query_name, - &resolved_params, - ) - .await - { - Ok(result) => result, - Err(err) => { - let _ = self.fail_run(&run.run_id).await; - return Err(err); + let exec_result = self + .execute_named_mutation(query_source, query_name, &resolved_params, &mut staging) + .await; + + let publish_result = match exec_result { + Err(e) => Err(e), + Ok(total) => { + if staging.is_empty() { + Ok(total) + } else { + let (updates, expected_versions) = staging.into_updates(); + match self + .commit_updates_on_branch_with_expected( + requested.as_deref(), + &updates, + &expected_versions, + ) + .await + { + Ok(_) => Ok(total), + Err(e) => Err(e), + } + } } }; - let target_head_now = self.latest_branch_snapshot_id(&target_branch).await?; - if target_head_now.as_str() != target_head_before.as_str() { - let _ = self.fail_run(&run.run_id).await; - return Err(OmniError::manifest_conflict(format!( - "target branch '{}' advanced during transactional mutation; retry", - target_branch - ))); + if let Some(previous) = previous { + self.restore_coordinator(previous); } - - if let Err(err) = self.publish_run(&run.run_id).await { - let _ = self.fail_run(&run.run_id).await; - return Err(err); - } - - Ok(staged_result) - } - - async fn execute_named_mutation_on_branch( - &mut self, - branch: Option<&str>, - query_source: &str, - query_name: &str, - params: &ParamMap, - ) -> Result { - let requested = match branch { - Some(branch) => Self::normalize_branch_name(branch)?, - None => None, - }; - let current = self.active_branch().map(str::to_string); - if requested == current { - return self - .execute_named_mutation(query_source, query_name, params) - .await; - } - - let previous = self - .swap_coordinator_for_branch(requested.as_deref()) - .await?; - let result = self - .execute_named_mutation(query_source, query_name, params) - .await; - self.restore_coordinator(previous); - result + publish_result } async fn execute_named_mutation( @@ -635,6 +757,7 @@ impl Omnigraph { query_source: &str, query_name: &str, params: &ParamMap, + staging: &mut MutationStaging, ) -> Result { let query_decl = omnigraph_compiler::find_named_query(query_source, query_name) .map_err(|e| OmniError::manifest(e.to_string()))?; @@ -657,19 +780,25 @@ impl Omnigraph { MutationOpIR::Insert { type_name, assignments, - } => self.execute_insert(type_name, assignments, params).await?, + } => { + self.execute_insert(type_name, assignments, params, staging) + .await? + } MutationOpIR::Update { type_name, assignments, predicate, } => { - self.execute_update(type_name, assignments, predicate, params) + self.execute_update(type_name, assignments, predicate, params, staging) .await? } MutationOpIR::Delete { type_name, predicate, - } => self.execute_delete(type_name, predicate, params).await?, + } => { + self.execute_delete(type_name, predicate, params, staging) + .await? + } }; total.affected_nodes += result.affected_nodes; total.affected_edges += result.affected_edges; @@ -682,6 +811,7 @@ impl Omnigraph { type_name: &str, assignments: &[IRAssignment], params: &ParamMap, + staging: &mut MutationStaging, ) -> Result { let mut resolved: HashMap = HashMap::new(); for a in assignments { @@ -722,21 +852,14 @@ impl Omnigraph { )?; } let has_key = node_type.key_property().is_some(); - let (state, table_branch) = if has_key { - self.upsert_batch(type_name, true, schema, batch).await? + let table_key = format!("node:{}", type_name); + let (state, full_path, table_branch) = if has_key { + self.upsert_batch_staged(&table_key, staging, schema, batch).await? } else { - self.append_batch(type_name, true, schema, batch).await? + self.append_batch_staged(&table_key, staging, schema, batch).await? }; - let table_key = format!("node:{}", type_name); - self.commit_updates(&[crate::db::SubTableUpdate { - table_key, - table_version: state.version, - table_branch, - row_count: state.row_count, - version_metadata: state.version_metadata, - }]) - .await?; + staging.record(table_key, full_path, table_branch, &state); Ok(MutationResult { affected_nodes: 1, @@ -749,7 +872,7 @@ impl Omnigraph { let id = ulid::Ulid::new().to_string(); let batch = build_insert_batch(&schema, &id, &resolved, &blob_props)?; - validate_edge_insert_endpoints(self, type_name, &resolved).await?; + validate_edge_insert_endpoints(self, staging, type_name, &resolved).await?; crate::loader::validate_enum_constraints(&batch, &edge_type.properties, type_name)?; let unique_props = crate::loader::unique_property_names_for_edge(edge_type); if !unique_props.is_empty() { @@ -760,7 +883,10 @@ impl Omnigraph { )?; } let active_branch = self.active_branch().map(str::to_string); - let (state, table_branch) = self.append_batch(type_name, false, schema, batch).await?; + let table_key = format!("edge:{}", type_name); + let (state, full_path, table_branch) = self + .append_batch_staged(&table_key, staging, schema, batch) + .await?; crate::loader::validate_edge_cardinality( self, @@ -771,15 +897,7 @@ impl Omnigraph { ) .await?; - let table_key = format!("edge:{}", type_name); - self.commit_updates(&[crate::db::SubTableUpdate { - table_key, - table_version: state.version, - table_branch, - row_count: state.row_count, - version_metadata: state.version_metadata, - }]) - .await?; + staging.record(table_key, full_path, table_branch, &state); self.invalidate_graph_index().await; @@ -792,42 +910,37 @@ impl Omnigraph { } } - /// Append a batch to a sub-table, returning (new_version, row_count). - async fn append_batch( + /// Append a batch to a sub-table within an in-flight mutation, routing + /// through `MutationStaging` so subsequent ops in the same query see the + /// write before publish. Returns the new state, the full path, and the + /// table branch. + async fn append_batch_staged( &self, - type_name: &str, - is_node: bool, + table_key: &str, + staging: &mut MutationStaging, _schema: SchemaRef, batch: RecordBatch, - ) -> Result<(crate::table_store::TableState, Option)> { - let table_key = if is_node { - format!("node:{}", type_name) - } else { - format!("edge:{}", type_name) - }; - let (mut ds, full_path, table_branch) = self.open_for_mutation(&table_key).await?; + ) -> Result<(crate::table_store::TableState, String, Option)> { + let (mut ds, full_path, table_branch) = + open_for_mutation_in_query(self, staging, table_key).await?; let state = self .table_store() .append_batch(&full_path, &mut ds, batch) .await?; - Ok((state, table_branch)) + Ok((state, full_path, table_branch)) } - /// Upsert a batch into a sub-table using merge_insert keyed by "id". - /// Used for @key node types to enforce uniqueness. - async fn upsert_batch( + /// Upsert a batch into a sub-table using merge_insert keyed by "id", + /// routing through `MutationStaging`. + async fn upsert_batch_staged( &self, - type_name: &str, - is_node: bool, + table_key: &str, + staging: &mut MutationStaging, _schema: SchemaRef, batch: RecordBatch, - ) -> Result<(crate::table_store::TableState, Option)> { - let table_key = if is_node { - format!("node:{}", type_name) - } else { - format!("edge:{}", type_name) - }; - let (ds, full_path, table_branch) = self.open_for_mutation(&table_key).await?; + ) -> Result<(crate::table_store::TableState, String, Option)> { + let (ds, full_path, table_branch) = + open_for_mutation_in_query(self, staging, table_key).await?; let state = self .table_store() .merge_insert_batch( @@ -839,7 +952,7 @@ impl Omnigraph { lance::dataset::WhenNotMatched::InsertAll, ) .await?; - Ok((state, table_branch)) + Ok((state, full_path, table_branch)) } async fn execute_update( @@ -848,6 +961,7 @@ impl Omnigraph { assignments: &[IRAssignment], predicate: &IRMutationPredicate, params: &ParamMap, + staging: &mut MutationStaging, ) -> Result { // Defense in depth: ensure this is a node type if !self.catalog().node_types.contains_key(type_name) { @@ -872,7 +986,8 @@ impl Omnigraph { let blob_props = self.catalog().node_types[type_name].blob_properties.clone(); let table_key = format!("node:{}", type_name); - let (ds, full_path, table_branch) = self.open_for_mutation(&table_key).await?; + let (ds, full_path, table_branch) = + open_for_mutation_in_query(self, staging, &table_key).await?; let initial_version = ds.version().version; let non_blob_cols: Vec<&str> = schema @@ -947,14 +1062,7 @@ impl Omnigraph { ) .await?; - self.commit_updates(&[crate::db::SubTableUpdate { - table_key, - table_version: update_state.version, - table_branch, - row_count: update_state.row_count, - version_metadata: update_state.version_metadata, - }]) - .await?; + staging.record(table_key, full_path, table_branch, &update_state); Ok(MutationResult { affected_nodes: affected_count, @@ -967,12 +1075,15 @@ impl Omnigraph { type_name: &str, predicate: &IRMutationPredicate, params: &ParamMap, + staging: &mut MutationStaging, ) -> Result { let is_node = self.catalog().node_types.contains_key(type_name); if is_node { - self.execute_delete_node(type_name, predicate, params).await + self.execute_delete_node(type_name, predicate, params, staging) + .await } else { - self.execute_delete_edge(type_name, predicate, params).await + self.execute_delete_edge(type_name, predicate, params, staging) + .await } } @@ -981,11 +1092,13 @@ impl Omnigraph { type_name: &str, predicate: &IRMutationPredicate, params: &ParamMap, + staging: &mut MutationStaging, ) -> Result { let pred_sql = predicate_to_sql(predicate, params, false)?; let table_key = format!("node:{}", type_name); - let (ds, full_path, table_branch) = self.open_for_mutation(&table_key).await?; + let (ds, full_path, table_branch) = + open_for_mutation_in_query(self, staging, &table_key).await?; let initial_version = ds.version().version; // Scan matching IDs for cascade @@ -1032,13 +1145,12 @@ impl Omnigraph { .delete_where(&full_path, &mut ds, &pred_sql) .await?; - let mut updates = vec![crate::db::SubTableUpdate { - table_key, - table_version: delete_state.version, - table_branch: table_branch.clone(), - row_count: delete_state.row_count, - version_metadata: delete_state.version_metadata, - }]; + staging.record( + table_key.clone(), + full_path.clone(), + table_branch.clone(), + &delete_state, + ); let mut affected_edges = 0usize; let escaped: Vec = deleted_ids @@ -1069,7 +1181,7 @@ impl Omnigraph { let edge_table_key = format!("edge:{}", edge_name); let cascade_filter = cascade_filters.join(" OR "); let (mut edge_ds, edge_full_path, edge_table_branch) = - self.open_for_mutation(&edge_table_key).await?; + open_for_mutation_in_query(self, staging, &edge_table_key).await?; let edge_delete = self .table_store() @@ -1079,18 +1191,15 @@ impl Omnigraph { affected_edges += edge_delete.deleted_rows; if edge_delete.deleted_rows > 0 { - updates.push(crate::db::SubTableUpdate { - table_key: edge_table_key, - table_version: edge_delete.version, - table_branch: edge_table_branch, - row_count: edge_delete.row_count, - version_metadata: edge_delete.version_metadata, - }); + staging.record( + edge_table_key, + edge_full_path, + edge_table_branch, + &edge_delete, + ); } } - self.commit_updates(&updates).await?; - if affected_edges > 0 { self.invalidate_graph_index().await; } @@ -1106,11 +1215,13 @@ impl Omnigraph { type_name: &str, predicate: &IRMutationPredicate, params: &ParamMap, + staging: &mut MutationStaging, ) -> Result { let pred_sql = predicate_to_sql(predicate, params, true)?; let table_key = format!("edge:{}", type_name); - let (mut ds, full_path, table_branch) = self.open_for_mutation(&table_key).await?; + let (mut ds, full_path, table_branch) = + open_for_mutation_in_query(self, staging, &table_key).await?; let delete_state = self .table_store() @@ -1119,15 +1230,7 @@ impl Omnigraph { let affected = delete_state.deleted_rows; if affected > 0 { - self.commit_updates(&[crate::db::SubTableUpdate { - table_key, - table_version: delete_state.version, - table_branch, - row_count: delete_state.row_count, - version_metadata: delete_state.version_metadata, - }]) - .await?; - + staging.record(table_key, full_path, table_branch, &delete_state); self.invalidate_graph_index().await; } diff --git a/crates/omnigraph/src/loader/mod.rs b/crates/omnigraph/src/loader/mod.rs index 83d1118..7fc0737 100644 --- a/crates/omnigraph/src/loader/mod.rs +++ b/crates/omnigraph/src/loader/mod.rs @@ -154,42 +154,17 @@ impl Omnigraph { pub async fn load(&mut self, branch: &str, data: &str, mode: LoadMode) -> Result { self.ensure_schema_state_valid().await?; - let requested = Self::normalize_branch_name(branch)?.unwrap_or_else(|| "main".to_string()); - if crate::db::is_internal_run_branch(&requested) { - return self - .load_direct_on_branch(Some(requested.as_str()), data, mode) - .await; - } - - let target_head_before = self.latest_branch_snapshot_id(&requested).await?; - let op = format!("load_jsonl:branch={}:mode={}", requested, mode.as_str()); - let run = self.begin_run(&requested, Some(op.as_str())).await?; - let staged_result = match self - .load_direct_on_branch(Some(run.run_branch.as_str()), data, mode) + // Branch convention: `None` represents `main`. Re-normalizing to + // `Some("main")` here would route the publisher commit through a + // separate coordinator (the cross-branch path in + // `commit_prepared_updates_on_branch_with_expected`) and leave + // `self.coordinator` with a stale manifest snapshot. + let requested = Self::normalize_branch_name(branch)?; + // Direct-to-target writes: no Run state machine, no `__run__` staging + // branch. Cross-table OCC is enforced by the publisher's + // `expected_table_versions` CAS inside `load_jsonl_reader`. + self.load_direct_on_branch(requested.as_deref(), data, mode) .await - { - Ok(result) => result, - Err(err) => { - let _ = self.fail_run(&run.run_id).await; - return Err(err); - } - }; - - let target_head_now = self.latest_branch_snapshot_id(&requested).await?; - if target_head_now.as_str() != target_head_before.as_str() { - let _ = self.fail_run(&run.run_id).await; - return Err(OmniError::manifest_conflict(format!( - "target branch '{}' advanced during transactional load; retry", - requested - ))); - } - - if let Err(err) = self.publish_run(&run.run_id).await { - let _ = self.fail_run(&run.run_id).await; - return Err(err); - } - - Ok(staged_result) } pub async fn load_file( @@ -334,6 +309,10 @@ async fn load_jsonl_reader( let mut updates = Vec::new(); let mut result = LoadResult::default(); let snapshot = db.snapshot_for_branch(branch).await?; + // Capture per-table manifest versions before any write so the publisher + // CAS at commit-time can detect concurrent writers landing between our + // read snapshot and our publish. + let mut expected_table_versions: HashMap = HashMap::new(); // Phase 2a: build and validate every node batch up front. Cheap and // synchronous — surfaces validation errors before any S3 traffic. @@ -350,9 +329,10 @@ async fn load_jsonl_reader( } let loaded_count = batch.num_rows(); let table_key = format!("node:{}", type_name); - snapshot + let entry = snapshot .entry(&table_key) .ok_or_else(|| OmniError::manifest(format!("no manifest entry for {}", table_key)))?; + expected_table_versions.insert(table_key.clone(), entry.table_version); prepared_nodes.push((type_name.clone(), table_key, batch, loaded_count)); } @@ -428,9 +408,10 @@ async fn load_jsonl_reader( } let loaded_count = batch.num_rows(); let table_key = format!("edge:{}", edge_name); - snapshot + let entry = snapshot .entry(&table_key) .ok_or_else(|| OmniError::manifest(format!("no manifest entry for {}", table_key)))?; + expected_table_versions.insert(table_key.clone(), entry.table_version); prepared_edges.push((edge_name.clone(), table_key, batch, loaded_count)); } @@ -464,8 +445,9 @@ async fn load_jsonl_reader( } } - // Phase 4: Atomic manifest commit - db.commit_updates_on_branch(branch, &updates).await?; + // Phase 4: Atomic manifest commit with publisher-level OCC. + db.commit_updates_on_branch_with_expected(branch, &updates, &expected_table_versions) + .await?; Ok(result) } diff --git a/crates/omnigraph/src/table_store.rs b/crates/omnigraph/src/table_store.rs index 981bdc8..f9d641f 100644 --- a/crates/omnigraph/src/table_store.rs +++ b/crates/omnigraph/src/table_store.rs @@ -147,13 +147,18 @@ impl TableStore { table_key: &str, expected_version: u64, ) -> Result<()> { - if ds.version().version != expected_version { - return Err(OmniError::manifest_conflict(format!( - "version drift on {}: snapshot pinned v{} but dataset is at v{} — call sync_branch() and retry", + let actual = ds.version().version; + if actual != expected_version { + // Use the structured ExpectedVersionMismatch variant so callers + // (and the HTTP server) can match on details rather than parsing + // the message. This drift is a publisher-style OCC failure: the + // caller's pre-write view of the table version is stale relative + // to the on-disk Lance head. + return Err(OmniError::manifest_expected_version_mismatch( table_key, expected_version, - ds.version().version - ))); + actual, + )); } Ok(()) } diff --git a/crates/omnigraph/tests/consistency.rs b/crates/omnigraph/tests/consistency.rs index 0a2872f..205893c 100644 --- a/crates/omnigraph/tests/consistency.rs +++ b/crates/omnigraph/tests/consistency.rs @@ -518,10 +518,15 @@ query high_value() { assert_eq!(values.value(8), 499); } -// ─── Regression: public mutation on stale handle still applies to latest head ────────────── +// ─── Stale handle must refresh-and-retry (no silent rebase) ────────────── #[tokio::test] -async fn stale_handle_public_mutation_uses_latest_target_head() { +async fn stale_handle_public_mutation_must_refresh_then_retry() { + // MR-771: with the Run state machine removed, the engine no longer + // auto-rebases stale-handle mutations onto the latest target head. The + // publisher's `expected_table_versions` CAS makes the contract explicit + // — a stale writer fails loudly with `ExpectedVersionMismatch` and the + // client decides whether to refresh-and-retry. let dir = tempfile::tempdir().unwrap(); let _db = init_and_load(&dir).await; drop(_db); @@ -530,7 +535,7 @@ async fn stale_handle_public_mutation_uses_latest_target_head() { let mut db1 = Omnigraph::open(uri).await.unwrap(); let mut db2 = Omnigraph::open(uri).await.unwrap(); - // Writer 1 inserts — advances the Person sub-table version + // Writer 1 inserts Eve — advances the Person sub-table. mutate_main( &mut db1, MUTATION_QUERIES, @@ -540,8 +545,26 @@ async fn stale_handle_public_mutation_uses_latest_target_head() { .await .unwrap(); - // Writer 2 (stale) mutates through the public transactional path. - // It should stage from the latest target head rather than replaying a stale write. + // Writer 2 is now stale. Its first attempt must fail with + // ExpectedVersionMismatch — no silent rebase. + let stale_err = mutate_main( + &mut db2, + MUTATION_QUERIES, + "set_age", + &mixed_params(&[("$name", "Alice")], &[("$age", 99)]), + ) + .await + .expect_err("stale writer must hit ExpectedVersionMismatch"); + let omnigraph::error::OmniError::Manifest(manifest_err) = stale_err else { + panic!("expected Manifest error"); + }; + assert!(matches!( + manifest_err.details, + Some(omnigraph::error::ManifestConflictDetails::ExpectedVersionMismatch { .. }) + )); + + // Refresh and retry — the canonical client recovery path. + db2.sync_branch("main").await.unwrap(); mutate_main( &mut db2, MUTATION_QUERIES, @@ -551,6 +574,7 @@ async fn stale_handle_public_mutation_uses_latest_target_head() { .await .unwrap(); + // Both Writer 1's insert and Writer 2's update are visible. let result = query_main( &mut db2, TEST_QUERIES, diff --git a/crates/omnigraph/tests/runs.rs b/crates/omnigraph/tests/runs.rs index 21e9aff..d418f84 100644 --- a/crates/omnigraph/tests/runs.rs +++ b/crates/omnigraph/tests/runs.rs @@ -1,283 +1,43 @@ +//! Tests for the direct-to-target write path (MR-771: Run state machine +//! removed). The Run/`__run__` staging branch / RunRecord state machine no +//! longer exists; mutations and loads write directly to target tables and +//! commit once via the publisher's `expected_table_versions` CAS. +//! +//! What this file covers: +//! - No `__run__*` branches are created by load or mutate. +//! - Cancellation of a mutation future leaves no graph-level state. +//! - Concurrent writers to the same table land exactly one publish; the +//! loser surfaces `ManifestConflictDetails::ExpectedVersionMismatch`. +//! - Failed mutations and loads leave the target unchanged. +//! - Multi-statement mutations are atomic (one commit per query). +//! - actor_id propagates through to the commit graph. + mod helpers; -use std::collections::HashMap; - -use arrow_array::{Array, RecordBatch, StringArray, TimestampMicrosecondArray}; -use futures::TryStreamExt; -use lance::Dataset; - use omnigraph::db::commit_graph::CommitGraph; -use omnigraph::db::{Omnigraph, ReadTarget, RunStatus}; -use omnigraph::error::{ManifestErrorKind, OmniError}; +use omnigraph::db::{Omnigraph, ReadTarget}; +use omnigraph::error::{ManifestConflictDetails, ManifestErrorKind, OmniError}; use omnigraph::loader::{LoadMode, load_jsonl}; use helpers::*; -#[derive(Debug, Clone)] -struct PersistedRun { - run_id: String, - target_branch: String, - run_branch: String, - status: String, - updated_at: i64, -} - -async fn latest_runs(uri: &str) -> Vec { - let runs_uri = format!("{}/_graph_runs.lance", uri); - let ds = Dataset::open(&runs_uri).await.unwrap(); - let batches: Vec = ds - .scan() - .try_into_stream() - .await - .unwrap() - .try_collect() - .await - .unwrap(); - - let mut latest: HashMap = HashMap::new(); - for batch in batches { - let run_ids = batch - .column_by_name("run_id") - .unwrap() - .as_any() - .downcast_ref::() - .unwrap(); - let target_branches = batch - .column_by_name("target_branch") - .unwrap() - .as_any() - .downcast_ref::() - .unwrap(); - let run_branches = batch - .column_by_name("run_branch") - .unwrap() - .as_any() - .downcast_ref::() - .unwrap(); - let statuses = batch - .column_by_name("status") - .unwrap() - .as_any() - .downcast_ref::() - .unwrap(); - let updated_ats = batch - .column_by_name("updated_at") - .unwrap() - .as_any() - .downcast_ref::() - .unwrap(); - - for row in 0..batch.num_rows() { - let record = PersistedRun { - run_id: run_ids.value(row).to_string(), - target_branch: target_branches.value(row).to_string(), - run_branch: run_branches.value(row).to_string(), - status: statuses.value(row).to_string(), - updated_at: updated_ats.value(row), - }; - match latest.get(record.run_id.as_str()) { - Some(existing) if existing.updated_at >= record.updated_at => {} - _ => { - latest.insert(record.run_id.clone(), record); - } - } - } - } - - let mut records = latest.into_values().collect::>(); - records.sort_by(|a, b| a.run_id.cmp(&b.run_id)); - records -} - +/// `omnigraph load` (no `--branch`) writes directly to the target — no +/// `__run__*` staging branch is created on success. #[tokio::test] -async fn begin_run_creates_hidden_internal_branch_and_isolates_writes() { - let dir = tempfile::tempdir().unwrap(); - let mut db = init_and_load(&dir).await; - let base_snapshot = db.resolve_snapshot("main").await.unwrap(); - - let run = db.begin_run("main", Some("test-load")).await.unwrap(); - - assert!(run.run_branch.starts_with("__run__")); - assert_eq!(run.target_branch, "main"); - assert_eq!(run.base_snapshot_id, base_snapshot.as_str()); - assert_eq!(run.status, RunStatus::Running); - assert_eq!(db.branch_list().await.unwrap(), vec!["main"]); - - db.load( - &run.run_branch, - r#"{"type":"Person","data":{"name":"Eve","age":22}}"#, - LoadMode::Append, - ) - .await - .unwrap(); - - let main_qr = db - .query( - ReadTarget::branch("main"), - TEST_QUERIES, - "get_person", - ¶ms(&[("$name", "Eve")]), - ) - .await - .unwrap(); - assert_eq!(main_qr.num_rows(), 0); - - let run_qr = db - .query( - ReadTarget::branch(run.run_branch.as_str()), - TEST_QUERIES, - "get_person", - ¶ms(&[("$name", "Eve")]), - ) - .await - .unwrap(); - assert_eq!(run_qr.num_rows(), 1); -} - -#[tokio::test] -async fn publish_run_merges_internal_branch_into_target_and_marks_record() { - let dir = tempfile::tempdir().unwrap(); - let mut db = init_and_load(&dir).await; - let run = db.begin_run("main", Some("publish-test")).await.unwrap(); - - db.load( - &run.run_branch, - r#"{"type":"Person","data":{"name":"Eve","age":22}}"#, - LoadMode::Append, - ) - .await - .unwrap(); - - let published_snapshot = db.publish_run(&run.run_id).await.unwrap(); - let record = db.get_run(&run.run_id).await.unwrap(); - - assert_eq!(record.status, RunStatus::Published); - assert_eq!( - record.published_snapshot_id.as_deref(), - Some(published_snapshot.as_str()) - ); - - let main_qr = db - .query( - ReadTarget::branch("main"), - TEST_QUERIES, - "get_person", - ¶ms(&[("$name", "Eve")]), - ) - .await - .unwrap(); - assert_eq!(main_qr.num_rows(), 1); -} - -#[tokio::test] -async fn abort_run_leaves_target_unchanged_and_deletes_run_branch() { - let dir = tempfile::tempdir().unwrap(); - let mut db = init_and_load(&dir).await; - let run = db.begin_run("main", Some("abort-test")).await.unwrap(); - - db.load( - &run.run_branch, - r#"{"type":"Person","data":{"name":"Eve","age":22}}"#, - LoadMode::Append, - ) - .await - .unwrap(); - - let aborted = db.abort_run(&run.run_id).await.unwrap(); - assert_eq!(aborted.status, RunStatus::Aborted); - - let main_qr = db - .query( - ReadTarget::branch("main"), - TEST_QUERIES, - "get_person", - ¶ms(&[("$name", "Eve")]), - ) - .await - .unwrap(); - assert_eq!(main_qr.num_rows(), 0); - - let err = db - .query( - ReadTarget::branch(run.run_branch.as_str()), - TEST_QUERIES, - "get_person", - ¶ms(&[("$name", "Eve")]), - ) - .await - .unwrap_err(); - assert!( - matches!(err, OmniError::Manifest(ref e) if e.kind == ManifestErrorKind::NotFound) - || matches!(err, OmniError::Lance(_)), - "run branch should be gone after abort, got: {}", - err - ); -} - -#[tokio::test] -async fn public_branch_apis_reject_internal_run_refs() { - let dir = tempfile::tempdir().unwrap(); - let mut db = init_and_load(&dir).await; - let run = db.begin_run("main", Some("guard-test")).await.unwrap(); - - let merge_err = db.branch_merge(&run.run_branch, "main").await.unwrap_err(); - match merge_err { - OmniError::Manifest(message) => assert!(message.message.contains("internal run refs")), - other => panic!("unexpected error: {}", other), - } - - let create_err = db.branch_create(&run.run_branch).await.unwrap_err(); - match create_err { - OmniError::Manifest(message) => assert!(message.message.contains("internal run ref")), - other => panic!("unexpected error: {}", other), - } - - let delete_err = db.branch_delete(&run.run_branch).await.unwrap_err(); - match delete_err { - OmniError::Manifest(message) => assert!(message.message.contains("internal run ref")), - other => panic!("unexpected error: {}", other), - } - - let fork_err = db - .branch_create_from(ReadTarget::branch(run.run_branch.as_str()), "child") - .await - .unwrap_err(); - match fork_err { - OmniError::Manifest(message) => assert!(message.message.contains("internal run ref")), - other => panic!("unexpected error: {}", other), - } -} - -#[tokio::test] -async fn branch_delete_rejects_target_branches_with_active_runs() { - let dir = tempfile::tempdir().unwrap(); - let mut db = init_and_load(&dir).await; - db.branch_create("feature").await.unwrap(); - let run = db.begin_run("feature", Some("delete-guard")).await.unwrap(); - - let err = db.branch_delete("feature").await.unwrap_err(); - assert!(err.to_string().contains(run.run_id.as_str())); - assert!(err.to_string().contains("targeting it is running")); -} - -#[tokio::test] -async fn public_load_uses_hidden_transactional_run_and_publishes_it() { +async fn load_does_not_create_run_branch() { let dir = tempfile::tempdir().unwrap(); let uri = dir.path().to_str().unwrap(); let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); - let result = load_jsonl(&mut db, TEST_DATA, LoadMode::Overwrite) + load_jsonl(&mut db, TEST_DATA, LoadMode::Overwrite) .await .unwrap(); - assert_eq!(result.nodes_loaded.len(), 2); - assert_eq!(result.edges_loaded.len(), 2); - assert_eq!(db.branch_list().await.unwrap(), vec!["main"]); - let runs = latest_runs(uri).await; - assert_eq!(runs.len(), 1); - assert_eq!(runs[0].target_branch, "main"); - assert_eq!(runs[0].status, "published"); - assert!(runs[0].run_branch.starts_with("__run__")); + assert_eq!(db.branch_list().await.unwrap(), vec!["main".to_string()]); + assert!( + !std::path::Path::new(&format!("{}/_graph_runs.lance", uri)).exists(), + "run state machine should not write _graph_runs.lance", + ); let qr = db .query( @@ -291,64 +51,10 @@ async fn public_load_uses_hidden_transactional_run_and_publishes_it() { assert_eq!(qr.num_rows(), 1); } +/// `omnigraph change` writes directly to the target. After the call, +/// `branch_list()` shows only `main`; no run record exists. #[tokio::test] -async fn public_load_preserves_staged_edge_ids_on_publish() { - let dir = tempfile::tempdir().unwrap(); - let uri = dir.path().to_str().unwrap(); - let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); - - let run = db.begin_run("main", Some("preserve-ids-load")).await.unwrap(); - db.load(&run.run_branch, TEST_DATA, LoadMode::Overwrite) - .await - .unwrap(); - - let mut staged_ids = collect_column_strings( - &read_table_branch(&db, run.run_branch.as_str(), "edge:Knows").await, - "id", - ); - staged_ids.sort(); - - db.publish_run(&run.run_id).await.unwrap(); - - let mut main_ids = collect_column_strings(&read_table(&db, "edge:Knows").await, "id"); - main_ids.sort(); - assert_eq!(main_ids, staged_ids); -} - -#[tokio::test] -async fn failed_public_load_marks_run_failed_and_leaves_target_unchanged() { - let dir = tempfile::tempdir().unwrap(); - let uri = dir.path().to_str().unwrap(); - let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); - - let bad = r#"{"type":"Person","data":{"name":"Alice","age":30}} -{"edge":"Knows","from":"Alice","to":"Missing"}"#; - let err = load_jsonl(&mut db, bad, LoadMode::Overwrite) - .await - .unwrap_err(); - match err { - OmniError::Manifest(message) => assert!(message.message.contains("not found in Person")), - other => panic!("unexpected error: {}", other), - } - - let runs = latest_runs(uri).await; - assert_eq!(runs.len(), 1); - assert_eq!(runs[0].status, "failed"); - assert!(runs[0].run_branch.starts_with("__run__")); - - let snap = snapshot_main(&db).await.unwrap(); - let person_count = snap - .open("node:Person") - .await - .unwrap() - .count_rows(None) - .await - .unwrap(); - assert_eq!(person_count, 0); -} - -#[tokio::test] -async fn public_mutation_uses_hidden_transactional_run_and_publishes_it() { +async fn mutation_does_not_create_run_branch() { let dir = tempfile::tempdir().unwrap(); let uri = dir.path().to_str().unwrap(); let mut db = init_and_load(&dir).await; @@ -363,62 +69,19 @@ async fn public_mutation_uses_hidden_transactional_run_and_publishes_it() { .await .unwrap(); assert_eq!(result.affected_nodes, 1); - assert_eq!(result.affected_edges, 0); - let runs = latest_runs(uri).await; - assert!(!runs.is_empty()); - let latest = runs.last().unwrap(); - assert_eq!(latest.target_branch, "main"); - assert_eq!(latest.status, "published"); - assert!(latest.run_branch.starts_with("__run__")); - - let qr = db - .query( - ReadTarget::branch("main"), - TEST_QUERIES, - "get_person", - ¶ms(&[("$name", "Eve")]), - ) - .await - .unwrap(); - assert_eq!(qr.num_rows(), 1); -} - -#[tokio::test] -async fn public_mutation_preserves_staged_edge_ids_on_publish() { - let dir = tempfile::tempdir().unwrap(); - let mut db = init_and_load(&dir).await; - - let run = db - .begin_run("main", Some("preserve-ids-mutation")) - .await - .unwrap(); - db.mutate( - run.run_branch.as_str(), - MUTATION_QUERIES, - "add_friend", - ¶ms(&[("$from", "Alice"), ("$to", "Diana")]), - ) - .await - .unwrap(); - - let mut staged_ids = collect_column_strings( - &read_table_branch(&db, run.run_branch.as_str(), "edge:Knows").await, - "id", + assert_eq!(db.branch_list().await.unwrap(), vec!["main".to_string()]); + assert!( + !std::path::Path::new(&format!("{}/_graph_runs.lance", uri)).exists(), + "run state machine should not write _graph_runs.lance", ); - staged_ids.sort(); - - db.publish_run(&run.run_id).await.unwrap(); - - let mut main_ids = collect_column_strings(&read_table(&db, "edge:Knows").await, "id"); - main_ids.sort(); - assert_eq!(main_ids, staged_ids); } +/// A failed mutation (validation error mid-query) leaves the target branch's +/// observable state unchanged. There is nothing for cleanup to delete. #[tokio::test] -async fn failed_public_mutation_marks_run_failed_and_leaves_target_unchanged() { +async fn failed_mutation_leaves_target_unchanged() { let dir = tempfile::tempdir().unwrap(); - let uri = dir.path().to_str().unwrap(); let mut db = init_and_load(&dir).await; let err = db @@ -435,12 +98,6 @@ async fn failed_public_mutation_marks_run_failed_and_leaves_target_unchanged() { other => panic!("unexpected error: {}", other), } - let runs = latest_runs(uri).await; - assert!(!runs.is_empty()); - let latest = runs.last().unwrap(); - assert_eq!(latest.status, "failed"); - assert!(latest.run_branch.starts_with("__run__")); - let qr = db .query( ReadTarget::branch("main"), @@ -451,64 +108,289 @@ async fn failed_public_mutation_marks_run_failed_and_leaves_target_unchanged() { .await .unwrap(); assert_eq!(qr.num_rows(), 2); + + assert_eq!(db.branch_list().await.unwrap(), vec!["main".to_string()]); } +/// Multi-statement mutations are atomic at the query boundary. The +/// `insert_person_and_friend` query inserts a person and an edge that +/// references it; both must land together (read-your-writes within the +/// query, single publish at the end). #[tokio::test] -async fn concurrent_conflicting_run_publish_fails_cleanly() { +async fn multi_statement_mutation_is_atomic_with_read_your_writes() { let dir = tempfile::tempdir().unwrap(); let mut db = init_and_load(&dir).await; - let run_a = db.begin_run("main", Some("conflict-a")).await.unwrap(); - let run_b = db.begin_run("main", Some("conflict-b")).await.unwrap(); + let result = db + .mutate( + "main", + MUTATION_QUERIES, + "insert_person_and_friend", + &mixed_params( + &[("$name", "Eve"), ("$friend", "Alice")], + &[("$age", 22)], + ), + ) + .await + .unwrap(); + assert_eq!(result.affected_nodes, 1); + assert_eq!(result.affected_edges, 1); - db.mutate( - run_a.run_branch.as_str(), - MUTATION_QUERIES, - "set_age", - &mixed_params(&[("$name", "Alice")], &[("$age", 31)]), - ) - .await - .unwrap(); - db.mutate( - run_b.run_branch.as_str(), - MUTATION_QUERIES, - "set_age", - &mixed_params(&[("$name", "Alice")], &[("$age", 32)]), - ) - .await - .unwrap(); - - db.publish_run(&run_a.run_id).await.unwrap(); - let publish_b = db.publish_run(&run_b.run_id).await; - assert!(publish_b.is_err(), "second conflicting publish should fail"); - let err = publish_b.unwrap_err().to_string(); - assert!( - err.contains("conflict") || err.contains("divergent") || err.contains("Alice"), - "unexpected conflict error: {}", - err - ); - - let alice = db + // Both writes are visible after one publish. + let person = db .query( ReadTarget::branch("main"), TEST_QUERIES, "get_person", - ¶ms(&[("$name", "Alice")]), + ¶ms(&[("$name", "Eve")]), ) .await .unwrap(); - let rows = alice.to_rust_json(); - assert_eq!(alice.num_rows(), 1); - assert_eq!(rows[0]["p.age"], serde_json::json!(31)); + assert_eq!(person.num_rows(), 1); - let run_a_record = db.get_run(&run_a.run_id).await.unwrap(); - assert_eq!(run_a_record.status, RunStatus::Published); - let run_b_record = db.get_run(&run_b.run_id).await.unwrap(); - assert_eq!(run_b_record.status, RunStatus::Running); + let friends = db + .query( + ReadTarget::branch("main"), + TEST_QUERIES, + "friends_of", + ¶ms(&[("$name", "Eve")]), + ) + .await + .unwrap(); + assert_eq!(friends.num_rows(), 1); } +/// Mid-query partial failure: op-1 writes a Lance fragment, op-2 fails. +/// Documents the *current* observable behavior — not the desired one. The +/// publisher never publishes (good — no manifest commit, target state +/// unchanged), but Lance HEAD on the touched table is now ahead of the +/// manifest-recorded version. The next mutation against that table fails +/// loudly with `ExpectedVersionMismatch` (the engine's +/// `ensure_expected_version` strict-equality check refuses the drift). +/// +/// **Known limitation, MR-771 follow-up**: a proper rollback requires +/// per-table Lance branches (write to a transient branch, fast-forward +/// main on success, drop on failure). Lance's `restore()` is not a rewind +/// — it appends a new commit, advancing HEAD further. See `docs/runs.md` +/// for the workaround (rare in practice: most validation runs before any +/// Lance write, so this only fires on multi-statement queries where a +/// late op fails after an earlier op committed). #[tokio::test] -async fn public_mutation_records_actor_on_run_and_published_commit() { +async fn partial_failure_observably_rolls_back_but_blocks_next_mutation_on_same_table() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + + // Op-1 inserts Person "Eve" successfully (advancing Lance HEAD on + // node:Person). Op-2 inserts Knows from Eve to "Missing" — fails at + // validate_edge_insert_endpoints because "Missing" doesn't exist. + // The query as a whole errors; the publisher never runs. + let err = db + .mutate( + "main", + MUTATION_QUERIES, + "insert_person_and_friend", + &mixed_params( + &[("$name", "Eve"), ("$friend", "Missing")], + &[("$age", 22)], + ), + ) + .await + .expect_err("op-2 must fail"); + let OmniError::Manifest(manifest_err) = err else { + panic!("expected Manifest error, got {err:?}"); + }; + assert!( + manifest_err.message.contains("not found"), + "unexpected error: {}", + manifest_err.message, + ); + + // Atomicity at the manifest level: Eve is *not* observable. The Lance + // fragment from op-1 exists on disk but is not referenced by the + // manifest; readers at the manifest's pinned version see the + // pre-mutation state. + let eve = db + .query( + ReadTarget::branch("main"), + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Eve")]), + ) + .await + .unwrap(); + assert_eq!(eve.num_rows(), 0, "partial Lance write must not be visible"); + + // The next mutation against the *same* table fails loudly. Other + // tables are unaffected, and reads still work. + let blocked = db + .mutate( + "main", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Frank")], &[("$age", 33)]), + ) + .await + .expect_err("next mutation on the touched table is blocked by orphan Lance HEAD"); + let OmniError::Manifest(blocked_err) = blocked else { + panic!("expected Manifest error, got {blocked:?}"); + }; + assert!(matches!( + blocked_err.details, + Some(omnigraph::error::ManifestConflictDetails::ExpectedVersionMismatch { .. }) + )); +} + +/// Concurrent writers to the same `(table, branch)` produce exactly one +/// success and one `ExpectedVersionMismatch`. The replacement for the old +/// `concurrent_conflicting_run_publish_fails_cleanly` test — the OCC fence +/// has moved from a graph-level run-publish merge into the publisher's +/// per-table CAS (MR-766 + MR-771). +/// +/// Drives the race by interleaving two handles that captured the same +/// pre-write manifest snapshot: A commits first; B's commit then sees +/// `expected_versions[node:Person] = pre` while the manifest is at +/// `pre + 1`, and the publisher rejects. +#[tokio::test] +async fn concurrent_writers_one_succeeds_one_gets_expected_version_mismatch() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_string_lossy().into_owned(); + + { + let mut db = Omnigraph::init(&uri, TEST_SCHEMA).await.unwrap(); + load_jsonl(&mut db, TEST_DATA, LoadMode::Overwrite) + .await + .unwrap(); + } + + // Open handle B first — it captures the pre-write snapshot. We don't + // actually mutate yet; we just want B's coordinator to be at the + // pre-A-commit state when we eventually call mutate. + let mut db_b = Omnigraph::open(&uri).await.unwrap(); + + // Writer A advances the manifest by inserting a new Person. + { + let mut db_a = Omnigraph::open(&uri).await.unwrap(); + db_a.mutate( + "main", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "WriterA")], &[("$age", 41)]), + ) + .await + .unwrap(); + } + + // Writer B's coordinator is still at the pre-A snapshot. Its mutation + // captures expected_versions[node:Person] = pre (stale), then publishes + // — the publisher's CAS pre-check sees the manifest is now at post and + // rejects with ExpectedVersionMismatch. + let result_b = db_b + .mutate( + "main", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "WriterB")], &[("$age", 42)]), + ) + .await; + + let err = result_b.expect_err("stale writer must hit ExpectedVersionMismatch"); + let OmniError::Manifest(manifest_err) = err else { + panic!("expected Manifest error, got {err:?}"); + }; + assert_eq!(manifest_err.kind, ManifestErrorKind::Conflict); + let Some(ManifestConflictDetails::ExpectedVersionMismatch { + ref table_key, + expected, + actual, + }) = manifest_err.details + else { + panic!( + "expected ExpectedVersionMismatch, got {:?}", + manifest_err.details, + ); + }; + assert_eq!(table_key, "node:Person"); + assert!( + actual > expected, + "actual ({actual}) should be ahead of expected ({expected})", + ); +} + +/// The cancellation hole that motivated MR-771: dropping a mutation future +/// mid-flight must not leave any graph-level state behind. With the run +/// state machine gone, only orphaned Lance fragments can remain — and those +/// are reclaimed by `omnigraph cleanup`. Verify by aborting the future and +/// asserting the branch list and manifest version are unchanged. +#[tokio::test] +async fn cancelled_mutation_future_leaves_no_state() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_string_lossy().into_owned(); + + { + let mut db = Omnigraph::init(&uri, TEST_SCHEMA).await.unwrap(); + load_jsonl(&mut db, TEST_DATA, LoadMode::Overwrite) + .await + .unwrap(); + } + + let manifest_version_before = { + let db = Omnigraph::open(&uri).await.unwrap(); + db.snapshot_of(ReadTarget::branch("main")) + .await + .unwrap() + .version() + }; + let branches_before = { + let db = Omnigraph::open(&uri).await.unwrap(); + db.branch_list().await.unwrap() + }; + + let uri_handle = uri.clone(); + let handle = tokio::spawn(async move { + let mut db = Omnigraph::open(&uri_handle).await.unwrap(); + db.mutate( + "main", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + }); + + // Cancel the future. Whether the in-flight write managed to land a + // fragment is timing-dependent and irrelevant; what matters is that no + // graph-level state remains pointing at it. + handle.abort(); + let _ = handle.await; + + let db = Omnigraph::open(&uri).await.unwrap(); + let branches_after = db.branch_list().await.unwrap(); + let manifest_version_after = db + .snapshot_of(ReadTarget::branch("main")) + .await + .unwrap() + .version(); + + assert_eq!( + branches_after, branches_before, + "cancelled mutation must not leave a __run__* branch behind", + ); + assert_eq!( + manifest_version_after, manifest_version_before, + "cancelled mutation must not advance the manifest", + ); + assert!( + !std::path::Path::new(&format!("{}/_graph_runs.lance", uri)).exists(), + "no _graph_runs.lance after cancel — state machine is gone", + ); +} + +/// `actor_id` provided to `mutate_as` reaches the commit graph so audit can +/// reconstruct who published which commit. This used to be plumbed via the +/// run record; now it goes directly through the publisher and +/// `record_graph_commit`. +#[tokio::test] +async fn mutation_actor_id_lands_in_commit_graph() { let dir = tempfile::tempdir().unwrap(); let uri = dir.path().to_str().unwrap(); let mut db = init_and_load(&dir).await; @@ -523,14 +405,6 @@ async fn public_mutation_records_actor_on_run_and_published_commit() { .await .unwrap(); - let runs = db.list_runs().await.unwrap(); - let run = runs - .iter() - .find(|run| run.operation_hash.as_deref() == Some("mutation:set_age:branch=main")) - .expect("published mutation run should exist"); - assert_eq!(run.actor_id.as_deref(), Some("act-andrew")); - assert_eq!(run.status, RunStatus::Published); - let head = CommitGraph::open(uri) .await .unwrap() @@ -541,12 +415,11 @@ async fn public_mutation_records_actor_on_run_and_published_commit() { assert_eq!(head.actor_id.as_deref(), Some("act-andrew")); } +/// Repeated loads must not accumulate `__run__*` branches across calls. In +/// the post-demotion world there are no run branches at all — verify that +/// 10 sequential loads end with `branch_list() == ["main"]`. #[tokio::test] -async fn run_branches_do_not_accumulate_across_repeated_loads() { - // MR-674: run branches are transactional scaffolding. Every terminal - // state (Published, Aborted, Failed) deletes the branch. Verifying the - // invariant end-to-end: after 10 publishes and one abort, only main - // should remain. +async fn repeated_loads_do_not_accumulate_branches() { let dir = tempfile::tempdir().unwrap(); let uri = dir.path().to_str().unwrap(); let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); @@ -561,41 +434,37 @@ async fn run_branches_do_not_accumulate_across_repeated_loads() { .unwrap(); } - let aborted_run = db.begin_run("main", Some("abort-me")).await.unwrap(); - db.abort_run(&aborted_run.run_id).await.unwrap(); - assert_eq!(db.branch_list().await.unwrap(), vec!["main".to_string()]); - let all_branches = Omnigraph::open(uri) - .await - .unwrap() - .branch_list() - .await - .unwrap(); - assert_eq!(all_branches, vec!["main".to_string()]); } +/// User code must not be able to write to internal `__run__*` names. The +/// branch-name guard predicate is kept as defense-in-depth (MR-770 will +/// remove it once production legacy branches are swept). #[tokio::test] -async fn failed_load_deletes_run_branch() { +async fn public_branch_apis_reject_internal_run_refs() { let dir = tempfile::tempdir().unwrap(); - let uri = dir.path().to_str().unwrap(); - let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); + let mut db = init_and_load(&dir).await; - let bad = r#"{"type":"Person","data":{"name":"Alice","age":30}} -{"edge":"Knows","from":"Alice","to":"Missing"}"#; - let _ = load_jsonl(&mut db, bad, LoadMode::Overwrite).await; + let create_err = db.branch_create("__run__synthetic").await.unwrap_err(); + let OmniError::Manifest(err) = create_err else { + panic!("expected Manifest error"); + }; + assert!( + err.message.contains("internal run ref"), + "unexpected error: {}", + err.message + ); - let runs = latest_runs(uri).await; - assert_eq!(runs.len(), 1); - assert_eq!(runs[0].status, "failed"); - - let err = db - .snapshot_of(ReadTarget::branch(runs[0].run_branch.as_str())) + let merge_err = db + .branch_merge("__run__synthetic", "main") .await .unwrap_err(); + let OmniError::Manifest(err) = merge_err else { + panic!("expected Manifest error"); + }; assert!( - matches!(err, OmniError::Manifest(ref e) if e.kind == ManifestErrorKind::NotFound) - || matches!(err, OmniError::Lance(_)), - "failed run's branch should be gone, got: {}", - err + err.message.contains("internal run refs"), + "unexpected error: {}", + err.message ); } diff --git a/crates/omnigraph/tests/s3_storage.rs b/crates/omnigraph/tests/s3_storage.rs index a7c26ea..3a3d2be 100644 --- a/crates/omnigraph/tests/s3_storage.rs +++ b/crates/omnigraph/tests/s3_storage.rs @@ -44,27 +44,16 @@ async fn s3_compatible_repo_lifecycle_works() { .await .unwrap(); - let run = reopened - .begin_run("main", Some("s3-runtime-run")) - .await - .unwrap(); + // Direct-to-target load (MR-771): no run lifecycle, single publisher + // commit lands the row. reopened .load( - &run.run_branch, + "main", r#"{"type":"Person","data":{"name":"RunOnly","age":31}}"#, LoadMode::Append, ) .await .unwrap(); - reopened.publish_run(&run.run_id).await.unwrap(); - - let runs = reopened.list_runs().await.unwrap(); - assert!( - runs.iter() - .any(|record| { record.run_id == run.run_id && record.status.as_str() == "published" }), - "expected published run record in {:?}", - runs - ); let mut reopened_again = Omnigraph::open(&uri).await.unwrap(); let eve = query_main( @@ -164,15 +153,8 @@ async fn s3_public_load_uses_hidden_run_and_publishes() { .await .unwrap(); - let runs = db.list_runs().await.unwrap(); - assert!( - runs.iter().any(|record| { - record.target_branch == "main" && record.status.as_str() == "published" - }), - "expected published transactional run in {:?}", - runs - ); - + // Direct-to-target writes (MR-771): no run state machine, just the + // published commit lands the row. Verify by reopening and reading. let mut reopened = Omnigraph::open(&uri).await.unwrap(); let loaded = query_main( &mut reopened, diff --git a/docs/releases/v0.4.0.md b/docs/releases/v0.4.0.md new file mode 100644 index 0000000..dbab789 --- /dev/null +++ b/docs/releases/v0.4.0.md @@ -0,0 +1,89 @@ +# Omnigraph v0.4.0 + +Omnigraph v0.4.0 demotes the Run state machine to commit metadata via the +publisher's CAS, fixing the cancellation hole that motivated MR-771 and +reducing the engine's surface area. + +## Highlights + +- **Direct-to-target writes (MR-771)**: `mutate_as` and `load` write + directly to the target tables and call + `ManifestBatchPublisher::publish` once at the end with + `expected_table_versions`. No more `__run__` staging branches, no + more `RunRecord` state machine. Cross-table OCC is enforced inside the + publisher's row-level CAS on `__manifest`. +- **Cancellation safety by construction**: a dropped mutation future + leaves no graph-level state — only orphaned Lance fragments, reclaimed + by `omnigraph cleanup`. The "zombie run" cascade documented in + `.context/zombie-run-investigation.md` is gone. +- **Read-your-writes inside multi-statement mutations**: a `.gq` query + that inserts and then references a row in the same statement now sees + its own writes via an in-process `MutationStaging` cache, even though + no manifest commit happens between ops. +- **Structured conflict surface**: concurrent writers race through the + publisher's CAS; the loser surfaces as + `ManifestConflictDetails::ExpectedVersionMismatch { table_key, + expected, actual }`. The HTTP server maps this to **409 Conflict** with + a structured `manifest_conflict` body so clients can detect-and-retry + without parsing the message. + +## Removed + +This is a breaking release. Pre-0.4.0 / no SLA. + +- `omnigraph::db::{RunRecord, RunStatus, RunId}` types and the + `_graph_runs.lance` / `_graph_run_actors.lance` Lance datasets. +- Engine APIs `begin_run`, `begin_run_as`, `publish_run`, + `publish_run_as`, `abort_run`, `fail_run`, `terminate_run`, + `list_runs`, `get_run`. +- HTTP endpoints: `GET /runs`, `GET /runs/{run_id}`, `POST + /runs/{run_id}/publish`, `POST /runs/{run_id}/abort`. The + `RunListOutput` and `RunOutput` schemas are removed from the OpenAPI + document. +- CLI subcommands: `omnigraph run list`, `omnigraph run show`, `omnigraph + run publish`, `omnigraph run abort`. Use `omnigraph commit list` + reading the commit graph for audit history. +- Cedar policy actions `run_publish` and `run_abort`. Existing + `policy.yaml` files referencing these actions will fail validation — + remove the rules; the `change` action covers the equivalent gating. + +## Behavior changes + +- `mutate_as` / `load` are now **atomic per query, single publish at the + end**. A failed mutation leaves the target unchanged with no + intermediate manifest commits. +- The `OmniError::manifest_conflict` shape produced by concurrent + writers is now `ExpectedVersionMismatch` (was `MergeConflict::DivergentUpdate` + via the run merge path). Clients that match on the conflict body must + switch to inspecting `manifest_conflict.table_key/expected/actual`. + +## Known limitation + +A multi-statement mutation that writes a Lance fragment in op-N and then +fails in op-N+1 leaves the touched table with Lance HEAD ahead of the +manifest. The next mutation against that table fails with +`ExpectedVersionMismatch`. Most validation runs before any Lance write, +so single-statement mutations are unaffected; the narrow path is +multi-statement queries with late-op failures. Tracked as a follow-up; +see [docs/runs.md](../runs.md#known-limitation-mid-query-partial-failure-on-the-same-table) +for the workaround. + +## Upgrade notes + +- **Stale `__run__*` branches and `_graph_runs.lance`** in legacy v0.3.x + repos are *inert* — the engine no longer reads them — but they remain + on disk until production cleanup. MR-770 owns the destructive sweep; + this release deliberately does not touch legacy bytes. +- The `is_internal_run_branch` predicate is kept as a defense-in-depth + guard against users naming a branch `__run__*`. It will be removed in + a follow-up alongside MR-770. +- External scripts hitting `/runs/*` will now receive 404. Migrate them + to `/commits` for audit history; mutation status is implied by the + HTTP response on `/change` itself. + +## Included Changes + +- MR-771 — Demote Run: write directly to target via publisher +- MR-766 — `ManifestBatchPublisher::publish` accepts per-table + `expected_table_versions` (landed earlier; this release wires it in + end-to-end) diff --git a/docs/runs.md b/docs/runs.md index 5150ec5..789d91d 100644 --- a/docs/runs.md +++ b/docs/runs.md @@ -1,38 +1,98 @@ -# Runs (transactional graph mutations) +# Runs — REMOVED (MR-771) -`db/run_registry.rs` + run lifecycle in `db/omnigraph.rs`. Stored in `_graph_runs.lance` and `_graph_run_actors.lance`. +The Run state machine and `__run__` staging branches were removed in +MR-771. `mutate_as` and `load` now write **directly to the target table** +and call `ManifestBatchPublisher::publish` once at the end with +`expected_table_versions` (the per-table manifest versions captured before +the first write). Cross-table OCC is enforced inside the publisher; the +publisher's row-level CAS on `__manifest` is the single fence. -## RunRecord +## What this means in practice -``` -RunRecord { - run_id: RunId (ULID), - target_branch: String, // where the run will publish - run_branch: "__run__", // ephemeral isolation branch - base_snapshot_id: String, - base_manifest_version: u64, - operation_hash: Option, // idempotency key - actor_id: Option, - status: Running | Published | Failed | Aborted, - published_snapshot_id: Option, - created_at, updated_at: i64 (microseconds), -} -``` +- No `RunRecord`, no `_graph_runs.lance`, no `_graph_run_actors.lance`. +- No `omnigraph run *` CLI subcommands and no `/runs/*` HTTP endpoints. +- No `__run__` staging branches. (Legacy on-disk artifacts from + pre-MR-771 repos are inert; MR-770 sweeps them in production.) +- Cancelled mutation futures leave **no graph-level state** — only orphaned + Lance fragments, which the existing `omnigraph cleanup` pipe reclaims. -## Lifecycle +## Read-your-writes within a multi-statement mutation -1. `begin_run(target_branch, op_hash)` / `begin_run_as(target_branch, op_hash, actor_id)` — forks `__run__` from the target's current head, appends a `RunRecord`. -2. Mutations on `run_branch` (via the normal write APIs) — isolated from concurrent activity on the target. -3. `publish_run(id)` / `publish_run_as(id, actor)`: - - **Fast path**: if the target hasn't moved since `base_snapshot_id`, promote the run snapshot directly. - - **Merge path**: if it has moved, perform a three-way merge (see [merge.md](merge.md)) into the target. - - On success: `status = Published`, `published_snapshot_id` set, run branch cleaned up asynchronously. -4. `abort_run(id)` / `fail_run(id)` — terminal; cleans up run branch best-effort. +A `.gq` query with multiple ops (e.g. `insert Person … insert Knows …`) +must observe earlier ops' writes when validating later ops (referential +integrity, edge cardinality). After demotion this is implemented via an +in-process `MutationStaging` accumulator in +`crates/omnigraph/src/exec/mutation.rs`: -## Idempotency +- On the first touch of each table, the pre-write manifest version is + captured into `expected_versions[table_key]`. +- Subsequent ops on the same table re-open the dataset at the locally + staged Lance version (bypassing the manifest, which has not been + committed yet) so they see prior writes. +- One `commit_with_expected(updates, expected_versions)` at the end + publishes the lot atomically. Cross-table conflicts surface as + `ManifestConflictDetails::ExpectedVersionMismatch`. -`operation_hash` is an optional field clients can use to detect a duplicate `begin_run` retry. +This upholds [docs/invariants.md §VI.23](invariants.md) (atomicity per +query) and §VI.25 (read-your-writes within a multi-statement mutation — +previously aspirational, now upheld). -## Cleanup +## Conflict shape -`cleanup_terminal_run_branches_for_target(branch)` is called as branches change; failures are swallowed (lazy cleanup on next branch op). +Concurrent writers to the same `(table, branch)` produce exactly one +success and one failure. The losing writer's error is +`OmniError::Manifest` with kind `Conflict` and details +`ManifestConflictDetails::ExpectedVersionMismatch { table_key, expected, +actual }`. The HTTP server maps this to **409 Conflict** with body +`{"error": "...", "code": "conflict", "manifest_conflict": { "table_key": +"...", "expected": N, "actual": M }}` — see [docs/server.md](server.md). + +## Audit + +`actor_id` lands in `_graph_commits.lance` via `record_graph_commit` (no +intermediate run record). Audit history is queried via `omnigraph commit +list`. + +## Migration code + +`db/manifest/migrations.rs` does not change. Active deletion of +`_graph_runs.lance` belongs in MR-770 (the production sweep) — this PR +stops *creating* run state but does not destroy legacy bytes on disk. + +## Known limitation: mid-query partial failure on the same table + +A multi-statement `.gq` mutation where op-N writes a Lance fragment +successfully and op-N+1 then fails leaves the touched table at +`Lance HEAD = manifest_version + 1`. The query is atomic at the manifest +level (the publisher never publishes, so reads at the pinned manifest +version do *not* see op-N's data), but the *next* mutation against the +same table fails loudly with +`ManifestConflictDetails::ExpectedVersionMismatch` because +`ensure_expected_version` enforces strict equality between Lance HEAD and +the manifest's pinned version. + +**Why the engine doesn't auto-rollback**: Lance's `Dataset::restore()` is +*not* a rewind — it appends a new commit (containing the desired +historical version's data) and advances HEAD further. There is no Lance +API to delete a committed version. A proper fix requires writing each +mutation's per-table fragments to a *transient Lance branch* on the +sub-table, then fast-forwarding main on success or dropping the branch +on failure. That work is tracked as a follow-up to MR-771; in the +meantime: + +- **In practice this is rare.** Most schema-language validation + (`@key`, `@enum`, `@range`, intra-batch uniqueness, edge-endpoint + existence) runs *before* any Lance write inside the failing op, so + single-statement mutations never trip this. The narrow path is + multi-statement queries (`insert ... insert ...`, + `insert ... update ...`) where a late op fails on validation that + depends on earlier ops' staged data. +- **Workaround**: callers that hit this should refresh the handle and + retry the mutation; if Lance HEAD remains drifted the + `omnigraph cleanup` command will GC the orphan version once a later + successful commit on the same table moves HEAD past it. (`cleanup` + cannot reclaim an orphan that *is* the current Lance HEAD; that case + needs the per-table-branch follow-up to fully heal.) + +The cancellation case (future drop mid-mutation) has the same shape and +the same workaround. diff --git a/docs/server.md b/docs/server.md index 3c6242d..c705635 100644 --- a/docs/server.md +++ b/docs/server.md @@ -19,10 +19,6 @@ Axum 0.8 + tokio + utoipa-generated OpenAPI. Single repo per process; deploy mul | POST | `/branches` | bearer + `branch_create` | create | `server_branch_create` | | DELETE | `/branches/{branch}` | bearer + `branch_delete` | delete | `server_branch_delete` | | POST | `/branches/merge` | bearer + `branch_merge` | merge `source → target` | `server_branch_merge` | -| GET | `/runs` | bearer + `read` | list | `server_run_list` | -| GET | `/runs/{run_id}` | bearer + `read` | show | `server_run_show` | -| POST | `/runs/{run_id}/publish` | bearer + `run_publish` | publish | `server_run_publish` | -| POST | `/runs/{run_id}/abort` | bearer + `run_abort` | abort | `server_run_abort` | | GET | `/commits?branch=` | bearer + `read` | list | `server_commit_list` | | GET | `/commits/{commit_id}` | bearer + `read` | show | `server_commit_show` | @@ -32,7 +28,14 @@ Only `/export` streams (`application/x-ndjson`, MPSC channel + `Body::from_strea ## Error model -Uniform `ErrorOutput { error, code?, merge_conflicts[] }` with `code ∈ unauthorized | forbidden | bad_request | not_found | conflict | internal`. Merge conflicts attach structured `MergeConflictOutput { table_key, row_id?, kind, message }`. +Uniform `ErrorOutput { error, code?, merge_conflicts[], manifest_conflict? }` with `code ∈ unauthorized | forbidden | bad_request | not_found | conflict | internal`. Merge conflicts attach structured `MergeConflictOutput { table_key, row_id?, kind, message }`. + +`manifest_conflict` is set on **publisher CAS rejections** (HTTP 409): the +caller's pre-write view of one table's manifest version was stale. +`ManifestConflictOutput { table_key, expected, actual }` tells the client +which table to refresh and retry. This is the conflict shape produced by +concurrent `/change` or `/ingest` calls landing the same `(table, branch)` +race (MR-771 / MR-766). HTTP status codes used: 200, 400, 401, 403, 404, 409, 500. @@ -64,5 +67,5 @@ See [deployment.md](deployment.md) for token-source operational details. - CORS — not configured; add `tower_http::cors` if needed. - Rate limiting — none. -- Pagination — none (commits/branches/runs return everything; export streams). +- Pagination — none (commits/branches return everything; export streams). - Multi-tenant routing — one repo per process. diff --git a/openapi.json b/openapi.json index 172bc8d..60bfd61 100644 --- a/openapi.json +++ b/openapi.json @@ -701,257 +701,6 @@ ] } }, - "/runs": { - "get": { - "tags": [ - "runs" - ], - "summary": "List all runs.", - "description": "A run is an ephemeral branch produced by an agent or background job. The\nlist includes pending, in-progress, published, and aborted runs across\nall target branches. Read-only.", - "operationId": "listRuns", - "responses": { - "200": { - "description": "List of runs", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/RunListOutput" - } - } - } - }, - "401": { - "description": "Unauthorized", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ErrorOutput" - } - } - } - }, - "403": { - "description": "Forbidden", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ErrorOutput" - } - } - } - } - }, - "security": [ - { - "bearer_token": [] - } - ] - } - }, - "/runs/{run_id}": { - "get": { - "tags": [ - "runs" - ], - "summary": "Get a single run.", - "description": "Returns the run's status, target/run branches, base snapshot, and (if\npublished) the resulting snapshot id. Read-only.", - "operationId": "getRun", - "parameters": [ - { - "name": "run_id", - "in": "path", - "description": "Run identifier", - "required": true, - "schema": { - "type": "string" - } - } - ], - "responses": { - "200": { - "description": "Run details", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/RunOutput" - } - } - } - }, - "401": { - "description": "Unauthorized", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ErrorOutput" - } - } - } - }, - "403": { - "description": "Forbidden", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ErrorOutput" - } - } - } - }, - "404": { - "description": "Run not found", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ErrorOutput" - } - } - } - } - }, - "security": [ - { - "bearer_token": [] - } - ] - } - }, - "/runs/{run_id}/abort": { - "post": { - "tags": [ - "runs" - ], - "summary": "Abort a run.", - "description": "Marks the run as aborted and releases its working branch. **Irreversible**:\nthe run cannot be resumed once aborted.", - "operationId": "abortRun", - "parameters": [ - { - "name": "run_id", - "in": "path", - "description": "Run identifier", - "required": true, - "schema": { - "type": "string" - } - } - ], - "responses": { - "200": { - "description": "Run aborted", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/RunOutput" - } - } - } - }, - "401": { - "description": "Unauthorized", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ErrorOutput" - } - } - } - }, - "403": { - "description": "Forbidden", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ErrorOutput" - } - } - } - }, - "404": { - "description": "Run not found", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ErrorOutput" - } - } - } - } - }, - "security": [ - { - "bearer_token": [] - } - ] - } - }, - "/runs/{run_id}/publish": { - "post": { - "tags": [ - "runs" - ], - "summary": "Publish a run to its target branch.", - "description": "Promotes the run's snapshot onto its `target_branch` as a new commit. The\nrun must be in a publishable state. **Destructive** to the target branch.", - "operationId": "publishRun", - "parameters": [ - { - "name": "run_id", - "in": "path", - "description": "Run identifier", - "required": true, - "schema": { - "type": "string" - } - } - ], - "responses": { - "200": { - "description": "Run published", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/RunOutput" - } - } - } - }, - "401": { - "description": "Unauthorized", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ErrorOutput" - } - } - } - }, - "403": { - "description": "Forbidden", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ErrorOutput" - } - } - } - }, - "404": { - "description": "Run not found", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ErrorOutput" - } - } - } - } - }, - "security": [ - { - "bearer_token": [] - } - ] - } - }, "/schema": { "get": { "tags": [ @@ -1413,6 +1162,17 @@ "error": { "type": "string" }, + "manifest_conflict": { + "oneOf": [ + { + "type": "null" + }, + { + "$ref": "#/components/schemas/ManifestConflictOutput", + "description": "Set when the conflict is a publisher CAS rejection\n(`ManifestConflictDetails::ExpectedVersionMismatch`). The caller's\npre-write view of `table_key` was at version `expected` but the\nmanifest is now at `actual`. Refresh and retry." + } + ] + }, "merge_conflicts": { "type": "array", "items": { @@ -1571,6 +1331,30 @@ "merge" ] }, + "ManifestConflictOutput": { + "type": "object", + "description": "Structured details for a publisher-level OCC failure. Surfaces alongside\nHTTP 409 when a write was rejected because the caller's pre-write view of\none table's manifest version was stale relative to the current head. The\nexpected/actual fields tell the client which table to refresh.", + "required": [ + "table_key", + "expected", + "actual" + ], + "properties": { + "actual": { + "type": "integer", + "format": "int64", + "minimum": 0 + }, + "expected": { + "type": "integer", + "format": "int64", + "minimum": 0 + }, + "table_key": { + "type": "string" + } + } + }, "MergeConflictKindOutput": { "type": "string", "enum": [ @@ -1690,85 +1474,6 @@ } } }, - "RunListOutput": { - "type": "object", - "required": [ - "runs" - ], - "properties": { - "runs": { - "type": "array", - "items": { - "$ref": "#/components/schemas/RunOutput" - } - } - } - }, - "RunOutput": { - "type": "object", - "required": [ - "run_id", - "target_branch", - "run_branch", - "base_snapshot_id", - "base_manifest_version", - "status", - "created_at", - "updated_at" - ], - "properties": { - "actor_id": { - "type": [ - "string", - "null" - ] - }, - "base_manifest_version": { - "type": "integer", - "format": "int64", - "minimum": 0 - }, - "base_snapshot_id": { - "type": "string" - }, - "created_at": { - "type": "integer", - "format": "int64", - "description": "Run creation time as Unix epoch microseconds.", - "example": 1714000000000000 - }, - "operation_hash": { - "type": [ - "string", - "null" - ] - }, - "published_snapshot_id": { - "type": [ - "string", - "null" - ] - }, - "run_branch": { - "type": "string" - }, - "run_id": { - "type": "string" - }, - "status": { - "type": "string" - }, - "target_branch": { - "type": "string" - }, - "updated_at": { - "type": "integer", - "format": "int64", - "description": "Last status change as Unix epoch microseconds.", - "example": 1714000000000000 - } - } - }, "SchemaApplyOutput": { "type": "object", "required": [