MR-771: demote Run to direct-publish via expected_table_versions CAS

mutate_as and load now write directly to target tables and call the
publisher once at the end with per-table expected versions; the Run
state machine, _graph_runs.lance writers, __run__ staging branches,
and server /runs/* endpoints are removed. Multi-statement mutations
remain atomic at the manifest level via an in-memory MutationStaging
accumulator that gives read-your-writes within a query and a single
publish at the end. Concurrent-writer conflicts surface as
ExpectedVersionMismatch (HTTP 409 manifest_conflict) instead of the
old DivergentUpdate merge shape. Documents one known limitation in
docs/runs.md: a multi-statement mid-query failure where op-N writes
a Lance fragment and op-N+1 fails leaves Lance HEAD ahead of the
manifest until a follow-up introduces per-table Lance branches.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Ragnor Comerford 2026-04-30 08:52:50 +02:00
parent 4e5374a85e
commit 35be20cb05
No known key found for this signature in database
28 changed files with 1188 additions and 3216 deletions

View file

@ -1,6 +1,4 @@
use omnigraph::db::{
GraphCommit, MergeOutcome, ReadTarget, RunRecord, SchemaApplyResult, Snapshot,
};
use omnigraph::db::{GraphCommit, MergeOutcome, ReadTarget, SchemaApplyResult, Snapshot};
use omnigraph::error::{MergeConflict, MergeConflictKind};
use omnigraph::loader::{IngestResult, LoadMode};
use omnigraph_compiler::SchemaMigrationStep;
@ -41,30 +39,6 @@ pub struct SnapshotOutput {
pub tables: Vec<SnapshotTableOutput>,
}
#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)]
pub struct RunOutput {
pub run_id: String,
pub target_branch: String,
pub run_branch: String,
pub base_snapshot_id: String,
pub base_manifest_version: u64,
pub operation_hash: Option<String>,
pub actor_id: Option<String>,
pub status: String,
pub published_snapshot_id: Option<String>,
/// Run creation time as Unix epoch microseconds.
#[schema(example = 1714000000000000i64)]
pub created_at: i64,
/// Last status change as Unix epoch microseconds.
#[schema(example = 1714000000000000i64)]
pub updated_at: i64,
}
#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)]
pub struct RunListOutput {
pub runs: Vec<RunOutput>,
}
#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)]
pub struct BranchCreateRequest {
/// Parent branch to fork from. Defaults to `main`.
@ -368,6 +342,17 @@ pub enum ErrorCode {
Internal,
}
/// Structured details for a publisher-level OCC failure. Surfaces alongside
/// HTTP 409 when a write was rejected because the caller's pre-write view of
/// one table's manifest version was stale relative to the current head. The
/// expected/actual fields tell the client which table to refresh.
#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)]
pub struct ManifestConflictOutput {
pub table_key: String,
pub expected: u64,
pub actual: u64,
}
#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)]
pub struct ErrorOutput {
pub error: String,
@ -375,6 +360,12 @@ pub struct ErrorOutput {
pub code: Option<ErrorCode>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub merge_conflicts: Vec<MergeConflictOutput>,
/// Set when the conflict is a publisher CAS rejection
/// (`ManifestConflictDetails::ExpectedVersionMismatch`). The caller's
/// pre-write view of `table_key` was at version `expected` but the
/// manifest is now at `actual`. Refresh and retry.
#[serde(skip_serializing_if = "Option::is_none")]
pub manifest_conflict: Option<ManifestConflictOutput>,
}
pub fn snapshot_payload(branch: &str, snapshot: &Snapshot) -> SnapshotOutput {
@ -408,22 +399,6 @@ pub fn schema_apply_output(uri: &str, result: SchemaApplyResult) -> SchemaApplyO
}
}
pub fn run_output(run: &RunRecord) -> RunOutput {
RunOutput {
run_id: run.run_id.as_str().to_string(),
target_branch: run.target_branch.clone(),
run_branch: run.run_branch.clone(),
base_snapshot_id: run.base_snapshot_id.as_str().to_string(),
base_manifest_version: run.base_manifest_version,
operation_hash: run.operation_hash.clone(),
actor_id: run.actor_id.clone(),
status: run.status.as_str().to_string(),
published_snapshot_id: run.published_snapshot_id.clone(),
created_at: run.created_at,
updated_at: run.updated_at,
}
}
pub fn commit_output(commit: &GraphCommit) -> CommitOutput {
CommitOutput {
graph_commit_id: commit.graph_commit_id.clone(),

View file

@ -14,8 +14,8 @@ use api::{
BranchCreateOutput, BranchCreateRequest, BranchDeleteOutput, BranchListOutput,
BranchMergeOutput, BranchMergeRequest, ChangeOutput, ChangeRequest, CommitListOutput,
CommitListQuery, ErrorCode, ErrorOutput, ExportRequest, HealthOutput, IngestOutput,
IngestRequest, ReadOutput, ReadRequest, RunListOutput, SchemaApplyOutput, SchemaApplyRequest,
SchemaOutput, SnapshotQuery, ingest_output, schema_apply_output, snapshot_payload,
IngestRequest, ReadOutput, ReadRequest, SchemaApplyOutput, SchemaApplyRequest, SchemaOutput,
SnapshotQuery, ingest_output, schema_apply_output, snapshot_payload,
};
use axum::body::{Body, Bytes};
use axum::extract::DefaultBodyLimit;
@ -33,8 +33,8 @@ pub use config::{
load_config,
};
use futures::stream;
use omnigraph::db::{Omnigraph, ReadTarget, RunId};
use omnigraph::error::{ManifestErrorKind, OmniError};
use omnigraph::db::{Omnigraph, ReadTarget};
use omnigraph::error::{ManifestConflictDetails, ManifestErrorKind, OmniError};
use omnigraph_compiler::json_params_to_param_map;
use omnigraph_compiler::query::parser::parse_query;
use omnigraph_compiler::{JsonParamMode, ParamMap};
@ -82,10 +82,6 @@ fn hash_bearer_token(token: &str) -> BearerTokenHash {
server_branch_create,
server_branch_delete,
server_branch_merge,
server_run_list,
server_run_show,
server_run_publish,
server_run_abort,
server_commit_list,
server_commit_show,
),
@ -159,6 +155,7 @@ pub struct ApiError {
code: ErrorCode,
message: String,
merge_conflicts: Vec<api::MergeConflictOutput>,
manifest_conflict: Option<api::ManifestConflictOutput>,
}
impl AppState {
@ -280,6 +277,7 @@ impl ApiError {
code: ErrorCode::Unauthorized,
message: message.into(),
merge_conflicts: Vec::new(),
manifest_conflict: None,
}
}
@ -289,6 +287,7 @@ impl ApiError {
code: ErrorCode::Forbidden,
message: message.into(),
merge_conflicts: Vec::new(),
manifest_conflict: None,
}
}
@ -298,6 +297,7 @@ impl ApiError {
code: ErrorCode::BadRequest,
message: message.into(),
merge_conflicts: Vec::new(),
manifest_conflict: None,
}
}
@ -307,6 +307,7 @@ impl ApiError {
code: ErrorCode::NotFound,
message: message.into(),
merge_conflicts: Vec::new(),
manifest_conflict: None,
}
}
@ -316,6 +317,7 @@ impl ApiError {
code: ErrorCode::Conflict,
message: message.into(),
merge_conflicts: Vec::new(),
manifest_conflict: None,
}
}
@ -325,6 +327,7 @@ impl ApiError {
code: ErrorCode::Internal,
message: message.into(),
merge_conflicts: Vec::new(),
manifest_conflict: None,
}
}
@ -334,6 +337,20 @@ impl ApiError {
code: ErrorCode::Conflict,
message: summarize_merge_conflicts(&conflicts),
merge_conflicts: conflicts,
manifest_conflict: None,
}
}
fn manifest_version_conflict(
message: String,
details: api::ManifestConflictOutput,
) -> Self {
Self {
status: StatusCode::CONFLICT,
code: ErrorCode::Conflict,
message,
merge_conflicts: Vec::new(),
manifest_conflict: Some(details),
}
}
@ -344,7 +361,21 @@ impl ApiError {
OmniError::Manifest(err) => match err.kind {
ManifestErrorKind::BadRequest => Self::bad_request(err.message),
ManifestErrorKind::NotFound => Self::not_found(err.message),
ManifestErrorKind::Conflict => Self::conflict(err.message),
ManifestErrorKind::Conflict => match err.details {
Some(ManifestConflictDetails::ExpectedVersionMismatch {
table_key,
expected,
actual,
}) => Self::manifest_version_conflict(
err.message,
api::ManifestConflictOutput {
table_key,
expected,
actual,
},
),
_ => Self::conflict(err.message),
},
ManifestErrorKind::Internal => Self::internal(err.message),
},
OmniError::MergeConflicts(conflicts) => Self::merge_conflict(
@ -395,6 +426,7 @@ impl IntoResponse for ApiError {
error: self.message,
code: Some(self.code),
merge_conflicts: self.merge_conflicts,
manifest_conflict: self.manifest_conflict,
}),
)
.into_response()
@ -443,10 +475,6 @@ pub fn build_app(state: AppState) -> Router {
)
.route("/branches/{branch}", delete(server_branch_delete))
.route("/branches/merge", post(server_branch_merge))
.route("/runs", get(server_run_list))
.route("/runs/{run_id}", get(server_run_show))
.route("/runs/{run_id}/publish", post(server_run_publish))
.route("/runs/{run_id}/abort", post(server_run_abort))
.route("/commits", get(server_commit_list))
.route("/commits/{commit_id}", get(server_commit_show))
.route_layer(middleware::from_fn_with_state(
@ -1219,203 +1247,6 @@ async fn server_branch_merge(
}))
}
#[utoipa::path(
get,
path = "/runs",
tag = "runs",
operation_id = "listRuns",
responses(
(status = 200, description = "List of runs", body = RunListOutput),
(status = 401, description = "Unauthorized", body = ErrorOutput),
(status = 403, description = "Forbidden", body = ErrorOutput),
),
security(("bearer_token" = [])),
)]
/// List all runs.
///
/// A run is an ephemeral branch produced by an agent or background job. The
/// list includes pending, in-progress, published, and aborted runs across
/// all target branches. Read-only.
async fn server_run_list(
State(state): State<AppState>,
actor: Option<Extension<AuthenticatedActor>>,
) -> std::result::Result<Json<RunListOutput>, ApiError> {
authorize_request(
&state,
actor.as_ref().map(|Extension(actor)| actor),
PolicyRequest {
actor_id: actor
.as_ref()
.map(|Extension(actor)| actor.as_str().to_string())
.unwrap_or_default(),
action: PolicyAction::Read,
branch: None,
target_branch: None,
},
)?;
let runs = {
let db = Arc::clone(&state.db).read_owned().await;
db.list_runs().await.map_err(ApiError::from_omni)?
};
Ok(Json(RunListOutput {
runs: runs.iter().map(api::run_output).collect(),
}))
}
#[utoipa::path(
get,
path = "/runs/{run_id}",
tag = "runs",
operation_id = "getRun",
params(
("run_id" = String, Path, description = "Run identifier"),
),
responses(
(status = 200, description = "Run details", body = api::RunOutput),
(status = 401, description = "Unauthorized", body = ErrorOutput),
(status = 403, description = "Forbidden", body = ErrorOutput),
(status = 404, description = "Run not found", body = ErrorOutput),
),
security(("bearer_token" = [])),
)]
/// Get a single run.
///
/// Returns the run's status, target/run branches, base snapshot, and (if
/// published) the resulting snapshot id. Read-only.
async fn server_run_show(
State(state): State<AppState>,
actor: Option<Extension<AuthenticatedActor>>,
Path(run_id): Path<String>,
) -> std::result::Result<Json<api::RunOutput>, ApiError> {
authorize_request(
&state,
actor.as_ref().map(|Extension(actor)| actor),
PolicyRequest {
actor_id: actor
.as_ref()
.map(|Extension(actor)| actor.as_str().to_string())
.unwrap_or_default(),
action: PolicyAction::Read,
branch: None,
target_branch: None,
},
)?;
let run = {
let db = Arc::clone(&state.db).read_owned().await;
db.get_run(&RunId::new(run_id))
.await
.map_err(ApiError::from_omni)?
};
Ok(Json(api::run_output(&run)))
}
#[utoipa::path(
post,
path = "/runs/{run_id}/publish",
tag = "runs",
operation_id = "publishRun",
params(
("run_id" = String, Path, description = "Run identifier"),
),
responses(
(status = 200, description = "Run published", body = api::RunOutput),
(status = 401, description = "Unauthorized", body = ErrorOutput),
(status = 403, description = "Forbidden", body = ErrorOutput),
(status = 404, description = "Run not found", body = ErrorOutput),
),
security(("bearer_token" = [])),
)]
/// Publish a run to its target branch.
///
/// Promotes the run's snapshot onto its `target_branch` as a new commit. The
/// run must be in a publishable state. **Destructive** to the target branch.
async fn server_run_publish(
State(state): State<AppState>,
actor: Option<Extension<AuthenticatedActor>>,
Path(run_id): Path<String>,
) -> std::result::Result<Json<api::RunOutput>, ApiError> {
let run_id = RunId::new(run_id);
let actor_id = actor.as_ref().map(|Extension(actor)| actor.as_str());
let target_branch = {
let db = Arc::clone(&state.db).read_owned().await;
db.get_run(&run_id)
.await
.map_err(ApiError::from_omni)?
.target_branch
};
authorize_request(
&state,
actor.as_ref().map(|Extension(actor)| actor),
PolicyRequest {
actor_id: actor_id.map(str::to_string).unwrap_or_default(),
action: PolicyAction::RunPublish,
branch: None,
target_branch: Some(target_branch),
},
)?;
let run = {
let mut db = Arc::clone(&state.db).write_owned().await;
db.publish_run_as(&run_id, actor_id)
.await
.map_err(ApiError::from_omni)?;
db.get_run(&run_id).await.map_err(ApiError::from_omni)?
};
Ok(Json(api::run_output(&run)))
}
#[utoipa::path(
post,
path = "/runs/{run_id}/abort",
tag = "runs",
operation_id = "abortRun",
params(
("run_id" = String, Path, description = "Run identifier"),
),
responses(
(status = 200, description = "Run aborted", body = api::RunOutput),
(status = 401, description = "Unauthorized", body = ErrorOutput),
(status = 403, description = "Forbidden", body = ErrorOutput),
(status = 404, description = "Run not found", body = ErrorOutput),
),
security(("bearer_token" = [])),
)]
/// Abort a run.
///
/// Marks the run as aborted and releases its working branch. **Irreversible**:
/// the run cannot be resumed once aborted.
async fn server_run_abort(
State(state): State<AppState>,
actor: Option<Extension<AuthenticatedActor>>,
Path(run_id): Path<String>,
) -> std::result::Result<Json<api::RunOutput>, ApiError> {
let run_id = RunId::new(run_id);
let target_branch = {
let db = Arc::clone(&state.db).read_owned().await;
db.get_run(&run_id)
.await
.map_err(ApiError::from_omni)?
.target_branch
};
authorize_request(
&state,
actor.as_ref().map(|Extension(actor)| actor),
PolicyRequest {
actor_id: actor
.as_ref()
.map(|Extension(actor)| actor.as_str().to_string())
.unwrap_or_default(),
action: PolicyAction::RunAbort,
branch: None,
target_branch: Some(target_branch),
},
)?;
let run = {
let mut db = Arc::clone(&state.db).write_owned().await;
db.abort_run(&run_id).await.map_err(ApiError::from_omni)?
};
Ok(Json(api::run_output(&run)))
}
#[utoipa::path(
get,
path = "/commits",

View file

@ -23,8 +23,6 @@ pub enum PolicyAction {
BranchCreate,
BranchDelete,
BranchMerge,
RunPublish,
RunAbort,
Admin,
}
@ -38,8 +36,6 @@ impl PolicyAction {
Self::BranchCreate => "branch_create",
Self::BranchDelete => "branch_delete",
Self::BranchMerge => "branch_merge",
Self::RunPublish => "run_publish",
Self::RunAbort => "run_abort",
Self::Admin => "admin",
}
}
@ -51,12 +47,7 @@ impl PolicyAction {
fn uses_target_branch_scope(self) -> bool {
matches!(
self,
Self::BranchCreate
| Self::SchemaApply
| Self::BranchDelete
| Self::BranchMerge
| Self::RunPublish
| Self::RunAbort
Self::BranchCreate | Self::SchemaApply | Self::BranchDelete | Self::BranchMerge
)
}
}
@ -79,8 +70,6 @@ impl FromStr for PolicyAction {
"branch_create" => Ok(Self::BranchCreate),
"branch_delete" => Ok(Self::BranchDelete),
"branch_merge" => Ok(Self::BranchMerge),
"run_publish" => Ok(Self::RunPublish),
"run_abort" => Ok(Self::RunAbort),
"admin" => Ok(Self::Admin),
other => bail!("unknown policy action '{other}'"),
}
@ -599,8 +588,6 @@ namespace Omnigraph {
action "branch_create" appliesTo { principal: Actor, resource: Repo, context: RequestContext };
action "branch_delete" appliesTo { principal: Actor, resource: Repo, context: RequestContext };
action "branch_merge" appliesTo { principal: Actor, resource: Repo, context: RequestContext };
action "run_publish" appliesTo { principal: Actor, resource: Repo, context: RequestContext };
action "run_abort" appliesTo { principal: Actor, resource: Repo, context: RequestContext };
action "admin" appliesTo { principal: Actor, resource: Repo, context: RequestContext };
}
"#
@ -732,7 +719,7 @@ rules:
- id: admins-promote
allow:
actors: { group: admins }
actions: [branch_delete, branch_merge, run_publish]
actions: [branch_delete, branch_merge]
target_branch_scope: protected
"#,
)