Merge main into blob merge fix

Co-Authored-By: Ragnor Comerford <ragnor.comerford@gmail.com>
2026-06-15 01:55:13 +02:00 · 2026-05-10 21:55:02 +00:00 · 2026-05-10 21:55:02 +00:00 · da89e18e62
commit da89e18e62
parent 01660faa26 19e9292ec0
38 changed files with 4380 additions and 534 deletions
--- a/crates/omnigraph-cli/Cargo.toml
+++ b/crates/omnigraph-cli/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "omnigraph-cli"
-version = "0.4.1"
+version = "0.4.2"
 edition = "2024"
 description = "CLI for the Omnigraph graph database."
 license = "MIT"
@ -13,9 +13,9 @@ name = "omnigraph"
 path = "src/main.rs"

 [dependencies]
-omnigraph = { package = "omnigraph-engine", path = "../omnigraph", version = "0.4.1" }
-omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.4.1" }
-omnigraph-server = { path = "../omnigraph-server", version = "0.4.1" }
+omnigraph = { package = "omnigraph-engine", path = "../omnigraph", version = "0.4.2" }
+omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.4.2" }
+omnigraph-server = { path = "../omnigraph-server", version = "0.4.2" }
 clap = { workspace = true }
 color-eyre = { workspace = true }
 serde = { workspace = true }
--- a/crates/omnigraph-cli/src/main.rs
+++ b/crates/omnigraph-cli/src/main.rs
@ -1425,7 +1425,7 @@ async fn execute_query_lint(
    let uri = resolve_local_uri(config, cli_uri, cli_target, "query lint")?;
    let db = Omnigraph::open(&uri).await?;
    Ok(lint_query_file(
-        db.catalog(),
+        &db.catalog(),
        &query_source,
        query_path,
        QueryLintSchemaSource::repo(uri),
--- a/crates/omnigraph-compiler/Cargo.toml
+++ b/crates/omnigraph-compiler/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "omnigraph-compiler"
-version = "0.4.1"
+version = "0.4.2"
 edition = "2024"
 description = "Schema/query compiler for Omnigraph. Zero Lance dependency."
 license = "MIT"
--- a/crates/omnigraph-server/Cargo.toml
+++ b/crates/omnigraph-server/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "omnigraph-server"
-version = "0.4.1"
+version = "0.4.2"
 edition = "2024"
 description = "HTTP server for the Omnigraph graph database."
 license = "MIT"
@ -19,8 +19,8 @@ default = []
 aws = ["dep:aws-config", "dep:aws-sdk-secretsmanager"]

 [dependencies]
-omnigraph = { package = "omnigraph-engine", path = "../omnigraph", version = "0.4.1" }
-omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.4.1" }
+omnigraph = { package = "omnigraph-engine", path = "../omnigraph", version = "0.4.2" }
+omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.4.2" }
 axum = { workspace = true }
 clap = { workspace = true }
 color-eyre = { workspace = true }
@ -37,6 +37,7 @@ futures = { workspace = true }
 sha2 = { workspace = true }
 subtle = { workspace = true }
 async-trait = { workspace = true }
+dashmap = "6"
 aws-config = { version = "1", optional = true, default-features = false, features = ["rustls", "rt-tokio", "credentials-process", "sso"] }
 aws-sdk-secretsmanager = { version = "1", optional = true, default-features = false, features = ["rustls", "rt-tokio"] }

--- a/crates/omnigraph-server/examples/bench_actor_isolation.rs
+++ b/crates/omnigraph-server/examples/bench_actor_isolation.rs
@ -0,0 +1,392 @@
+//! Actor-isolation benchmark for MR-686's `WorkloadController`.
+//!
+//! The handoff calls this out as the empirical proof of MR-686's central
+//! design promise: per-actor admission control isolates noisy actors so a
+//! heavy `/ingest` user does not starve light `/change` traffic. The
+//! per-`(table, branch)` queue pins the same-key serialization story; this
+//! bench pins actor isolation under load.
+//!
+//! Setup:
+//! - One "heavy" actor flooding `/ingest` with multi-row NDJSON bodies.
+//! - N "light" actors each running short bursts of `/change` inserts.
+//! - Each actor authenticates with its own bearer token so the
+//!   `WorkloadController` accounts them as distinct identities.
+//!
+//! Output: heavy-actor throughput / 429s, light-actor p50 / p95 / p99
+//! latency. Acceptance heuristic on local FS: light-actor p99 < 2 s
+//! while the heavy actor saturates its own per-actor cap.
+//!
+//! Usage:
+//! ```sh
+//! cargo run --release -p omnigraph-server --example bench_actor_isolation -- \
+//!     --light-actors 4 --light-ops-per-actor 50 \
+//!     --heavy-batches 200 --heavy-rows-per-batch 200 \
+//!     --inflight-cap 1 \
+//!     --output bench-results/after-pr2-phase2/actor-isolation.json
+//! ```
+
+use std::path::PathBuf;
+use std::time::{Duration, Instant};
+
+use axum::Router;
+use axum::body::{Body, to_bytes};
+use axum::http::{Method, Request, StatusCode};
+use clap::Parser;
+use omnigraph::db::Omnigraph;
+use omnigraph_server::api::{ChangeRequest, IngestRequest};
+use omnigraph_server::workload::WorkloadController;
+use omnigraph_server::{AppState, build_app};
+use serde::Serialize;
+use tower::ServiceExt;
+
+const SCHEMA: &str = "node Person {\n    name: String @key\n    age: I32?\n}\n";
+
+const HEAVY_TOKEN: &str = "heavy-actor-token";
+const HEAVY_ACTOR: &str = "act-heavy";
+
+#[derive(Parser, Debug)]
+#[command(about = "Actor-isolation HTTP bench for MR-686 WorkloadController")]
+struct Args {
+    /// Number of light actors driving /change traffic concurrently with the
+    /// heavy /ingest flood. Each gets its own bearer token.
+    #[arg(long, default_value_t = 4)]
+    light_actors: usize,
+    /// Number of /change ops per light actor.
+    #[arg(long, default_value_t = 50)]
+    light_ops_per_actor: usize,
+    /// Number of /ingest batches the heavy actor sends.
+    #[arg(long, default_value_t = 200)]
+    heavy_batches: usize,
+    /// NDJSON rows per heavy /ingest batch.
+    #[arg(long, default_value_t = 200)]
+    heavy_rows_per_batch: usize,
+    /// Concurrent in-flight /ingest tasks the heavy actor maintains. With
+    /// `inflight_cap` smaller than this, the heavy actor exercises its own
+    /// admission cap (and the bench reports `heavy_too_many_requests > 0`),
+    /// proving the gate fires without affecting light actors. Default 4
+    /// against cap=1 → expect ~3/4 batches rejected.
+    #[arg(long, default_value_t = 4)]
+    heavy_concurrency: usize,
+    /// Per-actor in-flight cap for the run. Passed directly into the
+    /// `WorkloadController` constructor (no env-var fiddling). Lower
+    /// values surface admission rejections faster.
+    #[arg(long, default_value_t = 1)]
+    inflight_cap: u32,
+    /// Per-actor byte budget (bytes). Default 1 GiB so byte budget
+    /// doesn't bottleneck the count gate during normal bench runs.
+    #[arg(long, default_value_t = 1_073_741_824)]
+    byte_cap: u64,
+    /// Output file for the JSON results. Stdout always gets a copy.
+    #[arg(long)]
+    output: Option<PathBuf>,
+    /// Optional label to record alongside results.
+    #[arg(long, default_value = "")]
+    label: String,
+}
+
+#[derive(Serialize, Debug)]
+struct BenchResults {
+    label: String,
+    inflight_cap: u32,
+    light_actors: usize,
+    light_ops_per_actor: usize,
+    heavy_batches: usize,
+    heavy_rows_per_batch: usize,
+    wall_time_ms: u64,
+    heavy_ok: usize,
+    heavy_too_many_requests: usize,
+    heavy_other_errors: usize,
+    heavy_throughput_attempts_per_sec: f64,
+    light_ok: usize,
+    light_too_many_requests: usize,
+    light_other_errors: usize,
+    light_p50_ms: f64,
+    light_p95_ms: f64,
+    light_p99_ms: f64,
+    light_p999_ms: f64,
+    light_max_ms: f64,
+    notes: &'static str,
+}
+
+fn build_heavy_body(batch_idx: usize, rows: usize) -> String {
+    let mut data = String::new();
+    for r in 0..rows {
+        data.push_str(&format!(
+            "{{\"type\":\"Person\",\"data\":{{\"name\":\"heavy-b{}-r{}\",\"age\":{}}}}}\n",
+            batch_idx,
+            r,
+            r % 100,
+        ));
+    }
+    serde_json::to_string(&IngestRequest {
+        data,
+        branch: Some("main".to_string()),
+        from: Some("main".to_string()),
+        mode: Some(omnigraph::loader::LoadMode::Merge),
+    })
+    .unwrap()
+}
+
+async fn send_heavy_batch(app: Router, batch_idx: usize, rows: usize) -> StatusCode {
+    let body = build_heavy_body(batch_idx, rows);
+    let req = Request::builder()
+        .method(Method::POST)
+        .uri("/ingest")
+        .header("authorization", format!("Bearer {HEAVY_TOKEN}"))
+        .header("content-type", "application/json")
+        .body(Body::from(body))
+        .unwrap();
+    match app.oneshot(req).await {
+        Ok(r) => r.status(),
+        Err(_) => StatusCode::INTERNAL_SERVER_ERROR,
+    }
+}
+
+/// Drive `batches` /ingest calls from the heavy actor with up to
+/// `concurrency` in flight at a time. With `concurrency > inflight_cap`,
+/// the heavy actor's own admission permits are exhausted at peak, and
+/// some batches return 429. Returns (ok, 429, other) counts.
+async fn drive_heavy_actor(
+    app: Router,
+    batches: usize,
+    rows_per_batch: usize,
+    concurrency: usize,
+) -> (usize, usize, usize) {
+    use tokio::sync::Semaphore;
+
+    // Asserted at startup in `main()`; check again here for defense in
+    // depth so a future caller can't pass 0 silently.
+    assert!(concurrency > 0, "drive_heavy_actor concurrency must be > 0");
+    let limiter = Arc::new(Semaphore::new(concurrency));
+    let mut handles = Vec::with_capacity(batches);
+    for b in 0..batches {
+        let app = app.clone();
+        let limiter = Arc::clone(&limiter);
+        handles.push(tokio::spawn(async move {
+            // Bound concurrency to `concurrency`; this is the bench's
+            // own pacing, not the server's admission control. The
+            // server's `WorkloadController` is what we're trying to
+            // exercise — and it has its own cap (potentially smaller).
+            let _permit = limiter.acquire_owned().await.unwrap();
+            send_heavy_batch(app, b, rows_per_batch).await
+        }));
+    }
+
+    let mut ok = 0usize;
+    let mut too_many = 0usize;
+    let mut other = 0usize;
+    for h in handles {
+        match h.await.unwrap_or(StatusCode::INTERNAL_SERVER_ERROR) {
+            StatusCode::OK => ok += 1,
+            StatusCode::TOO_MANY_REQUESTS => too_many += 1,
+            _ => other += 1,
+        }
+    }
+    (ok, too_many, other)
+}
+
+use std::sync::Arc;
+
+async fn drive_light_actor(
+    app: Router,
+    token: String,
+    actor_idx: usize,
+    ops: usize,
+) -> (Vec<Duration>, usize, usize, usize) {
+    let mut latencies = Vec::with_capacity(ops);
+    let mut ok = 0usize;
+    let mut too_many = 0usize;
+    let mut other = 0usize;
+    for op_idx in 0..ops {
+        let request_body = ChangeRequest {
+            query_source: "query insert_person($name: String, $age: I32) {\n    insert Person { name: $name, age: $age }\n}".to_string(),
+            query_name: Some("insert_person".to_string()),
+            params: Some(serde_json::json!({
+                "name": format!("light-{actor_idx}-{op_idx}"),
+                "age": op_idx as i32,
+            })),
+            branch: Some("main".to_string()),
+        };
+        let body = serde_json::to_vec(&request_body).unwrap();
+        let req = Request::builder()
+            .method(Method::POST)
+            .uri("/change")
+            .header("authorization", format!("Bearer {token}"))
+            .header("content-type", "application/json")
+            .body(Body::from(body))
+            .unwrap();
+        let start = Instant::now();
+        let response = match app.clone().oneshot(req).await {
+            Ok(r) => r,
+            Err(_) => {
+                other += 1;
+                continue;
+            }
+        };
+        let elapsed = start.elapsed();
+        match response.status() {
+            StatusCode::OK => {
+                ok += 1;
+                latencies.push(elapsed);
+            }
+            StatusCode::TOO_MANY_REQUESTS => {
+                too_many += 1;
+                // Drain to free the body resource.
+                let _ = to_bytes(response.into_body(), 16 * 1024).await;
+            }
+            _ => {
+                other += 1;
+                let _ = to_bytes(response.into_body(), 16 * 1024).await;
+            }
+        }
+    }
+    (latencies, ok, too_many, other)
+}
+
+#[tokio::main]
+async fn main() {
+    let args = Args::parse();
+    if args.light_actors == 0 || args.light_ops_per_actor == 0 || args.heavy_batches == 0 {
+        eprintln!("--light-actors, --light-ops-per-actor, --heavy-batches must all be > 0");
+        std::process::exit(2);
+    }
+    if args.heavy_concurrency == 0 {
+        eprintln!(
+            "--heavy-concurrency must be > 0 (zero would prevent the heavy actor from \
+             ever firing a batch; if you want to disable heavy traffic, set --heavy-batches=0)"
+        );
+        std::process::exit(2);
+    }
+
+    let temp = tempfile::tempdir().expect("tempdir");
+    let repo = temp.path().join("bench.omni");
+    Omnigraph::init(repo.to_str().unwrap(), SCHEMA)
+        .await
+        .expect("init repo");
+
+    // Build bearer tokens: one for the heavy actor + one per light actor.
+    let mut tokens: Vec<(String, String)> =
+        vec![(HEAVY_ACTOR.to_string(), HEAVY_TOKEN.to_string())];
+    for i in 0..args.light_actors {
+        tokens.push((format!("act-light-{i}"), format!("light-token-{i}")));
+    }
+    let db = Omnigraph::open(repo.to_str().unwrap())
+        .await
+        .expect("open repo");
+    // Construct a custom WorkloadController with the requested caps and
+    // pass it through `AppState::new_with_workload`. Avoids the
+    // `unsafe { std::env::set_var(...) }` antipattern that violates
+    // `setenv`'s thread-safety precondition once the multi-thread tokio
+    // runtime is up.
+    let workload = WorkloadController::new(args.inflight_cap, args.byte_cap);
+    let state = AppState::new_with_workload(
+        repo.to_string_lossy().to_string(),
+        db,
+        tokens,
+        workload,
+    );
+    let app = build_app(state);
+
+    eprintln!(
+        "running heavy={}x{} (concurrency={}) light={}x{} cap={}",
+        args.heavy_batches,
+        args.heavy_rows_per_batch,
+        args.heavy_concurrency,
+        args.light_actors,
+        args.light_ops_per_actor,
+        args.inflight_cap,
+    );
+
+    let start = Instant::now();
+    let heavy_app = app.clone();
+    let heavy_concurrency = args.heavy_concurrency;
+    let heavy_handle = tokio::spawn(async move {
+        drive_heavy_actor(
+            heavy_app,
+            args.heavy_batches,
+            args.heavy_rows_per_batch,
+            heavy_concurrency,
+        )
+        .await
+    });
+
+    let mut light_handles = Vec::with_capacity(args.light_actors);
+    for actor_idx in 0..args.light_actors {
+        let app = app.clone();
+        let token = format!("light-token-{actor_idx}");
+        let ops = args.light_ops_per_actor;
+        light_handles.push(tokio::spawn(async move {
+            drive_light_actor(app, token, actor_idx, ops).await
+        }));
+    }
+
+    let (heavy_ok, heavy_too_many, heavy_other) = heavy_handle.await.expect("heavy task panicked");
+    let mut light_latencies: Vec<Duration> =
+        Vec::with_capacity(args.light_actors * args.light_ops_per_actor);
+    let mut light_ok = 0usize;
+    let mut light_too_many = 0usize;
+    let mut light_other = 0usize;
+    for h in light_handles {
+        let (lats, ok, too_many, other) = h.await.expect("light task panicked");
+        light_latencies.extend(lats);
+        light_ok += ok;
+        light_too_many += too_many;
+        light_other += other;
+    }
+    let wall = start.elapsed();
+
+    light_latencies.sort();
+    let n = light_latencies.len();
+    let pct = |p: f64| -> f64 {
+        if n == 0 {
+            return 0.0;
+        }
+        let idx = ((n as f64 - 1.0) * p).round() as usize;
+        light_latencies[idx].as_secs_f64() * 1000.0
+    };
+    let max_ms = light_latencies
+        .last()
+        .map(|d| d.as_secs_f64() * 1000.0)
+        .unwrap_or(0.0);
+    let heavy_throughput = if wall.as_secs_f64() > 0.0 {
+        args.heavy_batches as f64 / wall.as_secs_f64()
+    } else {
+        0.0
+    };
+
+    let results = BenchResults {
+        label: args.label.clone(),
+        inflight_cap: args.inflight_cap,
+        light_actors: args.light_actors,
+        light_ops_per_actor: args.light_ops_per_actor,
+        heavy_batches: args.heavy_batches,
+        heavy_rows_per_batch: args.heavy_rows_per_batch,
+        wall_time_ms: wall.as_millis() as u64,
+        heavy_ok,
+        heavy_too_many_requests: heavy_too_many,
+        heavy_other_errors: heavy_other,
+        heavy_throughput_attempts_per_sec: heavy_throughput,
+        light_ok,
+        light_too_many_requests: light_too_many,
+        light_other_errors: light_other,
+        light_p50_ms: pct(0.50),
+        light_p95_ms: pct(0.95),
+        light_p99_ms: pct(0.99),
+        light_p999_ms: pct(0.999),
+        light_max_ms: max_ms,
+        notes: "MR-686 actor-isolation bench. Heavy /ingest + light /change concurrent.",
+    };
+
+    let json = serde_json::to_string_pretty(&results).unwrap();
+    println!("{json}");
+
+    if let Some(path) = args.output.as_ref() {
+        if let Some(parent) = path.parent()
+            && !parent.as_os_str().is_empty()
+        {
+            std::fs::create_dir_all(parent).expect("mkdir output parent");
+        }
+        std::fs::write(path, &json).expect("write output");
+        eprintln!("wrote {}", path.display());
+    }
+}
--- a/crates/omnigraph-server/examples/bench_concurrent_http.rs
+++ b/crates/omnigraph-server/examples/bench_concurrent_http.rs
@ -0,0 +1,267 @@
+//! Server-level concurrent HTTP benchmark for MR-686 (PR 0 baseline).
+//!
+//! Drives concurrent `/change` requests against an in-process Omnigraph HTTP
+//! server. Measures the global `Arc<RwLock<Omnigraph>>` lock penalty on
+//! current `main` so PR 1 + PR 2 can be evaluated against a real baseline.
+//!
+//! Per the MR-686 plan: this is the load-bearing bench. `Omnigraph::mutate_as`
+//! is `&mut self`, so an engine-level concurrent bench either serializes on the
+//! borrow checker (measures nothing) or drives multiple handles (measures Lance
+//! contention, not the server bottleneck). Driving the HTTP server is the only
+//! way to measure the actual `RwLock<Omnigraph>` contention this work removes.
+//!
+//! Usage:
+//! ```sh
+//! cargo run --release -p omnigraph-server --example bench_concurrent_http -- \
+//!     --tables 16 --actors 16 --ops-per-actor 1000 --mode disjoint \
+//!     --output bench-results/baseline-main/cross-table.json
+//! ```
+//!
+//! Modes:
+//! - `disjoint`: each actor writes to a distinct node type (cross-table fanout)
+//! - `same-key`: all actors write to the same node type (hot-key contention)
+//! - `mixed`: each actor writes to a different table per op (round-robin)
+
+use std::path::PathBuf;
+use std::time::{Duration, Instant};
+
+use axum::Router;
+use axum::body::{Body, to_bytes};
+use axum::http::{Method, Request, StatusCode};
+use clap::{Parser, ValueEnum};
+use omnigraph::db::Omnigraph;
+use omnigraph_server::api::ChangeRequest;
+use omnigraph_server::{AppState, build_app};
+use serde::Serialize;
+use tower::ServiceExt;
+
+#[derive(Parser, Debug)]
+#[command(about = "Concurrent HTTP bench for MR-686")]
+struct Args {
+    /// Number of distinct node types in the schema.
+    #[arg(long, default_value_t = 16)]
+    tables: usize,
+    /// Number of concurrent actors driving requests.
+    #[arg(long, default_value_t = 16)]
+    actors: usize,
+    /// Operations per actor.
+    #[arg(long, default_value_t = 100)]
+    ops_per_actor: usize,
+    /// Workload mode.
+    #[arg(long, value_enum, default_value_t = Mode::Disjoint)]
+    mode: Mode,
+    /// Output file for the JSON results. Stdout always gets a copy.
+    #[arg(long)]
+    output: Option<PathBuf>,
+    /// Optional label to record alongside results (e.g. "baseline-main").
+    #[arg(long, default_value = "")]
+    label: String,
+}
+
+#[derive(Clone, Copy, Debug, ValueEnum, Serialize)]
+#[serde(rename_all = "kebab-case")]
+enum Mode {
+    Disjoint,
+    SameKey,
+    Mixed,
+}
+
+#[derive(Serialize, Debug)]
+struct BenchResults {
+    label: String,
+    mode: Mode,
+    tables: usize,
+    actors: usize,
+    ops_per_actor: usize,
+    total_ops: usize,
+    error_count: usize,
+    wall_time_ms: u64,
+    throughput_ops_per_sec: f64,
+    p50_ms: f64,
+    p95_ms: f64,
+    p99_ms: f64,
+    p999_ms: f64,
+    max_ms: f64,
+    notes: &'static str,
+}
+
+fn build_schema(num_tables: usize) -> String {
+    let mut schema = String::new();
+    for i in 0..num_tables {
+        schema.push_str(&format!(
+            "node Item{i} {{\n    name: String @key\n    value: I32?\n}}\n\n"
+        ));
+    }
+    schema
+}
+
+fn build_query_source(table_idx: usize) -> String {
+    format!(
+        "query insert_item($name: String, $value: I32) {{\n    insert Item{table_idx} {{ name: $name, value: $value }}\n}}"
+    )
+}
+
+fn pick_table(actor_idx: usize, op_idx: usize, mode: Mode, num_tables: usize) -> usize {
+    match mode {
+        Mode::Disjoint => actor_idx % num_tables,
+        Mode::SameKey => 0,
+        Mode::Mixed => (actor_idx.wrapping_mul(7919) ^ op_idx) % num_tables,
+    }
+}
+
+async fn drive_actor(
+    app: Router,
+    actor_idx: usize,
+    ops: usize,
+    mode: Mode,
+    num_tables: usize,
+) -> (Vec<Duration>, usize) {
+    let mut latencies = Vec::with_capacity(ops);
+    let mut errors = 0usize;
+    for op_idx in 0..ops {
+        let table_idx = pick_table(actor_idx, op_idx, mode, num_tables);
+        let request_body = ChangeRequest {
+            query_source: build_query_source(table_idx),
+            query_name: Some("insert_item".to_string()),
+            params: Some(serde_json::json!({
+                "name": format!("a{actor_idx}_o{op_idx}"),
+                "value": op_idx as i32,
+            })),
+            branch: None,
+        };
+        let body = serde_json::to_vec(&request_body).unwrap();
+        let req = Request::builder()
+            .method(Method::POST)
+            .uri("/change")
+            .header("content-type", "application/json")
+            .body(Body::from(body))
+            .unwrap();
+
+        let start = Instant::now();
+        let response = match app.clone().oneshot(req).await {
+            Ok(r) => r,
+            Err(e) => {
+                eprintln!("actor {actor_idx} op {op_idx} transport error: {e:?}");
+                errors += 1;
+                continue;
+            }
+        };
+        let elapsed = start.elapsed();
+        let status = response.status();
+        if status != StatusCode::OK {
+            errors += 1;
+            // Drain body for logging on the first few failures.
+            if errors <= 3 {
+                let body = to_bytes(response.into_body(), 64 * 1024).await.unwrap_or_default();
+                eprintln!(
+                    "actor {actor_idx} op {op_idx} status {status} body {}",
+                    String::from_utf8_lossy(&body)
+                );
+            }
+        }
+        latencies.push(elapsed);
+    }
+    (latencies, errors)
+}
+
+#[tokio::main]
+async fn main() {
+    let args = Args::parse();
+    if args.tables == 0 || args.actors == 0 || args.ops_per_actor == 0 {
+        eprintln!("--tables, --actors, --ops-per-actor must all be > 0");
+        std::process::exit(2);
+    }
+
+    let temp = tempfile::tempdir().expect("tempdir");
+    let repo = temp.path().join("bench.omni");
+    let schema = build_schema(args.tables);
+    Omnigraph::init(repo.to_str().unwrap(), &schema)
+        .await
+        .expect("init repo");
+
+    let state = AppState::open(repo.to_string_lossy().to_string())
+        .await
+        .expect("open AppState");
+    let app = build_app(state);
+
+    eprintln!(
+        "running mode={:?} tables={} actors={} ops_per_actor={}",
+        args.mode, args.tables, args.actors, args.ops_per_actor
+    );
+
+    let start = Instant::now();
+    let mut handles = Vec::with_capacity(args.actors);
+    for actor_idx in 0..args.actors {
+        let app = app.clone();
+        let mode = args.mode;
+        let ops = args.ops_per_actor;
+        let num_tables = args.tables;
+        handles.push(tokio::spawn(async move {
+            drive_actor(app, actor_idx, ops, mode, num_tables).await
+        }));
+    }
+
+    let mut all_latencies: Vec<Duration> = Vec::with_capacity(args.actors * args.ops_per_actor);
+    let mut total_errors = 0usize;
+    for h in handles {
+        let (lats, errs) = h.await.expect("actor task panicked");
+        all_latencies.extend(lats);
+        total_errors += errs;
+    }
+    let wall = start.elapsed();
+
+    all_latencies.sort();
+    let n = all_latencies.len();
+    let pct = |p: f64| -> f64 {
+        if n == 0 {
+            return 0.0;
+        }
+        let idx = ((n as f64 - 1.0) * p).round() as usize;
+        all_latencies[idx].as_secs_f64() * 1000.0
+    };
+    let max_ms = all_latencies
+        .last()
+        .map(|d| d.as_secs_f64() * 1000.0)
+        .unwrap_or(0.0);
+    let throughput = if wall.as_secs_f64() > 0.0 {
+        n as f64 / wall.as_secs_f64()
+    } else {
+        0.0
+    };
+
+    let results = BenchResults {
+        label: args.label.clone(),
+        mode: args.mode,
+        tables: args.tables,
+        actors: args.actors,
+        ops_per_actor: args.ops_per_actor,
+        total_ops: n,
+        error_count: total_errors,
+        wall_time_ms: wall.as_millis() as u64,
+        throughput_ops_per_sec: throughput,
+        p50_ms: pct(0.50),
+        p95_ms: pct(0.95),
+        p99_ms: pct(0.99),
+        p999_ms: pct(0.999),
+        max_ms,
+        notes: "MR-686 PR 0 baseline. Drives /change via Tower oneshot.",
+    };
+
+    let json = serde_json::to_string_pretty(&results).unwrap();
+    println!("{json}");
+
+    if let Some(path) = args.output.as_ref() {
+        if let Some(parent) = path.parent()
+            && !parent.as_os_str().is_empty()
+        {
+            std::fs::create_dir_all(parent).expect("mkdir output parent");
+        }
+        std::fs::write(path, &json).expect("write output");
+        eprintln!("wrote {}", path.display());
+    }
+
+    if total_errors > 0 {
+        eprintln!("WARN: {total_errors} requests failed");
+        std::process::exit(1);
+    }
+}
--- a/crates/omnigraph-server/src/api.rs
+++ b/crates/omnigraph-server/src/api.rs
@ -339,6 +339,9 @@ pub enum ErrorCode {
    BadRequest,
    NotFound,
    Conflict,
+    /// 429 Too Many Requests — per-actor admission cap exceeded.
+    /// Clients should respect the `Retry-After` header.
+    TooManyRequests,
    Internal,
 }

--- a/crates/omnigraph-server/src/lib.rs
+++ b/crates/omnigraph-server/src/lib.rs
@ -2,6 +2,7 @@ pub mod api;
 pub mod auth;
 pub mod config;
 pub mod policy;
+pub mod workload;

 use std::collections::{HashMap, HashSet};
 use std::fs;
@ -47,7 +48,7 @@ use serde_json::Value;
 use sha2::{Digest, Sha256};
 use subtle::ConstantTimeEq;
 use tokio::net::TcpListener;
-use tokio::sync::{RwLock, mpsc};
+use tokio::sync::mpsc;
 use tower_http::trace::TraceLayer;
 use tracing::{error, info};
 use tracing_subscriber::EnvFilter;
@ -118,7 +119,14 @@ pub struct ServerConfig {
 #[derive(Clone)]
 pub struct AppState {
    uri: String,
-    db: Arc<RwLock<Omnigraph>>,
+    /// PR 2 (MR-686): the engine is now `Arc<Omnigraph>` — no global
+    /// write lock. Concurrent handlers call `&self` engine APIs
+    /// directly. Per-(table, branch) write queues inside the engine
+    /// serialize same-key writers; per-actor admission control on
+    /// `workload` isolates noisy actors.
+    engine: Arc<Omnigraph>,
+    /// Per-actor admission control. See `workload::WorkloadController`.
+    workload: Arc<workload::WorkloadController>,
    bearer_tokens: Arc<[(BearerTokenHash, Arc<str>)]>,
    policy_engine: Option<Arc<PolicyEngine>>,
 }
@ -191,12 +199,36 @@ impl AppState {
            .collect();
        Self {
            uri,
-            db: Arc::new(RwLock::new(db)),
+            engine: Arc::new(db),
+            workload: Arc::new(workload::WorkloadController::from_env()),
            bearer_tokens: Arc::from(bearer_tokens),
            policy_engine: policy_engine.map(Arc::new),
        }
    }

+    /// Construct with a caller-provided [`workload::WorkloadController`].
+    /// Tests and benches use this to override per-actor caps without
+    /// mutating global env vars (which is unsafe in Rust 2024 once the
+    /// async runtime is up — `setenv` isn't thread-safe).
+    pub fn new_with_workload(
+        uri: String,
+        db: Omnigraph,
+        bearer_tokens: Vec<(String, String)>,
+        workload: workload::WorkloadController,
+    ) -> Self {
+        let bearer_tokens: Vec<(BearerTokenHash, Arc<str>)> = bearer_tokens
+            .into_iter()
+            .map(|(actor, token)| (hash_bearer_token(&token), Arc::<str>::from(actor)))
+            .collect();
+        Self {
+            uri,
+            engine: Arc::new(db),
+            workload: Arc::new(workload),
+            bearer_tokens: Arc::from(bearer_tokens),
+            policy_engine: None,
+        }
+    }
+
    pub async fn open(uri: impl Into<String>) -> Result<Self> {
        Self::open_with_bearer_token(uri, None).await
    }
@ -331,6 +363,31 @@ impl ApiError {
        }
    }

+    /// HTTP 429 Too Many Requests — actor exceeded their per-actor
+    /// admission cap (count or byte budget). Clients should respect the
+    /// `Retry-After` header. Mapped from `RejectReason::InFlightCountExceeded`
+    /// and `RejectReason::ByteBudgetExceeded`.
+    pub fn too_many_requests(message: impl Into<String>) -> Self {
+        Self {
+            status: StatusCode::TOO_MANY_REQUESTS,
+            code: ErrorCode::TooManyRequests,
+            message: message.into(),
+            merge_conflicts: Vec::new(),
+            manifest_conflict: None,
+        }
+    }
+
+    /// Convert a `WorkloadController` rejection into the matching
+    /// `ApiError` variant.
+    pub fn from_workload_reject(reject: workload::RejectReason) -> Self {
+        match reject {
+            workload::RejectReason::InFlightCountExceeded { .. }
+            | workload::RejectReason::ByteBudgetExceeded { .. } => {
+                Self::too_many_requests(reject.to_string())
+            }
+        }
+    }
+
    fn merge_conflict(conflicts: Vec<api::MergeConflictOutput>) -> Self {
        Self {
            status: StatusCode::CONFLICT,
@ -418,10 +475,21 @@ fn summarize_merge_conflicts(conflicts: &[api::MergeConflictOutput]) -> String {
    format!("merge conflicts: {}{}", preview.join("; "), suffix)
 }

+/// Constant `Retry-After` value (seconds) emitted on 429 responses.
+const RETRY_AFTER_SECONDS: &str = "60";
+
 impl IntoResponse for ApiError {
    fn into_response(self) -> Response {
+        let mut headers = axum::http::HeaderMap::new();
+        if matches!(self.code, ErrorCode::TooManyRequests) {
+            headers.insert(
+                axum::http::header::RETRY_AFTER,
+                axum::http::HeaderValue::from_static(RETRY_AFTER_SECONDS),
+            );
+        }
        (
            self.status,
+            headers,
            Json(ErrorOutput {
                error: self.message,
                code: Some(self.code),
@ -674,7 +742,7 @@ async fn server_snapshot(
        },
    )?;
    let snapshot = {
-        let db = Arc::clone(&state.db).read_owned().await;
+        let db = &state.engine;
        db.snapshot_of(ReadTarget::branch(branch.as_str()))
            .await
            .map_err(ApiError::from_omni)?
@ -718,7 +786,7 @@ async fn server_read(
    let policy_branch = match &target {
        ReadTarget::Branch(branch) => Some(branch.clone()),
        ReadTarget::Snapshot(_) if state.policy_engine().is_some() && actor.is_some() => {
-            let db = Arc::clone(&state.db).read_owned().await;
+            let db = &state.engine;
            db.resolved_branch_of(target.clone())
                .await
                .map(|branch| branch.or_else(|| Some("main".to_string())))
@ -746,7 +814,7 @@ async fn server_read(
        .map_err(|err| ApiError::bad_request(err.to_string()))?;

    let result = {
-        let db = Arc::clone(&state.db).read_owned().await;
+        let db = &state.engine;
        db.query(
            target.clone(),
            &request.query_source,
@ -798,15 +866,15 @@ async fn server_export(
            target_branch: None,
        },
    )?;
-    let db = Arc::clone(&state.db);
+    let engine = Arc::clone(&state.engine);
    let type_names = request.type_names.clone();
    let table_keys = request.table_keys.clone();
    let (tx, rx) = mpsc::unbounded_channel::<std::result::Result<Bytes, io::Error>>();
    tokio::spawn(async move {
        let result = {
-            let db = db.read().await;
            let mut writer = ExportStreamWriter { sender: tx.clone() };
-            db.export_jsonl_to_writer(&branch, &type_names, &table_keys, &mut writer)
+            engine
+                .export_jsonl_to_writer(&branch, &type_names, &table_keys, &mut writer)
                .await
        };
        if let Err(err) = result {
@ -836,6 +904,7 @@ async fn server_export(
        (status = 401, description = "Unauthorized", body = ErrorOutput),
        (status = 403, description = "Forbidden", body = ErrorOutput),
        (status = 409, description = "Merge conflict", body = ErrorOutput),
+        (status = 429, description = "Per-actor admission cap exceeded; honor `Retry-After` header", body = ErrorOutput),
    ),
    security(("bearer_token" = [])),
 )]
@ -851,6 +920,10 @@ async fn server_change(
    Json(request): Json<ChangeRequest>,
 ) -> std::result::Result<Json<ChangeOutput>, ApiError> {
    let branch = request.branch.unwrap_or_else(|| "main".to_string());
+    let actor_arc = actor
+        .as_ref()
+        .map(|Extension(actor)| Arc::clone(&actor.0))
+        .unwrap_or_else(|| Arc::<str>::from("anonymous"));
    let actor_id = actor.as_ref().map(|Extension(actor)| actor.as_str());
    authorize_request(
        &state,
@ -862,6 +935,20 @@ async fn server_change(
            target_branch: None,
        },
    )?;
+    // Per-actor admission: bound concurrent in-flight mutations and
+    // estimated bytes per actor. Cedar runs FIRST so denied requests
+    // don't consume admission slots. Estimate uses the request body
+    // size as a coarse proxy; engine memory pressure can run higher.
+    let est_bytes = request.query_source.len() as u64
+        + request
+            .params
+            .as_ref()
+            .map(|p| p.to_string().len() as u64)
+            .unwrap_or(0);
+    let _admission = state
+        .workload
+        .try_admit(&actor_arc, est_bytes)
+        .map_err(ApiError::from_workload_reject)?;
    let (selected_name, query_params) =
        select_named_query(&request.query_source, request.query_name.as_deref())
            .map_err(|err| ApiError::bad_request(err.to_string()))?;
@ -869,7 +956,7 @@ async fn server_change(
        .map_err(|err| ApiError::bad_request(err.to_string()))?;

    let result = {
-        let mut db = Arc::clone(&state.db).write_owned().await;
+        let db = &state.engine;
        db.mutate_as(
            &branch,
            &request.query_source,
@ -924,7 +1011,7 @@ async fn server_schema_get(
        },
    )?;
    let schema_source = {
-        let db = Arc::clone(&state.db).read_owned().await;
+        let db = &state.engine;
        db.schema_source().to_string()
    };
    Ok(Json(SchemaOutput { schema_source }))
@ -941,6 +1028,7 @@ async fn server_schema_get(
        (status = 400, description = "Bad request", body = ErrorOutput),
        (status = 401, description = "Unauthorized", body = ErrorOutput),
        (status = 403, description = "Forbidden", body = ErrorOutput),
+        (status = 429, description = "Per-actor admission cap exceeded; honor `Retry-After` header", body = ErrorOutput),
    ),
    security(("bearer_token" = [])),
 )]
@ -955,6 +1043,10 @@ async fn server_schema_apply(
    actor: Option<Extension<AuthenticatedActor>>,
    Json(request): Json<SchemaApplyRequest>,
 ) -> std::result::Result<Json<SchemaApplyOutput>, ApiError> {
+    let actor_arc = actor
+        .as_ref()
+        .map(|Extension(actor)| Arc::clone(&actor.0))
+        .unwrap_or_else(|| Arc::<str>::from("anonymous"));
    let actor_id = actor.as_ref().map(|Extension(actor)| actor.as_str());
    authorize_request(
        &state,
@ -966,8 +1058,13 @@ async fn server_schema_apply(
            target_branch: Some("main".to_string()),
        },
    )?;
+    let est_bytes = request.schema_source.len() as u64;
+    let _admission = state
+        .workload
+        .try_admit(&actor_arc, est_bytes)
+        .map_err(ApiError::from_workload_reject)?;
    let result = {
-        let mut db = Arc::clone(&state.db).write_owned().await;
+        let db = &state.engine;
        db.apply_schema(&request.schema_source)
            .await
            .map_err(ApiError::from_omni)?
@ -986,6 +1083,7 @@ async fn server_schema_apply(
        (status = 400, description = "Bad request", body = ErrorOutput),
        (status = 401, description = "Unauthorized", body = ErrorOutput),
        (status = 403, description = "Forbidden", body = ErrorOutput),
+        (status = 429, description = "Per-actor admission cap exceeded; honor `Retry-After` header", body = ErrorOutput),
    ),
    security(("bearer_token" = [])),
 )]
@ -1004,10 +1102,14 @@ async fn server_ingest(
    let branch = request.branch.unwrap_or_else(|| "main".to_string());
    let from = request.from.unwrap_or_else(|| "main".to_string());
    let mode = request.mode.unwrap_or(omnigraph::loader::LoadMode::Merge);
+    let actor_arc = actor
+        .as_ref()
+        .map(|Extension(actor)| Arc::clone(&actor.0))
+        .unwrap_or_else(|| Arc::<str>::from("anonymous"));
    let actor_id = actor.as_ref().map(|Extension(actor)| actor.as_str());

    let branch_exists = {
-        let db = Arc::clone(&state.db).read_owned().await;
+        let db = &state.engine;
        db.branch_list()
            .await
            .map_err(ApiError::from_omni)?
@ -1037,9 +1139,14 @@ async fn server_ingest(
            target_branch: None,
        },
    )?;
+    let est_bytes = request.data.len() as u64;
+    let _admission = state
+        .workload
+        .try_admit(&actor_arc, est_bytes)
+        .map_err(ApiError::from_workload_reject)?;

    let result = {
-        let mut db = Arc::clone(&state.db).write_owned().await;
+        let db = &state.engine;
        db.ingest_as(&branch, Some(&from), &request.data, mode, actor_id)
            .await
            .map_err(ApiError::from_omni)?
@ -1085,7 +1192,7 @@ async fn server_branch_list(
        },
    )?;
    let mut branches = {
-        let db = Arc::clone(&state.db).read_owned().await;
+        let db = &state.engine;
        db.branch_list().await.map_err(ApiError::from_omni)?
    };
    branches.sort();
@ -1104,6 +1211,7 @@ async fn server_branch_list(
        (status = 401, description = "Unauthorized", body = ErrorOutput),
        (status = 403, description = "Forbidden", body = ErrorOutput),
        (status = 409, description = "Branch already exists", body = ErrorOutput),
+        (status = 429, description = "Per-actor admission cap exceeded; honor `Retry-After` header", body = ErrorOutput),
    ),
    security(("bearer_token" = [])),
 )]
@ -1118,6 +1226,10 @@ async fn server_branch_create(
    Json(request): Json<BranchCreateRequest>,
 ) -> std::result::Result<Json<BranchCreateOutput>, ApiError> {
    let from = request.from.unwrap_or_else(|| "main".to_string());
+    let actor_arc = actor
+        .as_ref()
+        .map(|Extension(actor)| Arc::clone(&actor.0))
+        .unwrap_or_else(|| Arc::<str>::from("anonymous"));
    authorize_request(
        &state,
        actor.as_ref().map(|Extension(actor)| actor),
@ -1131,8 +1243,15 @@ async fn server_branch_create(
            target_branch: Some(request.name.clone()),
        },
    )?;
+    // Branch metadata only — small constant bytes estimate. The Lance
+    // shallow-clone work is bounded by the parent's manifest size, not
+    // the request body.
+    let _admission = state
+        .workload
+        .try_admit(&actor_arc, 256)
+        .map_err(ApiError::from_workload_reject)?;
    {
-        let mut db = Arc::clone(&state.db).write_owned().await;
+        let db = &state.engine;
        db.branch_create_from(ReadTarget::branch(&from), &request.name)
            .await
            .map_err(ApiError::from_omni)?;
@ -1158,6 +1277,7 @@ async fn server_branch_create(
        (status = 401, description = "Unauthorized", body = ErrorOutput),
        (status = 403, description = "Forbidden", body = ErrorOutput),
        (status = 404, description = "Branch not found", body = ErrorOutput),
+        (status = 429, description = "Per-actor admission cap exceeded; honor `Retry-After` header", body = ErrorOutput),
    ),
    security(("bearer_token" = [])),
 )]
@ -1171,6 +1291,10 @@ async fn server_branch_delete(
    actor: Option<Extension<AuthenticatedActor>>,
    Path(branch): Path<String>,
 ) -> std::result::Result<Json<BranchDeleteOutput>, ApiError> {
+    let actor_arc = actor
+        .as_ref()
+        .map(|Extension(actor)| Arc::clone(&actor.0))
+        .unwrap_or_else(|| Arc::<str>::from("anonymous"));
    let actor_id = actor.as_ref().map(|Extension(actor)| actor.as_str());
    authorize_request(
        &state,
@ -1182,8 +1306,13 @@ async fn server_branch_delete(
            target_branch: Some(branch.clone()),
        },
    )?;
+    // Metadata-only manifest tombstone — small constant estimate.
+    let _admission = state
+        .workload
+        .try_admit(&actor_arc, 256)
+        .map_err(ApiError::from_workload_reject)?;
    {
-        let mut db = Arc::clone(&state.db).write_owned().await;
+        let db = &state.engine;
        db.branch_delete(&branch)
            .await
            .map_err(ApiError::from_omni)?;
@ -1207,6 +1336,7 @@ async fn server_branch_delete(
        (status = 401, description = "Unauthorized", body = ErrorOutput),
        (status = 403, description = "Forbidden", body = ErrorOutput),
        (status = 409, description = "Merge conflict", body = ErrorOutput),
+        (status = 429, description = "Per-actor admission cap exceeded; honor `Retry-After` header", body = ErrorOutput),
    ),
    security(("bearer_token" = [])),
 )]
@ -1222,6 +1352,10 @@ async fn server_branch_merge(
    Json(request): Json<BranchMergeRequest>,
 ) -> std::result::Result<Json<BranchMergeOutput>, ApiError> {
    let target = request.target.unwrap_or_else(|| "main".to_string());
+    let actor_arc = actor
+        .as_ref()
+        .map(|Extension(actor)| Arc::clone(&actor.0))
+        .unwrap_or_else(|| Arc::<str>::from("anonymous"));
    let actor_id = actor.as_ref().map(|Extension(actor)| actor.as_str());
    authorize_request(
        &state,
@ -1233,8 +1367,15 @@ async fn server_branch_merge(
            target_branch: Some(target.clone()),
        },
    )?;
+    // Merge body is small JSON; the heavy work is in the engine but is
+    // bounded per-(table, branch) by the writer queue. Small constant
+    // estimate suffices for the actor in-flight count.
+    let _admission = state
+        .workload
+        .try_admit(&actor_arc, 256)
+        .map_err(ApiError::from_workload_reject)?;
    let outcome = {
-        let mut db = Arc::clone(&state.db).write_owned().await;
+        let db = &state.engine;
        db.branch_merge_as(&request.source, &target, actor_id)
            .await
            .map_err(ApiError::from_omni)?
@ -1283,7 +1424,7 @@ async fn server_commit_list(
        },
    )?;
    let commits = {
-        let db = Arc::clone(&state.db).read_owned().await;
+        let db = &state.engine;
        db.list_commits(query.branch.as_deref())
            .await
            .map_err(ApiError::from_omni)?
@ -1332,7 +1473,7 @@ async fn server_commit_show(
        },
    )?;
    let commit = {
-        let db = Arc::clone(&state.db).read_owned().await;
+        let db = &state.engine;
        db.get_commit(&commit_id)
            .await
            .map_err(ApiError::from_omni)?
--- a/crates/omnigraph-server/src/workload.rs
+++ b/crates/omnigraph-server/src/workload.rs
@ -0,0 +1,363 @@
+//! Per-actor admission control for the HTTP server (MR-686 §VII.A).
+//!
+//! The HTTP server's previous global `RwLock<Omnigraph>` serialized every
+//! mutating request across all actors. PR 2 removes that lock — engine
+//! APIs are now `&self`, so concurrent calls from different actors can
+//! run against `Arc<Omnigraph>` simultaneously. Without admission
+//! control, one heavy actor can exhaust shared capacity (Lance I/O
+//! threads, manifest churn, network) and starve other actors.
+//!
+//! This module provides:
+//!
+//! - **Per-actor in-flight count cap**: each actor has a
+//!   `tokio::sync::Semaphore` with `OMNIGRAPH_PER_ACTOR_INFLIGHT_MAX`
+//!   permits (default 16). `try_acquire_owned()` returns `Err` when
+//!   exhausted; the server maps this to HTTP 429.
+//!
+//! - **Per-actor in-flight byte budget**: each actor accumulates an
+//!   `AtomicU64` byte estimate. `fetch_add(est_bytes)` then a check
+//!   against `byte_cap` is race-free via decrement-on-rejection. The
+//!   server maps an over-budget result to HTTP 429 as well.
+//!
+//! Counts are governed by the semaphore (race-free `try_acquire_owned()`
+//! enforces the cap atomically); bytes use `fetch_add` + decrement-on-
+//! rejection. Both checks are atomic compare-and-act, never
+//! load-then-act — the test
+//! `actor_admission_race_does_not_exceed_cap` pins this contract by
+//! spawning 32 concurrent `try_admit` calls against a cap of 16 and
+//! asserting exactly 16 succeed.
+//!
+//! Acquisition order against the engine's per-(table, branch) write
+//! queue: admission FIRST (the HTTP handler reserves capacity before
+//! calling into the engine), engine queue SECOND (acquired inside
+//! `MutationStaging::commit_all`). This composes cleanly because
+//! admission is a single per-actor count + budget check, never
+//! cross-actor; nothing the engine does can change a peer actor's
+//! admission state.
+
+use std::sync::Arc;
+use std::sync::atomic::{AtomicU64, Ordering};
+
+use dashmap::DashMap;
+use tokio::sync::{OwnedSemaphorePermit, Semaphore, TryAcquireError};
+
+/// Default per-actor in-flight count cap. Override via
+/// `OMNIGRAPH_PER_ACTOR_INFLIGHT_MAX`.
+pub const DEFAULT_PER_ACTOR_INFLIGHT_MAX: u32 = 16;
+
+/// Default per-actor in-flight byte budget (4 GiB). Override via
+/// `OMNIGRAPH_PER_ACTOR_BYTES_MAX`.
+pub const DEFAULT_PER_ACTOR_BYTES_MAX: u64 = 4 * 1024 * 1024 * 1024;
+
+/// Why a `try_admit` call returned `Err`. The server maps each variant
+/// to a specific HTTP response code; see `WorkloadController` docs.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum RejectReason {
+    /// Actor exceeded the per-actor in-flight count cap. HTTP 429.
+    InFlightCountExceeded { cap: u32 },
+    /// Actor exceeded the per-actor in-flight byte budget. HTTP 429.
+    ByteBudgetExceeded { cap: u64, attempted: u64 },
+}
+
+impl std::fmt::Display for RejectReason {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            RejectReason::InFlightCountExceeded { cap } => {
+                write!(f, "actor in-flight count cap {} exceeded", cap)
+            }
+            RejectReason::ByteBudgetExceeded { cap, attempted } => write!(
+                f,
+                "actor byte budget exceeded: would use {} bytes against cap {}",
+                attempted, cap
+            ),
+        }
+    }
+}
+
+/// Per-actor counters. One instance per actor_id, lazily created on
+/// first admission attempt.
+#[derive(Debug)]
+pub(crate) struct ActorState {
+    /// Counts the number of concurrent in-flight requests for this
+    /// actor. `try_acquire_owned()` is the count-cap gate.
+    in_flight_sem: Arc<Semaphore>,
+    /// Total bytes estimated to be in flight for this actor across
+    /// concurrent requests. `fetch_add` + check + decrement-on-failure
+    /// keeps the cap atomic.
+    bytes: AtomicU64,
+    /// Per-actor byte cap (snapshot of `WorkloadController.byte_cap`
+    /// at construction; cap mutations don't propagate to existing
+    /// ActorStates by design — controller config changes apply on
+    /// next ActorState construction).
+    byte_cap: u64,
+    /// Per-actor count cap (same snapshot semantics as `byte_cap`).
+    inflight_cap: u32,
+}
+
+impl ActorState {
+    fn new(inflight_cap: u32, byte_cap: u64) -> Self {
+        Self {
+            in_flight_sem: Arc::new(Semaphore::new(inflight_cap as usize)),
+            bytes: AtomicU64::new(0),
+            byte_cap,
+            inflight_cap,
+        }
+    }
+}
+
+/// Server-side per-actor admission controller. Constructed once at
+/// server startup and shared via `Arc<WorkloadController>` on
+/// `AppState`.
+pub struct WorkloadController {
+    per_actor: DashMap<Arc<str>, Arc<ActorState>>,
+    inflight_cap: u32,
+    byte_cap: u64,
+}
+
+impl WorkloadController {
+    /// Construct from explicit caps. Tests can override.
+    pub fn new(inflight_cap: u32, byte_cap: u64) -> Self {
+        Self {
+            per_actor: DashMap::new(),
+            inflight_cap,
+            byte_cap,
+        }
+    }
+
+    /// Construct from environment variables, falling back to defaults.
+    /// Bad env values fall back to the default with a `tracing::warn!`.
+    pub fn from_env() -> Self {
+        let inflight_cap = parse_env_u32(
+            "OMNIGRAPH_PER_ACTOR_INFLIGHT_MAX",
+            DEFAULT_PER_ACTOR_INFLIGHT_MAX,
+        );
+        let byte_cap = parse_env_u64("OMNIGRAPH_PER_ACTOR_BYTES_MAX", DEFAULT_PER_ACTOR_BYTES_MAX);
+        Self::new(inflight_cap, byte_cap)
+    }
+
+    /// Construct with default caps. Suitable for tests / single-tenant
+    /// deployments without explicit configuration.
+    pub fn with_defaults() -> Self {
+        Self::new(DEFAULT_PER_ACTOR_INFLIGHT_MAX, DEFAULT_PER_ACTOR_BYTES_MAX)
+    }
+
+    fn actor_state(&self, actor_id: &Arc<str>) -> Arc<ActorState> {
+        if let Some(existing) = self.per_actor.get(actor_id) {
+            return existing.clone();
+        }
+        // Race-on-construct is benign: DashMap's `entry().or_insert_with`
+        // serializes per-key construction; the loser's freshly-built
+        // ActorState gets dropped without observable effect.
+        self.per_actor
+            .entry(actor_id.clone())
+            .or_insert_with(|| Arc::new(ActorState::new(self.inflight_cap, self.byte_cap)))
+            .clone()
+    }
+
+    /// Reserve admission for one in-flight request from `actor_id`
+    /// estimated to consume `est_bytes`. Returns an `AdmissionGuard`
+    /// that releases the count permit + decrements the byte total
+    /// when dropped.
+    ///
+    /// On rejection, the byte counter is decremented before returning
+    /// — callers can retry without leaking budget.
+    pub fn try_admit(
+        &self,
+        actor_id: &Arc<str>,
+        est_bytes: u64,
+    ) -> Result<AdmissionGuard, RejectReason> {
+        let state = self.actor_state(actor_id);
+
+        // Count gate: race-free via `try_acquire_owned()`. If exhausted,
+        // immediately reject — no byte accounting needed for this request.
+        let permit = match Arc::clone(&state.in_flight_sem).try_acquire_owned() {
+            Ok(permit) => permit,
+            Err(TryAcquireError::NoPermits) => {
+                return Err(RejectReason::InFlightCountExceeded {
+                    cap: state.inflight_cap,
+                });
+            }
+            Err(TryAcquireError::Closed) => {
+                return Err(RejectReason::InFlightCountExceeded {
+                    cap: state.inflight_cap,
+                });
+            }
+        };
+
+        // Byte gate: atomic fetch_add then check; decrement on overflow.
+        // `Ordering::SeqCst` is conservative; per-actor accounting is
+        // not on the hot path of read queries.
+        let prior = state.bytes.fetch_add(est_bytes, Ordering::SeqCst);
+        let attempted = prior.saturating_add(est_bytes);
+        if attempted > state.byte_cap {
+            // Roll back the byte add. The permit drops with `permit`
+            // going out of scope below.
+            state.bytes.fetch_sub(est_bytes, Ordering::SeqCst);
+            return Err(RejectReason::ByteBudgetExceeded {
+                cap: state.byte_cap,
+                attempted,
+            });
+        }
+
+        Ok(AdmissionGuard {
+            _permit: permit,
+            actor_state: state,
+            est_bytes,
+        })
+    }
+}
+
+/// Drop-on-completion guard for an admitted request. Dropping releases
+/// the in-flight count permit (via `Drop` on the underlying semaphore
+/// permit) and decrements the actor's byte counter.
+#[derive(Debug)]
+pub struct AdmissionGuard {
+    _permit: OwnedSemaphorePermit,
+    actor_state: Arc<ActorState>,
+    est_bytes: u64,
+}
+
+impl Drop for AdmissionGuard {
+    fn drop(&mut self) {
+        self.actor_state
+            .bytes
+            .fetch_sub(self.est_bytes, Ordering::SeqCst);
+    }
+}
+
+fn parse_env_u32(name: &str, default: u32) -> u32 {
+    match std::env::var(name) {
+        Ok(v) => v.parse::<u32>().unwrap_or_else(|err| {
+            tracing::warn!(
+                env = name,
+                value = %v,
+                error = %err,
+                default,
+                "invalid env value, using default"
+            );
+            default
+        }),
+        Err(_) => default,
+    }
+}
+
+fn parse_env_u64(name: &str, default: u64) -> u64 {
+    match std::env::var(name) {
+        Ok(v) => v.parse::<u64>().unwrap_or_else(|err| {
+            tracing::warn!(
+                env = name,
+                value = %v,
+                error = %err,
+                default,
+                "invalid env value, using default"
+            );
+            default
+        }),
+        Err(_) => default,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+    async fn try_admit_admits_under_cap() {
+        let controller = WorkloadController::new(2, 1024);
+        let actor: Arc<str> = "alice".into();
+        let g1 = controller.try_admit(&actor, 100).expect("first admit");
+        let _g2 = controller.try_admit(&actor, 100).expect("second admit");
+        let err = controller
+            .try_admit(&actor, 100)
+            .expect_err("third should reject on count");
+        assert!(matches!(err, RejectReason::InFlightCountExceeded { cap: 2 }));
+        drop(g1);
+        // After drop, a new admit succeeds again.
+        let _g3 = controller
+            .try_admit(&actor, 100)
+            .expect("admit after drop");
+    }
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+    async fn byte_budget_caps_admission() {
+        let controller = WorkloadController::new(16, 1000);
+        let actor: Arc<str> = "alice".into();
+        let _g1 = controller.try_admit(&actor, 600).expect("first admit");
+        let err = controller
+            .try_admit(&actor, 600)
+            .expect_err("second should reject on bytes");
+        match err {
+            RejectReason::ByteBudgetExceeded { cap, attempted } => {
+                assert_eq!(cap, 1000);
+                assert_eq!(attempted, 1200);
+            }
+            other => panic!("expected ByteBudgetExceeded, got {:?}", other),
+        }
+        // Verify the byte counter was rolled back: a smaller request fits.
+        let _g2 = controller.try_admit(&actor, 300).expect("smaller admit");
+    }
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+    async fn actor_admission_race_does_not_exceed_cap() {
+        // Pin master plan §"WorkloadController" Finding 6: independent
+        // atomic load + check + add allows two concurrent callers to
+        // both pass a cap-N check. The Semaphore-based gate is
+        // race-free — exactly cap_count callers succeed.
+        //
+        // Each task holds its admission guard until released via a
+        // oneshot channel; this forces real contention because guards
+        // can't drop and free permits before all 32 calls have raced.
+        let controller = Arc::new(WorkloadController::new(16, u64::MAX / 4));
+        let actor: Arc<str> = "racer".into();
+
+        let (release_tx, _) = tokio::sync::broadcast::channel::<()>(1);
+
+        let mut handles = Vec::with_capacity(32);
+        for _ in 0..32 {
+            let controller = Arc::clone(&controller);
+            let actor = actor.clone();
+            let mut release_rx = release_tx.subscribe();
+            handles.push(tokio::spawn(async move {
+                let result = controller.try_admit(&actor, 1);
+                let success = result.is_ok();
+                // Hold the guard (if any) until the test signals release,
+                // so the cap-16 contention is observable across all 32
+                // tasks instead of permits being recycled task-by-task.
+                let _guard = result.ok();
+                let _ = release_rx.recv().await;
+                success
+            }));
+        }
+
+        // Give all 32 tasks a chance to hit `try_admit` before any can
+        // drop their guard. 50ms is plenty for tokio's scheduler on a
+        // 4-worker runtime.
+        tokio::time::sleep(std::time::Duration::from_millis(50)).await;
+        // Release every task; collect succeed/reject counts.
+        let _ = release_tx.send(());
+
+        let mut accepted = 0u32;
+        let mut rejected = 0u32;
+        for h in handles {
+            if h.await.unwrap() {
+                accepted += 1;
+            } else {
+                rejected += 1;
+            }
+        }
+        assert_eq!(accepted, 16, "expected exactly 16 successful admits");
+        assert_eq!(rejected, 16, "expected exactly 16 rejections");
+    }
+
+    #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+    async fn per_actor_caps_independent() {
+        let controller = WorkloadController::new(1, 1024);
+        let alice: Arc<str> = "alice".into();
+        let bob: Arc<str> = "bob".into();
+        let _ga = controller.try_admit(&alice, 100).expect("alice ok");
+        // Alice over count cap, Bob unaffected.
+        let err = controller.try_admit(&alice, 100).expect_err("alice rejected");
+        assert!(matches!(err, RejectReason::InFlightCountExceeded { .. }));
+        let _gb = controller.try_admit(&bob, 100).expect("bob ok");
+    }
+}
--- a/crates/omnigraph-server/tests/server.rs
+++ b/crates/omnigraph-server/tests/server.rs
--- a/crates/omnigraph/Cargo.toml
+++ b/crates/omnigraph/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "omnigraph-engine"
-version = "0.4.1"
+version = "0.4.2"
 edition = "2024"
 description = "Runtime engine for the Omnigraph graph database."
 license = "MIT"
@ -16,7 +16,7 @@ default = []
 failpoints = ["dep:fail", "fail/failpoints"]

 [dependencies]
-omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.4.1" }
+omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.4.2" }
 lance = { workspace = true }
 lance-datafusion = { workspace = true }
 datafusion = { workspace = true }
@ -47,9 +47,10 @@ time = { workspace = true }
 async-trait = { workspace = true }
 url = { workspace = true }
 chrono = { workspace = true }
+arc-swap = { workspace = true }

 [dev-dependencies]
-omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.4.1" }
+omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.4.2" }
 tokio = { workspace = true }
 lance-namespace-impls = { workspace = true }
 serial_test = "3"
--- a/crates/omnigraph/src/db/mod.rs
+++ b/crates/omnigraph/src/db/mod.rs
@ -5,6 +5,7 @@ mod omnigraph;
 mod recovery_audit;
 mod run_registry;
 mod schema_state;
+pub(crate) mod write_queue;

 pub use commit_graph::GraphCommit;
 pub use graph_coordinator::{GraphCoordinator, ReadTarget, ResolvedTarget, SnapshotId};
@ -18,6 +19,53 @@ pub(crate) use run_registry::is_internal_run_branch;

 pub(crate) const SCHEMA_APPLY_LOCK_BRANCH: &str = "__schema_apply_lock__";

+/// Mutation kind, threaded through the version-check call sites so the
+/// engine can apply an op-kind-aware policy:
+///
+/// - `Insert` / `Merge`: skip the strict pre-stage `ensure_expected_version`
+///   check. Lance's `MergeInsertBuilder` rebases concurrent appends; the
+///   per-(table, branch) writer queue serializes `commit_staged`; the
+///   publisher's CAS (refreshed under the queue via
+///   `MutationStaging::commit_all`'s `snapshot_for_branch` call) catches
+///   genuine cross-process drift as `ManifestConflictDetails::ExpectedVersionMismatch`.
+///   The pre-stage strict check would over-reject in-process concurrent
+///   inserts, which is exactly the case PR 2 / MR-686 designed the
+///   per-table queue to allow.
+///
+/// - `Update` / `Delete`: keep the strict check. These have read-modify-write
+///   semantics; Lance moving between the read at stage time and the write
+///   at commit time means the staged batch is computed against stale state.
+///   The strict check guards the per-query SI invariant. SERIALIZABLE
+///   opt-in (§VI.36 future seam) is the long-term answer for tighter
+///   semantics; today, in-process update-update races on the same key
+///   stay rejected as 409 — acceptable.
+///
+/// - `SchemaRewrite`: keep the strict check. Schema apply runs under the
+///   graph-wide `__schema_apply_lock__` AND per-table queues; the strict
+///   check is uncontested at that point.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub(crate) enum MutationOpKind {
+    Insert,
+    Merge,
+    Update,
+    Delete,
+    SchemaRewrite,
+}
+
+impl MutationOpKind {
+    /// Whether the strict pre-stage `ensure_expected_version` check should
+    /// fire for this op kind. See [`MutationOpKind`] for the rationale per
+    /// kind.
+    pub(crate) fn strict_pre_stage_version_check(self) -> bool {
+        match self {
+            MutationOpKind::Insert | MutationOpKind::Merge => false,
+            MutationOpKind::Update
+            | MutationOpKind::Delete
+            | MutationOpKind::SchemaRewrite => true,
+        }
+    }
+}
+
 pub(crate) fn is_schema_apply_lock_branch(name: &str) -> bool {
    name.trim_start_matches('/') == SCHEMA_APPLY_LOCK_BRANCH
 }
--- a/crates/omnigraph/src/db/omnigraph.rs
+++ b/crates/omnigraph/src/db/omnigraph.rs
@ -2,6 +2,7 @@ use std::collections::{BTreeSet, HashMap, HashSet};
 use std::io::Write;
 use std::sync::Arc;

+use arc_swap::ArcSwap;
 use arrow_array::{
    Array, BinaryArray, BooleanArray, Date32Array, FixedSizeListArray, Float32Array, Float64Array,
    Int32Array, Int64Array, LargeBinaryArray, LargeListArray, LargeStringArray, ListArray,
@ -73,12 +74,60 @@ pub struct SchemaApplyResult {
 pub struct Omnigraph {
    root_uri: String,
    storage: Arc<dyn StorageAdapter>,
-    coordinator: GraphCoordinator,
+    /// Coordinator state behind a tokio `RwLock`. PR 2 (MR-686) wraps
+    /// this so engine write APIs can be `&self` (the HTTP server's
+    /// `AppState` holds `Arc<Omnigraph>` and dispatches concurrent
+    /// calls without a global write lock). Reads (`snapshot`, `version`,
+    /// `current_branch`, `branch_list`, `resolve_*`, `head_commit_id`,
+    /// `list_commits`, …) acquire `.read().await` and parallelize.
+    /// Writes (`refresh`, `branch_create`, `branch_delete`, `commit_*`,
+    /// `record_*`) acquire `.write().await` and serialize. The atomic
+    /// commit invariant — `commit_manifest_updates` followed by
+    /// `record_graph_commit` must be atomic — is preserved by the
+    /// single `.write()` covering both calls inside
+    /// `commit_updates_with_actor_with_expected`. PR 2 Phase 2
+    /// converted from `Mutex` to `RwLock` because the bench showed
+    /// the Mutex was the dominant serializer for disjoint-table
+    /// workloads. Lock acquisition order: always before `runtime_cache`
+    /// (when both are needed in one scope).
+    coordinator: Arc<tokio::sync::RwLock<GraphCoordinator>>,
    table_store: TableStore,
    runtime_cache: RuntimeCache,
-    catalog: Catalog,
-    schema_source: String,
-    pub(crate) audit_actor_id: Option<String>,
+    /// Read-heavy on every query, written only by `apply_schema`. ArcSwap
+    /// gives atomic pointer swap with zero-cost reads (`load()` returns a
+    /// `Guard<Arc<Catalog>>`), so concurrent queries on different actors
+    /// don't contend on a lock to read the catalog.
+    catalog: Arc<ArcSwap<Catalog>>,
+    /// Read-heavy on schema introspection paths, written only by
+    /// `apply_schema`. Same ArcSwap rationale as `catalog`.
+    schema_source: Arc<ArcSwap<String>>,
+    /// Per-`(table_key, branch)` writer queues. Reachable from engine
+    /// internals (mutation finalize, schema_apply, branch_merge,
+    /// ensure_indices, delete_where) and from future MR-870 recovery
+    /// reconciler. PR 1b adds the field; callers acquire in commits 4+.
+    write_queue: Arc<crate::db::write_queue::WriteQueueManager>,
+    /// Process-wide mutex held across the swap → operate → restore window
+    /// in `branch_merge_impl`. Two concurrent merges with distinct targets
+    /// would otherwise interleave their three separate
+    /// `coordinator.write().await` acquisitions, leaving each merge's
+    /// inner body running against the other's swapped coord. Pinned by
+    /// `concurrent_branch_merges_distinct_targets_do_not_swap_into_each_other`
+    /// in `crates/omnigraph-server/tests/server.rs`.
+    ///
+    /// Cost: serializes ALL concurrent branch merges process-wide.
+    /// Acceptable because branch merges are heavy (table rewrites, index
+    /// rebuilds), per-(table, branch) queues inside `commit_all` already
+    /// serialize the data path, and merges are rare relative to /change
+    /// or /ingest. A finer-grained per-target-branch mutex is a follow-up
+    /// if telemetry shows merge concurrency matters.
+    ///
+    /// The deeper fix — refactor `branch_merge_on_current_target` to take
+    /// an explicit target coord parameter so `self.coordinator` is never
+    /// used as scratch space — is the round-1 shape applied to
+    /// `branch_create_from_impl`. Deferred because it requires unwinding
+    /// every `self.snapshot()` and `self.ensure_commit_graph_initialized()`
+    /// call inside the merge body.
+    merge_exclusive: Arc<tokio::sync::Mutex<()>>,
 }

 /// Whether [`Omnigraph::open`] runs the open-time recovery sweep.
@ -128,12 +177,13 @@ impl Omnigraph {
        Ok(Self {
            root_uri: root.clone(),
            storage,
-            coordinator,
+            coordinator: Arc::new(tokio::sync::RwLock::new(coordinator)),
            table_store: TableStore::new(&root),
            runtime_cache: RuntimeCache::default(),
-            catalog,
-            schema_source: schema_source.to_string(),
-            audit_actor_id: None,
+            catalog: Arc::new(ArcSwap::from_pointee(catalog)),
+            schema_source: Arc::new(ArcSwap::from_pointee(schema_source.to_string())),
+            write_queue: Arc::new(crate::db::write_queue::WriteQueueManager::new()),
+            merge_exclusive: Arc::new(tokio::sync::Mutex::new(())),
        })
    }

@ -214,21 +264,39 @@ impl Omnigraph {
        Ok(Self {
            root_uri: root.clone(),
            storage,
-            coordinator,
+            coordinator: Arc::new(tokio::sync::RwLock::new(coordinator)),
            table_store: TableStore::new(&root),
            runtime_cache: RuntimeCache::default(),
-            catalog,
-            schema_source,
-            audit_actor_id: None,
+            catalog: Arc::new(ArcSwap::from_pointee(catalog)),
+            schema_source: Arc::new(ArcSwap::from_pointee(schema_source)),
+            write_queue: Arc::new(crate::db::write_queue::WriteQueueManager::new()),
+            merge_exclusive: Arc::new(tokio::sync::Mutex::new(())),
        })
    }

-    pub fn catalog(&self) -> &Catalog {
-        &self.catalog
+    /// Returns an `Arc<Catalog>` snapshot. Cheap clone of the current
+    /// catalog pointer; callers can hold the returned `Arc` across awaits
+    /// without blocking concurrent `apply_schema`.
+    pub fn catalog(&self) -> Arc<Catalog> {
+        self.catalog.load_full()
    }

-    pub fn schema_source(&self) -> &str {
-        &self.schema_source
+    /// Returns an `Arc<String>` snapshot of the schema source.
+    pub fn schema_source(&self) -> Arc<String> {
+        self.schema_source.load_full()
+    }
+
+    /// Atomically swap the in-memory catalog. Concurrent readers see
+    /// either the old or the new pointer; never a torn state. Used by
+    /// `apply_schema` and `reload_schema_if_source_changed`.
+    pub(crate) fn store_catalog(&self, catalog: Catalog) {
+        self.catalog.store(Arc::new(catalog));
+    }
+
+    /// Atomically swap the in-memory schema source. Same rationale as
+    /// [`store_catalog`](Self::store_catalog).
+    pub(crate) fn store_schema_source(&self, schema_source: String) {
+        self.schema_source.store(Arc::new(schema_source));
    }

    pub fn uri(&self) -> &str {
@ -243,11 +311,11 @@ impl Omnigraph {
        schema_apply::plan_schema(self, desired_schema_source).await
    }

-    pub async fn apply_schema(&mut self, desired_schema_source: &str) -> Result<SchemaApplyResult> {
+    pub async fn apply_schema(&self, desired_schema_source: &str) -> Result<SchemaApplyResult> {
        schema_apply::apply_schema(self, desired_schema_source).await
    }

-    pub(crate) async fn ensure_schema_apply_idle(&mut self, operation: &str) -> Result<()> {
+    pub(crate) async fn ensure_schema_apply_idle(&self, operation: &str) -> Result<()> {
        schema_apply::ensure_schema_apply_idle(self, operation).await
    }

@ -278,6 +346,26 @@ impl Omnigraph {
        self.storage.as_ref()
    }

+    /// Per-`(table_key, branch)` writer queues.
+    ///
+    /// Engine-internal writers (mutation finalize, schema_apply,
+    /// branch_merge, ensure_indices, delete_where) and the future MR-870
+    /// recovery reconciler reach the queue manager via this accessor.
+    /// Returns an `Arc` clone so callers can hold the manager across
+    /// `&mut self` engine API boundaries.
+    pub(crate) fn write_queue(&self) -> Arc<crate::db::write_queue::WriteQueueManager> {
+        Arc::clone(&self.write_queue)
+    }
+
+    /// Engine-internal access to the merge-exclusive mutex. Held across
+    /// the swap → operate → restore window in `branch_merge_impl` so
+    /// concurrent merges with distinct targets don't corrupt
+    /// `self.coordinator` mid-operation. See the field doc on
+    /// `Omnigraph::merge_exclusive` for the full design rationale.
+    pub(crate) fn merge_exclusive(&self) -> Arc<tokio::sync::Mutex<()>> {
+        Arc::clone(&self.merge_exclusive)
+    }
+
    /// Engine-level access to the repo's normalized root URI. Used by
    /// the recovery sidecar protocol to compute `__recovery/` paths.
    pub(crate) fn root_uri(&self) -> &str {
@ -297,15 +385,16 @@ impl Omnigraph {
    }

    pub(crate) async fn swap_coordinator_for_branch(
-        &mut self,
+        &self,
        branch: Option<&str>,
    ) -> Result<GraphCoordinator> {
        let next = self.open_coordinator_for_branch(branch).await?;
-        Ok(std::mem::replace(&mut self.coordinator, next))
+        let mut coord = self.coordinator.write().await;
+        Ok(std::mem::replace(&mut *coord, next))
    }

-    pub(crate) fn restore_coordinator(&mut self, coordinator: GraphCoordinator) {
-        self.coordinator = coordinator;
+    pub(crate) async fn restore_coordinator(&self, coordinator: GraphCoordinator) {
+        *self.coordinator.write().await = coordinator;
    }

    pub(crate) async fn resolved_branch_target(
@ -315,21 +404,19 @@ impl Omnigraph {
        self.ensure_schema_state_valid().await?;
        let requested = ReadTarget::Branch(branch.unwrap_or("main").to_string());
        let normalized = normalize_branch_name(branch.unwrap_or("main"))?;
-        if normalized.as_deref() == self.coordinator.current_branch() {
-            let snapshot_id = self.coordinator.head_commit_id().await?.unwrap_or_else(|| {
-                SnapshotId::synthetic(
-                    self.coordinator.current_branch(),
-                    self.coordinator.version(),
-                )
+        let coord = self.coordinator.read().await;
+        if normalized.as_deref() == coord.current_branch() {
+            let snapshot_id = coord.head_commit_id().await?.unwrap_or_else(|| {
+                SnapshotId::synthetic(coord.current_branch(), coord.version())
            });
            return Ok(ResolvedTarget {
                requested,
-                branch: self.coordinator.current_branch().map(str::to_string),
+                branch: coord.current_branch().map(str::to_string),
                snapshot_id,
-                snapshot: self.coordinator.snapshot(),
+                snapshot: coord.snapshot(),
            });
        }
-        self.coordinator.resolve_target(&requested).await
+        coord.resolve_target(&requested).await
    }

    pub(crate) async fn snapshot_for_branch(&self, branch: Option<&str>) -> Result<Snapshot> {
@ -338,13 +425,13 @@ impl Omnigraph {
            .map(|resolved| resolved.snapshot)
    }

-    pub(crate) fn version(&self) -> u64 {
-        self.coordinator.version()
+    pub(crate) async fn version(&self) -> u64 {
+        self.coordinator.read().await.version()
    }

    /// Return an immutable Snapshot from the known manifest state. No storage I/O.
-    pub(crate) fn snapshot(&self) -> Snapshot {
-        self.coordinator.snapshot()
+    pub(crate) async fn snapshot(&self) -> Snapshot {
+        self.coordinator.read().await.snapshot()
    }

    pub async fn snapshot_of(&self, target: impl Into<ReadTarget>) -> Result<Snapshot> {
@ -369,10 +456,11 @@ impl Omnigraph {
    }

    /// Synchronize this handle's write base to the latest head of the named branch.
-    pub async fn sync_branch(&mut self, branch: &str) -> Result<()> {
+    pub async fn sync_branch(&self, branch: &str) -> Result<()> {
        self.ensure_schema_state_valid().await?;
        let branch = normalize_branch_name(branch)?;
-        self.coordinator = self.open_coordinator_for_branch(branch.as_deref()).await?;
+        let next = self.open_coordinator_for_branch(branch.as_deref()).await?;
+        *self.coordinator.write().await = next;
        self.runtime_cache.invalidate_all().await;
        Ok(())
    }
@ -409,35 +497,44 @@ impl Omnigraph {
    /// (e.g. `schema_apply` mid-write) MUST use
    /// [`refresh_coordinator_only`](Self::refresh_coordinator_only) to
    /// avoid the recovery sweep racing their own sidecar.
-    pub async fn refresh(&mut self) -> Result<()> {
-        self.coordinator.refresh().await?;
-        let schema_state_recovery = recover_schema_state_files(
-            &self.root_uri,
-            Arc::clone(&self.storage),
-            &self.coordinator.snapshot(),
-        )
-        .await?;
-        crate::db::manifest::recover_manifest_drift(
-            &self.root_uri,
-            Arc::clone(&self.storage),
-            &mut self.coordinator,
-            crate::db::manifest::RecoveryMode::RollForwardOnly,
-            schema_state_recovery,
-        )
-        .await?;
+    pub async fn refresh(&self) -> Result<()> {
+        // Scope the coord write guard to the recovery section only.
+        // `reload_schema_if_source_changed` (below) acquires
+        // `self.coordinator.read().await` when the on-disk schema source
+        // has drifted from the cached `schema_source`. Tokio's RwLock is
+        // not reentrant, so holding the write across that call deadlocks.
+        // Pinned by `composite_flow_schema_apply_then_branch_ops_no_deadlock_in_refresh`.
+        {
+            let mut coord = self.coordinator.write().await;
+            coord.refresh().await?;
+            let schema_state_recovery = recover_schema_state_files(
+                &self.root_uri,
+                Arc::clone(&self.storage),
+                &coord.snapshot(),
+            )
+            .await?;
+            crate::db::manifest::recover_manifest_drift(
+                &self.root_uri,
+                Arc::clone(&self.storage),
+                &mut *coord,
+                crate::db::manifest::RecoveryMode::RollForwardOnly,
+                schema_state_recovery,
+            )
+            .await?;
+        } // ← write guard released before reload's read acquisition
        self.reload_schema_if_source_changed().await?;
        self.runtime_cache.invalidate_all().await;
        Ok(())
    }

-    async fn reload_schema_if_source_changed(&mut self) -> Result<()> {
+    async fn reload_schema_if_source_changed(&self) -> Result<()> {
        let schema_path = schema_source_uri(&self.root_uri);
        let schema_source = self.storage.read_text(&schema_path).await?;
-        if schema_source == self.schema_source {
+        if schema_source == *self.schema_source.load_full() {
            return Ok(());
        }
        let current_source_ir = read_schema_ir_from_source(&schema_source)?;
-        let branches = self.coordinator.branch_list().await?;
+        let branches = self.coordinator.read().await.branch_list().await?;
        let (accepted_ir, _) = load_or_bootstrap_schema_contract(
            &self.root_uri,
            Arc::clone(&self.storage),
@ -447,8 +544,8 @@ impl Omnigraph {
        .await?;
        let mut catalog = build_catalog_from_ir(&accepted_ir)?;
        fixup_blob_schemas(&mut catalog);
-        self.schema_source = schema_source;
-        self.catalog = catalog;
+        self.store_schema_source(schema_source);
+        self.store_catalog(catalog);
        Ok(())
    }

@ -459,15 +556,15 @@ impl Omnigraph {
    /// here would observe the caller's own sidecar, classify it as
    /// RolledPastExpected, and roll it forward — racing the caller's
    /// own publish path.
-    pub(crate) async fn refresh_coordinator_only(&mut self) -> Result<()> {
-        self.coordinator.refresh().await?;
+    pub(crate) async fn refresh_coordinator_only(&self) -> Result<()> {
+        self.coordinator.write().await.refresh().await?;
        self.runtime_cache.invalidate_all().await;
        Ok(())
    }

    pub async fn resolve_snapshot(&self, branch: &str) -> Result<SnapshotId> {
        self.ensure_schema_state_valid().await?;
-        self.coordinator.resolve_snapshot_id(branch).await
+        self.coordinator.read().await.resolve_snapshot_id(branch).await
    }

    pub(crate) async fn resolved_target(
@ -475,7 +572,7 @@ impl Omnigraph {
        target: impl Into<ReadTarget>,
    ) -> Result<ResolvedTarget> {
        self.ensure_schema_state_valid().await?;
-        self.coordinator.resolve_target(&target.into()).await
+        self.coordinator.read().await.resolve_target(&target.into()).await
    }

    // ─── Change detection ────────────────────────────────────────────────
@ -506,26 +603,20 @@ impl Omnigraph {
        to_commit_id: &str,
        filter: &crate::changes::ChangeFilter,
    ) -> Result<crate::changes::ChangeSet> {
-        let from_commit = self
-            .coordinator
-            .resolve_commit(&SnapshotId::new(from_commit_id))
-            .await?;
-        let to_commit = self
-            .coordinator
-            .resolve_commit(&SnapshotId::new(to_commit_id))
-            .await?;
-        let from_snap = self
-            .coordinator
+        let coord = self.coordinator.read().await;
+        let from_commit = coord.resolve_commit(&SnapshotId::new(from_commit_id)).await?;
+        let to_commit = coord.resolve_commit(&SnapshotId::new(to_commit_id)).await?;
+        let from_snap = coord
            .resolve_target(&ReadTarget::Snapshot(SnapshotId::new(
                from_commit.graph_commit_id.clone(),
            )))
            .await?;
-        let to_snap = self
-            .coordinator
+        let to_snap = coord
            .resolve_target(&ReadTarget::Snapshot(SnapshotId::new(
                to_commit.graph_commit_id.clone(),
            )))
            .await?;
+        drop(coord);
        crate::changes::diff_snapshots(
            self.uri(),
            &from_snap.snapshot,
@ -558,7 +649,7 @@ impl Omnigraph {
    /// Create a Snapshot at any historical manifest version.
    pub async fn snapshot_at_version(&self, version: u64) -> Result<Snapshot> {
        self.ensure_schema_state_valid().await?;
-        self.coordinator.snapshot_at_version(version).await
+        self.coordinator.read().await.snapshot_at_version(version).await
    }

    pub async fn export_jsonl(
@ -606,11 +697,11 @@ impl Omnigraph {
    /// unbranched subtables keep inheriting `main`, while subtables inherited
    /// from an ancestor branch are first forked into the active branch before
    /// their index metadata is updated.
-    pub async fn ensure_indices(&mut self) -> Result<()> {
+    pub async fn ensure_indices(&self) -> Result<()> {
        table_ops::ensure_indices(self).await
    }

-    pub async fn ensure_indices_on(&mut self, branch: &str) -> Result<()> {
+    pub async fn ensure_indices_on(&self, branch: &str) -> Result<()> {
        table_ops::ensure_indices_on(self, branch).await
    }

@ -633,7 +724,7 @@ impl Omnigraph {

    /// Compact small Lance fragments into fewer larger ones across every
    /// node + edge table on `main`. See [`optimize`] for details.
-    pub async fn optimize(&mut self) -> Result<Vec<optimize::TableOptimizeStats>> {
+    pub async fn optimize(&self) -> Result<Vec<optimize::TableOptimizeStats>> {
        optimize::optimize_all_tables(self).await
    }

@ -658,8 +749,8 @@ impl Omnigraph {
    /// ```
    pub async fn read_blob(&self, type_name: &str, id: &str, property: &str) -> Result<BlobFile> {
        self.ensure_schema_state_valid().await?;
-        let node_type = self
-            .catalog
+        let catalog = self.catalog();
+        let node_type = catalog
            .node_types
            .get(type_name)
            .ok_or_else(|| OmniError::manifest(format!("unknown node type '{}'", type_name)))?;
@ -670,7 +761,7 @@ impl Omnigraph {
            )));
        }

-        let snapshot = self.snapshot();
+        let snapshot = self.snapshot().await;
        let table_key = format!("node:{}", type_name);
        let ds = snapshot.open(&table_key).await?;

@ -698,12 +789,12 @@ impl Omnigraph {
        })
    }

-    pub(crate) fn active_branch(&self) -> Option<&str> {
-        self.coordinator.current_branch()
+    pub(crate) async fn active_branch(&self) -> Option<String> {
+        self.coordinator.read().await.current_branch().map(str::to_string)
    }

    async fn ensure_branch_delete_safe(&self, branch: &str, branches: &[String]) -> Result<()> {
-        let descendants = self.coordinator.branch_descendants(branch).await?;
+        let descendants = self.coordinator.read().await.branch_descendants(branch).await?;
        if let Some(descendant) = descendants.first() {
            return Err(OmniError::manifest_conflict(format!(
                "cannot delete branch '{}' because descendant branch '{}' still depends on it",
@ -758,8 +849,9 @@ impl Omnigraph {
        Ok(())
    }

-    async fn delete_branch_storage_only(&mut self, branch: &str) -> Result<()> {
-        if self.coordinator.current_branch() == Some(branch) {
+    async fn delete_branch_storage_only(&self, branch: &str) -> Result<()> {
+        let active = self.coordinator.read().await.current_branch().map(str::to_string);
+        if active.as_deref() == Some(branch) {
            return Err(OmniError::manifest_conflict(format!(
                "cannot delete currently active branch '{}'",
                branch
@ -773,7 +865,7 @@ impl Omnigraph {
            .map(|entry| (entry.table_key.clone(), entry.table_path.clone()))
            .collect::<Vec<_>>();

-        self.coordinator.branch_delete(branch).await?;
+        self.coordinator.write().await.branch_delete(branch).await?;
        self.cleanup_deleted_branch_tables(branch, &owned_tables)
            .await
    }
@ -794,19 +886,15 @@ impl Omnigraph {
            .map(|id| id.map(|snapshot_id| snapshot_id.as_str().to_string()))
    }

-    pub async fn branch_create(&mut self, name: &str) -> Result<()> {
+    pub async fn branch_create(&self, name: &str) -> Result<()> {
        self.ensure_schema_state_valid().await?;
        self.ensure_schema_apply_idle("branch_create").await?;
        ensure_public_branch_ref(name, "branch_create")?;
-        self.coordinator.branch_create(name).await
-    }
-
-    pub(crate) fn current_audit_actor(&self) -> Option<&str> {
-        self.audit_actor_id.as_deref()
+        self.coordinator.write().await.branch_create(name).await
    }

    pub async fn branch_create_from(
-        &mut self,
+        &self,
        from: impl Into<ReadTarget>,
        name: &str,
    ) -> Result<()> {
@ -815,7 +903,7 @@ impl Omnigraph {
    }

    async fn branch_create_from_impl(
-        &mut self,
+        &self,
        from: impl Into<ReadTarget>,
        name: &str,
        allow_internal_refs: bool,
@ -831,25 +919,39 @@ impl Omnigraph {
            ensure_public_branch_ref(name, "branch_create_from")?;
        }
        let branch = normalize_branch_name(&branch_name)?;
-        let previous = self.swap_coordinator_for_branch(branch.as_deref()).await?;
-        let result = self.coordinator.branch_create(name).await;
-        self.restore_coordinator(previous);
-        result
+        // Operate on a freshly-opened source coordinator that's owned locally
+        // — never touch `self.coordinator`. The pre-fix implementation used
+        // `swap_coordinator_for_branch` + operate + `restore_coordinator` as
+        // three separate `coordinator.write().await` acquisitions; under
+        // `&self` concurrency, a second `branch_create_from` could swap
+        // self.coordinator between this caller's swap and operate steps,
+        // making the operate run against the wrong source branch and
+        // forking off the wrong HEAD. Pinned by
+        // `concurrent_branch_create_from_distinct_parents_does_not_corrupt_coordinator`
+        // in `crates/omnigraph-server/tests/server.rs`.
+        //
+        // `branch_create` mutates only the local coord's commit-graph cache;
+        // the manifest write is durable on disk regardless of which
+        // coord-handle issued it. Discarding `source_coord` after the call
+        // is the right shape — the new branch is reachable from any
+        // subsequent open of any coord.
+        let mut source_coord = self.open_coordinator_for_branch(branch.as_deref()).await?;
+        source_coord.branch_create(name).await
    }

    pub async fn branch_list(&self) -> Result<Vec<String>> {
        self.ensure_schema_state_valid().await?;
-        self.coordinator.branch_list().await
+        self.coordinator.read().await.branch_list().await
    }

-    pub async fn branch_delete(&mut self, name: &str) -> Result<()> {
+    pub async fn branch_delete(&self, name: &str) -> Result<()> {
        self.ensure_schema_state_valid().await?;
        self.ensure_schema_apply_idle("branch_delete").await?;
        ensure_public_branch_ref(name, "branch_delete")?;
        self.refresh().await?;
        let branch = normalize_branch_name(name)?
            .ok_or_else(|| OmniError::manifest("cannot delete branch 'main'".to_string()))?;
-        let branches = self.coordinator.branch_list().await?;
+        let branches = self.coordinator.read().await.branch_list().await?;
        if !branches.iter().any(|candidate| candidate == &branch) {
            return Err(OmniError::manifest_not_found(format!(
                "branch '{}' not found",
@ -863,7 +965,7 @@ impl Omnigraph {

    pub async fn get_commit(&self, commit_id: &str) -> Result<GraphCommit> {
        self.ensure_schema_state_valid().await?;
-        self.coordinator
+        self.coordinator.read().await
            .resolve_commit(&SnapshotId::new(commit_id))
            .await
    }
@ -886,16 +988,18 @@ impl Omnigraph {
    pub(crate) async fn open_for_mutation(
        &self,
        table_key: &str,
+        op_kind: crate::db::MutationOpKind,
    ) -> Result<(Dataset, String, Option<String>)> {
-        table_ops::open_for_mutation(self, table_key).await
+        table_ops::open_for_mutation(self, table_key, op_kind).await
    }

    pub(crate) async fn open_for_mutation_on_branch(
        &self,
        branch: Option<&str>,
        table_key: &str,
+        op_kind: crate::db::MutationOpKind,
    ) -> Result<(Dataset, String, Option<String>)> {
-        table_ops::open_for_mutation_on_branch(self, branch, table_key).await
+        table_ops::open_for_mutation_on_branch(self, branch, table_key, op_kind).await
    }

    pub(crate) async fn fork_dataset_from_entry_state(
@ -923,9 +1027,17 @@ impl Omnigraph {
        full_path: &str,
        table_branch: Option<&str>,
        expected_version: u64,
+        op_kind: crate::db::MutationOpKind,
    ) -> Result<Dataset> {
-        table_ops::reopen_for_mutation(self, table_key, full_path, table_branch, expected_version)
-            .await
+        table_ops::reopen_for_mutation(
+            self,
+            table_key,
+            full_path,
+            table_branch,
+            expected_version,
+            op_kind,
+        )
+        .await
    }

    pub(crate) async fn open_dataset_at_state(
@ -965,43 +1077,47 @@ impl Omnigraph {
    }

    pub(crate) async fn commit_manifest_updates(
-        &mut self,
+        &self,
        updates: &[crate::db::SubTableUpdate],
    ) -> Result<u64> {
        table_ops::commit_manifest_updates(self, updates).await
    }

    pub(crate) async fn record_merge_commit(
-        &mut self,
+        &self,
        manifest_version: u64,
        parent_commit_id: &str,
        merged_parent_commit_id: &str,
+        actor_id: Option<&str>,
    ) -> Result<String> {
        table_ops::record_merge_commit(
            self,
            manifest_version,
            parent_commit_id,
            merged_parent_commit_id,
+            actor_id,
        )
        .await
    }

    pub(crate) async fn commit_updates_on_branch_with_expected(
-        &mut self,
+        &self,
        branch: Option<&str>,
        updates: &[crate::db::SubTableUpdate],
        expected_table_versions: &std::collections::HashMap<String, u64>,
+        actor_id: Option<&str>,
    ) -> Result<u64> {
        table_ops::commit_updates_on_branch_with_expected(
            self,
            branch,
            updates,
            expected_table_versions,
+            actor_id,
        )
        .await
    }

-    pub(crate) async fn ensure_commit_graph_initialized(&mut self) -> Result<()> {
+    pub(crate) async fn ensure_commit_graph_initialized(&self) -> Result<()> {
        table_ops::ensure_commit_graph_initialized(self).await
    }

@ -1495,7 +1611,7 @@ edge WorksAt: Person -> Company
    }

    async fn table_rows_json(db: &Omnigraph, table_key: &str) -> Vec<Value> {
-        let snapshot = db.snapshot();
+        let snapshot = db.snapshot().await;
        let ds = snapshot.open(table_key).await.unwrap();
        let batches = db.table_store().scan_batches(&ds).await.unwrap();
        batches
@ -1509,7 +1625,10 @@ edge WorksAt: Person -> Company
    }

    async fn seed_person_row(db: &mut Omnigraph, name: &str, age: Option<i32>) {
-        let (mut ds, full_path, table_branch) = db.open_for_mutation("node:Person").await.unwrap();
+        let (mut ds, full_path, table_branch) = db
+            .open_for_mutation("node:Person", crate::db::MutationOpKind::Insert)
+            .await
+            .unwrap();
        let schema: Arc<Schema> = Arc::new(ds.schema().into());
        let columns: Vec<Arc<dyn Array>> = schema
            .fields()
@ -1592,7 +1711,7 @@ edge WorksAt: Person -> Company
        let uri = dir.path().to_str().unwrap();
        let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap();
        seed_person_row(&mut db, "Alice", Some(30)).await;
-        let before_version = db.snapshot().version();
+        let before_version = db.snapshot().await.version();

        let desired = TEST_SCHEMA
            .replace("node Person {\n", "node Human @rename_from(\"Person\") {\n")
@ -1603,7 +1722,7 @@ edge WorksAt: Person -> Company
            );
        db.apply_schema(&desired).await.unwrap();

-        let head = db.snapshot();
+        let head = db.snapshot().await;
        assert!(head.entry("node:Person").is_none());
        assert!(head.entry("node:Human").is_some());
        let historical = ManifestCoordinator::snapshot_at(uri, None, before_version)
@ -1633,7 +1752,7 @@ edge WorksAt: Person -> Company
        .await
        .unwrap();

-        let all_branches = db.coordinator.all_branches().await.unwrap();
+        let all_branches = db.coordinator.read().await.all_branches().await.unwrap();
        assert!(
            !all_branches.iter().any(|b| is_internal_run_branch(b)),
            "run branch should be deleted after publish, got: {:?}",
@ -1657,7 +1776,7 @@ edge WorksAt: Person -> Company
        let desired = TEST_SCHEMA.replace("name: String @key", "name: String @key @index");
        db.apply_schema(&desired).await.unwrap();

-        let snapshot = db.snapshot();
+        let snapshot = db.snapshot().await;
        let ds = snapshot.open("node:Person").await.unwrap();
        assert!(db.table_store().has_fts_index(&ds, "name").await.unwrap());
    }
@ -1676,7 +1795,7 @@ edge WorksAt: Person -> Company
        );
        db.apply_schema(&desired).await.unwrap();

-        let snapshot = db.snapshot();
+        let snapshot = db.snapshot().await;
        let ds = snapshot.open("node:Person").await.unwrap();
        assert!(db.table_store().has_btree_index(&ds, "id").await.unwrap());
        assert!(db.table_store().has_fts_index(&ds, "name").await.unwrap());
@ -1689,11 +1808,16 @@ edge WorksAt: Person -> Company
        let db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap();
        let mut db = db;
        db.coordinator
+            .write()
+            .await
            .branch_create(SCHEMA_APPLY_LOCK_BRANCH)
            .await
            .unwrap();

-        let err = db.open_for_mutation("node:Person").await.unwrap_err();
+        let err = db
+            .open_for_mutation("node:Person", crate::db::MutationOpKind::Insert)
+            .await
+            .unwrap_err();
        assert!(
            err.to_string()
                .contains("write is unavailable while schema apply is in progress")
@ -1706,6 +1830,8 @@ edge WorksAt: Person -> Company
        let uri = dir.path().to_str().unwrap();
        let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap();
        db.coordinator
+            .write()
+            .await
            .branch_create(SCHEMA_APPLY_LOCK_BRANCH)
            .await
            .unwrap();
@ -1723,6 +1849,8 @@ edge WorksAt: Person -> Company
        let uri = dir.path().to_str().unwrap();
        let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap();
        db.coordinator
+            .write()
+            .await
            .branch_create(SCHEMA_APPLY_LOCK_BRANCH)
            .await
            .unwrap();
--- a/crates/omnigraph/src/db/omnigraph/export.rs
+++ b/crates/omnigraph/src/db/omnigraph/export.rs
@ -16,7 +16,7 @@ pub(super) async fn entity_at(
    id: &str,
    version: u64,
 ) -> Result<Option<serde_json::Value>> {
-    let snap = db.coordinator.snapshot_at_version(version).await?;
+    let snap = db.coordinator.read().await.snapshot_at_version(version).await?;
    entity_from_snapshot(db, &snap, table_key, id).await
 }

@ -142,7 +142,8 @@ async fn export_table_to_writer<W: Write>(
        .open_snapshot_table(snapshot, table_key)
        .await?;
    let ordering = Some(vec![ColumnOrdering::asc_nulls_last("id".to_string())]);
-    let blob_properties = blob_properties_for_table_key(db.catalog(), table_key)?;
+    let catalog = db.catalog();
+    let blob_properties = blob_properties_for_table_key(&catalog, table_key)?;

    if blob_properties.is_empty() {
        for batch in db.table_store.scan(&ds, None, None, ordering).await? {
@ -207,9 +208,9 @@ fn write_export_rows_from_batch<W: Write>(
    blob_values: Option<&HashMap<String, Vec<Option<String>>>>,
    writer: &mut W,
 ) -> Result<()> {
+    let catalog = db.catalog();
    if let Some(type_name) = table_key.strip_prefix("node:") {
-        let node_type = db
-            .catalog
+        let node_type = catalog
            .node_types
            .get(type_name)
            .ok_or_else(|| OmniError::manifest(format!("unknown node type '{}'", type_name)))?;
@ -243,8 +244,7 @@ fn write_export_rows_from_batch<W: Write>(
    }

    if let Some(edge_name) = table_key.strip_prefix("edge:") {
-        let edge_type = db
-            .catalog
+        let edge_type = catalog
            .edge_types
            .get(edge_name)
            .ok_or_else(|| OmniError::manifest(format!("unknown edge type '{}'", edge_name)))?;
--- a/crates/omnigraph/src/db/omnigraph/optimize.rs
+++ b/crates/omnigraph/src/db/omnigraph/optimize.rs
@ -74,14 +74,14 @@ pub struct TableCleanupStats {

 /// Run Lance `compact_files` on every node + edge table on `main`.
 /// Tables run in parallel (bounded concurrency).
-pub async fn optimize_all_tables(db: &mut Omnigraph) -> Result<Vec<TableOptimizeStats>> {
+pub async fn optimize_all_tables(db: &Omnigraph) -> Result<Vec<TableOptimizeStats>> {
    db.ensure_schema_state_valid().await?;
    db.ensure_schema_apply_idle("optimize").await?;

    let resolved = db.resolved_branch_target(None).await?;
    let snapshot = resolved.snapshot;

-    let table_tasks: Vec<_> = all_table_keys(&db.catalog)
+    let table_tasks: Vec<_> = all_table_keys(&db.catalog())
        .into_iter()
        .filter_map(|table_key| {
            let entry = snapshot.entry(&table_key)?;
@ -144,7 +144,7 @@ pub async fn cleanup_all_tables(
    let resolved = db.resolved_branch_target(None).await?;
    let snapshot = resolved.snapshot;

-    let table_tasks: Vec<_> = all_table_keys(&db.catalog)
+    let table_tasks: Vec<_> = all_table_keys(&db.catalog())
        .into_iter()
        .filter_map(|table_key| {
            let entry = snapshot.entry(&table_key)?;
--- a/crates/omnigraph/src/db/omnigraph/schema_apply.rs
+++ b/crates/omnigraph/src/db/omnigraph/schema_apply.rs
@ -12,7 +12,7 @@ pub(super) async fn plan_schema(
 }

 pub(super) async fn apply_schema(
-    db: &mut Omnigraph,
+    db: &Omnigraph,
    desired_schema_source: &str,
 ) -> Result<SchemaApplyResult> {
    acquire_schema_apply_lock(db).await?;
@ -27,11 +27,11 @@ pub(super) async fn apply_schema(
 }

 pub(super) async fn apply_schema_with_lock(
-    db: &mut Omnigraph,
+    db: &Omnigraph,
    desired_schema_source: &str,
 ) -> Result<SchemaApplyResult> {
    db.ensure_schema_state_valid().await?;
-    let branches = db.coordinator.all_branches().await?;
+    let branches = db.coordinator.read().await.all_branches().await?;
    // Skip `main` and internal system branches. The schema-apply lock branch
    // is excluded because it is the cluster-wide schema-apply serializer.
    // `__run__*` branches are no longer created; the filter remains as
@ -67,7 +67,7 @@ pub(super) async fn apply_schema_with_lock(
        return Ok(SchemaApplyResult {
            supported: true,
            applied: false,
-            manifest_version: db.version(),
+            manifest_version: db.version().await,
            steps: plan.steps,
        });
    }
@ -75,7 +75,7 @@ pub(super) async fn apply_schema_with_lock(
    let mut desired_catalog = build_catalog_from_ir(&desired_ir)?;
    fixup_blob_schemas(&mut desired_catalog);

-    let snapshot = db.snapshot();
+    let snapshot = db.snapshot().await;
    let base_manifest_version = snapshot.version();
    let mut added_tables = BTreeSet::new();
    let mut renamed_tables = HashMap::new();
@ -209,6 +209,26 @@ pub(super) async fn apply_schema_with_lock(
        });
    }

+    // Acquire per-(table_key, branch) queues for every existing table
+    // that schema_apply will rewrite or re-index. New tables (added or
+    // renamed targets) aren't acquired — they have no existing dataset
+    // to race against. Held across the per-table commit loop and the
+    // manifest publish via `commit_changes_with_actor` below.
+    //
+    // Schema-apply already holds the graph-wide `__schema_apply_lock__`
+    // sentinel branch, so under PR 1b's intermediate state these
+    // per-table acquisitions are uncontended. They exist for symmetry
+    // with future MR-870 recovery, which will need queue acquisition
+    // before any `Dataset::restore` it issues for SchemaApply sidecars.
+    let schema_apply_queue_keys: Vec<(String, Option<String>)> = recovery_pins
+        .iter()
+        .map(|pin| (pin.table_key.clone(), pin.table_branch.clone()))
+        .collect();
+    let _schema_apply_queue_guards = db
+        .write_queue()
+        .acquire_many(&schema_apply_queue_keys)
+        .await;
+
    let recovery_handle = if recovery_pins.is_empty()
        && sidecar_registrations.is_empty()
        && sidecar_tombstones.is_empty()
@ -225,7 +245,10 @@ pub(super) async fn apply_schema_with_lock(
        let mut sidecar = crate::db::manifest::new_sidecar(
            crate::db::manifest::SidecarKind::SchemaApply,
            None,
-            db.audit_actor_id.clone(),
+            // `apply_schema` doesn't currently take an actor (no `apply_schema_as`
+            // public API). The HTTP server's /schema/apply handler can pass actor
+            // context through a follow-up addition. For now, system-attributed.
+            None,
            recovery_pins,
        );
        sidecar.additional_registrations = sidecar_registrations;
@ -266,11 +289,12 @@ pub(super) async fn apply_schema_with_lock(
        })?;
        ensure_snapshot_entry_head_matches(db, source_entry).await?;
        let source_ds = snapshot.open(source_table_key).await?;
+        let current_catalog = db.catalog();
        let batch = batch_for_schema_apply_rewrite(
            db,
            &source_ds,
            source_table_key,
-            &db.catalog,
+            &current_catalog,
            target_table_key,
            &desired_catalog,
            property_renames.get(target_table_key),
@ -311,11 +335,12 @@ pub(super) async fn apply_schema_with_lock(
        })?;
        ensure_snapshot_entry_head_matches(db, entry).await?;
        let source_ds = snapshot.open(table_key).await?;
+        let current_catalog = db.catalog();
        let batch = batch_for_schema_apply_rewrite(
            db,
            &source_ds,
            table_key,
-            &db.catalog,
+            &current_catalog,
            table_key,
            &desired_catalog,
            property_renames.get(table_key),
@ -418,11 +443,11 @@ pub(super) async fn apply_schema_with_lock(
    }

    db.refresh_coordinator_only().await?;
-    if db.version() != base_manifest_version {
+    let current_manifest_version = db.version().await;
+    if current_manifest_version != base_manifest_version {
        return Err(OmniError::manifest_conflict(format!(
            "schema apply lost its write lease: main advanced from v{} to v{} while schema apply was in progress",
-            base_manifest_version,
-            db.version()
+            base_manifest_version, current_manifest_version,
        )));
    }

@ -444,13 +469,15 @@ pub(super) async fn apply_schema_with_lock(

    crate::failpoints::maybe_fail("schema_apply.after_staging_write")?;

-    let actor_id = db.current_audit_actor().map(str::to_string);
+    // `apply_schema` doesn't currently take an actor; system-attributed.
    let PublishedSnapshot {
        manifest_version,
        _snapshot_id: _,
    } = db
        .coordinator
-        .commit_changes_with_actor(&manifest_changes, actor_id.as_deref())
+        .write()
+        .await
+        .commit_changes_with_actor(&manifest_changes, None)
        .await?;

    crate::failpoints::maybe_fail("schema_apply.after_manifest_commit")?;
@ -471,9 +498,9 @@ pub(super) async fn apply_schema_with_lock(
        )
        .await?;

-    db.catalog = desired_catalog;
-    db.schema_source = desired_schema_source.to_string();
-    db.coordinator.refresh().await?;
+    db.store_catalog(desired_catalog);
+    db.store_schema_source(desired_schema_source.to_string());
+    db.coordinator.write().await.refresh().await?;
    db.runtime_cache.invalidate_all().await;
    if changed_edge_tables {
        db.invalidate_graph_index().await;
@ -504,15 +531,15 @@ pub(super) async fn apply_schema_with_lock(
    })
 }

-pub(super) async fn ensure_schema_apply_idle(db: &mut Omnigraph, operation: &str) -> Result<()> {
+pub(super) async fn ensure_schema_apply_idle(db: &Omnigraph, operation: &str) -> Result<()> {
    db.refresh_coordinator_only().await?;
    ensure_schema_apply_not_locked(db, operation).await
 }

-pub(super) async fn acquire_schema_apply_lock(db: &mut Omnigraph) -> Result<()> {
+pub(super) async fn acquire_schema_apply_lock(db: &Omnigraph) -> Result<()> {
    db.ensure_schema_state_valid().await?;
    db.refresh_coordinator_only().await?;
-    let branches = db.coordinator.all_branches().await?;
+    let branches = db.coordinator.read().await.all_branches().await?;
    if branches
        .iter()
        .any(|branch| is_schema_apply_lock_branch(branch))
@ -523,12 +550,16 @@ pub(super) async fn acquire_schema_apply_lock(db: &mut Omnigraph) -> Result<()>
    }

    db.coordinator
+        .write()
+        .await
        .branch_create(SCHEMA_APPLY_LOCK_BRANCH)
        .await?;
    db.refresh_coordinator_only().await?;

    let blocking_branches = db
        .coordinator
+        .read()
+        .await
        .all_branches()
        .await?
        .into_iter()
@ -545,8 +576,10 @@ pub(super) async fn acquire_schema_apply_lock(db: &mut Omnigraph) -> Result<()>
    Ok(())
 }

-pub(super) async fn release_schema_apply_lock(db: &mut Omnigraph) -> Result<()> {
+pub(super) async fn release_schema_apply_lock(db: &Omnigraph) -> Result<()> {
    db.coordinator
+        .write()
+        .await
        .branch_delete(SCHEMA_APPLY_LOCK_BRANCH)
        .await?;
    // Use refresh_coordinator_only — the full Omnigraph::refresh would
@ -560,6 +593,8 @@ pub(super) async fn release_schema_apply_lock(db: &mut Omnigraph) -> Result<()>
 pub(super) async fn ensure_schema_apply_not_locked(db: &Omnigraph, operation: &str) -> Result<()> {
    if db
        .coordinator
+        .read()
+        .await
        .all_branches()
        .await?
        .iter()
--- a/crates/omnigraph/src/db/omnigraph/table_ops.rs
+++ b/crates/omnigraph/src/db/omnigraph/table_ops.rs
@ -2,31 +2,31 @@ use super::*;

 pub(super) async fn graph_index(db: &Omnigraph) -> Result<Arc<crate::graph_index::GraphIndex>> {
    db.ensure_schema_state_valid().await?;
-    let resolved = db
-        .coordinator
+    let coord = db.coordinator.read().await;
+    let resolved = coord
        .resolve_target(&ReadTarget::Branch(
-            db.coordinator
-                .current_branch()
-                .unwrap_or("main")
-                .to_string(),
+            coord.current_branch().unwrap_or("main").to_string(),
        ))
        .await?;
-    db.runtime_cache.graph_index(&resolved, &db.catalog).await
+    drop(coord);
+    let catalog = db.catalog();
+    db.runtime_cache.graph_index(&resolved, &catalog).await
 }

 pub(super) async fn graph_index_for_resolved(
    db: &Omnigraph,
    resolved: &ResolvedTarget,
 ) -> Result<Arc<crate::graph_index::GraphIndex>> {
-    db.runtime_cache.graph_index(resolved, &db.catalog).await
+    let catalog = db.catalog();
+    db.runtime_cache.graph_index(resolved, &catalog).await
 }

-pub(super) async fn ensure_indices(db: &mut Omnigraph) -> Result<()> {
-    let current_branch = db.coordinator.current_branch().map(str::to_string);
+pub(super) async fn ensure_indices(db: &Omnigraph) -> Result<()> {
+    let current_branch = db.coordinator.read().await.current_branch().map(str::to_string);
    ensure_indices_for_branch(db, current_branch.as_deref()).await
 }

-pub(super) async fn ensure_indices_on(db: &mut Omnigraph, branch: &str) -> Result<()> {
+pub(super) async fn ensure_indices_on(db: &Omnigraph, branch: &str) -> Result<()> {
    let branch = normalize_branch_name(branch)?;
    ensure_indices_for_branch(db, branch.as_deref()).await
 }
@ -58,12 +58,18 @@ pub(super) async fn failpoint_publish_table_head_without_index_rebuild_for_test(
    };
    let mut expected = std::collections::HashMap::new();
    expected.insert(table_key.to_string(), entry.table_version);
-    commit_prepared_updates_on_branch_with_expected(db, branch.as_deref(), &[update], &expected)
-        .await
+    commit_prepared_updates_on_branch_with_expected(
+        db,
+        branch.as_deref(),
+        &[update],
+        &expected,
+        None,
+    )
+    .await
 }

 pub(super) async fn ensure_indices_for_branch(
-    db: &mut Omnigraph,
+    db: &Omnigraph,
    branch: Option<&str>,
 ) -> Result<()> {
    db.ensure_schema_state_valid().await?;
@ -72,6 +78,7 @@ pub(super) async fn ensure_indices_for_branch(
    let snapshot = resolved.snapshot;
    let mut updates = Vec::new();
    let active_branch = resolved.branch;
+    let catalog = db.catalog();

    // Recovery sidecar: protect the per-table commit_staged loop in
    // build_indices_on_dataset (one commit per index built). Only pins
@ -83,7 +90,7 @@ pub(super) async fn ensure_indices_for_branch(
    // committed work on sibling tables. Steady-state runs (everything
    // already indexed) skip the sidecar entirely.
    let mut recovery_pins: Vec<crate::db::manifest::SidecarTablePin> = Vec::new();
-    for type_name in db.catalog.node_types.keys() {
+    for type_name in catalog.node_types.keys() {
        let table_key = format!("node:{}", type_name);
        let Some(entry) = snapshot.entry(&table_key) else {
            continue;
@ -122,7 +129,7 @@ pub(super) async fn ensure_indices_for_branch(
            });
        }
    }
-    for edge_name in db.catalog.edge_types.keys() {
+    for edge_name in catalog.edge_types.keys() {
        let table_key = format!("edge:{}", edge_name);
        let Some(entry) = snapshot.entry(&table_key) else {
            continue;
@ -147,13 +154,28 @@ pub(super) async fn ensure_indices_for_branch(
            });
        }
    }
+    // Acquire per-(table_key, active_branch) queues for every table
+    // that needs index work. Held across the per-table commit loop and
+    // the manifest publish at the end of this function. Sorted-order
+    // acquisition prevents lock-order inversion against concurrent
+    // multi-table writers (mutation finalize, branch_merge, future
+    // MR-870 recovery). Under PR 1b's intermediate state (global server
+    // RwLock still in place), this acquisition is uncontended.
+    let queue_keys: Vec<(String, Option<String>)> = recovery_pins
+        .iter()
+        .map(|pin| (pin.table_key.clone(), pin.table_branch.clone()))
+        .collect();
+    let _queue_guards = db.write_queue().acquire_many(&queue_keys).await;
+
    let recovery_handle = if recovery_pins.is_empty() {
        None
    } else {
        let sidecar = crate::db::manifest::new_sidecar(
            crate::db::manifest::SidecarKind::EnsureIndices,
            active_branch.clone(),
-            db.audit_actor_id.clone(),
+            // `ensure_indices` doesn't currently take an actor; system-attributed.
+            // Future: add `ensure_indices_as` to thread actor context.
+            None,
            recovery_pins,
        );
        Some(
@ -162,7 +184,7 @@ pub(super) async fn ensure_indices_for_branch(
        )
    };

-    for type_name in db.catalog.node_types.keys() {
+    for type_name in catalog.node_types.keys() {
        let table_key = format!("node:{}", type_name);
        let Some(entry) = snapshot.entry(&table_key) else {
            continue;
@ -179,6 +201,7 @@ pub(super) async fn ensure_indices_for_branch(
                        entry.table_branch.as_deref(),
                        entry.table_version,
                        active_branch,
+                        crate::db::MutationOpKind::SchemaRewrite,
                    )
                    .await?
                }
@ -209,7 +232,7 @@ pub(super) async fn ensure_indices_for_branch(
        }
    }

-    for edge_name in db.catalog.edge_types.keys() {
+    for edge_name in catalog.edge_types.keys() {
        let table_key = format!("edge:{}", edge_name);
        let Some(entry) = snapshot.entry(&table_key) else {
            continue;
@ -226,6 +249,7 @@ pub(super) async fn ensure_indices_for_branch(
                        entry.table_branch.as_deref(),
                        entry.table_version,
                        active_branch,
+                        crate::db::MutationOpKind::SchemaRewrite,
                    )
                    .await?
                }
@ -264,7 +288,7 @@ pub(super) async fn ensure_indices_for_branch(
    crate::failpoints::maybe_fail("ensure_indices.post_phase_b_pre_manifest_commit")?;

    if !updates.is_empty() {
-        commit_prepared_updates_on_branch(db, branch, &updates).await?;
+        commit_prepared_updates_on_branch(db, branch, &updates, None).await?;
    }

    // Recovery sidecar lifecycle: delete after the manifest publish (or
@ -321,7 +345,8 @@ async fn needs_index_work_node(
    if !db.table_store.has_btree_index(&ds, "id").await? {
        return Ok(true);
    }
-    let Some(node_type) = db.catalog.node_types.get(type_name) else {
+    let catalog = db.catalog();
+    let Some(node_type) = catalog.node_types.get(type_name) else {
        return Ok(false);
    };
    for index_cols in &node_type.indices {
@ -376,15 +401,23 @@ async fn needs_index_work_edge(
 pub(super) async fn open_for_mutation(
    db: &Omnigraph,
    table_key: &str,
+    op_kind: crate::db::MutationOpKind,
 ) -> Result<(Dataset, String, Option<String>)> {
-    let current_branch = db.coordinator.current_branch().map(str::to_string);
-    open_for_mutation_on_branch(db, current_branch.as_deref(), table_key).await
+    let current_branch = db.coordinator.read().await.current_branch().map(str::to_string);
+    open_for_mutation_on_branch(db, current_branch.as_deref(), table_key, op_kind).await
 }

+/// Open a sub-table for mutation. The `op_kind` selects the strict-vs-relaxed
+/// pre-stage version-check policy — see [`crate::db::MutationOpKind`] for the
+/// rationale per kind. Insert / Merge skip the strict
+/// `ensure_expected_version` check (Lance's natural conflict resolver +
+/// per-(table, branch) queue + publisher CAS handle drift); Update / Delete /
+/// SchemaRewrite keep it (read-modify-write SI).
 pub(super) async fn open_for_mutation_on_branch(
    db: &Omnigraph,
    branch: Option<&str>,
    table_key: &str,
+    op_kind: crate::db::MutationOpKind,
 ) -> Result<(Dataset, String, Option<String>)> {
    db.ensure_schema_apply_not_locked("write").await?;
    let resolved = db.resolved_branch_target(branch).await?;
@ -399,8 +432,10 @@ pub(super) async fn open_for_mutation_on_branch(
                .table_store
                .open_dataset_head_for_write(table_key, &full_path, None)
                .await?;
-            db.table_store
-                .ensure_expected_version(&ds, table_key, entry.table_version)?;
+            if op_kind.strict_pre_stage_version_check() {
+                db.table_store
+                    .ensure_expected_version(&ds, table_key, entry.table_version)?;
+            }
            Ok((ds, full_path, None))
        }
        Some(active_branch) => {
@ -411,6 +446,7 @@ pub(super) async fn open_for_mutation_on_branch(
                entry.table_branch.as_deref(),
                entry.table_version,
                active_branch,
+                op_kind,
            )
            .await?;
            Ok((ds, full_path, table_branch))
@ -425,6 +461,7 @@ pub(super) async fn open_owned_dataset_for_branch_write(
    entry_branch: Option<&str>,
    entry_version: u64,
    active_branch: &str,
+    op_kind: crate::db::MutationOpKind,
 ) -> Result<(Dataset, Option<String>)> {
    match entry_branch {
        Some(branch) if branch == active_branch => {
@ -432,8 +469,10 @@ pub(super) async fn open_owned_dataset_for_branch_write(
                .table_store
                .open_dataset_head_for_write(table_key, full_path, Some(active_branch))
                .await?;
-            db.table_store
-                .ensure_expected_version(&ds, table_key, entry_version)?;
+            if op_kind.strict_pre_stage_version_check() {
+                db.table_store
+                    .ensure_expected_version(&ds, table_key, entry_version)?;
+            }
            Ok((ds, Some(active_branch.to_string())))
        }
        source_branch => {
@ -450,8 +489,10 @@ pub(super) async fn open_owned_dataset_for_branch_write(
                .table_store
                .open_dataset_head_for_write(table_key, full_path, Some(active_branch))
                .await?;
-            db.table_store
-                .ensure_expected_version(&ds, table_key, entry_version)?;
+            if op_kind.strict_pre_stage_version_check() {
+                db.table_store
+                    .ensure_expected_version(&ds, table_key, entry_version)?;
+            }
            Ok((ds, Some(active_branch.to_string())))
        }
    }
@ -482,11 +523,27 @@ pub(super) async fn reopen_for_mutation(
    full_path: &str,
    table_branch: Option<&str>,
    expected_version: u64,
+    op_kind: crate::db::MutationOpKind,
 ) -> Result<Dataset> {
    db.ensure_schema_apply_not_locked("write").await?;
-    db.table_store
-        .reopen_for_mutation(full_path, table_branch, table_key, expected_version)
-        .await
+    if op_kind.strict_pre_stage_version_check() {
+        db.table_store
+            .reopen_for_mutation(full_path, table_branch, table_key, expected_version)
+            .await
+    } else {
+        // Insert / Merge: skip the strict version check. Open at HEAD —
+        // Lance's natural conflict resolver at commit_staged time
+        // (rebase append, dedupe merge_insert) handles concurrent
+        // writers correctly; the publisher CAS in
+        // `MutationStaging::commit_all` (refreshed under the
+        // per-(table, branch) queue via `snapshot_for_branch`) catches
+        // genuine cross-process drift as 409. See
+        // [`crate::db::MutationOpKind`] for the policy rationale.
+        let _ = expected_version;
+        db.table_store
+            .open_dataset_head_for_write(table_key, full_path, table_branch)
+            .await
+    }
 }

 pub(super) async fn open_dataset_at_state(
@ -505,7 +562,8 @@ pub(super) async fn build_indices_on_dataset(
    table_key: &str,
    ds: &mut Dataset,
 ) -> Result<()> {
-    build_indices_on_dataset_for_catalog(db, &db.catalog, table_key, ds).await
+    let catalog = db.catalog();
+    build_indices_on_dataset_for_catalog(db, &catalog, table_key, ds).await
 }

 pub(super) async fn build_indices_on_dataset_for_catalog(
@ -680,12 +738,19 @@ async fn prepare_updates_for_commit(
        let mut prepared_update = update.clone();
        if prepared_update.row_count > 0 {
            let full_path = format!("{}/{}", db.root_uri, entry.table_path);
+            // Strict version check is correct here: this runs INSIDE
+            // the publisher commit path, after `commit_staged` already
+            // advanced Lance HEAD to `prepared_update.table_version`.
+            // The check is a defense-in-depth assertion that the
+            // dataset state matches what we just committed; not the
+            // pre-stage race the op-kind policy targets.
            let mut ds = reopen_for_mutation(
                db,
                &prepared_update.table_key,
                &full_path,
                prepared_update.table_branch.as_deref(),
                prepared_update.table_version,
+                crate::db::MutationOpKind::SchemaRewrite,
            )
            .await?;
            build_indices_on_dataset(db, &prepared_update.table_key, &mut ds).await?;
@ -702,49 +767,50 @@ async fn prepare_updates_for_commit(
 }

 async fn commit_prepared_updates(
-    db: &mut Omnigraph,
+    db: &Omnigraph,
    updates: &[crate::db::SubTableUpdate],
+    actor_id: Option<&str>,
 ) -> Result<u64> {
-    let actor_id = db.current_audit_actor().map(str::to_string);
    let PublishedSnapshot {
        manifest_version,
        _snapshot_id: _,
    } = db
        .coordinator
-        .commit_updates_with_actor(updates, actor_id.as_deref())
+        .write()
+        .await
+        .commit_updates_with_actor(updates, actor_id)
        .await?;
    Ok(manifest_version)
 }

 async fn commit_prepared_updates_with_expected(
-    db: &mut Omnigraph,
+    db: &Omnigraph,
    updates: &[crate::db::SubTableUpdate],
    expected_table_versions: &std::collections::HashMap<String, u64>,
+    actor_id: Option<&str>,
 ) -> Result<u64> {
-    let actor_id = db.current_audit_actor().map(str::to_string);
    let PublishedSnapshot {
        manifest_version,
        _snapshot_id: _,
    } = db
        .coordinator
-        .commit_updates_with_actor_with_expected(
-            updates,
-            expected_table_versions,
-            actor_id.as_deref(),
-        )
+        .write()
+        .await
+        .commit_updates_with_actor_with_expected(updates, expected_table_versions, actor_id)
        .await?;
    Ok(manifest_version)
 }

 pub(super) async fn commit_prepared_updates_on_branch(
-    db: &mut Omnigraph,
+    db: &Omnigraph,
    branch: Option<&str>,
    updates: &[crate::db::SubTableUpdate],
+    actor_id: Option<&str>,
 ) -> Result<u64> {
-    let current_branch = db.coordinator.current_branch().map(str::to_string);
+    let current_branch = db.coordinator.read().await.current_branch().map(str::to_string);
    let requested_branch = branch.map(str::to_string);
    if requested_branch == current_branch {
-        return commit_prepared_updates(db, updates).await;
+        return commit_prepared_updates(db, updates, actor_id).await;
    }

    let mut coordinator = match requested_branch.as_deref() {
@ -753,26 +819,32 @@ pub(super) async fn commit_prepared_updates_on_branch(
        }
        None => GraphCoordinator::open(db.uri(), Arc::clone(&db.storage)).await?,
    };
-    let actor_id = db.current_audit_actor().map(str::to_string);
    let PublishedSnapshot {
        manifest_version,
        _snapshot_id: _,
    } = coordinator
-        .commit_updates_with_actor(updates, actor_id.as_deref())
+        .commit_updates_with_actor(updates, actor_id)
        .await?;
    Ok(manifest_version)
 }

 pub(super) async fn commit_prepared_updates_on_branch_with_expected(
-    db: &mut Omnigraph,
+    db: &Omnigraph,
    branch: Option<&str>,
    updates: &[crate::db::SubTableUpdate],
    expected_table_versions: &std::collections::HashMap<String, u64>,
+    actor_id: Option<&str>,
 ) -> Result<u64> {
-    let current_branch = db.coordinator.current_branch().map(str::to_string);
+    let current_branch = db.coordinator.read().await.current_branch().map(str::to_string);
    let requested_branch = branch.map(str::to_string);
    if requested_branch == current_branch {
-        return commit_prepared_updates_with_expected(db, updates, expected_table_versions).await;
+        return commit_prepared_updates_with_expected(
+            db,
+            updates,
+            expected_table_versions,
+            actor_id,
+        )
+        .await;
    }

    let mut coordinator = match requested_branch.as_deref() {
@ -781,16 +853,11 @@ pub(super) async fn commit_prepared_updates_on_branch_with_expected(
        }
        None => GraphCoordinator::open(db.uri(), Arc::clone(&db.storage)).await?,
    };
-    let actor_id = db.current_audit_actor().map(str::to_string);
    let PublishedSnapshot {
        manifest_version,
        _snapshot_id: _,
    } = coordinator
-        .commit_updates_with_actor_with_expected(
-            updates,
-            expected_table_versions,
-            actor_id.as_deref(),
-        )
+        .commit_updates_with_actor_with_expected(updates, expected_table_versions, actor_id)
        .await?;
    Ok(manifest_version)
 }
@ -803,31 +870,31 @@ pub(super) async fn commit_updates(
    updates: &[crate::db::SubTableUpdate],
 ) -> Result<u64> {
    db.ensure_schema_apply_not_locked("write commit").await?;
-    let current_branch = db.coordinator.current_branch().map(str::to_string);
+    let current_branch = db.coordinator.read().await.current_branch().map(str::to_string);
    let prepared = prepare_updates_for_commit(db, current_branch.as_deref(), updates).await?;
-    commit_prepared_updates(db, &prepared).await
+    commit_prepared_updates(db, &prepared, None).await
 }

 pub(super) async fn commit_manifest_updates(
-    db: &mut Omnigraph,
+    db: &Omnigraph,
    updates: &[crate::db::SubTableUpdate],
 ) -> Result<u64> {
-    db.coordinator.commit_manifest_updates(updates).await
+    db.coordinator.write().await.commit_manifest_updates(updates).await
 }

 pub(super) async fn record_merge_commit(
-    db: &mut Omnigraph,
+    db: &Omnigraph,
    manifest_version: u64,
    parent_commit_id: &str,
    merged_parent_commit_id: &str,
+    actor_id: Option<&str>,
 ) -> Result<String> {
-    let actor_id = db.current_audit_actor().map(str::to_string);
-    db.coordinator
+    db.coordinator.write().await
        .record_merge_commit(
            manifest_version,
            parent_commit_id,
            merged_parent_commit_id,
-            actor_id.as_deref(),
+            actor_id,
        )
        .await
        .map(|snapshot_id| snapshot_id.as_str().to_string())
@ -837,19 +904,26 @@ pub(super) async fn record_merge_commit(
 /// `expected_table_versions` map asserts the manifest's pre-write per-table
 /// versions; mismatches surface as `ManifestConflictDetails::ExpectedVersionMismatch`.
 pub(super) async fn commit_updates_on_branch_with_expected(
-    db: &mut Omnigraph,
+    db: &Omnigraph,
    branch: Option<&str>,
    updates: &[crate::db::SubTableUpdate],
    expected_table_versions: &std::collections::HashMap<String, u64>,
+    actor_id: Option<&str>,
 ) -> Result<u64> {
    db.ensure_schema_apply_not_locked("write commit").await?;
    let prepared = prepare_updates_for_commit(db, branch, updates).await?;
-    commit_prepared_updates_on_branch_with_expected(db, branch, &prepared, expected_table_versions)
-        .await
+    commit_prepared_updates_on_branch_with_expected(
+        db,
+        branch,
+        &prepared,
+        expected_table_versions,
+        actor_id,
+    )
+    .await
 }

-pub(super) async fn ensure_commit_graph_initialized(db: &mut Omnigraph) -> Result<()> {
-    db.coordinator.ensure_commit_graph_initialized().await
+pub(super) async fn ensure_commit_graph_initialized(db: &Omnigraph) -> Result<()> {
+    db.coordinator.write().await.ensure_commit_graph_initialized().await
 }

 pub(super) async fn invalidate_graph_index(db: &Omnigraph) {
--- a/crates/omnigraph/src/db/write_queue.rs
+++ b/crates/omnigraph/src/db/write_queue.rs
@ -0,0 +1,231 @@
+//! Per-`(table_key, branch)` writer queues — MR-686 scaffolding.
+//!
+//! Today every server-layer write serializes on the global
+//! `Arc<RwLock<Omnigraph>>` in `AppState`. MR-686 replaces that with
+//! per-`(table_key, branch_ref)` queues so disjoint-key writes proceed
+//! concurrently. This module owns the queue data structure; callers in
+//! `MutationStaging::commit_all`, `branch_merge`, `schema_apply`,
+//! `ensure_indices`, `delete_where`, and the future MR-870 recovery
+//! reconciler acquire guards before any per-table Lance commit.
+//!
+//! ## Why exclusive `tokio::sync::Mutex<()>` per key
+//!
+//! Lance's `Dataset::restore` "wins" against concurrent Append/Update/
+//! Delete/CreateIndex/Merge per `check_restore_txn`, silently orphaning
+//! the concurrent writer's commit. The queue's *only* application-layer
+//! job is to serialize Restore against every other writer on the same
+//! `(table_key, branch_ref)`. Lance OCC handles the rest of the conflict
+//! matrix (Append vs Append fully compatible, Update vs Update rebases or
+//! retries, etc.) but cannot make Restore symmetric — that's an upstream
+//! design choice. Until Lance fixes Restore (or BatchCommitTables
+//! changes the protocol), every writer takes the same exclusive lock.
+//!
+//! `RwLock` (shared for normal writes, exclusive for Restore) is the
+//! natural follow-up but adds a writer-classification surface that's
+//! easy to get wrong; misclassifying any writer reintroduces the
+//! orphaning hazard. We start with `Mutex` and revisit based on
+//! production telemetry.
+//!
+//! ## Sorted-order acquisition
+//!
+//! `acquire_many` accepts a slice of keys and acquires them in
+//! lexicographic order. Multi-table writers (mutation finalize,
+//! branch_merge, future recovery reconciler) MUST go through
+//! `acquire_many` so all callers agree on acquisition order — this is
+//! how lock-order inversion deadlock is prevented.
+
+use std::collections::HashMap;
+use std::sync::{Arc, Mutex};
+
+use tokio::sync::{Mutex as AsyncMutex, OwnedMutexGuard};
+
+/// Queue key: `(table_key, branch_ref)`. `branch_ref = None` means main.
+///
+/// Branch is part of the key because the same Lance dataset can be
+/// pinned at different versions on different branches; concurrent
+/// writes to the same `table_key` on disjoint branches must NOT
+/// serialize at the queue.
+pub(crate) type TableQueueKey = (String, Option<String>);
+
+/// Per-`(table_key, branch)` writer queue manager.
+///
+/// Lives on `Omnigraph` as `Arc<WriteQueueManager>` so HTTP handlers,
+/// engine internals, the CLI binary, and future background reconcilers
+/// (MR-870 recovery, MR-848 index) all reach it via the engine handle.
+#[derive(Default)]
+pub(crate) struct WriteQueueManager {
+    /// Held only briefly per `acquire` call: clone out the per-key Arc,
+    /// release the std mutex, then await the per-key tokio Mutex.
+    queues: Mutex<HashMap<TableQueueKey, Arc<AsyncMutex<()>>>>,
+}
+
+impl WriteQueueManager {
+    pub(crate) fn new() -> Self {
+        Self::default()
+    }
+
+    /// Get-or-create the per-key queue and clone its Arc.
+    fn slot(&self, key: &TableQueueKey) -> Arc<AsyncMutex<()>> {
+        let mut map = self.queues.lock().expect("write queue map poisoned");
+        if let Some(existing) = map.get(key) {
+            return Arc::clone(existing);
+        }
+        let fresh = Arc::new(AsyncMutex::new(()));
+        map.insert(key.clone(), Arc::clone(&fresh));
+        fresh
+    }
+
+    /// Acquire exclusive access to the queue for one `(table_key, branch)`.
+    ///
+    /// Blocks until the lock is available. Drop the returned guard to
+    /// release; the lock outlives the `WriteQueueManager` borrow.
+    pub(crate) async fn acquire(&self, key: &TableQueueKey) -> OwnedMutexGuard<()> {
+        self.slot(key).lock_owned().await
+    }
+
+    /// Acquire exclusive access to many `(table_key, branch)` keys
+    /// atomically, in lex-sorted order. Used by multi-table writers
+    /// (mutation finalize, branch_merge, recovery) so all callers
+    /// agree on acquisition order — prevents lock-order inversion.
+    ///
+    /// Empty input returns an empty Vec without touching the map.
+    /// Duplicates in `keys` are deduped before acquisition (the same
+    /// key acquired twice would deadlock against itself).
+    pub(crate) async fn acquire_many(
+        &self,
+        keys: &[TableQueueKey],
+    ) -> Vec<OwnedMutexGuard<()>> {
+        if keys.is_empty() {
+            return Vec::new();
+        }
+        let mut sorted: Vec<TableQueueKey> = keys.to_vec();
+        sorted.sort();
+        sorted.dedup();
+        let mut guards = Vec::with_capacity(sorted.len());
+        for key in &sorted {
+            guards.push(self.acquire(key).await);
+        }
+        guards
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::time::{Duration, Instant};
+    use tokio::time::timeout;
+
+    fn key(table: &str, branch: Option<&str>) -> TableQueueKey {
+        (table.to_string(), branch.map(str::to_string))
+    }
+
+    #[tokio::test]
+    async fn acquire_many_empty_returns_empty() {
+        let qm = WriteQueueManager::new();
+        let guards = qm.acquire_many(&[]).await;
+        assert!(guards.is_empty());
+    }
+
+    #[tokio::test]
+    async fn acquire_many_dedupes_repeated_keys() {
+        // Same key passed twice would deadlock if not deduped.
+        let qm = WriteQueueManager::new();
+        let k = key("t1", None);
+        let guards = timeout(
+            Duration::from_secs(2),
+            qm.acquire_many(&[k.clone(), k.clone(), k]),
+        )
+        .await
+        .expect("acquire_many with duplicates deadlocked");
+        assert_eq!(guards.len(), 1);
+    }
+
+    #[tokio::test]
+    async fn acquire_many_sorts_keys_deterministically() {
+        // Two callers passing keys in different orders must acquire in
+        // the same internal order. We test this indirectly: caller A
+        // passes [a, c] and caller B passes [c, a]; if they both
+        // acquire in sorted order the second caller blocks on `a` first,
+        // not `c` — same as A — so no deadlock under any interleaving.
+        // Direct sort observation: call acquire_many with a reversed
+        // input and verify it doesn't deadlock against a held guard on
+        // the sorted-first key.
+        let qm = Arc::new(WriteQueueManager::new());
+        let a = key("a", None);
+        let z = key("z", None);
+
+        // Hold `a` exclusively.
+        let _held = qm.acquire(&a).await;
+
+        // acquire_many([z, a]) — must sort to [a, z] internally and
+        // block on `a`. With a 200ms timeout we should NOT see it
+        // complete (it's blocked on `a`).
+        let qm2 = Arc::clone(&qm);
+        let z_clone = z.clone();
+        let a_clone = a.clone();
+        let result = timeout(Duration::from_millis(200), async move {
+            qm2.acquire_many(&[z_clone, a_clone]).await
+        })
+        .await;
+        assert!(result.is_err(), "acquire_many should block on `a`, the lex-first key");
+    }
+
+    #[tokio::test]
+    async fn same_key_acquire_serializes() {
+        let qm = Arc::new(WriteQueueManager::new());
+        let k = key("t1", None);
+
+        let first = qm.acquire(&k).await;
+
+        // Second acquire on same key should NOT complete within 200ms.
+        let qm2 = Arc::clone(&qm);
+        let k2 = k.clone();
+        let blocked = timeout(Duration::from_millis(200), async move {
+            qm2.acquire(&k2).await
+        })
+        .await;
+        assert!(blocked.is_err(), "second acquire on same key must block");
+
+        // Drop the first guard, then second acquire should succeed.
+        drop(first);
+        let _second = timeout(Duration::from_secs(2), qm.acquire(&k))
+            .await
+            .expect("second acquire after release should not block");
+    }
+
+    #[tokio::test]
+    async fn disjoint_keys_acquire_concurrently() {
+        let qm = Arc::new(WriteQueueManager::new());
+        let a = key("a", None);
+        let b = key("b", None);
+
+        // Hold `a` indefinitely.
+        let _held_a = qm.acquire(&a).await;
+
+        // Acquire `b` on a different task. Should complete promptly
+        // because `b` is disjoint from `a`.
+        let qm2 = Arc::clone(&qm);
+        let start = Instant::now();
+        let _held_b = timeout(Duration::from_secs(2), qm2.acquire(&b))
+            .await
+            .expect("disjoint key acquire must not block on unrelated held key");
+        assert!(
+            start.elapsed() < Duration::from_millis(500),
+            "disjoint acquire took {:?}, should be near-instant",
+            start.elapsed()
+        );
+    }
+
+    #[tokio::test]
+    async fn disjoint_branches_on_same_table_do_not_serialize() {
+        // (table, main) and (table, feature) are different keys.
+        let qm = Arc::new(WriteQueueManager::new());
+        let main_k = key("t1", None);
+        let feature_k = key("t1", Some("feature"));
+
+        let _held_main = qm.acquire(&main_k).await;
+        let _held_feature = timeout(Duration::from_secs(2), qm.acquire(&feature_k))
+            .await
+            .expect("same-table-different-branch should not serialize");
+    }
+}
--- a/crates/omnigraph/src/exec/merge.rs
+++ b/crates/omnigraph/src/exec/merge.rs
@ -855,8 +855,9 @@ async fn publish_adopted_source_state(
        .ok_or_else(|| OmniError::manifest(format!("missing source entry for {}", table_key)))?;
    let target_entry = target_snapshot.entry(table_key);

+    let target_active = target_db.active_branch().await;
    match (
-        target_db.active_branch(),
+        target_active.as_deref(),
        source_entry.table_branch.as_deref(),
    ) {
        // Both on main — pointer switch is safe (same lineage, version columns valid)
@ -945,7 +946,13 @@ async fn publish_rewritten_merge_table(
    table_key: &str,
    staged: &StagedMergeResult,
 ) -> Result<crate::db::SubTableUpdate> {
-    let (ds, full_path, table_branch) = target_db.open_for_mutation(table_key).await?;
+    // Branch merge's source-rewrite path is Merge-shaped (upsert from
+    // source onto target). The inline `delete_where` later in this
+    // function operates on rows the rewrite chose to remove, not
+    // user-facing predicates, so Merge is the correct policy here.
+    let (ds, full_path, table_branch) = target_db
+        .open_for_mutation(table_key, crate::db::MutationOpKind::Merge)
+        .await?;
    let mut current_ds = ds;

    // Phase 1: merge_insert changed/new rows (preserves _row_created_at_version for
@ -1045,25 +1052,26 @@ async fn publish_rewritten_merge_table(
 }

 impl Omnigraph {
-    pub async fn branch_merge(&mut self, source: &str, target: &str) -> Result<MergeOutcome> {
+    pub async fn branch_merge(&self, source: &str, target: &str) -> Result<MergeOutcome> {
        self.branch_merge_as(source, target, None).await
    }

    pub async fn branch_merge_as(
-        &mut self,
+        &self,
        source: &str,
        target: &str,
        actor_id: Option<&str>,
    ) -> Result<MergeOutcome> {
        self.ensure_schema_apply_idle("branch_merge").await?;
-        let previous_actor = self.audit_actor_id.clone();
-        self.audit_actor_id = actor_id.map(str::to_string);
-        let result = self.branch_merge_impl(source, target).await;
-        self.audit_actor_id = previous_actor;
-        result
+        self.branch_merge_impl(source, target, actor_id).await
    }

-    async fn branch_merge_impl(&mut self, source: &str, target: &str) -> Result<MergeOutcome> {
+    async fn branch_merge_impl(
+        &self,
+        source: &str,
+        target: &str,
+        actor_id: Option<&str>,
+    ) -> Result<MergeOutcome> {
        if is_internal_run_branch(source) || is_internal_run_branch(target) {
            return Err(OmniError::manifest(format!(
                "branch_merge does not allow internal run refs ('{}' -> '{}')",
@ -1113,7 +1121,17 @@ impl Omnigraph {
            ))
            .await?
            .snapshot;
-        let previous_branch = self.active_branch().map(str::to_string);
+        // Hold the merge-exclusive mutex across the full swap → operate
+        // → restore window. Two concurrent branch_merge calls would
+        // otherwise interleave their three separate `coordinator.write()`
+        // acquisitions, leaving each merge's body running against the
+        // other's swapped coord. Pinned by
+        // `concurrent_branch_merges_distinct_targets_do_not_swap_into_each_other`
+        // in `crates/omnigraph-server/tests/server.rs`.
+        let merge_exclusive = self.merge_exclusive();
+        let _merge_guard = merge_exclusive.lock().await;
+
+        let previous_branch = self.active_branch().await;
        let previous = self
            .swap_coordinator_for_branch(target_branch.as_deref())
            .await?;
@ -1124,9 +1142,10 @@ impl Omnigraph {
                &target_head_commit_id,
                &source_head_commit_id,
                is_fast_forward,
+                actor_id,
            )
            .await;
-        self.restore_coordinator(previous);
+        self.restore_coordinator(previous).await;

        if merge_result.is_ok() && previous_branch == target_branch {
            self.refresh().await?;
@ -1136,15 +1155,16 @@ impl Omnigraph {
    }

    async fn branch_merge_on_current_target(
-        &mut self,
+        &self,
        base_snapshot: &Snapshot,
        source_snapshot: &Snapshot,
        target_head_commit_id: &str,
        source_head_commit_id: &str,
        is_fast_forward: bool,
+        actor_id: Option<&str>,
    ) -> Result<MergeOutcome> {
        self.ensure_commit_graph_initialized().await?;
-        let target_snapshot = self.snapshot();
+        let target_snapshot = self.snapshot().await;

        let mut table_keys = HashSet::new();
        for entry in base_snapshot.entries() {
@ -1180,7 +1200,7 @@ impl Omnigraph {

            if let Some(staged) = stage_streaming_table_merge(
                table_key,
-                self.catalog(),
+                &self.catalog(),
                base_snapshot,
                source_snapshot,
                &target_snapshot,
@ -1227,6 +1247,55 @@ impl Omnigraph {
        // requires pre-computing source deltas during candidate
        // classification (a structural change to `CandidateTableState`)
        // and is left as follow-up work.
+        // Acquire per-(table_key, target_branch) queues for every table
+        // touched by the merge plan. Sorted-order acquisition prevents
+        // lock-order inversion against concurrent multi-table writers.
+        // The active branch (set by the caller's `swap_coordinator_for_branch`)
+        // is the merge target; queue keys are scoped to it because a
+        // branch_merge writes only to the target branch.
+        //
+        // Held across the per-table publish loop and the manifest
+        // commit + record_merge_commit calls below. Under PR 1b's
+        // intermediate state (global server RwLock still in place),
+        // this acquisition is uncontended.
+        let active_branch_for_keys = self.active_branch().await;
+        let merge_queue_keys: Vec<(String, Option<String>)> = ordered_table_keys
+            .iter()
+            .filter(|table_key| {
+                matches!(
+                    candidates.get(*table_key),
+                    Some(CandidateTableState::RewriteMerged(_))
+                        | Some(CandidateTableState::AdoptSourceState)
+                )
+            })
+            .map(|table_key| (table_key.clone(), active_branch_for_keys.clone()))
+            .collect();
+        let _merge_queue_guards = self.write_queue().acquire_many(&merge_queue_keys).await;
+
+        let post_queue_snapshot = self.snapshot().await;
+        for table_key in &ordered_table_keys {
+            let Some(candidate) = candidates.get(table_key) else {
+                continue;
+            };
+            if !matches!(
+                candidate,
+                CandidateTableState::RewriteMerged(_) | CandidateTableState::AdoptSourceState
+            ) {
+                continue;
+            }
+            let expected = target_snapshot.entry(table_key).map(|e| e.table_version);
+            let current = post_queue_snapshot
+                .entry(table_key)
+                .map(|e| e.table_version);
+            if expected != current {
+                return Err(OmniError::manifest_expected_version_mismatch(
+                    table_key.clone(),
+                    expected.unwrap_or(0),
+                    current.unwrap_or(0),
+                ));
+            }
+        }
+
        let recovery_pins: Vec<crate::db::manifest::SidecarTablePin> = ordered_table_keys
            .iter()
            .filter_map(|table_key| {
@ -1252,7 +1321,7 @@ impl Omnigraph {
                    // the orphaned post-Phase-B HEAD on the target ref.
                    // Same rationale as table_ops.rs:115-120 in
                    // ensure_indices_for_branch.
-                    table_branch: self.active_branch().map(str::to_string),
+                    table_branch: active_branch_for_keys.clone(),
                })
            })
            .collect();
@ -1268,11 +1337,11 @@ impl Omnigraph {
            // `branch_merge` calls `swap_coordinator_for_branch(target_branch)`
            // before invoking this function, so `self.active_branch()`
            // is the target.
-            let target_branch = self.active_branch().map(str::to_string);
+            let target_branch = active_branch_for_keys.clone();
            let mut sidecar = crate::db::manifest::new_sidecar(
                crate::db::manifest::SidecarKind::BranchMerge,
                target_branch,
-                self.audit_actor_id.clone(),
+                actor_id.map(str::to_string),
                recovery_pins,
            );
            // Carry the source branch's HEAD commit id so the recovery
@ -1301,7 +1370,7 @@ impl Omnigraph {
                CandidateTableState::AdoptSourceState => {
                    publish_adopted_source_state(
                        self,
-                        self.catalog(),
+                        &self.catalog(),
                        base_snapshot,
                        source_snapshot,
                        &target_snapshot,
@ -1326,7 +1395,7 @@ impl Omnigraph {
        crate::failpoints::maybe_fail("branch_merge.post_phase_b_pre_manifest_commit")?;

        let manifest_version = if updates.is_empty() {
-            self.version()
+            self.version().await
        } else {
            self.commit_manifest_updates(&updates).await?
        };
@ -1349,6 +1418,7 @@ impl Omnigraph {
            manifest_version,
            target_head_commit_id,
            source_head_commit_id,
+            actor_id,
        )
        .await?;

--- a/crates/omnigraph/src/exec/mutation.rs
+++ b/crates/omnigraph/src/exec/mutation.rs
@ -345,8 +345,8 @@ async fn validate_edge_insert_endpoints(
    edge_name: &str,
    assignments: &HashMap<String, Literal>,
 ) -> Result<()> {
-    let edge_type = db
-        .catalog()
+    let catalog = db.catalog();
+    let edge_type = catalog
        .edge_types
        .get(edge_name)
        .ok_or_else(|| OmniError::manifest(format!("unknown edge type '{}'", edge_name)))?;
@ -600,6 +600,7 @@ async fn open_table_for_mutation(
    staging: &mut MutationStaging,
    branch: Option<&str>,
    table_key: &str,
+    op_kind: crate::db::MutationOpKind,
 ) -> Result<(Dataset, String, Option<String>)> {
    if let Some(prior) = staging.inline_committed.get(table_key) {
        let path = staging.paths.get(table_key).ok_or_else(|| {
@ -614,18 +615,21 @@ async fn open_table_for_mutation(
                &path.full_path,
                path.table_branch.as_deref(),
                prior.table_version,
+                op_kind,
            )
            .await?;
        return Ok((ds, path.full_path.clone(), path.table_branch.clone()));
    }
-    let (ds, full_path, table_branch) =
-        db.open_for_mutation_on_branch(branch, table_key).await?;
+    let (ds, full_path, table_branch) = db
+        .open_for_mutation_on_branch(branch, table_key, op_kind)
+        .await?;
    let expected_version = ds.version().version;
    staging.ensure_path(
        table_key,
        full_path.clone(),
        table_branch.clone(),
        expected_version,
+        op_kind,
    );
    Ok((ds, full_path, table_branch))
 }
@ -670,7 +674,7 @@ fn enforce_no_mixed_destructive_constructive(

 impl Omnigraph {
    pub async fn mutate(
-        &mut self,
+        &self,
        branch: &str,
        query_source: &str,
        query_name: &str,
@ -681,28 +685,24 @@ impl Omnigraph {
    }

    pub async fn mutate_as(
-        &mut self,
+        &self,
        branch: &str,
        query_source: &str,
        query_name: &str,
        params: &ParamMap,
        actor_id: Option<&str>,
    ) -> Result<MutationResult> {
-        let previous_actor = self.audit_actor_id.clone();
-        self.audit_actor_id = actor_id.map(str::to_string);
-        let result = self
-            .mutate_with_current_actor(branch, query_source, query_name, params)
-            .await;
-        self.audit_actor_id = previous_actor;
-        result
+        self.mutate_with_current_actor(branch, query_source, query_name, params, actor_id)
+            .await
    }

    async fn mutate_with_current_actor(
-        &mut self,
+        &self,
        branch: &str,
        query_source: &str,
        query_name: &str,
        params: &ParamMap,
+        actor_id: Option<&str>,
    ) -> Result<MutationResult> {
        self.ensure_schema_state_valid().await?;
        let requested = Self::normalize_branch_name(branch)?;
@ -737,11 +737,19 @@ impl Omnigraph {
            Err(e) => Err(e),
            Ok(total) if staging.is_empty() => Ok(total),
            Ok(total) => {
-                let (updates, expected_versions, sidecar_handle) = staging
-                    .finalize(
+                let staged = staging.stage_all(self, requested.as_deref()).await?;
+                // `_queue_guards` holds per-(table_key, branch) write
+                // queues acquired inside `commit_all`. Held across the
+                // manifest publish below so no concurrent writer can
+                // interleave between our commit_staged and our publish
+                // (which would correctly fail our CAS but leave Lance
+                // HEAD advanced — the residual class MR-870 recovers).
+                let (updates, expected_versions, sidecar_handle, _queue_guards) = staged
+                    .commit_all(
                        self,
                        requested.as_deref(),
                        crate::db::manifest::SidecarKind::Mutation,
+                        actor_id,
                    )
                    .await?;
                // Failpoint that wedges the documented finalize→publisher
@ -759,6 +767,7 @@ impl Omnigraph {
                    requested.as_deref(),
                    &updates,
                    &expected_versions,
+                    actor_id,
                )
                .await?;
                // Phase C succeeded — sidecar can be deleted. If this
@ -794,7 +803,7 @@ impl Omnigraph {
    }

    async fn execute_named_mutation(
-        &mut self,
+        &self,
        query_source: &str,
        query_name: &str,
        params: &ParamMap,
@ -804,7 +813,7 @@ impl Omnigraph {
        let query_decl = omnigraph_compiler::find_named_query(query_source, query_name)
            .map_err(|e| OmniError::manifest(e.to_string()))?;

-        let checked = typecheck_query_decl(self.catalog(), &query_decl)?;
+        let checked = typecheck_query_decl(&self.catalog(), &query_decl)?;
        match checked {
            CheckedQuery::Mutation(_) => {}
            CheckedQuery::Read(_) => {
@ -858,7 +867,7 @@ impl Omnigraph {
    }

    async fn execute_insert(
-        &mut self,
+        &self,
        type_name: &str,
        assignments: &[IRAssignment],
        params: &ParamMap,
@ -906,8 +915,13 @@ impl Omnigraph {
            let has_key = node_type.key_property().is_some();
            let table_key = format!("node:{}", type_name);
            // Capture pre-write metadata on first touch (no Lance write).
+            let insert_kind = if has_key {
+                crate::db::MutationOpKind::Merge
+            } else {
+                crate::db::MutationOpKind::Insert
+            };
            let (_ds, _full_path, _table_branch) =
-                open_table_for_mutation(self, staging, branch, &table_key).await?;
+                open_table_for_mutation(self, staging, branch, &table_key, insert_kind).await?;
            // Accumulate. @key inserts go into the Merge stream (so a
            // later update on the same id coalesces correctly); no-key
            // inserts go into the Append stream.
@ -941,8 +955,14 @@ impl Omnigraph {
            }
            let table_key = format!("edge:{}", type_name);
            // Capture pre-write metadata on first touch (no Lance write).
-            let (ds, _full_path, _table_branch) =
-                open_table_for_mutation(self, staging, branch, &table_key).await?;
+            let (ds, _full_path, _table_branch) = open_table_for_mutation(
+                self,
+                staging,
+                branch,
+                &table_key,
+                crate::db::MutationOpKind::Insert,
+            )
+            .await?;
            // Accumulate the new edge row. Edge IDs are ULID-generated so
            // Append mode is correct (no key-based dedup needed).
            staging.append_batch(&table_key, schema, PendingMode::Append, batch.clone())?;
@ -972,7 +992,7 @@ impl Omnigraph {
    }

    async fn execute_update(
-        &mut self,
+        &self,
        type_name: &str,
        assignments: &[IRAssignment],
        predicate: &IRMutationPredicate,
@ -1003,8 +1023,14 @@ impl Omnigraph {
        let blob_props = self.catalog().node_types[type_name].blob_properties.clone();

        let table_key = format!("node:{}", type_name);
-        let (ds, _full_path, _table_branch) =
-            open_table_for_mutation(self, staging, branch, &table_key).await?;
+        let (ds, _full_path, _table_branch) = open_table_for_mutation(
+            self,
+            staging,
+            branch,
+            &table_key,
+            crate::db::MutationOpKind::Update,
+        )
+        .await?;

        // Scan committed via Lance + apply the same predicate to pending
        // batches via DataFusion `MemTable` (read-your-writes for prior
@ -1097,7 +1123,7 @@ impl Omnigraph {
    }

    async fn execute_delete(
-        &mut self,
+        &self,
        type_name: &str,
        predicate: &IRMutationPredicate,
        params: &ParamMap,
@ -1115,7 +1141,7 @@ impl Omnigraph {
    }

    async fn execute_delete_node(
-        &mut self,
+        &self,
        type_name: &str,
        predicate: &IRMutationPredicate,
        params: &ParamMap,
@ -1125,8 +1151,14 @@ impl Omnigraph {
        let pred_sql = predicate_to_sql(predicate, params, false)?;

        let table_key = format!("node:{}", type_name);
-        let (ds, full_path, table_branch) =
-            open_table_for_mutation(self, staging, branch, &table_key).await?;
+        let (ds, full_path, table_branch) = open_table_for_mutation(
+            self,
+            staging,
+            branch,
+            &table_key,
+            crate::db::MutationOpKind::Delete,
+        )
+        .await?;
        let initial_version = ds.version().version;

        // Scan matching IDs for cascade. Per D₂ this never overlaps with
@ -1171,8 +1203,10 @@ impl Omnigraph {
                &full_path,
                table_branch.as_deref(),
                initial_version,
+                crate::db::MutationOpKind::Delete,
            )
            .await?;
+        crate::failpoints::maybe_fail("mutation.delete_node_pre_primary_delete")?;
        let delete_state = self
            .table_store()
            .delete_where(&full_path, &mut ds, &pred_sql)
@ -1214,8 +1248,14 @@ impl Omnigraph {

            let edge_table_key = format!("edge:{}", edge_name);
            let cascade_filter = cascade_filters.join(" OR ");
-            let (mut edge_ds, edge_full_path, edge_table_branch) =
-                open_table_for_mutation(self, staging, branch, &edge_table_key).await?;
+            let (mut edge_ds, edge_full_path, edge_table_branch) = open_table_for_mutation(
+                self,
+                staging,
+                branch,
+                &edge_table_key,
+                crate::db::MutationOpKind::Delete,
+            )
+            .await?;

            let edge_delete = self
                .table_store()
@ -1246,7 +1286,7 @@ impl Omnigraph {
    }

    async fn execute_delete_edge(
-        &mut self,
+        &self,
        type_name: &str,
        predicate: &IRMutationPredicate,
        params: &ParamMap,
@ -1256,8 +1296,14 @@ impl Omnigraph {
        let pred_sql = predicate_to_sql(predicate, params, true)?;

        let table_key = format!("edge:{}", type_name);
-        let (mut ds, full_path, table_branch) =
-            open_table_for_mutation(self, staging, branch, &table_key).await?;
+        let (mut ds, full_path, table_branch) = open_table_for_mutation(
+            self,
+            staging,
+            branch,
+            &table_key,
+            crate::db::MutationOpKind::Delete,
+        )
+        .await?;

        let delete_state = self
            .table_store()
--- a/crates/omnigraph/src/exec/query.rs
+++ b/crates/omnigraph/src/exec/query.rs
@ -13,11 +13,12 @@ impl Omnigraph {
    ) -> Result<QueryResult> {
        self.ensure_schema_state_valid().await?;
        let resolved = self.resolved_target(target).await?;
+        let catalog = self.catalog();

        let query_decl = omnigraph_compiler::find_named_query(query_source, query_name)
            .map_err(|e| OmniError::manifest(e.to_string()))?;
-        let type_ctx = typecheck_query(self.catalog(), &query_decl)?;
-        let ir = lower_query(self.catalog(), &query_decl, &type_ctx)?;
+        let type_ctx = typecheck_query(&catalog, &query_decl)?;
+        let ir = lower_query(&catalog, &query_decl, &type_ctx)?;

        let needs_graph = ir
            .pipeline
@ -34,7 +35,7 @@ impl Omnigraph {
            params,
            &resolved.snapshot,
            graph_index.as_deref(),
-            self.catalog(),
+            &catalog,
        )
        .await
    }
@ -52,19 +53,19 @@ impl Omnigraph {
    ) -> Result<QueryResult> {
        self.ensure_schema_state_valid().await?;
        let snapshot = self.snapshot_at_version(version).await?;
+        let catalog = self.catalog();

        let query_decl = omnigraph_compiler::find_named_query(query_source, query_name)
            .map_err(|e| OmniError::manifest(e.to_string()))?;
-        let type_ctx = typecheck_query(self.catalog(), &query_decl)?;
-        let ir = lower_query(self.catalog(), &query_decl, &type_ctx)?;
+        let type_ctx = typecheck_query(&catalog, &query_decl)?;
+        let ir = lower_query(&catalog, &query_decl, &type_ctx)?;

        let needs_graph = ir
            .pipeline
            .iter()
            .any(|op| matches!(op, IROp::Expand { .. } | IROp::AntiJoin { .. }));
        let graph_index = if needs_graph {
-            let edge_types = self
-                .catalog()
+            let edge_types = catalog
                .edge_types
                .iter()
                .map(|(name, et)| (name.clone(), (et.from_type.clone(), et.to_type.clone())))
@ -79,7 +80,7 @@ impl Omnigraph {
            params,
            &snapshot,
            graph_index.as_deref(),
-            self.catalog(),
+            &catalog,
        )
        .await
    }
--- a/crates/omnigraph/src/exec/staging.rs
+++ b/crates/omnigraph/src/exec/staging.rs
@ -26,7 +26,7 @@ use arrow_schema::SchemaRef;
 use lance::Dataset;
 use omnigraph_compiler::catalog::EdgeType;

-use crate::db::SubTableUpdate;
+use crate::db::{MutationOpKind, SubTableUpdate};
 use crate::db::manifest::{
    new_sidecar, write_sidecar, RecoverySidecarHandle, SidecarKind, SidecarTablePin,
 };
@ -94,18 +94,30 @@ pub(crate) struct MutationStaging {
    /// Inline-committed updates from delete-touching ops (D₂ guarantees no
    /// pending batches exist on a delete-touched table).
    pub(crate) inline_committed: HashMap<String, SubTableUpdate>,
+    /// Strictest [`MutationOpKind`] seen per table within this query. Drives
+    /// the op-kind-aware drift check in [`StagedMutation::commit_all`]: for
+    /// tables whose first or any subsequent touch was a strict op
+    /// (Update / Delete / SchemaRewrite), commit_all fails fast with 409
+    /// when the staged dataset version drifts from the fresh manifest pin
+    /// rather than letting Lance's `commit_staged` surface
+    /// `RetryableCommitConflict` as a 500. See
+    /// [`MutationOpKind::strict_pre_stage_version_check`].
+    pub(crate) op_kinds: HashMap<String, MutationOpKind>,
 }

 impl MutationStaging {
    /// Capture pre-write metadata on first touch of a table. Subsequent
-    /// touches are no-ops (paths and `expected_version` are stable for the
-    /// lifetime of one query).
+    /// touches preserve the original `paths` and `expected_versions`
+    /// entries; `op_kinds` upgrades to the strictest kind seen so far so
+    /// that mixed insert+update on the same table still fires the strict
+    /// drift check at commit time.
    pub(crate) fn ensure_path(
        &mut self,
        table_key: &str,
        full_path: String,
        table_branch: Option<String>,
        expected_version: u64,
+        op_kind: MutationOpKind,
    ) {
        self.paths.entry(table_key.to_string()).or_insert(StagedTablePath {
            full_path,
@ -114,6 +126,19 @@ impl MutationStaging {
        self.expected_versions
            .entry(table_key.to_string())
            .or_insert(expected_version);
+        self.op_kinds
+            .entry(table_key.to_string())
+            .and_modify(|existing| {
+                // Upgrade to the stricter kind if a later op needs it.
+                // Insert + later Update → Update wins; Update + later Insert
+                // keeps Update.
+                if op_kind.strict_pre_stage_version_check()
+                    && !existing.strict_pre_stage_version_check()
+                {
+                    *existing = op_kind;
+                }
+            })
+            .or_insert(op_kind);
    }

    /// Append a batch to the per-table accumulator.
@ -210,101 +235,65 @@ impl MutationStaging {
    }

    /// End-of-query: for each pending table, concat batches and commit via
-    /// `stage_append` or `stage_merge_insert` followed by `commit_staged`.
-    /// Merge with inline-committed entries. Return `(updates,
-    /// expected_versions)` for `commit_updates_on_branch_with_expected`.
+    /// **Phase A** of the two-phase commit: stage uncommitted fragments
+    /// for every table in `pending`. No Lance HEAD movement, no sidecar,
+    /// no manifest publish. Returns a [`StagedMutation`] carrying the
+    /// staged transactions so a future MR-686 queue acquisition step can
+    /// run between staging (slow S3 PUTs, no queue) and commit (fast,
+    /// under per-`(table_key, branch)` queue).
    ///
-    /// Sequential per-table — no cross-table dependency, but a parallel
-    /// version is a perf optimization for multi-table writes (loader with
-    /// many node + edge types). v1 ships sequential; the fan-out can land
-    /// in a follow-up.
-    pub(crate) async fn finalize(
+    /// Sequential per-table for now — parallelizing across independent
+    /// Lance datasets is a perf follow-up; same loop structure as the
+    /// pre-split `finalize`.
+    pub(crate) async fn stage_all(
        self,
        db: &crate::db::Omnigraph,
-        branch: Option<&str>,
-        sidecar_kind: SidecarKind,
-    ) -> Result<(
-        Vec<SubTableUpdate>,
-        HashMap<String, u64>,
-        Option<RecoverySidecarHandle>,
-    )> {
+        _branch: Option<&str>,
+    ) -> Result<StagedMutation> {
        let MutationStaging {
            expected_versions,
            paths,
            pending,
            inline_committed,
+            op_kinds,
        } = self;

-        let mut updates: Vec<SubTableUpdate> =
-            inline_committed.into_values().collect();
-
-        // Sidecar protocol: build the per-table pin list BEFORE any Lance
-        // commit_staged runs, then write the sidecar so a crash between
-        // Phase B (this loop's commit_staged calls) and Phase C (the
-        // manifest publish in the caller) is recoverable on next open.
-        // Skipped when `pending` is empty (delete-only mutation; the D₂
-        // parse-time rule keeps deletes out of this code path so this
-        // branch is reached only for the inline-committed-only case).
-        let pins: Vec<SidecarTablePin> = pending
-            .iter()
-            .map(|(table_key, _)| {
-                let path = paths.get(table_key).ok_or_else(|| {
-                    OmniError::manifest_internal(format!(
-                        "MutationStaging::finalize: missing path for table '{}'",
-                        table_key,
-                    ))
-                })?;
-                let expected = *expected_versions.get(table_key).ok_or_else(|| {
-                    OmniError::manifest_internal(format!(
-                        "MutationStaging::finalize: missing expected version for table '{}'",
-                        table_key,
-                    ))
-                })?;
-                Ok::<SidecarTablePin, OmniError>(SidecarTablePin {
-                    table_key: table_key.clone(),
-                    table_path: path.full_path.clone(),
-                    expected_version: expected,
-                    post_commit_pin: expected + 1,
-                    table_branch: path.table_branch.clone(),
-                })
-            })
-            .collect::<Result<Vec<_>>>()?;
-
-        let sidecar_handle = if pins.is_empty() {
-            None
-        } else {
-            let sidecar = new_sidecar(
-                sidecar_kind,
-                branch.map(|s| s.to_string()),
-                db.audit_actor_id.clone(),
-                pins,
-            );
-            Some(write_sidecar(db.root_uri(), db.storage_adapter(), &sidecar).await?)
-        };
-
+        let mut staged_entries: Vec<StagedTableEntry> = Vec::with_capacity(pending.len());
        for (table_key, table) in pending {
-            let path = paths.get(&table_key).ok_or_else(|| {
+            let path = paths.get(&table_key).cloned().ok_or_else(|| {
                OmniError::manifest_internal(format!(
-                    "MutationStaging::finalize: missing path for table '{}'",
+                    "MutationStaging::stage_all: missing path for table '{}'",
                    table_key
                ))
            })?;
            let expected = *expected_versions.get(&table_key).ok_or_else(|| {
                OmniError::manifest_internal(format!(
-                    "MutationStaging::finalize: missing expected version for table '{}'",
+                    "MutationStaging::stage_all: missing expected version for table '{}'",
                    table_key
                ))
            })?;

-            // Reopen at the pre-write version. Lance HEAD has not advanced
-            // since `ensure_path` captured it — no prior op committed to
-            // this dataset.
+            // Reopen the dataset for staging. The op_kind reflects the
+            // accumulated PendingTable's mode: Append-mode batches are
+            // INSERT-shaped (no key-based dedup at commit_staged); Merge-
+            // mode batches are MERGE-shaped (key-dedup at commit_staged).
+            // Both skip the strict pre-stage version check under the
+            // [`MutationOpKind`] policy: Lance's natural rebase + the
+            // per-(table, branch) queue + the publisher CAS in
+            // `commit_all` handle drift; the strict check would
+            // over-reject in-process concurrent inserts (PR 2 / MR-686
+            // Phase 2).
+            let stage_kind = match table.mode {
+                PendingMode::Append => crate::db::MutationOpKind::Insert,
+                PendingMode::Merge => crate::db::MutationOpKind::Merge,
+            };
            let ds = db
                .reopen_for_mutation(
                    &table_key,
                    &path.full_path,
                    path.table_branch.as_deref(),
                    expected,
+                    stage_kind,
                )
                .await?;

@ -335,8 +324,8 @@ impl MutationStaging {
                }
            };

-            // Commit via Lance's two-phase write: stage produces
-            // uncommitted fragments + transaction; commit advances HEAD.
+            // Stage produces uncommitted fragments + transaction. No
+            // Lance HEAD advance until `commit_all` runs `commit_staged`.
            let staged = match table.mode {
                PendingMode::Append => {
                    db.table_store().stage_append(&ds, combined, &[]).await?
@ -353,16 +342,327 @@ impl MutationStaging {
                        .await?
                }
            };
+            staged_entries.push(StagedTableEntry {
+                table_key,
+                path,
+                expected_version: expected,
+                dataset: ds,
+                staged_write: staged,
+            });
+        }
+
+        Ok(StagedMutation {
+            inline_committed,
+            staged: staged_entries,
+            expected_versions,
+            paths,
+            op_kinds,
+        })
+    }
+}
+
+/// Output of [`MutationStaging::stage_all`]. Carries the staged Lance
+/// transactions (Phase A complete; uncommitted fragments written) plus
+/// the per-table metadata needed to write the recovery sidecar, run
+/// `commit_staged` (Phase B), and produce the publisher's input.
+///
+/// Splitting `stage_all` and `commit_all` is the structural prerequisite
+/// for MR-686: a future commit can drop queue acquisition + manifest-pin
+/// revalidation between Phase A and Phase B without touching staging
+/// logic.
+pub(crate) struct StagedMutation {
+    /// Updates from delete-touching ops (D₂ parse-time rule keeps
+    /// pending and inline_committed disjoint per table). Tables here
+    /// have already advanced Lance HEAD via inline `delete_where`;
+    /// `commit_all` builds sidecar pins for these too so the
+    /// commit→publish residual is recoverable for delete-only paths
+    /// (third-agent Finding 3).
+    inline_committed: HashMap<String, SubTableUpdate>,
+    /// One entry per table that had pending batches successfully staged.
+    staged: Vec<StagedTableEntry>,
+    /// Pre-write manifest version per table — the publisher's CAS fence.
+    expected_versions: HashMap<String, u64>,
+    /// Per-table identifiers from `MutationStaging::paths`. Carried
+    /// through so `commit_all` can build sidecar pins for both staged
+    /// and inline-committed tables.
+    paths: HashMap<String, StagedTablePath>,
+    /// Strictest op_kind per touched table, propagated from
+    /// `MutationStaging::op_kinds` so `commit_all`'s drift check
+    /// fires only on read-modify-write tables.
+    op_kinds: HashMap<String, MutationOpKind>,
+}
+
+/// Per-table state captured during `stage_all` and consumed by
+/// `commit_all`. Holds the opened `Dataset` so `commit_staged` doesn't
+/// re-open, and the `StagedWrite` whose `transaction` `commit_staged`
+/// will execute.
+struct StagedTableEntry {
+    table_key: String,
+    path: StagedTablePath,
+    expected_version: u64,
+    dataset: lance::Dataset,
+    staged_write: crate::table_store::StagedWrite,
+}
+
+impl StagedMutation {
+    /// **Phase B** of the two-phase commit: acquire per-`(table_key,
+    /// branch)` queues, revalidate manifest pins, write the recovery
+    /// sidecar, run `commit_staged` per table to advance Lance HEAD, and
+    /// return the publisher's input plus the queue guards.
+    ///
+    /// **Caller must hold the returned `_guards` Vec across the
+    /// subsequent manifest publish.** Releasing guards before publish
+    /// would let another writer interleave their commit_staged between
+    /// ours and our publish — which would correctly fail our CAS but
+    /// leave Lance HEAD advanced (the residual class MR-870 recovers
+    /// from). Holding the guards across publish keeps the residual
+    /// unreachable for op-execution failures on the happy path.
+    ///
+    /// Revalidation: between `stage_all` and `commit_all`, another
+    /// writer (in the same process or another process sharing the
+    /// repo) may have committed to one of our touched tables, advancing
+    /// the manifest pin past our `expected_version`. We revalidate
+    /// under the queue and fail-fast with `manifest_conflict` before
+    /// any `commit_staged` so the orphaned uncommitted fragments stay
+    /// unreferenced (cleaned by `cleanup_old_versions`'s age sweep)
+    /// rather than being committed and creating a Lance-HEAD-ahead
+    /// residual.
+    pub(crate) async fn commit_all(
+        self,
+        db: &crate::db::Omnigraph,
+        branch: Option<&str>,
+        sidecar_kind: SidecarKind,
+        actor_id: Option<&str>,
+    ) -> Result<(
+        Vec<SubTableUpdate>,
+        HashMap<String, u64>,
+        Option<RecoverySidecarHandle>,
+        Vec<tokio::sync::OwnedMutexGuard<()>>,
+    )> {
+        let StagedMutation {
+            inline_committed,
+            mut staged,
+            mut expected_versions,
+            paths,
+            op_kinds,
+        } = self;
+
+        // Acquire per-(table_key, branch) queues for every touched
+        // table — both staged and inline-committed. Sorted by
+        // `acquire_many` internally so all multi-table writers
+        // (mutation, branch_merge, schema_apply, future MR-870
+        // recovery) agree on acquisition order — prevents lock-order
+        // inversion deadlock.
+        //
+        // For inline-committed tables (delete-only mutations), Lance
+        // HEAD has already advanced inside `delete_where` before
+        // `commit_all` runs. Holding the queue here doesn't prevent
+        // that interleaving (commit 6 will move queue acquisition into
+        // `delete_where`'s call site); it does prevent another writer
+        // from interleaving between our delete and our publish, which
+        // would otherwise leave a Lance-HEAD-ahead residual the
+        // delete-only sidecar (added below) would have to recover.
+        let mut queue_keys: Vec<(String, Option<String>)> = Vec::with_capacity(
+            staged.len() + inline_committed.len(),
+        );
+        for entry in &staged {
+            queue_keys.push((entry.table_key.clone(), entry.path.table_branch.clone()));
+        }
+        for table_key in inline_committed.keys() {
+            let path = paths.get(table_key).ok_or_else(|| {
+                OmniError::manifest_internal(format!(
+                    "StagedMutation::commit_all: missing path for inline-committed table '{}'",
+                    table_key
+                ))
+            })?;
+            queue_keys.push((table_key.clone(), path.table_branch.clone()));
+        }
+        let guards = db.write_queue().acquire_many(&queue_keys).await;
+
+        // Re-capture manifest pins under the queue (PR 2 / MR-686).
+        //
+        // expected_versions was captured when the mutation first opened
+        // each table for mutation (the query's read-time pin). For
+        // non-strict inserts / merge-style appends, a writer may advance
+        // the table before we acquire the queue and Lance can still
+        // safely rebase the write, so we refresh expected_versions to
+        // the queued manifest pin.
+        //
+        // Strict read-modify-write ops (update / delete /
+        // schema-rewrite) are different: the staged batch was computed
+        // against the read-time pin, even if stage_all later re-opened
+        // the dataset at HEAD. For those ops, compare read-time
+        // expected_version to the queued manifest pin and fail before
+        // any Lance HEAD movement if the target drifted. This can
+        // over-reject a single mutation that inserts, then upgrades to
+        // update, while another writer advances the table between the
+        // two touches; that is safe-by-default and keeps one invariant
+        // until `ensure_path` learns how to bump expected_version on
+        // op-kind upgrade.
+        //
+        // Why per-branch (and not the bound-branch `db.snapshot()`):
+        // when the caller mutates a branch other than the engine's
+        // bound branch (e.g., feature-branch ingest from a server
+        // handle bound to main), `db.snapshot()` returns the bound
+        // branch's view of each table — which is the wrong pin for
+        // the publisher's CAS on a different branch. Using
+        // `snapshot_for_branch(branch)` resolves the per-branch
+        // entries correctly. The cost is one fresh manifest read per
+        // mutation; PR 1b's regression came from this same read, but
+        // that read is now strictly necessary for cross-branch
+        // correctness. Single-table same-branch mutations could still
+        // skip this read (queue exclusivity makes the publisher CAS a
+        // no-op), but the conditional adds complexity for marginal
+        // gain — left as a follow-up perf optimization.
+        //
+        // Multi-coordinator deployments (§VI.27 aspirational) get
+        // genuine cross-process drift detection from this read for
+        // free.
+        let snapshot = db.snapshot_for_branch(branch).await?;
+        for entry in staged.iter_mut() {
+            let current = snapshot
+                .entry(&entry.table_key)
+                .map(|e| e.table_version)
+                .ok_or_else(|| {
+                    OmniError::manifest_conflict(format!(
+                        "table '{}' missing from manifest at commit time",
+                        entry.table_key,
+                    ))
+                })?;
+
+            // Insert / Merge tables skip this check: concurrent inserts on
+            // disjoint keys legitimately coexist via Lance's auto-rebase, so
+            // the check would over-reject the existing Phase 2 same-key
+            // insert path (`change_concurrent_inserts_same_key_serialize_without_409`).
+            let strict = op_kinds
+                .get(&entry.table_key)
+                .map(|k| k.strict_pre_stage_version_check())
+                .unwrap_or(false);
+            if strict && entry.expected_version != current {
+                return Err(OmniError::manifest_expected_version_mismatch(
+                    entry.table_key.clone(),
+                    entry.expected_version,
+                    current,
+                ));
+            }
+
+            entry.expected_version = current;
+            expected_versions.insert(entry.table_key.clone(), current);
+        }
+        // Sidecar protocol: build the per-table pin list and write the
+        // sidecar BEFORE any later error can return after Lance HEAD has
+        // already moved. For staged tables this still happens before any
+        // Lance commit_staged runs. For inline-committed delete tables,
+        // Lance HEAD moved inside delete_where before commit_all, so the
+        // sidecar must also exist before the inline manifest-version check
+        // below can reject a stale query.
+        //
+        // Pins cover BOTH staged tables (Lance HEAD will advance below
+        // when `commit_staged` runs) AND inline-committed tables
+        // (Lance HEAD already advanced inside `delete_where` — we still
+        // need a sidecar so that an upcoming publish failure is
+        // recoverable on next open). This closes the third-agent
+        // Finding 3 hazard: delete-only mutations would otherwise skip
+        // the sidecar, leaving any commit→publish residual unreachable
+        // by recovery.
+        let mut pins: Vec<SidecarTablePin> = Vec::with_capacity(
+            staged.len() + inline_committed.len(),
+        );
+        for entry in &staged {
+            pins.push(SidecarTablePin {
+                table_key: entry.table_key.clone(),
+                table_path: entry.path.full_path.clone(),
+                expected_version: entry.expected_version,
+                post_commit_pin: entry.expected_version + 1,
+                table_branch: entry.path.table_branch.clone(),
+            });
+        }
+        for (table_key, update) in &inline_committed {
+            let path = paths.get(table_key).ok_or_else(|| {
+                OmniError::manifest_internal(format!(
+                    "StagedMutation::commit_all: missing path for inline-committed table '{}'",
+                    table_key
+                ))
+            })?;
+            let expected = *expected_versions.get(table_key).ok_or_else(|| {
+                OmniError::manifest_internal(format!(
+                    "StagedMutation::commit_all: missing expected version for inline-committed table '{}'",
+                    table_key
+                ))
+            })?;
+            pins.push(SidecarTablePin {
+                table_key: table_key.clone(),
+                table_path: path.full_path.clone(),
+                expected_version: expected,
+                // For inline-committed tables, the post-commit pin is
+                // the actual post-delete version recorded by
+                // `record_inline`, NOT `expected + 1` — `delete_where`
+                // can advance HEAD by more than one version (e.g.,
+                // when Lance internally compacts deletion vectors).
+                post_commit_pin: update.table_version,
+                table_branch: path.table_branch.clone(),
+            });
+        }
+
+        let sidecar_handle = if pins.is_empty() {
+            None
+        } else {
+            let sidecar = new_sidecar(
+                sidecar_kind,
+                branch.map(|s| s.to_string()),
+                actor_id.map(str::to_string),
+                pins,
+            );
+            Some(write_sidecar(db.root_uri(), db.storage_adapter(), &sidecar).await?)
+        };
+
+        for (table_key, _update) in inline_committed.iter() {
+            let current = snapshot
+                .entry(table_key)
+                .map(|e| e.table_version)
+                .ok_or_else(|| {
+                    OmniError::manifest_conflict(format!(
+                        "table '{}' missing from manifest at commit time",
+                        table_key,
+                    ))
+                })?;
+            let expected = expected_versions.get(table_key).copied().ok_or_else(|| {
+                OmniError::manifest_internal(format!(
+                    "StagedMutation::commit_all: missing expected version for inline-committed table '{}'",
+                    table_key
+                ))
+            })?;
+            if expected != current {
+                return Err(OmniError::manifest_expected_version_mismatch(
+                    table_key.clone(),
+                    expected,
+                    current,
+                ));
+            }
+            expected_versions.insert(table_key.clone(), current);
+        }
+
+        let mut updates: Vec<SubTableUpdate> = inline_committed.into_values().collect();
+
+        for entry in staged {
+            let StagedTableEntry {
+                table_key,
+                path,
+                expected_version: _,
+                dataset,
+                staged_write,
+            } = entry;
+
            let new_ds = db
                .table_store()
-                .commit_staged(Arc::new(ds), staged.transaction)
+                .commit_staged(Arc::new(dataset), staged_write.transaction)
                .await?;
            let state = db
                .table_store()
                .table_state(&path.full_path, &new_ds)
                .await?;
            updates.push(SubTableUpdate {
-                table_key: table_key.clone(),
+                table_key,
                table_version: state.version,
                table_branch: path.table_branch.clone(),
                row_count: state.row_count,
@ -370,7 +670,7 @@ impl MutationStaging {
            });
        }

-        Ok((updates, expected_versions, sidecar_handle))
+        Ok((updates, expected_versions, sidecar_handle, guards))
    }
 }

--- a/crates/omnigraph/src/loader/mod.rs
+++ b/crates/omnigraph/src/loader/mod.rs
@ -59,21 +59,21 @@ pub enum LoadMode {

 /// Load JSONL data into an Omnigraph database.
 pub async fn load_jsonl(db: &mut Omnigraph, data: &str, mode: LoadMode) -> Result<LoadResult> {
-    let current_branch = db.active_branch().map(str::to_string);
+    let current_branch = db.active_branch().await;
    let branch = current_branch.as_deref().unwrap_or("main");
    db.load(branch, data, mode).await
 }

 /// Load JSONL data from a file path.
 pub async fn load_jsonl_file(db: &mut Omnigraph, path: &str, mode: LoadMode) -> Result<LoadResult> {
-    let current_branch = db.active_branch().map(str::to_string);
+    let current_branch = db.active_branch().await;
    let branch = current_branch.as_deref().unwrap_or("main");
    db.load_file(branch, path, mode).await
 }

 impl Omnigraph {
    pub async fn ingest(
-        &mut self,
+        &self,
        branch: &str,
        from: Option<&str>,
        data: &str,
@ -83,24 +83,19 @@ impl Omnigraph {
    }

    pub async fn ingest_as(
-        &mut self,
+        &self,
        branch: &str,
        from: Option<&str>,
        data: &str,
        mode: LoadMode,
        actor_id: Option<&str>,
    ) -> Result<IngestResult> {
-        let previous_actor = self.audit_actor_id.clone();
-        self.audit_actor_id = actor_id.map(str::to_string);
-        let result = self
-            .ingest_with_current_actor(branch, from, data, mode)
-            .await;
-        self.audit_actor_id = previous_actor;
-        result
+        self.ingest_with_current_actor(branch, from, data, mode, actor_id)
+            .await
    }

    pub async fn ingest_file(
-        &mut self,
+        &self,
        branch: &str,
        from: Option<&str>,
        path: &str,
@ -110,7 +105,7 @@ impl Omnigraph {
    }

    pub async fn ingest_file_as(
-        &mut self,
+        &self,
        branch: &str,
        from: Option<&str>,
        path: &str,
@ -122,11 +117,12 @@ impl Omnigraph {
    }

    async fn ingest_with_current_actor(
-        &mut self,
+        &self,
        branch: &str,
        from: Option<&str>,
        data: &str,
        mode: LoadMode,
+        actor_id: Option<&str>,
    ) -> Result<IngestResult> {
        self.ensure_schema_state_valid().await?;
        let target_branch =
@ -143,7 +139,7 @@ impl Omnigraph {
                .await?;
        }

-        let result = self.load(&target_branch, data, mode).await?;
+        let result = self.load_as(&target_branch, data, mode, actor_id).await?;
        Ok(IngestResult {
            branch: target_branch,
            base_branch,
@ -153,7 +149,17 @@ impl Omnigraph {
        })
    }

-    pub async fn load(&mut self, branch: &str, data: &str, mode: LoadMode) -> Result<LoadResult> {
+    pub async fn load(&self, branch: &str, data: &str, mode: LoadMode) -> Result<LoadResult> {
+        self.load_as(branch, data, mode, None).await
+    }
+
+    pub async fn load_as(
+        &self,
+        branch: &str,
+        data: &str,
+        mode: LoadMode,
+        actor_id: Option<&str>,
+    ) -> Result<LoadResult> {
        self.ensure_schema_state_valid().await?;
        // Reject internal `__run__*` / system-prefixed branches at the
        // public write boundary. Direct-publish paths assert this
@ -169,12 +175,12 @@ impl Omnigraph {
        // Direct-to-target writes: no Run state machine, no `__run__` staging
        // branch. Cross-table OCC is enforced by the publisher's
        // `expected_table_versions` CAS inside `load_jsonl_reader`.
-        self.load_direct_on_branch(requested.as_deref(), data, mode)
+        self.load_direct_on_branch(requested.as_deref(), data, mode, actor_id)
            .await
    }

    pub async fn load_file(
-        &mut self,
+        &self,
        branch: &str,
        path: &str,
        mode: LoadMode,
@ -184,13 +190,14 @@ impl Omnigraph {
    }

    async fn load_direct_on_branch(
-        &mut self,
+        &self,
        branch: Option<&str>,
        data: &str,
        mode: LoadMode,
+        actor_id: Option<&str>,
    ) -> Result<LoadResult> {
        let reader = BufReader::new(Cursor::new(data.as_bytes()));
-        load_jsonl_reader(self, branch, reader, mode).await
+        load_jsonl_reader(self, branch, reader, mode, actor_id).await
    }
 }

@ -228,10 +235,11 @@ impl LoadResult {
 }

 async fn load_jsonl_reader<R: BufRead>(
-    db: &mut Omnigraph,
+    db: &Omnigraph,
    branch: Option<&str>,
    reader: R,
    mode: LoadMode,
+    actor_id: Option<&str>,
 ) -> Result<LoadResult> {
    let catalog = db.catalog().clone();

@ -327,6 +335,16 @@ async fn load_jsonl_reader<R: BufRead>(
        LoadMode::Append => PendingMode::Append,
        LoadMode::Overwrite => PendingMode::Append, // unused
    };
+    // Map LoadMode to MutationOpKind for the version-check policy.
+    // Append/Merge skip the strict pre-stage check (concurrency-safe
+    // under the per-(table, branch) queue + publisher CAS); Overwrite
+    // uses the strict check because it truncates and replaces the
+    // dataset — concurrent advances change what "replace" means.
+    let load_op_kind = match mode {
+        LoadMode::Append => crate::db::MutationOpKind::Insert,
+        LoadMode::Merge => crate::db::MutationOpKind::Merge,
+        LoadMode::Overwrite => crate::db::MutationOpKind::SchemaRewrite,
+    };

    // Phase 2a: build and validate every node batch up front. Cheap and
    // synchronous — surfaces validation errors before any S3 traffic.
@ -357,7 +375,7 @@ async fn load_jsonl_reader<R: BufRead>(
    if use_staging {
        for (type_name, table_key, batch, loaded_count) in prepared_nodes {
            let (ds, full_path, table_branch) = db
-                .open_for_mutation_on_branch(branch, &table_key)
+                .open_for_mutation_on_branch(branch, &table_key, load_op_kind)
                .await?;
            let expected_version = ds.version().version;
            staging.ensure_path(
@ -365,6 +383,7 @@ async fn load_jsonl_reader<R: BufRead>(
                full_path,
                table_branch,
                expected_version,
+                load_op_kind,
            );
            let schema = batch.schema();
            staging.append_batch(&table_key, schema, pending_mode, batch)?;
@ -478,7 +497,7 @@ async fn load_jsonl_reader<R: BufRead>(
    if use_staging {
        for (edge_name, table_key, batch, loaded_count) in prepared_edges {
            let (ds, full_path, table_branch) = db
-                .open_for_mutation_on_branch(branch, &table_key)
+                .open_for_mutation_on_branch(branch, &table_key, load_op_kind)
                .await?;
            let expected_version = ds.version().version;
            staging.ensure_path(
@ -486,6 +505,7 @@ async fn load_jsonl_reader<R: BufRead>(
                full_path,
                table_branch,
                expected_version,
+                load_op_kind,
            );
            let schema = batch.schema();
            staging.append_batch(&table_key, schema, pending_mode, batch)?;
@ -537,15 +557,19 @@ async fn load_jsonl_reader<R: BufRead>(

    // Phase 4: Atomic manifest commit with publisher-level OCC.
    if use_staging {
-        let (updates, expected_versions, sidecar_handle) = staging
-            .finalize(db, branch, crate::db::manifest::SidecarKind::Load)
+        let staged = staging.stage_all(db, branch).await?;
+        // `_queue_guards` holds per-(table_key, branch) write queues
+        // across the manifest publish below — see exec/mutation.rs for
+        // the rationale (interleaving prevention).
+        let (updates, expected_versions, sidecar_handle, _queue_guards) = staged
+            .commit_all(db, branch, crate::db::manifest::SidecarKind::Load, actor_id)
            .await?;
        // Same finalize → publisher residual as mutations: per-table
        // staged commits have advanced Lance HEAD, but the manifest
        // publish has not run yet. Reuse the mutation failpoint name so
        // one failpoint pins the shared `MutationStaging` boundary.
        crate::failpoints::maybe_fail("mutation.post_finalize_pre_publisher")?;
-        db.commit_updates_on_branch_with_expected(branch, &updates, &expected_versions)
+        db.commit_updates_on_branch_with_expected(branch, &updates, &expected_versions, actor_id)
            .await?;
        // The recovery sidecar protects the per-table commit_staged →
        // manifest publish window. Phase C succeeded — clean up
@ -574,6 +598,7 @@ async fn load_jsonl_reader<R: BufRead>(
            branch,
            &overwrite_updates,
            &overwrite_expected,
+            actor_id,
        )
        .await?;
    }
@ -1151,8 +1176,14 @@ async fn write_batch_to_dataset(
    batch: RecordBatch,
    mode: LoadMode,
 ) -> Result<(crate::table_store::TableState, Option<String>)> {
-    let (mut ds, full_path, table_branch) =
-        db.open_for_mutation_on_branch(branch, table_key).await?;
+    let op_kind = match mode {
+        LoadMode::Append => crate::db::MutationOpKind::Insert,
+        LoadMode::Merge => crate::db::MutationOpKind::Merge,
+        LoadMode::Overwrite => crate::db::MutationOpKind::SchemaRewrite,
+    };
+    let (mut ds, full_path, table_branch) = db
+        .open_for_mutation_on_branch(branch, table_key, op_kind)
+        .await?;
    let table_store = db.table_store();

    match mode {
@ -1817,7 +1848,7 @@ edge WorksAt: Person -> Company
            .unwrap();

        // Read back via snapshot
-        let snap = db.snapshot();
+        let snap = db.snapshot().await;
        let person_ds = snap.open("node:Person").await.unwrap();

        assert_eq!(person_ds.count_rows(None).await.unwrap(), 2);
@ -1854,7 +1885,7 @@ edge WorksAt: Person -> Company
            .await
            .unwrap();

-        let snap = db.snapshot();
+        let snap = db.snapshot().await;
        let knows_ds = snap.open("edge:Knows").await.unwrap();

        let batches: Vec<RecordBatch> = knows_ds
@ -1889,13 +1920,13 @@ edge WorksAt: Person -> Company
        let dir = tempfile::tempdir().unwrap();
        let uri = dir.path().to_str().unwrap();
        let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap();
-        let v1 = db.version();
+        let v1 = db.version().await;

        load_jsonl(&mut db, TEST_DATA, LoadMode::Overwrite)
            .await
            .unwrap();

-        assert!(db.version() > v1);
+        assert!(db.version().await > v1);
    }

    #[tokio::test]
@ -1912,7 +1943,7 @@ edge WorksAt: Person -> Company
            .unwrap();
        load_jsonl(&mut db, batch2, LoadMode::Append).await.unwrap();

-        let snap = db.snapshot();
+        let snap = db.snapshot().await;
        let person_ds = snap.open("node:Person").await.unwrap();
        assert_eq!(person_ds.count_rows(None).await.unwrap(), 2);
    }
--- a/crates/omnigraph/tests/composite_flow.rs
+++ b/crates/omnigraph/tests/composite_flow.rs
@ -396,6 +396,137 @@ async fn composite_flow_canonical_lifecycle() {
    assert!(!final_total.batches().is_empty());
 }

+/// Cross-handle sequence that exercises operations after a schema_apply
+/// invalidates a peer handle's cached `_schema.pg`. The narrow load-bearing
+/// pin is that `Omnigraph::refresh()` must not deadlock when its
+/// `reload_schema_if_source_changed()` step needs to acquire a read on the
+/// coordinator's `RwLock`. The broader sequencing — schema_apply →
+/// branch_create → branch_delete → branch_merge → mutate (using the new
+/// schema's added property) → reopen — pins that the fix doesn't regress
+/// any of the related call sites.
+///
+/// Pre-fix bug class: `Omnigraph::refresh()` held
+/// `coordinator.write().await` from start to finish, including across the
+/// `self.reload_schema_if_source_changed()` call. That helper's
+/// `self.coordinator.read().await` (only reached when the on-disk schema
+/// source differs from the in-memory cache) deadlocks against the outer
+/// write guard because tokio's `RwLock` is not reentrant. Reachable from
+/// every public refresh-using API: `branch_delete` (`omnigraph.rs:910`),
+/// `branch_merge` (post-merge refresh on bound target), and any caller
+/// that calls `Omnigraph::refresh` directly.
+///
+/// The cross-handle setup is the realistic trigger: handle A applies a
+/// schema, advancing `_schema.pg` on disk; handle B has stale in-memory
+/// schema_source. B's next `refresh()` (via branch_delete here) hits the
+/// read-after-write reload path. Single-handle is unreachable because
+/// `apply_schema` updates the local ArcSwap cache in-line.
+///
+/// Post-fix invariant: `refresh()` scopes its write guard to the recovery
+/// section only, releasing it before `reload_schema_if_source_changed()`.
+/// The reload's read acquisition is uncontested.
+#[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+async fn composite_flow_schema_apply_then_branch_ops_no_deadlock_in_refresh() {
+    let dir = tempfile::tempdir().unwrap();
+    let uri = dir.path().to_str().unwrap();
+
+    // Step 1: init + load on handle A.
+    let mut db_a = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap();
+    load_jsonl(&mut db_a, TEST_DATA, LoadMode::Append).await.unwrap();
+    assert_eq!(count_rows(&db_a, "node:Person").await, 4);
+
+    // Step 2: open handle B on the same repo. B's in-memory schema_source
+    // cache is now a snapshot of `_schema.pg` at open time.
+    let db_b = Omnigraph::open(uri).await.unwrap();
+
+    // Step 3: A applies a schema that adds a nullable property to Person.
+    // A's on-disk `_schema.pg` is rewritten; A's in-memory cache is updated
+    // in-line by `apply_schema`. B's in-memory cache is now STALE relative
+    // to disk.
+    const TEST_SCHEMA_V2: &str = "node Person {\n    name: String @key\n    age: I32?\n    nickname: String?\n}\n\nnode Company {\n    name: String @key\n}\n\nedge Knows: Person -> Person {\n    since: Date?\n}\n\nedge WorksAt: Person -> Company\n";
+    let plan = db_a.apply_schema(TEST_SCHEMA_V2).await.unwrap();
+    assert!(plan.applied, "apply_schema must succeed on a clean repo");
+    assert!(
+        !plan.steps.is_empty(),
+        "apply_schema must record the AddProperty step"
+    );
+
+    // Step 4: deadlock vector. B.branch_delete calls B.refresh() internally
+    // (omnigraph.rs:910). refresh() pre-fix holds the coord write guard
+    // across reload_schema_if_source_changed; with B's cache stale, that
+    // helper takes the not-early-return branch and tries
+    // self.coordinator.read().await — deadlocks against the outer write.
+    //
+    // Wrap in tokio::time::timeout so a deadlock surfaces as a clean test
+    // panic instead of a stuck CI job. 15s is well above natural completion
+    // on local FS (sub-second under normal conditions).
+    db_b.branch_create("post-schema-apply-test").await.unwrap();
+    let delete_result = tokio::time::timeout(
+        std::time::Duration::from_secs(15),
+        db_b.branch_delete("post-schema-apply-test"),
+    )
+    .await;
+    assert!(
+        delete_result.is_ok(),
+        "branch_delete deadlocked in refresh() with stale schema cache. \
+         Pre-fix symptom: Omnigraph::refresh() holds coordinator.write().await \
+         across reload_schema_if_source_changed(), which acquires \
+         coordinator.read().await on the same non-reentrant RwLock when the \
+         on-disk schema source differs from the in-memory cache.",
+    );
+    delete_result
+        .unwrap()
+        .expect("branch_delete must succeed once refresh() releases its write guard");
+
+    // Step 5: continuing operations on B post-refresh — verify the broader
+    // sequence works. B's catalog should now reflect the new schema (the
+    // refresh path includes reload_schema_if_source_changed which calls
+    // store_catalog).
+    db_b.branch_create("feature-after-apply").await.unwrap();
+
+    // Step 6: branch_merge from B exercises the post-merge refresh() path
+    // (merge.rs:1100-1107) — same deadlock surface as branch_delete,
+    // sanity-pinned by reusing the same handle whose cache was just
+    // refreshed.
+    let _outcome = tokio::time::timeout(
+        std::time::Duration::from_secs(15),
+        db_b.branch_merge("feature-after-apply", "main"),
+    )
+    .await
+    .expect("branch_merge deadlocked in refresh() post-schema-apply")
+    .expect("branch_merge must succeed");
+
+    // Step 7: mutation on main using the new schema's added property —
+    // verifies the catalog reload completed and the engine accepts a
+    // mutation referencing `nickname`.
+    const NICKNAME_QUERY: &str = "query set_nickname($name: String, $nickname: String) {\n    update Person set { nickname: $nickname } where name = $name\n}";
+    db_b.mutate_as(
+        "main",
+        NICKNAME_QUERY,
+        "set_nickname",
+        &mixed_params(&[("$name", "Alice"), ("$nickname", "Ali")], &[]),
+        None,
+    )
+    .await
+    .expect("update using post-apply schema property must succeed");
+
+    // Step 8: reopen — final integration check that the post-deadlock-fix
+    // state persists across handle drop/open.
+    drop(db_a);
+    drop(db_b);
+    let db_c = Omnigraph::open(uri).await.unwrap();
+    assert_eq!(
+        count_rows(&db_c, "node:Person").await,
+        4,
+        "Person count consistent across reopen post-schema-apply",
+    );
+    let branches = db_c.branch_list().await.unwrap();
+    assert!(
+        !branches.iter().any(|b| b == "post-schema-apply-test"),
+        "deleted branch must stay deleted across reopen; got {:?}",
+        branches,
+    );
+}
+
 /// Multi-branch sequential merges with main writes interleaved between
 /// every diverge point. Catches compositional regressions that single-
 /// merge tests can't see:
--- a/crates/omnigraph/tests/failpoints.rs
+++ b/crates/omnigraph/tests/failpoints.rs
@ -3,6 +3,7 @@
 mod helpers;

 use fail::FailScenario;
+use futures::FutureExt;
 use omnigraph::db::Omnigraph;
 use omnigraph::failpoints::ScopedFailPoint;

@ -25,37 +26,12 @@ fn node_table_uri(root: &str, type_name: &str) -> String {
    format!("{}/nodes/{hash:016x}", root.trim_end_matches('/'))
 }

-fn person_batch(rows: &[(&str, &str, Option<i32>)]) -> arrow_array::RecordBatch {
-    use std::sync::Arc;
-
-    use arrow_array::{Int32Array, StringArray};
-    use arrow_schema::{DataType, Field, Schema};
-
-    let schema = Arc::new(Schema::new(vec![
-        Field::new("id", DataType::Utf8, false),
-        Field::new("age", DataType::Int32, true),
-        Field::new("name", DataType::Utf8, false),
-    ]));
-    let ids: Vec<&str> = rows.iter().map(|(id, _, _)| *id).collect();
-    let names: Vec<&str> = rows.iter().map(|(_, name, _)| *name).collect();
-    let ages: Vec<Option<i32>> = rows.iter().map(|(_, _, age)| *age).collect();
-    arrow_array::RecordBatch::try_new(
-        schema,
-        vec![
-            Arc::new(StringArray::from(ids)),
-            Arc::new(Int32Array::from(ages)),
-            Arc::new(StringArray::from(names)),
-        ],
-    )
-    .unwrap()
-}
-
 #[tokio::test]
 async fn branch_create_failpoint_triggers() {
    let _scenario = FailScenario::setup();
    let dir = tempfile::tempdir().unwrap();
    let uri = dir.path().to_str().unwrap();
-    let mut db = Omnigraph::init(uri, helpers::TEST_SCHEMA).await.unwrap();
+    let db = Omnigraph::init(uri, helpers::TEST_SCHEMA).await.unwrap();
    let _failpoint = ScopedFailPoint::new("branch_create.after_manifest_branch_create", "return");

    let err = db.branch_create("feature").await.unwrap_err();
@ -65,7 +41,7 @@ async fn branch_create_failpoint_triggers() {
    );
 }

-#[tokio::test]
+#[tokio::test(flavor = "multi_thread")]
 async fn graph_publish_failpoint_triggers_before_commit_append() {
    let _scenario = FailScenario::setup();
    let dir = tempfile::tempdir().unwrap();
@ -100,7 +76,7 @@ async fn schema_apply_pre_commit_crash_rolls_forward_via_sidecar() {
    let uri = dir.path().to_str().unwrap().to_string();

    {
-        let mut db = Omnigraph::init(&uri, SCHEMA_V1).await.unwrap();
+        let db = Omnigraph::init(&uri, SCHEMA_V1).await.unwrap();
        let _failpoint = ScopedFailPoint::new("schema_apply.after_staging_write", "return");
        let err = db.apply_schema(SCHEMA_V2_ADDED_TYPE).await.unwrap_err();
        assert!(
@ -122,7 +98,7 @@ async fn schema_apply_pre_commit_crash_rolls_forward_via_sidecar() {
    // behind is closed.
    let db = Omnigraph::open(&uri).await.unwrap();
    assert_eq!(
-        db.schema_source(),
+        db.schema_source().as_str(),
        SCHEMA_V2_ADDED_TYPE,
        "live schema must reflect the rolled-forward apply (Company added)"
    );
@ -143,7 +119,7 @@ async fn schema_apply_recovers_post_commit_crash() {
    let uri = dir.path().to_str().unwrap().to_string();

    {
-        let mut db = Omnigraph::init(&uri, SCHEMA_V1).await.unwrap();
+        let db = Omnigraph::init(&uri, SCHEMA_V1).await.unwrap();
        let _failpoint = ScopedFailPoint::new("schema_apply.after_manifest_commit", "return");
        let err = db.apply_schema(SCHEMA_V2_ADDED_TYPE).await.unwrap_err();
        assert!(
@ -157,7 +133,7 @@ async fn schema_apply_recovers_post_commit_crash() {
    // Reopen — manifest is at the new version, so recovery sweep should
    // complete the rename and the live schema matches v2.
    let db = Omnigraph::open(&uri).await.unwrap();
-    assert_eq!(db.schema_source(), SCHEMA_V2_ADDED_TYPE);
+    assert_eq!(db.schema_source().as_str(), SCHEMA_V2_ADDED_TYPE);
    assert_no_staging_files(dir.path());
 }

@ -172,7 +148,7 @@ async fn schema_apply_recovers_partial_rename() {
    let uri = dir.path().to_str().unwrap().to_string();

    {
-        let mut db = Omnigraph::init(&uri, SCHEMA_V1).await.unwrap();
+        let db = Omnigraph::init(&uri, SCHEMA_V1).await.unwrap();
        db.apply_schema(SCHEMA_V2_ADDED_TYPE).await.unwrap();
    }

@ -192,7 +168,7 @@ async fn schema_apply_recovers_partial_rename() {
    // Reopen — recovery should complete the rename (overwriting final files
    // with identical staging content) and remove the staging files.
    let db = Omnigraph::open(&uri).await.unwrap();
-    assert_eq!(db.schema_source(), SCHEMA_V2_ADDED_TYPE);
+    assert_eq!(db.schema_source().as_str(), SCHEMA_V2_ADDED_TYPE);
    assert_no_staging_files(dir.path());
 }

@ -312,6 +288,85 @@ async fn recovery_rolls_forward_after_finalize_publisher_failure() {
    );
 }

+#[tokio::test]
+async fn inline_delete_conflict_writes_sidecar_before_rejecting() {
+    let _scenario = FailScenario::setup();
+    let dir = tempfile::tempdir().unwrap();
+    let uri = dir.path().to_str().unwrap().to_string();
+    let db = helpers::init_and_load(&dir).await;
+
+    let pre_snapshot = db
+        .snapshot_of(omnigraph::db::ReadTarget::branch("main"))
+        .await
+        .unwrap();
+    let pre_person_pin = pre_snapshot.entry("node:Person").unwrap().table_version;
+    let person_uri = node_table_uri(&uri, "Person");
+
+    {
+        let _pause_delete = ScopedFailPoint::new("mutation.delete_node_pre_primary_delete", "pause");
+        let delete_params = helpers::params(&[("$name", "Alice")]);
+        let delete = db.mutate(
+            "main",
+            MUTATION_QUERIES,
+            "remove_person",
+            &delete_params,
+        );
+        tokio::pin!(delete);
+
+        let mut concurrent_update_succeeded = false;
+        for _ in 0..50 {
+            if delete.as_mut().now_or_never().is_some() {
+                panic!("delete mutation completed before primary-delete failpoint was released");
+            }
+            let mut concurrent = Omnigraph::open_read_only(&uri).await.unwrap();
+            if mutate_main(
+                &mut concurrent,
+                MUTATION_QUERIES,
+                "set_age",
+                &mixed_params(&[("$name", "Bob")], &[("$age", 26)]),
+            )
+                .await
+                .is_ok()
+            {
+                concurrent_update_succeeded = true;
+                break;
+            }
+            tokio::time::sleep(std::time::Duration::from_millis(20)).await;
+        }
+        assert!(concurrent_update_succeeded, "concurrent update must land while delete is paused");
+        fail::remove("mutation.delete_node_pre_primary_delete");
+
+        let err = delete.await.unwrap_err();
+        assert!(
+            err.to_string().contains("stale view of 'node:Person'")
+                || err.to_string().contains("ExpectedVersionMismatch")
+                || err.to_string().contains("expected version mismatch"),
+            "unexpected error: {err}",
+        );
+    }
+
+    let person_head = lance::Dataset::open(&person_uri)
+        .await
+        .unwrap()
+        .version()
+        .version;
+    assert!(
+        person_head > pre_person_pin,
+        "primary inline delete must have advanced node:Person before rejecting"
+    );
+    let db = Omnigraph::open(&uri).await.unwrap();
+    assert_eq!(
+        helpers::count_rows(&db, "node:Person").await,
+        4,
+        "manifest-conflicted delete must not remove net Person rows after recovery"
+    );
+    assert_eq!(
+        helpers::count_rows(&db, "edge:Knows").await,
+        3,
+        "manifest-conflicted delete must not remove net Knows rows after recovery"
+    );
+}
+
 #[tokio::test]
 async fn recovery_rolls_forward_load_on_feature_branch() {
    use omnigraph::loader::LoadMode;
@ -324,7 +379,7 @@ async fn recovery_rolls_forward_load_on_feature_branch() {
    let feature_parent_commit_id;

    {
-        let mut db = Omnigraph::init(&uri, helpers::TEST_SCHEMA).await.unwrap();
+        let db = Omnigraph::init(&uri, helpers::TEST_SCHEMA).await.unwrap();
        db.branch_create("feature").await.unwrap();
        db.mutate(
            "feature",
@ -929,7 +984,7 @@ async fn schema_apply_without_schema_staging_rolls_back_on_next_open() {
    };

    {
-        let mut db = Omnigraph::open(&uri).await.unwrap();
+        let db = Omnigraph::open(&uri).await.unwrap();
        let _failpoint = ScopedFailPoint::new("schema_apply.before_staging_write", "return");
        let v2_schema = r#"node Person {
    name: String @key
@ -1029,7 +1084,7 @@ async fn schema_apply_phase_b_failure_recovered_on_next_open() {
    // (Lance HEAD advanced) AND AFTER the schema-state staging files are
    // written, but BEFORE the manifest publish. The recovery sidecar persists.
    {
-        let mut db = Omnigraph::open(&uri).await.unwrap();
+        let db = Omnigraph::open(&uri).await.unwrap();
        let _failpoint = ScopedFailPoint::new("schema_apply.after_staging_write", "return");
        // v2 schema: add a `city` property to Person AND add a new
        // `Tag` node type. The new property triggers the rewritten_tables
@ -1191,7 +1246,7 @@ async fn branch_merge_phase_b_failure_recovered_on_next_open() {
    // Setup: failpoint fires after the per-table publish loop completes
    // but before commit_manifest_updates. Sidecar persists.
    {
-        let mut db = Omnigraph::open(&uri).await.unwrap();
+        let db = Omnigraph::open(&uri).await.unwrap();
        let _failpoint =
            ScopedFailPoint::new("branch_merge.post_phase_b_pre_manifest_commit", "return");
        let err = db.branch_merge("feature", "main").await.unwrap_err();
@ -1367,7 +1422,7 @@ async fn branch_merge_phase_b_failure_recovered_on_non_main_target() {
    // but before commit_manifest_updates. Sidecar persists with
    // branch=Some("target_branch").
    {
-        let mut db = Omnigraph::open(&uri).await.unwrap();
+        let db = Omnigraph::open(&uri).await.unwrap();
        let _failpoint =
            ScopedFailPoint::new("branch_merge.post_phase_b_pre_manifest_commit", "return");
        let err = db
@ -1468,7 +1523,7 @@ async fn branch_merge_sidecar_pins_table_branch_to_active_branch() {
    }

    {
-        let mut db = Omnigraph::open(&uri).await.unwrap();
+        let db = Omnigraph::open(&uri).await.unwrap();
        let _failpoint =
            ScopedFailPoint::new("branch_merge.post_phase_b_pre_manifest_commit", "return");
        let _ = db
@ -1559,7 +1614,7 @@ async fn ensure_indices_phase_b_failure_does_not_leak_sidecar_when_no_work_neede
    // that genuinely need work); no sidecar is written. The failpoint
    // still fires, surfacing the Err.
    {
-        let mut db = Omnigraph::open(&uri).await.unwrap();
+        let db = Omnigraph::open(&uri).await.unwrap();
        let _failpoint =
            ScopedFailPoint::new("ensure_indices.post_phase_b_pre_manifest_commit", "return");
        let err = db.ensure_indices().await.unwrap_err();
--- a/crates/omnigraph/tests/staged_writes.rs
+++ b/crates/omnigraph/tests/staged_writes.rs
@ -860,6 +860,13 @@ async fn lance_restore_appends_one_commit_with_checked_out_content() {
 /// tables before invoking restore — otherwise this hazard becomes
 /// reachable during in-flight tenant traffic.
 ///
+/// MR-686 introduces those per-(table_key, branch) writer queues as the
+/// application-layer mechanism that closes this hazard once continuous
+/// in-process recovery (MR-870) lands. Until MR-686's queue is wired into
+/// the recovery path, the open-time-only invocation strategy is the
+/// only thing keeping this hazard out of production. See
+/// `docs/invariants.md` §VI.30, §VI.32, §VI.33.
+///
 /// This test is the load-bearing constraint any future reconciler must
 /// honor.
 #[tokio::test]