Merge main into blob merge fix

Co-Authored-By: Ragnor Comerford <ragnor.comerford@gmail.com>
This commit is contained in:
Devin AI 2026-05-10 21:55:02 +00:00
commit da89e18e62
38 changed files with 4380 additions and 534 deletions

View file

@ -1,6 +1,6 @@
[package]
name = "omnigraph-cli"
version = "0.4.1"
version = "0.4.2"
edition = "2024"
description = "CLI for the Omnigraph graph database."
license = "MIT"
@ -13,9 +13,9 @@ name = "omnigraph"
path = "src/main.rs"
[dependencies]
omnigraph = { package = "omnigraph-engine", path = "../omnigraph", version = "0.4.1" }
omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.4.1" }
omnigraph-server = { path = "../omnigraph-server", version = "0.4.1" }
omnigraph = { package = "omnigraph-engine", path = "../omnigraph", version = "0.4.2" }
omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.4.2" }
omnigraph-server = { path = "../omnigraph-server", version = "0.4.2" }
clap = { workspace = true }
color-eyre = { workspace = true }
serde = { workspace = true }

View file

@ -1425,7 +1425,7 @@ async fn execute_query_lint(
let uri = resolve_local_uri(config, cli_uri, cli_target, "query lint")?;
let db = Omnigraph::open(&uri).await?;
Ok(lint_query_file(
db.catalog(),
&db.catalog(),
&query_source,
query_path,
QueryLintSchemaSource::repo(uri),

View file

@ -1,6 +1,6 @@
[package]
name = "omnigraph-compiler"
version = "0.4.1"
version = "0.4.2"
edition = "2024"
description = "Schema/query compiler for Omnigraph. Zero Lance dependency."
license = "MIT"

View file

@ -1,6 +1,6 @@
[package]
name = "omnigraph-server"
version = "0.4.1"
version = "0.4.2"
edition = "2024"
description = "HTTP server for the Omnigraph graph database."
license = "MIT"
@ -19,8 +19,8 @@ default = []
aws = ["dep:aws-config", "dep:aws-sdk-secretsmanager"]
[dependencies]
omnigraph = { package = "omnigraph-engine", path = "../omnigraph", version = "0.4.1" }
omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.4.1" }
omnigraph = { package = "omnigraph-engine", path = "../omnigraph", version = "0.4.2" }
omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.4.2" }
axum = { workspace = true }
clap = { workspace = true }
color-eyre = { workspace = true }
@ -37,6 +37,7 @@ futures = { workspace = true }
sha2 = { workspace = true }
subtle = { workspace = true }
async-trait = { workspace = true }
dashmap = "6"
aws-config = { version = "1", optional = true, default-features = false, features = ["rustls", "rt-tokio", "credentials-process", "sso"] }
aws-sdk-secretsmanager = { version = "1", optional = true, default-features = false, features = ["rustls", "rt-tokio"] }

View file

@ -0,0 +1,392 @@
//! Actor-isolation benchmark for MR-686's `WorkloadController`.
//!
//! The handoff calls this out as the empirical proof of MR-686's central
//! design promise: per-actor admission control isolates noisy actors so a
//! heavy `/ingest` user does not starve light `/change` traffic. The
//! per-`(table, branch)` queue pins the same-key serialization story; this
//! bench pins actor isolation under load.
//!
//! Setup:
//! - One "heavy" actor flooding `/ingest` with multi-row NDJSON bodies.
//! - N "light" actors each running short bursts of `/change` inserts.
//! - Each actor authenticates with its own bearer token so the
//! `WorkloadController` accounts them as distinct identities.
//!
//! Output: heavy-actor throughput / 429s, light-actor p50 / p95 / p99
//! latency. Acceptance heuristic on local FS: light-actor p99 < 2 s
//! while the heavy actor saturates its own per-actor cap.
//!
//! Usage:
//! ```sh
//! cargo run --release -p omnigraph-server --example bench_actor_isolation -- \
//! --light-actors 4 --light-ops-per-actor 50 \
//! --heavy-batches 200 --heavy-rows-per-batch 200 \
//! --inflight-cap 1 \
//! --output bench-results/after-pr2-phase2/actor-isolation.json
//! ```
use std::path::PathBuf;
use std::time::{Duration, Instant};
use axum::Router;
use axum::body::{Body, to_bytes};
use axum::http::{Method, Request, StatusCode};
use clap::Parser;
use omnigraph::db::Omnigraph;
use omnigraph_server::api::{ChangeRequest, IngestRequest};
use omnigraph_server::workload::WorkloadController;
use omnigraph_server::{AppState, build_app};
use serde::Serialize;
use tower::ServiceExt;
const SCHEMA: &str = "node Person {\n name: String @key\n age: I32?\n}\n";
const HEAVY_TOKEN: &str = "heavy-actor-token";
const HEAVY_ACTOR: &str = "act-heavy";
#[derive(Parser, Debug)]
#[command(about = "Actor-isolation HTTP bench for MR-686 WorkloadController")]
struct Args {
/// Number of light actors driving /change traffic concurrently with the
/// heavy /ingest flood. Each gets its own bearer token.
#[arg(long, default_value_t = 4)]
light_actors: usize,
/// Number of /change ops per light actor.
#[arg(long, default_value_t = 50)]
light_ops_per_actor: usize,
/// Number of /ingest batches the heavy actor sends.
#[arg(long, default_value_t = 200)]
heavy_batches: usize,
/// NDJSON rows per heavy /ingest batch.
#[arg(long, default_value_t = 200)]
heavy_rows_per_batch: usize,
/// Concurrent in-flight /ingest tasks the heavy actor maintains. With
/// `inflight_cap` smaller than this, the heavy actor exercises its own
/// admission cap (and the bench reports `heavy_too_many_requests > 0`),
/// proving the gate fires without affecting light actors. Default 4
/// against cap=1 → expect ~3/4 batches rejected.
#[arg(long, default_value_t = 4)]
heavy_concurrency: usize,
/// Per-actor in-flight cap for the run. Passed directly into the
/// `WorkloadController` constructor (no env-var fiddling). Lower
/// values surface admission rejections faster.
#[arg(long, default_value_t = 1)]
inflight_cap: u32,
/// Per-actor byte budget (bytes). Default 1 GiB so byte budget
/// doesn't bottleneck the count gate during normal bench runs.
#[arg(long, default_value_t = 1_073_741_824)]
byte_cap: u64,
/// Output file for the JSON results. Stdout always gets a copy.
#[arg(long)]
output: Option<PathBuf>,
/// Optional label to record alongside results.
#[arg(long, default_value = "")]
label: String,
}
#[derive(Serialize, Debug)]
struct BenchResults {
label: String,
inflight_cap: u32,
light_actors: usize,
light_ops_per_actor: usize,
heavy_batches: usize,
heavy_rows_per_batch: usize,
wall_time_ms: u64,
heavy_ok: usize,
heavy_too_many_requests: usize,
heavy_other_errors: usize,
heavy_throughput_attempts_per_sec: f64,
light_ok: usize,
light_too_many_requests: usize,
light_other_errors: usize,
light_p50_ms: f64,
light_p95_ms: f64,
light_p99_ms: f64,
light_p999_ms: f64,
light_max_ms: f64,
notes: &'static str,
}
fn build_heavy_body(batch_idx: usize, rows: usize) -> String {
let mut data = String::new();
for r in 0..rows {
data.push_str(&format!(
"{{\"type\":\"Person\",\"data\":{{\"name\":\"heavy-b{}-r{}\",\"age\":{}}}}}\n",
batch_idx,
r,
r % 100,
));
}
serde_json::to_string(&IngestRequest {
data,
branch: Some("main".to_string()),
from: Some("main".to_string()),
mode: Some(omnigraph::loader::LoadMode::Merge),
})
.unwrap()
}
async fn send_heavy_batch(app: Router, batch_idx: usize, rows: usize) -> StatusCode {
let body = build_heavy_body(batch_idx, rows);
let req = Request::builder()
.method(Method::POST)
.uri("/ingest")
.header("authorization", format!("Bearer {HEAVY_TOKEN}"))
.header("content-type", "application/json")
.body(Body::from(body))
.unwrap();
match app.oneshot(req).await {
Ok(r) => r.status(),
Err(_) => StatusCode::INTERNAL_SERVER_ERROR,
}
}
/// Drive `batches` /ingest calls from the heavy actor with up to
/// `concurrency` in flight at a time. With `concurrency > inflight_cap`,
/// the heavy actor's own admission permits are exhausted at peak, and
/// some batches return 429. Returns (ok, 429, other) counts.
async fn drive_heavy_actor(
app: Router,
batches: usize,
rows_per_batch: usize,
concurrency: usize,
) -> (usize, usize, usize) {
use tokio::sync::Semaphore;
// Asserted at startup in `main()`; check again here for defense in
// depth so a future caller can't pass 0 silently.
assert!(concurrency > 0, "drive_heavy_actor concurrency must be > 0");
let limiter = Arc::new(Semaphore::new(concurrency));
let mut handles = Vec::with_capacity(batches);
for b in 0..batches {
let app = app.clone();
let limiter = Arc::clone(&limiter);
handles.push(tokio::spawn(async move {
// Bound concurrency to `concurrency`; this is the bench's
// own pacing, not the server's admission control. The
// server's `WorkloadController` is what we're trying to
// exercise — and it has its own cap (potentially smaller).
let _permit = limiter.acquire_owned().await.unwrap();
send_heavy_batch(app, b, rows_per_batch).await
}));
}
let mut ok = 0usize;
let mut too_many = 0usize;
let mut other = 0usize;
for h in handles {
match h.await.unwrap_or(StatusCode::INTERNAL_SERVER_ERROR) {
StatusCode::OK => ok += 1,
StatusCode::TOO_MANY_REQUESTS => too_many += 1,
_ => other += 1,
}
}
(ok, too_many, other)
}
use std::sync::Arc;
async fn drive_light_actor(
app: Router,
token: String,
actor_idx: usize,
ops: usize,
) -> (Vec<Duration>, usize, usize, usize) {
let mut latencies = Vec::with_capacity(ops);
let mut ok = 0usize;
let mut too_many = 0usize;
let mut other = 0usize;
for op_idx in 0..ops {
let request_body = ChangeRequest {
query_source: "query insert_person($name: String, $age: I32) {\n insert Person { name: $name, age: $age }\n}".to_string(),
query_name: Some("insert_person".to_string()),
params: Some(serde_json::json!({
"name": format!("light-{actor_idx}-{op_idx}"),
"age": op_idx as i32,
})),
branch: Some("main".to_string()),
};
let body = serde_json::to_vec(&request_body).unwrap();
let req = Request::builder()
.method(Method::POST)
.uri("/change")
.header("authorization", format!("Bearer {token}"))
.header("content-type", "application/json")
.body(Body::from(body))
.unwrap();
let start = Instant::now();
let response = match app.clone().oneshot(req).await {
Ok(r) => r,
Err(_) => {
other += 1;
continue;
}
};
let elapsed = start.elapsed();
match response.status() {
StatusCode::OK => {
ok += 1;
latencies.push(elapsed);
}
StatusCode::TOO_MANY_REQUESTS => {
too_many += 1;
// Drain to free the body resource.
let _ = to_bytes(response.into_body(), 16 * 1024).await;
}
_ => {
other += 1;
let _ = to_bytes(response.into_body(), 16 * 1024).await;
}
}
}
(latencies, ok, too_many, other)
}
#[tokio::main]
async fn main() {
let args = Args::parse();
if args.light_actors == 0 || args.light_ops_per_actor == 0 || args.heavy_batches == 0 {
eprintln!("--light-actors, --light-ops-per-actor, --heavy-batches must all be > 0");
std::process::exit(2);
}
if args.heavy_concurrency == 0 {
eprintln!(
"--heavy-concurrency must be > 0 (zero would prevent the heavy actor from \
ever firing a batch; if you want to disable heavy traffic, set --heavy-batches=0)"
);
std::process::exit(2);
}
let temp = tempfile::tempdir().expect("tempdir");
let repo = temp.path().join("bench.omni");
Omnigraph::init(repo.to_str().unwrap(), SCHEMA)
.await
.expect("init repo");
// Build bearer tokens: one for the heavy actor + one per light actor.
let mut tokens: Vec<(String, String)> =
vec![(HEAVY_ACTOR.to_string(), HEAVY_TOKEN.to_string())];
for i in 0..args.light_actors {
tokens.push((format!("act-light-{i}"), format!("light-token-{i}")));
}
let db = Omnigraph::open(repo.to_str().unwrap())
.await
.expect("open repo");
// Construct a custom WorkloadController with the requested caps and
// pass it through `AppState::new_with_workload`. Avoids the
// `unsafe { std::env::set_var(...) }` antipattern that violates
// `setenv`'s thread-safety precondition once the multi-thread tokio
// runtime is up.
let workload = WorkloadController::new(args.inflight_cap, args.byte_cap);
let state = AppState::new_with_workload(
repo.to_string_lossy().to_string(),
db,
tokens,
workload,
);
let app = build_app(state);
eprintln!(
"running heavy={}x{} (concurrency={}) light={}x{} cap={}",
args.heavy_batches,
args.heavy_rows_per_batch,
args.heavy_concurrency,
args.light_actors,
args.light_ops_per_actor,
args.inflight_cap,
);
let start = Instant::now();
let heavy_app = app.clone();
let heavy_concurrency = args.heavy_concurrency;
let heavy_handle = tokio::spawn(async move {
drive_heavy_actor(
heavy_app,
args.heavy_batches,
args.heavy_rows_per_batch,
heavy_concurrency,
)
.await
});
let mut light_handles = Vec::with_capacity(args.light_actors);
for actor_idx in 0..args.light_actors {
let app = app.clone();
let token = format!("light-token-{actor_idx}");
let ops = args.light_ops_per_actor;
light_handles.push(tokio::spawn(async move {
drive_light_actor(app, token, actor_idx, ops).await
}));
}
let (heavy_ok, heavy_too_many, heavy_other) = heavy_handle.await.expect("heavy task panicked");
let mut light_latencies: Vec<Duration> =
Vec::with_capacity(args.light_actors * args.light_ops_per_actor);
let mut light_ok = 0usize;
let mut light_too_many = 0usize;
let mut light_other = 0usize;
for h in light_handles {
let (lats, ok, too_many, other) = h.await.expect("light task panicked");
light_latencies.extend(lats);
light_ok += ok;
light_too_many += too_many;
light_other += other;
}
let wall = start.elapsed();
light_latencies.sort();
let n = light_latencies.len();
let pct = |p: f64| -> f64 {
if n == 0 {
return 0.0;
}
let idx = ((n as f64 - 1.0) * p).round() as usize;
light_latencies[idx].as_secs_f64() * 1000.0
};
let max_ms = light_latencies
.last()
.map(|d| d.as_secs_f64() * 1000.0)
.unwrap_or(0.0);
let heavy_throughput = if wall.as_secs_f64() > 0.0 {
args.heavy_batches as f64 / wall.as_secs_f64()
} else {
0.0
};
let results = BenchResults {
label: args.label.clone(),
inflight_cap: args.inflight_cap,
light_actors: args.light_actors,
light_ops_per_actor: args.light_ops_per_actor,
heavy_batches: args.heavy_batches,
heavy_rows_per_batch: args.heavy_rows_per_batch,
wall_time_ms: wall.as_millis() as u64,
heavy_ok,
heavy_too_many_requests: heavy_too_many,
heavy_other_errors: heavy_other,
heavy_throughput_attempts_per_sec: heavy_throughput,
light_ok,
light_too_many_requests: light_too_many,
light_other_errors: light_other,
light_p50_ms: pct(0.50),
light_p95_ms: pct(0.95),
light_p99_ms: pct(0.99),
light_p999_ms: pct(0.999),
light_max_ms: max_ms,
notes: "MR-686 actor-isolation bench. Heavy /ingest + light /change concurrent.",
};
let json = serde_json::to_string_pretty(&results).unwrap();
println!("{json}");
if let Some(path) = args.output.as_ref() {
if let Some(parent) = path.parent()
&& !parent.as_os_str().is_empty()
{
std::fs::create_dir_all(parent).expect("mkdir output parent");
}
std::fs::write(path, &json).expect("write output");
eprintln!("wrote {}", path.display());
}
}

View file

@ -0,0 +1,267 @@
//! Server-level concurrent HTTP benchmark for MR-686 (PR 0 baseline).
//!
//! Drives concurrent `/change` requests against an in-process Omnigraph HTTP
//! server. Measures the global `Arc<RwLock<Omnigraph>>` lock penalty on
//! current `main` so PR 1 + PR 2 can be evaluated against a real baseline.
//!
//! Per the MR-686 plan: this is the load-bearing bench. `Omnigraph::mutate_as`
//! is `&mut self`, so an engine-level concurrent bench either serializes on the
//! borrow checker (measures nothing) or drives multiple handles (measures Lance
//! contention, not the server bottleneck). Driving the HTTP server is the only
//! way to measure the actual `RwLock<Omnigraph>` contention this work removes.
//!
//! Usage:
//! ```sh
//! cargo run --release -p omnigraph-server --example bench_concurrent_http -- \
//! --tables 16 --actors 16 --ops-per-actor 1000 --mode disjoint \
//! --output bench-results/baseline-main/cross-table.json
//! ```
//!
//! Modes:
//! - `disjoint`: each actor writes to a distinct node type (cross-table fanout)
//! - `same-key`: all actors write to the same node type (hot-key contention)
//! - `mixed`: each actor writes to a different table per op (round-robin)
use std::path::PathBuf;
use std::time::{Duration, Instant};
use axum::Router;
use axum::body::{Body, to_bytes};
use axum::http::{Method, Request, StatusCode};
use clap::{Parser, ValueEnum};
use omnigraph::db::Omnigraph;
use omnigraph_server::api::ChangeRequest;
use omnigraph_server::{AppState, build_app};
use serde::Serialize;
use tower::ServiceExt;
#[derive(Parser, Debug)]
#[command(about = "Concurrent HTTP bench for MR-686")]
struct Args {
/// Number of distinct node types in the schema.
#[arg(long, default_value_t = 16)]
tables: usize,
/// Number of concurrent actors driving requests.
#[arg(long, default_value_t = 16)]
actors: usize,
/// Operations per actor.
#[arg(long, default_value_t = 100)]
ops_per_actor: usize,
/// Workload mode.
#[arg(long, value_enum, default_value_t = Mode::Disjoint)]
mode: Mode,
/// Output file for the JSON results. Stdout always gets a copy.
#[arg(long)]
output: Option<PathBuf>,
/// Optional label to record alongside results (e.g. "baseline-main").
#[arg(long, default_value = "")]
label: String,
}
#[derive(Clone, Copy, Debug, ValueEnum, Serialize)]
#[serde(rename_all = "kebab-case")]
enum Mode {
Disjoint,
SameKey,
Mixed,
}
#[derive(Serialize, Debug)]
struct BenchResults {
label: String,
mode: Mode,
tables: usize,
actors: usize,
ops_per_actor: usize,
total_ops: usize,
error_count: usize,
wall_time_ms: u64,
throughput_ops_per_sec: f64,
p50_ms: f64,
p95_ms: f64,
p99_ms: f64,
p999_ms: f64,
max_ms: f64,
notes: &'static str,
}
fn build_schema(num_tables: usize) -> String {
let mut schema = String::new();
for i in 0..num_tables {
schema.push_str(&format!(
"node Item{i} {{\n name: String @key\n value: I32?\n}}\n\n"
));
}
schema
}
fn build_query_source(table_idx: usize) -> String {
format!(
"query insert_item($name: String, $value: I32) {{\n insert Item{table_idx} {{ name: $name, value: $value }}\n}}"
)
}
fn pick_table(actor_idx: usize, op_idx: usize, mode: Mode, num_tables: usize) -> usize {
match mode {
Mode::Disjoint => actor_idx % num_tables,
Mode::SameKey => 0,
Mode::Mixed => (actor_idx.wrapping_mul(7919) ^ op_idx) % num_tables,
}
}
async fn drive_actor(
app: Router,
actor_idx: usize,
ops: usize,
mode: Mode,
num_tables: usize,
) -> (Vec<Duration>, usize) {
let mut latencies = Vec::with_capacity(ops);
let mut errors = 0usize;
for op_idx in 0..ops {
let table_idx = pick_table(actor_idx, op_idx, mode, num_tables);
let request_body = ChangeRequest {
query_source: build_query_source(table_idx),
query_name: Some("insert_item".to_string()),
params: Some(serde_json::json!({
"name": format!("a{actor_idx}_o{op_idx}"),
"value": op_idx as i32,
})),
branch: None,
};
let body = serde_json::to_vec(&request_body).unwrap();
let req = Request::builder()
.method(Method::POST)
.uri("/change")
.header("content-type", "application/json")
.body(Body::from(body))
.unwrap();
let start = Instant::now();
let response = match app.clone().oneshot(req).await {
Ok(r) => r,
Err(e) => {
eprintln!("actor {actor_idx} op {op_idx} transport error: {e:?}");
errors += 1;
continue;
}
};
let elapsed = start.elapsed();
let status = response.status();
if status != StatusCode::OK {
errors += 1;
// Drain body for logging on the first few failures.
if errors <= 3 {
let body = to_bytes(response.into_body(), 64 * 1024).await.unwrap_or_default();
eprintln!(
"actor {actor_idx} op {op_idx} status {status} body {}",
String::from_utf8_lossy(&body)
);
}
}
latencies.push(elapsed);
}
(latencies, errors)
}
#[tokio::main]
async fn main() {
let args = Args::parse();
if args.tables == 0 || args.actors == 0 || args.ops_per_actor == 0 {
eprintln!("--tables, --actors, --ops-per-actor must all be > 0");
std::process::exit(2);
}
let temp = tempfile::tempdir().expect("tempdir");
let repo = temp.path().join("bench.omni");
let schema = build_schema(args.tables);
Omnigraph::init(repo.to_str().unwrap(), &schema)
.await
.expect("init repo");
let state = AppState::open(repo.to_string_lossy().to_string())
.await
.expect("open AppState");
let app = build_app(state);
eprintln!(
"running mode={:?} tables={} actors={} ops_per_actor={}",
args.mode, args.tables, args.actors, args.ops_per_actor
);
let start = Instant::now();
let mut handles = Vec::with_capacity(args.actors);
for actor_idx in 0..args.actors {
let app = app.clone();
let mode = args.mode;
let ops = args.ops_per_actor;
let num_tables = args.tables;
handles.push(tokio::spawn(async move {
drive_actor(app, actor_idx, ops, mode, num_tables).await
}));
}
let mut all_latencies: Vec<Duration> = Vec::with_capacity(args.actors * args.ops_per_actor);
let mut total_errors = 0usize;
for h in handles {
let (lats, errs) = h.await.expect("actor task panicked");
all_latencies.extend(lats);
total_errors += errs;
}
let wall = start.elapsed();
all_latencies.sort();
let n = all_latencies.len();
let pct = |p: f64| -> f64 {
if n == 0 {
return 0.0;
}
let idx = ((n as f64 - 1.0) * p).round() as usize;
all_latencies[idx].as_secs_f64() * 1000.0
};
let max_ms = all_latencies
.last()
.map(|d| d.as_secs_f64() * 1000.0)
.unwrap_or(0.0);
let throughput = if wall.as_secs_f64() > 0.0 {
n as f64 / wall.as_secs_f64()
} else {
0.0
};
let results = BenchResults {
label: args.label.clone(),
mode: args.mode,
tables: args.tables,
actors: args.actors,
ops_per_actor: args.ops_per_actor,
total_ops: n,
error_count: total_errors,
wall_time_ms: wall.as_millis() as u64,
throughput_ops_per_sec: throughput,
p50_ms: pct(0.50),
p95_ms: pct(0.95),
p99_ms: pct(0.99),
p999_ms: pct(0.999),
max_ms,
notes: "MR-686 PR 0 baseline. Drives /change via Tower oneshot.",
};
let json = serde_json::to_string_pretty(&results).unwrap();
println!("{json}");
if let Some(path) = args.output.as_ref() {
if let Some(parent) = path.parent()
&& !parent.as_os_str().is_empty()
{
std::fs::create_dir_all(parent).expect("mkdir output parent");
}
std::fs::write(path, &json).expect("write output");
eprintln!("wrote {}", path.display());
}
if total_errors > 0 {
eprintln!("WARN: {total_errors} requests failed");
std::process::exit(1);
}
}

View file

@ -339,6 +339,9 @@ pub enum ErrorCode {
BadRequest,
NotFound,
Conflict,
/// 429 Too Many Requests — per-actor admission cap exceeded.
/// Clients should respect the `Retry-After` header.
TooManyRequests,
Internal,
}

View file

@ -2,6 +2,7 @@ pub mod api;
pub mod auth;
pub mod config;
pub mod policy;
pub mod workload;
use std::collections::{HashMap, HashSet};
use std::fs;
@ -47,7 +48,7 @@ use serde_json::Value;
use sha2::{Digest, Sha256};
use subtle::ConstantTimeEq;
use tokio::net::TcpListener;
use tokio::sync::{RwLock, mpsc};
use tokio::sync::mpsc;
use tower_http::trace::TraceLayer;
use tracing::{error, info};
use tracing_subscriber::EnvFilter;
@ -118,7 +119,14 @@ pub struct ServerConfig {
#[derive(Clone)]
pub struct AppState {
uri: String,
db: Arc<RwLock<Omnigraph>>,
/// PR 2 (MR-686): the engine is now `Arc<Omnigraph>` — no global
/// write lock. Concurrent handlers call `&self` engine APIs
/// directly. Per-(table, branch) write queues inside the engine
/// serialize same-key writers; per-actor admission control on
/// `workload` isolates noisy actors.
engine: Arc<Omnigraph>,
/// Per-actor admission control. See `workload::WorkloadController`.
workload: Arc<workload::WorkloadController>,
bearer_tokens: Arc<[(BearerTokenHash, Arc<str>)]>,
policy_engine: Option<Arc<PolicyEngine>>,
}
@ -191,12 +199,36 @@ impl AppState {
.collect();
Self {
uri,
db: Arc::new(RwLock::new(db)),
engine: Arc::new(db),
workload: Arc::new(workload::WorkloadController::from_env()),
bearer_tokens: Arc::from(bearer_tokens),
policy_engine: policy_engine.map(Arc::new),
}
}
/// Construct with a caller-provided [`workload::WorkloadController`].
/// Tests and benches use this to override per-actor caps without
/// mutating global env vars (which is unsafe in Rust 2024 once the
/// async runtime is up — `setenv` isn't thread-safe).
pub fn new_with_workload(
uri: String,
db: Omnigraph,
bearer_tokens: Vec<(String, String)>,
workload: workload::WorkloadController,
) -> Self {
let bearer_tokens: Vec<(BearerTokenHash, Arc<str>)> = bearer_tokens
.into_iter()
.map(|(actor, token)| (hash_bearer_token(&token), Arc::<str>::from(actor)))
.collect();
Self {
uri,
engine: Arc::new(db),
workload: Arc::new(workload),
bearer_tokens: Arc::from(bearer_tokens),
policy_engine: None,
}
}
pub async fn open(uri: impl Into<String>) -> Result<Self> {
Self::open_with_bearer_token(uri, None).await
}
@ -331,6 +363,31 @@ impl ApiError {
}
}
/// HTTP 429 Too Many Requests — actor exceeded their per-actor
/// admission cap (count or byte budget). Clients should respect the
/// `Retry-After` header. Mapped from `RejectReason::InFlightCountExceeded`
/// and `RejectReason::ByteBudgetExceeded`.
pub fn too_many_requests(message: impl Into<String>) -> Self {
Self {
status: StatusCode::TOO_MANY_REQUESTS,
code: ErrorCode::TooManyRequests,
message: message.into(),
merge_conflicts: Vec::new(),
manifest_conflict: None,
}
}
/// Convert a `WorkloadController` rejection into the matching
/// `ApiError` variant.
pub fn from_workload_reject(reject: workload::RejectReason) -> Self {
match reject {
workload::RejectReason::InFlightCountExceeded { .. }
| workload::RejectReason::ByteBudgetExceeded { .. } => {
Self::too_many_requests(reject.to_string())
}
}
}
fn merge_conflict(conflicts: Vec<api::MergeConflictOutput>) -> Self {
Self {
status: StatusCode::CONFLICT,
@ -418,10 +475,21 @@ fn summarize_merge_conflicts(conflicts: &[api::MergeConflictOutput]) -> String {
format!("merge conflicts: {}{}", preview.join("; "), suffix)
}
/// Constant `Retry-After` value (seconds) emitted on 429 responses.
const RETRY_AFTER_SECONDS: &str = "60";
impl IntoResponse for ApiError {
fn into_response(self) -> Response {
let mut headers = axum::http::HeaderMap::new();
if matches!(self.code, ErrorCode::TooManyRequests) {
headers.insert(
axum::http::header::RETRY_AFTER,
axum::http::HeaderValue::from_static(RETRY_AFTER_SECONDS),
);
}
(
self.status,
headers,
Json(ErrorOutput {
error: self.message,
code: Some(self.code),
@ -674,7 +742,7 @@ async fn server_snapshot(
},
)?;
let snapshot = {
let db = Arc::clone(&state.db).read_owned().await;
let db = &state.engine;
db.snapshot_of(ReadTarget::branch(branch.as_str()))
.await
.map_err(ApiError::from_omni)?
@ -718,7 +786,7 @@ async fn server_read(
let policy_branch = match &target {
ReadTarget::Branch(branch) => Some(branch.clone()),
ReadTarget::Snapshot(_) if state.policy_engine().is_some() && actor.is_some() => {
let db = Arc::clone(&state.db).read_owned().await;
let db = &state.engine;
db.resolved_branch_of(target.clone())
.await
.map(|branch| branch.or_else(|| Some("main".to_string())))
@ -746,7 +814,7 @@ async fn server_read(
.map_err(|err| ApiError::bad_request(err.to_string()))?;
let result = {
let db = Arc::clone(&state.db).read_owned().await;
let db = &state.engine;
db.query(
target.clone(),
&request.query_source,
@ -798,15 +866,15 @@ async fn server_export(
target_branch: None,
},
)?;
let db = Arc::clone(&state.db);
let engine = Arc::clone(&state.engine);
let type_names = request.type_names.clone();
let table_keys = request.table_keys.clone();
let (tx, rx) = mpsc::unbounded_channel::<std::result::Result<Bytes, io::Error>>();
tokio::spawn(async move {
let result = {
let db = db.read().await;
let mut writer = ExportStreamWriter { sender: tx.clone() };
db.export_jsonl_to_writer(&branch, &type_names, &table_keys, &mut writer)
engine
.export_jsonl_to_writer(&branch, &type_names, &table_keys, &mut writer)
.await
};
if let Err(err) = result {
@ -836,6 +904,7 @@ async fn server_export(
(status = 401, description = "Unauthorized", body = ErrorOutput),
(status = 403, description = "Forbidden", body = ErrorOutput),
(status = 409, description = "Merge conflict", body = ErrorOutput),
(status = 429, description = "Per-actor admission cap exceeded; honor `Retry-After` header", body = ErrorOutput),
),
security(("bearer_token" = [])),
)]
@ -851,6 +920,10 @@ async fn server_change(
Json(request): Json<ChangeRequest>,
) -> std::result::Result<Json<ChangeOutput>, ApiError> {
let branch = request.branch.unwrap_or_else(|| "main".to_string());
let actor_arc = actor
.as_ref()
.map(|Extension(actor)| Arc::clone(&actor.0))
.unwrap_or_else(|| Arc::<str>::from("anonymous"));
let actor_id = actor.as_ref().map(|Extension(actor)| actor.as_str());
authorize_request(
&state,
@ -862,6 +935,20 @@ async fn server_change(
target_branch: None,
},
)?;
// Per-actor admission: bound concurrent in-flight mutations and
// estimated bytes per actor. Cedar runs FIRST so denied requests
// don't consume admission slots. Estimate uses the request body
// size as a coarse proxy; engine memory pressure can run higher.
let est_bytes = request.query_source.len() as u64
+ request
.params
.as_ref()
.map(|p| p.to_string().len() as u64)
.unwrap_or(0);
let _admission = state
.workload
.try_admit(&actor_arc, est_bytes)
.map_err(ApiError::from_workload_reject)?;
let (selected_name, query_params) =
select_named_query(&request.query_source, request.query_name.as_deref())
.map_err(|err| ApiError::bad_request(err.to_string()))?;
@ -869,7 +956,7 @@ async fn server_change(
.map_err(|err| ApiError::bad_request(err.to_string()))?;
let result = {
let mut db = Arc::clone(&state.db).write_owned().await;
let db = &state.engine;
db.mutate_as(
&branch,
&request.query_source,
@ -924,7 +1011,7 @@ async fn server_schema_get(
},
)?;
let schema_source = {
let db = Arc::clone(&state.db).read_owned().await;
let db = &state.engine;
db.schema_source().to_string()
};
Ok(Json(SchemaOutput { schema_source }))
@ -941,6 +1028,7 @@ async fn server_schema_get(
(status = 400, description = "Bad request", body = ErrorOutput),
(status = 401, description = "Unauthorized", body = ErrorOutput),
(status = 403, description = "Forbidden", body = ErrorOutput),
(status = 429, description = "Per-actor admission cap exceeded; honor `Retry-After` header", body = ErrorOutput),
),
security(("bearer_token" = [])),
)]
@ -955,6 +1043,10 @@ async fn server_schema_apply(
actor: Option<Extension<AuthenticatedActor>>,
Json(request): Json<SchemaApplyRequest>,
) -> std::result::Result<Json<SchemaApplyOutput>, ApiError> {
let actor_arc = actor
.as_ref()
.map(|Extension(actor)| Arc::clone(&actor.0))
.unwrap_or_else(|| Arc::<str>::from("anonymous"));
let actor_id = actor.as_ref().map(|Extension(actor)| actor.as_str());
authorize_request(
&state,
@ -966,8 +1058,13 @@ async fn server_schema_apply(
target_branch: Some("main".to_string()),
},
)?;
let est_bytes = request.schema_source.len() as u64;
let _admission = state
.workload
.try_admit(&actor_arc, est_bytes)
.map_err(ApiError::from_workload_reject)?;
let result = {
let mut db = Arc::clone(&state.db).write_owned().await;
let db = &state.engine;
db.apply_schema(&request.schema_source)
.await
.map_err(ApiError::from_omni)?
@ -986,6 +1083,7 @@ async fn server_schema_apply(
(status = 400, description = "Bad request", body = ErrorOutput),
(status = 401, description = "Unauthorized", body = ErrorOutput),
(status = 403, description = "Forbidden", body = ErrorOutput),
(status = 429, description = "Per-actor admission cap exceeded; honor `Retry-After` header", body = ErrorOutput),
),
security(("bearer_token" = [])),
)]
@ -1004,10 +1102,14 @@ async fn server_ingest(
let branch = request.branch.unwrap_or_else(|| "main".to_string());
let from = request.from.unwrap_or_else(|| "main".to_string());
let mode = request.mode.unwrap_or(omnigraph::loader::LoadMode::Merge);
let actor_arc = actor
.as_ref()
.map(|Extension(actor)| Arc::clone(&actor.0))
.unwrap_or_else(|| Arc::<str>::from("anonymous"));
let actor_id = actor.as_ref().map(|Extension(actor)| actor.as_str());
let branch_exists = {
let db = Arc::clone(&state.db).read_owned().await;
let db = &state.engine;
db.branch_list()
.await
.map_err(ApiError::from_omni)?
@ -1037,9 +1139,14 @@ async fn server_ingest(
target_branch: None,
},
)?;
let est_bytes = request.data.len() as u64;
let _admission = state
.workload
.try_admit(&actor_arc, est_bytes)
.map_err(ApiError::from_workload_reject)?;
let result = {
let mut db = Arc::clone(&state.db).write_owned().await;
let db = &state.engine;
db.ingest_as(&branch, Some(&from), &request.data, mode, actor_id)
.await
.map_err(ApiError::from_omni)?
@ -1085,7 +1192,7 @@ async fn server_branch_list(
},
)?;
let mut branches = {
let db = Arc::clone(&state.db).read_owned().await;
let db = &state.engine;
db.branch_list().await.map_err(ApiError::from_omni)?
};
branches.sort();
@ -1104,6 +1211,7 @@ async fn server_branch_list(
(status = 401, description = "Unauthorized", body = ErrorOutput),
(status = 403, description = "Forbidden", body = ErrorOutput),
(status = 409, description = "Branch already exists", body = ErrorOutput),
(status = 429, description = "Per-actor admission cap exceeded; honor `Retry-After` header", body = ErrorOutput),
),
security(("bearer_token" = [])),
)]
@ -1118,6 +1226,10 @@ async fn server_branch_create(
Json(request): Json<BranchCreateRequest>,
) -> std::result::Result<Json<BranchCreateOutput>, ApiError> {
let from = request.from.unwrap_or_else(|| "main".to_string());
let actor_arc = actor
.as_ref()
.map(|Extension(actor)| Arc::clone(&actor.0))
.unwrap_or_else(|| Arc::<str>::from("anonymous"));
authorize_request(
&state,
actor.as_ref().map(|Extension(actor)| actor),
@ -1131,8 +1243,15 @@ async fn server_branch_create(
target_branch: Some(request.name.clone()),
},
)?;
// Branch metadata only — small constant bytes estimate. The Lance
// shallow-clone work is bounded by the parent's manifest size, not
// the request body.
let _admission = state
.workload
.try_admit(&actor_arc, 256)
.map_err(ApiError::from_workload_reject)?;
{
let mut db = Arc::clone(&state.db).write_owned().await;
let db = &state.engine;
db.branch_create_from(ReadTarget::branch(&from), &request.name)
.await
.map_err(ApiError::from_omni)?;
@ -1158,6 +1277,7 @@ async fn server_branch_create(
(status = 401, description = "Unauthorized", body = ErrorOutput),
(status = 403, description = "Forbidden", body = ErrorOutput),
(status = 404, description = "Branch not found", body = ErrorOutput),
(status = 429, description = "Per-actor admission cap exceeded; honor `Retry-After` header", body = ErrorOutput),
),
security(("bearer_token" = [])),
)]
@ -1171,6 +1291,10 @@ async fn server_branch_delete(
actor: Option<Extension<AuthenticatedActor>>,
Path(branch): Path<String>,
) -> std::result::Result<Json<BranchDeleteOutput>, ApiError> {
let actor_arc = actor
.as_ref()
.map(|Extension(actor)| Arc::clone(&actor.0))
.unwrap_or_else(|| Arc::<str>::from("anonymous"));
let actor_id = actor.as_ref().map(|Extension(actor)| actor.as_str());
authorize_request(
&state,
@ -1182,8 +1306,13 @@ async fn server_branch_delete(
target_branch: Some(branch.clone()),
},
)?;
// Metadata-only manifest tombstone — small constant estimate.
let _admission = state
.workload
.try_admit(&actor_arc, 256)
.map_err(ApiError::from_workload_reject)?;
{
let mut db = Arc::clone(&state.db).write_owned().await;
let db = &state.engine;
db.branch_delete(&branch)
.await
.map_err(ApiError::from_omni)?;
@ -1207,6 +1336,7 @@ async fn server_branch_delete(
(status = 401, description = "Unauthorized", body = ErrorOutput),
(status = 403, description = "Forbidden", body = ErrorOutput),
(status = 409, description = "Merge conflict", body = ErrorOutput),
(status = 429, description = "Per-actor admission cap exceeded; honor `Retry-After` header", body = ErrorOutput),
),
security(("bearer_token" = [])),
)]
@ -1222,6 +1352,10 @@ async fn server_branch_merge(
Json(request): Json<BranchMergeRequest>,
) -> std::result::Result<Json<BranchMergeOutput>, ApiError> {
let target = request.target.unwrap_or_else(|| "main".to_string());
let actor_arc = actor
.as_ref()
.map(|Extension(actor)| Arc::clone(&actor.0))
.unwrap_or_else(|| Arc::<str>::from("anonymous"));
let actor_id = actor.as_ref().map(|Extension(actor)| actor.as_str());
authorize_request(
&state,
@ -1233,8 +1367,15 @@ async fn server_branch_merge(
target_branch: Some(target.clone()),
},
)?;
// Merge body is small JSON; the heavy work is in the engine but is
// bounded per-(table, branch) by the writer queue. Small constant
// estimate suffices for the actor in-flight count.
let _admission = state
.workload
.try_admit(&actor_arc, 256)
.map_err(ApiError::from_workload_reject)?;
let outcome = {
let mut db = Arc::clone(&state.db).write_owned().await;
let db = &state.engine;
db.branch_merge_as(&request.source, &target, actor_id)
.await
.map_err(ApiError::from_omni)?
@ -1283,7 +1424,7 @@ async fn server_commit_list(
},
)?;
let commits = {
let db = Arc::clone(&state.db).read_owned().await;
let db = &state.engine;
db.list_commits(query.branch.as_deref())
.await
.map_err(ApiError::from_omni)?
@ -1332,7 +1473,7 @@ async fn server_commit_show(
},
)?;
let commit = {
let db = Arc::clone(&state.db).read_owned().await;
let db = &state.engine;
db.get_commit(&commit_id)
.await
.map_err(ApiError::from_omni)?

View file

@ -0,0 +1,363 @@
//! Per-actor admission control for the HTTP server (MR-686 §VII.A).
//!
//! The HTTP server's previous global `RwLock<Omnigraph>` serialized every
//! mutating request across all actors. PR 2 removes that lock — engine
//! APIs are now `&self`, so concurrent calls from different actors can
//! run against `Arc<Omnigraph>` simultaneously. Without admission
//! control, one heavy actor can exhaust shared capacity (Lance I/O
//! threads, manifest churn, network) and starve other actors.
//!
//! This module provides:
//!
//! - **Per-actor in-flight count cap**: each actor has a
//! `tokio::sync::Semaphore` with `OMNIGRAPH_PER_ACTOR_INFLIGHT_MAX`
//! permits (default 16). `try_acquire_owned()` returns `Err` when
//! exhausted; the server maps this to HTTP 429.
//!
//! - **Per-actor in-flight byte budget**: each actor accumulates an
//! `AtomicU64` byte estimate. `fetch_add(est_bytes)` then a check
//! against `byte_cap` is race-free via decrement-on-rejection. The
//! server maps an over-budget result to HTTP 429 as well.
//!
//! Counts are governed by the semaphore (race-free `try_acquire_owned()`
//! enforces the cap atomically); bytes use `fetch_add` + decrement-on-
//! rejection. Both checks are atomic compare-and-act, never
//! load-then-act — the test
//! `actor_admission_race_does_not_exceed_cap` pins this contract by
//! spawning 32 concurrent `try_admit` calls against a cap of 16 and
//! asserting exactly 16 succeed.
//!
//! Acquisition order against the engine's per-(table, branch) write
//! queue: admission FIRST (the HTTP handler reserves capacity before
//! calling into the engine), engine queue SECOND (acquired inside
//! `MutationStaging::commit_all`). This composes cleanly because
//! admission is a single per-actor count + budget check, never
//! cross-actor; nothing the engine does can change a peer actor's
//! admission state.
use std::sync::Arc;
use std::sync::atomic::{AtomicU64, Ordering};
use dashmap::DashMap;
use tokio::sync::{OwnedSemaphorePermit, Semaphore, TryAcquireError};
/// Default per-actor in-flight count cap. Override via
/// `OMNIGRAPH_PER_ACTOR_INFLIGHT_MAX`.
pub const DEFAULT_PER_ACTOR_INFLIGHT_MAX: u32 = 16;
/// Default per-actor in-flight byte budget (4 GiB). Override via
/// `OMNIGRAPH_PER_ACTOR_BYTES_MAX`.
pub const DEFAULT_PER_ACTOR_BYTES_MAX: u64 = 4 * 1024 * 1024 * 1024;
/// Why a `try_admit` call returned `Err`. The server maps each variant
/// to a specific HTTP response code; see `WorkloadController` docs.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum RejectReason {
/// Actor exceeded the per-actor in-flight count cap. HTTP 429.
InFlightCountExceeded { cap: u32 },
/// Actor exceeded the per-actor in-flight byte budget. HTTP 429.
ByteBudgetExceeded { cap: u64, attempted: u64 },
}
impl std::fmt::Display for RejectReason {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
RejectReason::InFlightCountExceeded { cap } => {
write!(f, "actor in-flight count cap {} exceeded", cap)
}
RejectReason::ByteBudgetExceeded { cap, attempted } => write!(
f,
"actor byte budget exceeded: would use {} bytes against cap {}",
attempted, cap
),
}
}
}
/// Per-actor counters. One instance per actor_id, lazily created on
/// first admission attempt.
#[derive(Debug)]
pub(crate) struct ActorState {
/// Counts the number of concurrent in-flight requests for this
/// actor. `try_acquire_owned()` is the count-cap gate.
in_flight_sem: Arc<Semaphore>,
/// Total bytes estimated to be in flight for this actor across
/// concurrent requests. `fetch_add` + check + decrement-on-failure
/// keeps the cap atomic.
bytes: AtomicU64,
/// Per-actor byte cap (snapshot of `WorkloadController.byte_cap`
/// at construction; cap mutations don't propagate to existing
/// ActorStates by design — controller config changes apply on
/// next ActorState construction).
byte_cap: u64,
/// Per-actor count cap (same snapshot semantics as `byte_cap`).
inflight_cap: u32,
}
impl ActorState {
fn new(inflight_cap: u32, byte_cap: u64) -> Self {
Self {
in_flight_sem: Arc::new(Semaphore::new(inflight_cap as usize)),
bytes: AtomicU64::new(0),
byte_cap,
inflight_cap,
}
}
}
/// Server-side per-actor admission controller. Constructed once at
/// server startup and shared via `Arc<WorkloadController>` on
/// `AppState`.
pub struct WorkloadController {
per_actor: DashMap<Arc<str>, Arc<ActorState>>,
inflight_cap: u32,
byte_cap: u64,
}
impl WorkloadController {
/// Construct from explicit caps. Tests can override.
pub fn new(inflight_cap: u32, byte_cap: u64) -> Self {
Self {
per_actor: DashMap::new(),
inflight_cap,
byte_cap,
}
}
/// Construct from environment variables, falling back to defaults.
/// Bad env values fall back to the default with a `tracing::warn!`.
pub fn from_env() -> Self {
let inflight_cap = parse_env_u32(
"OMNIGRAPH_PER_ACTOR_INFLIGHT_MAX",
DEFAULT_PER_ACTOR_INFLIGHT_MAX,
);
let byte_cap = parse_env_u64("OMNIGRAPH_PER_ACTOR_BYTES_MAX", DEFAULT_PER_ACTOR_BYTES_MAX);
Self::new(inflight_cap, byte_cap)
}
/// Construct with default caps. Suitable for tests / single-tenant
/// deployments without explicit configuration.
pub fn with_defaults() -> Self {
Self::new(DEFAULT_PER_ACTOR_INFLIGHT_MAX, DEFAULT_PER_ACTOR_BYTES_MAX)
}
fn actor_state(&self, actor_id: &Arc<str>) -> Arc<ActorState> {
if let Some(existing) = self.per_actor.get(actor_id) {
return existing.clone();
}
// Race-on-construct is benign: DashMap's `entry().or_insert_with`
// serializes per-key construction; the loser's freshly-built
// ActorState gets dropped without observable effect.
self.per_actor
.entry(actor_id.clone())
.or_insert_with(|| Arc::new(ActorState::new(self.inflight_cap, self.byte_cap)))
.clone()
}
/// Reserve admission for one in-flight request from `actor_id`
/// estimated to consume `est_bytes`. Returns an `AdmissionGuard`
/// that releases the count permit + decrements the byte total
/// when dropped.
///
/// On rejection, the byte counter is decremented before returning
/// — callers can retry without leaking budget.
pub fn try_admit(
&self,
actor_id: &Arc<str>,
est_bytes: u64,
) -> Result<AdmissionGuard, RejectReason> {
let state = self.actor_state(actor_id);
// Count gate: race-free via `try_acquire_owned()`. If exhausted,
// immediately reject — no byte accounting needed for this request.
let permit = match Arc::clone(&state.in_flight_sem).try_acquire_owned() {
Ok(permit) => permit,
Err(TryAcquireError::NoPermits) => {
return Err(RejectReason::InFlightCountExceeded {
cap: state.inflight_cap,
});
}
Err(TryAcquireError::Closed) => {
return Err(RejectReason::InFlightCountExceeded {
cap: state.inflight_cap,
});
}
};
// Byte gate: atomic fetch_add then check; decrement on overflow.
// `Ordering::SeqCst` is conservative; per-actor accounting is
// not on the hot path of read queries.
let prior = state.bytes.fetch_add(est_bytes, Ordering::SeqCst);
let attempted = prior.saturating_add(est_bytes);
if attempted > state.byte_cap {
// Roll back the byte add. The permit drops with `permit`
// going out of scope below.
state.bytes.fetch_sub(est_bytes, Ordering::SeqCst);
return Err(RejectReason::ByteBudgetExceeded {
cap: state.byte_cap,
attempted,
});
}
Ok(AdmissionGuard {
_permit: permit,
actor_state: state,
est_bytes,
})
}
}
/// Drop-on-completion guard for an admitted request. Dropping releases
/// the in-flight count permit (via `Drop` on the underlying semaphore
/// permit) and decrements the actor's byte counter.
#[derive(Debug)]
pub struct AdmissionGuard {
_permit: OwnedSemaphorePermit,
actor_state: Arc<ActorState>,
est_bytes: u64,
}
impl Drop for AdmissionGuard {
fn drop(&mut self) {
self.actor_state
.bytes
.fetch_sub(self.est_bytes, Ordering::SeqCst);
}
}
fn parse_env_u32(name: &str, default: u32) -> u32 {
match std::env::var(name) {
Ok(v) => v.parse::<u32>().unwrap_or_else(|err| {
tracing::warn!(
env = name,
value = %v,
error = %err,
default,
"invalid env value, using default"
);
default
}),
Err(_) => default,
}
}
fn parse_env_u64(name: &str, default: u64) -> u64 {
match std::env::var(name) {
Ok(v) => v.parse::<u64>().unwrap_or_else(|err| {
tracing::warn!(
env = name,
value = %v,
error = %err,
default,
"invalid env value, using default"
);
default
}),
Err(_) => default,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[tokio::test(flavor = "multi_thread", worker_threads = 4)]
async fn try_admit_admits_under_cap() {
let controller = WorkloadController::new(2, 1024);
let actor: Arc<str> = "alice".into();
let g1 = controller.try_admit(&actor, 100).expect("first admit");
let _g2 = controller.try_admit(&actor, 100).expect("second admit");
let err = controller
.try_admit(&actor, 100)
.expect_err("third should reject on count");
assert!(matches!(err, RejectReason::InFlightCountExceeded { cap: 2 }));
drop(g1);
// After drop, a new admit succeeds again.
let _g3 = controller
.try_admit(&actor, 100)
.expect("admit after drop");
}
#[tokio::test(flavor = "multi_thread", worker_threads = 4)]
async fn byte_budget_caps_admission() {
let controller = WorkloadController::new(16, 1000);
let actor: Arc<str> = "alice".into();
let _g1 = controller.try_admit(&actor, 600).expect("first admit");
let err = controller
.try_admit(&actor, 600)
.expect_err("second should reject on bytes");
match err {
RejectReason::ByteBudgetExceeded { cap, attempted } => {
assert_eq!(cap, 1000);
assert_eq!(attempted, 1200);
}
other => panic!("expected ByteBudgetExceeded, got {:?}", other),
}
// Verify the byte counter was rolled back: a smaller request fits.
let _g2 = controller.try_admit(&actor, 300).expect("smaller admit");
}
#[tokio::test(flavor = "multi_thread", worker_threads = 4)]
async fn actor_admission_race_does_not_exceed_cap() {
// Pin master plan §"WorkloadController" Finding 6: independent
// atomic load + check + add allows two concurrent callers to
// both pass a cap-N check. The Semaphore-based gate is
// race-free — exactly cap_count callers succeed.
//
// Each task holds its admission guard until released via a
// oneshot channel; this forces real contention because guards
// can't drop and free permits before all 32 calls have raced.
let controller = Arc::new(WorkloadController::new(16, u64::MAX / 4));
let actor: Arc<str> = "racer".into();
let (release_tx, _) = tokio::sync::broadcast::channel::<()>(1);
let mut handles = Vec::with_capacity(32);
for _ in 0..32 {
let controller = Arc::clone(&controller);
let actor = actor.clone();
let mut release_rx = release_tx.subscribe();
handles.push(tokio::spawn(async move {
let result = controller.try_admit(&actor, 1);
let success = result.is_ok();
// Hold the guard (if any) until the test signals release,
// so the cap-16 contention is observable across all 32
// tasks instead of permits being recycled task-by-task.
let _guard = result.ok();
let _ = release_rx.recv().await;
success
}));
}
// Give all 32 tasks a chance to hit `try_admit` before any can
// drop their guard. 50ms is plenty for tokio's scheduler on a
// 4-worker runtime.
tokio::time::sleep(std::time::Duration::from_millis(50)).await;
// Release every task; collect succeed/reject counts.
let _ = release_tx.send(());
let mut accepted = 0u32;
let mut rejected = 0u32;
for h in handles {
if h.await.unwrap() {
accepted += 1;
} else {
rejected += 1;
}
}
assert_eq!(accepted, 16, "expected exactly 16 successful admits");
assert_eq!(rejected, 16, "expected exactly 16 rejections");
}
#[tokio::test(flavor = "multi_thread", worker_threads = 4)]
async fn per_actor_caps_independent() {
let controller = WorkloadController::new(1, 1024);
let alice: Arc<str> = "alice".into();
let bob: Arc<str> = "bob".into();
let _ga = controller.try_admit(&alice, 100).expect("alice ok");
// Alice over count cap, Bob unaffected.
let err = controller.try_admit(&alice, 100).expect_err("alice rejected");
assert!(matches!(err, RejectReason::InFlightCountExceeded { .. }));
let _gb = controller.try_admit(&bob, 100).expect("bob ok");
}
}

File diff suppressed because it is too large Load diff

View file

@ -1,6 +1,6 @@
[package]
name = "omnigraph-engine"
version = "0.4.1"
version = "0.4.2"
edition = "2024"
description = "Runtime engine for the Omnigraph graph database."
license = "MIT"
@ -16,7 +16,7 @@ default = []
failpoints = ["dep:fail", "fail/failpoints"]
[dependencies]
omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.4.1" }
omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.4.2" }
lance = { workspace = true }
lance-datafusion = { workspace = true }
datafusion = { workspace = true }
@ -47,9 +47,10 @@ time = { workspace = true }
async-trait = { workspace = true }
url = { workspace = true }
chrono = { workspace = true }
arc-swap = { workspace = true }
[dev-dependencies]
omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.4.1" }
omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.4.2" }
tokio = { workspace = true }
lance-namespace-impls = { workspace = true }
serial_test = "3"

View file

@ -5,6 +5,7 @@ mod omnigraph;
mod recovery_audit;
mod run_registry;
mod schema_state;
pub(crate) mod write_queue;
pub use commit_graph::GraphCommit;
pub use graph_coordinator::{GraphCoordinator, ReadTarget, ResolvedTarget, SnapshotId};
@ -18,6 +19,53 @@ pub(crate) use run_registry::is_internal_run_branch;
pub(crate) const SCHEMA_APPLY_LOCK_BRANCH: &str = "__schema_apply_lock__";
/// Mutation kind, threaded through the version-check call sites so the
/// engine can apply an op-kind-aware policy:
///
/// - `Insert` / `Merge`: skip the strict pre-stage `ensure_expected_version`
/// check. Lance's `MergeInsertBuilder` rebases concurrent appends; the
/// per-(table, branch) writer queue serializes `commit_staged`; the
/// publisher's CAS (refreshed under the queue via
/// `MutationStaging::commit_all`'s `snapshot_for_branch` call) catches
/// genuine cross-process drift as `ManifestConflictDetails::ExpectedVersionMismatch`.
/// The pre-stage strict check would over-reject in-process concurrent
/// inserts, which is exactly the case PR 2 / MR-686 designed the
/// per-table queue to allow.
///
/// - `Update` / `Delete`: keep the strict check. These have read-modify-write
/// semantics; Lance moving between the read at stage time and the write
/// at commit time means the staged batch is computed against stale state.
/// The strict check guards the per-query SI invariant. SERIALIZABLE
/// opt-in (§VI.36 future seam) is the long-term answer for tighter
/// semantics; today, in-process update-update races on the same key
/// stay rejected as 409 — acceptable.
///
/// - `SchemaRewrite`: keep the strict check. Schema apply runs under the
/// graph-wide `__schema_apply_lock__` AND per-table queues; the strict
/// check is uncontested at that point.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(crate) enum MutationOpKind {
Insert,
Merge,
Update,
Delete,
SchemaRewrite,
}
impl MutationOpKind {
/// Whether the strict pre-stage `ensure_expected_version` check should
/// fire for this op kind. See [`MutationOpKind`] for the rationale per
/// kind.
pub(crate) fn strict_pre_stage_version_check(self) -> bool {
match self {
MutationOpKind::Insert | MutationOpKind::Merge => false,
MutationOpKind::Update
| MutationOpKind::Delete
| MutationOpKind::SchemaRewrite => true,
}
}
}
pub(crate) fn is_schema_apply_lock_branch(name: &str) -> bool {
name.trim_start_matches('/') == SCHEMA_APPLY_LOCK_BRANCH
}

View file

@ -2,6 +2,7 @@ use std::collections::{BTreeSet, HashMap, HashSet};
use std::io::Write;
use std::sync::Arc;
use arc_swap::ArcSwap;
use arrow_array::{
Array, BinaryArray, BooleanArray, Date32Array, FixedSizeListArray, Float32Array, Float64Array,
Int32Array, Int64Array, LargeBinaryArray, LargeListArray, LargeStringArray, ListArray,
@ -73,12 +74,60 @@ pub struct SchemaApplyResult {
pub struct Omnigraph {
root_uri: String,
storage: Arc<dyn StorageAdapter>,
coordinator: GraphCoordinator,
/// Coordinator state behind a tokio `RwLock`. PR 2 (MR-686) wraps
/// this so engine write APIs can be `&self` (the HTTP server's
/// `AppState` holds `Arc<Omnigraph>` and dispatches concurrent
/// calls without a global write lock). Reads (`snapshot`, `version`,
/// `current_branch`, `branch_list`, `resolve_*`, `head_commit_id`,
/// `list_commits`, …) acquire `.read().await` and parallelize.
/// Writes (`refresh`, `branch_create`, `branch_delete`, `commit_*`,
/// `record_*`) acquire `.write().await` and serialize. The atomic
/// commit invariant — `commit_manifest_updates` followed by
/// `record_graph_commit` must be atomic — is preserved by the
/// single `.write()` covering both calls inside
/// `commit_updates_with_actor_with_expected`. PR 2 Phase 2
/// converted from `Mutex` to `RwLock` because the bench showed
/// the Mutex was the dominant serializer for disjoint-table
/// workloads. Lock acquisition order: always before `runtime_cache`
/// (when both are needed in one scope).
coordinator: Arc<tokio::sync::RwLock<GraphCoordinator>>,
table_store: TableStore,
runtime_cache: RuntimeCache,
catalog: Catalog,
schema_source: String,
pub(crate) audit_actor_id: Option<String>,
/// Read-heavy on every query, written only by `apply_schema`. ArcSwap
/// gives atomic pointer swap with zero-cost reads (`load()` returns a
/// `Guard<Arc<Catalog>>`), so concurrent queries on different actors
/// don't contend on a lock to read the catalog.
catalog: Arc<ArcSwap<Catalog>>,
/// Read-heavy on schema introspection paths, written only by
/// `apply_schema`. Same ArcSwap rationale as `catalog`.
schema_source: Arc<ArcSwap<String>>,
/// Per-`(table_key, branch)` writer queues. Reachable from engine
/// internals (mutation finalize, schema_apply, branch_merge,
/// ensure_indices, delete_where) and from future MR-870 recovery
/// reconciler. PR 1b adds the field; callers acquire in commits 4+.
write_queue: Arc<crate::db::write_queue::WriteQueueManager>,
/// Process-wide mutex held across the swap → operate → restore window
/// in `branch_merge_impl`. Two concurrent merges with distinct targets
/// would otherwise interleave their three separate
/// `coordinator.write().await` acquisitions, leaving each merge's
/// inner body running against the other's swapped coord. Pinned by
/// `concurrent_branch_merges_distinct_targets_do_not_swap_into_each_other`
/// in `crates/omnigraph-server/tests/server.rs`.
///
/// Cost: serializes ALL concurrent branch merges process-wide.
/// Acceptable because branch merges are heavy (table rewrites, index
/// rebuilds), per-(table, branch) queues inside `commit_all` already
/// serialize the data path, and merges are rare relative to /change
/// or /ingest. A finer-grained per-target-branch mutex is a follow-up
/// if telemetry shows merge concurrency matters.
///
/// The deeper fix — refactor `branch_merge_on_current_target` to take
/// an explicit target coord parameter so `self.coordinator` is never
/// used as scratch space — is the round-1 shape applied to
/// `branch_create_from_impl`. Deferred because it requires unwinding
/// every `self.snapshot()` and `self.ensure_commit_graph_initialized()`
/// call inside the merge body.
merge_exclusive: Arc<tokio::sync::Mutex<()>>,
}
/// Whether [`Omnigraph::open`] runs the open-time recovery sweep.
@ -128,12 +177,13 @@ impl Omnigraph {
Ok(Self {
root_uri: root.clone(),
storage,
coordinator,
coordinator: Arc::new(tokio::sync::RwLock::new(coordinator)),
table_store: TableStore::new(&root),
runtime_cache: RuntimeCache::default(),
catalog,
schema_source: schema_source.to_string(),
audit_actor_id: None,
catalog: Arc::new(ArcSwap::from_pointee(catalog)),
schema_source: Arc::new(ArcSwap::from_pointee(schema_source.to_string())),
write_queue: Arc::new(crate::db::write_queue::WriteQueueManager::new()),
merge_exclusive: Arc::new(tokio::sync::Mutex::new(())),
})
}
@ -214,21 +264,39 @@ impl Omnigraph {
Ok(Self {
root_uri: root.clone(),
storage,
coordinator,
coordinator: Arc::new(tokio::sync::RwLock::new(coordinator)),
table_store: TableStore::new(&root),
runtime_cache: RuntimeCache::default(),
catalog,
schema_source,
audit_actor_id: None,
catalog: Arc::new(ArcSwap::from_pointee(catalog)),
schema_source: Arc::new(ArcSwap::from_pointee(schema_source)),
write_queue: Arc::new(crate::db::write_queue::WriteQueueManager::new()),
merge_exclusive: Arc::new(tokio::sync::Mutex::new(())),
})
}
pub fn catalog(&self) -> &Catalog {
&self.catalog
/// Returns an `Arc<Catalog>` snapshot. Cheap clone of the current
/// catalog pointer; callers can hold the returned `Arc` across awaits
/// without blocking concurrent `apply_schema`.
pub fn catalog(&self) -> Arc<Catalog> {
self.catalog.load_full()
}
pub fn schema_source(&self) -> &str {
&self.schema_source
/// Returns an `Arc<String>` snapshot of the schema source.
pub fn schema_source(&self) -> Arc<String> {
self.schema_source.load_full()
}
/// Atomically swap the in-memory catalog. Concurrent readers see
/// either the old or the new pointer; never a torn state. Used by
/// `apply_schema` and `reload_schema_if_source_changed`.
pub(crate) fn store_catalog(&self, catalog: Catalog) {
self.catalog.store(Arc::new(catalog));
}
/// Atomically swap the in-memory schema source. Same rationale as
/// [`store_catalog`](Self::store_catalog).
pub(crate) fn store_schema_source(&self, schema_source: String) {
self.schema_source.store(Arc::new(schema_source));
}
pub fn uri(&self) -> &str {
@ -243,11 +311,11 @@ impl Omnigraph {
schema_apply::plan_schema(self, desired_schema_source).await
}
pub async fn apply_schema(&mut self, desired_schema_source: &str) -> Result<SchemaApplyResult> {
pub async fn apply_schema(&self, desired_schema_source: &str) -> Result<SchemaApplyResult> {
schema_apply::apply_schema(self, desired_schema_source).await
}
pub(crate) async fn ensure_schema_apply_idle(&mut self, operation: &str) -> Result<()> {
pub(crate) async fn ensure_schema_apply_idle(&self, operation: &str) -> Result<()> {
schema_apply::ensure_schema_apply_idle(self, operation).await
}
@ -278,6 +346,26 @@ impl Omnigraph {
self.storage.as_ref()
}
/// Per-`(table_key, branch)` writer queues.
///
/// Engine-internal writers (mutation finalize, schema_apply,
/// branch_merge, ensure_indices, delete_where) and the future MR-870
/// recovery reconciler reach the queue manager via this accessor.
/// Returns an `Arc` clone so callers can hold the manager across
/// `&mut self` engine API boundaries.
pub(crate) fn write_queue(&self) -> Arc<crate::db::write_queue::WriteQueueManager> {
Arc::clone(&self.write_queue)
}
/// Engine-internal access to the merge-exclusive mutex. Held across
/// the swap → operate → restore window in `branch_merge_impl` so
/// concurrent merges with distinct targets don't corrupt
/// `self.coordinator` mid-operation. See the field doc on
/// `Omnigraph::merge_exclusive` for the full design rationale.
pub(crate) fn merge_exclusive(&self) -> Arc<tokio::sync::Mutex<()>> {
Arc::clone(&self.merge_exclusive)
}
/// Engine-level access to the repo's normalized root URI. Used by
/// the recovery sidecar protocol to compute `__recovery/` paths.
pub(crate) fn root_uri(&self) -> &str {
@ -297,15 +385,16 @@ impl Omnigraph {
}
pub(crate) async fn swap_coordinator_for_branch(
&mut self,
&self,
branch: Option<&str>,
) -> Result<GraphCoordinator> {
let next = self.open_coordinator_for_branch(branch).await?;
Ok(std::mem::replace(&mut self.coordinator, next))
let mut coord = self.coordinator.write().await;
Ok(std::mem::replace(&mut *coord, next))
}
pub(crate) fn restore_coordinator(&mut self, coordinator: GraphCoordinator) {
self.coordinator = coordinator;
pub(crate) async fn restore_coordinator(&self, coordinator: GraphCoordinator) {
*self.coordinator.write().await = coordinator;
}
pub(crate) async fn resolved_branch_target(
@ -315,21 +404,19 @@ impl Omnigraph {
self.ensure_schema_state_valid().await?;
let requested = ReadTarget::Branch(branch.unwrap_or("main").to_string());
let normalized = normalize_branch_name(branch.unwrap_or("main"))?;
if normalized.as_deref() == self.coordinator.current_branch() {
let snapshot_id = self.coordinator.head_commit_id().await?.unwrap_or_else(|| {
SnapshotId::synthetic(
self.coordinator.current_branch(),
self.coordinator.version(),
)
let coord = self.coordinator.read().await;
if normalized.as_deref() == coord.current_branch() {
let snapshot_id = coord.head_commit_id().await?.unwrap_or_else(|| {
SnapshotId::synthetic(coord.current_branch(), coord.version())
});
return Ok(ResolvedTarget {
requested,
branch: self.coordinator.current_branch().map(str::to_string),
branch: coord.current_branch().map(str::to_string),
snapshot_id,
snapshot: self.coordinator.snapshot(),
snapshot: coord.snapshot(),
});
}
self.coordinator.resolve_target(&requested).await
coord.resolve_target(&requested).await
}
pub(crate) async fn snapshot_for_branch(&self, branch: Option<&str>) -> Result<Snapshot> {
@ -338,13 +425,13 @@ impl Omnigraph {
.map(|resolved| resolved.snapshot)
}
pub(crate) fn version(&self) -> u64 {
self.coordinator.version()
pub(crate) async fn version(&self) -> u64 {
self.coordinator.read().await.version()
}
/// Return an immutable Snapshot from the known manifest state. No storage I/O.
pub(crate) fn snapshot(&self) -> Snapshot {
self.coordinator.snapshot()
pub(crate) async fn snapshot(&self) -> Snapshot {
self.coordinator.read().await.snapshot()
}
pub async fn snapshot_of(&self, target: impl Into<ReadTarget>) -> Result<Snapshot> {
@ -369,10 +456,11 @@ impl Omnigraph {
}
/// Synchronize this handle's write base to the latest head of the named branch.
pub async fn sync_branch(&mut self, branch: &str) -> Result<()> {
pub async fn sync_branch(&self, branch: &str) -> Result<()> {
self.ensure_schema_state_valid().await?;
let branch = normalize_branch_name(branch)?;
self.coordinator = self.open_coordinator_for_branch(branch.as_deref()).await?;
let next = self.open_coordinator_for_branch(branch.as_deref()).await?;
*self.coordinator.write().await = next;
self.runtime_cache.invalidate_all().await;
Ok(())
}
@ -409,35 +497,44 @@ impl Omnigraph {
/// (e.g. `schema_apply` mid-write) MUST use
/// [`refresh_coordinator_only`](Self::refresh_coordinator_only) to
/// avoid the recovery sweep racing their own sidecar.
pub async fn refresh(&mut self) -> Result<()> {
self.coordinator.refresh().await?;
let schema_state_recovery = recover_schema_state_files(
&self.root_uri,
Arc::clone(&self.storage),
&self.coordinator.snapshot(),
)
.await?;
crate::db::manifest::recover_manifest_drift(
&self.root_uri,
Arc::clone(&self.storage),
&mut self.coordinator,
crate::db::manifest::RecoveryMode::RollForwardOnly,
schema_state_recovery,
)
.await?;
pub async fn refresh(&self) -> Result<()> {
// Scope the coord write guard to the recovery section only.
// `reload_schema_if_source_changed` (below) acquires
// `self.coordinator.read().await` when the on-disk schema source
// has drifted from the cached `schema_source`. Tokio's RwLock is
// not reentrant, so holding the write across that call deadlocks.
// Pinned by `composite_flow_schema_apply_then_branch_ops_no_deadlock_in_refresh`.
{
let mut coord = self.coordinator.write().await;
coord.refresh().await?;
let schema_state_recovery = recover_schema_state_files(
&self.root_uri,
Arc::clone(&self.storage),
&coord.snapshot(),
)
.await?;
crate::db::manifest::recover_manifest_drift(
&self.root_uri,
Arc::clone(&self.storage),
&mut *coord,
crate::db::manifest::RecoveryMode::RollForwardOnly,
schema_state_recovery,
)
.await?;
} // ← write guard released before reload's read acquisition
self.reload_schema_if_source_changed().await?;
self.runtime_cache.invalidate_all().await;
Ok(())
}
async fn reload_schema_if_source_changed(&mut self) -> Result<()> {
async fn reload_schema_if_source_changed(&self) -> Result<()> {
let schema_path = schema_source_uri(&self.root_uri);
let schema_source = self.storage.read_text(&schema_path).await?;
if schema_source == self.schema_source {
if schema_source == *self.schema_source.load_full() {
return Ok(());
}
let current_source_ir = read_schema_ir_from_source(&schema_source)?;
let branches = self.coordinator.branch_list().await?;
let branches = self.coordinator.read().await.branch_list().await?;
let (accepted_ir, _) = load_or_bootstrap_schema_contract(
&self.root_uri,
Arc::clone(&self.storage),
@ -447,8 +544,8 @@ impl Omnigraph {
.await?;
let mut catalog = build_catalog_from_ir(&accepted_ir)?;
fixup_blob_schemas(&mut catalog);
self.schema_source = schema_source;
self.catalog = catalog;
self.store_schema_source(schema_source);
self.store_catalog(catalog);
Ok(())
}
@ -459,15 +556,15 @@ impl Omnigraph {
/// here would observe the caller's own sidecar, classify it as
/// RolledPastExpected, and roll it forward — racing the caller's
/// own publish path.
pub(crate) async fn refresh_coordinator_only(&mut self) -> Result<()> {
self.coordinator.refresh().await?;
pub(crate) async fn refresh_coordinator_only(&self) -> Result<()> {
self.coordinator.write().await.refresh().await?;
self.runtime_cache.invalidate_all().await;
Ok(())
}
pub async fn resolve_snapshot(&self, branch: &str) -> Result<SnapshotId> {
self.ensure_schema_state_valid().await?;
self.coordinator.resolve_snapshot_id(branch).await
self.coordinator.read().await.resolve_snapshot_id(branch).await
}
pub(crate) async fn resolved_target(
@ -475,7 +572,7 @@ impl Omnigraph {
target: impl Into<ReadTarget>,
) -> Result<ResolvedTarget> {
self.ensure_schema_state_valid().await?;
self.coordinator.resolve_target(&target.into()).await
self.coordinator.read().await.resolve_target(&target.into()).await
}
// ─── Change detection ────────────────────────────────────────────────
@ -506,26 +603,20 @@ impl Omnigraph {
to_commit_id: &str,
filter: &crate::changes::ChangeFilter,
) -> Result<crate::changes::ChangeSet> {
let from_commit = self
.coordinator
.resolve_commit(&SnapshotId::new(from_commit_id))
.await?;
let to_commit = self
.coordinator
.resolve_commit(&SnapshotId::new(to_commit_id))
.await?;
let from_snap = self
.coordinator
let coord = self.coordinator.read().await;
let from_commit = coord.resolve_commit(&SnapshotId::new(from_commit_id)).await?;
let to_commit = coord.resolve_commit(&SnapshotId::new(to_commit_id)).await?;
let from_snap = coord
.resolve_target(&ReadTarget::Snapshot(SnapshotId::new(
from_commit.graph_commit_id.clone(),
)))
.await?;
let to_snap = self
.coordinator
let to_snap = coord
.resolve_target(&ReadTarget::Snapshot(SnapshotId::new(
to_commit.graph_commit_id.clone(),
)))
.await?;
drop(coord);
crate::changes::diff_snapshots(
self.uri(),
&from_snap.snapshot,
@ -558,7 +649,7 @@ impl Omnigraph {
/// Create a Snapshot at any historical manifest version.
pub async fn snapshot_at_version(&self, version: u64) -> Result<Snapshot> {
self.ensure_schema_state_valid().await?;
self.coordinator.snapshot_at_version(version).await
self.coordinator.read().await.snapshot_at_version(version).await
}
pub async fn export_jsonl(
@ -606,11 +697,11 @@ impl Omnigraph {
/// unbranched subtables keep inheriting `main`, while subtables inherited
/// from an ancestor branch are first forked into the active branch before
/// their index metadata is updated.
pub async fn ensure_indices(&mut self) -> Result<()> {
pub async fn ensure_indices(&self) -> Result<()> {
table_ops::ensure_indices(self).await
}
pub async fn ensure_indices_on(&mut self, branch: &str) -> Result<()> {
pub async fn ensure_indices_on(&self, branch: &str) -> Result<()> {
table_ops::ensure_indices_on(self, branch).await
}
@ -633,7 +724,7 @@ impl Omnigraph {
/// Compact small Lance fragments into fewer larger ones across every
/// node + edge table on `main`. See [`optimize`] for details.
pub async fn optimize(&mut self) -> Result<Vec<optimize::TableOptimizeStats>> {
pub async fn optimize(&self) -> Result<Vec<optimize::TableOptimizeStats>> {
optimize::optimize_all_tables(self).await
}
@ -658,8 +749,8 @@ impl Omnigraph {
/// ```
pub async fn read_blob(&self, type_name: &str, id: &str, property: &str) -> Result<BlobFile> {
self.ensure_schema_state_valid().await?;
let node_type = self
.catalog
let catalog = self.catalog();
let node_type = catalog
.node_types
.get(type_name)
.ok_or_else(|| OmniError::manifest(format!("unknown node type '{}'", type_name)))?;
@ -670,7 +761,7 @@ impl Omnigraph {
)));
}
let snapshot = self.snapshot();
let snapshot = self.snapshot().await;
let table_key = format!("node:{}", type_name);
let ds = snapshot.open(&table_key).await?;
@ -698,12 +789,12 @@ impl Omnigraph {
})
}
pub(crate) fn active_branch(&self) -> Option<&str> {
self.coordinator.current_branch()
pub(crate) async fn active_branch(&self) -> Option<String> {
self.coordinator.read().await.current_branch().map(str::to_string)
}
async fn ensure_branch_delete_safe(&self, branch: &str, branches: &[String]) -> Result<()> {
let descendants = self.coordinator.branch_descendants(branch).await?;
let descendants = self.coordinator.read().await.branch_descendants(branch).await?;
if let Some(descendant) = descendants.first() {
return Err(OmniError::manifest_conflict(format!(
"cannot delete branch '{}' because descendant branch '{}' still depends on it",
@ -758,8 +849,9 @@ impl Omnigraph {
Ok(())
}
async fn delete_branch_storage_only(&mut self, branch: &str) -> Result<()> {
if self.coordinator.current_branch() == Some(branch) {
async fn delete_branch_storage_only(&self, branch: &str) -> Result<()> {
let active = self.coordinator.read().await.current_branch().map(str::to_string);
if active.as_deref() == Some(branch) {
return Err(OmniError::manifest_conflict(format!(
"cannot delete currently active branch '{}'",
branch
@ -773,7 +865,7 @@ impl Omnigraph {
.map(|entry| (entry.table_key.clone(), entry.table_path.clone()))
.collect::<Vec<_>>();
self.coordinator.branch_delete(branch).await?;
self.coordinator.write().await.branch_delete(branch).await?;
self.cleanup_deleted_branch_tables(branch, &owned_tables)
.await
}
@ -794,19 +886,15 @@ impl Omnigraph {
.map(|id| id.map(|snapshot_id| snapshot_id.as_str().to_string()))
}
pub async fn branch_create(&mut self, name: &str) -> Result<()> {
pub async fn branch_create(&self, name: &str) -> Result<()> {
self.ensure_schema_state_valid().await?;
self.ensure_schema_apply_idle("branch_create").await?;
ensure_public_branch_ref(name, "branch_create")?;
self.coordinator.branch_create(name).await
}
pub(crate) fn current_audit_actor(&self) -> Option<&str> {
self.audit_actor_id.as_deref()
self.coordinator.write().await.branch_create(name).await
}
pub async fn branch_create_from(
&mut self,
&self,
from: impl Into<ReadTarget>,
name: &str,
) -> Result<()> {
@ -815,7 +903,7 @@ impl Omnigraph {
}
async fn branch_create_from_impl(
&mut self,
&self,
from: impl Into<ReadTarget>,
name: &str,
allow_internal_refs: bool,
@ -831,25 +919,39 @@ impl Omnigraph {
ensure_public_branch_ref(name, "branch_create_from")?;
}
let branch = normalize_branch_name(&branch_name)?;
let previous = self.swap_coordinator_for_branch(branch.as_deref()).await?;
let result = self.coordinator.branch_create(name).await;
self.restore_coordinator(previous);
result
// Operate on a freshly-opened source coordinator that's owned locally
// — never touch `self.coordinator`. The pre-fix implementation used
// `swap_coordinator_for_branch` + operate + `restore_coordinator` as
// three separate `coordinator.write().await` acquisitions; under
// `&self` concurrency, a second `branch_create_from` could swap
// self.coordinator between this caller's swap and operate steps,
// making the operate run against the wrong source branch and
// forking off the wrong HEAD. Pinned by
// `concurrent_branch_create_from_distinct_parents_does_not_corrupt_coordinator`
// in `crates/omnigraph-server/tests/server.rs`.
//
// `branch_create` mutates only the local coord's commit-graph cache;
// the manifest write is durable on disk regardless of which
// coord-handle issued it. Discarding `source_coord` after the call
// is the right shape — the new branch is reachable from any
// subsequent open of any coord.
let mut source_coord = self.open_coordinator_for_branch(branch.as_deref()).await?;
source_coord.branch_create(name).await
}
pub async fn branch_list(&self) -> Result<Vec<String>> {
self.ensure_schema_state_valid().await?;
self.coordinator.branch_list().await
self.coordinator.read().await.branch_list().await
}
pub async fn branch_delete(&mut self, name: &str) -> Result<()> {
pub async fn branch_delete(&self, name: &str) -> Result<()> {
self.ensure_schema_state_valid().await?;
self.ensure_schema_apply_idle("branch_delete").await?;
ensure_public_branch_ref(name, "branch_delete")?;
self.refresh().await?;
let branch = normalize_branch_name(name)?
.ok_or_else(|| OmniError::manifest("cannot delete branch 'main'".to_string()))?;
let branches = self.coordinator.branch_list().await?;
let branches = self.coordinator.read().await.branch_list().await?;
if !branches.iter().any(|candidate| candidate == &branch) {
return Err(OmniError::manifest_not_found(format!(
"branch '{}' not found",
@ -863,7 +965,7 @@ impl Omnigraph {
pub async fn get_commit(&self, commit_id: &str) -> Result<GraphCommit> {
self.ensure_schema_state_valid().await?;
self.coordinator
self.coordinator.read().await
.resolve_commit(&SnapshotId::new(commit_id))
.await
}
@ -886,16 +988,18 @@ impl Omnigraph {
pub(crate) async fn open_for_mutation(
&self,
table_key: &str,
op_kind: crate::db::MutationOpKind,
) -> Result<(Dataset, String, Option<String>)> {
table_ops::open_for_mutation(self, table_key).await
table_ops::open_for_mutation(self, table_key, op_kind).await
}
pub(crate) async fn open_for_mutation_on_branch(
&self,
branch: Option<&str>,
table_key: &str,
op_kind: crate::db::MutationOpKind,
) -> Result<(Dataset, String, Option<String>)> {
table_ops::open_for_mutation_on_branch(self, branch, table_key).await
table_ops::open_for_mutation_on_branch(self, branch, table_key, op_kind).await
}
pub(crate) async fn fork_dataset_from_entry_state(
@ -923,9 +1027,17 @@ impl Omnigraph {
full_path: &str,
table_branch: Option<&str>,
expected_version: u64,
op_kind: crate::db::MutationOpKind,
) -> Result<Dataset> {
table_ops::reopen_for_mutation(self, table_key, full_path, table_branch, expected_version)
.await
table_ops::reopen_for_mutation(
self,
table_key,
full_path,
table_branch,
expected_version,
op_kind,
)
.await
}
pub(crate) async fn open_dataset_at_state(
@ -965,43 +1077,47 @@ impl Omnigraph {
}
pub(crate) async fn commit_manifest_updates(
&mut self,
&self,
updates: &[crate::db::SubTableUpdate],
) -> Result<u64> {
table_ops::commit_manifest_updates(self, updates).await
}
pub(crate) async fn record_merge_commit(
&mut self,
&self,
manifest_version: u64,
parent_commit_id: &str,
merged_parent_commit_id: &str,
actor_id: Option<&str>,
) -> Result<String> {
table_ops::record_merge_commit(
self,
manifest_version,
parent_commit_id,
merged_parent_commit_id,
actor_id,
)
.await
}
pub(crate) async fn commit_updates_on_branch_with_expected(
&mut self,
&self,
branch: Option<&str>,
updates: &[crate::db::SubTableUpdate],
expected_table_versions: &std::collections::HashMap<String, u64>,
actor_id: Option<&str>,
) -> Result<u64> {
table_ops::commit_updates_on_branch_with_expected(
self,
branch,
updates,
expected_table_versions,
actor_id,
)
.await
}
pub(crate) async fn ensure_commit_graph_initialized(&mut self) -> Result<()> {
pub(crate) async fn ensure_commit_graph_initialized(&self) -> Result<()> {
table_ops::ensure_commit_graph_initialized(self).await
}
@ -1495,7 +1611,7 @@ edge WorksAt: Person -> Company
}
async fn table_rows_json(db: &Omnigraph, table_key: &str) -> Vec<Value> {
let snapshot = db.snapshot();
let snapshot = db.snapshot().await;
let ds = snapshot.open(table_key).await.unwrap();
let batches = db.table_store().scan_batches(&ds).await.unwrap();
batches
@ -1509,7 +1625,10 @@ edge WorksAt: Person -> Company
}
async fn seed_person_row(db: &mut Omnigraph, name: &str, age: Option<i32>) {
let (mut ds, full_path, table_branch) = db.open_for_mutation("node:Person").await.unwrap();
let (mut ds, full_path, table_branch) = db
.open_for_mutation("node:Person", crate::db::MutationOpKind::Insert)
.await
.unwrap();
let schema: Arc<Schema> = Arc::new(ds.schema().into());
let columns: Vec<Arc<dyn Array>> = schema
.fields()
@ -1592,7 +1711,7 @@ edge WorksAt: Person -> Company
let uri = dir.path().to_str().unwrap();
let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap();
seed_person_row(&mut db, "Alice", Some(30)).await;
let before_version = db.snapshot().version();
let before_version = db.snapshot().await.version();
let desired = TEST_SCHEMA
.replace("node Person {\n", "node Human @rename_from(\"Person\") {\n")
@ -1603,7 +1722,7 @@ edge WorksAt: Person -> Company
);
db.apply_schema(&desired).await.unwrap();
let head = db.snapshot();
let head = db.snapshot().await;
assert!(head.entry("node:Person").is_none());
assert!(head.entry("node:Human").is_some());
let historical = ManifestCoordinator::snapshot_at(uri, None, before_version)
@ -1633,7 +1752,7 @@ edge WorksAt: Person -> Company
.await
.unwrap();
let all_branches = db.coordinator.all_branches().await.unwrap();
let all_branches = db.coordinator.read().await.all_branches().await.unwrap();
assert!(
!all_branches.iter().any(|b| is_internal_run_branch(b)),
"run branch should be deleted after publish, got: {:?}",
@ -1657,7 +1776,7 @@ edge WorksAt: Person -> Company
let desired = TEST_SCHEMA.replace("name: String @key", "name: String @key @index");
db.apply_schema(&desired).await.unwrap();
let snapshot = db.snapshot();
let snapshot = db.snapshot().await;
let ds = snapshot.open("node:Person").await.unwrap();
assert!(db.table_store().has_fts_index(&ds, "name").await.unwrap());
}
@ -1676,7 +1795,7 @@ edge WorksAt: Person -> Company
);
db.apply_schema(&desired).await.unwrap();
let snapshot = db.snapshot();
let snapshot = db.snapshot().await;
let ds = snapshot.open("node:Person").await.unwrap();
assert!(db.table_store().has_btree_index(&ds, "id").await.unwrap());
assert!(db.table_store().has_fts_index(&ds, "name").await.unwrap());
@ -1689,11 +1808,16 @@ edge WorksAt: Person -> Company
let db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap();
let mut db = db;
db.coordinator
.write()
.await
.branch_create(SCHEMA_APPLY_LOCK_BRANCH)
.await
.unwrap();
let err = db.open_for_mutation("node:Person").await.unwrap_err();
let err = db
.open_for_mutation("node:Person", crate::db::MutationOpKind::Insert)
.await
.unwrap_err();
assert!(
err.to_string()
.contains("write is unavailable while schema apply is in progress")
@ -1706,6 +1830,8 @@ edge WorksAt: Person -> Company
let uri = dir.path().to_str().unwrap();
let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap();
db.coordinator
.write()
.await
.branch_create(SCHEMA_APPLY_LOCK_BRANCH)
.await
.unwrap();
@ -1723,6 +1849,8 @@ edge WorksAt: Person -> Company
let uri = dir.path().to_str().unwrap();
let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap();
db.coordinator
.write()
.await
.branch_create(SCHEMA_APPLY_LOCK_BRANCH)
.await
.unwrap();

View file

@ -16,7 +16,7 @@ pub(super) async fn entity_at(
id: &str,
version: u64,
) -> Result<Option<serde_json::Value>> {
let snap = db.coordinator.snapshot_at_version(version).await?;
let snap = db.coordinator.read().await.snapshot_at_version(version).await?;
entity_from_snapshot(db, &snap, table_key, id).await
}
@ -142,7 +142,8 @@ async fn export_table_to_writer<W: Write>(
.open_snapshot_table(snapshot, table_key)
.await?;
let ordering = Some(vec![ColumnOrdering::asc_nulls_last("id".to_string())]);
let blob_properties = blob_properties_for_table_key(db.catalog(), table_key)?;
let catalog = db.catalog();
let blob_properties = blob_properties_for_table_key(&catalog, table_key)?;
if blob_properties.is_empty() {
for batch in db.table_store.scan(&ds, None, None, ordering).await? {
@ -207,9 +208,9 @@ fn write_export_rows_from_batch<W: Write>(
blob_values: Option<&HashMap<String, Vec<Option<String>>>>,
writer: &mut W,
) -> Result<()> {
let catalog = db.catalog();
if let Some(type_name) = table_key.strip_prefix("node:") {
let node_type = db
.catalog
let node_type = catalog
.node_types
.get(type_name)
.ok_or_else(|| OmniError::manifest(format!("unknown node type '{}'", type_name)))?;
@ -243,8 +244,7 @@ fn write_export_rows_from_batch<W: Write>(
}
if let Some(edge_name) = table_key.strip_prefix("edge:") {
let edge_type = db
.catalog
let edge_type = catalog
.edge_types
.get(edge_name)
.ok_or_else(|| OmniError::manifest(format!("unknown edge type '{}'", edge_name)))?;

View file

@ -74,14 +74,14 @@ pub struct TableCleanupStats {
/// Run Lance `compact_files` on every node + edge table on `main`.
/// Tables run in parallel (bounded concurrency).
pub async fn optimize_all_tables(db: &mut Omnigraph) -> Result<Vec<TableOptimizeStats>> {
pub async fn optimize_all_tables(db: &Omnigraph) -> Result<Vec<TableOptimizeStats>> {
db.ensure_schema_state_valid().await?;
db.ensure_schema_apply_idle("optimize").await?;
let resolved = db.resolved_branch_target(None).await?;
let snapshot = resolved.snapshot;
let table_tasks: Vec<_> = all_table_keys(&db.catalog)
let table_tasks: Vec<_> = all_table_keys(&db.catalog())
.into_iter()
.filter_map(|table_key| {
let entry = snapshot.entry(&table_key)?;
@ -144,7 +144,7 @@ pub async fn cleanup_all_tables(
let resolved = db.resolved_branch_target(None).await?;
let snapshot = resolved.snapshot;
let table_tasks: Vec<_> = all_table_keys(&db.catalog)
let table_tasks: Vec<_> = all_table_keys(&db.catalog())
.into_iter()
.filter_map(|table_key| {
let entry = snapshot.entry(&table_key)?;

View file

@ -12,7 +12,7 @@ pub(super) async fn plan_schema(
}
pub(super) async fn apply_schema(
db: &mut Omnigraph,
db: &Omnigraph,
desired_schema_source: &str,
) -> Result<SchemaApplyResult> {
acquire_schema_apply_lock(db).await?;
@ -27,11 +27,11 @@ pub(super) async fn apply_schema(
}
pub(super) async fn apply_schema_with_lock(
db: &mut Omnigraph,
db: &Omnigraph,
desired_schema_source: &str,
) -> Result<SchemaApplyResult> {
db.ensure_schema_state_valid().await?;
let branches = db.coordinator.all_branches().await?;
let branches = db.coordinator.read().await.all_branches().await?;
// Skip `main` and internal system branches. The schema-apply lock branch
// is excluded because it is the cluster-wide schema-apply serializer.
// `__run__*` branches are no longer created; the filter remains as
@ -67,7 +67,7 @@ pub(super) async fn apply_schema_with_lock(
return Ok(SchemaApplyResult {
supported: true,
applied: false,
manifest_version: db.version(),
manifest_version: db.version().await,
steps: plan.steps,
});
}
@ -75,7 +75,7 @@ pub(super) async fn apply_schema_with_lock(
let mut desired_catalog = build_catalog_from_ir(&desired_ir)?;
fixup_blob_schemas(&mut desired_catalog);
let snapshot = db.snapshot();
let snapshot = db.snapshot().await;
let base_manifest_version = snapshot.version();
let mut added_tables = BTreeSet::new();
let mut renamed_tables = HashMap::new();
@ -209,6 +209,26 @@ pub(super) async fn apply_schema_with_lock(
});
}
// Acquire per-(table_key, branch) queues for every existing table
// that schema_apply will rewrite or re-index. New tables (added or
// renamed targets) aren't acquired — they have no existing dataset
// to race against. Held across the per-table commit loop and the
// manifest publish via `commit_changes_with_actor` below.
//
// Schema-apply already holds the graph-wide `__schema_apply_lock__`
// sentinel branch, so under PR 1b's intermediate state these
// per-table acquisitions are uncontended. They exist for symmetry
// with future MR-870 recovery, which will need queue acquisition
// before any `Dataset::restore` it issues for SchemaApply sidecars.
let schema_apply_queue_keys: Vec<(String, Option<String>)> = recovery_pins
.iter()
.map(|pin| (pin.table_key.clone(), pin.table_branch.clone()))
.collect();
let _schema_apply_queue_guards = db
.write_queue()
.acquire_many(&schema_apply_queue_keys)
.await;
let recovery_handle = if recovery_pins.is_empty()
&& sidecar_registrations.is_empty()
&& sidecar_tombstones.is_empty()
@ -225,7 +245,10 @@ pub(super) async fn apply_schema_with_lock(
let mut sidecar = crate::db::manifest::new_sidecar(
crate::db::manifest::SidecarKind::SchemaApply,
None,
db.audit_actor_id.clone(),
// `apply_schema` doesn't currently take an actor (no `apply_schema_as`
// public API). The HTTP server's /schema/apply handler can pass actor
// context through a follow-up addition. For now, system-attributed.
None,
recovery_pins,
);
sidecar.additional_registrations = sidecar_registrations;
@ -266,11 +289,12 @@ pub(super) async fn apply_schema_with_lock(
})?;
ensure_snapshot_entry_head_matches(db, source_entry).await?;
let source_ds = snapshot.open(source_table_key).await?;
let current_catalog = db.catalog();
let batch = batch_for_schema_apply_rewrite(
db,
&source_ds,
source_table_key,
&db.catalog,
&current_catalog,
target_table_key,
&desired_catalog,
property_renames.get(target_table_key),
@ -311,11 +335,12 @@ pub(super) async fn apply_schema_with_lock(
})?;
ensure_snapshot_entry_head_matches(db, entry).await?;
let source_ds = snapshot.open(table_key).await?;
let current_catalog = db.catalog();
let batch = batch_for_schema_apply_rewrite(
db,
&source_ds,
table_key,
&db.catalog,
&current_catalog,
table_key,
&desired_catalog,
property_renames.get(table_key),
@ -418,11 +443,11 @@ pub(super) async fn apply_schema_with_lock(
}
db.refresh_coordinator_only().await?;
if db.version() != base_manifest_version {
let current_manifest_version = db.version().await;
if current_manifest_version != base_manifest_version {
return Err(OmniError::manifest_conflict(format!(
"schema apply lost its write lease: main advanced from v{} to v{} while schema apply was in progress",
base_manifest_version,
db.version()
base_manifest_version, current_manifest_version,
)));
}
@ -444,13 +469,15 @@ pub(super) async fn apply_schema_with_lock(
crate::failpoints::maybe_fail("schema_apply.after_staging_write")?;
let actor_id = db.current_audit_actor().map(str::to_string);
// `apply_schema` doesn't currently take an actor; system-attributed.
let PublishedSnapshot {
manifest_version,
_snapshot_id: _,
} = db
.coordinator
.commit_changes_with_actor(&manifest_changes, actor_id.as_deref())
.write()
.await
.commit_changes_with_actor(&manifest_changes, None)
.await?;
crate::failpoints::maybe_fail("schema_apply.after_manifest_commit")?;
@ -471,9 +498,9 @@ pub(super) async fn apply_schema_with_lock(
)
.await?;
db.catalog = desired_catalog;
db.schema_source = desired_schema_source.to_string();
db.coordinator.refresh().await?;
db.store_catalog(desired_catalog);
db.store_schema_source(desired_schema_source.to_string());
db.coordinator.write().await.refresh().await?;
db.runtime_cache.invalidate_all().await;
if changed_edge_tables {
db.invalidate_graph_index().await;
@ -504,15 +531,15 @@ pub(super) async fn apply_schema_with_lock(
})
}
pub(super) async fn ensure_schema_apply_idle(db: &mut Omnigraph, operation: &str) -> Result<()> {
pub(super) async fn ensure_schema_apply_idle(db: &Omnigraph, operation: &str) -> Result<()> {
db.refresh_coordinator_only().await?;
ensure_schema_apply_not_locked(db, operation).await
}
pub(super) async fn acquire_schema_apply_lock(db: &mut Omnigraph) -> Result<()> {
pub(super) async fn acquire_schema_apply_lock(db: &Omnigraph) -> Result<()> {
db.ensure_schema_state_valid().await?;
db.refresh_coordinator_only().await?;
let branches = db.coordinator.all_branches().await?;
let branches = db.coordinator.read().await.all_branches().await?;
if branches
.iter()
.any(|branch| is_schema_apply_lock_branch(branch))
@ -523,12 +550,16 @@ pub(super) async fn acquire_schema_apply_lock(db: &mut Omnigraph) -> Result<()>
}
db.coordinator
.write()
.await
.branch_create(SCHEMA_APPLY_LOCK_BRANCH)
.await?;
db.refresh_coordinator_only().await?;
let blocking_branches = db
.coordinator
.read()
.await
.all_branches()
.await?
.into_iter()
@ -545,8 +576,10 @@ pub(super) async fn acquire_schema_apply_lock(db: &mut Omnigraph) -> Result<()>
Ok(())
}
pub(super) async fn release_schema_apply_lock(db: &mut Omnigraph) -> Result<()> {
pub(super) async fn release_schema_apply_lock(db: &Omnigraph) -> Result<()> {
db.coordinator
.write()
.await
.branch_delete(SCHEMA_APPLY_LOCK_BRANCH)
.await?;
// Use refresh_coordinator_only — the full Omnigraph::refresh would
@ -560,6 +593,8 @@ pub(super) async fn release_schema_apply_lock(db: &mut Omnigraph) -> Result<()>
pub(super) async fn ensure_schema_apply_not_locked(db: &Omnigraph, operation: &str) -> Result<()> {
if db
.coordinator
.read()
.await
.all_branches()
.await?
.iter()

View file

@ -2,31 +2,31 @@ use super::*;
pub(super) async fn graph_index(db: &Omnigraph) -> Result<Arc<crate::graph_index::GraphIndex>> {
db.ensure_schema_state_valid().await?;
let resolved = db
.coordinator
let coord = db.coordinator.read().await;
let resolved = coord
.resolve_target(&ReadTarget::Branch(
db.coordinator
.current_branch()
.unwrap_or("main")
.to_string(),
coord.current_branch().unwrap_or("main").to_string(),
))
.await?;
db.runtime_cache.graph_index(&resolved, &db.catalog).await
drop(coord);
let catalog = db.catalog();
db.runtime_cache.graph_index(&resolved, &catalog).await
}
pub(super) async fn graph_index_for_resolved(
db: &Omnigraph,
resolved: &ResolvedTarget,
) -> Result<Arc<crate::graph_index::GraphIndex>> {
db.runtime_cache.graph_index(resolved, &db.catalog).await
let catalog = db.catalog();
db.runtime_cache.graph_index(resolved, &catalog).await
}
pub(super) async fn ensure_indices(db: &mut Omnigraph) -> Result<()> {
let current_branch = db.coordinator.current_branch().map(str::to_string);
pub(super) async fn ensure_indices(db: &Omnigraph) -> Result<()> {
let current_branch = db.coordinator.read().await.current_branch().map(str::to_string);
ensure_indices_for_branch(db, current_branch.as_deref()).await
}
pub(super) async fn ensure_indices_on(db: &mut Omnigraph, branch: &str) -> Result<()> {
pub(super) async fn ensure_indices_on(db: &Omnigraph, branch: &str) -> Result<()> {
let branch = normalize_branch_name(branch)?;
ensure_indices_for_branch(db, branch.as_deref()).await
}
@ -58,12 +58,18 @@ pub(super) async fn failpoint_publish_table_head_without_index_rebuild_for_test(
};
let mut expected = std::collections::HashMap::new();
expected.insert(table_key.to_string(), entry.table_version);
commit_prepared_updates_on_branch_with_expected(db, branch.as_deref(), &[update], &expected)
.await
commit_prepared_updates_on_branch_with_expected(
db,
branch.as_deref(),
&[update],
&expected,
None,
)
.await
}
pub(super) async fn ensure_indices_for_branch(
db: &mut Omnigraph,
db: &Omnigraph,
branch: Option<&str>,
) -> Result<()> {
db.ensure_schema_state_valid().await?;
@ -72,6 +78,7 @@ pub(super) async fn ensure_indices_for_branch(
let snapshot = resolved.snapshot;
let mut updates = Vec::new();
let active_branch = resolved.branch;
let catalog = db.catalog();
// Recovery sidecar: protect the per-table commit_staged loop in
// build_indices_on_dataset (one commit per index built). Only pins
@ -83,7 +90,7 @@ pub(super) async fn ensure_indices_for_branch(
// committed work on sibling tables. Steady-state runs (everything
// already indexed) skip the sidecar entirely.
let mut recovery_pins: Vec<crate::db::manifest::SidecarTablePin> = Vec::new();
for type_name in db.catalog.node_types.keys() {
for type_name in catalog.node_types.keys() {
let table_key = format!("node:{}", type_name);
let Some(entry) = snapshot.entry(&table_key) else {
continue;
@ -122,7 +129,7 @@ pub(super) async fn ensure_indices_for_branch(
});
}
}
for edge_name in db.catalog.edge_types.keys() {
for edge_name in catalog.edge_types.keys() {
let table_key = format!("edge:{}", edge_name);
let Some(entry) = snapshot.entry(&table_key) else {
continue;
@ -147,13 +154,28 @@ pub(super) async fn ensure_indices_for_branch(
});
}
}
// Acquire per-(table_key, active_branch) queues for every table
// that needs index work. Held across the per-table commit loop and
// the manifest publish at the end of this function. Sorted-order
// acquisition prevents lock-order inversion against concurrent
// multi-table writers (mutation finalize, branch_merge, future
// MR-870 recovery). Under PR 1b's intermediate state (global server
// RwLock still in place), this acquisition is uncontended.
let queue_keys: Vec<(String, Option<String>)> = recovery_pins
.iter()
.map(|pin| (pin.table_key.clone(), pin.table_branch.clone()))
.collect();
let _queue_guards = db.write_queue().acquire_many(&queue_keys).await;
let recovery_handle = if recovery_pins.is_empty() {
None
} else {
let sidecar = crate::db::manifest::new_sidecar(
crate::db::manifest::SidecarKind::EnsureIndices,
active_branch.clone(),
db.audit_actor_id.clone(),
// `ensure_indices` doesn't currently take an actor; system-attributed.
// Future: add `ensure_indices_as` to thread actor context.
None,
recovery_pins,
);
Some(
@ -162,7 +184,7 @@ pub(super) async fn ensure_indices_for_branch(
)
};
for type_name in db.catalog.node_types.keys() {
for type_name in catalog.node_types.keys() {
let table_key = format!("node:{}", type_name);
let Some(entry) = snapshot.entry(&table_key) else {
continue;
@ -179,6 +201,7 @@ pub(super) async fn ensure_indices_for_branch(
entry.table_branch.as_deref(),
entry.table_version,
active_branch,
crate::db::MutationOpKind::SchemaRewrite,
)
.await?
}
@ -209,7 +232,7 @@ pub(super) async fn ensure_indices_for_branch(
}
}
for edge_name in db.catalog.edge_types.keys() {
for edge_name in catalog.edge_types.keys() {
let table_key = format!("edge:{}", edge_name);
let Some(entry) = snapshot.entry(&table_key) else {
continue;
@ -226,6 +249,7 @@ pub(super) async fn ensure_indices_for_branch(
entry.table_branch.as_deref(),
entry.table_version,
active_branch,
crate::db::MutationOpKind::SchemaRewrite,
)
.await?
}
@ -264,7 +288,7 @@ pub(super) async fn ensure_indices_for_branch(
crate::failpoints::maybe_fail("ensure_indices.post_phase_b_pre_manifest_commit")?;
if !updates.is_empty() {
commit_prepared_updates_on_branch(db, branch, &updates).await?;
commit_prepared_updates_on_branch(db, branch, &updates, None).await?;
}
// Recovery sidecar lifecycle: delete after the manifest publish (or
@ -321,7 +345,8 @@ async fn needs_index_work_node(
if !db.table_store.has_btree_index(&ds, "id").await? {
return Ok(true);
}
let Some(node_type) = db.catalog.node_types.get(type_name) else {
let catalog = db.catalog();
let Some(node_type) = catalog.node_types.get(type_name) else {
return Ok(false);
};
for index_cols in &node_type.indices {
@ -376,15 +401,23 @@ async fn needs_index_work_edge(
pub(super) async fn open_for_mutation(
db: &Omnigraph,
table_key: &str,
op_kind: crate::db::MutationOpKind,
) -> Result<(Dataset, String, Option<String>)> {
let current_branch = db.coordinator.current_branch().map(str::to_string);
open_for_mutation_on_branch(db, current_branch.as_deref(), table_key).await
let current_branch = db.coordinator.read().await.current_branch().map(str::to_string);
open_for_mutation_on_branch(db, current_branch.as_deref(), table_key, op_kind).await
}
/// Open a sub-table for mutation. The `op_kind` selects the strict-vs-relaxed
/// pre-stage version-check policy — see [`crate::db::MutationOpKind`] for the
/// rationale per kind. Insert / Merge skip the strict
/// `ensure_expected_version` check (Lance's natural conflict resolver +
/// per-(table, branch) queue + publisher CAS handle drift); Update / Delete /
/// SchemaRewrite keep it (read-modify-write SI).
pub(super) async fn open_for_mutation_on_branch(
db: &Omnigraph,
branch: Option<&str>,
table_key: &str,
op_kind: crate::db::MutationOpKind,
) -> Result<(Dataset, String, Option<String>)> {
db.ensure_schema_apply_not_locked("write").await?;
let resolved = db.resolved_branch_target(branch).await?;
@ -399,8 +432,10 @@ pub(super) async fn open_for_mutation_on_branch(
.table_store
.open_dataset_head_for_write(table_key, &full_path, None)
.await?;
db.table_store
.ensure_expected_version(&ds, table_key, entry.table_version)?;
if op_kind.strict_pre_stage_version_check() {
db.table_store
.ensure_expected_version(&ds, table_key, entry.table_version)?;
}
Ok((ds, full_path, None))
}
Some(active_branch) => {
@ -411,6 +446,7 @@ pub(super) async fn open_for_mutation_on_branch(
entry.table_branch.as_deref(),
entry.table_version,
active_branch,
op_kind,
)
.await?;
Ok((ds, full_path, table_branch))
@ -425,6 +461,7 @@ pub(super) async fn open_owned_dataset_for_branch_write(
entry_branch: Option<&str>,
entry_version: u64,
active_branch: &str,
op_kind: crate::db::MutationOpKind,
) -> Result<(Dataset, Option<String>)> {
match entry_branch {
Some(branch) if branch == active_branch => {
@ -432,8 +469,10 @@ pub(super) async fn open_owned_dataset_for_branch_write(
.table_store
.open_dataset_head_for_write(table_key, full_path, Some(active_branch))
.await?;
db.table_store
.ensure_expected_version(&ds, table_key, entry_version)?;
if op_kind.strict_pre_stage_version_check() {
db.table_store
.ensure_expected_version(&ds, table_key, entry_version)?;
}
Ok((ds, Some(active_branch.to_string())))
}
source_branch => {
@ -450,8 +489,10 @@ pub(super) async fn open_owned_dataset_for_branch_write(
.table_store
.open_dataset_head_for_write(table_key, full_path, Some(active_branch))
.await?;
db.table_store
.ensure_expected_version(&ds, table_key, entry_version)?;
if op_kind.strict_pre_stage_version_check() {
db.table_store
.ensure_expected_version(&ds, table_key, entry_version)?;
}
Ok((ds, Some(active_branch.to_string())))
}
}
@ -482,11 +523,27 @@ pub(super) async fn reopen_for_mutation(
full_path: &str,
table_branch: Option<&str>,
expected_version: u64,
op_kind: crate::db::MutationOpKind,
) -> Result<Dataset> {
db.ensure_schema_apply_not_locked("write").await?;
db.table_store
.reopen_for_mutation(full_path, table_branch, table_key, expected_version)
.await
if op_kind.strict_pre_stage_version_check() {
db.table_store
.reopen_for_mutation(full_path, table_branch, table_key, expected_version)
.await
} else {
// Insert / Merge: skip the strict version check. Open at HEAD —
// Lance's natural conflict resolver at commit_staged time
// (rebase append, dedupe merge_insert) handles concurrent
// writers correctly; the publisher CAS in
// `MutationStaging::commit_all` (refreshed under the
// per-(table, branch) queue via `snapshot_for_branch`) catches
// genuine cross-process drift as 409. See
// [`crate::db::MutationOpKind`] for the policy rationale.
let _ = expected_version;
db.table_store
.open_dataset_head_for_write(table_key, full_path, table_branch)
.await
}
}
pub(super) async fn open_dataset_at_state(
@ -505,7 +562,8 @@ pub(super) async fn build_indices_on_dataset(
table_key: &str,
ds: &mut Dataset,
) -> Result<()> {
build_indices_on_dataset_for_catalog(db, &db.catalog, table_key, ds).await
let catalog = db.catalog();
build_indices_on_dataset_for_catalog(db, &catalog, table_key, ds).await
}
pub(super) async fn build_indices_on_dataset_for_catalog(
@ -680,12 +738,19 @@ async fn prepare_updates_for_commit(
let mut prepared_update = update.clone();
if prepared_update.row_count > 0 {
let full_path = format!("{}/{}", db.root_uri, entry.table_path);
// Strict version check is correct here: this runs INSIDE
// the publisher commit path, after `commit_staged` already
// advanced Lance HEAD to `prepared_update.table_version`.
// The check is a defense-in-depth assertion that the
// dataset state matches what we just committed; not the
// pre-stage race the op-kind policy targets.
let mut ds = reopen_for_mutation(
db,
&prepared_update.table_key,
&full_path,
prepared_update.table_branch.as_deref(),
prepared_update.table_version,
crate::db::MutationOpKind::SchemaRewrite,
)
.await?;
build_indices_on_dataset(db, &prepared_update.table_key, &mut ds).await?;
@ -702,49 +767,50 @@ async fn prepare_updates_for_commit(
}
async fn commit_prepared_updates(
db: &mut Omnigraph,
db: &Omnigraph,
updates: &[crate::db::SubTableUpdate],
actor_id: Option<&str>,
) -> Result<u64> {
let actor_id = db.current_audit_actor().map(str::to_string);
let PublishedSnapshot {
manifest_version,
_snapshot_id: _,
} = db
.coordinator
.commit_updates_with_actor(updates, actor_id.as_deref())
.write()
.await
.commit_updates_with_actor(updates, actor_id)
.await?;
Ok(manifest_version)
}
async fn commit_prepared_updates_with_expected(
db: &mut Omnigraph,
db: &Omnigraph,
updates: &[crate::db::SubTableUpdate],
expected_table_versions: &std::collections::HashMap<String, u64>,
actor_id: Option<&str>,
) -> Result<u64> {
let actor_id = db.current_audit_actor().map(str::to_string);
let PublishedSnapshot {
manifest_version,
_snapshot_id: _,
} = db
.coordinator
.commit_updates_with_actor_with_expected(
updates,
expected_table_versions,
actor_id.as_deref(),
)
.write()
.await
.commit_updates_with_actor_with_expected(updates, expected_table_versions, actor_id)
.await?;
Ok(manifest_version)
}
pub(super) async fn commit_prepared_updates_on_branch(
db: &mut Omnigraph,
db: &Omnigraph,
branch: Option<&str>,
updates: &[crate::db::SubTableUpdate],
actor_id: Option<&str>,
) -> Result<u64> {
let current_branch = db.coordinator.current_branch().map(str::to_string);
let current_branch = db.coordinator.read().await.current_branch().map(str::to_string);
let requested_branch = branch.map(str::to_string);
if requested_branch == current_branch {
return commit_prepared_updates(db, updates).await;
return commit_prepared_updates(db, updates, actor_id).await;
}
let mut coordinator = match requested_branch.as_deref() {
@ -753,26 +819,32 @@ pub(super) async fn commit_prepared_updates_on_branch(
}
None => GraphCoordinator::open(db.uri(), Arc::clone(&db.storage)).await?,
};
let actor_id = db.current_audit_actor().map(str::to_string);
let PublishedSnapshot {
manifest_version,
_snapshot_id: _,
} = coordinator
.commit_updates_with_actor(updates, actor_id.as_deref())
.commit_updates_with_actor(updates, actor_id)
.await?;
Ok(manifest_version)
}
pub(super) async fn commit_prepared_updates_on_branch_with_expected(
db: &mut Omnigraph,
db: &Omnigraph,
branch: Option<&str>,
updates: &[crate::db::SubTableUpdate],
expected_table_versions: &std::collections::HashMap<String, u64>,
actor_id: Option<&str>,
) -> Result<u64> {
let current_branch = db.coordinator.current_branch().map(str::to_string);
let current_branch = db.coordinator.read().await.current_branch().map(str::to_string);
let requested_branch = branch.map(str::to_string);
if requested_branch == current_branch {
return commit_prepared_updates_with_expected(db, updates, expected_table_versions).await;
return commit_prepared_updates_with_expected(
db,
updates,
expected_table_versions,
actor_id,
)
.await;
}
let mut coordinator = match requested_branch.as_deref() {
@ -781,16 +853,11 @@ pub(super) async fn commit_prepared_updates_on_branch_with_expected(
}
None => GraphCoordinator::open(db.uri(), Arc::clone(&db.storage)).await?,
};
let actor_id = db.current_audit_actor().map(str::to_string);
let PublishedSnapshot {
manifest_version,
_snapshot_id: _,
} = coordinator
.commit_updates_with_actor_with_expected(
updates,
expected_table_versions,
actor_id.as_deref(),
)
.commit_updates_with_actor_with_expected(updates, expected_table_versions, actor_id)
.await?;
Ok(manifest_version)
}
@ -803,31 +870,31 @@ pub(super) async fn commit_updates(
updates: &[crate::db::SubTableUpdate],
) -> Result<u64> {
db.ensure_schema_apply_not_locked("write commit").await?;
let current_branch = db.coordinator.current_branch().map(str::to_string);
let current_branch = db.coordinator.read().await.current_branch().map(str::to_string);
let prepared = prepare_updates_for_commit(db, current_branch.as_deref(), updates).await?;
commit_prepared_updates(db, &prepared).await
commit_prepared_updates(db, &prepared, None).await
}
pub(super) async fn commit_manifest_updates(
db: &mut Omnigraph,
db: &Omnigraph,
updates: &[crate::db::SubTableUpdate],
) -> Result<u64> {
db.coordinator.commit_manifest_updates(updates).await
db.coordinator.write().await.commit_manifest_updates(updates).await
}
pub(super) async fn record_merge_commit(
db: &mut Omnigraph,
db: &Omnigraph,
manifest_version: u64,
parent_commit_id: &str,
merged_parent_commit_id: &str,
actor_id: Option<&str>,
) -> Result<String> {
let actor_id = db.current_audit_actor().map(str::to_string);
db.coordinator
db.coordinator.write().await
.record_merge_commit(
manifest_version,
parent_commit_id,
merged_parent_commit_id,
actor_id.as_deref(),
actor_id,
)
.await
.map(|snapshot_id| snapshot_id.as_str().to_string())
@ -837,19 +904,26 @@ pub(super) async fn record_merge_commit(
/// `expected_table_versions` map asserts the manifest's pre-write per-table
/// versions; mismatches surface as `ManifestConflictDetails::ExpectedVersionMismatch`.
pub(super) async fn commit_updates_on_branch_with_expected(
db: &mut Omnigraph,
db: &Omnigraph,
branch: Option<&str>,
updates: &[crate::db::SubTableUpdate],
expected_table_versions: &std::collections::HashMap<String, u64>,
actor_id: Option<&str>,
) -> Result<u64> {
db.ensure_schema_apply_not_locked("write commit").await?;
let prepared = prepare_updates_for_commit(db, branch, updates).await?;
commit_prepared_updates_on_branch_with_expected(db, branch, &prepared, expected_table_versions)
.await
commit_prepared_updates_on_branch_with_expected(
db,
branch,
&prepared,
expected_table_versions,
actor_id,
)
.await
}
pub(super) async fn ensure_commit_graph_initialized(db: &mut Omnigraph) -> Result<()> {
db.coordinator.ensure_commit_graph_initialized().await
pub(super) async fn ensure_commit_graph_initialized(db: &Omnigraph) -> Result<()> {
db.coordinator.write().await.ensure_commit_graph_initialized().await
}
pub(super) async fn invalidate_graph_index(db: &Omnigraph) {

View file

@ -0,0 +1,231 @@
//! Per-`(table_key, branch)` writer queues — MR-686 scaffolding.
//!
//! Today every server-layer write serializes on the global
//! `Arc<RwLock<Omnigraph>>` in `AppState`. MR-686 replaces that with
//! per-`(table_key, branch_ref)` queues so disjoint-key writes proceed
//! concurrently. This module owns the queue data structure; callers in
//! `MutationStaging::commit_all`, `branch_merge`, `schema_apply`,
//! `ensure_indices`, `delete_where`, and the future MR-870 recovery
//! reconciler acquire guards before any per-table Lance commit.
//!
//! ## Why exclusive `tokio::sync::Mutex<()>` per key
//!
//! Lance's `Dataset::restore` "wins" against concurrent Append/Update/
//! Delete/CreateIndex/Merge per `check_restore_txn`, silently orphaning
//! the concurrent writer's commit. The queue's *only* application-layer
//! job is to serialize Restore against every other writer on the same
//! `(table_key, branch_ref)`. Lance OCC handles the rest of the conflict
//! matrix (Append vs Append fully compatible, Update vs Update rebases or
//! retries, etc.) but cannot make Restore symmetric — that's an upstream
//! design choice. Until Lance fixes Restore (or BatchCommitTables
//! changes the protocol), every writer takes the same exclusive lock.
//!
//! `RwLock` (shared for normal writes, exclusive for Restore) is the
//! natural follow-up but adds a writer-classification surface that's
//! easy to get wrong; misclassifying any writer reintroduces the
//! orphaning hazard. We start with `Mutex` and revisit based on
//! production telemetry.
//!
//! ## Sorted-order acquisition
//!
//! `acquire_many` accepts a slice of keys and acquires them in
//! lexicographic order. Multi-table writers (mutation finalize,
//! branch_merge, future recovery reconciler) MUST go through
//! `acquire_many` so all callers agree on acquisition order — this is
//! how lock-order inversion deadlock is prevented.
use std::collections::HashMap;
use std::sync::{Arc, Mutex};
use tokio::sync::{Mutex as AsyncMutex, OwnedMutexGuard};
/// Queue key: `(table_key, branch_ref)`. `branch_ref = None` means main.
///
/// Branch is part of the key because the same Lance dataset can be
/// pinned at different versions on different branches; concurrent
/// writes to the same `table_key` on disjoint branches must NOT
/// serialize at the queue.
pub(crate) type TableQueueKey = (String, Option<String>);
/// Per-`(table_key, branch)` writer queue manager.
///
/// Lives on `Omnigraph` as `Arc<WriteQueueManager>` so HTTP handlers,
/// engine internals, the CLI binary, and future background reconcilers
/// (MR-870 recovery, MR-848 index) all reach it via the engine handle.
#[derive(Default)]
pub(crate) struct WriteQueueManager {
/// Held only briefly per `acquire` call: clone out the per-key Arc,
/// release the std mutex, then await the per-key tokio Mutex.
queues: Mutex<HashMap<TableQueueKey, Arc<AsyncMutex<()>>>>,
}
impl WriteQueueManager {
pub(crate) fn new() -> Self {
Self::default()
}
/// Get-or-create the per-key queue and clone its Arc.
fn slot(&self, key: &TableQueueKey) -> Arc<AsyncMutex<()>> {
let mut map = self.queues.lock().expect("write queue map poisoned");
if let Some(existing) = map.get(key) {
return Arc::clone(existing);
}
let fresh = Arc::new(AsyncMutex::new(()));
map.insert(key.clone(), Arc::clone(&fresh));
fresh
}
/// Acquire exclusive access to the queue for one `(table_key, branch)`.
///
/// Blocks until the lock is available. Drop the returned guard to
/// release; the lock outlives the `WriteQueueManager` borrow.
pub(crate) async fn acquire(&self, key: &TableQueueKey) -> OwnedMutexGuard<()> {
self.slot(key).lock_owned().await
}
/// Acquire exclusive access to many `(table_key, branch)` keys
/// atomically, in lex-sorted order. Used by multi-table writers
/// (mutation finalize, branch_merge, recovery) so all callers
/// agree on acquisition order — prevents lock-order inversion.
///
/// Empty input returns an empty Vec without touching the map.
/// Duplicates in `keys` are deduped before acquisition (the same
/// key acquired twice would deadlock against itself).
pub(crate) async fn acquire_many(
&self,
keys: &[TableQueueKey],
) -> Vec<OwnedMutexGuard<()>> {
if keys.is_empty() {
return Vec::new();
}
let mut sorted: Vec<TableQueueKey> = keys.to_vec();
sorted.sort();
sorted.dedup();
let mut guards = Vec::with_capacity(sorted.len());
for key in &sorted {
guards.push(self.acquire(key).await);
}
guards
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::time::{Duration, Instant};
use tokio::time::timeout;
fn key(table: &str, branch: Option<&str>) -> TableQueueKey {
(table.to_string(), branch.map(str::to_string))
}
#[tokio::test]
async fn acquire_many_empty_returns_empty() {
let qm = WriteQueueManager::new();
let guards = qm.acquire_many(&[]).await;
assert!(guards.is_empty());
}
#[tokio::test]
async fn acquire_many_dedupes_repeated_keys() {
// Same key passed twice would deadlock if not deduped.
let qm = WriteQueueManager::new();
let k = key("t1", None);
let guards = timeout(
Duration::from_secs(2),
qm.acquire_many(&[k.clone(), k.clone(), k]),
)
.await
.expect("acquire_many with duplicates deadlocked");
assert_eq!(guards.len(), 1);
}
#[tokio::test]
async fn acquire_many_sorts_keys_deterministically() {
// Two callers passing keys in different orders must acquire in
// the same internal order. We test this indirectly: caller A
// passes [a, c] and caller B passes [c, a]; if they both
// acquire in sorted order the second caller blocks on `a` first,
// not `c` — same as A — so no deadlock under any interleaving.
// Direct sort observation: call acquire_many with a reversed
// input and verify it doesn't deadlock against a held guard on
// the sorted-first key.
let qm = Arc::new(WriteQueueManager::new());
let a = key("a", None);
let z = key("z", None);
// Hold `a` exclusively.
let _held = qm.acquire(&a).await;
// acquire_many([z, a]) — must sort to [a, z] internally and
// block on `a`. With a 200ms timeout we should NOT see it
// complete (it's blocked on `a`).
let qm2 = Arc::clone(&qm);
let z_clone = z.clone();
let a_clone = a.clone();
let result = timeout(Duration::from_millis(200), async move {
qm2.acquire_many(&[z_clone, a_clone]).await
})
.await;
assert!(result.is_err(), "acquire_many should block on `a`, the lex-first key");
}
#[tokio::test]
async fn same_key_acquire_serializes() {
let qm = Arc::new(WriteQueueManager::new());
let k = key("t1", None);
let first = qm.acquire(&k).await;
// Second acquire on same key should NOT complete within 200ms.
let qm2 = Arc::clone(&qm);
let k2 = k.clone();
let blocked = timeout(Duration::from_millis(200), async move {
qm2.acquire(&k2).await
})
.await;
assert!(blocked.is_err(), "second acquire on same key must block");
// Drop the first guard, then second acquire should succeed.
drop(first);
let _second = timeout(Duration::from_secs(2), qm.acquire(&k))
.await
.expect("second acquire after release should not block");
}
#[tokio::test]
async fn disjoint_keys_acquire_concurrently() {
let qm = Arc::new(WriteQueueManager::new());
let a = key("a", None);
let b = key("b", None);
// Hold `a` indefinitely.
let _held_a = qm.acquire(&a).await;
// Acquire `b` on a different task. Should complete promptly
// because `b` is disjoint from `a`.
let qm2 = Arc::clone(&qm);
let start = Instant::now();
let _held_b = timeout(Duration::from_secs(2), qm2.acquire(&b))
.await
.expect("disjoint key acquire must not block on unrelated held key");
assert!(
start.elapsed() < Duration::from_millis(500),
"disjoint acquire took {:?}, should be near-instant",
start.elapsed()
);
}
#[tokio::test]
async fn disjoint_branches_on_same_table_do_not_serialize() {
// (table, main) and (table, feature) are different keys.
let qm = Arc::new(WriteQueueManager::new());
let main_k = key("t1", None);
let feature_k = key("t1", Some("feature"));
let _held_main = qm.acquire(&main_k).await;
let _held_feature = timeout(Duration::from_secs(2), qm.acquire(&feature_k))
.await
.expect("same-table-different-branch should not serialize");
}
}

View file

@ -855,8 +855,9 @@ async fn publish_adopted_source_state(
.ok_or_else(|| OmniError::manifest(format!("missing source entry for {}", table_key)))?;
let target_entry = target_snapshot.entry(table_key);
let target_active = target_db.active_branch().await;
match (
target_db.active_branch(),
target_active.as_deref(),
source_entry.table_branch.as_deref(),
) {
// Both on main — pointer switch is safe (same lineage, version columns valid)
@ -945,7 +946,13 @@ async fn publish_rewritten_merge_table(
table_key: &str,
staged: &StagedMergeResult,
) -> Result<crate::db::SubTableUpdate> {
let (ds, full_path, table_branch) = target_db.open_for_mutation(table_key).await?;
// Branch merge's source-rewrite path is Merge-shaped (upsert from
// source onto target). The inline `delete_where` later in this
// function operates on rows the rewrite chose to remove, not
// user-facing predicates, so Merge is the correct policy here.
let (ds, full_path, table_branch) = target_db
.open_for_mutation(table_key, crate::db::MutationOpKind::Merge)
.await?;
let mut current_ds = ds;
// Phase 1: merge_insert changed/new rows (preserves _row_created_at_version for
@ -1045,25 +1052,26 @@ async fn publish_rewritten_merge_table(
}
impl Omnigraph {
pub async fn branch_merge(&mut self, source: &str, target: &str) -> Result<MergeOutcome> {
pub async fn branch_merge(&self, source: &str, target: &str) -> Result<MergeOutcome> {
self.branch_merge_as(source, target, None).await
}
pub async fn branch_merge_as(
&mut self,
&self,
source: &str,
target: &str,
actor_id: Option<&str>,
) -> Result<MergeOutcome> {
self.ensure_schema_apply_idle("branch_merge").await?;
let previous_actor = self.audit_actor_id.clone();
self.audit_actor_id = actor_id.map(str::to_string);
let result = self.branch_merge_impl(source, target).await;
self.audit_actor_id = previous_actor;
result
self.branch_merge_impl(source, target, actor_id).await
}
async fn branch_merge_impl(&mut self, source: &str, target: &str) -> Result<MergeOutcome> {
async fn branch_merge_impl(
&self,
source: &str,
target: &str,
actor_id: Option<&str>,
) -> Result<MergeOutcome> {
if is_internal_run_branch(source) || is_internal_run_branch(target) {
return Err(OmniError::manifest(format!(
"branch_merge does not allow internal run refs ('{}' -> '{}')",
@ -1113,7 +1121,17 @@ impl Omnigraph {
))
.await?
.snapshot;
let previous_branch = self.active_branch().map(str::to_string);
// Hold the merge-exclusive mutex across the full swap → operate
// → restore window. Two concurrent branch_merge calls would
// otherwise interleave their three separate `coordinator.write()`
// acquisitions, leaving each merge's body running against the
// other's swapped coord. Pinned by
// `concurrent_branch_merges_distinct_targets_do_not_swap_into_each_other`
// in `crates/omnigraph-server/tests/server.rs`.
let merge_exclusive = self.merge_exclusive();
let _merge_guard = merge_exclusive.lock().await;
let previous_branch = self.active_branch().await;
let previous = self
.swap_coordinator_for_branch(target_branch.as_deref())
.await?;
@ -1124,9 +1142,10 @@ impl Omnigraph {
&target_head_commit_id,
&source_head_commit_id,
is_fast_forward,
actor_id,
)
.await;
self.restore_coordinator(previous);
self.restore_coordinator(previous).await;
if merge_result.is_ok() && previous_branch == target_branch {
self.refresh().await?;
@ -1136,15 +1155,16 @@ impl Omnigraph {
}
async fn branch_merge_on_current_target(
&mut self,
&self,
base_snapshot: &Snapshot,
source_snapshot: &Snapshot,
target_head_commit_id: &str,
source_head_commit_id: &str,
is_fast_forward: bool,
actor_id: Option<&str>,
) -> Result<MergeOutcome> {
self.ensure_commit_graph_initialized().await?;
let target_snapshot = self.snapshot();
let target_snapshot = self.snapshot().await;
let mut table_keys = HashSet::new();
for entry in base_snapshot.entries() {
@ -1180,7 +1200,7 @@ impl Omnigraph {
if let Some(staged) = stage_streaming_table_merge(
table_key,
self.catalog(),
&self.catalog(),
base_snapshot,
source_snapshot,
&target_snapshot,
@ -1227,6 +1247,55 @@ impl Omnigraph {
// requires pre-computing source deltas during candidate
// classification (a structural change to `CandidateTableState`)
// and is left as follow-up work.
// Acquire per-(table_key, target_branch) queues for every table
// touched by the merge plan. Sorted-order acquisition prevents
// lock-order inversion against concurrent multi-table writers.
// The active branch (set by the caller's `swap_coordinator_for_branch`)
// is the merge target; queue keys are scoped to it because a
// branch_merge writes only to the target branch.
//
// Held across the per-table publish loop and the manifest
// commit + record_merge_commit calls below. Under PR 1b's
// intermediate state (global server RwLock still in place),
// this acquisition is uncontended.
let active_branch_for_keys = self.active_branch().await;
let merge_queue_keys: Vec<(String, Option<String>)> = ordered_table_keys
.iter()
.filter(|table_key| {
matches!(
candidates.get(*table_key),
Some(CandidateTableState::RewriteMerged(_))
| Some(CandidateTableState::AdoptSourceState)
)
})
.map(|table_key| (table_key.clone(), active_branch_for_keys.clone()))
.collect();
let _merge_queue_guards = self.write_queue().acquire_many(&merge_queue_keys).await;
let post_queue_snapshot = self.snapshot().await;
for table_key in &ordered_table_keys {
let Some(candidate) = candidates.get(table_key) else {
continue;
};
if !matches!(
candidate,
CandidateTableState::RewriteMerged(_) | CandidateTableState::AdoptSourceState
) {
continue;
}
let expected = target_snapshot.entry(table_key).map(|e| e.table_version);
let current = post_queue_snapshot
.entry(table_key)
.map(|e| e.table_version);
if expected != current {
return Err(OmniError::manifest_expected_version_mismatch(
table_key.clone(),
expected.unwrap_or(0),
current.unwrap_or(0),
));
}
}
let recovery_pins: Vec<crate::db::manifest::SidecarTablePin> = ordered_table_keys
.iter()
.filter_map(|table_key| {
@ -1252,7 +1321,7 @@ impl Omnigraph {
// the orphaned post-Phase-B HEAD on the target ref.
// Same rationale as table_ops.rs:115-120 in
// ensure_indices_for_branch.
table_branch: self.active_branch().map(str::to_string),
table_branch: active_branch_for_keys.clone(),
})
})
.collect();
@ -1268,11 +1337,11 @@ impl Omnigraph {
// `branch_merge` calls `swap_coordinator_for_branch(target_branch)`
// before invoking this function, so `self.active_branch()`
// is the target.
let target_branch = self.active_branch().map(str::to_string);
let target_branch = active_branch_for_keys.clone();
let mut sidecar = crate::db::manifest::new_sidecar(
crate::db::manifest::SidecarKind::BranchMerge,
target_branch,
self.audit_actor_id.clone(),
actor_id.map(str::to_string),
recovery_pins,
);
// Carry the source branch's HEAD commit id so the recovery
@ -1301,7 +1370,7 @@ impl Omnigraph {
CandidateTableState::AdoptSourceState => {
publish_adopted_source_state(
self,
self.catalog(),
&self.catalog(),
base_snapshot,
source_snapshot,
&target_snapshot,
@ -1326,7 +1395,7 @@ impl Omnigraph {
crate::failpoints::maybe_fail("branch_merge.post_phase_b_pre_manifest_commit")?;
let manifest_version = if updates.is_empty() {
self.version()
self.version().await
} else {
self.commit_manifest_updates(&updates).await?
};
@ -1349,6 +1418,7 @@ impl Omnigraph {
manifest_version,
target_head_commit_id,
source_head_commit_id,
actor_id,
)
.await?;

View file

@ -345,8 +345,8 @@ async fn validate_edge_insert_endpoints(
edge_name: &str,
assignments: &HashMap<String, Literal>,
) -> Result<()> {
let edge_type = db
.catalog()
let catalog = db.catalog();
let edge_type = catalog
.edge_types
.get(edge_name)
.ok_or_else(|| OmniError::manifest(format!("unknown edge type '{}'", edge_name)))?;
@ -600,6 +600,7 @@ async fn open_table_for_mutation(
staging: &mut MutationStaging,
branch: Option<&str>,
table_key: &str,
op_kind: crate::db::MutationOpKind,
) -> Result<(Dataset, String, Option<String>)> {
if let Some(prior) = staging.inline_committed.get(table_key) {
let path = staging.paths.get(table_key).ok_or_else(|| {
@ -614,18 +615,21 @@ async fn open_table_for_mutation(
&path.full_path,
path.table_branch.as_deref(),
prior.table_version,
op_kind,
)
.await?;
return Ok((ds, path.full_path.clone(), path.table_branch.clone()));
}
let (ds, full_path, table_branch) =
db.open_for_mutation_on_branch(branch, table_key).await?;
let (ds, full_path, table_branch) = db
.open_for_mutation_on_branch(branch, table_key, op_kind)
.await?;
let expected_version = ds.version().version;
staging.ensure_path(
table_key,
full_path.clone(),
table_branch.clone(),
expected_version,
op_kind,
);
Ok((ds, full_path, table_branch))
}
@ -670,7 +674,7 @@ fn enforce_no_mixed_destructive_constructive(
impl Omnigraph {
pub async fn mutate(
&mut self,
&self,
branch: &str,
query_source: &str,
query_name: &str,
@ -681,28 +685,24 @@ impl Omnigraph {
}
pub async fn mutate_as(
&mut self,
&self,
branch: &str,
query_source: &str,
query_name: &str,
params: &ParamMap,
actor_id: Option<&str>,
) -> Result<MutationResult> {
let previous_actor = self.audit_actor_id.clone();
self.audit_actor_id = actor_id.map(str::to_string);
let result = self
.mutate_with_current_actor(branch, query_source, query_name, params)
.await;
self.audit_actor_id = previous_actor;
result
self.mutate_with_current_actor(branch, query_source, query_name, params, actor_id)
.await
}
async fn mutate_with_current_actor(
&mut self,
&self,
branch: &str,
query_source: &str,
query_name: &str,
params: &ParamMap,
actor_id: Option<&str>,
) -> Result<MutationResult> {
self.ensure_schema_state_valid().await?;
let requested = Self::normalize_branch_name(branch)?;
@ -737,11 +737,19 @@ impl Omnigraph {
Err(e) => Err(e),
Ok(total) if staging.is_empty() => Ok(total),
Ok(total) => {
let (updates, expected_versions, sidecar_handle) = staging
.finalize(
let staged = staging.stage_all(self, requested.as_deref()).await?;
// `_queue_guards` holds per-(table_key, branch) write
// queues acquired inside `commit_all`. Held across the
// manifest publish below so no concurrent writer can
// interleave between our commit_staged and our publish
// (which would correctly fail our CAS but leave Lance
// HEAD advanced — the residual class MR-870 recovers).
let (updates, expected_versions, sidecar_handle, _queue_guards) = staged
.commit_all(
self,
requested.as_deref(),
crate::db::manifest::SidecarKind::Mutation,
actor_id,
)
.await?;
// Failpoint that wedges the documented finalize→publisher
@ -759,6 +767,7 @@ impl Omnigraph {
requested.as_deref(),
&updates,
&expected_versions,
actor_id,
)
.await?;
// Phase C succeeded — sidecar can be deleted. If this
@ -794,7 +803,7 @@ impl Omnigraph {
}
async fn execute_named_mutation(
&mut self,
&self,
query_source: &str,
query_name: &str,
params: &ParamMap,
@ -804,7 +813,7 @@ impl Omnigraph {
let query_decl = omnigraph_compiler::find_named_query(query_source, query_name)
.map_err(|e| OmniError::manifest(e.to_string()))?;
let checked = typecheck_query_decl(self.catalog(), &query_decl)?;
let checked = typecheck_query_decl(&self.catalog(), &query_decl)?;
match checked {
CheckedQuery::Mutation(_) => {}
CheckedQuery::Read(_) => {
@ -858,7 +867,7 @@ impl Omnigraph {
}
async fn execute_insert(
&mut self,
&self,
type_name: &str,
assignments: &[IRAssignment],
params: &ParamMap,
@ -906,8 +915,13 @@ impl Omnigraph {
let has_key = node_type.key_property().is_some();
let table_key = format!("node:{}", type_name);
// Capture pre-write metadata on first touch (no Lance write).
let insert_kind = if has_key {
crate::db::MutationOpKind::Merge
} else {
crate::db::MutationOpKind::Insert
};
let (_ds, _full_path, _table_branch) =
open_table_for_mutation(self, staging, branch, &table_key).await?;
open_table_for_mutation(self, staging, branch, &table_key, insert_kind).await?;
// Accumulate. @key inserts go into the Merge stream (so a
// later update on the same id coalesces correctly); no-key
// inserts go into the Append stream.
@ -941,8 +955,14 @@ impl Omnigraph {
}
let table_key = format!("edge:{}", type_name);
// Capture pre-write metadata on first touch (no Lance write).
let (ds, _full_path, _table_branch) =
open_table_for_mutation(self, staging, branch, &table_key).await?;
let (ds, _full_path, _table_branch) = open_table_for_mutation(
self,
staging,
branch,
&table_key,
crate::db::MutationOpKind::Insert,
)
.await?;
// Accumulate the new edge row. Edge IDs are ULID-generated so
// Append mode is correct (no key-based dedup needed).
staging.append_batch(&table_key, schema, PendingMode::Append, batch.clone())?;
@ -972,7 +992,7 @@ impl Omnigraph {
}
async fn execute_update(
&mut self,
&self,
type_name: &str,
assignments: &[IRAssignment],
predicate: &IRMutationPredicate,
@ -1003,8 +1023,14 @@ impl Omnigraph {
let blob_props = self.catalog().node_types[type_name].blob_properties.clone();
let table_key = format!("node:{}", type_name);
let (ds, _full_path, _table_branch) =
open_table_for_mutation(self, staging, branch, &table_key).await?;
let (ds, _full_path, _table_branch) = open_table_for_mutation(
self,
staging,
branch,
&table_key,
crate::db::MutationOpKind::Update,
)
.await?;
// Scan committed via Lance + apply the same predicate to pending
// batches via DataFusion `MemTable` (read-your-writes for prior
@ -1097,7 +1123,7 @@ impl Omnigraph {
}
async fn execute_delete(
&mut self,
&self,
type_name: &str,
predicate: &IRMutationPredicate,
params: &ParamMap,
@ -1115,7 +1141,7 @@ impl Omnigraph {
}
async fn execute_delete_node(
&mut self,
&self,
type_name: &str,
predicate: &IRMutationPredicate,
params: &ParamMap,
@ -1125,8 +1151,14 @@ impl Omnigraph {
let pred_sql = predicate_to_sql(predicate, params, false)?;
let table_key = format!("node:{}", type_name);
let (ds, full_path, table_branch) =
open_table_for_mutation(self, staging, branch, &table_key).await?;
let (ds, full_path, table_branch) = open_table_for_mutation(
self,
staging,
branch,
&table_key,
crate::db::MutationOpKind::Delete,
)
.await?;
let initial_version = ds.version().version;
// Scan matching IDs for cascade. Per D₂ this never overlaps with
@ -1171,8 +1203,10 @@ impl Omnigraph {
&full_path,
table_branch.as_deref(),
initial_version,
crate::db::MutationOpKind::Delete,
)
.await?;
crate::failpoints::maybe_fail("mutation.delete_node_pre_primary_delete")?;
let delete_state = self
.table_store()
.delete_where(&full_path, &mut ds, &pred_sql)
@ -1214,8 +1248,14 @@ impl Omnigraph {
let edge_table_key = format!("edge:{}", edge_name);
let cascade_filter = cascade_filters.join(" OR ");
let (mut edge_ds, edge_full_path, edge_table_branch) =
open_table_for_mutation(self, staging, branch, &edge_table_key).await?;
let (mut edge_ds, edge_full_path, edge_table_branch) = open_table_for_mutation(
self,
staging,
branch,
&edge_table_key,
crate::db::MutationOpKind::Delete,
)
.await?;
let edge_delete = self
.table_store()
@ -1246,7 +1286,7 @@ impl Omnigraph {
}
async fn execute_delete_edge(
&mut self,
&self,
type_name: &str,
predicate: &IRMutationPredicate,
params: &ParamMap,
@ -1256,8 +1296,14 @@ impl Omnigraph {
let pred_sql = predicate_to_sql(predicate, params, true)?;
let table_key = format!("edge:{}", type_name);
let (mut ds, full_path, table_branch) =
open_table_for_mutation(self, staging, branch, &table_key).await?;
let (mut ds, full_path, table_branch) = open_table_for_mutation(
self,
staging,
branch,
&table_key,
crate::db::MutationOpKind::Delete,
)
.await?;
let delete_state = self
.table_store()

View file

@ -13,11 +13,12 @@ impl Omnigraph {
) -> Result<QueryResult> {
self.ensure_schema_state_valid().await?;
let resolved = self.resolved_target(target).await?;
let catalog = self.catalog();
let query_decl = omnigraph_compiler::find_named_query(query_source, query_name)
.map_err(|e| OmniError::manifest(e.to_string()))?;
let type_ctx = typecheck_query(self.catalog(), &query_decl)?;
let ir = lower_query(self.catalog(), &query_decl, &type_ctx)?;
let type_ctx = typecheck_query(&catalog, &query_decl)?;
let ir = lower_query(&catalog, &query_decl, &type_ctx)?;
let needs_graph = ir
.pipeline
@ -34,7 +35,7 @@ impl Omnigraph {
params,
&resolved.snapshot,
graph_index.as_deref(),
self.catalog(),
&catalog,
)
.await
}
@ -52,19 +53,19 @@ impl Omnigraph {
) -> Result<QueryResult> {
self.ensure_schema_state_valid().await?;
let snapshot = self.snapshot_at_version(version).await?;
let catalog = self.catalog();
let query_decl = omnigraph_compiler::find_named_query(query_source, query_name)
.map_err(|e| OmniError::manifest(e.to_string()))?;
let type_ctx = typecheck_query(self.catalog(), &query_decl)?;
let ir = lower_query(self.catalog(), &query_decl, &type_ctx)?;
let type_ctx = typecheck_query(&catalog, &query_decl)?;
let ir = lower_query(&catalog, &query_decl, &type_ctx)?;
let needs_graph = ir
.pipeline
.iter()
.any(|op| matches!(op, IROp::Expand { .. } | IROp::AntiJoin { .. }));
let graph_index = if needs_graph {
let edge_types = self
.catalog()
let edge_types = catalog
.edge_types
.iter()
.map(|(name, et)| (name.clone(), (et.from_type.clone(), et.to_type.clone())))
@ -79,7 +80,7 @@ impl Omnigraph {
params,
&snapshot,
graph_index.as_deref(),
self.catalog(),
&catalog,
)
.await
}

View file

@ -26,7 +26,7 @@ use arrow_schema::SchemaRef;
use lance::Dataset;
use omnigraph_compiler::catalog::EdgeType;
use crate::db::SubTableUpdate;
use crate::db::{MutationOpKind, SubTableUpdate};
use crate::db::manifest::{
new_sidecar, write_sidecar, RecoverySidecarHandle, SidecarKind, SidecarTablePin,
};
@ -94,18 +94,30 @@ pub(crate) struct MutationStaging {
/// Inline-committed updates from delete-touching ops (D₂ guarantees no
/// pending batches exist on a delete-touched table).
pub(crate) inline_committed: HashMap<String, SubTableUpdate>,
/// Strictest [`MutationOpKind`] seen per table within this query. Drives
/// the op-kind-aware drift check in [`StagedMutation::commit_all`]: for
/// tables whose first or any subsequent touch was a strict op
/// (Update / Delete / SchemaRewrite), commit_all fails fast with 409
/// when the staged dataset version drifts from the fresh manifest pin
/// rather than letting Lance's `commit_staged` surface
/// `RetryableCommitConflict` as a 500. See
/// [`MutationOpKind::strict_pre_stage_version_check`].
pub(crate) op_kinds: HashMap<String, MutationOpKind>,
}
impl MutationStaging {
/// Capture pre-write metadata on first touch of a table. Subsequent
/// touches are no-ops (paths and `expected_version` are stable for the
/// lifetime of one query).
/// touches preserve the original `paths` and `expected_versions`
/// entries; `op_kinds` upgrades to the strictest kind seen so far so
/// that mixed insert+update on the same table still fires the strict
/// drift check at commit time.
pub(crate) fn ensure_path(
&mut self,
table_key: &str,
full_path: String,
table_branch: Option<String>,
expected_version: u64,
op_kind: MutationOpKind,
) {
self.paths.entry(table_key.to_string()).or_insert(StagedTablePath {
full_path,
@ -114,6 +126,19 @@ impl MutationStaging {
self.expected_versions
.entry(table_key.to_string())
.or_insert(expected_version);
self.op_kinds
.entry(table_key.to_string())
.and_modify(|existing| {
// Upgrade to the stricter kind if a later op needs it.
// Insert + later Update → Update wins; Update + later Insert
// keeps Update.
if op_kind.strict_pre_stage_version_check()
&& !existing.strict_pre_stage_version_check()
{
*existing = op_kind;
}
})
.or_insert(op_kind);
}
/// Append a batch to the per-table accumulator.
@ -210,101 +235,65 @@ impl MutationStaging {
}
/// End-of-query: for each pending table, concat batches and commit via
/// `stage_append` or `stage_merge_insert` followed by `commit_staged`.
/// Merge with inline-committed entries. Return `(updates,
/// expected_versions)` for `commit_updates_on_branch_with_expected`.
/// **Phase A** of the two-phase commit: stage uncommitted fragments
/// for every table in `pending`. No Lance HEAD movement, no sidecar,
/// no manifest publish. Returns a [`StagedMutation`] carrying the
/// staged transactions so a future MR-686 queue acquisition step can
/// run between staging (slow S3 PUTs, no queue) and commit (fast,
/// under per-`(table_key, branch)` queue).
///
/// Sequential per-table — no cross-table dependency, but a parallel
/// version is a perf optimization for multi-table writes (loader with
/// many node + edge types). v1 ships sequential; the fan-out can land
/// in a follow-up.
pub(crate) async fn finalize(
/// Sequential per-table for now — parallelizing across independent
/// Lance datasets is a perf follow-up; same loop structure as the
/// pre-split `finalize`.
pub(crate) async fn stage_all(
self,
db: &crate::db::Omnigraph,
branch: Option<&str>,
sidecar_kind: SidecarKind,
) -> Result<(
Vec<SubTableUpdate>,
HashMap<String, u64>,
Option<RecoverySidecarHandle>,
)> {
_branch: Option<&str>,
) -> Result<StagedMutation> {
let MutationStaging {
expected_versions,
paths,
pending,
inline_committed,
op_kinds,
} = self;
let mut updates: Vec<SubTableUpdate> =
inline_committed.into_values().collect();
// Sidecar protocol: build the per-table pin list BEFORE any Lance
// commit_staged runs, then write the sidecar so a crash between
// Phase B (this loop's commit_staged calls) and Phase C (the
// manifest publish in the caller) is recoverable on next open.
// Skipped when `pending` is empty (delete-only mutation; the D₂
// parse-time rule keeps deletes out of this code path so this
// branch is reached only for the inline-committed-only case).
let pins: Vec<SidecarTablePin> = pending
.iter()
.map(|(table_key, _)| {
let path = paths.get(table_key).ok_or_else(|| {
OmniError::manifest_internal(format!(
"MutationStaging::finalize: missing path for table '{}'",
table_key,
))
})?;
let expected = *expected_versions.get(table_key).ok_or_else(|| {
OmniError::manifest_internal(format!(
"MutationStaging::finalize: missing expected version for table '{}'",
table_key,
))
})?;
Ok::<SidecarTablePin, OmniError>(SidecarTablePin {
table_key: table_key.clone(),
table_path: path.full_path.clone(),
expected_version: expected,
post_commit_pin: expected + 1,
table_branch: path.table_branch.clone(),
})
})
.collect::<Result<Vec<_>>>()?;
let sidecar_handle = if pins.is_empty() {
None
} else {
let sidecar = new_sidecar(
sidecar_kind,
branch.map(|s| s.to_string()),
db.audit_actor_id.clone(),
pins,
);
Some(write_sidecar(db.root_uri(), db.storage_adapter(), &sidecar).await?)
};
let mut staged_entries: Vec<StagedTableEntry> = Vec::with_capacity(pending.len());
for (table_key, table) in pending {
let path = paths.get(&table_key).ok_or_else(|| {
let path = paths.get(&table_key).cloned().ok_or_else(|| {
OmniError::manifest_internal(format!(
"MutationStaging::finalize: missing path for table '{}'",
"MutationStaging::stage_all: missing path for table '{}'",
table_key
))
})?;
let expected = *expected_versions.get(&table_key).ok_or_else(|| {
OmniError::manifest_internal(format!(
"MutationStaging::finalize: missing expected version for table '{}'",
"MutationStaging::stage_all: missing expected version for table '{}'",
table_key
))
})?;
// Reopen at the pre-write version. Lance HEAD has not advanced
// since `ensure_path` captured it — no prior op committed to
// this dataset.
// Reopen the dataset for staging. The op_kind reflects the
// accumulated PendingTable's mode: Append-mode batches are
// INSERT-shaped (no key-based dedup at commit_staged); Merge-
// mode batches are MERGE-shaped (key-dedup at commit_staged).
// Both skip the strict pre-stage version check under the
// [`MutationOpKind`] policy: Lance's natural rebase + the
// per-(table, branch) queue + the publisher CAS in
// `commit_all` handle drift; the strict check would
// over-reject in-process concurrent inserts (PR 2 / MR-686
// Phase 2).
let stage_kind = match table.mode {
PendingMode::Append => crate::db::MutationOpKind::Insert,
PendingMode::Merge => crate::db::MutationOpKind::Merge,
};
let ds = db
.reopen_for_mutation(
&table_key,
&path.full_path,
path.table_branch.as_deref(),
expected,
stage_kind,
)
.await?;
@ -335,8 +324,8 @@ impl MutationStaging {
}
};
// Commit via Lance's two-phase write: stage produces
// uncommitted fragments + transaction; commit advances HEAD.
// Stage produces uncommitted fragments + transaction. No
// Lance HEAD advance until `commit_all` runs `commit_staged`.
let staged = match table.mode {
PendingMode::Append => {
db.table_store().stage_append(&ds, combined, &[]).await?
@ -353,16 +342,327 @@ impl MutationStaging {
.await?
}
};
staged_entries.push(StagedTableEntry {
table_key,
path,
expected_version: expected,
dataset: ds,
staged_write: staged,
});
}
Ok(StagedMutation {
inline_committed,
staged: staged_entries,
expected_versions,
paths,
op_kinds,
})
}
}
/// Output of [`MutationStaging::stage_all`]. Carries the staged Lance
/// transactions (Phase A complete; uncommitted fragments written) plus
/// the per-table metadata needed to write the recovery sidecar, run
/// `commit_staged` (Phase B), and produce the publisher's input.
///
/// Splitting `stage_all` and `commit_all` is the structural prerequisite
/// for MR-686: a future commit can drop queue acquisition + manifest-pin
/// revalidation between Phase A and Phase B without touching staging
/// logic.
pub(crate) struct StagedMutation {
/// Updates from delete-touching ops (D₂ parse-time rule keeps
/// pending and inline_committed disjoint per table). Tables here
/// have already advanced Lance HEAD via inline `delete_where`;
/// `commit_all` builds sidecar pins for these too so the
/// commit→publish residual is recoverable for delete-only paths
/// (third-agent Finding 3).
inline_committed: HashMap<String, SubTableUpdate>,
/// One entry per table that had pending batches successfully staged.
staged: Vec<StagedTableEntry>,
/// Pre-write manifest version per table — the publisher's CAS fence.
expected_versions: HashMap<String, u64>,
/// Per-table identifiers from `MutationStaging::paths`. Carried
/// through so `commit_all` can build sidecar pins for both staged
/// and inline-committed tables.
paths: HashMap<String, StagedTablePath>,
/// Strictest op_kind per touched table, propagated from
/// `MutationStaging::op_kinds` so `commit_all`'s drift check
/// fires only on read-modify-write tables.
op_kinds: HashMap<String, MutationOpKind>,
}
/// Per-table state captured during `stage_all` and consumed by
/// `commit_all`. Holds the opened `Dataset` so `commit_staged` doesn't
/// re-open, and the `StagedWrite` whose `transaction` `commit_staged`
/// will execute.
struct StagedTableEntry {
table_key: String,
path: StagedTablePath,
expected_version: u64,
dataset: lance::Dataset,
staged_write: crate::table_store::StagedWrite,
}
impl StagedMutation {
/// **Phase B** of the two-phase commit: acquire per-`(table_key,
/// branch)` queues, revalidate manifest pins, write the recovery
/// sidecar, run `commit_staged` per table to advance Lance HEAD, and
/// return the publisher's input plus the queue guards.
///
/// **Caller must hold the returned `_guards` Vec across the
/// subsequent manifest publish.** Releasing guards before publish
/// would let another writer interleave their commit_staged between
/// ours and our publish — which would correctly fail our CAS but
/// leave Lance HEAD advanced (the residual class MR-870 recovers
/// from). Holding the guards across publish keeps the residual
/// unreachable for op-execution failures on the happy path.
///
/// Revalidation: between `stage_all` and `commit_all`, another
/// writer (in the same process or another process sharing the
/// repo) may have committed to one of our touched tables, advancing
/// the manifest pin past our `expected_version`. We revalidate
/// under the queue and fail-fast with `manifest_conflict` before
/// any `commit_staged` so the orphaned uncommitted fragments stay
/// unreferenced (cleaned by `cleanup_old_versions`'s age sweep)
/// rather than being committed and creating a Lance-HEAD-ahead
/// residual.
pub(crate) async fn commit_all(
self,
db: &crate::db::Omnigraph,
branch: Option<&str>,
sidecar_kind: SidecarKind,
actor_id: Option<&str>,
) -> Result<(
Vec<SubTableUpdate>,
HashMap<String, u64>,
Option<RecoverySidecarHandle>,
Vec<tokio::sync::OwnedMutexGuard<()>>,
)> {
let StagedMutation {
inline_committed,
mut staged,
mut expected_versions,
paths,
op_kinds,
} = self;
// Acquire per-(table_key, branch) queues for every touched
// table — both staged and inline-committed. Sorted by
// `acquire_many` internally so all multi-table writers
// (mutation, branch_merge, schema_apply, future MR-870
// recovery) agree on acquisition order — prevents lock-order
// inversion deadlock.
//
// For inline-committed tables (delete-only mutations), Lance
// HEAD has already advanced inside `delete_where` before
// `commit_all` runs. Holding the queue here doesn't prevent
// that interleaving (commit 6 will move queue acquisition into
// `delete_where`'s call site); it does prevent another writer
// from interleaving between our delete and our publish, which
// would otherwise leave a Lance-HEAD-ahead residual the
// delete-only sidecar (added below) would have to recover.
let mut queue_keys: Vec<(String, Option<String>)> = Vec::with_capacity(
staged.len() + inline_committed.len(),
);
for entry in &staged {
queue_keys.push((entry.table_key.clone(), entry.path.table_branch.clone()));
}
for table_key in inline_committed.keys() {
let path = paths.get(table_key).ok_or_else(|| {
OmniError::manifest_internal(format!(
"StagedMutation::commit_all: missing path for inline-committed table '{}'",
table_key
))
})?;
queue_keys.push((table_key.clone(), path.table_branch.clone()));
}
let guards = db.write_queue().acquire_many(&queue_keys).await;
// Re-capture manifest pins under the queue (PR 2 / MR-686).
//
// expected_versions was captured when the mutation first opened
// each table for mutation (the query's read-time pin). For
// non-strict inserts / merge-style appends, a writer may advance
// the table before we acquire the queue and Lance can still
// safely rebase the write, so we refresh expected_versions to
// the queued manifest pin.
//
// Strict read-modify-write ops (update / delete /
// schema-rewrite) are different: the staged batch was computed
// against the read-time pin, even if stage_all later re-opened
// the dataset at HEAD. For those ops, compare read-time
// expected_version to the queued manifest pin and fail before
// any Lance HEAD movement if the target drifted. This can
// over-reject a single mutation that inserts, then upgrades to
// update, while another writer advances the table between the
// two touches; that is safe-by-default and keeps one invariant
// until `ensure_path` learns how to bump expected_version on
// op-kind upgrade.
//
// Why per-branch (and not the bound-branch `db.snapshot()`):
// when the caller mutates a branch other than the engine's
// bound branch (e.g., feature-branch ingest from a server
// handle bound to main), `db.snapshot()` returns the bound
// branch's view of each table — which is the wrong pin for
// the publisher's CAS on a different branch. Using
// `snapshot_for_branch(branch)` resolves the per-branch
// entries correctly. The cost is one fresh manifest read per
// mutation; PR 1b's regression came from this same read, but
// that read is now strictly necessary for cross-branch
// correctness. Single-table same-branch mutations could still
// skip this read (queue exclusivity makes the publisher CAS a
// no-op), but the conditional adds complexity for marginal
// gain — left as a follow-up perf optimization.
//
// Multi-coordinator deployments (§VI.27 aspirational) get
// genuine cross-process drift detection from this read for
// free.
let snapshot = db.snapshot_for_branch(branch).await?;
for entry in staged.iter_mut() {
let current = snapshot
.entry(&entry.table_key)
.map(|e| e.table_version)
.ok_or_else(|| {
OmniError::manifest_conflict(format!(
"table '{}' missing from manifest at commit time",
entry.table_key,
))
})?;
// Insert / Merge tables skip this check: concurrent inserts on
// disjoint keys legitimately coexist via Lance's auto-rebase, so
// the check would over-reject the existing Phase 2 same-key
// insert path (`change_concurrent_inserts_same_key_serialize_without_409`).
let strict = op_kinds
.get(&entry.table_key)
.map(|k| k.strict_pre_stage_version_check())
.unwrap_or(false);
if strict && entry.expected_version != current {
return Err(OmniError::manifest_expected_version_mismatch(
entry.table_key.clone(),
entry.expected_version,
current,
));
}
entry.expected_version = current;
expected_versions.insert(entry.table_key.clone(), current);
}
// Sidecar protocol: build the per-table pin list and write the
// sidecar BEFORE any later error can return after Lance HEAD has
// already moved. For staged tables this still happens before any
// Lance commit_staged runs. For inline-committed delete tables,
// Lance HEAD moved inside delete_where before commit_all, so the
// sidecar must also exist before the inline manifest-version check
// below can reject a stale query.
//
// Pins cover BOTH staged tables (Lance HEAD will advance below
// when `commit_staged` runs) AND inline-committed tables
// (Lance HEAD already advanced inside `delete_where` — we still
// need a sidecar so that an upcoming publish failure is
// recoverable on next open). This closes the third-agent
// Finding 3 hazard: delete-only mutations would otherwise skip
// the sidecar, leaving any commit→publish residual unreachable
// by recovery.
let mut pins: Vec<SidecarTablePin> = Vec::with_capacity(
staged.len() + inline_committed.len(),
);
for entry in &staged {
pins.push(SidecarTablePin {
table_key: entry.table_key.clone(),
table_path: entry.path.full_path.clone(),
expected_version: entry.expected_version,
post_commit_pin: entry.expected_version + 1,
table_branch: entry.path.table_branch.clone(),
});
}
for (table_key, update) in &inline_committed {
let path = paths.get(table_key).ok_or_else(|| {
OmniError::manifest_internal(format!(
"StagedMutation::commit_all: missing path for inline-committed table '{}'",
table_key
))
})?;
let expected = *expected_versions.get(table_key).ok_or_else(|| {
OmniError::manifest_internal(format!(
"StagedMutation::commit_all: missing expected version for inline-committed table '{}'",
table_key
))
})?;
pins.push(SidecarTablePin {
table_key: table_key.clone(),
table_path: path.full_path.clone(),
expected_version: expected,
// For inline-committed tables, the post-commit pin is
// the actual post-delete version recorded by
// `record_inline`, NOT `expected + 1` — `delete_where`
// can advance HEAD by more than one version (e.g.,
// when Lance internally compacts deletion vectors).
post_commit_pin: update.table_version,
table_branch: path.table_branch.clone(),
});
}
let sidecar_handle = if pins.is_empty() {
None
} else {
let sidecar = new_sidecar(
sidecar_kind,
branch.map(|s| s.to_string()),
actor_id.map(str::to_string),
pins,
);
Some(write_sidecar(db.root_uri(), db.storage_adapter(), &sidecar).await?)
};
for (table_key, _update) in inline_committed.iter() {
let current = snapshot
.entry(table_key)
.map(|e| e.table_version)
.ok_or_else(|| {
OmniError::manifest_conflict(format!(
"table '{}' missing from manifest at commit time",
table_key,
))
})?;
let expected = expected_versions.get(table_key).copied().ok_or_else(|| {
OmniError::manifest_internal(format!(
"StagedMutation::commit_all: missing expected version for inline-committed table '{}'",
table_key
))
})?;
if expected != current {
return Err(OmniError::manifest_expected_version_mismatch(
table_key.clone(),
expected,
current,
));
}
expected_versions.insert(table_key.clone(), current);
}
let mut updates: Vec<SubTableUpdate> = inline_committed.into_values().collect();
for entry in staged {
let StagedTableEntry {
table_key,
path,
expected_version: _,
dataset,
staged_write,
} = entry;
let new_ds = db
.table_store()
.commit_staged(Arc::new(ds), staged.transaction)
.commit_staged(Arc::new(dataset), staged_write.transaction)
.await?;
let state = db
.table_store()
.table_state(&path.full_path, &new_ds)
.await?;
updates.push(SubTableUpdate {
table_key: table_key.clone(),
table_key,
table_version: state.version,
table_branch: path.table_branch.clone(),
row_count: state.row_count,
@ -370,7 +670,7 @@ impl MutationStaging {
});
}
Ok((updates, expected_versions, sidecar_handle))
Ok((updates, expected_versions, sidecar_handle, guards))
}
}

View file

@ -59,21 +59,21 @@ pub enum LoadMode {
/// Load JSONL data into an Omnigraph database.
pub async fn load_jsonl(db: &mut Omnigraph, data: &str, mode: LoadMode) -> Result<LoadResult> {
let current_branch = db.active_branch().map(str::to_string);
let current_branch = db.active_branch().await;
let branch = current_branch.as_deref().unwrap_or("main");
db.load(branch, data, mode).await
}
/// Load JSONL data from a file path.
pub async fn load_jsonl_file(db: &mut Omnigraph, path: &str, mode: LoadMode) -> Result<LoadResult> {
let current_branch = db.active_branch().map(str::to_string);
let current_branch = db.active_branch().await;
let branch = current_branch.as_deref().unwrap_or("main");
db.load_file(branch, path, mode).await
}
impl Omnigraph {
pub async fn ingest(
&mut self,
&self,
branch: &str,
from: Option<&str>,
data: &str,
@ -83,24 +83,19 @@ impl Omnigraph {
}
pub async fn ingest_as(
&mut self,
&self,
branch: &str,
from: Option<&str>,
data: &str,
mode: LoadMode,
actor_id: Option<&str>,
) -> Result<IngestResult> {
let previous_actor = self.audit_actor_id.clone();
self.audit_actor_id = actor_id.map(str::to_string);
let result = self
.ingest_with_current_actor(branch, from, data, mode)
.await;
self.audit_actor_id = previous_actor;
result
self.ingest_with_current_actor(branch, from, data, mode, actor_id)
.await
}
pub async fn ingest_file(
&mut self,
&self,
branch: &str,
from: Option<&str>,
path: &str,
@ -110,7 +105,7 @@ impl Omnigraph {
}
pub async fn ingest_file_as(
&mut self,
&self,
branch: &str,
from: Option<&str>,
path: &str,
@ -122,11 +117,12 @@ impl Omnigraph {
}
async fn ingest_with_current_actor(
&mut self,
&self,
branch: &str,
from: Option<&str>,
data: &str,
mode: LoadMode,
actor_id: Option<&str>,
) -> Result<IngestResult> {
self.ensure_schema_state_valid().await?;
let target_branch =
@ -143,7 +139,7 @@ impl Omnigraph {
.await?;
}
let result = self.load(&target_branch, data, mode).await?;
let result = self.load_as(&target_branch, data, mode, actor_id).await?;
Ok(IngestResult {
branch: target_branch,
base_branch,
@ -153,7 +149,17 @@ impl Omnigraph {
})
}
pub async fn load(&mut self, branch: &str, data: &str, mode: LoadMode) -> Result<LoadResult> {
pub async fn load(&self, branch: &str, data: &str, mode: LoadMode) -> Result<LoadResult> {
self.load_as(branch, data, mode, None).await
}
pub async fn load_as(
&self,
branch: &str,
data: &str,
mode: LoadMode,
actor_id: Option<&str>,
) -> Result<LoadResult> {
self.ensure_schema_state_valid().await?;
// Reject internal `__run__*` / system-prefixed branches at the
// public write boundary. Direct-publish paths assert this
@ -169,12 +175,12 @@ impl Omnigraph {
// Direct-to-target writes: no Run state machine, no `__run__` staging
// branch. Cross-table OCC is enforced by the publisher's
// `expected_table_versions` CAS inside `load_jsonl_reader`.
self.load_direct_on_branch(requested.as_deref(), data, mode)
self.load_direct_on_branch(requested.as_deref(), data, mode, actor_id)
.await
}
pub async fn load_file(
&mut self,
&self,
branch: &str,
path: &str,
mode: LoadMode,
@ -184,13 +190,14 @@ impl Omnigraph {
}
async fn load_direct_on_branch(
&mut self,
&self,
branch: Option<&str>,
data: &str,
mode: LoadMode,
actor_id: Option<&str>,
) -> Result<LoadResult> {
let reader = BufReader::new(Cursor::new(data.as_bytes()));
load_jsonl_reader(self, branch, reader, mode).await
load_jsonl_reader(self, branch, reader, mode, actor_id).await
}
}
@ -228,10 +235,11 @@ impl LoadResult {
}
async fn load_jsonl_reader<R: BufRead>(
db: &mut Omnigraph,
db: &Omnigraph,
branch: Option<&str>,
reader: R,
mode: LoadMode,
actor_id: Option<&str>,
) -> Result<LoadResult> {
let catalog = db.catalog().clone();
@ -327,6 +335,16 @@ async fn load_jsonl_reader<R: BufRead>(
LoadMode::Append => PendingMode::Append,
LoadMode::Overwrite => PendingMode::Append, // unused
};
// Map LoadMode to MutationOpKind for the version-check policy.
// Append/Merge skip the strict pre-stage check (concurrency-safe
// under the per-(table, branch) queue + publisher CAS); Overwrite
// uses the strict check because it truncates and replaces the
// dataset — concurrent advances change what "replace" means.
let load_op_kind = match mode {
LoadMode::Append => crate::db::MutationOpKind::Insert,
LoadMode::Merge => crate::db::MutationOpKind::Merge,
LoadMode::Overwrite => crate::db::MutationOpKind::SchemaRewrite,
};
// Phase 2a: build and validate every node batch up front. Cheap and
// synchronous — surfaces validation errors before any S3 traffic.
@ -357,7 +375,7 @@ async fn load_jsonl_reader<R: BufRead>(
if use_staging {
for (type_name, table_key, batch, loaded_count) in prepared_nodes {
let (ds, full_path, table_branch) = db
.open_for_mutation_on_branch(branch, &table_key)
.open_for_mutation_on_branch(branch, &table_key, load_op_kind)
.await?;
let expected_version = ds.version().version;
staging.ensure_path(
@ -365,6 +383,7 @@ async fn load_jsonl_reader<R: BufRead>(
full_path,
table_branch,
expected_version,
load_op_kind,
);
let schema = batch.schema();
staging.append_batch(&table_key, schema, pending_mode, batch)?;
@ -478,7 +497,7 @@ async fn load_jsonl_reader<R: BufRead>(
if use_staging {
for (edge_name, table_key, batch, loaded_count) in prepared_edges {
let (ds, full_path, table_branch) = db
.open_for_mutation_on_branch(branch, &table_key)
.open_for_mutation_on_branch(branch, &table_key, load_op_kind)
.await?;
let expected_version = ds.version().version;
staging.ensure_path(
@ -486,6 +505,7 @@ async fn load_jsonl_reader<R: BufRead>(
full_path,
table_branch,
expected_version,
load_op_kind,
);
let schema = batch.schema();
staging.append_batch(&table_key, schema, pending_mode, batch)?;
@ -537,15 +557,19 @@ async fn load_jsonl_reader<R: BufRead>(
// Phase 4: Atomic manifest commit with publisher-level OCC.
if use_staging {
let (updates, expected_versions, sidecar_handle) = staging
.finalize(db, branch, crate::db::manifest::SidecarKind::Load)
let staged = staging.stage_all(db, branch).await?;
// `_queue_guards` holds per-(table_key, branch) write queues
// across the manifest publish below — see exec/mutation.rs for
// the rationale (interleaving prevention).
let (updates, expected_versions, sidecar_handle, _queue_guards) = staged
.commit_all(db, branch, crate::db::manifest::SidecarKind::Load, actor_id)
.await?;
// Same finalize → publisher residual as mutations: per-table
// staged commits have advanced Lance HEAD, but the manifest
// publish has not run yet. Reuse the mutation failpoint name so
// one failpoint pins the shared `MutationStaging` boundary.
crate::failpoints::maybe_fail("mutation.post_finalize_pre_publisher")?;
db.commit_updates_on_branch_with_expected(branch, &updates, &expected_versions)
db.commit_updates_on_branch_with_expected(branch, &updates, &expected_versions, actor_id)
.await?;
// The recovery sidecar protects the per-table commit_staged →
// manifest publish window. Phase C succeeded — clean up
@ -574,6 +598,7 @@ async fn load_jsonl_reader<R: BufRead>(
branch,
&overwrite_updates,
&overwrite_expected,
actor_id,
)
.await?;
}
@ -1151,8 +1176,14 @@ async fn write_batch_to_dataset(
batch: RecordBatch,
mode: LoadMode,
) -> Result<(crate::table_store::TableState, Option<String>)> {
let (mut ds, full_path, table_branch) =
db.open_for_mutation_on_branch(branch, table_key).await?;
let op_kind = match mode {
LoadMode::Append => crate::db::MutationOpKind::Insert,
LoadMode::Merge => crate::db::MutationOpKind::Merge,
LoadMode::Overwrite => crate::db::MutationOpKind::SchemaRewrite,
};
let (mut ds, full_path, table_branch) = db
.open_for_mutation_on_branch(branch, table_key, op_kind)
.await?;
let table_store = db.table_store();
match mode {
@ -1817,7 +1848,7 @@ edge WorksAt: Person -> Company
.unwrap();
// Read back via snapshot
let snap = db.snapshot();
let snap = db.snapshot().await;
let person_ds = snap.open("node:Person").await.unwrap();
assert_eq!(person_ds.count_rows(None).await.unwrap(), 2);
@ -1854,7 +1885,7 @@ edge WorksAt: Person -> Company
.await
.unwrap();
let snap = db.snapshot();
let snap = db.snapshot().await;
let knows_ds = snap.open("edge:Knows").await.unwrap();
let batches: Vec<RecordBatch> = knows_ds
@ -1889,13 +1920,13 @@ edge WorksAt: Person -> Company
let dir = tempfile::tempdir().unwrap();
let uri = dir.path().to_str().unwrap();
let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap();
let v1 = db.version();
let v1 = db.version().await;
load_jsonl(&mut db, TEST_DATA, LoadMode::Overwrite)
.await
.unwrap();
assert!(db.version() > v1);
assert!(db.version().await > v1);
}
#[tokio::test]
@ -1912,7 +1943,7 @@ edge WorksAt: Person -> Company
.unwrap();
load_jsonl(&mut db, batch2, LoadMode::Append).await.unwrap();
let snap = db.snapshot();
let snap = db.snapshot().await;
let person_ds = snap.open("node:Person").await.unwrap();
assert_eq!(person_ds.count_rows(None).await.unwrap(), 2);
}

View file

@ -396,6 +396,137 @@ async fn composite_flow_canonical_lifecycle() {
assert!(!final_total.batches().is_empty());
}
/// Cross-handle sequence that exercises operations after a schema_apply
/// invalidates a peer handle's cached `_schema.pg`. The narrow load-bearing
/// pin is that `Omnigraph::refresh()` must not deadlock when its
/// `reload_schema_if_source_changed()` step needs to acquire a read on the
/// coordinator's `RwLock`. The broader sequencing — schema_apply →
/// branch_create → branch_delete → branch_merge → mutate (using the new
/// schema's added property) → reopen — pins that the fix doesn't regress
/// any of the related call sites.
///
/// Pre-fix bug class: `Omnigraph::refresh()` held
/// `coordinator.write().await` from start to finish, including across the
/// `self.reload_schema_if_source_changed()` call. That helper's
/// `self.coordinator.read().await` (only reached when the on-disk schema
/// source differs from the in-memory cache) deadlocks against the outer
/// write guard because tokio's `RwLock` is not reentrant. Reachable from
/// every public refresh-using API: `branch_delete` (`omnigraph.rs:910`),
/// `branch_merge` (post-merge refresh on bound target), and any caller
/// that calls `Omnigraph::refresh` directly.
///
/// The cross-handle setup is the realistic trigger: handle A applies a
/// schema, advancing `_schema.pg` on disk; handle B has stale in-memory
/// schema_source. B's next `refresh()` (via branch_delete here) hits the
/// read-after-write reload path. Single-handle is unreachable because
/// `apply_schema` updates the local ArcSwap cache in-line.
///
/// Post-fix invariant: `refresh()` scopes its write guard to the recovery
/// section only, releasing it before `reload_schema_if_source_changed()`.
/// The reload's read acquisition is uncontested.
#[tokio::test(flavor = "multi_thread", worker_threads = 4)]
async fn composite_flow_schema_apply_then_branch_ops_no_deadlock_in_refresh() {
let dir = tempfile::tempdir().unwrap();
let uri = dir.path().to_str().unwrap();
// Step 1: init + load on handle A.
let mut db_a = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap();
load_jsonl(&mut db_a, TEST_DATA, LoadMode::Append).await.unwrap();
assert_eq!(count_rows(&db_a, "node:Person").await, 4);
// Step 2: open handle B on the same repo. B's in-memory schema_source
// cache is now a snapshot of `_schema.pg` at open time.
let db_b = Omnigraph::open(uri).await.unwrap();
// Step 3: A applies a schema that adds a nullable property to Person.
// A's on-disk `_schema.pg` is rewritten; A's in-memory cache is updated
// in-line by `apply_schema`. B's in-memory cache is now STALE relative
// to disk.
const TEST_SCHEMA_V2: &str = "node Person {\n name: String @key\n age: I32?\n nickname: String?\n}\n\nnode Company {\n name: String @key\n}\n\nedge Knows: Person -> Person {\n since: Date?\n}\n\nedge WorksAt: Person -> Company\n";
let plan = db_a.apply_schema(TEST_SCHEMA_V2).await.unwrap();
assert!(plan.applied, "apply_schema must succeed on a clean repo");
assert!(
!plan.steps.is_empty(),
"apply_schema must record the AddProperty step"
);
// Step 4: deadlock vector. B.branch_delete calls B.refresh() internally
// (omnigraph.rs:910). refresh() pre-fix holds the coord write guard
// across reload_schema_if_source_changed; with B's cache stale, that
// helper takes the not-early-return branch and tries
// self.coordinator.read().await — deadlocks against the outer write.
//
// Wrap in tokio::time::timeout so a deadlock surfaces as a clean test
// panic instead of a stuck CI job. 15s is well above natural completion
// on local FS (sub-second under normal conditions).
db_b.branch_create("post-schema-apply-test").await.unwrap();
let delete_result = tokio::time::timeout(
std::time::Duration::from_secs(15),
db_b.branch_delete("post-schema-apply-test"),
)
.await;
assert!(
delete_result.is_ok(),
"branch_delete deadlocked in refresh() with stale schema cache. \
Pre-fix symptom: Omnigraph::refresh() holds coordinator.write().await \
across reload_schema_if_source_changed(), which acquires \
coordinator.read().await on the same non-reentrant RwLock when the \
on-disk schema source differs from the in-memory cache.",
);
delete_result
.unwrap()
.expect("branch_delete must succeed once refresh() releases its write guard");
// Step 5: continuing operations on B post-refresh — verify the broader
// sequence works. B's catalog should now reflect the new schema (the
// refresh path includes reload_schema_if_source_changed which calls
// store_catalog).
db_b.branch_create("feature-after-apply").await.unwrap();
// Step 6: branch_merge from B exercises the post-merge refresh() path
// (merge.rs:1100-1107) — same deadlock surface as branch_delete,
// sanity-pinned by reusing the same handle whose cache was just
// refreshed.
let _outcome = tokio::time::timeout(
std::time::Duration::from_secs(15),
db_b.branch_merge("feature-after-apply", "main"),
)
.await
.expect("branch_merge deadlocked in refresh() post-schema-apply")
.expect("branch_merge must succeed");
// Step 7: mutation on main using the new schema's added property —
// verifies the catalog reload completed and the engine accepts a
// mutation referencing `nickname`.
const NICKNAME_QUERY: &str = "query set_nickname($name: String, $nickname: String) {\n update Person set { nickname: $nickname } where name = $name\n}";
db_b.mutate_as(
"main",
NICKNAME_QUERY,
"set_nickname",
&mixed_params(&[("$name", "Alice"), ("$nickname", "Ali")], &[]),
None,
)
.await
.expect("update using post-apply schema property must succeed");
// Step 8: reopen — final integration check that the post-deadlock-fix
// state persists across handle drop/open.
drop(db_a);
drop(db_b);
let db_c = Omnigraph::open(uri).await.unwrap();
assert_eq!(
count_rows(&db_c, "node:Person").await,
4,
"Person count consistent across reopen post-schema-apply",
);
let branches = db_c.branch_list().await.unwrap();
assert!(
!branches.iter().any(|b| b == "post-schema-apply-test"),
"deleted branch must stay deleted across reopen; got {:?}",
branches,
);
}
/// Multi-branch sequential merges with main writes interleaved between
/// every diverge point. Catches compositional regressions that single-
/// merge tests can't see:

View file

@ -3,6 +3,7 @@
mod helpers;
use fail::FailScenario;
use futures::FutureExt;
use omnigraph::db::Omnigraph;
use omnigraph::failpoints::ScopedFailPoint;
@ -25,37 +26,12 @@ fn node_table_uri(root: &str, type_name: &str) -> String {
format!("{}/nodes/{hash:016x}", root.trim_end_matches('/'))
}
fn person_batch(rows: &[(&str, &str, Option<i32>)]) -> arrow_array::RecordBatch {
use std::sync::Arc;
use arrow_array::{Int32Array, StringArray};
use arrow_schema::{DataType, Field, Schema};
let schema = Arc::new(Schema::new(vec![
Field::new("id", DataType::Utf8, false),
Field::new("age", DataType::Int32, true),
Field::new("name", DataType::Utf8, false),
]));
let ids: Vec<&str> = rows.iter().map(|(id, _, _)| *id).collect();
let names: Vec<&str> = rows.iter().map(|(_, name, _)| *name).collect();
let ages: Vec<Option<i32>> = rows.iter().map(|(_, _, age)| *age).collect();
arrow_array::RecordBatch::try_new(
schema,
vec![
Arc::new(StringArray::from(ids)),
Arc::new(Int32Array::from(ages)),
Arc::new(StringArray::from(names)),
],
)
.unwrap()
}
#[tokio::test]
async fn branch_create_failpoint_triggers() {
let _scenario = FailScenario::setup();
let dir = tempfile::tempdir().unwrap();
let uri = dir.path().to_str().unwrap();
let mut db = Omnigraph::init(uri, helpers::TEST_SCHEMA).await.unwrap();
let db = Omnigraph::init(uri, helpers::TEST_SCHEMA).await.unwrap();
let _failpoint = ScopedFailPoint::new("branch_create.after_manifest_branch_create", "return");
let err = db.branch_create("feature").await.unwrap_err();
@ -65,7 +41,7 @@ async fn branch_create_failpoint_triggers() {
);
}
#[tokio::test]
#[tokio::test(flavor = "multi_thread")]
async fn graph_publish_failpoint_triggers_before_commit_append() {
let _scenario = FailScenario::setup();
let dir = tempfile::tempdir().unwrap();
@ -100,7 +76,7 @@ async fn schema_apply_pre_commit_crash_rolls_forward_via_sidecar() {
let uri = dir.path().to_str().unwrap().to_string();
{
let mut db = Omnigraph::init(&uri, SCHEMA_V1).await.unwrap();
let db = Omnigraph::init(&uri, SCHEMA_V1).await.unwrap();
let _failpoint = ScopedFailPoint::new("schema_apply.after_staging_write", "return");
let err = db.apply_schema(SCHEMA_V2_ADDED_TYPE).await.unwrap_err();
assert!(
@ -122,7 +98,7 @@ async fn schema_apply_pre_commit_crash_rolls_forward_via_sidecar() {
// behind is closed.
let db = Omnigraph::open(&uri).await.unwrap();
assert_eq!(
db.schema_source(),
db.schema_source().as_str(),
SCHEMA_V2_ADDED_TYPE,
"live schema must reflect the rolled-forward apply (Company added)"
);
@ -143,7 +119,7 @@ async fn schema_apply_recovers_post_commit_crash() {
let uri = dir.path().to_str().unwrap().to_string();
{
let mut db = Omnigraph::init(&uri, SCHEMA_V1).await.unwrap();
let db = Omnigraph::init(&uri, SCHEMA_V1).await.unwrap();
let _failpoint = ScopedFailPoint::new("schema_apply.after_manifest_commit", "return");
let err = db.apply_schema(SCHEMA_V2_ADDED_TYPE).await.unwrap_err();
assert!(
@ -157,7 +133,7 @@ async fn schema_apply_recovers_post_commit_crash() {
// Reopen — manifest is at the new version, so recovery sweep should
// complete the rename and the live schema matches v2.
let db = Omnigraph::open(&uri).await.unwrap();
assert_eq!(db.schema_source(), SCHEMA_V2_ADDED_TYPE);
assert_eq!(db.schema_source().as_str(), SCHEMA_V2_ADDED_TYPE);
assert_no_staging_files(dir.path());
}
@ -172,7 +148,7 @@ async fn schema_apply_recovers_partial_rename() {
let uri = dir.path().to_str().unwrap().to_string();
{
let mut db = Omnigraph::init(&uri, SCHEMA_V1).await.unwrap();
let db = Omnigraph::init(&uri, SCHEMA_V1).await.unwrap();
db.apply_schema(SCHEMA_V2_ADDED_TYPE).await.unwrap();
}
@ -192,7 +168,7 @@ async fn schema_apply_recovers_partial_rename() {
// Reopen — recovery should complete the rename (overwriting final files
// with identical staging content) and remove the staging files.
let db = Omnigraph::open(&uri).await.unwrap();
assert_eq!(db.schema_source(), SCHEMA_V2_ADDED_TYPE);
assert_eq!(db.schema_source().as_str(), SCHEMA_V2_ADDED_TYPE);
assert_no_staging_files(dir.path());
}
@ -312,6 +288,85 @@ async fn recovery_rolls_forward_after_finalize_publisher_failure() {
);
}
#[tokio::test]
async fn inline_delete_conflict_writes_sidecar_before_rejecting() {
let _scenario = FailScenario::setup();
let dir = tempfile::tempdir().unwrap();
let uri = dir.path().to_str().unwrap().to_string();
let db = helpers::init_and_load(&dir).await;
let pre_snapshot = db
.snapshot_of(omnigraph::db::ReadTarget::branch("main"))
.await
.unwrap();
let pre_person_pin = pre_snapshot.entry("node:Person").unwrap().table_version;
let person_uri = node_table_uri(&uri, "Person");
{
let _pause_delete = ScopedFailPoint::new("mutation.delete_node_pre_primary_delete", "pause");
let delete_params = helpers::params(&[("$name", "Alice")]);
let delete = db.mutate(
"main",
MUTATION_QUERIES,
"remove_person",
&delete_params,
);
tokio::pin!(delete);
let mut concurrent_update_succeeded = false;
for _ in 0..50 {
if delete.as_mut().now_or_never().is_some() {
panic!("delete mutation completed before primary-delete failpoint was released");
}
let mut concurrent = Omnigraph::open_read_only(&uri).await.unwrap();
if mutate_main(
&mut concurrent,
MUTATION_QUERIES,
"set_age",
&mixed_params(&[("$name", "Bob")], &[("$age", 26)]),
)
.await
.is_ok()
{
concurrent_update_succeeded = true;
break;
}
tokio::time::sleep(std::time::Duration::from_millis(20)).await;
}
assert!(concurrent_update_succeeded, "concurrent update must land while delete is paused");
fail::remove("mutation.delete_node_pre_primary_delete");
let err = delete.await.unwrap_err();
assert!(
err.to_string().contains("stale view of 'node:Person'")
|| err.to_string().contains("ExpectedVersionMismatch")
|| err.to_string().contains("expected version mismatch"),
"unexpected error: {err}",
);
}
let person_head = lance::Dataset::open(&person_uri)
.await
.unwrap()
.version()
.version;
assert!(
person_head > pre_person_pin,
"primary inline delete must have advanced node:Person before rejecting"
);
let db = Omnigraph::open(&uri).await.unwrap();
assert_eq!(
helpers::count_rows(&db, "node:Person").await,
4,
"manifest-conflicted delete must not remove net Person rows after recovery"
);
assert_eq!(
helpers::count_rows(&db, "edge:Knows").await,
3,
"manifest-conflicted delete must not remove net Knows rows after recovery"
);
}
#[tokio::test]
async fn recovery_rolls_forward_load_on_feature_branch() {
use omnigraph::loader::LoadMode;
@ -324,7 +379,7 @@ async fn recovery_rolls_forward_load_on_feature_branch() {
let feature_parent_commit_id;
{
let mut db = Omnigraph::init(&uri, helpers::TEST_SCHEMA).await.unwrap();
let db = Omnigraph::init(&uri, helpers::TEST_SCHEMA).await.unwrap();
db.branch_create("feature").await.unwrap();
db.mutate(
"feature",
@ -929,7 +984,7 @@ async fn schema_apply_without_schema_staging_rolls_back_on_next_open() {
};
{
let mut db = Omnigraph::open(&uri).await.unwrap();
let db = Omnigraph::open(&uri).await.unwrap();
let _failpoint = ScopedFailPoint::new("schema_apply.before_staging_write", "return");
let v2_schema = r#"node Person {
name: String @key
@ -1029,7 +1084,7 @@ async fn schema_apply_phase_b_failure_recovered_on_next_open() {
// (Lance HEAD advanced) AND AFTER the schema-state staging files are
// written, but BEFORE the manifest publish. The recovery sidecar persists.
{
let mut db = Omnigraph::open(&uri).await.unwrap();
let db = Omnigraph::open(&uri).await.unwrap();
let _failpoint = ScopedFailPoint::new("schema_apply.after_staging_write", "return");
// v2 schema: add a `city` property to Person AND add a new
// `Tag` node type. The new property triggers the rewritten_tables
@ -1191,7 +1246,7 @@ async fn branch_merge_phase_b_failure_recovered_on_next_open() {
// Setup: failpoint fires after the per-table publish loop completes
// but before commit_manifest_updates. Sidecar persists.
{
let mut db = Omnigraph::open(&uri).await.unwrap();
let db = Omnigraph::open(&uri).await.unwrap();
let _failpoint =
ScopedFailPoint::new("branch_merge.post_phase_b_pre_manifest_commit", "return");
let err = db.branch_merge("feature", "main").await.unwrap_err();
@ -1367,7 +1422,7 @@ async fn branch_merge_phase_b_failure_recovered_on_non_main_target() {
// but before commit_manifest_updates. Sidecar persists with
// branch=Some("target_branch").
{
let mut db = Omnigraph::open(&uri).await.unwrap();
let db = Omnigraph::open(&uri).await.unwrap();
let _failpoint =
ScopedFailPoint::new("branch_merge.post_phase_b_pre_manifest_commit", "return");
let err = db
@ -1468,7 +1523,7 @@ async fn branch_merge_sidecar_pins_table_branch_to_active_branch() {
}
{
let mut db = Omnigraph::open(&uri).await.unwrap();
let db = Omnigraph::open(&uri).await.unwrap();
let _failpoint =
ScopedFailPoint::new("branch_merge.post_phase_b_pre_manifest_commit", "return");
let _ = db
@ -1559,7 +1614,7 @@ async fn ensure_indices_phase_b_failure_does_not_leak_sidecar_when_no_work_neede
// that genuinely need work); no sidecar is written. The failpoint
// still fires, surfacing the Err.
{
let mut db = Omnigraph::open(&uri).await.unwrap();
let db = Omnigraph::open(&uri).await.unwrap();
let _failpoint =
ScopedFailPoint::new("ensure_indices.post_phase_b_pre_manifest_commit", "return");
let err = db.ensure_indices().await.unwrap_err();

View file

@ -860,6 +860,13 @@ async fn lance_restore_appends_one_commit_with_checked_out_content() {
/// tables before invoking restore — otherwise this hazard becomes
/// reachable during in-flight tenant traffic.
///
/// MR-686 introduces those per-(table_key, branch) writer queues as the
/// application-layer mechanism that closes this hazard once continuous
/// in-process recovery (MR-870) lands. Until MR-686's queue is wired into
/// the recovery path, the open-time-only invocation strategy is the
/// only thing keeping this hazard out of production. See
/// `docs/invariants.md` §VI.30, §VI.32, §VI.33.
///
/// This test is the load-bearing constraint any future reconciler must
/// honor.
#[tokio::test]