mirror of
https://github.com/ModernRelay/omnigraph.git
synced 2026-06-15 01:55:13 +02:00
Merge main into blob merge fix
Co-Authored-By: Ragnor Comerford <ragnor.comerford@gmail.com>
This commit is contained in:
commit
da89e18e62
38 changed files with 4380 additions and 534 deletions
|
|
@ -1,6 +1,6 @@
|
|||
[package]
|
||||
name = "omnigraph-cli"
|
||||
version = "0.4.1"
|
||||
version = "0.4.2"
|
||||
edition = "2024"
|
||||
description = "CLI for the Omnigraph graph database."
|
||||
license = "MIT"
|
||||
|
|
@ -13,9 +13,9 @@ name = "omnigraph"
|
|||
path = "src/main.rs"
|
||||
|
||||
[dependencies]
|
||||
omnigraph = { package = "omnigraph-engine", path = "../omnigraph", version = "0.4.1" }
|
||||
omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.4.1" }
|
||||
omnigraph-server = { path = "../omnigraph-server", version = "0.4.1" }
|
||||
omnigraph = { package = "omnigraph-engine", path = "../omnigraph", version = "0.4.2" }
|
||||
omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.4.2" }
|
||||
omnigraph-server = { path = "../omnigraph-server", version = "0.4.2" }
|
||||
clap = { workspace = true }
|
||||
color-eyre = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
|
|
|
|||
|
|
@ -1425,7 +1425,7 @@ async fn execute_query_lint(
|
|||
let uri = resolve_local_uri(config, cli_uri, cli_target, "query lint")?;
|
||||
let db = Omnigraph::open(&uri).await?;
|
||||
Ok(lint_query_file(
|
||||
db.catalog(),
|
||||
&db.catalog(),
|
||||
&query_source,
|
||||
query_path,
|
||||
QueryLintSchemaSource::repo(uri),
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
[package]
|
||||
name = "omnigraph-compiler"
|
||||
version = "0.4.1"
|
||||
version = "0.4.2"
|
||||
edition = "2024"
|
||||
description = "Schema/query compiler for Omnigraph. Zero Lance dependency."
|
||||
license = "MIT"
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
[package]
|
||||
name = "omnigraph-server"
|
||||
version = "0.4.1"
|
||||
version = "0.4.2"
|
||||
edition = "2024"
|
||||
description = "HTTP server for the Omnigraph graph database."
|
||||
license = "MIT"
|
||||
|
|
@ -19,8 +19,8 @@ default = []
|
|||
aws = ["dep:aws-config", "dep:aws-sdk-secretsmanager"]
|
||||
|
||||
[dependencies]
|
||||
omnigraph = { package = "omnigraph-engine", path = "../omnigraph", version = "0.4.1" }
|
||||
omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.4.1" }
|
||||
omnigraph = { package = "omnigraph-engine", path = "../omnigraph", version = "0.4.2" }
|
||||
omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.4.2" }
|
||||
axum = { workspace = true }
|
||||
clap = { workspace = true }
|
||||
color-eyre = { workspace = true }
|
||||
|
|
@ -37,6 +37,7 @@ futures = { workspace = true }
|
|||
sha2 = { workspace = true }
|
||||
subtle = { workspace = true }
|
||||
async-trait = { workspace = true }
|
||||
dashmap = "6"
|
||||
aws-config = { version = "1", optional = true, default-features = false, features = ["rustls", "rt-tokio", "credentials-process", "sso"] }
|
||||
aws-sdk-secretsmanager = { version = "1", optional = true, default-features = false, features = ["rustls", "rt-tokio"] }
|
||||
|
||||
|
|
|
|||
392
crates/omnigraph-server/examples/bench_actor_isolation.rs
Normal file
392
crates/omnigraph-server/examples/bench_actor_isolation.rs
Normal file
|
|
@ -0,0 +1,392 @@
|
|||
//! Actor-isolation benchmark for MR-686's `WorkloadController`.
|
||||
//!
|
||||
//! The handoff calls this out as the empirical proof of MR-686's central
|
||||
//! design promise: per-actor admission control isolates noisy actors so a
|
||||
//! heavy `/ingest` user does not starve light `/change` traffic. The
|
||||
//! per-`(table, branch)` queue pins the same-key serialization story; this
|
||||
//! bench pins actor isolation under load.
|
||||
//!
|
||||
//! Setup:
|
||||
//! - One "heavy" actor flooding `/ingest` with multi-row NDJSON bodies.
|
||||
//! - N "light" actors each running short bursts of `/change` inserts.
|
||||
//! - Each actor authenticates with its own bearer token so the
|
||||
//! `WorkloadController` accounts them as distinct identities.
|
||||
//!
|
||||
//! Output: heavy-actor throughput / 429s, light-actor p50 / p95 / p99
|
||||
//! latency. Acceptance heuristic on local FS: light-actor p99 < 2 s
|
||||
//! while the heavy actor saturates its own per-actor cap.
|
||||
//!
|
||||
//! Usage:
|
||||
//! ```sh
|
||||
//! cargo run --release -p omnigraph-server --example bench_actor_isolation -- \
|
||||
//! --light-actors 4 --light-ops-per-actor 50 \
|
||||
//! --heavy-batches 200 --heavy-rows-per-batch 200 \
|
||||
//! --inflight-cap 1 \
|
||||
//! --output bench-results/after-pr2-phase2/actor-isolation.json
|
||||
//! ```
|
||||
|
||||
use std::path::PathBuf;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use axum::Router;
|
||||
use axum::body::{Body, to_bytes};
|
||||
use axum::http::{Method, Request, StatusCode};
|
||||
use clap::Parser;
|
||||
use omnigraph::db::Omnigraph;
|
||||
use omnigraph_server::api::{ChangeRequest, IngestRequest};
|
||||
use omnigraph_server::workload::WorkloadController;
|
||||
use omnigraph_server::{AppState, build_app};
|
||||
use serde::Serialize;
|
||||
use tower::ServiceExt;
|
||||
|
||||
const SCHEMA: &str = "node Person {\n name: String @key\n age: I32?\n}\n";
|
||||
|
||||
const HEAVY_TOKEN: &str = "heavy-actor-token";
|
||||
const HEAVY_ACTOR: &str = "act-heavy";
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(about = "Actor-isolation HTTP bench for MR-686 WorkloadController")]
|
||||
struct Args {
|
||||
/// Number of light actors driving /change traffic concurrently with the
|
||||
/// heavy /ingest flood. Each gets its own bearer token.
|
||||
#[arg(long, default_value_t = 4)]
|
||||
light_actors: usize,
|
||||
/// Number of /change ops per light actor.
|
||||
#[arg(long, default_value_t = 50)]
|
||||
light_ops_per_actor: usize,
|
||||
/// Number of /ingest batches the heavy actor sends.
|
||||
#[arg(long, default_value_t = 200)]
|
||||
heavy_batches: usize,
|
||||
/// NDJSON rows per heavy /ingest batch.
|
||||
#[arg(long, default_value_t = 200)]
|
||||
heavy_rows_per_batch: usize,
|
||||
/// Concurrent in-flight /ingest tasks the heavy actor maintains. With
|
||||
/// `inflight_cap` smaller than this, the heavy actor exercises its own
|
||||
/// admission cap (and the bench reports `heavy_too_many_requests > 0`),
|
||||
/// proving the gate fires without affecting light actors. Default 4
|
||||
/// against cap=1 → expect ~3/4 batches rejected.
|
||||
#[arg(long, default_value_t = 4)]
|
||||
heavy_concurrency: usize,
|
||||
/// Per-actor in-flight cap for the run. Passed directly into the
|
||||
/// `WorkloadController` constructor (no env-var fiddling). Lower
|
||||
/// values surface admission rejections faster.
|
||||
#[arg(long, default_value_t = 1)]
|
||||
inflight_cap: u32,
|
||||
/// Per-actor byte budget (bytes). Default 1 GiB so byte budget
|
||||
/// doesn't bottleneck the count gate during normal bench runs.
|
||||
#[arg(long, default_value_t = 1_073_741_824)]
|
||||
byte_cap: u64,
|
||||
/// Output file for the JSON results. Stdout always gets a copy.
|
||||
#[arg(long)]
|
||||
output: Option<PathBuf>,
|
||||
/// Optional label to record alongside results.
|
||||
#[arg(long, default_value = "")]
|
||||
label: String,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Debug)]
|
||||
struct BenchResults {
|
||||
label: String,
|
||||
inflight_cap: u32,
|
||||
light_actors: usize,
|
||||
light_ops_per_actor: usize,
|
||||
heavy_batches: usize,
|
||||
heavy_rows_per_batch: usize,
|
||||
wall_time_ms: u64,
|
||||
heavy_ok: usize,
|
||||
heavy_too_many_requests: usize,
|
||||
heavy_other_errors: usize,
|
||||
heavy_throughput_attempts_per_sec: f64,
|
||||
light_ok: usize,
|
||||
light_too_many_requests: usize,
|
||||
light_other_errors: usize,
|
||||
light_p50_ms: f64,
|
||||
light_p95_ms: f64,
|
||||
light_p99_ms: f64,
|
||||
light_p999_ms: f64,
|
||||
light_max_ms: f64,
|
||||
notes: &'static str,
|
||||
}
|
||||
|
||||
fn build_heavy_body(batch_idx: usize, rows: usize) -> String {
|
||||
let mut data = String::new();
|
||||
for r in 0..rows {
|
||||
data.push_str(&format!(
|
||||
"{{\"type\":\"Person\",\"data\":{{\"name\":\"heavy-b{}-r{}\",\"age\":{}}}}}\n",
|
||||
batch_idx,
|
||||
r,
|
||||
r % 100,
|
||||
));
|
||||
}
|
||||
serde_json::to_string(&IngestRequest {
|
||||
data,
|
||||
branch: Some("main".to_string()),
|
||||
from: Some("main".to_string()),
|
||||
mode: Some(omnigraph::loader::LoadMode::Merge),
|
||||
})
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
async fn send_heavy_batch(app: Router, batch_idx: usize, rows: usize) -> StatusCode {
|
||||
let body = build_heavy_body(batch_idx, rows);
|
||||
let req = Request::builder()
|
||||
.method(Method::POST)
|
||||
.uri("/ingest")
|
||||
.header("authorization", format!("Bearer {HEAVY_TOKEN}"))
|
||||
.header("content-type", "application/json")
|
||||
.body(Body::from(body))
|
||||
.unwrap();
|
||||
match app.oneshot(req).await {
|
||||
Ok(r) => r.status(),
|
||||
Err(_) => StatusCode::INTERNAL_SERVER_ERROR,
|
||||
}
|
||||
}
|
||||
|
||||
/// Drive `batches` /ingest calls from the heavy actor with up to
|
||||
/// `concurrency` in flight at a time. With `concurrency > inflight_cap`,
|
||||
/// the heavy actor's own admission permits are exhausted at peak, and
|
||||
/// some batches return 429. Returns (ok, 429, other) counts.
|
||||
async fn drive_heavy_actor(
|
||||
app: Router,
|
||||
batches: usize,
|
||||
rows_per_batch: usize,
|
||||
concurrency: usize,
|
||||
) -> (usize, usize, usize) {
|
||||
use tokio::sync::Semaphore;
|
||||
|
||||
// Asserted at startup in `main()`; check again here for defense in
|
||||
// depth so a future caller can't pass 0 silently.
|
||||
assert!(concurrency > 0, "drive_heavy_actor concurrency must be > 0");
|
||||
let limiter = Arc::new(Semaphore::new(concurrency));
|
||||
let mut handles = Vec::with_capacity(batches);
|
||||
for b in 0..batches {
|
||||
let app = app.clone();
|
||||
let limiter = Arc::clone(&limiter);
|
||||
handles.push(tokio::spawn(async move {
|
||||
// Bound concurrency to `concurrency`; this is the bench's
|
||||
// own pacing, not the server's admission control. The
|
||||
// server's `WorkloadController` is what we're trying to
|
||||
// exercise — and it has its own cap (potentially smaller).
|
||||
let _permit = limiter.acquire_owned().await.unwrap();
|
||||
send_heavy_batch(app, b, rows_per_batch).await
|
||||
}));
|
||||
}
|
||||
|
||||
let mut ok = 0usize;
|
||||
let mut too_many = 0usize;
|
||||
let mut other = 0usize;
|
||||
for h in handles {
|
||||
match h.await.unwrap_or(StatusCode::INTERNAL_SERVER_ERROR) {
|
||||
StatusCode::OK => ok += 1,
|
||||
StatusCode::TOO_MANY_REQUESTS => too_many += 1,
|
||||
_ => other += 1,
|
||||
}
|
||||
}
|
||||
(ok, too_many, other)
|
||||
}
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
async fn drive_light_actor(
|
||||
app: Router,
|
||||
token: String,
|
||||
actor_idx: usize,
|
||||
ops: usize,
|
||||
) -> (Vec<Duration>, usize, usize, usize) {
|
||||
let mut latencies = Vec::with_capacity(ops);
|
||||
let mut ok = 0usize;
|
||||
let mut too_many = 0usize;
|
||||
let mut other = 0usize;
|
||||
for op_idx in 0..ops {
|
||||
let request_body = ChangeRequest {
|
||||
query_source: "query insert_person($name: String, $age: I32) {\n insert Person { name: $name, age: $age }\n}".to_string(),
|
||||
query_name: Some("insert_person".to_string()),
|
||||
params: Some(serde_json::json!({
|
||||
"name": format!("light-{actor_idx}-{op_idx}"),
|
||||
"age": op_idx as i32,
|
||||
})),
|
||||
branch: Some("main".to_string()),
|
||||
};
|
||||
let body = serde_json::to_vec(&request_body).unwrap();
|
||||
let req = Request::builder()
|
||||
.method(Method::POST)
|
||||
.uri("/change")
|
||||
.header("authorization", format!("Bearer {token}"))
|
||||
.header("content-type", "application/json")
|
||||
.body(Body::from(body))
|
||||
.unwrap();
|
||||
let start = Instant::now();
|
||||
let response = match app.clone().oneshot(req).await {
|
||||
Ok(r) => r,
|
||||
Err(_) => {
|
||||
other += 1;
|
||||
continue;
|
||||
}
|
||||
};
|
||||
let elapsed = start.elapsed();
|
||||
match response.status() {
|
||||
StatusCode::OK => {
|
||||
ok += 1;
|
||||
latencies.push(elapsed);
|
||||
}
|
||||
StatusCode::TOO_MANY_REQUESTS => {
|
||||
too_many += 1;
|
||||
// Drain to free the body resource.
|
||||
let _ = to_bytes(response.into_body(), 16 * 1024).await;
|
||||
}
|
||||
_ => {
|
||||
other += 1;
|
||||
let _ = to_bytes(response.into_body(), 16 * 1024).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
(latencies, ok, too_many, other)
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
let args = Args::parse();
|
||||
if args.light_actors == 0 || args.light_ops_per_actor == 0 || args.heavy_batches == 0 {
|
||||
eprintln!("--light-actors, --light-ops-per-actor, --heavy-batches must all be > 0");
|
||||
std::process::exit(2);
|
||||
}
|
||||
if args.heavy_concurrency == 0 {
|
||||
eprintln!(
|
||||
"--heavy-concurrency must be > 0 (zero would prevent the heavy actor from \
|
||||
ever firing a batch; if you want to disable heavy traffic, set --heavy-batches=0)"
|
||||
);
|
||||
std::process::exit(2);
|
||||
}
|
||||
|
||||
let temp = tempfile::tempdir().expect("tempdir");
|
||||
let repo = temp.path().join("bench.omni");
|
||||
Omnigraph::init(repo.to_str().unwrap(), SCHEMA)
|
||||
.await
|
||||
.expect("init repo");
|
||||
|
||||
// Build bearer tokens: one for the heavy actor + one per light actor.
|
||||
let mut tokens: Vec<(String, String)> =
|
||||
vec![(HEAVY_ACTOR.to_string(), HEAVY_TOKEN.to_string())];
|
||||
for i in 0..args.light_actors {
|
||||
tokens.push((format!("act-light-{i}"), format!("light-token-{i}")));
|
||||
}
|
||||
let db = Omnigraph::open(repo.to_str().unwrap())
|
||||
.await
|
||||
.expect("open repo");
|
||||
// Construct a custom WorkloadController with the requested caps and
|
||||
// pass it through `AppState::new_with_workload`. Avoids the
|
||||
// `unsafe { std::env::set_var(...) }` antipattern that violates
|
||||
// `setenv`'s thread-safety precondition once the multi-thread tokio
|
||||
// runtime is up.
|
||||
let workload = WorkloadController::new(args.inflight_cap, args.byte_cap);
|
||||
let state = AppState::new_with_workload(
|
||||
repo.to_string_lossy().to_string(),
|
||||
db,
|
||||
tokens,
|
||||
workload,
|
||||
);
|
||||
let app = build_app(state);
|
||||
|
||||
eprintln!(
|
||||
"running heavy={}x{} (concurrency={}) light={}x{} cap={}",
|
||||
args.heavy_batches,
|
||||
args.heavy_rows_per_batch,
|
||||
args.heavy_concurrency,
|
||||
args.light_actors,
|
||||
args.light_ops_per_actor,
|
||||
args.inflight_cap,
|
||||
);
|
||||
|
||||
let start = Instant::now();
|
||||
let heavy_app = app.clone();
|
||||
let heavy_concurrency = args.heavy_concurrency;
|
||||
let heavy_handle = tokio::spawn(async move {
|
||||
drive_heavy_actor(
|
||||
heavy_app,
|
||||
args.heavy_batches,
|
||||
args.heavy_rows_per_batch,
|
||||
heavy_concurrency,
|
||||
)
|
||||
.await
|
||||
});
|
||||
|
||||
let mut light_handles = Vec::with_capacity(args.light_actors);
|
||||
for actor_idx in 0..args.light_actors {
|
||||
let app = app.clone();
|
||||
let token = format!("light-token-{actor_idx}");
|
||||
let ops = args.light_ops_per_actor;
|
||||
light_handles.push(tokio::spawn(async move {
|
||||
drive_light_actor(app, token, actor_idx, ops).await
|
||||
}));
|
||||
}
|
||||
|
||||
let (heavy_ok, heavy_too_many, heavy_other) = heavy_handle.await.expect("heavy task panicked");
|
||||
let mut light_latencies: Vec<Duration> =
|
||||
Vec::with_capacity(args.light_actors * args.light_ops_per_actor);
|
||||
let mut light_ok = 0usize;
|
||||
let mut light_too_many = 0usize;
|
||||
let mut light_other = 0usize;
|
||||
for h in light_handles {
|
||||
let (lats, ok, too_many, other) = h.await.expect("light task panicked");
|
||||
light_latencies.extend(lats);
|
||||
light_ok += ok;
|
||||
light_too_many += too_many;
|
||||
light_other += other;
|
||||
}
|
||||
let wall = start.elapsed();
|
||||
|
||||
light_latencies.sort();
|
||||
let n = light_latencies.len();
|
||||
let pct = |p: f64| -> f64 {
|
||||
if n == 0 {
|
||||
return 0.0;
|
||||
}
|
||||
let idx = ((n as f64 - 1.0) * p).round() as usize;
|
||||
light_latencies[idx].as_secs_f64() * 1000.0
|
||||
};
|
||||
let max_ms = light_latencies
|
||||
.last()
|
||||
.map(|d| d.as_secs_f64() * 1000.0)
|
||||
.unwrap_or(0.0);
|
||||
let heavy_throughput = if wall.as_secs_f64() > 0.0 {
|
||||
args.heavy_batches as f64 / wall.as_secs_f64()
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
let results = BenchResults {
|
||||
label: args.label.clone(),
|
||||
inflight_cap: args.inflight_cap,
|
||||
light_actors: args.light_actors,
|
||||
light_ops_per_actor: args.light_ops_per_actor,
|
||||
heavy_batches: args.heavy_batches,
|
||||
heavy_rows_per_batch: args.heavy_rows_per_batch,
|
||||
wall_time_ms: wall.as_millis() as u64,
|
||||
heavy_ok,
|
||||
heavy_too_many_requests: heavy_too_many,
|
||||
heavy_other_errors: heavy_other,
|
||||
heavy_throughput_attempts_per_sec: heavy_throughput,
|
||||
light_ok,
|
||||
light_too_many_requests: light_too_many,
|
||||
light_other_errors: light_other,
|
||||
light_p50_ms: pct(0.50),
|
||||
light_p95_ms: pct(0.95),
|
||||
light_p99_ms: pct(0.99),
|
||||
light_p999_ms: pct(0.999),
|
||||
light_max_ms: max_ms,
|
||||
notes: "MR-686 actor-isolation bench. Heavy /ingest + light /change concurrent.",
|
||||
};
|
||||
|
||||
let json = serde_json::to_string_pretty(&results).unwrap();
|
||||
println!("{json}");
|
||||
|
||||
if let Some(path) = args.output.as_ref() {
|
||||
if let Some(parent) = path.parent()
|
||||
&& !parent.as_os_str().is_empty()
|
||||
{
|
||||
std::fs::create_dir_all(parent).expect("mkdir output parent");
|
||||
}
|
||||
std::fs::write(path, &json).expect("write output");
|
||||
eprintln!("wrote {}", path.display());
|
||||
}
|
||||
}
|
||||
267
crates/omnigraph-server/examples/bench_concurrent_http.rs
Normal file
267
crates/omnigraph-server/examples/bench_concurrent_http.rs
Normal file
|
|
@ -0,0 +1,267 @@
|
|||
//! Server-level concurrent HTTP benchmark for MR-686 (PR 0 baseline).
|
||||
//!
|
||||
//! Drives concurrent `/change` requests against an in-process Omnigraph HTTP
|
||||
//! server. Measures the global `Arc<RwLock<Omnigraph>>` lock penalty on
|
||||
//! current `main` so PR 1 + PR 2 can be evaluated against a real baseline.
|
||||
//!
|
||||
//! Per the MR-686 plan: this is the load-bearing bench. `Omnigraph::mutate_as`
|
||||
//! is `&mut self`, so an engine-level concurrent bench either serializes on the
|
||||
//! borrow checker (measures nothing) or drives multiple handles (measures Lance
|
||||
//! contention, not the server bottleneck). Driving the HTTP server is the only
|
||||
//! way to measure the actual `RwLock<Omnigraph>` contention this work removes.
|
||||
//!
|
||||
//! Usage:
|
||||
//! ```sh
|
||||
//! cargo run --release -p omnigraph-server --example bench_concurrent_http -- \
|
||||
//! --tables 16 --actors 16 --ops-per-actor 1000 --mode disjoint \
|
||||
//! --output bench-results/baseline-main/cross-table.json
|
||||
//! ```
|
||||
//!
|
||||
//! Modes:
|
||||
//! - `disjoint`: each actor writes to a distinct node type (cross-table fanout)
|
||||
//! - `same-key`: all actors write to the same node type (hot-key contention)
|
||||
//! - `mixed`: each actor writes to a different table per op (round-robin)
|
||||
|
||||
use std::path::PathBuf;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use axum::Router;
|
||||
use axum::body::{Body, to_bytes};
|
||||
use axum::http::{Method, Request, StatusCode};
|
||||
use clap::{Parser, ValueEnum};
|
||||
use omnigraph::db::Omnigraph;
|
||||
use omnigraph_server::api::ChangeRequest;
|
||||
use omnigraph_server::{AppState, build_app};
|
||||
use serde::Serialize;
|
||||
use tower::ServiceExt;
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(about = "Concurrent HTTP bench for MR-686")]
|
||||
struct Args {
|
||||
/// Number of distinct node types in the schema.
|
||||
#[arg(long, default_value_t = 16)]
|
||||
tables: usize,
|
||||
/// Number of concurrent actors driving requests.
|
||||
#[arg(long, default_value_t = 16)]
|
||||
actors: usize,
|
||||
/// Operations per actor.
|
||||
#[arg(long, default_value_t = 100)]
|
||||
ops_per_actor: usize,
|
||||
/// Workload mode.
|
||||
#[arg(long, value_enum, default_value_t = Mode::Disjoint)]
|
||||
mode: Mode,
|
||||
/// Output file for the JSON results. Stdout always gets a copy.
|
||||
#[arg(long)]
|
||||
output: Option<PathBuf>,
|
||||
/// Optional label to record alongside results (e.g. "baseline-main").
|
||||
#[arg(long, default_value = "")]
|
||||
label: String,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, ValueEnum, Serialize)]
|
||||
#[serde(rename_all = "kebab-case")]
|
||||
enum Mode {
|
||||
Disjoint,
|
||||
SameKey,
|
||||
Mixed,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Debug)]
|
||||
struct BenchResults {
|
||||
label: String,
|
||||
mode: Mode,
|
||||
tables: usize,
|
||||
actors: usize,
|
||||
ops_per_actor: usize,
|
||||
total_ops: usize,
|
||||
error_count: usize,
|
||||
wall_time_ms: u64,
|
||||
throughput_ops_per_sec: f64,
|
||||
p50_ms: f64,
|
||||
p95_ms: f64,
|
||||
p99_ms: f64,
|
||||
p999_ms: f64,
|
||||
max_ms: f64,
|
||||
notes: &'static str,
|
||||
}
|
||||
|
||||
fn build_schema(num_tables: usize) -> String {
|
||||
let mut schema = String::new();
|
||||
for i in 0..num_tables {
|
||||
schema.push_str(&format!(
|
||||
"node Item{i} {{\n name: String @key\n value: I32?\n}}\n\n"
|
||||
));
|
||||
}
|
||||
schema
|
||||
}
|
||||
|
||||
fn build_query_source(table_idx: usize) -> String {
|
||||
format!(
|
||||
"query insert_item($name: String, $value: I32) {{\n insert Item{table_idx} {{ name: $name, value: $value }}\n}}"
|
||||
)
|
||||
}
|
||||
|
||||
fn pick_table(actor_idx: usize, op_idx: usize, mode: Mode, num_tables: usize) -> usize {
|
||||
match mode {
|
||||
Mode::Disjoint => actor_idx % num_tables,
|
||||
Mode::SameKey => 0,
|
||||
Mode::Mixed => (actor_idx.wrapping_mul(7919) ^ op_idx) % num_tables,
|
||||
}
|
||||
}
|
||||
|
||||
async fn drive_actor(
|
||||
app: Router,
|
||||
actor_idx: usize,
|
||||
ops: usize,
|
||||
mode: Mode,
|
||||
num_tables: usize,
|
||||
) -> (Vec<Duration>, usize) {
|
||||
let mut latencies = Vec::with_capacity(ops);
|
||||
let mut errors = 0usize;
|
||||
for op_idx in 0..ops {
|
||||
let table_idx = pick_table(actor_idx, op_idx, mode, num_tables);
|
||||
let request_body = ChangeRequest {
|
||||
query_source: build_query_source(table_idx),
|
||||
query_name: Some("insert_item".to_string()),
|
||||
params: Some(serde_json::json!({
|
||||
"name": format!("a{actor_idx}_o{op_idx}"),
|
||||
"value": op_idx as i32,
|
||||
})),
|
||||
branch: None,
|
||||
};
|
||||
let body = serde_json::to_vec(&request_body).unwrap();
|
||||
let req = Request::builder()
|
||||
.method(Method::POST)
|
||||
.uri("/change")
|
||||
.header("content-type", "application/json")
|
||||
.body(Body::from(body))
|
||||
.unwrap();
|
||||
|
||||
let start = Instant::now();
|
||||
let response = match app.clone().oneshot(req).await {
|
||||
Ok(r) => r,
|
||||
Err(e) => {
|
||||
eprintln!("actor {actor_idx} op {op_idx} transport error: {e:?}");
|
||||
errors += 1;
|
||||
continue;
|
||||
}
|
||||
};
|
||||
let elapsed = start.elapsed();
|
||||
let status = response.status();
|
||||
if status != StatusCode::OK {
|
||||
errors += 1;
|
||||
// Drain body for logging on the first few failures.
|
||||
if errors <= 3 {
|
||||
let body = to_bytes(response.into_body(), 64 * 1024).await.unwrap_or_default();
|
||||
eprintln!(
|
||||
"actor {actor_idx} op {op_idx} status {status} body {}",
|
||||
String::from_utf8_lossy(&body)
|
||||
);
|
||||
}
|
||||
}
|
||||
latencies.push(elapsed);
|
||||
}
|
||||
(latencies, errors)
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
let args = Args::parse();
|
||||
if args.tables == 0 || args.actors == 0 || args.ops_per_actor == 0 {
|
||||
eprintln!("--tables, --actors, --ops-per-actor must all be > 0");
|
||||
std::process::exit(2);
|
||||
}
|
||||
|
||||
let temp = tempfile::tempdir().expect("tempdir");
|
||||
let repo = temp.path().join("bench.omni");
|
||||
let schema = build_schema(args.tables);
|
||||
Omnigraph::init(repo.to_str().unwrap(), &schema)
|
||||
.await
|
||||
.expect("init repo");
|
||||
|
||||
let state = AppState::open(repo.to_string_lossy().to_string())
|
||||
.await
|
||||
.expect("open AppState");
|
||||
let app = build_app(state);
|
||||
|
||||
eprintln!(
|
||||
"running mode={:?} tables={} actors={} ops_per_actor={}",
|
||||
args.mode, args.tables, args.actors, args.ops_per_actor
|
||||
);
|
||||
|
||||
let start = Instant::now();
|
||||
let mut handles = Vec::with_capacity(args.actors);
|
||||
for actor_idx in 0..args.actors {
|
||||
let app = app.clone();
|
||||
let mode = args.mode;
|
||||
let ops = args.ops_per_actor;
|
||||
let num_tables = args.tables;
|
||||
handles.push(tokio::spawn(async move {
|
||||
drive_actor(app, actor_idx, ops, mode, num_tables).await
|
||||
}));
|
||||
}
|
||||
|
||||
let mut all_latencies: Vec<Duration> = Vec::with_capacity(args.actors * args.ops_per_actor);
|
||||
let mut total_errors = 0usize;
|
||||
for h in handles {
|
||||
let (lats, errs) = h.await.expect("actor task panicked");
|
||||
all_latencies.extend(lats);
|
||||
total_errors += errs;
|
||||
}
|
||||
let wall = start.elapsed();
|
||||
|
||||
all_latencies.sort();
|
||||
let n = all_latencies.len();
|
||||
let pct = |p: f64| -> f64 {
|
||||
if n == 0 {
|
||||
return 0.0;
|
||||
}
|
||||
let idx = ((n as f64 - 1.0) * p).round() as usize;
|
||||
all_latencies[idx].as_secs_f64() * 1000.0
|
||||
};
|
||||
let max_ms = all_latencies
|
||||
.last()
|
||||
.map(|d| d.as_secs_f64() * 1000.0)
|
||||
.unwrap_or(0.0);
|
||||
let throughput = if wall.as_secs_f64() > 0.0 {
|
||||
n as f64 / wall.as_secs_f64()
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
let results = BenchResults {
|
||||
label: args.label.clone(),
|
||||
mode: args.mode,
|
||||
tables: args.tables,
|
||||
actors: args.actors,
|
||||
ops_per_actor: args.ops_per_actor,
|
||||
total_ops: n,
|
||||
error_count: total_errors,
|
||||
wall_time_ms: wall.as_millis() as u64,
|
||||
throughput_ops_per_sec: throughput,
|
||||
p50_ms: pct(0.50),
|
||||
p95_ms: pct(0.95),
|
||||
p99_ms: pct(0.99),
|
||||
p999_ms: pct(0.999),
|
||||
max_ms,
|
||||
notes: "MR-686 PR 0 baseline. Drives /change via Tower oneshot.",
|
||||
};
|
||||
|
||||
let json = serde_json::to_string_pretty(&results).unwrap();
|
||||
println!("{json}");
|
||||
|
||||
if let Some(path) = args.output.as_ref() {
|
||||
if let Some(parent) = path.parent()
|
||||
&& !parent.as_os_str().is_empty()
|
||||
{
|
||||
std::fs::create_dir_all(parent).expect("mkdir output parent");
|
||||
}
|
||||
std::fs::write(path, &json).expect("write output");
|
||||
eprintln!("wrote {}", path.display());
|
||||
}
|
||||
|
||||
if total_errors > 0 {
|
||||
eprintln!("WARN: {total_errors} requests failed");
|
||||
std::process::exit(1);
|
||||
}
|
||||
}
|
||||
|
|
@ -339,6 +339,9 @@ pub enum ErrorCode {
|
|||
BadRequest,
|
||||
NotFound,
|
||||
Conflict,
|
||||
/// 429 Too Many Requests — per-actor admission cap exceeded.
|
||||
/// Clients should respect the `Retry-After` header.
|
||||
TooManyRequests,
|
||||
Internal,
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ pub mod api;
|
|||
pub mod auth;
|
||||
pub mod config;
|
||||
pub mod policy;
|
||||
pub mod workload;
|
||||
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::fs;
|
||||
|
|
@ -47,7 +48,7 @@ use serde_json::Value;
|
|||
use sha2::{Digest, Sha256};
|
||||
use subtle::ConstantTimeEq;
|
||||
use tokio::net::TcpListener;
|
||||
use tokio::sync::{RwLock, mpsc};
|
||||
use tokio::sync::mpsc;
|
||||
use tower_http::trace::TraceLayer;
|
||||
use tracing::{error, info};
|
||||
use tracing_subscriber::EnvFilter;
|
||||
|
|
@ -118,7 +119,14 @@ pub struct ServerConfig {
|
|||
#[derive(Clone)]
|
||||
pub struct AppState {
|
||||
uri: String,
|
||||
db: Arc<RwLock<Omnigraph>>,
|
||||
/// PR 2 (MR-686): the engine is now `Arc<Omnigraph>` — no global
|
||||
/// write lock. Concurrent handlers call `&self` engine APIs
|
||||
/// directly. Per-(table, branch) write queues inside the engine
|
||||
/// serialize same-key writers; per-actor admission control on
|
||||
/// `workload` isolates noisy actors.
|
||||
engine: Arc<Omnigraph>,
|
||||
/// Per-actor admission control. See `workload::WorkloadController`.
|
||||
workload: Arc<workload::WorkloadController>,
|
||||
bearer_tokens: Arc<[(BearerTokenHash, Arc<str>)]>,
|
||||
policy_engine: Option<Arc<PolicyEngine>>,
|
||||
}
|
||||
|
|
@ -191,12 +199,36 @@ impl AppState {
|
|||
.collect();
|
||||
Self {
|
||||
uri,
|
||||
db: Arc::new(RwLock::new(db)),
|
||||
engine: Arc::new(db),
|
||||
workload: Arc::new(workload::WorkloadController::from_env()),
|
||||
bearer_tokens: Arc::from(bearer_tokens),
|
||||
policy_engine: policy_engine.map(Arc::new),
|
||||
}
|
||||
}
|
||||
|
||||
/// Construct with a caller-provided [`workload::WorkloadController`].
|
||||
/// Tests and benches use this to override per-actor caps without
|
||||
/// mutating global env vars (which is unsafe in Rust 2024 once the
|
||||
/// async runtime is up — `setenv` isn't thread-safe).
|
||||
pub fn new_with_workload(
|
||||
uri: String,
|
||||
db: Omnigraph,
|
||||
bearer_tokens: Vec<(String, String)>,
|
||||
workload: workload::WorkloadController,
|
||||
) -> Self {
|
||||
let bearer_tokens: Vec<(BearerTokenHash, Arc<str>)> = bearer_tokens
|
||||
.into_iter()
|
||||
.map(|(actor, token)| (hash_bearer_token(&token), Arc::<str>::from(actor)))
|
||||
.collect();
|
||||
Self {
|
||||
uri,
|
||||
engine: Arc::new(db),
|
||||
workload: Arc::new(workload),
|
||||
bearer_tokens: Arc::from(bearer_tokens),
|
||||
policy_engine: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn open(uri: impl Into<String>) -> Result<Self> {
|
||||
Self::open_with_bearer_token(uri, None).await
|
||||
}
|
||||
|
|
@ -331,6 +363,31 @@ impl ApiError {
|
|||
}
|
||||
}
|
||||
|
||||
/// HTTP 429 Too Many Requests — actor exceeded their per-actor
|
||||
/// admission cap (count or byte budget). Clients should respect the
|
||||
/// `Retry-After` header. Mapped from `RejectReason::InFlightCountExceeded`
|
||||
/// and `RejectReason::ByteBudgetExceeded`.
|
||||
pub fn too_many_requests(message: impl Into<String>) -> Self {
|
||||
Self {
|
||||
status: StatusCode::TOO_MANY_REQUESTS,
|
||||
code: ErrorCode::TooManyRequests,
|
||||
message: message.into(),
|
||||
merge_conflicts: Vec::new(),
|
||||
manifest_conflict: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert a `WorkloadController` rejection into the matching
|
||||
/// `ApiError` variant.
|
||||
pub fn from_workload_reject(reject: workload::RejectReason) -> Self {
|
||||
match reject {
|
||||
workload::RejectReason::InFlightCountExceeded { .. }
|
||||
| workload::RejectReason::ByteBudgetExceeded { .. } => {
|
||||
Self::too_many_requests(reject.to_string())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn merge_conflict(conflicts: Vec<api::MergeConflictOutput>) -> Self {
|
||||
Self {
|
||||
status: StatusCode::CONFLICT,
|
||||
|
|
@ -418,10 +475,21 @@ fn summarize_merge_conflicts(conflicts: &[api::MergeConflictOutput]) -> String {
|
|||
format!("merge conflicts: {}{}", preview.join("; "), suffix)
|
||||
}
|
||||
|
||||
/// Constant `Retry-After` value (seconds) emitted on 429 responses.
|
||||
const RETRY_AFTER_SECONDS: &str = "60";
|
||||
|
||||
impl IntoResponse for ApiError {
|
||||
fn into_response(self) -> Response {
|
||||
let mut headers = axum::http::HeaderMap::new();
|
||||
if matches!(self.code, ErrorCode::TooManyRequests) {
|
||||
headers.insert(
|
||||
axum::http::header::RETRY_AFTER,
|
||||
axum::http::HeaderValue::from_static(RETRY_AFTER_SECONDS),
|
||||
);
|
||||
}
|
||||
(
|
||||
self.status,
|
||||
headers,
|
||||
Json(ErrorOutput {
|
||||
error: self.message,
|
||||
code: Some(self.code),
|
||||
|
|
@ -674,7 +742,7 @@ async fn server_snapshot(
|
|||
},
|
||||
)?;
|
||||
let snapshot = {
|
||||
let db = Arc::clone(&state.db).read_owned().await;
|
||||
let db = &state.engine;
|
||||
db.snapshot_of(ReadTarget::branch(branch.as_str()))
|
||||
.await
|
||||
.map_err(ApiError::from_omni)?
|
||||
|
|
@ -718,7 +786,7 @@ async fn server_read(
|
|||
let policy_branch = match &target {
|
||||
ReadTarget::Branch(branch) => Some(branch.clone()),
|
||||
ReadTarget::Snapshot(_) if state.policy_engine().is_some() && actor.is_some() => {
|
||||
let db = Arc::clone(&state.db).read_owned().await;
|
||||
let db = &state.engine;
|
||||
db.resolved_branch_of(target.clone())
|
||||
.await
|
||||
.map(|branch| branch.or_else(|| Some("main".to_string())))
|
||||
|
|
@ -746,7 +814,7 @@ async fn server_read(
|
|||
.map_err(|err| ApiError::bad_request(err.to_string()))?;
|
||||
|
||||
let result = {
|
||||
let db = Arc::clone(&state.db).read_owned().await;
|
||||
let db = &state.engine;
|
||||
db.query(
|
||||
target.clone(),
|
||||
&request.query_source,
|
||||
|
|
@ -798,15 +866,15 @@ async fn server_export(
|
|||
target_branch: None,
|
||||
},
|
||||
)?;
|
||||
let db = Arc::clone(&state.db);
|
||||
let engine = Arc::clone(&state.engine);
|
||||
let type_names = request.type_names.clone();
|
||||
let table_keys = request.table_keys.clone();
|
||||
let (tx, rx) = mpsc::unbounded_channel::<std::result::Result<Bytes, io::Error>>();
|
||||
tokio::spawn(async move {
|
||||
let result = {
|
||||
let db = db.read().await;
|
||||
let mut writer = ExportStreamWriter { sender: tx.clone() };
|
||||
db.export_jsonl_to_writer(&branch, &type_names, &table_keys, &mut writer)
|
||||
engine
|
||||
.export_jsonl_to_writer(&branch, &type_names, &table_keys, &mut writer)
|
||||
.await
|
||||
};
|
||||
if let Err(err) = result {
|
||||
|
|
@ -836,6 +904,7 @@ async fn server_export(
|
|||
(status = 401, description = "Unauthorized", body = ErrorOutput),
|
||||
(status = 403, description = "Forbidden", body = ErrorOutput),
|
||||
(status = 409, description = "Merge conflict", body = ErrorOutput),
|
||||
(status = 429, description = "Per-actor admission cap exceeded; honor `Retry-After` header", body = ErrorOutput),
|
||||
),
|
||||
security(("bearer_token" = [])),
|
||||
)]
|
||||
|
|
@ -851,6 +920,10 @@ async fn server_change(
|
|||
Json(request): Json<ChangeRequest>,
|
||||
) -> std::result::Result<Json<ChangeOutput>, ApiError> {
|
||||
let branch = request.branch.unwrap_or_else(|| "main".to_string());
|
||||
let actor_arc = actor
|
||||
.as_ref()
|
||||
.map(|Extension(actor)| Arc::clone(&actor.0))
|
||||
.unwrap_or_else(|| Arc::<str>::from("anonymous"));
|
||||
let actor_id = actor.as_ref().map(|Extension(actor)| actor.as_str());
|
||||
authorize_request(
|
||||
&state,
|
||||
|
|
@ -862,6 +935,20 @@ async fn server_change(
|
|||
target_branch: None,
|
||||
},
|
||||
)?;
|
||||
// Per-actor admission: bound concurrent in-flight mutations and
|
||||
// estimated bytes per actor. Cedar runs FIRST so denied requests
|
||||
// don't consume admission slots. Estimate uses the request body
|
||||
// size as a coarse proxy; engine memory pressure can run higher.
|
||||
let est_bytes = request.query_source.len() as u64
|
||||
+ request
|
||||
.params
|
||||
.as_ref()
|
||||
.map(|p| p.to_string().len() as u64)
|
||||
.unwrap_or(0);
|
||||
let _admission = state
|
||||
.workload
|
||||
.try_admit(&actor_arc, est_bytes)
|
||||
.map_err(ApiError::from_workload_reject)?;
|
||||
let (selected_name, query_params) =
|
||||
select_named_query(&request.query_source, request.query_name.as_deref())
|
||||
.map_err(|err| ApiError::bad_request(err.to_string()))?;
|
||||
|
|
@ -869,7 +956,7 @@ async fn server_change(
|
|||
.map_err(|err| ApiError::bad_request(err.to_string()))?;
|
||||
|
||||
let result = {
|
||||
let mut db = Arc::clone(&state.db).write_owned().await;
|
||||
let db = &state.engine;
|
||||
db.mutate_as(
|
||||
&branch,
|
||||
&request.query_source,
|
||||
|
|
@ -924,7 +1011,7 @@ async fn server_schema_get(
|
|||
},
|
||||
)?;
|
||||
let schema_source = {
|
||||
let db = Arc::clone(&state.db).read_owned().await;
|
||||
let db = &state.engine;
|
||||
db.schema_source().to_string()
|
||||
};
|
||||
Ok(Json(SchemaOutput { schema_source }))
|
||||
|
|
@ -941,6 +1028,7 @@ async fn server_schema_get(
|
|||
(status = 400, description = "Bad request", body = ErrorOutput),
|
||||
(status = 401, description = "Unauthorized", body = ErrorOutput),
|
||||
(status = 403, description = "Forbidden", body = ErrorOutput),
|
||||
(status = 429, description = "Per-actor admission cap exceeded; honor `Retry-After` header", body = ErrorOutput),
|
||||
),
|
||||
security(("bearer_token" = [])),
|
||||
)]
|
||||
|
|
@ -955,6 +1043,10 @@ async fn server_schema_apply(
|
|||
actor: Option<Extension<AuthenticatedActor>>,
|
||||
Json(request): Json<SchemaApplyRequest>,
|
||||
) -> std::result::Result<Json<SchemaApplyOutput>, ApiError> {
|
||||
let actor_arc = actor
|
||||
.as_ref()
|
||||
.map(|Extension(actor)| Arc::clone(&actor.0))
|
||||
.unwrap_or_else(|| Arc::<str>::from("anonymous"));
|
||||
let actor_id = actor.as_ref().map(|Extension(actor)| actor.as_str());
|
||||
authorize_request(
|
||||
&state,
|
||||
|
|
@ -966,8 +1058,13 @@ async fn server_schema_apply(
|
|||
target_branch: Some("main".to_string()),
|
||||
},
|
||||
)?;
|
||||
let est_bytes = request.schema_source.len() as u64;
|
||||
let _admission = state
|
||||
.workload
|
||||
.try_admit(&actor_arc, est_bytes)
|
||||
.map_err(ApiError::from_workload_reject)?;
|
||||
let result = {
|
||||
let mut db = Arc::clone(&state.db).write_owned().await;
|
||||
let db = &state.engine;
|
||||
db.apply_schema(&request.schema_source)
|
||||
.await
|
||||
.map_err(ApiError::from_omni)?
|
||||
|
|
@ -986,6 +1083,7 @@ async fn server_schema_apply(
|
|||
(status = 400, description = "Bad request", body = ErrorOutput),
|
||||
(status = 401, description = "Unauthorized", body = ErrorOutput),
|
||||
(status = 403, description = "Forbidden", body = ErrorOutput),
|
||||
(status = 429, description = "Per-actor admission cap exceeded; honor `Retry-After` header", body = ErrorOutput),
|
||||
),
|
||||
security(("bearer_token" = [])),
|
||||
)]
|
||||
|
|
@ -1004,10 +1102,14 @@ async fn server_ingest(
|
|||
let branch = request.branch.unwrap_or_else(|| "main".to_string());
|
||||
let from = request.from.unwrap_or_else(|| "main".to_string());
|
||||
let mode = request.mode.unwrap_or(omnigraph::loader::LoadMode::Merge);
|
||||
let actor_arc = actor
|
||||
.as_ref()
|
||||
.map(|Extension(actor)| Arc::clone(&actor.0))
|
||||
.unwrap_or_else(|| Arc::<str>::from("anonymous"));
|
||||
let actor_id = actor.as_ref().map(|Extension(actor)| actor.as_str());
|
||||
|
||||
let branch_exists = {
|
||||
let db = Arc::clone(&state.db).read_owned().await;
|
||||
let db = &state.engine;
|
||||
db.branch_list()
|
||||
.await
|
||||
.map_err(ApiError::from_omni)?
|
||||
|
|
@ -1037,9 +1139,14 @@ async fn server_ingest(
|
|||
target_branch: None,
|
||||
},
|
||||
)?;
|
||||
let est_bytes = request.data.len() as u64;
|
||||
let _admission = state
|
||||
.workload
|
||||
.try_admit(&actor_arc, est_bytes)
|
||||
.map_err(ApiError::from_workload_reject)?;
|
||||
|
||||
let result = {
|
||||
let mut db = Arc::clone(&state.db).write_owned().await;
|
||||
let db = &state.engine;
|
||||
db.ingest_as(&branch, Some(&from), &request.data, mode, actor_id)
|
||||
.await
|
||||
.map_err(ApiError::from_omni)?
|
||||
|
|
@ -1085,7 +1192,7 @@ async fn server_branch_list(
|
|||
},
|
||||
)?;
|
||||
let mut branches = {
|
||||
let db = Arc::clone(&state.db).read_owned().await;
|
||||
let db = &state.engine;
|
||||
db.branch_list().await.map_err(ApiError::from_omni)?
|
||||
};
|
||||
branches.sort();
|
||||
|
|
@ -1104,6 +1211,7 @@ async fn server_branch_list(
|
|||
(status = 401, description = "Unauthorized", body = ErrorOutput),
|
||||
(status = 403, description = "Forbidden", body = ErrorOutput),
|
||||
(status = 409, description = "Branch already exists", body = ErrorOutput),
|
||||
(status = 429, description = "Per-actor admission cap exceeded; honor `Retry-After` header", body = ErrorOutput),
|
||||
),
|
||||
security(("bearer_token" = [])),
|
||||
)]
|
||||
|
|
@ -1118,6 +1226,10 @@ async fn server_branch_create(
|
|||
Json(request): Json<BranchCreateRequest>,
|
||||
) -> std::result::Result<Json<BranchCreateOutput>, ApiError> {
|
||||
let from = request.from.unwrap_or_else(|| "main".to_string());
|
||||
let actor_arc = actor
|
||||
.as_ref()
|
||||
.map(|Extension(actor)| Arc::clone(&actor.0))
|
||||
.unwrap_or_else(|| Arc::<str>::from("anonymous"));
|
||||
authorize_request(
|
||||
&state,
|
||||
actor.as_ref().map(|Extension(actor)| actor),
|
||||
|
|
@ -1131,8 +1243,15 @@ async fn server_branch_create(
|
|||
target_branch: Some(request.name.clone()),
|
||||
},
|
||||
)?;
|
||||
// Branch metadata only — small constant bytes estimate. The Lance
|
||||
// shallow-clone work is bounded by the parent's manifest size, not
|
||||
// the request body.
|
||||
let _admission = state
|
||||
.workload
|
||||
.try_admit(&actor_arc, 256)
|
||||
.map_err(ApiError::from_workload_reject)?;
|
||||
{
|
||||
let mut db = Arc::clone(&state.db).write_owned().await;
|
||||
let db = &state.engine;
|
||||
db.branch_create_from(ReadTarget::branch(&from), &request.name)
|
||||
.await
|
||||
.map_err(ApiError::from_omni)?;
|
||||
|
|
@ -1158,6 +1277,7 @@ async fn server_branch_create(
|
|||
(status = 401, description = "Unauthorized", body = ErrorOutput),
|
||||
(status = 403, description = "Forbidden", body = ErrorOutput),
|
||||
(status = 404, description = "Branch not found", body = ErrorOutput),
|
||||
(status = 429, description = "Per-actor admission cap exceeded; honor `Retry-After` header", body = ErrorOutput),
|
||||
),
|
||||
security(("bearer_token" = [])),
|
||||
)]
|
||||
|
|
@ -1171,6 +1291,10 @@ async fn server_branch_delete(
|
|||
actor: Option<Extension<AuthenticatedActor>>,
|
||||
Path(branch): Path<String>,
|
||||
) -> std::result::Result<Json<BranchDeleteOutput>, ApiError> {
|
||||
let actor_arc = actor
|
||||
.as_ref()
|
||||
.map(|Extension(actor)| Arc::clone(&actor.0))
|
||||
.unwrap_or_else(|| Arc::<str>::from("anonymous"));
|
||||
let actor_id = actor.as_ref().map(|Extension(actor)| actor.as_str());
|
||||
authorize_request(
|
||||
&state,
|
||||
|
|
@ -1182,8 +1306,13 @@ async fn server_branch_delete(
|
|||
target_branch: Some(branch.clone()),
|
||||
},
|
||||
)?;
|
||||
// Metadata-only manifest tombstone — small constant estimate.
|
||||
let _admission = state
|
||||
.workload
|
||||
.try_admit(&actor_arc, 256)
|
||||
.map_err(ApiError::from_workload_reject)?;
|
||||
{
|
||||
let mut db = Arc::clone(&state.db).write_owned().await;
|
||||
let db = &state.engine;
|
||||
db.branch_delete(&branch)
|
||||
.await
|
||||
.map_err(ApiError::from_omni)?;
|
||||
|
|
@ -1207,6 +1336,7 @@ async fn server_branch_delete(
|
|||
(status = 401, description = "Unauthorized", body = ErrorOutput),
|
||||
(status = 403, description = "Forbidden", body = ErrorOutput),
|
||||
(status = 409, description = "Merge conflict", body = ErrorOutput),
|
||||
(status = 429, description = "Per-actor admission cap exceeded; honor `Retry-After` header", body = ErrorOutput),
|
||||
),
|
||||
security(("bearer_token" = [])),
|
||||
)]
|
||||
|
|
@ -1222,6 +1352,10 @@ async fn server_branch_merge(
|
|||
Json(request): Json<BranchMergeRequest>,
|
||||
) -> std::result::Result<Json<BranchMergeOutput>, ApiError> {
|
||||
let target = request.target.unwrap_or_else(|| "main".to_string());
|
||||
let actor_arc = actor
|
||||
.as_ref()
|
||||
.map(|Extension(actor)| Arc::clone(&actor.0))
|
||||
.unwrap_or_else(|| Arc::<str>::from("anonymous"));
|
||||
let actor_id = actor.as_ref().map(|Extension(actor)| actor.as_str());
|
||||
authorize_request(
|
||||
&state,
|
||||
|
|
@ -1233,8 +1367,15 @@ async fn server_branch_merge(
|
|||
target_branch: Some(target.clone()),
|
||||
},
|
||||
)?;
|
||||
// Merge body is small JSON; the heavy work is in the engine but is
|
||||
// bounded per-(table, branch) by the writer queue. Small constant
|
||||
// estimate suffices for the actor in-flight count.
|
||||
let _admission = state
|
||||
.workload
|
||||
.try_admit(&actor_arc, 256)
|
||||
.map_err(ApiError::from_workload_reject)?;
|
||||
let outcome = {
|
||||
let mut db = Arc::clone(&state.db).write_owned().await;
|
||||
let db = &state.engine;
|
||||
db.branch_merge_as(&request.source, &target, actor_id)
|
||||
.await
|
||||
.map_err(ApiError::from_omni)?
|
||||
|
|
@ -1283,7 +1424,7 @@ async fn server_commit_list(
|
|||
},
|
||||
)?;
|
||||
let commits = {
|
||||
let db = Arc::clone(&state.db).read_owned().await;
|
||||
let db = &state.engine;
|
||||
db.list_commits(query.branch.as_deref())
|
||||
.await
|
||||
.map_err(ApiError::from_omni)?
|
||||
|
|
@ -1332,7 +1473,7 @@ async fn server_commit_show(
|
|||
},
|
||||
)?;
|
||||
let commit = {
|
||||
let db = Arc::clone(&state.db).read_owned().await;
|
||||
let db = &state.engine;
|
||||
db.get_commit(&commit_id)
|
||||
.await
|
||||
.map_err(ApiError::from_omni)?
|
||||
|
|
|
|||
363
crates/omnigraph-server/src/workload.rs
Normal file
363
crates/omnigraph-server/src/workload.rs
Normal file
|
|
@ -0,0 +1,363 @@
|
|||
//! Per-actor admission control for the HTTP server (MR-686 §VII.A).
|
||||
//!
|
||||
//! The HTTP server's previous global `RwLock<Omnigraph>` serialized every
|
||||
//! mutating request across all actors. PR 2 removes that lock — engine
|
||||
//! APIs are now `&self`, so concurrent calls from different actors can
|
||||
//! run against `Arc<Omnigraph>` simultaneously. Without admission
|
||||
//! control, one heavy actor can exhaust shared capacity (Lance I/O
|
||||
//! threads, manifest churn, network) and starve other actors.
|
||||
//!
|
||||
//! This module provides:
|
||||
//!
|
||||
//! - **Per-actor in-flight count cap**: each actor has a
|
||||
//! `tokio::sync::Semaphore` with `OMNIGRAPH_PER_ACTOR_INFLIGHT_MAX`
|
||||
//! permits (default 16). `try_acquire_owned()` returns `Err` when
|
||||
//! exhausted; the server maps this to HTTP 429.
|
||||
//!
|
||||
//! - **Per-actor in-flight byte budget**: each actor accumulates an
|
||||
//! `AtomicU64` byte estimate. `fetch_add(est_bytes)` then a check
|
||||
//! against `byte_cap` is race-free via decrement-on-rejection. The
|
||||
//! server maps an over-budget result to HTTP 429 as well.
|
||||
//!
|
||||
//! Counts are governed by the semaphore (race-free `try_acquire_owned()`
|
||||
//! enforces the cap atomically); bytes use `fetch_add` + decrement-on-
|
||||
//! rejection. Both checks are atomic compare-and-act, never
|
||||
//! load-then-act — the test
|
||||
//! `actor_admission_race_does_not_exceed_cap` pins this contract by
|
||||
//! spawning 32 concurrent `try_admit` calls against a cap of 16 and
|
||||
//! asserting exactly 16 succeed.
|
||||
//!
|
||||
//! Acquisition order against the engine's per-(table, branch) write
|
||||
//! queue: admission FIRST (the HTTP handler reserves capacity before
|
||||
//! calling into the engine), engine queue SECOND (acquired inside
|
||||
//! `MutationStaging::commit_all`). This composes cleanly because
|
||||
//! admission is a single per-actor count + budget check, never
|
||||
//! cross-actor; nothing the engine does can change a peer actor's
|
||||
//! admission state.
|
||||
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
|
||||
use dashmap::DashMap;
|
||||
use tokio::sync::{OwnedSemaphorePermit, Semaphore, TryAcquireError};
|
||||
|
||||
/// Default per-actor in-flight count cap. Override via
|
||||
/// `OMNIGRAPH_PER_ACTOR_INFLIGHT_MAX`.
|
||||
pub const DEFAULT_PER_ACTOR_INFLIGHT_MAX: u32 = 16;
|
||||
|
||||
/// Default per-actor in-flight byte budget (4 GiB). Override via
|
||||
/// `OMNIGRAPH_PER_ACTOR_BYTES_MAX`.
|
||||
pub const DEFAULT_PER_ACTOR_BYTES_MAX: u64 = 4 * 1024 * 1024 * 1024;
|
||||
|
||||
/// Why a `try_admit` call returned `Err`. The server maps each variant
|
||||
/// to a specific HTTP response code; see `WorkloadController` docs.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum RejectReason {
|
||||
/// Actor exceeded the per-actor in-flight count cap. HTTP 429.
|
||||
InFlightCountExceeded { cap: u32 },
|
||||
/// Actor exceeded the per-actor in-flight byte budget. HTTP 429.
|
||||
ByteBudgetExceeded { cap: u64, attempted: u64 },
|
||||
}
|
||||
|
||||
impl std::fmt::Display for RejectReason {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
RejectReason::InFlightCountExceeded { cap } => {
|
||||
write!(f, "actor in-flight count cap {} exceeded", cap)
|
||||
}
|
||||
RejectReason::ByteBudgetExceeded { cap, attempted } => write!(
|
||||
f,
|
||||
"actor byte budget exceeded: would use {} bytes against cap {}",
|
||||
attempted, cap
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Per-actor counters. One instance per actor_id, lazily created on
|
||||
/// first admission attempt.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct ActorState {
|
||||
/// Counts the number of concurrent in-flight requests for this
|
||||
/// actor. `try_acquire_owned()` is the count-cap gate.
|
||||
in_flight_sem: Arc<Semaphore>,
|
||||
/// Total bytes estimated to be in flight for this actor across
|
||||
/// concurrent requests. `fetch_add` + check + decrement-on-failure
|
||||
/// keeps the cap atomic.
|
||||
bytes: AtomicU64,
|
||||
/// Per-actor byte cap (snapshot of `WorkloadController.byte_cap`
|
||||
/// at construction; cap mutations don't propagate to existing
|
||||
/// ActorStates by design — controller config changes apply on
|
||||
/// next ActorState construction).
|
||||
byte_cap: u64,
|
||||
/// Per-actor count cap (same snapshot semantics as `byte_cap`).
|
||||
inflight_cap: u32,
|
||||
}
|
||||
|
||||
impl ActorState {
|
||||
fn new(inflight_cap: u32, byte_cap: u64) -> Self {
|
||||
Self {
|
||||
in_flight_sem: Arc::new(Semaphore::new(inflight_cap as usize)),
|
||||
bytes: AtomicU64::new(0),
|
||||
byte_cap,
|
||||
inflight_cap,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Server-side per-actor admission controller. Constructed once at
|
||||
/// server startup and shared via `Arc<WorkloadController>` on
|
||||
/// `AppState`.
|
||||
pub struct WorkloadController {
|
||||
per_actor: DashMap<Arc<str>, Arc<ActorState>>,
|
||||
inflight_cap: u32,
|
||||
byte_cap: u64,
|
||||
}
|
||||
|
||||
impl WorkloadController {
|
||||
/// Construct from explicit caps. Tests can override.
|
||||
pub fn new(inflight_cap: u32, byte_cap: u64) -> Self {
|
||||
Self {
|
||||
per_actor: DashMap::new(),
|
||||
inflight_cap,
|
||||
byte_cap,
|
||||
}
|
||||
}
|
||||
|
||||
/// Construct from environment variables, falling back to defaults.
|
||||
/// Bad env values fall back to the default with a `tracing::warn!`.
|
||||
pub fn from_env() -> Self {
|
||||
let inflight_cap = parse_env_u32(
|
||||
"OMNIGRAPH_PER_ACTOR_INFLIGHT_MAX",
|
||||
DEFAULT_PER_ACTOR_INFLIGHT_MAX,
|
||||
);
|
||||
let byte_cap = parse_env_u64("OMNIGRAPH_PER_ACTOR_BYTES_MAX", DEFAULT_PER_ACTOR_BYTES_MAX);
|
||||
Self::new(inflight_cap, byte_cap)
|
||||
}
|
||||
|
||||
/// Construct with default caps. Suitable for tests / single-tenant
|
||||
/// deployments without explicit configuration.
|
||||
pub fn with_defaults() -> Self {
|
||||
Self::new(DEFAULT_PER_ACTOR_INFLIGHT_MAX, DEFAULT_PER_ACTOR_BYTES_MAX)
|
||||
}
|
||||
|
||||
fn actor_state(&self, actor_id: &Arc<str>) -> Arc<ActorState> {
|
||||
if let Some(existing) = self.per_actor.get(actor_id) {
|
||||
return existing.clone();
|
||||
}
|
||||
// Race-on-construct is benign: DashMap's `entry().or_insert_with`
|
||||
// serializes per-key construction; the loser's freshly-built
|
||||
// ActorState gets dropped without observable effect.
|
||||
self.per_actor
|
||||
.entry(actor_id.clone())
|
||||
.or_insert_with(|| Arc::new(ActorState::new(self.inflight_cap, self.byte_cap)))
|
||||
.clone()
|
||||
}
|
||||
|
||||
/// Reserve admission for one in-flight request from `actor_id`
|
||||
/// estimated to consume `est_bytes`. Returns an `AdmissionGuard`
|
||||
/// that releases the count permit + decrements the byte total
|
||||
/// when dropped.
|
||||
///
|
||||
/// On rejection, the byte counter is decremented before returning
|
||||
/// — callers can retry without leaking budget.
|
||||
pub fn try_admit(
|
||||
&self,
|
||||
actor_id: &Arc<str>,
|
||||
est_bytes: u64,
|
||||
) -> Result<AdmissionGuard, RejectReason> {
|
||||
let state = self.actor_state(actor_id);
|
||||
|
||||
// Count gate: race-free via `try_acquire_owned()`. If exhausted,
|
||||
// immediately reject — no byte accounting needed for this request.
|
||||
let permit = match Arc::clone(&state.in_flight_sem).try_acquire_owned() {
|
||||
Ok(permit) => permit,
|
||||
Err(TryAcquireError::NoPermits) => {
|
||||
return Err(RejectReason::InFlightCountExceeded {
|
||||
cap: state.inflight_cap,
|
||||
});
|
||||
}
|
||||
Err(TryAcquireError::Closed) => {
|
||||
return Err(RejectReason::InFlightCountExceeded {
|
||||
cap: state.inflight_cap,
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
// Byte gate: atomic fetch_add then check; decrement on overflow.
|
||||
// `Ordering::SeqCst` is conservative; per-actor accounting is
|
||||
// not on the hot path of read queries.
|
||||
let prior = state.bytes.fetch_add(est_bytes, Ordering::SeqCst);
|
||||
let attempted = prior.saturating_add(est_bytes);
|
||||
if attempted > state.byte_cap {
|
||||
// Roll back the byte add. The permit drops with `permit`
|
||||
// going out of scope below.
|
||||
state.bytes.fetch_sub(est_bytes, Ordering::SeqCst);
|
||||
return Err(RejectReason::ByteBudgetExceeded {
|
||||
cap: state.byte_cap,
|
||||
attempted,
|
||||
});
|
||||
}
|
||||
|
||||
Ok(AdmissionGuard {
|
||||
_permit: permit,
|
||||
actor_state: state,
|
||||
est_bytes,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Drop-on-completion guard for an admitted request. Dropping releases
|
||||
/// the in-flight count permit (via `Drop` on the underlying semaphore
|
||||
/// permit) and decrements the actor's byte counter.
|
||||
#[derive(Debug)]
|
||||
pub struct AdmissionGuard {
|
||||
_permit: OwnedSemaphorePermit,
|
||||
actor_state: Arc<ActorState>,
|
||||
est_bytes: u64,
|
||||
}
|
||||
|
||||
impl Drop for AdmissionGuard {
|
||||
fn drop(&mut self) {
|
||||
self.actor_state
|
||||
.bytes
|
||||
.fetch_sub(self.est_bytes, Ordering::SeqCst);
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_env_u32(name: &str, default: u32) -> u32 {
|
||||
match std::env::var(name) {
|
||||
Ok(v) => v.parse::<u32>().unwrap_or_else(|err| {
|
||||
tracing::warn!(
|
||||
env = name,
|
||||
value = %v,
|
||||
error = %err,
|
||||
default,
|
||||
"invalid env value, using default"
|
||||
);
|
||||
default
|
||||
}),
|
||||
Err(_) => default,
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_env_u64(name: &str, default: u64) -> u64 {
|
||||
match std::env::var(name) {
|
||||
Ok(v) => v.parse::<u64>().unwrap_or_else(|err| {
|
||||
tracing::warn!(
|
||||
env = name,
|
||||
value = %v,
|
||||
error = %err,
|
||||
default,
|
||||
"invalid env value, using default"
|
||||
);
|
||||
default
|
||||
}),
|
||||
Err(_) => default,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 4)]
|
||||
async fn try_admit_admits_under_cap() {
|
||||
let controller = WorkloadController::new(2, 1024);
|
||||
let actor: Arc<str> = "alice".into();
|
||||
let g1 = controller.try_admit(&actor, 100).expect("first admit");
|
||||
let _g2 = controller.try_admit(&actor, 100).expect("second admit");
|
||||
let err = controller
|
||||
.try_admit(&actor, 100)
|
||||
.expect_err("third should reject on count");
|
||||
assert!(matches!(err, RejectReason::InFlightCountExceeded { cap: 2 }));
|
||||
drop(g1);
|
||||
// After drop, a new admit succeeds again.
|
||||
let _g3 = controller
|
||||
.try_admit(&actor, 100)
|
||||
.expect("admit after drop");
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 4)]
|
||||
async fn byte_budget_caps_admission() {
|
||||
let controller = WorkloadController::new(16, 1000);
|
||||
let actor: Arc<str> = "alice".into();
|
||||
let _g1 = controller.try_admit(&actor, 600).expect("first admit");
|
||||
let err = controller
|
||||
.try_admit(&actor, 600)
|
||||
.expect_err("second should reject on bytes");
|
||||
match err {
|
||||
RejectReason::ByteBudgetExceeded { cap, attempted } => {
|
||||
assert_eq!(cap, 1000);
|
||||
assert_eq!(attempted, 1200);
|
||||
}
|
||||
other => panic!("expected ByteBudgetExceeded, got {:?}", other),
|
||||
}
|
||||
// Verify the byte counter was rolled back: a smaller request fits.
|
||||
let _g2 = controller.try_admit(&actor, 300).expect("smaller admit");
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 4)]
|
||||
async fn actor_admission_race_does_not_exceed_cap() {
|
||||
// Pin master plan §"WorkloadController" Finding 6: independent
|
||||
// atomic load + check + add allows two concurrent callers to
|
||||
// both pass a cap-N check. The Semaphore-based gate is
|
||||
// race-free — exactly cap_count callers succeed.
|
||||
//
|
||||
// Each task holds its admission guard until released via a
|
||||
// oneshot channel; this forces real contention because guards
|
||||
// can't drop and free permits before all 32 calls have raced.
|
||||
let controller = Arc::new(WorkloadController::new(16, u64::MAX / 4));
|
||||
let actor: Arc<str> = "racer".into();
|
||||
|
||||
let (release_tx, _) = tokio::sync::broadcast::channel::<()>(1);
|
||||
|
||||
let mut handles = Vec::with_capacity(32);
|
||||
for _ in 0..32 {
|
||||
let controller = Arc::clone(&controller);
|
||||
let actor = actor.clone();
|
||||
let mut release_rx = release_tx.subscribe();
|
||||
handles.push(tokio::spawn(async move {
|
||||
let result = controller.try_admit(&actor, 1);
|
||||
let success = result.is_ok();
|
||||
// Hold the guard (if any) until the test signals release,
|
||||
// so the cap-16 contention is observable across all 32
|
||||
// tasks instead of permits being recycled task-by-task.
|
||||
let _guard = result.ok();
|
||||
let _ = release_rx.recv().await;
|
||||
success
|
||||
}));
|
||||
}
|
||||
|
||||
// Give all 32 tasks a chance to hit `try_admit` before any can
|
||||
// drop their guard. 50ms is plenty for tokio's scheduler on a
|
||||
// 4-worker runtime.
|
||||
tokio::time::sleep(std::time::Duration::from_millis(50)).await;
|
||||
// Release every task; collect succeed/reject counts.
|
||||
let _ = release_tx.send(());
|
||||
|
||||
let mut accepted = 0u32;
|
||||
let mut rejected = 0u32;
|
||||
for h in handles {
|
||||
if h.await.unwrap() {
|
||||
accepted += 1;
|
||||
} else {
|
||||
rejected += 1;
|
||||
}
|
||||
}
|
||||
assert_eq!(accepted, 16, "expected exactly 16 successful admits");
|
||||
assert_eq!(rejected, 16, "expected exactly 16 rejections");
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 4)]
|
||||
async fn per_actor_caps_independent() {
|
||||
let controller = WorkloadController::new(1, 1024);
|
||||
let alice: Arc<str> = "alice".into();
|
||||
let bob: Arc<str> = "bob".into();
|
||||
let _ga = controller.try_admit(&alice, 100).expect("alice ok");
|
||||
// Alice over count cap, Bob unaffected.
|
||||
let err = controller.try_admit(&alice, 100).expect_err("alice rejected");
|
||||
assert!(matches!(err, RejectReason::InFlightCountExceeded { .. }));
|
||||
let _gb = controller.try_admit(&bob, 100).expect("bob ok");
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load diff
|
|
@ -1,6 +1,6 @@
|
|||
[package]
|
||||
name = "omnigraph-engine"
|
||||
version = "0.4.1"
|
||||
version = "0.4.2"
|
||||
edition = "2024"
|
||||
description = "Runtime engine for the Omnigraph graph database."
|
||||
license = "MIT"
|
||||
|
|
@ -16,7 +16,7 @@ default = []
|
|||
failpoints = ["dep:fail", "fail/failpoints"]
|
||||
|
||||
[dependencies]
|
||||
omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.4.1" }
|
||||
omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.4.2" }
|
||||
lance = { workspace = true }
|
||||
lance-datafusion = { workspace = true }
|
||||
datafusion = { workspace = true }
|
||||
|
|
@ -47,9 +47,10 @@ time = { workspace = true }
|
|||
async-trait = { workspace = true }
|
||||
url = { workspace = true }
|
||||
chrono = { workspace = true }
|
||||
arc-swap = { workspace = true }
|
||||
|
||||
[dev-dependencies]
|
||||
omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.4.1" }
|
||||
omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.4.2" }
|
||||
tokio = { workspace = true }
|
||||
lance-namespace-impls = { workspace = true }
|
||||
serial_test = "3"
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ mod omnigraph;
|
|||
mod recovery_audit;
|
||||
mod run_registry;
|
||||
mod schema_state;
|
||||
pub(crate) mod write_queue;
|
||||
|
||||
pub use commit_graph::GraphCommit;
|
||||
pub use graph_coordinator::{GraphCoordinator, ReadTarget, ResolvedTarget, SnapshotId};
|
||||
|
|
@ -18,6 +19,53 @@ pub(crate) use run_registry::is_internal_run_branch;
|
|||
|
||||
pub(crate) const SCHEMA_APPLY_LOCK_BRANCH: &str = "__schema_apply_lock__";
|
||||
|
||||
/// Mutation kind, threaded through the version-check call sites so the
|
||||
/// engine can apply an op-kind-aware policy:
|
||||
///
|
||||
/// - `Insert` / `Merge`: skip the strict pre-stage `ensure_expected_version`
|
||||
/// check. Lance's `MergeInsertBuilder` rebases concurrent appends; the
|
||||
/// per-(table, branch) writer queue serializes `commit_staged`; the
|
||||
/// publisher's CAS (refreshed under the queue via
|
||||
/// `MutationStaging::commit_all`'s `snapshot_for_branch` call) catches
|
||||
/// genuine cross-process drift as `ManifestConflictDetails::ExpectedVersionMismatch`.
|
||||
/// The pre-stage strict check would over-reject in-process concurrent
|
||||
/// inserts, which is exactly the case PR 2 / MR-686 designed the
|
||||
/// per-table queue to allow.
|
||||
///
|
||||
/// - `Update` / `Delete`: keep the strict check. These have read-modify-write
|
||||
/// semantics; Lance moving between the read at stage time and the write
|
||||
/// at commit time means the staged batch is computed against stale state.
|
||||
/// The strict check guards the per-query SI invariant. SERIALIZABLE
|
||||
/// opt-in (§VI.36 future seam) is the long-term answer for tighter
|
||||
/// semantics; today, in-process update-update races on the same key
|
||||
/// stay rejected as 409 — acceptable.
|
||||
///
|
||||
/// - `SchemaRewrite`: keep the strict check. Schema apply runs under the
|
||||
/// graph-wide `__schema_apply_lock__` AND per-table queues; the strict
|
||||
/// check is uncontested at that point.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub(crate) enum MutationOpKind {
|
||||
Insert,
|
||||
Merge,
|
||||
Update,
|
||||
Delete,
|
||||
SchemaRewrite,
|
||||
}
|
||||
|
||||
impl MutationOpKind {
|
||||
/// Whether the strict pre-stage `ensure_expected_version` check should
|
||||
/// fire for this op kind. See [`MutationOpKind`] for the rationale per
|
||||
/// kind.
|
||||
pub(crate) fn strict_pre_stage_version_check(self) -> bool {
|
||||
match self {
|
||||
MutationOpKind::Insert | MutationOpKind::Merge => false,
|
||||
MutationOpKind::Update
|
||||
| MutationOpKind::Delete
|
||||
| MutationOpKind::SchemaRewrite => true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn is_schema_apply_lock_branch(name: &str) -> bool {
|
||||
name.trim_start_matches('/') == SCHEMA_APPLY_LOCK_BRANCH
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ use std::collections::{BTreeSet, HashMap, HashSet};
|
|||
use std::io::Write;
|
||||
use std::sync::Arc;
|
||||
|
||||
use arc_swap::ArcSwap;
|
||||
use arrow_array::{
|
||||
Array, BinaryArray, BooleanArray, Date32Array, FixedSizeListArray, Float32Array, Float64Array,
|
||||
Int32Array, Int64Array, LargeBinaryArray, LargeListArray, LargeStringArray, ListArray,
|
||||
|
|
@ -73,12 +74,60 @@ pub struct SchemaApplyResult {
|
|||
pub struct Omnigraph {
|
||||
root_uri: String,
|
||||
storage: Arc<dyn StorageAdapter>,
|
||||
coordinator: GraphCoordinator,
|
||||
/// Coordinator state behind a tokio `RwLock`. PR 2 (MR-686) wraps
|
||||
/// this so engine write APIs can be `&self` (the HTTP server's
|
||||
/// `AppState` holds `Arc<Omnigraph>` and dispatches concurrent
|
||||
/// calls without a global write lock). Reads (`snapshot`, `version`,
|
||||
/// `current_branch`, `branch_list`, `resolve_*`, `head_commit_id`,
|
||||
/// `list_commits`, …) acquire `.read().await` and parallelize.
|
||||
/// Writes (`refresh`, `branch_create`, `branch_delete`, `commit_*`,
|
||||
/// `record_*`) acquire `.write().await` and serialize. The atomic
|
||||
/// commit invariant — `commit_manifest_updates` followed by
|
||||
/// `record_graph_commit` must be atomic — is preserved by the
|
||||
/// single `.write()` covering both calls inside
|
||||
/// `commit_updates_with_actor_with_expected`. PR 2 Phase 2
|
||||
/// converted from `Mutex` to `RwLock` because the bench showed
|
||||
/// the Mutex was the dominant serializer for disjoint-table
|
||||
/// workloads. Lock acquisition order: always before `runtime_cache`
|
||||
/// (when both are needed in one scope).
|
||||
coordinator: Arc<tokio::sync::RwLock<GraphCoordinator>>,
|
||||
table_store: TableStore,
|
||||
runtime_cache: RuntimeCache,
|
||||
catalog: Catalog,
|
||||
schema_source: String,
|
||||
pub(crate) audit_actor_id: Option<String>,
|
||||
/// Read-heavy on every query, written only by `apply_schema`. ArcSwap
|
||||
/// gives atomic pointer swap with zero-cost reads (`load()` returns a
|
||||
/// `Guard<Arc<Catalog>>`), so concurrent queries on different actors
|
||||
/// don't contend on a lock to read the catalog.
|
||||
catalog: Arc<ArcSwap<Catalog>>,
|
||||
/// Read-heavy on schema introspection paths, written only by
|
||||
/// `apply_schema`. Same ArcSwap rationale as `catalog`.
|
||||
schema_source: Arc<ArcSwap<String>>,
|
||||
/// Per-`(table_key, branch)` writer queues. Reachable from engine
|
||||
/// internals (mutation finalize, schema_apply, branch_merge,
|
||||
/// ensure_indices, delete_where) and from future MR-870 recovery
|
||||
/// reconciler. PR 1b adds the field; callers acquire in commits 4+.
|
||||
write_queue: Arc<crate::db::write_queue::WriteQueueManager>,
|
||||
/// Process-wide mutex held across the swap → operate → restore window
|
||||
/// in `branch_merge_impl`. Two concurrent merges with distinct targets
|
||||
/// would otherwise interleave their three separate
|
||||
/// `coordinator.write().await` acquisitions, leaving each merge's
|
||||
/// inner body running against the other's swapped coord. Pinned by
|
||||
/// `concurrent_branch_merges_distinct_targets_do_not_swap_into_each_other`
|
||||
/// in `crates/omnigraph-server/tests/server.rs`.
|
||||
///
|
||||
/// Cost: serializes ALL concurrent branch merges process-wide.
|
||||
/// Acceptable because branch merges are heavy (table rewrites, index
|
||||
/// rebuilds), per-(table, branch) queues inside `commit_all` already
|
||||
/// serialize the data path, and merges are rare relative to /change
|
||||
/// or /ingest. A finer-grained per-target-branch mutex is a follow-up
|
||||
/// if telemetry shows merge concurrency matters.
|
||||
///
|
||||
/// The deeper fix — refactor `branch_merge_on_current_target` to take
|
||||
/// an explicit target coord parameter so `self.coordinator` is never
|
||||
/// used as scratch space — is the round-1 shape applied to
|
||||
/// `branch_create_from_impl`. Deferred because it requires unwinding
|
||||
/// every `self.snapshot()` and `self.ensure_commit_graph_initialized()`
|
||||
/// call inside the merge body.
|
||||
merge_exclusive: Arc<tokio::sync::Mutex<()>>,
|
||||
}
|
||||
|
||||
/// Whether [`Omnigraph::open`] runs the open-time recovery sweep.
|
||||
|
|
@ -128,12 +177,13 @@ impl Omnigraph {
|
|||
Ok(Self {
|
||||
root_uri: root.clone(),
|
||||
storage,
|
||||
coordinator,
|
||||
coordinator: Arc::new(tokio::sync::RwLock::new(coordinator)),
|
||||
table_store: TableStore::new(&root),
|
||||
runtime_cache: RuntimeCache::default(),
|
||||
catalog,
|
||||
schema_source: schema_source.to_string(),
|
||||
audit_actor_id: None,
|
||||
catalog: Arc::new(ArcSwap::from_pointee(catalog)),
|
||||
schema_source: Arc::new(ArcSwap::from_pointee(schema_source.to_string())),
|
||||
write_queue: Arc::new(crate::db::write_queue::WriteQueueManager::new()),
|
||||
merge_exclusive: Arc::new(tokio::sync::Mutex::new(())),
|
||||
})
|
||||
}
|
||||
|
||||
|
|
@ -214,21 +264,39 @@ impl Omnigraph {
|
|||
Ok(Self {
|
||||
root_uri: root.clone(),
|
||||
storage,
|
||||
coordinator,
|
||||
coordinator: Arc::new(tokio::sync::RwLock::new(coordinator)),
|
||||
table_store: TableStore::new(&root),
|
||||
runtime_cache: RuntimeCache::default(),
|
||||
catalog,
|
||||
schema_source,
|
||||
audit_actor_id: None,
|
||||
catalog: Arc::new(ArcSwap::from_pointee(catalog)),
|
||||
schema_source: Arc::new(ArcSwap::from_pointee(schema_source)),
|
||||
write_queue: Arc::new(crate::db::write_queue::WriteQueueManager::new()),
|
||||
merge_exclusive: Arc::new(tokio::sync::Mutex::new(())),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn catalog(&self) -> &Catalog {
|
||||
&self.catalog
|
||||
/// Returns an `Arc<Catalog>` snapshot. Cheap clone of the current
|
||||
/// catalog pointer; callers can hold the returned `Arc` across awaits
|
||||
/// without blocking concurrent `apply_schema`.
|
||||
pub fn catalog(&self) -> Arc<Catalog> {
|
||||
self.catalog.load_full()
|
||||
}
|
||||
|
||||
pub fn schema_source(&self) -> &str {
|
||||
&self.schema_source
|
||||
/// Returns an `Arc<String>` snapshot of the schema source.
|
||||
pub fn schema_source(&self) -> Arc<String> {
|
||||
self.schema_source.load_full()
|
||||
}
|
||||
|
||||
/// Atomically swap the in-memory catalog. Concurrent readers see
|
||||
/// either the old or the new pointer; never a torn state. Used by
|
||||
/// `apply_schema` and `reload_schema_if_source_changed`.
|
||||
pub(crate) fn store_catalog(&self, catalog: Catalog) {
|
||||
self.catalog.store(Arc::new(catalog));
|
||||
}
|
||||
|
||||
/// Atomically swap the in-memory schema source. Same rationale as
|
||||
/// [`store_catalog`](Self::store_catalog).
|
||||
pub(crate) fn store_schema_source(&self, schema_source: String) {
|
||||
self.schema_source.store(Arc::new(schema_source));
|
||||
}
|
||||
|
||||
pub fn uri(&self) -> &str {
|
||||
|
|
@ -243,11 +311,11 @@ impl Omnigraph {
|
|||
schema_apply::plan_schema(self, desired_schema_source).await
|
||||
}
|
||||
|
||||
pub async fn apply_schema(&mut self, desired_schema_source: &str) -> Result<SchemaApplyResult> {
|
||||
pub async fn apply_schema(&self, desired_schema_source: &str) -> Result<SchemaApplyResult> {
|
||||
schema_apply::apply_schema(self, desired_schema_source).await
|
||||
}
|
||||
|
||||
pub(crate) async fn ensure_schema_apply_idle(&mut self, operation: &str) -> Result<()> {
|
||||
pub(crate) async fn ensure_schema_apply_idle(&self, operation: &str) -> Result<()> {
|
||||
schema_apply::ensure_schema_apply_idle(self, operation).await
|
||||
}
|
||||
|
||||
|
|
@ -278,6 +346,26 @@ impl Omnigraph {
|
|||
self.storage.as_ref()
|
||||
}
|
||||
|
||||
/// Per-`(table_key, branch)` writer queues.
|
||||
///
|
||||
/// Engine-internal writers (mutation finalize, schema_apply,
|
||||
/// branch_merge, ensure_indices, delete_where) and the future MR-870
|
||||
/// recovery reconciler reach the queue manager via this accessor.
|
||||
/// Returns an `Arc` clone so callers can hold the manager across
|
||||
/// `&mut self` engine API boundaries.
|
||||
pub(crate) fn write_queue(&self) -> Arc<crate::db::write_queue::WriteQueueManager> {
|
||||
Arc::clone(&self.write_queue)
|
||||
}
|
||||
|
||||
/// Engine-internal access to the merge-exclusive mutex. Held across
|
||||
/// the swap → operate → restore window in `branch_merge_impl` so
|
||||
/// concurrent merges with distinct targets don't corrupt
|
||||
/// `self.coordinator` mid-operation. See the field doc on
|
||||
/// `Omnigraph::merge_exclusive` for the full design rationale.
|
||||
pub(crate) fn merge_exclusive(&self) -> Arc<tokio::sync::Mutex<()>> {
|
||||
Arc::clone(&self.merge_exclusive)
|
||||
}
|
||||
|
||||
/// Engine-level access to the repo's normalized root URI. Used by
|
||||
/// the recovery sidecar protocol to compute `__recovery/` paths.
|
||||
pub(crate) fn root_uri(&self) -> &str {
|
||||
|
|
@ -297,15 +385,16 @@ impl Omnigraph {
|
|||
}
|
||||
|
||||
pub(crate) async fn swap_coordinator_for_branch(
|
||||
&mut self,
|
||||
&self,
|
||||
branch: Option<&str>,
|
||||
) -> Result<GraphCoordinator> {
|
||||
let next = self.open_coordinator_for_branch(branch).await?;
|
||||
Ok(std::mem::replace(&mut self.coordinator, next))
|
||||
let mut coord = self.coordinator.write().await;
|
||||
Ok(std::mem::replace(&mut *coord, next))
|
||||
}
|
||||
|
||||
pub(crate) fn restore_coordinator(&mut self, coordinator: GraphCoordinator) {
|
||||
self.coordinator = coordinator;
|
||||
pub(crate) async fn restore_coordinator(&self, coordinator: GraphCoordinator) {
|
||||
*self.coordinator.write().await = coordinator;
|
||||
}
|
||||
|
||||
pub(crate) async fn resolved_branch_target(
|
||||
|
|
@ -315,21 +404,19 @@ impl Omnigraph {
|
|||
self.ensure_schema_state_valid().await?;
|
||||
let requested = ReadTarget::Branch(branch.unwrap_or("main").to_string());
|
||||
let normalized = normalize_branch_name(branch.unwrap_or("main"))?;
|
||||
if normalized.as_deref() == self.coordinator.current_branch() {
|
||||
let snapshot_id = self.coordinator.head_commit_id().await?.unwrap_or_else(|| {
|
||||
SnapshotId::synthetic(
|
||||
self.coordinator.current_branch(),
|
||||
self.coordinator.version(),
|
||||
)
|
||||
let coord = self.coordinator.read().await;
|
||||
if normalized.as_deref() == coord.current_branch() {
|
||||
let snapshot_id = coord.head_commit_id().await?.unwrap_or_else(|| {
|
||||
SnapshotId::synthetic(coord.current_branch(), coord.version())
|
||||
});
|
||||
return Ok(ResolvedTarget {
|
||||
requested,
|
||||
branch: self.coordinator.current_branch().map(str::to_string),
|
||||
branch: coord.current_branch().map(str::to_string),
|
||||
snapshot_id,
|
||||
snapshot: self.coordinator.snapshot(),
|
||||
snapshot: coord.snapshot(),
|
||||
});
|
||||
}
|
||||
self.coordinator.resolve_target(&requested).await
|
||||
coord.resolve_target(&requested).await
|
||||
}
|
||||
|
||||
pub(crate) async fn snapshot_for_branch(&self, branch: Option<&str>) -> Result<Snapshot> {
|
||||
|
|
@ -338,13 +425,13 @@ impl Omnigraph {
|
|||
.map(|resolved| resolved.snapshot)
|
||||
}
|
||||
|
||||
pub(crate) fn version(&self) -> u64 {
|
||||
self.coordinator.version()
|
||||
pub(crate) async fn version(&self) -> u64 {
|
||||
self.coordinator.read().await.version()
|
||||
}
|
||||
|
||||
/// Return an immutable Snapshot from the known manifest state. No storage I/O.
|
||||
pub(crate) fn snapshot(&self) -> Snapshot {
|
||||
self.coordinator.snapshot()
|
||||
pub(crate) async fn snapshot(&self) -> Snapshot {
|
||||
self.coordinator.read().await.snapshot()
|
||||
}
|
||||
|
||||
pub async fn snapshot_of(&self, target: impl Into<ReadTarget>) -> Result<Snapshot> {
|
||||
|
|
@ -369,10 +456,11 @@ impl Omnigraph {
|
|||
}
|
||||
|
||||
/// Synchronize this handle's write base to the latest head of the named branch.
|
||||
pub async fn sync_branch(&mut self, branch: &str) -> Result<()> {
|
||||
pub async fn sync_branch(&self, branch: &str) -> Result<()> {
|
||||
self.ensure_schema_state_valid().await?;
|
||||
let branch = normalize_branch_name(branch)?;
|
||||
self.coordinator = self.open_coordinator_for_branch(branch.as_deref()).await?;
|
||||
let next = self.open_coordinator_for_branch(branch.as_deref()).await?;
|
||||
*self.coordinator.write().await = next;
|
||||
self.runtime_cache.invalidate_all().await;
|
||||
Ok(())
|
||||
}
|
||||
|
|
@ -409,35 +497,44 @@ impl Omnigraph {
|
|||
/// (e.g. `schema_apply` mid-write) MUST use
|
||||
/// [`refresh_coordinator_only`](Self::refresh_coordinator_only) to
|
||||
/// avoid the recovery sweep racing their own sidecar.
|
||||
pub async fn refresh(&mut self) -> Result<()> {
|
||||
self.coordinator.refresh().await?;
|
||||
let schema_state_recovery = recover_schema_state_files(
|
||||
&self.root_uri,
|
||||
Arc::clone(&self.storage),
|
||||
&self.coordinator.snapshot(),
|
||||
)
|
||||
.await?;
|
||||
crate::db::manifest::recover_manifest_drift(
|
||||
&self.root_uri,
|
||||
Arc::clone(&self.storage),
|
||||
&mut self.coordinator,
|
||||
crate::db::manifest::RecoveryMode::RollForwardOnly,
|
||||
schema_state_recovery,
|
||||
)
|
||||
.await?;
|
||||
pub async fn refresh(&self) -> Result<()> {
|
||||
// Scope the coord write guard to the recovery section only.
|
||||
// `reload_schema_if_source_changed` (below) acquires
|
||||
// `self.coordinator.read().await` when the on-disk schema source
|
||||
// has drifted from the cached `schema_source`. Tokio's RwLock is
|
||||
// not reentrant, so holding the write across that call deadlocks.
|
||||
// Pinned by `composite_flow_schema_apply_then_branch_ops_no_deadlock_in_refresh`.
|
||||
{
|
||||
let mut coord = self.coordinator.write().await;
|
||||
coord.refresh().await?;
|
||||
let schema_state_recovery = recover_schema_state_files(
|
||||
&self.root_uri,
|
||||
Arc::clone(&self.storage),
|
||||
&coord.snapshot(),
|
||||
)
|
||||
.await?;
|
||||
crate::db::manifest::recover_manifest_drift(
|
||||
&self.root_uri,
|
||||
Arc::clone(&self.storage),
|
||||
&mut *coord,
|
||||
crate::db::manifest::RecoveryMode::RollForwardOnly,
|
||||
schema_state_recovery,
|
||||
)
|
||||
.await?;
|
||||
} // ← write guard released before reload's read acquisition
|
||||
self.reload_schema_if_source_changed().await?;
|
||||
self.runtime_cache.invalidate_all().await;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn reload_schema_if_source_changed(&mut self) -> Result<()> {
|
||||
async fn reload_schema_if_source_changed(&self) -> Result<()> {
|
||||
let schema_path = schema_source_uri(&self.root_uri);
|
||||
let schema_source = self.storage.read_text(&schema_path).await?;
|
||||
if schema_source == self.schema_source {
|
||||
if schema_source == *self.schema_source.load_full() {
|
||||
return Ok(());
|
||||
}
|
||||
let current_source_ir = read_schema_ir_from_source(&schema_source)?;
|
||||
let branches = self.coordinator.branch_list().await?;
|
||||
let branches = self.coordinator.read().await.branch_list().await?;
|
||||
let (accepted_ir, _) = load_or_bootstrap_schema_contract(
|
||||
&self.root_uri,
|
||||
Arc::clone(&self.storage),
|
||||
|
|
@ -447,8 +544,8 @@ impl Omnigraph {
|
|||
.await?;
|
||||
let mut catalog = build_catalog_from_ir(&accepted_ir)?;
|
||||
fixup_blob_schemas(&mut catalog);
|
||||
self.schema_source = schema_source;
|
||||
self.catalog = catalog;
|
||||
self.store_schema_source(schema_source);
|
||||
self.store_catalog(catalog);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
|
@ -459,15 +556,15 @@ impl Omnigraph {
|
|||
/// here would observe the caller's own sidecar, classify it as
|
||||
/// RolledPastExpected, and roll it forward — racing the caller's
|
||||
/// own publish path.
|
||||
pub(crate) async fn refresh_coordinator_only(&mut self) -> Result<()> {
|
||||
self.coordinator.refresh().await?;
|
||||
pub(crate) async fn refresh_coordinator_only(&self) -> Result<()> {
|
||||
self.coordinator.write().await.refresh().await?;
|
||||
self.runtime_cache.invalidate_all().await;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn resolve_snapshot(&self, branch: &str) -> Result<SnapshotId> {
|
||||
self.ensure_schema_state_valid().await?;
|
||||
self.coordinator.resolve_snapshot_id(branch).await
|
||||
self.coordinator.read().await.resolve_snapshot_id(branch).await
|
||||
}
|
||||
|
||||
pub(crate) async fn resolved_target(
|
||||
|
|
@ -475,7 +572,7 @@ impl Omnigraph {
|
|||
target: impl Into<ReadTarget>,
|
||||
) -> Result<ResolvedTarget> {
|
||||
self.ensure_schema_state_valid().await?;
|
||||
self.coordinator.resolve_target(&target.into()).await
|
||||
self.coordinator.read().await.resolve_target(&target.into()).await
|
||||
}
|
||||
|
||||
// ─── Change detection ────────────────────────────────────────────────
|
||||
|
|
@ -506,26 +603,20 @@ impl Omnigraph {
|
|||
to_commit_id: &str,
|
||||
filter: &crate::changes::ChangeFilter,
|
||||
) -> Result<crate::changes::ChangeSet> {
|
||||
let from_commit = self
|
||||
.coordinator
|
||||
.resolve_commit(&SnapshotId::new(from_commit_id))
|
||||
.await?;
|
||||
let to_commit = self
|
||||
.coordinator
|
||||
.resolve_commit(&SnapshotId::new(to_commit_id))
|
||||
.await?;
|
||||
let from_snap = self
|
||||
.coordinator
|
||||
let coord = self.coordinator.read().await;
|
||||
let from_commit = coord.resolve_commit(&SnapshotId::new(from_commit_id)).await?;
|
||||
let to_commit = coord.resolve_commit(&SnapshotId::new(to_commit_id)).await?;
|
||||
let from_snap = coord
|
||||
.resolve_target(&ReadTarget::Snapshot(SnapshotId::new(
|
||||
from_commit.graph_commit_id.clone(),
|
||||
)))
|
||||
.await?;
|
||||
let to_snap = self
|
||||
.coordinator
|
||||
let to_snap = coord
|
||||
.resolve_target(&ReadTarget::Snapshot(SnapshotId::new(
|
||||
to_commit.graph_commit_id.clone(),
|
||||
)))
|
||||
.await?;
|
||||
drop(coord);
|
||||
crate::changes::diff_snapshots(
|
||||
self.uri(),
|
||||
&from_snap.snapshot,
|
||||
|
|
@ -558,7 +649,7 @@ impl Omnigraph {
|
|||
/// Create a Snapshot at any historical manifest version.
|
||||
pub async fn snapshot_at_version(&self, version: u64) -> Result<Snapshot> {
|
||||
self.ensure_schema_state_valid().await?;
|
||||
self.coordinator.snapshot_at_version(version).await
|
||||
self.coordinator.read().await.snapshot_at_version(version).await
|
||||
}
|
||||
|
||||
pub async fn export_jsonl(
|
||||
|
|
@ -606,11 +697,11 @@ impl Omnigraph {
|
|||
/// unbranched subtables keep inheriting `main`, while subtables inherited
|
||||
/// from an ancestor branch are first forked into the active branch before
|
||||
/// their index metadata is updated.
|
||||
pub async fn ensure_indices(&mut self) -> Result<()> {
|
||||
pub async fn ensure_indices(&self) -> Result<()> {
|
||||
table_ops::ensure_indices(self).await
|
||||
}
|
||||
|
||||
pub async fn ensure_indices_on(&mut self, branch: &str) -> Result<()> {
|
||||
pub async fn ensure_indices_on(&self, branch: &str) -> Result<()> {
|
||||
table_ops::ensure_indices_on(self, branch).await
|
||||
}
|
||||
|
||||
|
|
@ -633,7 +724,7 @@ impl Omnigraph {
|
|||
|
||||
/// Compact small Lance fragments into fewer larger ones across every
|
||||
/// node + edge table on `main`. See [`optimize`] for details.
|
||||
pub async fn optimize(&mut self) -> Result<Vec<optimize::TableOptimizeStats>> {
|
||||
pub async fn optimize(&self) -> Result<Vec<optimize::TableOptimizeStats>> {
|
||||
optimize::optimize_all_tables(self).await
|
||||
}
|
||||
|
||||
|
|
@ -658,8 +749,8 @@ impl Omnigraph {
|
|||
/// ```
|
||||
pub async fn read_blob(&self, type_name: &str, id: &str, property: &str) -> Result<BlobFile> {
|
||||
self.ensure_schema_state_valid().await?;
|
||||
let node_type = self
|
||||
.catalog
|
||||
let catalog = self.catalog();
|
||||
let node_type = catalog
|
||||
.node_types
|
||||
.get(type_name)
|
||||
.ok_or_else(|| OmniError::manifest(format!("unknown node type '{}'", type_name)))?;
|
||||
|
|
@ -670,7 +761,7 @@ impl Omnigraph {
|
|||
)));
|
||||
}
|
||||
|
||||
let snapshot = self.snapshot();
|
||||
let snapshot = self.snapshot().await;
|
||||
let table_key = format!("node:{}", type_name);
|
||||
let ds = snapshot.open(&table_key).await?;
|
||||
|
||||
|
|
@ -698,12 +789,12 @@ impl Omnigraph {
|
|||
})
|
||||
}
|
||||
|
||||
pub(crate) fn active_branch(&self) -> Option<&str> {
|
||||
self.coordinator.current_branch()
|
||||
pub(crate) async fn active_branch(&self) -> Option<String> {
|
||||
self.coordinator.read().await.current_branch().map(str::to_string)
|
||||
}
|
||||
|
||||
async fn ensure_branch_delete_safe(&self, branch: &str, branches: &[String]) -> Result<()> {
|
||||
let descendants = self.coordinator.branch_descendants(branch).await?;
|
||||
let descendants = self.coordinator.read().await.branch_descendants(branch).await?;
|
||||
if let Some(descendant) = descendants.first() {
|
||||
return Err(OmniError::manifest_conflict(format!(
|
||||
"cannot delete branch '{}' because descendant branch '{}' still depends on it",
|
||||
|
|
@ -758,8 +849,9 @@ impl Omnigraph {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
async fn delete_branch_storage_only(&mut self, branch: &str) -> Result<()> {
|
||||
if self.coordinator.current_branch() == Some(branch) {
|
||||
async fn delete_branch_storage_only(&self, branch: &str) -> Result<()> {
|
||||
let active = self.coordinator.read().await.current_branch().map(str::to_string);
|
||||
if active.as_deref() == Some(branch) {
|
||||
return Err(OmniError::manifest_conflict(format!(
|
||||
"cannot delete currently active branch '{}'",
|
||||
branch
|
||||
|
|
@ -773,7 +865,7 @@ impl Omnigraph {
|
|||
.map(|entry| (entry.table_key.clone(), entry.table_path.clone()))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
self.coordinator.branch_delete(branch).await?;
|
||||
self.coordinator.write().await.branch_delete(branch).await?;
|
||||
self.cleanup_deleted_branch_tables(branch, &owned_tables)
|
||||
.await
|
||||
}
|
||||
|
|
@ -794,19 +886,15 @@ impl Omnigraph {
|
|||
.map(|id| id.map(|snapshot_id| snapshot_id.as_str().to_string()))
|
||||
}
|
||||
|
||||
pub async fn branch_create(&mut self, name: &str) -> Result<()> {
|
||||
pub async fn branch_create(&self, name: &str) -> Result<()> {
|
||||
self.ensure_schema_state_valid().await?;
|
||||
self.ensure_schema_apply_idle("branch_create").await?;
|
||||
ensure_public_branch_ref(name, "branch_create")?;
|
||||
self.coordinator.branch_create(name).await
|
||||
}
|
||||
|
||||
pub(crate) fn current_audit_actor(&self) -> Option<&str> {
|
||||
self.audit_actor_id.as_deref()
|
||||
self.coordinator.write().await.branch_create(name).await
|
||||
}
|
||||
|
||||
pub async fn branch_create_from(
|
||||
&mut self,
|
||||
&self,
|
||||
from: impl Into<ReadTarget>,
|
||||
name: &str,
|
||||
) -> Result<()> {
|
||||
|
|
@ -815,7 +903,7 @@ impl Omnigraph {
|
|||
}
|
||||
|
||||
async fn branch_create_from_impl(
|
||||
&mut self,
|
||||
&self,
|
||||
from: impl Into<ReadTarget>,
|
||||
name: &str,
|
||||
allow_internal_refs: bool,
|
||||
|
|
@ -831,25 +919,39 @@ impl Omnigraph {
|
|||
ensure_public_branch_ref(name, "branch_create_from")?;
|
||||
}
|
||||
let branch = normalize_branch_name(&branch_name)?;
|
||||
let previous = self.swap_coordinator_for_branch(branch.as_deref()).await?;
|
||||
let result = self.coordinator.branch_create(name).await;
|
||||
self.restore_coordinator(previous);
|
||||
result
|
||||
// Operate on a freshly-opened source coordinator that's owned locally
|
||||
// — never touch `self.coordinator`. The pre-fix implementation used
|
||||
// `swap_coordinator_for_branch` + operate + `restore_coordinator` as
|
||||
// three separate `coordinator.write().await` acquisitions; under
|
||||
// `&self` concurrency, a second `branch_create_from` could swap
|
||||
// self.coordinator between this caller's swap and operate steps,
|
||||
// making the operate run against the wrong source branch and
|
||||
// forking off the wrong HEAD. Pinned by
|
||||
// `concurrent_branch_create_from_distinct_parents_does_not_corrupt_coordinator`
|
||||
// in `crates/omnigraph-server/tests/server.rs`.
|
||||
//
|
||||
// `branch_create` mutates only the local coord's commit-graph cache;
|
||||
// the manifest write is durable on disk regardless of which
|
||||
// coord-handle issued it. Discarding `source_coord` after the call
|
||||
// is the right shape — the new branch is reachable from any
|
||||
// subsequent open of any coord.
|
||||
let mut source_coord = self.open_coordinator_for_branch(branch.as_deref()).await?;
|
||||
source_coord.branch_create(name).await
|
||||
}
|
||||
|
||||
pub async fn branch_list(&self) -> Result<Vec<String>> {
|
||||
self.ensure_schema_state_valid().await?;
|
||||
self.coordinator.branch_list().await
|
||||
self.coordinator.read().await.branch_list().await
|
||||
}
|
||||
|
||||
pub async fn branch_delete(&mut self, name: &str) -> Result<()> {
|
||||
pub async fn branch_delete(&self, name: &str) -> Result<()> {
|
||||
self.ensure_schema_state_valid().await?;
|
||||
self.ensure_schema_apply_idle("branch_delete").await?;
|
||||
ensure_public_branch_ref(name, "branch_delete")?;
|
||||
self.refresh().await?;
|
||||
let branch = normalize_branch_name(name)?
|
||||
.ok_or_else(|| OmniError::manifest("cannot delete branch 'main'".to_string()))?;
|
||||
let branches = self.coordinator.branch_list().await?;
|
||||
let branches = self.coordinator.read().await.branch_list().await?;
|
||||
if !branches.iter().any(|candidate| candidate == &branch) {
|
||||
return Err(OmniError::manifest_not_found(format!(
|
||||
"branch '{}' not found",
|
||||
|
|
@ -863,7 +965,7 @@ impl Omnigraph {
|
|||
|
||||
pub async fn get_commit(&self, commit_id: &str) -> Result<GraphCommit> {
|
||||
self.ensure_schema_state_valid().await?;
|
||||
self.coordinator
|
||||
self.coordinator.read().await
|
||||
.resolve_commit(&SnapshotId::new(commit_id))
|
||||
.await
|
||||
}
|
||||
|
|
@ -886,16 +988,18 @@ impl Omnigraph {
|
|||
pub(crate) async fn open_for_mutation(
|
||||
&self,
|
||||
table_key: &str,
|
||||
op_kind: crate::db::MutationOpKind,
|
||||
) -> Result<(Dataset, String, Option<String>)> {
|
||||
table_ops::open_for_mutation(self, table_key).await
|
||||
table_ops::open_for_mutation(self, table_key, op_kind).await
|
||||
}
|
||||
|
||||
pub(crate) async fn open_for_mutation_on_branch(
|
||||
&self,
|
||||
branch: Option<&str>,
|
||||
table_key: &str,
|
||||
op_kind: crate::db::MutationOpKind,
|
||||
) -> Result<(Dataset, String, Option<String>)> {
|
||||
table_ops::open_for_mutation_on_branch(self, branch, table_key).await
|
||||
table_ops::open_for_mutation_on_branch(self, branch, table_key, op_kind).await
|
||||
}
|
||||
|
||||
pub(crate) async fn fork_dataset_from_entry_state(
|
||||
|
|
@ -923,9 +1027,17 @@ impl Omnigraph {
|
|||
full_path: &str,
|
||||
table_branch: Option<&str>,
|
||||
expected_version: u64,
|
||||
op_kind: crate::db::MutationOpKind,
|
||||
) -> Result<Dataset> {
|
||||
table_ops::reopen_for_mutation(self, table_key, full_path, table_branch, expected_version)
|
||||
.await
|
||||
table_ops::reopen_for_mutation(
|
||||
self,
|
||||
table_key,
|
||||
full_path,
|
||||
table_branch,
|
||||
expected_version,
|
||||
op_kind,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) async fn open_dataset_at_state(
|
||||
|
|
@ -965,43 +1077,47 @@ impl Omnigraph {
|
|||
}
|
||||
|
||||
pub(crate) async fn commit_manifest_updates(
|
||||
&mut self,
|
||||
&self,
|
||||
updates: &[crate::db::SubTableUpdate],
|
||||
) -> Result<u64> {
|
||||
table_ops::commit_manifest_updates(self, updates).await
|
||||
}
|
||||
|
||||
pub(crate) async fn record_merge_commit(
|
||||
&mut self,
|
||||
&self,
|
||||
manifest_version: u64,
|
||||
parent_commit_id: &str,
|
||||
merged_parent_commit_id: &str,
|
||||
actor_id: Option<&str>,
|
||||
) -> Result<String> {
|
||||
table_ops::record_merge_commit(
|
||||
self,
|
||||
manifest_version,
|
||||
parent_commit_id,
|
||||
merged_parent_commit_id,
|
||||
actor_id,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) async fn commit_updates_on_branch_with_expected(
|
||||
&mut self,
|
||||
&self,
|
||||
branch: Option<&str>,
|
||||
updates: &[crate::db::SubTableUpdate],
|
||||
expected_table_versions: &std::collections::HashMap<String, u64>,
|
||||
actor_id: Option<&str>,
|
||||
) -> Result<u64> {
|
||||
table_ops::commit_updates_on_branch_with_expected(
|
||||
self,
|
||||
branch,
|
||||
updates,
|
||||
expected_table_versions,
|
||||
actor_id,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) async fn ensure_commit_graph_initialized(&mut self) -> Result<()> {
|
||||
pub(crate) async fn ensure_commit_graph_initialized(&self) -> Result<()> {
|
||||
table_ops::ensure_commit_graph_initialized(self).await
|
||||
}
|
||||
|
||||
|
|
@ -1495,7 +1611,7 @@ edge WorksAt: Person -> Company
|
|||
}
|
||||
|
||||
async fn table_rows_json(db: &Omnigraph, table_key: &str) -> Vec<Value> {
|
||||
let snapshot = db.snapshot();
|
||||
let snapshot = db.snapshot().await;
|
||||
let ds = snapshot.open(table_key).await.unwrap();
|
||||
let batches = db.table_store().scan_batches(&ds).await.unwrap();
|
||||
batches
|
||||
|
|
@ -1509,7 +1625,10 @@ edge WorksAt: Person -> Company
|
|||
}
|
||||
|
||||
async fn seed_person_row(db: &mut Omnigraph, name: &str, age: Option<i32>) {
|
||||
let (mut ds, full_path, table_branch) = db.open_for_mutation("node:Person").await.unwrap();
|
||||
let (mut ds, full_path, table_branch) = db
|
||||
.open_for_mutation("node:Person", crate::db::MutationOpKind::Insert)
|
||||
.await
|
||||
.unwrap();
|
||||
let schema: Arc<Schema> = Arc::new(ds.schema().into());
|
||||
let columns: Vec<Arc<dyn Array>> = schema
|
||||
.fields()
|
||||
|
|
@ -1592,7 +1711,7 @@ edge WorksAt: Person -> Company
|
|||
let uri = dir.path().to_str().unwrap();
|
||||
let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap();
|
||||
seed_person_row(&mut db, "Alice", Some(30)).await;
|
||||
let before_version = db.snapshot().version();
|
||||
let before_version = db.snapshot().await.version();
|
||||
|
||||
let desired = TEST_SCHEMA
|
||||
.replace("node Person {\n", "node Human @rename_from(\"Person\") {\n")
|
||||
|
|
@ -1603,7 +1722,7 @@ edge WorksAt: Person -> Company
|
|||
);
|
||||
db.apply_schema(&desired).await.unwrap();
|
||||
|
||||
let head = db.snapshot();
|
||||
let head = db.snapshot().await;
|
||||
assert!(head.entry("node:Person").is_none());
|
||||
assert!(head.entry("node:Human").is_some());
|
||||
let historical = ManifestCoordinator::snapshot_at(uri, None, before_version)
|
||||
|
|
@ -1633,7 +1752,7 @@ edge WorksAt: Person -> Company
|
|||
.await
|
||||
.unwrap();
|
||||
|
||||
let all_branches = db.coordinator.all_branches().await.unwrap();
|
||||
let all_branches = db.coordinator.read().await.all_branches().await.unwrap();
|
||||
assert!(
|
||||
!all_branches.iter().any(|b| is_internal_run_branch(b)),
|
||||
"run branch should be deleted after publish, got: {:?}",
|
||||
|
|
@ -1657,7 +1776,7 @@ edge WorksAt: Person -> Company
|
|||
let desired = TEST_SCHEMA.replace("name: String @key", "name: String @key @index");
|
||||
db.apply_schema(&desired).await.unwrap();
|
||||
|
||||
let snapshot = db.snapshot();
|
||||
let snapshot = db.snapshot().await;
|
||||
let ds = snapshot.open("node:Person").await.unwrap();
|
||||
assert!(db.table_store().has_fts_index(&ds, "name").await.unwrap());
|
||||
}
|
||||
|
|
@ -1676,7 +1795,7 @@ edge WorksAt: Person -> Company
|
|||
);
|
||||
db.apply_schema(&desired).await.unwrap();
|
||||
|
||||
let snapshot = db.snapshot();
|
||||
let snapshot = db.snapshot().await;
|
||||
let ds = snapshot.open("node:Person").await.unwrap();
|
||||
assert!(db.table_store().has_btree_index(&ds, "id").await.unwrap());
|
||||
assert!(db.table_store().has_fts_index(&ds, "name").await.unwrap());
|
||||
|
|
@ -1689,11 +1808,16 @@ edge WorksAt: Person -> Company
|
|||
let db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap();
|
||||
let mut db = db;
|
||||
db.coordinator
|
||||
.write()
|
||||
.await
|
||||
.branch_create(SCHEMA_APPLY_LOCK_BRANCH)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let err = db.open_for_mutation("node:Person").await.unwrap_err();
|
||||
let err = db
|
||||
.open_for_mutation("node:Person", crate::db::MutationOpKind::Insert)
|
||||
.await
|
||||
.unwrap_err();
|
||||
assert!(
|
||||
err.to_string()
|
||||
.contains("write is unavailable while schema apply is in progress")
|
||||
|
|
@ -1706,6 +1830,8 @@ edge WorksAt: Person -> Company
|
|||
let uri = dir.path().to_str().unwrap();
|
||||
let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap();
|
||||
db.coordinator
|
||||
.write()
|
||||
.await
|
||||
.branch_create(SCHEMA_APPLY_LOCK_BRANCH)
|
||||
.await
|
||||
.unwrap();
|
||||
|
|
@ -1723,6 +1849,8 @@ edge WorksAt: Person -> Company
|
|||
let uri = dir.path().to_str().unwrap();
|
||||
let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap();
|
||||
db.coordinator
|
||||
.write()
|
||||
.await
|
||||
.branch_create(SCHEMA_APPLY_LOCK_BRANCH)
|
||||
.await
|
||||
.unwrap();
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@ pub(super) async fn entity_at(
|
|||
id: &str,
|
||||
version: u64,
|
||||
) -> Result<Option<serde_json::Value>> {
|
||||
let snap = db.coordinator.snapshot_at_version(version).await?;
|
||||
let snap = db.coordinator.read().await.snapshot_at_version(version).await?;
|
||||
entity_from_snapshot(db, &snap, table_key, id).await
|
||||
}
|
||||
|
||||
|
|
@ -142,7 +142,8 @@ async fn export_table_to_writer<W: Write>(
|
|||
.open_snapshot_table(snapshot, table_key)
|
||||
.await?;
|
||||
let ordering = Some(vec![ColumnOrdering::asc_nulls_last("id".to_string())]);
|
||||
let blob_properties = blob_properties_for_table_key(db.catalog(), table_key)?;
|
||||
let catalog = db.catalog();
|
||||
let blob_properties = blob_properties_for_table_key(&catalog, table_key)?;
|
||||
|
||||
if blob_properties.is_empty() {
|
||||
for batch in db.table_store.scan(&ds, None, None, ordering).await? {
|
||||
|
|
@ -207,9 +208,9 @@ fn write_export_rows_from_batch<W: Write>(
|
|||
blob_values: Option<&HashMap<String, Vec<Option<String>>>>,
|
||||
writer: &mut W,
|
||||
) -> Result<()> {
|
||||
let catalog = db.catalog();
|
||||
if let Some(type_name) = table_key.strip_prefix("node:") {
|
||||
let node_type = db
|
||||
.catalog
|
||||
let node_type = catalog
|
||||
.node_types
|
||||
.get(type_name)
|
||||
.ok_or_else(|| OmniError::manifest(format!("unknown node type '{}'", type_name)))?;
|
||||
|
|
@ -243,8 +244,7 @@ fn write_export_rows_from_batch<W: Write>(
|
|||
}
|
||||
|
||||
if let Some(edge_name) = table_key.strip_prefix("edge:") {
|
||||
let edge_type = db
|
||||
.catalog
|
||||
let edge_type = catalog
|
||||
.edge_types
|
||||
.get(edge_name)
|
||||
.ok_or_else(|| OmniError::manifest(format!("unknown edge type '{}'", edge_name)))?;
|
||||
|
|
|
|||
|
|
@ -74,14 +74,14 @@ pub struct TableCleanupStats {
|
|||
|
||||
/// Run Lance `compact_files` on every node + edge table on `main`.
|
||||
/// Tables run in parallel (bounded concurrency).
|
||||
pub async fn optimize_all_tables(db: &mut Omnigraph) -> Result<Vec<TableOptimizeStats>> {
|
||||
pub async fn optimize_all_tables(db: &Omnigraph) -> Result<Vec<TableOptimizeStats>> {
|
||||
db.ensure_schema_state_valid().await?;
|
||||
db.ensure_schema_apply_idle("optimize").await?;
|
||||
|
||||
let resolved = db.resolved_branch_target(None).await?;
|
||||
let snapshot = resolved.snapshot;
|
||||
|
||||
let table_tasks: Vec<_> = all_table_keys(&db.catalog)
|
||||
let table_tasks: Vec<_> = all_table_keys(&db.catalog())
|
||||
.into_iter()
|
||||
.filter_map(|table_key| {
|
||||
let entry = snapshot.entry(&table_key)?;
|
||||
|
|
@ -144,7 +144,7 @@ pub async fn cleanup_all_tables(
|
|||
let resolved = db.resolved_branch_target(None).await?;
|
||||
let snapshot = resolved.snapshot;
|
||||
|
||||
let table_tasks: Vec<_> = all_table_keys(&db.catalog)
|
||||
let table_tasks: Vec<_> = all_table_keys(&db.catalog())
|
||||
.into_iter()
|
||||
.filter_map(|table_key| {
|
||||
let entry = snapshot.entry(&table_key)?;
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ pub(super) async fn plan_schema(
|
|||
}
|
||||
|
||||
pub(super) async fn apply_schema(
|
||||
db: &mut Omnigraph,
|
||||
db: &Omnigraph,
|
||||
desired_schema_source: &str,
|
||||
) -> Result<SchemaApplyResult> {
|
||||
acquire_schema_apply_lock(db).await?;
|
||||
|
|
@ -27,11 +27,11 @@ pub(super) async fn apply_schema(
|
|||
}
|
||||
|
||||
pub(super) async fn apply_schema_with_lock(
|
||||
db: &mut Omnigraph,
|
||||
db: &Omnigraph,
|
||||
desired_schema_source: &str,
|
||||
) -> Result<SchemaApplyResult> {
|
||||
db.ensure_schema_state_valid().await?;
|
||||
let branches = db.coordinator.all_branches().await?;
|
||||
let branches = db.coordinator.read().await.all_branches().await?;
|
||||
// Skip `main` and internal system branches. The schema-apply lock branch
|
||||
// is excluded because it is the cluster-wide schema-apply serializer.
|
||||
// `__run__*` branches are no longer created; the filter remains as
|
||||
|
|
@ -67,7 +67,7 @@ pub(super) async fn apply_schema_with_lock(
|
|||
return Ok(SchemaApplyResult {
|
||||
supported: true,
|
||||
applied: false,
|
||||
manifest_version: db.version(),
|
||||
manifest_version: db.version().await,
|
||||
steps: plan.steps,
|
||||
});
|
||||
}
|
||||
|
|
@ -75,7 +75,7 @@ pub(super) async fn apply_schema_with_lock(
|
|||
let mut desired_catalog = build_catalog_from_ir(&desired_ir)?;
|
||||
fixup_blob_schemas(&mut desired_catalog);
|
||||
|
||||
let snapshot = db.snapshot();
|
||||
let snapshot = db.snapshot().await;
|
||||
let base_manifest_version = snapshot.version();
|
||||
let mut added_tables = BTreeSet::new();
|
||||
let mut renamed_tables = HashMap::new();
|
||||
|
|
@ -209,6 +209,26 @@ pub(super) async fn apply_schema_with_lock(
|
|||
});
|
||||
}
|
||||
|
||||
// Acquire per-(table_key, branch) queues for every existing table
|
||||
// that schema_apply will rewrite or re-index. New tables (added or
|
||||
// renamed targets) aren't acquired — they have no existing dataset
|
||||
// to race against. Held across the per-table commit loop and the
|
||||
// manifest publish via `commit_changes_with_actor` below.
|
||||
//
|
||||
// Schema-apply already holds the graph-wide `__schema_apply_lock__`
|
||||
// sentinel branch, so under PR 1b's intermediate state these
|
||||
// per-table acquisitions are uncontended. They exist for symmetry
|
||||
// with future MR-870 recovery, which will need queue acquisition
|
||||
// before any `Dataset::restore` it issues for SchemaApply sidecars.
|
||||
let schema_apply_queue_keys: Vec<(String, Option<String>)> = recovery_pins
|
||||
.iter()
|
||||
.map(|pin| (pin.table_key.clone(), pin.table_branch.clone()))
|
||||
.collect();
|
||||
let _schema_apply_queue_guards = db
|
||||
.write_queue()
|
||||
.acquire_many(&schema_apply_queue_keys)
|
||||
.await;
|
||||
|
||||
let recovery_handle = if recovery_pins.is_empty()
|
||||
&& sidecar_registrations.is_empty()
|
||||
&& sidecar_tombstones.is_empty()
|
||||
|
|
@ -225,7 +245,10 @@ pub(super) async fn apply_schema_with_lock(
|
|||
let mut sidecar = crate::db::manifest::new_sidecar(
|
||||
crate::db::manifest::SidecarKind::SchemaApply,
|
||||
None,
|
||||
db.audit_actor_id.clone(),
|
||||
// `apply_schema` doesn't currently take an actor (no `apply_schema_as`
|
||||
// public API). The HTTP server's /schema/apply handler can pass actor
|
||||
// context through a follow-up addition. For now, system-attributed.
|
||||
None,
|
||||
recovery_pins,
|
||||
);
|
||||
sidecar.additional_registrations = sidecar_registrations;
|
||||
|
|
@ -266,11 +289,12 @@ pub(super) async fn apply_schema_with_lock(
|
|||
})?;
|
||||
ensure_snapshot_entry_head_matches(db, source_entry).await?;
|
||||
let source_ds = snapshot.open(source_table_key).await?;
|
||||
let current_catalog = db.catalog();
|
||||
let batch = batch_for_schema_apply_rewrite(
|
||||
db,
|
||||
&source_ds,
|
||||
source_table_key,
|
||||
&db.catalog,
|
||||
¤t_catalog,
|
||||
target_table_key,
|
||||
&desired_catalog,
|
||||
property_renames.get(target_table_key),
|
||||
|
|
@ -311,11 +335,12 @@ pub(super) async fn apply_schema_with_lock(
|
|||
})?;
|
||||
ensure_snapshot_entry_head_matches(db, entry).await?;
|
||||
let source_ds = snapshot.open(table_key).await?;
|
||||
let current_catalog = db.catalog();
|
||||
let batch = batch_for_schema_apply_rewrite(
|
||||
db,
|
||||
&source_ds,
|
||||
table_key,
|
||||
&db.catalog,
|
||||
¤t_catalog,
|
||||
table_key,
|
||||
&desired_catalog,
|
||||
property_renames.get(table_key),
|
||||
|
|
@ -418,11 +443,11 @@ pub(super) async fn apply_schema_with_lock(
|
|||
}
|
||||
|
||||
db.refresh_coordinator_only().await?;
|
||||
if db.version() != base_manifest_version {
|
||||
let current_manifest_version = db.version().await;
|
||||
if current_manifest_version != base_manifest_version {
|
||||
return Err(OmniError::manifest_conflict(format!(
|
||||
"schema apply lost its write lease: main advanced from v{} to v{} while schema apply was in progress",
|
||||
base_manifest_version,
|
||||
db.version()
|
||||
base_manifest_version, current_manifest_version,
|
||||
)));
|
||||
}
|
||||
|
||||
|
|
@ -444,13 +469,15 @@ pub(super) async fn apply_schema_with_lock(
|
|||
|
||||
crate::failpoints::maybe_fail("schema_apply.after_staging_write")?;
|
||||
|
||||
let actor_id = db.current_audit_actor().map(str::to_string);
|
||||
// `apply_schema` doesn't currently take an actor; system-attributed.
|
||||
let PublishedSnapshot {
|
||||
manifest_version,
|
||||
_snapshot_id: _,
|
||||
} = db
|
||||
.coordinator
|
||||
.commit_changes_with_actor(&manifest_changes, actor_id.as_deref())
|
||||
.write()
|
||||
.await
|
||||
.commit_changes_with_actor(&manifest_changes, None)
|
||||
.await?;
|
||||
|
||||
crate::failpoints::maybe_fail("schema_apply.after_manifest_commit")?;
|
||||
|
|
@ -471,9 +498,9 @@ pub(super) async fn apply_schema_with_lock(
|
|||
)
|
||||
.await?;
|
||||
|
||||
db.catalog = desired_catalog;
|
||||
db.schema_source = desired_schema_source.to_string();
|
||||
db.coordinator.refresh().await?;
|
||||
db.store_catalog(desired_catalog);
|
||||
db.store_schema_source(desired_schema_source.to_string());
|
||||
db.coordinator.write().await.refresh().await?;
|
||||
db.runtime_cache.invalidate_all().await;
|
||||
if changed_edge_tables {
|
||||
db.invalidate_graph_index().await;
|
||||
|
|
@ -504,15 +531,15 @@ pub(super) async fn apply_schema_with_lock(
|
|||
})
|
||||
}
|
||||
|
||||
pub(super) async fn ensure_schema_apply_idle(db: &mut Omnigraph, operation: &str) -> Result<()> {
|
||||
pub(super) async fn ensure_schema_apply_idle(db: &Omnigraph, operation: &str) -> Result<()> {
|
||||
db.refresh_coordinator_only().await?;
|
||||
ensure_schema_apply_not_locked(db, operation).await
|
||||
}
|
||||
|
||||
pub(super) async fn acquire_schema_apply_lock(db: &mut Omnigraph) -> Result<()> {
|
||||
pub(super) async fn acquire_schema_apply_lock(db: &Omnigraph) -> Result<()> {
|
||||
db.ensure_schema_state_valid().await?;
|
||||
db.refresh_coordinator_only().await?;
|
||||
let branches = db.coordinator.all_branches().await?;
|
||||
let branches = db.coordinator.read().await.all_branches().await?;
|
||||
if branches
|
||||
.iter()
|
||||
.any(|branch| is_schema_apply_lock_branch(branch))
|
||||
|
|
@ -523,12 +550,16 @@ pub(super) async fn acquire_schema_apply_lock(db: &mut Omnigraph) -> Result<()>
|
|||
}
|
||||
|
||||
db.coordinator
|
||||
.write()
|
||||
.await
|
||||
.branch_create(SCHEMA_APPLY_LOCK_BRANCH)
|
||||
.await?;
|
||||
db.refresh_coordinator_only().await?;
|
||||
|
||||
let blocking_branches = db
|
||||
.coordinator
|
||||
.read()
|
||||
.await
|
||||
.all_branches()
|
||||
.await?
|
||||
.into_iter()
|
||||
|
|
@ -545,8 +576,10 @@ pub(super) async fn acquire_schema_apply_lock(db: &mut Omnigraph) -> Result<()>
|
|||
Ok(())
|
||||
}
|
||||
|
||||
pub(super) async fn release_schema_apply_lock(db: &mut Omnigraph) -> Result<()> {
|
||||
pub(super) async fn release_schema_apply_lock(db: &Omnigraph) -> Result<()> {
|
||||
db.coordinator
|
||||
.write()
|
||||
.await
|
||||
.branch_delete(SCHEMA_APPLY_LOCK_BRANCH)
|
||||
.await?;
|
||||
// Use refresh_coordinator_only — the full Omnigraph::refresh would
|
||||
|
|
@ -560,6 +593,8 @@ pub(super) async fn release_schema_apply_lock(db: &mut Omnigraph) -> Result<()>
|
|||
pub(super) async fn ensure_schema_apply_not_locked(db: &Omnigraph, operation: &str) -> Result<()> {
|
||||
if db
|
||||
.coordinator
|
||||
.read()
|
||||
.await
|
||||
.all_branches()
|
||||
.await?
|
||||
.iter()
|
||||
|
|
|
|||
|
|
@ -2,31 +2,31 @@ use super::*;
|
|||
|
||||
pub(super) async fn graph_index(db: &Omnigraph) -> Result<Arc<crate::graph_index::GraphIndex>> {
|
||||
db.ensure_schema_state_valid().await?;
|
||||
let resolved = db
|
||||
.coordinator
|
||||
let coord = db.coordinator.read().await;
|
||||
let resolved = coord
|
||||
.resolve_target(&ReadTarget::Branch(
|
||||
db.coordinator
|
||||
.current_branch()
|
||||
.unwrap_or("main")
|
||||
.to_string(),
|
||||
coord.current_branch().unwrap_or("main").to_string(),
|
||||
))
|
||||
.await?;
|
||||
db.runtime_cache.graph_index(&resolved, &db.catalog).await
|
||||
drop(coord);
|
||||
let catalog = db.catalog();
|
||||
db.runtime_cache.graph_index(&resolved, &catalog).await
|
||||
}
|
||||
|
||||
pub(super) async fn graph_index_for_resolved(
|
||||
db: &Omnigraph,
|
||||
resolved: &ResolvedTarget,
|
||||
) -> Result<Arc<crate::graph_index::GraphIndex>> {
|
||||
db.runtime_cache.graph_index(resolved, &db.catalog).await
|
||||
let catalog = db.catalog();
|
||||
db.runtime_cache.graph_index(resolved, &catalog).await
|
||||
}
|
||||
|
||||
pub(super) async fn ensure_indices(db: &mut Omnigraph) -> Result<()> {
|
||||
let current_branch = db.coordinator.current_branch().map(str::to_string);
|
||||
pub(super) async fn ensure_indices(db: &Omnigraph) -> Result<()> {
|
||||
let current_branch = db.coordinator.read().await.current_branch().map(str::to_string);
|
||||
ensure_indices_for_branch(db, current_branch.as_deref()).await
|
||||
}
|
||||
|
||||
pub(super) async fn ensure_indices_on(db: &mut Omnigraph, branch: &str) -> Result<()> {
|
||||
pub(super) async fn ensure_indices_on(db: &Omnigraph, branch: &str) -> Result<()> {
|
||||
let branch = normalize_branch_name(branch)?;
|
||||
ensure_indices_for_branch(db, branch.as_deref()).await
|
||||
}
|
||||
|
|
@ -58,12 +58,18 @@ pub(super) async fn failpoint_publish_table_head_without_index_rebuild_for_test(
|
|||
};
|
||||
let mut expected = std::collections::HashMap::new();
|
||||
expected.insert(table_key.to_string(), entry.table_version);
|
||||
commit_prepared_updates_on_branch_with_expected(db, branch.as_deref(), &[update], &expected)
|
||||
.await
|
||||
commit_prepared_updates_on_branch_with_expected(
|
||||
db,
|
||||
branch.as_deref(),
|
||||
&[update],
|
||||
&expected,
|
||||
None,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
pub(super) async fn ensure_indices_for_branch(
|
||||
db: &mut Omnigraph,
|
||||
db: &Omnigraph,
|
||||
branch: Option<&str>,
|
||||
) -> Result<()> {
|
||||
db.ensure_schema_state_valid().await?;
|
||||
|
|
@ -72,6 +78,7 @@ pub(super) async fn ensure_indices_for_branch(
|
|||
let snapshot = resolved.snapshot;
|
||||
let mut updates = Vec::new();
|
||||
let active_branch = resolved.branch;
|
||||
let catalog = db.catalog();
|
||||
|
||||
// Recovery sidecar: protect the per-table commit_staged loop in
|
||||
// build_indices_on_dataset (one commit per index built). Only pins
|
||||
|
|
@ -83,7 +90,7 @@ pub(super) async fn ensure_indices_for_branch(
|
|||
// committed work on sibling tables. Steady-state runs (everything
|
||||
// already indexed) skip the sidecar entirely.
|
||||
let mut recovery_pins: Vec<crate::db::manifest::SidecarTablePin> = Vec::new();
|
||||
for type_name in db.catalog.node_types.keys() {
|
||||
for type_name in catalog.node_types.keys() {
|
||||
let table_key = format!("node:{}", type_name);
|
||||
let Some(entry) = snapshot.entry(&table_key) else {
|
||||
continue;
|
||||
|
|
@ -122,7 +129,7 @@ pub(super) async fn ensure_indices_for_branch(
|
|||
});
|
||||
}
|
||||
}
|
||||
for edge_name in db.catalog.edge_types.keys() {
|
||||
for edge_name in catalog.edge_types.keys() {
|
||||
let table_key = format!("edge:{}", edge_name);
|
||||
let Some(entry) = snapshot.entry(&table_key) else {
|
||||
continue;
|
||||
|
|
@ -147,13 +154,28 @@ pub(super) async fn ensure_indices_for_branch(
|
|||
});
|
||||
}
|
||||
}
|
||||
// Acquire per-(table_key, active_branch) queues for every table
|
||||
// that needs index work. Held across the per-table commit loop and
|
||||
// the manifest publish at the end of this function. Sorted-order
|
||||
// acquisition prevents lock-order inversion against concurrent
|
||||
// multi-table writers (mutation finalize, branch_merge, future
|
||||
// MR-870 recovery). Under PR 1b's intermediate state (global server
|
||||
// RwLock still in place), this acquisition is uncontended.
|
||||
let queue_keys: Vec<(String, Option<String>)> = recovery_pins
|
||||
.iter()
|
||||
.map(|pin| (pin.table_key.clone(), pin.table_branch.clone()))
|
||||
.collect();
|
||||
let _queue_guards = db.write_queue().acquire_many(&queue_keys).await;
|
||||
|
||||
let recovery_handle = if recovery_pins.is_empty() {
|
||||
None
|
||||
} else {
|
||||
let sidecar = crate::db::manifest::new_sidecar(
|
||||
crate::db::manifest::SidecarKind::EnsureIndices,
|
||||
active_branch.clone(),
|
||||
db.audit_actor_id.clone(),
|
||||
// `ensure_indices` doesn't currently take an actor; system-attributed.
|
||||
// Future: add `ensure_indices_as` to thread actor context.
|
||||
None,
|
||||
recovery_pins,
|
||||
);
|
||||
Some(
|
||||
|
|
@ -162,7 +184,7 @@ pub(super) async fn ensure_indices_for_branch(
|
|||
)
|
||||
};
|
||||
|
||||
for type_name in db.catalog.node_types.keys() {
|
||||
for type_name in catalog.node_types.keys() {
|
||||
let table_key = format!("node:{}", type_name);
|
||||
let Some(entry) = snapshot.entry(&table_key) else {
|
||||
continue;
|
||||
|
|
@ -179,6 +201,7 @@ pub(super) async fn ensure_indices_for_branch(
|
|||
entry.table_branch.as_deref(),
|
||||
entry.table_version,
|
||||
active_branch,
|
||||
crate::db::MutationOpKind::SchemaRewrite,
|
||||
)
|
||||
.await?
|
||||
}
|
||||
|
|
@ -209,7 +232,7 @@ pub(super) async fn ensure_indices_for_branch(
|
|||
}
|
||||
}
|
||||
|
||||
for edge_name in db.catalog.edge_types.keys() {
|
||||
for edge_name in catalog.edge_types.keys() {
|
||||
let table_key = format!("edge:{}", edge_name);
|
||||
let Some(entry) = snapshot.entry(&table_key) else {
|
||||
continue;
|
||||
|
|
@ -226,6 +249,7 @@ pub(super) async fn ensure_indices_for_branch(
|
|||
entry.table_branch.as_deref(),
|
||||
entry.table_version,
|
||||
active_branch,
|
||||
crate::db::MutationOpKind::SchemaRewrite,
|
||||
)
|
||||
.await?
|
||||
}
|
||||
|
|
@ -264,7 +288,7 @@ pub(super) async fn ensure_indices_for_branch(
|
|||
crate::failpoints::maybe_fail("ensure_indices.post_phase_b_pre_manifest_commit")?;
|
||||
|
||||
if !updates.is_empty() {
|
||||
commit_prepared_updates_on_branch(db, branch, &updates).await?;
|
||||
commit_prepared_updates_on_branch(db, branch, &updates, None).await?;
|
||||
}
|
||||
|
||||
// Recovery sidecar lifecycle: delete after the manifest publish (or
|
||||
|
|
@ -321,7 +345,8 @@ async fn needs_index_work_node(
|
|||
if !db.table_store.has_btree_index(&ds, "id").await? {
|
||||
return Ok(true);
|
||||
}
|
||||
let Some(node_type) = db.catalog.node_types.get(type_name) else {
|
||||
let catalog = db.catalog();
|
||||
let Some(node_type) = catalog.node_types.get(type_name) else {
|
||||
return Ok(false);
|
||||
};
|
||||
for index_cols in &node_type.indices {
|
||||
|
|
@ -376,15 +401,23 @@ async fn needs_index_work_edge(
|
|||
pub(super) async fn open_for_mutation(
|
||||
db: &Omnigraph,
|
||||
table_key: &str,
|
||||
op_kind: crate::db::MutationOpKind,
|
||||
) -> Result<(Dataset, String, Option<String>)> {
|
||||
let current_branch = db.coordinator.current_branch().map(str::to_string);
|
||||
open_for_mutation_on_branch(db, current_branch.as_deref(), table_key).await
|
||||
let current_branch = db.coordinator.read().await.current_branch().map(str::to_string);
|
||||
open_for_mutation_on_branch(db, current_branch.as_deref(), table_key, op_kind).await
|
||||
}
|
||||
|
||||
/// Open a sub-table for mutation. The `op_kind` selects the strict-vs-relaxed
|
||||
/// pre-stage version-check policy — see [`crate::db::MutationOpKind`] for the
|
||||
/// rationale per kind. Insert / Merge skip the strict
|
||||
/// `ensure_expected_version` check (Lance's natural conflict resolver +
|
||||
/// per-(table, branch) queue + publisher CAS handle drift); Update / Delete /
|
||||
/// SchemaRewrite keep it (read-modify-write SI).
|
||||
pub(super) async fn open_for_mutation_on_branch(
|
||||
db: &Omnigraph,
|
||||
branch: Option<&str>,
|
||||
table_key: &str,
|
||||
op_kind: crate::db::MutationOpKind,
|
||||
) -> Result<(Dataset, String, Option<String>)> {
|
||||
db.ensure_schema_apply_not_locked("write").await?;
|
||||
let resolved = db.resolved_branch_target(branch).await?;
|
||||
|
|
@ -399,8 +432,10 @@ pub(super) async fn open_for_mutation_on_branch(
|
|||
.table_store
|
||||
.open_dataset_head_for_write(table_key, &full_path, None)
|
||||
.await?;
|
||||
db.table_store
|
||||
.ensure_expected_version(&ds, table_key, entry.table_version)?;
|
||||
if op_kind.strict_pre_stage_version_check() {
|
||||
db.table_store
|
||||
.ensure_expected_version(&ds, table_key, entry.table_version)?;
|
||||
}
|
||||
Ok((ds, full_path, None))
|
||||
}
|
||||
Some(active_branch) => {
|
||||
|
|
@ -411,6 +446,7 @@ pub(super) async fn open_for_mutation_on_branch(
|
|||
entry.table_branch.as_deref(),
|
||||
entry.table_version,
|
||||
active_branch,
|
||||
op_kind,
|
||||
)
|
||||
.await?;
|
||||
Ok((ds, full_path, table_branch))
|
||||
|
|
@ -425,6 +461,7 @@ pub(super) async fn open_owned_dataset_for_branch_write(
|
|||
entry_branch: Option<&str>,
|
||||
entry_version: u64,
|
||||
active_branch: &str,
|
||||
op_kind: crate::db::MutationOpKind,
|
||||
) -> Result<(Dataset, Option<String>)> {
|
||||
match entry_branch {
|
||||
Some(branch) if branch == active_branch => {
|
||||
|
|
@ -432,8 +469,10 @@ pub(super) async fn open_owned_dataset_for_branch_write(
|
|||
.table_store
|
||||
.open_dataset_head_for_write(table_key, full_path, Some(active_branch))
|
||||
.await?;
|
||||
db.table_store
|
||||
.ensure_expected_version(&ds, table_key, entry_version)?;
|
||||
if op_kind.strict_pre_stage_version_check() {
|
||||
db.table_store
|
||||
.ensure_expected_version(&ds, table_key, entry_version)?;
|
||||
}
|
||||
Ok((ds, Some(active_branch.to_string())))
|
||||
}
|
||||
source_branch => {
|
||||
|
|
@ -450,8 +489,10 @@ pub(super) async fn open_owned_dataset_for_branch_write(
|
|||
.table_store
|
||||
.open_dataset_head_for_write(table_key, full_path, Some(active_branch))
|
||||
.await?;
|
||||
db.table_store
|
||||
.ensure_expected_version(&ds, table_key, entry_version)?;
|
||||
if op_kind.strict_pre_stage_version_check() {
|
||||
db.table_store
|
||||
.ensure_expected_version(&ds, table_key, entry_version)?;
|
||||
}
|
||||
Ok((ds, Some(active_branch.to_string())))
|
||||
}
|
||||
}
|
||||
|
|
@ -482,11 +523,27 @@ pub(super) async fn reopen_for_mutation(
|
|||
full_path: &str,
|
||||
table_branch: Option<&str>,
|
||||
expected_version: u64,
|
||||
op_kind: crate::db::MutationOpKind,
|
||||
) -> Result<Dataset> {
|
||||
db.ensure_schema_apply_not_locked("write").await?;
|
||||
db.table_store
|
||||
.reopen_for_mutation(full_path, table_branch, table_key, expected_version)
|
||||
.await
|
||||
if op_kind.strict_pre_stage_version_check() {
|
||||
db.table_store
|
||||
.reopen_for_mutation(full_path, table_branch, table_key, expected_version)
|
||||
.await
|
||||
} else {
|
||||
// Insert / Merge: skip the strict version check. Open at HEAD —
|
||||
// Lance's natural conflict resolver at commit_staged time
|
||||
// (rebase append, dedupe merge_insert) handles concurrent
|
||||
// writers correctly; the publisher CAS in
|
||||
// `MutationStaging::commit_all` (refreshed under the
|
||||
// per-(table, branch) queue via `snapshot_for_branch`) catches
|
||||
// genuine cross-process drift as 409. See
|
||||
// [`crate::db::MutationOpKind`] for the policy rationale.
|
||||
let _ = expected_version;
|
||||
db.table_store
|
||||
.open_dataset_head_for_write(table_key, full_path, table_branch)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) async fn open_dataset_at_state(
|
||||
|
|
@ -505,7 +562,8 @@ pub(super) async fn build_indices_on_dataset(
|
|||
table_key: &str,
|
||||
ds: &mut Dataset,
|
||||
) -> Result<()> {
|
||||
build_indices_on_dataset_for_catalog(db, &db.catalog, table_key, ds).await
|
||||
let catalog = db.catalog();
|
||||
build_indices_on_dataset_for_catalog(db, &catalog, table_key, ds).await
|
||||
}
|
||||
|
||||
pub(super) async fn build_indices_on_dataset_for_catalog(
|
||||
|
|
@ -680,12 +738,19 @@ async fn prepare_updates_for_commit(
|
|||
let mut prepared_update = update.clone();
|
||||
if prepared_update.row_count > 0 {
|
||||
let full_path = format!("{}/{}", db.root_uri, entry.table_path);
|
||||
// Strict version check is correct here: this runs INSIDE
|
||||
// the publisher commit path, after `commit_staged` already
|
||||
// advanced Lance HEAD to `prepared_update.table_version`.
|
||||
// The check is a defense-in-depth assertion that the
|
||||
// dataset state matches what we just committed; not the
|
||||
// pre-stage race the op-kind policy targets.
|
||||
let mut ds = reopen_for_mutation(
|
||||
db,
|
||||
&prepared_update.table_key,
|
||||
&full_path,
|
||||
prepared_update.table_branch.as_deref(),
|
||||
prepared_update.table_version,
|
||||
crate::db::MutationOpKind::SchemaRewrite,
|
||||
)
|
||||
.await?;
|
||||
build_indices_on_dataset(db, &prepared_update.table_key, &mut ds).await?;
|
||||
|
|
@ -702,49 +767,50 @@ async fn prepare_updates_for_commit(
|
|||
}
|
||||
|
||||
async fn commit_prepared_updates(
|
||||
db: &mut Omnigraph,
|
||||
db: &Omnigraph,
|
||||
updates: &[crate::db::SubTableUpdate],
|
||||
actor_id: Option<&str>,
|
||||
) -> Result<u64> {
|
||||
let actor_id = db.current_audit_actor().map(str::to_string);
|
||||
let PublishedSnapshot {
|
||||
manifest_version,
|
||||
_snapshot_id: _,
|
||||
} = db
|
||||
.coordinator
|
||||
.commit_updates_with_actor(updates, actor_id.as_deref())
|
||||
.write()
|
||||
.await
|
||||
.commit_updates_with_actor(updates, actor_id)
|
||||
.await?;
|
||||
Ok(manifest_version)
|
||||
}
|
||||
|
||||
async fn commit_prepared_updates_with_expected(
|
||||
db: &mut Omnigraph,
|
||||
db: &Omnigraph,
|
||||
updates: &[crate::db::SubTableUpdate],
|
||||
expected_table_versions: &std::collections::HashMap<String, u64>,
|
||||
actor_id: Option<&str>,
|
||||
) -> Result<u64> {
|
||||
let actor_id = db.current_audit_actor().map(str::to_string);
|
||||
let PublishedSnapshot {
|
||||
manifest_version,
|
||||
_snapshot_id: _,
|
||||
} = db
|
||||
.coordinator
|
||||
.commit_updates_with_actor_with_expected(
|
||||
updates,
|
||||
expected_table_versions,
|
||||
actor_id.as_deref(),
|
||||
)
|
||||
.write()
|
||||
.await
|
||||
.commit_updates_with_actor_with_expected(updates, expected_table_versions, actor_id)
|
||||
.await?;
|
||||
Ok(manifest_version)
|
||||
}
|
||||
|
||||
pub(super) async fn commit_prepared_updates_on_branch(
|
||||
db: &mut Omnigraph,
|
||||
db: &Omnigraph,
|
||||
branch: Option<&str>,
|
||||
updates: &[crate::db::SubTableUpdate],
|
||||
actor_id: Option<&str>,
|
||||
) -> Result<u64> {
|
||||
let current_branch = db.coordinator.current_branch().map(str::to_string);
|
||||
let current_branch = db.coordinator.read().await.current_branch().map(str::to_string);
|
||||
let requested_branch = branch.map(str::to_string);
|
||||
if requested_branch == current_branch {
|
||||
return commit_prepared_updates(db, updates).await;
|
||||
return commit_prepared_updates(db, updates, actor_id).await;
|
||||
}
|
||||
|
||||
let mut coordinator = match requested_branch.as_deref() {
|
||||
|
|
@ -753,26 +819,32 @@ pub(super) async fn commit_prepared_updates_on_branch(
|
|||
}
|
||||
None => GraphCoordinator::open(db.uri(), Arc::clone(&db.storage)).await?,
|
||||
};
|
||||
let actor_id = db.current_audit_actor().map(str::to_string);
|
||||
let PublishedSnapshot {
|
||||
manifest_version,
|
||||
_snapshot_id: _,
|
||||
} = coordinator
|
||||
.commit_updates_with_actor(updates, actor_id.as_deref())
|
||||
.commit_updates_with_actor(updates, actor_id)
|
||||
.await?;
|
||||
Ok(manifest_version)
|
||||
}
|
||||
|
||||
pub(super) async fn commit_prepared_updates_on_branch_with_expected(
|
||||
db: &mut Omnigraph,
|
||||
db: &Omnigraph,
|
||||
branch: Option<&str>,
|
||||
updates: &[crate::db::SubTableUpdate],
|
||||
expected_table_versions: &std::collections::HashMap<String, u64>,
|
||||
actor_id: Option<&str>,
|
||||
) -> Result<u64> {
|
||||
let current_branch = db.coordinator.current_branch().map(str::to_string);
|
||||
let current_branch = db.coordinator.read().await.current_branch().map(str::to_string);
|
||||
let requested_branch = branch.map(str::to_string);
|
||||
if requested_branch == current_branch {
|
||||
return commit_prepared_updates_with_expected(db, updates, expected_table_versions).await;
|
||||
return commit_prepared_updates_with_expected(
|
||||
db,
|
||||
updates,
|
||||
expected_table_versions,
|
||||
actor_id,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
let mut coordinator = match requested_branch.as_deref() {
|
||||
|
|
@ -781,16 +853,11 @@ pub(super) async fn commit_prepared_updates_on_branch_with_expected(
|
|||
}
|
||||
None => GraphCoordinator::open(db.uri(), Arc::clone(&db.storage)).await?,
|
||||
};
|
||||
let actor_id = db.current_audit_actor().map(str::to_string);
|
||||
let PublishedSnapshot {
|
||||
manifest_version,
|
||||
_snapshot_id: _,
|
||||
} = coordinator
|
||||
.commit_updates_with_actor_with_expected(
|
||||
updates,
|
||||
expected_table_versions,
|
||||
actor_id.as_deref(),
|
||||
)
|
||||
.commit_updates_with_actor_with_expected(updates, expected_table_versions, actor_id)
|
||||
.await?;
|
||||
Ok(manifest_version)
|
||||
}
|
||||
|
|
@ -803,31 +870,31 @@ pub(super) async fn commit_updates(
|
|||
updates: &[crate::db::SubTableUpdate],
|
||||
) -> Result<u64> {
|
||||
db.ensure_schema_apply_not_locked("write commit").await?;
|
||||
let current_branch = db.coordinator.current_branch().map(str::to_string);
|
||||
let current_branch = db.coordinator.read().await.current_branch().map(str::to_string);
|
||||
let prepared = prepare_updates_for_commit(db, current_branch.as_deref(), updates).await?;
|
||||
commit_prepared_updates(db, &prepared).await
|
||||
commit_prepared_updates(db, &prepared, None).await
|
||||
}
|
||||
|
||||
pub(super) async fn commit_manifest_updates(
|
||||
db: &mut Omnigraph,
|
||||
db: &Omnigraph,
|
||||
updates: &[crate::db::SubTableUpdate],
|
||||
) -> Result<u64> {
|
||||
db.coordinator.commit_manifest_updates(updates).await
|
||||
db.coordinator.write().await.commit_manifest_updates(updates).await
|
||||
}
|
||||
|
||||
pub(super) async fn record_merge_commit(
|
||||
db: &mut Omnigraph,
|
||||
db: &Omnigraph,
|
||||
manifest_version: u64,
|
||||
parent_commit_id: &str,
|
||||
merged_parent_commit_id: &str,
|
||||
actor_id: Option<&str>,
|
||||
) -> Result<String> {
|
||||
let actor_id = db.current_audit_actor().map(str::to_string);
|
||||
db.coordinator
|
||||
db.coordinator.write().await
|
||||
.record_merge_commit(
|
||||
manifest_version,
|
||||
parent_commit_id,
|
||||
merged_parent_commit_id,
|
||||
actor_id.as_deref(),
|
||||
actor_id,
|
||||
)
|
||||
.await
|
||||
.map(|snapshot_id| snapshot_id.as_str().to_string())
|
||||
|
|
@ -837,19 +904,26 @@ pub(super) async fn record_merge_commit(
|
|||
/// `expected_table_versions` map asserts the manifest's pre-write per-table
|
||||
/// versions; mismatches surface as `ManifestConflictDetails::ExpectedVersionMismatch`.
|
||||
pub(super) async fn commit_updates_on_branch_with_expected(
|
||||
db: &mut Omnigraph,
|
||||
db: &Omnigraph,
|
||||
branch: Option<&str>,
|
||||
updates: &[crate::db::SubTableUpdate],
|
||||
expected_table_versions: &std::collections::HashMap<String, u64>,
|
||||
actor_id: Option<&str>,
|
||||
) -> Result<u64> {
|
||||
db.ensure_schema_apply_not_locked("write commit").await?;
|
||||
let prepared = prepare_updates_for_commit(db, branch, updates).await?;
|
||||
commit_prepared_updates_on_branch_with_expected(db, branch, &prepared, expected_table_versions)
|
||||
.await
|
||||
commit_prepared_updates_on_branch_with_expected(
|
||||
db,
|
||||
branch,
|
||||
&prepared,
|
||||
expected_table_versions,
|
||||
actor_id,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
pub(super) async fn ensure_commit_graph_initialized(db: &mut Omnigraph) -> Result<()> {
|
||||
db.coordinator.ensure_commit_graph_initialized().await
|
||||
pub(super) async fn ensure_commit_graph_initialized(db: &Omnigraph) -> Result<()> {
|
||||
db.coordinator.write().await.ensure_commit_graph_initialized().await
|
||||
}
|
||||
|
||||
pub(super) async fn invalidate_graph_index(db: &Omnigraph) {
|
||||
|
|
|
|||
231
crates/omnigraph/src/db/write_queue.rs
Normal file
231
crates/omnigraph/src/db/write_queue.rs
Normal file
|
|
@ -0,0 +1,231 @@
|
|||
//! Per-`(table_key, branch)` writer queues — MR-686 scaffolding.
|
||||
//!
|
||||
//! Today every server-layer write serializes on the global
|
||||
//! `Arc<RwLock<Omnigraph>>` in `AppState`. MR-686 replaces that with
|
||||
//! per-`(table_key, branch_ref)` queues so disjoint-key writes proceed
|
||||
//! concurrently. This module owns the queue data structure; callers in
|
||||
//! `MutationStaging::commit_all`, `branch_merge`, `schema_apply`,
|
||||
//! `ensure_indices`, `delete_where`, and the future MR-870 recovery
|
||||
//! reconciler acquire guards before any per-table Lance commit.
|
||||
//!
|
||||
//! ## Why exclusive `tokio::sync::Mutex<()>` per key
|
||||
//!
|
||||
//! Lance's `Dataset::restore` "wins" against concurrent Append/Update/
|
||||
//! Delete/CreateIndex/Merge per `check_restore_txn`, silently orphaning
|
||||
//! the concurrent writer's commit. The queue's *only* application-layer
|
||||
//! job is to serialize Restore against every other writer on the same
|
||||
//! `(table_key, branch_ref)`. Lance OCC handles the rest of the conflict
|
||||
//! matrix (Append vs Append fully compatible, Update vs Update rebases or
|
||||
//! retries, etc.) but cannot make Restore symmetric — that's an upstream
|
||||
//! design choice. Until Lance fixes Restore (or BatchCommitTables
|
||||
//! changes the protocol), every writer takes the same exclusive lock.
|
||||
//!
|
||||
//! `RwLock` (shared for normal writes, exclusive for Restore) is the
|
||||
//! natural follow-up but adds a writer-classification surface that's
|
||||
//! easy to get wrong; misclassifying any writer reintroduces the
|
||||
//! orphaning hazard. We start with `Mutex` and revisit based on
|
||||
//! production telemetry.
|
||||
//!
|
||||
//! ## Sorted-order acquisition
|
||||
//!
|
||||
//! `acquire_many` accepts a slice of keys and acquires them in
|
||||
//! lexicographic order. Multi-table writers (mutation finalize,
|
||||
//! branch_merge, future recovery reconciler) MUST go through
|
||||
//! `acquire_many` so all callers agree on acquisition order — this is
|
||||
//! how lock-order inversion deadlock is prevented.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
use tokio::sync::{Mutex as AsyncMutex, OwnedMutexGuard};
|
||||
|
||||
/// Queue key: `(table_key, branch_ref)`. `branch_ref = None` means main.
|
||||
///
|
||||
/// Branch is part of the key because the same Lance dataset can be
|
||||
/// pinned at different versions on different branches; concurrent
|
||||
/// writes to the same `table_key` on disjoint branches must NOT
|
||||
/// serialize at the queue.
|
||||
pub(crate) type TableQueueKey = (String, Option<String>);
|
||||
|
||||
/// Per-`(table_key, branch)` writer queue manager.
|
||||
///
|
||||
/// Lives on `Omnigraph` as `Arc<WriteQueueManager>` so HTTP handlers,
|
||||
/// engine internals, the CLI binary, and future background reconcilers
|
||||
/// (MR-870 recovery, MR-848 index) all reach it via the engine handle.
|
||||
#[derive(Default)]
|
||||
pub(crate) struct WriteQueueManager {
|
||||
/// Held only briefly per `acquire` call: clone out the per-key Arc,
|
||||
/// release the std mutex, then await the per-key tokio Mutex.
|
||||
queues: Mutex<HashMap<TableQueueKey, Arc<AsyncMutex<()>>>>,
|
||||
}
|
||||
|
||||
impl WriteQueueManager {
|
||||
pub(crate) fn new() -> Self {
|
||||
Self::default()
|
||||
}
|
||||
|
||||
/// Get-or-create the per-key queue and clone its Arc.
|
||||
fn slot(&self, key: &TableQueueKey) -> Arc<AsyncMutex<()>> {
|
||||
let mut map = self.queues.lock().expect("write queue map poisoned");
|
||||
if let Some(existing) = map.get(key) {
|
||||
return Arc::clone(existing);
|
||||
}
|
||||
let fresh = Arc::new(AsyncMutex::new(()));
|
||||
map.insert(key.clone(), Arc::clone(&fresh));
|
||||
fresh
|
||||
}
|
||||
|
||||
/// Acquire exclusive access to the queue for one `(table_key, branch)`.
|
||||
///
|
||||
/// Blocks until the lock is available. Drop the returned guard to
|
||||
/// release; the lock outlives the `WriteQueueManager` borrow.
|
||||
pub(crate) async fn acquire(&self, key: &TableQueueKey) -> OwnedMutexGuard<()> {
|
||||
self.slot(key).lock_owned().await
|
||||
}
|
||||
|
||||
/// Acquire exclusive access to many `(table_key, branch)` keys
|
||||
/// atomically, in lex-sorted order. Used by multi-table writers
|
||||
/// (mutation finalize, branch_merge, recovery) so all callers
|
||||
/// agree on acquisition order — prevents lock-order inversion.
|
||||
///
|
||||
/// Empty input returns an empty Vec without touching the map.
|
||||
/// Duplicates in `keys` are deduped before acquisition (the same
|
||||
/// key acquired twice would deadlock against itself).
|
||||
pub(crate) async fn acquire_many(
|
||||
&self,
|
||||
keys: &[TableQueueKey],
|
||||
) -> Vec<OwnedMutexGuard<()>> {
|
||||
if keys.is_empty() {
|
||||
return Vec::new();
|
||||
}
|
||||
let mut sorted: Vec<TableQueueKey> = keys.to_vec();
|
||||
sorted.sort();
|
||||
sorted.dedup();
|
||||
let mut guards = Vec::with_capacity(sorted.len());
|
||||
for key in &sorted {
|
||||
guards.push(self.acquire(key).await);
|
||||
}
|
||||
guards
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::time::{Duration, Instant};
|
||||
use tokio::time::timeout;
|
||||
|
||||
fn key(table: &str, branch: Option<&str>) -> TableQueueKey {
|
||||
(table.to_string(), branch.map(str::to_string))
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn acquire_many_empty_returns_empty() {
|
||||
let qm = WriteQueueManager::new();
|
||||
let guards = qm.acquire_many(&[]).await;
|
||||
assert!(guards.is_empty());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn acquire_many_dedupes_repeated_keys() {
|
||||
// Same key passed twice would deadlock if not deduped.
|
||||
let qm = WriteQueueManager::new();
|
||||
let k = key("t1", None);
|
||||
let guards = timeout(
|
||||
Duration::from_secs(2),
|
||||
qm.acquire_many(&[k.clone(), k.clone(), k]),
|
||||
)
|
||||
.await
|
||||
.expect("acquire_many with duplicates deadlocked");
|
||||
assert_eq!(guards.len(), 1);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn acquire_many_sorts_keys_deterministically() {
|
||||
// Two callers passing keys in different orders must acquire in
|
||||
// the same internal order. We test this indirectly: caller A
|
||||
// passes [a, c] and caller B passes [c, a]; if they both
|
||||
// acquire in sorted order the second caller blocks on `a` first,
|
||||
// not `c` — same as A — so no deadlock under any interleaving.
|
||||
// Direct sort observation: call acquire_many with a reversed
|
||||
// input and verify it doesn't deadlock against a held guard on
|
||||
// the sorted-first key.
|
||||
let qm = Arc::new(WriteQueueManager::new());
|
||||
let a = key("a", None);
|
||||
let z = key("z", None);
|
||||
|
||||
// Hold `a` exclusively.
|
||||
let _held = qm.acquire(&a).await;
|
||||
|
||||
// acquire_many([z, a]) — must sort to [a, z] internally and
|
||||
// block on `a`. With a 200ms timeout we should NOT see it
|
||||
// complete (it's blocked on `a`).
|
||||
let qm2 = Arc::clone(&qm);
|
||||
let z_clone = z.clone();
|
||||
let a_clone = a.clone();
|
||||
let result = timeout(Duration::from_millis(200), async move {
|
||||
qm2.acquire_many(&[z_clone, a_clone]).await
|
||||
})
|
||||
.await;
|
||||
assert!(result.is_err(), "acquire_many should block on `a`, the lex-first key");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn same_key_acquire_serializes() {
|
||||
let qm = Arc::new(WriteQueueManager::new());
|
||||
let k = key("t1", None);
|
||||
|
||||
let first = qm.acquire(&k).await;
|
||||
|
||||
// Second acquire on same key should NOT complete within 200ms.
|
||||
let qm2 = Arc::clone(&qm);
|
||||
let k2 = k.clone();
|
||||
let blocked = timeout(Duration::from_millis(200), async move {
|
||||
qm2.acquire(&k2).await
|
||||
})
|
||||
.await;
|
||||
assert!(blocked.is_err(), "second acquire on same key must block");
|
||||
|
||||
// Drop the first guard, then second acquire should succeed.
|
||||
drop(first);
|
||||
let _second = timeout(Duration::from_secs(2), qm.acquire(&k))
|
||||
.await
|
||||
.expect("second acquire after release should not block");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn disjoint_keys_acquire_concurrently() {
|
||||
let qm = Arc::new(WriteQueueManager::new());
|
||||
let a = key("a", None);
|
||||
let b = key("b", None);
|
||||
|
||||
// Hold `a` indefinitely.
|
||||
let _held_a = qm.acquire(&a).await;
|
||||
|
||||
// Acquire `b` on a different task. Should complete promptly
|
||||
// because `b` is disjoint from `a`.
|
||||
let qm2 = Arc::clone(&qm);
|
||||
let start = Instant::now();
|
||||
let _held_b = timeout(Duration::from_secs(2), qm2.acquire(&b))
|
||||
.await
|
||||
.expect("disjoint key acquire must not block on unrelated held key");
|
||||
assert!(
|
||||
start.elapsed() < Duration::from_millis(500),
|
||||
"disjoint acquire took {:?}, should be near-instant",
|
||||
start.elapsed()
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn disjoint_branches_on_same_table_do_not_serialize() {
|
||||
// (table, main) and (table, feature) are different keys.
|
||||
let qm = Arc::new(WriteQueueManager::new());
|
||||
let main_k = key("t1", None);
|
||||
let feature_k = key("t1", Some("feature"));
|
||||
|
||||
let _held_main = qm.acquire(&main_k).await;
|
||||
let _held_feature = timeout(Duration::from_secs(2), qm.acquire(&feature_k))
|
||||
.await
|
||||
.expect("same-table-different-branch should not serialize");
|
||||
}
|
||||
}
|
||||
|
|
@ -855,8 +855,9 @@ async fn publish_adopted_source_state(
|
|||
.ok_or_else(|| OmniError::manifest(format!("missing source entry for {}", table_key)))?;
|
||||
let target_entry = target_snapshot.entry(table_key);
|
||||
|
||||
let target_active = target_db.active_branch().await;
|
||||
match (
|
||||
target_db.active_branch(),
|
||||
target_active.as_deref(),
|
||||
source_entry.table_branch.as_deref(),
|
||||
) {
|
||||
// Both on main — pointer switch is safe (same lineage, version columns valid)
|
||||
|
|
@ -945,7 +946,13 @@ async fn publish_rewritten_merge_table(
|
|||
table_key: &str,
|
||||
staged: &StagedMergeResult,
|
||||
) -> Result<crate::db::SubTableUpdate> {
|
||||
let (ds, full_path, table_branch) = target_db.open_for_mutation(table_key).await?;
|
||||
// Branch merge's source-rewrite path is Merge-shaped (upsert from
|
||||
// source onto target). The inline `delete_where` later in this
|
||||
// function operates on rows the rewrite chose to remove, not
|
||||
// user-facing predicates, so Merge is the correct policy here.
|
||||
let (ds, full_path, table_branch) = target_db
|
||||
.open_for_mutation(table_key, crate::db::MutationOpKind::Merge)
|
||||
.await?;
|
||||
let mut current_ds = ds;
|
||||
|
||||
// Phase 1: merge_insert changed/new rows (preserves _row_created_at_version for
|
||||
|
|
@ -1045,25 +1052,26 @@ async fn publish_rewritten_merge_table(
|
|||
}
|
||||
|
||||
impl Omnigraph {
|
||||
pub async fn branch_merge(&mut self, source: &str, target: &str) -> Result<MergeOutcome> {
|
||||
pub async fn branch_merge(&self, source: &str, target: &str) -> Result<MergeOutcome> {
|
||||
self.branch_merge_as(source, target, None).await
|
||||
}
|
||||
|
||||
pub async fn branch_merge_as(
|
||||
&mut self,
|
||||
&self,
|
||||
source: &str,
|
||||
target: &str,
|
||||
actor_id: Option<&str>,
|
||||
) -> Result<MergeOutcome> {
|
||||
self.ensure_schema_apply_idle("branch_merge").await?;
|
||||
let previous_actor = self.audit_actor_id.clone();
|
||||
self.audit_actor_id = actor_id.map(str::to_string);
|
||||
let result = self.branch_merge_impl(source, target).await;
|
||||
self.audit_actor_id = previous_actor;
|
||||
result
|
||||
self.branch_merge_impl(source, target, actor_id).await
|
||||
}
|
||||
|
||||
async fn branch_merge_impl(&mut self, source: &str, target: &str) -> Result<MergeOutcome> {
|
||||
async fn branch_merge_impl(
|
||||
&self,
|
||||
source: &str,
|
||||
target: &str,
|
||||
actor_id: Option<&str>,
|
||||
) -> Result<MergeOutcome> {
|
||||
if is_internal_run_branch(source) || is_internal_run_branch(target) {
|
||||
return Err(OmniError::manifest(format!(
|
||||
"branch_merge does not allow internal run refs ('{}' -> '{}')",
|
||||
|
|
@ -1113,7 +1121,17 @@ impl Omnigraph {
|
|||
))
|
||||
.await?
|
||||
.snapshot;
|
||||
let previous_branch = self.active_branch().map(str::to_string);
|
||||
// Hold the merge-exclusive mutex across the full swap → operate
|
||||
// → restore window. Two concurrent branch_merge calls would
|
||||
// otherwise interleave their three separate `coordinator.write()`
|
||||
// acquisitions, leaving each merge's body running against the
|
||||
// other's swapped coord. Pinned by
|
||||
// `concurrent_branch_merges_distinct_targets_do_not_swap_into_each_other`
|
||||
// in `crates/omnigraph-server/tests/server.rs`.
|
||||
let merge_exclusive = self.merge_exclusive();
|
||||
let _merge_guard = merge_exclusive.lock().await;
|
||||
|
||||
let previous_branch = self.active_branch().await;
|
||||
let previous = self
|
||||
.swap_coordinator_for_branch(target_branch.as_deref())
|
||||
.await?;
|
||||
|
|
@ -1124,9 +1142,10 @@ impl Omnigraph {
|
|||
&target_head_commit_id,
|
||||
&source_head_commit_id,
|
||||
is_fast_forward,
|
||||
actor_id,
|
||||
)
|
||||
.await;
|
||||
self.restore_coordinator(previous);
|
||||
self.restore_coordinator(previous).await;
|
||||
|
||||
if merge_result.is_ok() && previous_branch == target_branch {
|
||||
self.refresh().await?;
|
||||
|
|
@ -1136,15 +1155,16 @@ impl Omnigraph {
|
|||
}
|
||||
|
||||
async fn branch_merge_on_current_target(
|
||||
&mut self,
|
||||
&self,
|
||||
base_snapshot: &Snapshot,
|
||||
source_snapshot: &Snapshot,
|
||||
target_head_commit_id: &str,
|
||||
source_head_commit_id: &str,
|
||||
is_fast_forward: bool,
|
||||
actor_id: Option<&str>,
|
||||
) -> Result<MergeOutcome> {
|
||||
self.ensure_commit_graph_initialized().await?;
|
||||
let target_snapshot = self.snapshot();
|
||||
let target_snapshot = self.snapshot().await;
|
||||
|
||||
let mut table_keys = HashSet::new();
|
||||
for entry in base_snapshot.entries() {
|
||||
|
|
@ -1180,7 +1200,7 @@ impl Omnigraph {
|
|||
|
||||
if let Some(staged) = stage_streaming_table_merge(
|
||||
table_key,
|
||||
self.catalog(),
|
||||
&self.catalog(),
|
||||
base_snapshot,
|
||||
source_snapshot,
|
||||
&target_snapshot,
|
||||
|
|
@ -1227,6 +1247,55 @@ impl Omnigraph {
|
|||
// requires pre-computing source deltas during candidate
|
||||
// classification (a structural change to `CandidateTableState`)
|
||||
// and is left as follow-up work.
|
||||
// Acquire per-(table_key, target_branch) queues for every table
|
||||
// touched by the merge plan. Sorted-order acquisition prevents
|
||||
// lock-order inversion against concurrent multi-table writers.
|
||||
// The active branch (set by the caller's `swap_coordinator_for_branch`)
|
||||
// is the merge target; queue keys are scoped to it because a
|
||||
// branch_merge writes only to the target branch.
|
||||
//
|
||||
// Held across the per-table publish loop and the manifest
|
||||
// commit + record_merge_commit calls below. Under PR 1b's
|
||||
// intermediate state (global server RwLock still in place),
|
||||
// this acquisition is uncontended.
|
||||
let active_branch_for_keys = self.active_branch().await;
|
||||
let merge_queue_keys: Vec<(String, Option<String>)> = ordered_table_keys
|
||||
.iter()
|
||||
.filter(|table_key| {
|
||||
matches!(
|
||||
candidates.get(*table_key),
|
||||
Some(CandidateTableState::RewriteMerged(_))
|
||||
| Some(CandidateTableState::AdoptSourceState)
|
||||
)
|
||||
})
|
||||
.map(|table_key| (table_key.clone(), active_branch_for_keys.clone()))
|
||||
.collect();
|
||||
let _merge_queue_guards = self.write_queue().acquire_many(&merge_queue_keys).await;
|
||||
|
||||
let post_queue_snapshot = self.snapshot().await;
|
||||
for table_key in &ordered_table_keys {
|
||||
let Some(candidate) = candidates.get(table_key) else {
|
||||
continue;
|
||||
};
|
||||
if !matches!(
|
||||
candidate,
|
||||
CandidateTableState::RewriteMerged(_) | CandidateTableState::AdoptSourceState
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
let expected = target_snapshot.entry(table_key).map(|e| e.table_version);
|
||||
let current = post_queue_snapshot
|
||||
.entry(table_key)
|
||||
.map(|e| e.table_version);
|
||||
if expected != current {
|
||||
return Err(OmniError::manifest_expected_version_mismatch(
|
||||
table_key.clone(),
|
||||
expected.unwrap_or(0),
|
||||
current.unwrap_or(0),
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
let recovery_pins: Vec<crate::db::manifest::SidecarTablePin> = ordered_table_keys
|
||||
.iter()
|
||||
.filter_map(|table_key| {
|
||||
|
|
@ -1252,7 +1321,7 @@ impl Omnigraph {
|
|||
// the orphaned post-Phase-B HEAD on the target ref.
|
||||
// Same rationale as table_ops.rs:115-120 in
|
||||
// ensure_indices_for_branch.
|
||||
table_branch: self.active_branch().map(str::to_string),
|
||||
table_branch: active_branch_for_keys.clone(),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
|
@ -1268,11 +1337,11 @@ impl Omnigraph {
|
|||
// `branch_merge` calls `swap_coordinator_for_branch(target_branch)`
|
||||
// before invoking this function, so `self.active_branch()`
|
||||
// is the target.
|
||||
let target_branch = self.active_branch().map(str::to_string);
|
||||
let target_branch = active_branch_for_keys.clone();
|
||||
let mut sidecar = crate::db::manifest::new_sidecar(
|
||||
crate::db::manifest::SidecarKind::BranchMerge,
|
||||
target_branch,
|
||||
self.audit_actor_id.clone(),
|
||||
actor_id.map(str::to_string),
|
||||
recovery_pins,
|
||||
);
|
||||
// Carry the source branch's HEAD commit id so the recovery
|
||||
|
|
@ -1301,7 +1370,7 @@ impl Omnigraph {
|
|||
CandidateTableState::AdoptSourceState => {
|
||||
publish_adopted_source_state(
|
||||
self,
|
||||
self.catalog(),
|
||||
&self.catalog(),
|
||||
base_snapshot,
|
||||
source_snapshot,
|
||||
&target_snapshot,
|
||||
|
|
@ -1326,7 +1395,7 @@ impl Omnigraph {
|
|||
crate::failpoints::maybe_fail("branch_merge.post_phase_b_pre_manifest_commit")?;
|
||||
|
||||
let manifest_version = if updates.is_empty() {
|
||||
self.version()
|
||||
self.version().await
|
||||
} else {
|
||||
self.commit_manifest_updates(&updates).await?
|
||||
};
|
||||
|
|
@ -1349,6 +1418,7 @@ impl Omnigraph {
|
|||
manifest_version,
|
||||
target_head_commit_id,
|
||||
source_head_commit_id,
|
||||
actor_id,
|
||||
)
|
||||
.await?;
|
||||
|
||||
|
|
|
|||
|
|
@ -345,8 +345,8 @@ async fn validate_edge_insert_endpoints(
|
|||
edge_name: &str,
|
||||
assignments: &HashMap<String, Literal>,
|
||||
) -> Result<()> {
|
||||
let edge_type = db
|
||||
.catalog()
|
||||
let catalog = db.catalog();
|
||||
let edge_type = catalog
|
||||
.edge_types
|
||||
.get(edge_name)
|
||||
.ok_or_else(|| OmniError::manifest(format!("unknown edge type '{}'", edge_name)))?;
|
||||
|
|
@ -600,6 +600,7 @@ async fn open_table_for_mutation(
|
|||
staging: &mut MutationStaging,
|
||||
branch: Option<&str>,
|
||||
table_key: &str,
|
||||
op_kind: crate::db::MutationOpKind,
|
||||
) -> Result<(Dataset, String, Option<String>)> {
|
||||
if let Some(prior) = staging.inline_committed.get(table_key) {
|
||||
let path = staging.paths.get(table_key).ok_or_else(|| {
|
||||
|
|
@ -614,18 +615,21 @@ async fn open_table_for_mutation(
|
|||
&path.full_path,
|
||||
path.table_branch.as_deref(),
|
||||
prior.table_version,
|
||||
op_kind,
|
||||
)
|
||||
.await?;
|
||||
return Ok((ds, path.full_path.clone(), path.table_branch.clone()));
|
||||
}
|
||||
let (ds, full_path, table_branch) =
|
||||
db.open_for_mutation_on_branch(branch, table_key).await?;
|
||||
let (ds, full_path, table_branch) = db
|
||||
.open_for_mutation_on_branch(branch, table_key, op_kind)
|
||||
.await?;
|
||||
let expected_version = ds.version().version;
|
||||
staging.ensure_path(
|
||||
table_key,
|
||||
full_path.clone(),
|
||||
table_branch.clone(),
|
||||
expected_version,
|
||||
op_kind,
|
||||
);
|
||||
Ok((ds, full_path, table_branch))
|
||||
}
|
||||
|
|
@ -670,7 +674,7 @@ fn enforce_no_mixed_destructive_constructive(
|
|||
|
||||
impl Omnigraph {
|
||||
pub async fn mutate(
|
||||
&mut self,
|
||||
&self,
|
||||
branch: &str,
|
||||
query_source: &str,
|
||||
query_name: &str,
|
||||
|
|
@ -681,28 +685,24 @@ impl Omnigraph {
|
|||
}
|
||||
|
||||
pub async fn mutate_as(
|
||||
&mut self,
|
||||
&self,
|
||||
branch: &str,
|
||||
query_source: &str,
|
||||
query_name: &str,
|
||||
params: &ParamMap,
|
||||
actor_id: Option<&str>,
|
||||
) -> Result<MutationResult> {
|
||||
let previous_actor = self.audit_actor_id.clone();
|
||||
self.audit_actor_id = actor_id.map(str::to_string);
|
||||
let result = self
|
||||
.mutate_with_current_actor(branch, query_source, query_name, params)
|
||||
.await;
|
||||
self.audit_actor_id = previous_actor;
|
||||
result
|
||||
self.mutate_with_current_actor(branch, query_source, query_name, params, actor_id)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn mutate_with_current_actor(
|
||||
&mut self,
|
||||
&self,
|
||||
branch: &str,
|
||||
query_source: &str,
|
||||
query_name: &str,
|
||||
params: &ParamMap,
|
||||
actor_id: Option<&str>,
|
||||
) -> Result<MutationResult> {
|
||||
self.ensure_schema_state_valid().await?;
|
||||
let requested = Self::normalize_branch_name(branch)?;
|
||||
|
|
@ -737,11 +737,19 @@ impl Omnigraph {
|
|||
Err(e) => Err(e),
|
||||
Ok(total) if staging.is_empty() => Ok(total),
|
||||
Ok(total) => {
|
||||
let (updates, expected_versions, sidecar_handle) = staging
|
||||
.finalize(
|
||||
let staged = staging.stage_all(self, requested.as_deref()).await?;
|
||||
// `_queue_guards` holds per-(table_key, branch) write
|
||||
// queues acquired inside `commit_all`. Held across the
|
||||
// manifest publish below so no concurrent writer can
|
||||
// interleave between our commit_staged and our publish
|
||||
// (which would correctly fail our CAS but leave Lance
|
||||
// HEAD advanced — the residual class MR-870 recovers).
|
||||
let (updates, expected_versions, sidecar_handle, _queue_guards) = staged
|
||||
.commit_all(
|
||||
self,
|
||||
requested.as_deref(),
|
||||
crate::db::manifest::SidecarKind::Mutation,
|
||||
actor_id,
|
||||
)
|
||||
.await?;
|
||||
// Failpoint that wedges the documented finalize→publisher
|
||||
|
|
@ -759,6 +767,7 @@ impl Omnigraph {
|
|||
requested.as_deref(),
|
||||
&updates,
|
||||
&expected_versions,
|
||||
actor_id,
|
||||
)
|
||||
.await?;
|
||||
// Phase C succeeded — sidecar can be deleted. If this
|
||||
|
|
@ -794,7 +803,7 @@ impl Omnigraph {
|
|||
}
|
||||
|
||||
async fn execute_named_mutation(
|
||||
&mut self,
|
||||
&self,
|
||||
query_source: &str,
|
||||
query_name: &str,
|
||||
params: &ParamMap,
|
||||
|
|
@ -804,7 +813,7 @@ impl Omnigraph {
|
|||
let query_decl = omnigraph_compiler::find_named_query(query_source, query_name)
|
||||
.map_err(|e| OmniError::manifest(e.to_string()))?;
|
||||
|
||||
let checked = typecheck_query_decl(self.catalog(), &query_decl)?;
|
||||
let checked = typecheck_query_decl(&self.catalog(), &query_decl)?;
|
||||
match checked {
|
||||
CheckedQuery::Mutation(_) => {}
|
||||
CheckedQuery::Read(_) => {
|
||||
|
|
@ -858,7 +867,7 @@ impl Omnigraph {
|
|||
}
|
||||
|
||||
async fn execute_insert(
|
||||
&mut self,
|
||||
&self,
|
||||
type_name: &str,
|
||||
assignments: &[IRAssignment],
|
||||
params: &ParamMap,
|
||||
|
|
@ -906,8 +915,13 @@ impl Omnigraph {
|
|||
let has_key = node_type.key_property().is_some();
|
||||
let table_key = format!("node:{}", type_name);
|
||||
// Capture pre-write metadata on first touch (no Lance write).
|
||||
let insert_kind = if has_key {
|
||||
crate::db::MutationOpKind::Merge
|
||||
} else {
|
||||
crate::db::MutationOpKind::Insert
|
||||
};
|
||||
let (_ds, _full_path, _table_branch) =
|
||||
open_table_for_mutation(self, staging, branch, &table_key).await?;
|
||||
open_table_for_mutation(self, staging, branch, &table_key, insert_kind).await?;
|
||||
// Accumulate. @key inserts go into the Merge stream (so a
|
||||
// later update on the same id coalesces correctly); no-key
|
||||
// inserts go into the Append stream.
|
||||
|
|
@ -941,8 +955,14 @@ impl Omnigraph {
|
|||
}
|
||||
let table_key = format!("edge:{}", type_name);
|
||||
// Capture pre-write metadata on first touch (no Lance write).
|
||||
let (ds, _full_path, _table_branch) =
|
||||
open_table_for_mutation(self, staging, branch, &table_key).await?;
|
||||
let (ds, _full_path, _table_branch) = open_table_for_mutation(
|
||||
self,
|
||||
staging,
|
||||
branch,
|
||||
&table_key,
|
||||
crate::db::MutationOpKind::Insert,
|
||||
)
|
||||
.await?;
|
||||
// Accumulate the new edge row. Edge IDs are ULID-generated so
|
||||
// Append mode is correct (no key-based dedup needed).
|
||||
staging.append_batch(&table_key, schema, PendingMode::Append, batch.clone())?;
|
||||
|
|
@ -972,7 +992,7 @@ impl Omnigraph {
|
|||
}
|
||||
|
||||
async fn execute_update(
|
||||
&mut self,
|
||||
&self,
|
||||
type_name: &str,
|
||||
assignments: &[IRAssignment],
|
||||
predicate: &IRMutationPredicate,
|
||||
|
|
@ -1003,8 +1023,14 @@ impl Omnigraph {
|
|||
let blob_props = self.catalog().node_types[type_name].blob_properties.clone();
|
||||
|
||||
let table_key = format!("node:{}", type_name);
|
||||
let (ds, _full_path, _table_branch) =
|
||||
open_table_for_mutation(self, staging, branch, &table_key).await?;
|
||||
let (ds, _full_path, _table_branch) = open_table_for_mutation(
|
||||
self,
|
||||
staging,
|
||||
branch,
|
||||
&table_key,
|
||||
crate::db::MutationOpKind::Update,
|
||||
)
|
||||
.await?;
|
||||
|
||||
// Scan committed via Lance + apply the same predicate to pending
|
||||
// batches via DataFusion `MemTable` (read-your-writes for prior
|
||||
|
|
@ -1097,7 +1123,7 @@ impl Omnigraph {
|
|||
}
|
||||
|
||||
async fn execute_delete(
|
||||
&mut self,
|
||||
&self,
|
||||
type_name: &str,
|
||||
predicate: &IRMutationPredicate,
|
||||
params: &ParamMap,
|
||||
|
|
@ -1115,7 +1141,7 @@ impl Omnigraph {
|
|||
}
|
||||
|
||||
async fn execute_delete_node(
|
||||
&mut self,
|
||||
&self,
|
||||
type_name: &str,
|
||||
predicate: &IRMutationPredicate,
|
||||
params: &ParamMap,
|
||||
|
|
@ -1125,8 +1151,14 @@ impl Omnigraph {
|
|||
let pred_sql = predicate_to_sql(predicate, params, false)?;
|
||||
|
||||
let table_key = format!("node:{}", type_name);
|
||||
let (ds, full_path, table_branch) =
|
||||
open_table_for_mutation(self, staging, branch, &table_key).await?;
|
||||
let (ds, full_path, table_branch) = open_table_for_mutation(
|
||||
self,
|
||||
staging,
|
||||
branch,
|
||||
&table_key,
|
||||
crate::db::MutationOpKind::Delete,
|
||||
)
|
||||
.await?;
|
||||
let initial_version = ds.version().version;
|
||||
|
||||
// Scan matching IDs for cascade. Per D₂ this never overlaps with
|
||||
|
|
@ -1171,8 +1203,10 @@ impl Omnigraph {
|
|||
&full_path,
|
||||
table_branch.as_deref(),
|
||||
initial_version,
|
||||
crate::db::MutationOpKind::Delete,
|
||||
)
|
||||
.await?;
|
||||
crate::failpoints::maybe_fail("mutation.delete_node_pre_primary_delete")?;
|
||||
let delete_state = self
|
||||
.table_store()
|
||||
.delete_where(&full_path, &mut ds, &pred_sql)
|
||||
|
|
@ -1214,8 +1248,14 @@ impl Omnigraph {
|
|||
|
||||
let edge_table_key = format!("edge:{}", edge_name);
|
||||
let cascade_filter = cascade_filters.join(" OR ");
|
||||
let (mut edge_ds, edge_full_path, edge_table_branch) =
|
||||
open_table_for_mutation(self, staging, branch, &edge_table_key).await?;
|
||||
let (mut edge_ds, edge_full_path, edge_table_branch) = open_table_for_mutation(
|
||||
self,
|
||||
staging,
|
||||
branch,
|
||||
&edge_table_key,
|
||||
crate::db::MutationOpKind::Delete,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let edge_delete = self
|
||||
.table_store()
|
||||
|
|
@ -1246,7 +1286,7 @@ impl Omnigraph {
|
|||
}
|
||||
|
||||
async fn execute_delete_edge(
|
||||
&mut self,
|
||||
&self,
|
||||
type_name: &str,
|
||||
predicate: &IRMutationPredicate,
|
||||
params: &ParamMap,
|
||||
|
|
@ -1256,8 +1296,14 @@ impl Omnigraph {
|
|||
let pred_sql = predicate_to_sql(predicate, params, true)?;
|
||||
|
||||
let table_key = format!("edge:{}", type_name);
|
||||
let (mut ds, full_path, table_branch) =
|
||||
open_table_for_mutation(self, staging, branch, &table_key).await?;
|
||||
let (mut ds, full_path, table_branch) = open_table_for_mutation(
|
||||
self,
|
||||
staging,
|
||||
branch,
|
||||
&table_key,
|
||||
crate::db::MutationOpKind::Delete,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let delete_state = self
|
||||
.table_store()
|
||||
|
|
|
|||
|
|
@ -13,11 +13,12 @@ impl Omnigraph {
|
|||
) -> Result<QueryResult> {
|
||||
self.ensure_schema_state_valid().await?;
|
||||
let resolved = self.resolved_target(target).await?;
|
||||
let catalog = self.catalog();
|
||||
|
||||
let query_decl = omnigraph_compiler::find_named_query(query_source, query_name)
|
||||
.map_err(|e| OmniError::manifest(e.to_string()))?;
|
||||
let type_ctx = typecheck_query(self.catalog(), &query_decl)?;
|
||||
let ir = lower_query(self.catalog(), &query_decl, &type_ctx)?;
|
||||
let type_ctx = typecheck_query(&catalog, &query_decl)?;
|
||||
let ir = lower_query(&catalog, &query_decl, &type_ctx)?;
|
||||
|
||||
let needs_graph = ir
|
||||
.pipeline
|
||||
|
|
@ -34,7 +35,7 @@ impl Omnigraph {
|
|||
params,
|
||||
&resolved.snapshot,
|
||||
graph_index.as_deref(),
|
||||
self.catalog(),
|
||||
&catalog,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
|
@ -52,19 +53,19 @@ impl Omnigraph {
|
|||
) -> Result<QueryResult> {
|
||||
self.ensure_schema_state_valid().await?;
|
||||
let snapshot = self.snapshot_at_version(version).await?;
|
||||
let catalog = self.catalog();
|
||||
|
||||
let query_decl = omnigraph_compiler::find_named_query(query_source, query_name)
|
||||
.map_err(|e| OmniError::manifest(e.to_string()))?;
|
||||
let type_ctx = typecheck_query(self.catalog(), &query_decl)?;
|
||||
let ir = lower_query(self.catalog(), &query_decl, &type_ctx)?;
|
||||
let type_ctx = typecheck_query(&catalog, &query_decl)?;
|
||||
let ir = lower_query(&catalog, &query_decl, &type_ctx)?;
|
||||
|
||||
let needs_graph = ir
|
||||
.pipeline
|
||||
.iter()
|
||||
.any(|op| matches!(op, IROp::Expand { .. } | IROp::AntiJoin { .. }));
|
||||
let graph_index = if needs_graph {
|
||||
let edge_types = self
|
||||
.catalog()
|
||||
let edge_types = catalog
|
||||
.edge_types
|
||||
.iter()
|
||||
.map(|(name, et)| (name.clone(), (et.from_type.clone(), et.to_type.clone())))
|
||||
|
|
@ -79,7 +80,7 @@ impl Omnigraph {
|
|||
params,
|
||||
&snapshot,
|
||||
graph_index.as_deref(),
|
||||
self.catalog(),
|
||||
&catalog,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
|
|
|||
|
|
@ -26,7 +26,7 @@ use arrow_schema::SchemaRef;
|
|||
use lance::Dataset;
|
||||
use omnigraph_compiler::catalog::EdgeType;
|
||||
|
||||
use crate::db::SubTableUpdate;
|
||||
use crate::db::{MutationOpKind, SubTableUpdate};
|
||||
use crate::db::manifest::{
|
||||
new_sidecar, write_sidecar, RecoverySidecarHandle, SidecarKind, SidecarTablePin,
|
||||
};
|
||||
|
|
@ -94,18 +94,30 @@ pub(crate) struct MutationStaging {
|
|||
/// Inline-committed updates from delete-touching ops (D₂ guarantees no
|
||||
/// pending batches exist on a delete-touched table).
|
||||
pub(crate) inline_committed: HashMap<String, SubTableUpdate>,
|
||||
/// Strictest [`MutationOpKind`] seen per table within this query. Drives
|
||||
/// the op-kind-aware drift check in [`StagedMutation::commit_all`]: for
|
||||
/// tables whose first or any subsequent touch was a strict op
|
||||
/// (Update / Delete / SchemaRewrite), commit_all fails fast with 409
|
||||
/// when the staged dataset version drifts from the fresh manifest pin
|
||||
/// rather than letting Lance's `commit_staged` surface
|
||||
/// `RetryableCommitConflict` as a 500. See
|
||||
/// [`MutationOpKind::strict_pre_stage_version_check`].
|
||||
pub(crate) op_kinds: HashMap<String, MutationOpKind>,
|
||||
}
|
||||
|
||||
impl MutationStaging {
|
||||
/// Capture pre-write metadata on first touch of a table. Subsequent
|
||||
/// touches are no-ops (paths and `expected_version` are stable for the
|
||||
/// lifetime of one query).
|
||||
/// touches preserve the original `paths` and `expected_versions`
|
||||
/// entries; `op_kinds` upgrades to the strictest kind seen so far so
|
||||
/// that mixed insert+update on the same table still fires the strict
|
||||
/// drift check at commit time.
|
||||
pub(crate) fn ensure_path(
|
||||
&mut self,
|
||||
table_key: &str,
|
||||
full_path: String,
|
||||
table_branch: Option<String>,
|
||||
expected_version: u64,
|
||||
op_kind: MutationOpKind,
|
||||
) {
|
||||
self.paths.entry(table_key.to_string()).or_insert(StagedTablePath {
|
||||
full_path,
|
||||
|
|
@ -114,6 +126,19 @@ impl MutationStaging {
|
|||
self.expected_versions
|
||||
.entry(table_key.to_string())
|
||||
.or_insert(expected_version);
|
||||
self.op_kinds
|
||||
.entry(table_key.to_string())
|
||||
.and_modify(|existing| {
|
||||
// Upgrade to the stricter kind if a later op needs it.
|
||||
// Insert + later Update → Update wins; Update + later Insert
|
||||
// keeps Update.
|
||||
if op_kind.strict_pre_stage_version_check()
|
||||
&& !existing.strict_pre_stage_version_check()
|
||||
{
|
||||
*existing = op_kind;
|
||||
}
|
||||
})
|
||||
.or_insert(op_kind);
|
||||
}
|
||||
|
||||
/// Append a batch to the per-table accumulator.
|
||||
|
|
@ -210,101 +235,65 @@ impl MutationStaging {
|
|||
}
|
||||
|
||||
/// End-of-query: for each pending table, concat batches and commit via
|
||||
/// `stage_append` or `stage_merge_insert` followed by `commit_staged`.
|
||||
/// Merge with inline-committed entries. Return `(updates,
|
||||
/// expected_versions)` for `commit_updates_on_branch_with_expected`.
|
||||
/// **Phase A** of the two-phase commit: stage uncommitted fragments
|
||||
/// for every table in `pending`. No Lance HEAD movement, no sidecar,
|
||||
/// no manifest publish. Returns a [`StagedMutation`] carrying the
|
||||
/// staged transactions so a future MR-686 queue acquisition step can
|
||||
/// run between staging (slow S3 PUTs, no queue) and commit (fast,
|
||||
/// under per-`(table_key, branch)` queue).
|
||||
///
|
||||
/// Sequential per-table — no cross-table dependency, but a parallel
|
||||
/// version is a perf optimization for multi-table writes (loader with
|
||||
/// many node + edge types). v1 ships sequential; the fan-out can land
|
||||
/// in a follow-up.
|
||||
pub(crate) async fn finalize(
|
||||
/// Sequential per-table for now — parallelizing across independent
|
||||
/// Lance datasets is a perf follow-up; same loop structure as the
|
||||
/// pre-split `finalize`.
|
||||
pub(crate) async fn stage_all(
|
||||
self,
|
||||
db: &crate::db::Omnigraph,
|
||||
branch: Option<&str>,
|
||||
sidecar_kind: SidecarKind,
|
||||
) -> Result<(
|
||||
Vec<SubTableUpdate>,
|
||||
HashMap<String, u64>,
|
||||
Option<RecoverySidecarHandle>,
|
||||
)> {
|
||||
_branch: Option<&str>,
|
||||
) -> Result<StagedMutation> {
|
||||
let MutationStaging {
|
||||
expected_versions,
|
||||
paths,
|
||||
pending,
|
||||
inline_committed,
|
||||
op_kinds,
|
||||
} = self;
|
||||
|
||||
let mut updates: Vec<SubTableUpdate> =
|
||||
inline_committed.into_values().collect();
|
||||
|
||||
// Sidecar protocol: build the per-table pin list BEFORE any Lance
|
||||
// commit_staged runs, then write the sidecar so a crash between
|
||||
// Phase B (this loop's commit_staged calls) and Phase C (the
|
||||
// manifest publish in the caller) is recoverable on next open.
|
||||
// Skipped when `pending` is empty (delete-only mutation; the D₂
|
||||
// parse-time rule keeps deletes out of this code path so this
|
||||
// branch is reached only for the inline-committed-only case).
|
||||
let pins: Vec<SidecarTablePin> = pending
|
||||
.iter()
|
||||
.map(|(table_key, _)| {
|
||||
let path = paths.get(table_key).ok_or_else(|| {
|
||||
OmniError::manifest_internal(format!(
|
||||
"MutationStaging::finalize: missing path for table '{}'",
|
||||
table_key,
|
||||
))
|
||||
})?;
|
||||
let expected = *expected_versions.get(table_key).ok_or_else(|| {
|
||||
OmniError::manifest_internal(format!(
|
||||
"MutationStaging::finalize: missing expected version for table '{}'",
|
||||
table_key,
|
||||
))
|
||||
})?;
|
||||
Ok::<SidecarTablePin, OmniError>(SidecarTablePin {
|
||||
table_key: table_key.clone(),
|
||||
table_path: path.full_path.clone(),
|
||||
expected_version: expected,
|
||||
post_commit_pin: expected + 1,
|
||||
table_branch: path.table_branch.clone(),
|
||||
})
|
||||
})
|
||||
.collect::<Result<Vec<_>>>()?;
|
||||
|
||||
let sidecar_handle = if pins.is_empty() {
|
||||
None
|
||||
} else {
|
||||
let sidecar = new_sidecar(
|
||||
sidecar_kind,
|
||||
branch.map(|s| s.to_string()),
|
||||
db.audit_actor_id.clone(),
|
||||
pins,
|
||||
);
|
||||
Some(write_sidecar(db.root_uri(), db.storage_adapter(), &sidecar).await?)
|
||||
};
|
||||
|
||||
let mut staged_entries: Vec<StagedTableEntry> = Vec::with_capacity(pending.len());
|
||||
for (table_key, table) in pending {
|
||||
let path = paths.get(&table_key).ok_or_else(|| {
|
||||
let path = paths.get(&table_key).cloned().ok_or_else(|| {
|
||||
OmniError::manifest_internal(format!(
|
||||
"MutationStaging::finalize: missing path for table '{}'",
|
||||
"MutationStaging::stage_all: missing path for table '{}'",
|
||||
table_key
|
||||
))
|
||||
})?;
|
||||
let expected = *expected_versions.get(&table_key).ok_or_else(|| {
|
||||
OmniError::manifest_internal(format!(
|
||||
"MutationStaging::finalize: missing expected version for table '{}'",
|
||||
"MutationStaging::stage_all: missing expected version for table '{}'",
|
||||
table_key
|
||||
))
|
||||
})?;
|
||||
|
||||
// Reopen at the pre-write version. Lance HEAD has not advanced
|
||||
// since `ensure_path` captured it — no prior op committed to
|
||||
// this dataset.
|
||||
// Reopen the dataset for staging. The op_kind reflects the
|
||||
// accumulated PendingTable's mode: Append-mode batches are
|
||||
// INSERT-shaped (no key-based dedup at commit_staged); Merge-
|
||||
// mode batches are MERGE-shaped (key-dedup at commit_staged).
|
||||
// Both skip the strict pre-stage version check under the
|
||||
// [`MutationOpKind`] policy: Lance's natural rebase + the
|
||||
// per-(table, branch) queue + the publisher CAS in
|
||||
// `commit_all` handle drift; the strict check would
|
||||
// over-reject in-process concurrent inserts (PR 2 / MR-686
|
||||
// Phase 2).
|
||||
let stage_kind = match table.mode {
|
||||
PendingMode::Append => crate::db::MutationOpKind::Insert,
|
||||
PendingMode::Merge => crate::db::MutationOpKind::Merge,
|
||||
};
|
||||
let ds = db
|
||||
.reopen_for_mutation(
|
||||
&table_key,
|
||||
&path.full_path,
|
||||
path.table_branch.as_deref(),
|
||||
expected,
|
||||
stage_kind,
|
||||
)
|
||||
.await?;
|
||||
|
||||
|
|
@ -335,8 +324,8 @@ impl MutationStaging {
|
|||
}
|
||||
};
|
||||
|
||||
// Commit via Lance's two-phase write: stage produces
|
||||
// uncommitted fragments + transaction; commit advances HEAD.
|
||||
// Stage produces uncommitted fragments + transaction. No
|
||||
// Lance HEAD advance until `commit_all` runs `commit_staged`.
|
||||
let staged = match table.mode {
|
||||
PendingMode::Append => {
|
||||
db.table_store().stage_append(&ds, combined, &[]).await?
|
||||
|
|
@ -353,16 +342,327 @@ impl MutationStaging {
|
|||
.await?
|
||||
}
|
||||
};
|
||||
staged_entries.push(StagedTableEntry {
|
||||
table_key,
|
||||
path,
|
||||
expected_version: expected,
|
||||
dataset: ds,
|
||||
staged_write: staged,
|
||||
});
|
||||
}
|
||||
|
||||
Ok(StagedMutation {
|
||||
inline_committed,
|
||||
staged: staged_entries,
|
||||
expected_versions,
|
||||
paths,
|
||||
op_kinds,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Output of [`MutationStaging::stage_all`]. Carries the staged Lance
|
||||
/// transactions (Phase A complete; uncommitted fragments written) plus
|
||||
/// the per-table metadata needed to write the recovery sidecar, run
|
||||
/// `commit_staged` (Phase B), and produce the publisher's input.
|
||||
///
|
||||
/// Splitting `stage_all` and `commit_all` is the structural prerequisite
|
||||
/// for MR-686: a future commit can drop queue acquisition + manifest-pin
|
||||
/// revalidation between Phase A and Phase B without touching staging
|
||||
/// logic.
|
||||
pub(crate) struct StagedMutation {
|
||||
/// Updates from delete-touching ops (D₂ parse-time rule keeps
|
||||
/// pending and inline_committed disjoint per table). Tables here
|
||||
/// have already advanced Lance HEAD via inline `delete_where`;
|
||||
/// `commit_all` builds sidecar pins for these too so the
|
||||
/// commit→publish residual is recoverable for delete-only paths
|
||||
/// (third-agent Finding 3).
|
||||
inline_committed: HashMap<String, SubTableUpdate>,
|
||||
/// One entry per table that had pending batches successfully staged.
|
||||
staged: Vec<StagedTableEntry>,
|
||||
/// Pre-write manifest version per table — the publisher's CAS fence.
|
||||
expected_versions: HashMap<String, u64>,
|
||||
/// Per-table identifiers from `MutationStaging::paths`. Carried
|
||||
/// through so `commit_all` can build sidecar pins for both staged
|
||||
/// and inline-committed tables.
|
||||
paths: HashMap<String, StagedTablePath>,
|
||||
/// Strictest op_kind per touched table, propagated from
|
||||
/// `MutationStaging::op_kinds` so `commit_all`'s drift check
|
||||
/// fires only on read-modify-write tables.
|
||||
op_kinds: HashMap<String, MutationOpKind>,
|
||||
}
|
||||
|
||||
/// Per-table state captured during `stage_all` and consumed by
|
||||
/// `commit_all`. Holds the opened `Dataset` so `commit_staged` doesn't
|
||||
/// re-open, and the `StagedWrite` whose `transaction` `commit_staged`
|
||||
/// will execute.
|
||||
struct StagedTableEntry {
|
||||
table_key: String,
|
||||
path: StagedTablePath,
|
||||
expected_version: u64,
|
||||
dataset: lance::Dataset,
|
||||
staged_write: crate::table_store::StagedWrite,
|
||||
}
|
||||
|
||||
impl StagedMutation {
|
||||
/// **Phase B** of the two-phase commit: acquire per-`(table_key,
|
||||
/// branch)` queues, revalidate manifest pins, write the recovery
|
||||
/// sidecar, run `commit_staged` per table to advance Lance HEAD, and
|
||||
/// return the publisher's input plus the queue guards.
|
||||
///
|
||||
/// **Caller must hold the returned `_guards` Vec across the
|
||||
/// subsequent manifest publish.** Releasing guards before publish
|
||||
/// would let another writer interleave their commit_staged between
|
||||
/// ours and our publish — which would correctly fail our CAS but
|
||||
/// leave Lance HEAD advanced (the residual class MR-870 recovers
|
||||
/// from). Holding the guards across publish keeps the residual
|
||||
/// unreachable for op-execution failures on the happy path.
|
||||
///
|
||||
/// Revalidation: between `stage_all` and `commit_all`, another
|
||||
/// writer (in the same process or another process sharing the
|
||||
/// repo) may have committed to one of our touched tables, advancing
|
||||
/// the manifest pin past our `expected_version`. We revalidate
|
||||
/// under the queue and fail-fast with `manifest_conflict` before
|
||||
/// any `commit_staged` so the orphaned uncommitted fragments stay
|
||||
/// unreferenced (cleaned by `cleanup_old_versions`'s age sweep)
|
||||
/// rather than being committed and creating a Lance-HEAD-ahead
|
||||
/// residual.
|
||||
pub(crate) async fn commit_all(
|
||||
self,
|
||||
db: &crate::db::Omnigraph,
|
||||
branch: Option<&str>,
|
||||
sidecar_kind: SidecarKind,
|
||||
actor_id: Option<&str>,
|
||||
) -> Result<(
|
||||
Vec<SubTableUpdate>,
|
||||
HashMap<String, u64>,
|
||||
Option<RecoverySidecarHandle>,
|
||||
Vec<tokio::sync::OwnedMutexGuard<()>>,
|
||||
)> {
|
||||
let StagedMutation {
|
||||
inline_committed,
|
||||
mut staged,
|
||||
mut expected_versions,
|
||||
paths,
|
||||
op_kinds,
|
||||
} = self;
|
||||
|
||||
// Acquire per-(table_key, branch) queues for every touched
|
||||
// table — both staged and inline-committed. Sorted by
|
||||
// `acquire_many` internally so all multi-table writers
|
||||
// (mutation, branch_merge, schema_apply, future MR-870
|
||||
// recovery) agree on acquisition order — prevents lock-order
|
||||
// inversion deadlock.
|
||||
//
|
||||
// For inline-committed tables (delete-only mutations), Lance
|
||||
// HEAD has already advanced inside `delete_where` before
|
||||
// `commit_all` runs. Holding the queue here doesn't prevent
|
||||
// that interleaving (commit 6 will move queue acquisition into
|
||||
// `delete_where`'s call site); it does prevent another writer
|
||||
// from interleaving between our delete and our publish, which
|
||||
// would otherwise leave a Lance-HEAD-ahead residual the
|
||||
// delete-only sidecar (added below) would have to recover.
|
||||
let mut queue_keys: Vec<(String, Option<String>)> = Vec::with_capacity(
|
||||
staged.len() + inline_committed.len(),
|
||||
);
|
||||
for entry in &staged {
|
||||
queue_keys.push((entry.table_key.clone(), entry.path.table_branch.clone()));
|
||||
}
|
||||
for table_key in inline_committed.keys() {
|
||||
let path = paths.get(table_key).ok_or_else(|| {
|
||||
OmniError::manifest_internal(format!(
|
||||
"StagedMutation::commit_all: missing path for inline-committed table '{}'",
|
||||
table_key
|
||||
))
|
||||
})?;
|
||||
queue_keys.push((table_key.clone(), path.table_branch.clone()));
|
||||
}
|
||||
let guards = db.write_queue().acquire_many(&queue_keys).await;
|
||||
|
||||
// Re-capture manifest pins under the queue (PR 2 / MR-686).
|
||||
//
|
||||
// expected_versions was captured when the mutation first opened
|
||||
// each table for mutation (the query's read-time pin). For
|
||||
// non-strict inserts / merge-style appends, a writer may advance
|
||||
// the table before we acquire the queue and Lance can still
|
||||
// safely rebase the write, so we refresh expected_versions to
|
||||
// the queued manifest pin.
|
||||
//
|
||||
// Strict read-modify-write ops (update / delete /
|
||||
// schema-rewrite) are different: the staged batch was computed
|
||||
// against the read-time pin, even if stage_all later re-opened
|
||||
// the dataset at HEAD. For those ops, compare read-time
|
||||
// expected_version to the queued manifest pin and fail before
|
||||
// any Lance HEAD movement if the target drifted. This can
|
||||
// over-reject a single mutation that inserts, then upgrades to
|
||||
// update, while another writer advances the table between the
|
||||
// two touches; that is safe-by-default and keeps one invariant
|
||||
// until `ensure_path` learns how to bump expected_version on
|
||||
// op-kind upgrade.
|
||||
//
|
||||
// Why per-branch (and not the bound-branch `db.snapshot()`):
|
||||
// when the caller mutates a branch other than the engine's
|
||||
// bound branch (e.g., feature-branch ingest from a server
|
||||
// handle bound to main), `db.snapshot()` returns the bound
|
||||
// branch's view of each table — which is the wrong pin for
|
||||
// the publisher's CAS on a different branch. Using
|
||||
// `snapshot_for_branch(branch)` resolves the per-branch
|
||||
// entries correctly. The cost is one fresh manifest read per
|
||||
// mutation; PR 1b's regression came from this same read, but
|
||||
// that read is now strictly necessary for cross-branch
|
||||
// correctness. Single-table same-branch mutations could still
|
||||
// skip this read (queue exclusivity makes the publisher CAS a
|
||||
// no-op), but the conditional adds complexity for marginal
|
||||
// gain — left as a follow-up perf optimization.
|
||||
//
|
||||
// Multi-coordinator deployments (§VI.27 aspirational) get
|
||||
// genuine cross-process drift detection from this read for
|
||||
// free.
|
||||
let snapshot = db.snapshot_for_branch(branch).await?;
|
||||
for entry in staged.iter_mut() {
|
||||
let current = snapshot
|
||||
.entry(&entry.table_key)
|
||||
.map(|e| e.table_version)
|
||||
.ok_or_else(|| {
|
||||
OmniError::manifest_conflict(format!(
|
||||
"table '{}' missing from manifest at commit time",
|
||||
entry.table_key,
|
||||
))
|
||||
})?;
|
||||
|
||||
// Insert / Merge tables skip this check: concurrent inserts on
|
||||
// disjoint keys legitimately coexist via Lance's auto-rebase, so
|
||||
// the check would over-reject the existing Phase 2 same-key
|
||||
// insert path (`change_concurrent_inserts_same_key_serialize_without_409`).
|
||||
let strict = op_kinds
|
||||
.get(&entry.table_key)
|
||||
.map(|k| k.strict_pre_stage_version_check())
|
||||
.unwrap_or(false);
|
||||
if strict && entry.expected_version != current {
|
||||
return Err(OmniError::manifest_expected_version_mismatch(
|
||||
entry.table_key.clone(),
|
||||
entry.expected_version,
|
||||
current,
|
||||
));
|
||||
}
|
||||
|
||||
entry.expected_version = current;
|
||||
expected_versions.insert(entry.table_key.clone(), current);
|
||||
}
|
||||
// Sidecar protocol: build the per-table pin list and write the
|
||||
// sidecar BEFORE any later error can return after Lance HEAD has
|
||||
// already moved. For staged tables this still happens before any
|
||||
// Lance commit_staged runs. For inline-committed delete tables,
|
||||
// Lance HEAD moved inside delete_where before commit_all, so the
|
||||
// sidecar must also exist before the inline manifest-version check
|
||||
// below can reject a stale query.
|
||||
//
|
||||
// Pins cover BOTH staged tables (Lance HEAD will advance below
|
||||
// when `commit_staged` runs) AND inline-committed tables
|
||||
// (Lance HEAD already advanced inside `delete_where` — we still
|
||||
// need a sidecar so that an upcoming publish failure is
|
||||
// recoverable on next open). This closes the third-agent
|
||||
// Finding 3 hazard: delete-only mutations would otherwise skip
|
||||
// the sidecar, leaving any commit→publish residual unreachable
|
||||
// by recovery.
|
||||
let mut pins: Vec<SidecarTablePin> = Vec::with_capacity(
|
||||
staged.len() + inline_committed.len(),
|
||||
);
|
||||
for entry in &staged {
|
||||
pins.push(SidecarTablePin {
|
||||
table_key: entry.table_key.clone(),
|
||||
table_path: entry.path.full_path.clone(),
|
||||
expected_version: entry.expected_version,
|
||||
post_commit_pin: entry.expected_version + 1,
|
||||
table_branch: entry.path.table_branch.clone(),
|
||||
});
|
||||
}
|
||||
for (table_key, update) in &inline_committed {
|
||||
let path = paths.get(table_key).ok_or_else(|| {
|
||||
OmniError::manifest_internal(format!(
|
||||
"StagedMutation::commit_all: missing path for inline-committed table '{}'",
|
||||
table_key
|
||||
))
|
||||
})?;
|
||||
let expected = *expected_versions.get(table_key).ok_or_else(|| {
|
||||
OmniError::manifest_internal(format!(
|
||||
"StagedMutation::commit_all: missing expected version for inline-committed table '{}'",
|
||||
table_key
|
||||
))
|
||||
})?;
|
||||
pins.push(SidecarTablePin {
|
||||
table_key: table_key.clone(),
|
||||
table_path: path.full_path.clone(),
|
||||
expected_version: expected,
|
||||
// For inline-committed tables, the post-commit pin is
|
||||
// the actual post-delete version recorded by
|
||||
// `record_inline`, NOT `expected + 1` — `delete_where`
|
||||
// can advance HEAD by more than one version (e.g.,
|
||||
// when Lance internally compacts deletion vectors).
|
||||
post_commit_pin: update.table_version,
|
||||
table_branch: path.table_branch.clone(),
|
||||
});
|
||||
}
|
||||
|
||||
let sidecar_handle = if pins.is_empty() {
|
||||
None
|
||||
} else {
|
||||
let sidecar = new_sidecar(
|
||||
sidecar_kind,
|
||||
branch.map(|s| s.to_string()),
|
||||
actor_id.map(str::to_string),
|
||||
pins,
|
||||
);
|
||||
Some(write_sidecar(db.root_uri(), db.storage_adapter(), &sidecar).await?)
|
||||
};
|
||||
|
||||
for (table_key, _update) in inline_committed.iter() {
|
||||
let current = snapshot
|
||||
.entry(table_key)
|
||||
.map(|e| e.table_version)
|
||||
.ok_or_else(|| {
|
||||
OmniError::manifest_conflict(format!(
|
||||
"table '{}' missing from manifest at commit time",
|
||||
table_key,
|
||||
))
|
||||
})?;
|
||||
let expected = expected_versions.get(table_key).copied().ok_or_else(|| {
|
||||
OmniError::manifest_internal(format!(
|
||||
"StagedMutation::commit_all: missing expected version for inline-committed table '{}'",
|
||||
table_key
|
||||
))
|
||||
})?;
|
||||
if expected != current {
|
||||
return Err(OmniError::manifest_expected_version_mismatch(
|
||||
table_key.clone(),
|
||||
expected,
|
||||
current,
|
||||
));
|
||||
}
|
||||
expected_versions.insert(table_key.clone(), current);
|
||||
}
|
||||
|
||||
let mut updates: Vec<SubTableUpdate> = inline_committed.into_values().collect();
|
||||
|
||||
for entry in staged {
|
||||
let StagedTableEntry {
|
||||
table_key,
|
||||
path,
|
||||
expected_version: _,
|
||||
dataset,
|
||||
staged_write,
|
||||
} = entry;
|
||||
|
||||
let new_ds = db
|
||||
.table_store()
|
||||
.commit_staged(Arc::new(ds), staged.transaction)
|
||||
.commit_staged(Arc::new(dataset), staged_write.transaction)
|
||||
.await?;
|
||||
let state = db
|
||||
.table_store()
|
||||
.table_state(&path.full_path, &new_ds)
|
||||
.await?;
|
||||
updates.push(SubTableUpdate {
|
||||
table_key: table_key.clone(),
|
||||
table_key,
|
||||
table_version: state.version,
|
||||
table_branch: path.table_branch.clone(),
|
||||
row_count: state.row_count,
|
||||
|
|
@ -370,7 +670,7 @@ impl MutationStaging {
|
|||
});
|
||||
}
|
||||
|
||||
Ok((updates, expected_versions, sidecar_handle))
|
||||
Ok((updates, expected_versions, sidecar_handle, guards))
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -59,21 +59,21 @@ pub enum LoadMode {
|
|||
|
||||
/// Load JSONL data into an Omnigraph database.
|
||||
pub async fn load_jsonl(db: &mut Omnigraph, data: &str, mode: LoadMode) -> Result<LoadResult> {
|
||||
let current_branch = db.active_branch().map(str::to_string);
|
||||
let current_branch = db.active_branch().await;
|
||||
let branch = current_branch.as_deref().unwrap_or("main");
|
||||
db.load(branch, data, mode).await
|
||||
}
|
||||
|
||||
/// Load JSONL data from a file path.
|
||||
pub async fn load_jsonl_file(db: &mut Omnigraph, path: &str, mode: LoadMode) -> Result<LoadResult> {
|
||||
let current_branch = db.active_branch().map(str::to_string);
|
||||
let current_branch = db.active_branch().await;
|
||||
let branch = current_branch.as_deref().unwrap_or("main");
|
||||
db.load_file(branch, path, mode).await
|
||||
}
|
||||
|
||||
impl Omnigraph {
|
||||
pub async fn ingest(
|
||||
&mut self,
|
||||
&self,
|
||||
branch: &str,
|
||||
from: Option<&str>,
|
||||
data: &str,
|
||||
|
|
@ -83,24 +83,19 @@ impl Omnigraph {
|
|||
}
|
||||
|
||||
pub async fn ingest_as(
|
||||
&mut self,
|
||||
&self,
|
||||
branch: &str,
|
||||
from: Option<&str>,
|
||||
data: &str,
|
||||
mode: LoadMode,
|
||||
actor_id: Option<&str>,
|
||||
) -> Result<IngestResult> {
|
||||
let previous_actor = self.audit_actor_id.clone();
|
||||
self.audit_actor_id = actor_id.map(str::to_string);
|
||||
let result = self
|
||||
.ingest_with_current_actor(branch, from, data, mode)
|
||||
.await;
|
||||
self.audit_actor_id = previous_actor;
|
||||
result
|
||||
self.ingest_with_current_actor(branch, from, data, mode, actor_id)
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn ingest_file(
|
||||
&mut self,
|
||||
&self,
|
||||
branch: &str,
|
||||
from: Option<&str>,
|
||||
path: &str,
|
||||
|
|
@ -110,7 +105,7 @@ impl Omnigraph {
|
|||
}
|
||||
|
||||
pub async fn ingest_file_as(
|
||||
&mut self,
|
||||
&self,
|
||||
branch: &str,
|
||||
from: Option<&str>,
|
||||
path: &str,
|
||||
|
|
@ -122,11 +117,12 @@ impl Omnigraph {
|
|||
}
|
||||
|
||||
async fn ingest_with_current_actor(
|
||||
&mut self,
|
||||
&self,
|
||||
branch: &str,
|
||||
from: Option<&str>,
|
||||
data: &str,
|
||||
mode: LoadMode,
|
||||
actor_id: Option<&str>,
|
||||
) -> Result<IngestResult> {
|
||||
self.ensure_schema_state_valid().await?;
|
||||
let target_branch =
|
||||
|
|
@ -143,7 +139,7 @@ impl Omnigraph {
|
|||
.await?;
|
||||
}
|
||||
|
||||
let result = self.load(&target_branch, data, mode).await?;
|
||||
let result = self.load_as(&target_branch, data, mode, actor_id).await?;
|
||||
Ok(IngestResult {
|
||||
branch: target_branch,
|
||||
base_branch,
|
||||
|
|
@ -153,7 +149,17 @@ impl Omnigraph {
|
|||
})
|
||||
}
|
||||
|
||||
pub async fn load(&mut self, branch: &str, data: &str, mode: LoadMode) -> Result<LoadResult> {
|
||||
pub async fn load(&self, branch: &str, data: &str, mode: LoadMode) -> Result<LoadResult> {
|
||||
self.load_as(branch, data, mode, None).await
|
||||
}
|
||||
|
||||
pub async fn load_as(
|
||||
&self,
|
||||
branch: &str,
|
||||
data: &str,
|
||||
mode: LoadMode,
|
||||
actor_id: Option<&str>,
|
||||
) -> Result<LoadResult> {
|
||||
self.ensure_schema_state_valid().await?;
|
||||
// Reject internal `__run__*` / system-prefixed branches at the
|
||||
// public write boundary. Direct-publish paths assert this
|
||||
|
|
@ -169,12 +175,12 @@ impl Omnigraph {
|
|||
// Direct-to-target writes: no Run state machine, no `__run__` staging
|
||||
// branch. Cross-table OCC is enforced by the publisher's
|
||||
// `expected_table_versions` CAS inside `load_jsonl_reader`.
|
||||
self.load_direct_on_branch(requested.as_deref(), data, mode)
|
||||
self.load_direct_on_branch(requested.as_deref(), data, mode, actor_id)
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn load_file(
|
||||
&mut self,
|
||||
&self,
|
||||
branch: &str,
|
||||
path: &str,
|
||||
mode: LoadMode,
|
||||
|
|
@ -184,13 +190,14 @@ impl Omnigraph {
|
|||
}
|
||||
|
||||
async fn load_direct_on_branch(
|
||||
&mut self,
|
||||
&self,
|
||||
branch: Option<&str>,
|
||||
data: &str,
|
||||
mode: LoadMode,
|
||||
actor_id: Option<&str>,
|
||||
) -> Result<LoadResult> {
|
||||
let reader = BufReader::new(Cursor::new(data.as_bytes()));
|
||||
load_jsonl_reader(self, branch, reader, mode).await
|
||||
load_jsonl_reader(self, branch, reader, mode, actor_id).await
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -228,10 +235,11 @@ impl LoadResult {
|
|||
}
|
||||
|
||||
async fn load_jsonl_reader<R: BufRead>(
|
||||
db: &mut Omnigraph,
|
||||
db: &Omnigraph,
|
||||
branch: Option<&str>,
|
||||
reader: R,
|
||||
mode: LoadMode,
|
||||
actor_id: Option<&str>,
|
||||
) -> Result<LoadResult> {
|
||||
let catalog = db.catalog().clone();
|
||||
|
||||
|
|
@ -327,6 +335,16 @@ async fn load_jsonl_reader<R: BufRead>(
|
|||
LoadMode::Append => PendingMode::Append,
|
||||
LoadMode::Overwrite => PendingMode::Append, // unused
|
||||
};
|
||||
// Map LoadMode to MutationOpKind for the version-check policy.
|
||||
// Append/Merge skip the strict pre-stage check (concurrency-safe
|
||||
// under the per-(table, branch) queue + publisher CAS); Overwrite
|
||||
// uses the strict check because it truncates and replaces the
|
||||
// dataset — concurrent advances change what "replace" means.
|
||||
let load_op_kind = match mode {
|
||||
LoadMode::Append => crate::db::MutationOpKind::Insert,
|
||||
LoadMode::Merge => crate::db::MutationOpKind::Merge,
|
||||
LoadMode::Overwrite => crate::db::MutationOpKind::SchemaRewrite,
|
||||
};
|
||||
|
||||
// Phase 2a: build and validate every node batch up front. Cheap and
|
||||
// synchronous — surfaces validation errors before any S3 traffic.
|
||||
|
|
@ -357,7 +375,7 @@ async fn load_jsonl_reader<R: BufRead>(
|
|||
if use_staging {
|
||||
for (type_name, table_key, batch, loaded_count) in prepared_nodes {
|
||||
let (ds, full_path, table_branch) = db
|
||||
.open_for_mutation_on_branch(branch, &table_key)
|
||||
.open_for_mutation_on_branch(branch, &table_key, load_op_kind)
|
||||
.await?;
|
||||
let expected_version = ds.version().version;
|
||||
staging.ensure_path(
|
||||
|
|
@ -365,6 +383,7 @@ async fn load_jsonl_reader<R: BufRead>(
|
|||
full_path,
|
||||
table_branch,
|
||||
expected_version,
|
||||
load_op_kind,
|
||||
);
|
||||
let schema = batch.schema();
|
||||
staging.append_batch(&table_key, schema, pending_mode, batch)?;
|
||||
|
|
@ -478,7 +497,7 @@ async fn load_jsonl_reader<R: BufRead>(
|
|||
if use_staging {
|
||||
for (edge_name, table_key, batch, loaded_count) in prepared_edges {
|
||||
let (ds, full_path, table_branch) = db
|
||||
.open_for_mutation_on_branch(branch, &table_key)
|
||||
.open_for_mutation_on_branch(branch, &table_key, load_op_kind)
|
||||
.await?;
|
||||
let expected_version = ds.version().version;
|
||||
staging.ensure_path(
|
||||
|
|
@ -486,6 +505,7 @@ async fn load_jsonl_reader<R: BufRead>(
|
|||
full_path,
|
||||
table_branch,
|
||||
expected_version,
|
||||
load_op_kind,
|
||||
);
|
||||
let schema = batch.schema();
|
||||
staging.append_batch(&table_key, schema, pending_mode, batch)?;
|
||||
|
|
@ -537,15 +557,19 @@ async fn load_jsonl_reader<R: BufRead>(
|
|||
|
||||
// Phase 4: Atomic manifest commit with publisher-level OCC.
|
||||
if use_staging {
|
||||
let (updates, expected_versions, sidecar_handle) = staging
|
||||
.finalize(db, branch, crate::db::manifest::SidecarKind::Load)
|
||||
let staged = staging.stage_all(db, branch).await?;
|
||||
// `_queue_guards` holds per-(table_key, branch) write queues
|
||||
// across the manifest publish below — see exec/mutation.rs for
|
||||
// the rationale (interleaving prevention).
|
||||
let (updates, expected_versions, sidecar_handle, _queue_guards) = staged
|
||||
.commit_all(db, branch, crate::db::manifest::SidecarKind::Load, actor_id)
|
||||
.await?;
|
||||
// Same finalize → publisher residual as mutations: per-table
|
||||
// staged commits have advanced Lance HEAD, but the manifest
|
||||
// publish has not run yet. Reuse the mutation failpoint name so
|
||||
// one failpoint pins the shared `MutationStaging` boundary.
|
||||
crate::failpoints::maybe_fail("mutation.post_finalize_pre_publisher")?;
|
||||
db.commit_updates_on_branch_with_expected(branch, &updates, &expected_versions)
|
||||
db.commit_updates_on_branch_with_expected(branch, &updates, &expected_versions, actor_id)
|
||||
.await?;
|
||||
// The recovery sidecar protects the per-table commit_staged →
|
||||
// manifest publish window. Phase C succeeded — clean up
|
||||
|
|
@ -574,6 +598,7 @@ async fn load_jsonl_reader<R: BufRead>(
|
|||
branch,
|
||||
&overwrite_updates,
|
||||
&overwrite_expected,
|
||||
actor_id,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
|
|
@ -1151,8 +1176,14 @@ async fn write_batch_to_dataset(
|
|||
batch: RecordBatch,
|
||||
mode: LoadMode,
|
||||
) -> Result<(crate::table_store::TableState, Option<String>)> {
|
||||
let (mut ds, full_path, table_branch) =
|
||||
db.open_for_mutation_on_branch(branch, table_key).await?;
|
||||
let op_kind = match mode {
|
||||
LoadMode::Append => crate::db::MutationOpKind::Insert,
|
||||
LoadMode::Merge => crate::db::MutationOpKind::Merge,
|
||||
LoadMode::Overwrite => crate::db::MutationOpKind::SchemaRewrite,
|
||||
};
|
||||
let (mut ds, full_path, table_branch) = db
|
||||
.open_for_mutation_on_branch(branch, table_key, op_kind)
|
||||
.await?;
|
||||
let table_store = db.table_store();
|
||||
|
||||
match mode {
|
||||
|
|
@ -1817,7 +1848,7 @@ edge WorksAt: Person -> Company
|
|||
.unwrap();
|
||||
|
||||
// Read back via snapshot
|
||||
let snap = db.snapshot();
|
||||
let snap = db.snapshot().await;
|
||||
let person_ds = snap.open("node:Person").await.unwrap();
|
||||
|
||||
assert_eq!(person_ds.count_rows(None).await.unwrap(), 2);
|
||||
|
|
@ -1854,7 +1885,7 @@ edge WorksAt: Person -> Company
|
|||
.await
|
||||
.unwrap();
|
||||
|
||||
let snap = db.snapshot();
|
||||
let snap = db.snapshot().await;
|
||||
let knows_ds = snap.open("edge:Knows").await.unwrap();
|
||||
|
||||
let batches: Vec<RecordBatch> = knows_ds
|
||||
|
|
@ -1889,13 +1920,13 @@ edge WorksAt: Person -> Company
|
|||
let dir = tempfile::tempdir().unwrap();
|
||||
let uri = dir.path().to_str().unwrap();
|
||||
let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap();
|
||||
let v1 = db.version();
|
||||
let v1 = db.version().await;
|
||||
|
||||
load_jsonl(&mut db, TEST_DATA, LoadMode::Overwrite)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert!(db.version() > v1);
|
||||
assert!(db.version().await > v1);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
|
|
@ -1912,7 +1943,7 @@ edge WorksAt: Person -> Company
|
|||
.unwrap();
|
||||
load_jsonl(&mut db, batch2, LoadMode::Append).await.unwrap();
|
||||
|
||||
let snap = db.snapshot();
|
||||
let snap = db.snapshot().await;
|
||||
let person_ds = snap.open("node:Person").await.unwrap();
|
||||
assert_eq!(person_ds.count_rows(None).await.unwrap(), 2);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -396,6 +396,137 @@ async fn composite_flow_canonical_lifecycle() {
|
|||
assert!(!final_total.batches().is_empty());
|
||||
}
|
||||
|
||||
/// Cross-handle sequence that exercises operations after a schema_apply
|
||||
/// invalidates a peer handle's cached `_schema.pg`. The narrow load-bearing
|
||||
/// pin is that `Omnigraph::refresh()` must not deadlock when its
|
||||
/// `reload_schema_if_source_changed()` step needs to acquire a read on the
|
||||
/// coordinator's `RwLock`. The broader sequencing — schema_apply →
|
||||
/// branch_create → branch_delete → branch_merge → mutate (using the new
|
||||
/// schema's added property) → reopen — pins that the fix doesn't regress
|
||||
/// any of the related call sites.
|
||||
///
|
||||
/// Pre-fix bug class: `Omnigraph::refresh()` held
|
||||
/// `coordinator.write().await` from start to finish, including across the
|
||||
/// `self.reload_schema_if_source_changed()` call. That helper's
|
||||
/// `self.coordinator.read().await` (only reached when the on-disk schema
|
||||
/// source differs from the in-memory cache) deadlocks against the outer
|
||||
/// write guard because tokio's `RwLock` is not reentrant. Reachable from
|
||||
/// every public refresh-using API: `branch_delete` (`omnigraph.rs:910`),
|
||||
/// `branch_merge` (post-merge refresh on bound target), and any caller
|
||||
/// that calls `Omnigraph::refresh` directly.
|
||||
///
|
||||
/// The cross-handle setup is the realistic trigger: handle A applies a
|
||||
/// schema, advancing `_schema.pg` on disk; handle B has stale in-memory
|
||||
/// schema_source. B's next `refresh()` (via branch_delete here) hits the
|
||||
/// read-after-write reload path. Single-handle is unreachable because
|
||||
/// `apply_schema` updates the local ArcSwap cache in-line.
|
||||
///
|
||||
/// Post-fix invariant: `refresh()` scopes its write guard to the recovery
|
||||
/// section only, releasing it before `reload_schema_if_source_changed()`.
|
||||
/// The reload's read acquisition is uncontested.
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 4)]
|
||||
async fn composite_flow_schema_apply_then_branch_ops_no_deadlock_in_refresh() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let uri = dir.path().to_str().unwrap();
|
||||
|
||||
// Step 1: init + load on handle A.
|
||||
let mut db_a = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap();
|
||||
load_jsonl(&mut db_a, TEST_DATA, LoadMode::Append).await.unwrap();
|
||||
assert_eq!(count_rows(&db_a, "node:Person").await, 4);
|
||||
|
||||
// Step 2: open handle B on the same repo. B's in-memory schema_source
|
||||
// cache is now a snapshot of `_schema.pg` at open time.
|
||||
let db_b = Omnigraph::open(uri).await.unwrap();
|
||||
|
||||
// Step 3: A applies a schema that adds a nullable property to Person.
|
||||
// A's on-disk `_schema.pg` is rewritten; A's in-memory cache is updated
|
||||
// in-line by `apply_schema`. B's in-memory cache is now STALE relative
|
||||
// to disk.
|
||||
const TEST_SCHEMA_V2: &str = "node Person {\n name: String @key\n age: I32?\n nickname: String?\n}\n\nnode Company {\n name: String @key\n}\n\nedge Knows: Person -> Person {\n since: Date?\n}\n\nedge WorksAt: Person -> Company\n";
|
||||
let plan = db_a.apply_schema(TEST_SCHEMA_V2).await.unwrap();
|
||||
assert!(plan.applied, "apply_schema must succeed on a clean repo");
|
||||
assert!(
|
||||
!plan.steps.is_empty(),
|
||||
"apply_schema must record the AddProperty step"
|
||||
);
|
||||
|
||||
// Step 4: deadlock vector. B.branch_delete calls B.refresh() internally
|
||||
// (omnigraph.rs:910). refresh() pre-fix holds the coord write guard
|
||||
// across reload_schema_if_source_changed; with B's cache stale, that
|
||||
// helper takes the not-early-return branch and tries
|
||||
// self.coordinator.read().await — deadlocks against the outer write.
|
||||
//
|
||||
// Wrap in tokio::time::timeout so a deadlock surfaces as a clean test
|
||||
// panic instead of a stuck CI job. 15s is well above natural completion
|
||||
// on local FS (sub-second under normal conditions).
|
||||
db_b.branch_create("post-schema-apply-test").await.unwrap();
|
||||
let delete_result = tokio::time::timeout(
|
||||
std::time::Duration::from_secs(15),
|
||||
db_b.branch_delete("post-schema-apply-test"),
|
||||
)
|
||||
.await;
|
||||
assert!(
|
||||
delete_result.is_ok(),
|
||||
"branch_delete deadlocked in refresh() with stale schema cache. \
|
||||
Pre-fix symptom: Omnigraph::refresh() holds coordinator.write().await \
|
||||
across reload_schema_if_source_changed(), which acquires \
|
||||
coordinator.read().await on the same non-reentrant RwLock when the \
|
||||
on-disk schema source differs from the in-memory cache.",
|
||||
);
|
||||
delete_result
|
||||
.unwrap()
|
||||
.expect("branch_delete must succeed once refresh() releases its write guard");
|
||||
|
||||
// Step 5: continuing operations on B post-refresh — verify the broader
|
||||
// sequence works. B's catalog should now reflect the new schema (the
|
||||
// refresh path includes reload_schema_if_source_changed which calls
|
||||
// store_catalog).
|
||||
db_b.branch_create("feature-after-apply").await.unwrap();
|
||||
|
||||
// Step 6: branch_merge from B exercises the post-merge refresh() path
|
||||
// (merge.rs:1100-1107) — same deadlock surface as branch_delete,
|
||||
// sanity-pinned by reusing the same handle whose cache was just
|
||||
// refreshed.
|
||||
let _outcome = tokio::time::timeout(
|
||||
std::time::Duration::from_secs(15),
|
||||
db_b.branch_merge("feature-after-apply", "main"),
|
||||
)
|
||||
.await
|
||||
.expect("branch_merge deadlocked in refresh() post-schema-apply")
|
||||
.expect("branch_merge must succeed");
|
||||
|
||||
// Step 7: mutation on main using the new schema's added property —
|
||||
// verifies the catalog reload completed and the engine accepts a
|
||||
// mutation referencing `nickname`.
|
||||
const NICKNAME_QUERY: &str = "query set_nickname($name: String, $nickname: String) {\n update Person set { nickname: $nickname } where name = $name\n}";
|
||||
db_b.mutate_as(
|
||||
"main",
|
||||
NICKNAME_QUERY,
|
||||
"set_nickname",
|
||||
&mixed_params(&[("$name", "Alice"), ("$nickname", "Ali")], &[]),
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.expect("update using post-apply schema property must succeed");
|
||||
|
||||
// Step 8: reopen — final integration check that the post-deadlock-fix
|
||||
// state persists across handle drop/open.
|
||||
drop(db_a);
|
||||
drop(db_b);
|
||||
let db_c = Omnigraph::open(uri).await.unwrap();
|
||||
assert_eq!(
|
||||
count_rows(&db_c, "node:Person").await,
|
||||
4,
|
||||
"Person count consistent across reopen post-schema-apply",
|
||||
);
|
||||
let branches = db_c.branch_list().await.unwrap();
|
||||
assert!(
|
||||
!branches.iter().any(|b| b == "post-schema-apply-test"),
|
||||
"deleted branch must stay deleted across reopen; got {:?}",
|
||||
branches,
|
||||
);
|
||||
}
|
||||
|
||||
/// Multi-branch sequential merges with main writes interleaved between
|
||||
/// every diverge point. Catches compositional regressions that single-
|
||||
/// merge tests can't see:
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@
|
|||
mod helpers;
|
||||
|
||||
use fail::FailScenario;
|
||||
use futures::FutureExt;
|
||||
use omnigraph::db::Omnigraph;
|
||||
use omnigraph::failpoints::ScopedFailPoint;
|
||||
|
||||
|
|
@ -25,37 +26,12 @@ fn node_table_uri(root: &str, type_name: &str) -> String {
|
|||
format!("{}/nodes/{hash:016x}", root.trim_end_matches('/'))
|
||||
}
|
||||
|
||||
fn person_batch(rows: &[(&str, &str, Option<i32>)]) -> arrow_array::RecordBatch {
|
||||
use std::sync::Arc;
|
||||
|
||||
use arrow_array::{Int32Array, StringArray};
|
||||
use arrow_schema::{DataType, Field, Schema};
|
||||
|
||||
let schema = Arc::new(Schema::new(vec![
|
||||
Field::new("id", DataType::Utf8, false),
|
||||
Field::new("age", DataType::Int32, true),
|
||||
Field::new("name", DataType::Utf8, false),
|
||||
]));
|
||||
let ids: Vec<&str> = rows.iter().map(|(id, _, _)| *id).collect();
|
||||
let names: Vec<&str> = rows.iter().map(|(_, name, _)| *name).collect();
|
||||
let ages: Vec<Option<i32>> = rows.iter().map(|(_, _, age)| *age).collect();
|
||||
arrow_array::RecordBatch::try_new(
|
||||
schema,
|
||||
vec![
|
||||
Arc::new(StringArray::from(ids)),
|
||||
Arc::new(Int32Array::from(ages)),
|
||||
Arc::new(StringArray::from(names)),
|
||||
],
|
||||
)
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn branch_create_failpoint_triggers() {
|
||||
let _scenario = FailScenario::setup();
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let uri = dir.path().to_str().unwrap();
|
||||
let mut db = Omnigraph::init(uri, helpers::TEST_SCHEMA).await.unwrap();
|
||||
let db = Omnigraph::init(uri, helpers::TEST_SCHEMA).await.unwrap();
|
||||
let _failpoint = ScopedFailPoint::new("branch_create.after_manifest_branch_create", "return");
|
||||
|
||||
let err = db.branch_create("feature").await.unwrap_err();
|
||||
|
|
@ -65,7 +41,7 @@ async fn branch_create_failpoint_triggers() {
|
|||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
#[tokio::test(flavor = "multi_thread")]
|
||||
async fn graph_publish_failpoint_triggers_before_commit_append() {
|
||||
let _scenario = FailScenario::setup();
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
|
|
@ -100,7 +76,7 @@ async fn schema_apply_pre_commit_crash_rolls_forward_via_sidecar() {
|
|||
let uri = dir.path().to_str().unwrap().to_string();
|
||||
|
||||
{
|
||||
let mut db = Omnigraph::init(&uri, SCHEMA_V1).await.unwrap();
|
||||
let db = Omnigraph::init(&uri, SCHEMA_V1).await.unwrap();
|
||||
let _failpoint = ScopedFailPoint::new("schema_apply.after_staging_write", "return");
|
||||
let err = db.apply_schema(SCHEMA_V2_ADDED_TYPE).await.unwrap_err();
|
||||
assert!(
|
||||
|
|
@ -122,7 +98,7 @@ async fn schema_apply_pre_commit_crash_rolls_forward_via_sidecar() {
|
|||
// behind is closed.
|
||||
let db = Omnigraph::open(&uri).await.unwrap();
|
||||
assert_eq!(
|
||||
db.schema_source(),
|
||||
db.schema_source().as_str(),
|
||||
SCHEMA_V2_ADDED_TYPE,
|
||||
"live schema must reflect the rolled-forward apply (Company added)"
|
||||
);
|
||||
|
|
@ -143,7 +119,7 @@ async fn schema_apply_recovers_post_commit_crash() {
|
|||
let uri = dir.path().to_str().unwrap().to_string();
|
||||
|
||||
{
|
||||
let mut db = Omnigraph::init(&uri, SCHEMA_V1).await.unwrap();
|
||||
let db = Omnigraph::init(&uri, SCHEMA_V1).await.unwrap();
|
||||
let _failpoint = ScopedFailPoint::new("schema_apply.after_manifest_commit", "return");
|
||||
let err = db.apply_schema(SCHEMA_V2_ADDED_TYPE).await.unwrap_err();
|
||||
assert!(
|
||||
|
|
@ -157,7 +133,7 @@ async fn schema_apply_recovers_post_commit_crash() {
|
|||
// Reopen — manifest is at the new version, so recovery sweep should
|
||||
// complete the rename and the live schema matches v2.
|
||||
let db = Omnigraph::open(&uri).await.unwrap();
|
||||
assert_eq!(db.schema_source(), SCHEMA_V2_ADDED_TYPE);
|
||||
assert_eq!(db.schema_source().as_str(), SCHEMA_V2_ADDED_TYPE);
|
||||
assert_no_staging_files(dir.path());
|
||||
}
|
||||
|
||||
|
|
@ -172,7 +148,7 @@ async fn schema_apply_recovers_partial_rename() {
|
|||
let uri = dir.path().to_str().unwrap().to_string();
|
||||
|
||||
{
|
||||
let mut db = Omnigraph::init(&uri, SCHEMA_V1).await.unwrap();
|
||||
let db = Omnigraph::init(&uri, SCHEMA_V1).await.unwrap();
|
||||
db.apply_schema(SCHEMA_V2_ADDED_TYPE).await.unwrap();
|
||||
}
|
||||
|
||||
|
|
@ -192,7 +168,7 @@ async fn schema_apply_recovers_partial_rename() {
|
|||
// Reopen — recovery should complete the rename (overwriting final files
|
||||
// with identical staging content) and remove the staging files.
|
||||
let db = Omnigraph::open(&uri).await.unwrap();
|
||||
assert_eq!(db.schema_source(), SCHEMA_V2_ADDED_TYPE);
|
||||
assert_eq!(db.schema_source().as_str(), SCHEMA_V2_ADDED_TYPE);
|
||||
assert_no_staging_files(dir.path());
|
||||
}
|
||||
|
||||
|
|
@ -312,6 +288,85 @@ async fn recovery_rolls_forward_after_finalize_publisher_failure() {
|
|||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn inline_delete_conflict_writes_sidecar_before_rejecting() {
|
||||
let _scenario = FailScenario::setup();
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let uri = dir.path().to_str().unwrap().to_string();
|
||||
let db = helpers::init_and_load(&dir).await;
|
||||
|
||||
let pre_snapshot = db
|
||||
.snapshot_of(omnigraph::db::ReadTarget::branch("main"))
|
||||
.await
|
||||
.unwrap();
|
||||
let pre_person_pin = pre_snapshot.entry("node:Person").unwrap().table_version;
|
||||
let person_uri = node_table_uri(&uri, "Person");
|
||||
|
||||
{
|
||||
let _pause_delete = ScopedFailPoint::new("mutation.delete_node_pre_primary_delete", "pause");
|
||||
let delete_params = helpers::params(&[("$name", "Alice")]);
|
||||
let delete = db.mutate(
|
||||
"main",
|
||||
MUTATION_QUERIES,
|
||||
"remove_person",
|
||||
&delete_params,
|
||||
);
|
||||
tokio::pin!(delete);
|
||||
|
||||
let mut concurrent_update_succeeded = false;
|
||||
for _ in 0..50 {
|
||||
if delete.as_mut().now_or_never().is_some() {
|
||||
panic!("delete mutation completed before primary-delete failpoint was released");
|
||||
}
|
||||
let mut concurrent = Omnigraph::open_read_only(&uri).await.unwrap();
|
||||
if mutate_main(
|
||||
&mut concurrent,
|
||||
MUTATION_QUERIES,
|
||||
"set_age",
|
||||
&mixed_params(&[("$name", "Bob")], &[("$age", 26)]),
|
||||
)
|
||||
.await
|
||||
.is_ok()
|
||||
{
|
||||
concurrent_update_succeeded = true;
|
||||
break;
|
||||
}
|
||||
tokio::time::sleep(std::time::Duration::from_millis(20)).await;
|
||||
}
|
||||
assert!(concurrent_update_succeeded, "concurrent update must land while delete is paused");
|
||||
fail::remove("mutation.delete_node_pre_primary_delete");
|
||||
|
||||
let err = delete.await.unwrap_err();
|
||||
assert!(
|
||||
err.to_string().contains("stale view of 'node:Person'")
|
||||
|| err.to_string().contains("ExpectedVersionMismatch")
|
||||
|| err.to_string().contains("expected version mismatch"),
|
||||
"unexpected error: {err}",
|
||||
);
|
||||
}
|
||||
|
||||
let person_head = lance::Dataset::open(&person_uri)
|
||||
.await
|
||||
.unwrap()
|
||||
.version()
|
||||
.version;
|
||||
assert!(
|
||||
person_head > pre_person_pin,
|
||||
"primary inline delete must have advanced node:Person before rejecting"
|
||||
);
|
||||
let db = Omnigraph::open(&uri).await.unwrap();
|
||||
assert_eq!(
|
||||
helpers::count_rows(&db, "node:Person").await,
|
||||
4,
|
||||
"manifest-conflicted delete must not remove net Person rows after recovery"
|
||||
);
|
||||
assert_eq!(
|
||||
helpers::count_rows(&db, "edge:Knows").await,
|
||||
3,
|
||||
"manifest-conflicted delete must not remove net Knows rows after recovery"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn recovery_rolls_forward_load_on_feature_branch() {
|
||||
use omnigraph::loader::LoadMode;
|
||||
|
|
@ -324,7 +379,7 @@ async fn recovery_rolls_forward_load_on_feature_branch() {
|
|||
let feature_parent_commit_id;
|
||||
|
||||
{
|
||||
let mut db = Omnigraph::init(&uri, helpers::TEST_SCHEMA).await.unwrap();
|
||||
let db = Omnigraph::init(&uri, helpers::TEST_SCHEMA).await.unwrap();
|
||||
db.branch_create("feature").await.unwrap();
|
||||
db.mutate(
|
||||
"feature",
|
||||
|
|
@ -929,7 +984,7 @@ async fn schema_apply_without_schema_staging_rolls_back_on_next_open() {
|
|||
};
|
||||
|
||||
{
|
||||
let mut db = Omnigraph::open(&uri).await.unwrap();
|
||||
let db = Omnigraph::open(&uri).await.unwrap();
|
||||
let _failpoint = ScopedFailPoint::new("schema_apply.before_staging_write", "return");
|
||||
let v2_schema = r#"node Person {
|
||||
name: String @key
|
||||
|
|
@ -1029,7 +1084,7 @@ async fn schema_apply_phase_b_failure_recovered_on_next_open() {
|
|||
// (Lance HEAD advanced) AND AFTER the schema-state staging files are
|
||||
// written, but BEFORE the manifest publish. The recovery sidecar persists.
|
||||
{
|
||||
let mut db = Omnigraph::open(&uri).await.unwrap();
|
||||
let db = Omnigraph::open(&uri).await.unwrap();
|
||||
let _failpoint = ScopedFailPoint::new("schema_apply.after_staging_write", "return");
|
||||
// v2 schema: add a `city` property to Person AND add a new
|
||||
// `Tag` node type. The new property triggers the rewritten_tables
|
||||
|
|
@ -1191,7 +1246,7 @@ async fn branch_merge_phase_b_failure_recovered_on_next_open() {
|
|||
// Setup: failpoint fires after the per-table publish loop completes
|
||||
// but before commit_manifest_updates. Sidecar persists.
|
||||
{
|
||||
let mut db = Omnigraph::open(&uri).await.unwrap();
|
||||
let db = Omnigraph::open(&uri).await.unwrap();
|
||||
let _failpoint =
|
||||
ScopedFailPoint::new("branch_merge.post_phase_b_pre_manifest_commit", "return");
|
||||
let err = db.branch_merge("feature", "main").await.unwrap_err();
|
||||
|
|
@ -1367,7 +1422,7 @@ async fn branch_merge_phase_b_failure_recovered_on_non_main_target() {
|
|||
// but before commit_manifest_updates. Sidecar persists with
|
||||
// branch=Some("target_branch").
|
||||
{
|
||||
let mut db = Omnigraph::open(&uri).await.unwrap();
|
||||
let db = Omnigraph::open(&uri).await.unwrap();
|
||||
let _failpoint =
|
||||
ScopedFailPoint::new("branch_merge.post_phase_b_pre_manifest_commit", "return");
|
||||
let err = db
|
||||
|
|
@ -1468,7 +1523,7 @@ async fn branch_merge_sidecar_pins_table_branch_to_active_branch() {
|
|||
}
|
||||
|
||||
{
|
||||
let mut db = Omnigraph::open(&uri).await.unwrap();
|
||||
let db = Omnigraph::open(&uri).await.unwrap();
|
||||
let _failpoint =
|
||||
ScopedFailPoint::new("branch_merge.post_phase_b_pre_manifest_commit", "return");
|
||||
let _ = db
|
||||
|
|
@ -1559,7 +1614,7 @@ async fn ensure_indices_phase_b_failure_does_not_leak_sidecar_when_no_work_neede
|
|||
// that genuinely need work); no sidecar is written. The failpoint
|
||||
// still fires, surfacing the Err.
|
||||
{
|
||||
let mut db = Omnigraph::open(&uri).await.unwrap();
|
||||
let db = Omnigraph::open(&uri).await.unwrap();
|
||||
let _failpoint =
|
||||
ScopedFailPoint::new("ensure_indices.post_phase_b_pre_manifest_commit", "return");
|
||||
let err = db.ensure_indices().await.unwrap_err();
|
||||
|
|
|
|||
|
|
@ -860,6 +860,13 @@ async fn lance_restore_appends_one_commit_with_checked_out_content() {
|
|||
/// tables before invoking restore — otherwise this hazard becomes
|
||||
/// reachable during in-flight tenant traffic.
|
||||
///
|
||||
/// MR-686 introduces those per-(table_key, branch) writer queues as the
|
||||
/// application-layer mechanism that closes this hazard once continuous
|
||||
/// in-process recovery (MR-870) lands. Until MR-686's queue is wired into
|
||||
/// the recovery path, the open-time-only invocation strategy is the
|
||||
/// only thing keeping this hazard out of production. See
|
||||
/// `docs/invariants.md` §VI.30, §VI.32, §VI.33.
|
||||
///
|
||||
/// This test is the load-bearing constraint any future reconciler must
|
||||
/// honor.
|
||||
#[tokio::test]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue