From cdfbccbfdc57597c2bdad5c9079a1c73cc55f21c Mon Sep 17 00:00:00 2001 From: Ragnor Comerford Date: Fri, 1 May 2026 10:42:21 +0200 Subject: [PATCH 01/10] MR-794 step 2: scaffold MutationStaging accumulator + scan_with_pending MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the scaffolding for the in-memory staged-write rewire — no behavior change yet: * New crates/omnigraph/src/exec/staging.rs with MutationStaging, PendingTable, PendingMode, StagedTablePath, plus the end-of-query finalize() that issues one stage_* + commit_staged per pending table (Merge mode dedupes by id, last-write-wins). * TableStore::scan_with_pending and count_rows_with_pending helpers — Lance scan committed + DataFusion MemTable scan pending, concat. Sidesteps the Scanner::with_fragments filter-pushdown limitation documented on scan_with_staged. * Add datafusion = "52" to workspace + omnigraph-engine deps for MemTable (transitively pulled by Lance already). Engine code still uses the legacy MutationStaging shape; the rewire lands in subsequent commits. Co-Authored-By: Claude Opus 4.7 (1M context) --- Cargo.lock | 1 + Cargo.toml | 1 + crates/omnigraph/Cargo.toml | 1 + crates/omnigraph/src/exec/mod.rs | 1 + crates/omnigraph/src/exec/staging.rs | 391 +++++++++++++++++++++++++++ crates/omnigraph/src/table_store.rs | 140 ++++++++++ 6 files changed, 535 insertions(+) create mode 100644 crates/omnigraph/src/exec/staging.rs diff --git a/Cargo.lock b/Cargo.lock index 9eafcf9..77efe8f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4647,6 +4647,7 @@ dependencies = [ "async-trait", "base64", "chrono", + "datafusion", "fail", "futures", "lance", diff --git a/Cargo.toml b/Cargo.toml index 1766b07..2878bf7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,6 +20,7 @@ arrow-select = "57" arrow-cast = { version = "57", features = ["prettyprint"] } arrow-ord = "57" +datafusion = { version = "52", default-features = false } datafusion-physical-plan = "52" datafusion-physical-expr = "52" datafusion-execution = "52" diff --git a/crates/omnigraph/Cargo.toml b/crates/omnigraph/Cargo.toml index 0e1bae9..afa608b 100644 --- a/crates/omnigraph/Cargo.toml +++ b/crates/omnigraph/Cargo.toml @@ -19,6 +19,7 @@ failpoints = ["dep:fail", "fail/failpoints"] omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.3.1" } lance = { workspace = true } lance-datafusion = { workspace = true } +datafusion = { workspace = true } lance-file = { workspace = true } lance-index = { workspace = true } lance-linalg = { workspace = true } diff --git a/crates/omnigraph/src/exec/mod.rs b/crates/omnigraph/src/exec/mod.rs index 8171461..33a7e41 100644 --- a/crates/omnigraph/src/exec/mod.rs +++ b/crates/omnigraph/src/exec/mod.rs @@ -46,3 +46,4 @@ mod merge; mod mutation; mod projection; mod query; +pub(crate) mod staging; diff --git a/crates/omnigraph/src/exec/staging.rs b/crates/omnigraph/src/exec/staging.rs new file mode 100644 index 0000000..2b2c193 --- /dev/null +++ b/crates/omnigraph/src/exec/staging.rs @@ -0,0 +1,391 @@ +//! Per-query staging accumulator for direct-publish writes (MR-794 step 2+). +//! +//! `MutationStaging` accumulates per-table input batches in memory during a +//! `mutate_as` or `load` query, then at end-of-query commits each touched +//! table via Lance's distributed-write API (one `stage_*` + `commit_staged` +//! per table) and returns the publisher inputs (`SubTableUpdate` list + +//! `expected_table_versions`). +//! +//! Read-your-writes within the same query is satisfied by the in-memory +//! pending batches (see `pending_batches`) — read sites union the committed +//! Lance scan with the pending Arrow batches via DataFusion `MemTable` (see +//! `crate::table_store::TableStore::scan_with_pending`). +//! +//! This module is shared by the engine's mutation path (`exec/mutation.rs`) +//! and the bulk loader (`loader/mod.rs`); both feed insert/update batches +//! into `pending` and route end-of-query commits through `finalize`. +//! Deletes follow the inline-commit path and are recorded via +//! `record_inline` (parse-time D₂ rule prevents mixed insert/delete in a +//! single query, so no flushing is required). + +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; + +use arrow_array::{Array, RecordBatch, StringArray, UInt32Array}; +use arrow_schema::SchemaRef; + +use crate::db::SubTableUpdate; +use crate::error::{OmniError, Result}; + +/// Whether the per-table accumulator should commit via `stage_append` +/// (no @key inserts, edge inserts) or `stage_merge_insert` (any @key insert +/// or update). Once set to `Merge` for a table within a query, subsequent +/// inserts on that table are rolled into the same merge — a `WhenNotMatched +/// = InsertAll` merge is correct for both cases. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(crate) enum PendingMode { + Append, + Merge, +} + +/// Per-table accumulator. Each insert/update op pushes a `RecordBatch` into +/// `batches`; at end-of-query the accumulated batches concat into a single +/// stage call. +#[derive(Debug)] +pub(crate) struct PendingTable { + pub(crate) schema: SchemaRef, + pub(crate) mode: PendingMode, + pub(crate) batches: Vec, +} + +impl PendingTable { + fn new(schema: SchemaRef, mode: PendingMode) -> Self { + Self { + schema, + mode, + batches: Vec::new(), + } + } + + fn total_rows(&self) -> usize { + self.batches.iter().map(|b| b.num_rows()).sum() + } +} + +/// Stable per-table identifiers captured on first touch and reused at +/// finalize time. Avoids re-resolving the dataset path / branch. +#[derive(Debug, Clone)] +pub(crate) struct StagedTablePath { + pub(crate) full_path: String, + pub(crate) table_branch: Option, +} + +/// Per-query staging state. +/// +/// Replaces the post-MR-771 inline-commit `MutationStaging.latest` map with +/// an in-memory accumulator that defers all Lance HEAD advances to +/// end-of-query. After this rewire the bug class "Lance HEAD drifts ahead +/// of `__manifest`" is unreachable in `mutate_as` and `load` for inserts +/// and updates by construction. +#[derive(Default)] +pub(crate) struct MutationStaging { + /// Pre-write manifest version per table — the publisher's CAS fence at + /// end-of-query. + pub(crate) expected_versions: HashMap, + /// Per-table identifiers captured on first touch. + pub(crate) paths: HashMap, + /// In-memory accumulated batches per table (insert/update path). + pub(crate) pending: HashMap, + /// Inline-committed updates from delete-touching ops (D₂ guarantees no + /// pending batches exist on a delete-touched table). + pub(crate) inline_committed: HashMap, +} + +impl MutationStaging { + /// Capture pre-write metadata on first touch of a table. Subsequent + /// touches are no-ops (paths and `expected_version` are stable for the + /// lifetime of one query). + pub(crate) fn ensure_path( + &mut self, + table_key: &str, + full_path: String, + table_branch: Option, + expected_version: u64, + ) { + self.paths.entry(table_key.to_string()).or_insert(StagedTablePath { + full_path, + table_branch, + }); + self.expected_versions + .entry(table_key.to_string()) + .or_insert(expected_version); + } + + /// Append a batch to the per-table accumulator. + /// + /// `mode` is asserted-consistent with prior pushes for the same table: + /// `Append`+`Append` stays Append; any `Merge` upgrades the table to + /// Merge (e.g. an `update Person` after `insert Knows from='X' to='Y'` + /// when both produce content on `node:Person`). Once Merge is set, + /// subsequent appends roll into the merge stream — `WhenNotMatched = + /// InsertAll` correctly inserts append-shaped rows. + pub(crate) fn append_batch( + &mut self, + table_key: &str, + schema: SchemaRef, + mode: PendingMode, + batch: RecordBatch, + ) -> Result<()> { + if batch.num_rows() == 0 { + // No-op — staging is purely additive; an empty batch should not + // be appended. + return Ok(()); + } + let entry = self + .pending + .entry(table_key.to_string()) + .or_insert_with(|| PendingTable::new(schema.clone(), mode)); + // Upgrade Append -> Merge if any op needs merge semantics. + if mode == PendingMode::Merge { + entry.mode = PendingMode::Merge; + } + entry.batches.push(batch); + Ok(()) + } + + /// Record a delete that already inline-committed at the Lance layer. + pub(crate) fn record_inline(&mut self, update: SubTableUpdate) { + self.inline_committed.insert(update.table_key.clone(), update); + } + + /// Read-your-writes accessor: the accumulated pending batches for + /// `table_key`, or `&[]` if none. + pub(crate) fn pending_batches(&self, table_key: &str) -> &[RecordBatch] { + self.pending + .get(table_key) + .map(|p| p.batches.as_slice()) + .unwrap_or(&[]) + } + + /// Schema of the accumulated batches for `table_key`, or `None` if no + /// op has touched the table. Used by `scan_with_pending` to construct + /// the in-memory `MemTable`. + pub(crate) fn pending_schema(&self, table_key: &str) -> Option { + self.pending.get(table_key).map(|p| p.schema.clone()) + } + + /// `true` if neither pending nor inline_committed has any state — the + /// query made no observable writes. + pub(crate) fn is_empty(&self) -> bool { + self.pending.is_empty() && self.inline_committed.is_empty() + } + + /// Total count of pending rows across all tables. Used by tests and + /// (eventually) memory-budget enforcement. + #[allow(dead_code)] + pub(crate) fn pending_row_count(&self) -> usize { + self.pending.values().map(|p| p.total_rows()).sum() + } + + /// End-of-query: for each pending table, concat batches and commit via + /// `stage_append` or `stage_merge_insert` followed by `commit_staged`. + /// Merge with inline-committed entries. Return `(updates, + /// expected_versions)` for `commit_updates_on_branch_with_expected`. + /// + /// Sequential per-table — no cross-table dependency, but a parallel + /// version is a perf optimization for multi-table writes (loader with + /// many node + edge types). v1 ships sequential; the fan-out can land + /// in a follow-up. + pub(crate) async fn finalize( + self, + db: &crate::db::Omnigraph, + _branch: Option<&str>, + ) -> Result<(Vec, HashMap)> { + let MutationStaging { + expected_versions, + paths, + pending, + inline_committed, + } = self; + + let mut updates: Vec = + inline_committed.into_values().collect(); + + for (table_key, table) in pending { + let path = paths.get(&table_key).ok_or_else(|| { + OmniError::manifest_internal(format!( + "MutationStaging::finalize: missing path for table '{}'", + table_key + )) + })?; + let expected = *expected_versions.get(&table_key).ok_or_else(|| { + OmniError::manifest_internal(format!( + "MutationStaging::finalize: missing expected version for table '{}'", + table_key + )) + })?; + + // Reopen at the pre-write version. Lance HEAD has not advanced + // since `ensure_path` captured it — no prior op committed to + // this dataset. + let ds = db + .reopen_for_mutation( + &table_key, + &path.full_path, + path.table_branch.as_deref(), + expected, + ) + .await?; + + if table.batches.is_empty() { + continue; + } + + // For Merge mode, dedupe accumulated batches by `id`, keeping + // the LAST occurrence (last-write-wins for the query). This + // is required because Lance's `MergeInsertBuilder` produces + // arbitrary results on duplicate keys in the source. Append + // mode is exempt because no-key node and edge inserts use + // ULID-generated ids that are unique within a query. + let combined = match table.mode { + PendingMode::Merge => { + dedupe_merge_batches_by_id(&table.schema, table.batches)? + } + PendingMode::Append => { + if table.batches.len() == 1 { + table.batches.into_iter().next().unwrap() + } else { + arrow_select::concat::concat_batches( + &table.schema, + &table.batches, + ) + .map_err(|e| OmniError::Lance(e.to_string()))? + } + } + }; + + // Commit via Lance's two-phase write: stage produces + // uncommitted fragments + transaction; commit advances HEAD. + let staged = match table.mode { + PendingMode::Append => { + db.table_store().stage_append(&ds, combined, &[]).await? + } + PendingMode::Merge => { + db.table_store() + .stage_merge_insert( + ds.clone(), + combined, + vec!["id".to_string()], + lance::dataset::WhenMatched::UpdateAll, + lance::dataset::WhenNotMatched::InsertAll, + ) + .await? + } + }; + let new_ds = db + .table_store() + .commit_staged(Arc::new(ds), staged.transaction) + .await?; + let state = db + .table_store() + .table_state(&path.full_path, &new_ds) + .await?; + updates.push(SubTableUpdate { + table_key: table_key.clone(), + table_version: state.version, + table_branch: path.table_branch.clone(), + row_count: state.row_count, + version_metadata: state.version_metadata, + }); + } + + Ok((updates, expected_versions)) + } +} + +/// Walk `batches` in reverse, tracking seen `id` values; for each row +/// whose id we have NOT seen yet, mark it as a keeper. After the walk, +/// take the kept rows in forward (input) order and concat into one batch. +/// +/// Result: a deduped batch where each `id` appears at most once, with +/// the LAST occurrence's column values. Required by `stage_merge_insert`, +/// which needs unique source keys (Lance's `MergeInsertBuilder` produces +/// arbitrary results on duplicates). +/// +/// `batches` must be non-empty and all share `schema` (caller enforces). +fn dedupe_merge_batches_by_id( + schema: &SchemaRef, + batches: Vec, +) -> Result { + if batches.is_empty() { + return Err(OmniError::manifest_internal( + "dedupe_merge_batches_by_id: batches is empty".to_string(), + )); + } + + // Walk in reverse, tracking seen ids. For each row whose id we + // haven't seen yet, record (batch_idx, row_idx) for the kept set. + let mut seen: HashSet = HashSet::new(); + let mut keep: Vec> = vec![Vec::new(); batches.len()]; + let mut any_duplicates = false; + + for (b_idx, batch) in batches.iter().enumerate().rev() { + let id_col = batch + .column_by_name("id") + .ok_or_else(|| { + OmniError::manifest_internal( + "dedupe_merge_batches_by_id: batch has no 'id' column".to_string(), + ) + })? + .as_any() + .downcast_ref::() + .ok_or_else(|| { + OmniError::manifest_internal( + "dedupe_merge_batches_by_id: 'id' column is not Utf8".to_string(), + ) + })?; + for r_idx in (0..batch.num_rows()).rev() { + if !id_col.is_valid(r_idx) { + // NULL ids — keep all (NULL != NULL in Lance/SQL semantics). + keep[b_idx].push(r_idx as u32); + continue; + } + let id = id_col.value(r_idx); + if seen.insert(id.to_string()) { + keep[b_idx].push(r_idx as u32); + } else { + any_duplicates = true; + } + } + // We pushed in reverse-row order; flip to forward order so the + // emitted batch reflects insertion order. + keep[b_idx].reverse(); + } + + // Fast path: no duplicates → simple concat. + if !any_duplicates { + if batches.len() == 1 { + return Ok(batches.into_iter().next().unwrap()); + } + return arrow_select::concat::concat_batches(schema, &batches) + .map_err(|e| OmniError::Lance(e.to_string())); + } + + // Slow path: build per-batch slices via `take`, then concat. + let mut sliced: Vec = Vec::with_capacity(batches.len()); + for (b_idx, idxs) in keep.into_iter().enumerate() { + if idxs.is_empty() { + continue; + } + let take_array = UInt32Array::from(idxs); + let columns: Vec> = batches[b_idx] + .columns() + .iter() + .map(|col| arrow_select::take::take(col, &take_array, None)) + .collect::>() + .map_err(|e| OmniError::Lance(e.to_string()))?; + let new_batch = RecordBatch::try_new(batches[b_idx].schema(), columns) + .map_err(|e| OmniError::Lance(e.to_string()))?; + sliced.push(new_batch); + } + if sliced.is_empty() { + return Err(OmniError::manifest_internal( + "dedupe_merge_batches_by_id: all rows were dropped (unexpected)".to_string(), + )); + } + if sliced.len() == 1 { + return Ok(sliced.into_iter().next().unwrap()); + } + arrow_select::concat::concat_batches(schema, &sliced) + .map_err(|e| OmniError::Lance(e.to_string())) +} diff --git a/crates/omnigraph/src/table_store.rs b/crates/omnigraph/src/table_store.rs index c9ff90a..52deb2f 100644 --- a/crates/omnigraph/src/table_store.rs +++ b/crates/omnigraph/src/table_store.rs @@ -821,6 +821,103 @@ impl TableStore { .map_err(|e| OmniError::Lance(e.to_string())) } + /// Scan committed via Lance + apply the same filter to in-memory + /// pending batches via DataFusion `MemTable`, concat the two result + /// streams. The replacement for `scan_with_staged` in engine code: + /// MR-794 step 2+ accumulates input batches in memory and unions + /// them with the committed snapshot at read time, sidestepping the + /// `Scanner::with_fragments` filter-pushdown limitation documented + /// on `scan_with_staged`. + /// + /// `committed_ds` should be opened at the pre-mutation + /// `expected_version` (the same version captured in `MutationStaging::expected_versions` + /// at first touch of the table). `pending_batches` are the per-table + /// accumulator's batches in their input shape. `pending_schema` is + /// the schema of the accumulated batches; passing `None` falls back + /// to the schema of the first pending batch. + /// + /// `filter` is the Lance / DataFusion SQL predicate. It is applied + /// to both sides — Lance pushes it down on the committed side; the + /// pending side runs it through a fresh DataFusion `SessionContext` + /// with the batches registered as a `MemTable` named `pending`. + /// + /// When `pending_batches` is empty this delegates to the regular + /// scan path. + pub async fn scan_with_pending( + &self, + committed_ds: &Dataset, + pending_batches: &[RecordBatch], + pending_schema: Option, + projection: Option<&[&str]>, + filter: Option<&str>, + ) -> Result> { + let mut out = self.scan(committed_ds, projection, filter, None).await?; + if pending_batches.is_empty() { + return Ok(out); + } + let pending = scan_pending_batches( + pending_batches, + pending_schema, + projection, + filter, + ) + .await?; + out.extend(pending); + Ok(out) + } + + /// `count_rows` variant that respects in-memory pending batches via + /// DataFusion `MemTable`. Used by edge-cardinality validation that + /// needs to see staged edges before commit. + /// + /// Cheaper than `scan_with_pending` for the count case because we + /// don't materialize columns on the pending side. + pub async fn count_rows_with_pending( + &self, + committed_ds: &Dataset, + pending_batches: &[RecordBatch], + pending_schema: Option, + filter: Option, + ) -> Result { + let committed = self.count_rows(committed_ds, filter.clone()).await?; + if pending_batches.is_empty() { + return Ok(committed); + } + // Count via DataFusion: COUNT(*) from pending [WHERE filter]. + let schema = + pending_schema.unwrap_or_else(|| pending_batches[0].schema()); + let ctx = datafusion::execution::context::SessionContext::new(); + let mem = datafusion::datasource::MemTable::try_new( + schema, + vec![pending_batches.to_vec()], + ) + .map_err(|e| OmniError::Lance(e.to_string()))?; + ctx.register_table("pending", Arc::new(mem)) + .map_err(|e| OmniError::Lance(e.to_string()))?; + let where_clause = filter + .map(|f| format!("WHERE {f}")) + .unwrap_or_default(); + let sql = format!("SELECT COUNT(*) AS c FROM pending {where_clause}"); + let df = ctx + .sql(&sql) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + let batches = df + .collect() + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + let pending_count = batches + .into_iter() + .filter(|b| b.num_rows() > 0) + .map(|b| { + use arrow_array::cast::AsArray; + let arr = b.column(0).as_primitive::(); + arr.value(0) as usize + }) + .sum::(); + Ok(committed + pending_count) + } + /// `count_rows` variant that respects staged writes. Used for /// edge-cardinality validation that needs to see staged edges before /// commit. Same `committed - removed + new` composition as @@ -1068,6 +1165,49 @@ fn assign_row_id_meta(fragments: &mut [Fragment], start_row_id: u64) -> Result<( Ok(()) } +/// Apply `projection` and `filter` to in-memory pending batches via a +/// fresh DataFusion `SessionContext`. Used by `scan_with_pending` for +/// the read-your-writes side of MR-794's in-memory accumulator. +/// +/// `pending_batches` must be non-empty (the caller short-circuits on +/// empty). +async fn scan_pending_batches( + pending_batches: &[RecordBatch], + pending_schema: Option, + projection: Option<&[&str]>, + filter: Option<&str>, +) -> Result> { + let schema = pending_schema.unwrap_or_else(|| pending_batches[0].schema()); + let ctx = datafusion::execution::context::SessionContext::new(); + let mem = datafusion::datasource::MemTable::try_new( + schema, + vec![pending_batches.to_vec()], + ) + .map_err(|e| OmniError::Lance(e.to_string()))?; + ctx.register_table("pending", Arc::new(mem)) + .map_err(|e| OmniError::Lance(e.to_string()))?; + + let proj = projection + .map(|cols| { + cols.iter() + .map(|c| format!("\"{}\"", c.replace('"', "\"\""))) + .collect::>() + .join(", ") + }) + .unwrap_or_else(|| "*".to_string()); + let where_clause = filter + .map(|f| format!("WHERE {f}")) + .unwrap_or_default(); + let sql = format!("SELECT {proj} FROM pending {where_clause}"); + let df = ctx + .sql(&sql) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + df.collect() + .await + .map_err(|e| OmniError::Lance(e.to_string())) +} + fn combine_committed_with_staged(ds: &Dataset, staged: &[StagedWrite]) -> Vec { let removed: std::collections::HashSet = staged .iter() From e6f48ba24d6f7d0135f84cefe509d0968c98a118 Mon Sep 17 00:00:00 2001 From: Ragnor Comerford Date: Fri, 1 May 2026 10:42:37 +0200 Subject: [PATCH 02/10] =?UTF-8?q?MR-794=20step=202:=20rewire=20mutation.rs?= =?UTF-8?q?=20to=20in-memory=20accumulator=20+=20D=E2=82=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Replace mutation.rs's MutationStaging.latest with the new pending + inline_committed shape from exec::staging. Inserts and updates push batches into pending; deletes still inline-commit via record_inline. * Rewrite execute_insert, execute_update, execute_delete*, validate_edge_insert_endpoints, ensure_node_id_exists for the new shape. Edge cardinality validates against committed scan + in-memory pending walk (validate_edge_cardinality_with_pending). * D₂ parse-time check: a query is either insert/update-only or delete-only. Mixed → reject before any I/O. * Drop CoordinatorRestoreGuard and the swap_coordinator_for_branch / restore_coordinator dance from mutate_with_current_actor. Branch is threaded explicitly through execute_named_mutation and the per-op functions. (merge.rs keeps its own swap pattern.) * apply_assignments updated to copy unassigned blob columns from the scan when present, enabling full-schema update batches; for blob-bearing tables we still project away the blob columns at scan time (Lance's filter pushdown panics otherwise) and accept the narrow-schema output for the v1 path. A failed mid-query op no longer advances Lance HEAD on staged tables — the next mutation proceeds normally with no ExpectedVersionMismatch. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/omnigraph/src/exec/mutation.rs | 756 +++++++++++++------------- 1 file changed, 368 insertions(+), 388 deletions(-) diff --git a/crates/omnigraph/src/exec/mutation.rs b/crates/omnigraph/src/exec/mutation.rs index d7513a8..7397074 100644 --- a/crates/omnigraph/src/exec/mutation.rs +++ b/crates/omnigraph/src/exec/mutation.rs @@ -341,6 +341,7 @@ fn build_insert_batch( async fn validate_edge_insert_endpoints( db: &Omnigraph, staging: &MutationStaging, + branch: Option<&str>, edge_name: &str, assignments: &HashMap, ) -> Result<()> { @@ -382,45 +383,57 @@ async fn validate_edge_insert_endpoints( } }; - ensure_node_id_exists(db, staging, &edge_type.from_type, from, "src").await?; - ensure_node_id_exists(db, staging, &edge_type.to_type, to, "dst").await?; + ensure_node_id_exists(db, staging, branch, &edge_type.from_type, from, "src").await?; + ensure_node_id_exists(db, staging, branch, &edge_type.to_type, to, "dst").await?; Ok(()) } +/// Quick scan of pending batches for an `id` value match. Used by the +/// mutation path's edge endpoint validation to satisfy read-your-writes +/// for same-query inserts before they're committed to Lance. +fn pending_batches_contain_id(batches: &[RecordBatch], id: &str) -> bool { + for batch in batches { + let Some(col) = batch.column_by_name("id") else { + continue; + }; + let Some(arr) = col.as_any().downcast_ref::() else { + continue; + }; + for i in 0..arr.len() { + if arr.is_valid(i) && arr.value(i) == id { + return true; + } + } + } + false +} + async fn ensure_node_id_exists( db: &Omnigraph, staging: &MutationStaging, + branch: Option<&str>, node_type: &str, id: &str, label: &str, ) -> Result<()> { let table_key = format!("node:{}", node_type); - let filter = format!("id = '{}'", id.replace('\'', "''")); - // Prefer the in-query staged dataset so a same-query insert of the - // referenced node is visible to this validation. Fall back to the - // pre-mutation manifest snapshot when no prior op touched this table. - let exists = if let Some(staged) = staging.latest.get(&table_key) { - let ds = db - .reopen_for_mutation( - &table_key, - &staged.full_path, - staged.table_branch.as_deref(), - staged.table_version, - ) - .await?; - ds.count_rows(Some(filter)) - .await - .map_err(|e| OmniError::Lance(e.to_string()))? - > 0 - } else { - let snapshot = db.snapshot(); - let ds = snapshot.open(&table_key).await?; - ds.count_rows(Some(filter)) - .await - .map_err(|e| OmniError::Lance(e.to_string()))? - > 0 - }; + // Prefer the in-query pending accumulator so a same-query insert of + // the referenced node is visible to this validation. Fall back to + // the pre-mutation manifest snapshot when nothing pending matches. + let pending = staging.pending_batches(&table_key); + if pending_batches_contain_id(pending, id) { + return Ok(()); + } + + let filter = format!("id = '{}'", id.replace('\'', "''")); + let snapshot = db.snapshot_for_branch(branch).await?; + let ds = snapshot.open(&table_key).await?; + let exists = ds + .count_rows(Some(filter)) + .await + .map_err(|e| OmniError::Lance(e.to_string()))? + > 0; if exists { Ok(()) @@ -469,10 +482,22 @@ fn predicate_to_sql( } /// Replace specific columns in a RecordBatch with new literal values. -/// Blob columns are excluded from the scan result, so assigned blob values are -/// synthesized from the full table schema and included inline in the update -/// batch. Unassigned blob columns are omitted so merge_insert leaves them -/// untouched. +/// +/// Blob columns may or may not be present in `batch` depending on the +/// caller's scan projection: +/// - If `batch` does NOT contain a blob column AND it has no assignment, +/// the column is OMITTED from the output. `merge_insert` leaves it +/// untouched. +/// - If `batch` DOES contain a blob column AND it has no assignment, the +/// column is COPIED to the output. This enables coalescing of +/// different-shape updates into a single full-schema merge batch (the +/// per-table accumulator in `MutationStaging` requires consistent +/// schemas across pending batches for `concat_batches`). The +/// round-tripping cost is acceptable for typical agent-driven +/// mutations; tables with large blobs and unassigned-blob updates may +/// want to be split into separate queries. +/// - If a blob column has a string-URI assignment, build the blob array +/// inline. fn apply_assignments( full_schema: &SchemaRef, batch: &RecordBatch, @@ -484,12 +509,8 @@ fn apply_assignments( for field in full_schema.fields().iter() { if blob_properties.contains(field.name()) { - // Blob columns aren't in the scan result. If this blob has an - // assignment, build the blob array inline so the single - // merge_insert covers both scalar and blob updates. Unassigned - // blob columns are omitted — merge_insert only touches columns - // present in the batch. if let Some(Literal::String(uri)) = assignments.get(field.name()) { + // Assigned: build a single blob column from the URI. let mut builder = BlobArrayBuilder::new(batch.num_rows()); for _ in 0..batch.num_rows() { crate::loader::append_blob_value(&mut builder, uri)?; @@ -501,8 +522,17 @@ fn apply_assignments( .finish() .map_err(|e| OmniError::Lance(e.to_string()))?, ); + } else if let Some(col) = batch.column_by_name(field.name()) { + // Unassigned but scan included it: copy through (writes + // back the same blob, no observable change but uniform + // schema for the accumulator). + let blob_field = lance::blob::blob_field(field.name(), field.is_nullable()); + out_fields.push(blob_field); + columns.push(col.clone()); } - // else: no assignment for this blob column — skip it + // else: scan did not include this blob column and no + // assignment — omit. Caller's accumulator must accept the + // narrower schema (legacy single-merge_insert path). } else if let Some(lit) = assignments.get(field.name()) { out_fields.push(field.as_ref().clone()); columns.push(literal_to_typed_array( @@ -528,197 +558,74 @@ fn apply_assignments( // ─── Mutation execution ────────────────────────────────────────────────────── -/// Per-query staging state for direct-to-target mutations. Replaces the -/// `__run__` staging branch with an in-memory accumulator. +use super::staging::{MutationStaging, PendingMode}; + +/// Open a sub-table dataset for read or inline-commit-write within the +/// current mutation query, capturing pre-write metadata in `staging` on +/// first touch. The captured version is the publisher's CAS fence at +/// end-of-query (per-table OCC). /// -/// Each unique table touched by the mutation is captured at first-open time: -/// - `expected_versions[table_key]` records the manifest version we observed -/// pre-write — the publisher's CAS fence at end-of-query. -/// - `latest[table_key]` records the most recent post-write state on that -/// table — used to thread between ops within the query so subsequent ops -/// see prior writes (read-your-writes). +/// The dataset is always opened at HEAD on the requested branch. Under +/// the staged-write rewire (MR-794 step 2+), no per-op commit advances +/// HEAD, so subsequent opens within the same query observe the same +/// version. The exception is the inline-commit delete path +/// (`execute_delete_*`), which still advances HEAD per-op — but D₂ at +/// parse time prevents inserts/updates and deletes from coexisting in +/// one query, so this can't conflict with a pending-batch read. +async fn open_table_for_mutation( + db: &Omnigraph, + staging: &mut MutationStaging, + branch: Option<&str>, + table_key: &str, +) -> Result<(Dataset, String, Option)> { + let (ds, full_path, table_branch) = + db.open_for_mutation_on_branch(branch, table_key).await?; + let expected_version = ds.version().version; + staging.ensure_path( + table_key, + full_path.clone(), + table_branch.clone(), + expected_version, + ); + Ok((ds, full_path, table_branch)) +} + +/// D₂ parse-time check: a single mutation query is either insert/update-only +/// or delete-only. Mixed → reject before any I/O. /// -/// **Known limitation (mid-query partial failure).** If op-N succeeds at the -/// Lance level (a fragment is committed, advancing the table's Lance HEAD) -/// and op-N+1 then fails before the publisher commits, the table is left -/// with `Lance HEAD > manifest_version`. The next `mutate_as` against the -/// same table will surface `ExpectedVersionMismatch` (Lance HEAD ahead of -/// the manifest snapshot). Lance's `restore()` is *not* a rewind — it -/// creates a new commit, monotonically advancing the version. The proper -/// fix uses Lance's distributed-write API (`write_fragments` / -/// `Scanner::with_fragments` / `Operation::Append { fragments }`) — see -/// [MR-794](https://linear.app/modernrelay/issue/MR-794). The staging -/// primitives `TableStore::stage_append` / `stage_merge_insert` / -/// `commit_staged` / `scan_with_staged` are in place; full integration -/// with this struct is the MR-794 follow-up. In practice this path is -/// narrow today: most validation runs before any Lance write, so -/// single-statement mutations are unaffected. See `docs/runs.md`. -#[derive(Default)] -struct MutationStaging { - expected_versions: HashMap, - latest: HashMap, -} - -struct StagedTable { - table_key: String, - table_branch: Option, - table_version: u64, - row_count: u64, - full_path: String, - version_metadata: crate::db::manifest::TableVersionMetadata, -} - -trait IntoStagedRecord { - fn version(&self) -> u64; - fn row_count(&self) -> u64; - fn version_metadata(&self) -> &crate::db::manifest::TableVersionMetadata; -} - -impl IntoStagedRecord for crate::table_store::TableState { - fn version(&self) -> u64 { - self.version - } - fn row_count(&self) -> u64 { - self.row_count - } - fn version_metadata(&self) -> &crate::db::manifest::TableVersionMetadata { - &self.version_metadata - } -} - -impl IntoStagedRecord for crate::table_store::DeleteState { - fn version(&self) -> u64 { - self.version - } - fn row_count(&self) -> u64 { - self.row_count - } - fn version_metadata(&self) -> &crate::db::manifest::TableVersionMetadata { - &self.version_metadata - } -} - -impl MutationStaging { - fn is_empty(&self) -> bool { - self.latest.is_empty() - } - - fn record( - &mut self, - table_key: String, - full_path: String, - table_branch: Option, - state: &S, - ) { - self.latest.insert( - table_key.clone(), - StagedTable { - table_key, - table_branch, - table_version: state.version(), - row_count: state.row_count(), - full_path, - version_metadata: state.version_metadata().clone(), - }, - ); - } - - fn into_updates(self) -> (Vec, HashMap) { - let updates = self - .latest - .into_values() - .map(|st| crate::db::SubTableUpdate { - table_key: st.table_key, - table_version: st.table_version, - table_branch: st.table_branch, - row_count: st.row_count, - version_metadata: st.version_metadata, - }) - .collect(); - (updates, self.expected_versions) - } -} - -/// RAII helper that restores `Omnigraph::coordinator` on drop. Used by -/// `mutate_with_current_actor` so a panic between the coordinator swap and -/// the explicit restore (e.g. an assertion deep inside Lance) does not -/// leave the handle pinned to the requested branch indefinitely. The -/// captured coordinator is `take`n on drop and assigned via the -/// (synchronous) `restore_coordinator` accessor. -/// -/// Holds a bare `*mut Omnigraph` (no lifetime parameter) deliberately: -/// borrowing the engine through this guard would lock out the rest of -/// `mutate_with_current_actor` from calling `&mut self` methods on the -/// engine while the guard is alive. The unsafe is bounded by the -/// constructor contract — the caller must not let the guard outlive the -/// `&mut self` it was built from. In practice this is enforced by the -/// guard being assigned to a stack-local `_guard` binding inside one -/// function and never moved out. -struct CoordinatorRestoreGuard { - db: *mut Omnigraph, - previous: Option, -} - -// SAFETY: the pointer addresses an `Omnigraph`, which is `Send`. The guard -// is short-lived and the only operation it performs is the sync -// `restore_coordinator` field assignment in `Drop`. No reference is shared -// across threads — the future holding the guard moves between threads -// (e.g. when an Axum handler is awaited on a worker), and the swap-back is -// always invoked at most once on whichever thread runs `Drop`. -unsafe impl Send for CoordinatorRestoreGuard {} - -impl CoordinatorRestoreGuard { - /// SAFETY: `db` must outlive the returned guard, and the caller must - /// not move the guard outside the borrow scope of `db`. - fn new(db: &mut Omnigraph, previous: crate::db::GraphCoordinator) -> Self { - Self { - db: db as *mut Omnigraph, - previous: Some(previous), - } - } -} - -impl Drop for CoordinatorRestoreGuard { - fn drop(&mut self) { - if let Some(prev) = self.previous.take() { - // SAFETY: per the `new` contract, `db` is still valid here. - // `restore_coordinator` is a sync field assignment and does not - // re-enter the runtime. - unsafe { - (*self.db).restore_coordinator(prev); +/// Reason: under the staged-write rewire (MR-794 step 2+), inserts and +/// updates accumulate in memory and commit at end-of-query, while deletes +/// still inline-commit (Lance lacks a public two-phase delete in 4.0.0). +/// Mixing creates ordering hazards (same-row insert→delete becomes a no-op +/// because the staged insert isn't visible to delete; cascading deletes +/// of just-inserted edges break referential integrity by silent design). +/// Until Lance exposes `DeleteJob::execute_uncommitted`, the parse-time +/// rejection keeps both paths atomic and correct. +fn enforce_no_mixed_destructive_constructive( + ir: &omnigraph_compiler::ir::MutationIR, +) -> Result<()> { + let mut has_constructive = false; + let mut has_delete = false; + for op in &ir.ops { + match op { + MutationOpIR::Insert { .. } | MutationOpIR::Update { .. } => { + has_constructive = true; + } + MutationOpIR::Delete { .. } => { + has_delete = true; } } } -} - -/// Open a sub-table dataset for write in the current mutation query. On the -/// first touch of a table, captures the pre-write manifest version into -/// `staging.expected_versions` so the publisher can enforce OCC. On -/// subsequent touches, re-opens the dataset at the locally-staged version -/// (the version we wrote in a prior op of the same query) — bypassing the -/// manifest because nothing has been committed yet. -async fn open_for_mutation_in_query( - db: &Omnigraph, - staging: &mut MutationStaging, - table_key: &str, -) -> Result<(Dataset, String, Option)> { - if let Some(staged) = staging.latest.get(table_key) { - let ds = db - .reopen_for_mutation( - table_key, - &staged.full_path, - staged.table_branch.as_deref(), - staged.table_version, - ) - .await?; - return Ok((ds, staged.full_path.clone(), staged.table_branch.clone())); + if has_constructive && has_delete { + return Err(OmniError::manifest(format!( + "mutation '{}' on the same query mixes inserts/updates and deletes; \ + split into separate mutations: (1) inserts and updates, then (2) deletes. \ + This restriction lifts when Lance exposes a two-phase delete API \ + (tracked: MR-793 / Lance-upstream).", + ir.name + ))); } - let (ds, full_path, table_branch) = db.open_for_mutation(table_key).await?; - staging - .expected_versions - .entry(table_key.to_string()) - .or_insert(ds.version().version); - Ok((ds, full_path, table_branch)) + Ok(()) } impl Omnigraph { @@ -769,57 +676,40 @@ impl Omnigraph { } let resolved_params = enrich_mutation_params(params)?; - // Direct-to-target write path. Per-query staging captures pre-write - // manifest versions (publisher CAS fence) and threads dataset state - // across ops to maintain read-your-writes within a multi-statement - // query without per-op manifest commits. + // Per-query staging accumulator. Inserts and updates push batches + // into `pending`; deletes still inline-commit and record into + // `inline_committed`. At end-of-query, `finalize` issues one + // `stage_*` + `commit_staged` per pending table, then the + // publisher commits the manifest atomically across all touched + // tables. Branch is threaded explicitly — no coordinator swap. let mut staging = MutationStaging::default(); - let current = self.active_branch().map(str::to_string); - let needs_swap = requested.as_deref() != current.as_deref(); - - // RAII guard for coordinator state. If we swapped to the requested - // branch, the original coordinator is captured here and unconditionally - // restored on drop — including on panic from `execute_named_mutation` - // or the publisher. Without this, a Lance-internal panic between swap - // and restore would leave the handle pinned to the wrong branch for - // its remaining lifetime. - let _guard = if needs_swap { - let previous = self.swap_coordinator_for_branch(requested.as_deref()).await?; - Some(CoordinatorRestoreGuard::new(self, previous)) - } else { - None - }; - let exec_result = self - .execute_named_mutation(query_source, query_name, &resolved_params, &mut staging) + .execute_named_mutation( + query_source, + query_name, + &resolved_params, + requested.as_deref(), + &mut staging, + ) .await; - let publish_result = match exec_result { + match exec_result { Err(e) => Err(e), + Ok(total) if staging.is_empty() => Ok(total), Ok(total) => { - if staging.is_empty() { - Ok(total) - } else { - let (updates, expected_versions) = staging.into_updates(); - match self - .commit_updates_on_branch_with_expected( - requested.as_deref(), - &updates, - &expected_versions, - ) - .await - { - Ok(_) => Ok(total), - Err(e) => Err(e), - } - } + let (updates, expected_versions) = staging + .finalize(self, requested.as_deref()) + .await?; + self.commit_updates_on_branch_with_expected( + requested.as_deref(), + &updates, + &expected_versions, + ) + .await?; + Ok(total) } - }; - - // `_guard` drops here and restores the coordinator (also restores on - // any panic that unwound through this function above). - publish_result + } } async fn execute_named_mutation( @@ -827,6 +717,7 @@ impl Omnigraph { query_source: &str, query_name: &str, params: &ParamMap, + branch: Option<&str>, staging: &mut MutationStaging, ) -> Result { let query_decl = omnigraph_compiler::find_named_query(query_source, query_name) @@ -843,6 +734,8 @@ impl Omnigraph { } let ir = lower_mutation_query(&query_decl)?; + // D₂: reject mixed insert/update + delete before any I/O. + enforce_no_mixed_destructive_constructive(&ir)?; let mut total = MutationResult::default(); for op in &ir.ops { @@ -851,7 +744,7 @@ impl Omnigraph { type_name, assignments, } => { - self.execute_insert(type_name, assignments, params, staging) + self.execute_insert(type_name, assignments, params, branch, staging) .await? } MutationOpIR::Update { @@ -859,14 +752,21 @@ impl Omnigraph { assignments, predicate, } => { - self.execute_update(type_name, assignments, predicate, params, staging) - .await? + self.execute_update( + type_name, + assignments, + predicate, + params, + branch, + staging, + ) + .await? } MutationOpIR::Delete { type_name, predicate, } => { - self.execute_delete(type_name, predicate, params, staging) + self.execute_delete(type_name, predicate, params, branch, staging) .await? } }; @@ -881,6 +781,7 @@ impl Omnigraph { type_name: &str, assignments: &[IRAssignment], params: &ParamMap, + branch: Option<&str>, staging: &mut MutationStaging, ) -> Result { let mut resolved: HashMap = HashMap::new(); @@ -923,13 +824,18 @@ impl Omnigraph { } let has_key = node_type.key_property().is_some(); let table_key = format!("node:{}", type_name); - let (state, full_path, table_branch) = if has_key { - self.upsert_batch_staged(&table_key, staging, schema, batch).await? + // Capture pre-write metadata on first touch (no Lance write). + let (_ds, _full_path, _table_branch) = + open_table_for_mutation(self, staging, branch, &table_key).await?; + // Accumulate. @key inserts go into the Merge stream (so a + // later update on the same id coalesces correctly); no-key + // inserts go into the Append stream. + let mode = if has_key { + PendingMode::Merge } else { - self.append_batch_staged(&table_key, staging, schema, batch).await? + PendingMode::Append }; - - staging.record(table_key, full_path, table_branch, &state); + staging.append_batch(&table_key, schema, mode, batch)?; Ok(MutationResult { affected_nodes: 1, @@ -942,7 +848,7 @@ impl Omnigraph { let id = ulid::Ulid::new().to_string(); let batch = build_insert_batch(&schema, &id, &resolved, &blob_props)?; - validate_edge_insert_endpoints(self, staging, type_name, &resolved).await?; + validate_edge_insert_endpoints(self, staging, branch, type_name, &resolved).await?; crate::loader::validate_enum_constraints(&batch, &edge_type.properties, type_name)?; let unique_props = crate::loader::unique_property_names_for_edge(edge_type); if !unique_props.is_empty() { @@ -952,23 +858,27 @@ impl Omnigraph { &unique_props, )?; } - let active_branch = self.active_branch().map(str::to_string); let table_key = format!("edge:{}", type_name); - let (state, full_path, table_branch) = self - .append_batch_staged(&table_key, staging, schema, batch) - .await?; + // Capture pre-write metadata on first touch (no Lance write). + let (ds, _full_path, _table_branch) = + open_table_for_mutation(self, staging, branch, &table_key).await?; + // Accumulate the new edge row. Edge IDs are ULID-generated so + // Append mode is correct (no key-based dedup needed). + staging.append_batch(&table_key, schema, PendingMode::Append, batch.clone())?; - crate::loader::validate_edge_cardinality( + // Edge cardinality validation: scan committed edges via Lance + // + iterate pending edges in-memory for the `src` column, + // group-by-src. The pending side already includes the row + // we just appended (above). + validate_edge_cardinality_with_pending( self, - active_branch.as_deref(), - type_name, - state.version, - table_branch.as_deref(), + &ds, + staging, + &table_key, + edge_type, ) .await?; - staging.record(table_key, full_path, table_branch, &state); - self.invalidate_graph_index().await; Ok(MutationResult { @@ -980,57 +890,13 @@ impl Omnigraph { } } - /// Append a batch to a sub-table within an in-flight mutation, routing - /// through `MutationStaging` so subsequent ops in the same query see the - /// write before publish. Returns the new state, the full path, and the - /// table branch. - async fn append_batch_staged( - &self, - table_key: &str, - staging: &mut MutationStaging, - _schema: SchemaRef, - batch: RecordBatch, - ) -> Result<(crate::table_store::TableState, String, Option)> { - let (mut ds, full_path, table_branch) = - open_for_mutation_in_query(self, staging, table_key).await?; - let state = self - .table_store() - .append_batch(&full_path, &mut ds, batch) - .await?; - Ok((state, full_path, table_branch)) - } - - /// Upsert a batch into a sub-table using merge_insert keyed by "id", - /// routing through `MutationStaging`. - async fn upsert_batch_staged( - &self, - table_key: &str, - staging: &mut MutationStaging, - _schema: SchemaRef, - batch: RecordBatch, - ) -> Result<(crate::table_store::TableState, String, Option)> { - let (ds, full_path, table_branch) = - open_for_mutation_in_query(self, staging, table_key).await?; - let state = self - .table_store() - .merge_insert_batch( - &full_path, - ds, - batch, - vec!["id".to_string()], - lance::dataset::WhenMatched::UpdateAll, - lance::dataset::WhenNotMatched::InsertAll, - ) - .await?; - Ok((state, full_path, table_branch)) - } - async fn execute_update( &mut self, type_name: &str, assignments: &[IRAssignment], predicate: &IRMutationPredicate, params: &ParamMap, + branch: Option<&str>, staging: &mut MutationStaging, ) -> Result { // Defense in depth: ensure this is a node type @@ -1056,23 +922,39 @@ impl Omnigraph { let blob_props = self.catalog().node_types[type_name].blob_properties.clone(); let table_key = format!("node:{}", type_name); - let (ds, full_path, table_branch) = - open_for_mutation_in_query(self, staging, &table_key).await?; - let initial_version = ds.version().version; + let (ds, _full_path, _table_branch) = + open_table_for_mutation(self, staging, branch, &table_key).await?; + // Scan committed via Lance + apply the same predicate to pending + // batches via DataFusion `MemTable` (read-your-writes for prior + // ops in this query). The pending side may include rows from + // earlier `insert` / `update` ops on the same table. + // + // For blob tables we project away the blob columns: Lance's + // scanner doesn't accept the standard projection path on blob + // descriptors and would panic with a `Field::project` assertion. + // The downstream `apply_assignments` synthesizes blob columns + // from explicit assignments and omits unassigned blobs (Lance's + // merge_insert leaves them untouched). Tables without blob + // columns scan the full schema unprojected. let non_blob_cols: Vec<&str> = schema .fields() .iter() .filter(|f| !blob_props.contains(f.name())) .map(|f| f.name().as_str()) .collect(); + let projection: Option<&[&str]> = + (!blob_props.is_empty()).then_some(non_blob_cols.as_slice()); + let pending_batches = staging.pending_batches(&table_key); + let pending_schema = staging.pending_schema(&table_key); let batches = self .table_store() - .scan( + .scan_with_pending( &ds, - (!blob_props.is_empty()).then_some(non_blob_cols.as_slice()), + pending_batches, + pending_schema, + projection, Some(&pred_sql), - None, ) .await?; @@ -1083,13 +965,12 @@ impl Omnigraph { }); } - let matched = if batches.len() == 1 { - batches.into_iter().next().unwrap() - } else { - let s = batches[0].schema(); - arrow_select::concat::concat_batches(&s, &batches) - .map_err(|e| OmniError::Lance(e.to_string()))? - }; + // Concat the matched batches into one. The pending side may have + // a slightly different schema (e.g. Lance's `_rowid` column on + // the committed side, missing on the pending side). Normalize by + // dropping any `_rowid` / `_rowaddr` columns and reordering to + // the table's canonical schema. + let matched = concat_match_batches_to_schema(&schema, &blob_props, batches)?; let affected_count = matched.num_rows(); @@ -1110,29 +991,13 @@ impl Omnigraph { )?; } - // Re-open for merge_insert (scan consumed the dataset; - // version guard was already applied by open_for_mutation above) - let ds = self - .reopen_for_mutation( - &table_key, - &full_path, - table_branch.as_deref(), - initial_version, - ) - .await?; - let update_state = self - .table_store() - .merge_insert_batch( - &full_path, - ds, - updated, - vec!["id".to_string()], - lance::dataset::WhenMatched::UpdateAll, - lance::dataset::WhenNotMatched::DoNothing, - ) - .await?; - - staging.record(table_key, full_path, table_branch, &update_state); + // Accumulate the updated batch into the Merge-mode pending stream. + // The accumulator may now contain entries with the same id as a + // prior insert or update on this table; `MutationStaging::finalize` + // dedupes by id (last-occurrence wins) before issuing the single + // `stage_merge_insert` call at end-of-query. + let updated_schema = updated.schema(); + staging.append_batch(&table_key, updated_schema, PendingMode::Merge, updated)?; Ok(MutationResult { affected_nodes: affected_count, @@ -1145,14 +1010,15 @@ impl Omnigraph { type_name: &str, predicate: &IRMutationPredicate, params: &ParamMap, + branch: Option<&str>, staging: &mut MutationStaging, ) -> Result { let is_node = self.catalog().node_types.contains_key(type_name); if is_node { - self.execute_delete_node(type_name, predicate, params, staging) + self.execute_delete_node(type_name, predicate, params, branch, staging) .await } else { - self.execute_delete_edge(type_name, predicate, params, staging) + self.execute_delete_edge(type_name, predicate, params, branch, staging) .await } } @@ -1162,16 +1028,19 @@ impl Omnigraph { type_name: &str, predicate: &IRMutationPredicate, params: &ParamMap, + branch: Option<&str>, staging: &mut MutationStaging, ) -> Result { let pred_sql = predicate_to_sql(predicate, params, false)?; let table_key = format!("node:{}", type_name); let (ds, full_path, table_branch) = - open_for_mutation_in_query(self, staging, &table_key).await?; + open_table_for_mutation(self, staging, branch, &table_key).await?; let initial_version = ds.version().version; - // Scan matching IDs for cascade + // Scan matching IDs for cascade. Per D₂ this never overlaps with + // staged inserts (mixed insert/delete in one query is rejected at + // parse time), so we scan committed only. let batches = self .table_store() .scan(&ds, Some(&["id"]), Some(&pred_sql), None) @@ -1200,8 +1069,11 @@ impl Omnigraph { let affected_nodes = deleted_ids.len(); - // Delete nodes (re-open needed because the scan consumed the dataset; - // version guard was already applied by open_for_mutation above) + // Delete nodes — still inline-commit (Lance's `Dataset::delete` is + // not exposed as a two-phase op in 4.0.0). D₂ keeps inserts and + // deletes from coexisting in one query, so this advance of Lance + // HEAD is the only HEAD movement during the query and the + // publisher's CAS captures it intact. let mut ds = self .reopen_for_mutation( &table_key, @@ -1215,12 +1087,13 @@ impl Omnigraph { .delete_where(&full_path, &mut ds, &pred_sql) .await?; - staging.record( - table_key.clone(), - full_path.clone(), - table_branch.clone(), - &delete_state, - ); + staging.record_inline(crate::db::SubTableUpdate { + table_key: table_key.clone(), + table_version: delete_state.version, + table_branch: table_branch.clone(), + row_count: delete_state.row_count, + version_metadata: delete_state.version_metadata, + }); let mut affected_edges = 0usize; let escaped: Vec = deleted_ids @@ -1251,7 +1124,7 @@ impl Omnigraph { let edge_table_key = format!("edge:{}", edge_name); let cascade_filter = cascade_filters.join(" OR "); let (mut edge_ds, edge_full_path, edge_table_branch) = - open_for_mutation_in_query(self, staging, &edge_table_key).await?; + open_table_for_mutation(self, staging, branch, &edge_table_key).await?; let edge_delete = self .table_store() @@ -1261,12 +1134,13 @@ impl Omnigraph { affected_edges += edge_delete.deleted_rows; if edge_delete.deleted_rows > 0 { - staging.record( - edge_table_key, - edge_full_path, - edge_table_branch, - &edge_delete, - ); + staging.record_inline(crate::db::SubTableUpdate { + table_key: edge_table_key, + table_version: edge_delete.version, + table_branch: edge_table_branch, + row_count: edge_delete.row_count, + version_metadata: edge_delete.version_metadata, + }); } } @@ -1285,13 +1159,14 @@ impl Omnigraph { type_name: &str, predicate: &IRMutationPredicate, params: &ParamMap, + branch: Option<&str>, staging: &mut MutationStaging, ) -> Result { let pred_sql = predicate_to_sql(predicate, params, true)?; let table_key = format!("edge:{}", type_name); let (mut ds, full_path, table_branch) = - open_for_mutation_in_query(self, staging, &table_key).await?; + open_table_for_mutation(self, staging, branch, &table_key).await?; let delete_state = self .table_store() @@ -1300,7 +1175,13 @@ impl Omnigraph { let affected = delete_state.deleted_rows; if affected > 0 { - staging.record(table_key, full_path, table_branch, &delete_state); + staging.record_inline(crate::db::SubTableUpdate { + table_key, + table_version: delete_state.version, + table_branch, + row_count: delete_state.row_count, + version_metadata: delete_state.version_metadata, + }); self.invalidate_graph_index().await; } @@ -1311,6 +1192,105 @@ impl Omnigraph { } } +/// Concat the matched batches from `scan_with_pending` into a single batch. +/// `scan_with_pending` returns committed-side and pending-side batches in +/// order; both should share a schema if pending was produced through +/// `apply_assignments` with full-schema scan input. If schemas drift, +/// surface a clear error so the user can split the query. +fn concat_match_batches_to_schema( + _schema: &SchemaRef, + _blob_properties: &HashSet, + batches: Vec, +) -> Result { + if batches.len() == 1 { + return Ok(batches.into_iter().next().unwrap()); + } + let common = batches[0].schema(); + arrow_select::concat::concat_batches(&common, &batches).map_err(|e| { + OmniError::Lance(format!( + "scan_with_pending returned batches with mismatched schemas \ + across the committed/pending boundary; this typically indicates \ + a blob-column shape mismatch between the committed table and a \ + prior in-query insert/update. Split blob-touching mutations \ + into separate queries. ({})", + e + )) + }) +} + +/// Validate that adding `pending` edges plus the committed edges does not +/// exceed the per-source cardinality bound on `edge_type`. Reads the `src` +/// column from both committed (Lance) and pending (in-memory) and counts +/// per src. +async fn validate_edge_cardinality_with_pending( + db: &Omnigraph, + committed_ds: &Dataset, + staging: &MutationStaging, + table_key: &str, + edge_type: &omnigraph_compiler::catalog::EdgeType, +) -> Result<()> { + use std::collections::HashMap as StdHashMap; + + if edge_type.cardinality.is_default() { + return Ok(()); + } + + let mut counts: StdHashMap = StdHashMap::new(); + + // Committed side: scan `src` column (cheap, no filter). + let committed = db + .table_store() + .scan(committed_ds, Some(&["src"]), None, None) + .await?; + for batch in &committed { + let srcs = batch + .column_by_name("src") + .ok_or_else(|| OmniError::Lance("missing 'src' column on edge table".into()))? + .as_any() + .downcast_ref::() + .ok_or_else(|| OmniError::Lance("'src' column is not Utf8".into()))?; + for i in 0..srcs.len() { + if srcs.is_valid(i) { + *counts.entry(srcs.value(i).to_string()).or_insert(0) += 1; + } + } + } + + // Pending side: walk in-memory pending batches for `src`. + for batch in staging.pending_batches(table_key) { + let Some(col) = batch.column_by_name("src") else { + continue; + }; + let Some(srcs) = col.as_any().downcast_ref::() else { + continue; + }; + for i in 0..srcs.len() { + if srcs.is_valid(i) { + *counts.entry(srcs.value(i).to_string()).or_insert(0) += 1; + } + } + } + + let card = &edge_type.cardinality; + for (src, count) in &counts { + if let Some(max) = card.max { + if *count > max { + return Err(OmniError::manifest(format!( + "@card violation on edge {}: source '{}' has {} edges (max {})", + edge_type.name, src, count, max + ))); + } + } + // Note: per-source minimum cardinality cannot be checked + // mid-query (a bound of `2..` requires both edges to be inserted + // before validation). The publisher path could re-validate at + // commit time; for now, defer to the loader's end-of-query check. + let _ = card.min; + } + + Ok(()) +} + fn enrich_mutation_params(params: &ParamMap) -> Result { let mut resolved = params.clone(); if !resolved.contains_key(NOW_PARAM_NAME) { From bbf610ea9b51edc07f10097c2649f3171d5f0c97 Mon Sep 17 00:00:00 2001 From: Ragnor Comerford Date: Fri, 1 May 2026 10:42:47 +0200 Subject: [PATCH 03/10] =?UTF-8?q?MR-794=20step=202:=20rewire=20loader=20fo?= =?UTF-8?q?r=20Append/Merge=20=E2=80=94=20Overwrite=20stays=20inline?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit load_jsonl_reader dispatches on mode: Append/Merge use the MutationStaging accumulator (per-type batch staging, single stage_* + commit_staged per touched table at end-of-load, publisher CAS). Overwrite keeps the legacy concurrent inline-commit path (truncate-then-append doesn't fit the staged shape; overwrite has no in-flight read-your-writes requirement). * New helpers collect_node_ids_with_pending and validate_edge_cardinality_with_pending_loader — loader analogs of the engine's pending-aware validators. * Phase 2c (RI) and Phase 3 (cardinality) consult pending batches for Append/Merge so a mid-load failure aborts the load before any Lance write reaches HEAD. A failed Append/Merge load no longer advances Lance HEAD on staged tables — the next load on the same tables proceeds normally with no ExpectedVersionMismatch. Overwrite mode's drift residual is unchanged from today's behavior; documented in docs/runs.md. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/omnigraph/src/loader/mod.rs | 369 ++++++++++++++++++++++++----- 1 file changed, 306 insertions(+), 63 deletions(-) diff --git a/crates/omnigraph/src/loader/mod.rs b/crates/omnigraph/src/loader/mod.rs index 405f560..d05a315 100644 --- a/crates/omnigraph/src/loader/mod.rs +++ b/crates/omnigraph/src/loader/mod.rs @@ -21,6 +21,7 @@ use serde_json::Value as JsonValue; use crate::db::Omnigraph; use crate::error::{OmniError, Result}; +use crate::exec::staging::{MutationStaging, PendingMode}; /// Result of a load operation. #[derive(Debug, Clone, Default)] @@ -304,21 +305,29 @@ async fn load_jsonl_reader( } } - // Phase 2: Build per-type RecordBatches and write to Lance. - // - // Writes to different tables are independent in Lance (each table has its - // own manifest + fragments), so we parallelize across types with a bounded - // concurrency limit. Serial writes against S3 were the dominant cost of - // load — batching and parallelizing per-type cuts wall time by roughly - // `LOAD_WRITE_CONCURRENCY`× for wide schemas (see MR-677). + // Phase 2: Build per-type RecordBatches and accumulate into the + // staging pipeline. For Append/Merge, batches go into an in-memory + // accumulator and a single `stage_*` + `commit_staged` per touched + // table runs at end-of-load — a mid-load failure (RI / cardinality + // violation) leaves Lance HEAD untouched. For Overwrite, the legacy + // inline-commit path is preserved (truncate+append doesn't fit the + // staged shape cleanly, and overwrite has no in-flight read-your-writes + // requirement). - let mut updates = Vec::new(); let mut result = LoadResult::default(); let snapshot = db.snapshot_for_branch(branch).await?; - // Capture per-table manifest versions before any write so the publisher - // CAS at commit-time can detect concurrent writers landing between our - // read snapshot and our publish. - let mut expected_table_versions: HashMap = HashMap::new(); + let use_staging = !matches!(mode, LoadMode::Overwrite); + let mut staging = MutationStaging::default(); + let mut overwrite_updates: Vec = Vec::new(); + let mut overwrite_expected: HashMap = HashMap::new(); + let pending_mode = match mode { + LoadMode::Merge => PendingMode::Merge, + // Append-mode loads accumulate as Append. Edge tables (no @key) + // and no-key node tables stay safe on the stage_append path. The + // Merge mode applies dedupe-by-id; Append assumes unique inputs. + LoadMode::Append => PendingMode::Append, + LoadMode::Overwrite => PendingMode::Append, // unused + }; // Phase 2a: build and validate every node batch up front. Cheap and // synchronous — surfaces validation errors before any S3 traffic. @@ -338,46 +347,89 @@ async fn load_jsonl_reader( let entry = snapshot .entry(&table_key) .ok_or_else(|| OmniError::manifest(format!("no manifest entry for {}", table_key)))?; - expected_table_versions.insert(table_key.clone(), entry.table_version); + if !use_staging { + overwrite_expected.insert(table_key.clone(), entry.table_version); + } prepared_nodes.push((type_name.clone(), table_key, batch, loaded_count)); } - // Phase 2b: write every node type concurrently, bounded. - let node_write_results = write_batches_concurrently(db, branch, mode, prepared_nodes).await?; - - for (type_name, table_key, loaded_count, state, table_branch) in node_write_results { - updates.push(crate::db::SubTableUpdate { - table_key, - table_version: state.version, - table_branch, - row_count: state.row_count, - version_metadata: state.version_metadata, - }); - result.nodes_loaded.insert(type_name, loaded_count); + // Phase 2b: write every node type. Append/Merge → in-memory + // accumulator. Overwrite → concurrent inline-commit (legacy path). + if use_staging { + for (type_name, table_key, batch, loaded_count) in prepared_nodes { + let (ds, full_path, table_branch) = db + .open_for_mutation_on_branch(branch, &table_key) + .await?; + let expected_version = ds.version().version; + staging.ensure_path( + &table_key, + full_path, + table_branch, + expected_version, + ); + let schema = batch.schema(); + staging.append_batch(&table_key, schema, pending_mode, batch)?; + result.nodes_loaded.insert(type_name, loaded_count); + } + } else { + let node_write_results = + write_batches_concurrently(db, branch, mode, prepared_nodes).await?; + for (type_name, table_key, loaded_count, state, table_branch) in node_write_results { + overwrite_updates.push(crate::db::SubTableUpdate { + table_key, + table_version: state.version, + table_branch, + row_count: state.row_count, + version_metadata: state.version_metadata, + }); + result.nodes_loaded.insert(type_name, loaded_count); + } } - // Phase 2b: Validate edge referential integrity — every src/dst must - // reference an existing node ID in the appropriate type. + // Phase 2c: Validate edge referential integrity — every src/dst must + // reference an existing node ID in the appropriate type. For staged + // loads, the lookup unions snapshot-committed IDs with the in-memory + // pending batches (which carry the just-staged node inserts). for (edge_name, rows) in &edge_rows { let edge_type = &catalog.edge_types[edge_name]; - let from_ids = collect_node_ids( - db, - branch, - &edge_type.from_type, - &node_rows, - &catalog, - &updates, - ) - .await?; - let to_ids = collect_node_ids( - db, - branch, - &edge_type.to_type, - &node_rows, - &catalog, - &updates, - ) - .await?; + let from_ids = if use_staging { + collect_node_ids_with_pending( + db, + branch, + &edge_type.from_type, + &staging, + ) + .await? + } else { + collect_node_ids( + db, + branch, + &edge_type.from_type, + &node_rows, + &catalog, + &overwrite_updates, + ) + .await? + }; + let to_ids = if use_staging { + collect_node_ids_with_pending( + db, + branch, + &edge_type.to_type, + &staging, + ) + .await? + } else { + collect_node_ids( + db, + branch, + &edge_type.to_type, + &node_rows, + &catalog, + &overwrite_updates, + ) + .await? + }; for (i, (src, dst, _)) in rows.iter().enumerate() { if !from_ids.contains(src.as_str()) { @@ -401,7 +453,7 @@ async fn load_jsonl_reader( } } - // Write edges (parallel per edge type, same pattern as nodes) + // Phase 2d: build edge batches. let mut prepared_edges: Vec<(String, String, RecordBatch, usize)> = Vec::with_capacity(edge_rows.len()); for (edge_name, rows) in &edge_rows { @@ -417,29 +469,61 @@ async fn load_jsonl_reader( let entry = snapshot .entry(&table_key) .ok_or_else(|| OmniError::manifest(format!("no manifest entry for {}", table_key)))?; - expected_table_versions.insert(table_key.clone(), entry.table_version); + if !use_staging { + overwrite_expected.insert(table_key.clone(), entry.table_version); + } prepared_edges.push((edge_name.clone(), table_key, batch, loaded_count)); } - let edge_write_results = write_batches_concurrently(db, branch, mode, prepared_edges).await?; - - for (edge_name, table_key, loaded_count, state, table_branch) in edge_write_results { - updates.push(crate::db::SubTableUpdate { - table_key, - table_version: state.version, - table_branch, - row_count: state.row_count, - version_metadata: state.version_metadata, - }); - result.edges_loaded.insert(edge_name, loaded_count); + // Phase 2e: write every edge type. Same dispatch as Phase 2b. + if use_staging { + for (edge_name, table_key, batch, loaded_count) in prepared_edges { + let (ds, full_path, table_branch) = db + .open_for_mutation_on_branch(branch, &table_key) + .await?; + let expected_version = ds.version().version; + staging.ensure_path( + &table_key, + full_path, + table_branch, + expected_version, + ); + let schema = batch.schema(); + staging.append_batch(&table_key, schema, pending_mode, batch)?; + result.edges_loaded.insert(edge_name, loaded_count); + } + } else { + let edge_write_results = + write_batches_concurrently(db, branch, mode, prepared_edges).await?; + for (edge_name, table_key, loaded_count, state, table_branch) in edge_write_results { + overwrite_updates.push(crate::db::SubTableUpdate { + table_key, + table_version: state.version, + table_branch, + row_count: state.row_count, + version_metadata: state.version_metadata, + }); + result.edges_loaded.insert(edge_name, loaded_count); + } } - // Phase 3: Validate edge cardinality constraints (before commit — invalid - // data must not be committed). Opens edge sub-tables at their just-written - // versions, not through the snapshot (which still pins to pre-write state). + // Phase 3: Validate edge cardinality constraints (before commit — + // invalid data must not be committed). Staged path scans committed + // edges via Lance + iterates pending edges in-memory. Overwrite path + // opens the just-written version (legacy behavior). for (edge_name, _) in &edge_rows { + let edge_type = &catalog.edge_types[edge_name]; let table_key = format!("edge:{}", edge_name); - if let Some(update) = updates.iter().find(|u| u.table_key == table_key) { + if use_staging { + validate_edge_cardinality_with_pending_loader( + db, + branch, + edge_type, + &table_key, + &staging, + ) + .await?; + } else if let Some(update) = overwrite_updates.iter().find(|u| u.table_key == table_key) { validate_edge_cardinality( db, branch, @@ -452,8 +536,18 @@ async fn load_jsonl_reader( } // Phase 4: Atomic manifest commit with publisher-level OCC. - db.commit_updates_on_branch_with_expected(branch, &updates, &expected_table_versions) + if use_staging { + let (updates, expected_versions) = staging.finalize(db, branch).await?; + db.commit_updates_on_branch_with_expected(branch, &updates, &expected_versions) + .await?; + } else { + db.commit_updates_on_branch_with_expected( + branch, + &overwrite_updates, + &overwrite_expected, + ) .await?; + } Ok(result) } @@ -1456,6 +1550,155 @@ pub(crate) async fn validate_edge_cardinality( Ok(()) } +/// Validate edge `@card` cardinality with in-memory pending edges visible. +/// +/// Loader-level analog to `exec::mutation::validate_edge_cardinality_with_pending`: +/// scans committed edges via Lance and unions counts with the pending +/// edge batches accumulated by the staged loader. Used by Append/Merge +/// loads (the Overwrite path uses `validate_edge_cardinality` which +/// opens the just-written Lance version). +async fn validate_edge_cardinality_with_pending_loader( + db: &Omnigraph, + branch: Option<&str>, + edge_type: &omnigraph_compiler::catalog::EdgeType, + table_key: &str, + staging: &MutationStaging, +) -> Result<()> { + if edge_type.cardinality.is_default() { + return Ok(()); + } + + let mut counts: HashMap = HashMap::new(); + + // Committed side: open at pre-write version (the snapshot pinned at + // load entry; pending writes haven't committed yet). + let snapshot = db.snapshot_for_branch(branch).await?; + if let Some(entry) = snapshot.entry(table_key) { + let ds = db + .open_dataset_at_state( + &entry.table_path, + entry.table_branch.as_deref(), + entry.table_version, + ) + .await?; + let batches = db + .table_store() + .scan(&ds, Some(&["src"]), None, None) + .await?; + for batch in &batches { + let srcs = batch + .column_by_name("src") + .ok_or_else(|| OmniError::Lance("missing 'src' column".into()))? + .as_any() + .downcast_ref::() + .ok_or_else(|| OmniError::Lance("'src' column is not Utf8".into()))?; + for i in 0..srcs.len() { + if srcs.is_valid(i) { + *counts.entry(srcs.value(i).to_string()).or_insert(0) += 1; + } + } + } + } + + // Pending side: walk in-memory pending batches for `src`. + for batch in staging.pending_batches(table_key) { + let Some(col) = batch.column_by_name("src") else { + continue; + }; + let Some(srcs) = col.as_any().downcast_ref::() else { + continue; + }; + for i in 0..srcs.len() { + if srcs.is_valid(i) { + *counts.entry(srcs.value(i).to_string()).or_insert(0) += 1; + } + } + } + + let card = &edge_type.cardinality; + for (src, count) in &counts { + if let Some(max) = card.max { + if *count > max { + return Err(OmniError::manifest(format!( + "@card violation on edge {}: source '{}' has {} edges (max {})", + edge_type.name, src, count, max + ))); + } + } + if *count < card.min { + return Err(OmniError::manifest(format!( + "@card violation on edge {}: source '{}' has {} edges (min {})", + edge_type.name, src, count, card.min + ))); + } + } + + Ok(()) +} + +/// Collect all valid node IDs for a given type, with in-memory pending +/// node inserts visible. Used by the staged loader's Phase 2c +/// referential-integrity validation. +/// +/// Union of: +/// - IDs from the staged loader's pending batches (in-memory; just-staged +/// inserts of this type) +/// - IDs from the committed sub-table at the pre-load snapshot version +async fn collect_node_ids_with_pending( + db: &Omnigraph, + branch: Option<&str>, + type_name: &str, + staging: &MutationStaging, +) -> Result> { + let mut ids = HashSet::new(); + let table_key = format!("node:{}", type_name); + + // From staging.pending: walk the in-memory accumulator's id column. + for batch in staging.pending_batches(&table_key) { + if let Some(col) = batch.column_by_name("id") { + if let Some(arr) = col.as_any().downcast_ref::() { + for i in 0..arr.len() { + if arr.is_valid(i) { + ids.insert(arr.value(i).to_string()); + } + } + } + } + } + + // From the committed Lance sub-table at the pre-load snapshot version. + let snapshot = db.snapshot_for_branch(branch).await?; + let Some(entry) = snapshot.entry(&table_key) else { + return Ok(ids); + }; + let ds = db + .open_dataset_at_state( + &entry.table_path, + entry.table_branch.as_deref(), + entry.table_version, + ) + .await?; + + let batches = db + .table_store() + .scan(&ds, Some(&["id"]), None, None) + .await?; + + for batch in &batches { + let id_col = batch + .column_by_name("id") + .ok_or_else(|| OmniError::Lance("missing 'id' column".into()))? + .as_any() + .downcast_ref::() + .ok_or_else(|| OmniError::Lance("'id' column is not Utf8".into()))?; + for i in 0..batch.num_rows() { + ids.insert(id_col.value(i).to_string()); + } + } + + Ok(ids) +} + /// Collect all valid node IDs for a given type. Union of: /// - IDs from the just-loaded batch (in memory, from node_rows) /// - IDs from the sub-table at the just-written version (if it was updated) From 82350bdc4a82debf6155b922bac2080b18eff39f Mon Sep 17 00:00:00 2001 From: Ragnor Comerford Date: Fri, 1 May 2026 10:43:00 +0200 Subject: [PATCH 04/10] =?UTF-8?q?MR-794=20step=202:=20tests=20=E2=80=94=20?= =?UTF-8?q?flip=20partial=5Ffailure=20+=20add=20coalesce/D=E2=82=82/load?= =?UTF-8?q?=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Flip partial_failure_observably_rolls_back_but_blocks_next_mutation_on_same_table → partial_failure_leaves_target_queryable_and_unblocks_next_mutation. Asserts the next mutation succeeds (no drift) instead of expecting the legacy ExpectedVersionMismatch. * New: mutation_rejects_mixed_insert_and_delete_at_parse_time — D₂ check fires before any I/O. * New: mixed_insert_and_update_on_same_person_coalesces_to_one_merge — verifies dedupe-by-id last-write-wins in MutationStaging::finalize. * New: multiple_appends_to_same_edge_coalesce_to_one_append — two-statement edge insert publishes exactly once. * New: multi_statement_inserts_publish_exactly_once — manifest version advances by exactly 1 per multi-statement query. * New: load_with_bad_edge_reference_unblocks_next_load — RI violation aborts before publish; next load succeeds. * New: load_with_cardinality_violation_unblocks_next_load — uses a custom WorksAt @card(0..1) schema; cardinality violation aborts before publish; next load succeeds. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/omnigraph/tests/runs.rs | 436 ++++++++++++++++++++++++++++++--- 1 file changed, 401 insertions(+), 35 deletions(-) diff --git a/crates/omnigraph/tests/runs.rs b/crates/omnigraph/tests/runs.rs index c83d914..2e9b480 100644 --- a/crates/omnigraph/tests/runs.rs +++ b/crates/omnigraph/tests/runs.rs @@ -14,6 +14,7 @@ mod helpers; +use arrow_array::Array; use omnigraph::db::commit_graph::CommitGraph; use omnigraph::db::{Omnigraph, ReadTarget}; use omnigraph::error::{ManifestConflictDetails, ManifestErrorKind, OmniError}; @@ -160,30 +161,27 @@ async fn multi_statement_mutation_is_atomic_with_read_your_writes() { assert_eq!(friends.num_rows(), 1); } -/// Mid-query partial failure: op-1 writes a Lance fragment, op-2 fails. -/// Documents the *current* observable behavior — not the desired one. The -/// publisher never publishes (good — no manifest commit, target state -/// unchanged), but Lance HEAD on the touched table is now ahead of the -/// manifest-recorded version. The next mutation against that table fails -/// loudly with `ExpectedVersionMismatch` (the engine's -/// `ensure_expected_version` strict-equality check refuses the drift). +/// Mid-query partial failure: op-1 stages a Person insert, op-2 fails on +/// referential integrity (validate_edge_insert_endpoints). Under the +/// MR-794 staged-write rewire, op-1's batch lives in the in-memory +/// accumulator and never reaches Lance — Lance HEAD on `node:Person` +/// stays at the pre-mutation version. The publisher never publishes, +/// the manifest never advances, and the next mutation against the same +/// table proceeds normally (no `ExpectedVersionMismatch`). /// -/// **Known limitation, MR-771 follow-up**: a proper rollback requires -/// per-table Lance branches (write to a transient branch, fast-forward -/// main on success, drop on failure). Lance's `restore()` is not a rewind -/// — it appends a new commit, advancing HEAD further. See `docs/runs.md` -/// for the workaround (rare in practice: most validation runs before any -/// Lance write, so this only fires on multi-statement queries where a -/// late op fails after an earlier op committed). +/// This test pins the post-MR-794 contract: +/// - Failed multi-statement mutation surfaces a clear error, no manifest +/// commit, no observable state change. +/// - The touched tables stay queryable and writable from the next +/// query — Lance HEAD has not drifted. #[tokio::test] -async fn partial_failure_observably_rolls_back_but_blocks_next_mutation_on_same_table() { +async fn partial_failure_leaves_target_queryable_and_unblocks_next_mutation() { let dir = tempfile::tempdir().unwrap(); let mut db = init_and_load(&dir).await; - // Op-1 inserts Person "Eve" successfully (advancing Lance HEAD on - // node:Person). Op-2 inserts Knows from Eve to "Missing" — fails at - // validate_edge_insert_endpoints because "Missing" doesn't exist. - // The query as a whole errors; the publisher never runs. + // Op-1 stages a Person 'Eve' insert. Op-2 attempts an edge to + // 'Missing' — fails at validate_edge_insert_endpoints because + // 'Missing' doesn't exist (and isn't pending). let err = db .mutate( "main", @@ -205,10 +203,9 @@ async fn partial_failure_observably_rolls_back_but_blocks_next_mutation_on_same_ manifest_err.message, ); - // Atomicity at the manifest level: Eve is *not* observable. The Lance - // fragment from op-1 exists on disk but is not referenced by the - // manifest; readers at the manifest's pinned version see the - // pre-mutation state. + // Atomicity at the manifest level: Eve is *not* observable. The + // staged batch never reached Lance, so neither the Lance HEAD nor + // the manifest moved. let eve = db .query( ReadTarget::branch("main"), @@ -218,11 +215,12 @@ async fn partial_failure_observably_rolls_back_but_blocks_next_mutation_on_same_ ) .await .unwrap(); - assert_eq!(eve.num_rows(), 0, "partial Lance write must not be visible"); + assert_eq!(eve.num_rows(), 0, "partial mutation must not be visible"); - // The next mutation against the *same* table fails loudly. Other - // tables are unaffected, and reads still work. - let blocked = db + // The next mutation against the same table SUCCEEDS — staged writes + // never advance Lance HEAD on a failed query, so there is no drift + // to trip the publisher's CAS. + let result = db .mutate( "main", MUTATION_QUERIES, @@ -230,14 +228,23 @@ async fn partial_failure_observably_rolls_back_but_blocks_next_mutation_on_same_ &mixed_params(&[("$name", "Frank")], &[("$age", 33)]), ) .await - .expect_err("next mutation on the touched table is blocked by orphan Lance HEAD"); - let OmniError::Manifest(blocked_err) = blocked else { - panic!("expected Manifest error, got {blocked:?}"); - }; - assert!(matches!( - blocked_err.details, - Some(omnigraph::error::ManifestConflictDetails::ExpectedVersionMismatch { .. }) - )); + .expect("next mutation on the touched table must succeed under MR-794"); + assert_eq!( + result.affected_nodes, 1, + "follow-up insert should report 1 affected node" + ); + + // And Frank is observable. + let frank = db + .query( + ReadTarget::branch("main"), + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Frank")]), + ) + .await + .unwrap(); + assert_eq!(frank.num_rows(), 1, "Frank must be visible after publish"); } /// Concurrent writers to the same `(table, branch)` produce exactly one @@ -472,3 +479,362 @@ async fn public_branch_apis_reject_internal_run_refs() { err.message ); } + +// ─── MR-794: staged-write rewire — additional contract tests ──────────────── + +/// Mutation queries used only by the MR-794 tests below. Kept in the test +/// file (not in helpers' shared `MUTATION_QUERIES`) to keep their scope +/// local to the staged-write coverage. +const STAGED_QUERIES: &str = r#" +query insert_two_persons($a_name: String, $a_age: I32, $b_name: String, $b_age: I32) { + insert Person { name: $a_name, age: $a_age } + insert Person { name: $b_name, age: $b_age } +} + +query insert_then_update_same_person( + $name: String, $insert_age: I32, $update_age: I32 +) { + insert Person { name: $name, age: $insert_age } + update Person set { age: $update_age } where name = $name +} + +query insert_two_friends($from: String, $a: String, $b: String) { + insert Knows { from: $from, to: $a } + insert Knows { from: $from, to: $b } +} + +query mixed_insert_and_delete($name: String, $age: I32, $victim: String) { + insert Person { name: $name, age: $age } + delete Person where name = $victim +} +"#; + +/// D₂: a query mixing inserts/updates with deletes is rejected at parse +/// time, BEFORE any I/O. The error shape directs the user to split the +/// query into two mutations. +#[tokio::test] +async fn mutation_rejects_mixed_insert_and_delete_at_parse_time() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + + // Capture pre-mutation state on touched tables to confirm no I/O. + let persons_before = count_rows(&db, "node:Person").await; + + let err = db + .mutate( + "main", + STAGED_QUERIES, + "mixed_insert_and_delete", + &mixed_params( + &[("$name", "Eve"), ("$victim", "Alice")], + &[("$age", 22)], + ), + ) + .await + .expect_err("D₂ must reject mixed insert+delete"); + let OmniError::Manifest(manifest_err) = err else { + panic!("expected Manifest error, got {err:?}"); + }; + assert!( + manifest_err.message.contains("inserts/updates and deletes"), + "unexpected error message: {}", + manifest_err.message, + ); + assert!( + manifest_err.message.contains("split into separate mutations"), + "error message should direct user to split: {}", + manifest_err.message, + ); + + // No I/O — counts unchanged, branches unchanged. + let persons_after = count_rows(&db, "node:Person").await; + assert_eq!( + persons_before, persons_after, + "D₂ rejection must fire before any write", + ); + assert_eq!(db.branch_list().await.unwrap(), vec!["main".to_string()]); +} + +/// `insert Person 'X'; update Person where name='X' set age=...` — both +/// ops produce content on `node:Person` and coalesce into one +/// `stage_merge_insert` at end-of-query. The accumulator's last-write-wins +/// dedupe (in `MutationStaging::finalize`) ensures the update's value +/// wins. Single Lance commit per table per query. +#[tokio::test] +async fn mixed_insert_and_update_on_same_person_coalesces_to_one_merge() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + + let pre_version = version_main(&db).await.unwrap(); + + let result = db + .mutate( + "main", + STAGED_QUERIES, + "insert_then_update_same_person", + &mixed_params( + &[("$name", "Yves")], + &[("$insert_age", 10), ("$update_age", 99)], + ), + ) + .await + .unwrap(); + assert_eq!(result.affected_nodes, 2, "1 insert + 1 update reported"); + + // The end-state row carries the update value (last-write-wins via + // dedupe in finalize), proving the staged merge_insert ran with the + // correct source dedupe. Read the underlying Person table directly + // and assert age=99 for the row we just inserted+updated. + let batches = read_table(&db, "node:Person").await; + let mut found_age: Option = None; + for batch in &batches { + let names = batch + .column_by_name("name") + .expect("Person table missing 'name' column") + .as_any() + .downcast_ref::() + .expect("'name' should be Utf8"); + let ages = batch + .column_by_name("age") + .expect("Person table missing 'age' column") + .as_any() + .downcast_ref::() + .expect("'age' should be I32"); + for i in 0..batch.num_rows() { + if names.is_valid(i) && names.value(i) == "Yves" { + if ages.is_valid(i) { + found_age = Some(ages.value(i)); + } + } + } + } + assert_eq!( + found_age, + Some(99), + "dedupe must keep the update's age value, not the insert's", + ); + + // One-publish guarantee: manifest version advanced by exactly 1. + let post_version = version_main(&db).await.unwrap(); + assert_eq!( + post_version, + pre_version + 1, + "insert+update query must publish exactly once", + ); +} + +/// `insert Knows from='Alice' to='Bob'; insert Knows from='Alice' to='Eve'` +/// — both append to `edge:Knows`. The accumulator coalesces them into one +/// `stage_append` at end-of-query. Edge IDs are ULID-generated so no +/// dedupe is needed (Append mode). +#[tokio::test] +async fn multiple_appends_to_same_edge_coalesce_to_one_append() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + + // Add Eve so the second edge has a valid endpoint. + db.mutate( + "main", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + .unwrap(); + + let edges_before = count_rows(&db, "edge:Knows").await; + let pre_version = version_main(&db).await.unwrap(); + + let result = db + .mutate( + "main", + STAGED_QUERIES, + "insert_two_friends", + ¶ms(&[ + ("$from", "Alice"), + ("$a", "Bob"), + ("$b", "Eve"), + ]), + ) + .await + .unwrap(); + assert_eq!(result.affected_edges, 2); + + // Both edges visible. + let edges_after = count_rows(&db, "edge:Knows").await; + assert_eq!(edges_after, edges_before + 2); + + // One manifest version bump for the two-edge query (atomic publish). + let post_version = version_main(&db).await.unwrap(); + assert_eq!( + post_version, + pre_version + 1, + "two-statement edge insert must publish exactly once", + ); +} + +/// A multi-statement insert query touching two Person rows produces a +/// single `stage_*` + `commit_staged` per table — verified by checking +/// that the manifest version advances exactly once across the query. +#[tokio::test] +async fn multi_statement_inserts_publish_exactly_once() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + + let pre_version = version_main(&db).await.unwrap(); + + db.mutate( + "main", + STAGED_QUERIES, + "insert_two_persons", + &mixed_params( + &[("$a_name", "Owen"), ("$b_name", "Pat")], + &[("$a_age", 50), ("$b_age", 51)], + ), + ) + .await + .unwrap(); + + let post_version = version_main(&db).await.unwrap(); + assert_eq!( + post_version, + pre_version + 1, + "two-statement insert query must publish exactly once", + ); + + // Both rows visible. + let owen = db + .query( + ReadTarget::branch("main"), + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Owen")]), + ) + .await + .unwrap(); + assert_eq!(owen.num_rows(), 1); + let pat = db + .query( + ReadTarget::branch("main"), + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Pat")]), + ) + .await + .unwrap(); + assert_eq!(pat.num_rows(), 1); +} + +/// A load with a mid-input edge RI violation must leave Lance HEAD on +/// the touched node tables untouched (staged loader never commits any +/// fragment when the load fails). The next load on the same tables +/// succeeds — no `ExpectedVersionMismatch` from drift. +#[tokio::test] +async fn load_with_bad_edge_reference_unblocks_next_load() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); + // Seed with the standard fixture so we're working from a non-empty + // baseline. + load_jsonl(&mut db, TEST_DATA, LoadMode::Overwrite) + .await + .unwrap(); + + let pre_persons = count_rows(&db, "node:Person").await; + let pre_edges = count_rows(&db, "edge:Knows").await; + + // First load: append a Person + an edge whose `to` points to a + // non-existent Person. RI fails AFTER the staged Person is in the + // accumulator but BEFORE the publish. + let bad = r#"{"type": "Person", "data": {"name": "Mallory", "age": 5}} +{"edge": "Knows", "from": "Mallory", "to": "Ghost"} +"#; + let err = load_jsonl(&mut db, bad, LoadMode::Append) + .await + .expect_err("RI violation must fail the load"); + let OmniError::Manifest(manifest_err) = err else { + panic!("expected Manifest error, got {err:?}"); + }; + assert!( + manifest_err.message.contains("not found"), + "unexpected error: {}", + manifest_err.message, + ); + + // No write made it to disk: counts unchanged. + let mid_persons = count_rows(&db, "node:Person").await; + let mid_edges = count_rows(&db, "edge:Knows").await; + assert_eq!(mid_persons, pre_persons, "failed load must not advance Person count"); + assert_eq!(mid_edges, pre_edges, "failed load must not advance Knows count"); + + // Second load against the same tables — succeeds (no HEAD drift). + let good = r#"{"type": "Person", "data": {"name": "Pat", "age": 55}}"#; + load_jsonl(&mut db, good, LoadMode::Append).await.unwrap(); + assert_eq!( + count_rows(&db, "node:Person").await, + pre_persons + 1, + "follow-up load must succeed (no drift)", + ); +} + +/// Same shape as the RI test above, but driven by a cardinality +/// violation (`@card(0..1)` on `WorksAt`). The staged loader's pending +/// edge accumulator drives the cardinality scan; a violation aborts +/// the load before publish; the next load on the same tables succeeds. +#[tokio::test] +async fn load_with_cardinality_violation_unblocks_next_load() { + // Use a custom schema where WorksAt has a strict 0..1 cardinality + // bound — the default test schema leaves WorksAt unbounded. Seed + // Alice + two companies, then attempt two WorksAt edges from Alice, + // which violates the bound. + const CARD_SCHEMA: &str = r#" +node Person { + name: String @key + age: I32? +} +node Company { + name: String @key +} +edge WorksAt: Person -> Company @card(0..1) +"#; + + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, CARD_SCHEMA).await.unwrap(); + + let seed = r#"{"type": "Person", "data": {"name": "Alice", "age": 30}} +{"type": "Company", "data": {"name": "Acme"}} +{"type": "Company", "data": {"name": "Bigco"}} +"#; + load_jsonl(&mut db, seed, LoadMode::Overwrite).await.unwrap(); + + let pre_works = count_rows(&db, "edge:WorksAt").await; + + // Two WorksAt edges from Alice — exceeds @card(0..1). + let bad = r#"{"edge": "WorksAt", "from": "Alice", "to": "Acme"} +{"edge": "WorksAt", "from": "Alice", "to": "Bigco"} +"#; + let err = load_jsonl(&mut db, bad, LoadMode::Append) + .await + .expect_err("cardinality violation must fail the load"); + let OmniError::Manifest(manifest_err) = err else { + panic!("expected Manifest error, got {err:?}"); + }; + assert!( + manifest_err.message.contains("@card violation"), + "unexpected error: {}", + manifest_err.message, + ); + + // No edges added; next load on the same edge table succeeds. + let mid_works = count_rows(&db, "edge:WorksAt").await; + assert_eq!(mid_works, pre_works); + + let good = r#"{"edge": "WorksAt", "from": "Alice", "to": "Acme"}"#; + load_jsonl(&mut db, good, LoadMode::Append).await.unwrap(); + assert_eq!( + count_rows(&db, "edge:WorksAt").await, + pre_works + 1, + "follow-up load must succeed (no drift on edge table)", + ); +} From a61e82f47acdd49bf75a7d21a5e837e58dfdeee4 Mon Sep 17 00:00:00 2001 From: Ragnor Comerford Date: Fri, 1 May 2026 10:43:19 +0200 Subject: [PATCH 05/10] =?UTF-8?q?MR-794=20step=202:=20docs=20=E2=80=94=20r?= =?UTF-8?q?uns/invariants/architecture/execution=20+=20cleanup?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refresh user-facing and agent-facing docs for the staged-write rewire and clean up stale Run-state-machine references that survived MR-771. MR-794-specific updates: * docs/runs.md — remove "Known limitation: mid-query partial failure" section; document the in-memory accumulator + D₂ rule + the LoadMode::Overwrite residual. * docs/invariants.md §VI.25 — flip from aspirational/open to upheld for inserts/updates. Within-query read-your-writes is now load-bearing for the publisher CAS contract. * docs/architecture.md — add "Mutation atomicity — in-memory accumulator (MR-794)" subsection with per-op flow; refresh the engine + state diagrams to drop RunRegistry and add MutationStaging. * docs/execution.md — rewrite the mutation flow sequence diagram for the staged-write path; updated the LoadMode table to call out per-mode commit semantics; rewrote load vs ingest. * docs/query-language.md — document the D₂ parse-time rule. * docs/errors.md — add the D₂ BadRequest rejection path. * docs/testing.md — extend the runs.rs row to cover the new MR-794 contract tests; add the staged_writes.rs row. * docs/releases/v0.4.1.md (new) — release note covering the rewire, test additions, residuals, and files changed. * AGENTS.md (CLAUDE.md symlink) — update the atomic-per-query description and the L2 capability matrix row. Stale-reference cleanup (MR-771 leftovers): * docs/storage.md — drop live _graph_runs.lance / _graph_run_actors.lance from the layout diagram and prose; mark legacy. * docs/branches-commits.md — move __run__ to a legacy note; remove publish_run from the publish-trigger list. * docs/audit.md — refresh _as API list (drop begin_run_as / publish_run_as); legacy RunRecord.actor_id moved to a historical note. * docs/constants.md — mark run registry / branch-prefix rows as legacy. * docs/cli.md — replace the legacy omnigraph run * quickstart block with omnigraph commit list/show. Co-Authored-By: Claude Opus 4.7 (1M context) --- AGENTS.md | 4 +- docs/architecture.md | 53 +++++++++++++-- docs/audit.md | 5 +- docs/branches-commits.md | 4 +- docs/cli.md | 8 +-- docs/constants.md | 4 +- docs/errors.md | 1 + docs/execution.md | 109 ++++++++++++++++--------------- docs/invariants.md | 2 +- docs/query-language.md | 8 +++ docs/releases/v0.4.1.md | 138 +++++++++++++++++++++++++++++++++++++++ docs/runs.md | 118 ++++++++++++++++++++------------- docs/storage.md | 6 +- docs/testing.md | 3 +- 14 files changed, 342 insertions(+), 121 deletions(-) create mode 100644 docs/releases/v0.4.1.md diff --git a/AGENTS.md b/AGENTS.md index c851128..d347343 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -32,7 +32,7 @@ OmniGraph is a typed property-graph engine built as a coordination layer over ma - **Languages**: a `.pg` schema language and a `.gq` query language, both Pest-based, with a typed IR. - **Multi-modal querying**: vector ANN (`nearest`), full-text (`search`/`fuzzy`/`match_text`/`bm25`), Reciprocal Rank Fusion (`rrf`), and graph traversal (`Expand`, anti-join `not { … }`) in one runtime. - **Branches and commits across the whole graph**: Git-style — every successful publish appends to a commit DAG; merges are three-way at the row level. -- **Atomic per-query writes**: `mutate_as` and `load` capture per-table `expected_table_versions` before writing and call `ManifestBatchPublisher::publish` once at the end. Cross-table OCC enforced inside the publisher's row-level CAS; no staging branches, no run state machine. +- **Atomic per-query writes**: `mutate_as` and `load` accumulate insert/update batches into an in-memory `MutationStaging.pending` per touched table; one `stage_*` + `commit_staged` per table runs at end-of-query, then `ManifestBatchPublisher::publish` commits the manifest atomically with per-table `expected_table_versions` CAS. A mid-query failure leaves Lance HEAD untouched on staged tables — no drift, no run state machine, no staging branches. Deletes still inline-commit; D₂ at parse time prevents inserts/updates and deletes from coexisting in one query. - **HTTP server**: Axum + utoipa OpenAPI, bearer auth (SHA-256 hashed, optional AWS Secrets Manager), Cedar policy gating. - **CLI** driven by a single `omnigraph.yaml`; multi-format output (json/jsonl/csv/kv/table). @@ -211,7 +211,7 @@ omnigraph policy explain --actor act-alice --action change --branch main | Query language | — | `.gq` + Pest grammar + IR + lowering + linter | | Schema migration planning | — | `plan_schema_migration` + `apply_schema` step types + `__schema_apply_lock__` | | Commit graph (DAG) across whole repo | — | `_graph_commits.lance` with linear + merge parents, ULID ids, actor map | -| Per-query atomic writes | — | `MutationStaging` accumulator + `commit_with_expected` publisher CAS, single commit per `mutate_as` / `load` | +| Per-query atomic writes | — | In-memory `MutationStaging.pending` accumulator + `stage_*` / `commit_staged` per touched table at end-of-query + publisher CAS via `commit_with_expected` (single manifest commit per `mutate_as` / `load`); D₂ parse-time rule keeps inserts/updates and deletes from mixing | | Three-way row-level merge | — | `OrderedTableCursor` + `StagedTableWriter`, structured `MergeConflictKind` | | Change feeds | — | `diff_between` / `diff_commits` with manifest fast path + ID streaming | | Cedar policy | — | 8 actions, branch / target_branch / protected scopes, validate/test/explain CLI | diff --git a/docs/architecture.md b/docs/architecture.md index 5bd252e..0357b5d 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -63,7 +63,7 @@ flowchart TB subgraph engine[omnigraph engine] plan[exec query and mutation]:::l2 gi[graph index CSR/CSC
RuntimeCache LRU 8]:::l2 - coord[coordinator
ManifestRepo · CommitGraph · RunRegistry]:::l2 + coord[coordinator
ManifestRepo · CommitGraph]:::l2 end subgraph storage[storage trait — wraps Lance] @@ -134,7 +134,7 @@ flowchart TB coord[GraphCoordinator]:::l2 mr[ManifestRepo
db/manifest.rs]:::l2 cg[CommitGraph
_graph_commits.lance]:::l2 - rr[RunRegistry
_graph_runs.lance]:::l2 + stg[MutationStaging
per-query in-memory accumulator
exec/staging.rs]:::l2 end subgraph idx[graph index] @@ -149,17 +149,18 @@ flowchart TB eq --> gi eq --> ts + em --> stg em --> ts + ld --> stg ld --> ts eq --> mr em --> mr coord --> mr coord --> cg - coord --> rr ts --> st ``` -The engine binds the compiler IR to Lance. It owns multi-dataset coordination, the graph topology index, the run registry, and the snapshot/manifest read path. +The engine binds the compiler IR to Lance. It owns multi-dataset coordination, the graph topology index, the per-query staging accumulator, and the snapshot/manifest read path. Code paths: @@ -169,6 +170,46 @@ Code paths: - Graph index: `crates/omnigraph/src/graph_index/` - Loader: `Omnigraph::ingest` at `crates/omnigraph/src/loader/mod.rs:74` +### Mutation atomicity — in-memory accumulator (MR-794) + +Inserts and updates inside `mutate_as` and the bulk loader's +Append/Merge modes go through `MutationStaging` +([`crates/omnigraph/src/exec/staging.rs`](../crates/omnigraph/src/exec/staging.rs)), +a per-query in-memory accumulator. No Lance HEAD advance happens during +op execution; one `stage_*` + `commit_staged` per touched table runs +at end-of-query, then the publisher commits the manifest atomically. + +``` +op-1 (insert/update) → push RecordBatch → MutationStaging.pending[table] +op-2 (insert/update) → read committed via Lance + pending via DataFusion + MemTable (read-your-writes) → push batch +op-N → push batch +─── end of query ─────────────────────────────────────── +finalize: per pending table: + concat batches → stage_append OR stage_merge_insert → commit_staged +publisher: ManifestBatchPublisher::publish (one cross-table CAS) +``` + +A failed op leaves Lance HEAD untouched on the staged tables: the next +mutation proceeds normally with no drift to reconcile. Concrete +contracts: + +- `D₂` parse-time rule: a query is either insert/update-only or + delete-only. Mixed → reject. Deletes still inline-commit (Lance + 4.0.0 has no public two-phase delete); D₂ keeps the inline path safe. +- `LoadMode::Overwrite` keeps the inline-commit path + (truncate-then-append doesn't fit the staged shape; overwrite has no + in-flight read-your-writes requirement). +- Read sites consume `TableStore::scan_with_pending`, which Lance-scans + the committed snapshot at the captured `expected_version` and unions + with a DataFusion `MemTable` over the pending batches. + +This pattern realizes [docs/invariants.md §VI.25](invariants.md) +(read-your-writes within a multi-statement mutation) and §VI.32 +(failure scope bounded) for inserts/updates by construction at the +writer layer. See [docs/runs.md](runs.md) for the publisher CAS +contract this builds on. + ### Storage trait — today vs. roadmap ```mermaid @@ -256,13 +297,13 @@ Throughout the docs, capabilities are split into: - **MVCC**: every Lance write bumps a per-dataset version; the OmniGraph manifest version coordinates which sub-table versions are visible together. - **Snapshot isolation**: a query holds one `Snapshot` for its lifetime; concurrent writes don't leak in. - **Cross-branch isolation**: copy-on-write means readers and writers on different branches don't block each other. -- **Run isolation**: each transactional run lives on its own `__run__` branch. +- **Per-query staging**: `mutate_as` and `load` (Append/Merge) accumulate insert/update batches in an in-memory `MutationStaging`; one `stage_*` + `commit_staged` per touched table runs at end-of-query, then the publisher commits the manifest atomically. A mid-query failure leaves Lance HEAD untouched on staged tables. (MR-794; pre-v0.4.0 used a `__run__` staging branch + Run state machine, removed in MR-771.) - **Schema-apply lock**: `__schema_apply_lock__` system branch serializes schema migrations. - **Fail-points** (`failpoints` cargo feature): `failpoints::maybe_fail("operation.step")?` in `branch_create`, publish, etc., for deterministic failure injection in tests. ## Workspace crates - `omnigraph-compiler` — schema and query grammars, catalog, IR, lowering, type checker, lint, migration planner, OpenAI-style embedding client. -- `omnigraph` (engine, published as `omnigraph-engine` on crates.io since v0.2.2) — the Lance-backed runtime: manifest, commit graph, run registry, snapshot, exec, merge, loader, Gemini embedding client. +- `omnigraph` (engine, published as `omnigraph-engine` on crates.io since v0.2.2) — the Lance-backed runtime: manifest, commit graph, snapshot, exec (incl. per-query `MutationStaging` accumulator), merge, loader, Gemini embedding client. - `omnigraph-cli` — the `omnigraph` binary. - `omnigraph-server` — the `omnigraph-server` binary (Axum HTTP server). diff --git a/docs/audit.md b/docs/audit.md index 591c016..80ac137 100644 --- a/docs/audit.md +++ b/docs/audit.md @@ -1,6 +1,7 @@ # Audit / Actor tracking - `Omnigraph::audit_actor_id: Option` is the actor in effect. -- `_as` variants of every write API let callers override the actor: `begin_run_as`, `publish_run_as`, `ingest_as`, `mutate_as`, `branch_merge_as`, etc. -- Actor IDs are persisted both on `RunRecord.actor_id` and on `GraphCommit.actor_id`, with optional split storage in `_graph_commit_actors.lance` and `_graph_run_actors.lance`. +- `_as` variants of every write API let callers override the actor: `mutate_as`, `ingest_as`, `branch_merge_as`, `apply_schema_as`, etc. +- Actor IDs are persisted on `GraphCommit.actor_id` with split storage in `_graph_commit_actors.lance` (the commit graph is split into `_graph_commits.lance` for the linkage and `_graph_commit_actors.lance` for the actor map). - HTTP server uses the bearer-token actor automatically; CLI uses the local user / explicit env (no implicit actor). +- Pre-v0.4.0 repos also stored actor IDs on `RunRecord.actor_id` in `_graph_runs.lance` / `_graph_run_actors.lance`. The Run state machine was removed in MR-771; those files are inert post-v0.4.0 and reclaimed by MR-770's production sweep. diff --git a/docs/branches-commits.md b/docs/branches-commits.md index 2e8cca8..4501822 100644 --- a/docs/branches-commits.md +++ b/docs/branches-commits.md @@ -37,7 +37,7 @@ Storage is split across two Lance datasets (both with stable row IDs): Notes: -- Every successful publish (load / change / merge / schema_apply / publish_run) appends one commit. +- Every successful publish (load / change / merge / schema_apply) appends one commit. - Merge commits have two parents; linear commits have one. - API: `list_commits(branch)`, `get_commit(id)`, `head_commit_id_for_branch(branch)`. @@ -53,5 +53,5 @@ Notes: Filtered from `branch_list()` but visible to internals: -- `__run__` — ephemeral isolation branch for a transactional run. - `__schema_apply_lock__` — serializes schema migrations. +- `__run__` — legacy from the pre-v0.4.0 Run state machine (removed in MR-771). The branch-name guard predicate `is_internal_run_branch` is kept as defense-in-depth so users cannot create a branch matching the legacy prefix; the filter will be removed once production legacy branches are swept (MR-770). diff --git a/docs/cli.md b/docs/cli.md index 0634bea..ae8c152 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -56,12 +56,12 @@ omnigraph policy validate --config ./omnigraph.yaml omnigraph policy test --config ./omnigraph.yaml omnigraph policy explain --config ./omnigraph.yaml --actor act-alice --action read --branch main -omnigraph run list ./repo.omni --json -omnigraph run show --uri ./repo.omni --json -omnigraph run publish --uri ./repo.omni --json -omnigraph run abort --uri ./repo.omni --json +omnigraph commit list ./repo.omni --json +omnigraph commit show --uri ./repo.omni --json ``` +(The legacy `omnigraph run list/show/publish/abort` subcommands were removed in MR-771; mutations and loads publish atomically and the commit graph (`omnigraph commit list`) is the audit surface.) + `query lint` and `query check` are the same command surface. In v1, repo-backed lint uses local or `s3://` repo URIs; HTTP targets are only supported when you also pass `--schema`. diff --git a/docs/constants.md b/docs/constants.md index 198b77e..527aaea 100644 --- a/docs/constants.md +++ b/docs/constants.md @@ -4,8 +4,8 @@ |---|---|---| | `MANIFEST_DIR` | `__manifest` | `db/manifest/layout.rs` | | Commit graph dir | `_graph_commits.lance` | `db/commit_graph.rs` | -| Run registry dir | `_graph_runs.lance` | `db/run_registry.rs` | -| Run branch prefix | `__run__` | `db/run_registry.rs` | +| Run registry dir (legacy, removed MR-771) | `_graph_runs.lance` | inert post-v0.4.0; reclaimed by MR-770 | +| Run branch prefix (legacy, removed MR-771) | `__run__` | filtered by `is_internal_run_branch` defense-in-depth | | Schema apply lock | `__schema_apply_lock__` | `db/mod.rs` | | Manifest publisher retry budget | `PUBLISHER_RETRY_BUDGET = 5` | `db/manifest/publisher.rs` | | Internal manifest schema version | `INTERNAL_MANIFEST_SCHEMA_VERSION = 2` | `db/manifest/migrations.rs` | diff --git a/docs/errors.md b/docs/errors.md index 257ae4c..ad79e66 100644 --- a/docs/errors.md +++ b/docs/errors.md @@ -9,6 +9,7 @@ - `Manifest(ManifestError { kind: BadRequest|NotFound|Conflict|Internal, details: Option, … })` - `ManifestConflictDetails::ExpectedVersionMismatch { table_key, expected, actual }` — caller's `expected_table_versions` did not match the manifest's current latest non-tombstoned version (set by `OmniError::manifest_expected_version_mismatch`). - `ManifestConflictDetails::RowLevelCasContention` — Lance row-level CAS rejected the publish because a concurrent writer landed the same `object_id`. Retried internally by the publisher; only surfaces if the retry budget exhausts. + - **D₂ parse-time rejection** (MR-794): a single mutation query that mixes inserts/updates with deletes errors out *before any I/O* with kind `BadRequest`. Message: `mutation '' on the same query mixes inserts/updates and deletes; split into separate mutations: (1) inserts and updates, then (2) deletes`. See [docs/query-language.md](query-language.md) for the rule and [docs/runs.md](runs.md) for the underlying staged-write rationale. - `MergeConflicts(Vec)` Compiler-side `NanoError` covers parse / catalog / type / storage / plan / execution / arrow / lance / IO / manifest / unique-constraint, each with structured spans (`SourceSpan { start, end }`) for ariadne-style diagnostics. diff --git a/docs/execution.md b/docs/execution.md index 6bf55a5..bd4842c 100644 --- a/docs/execution.md +++ b/docs/execution.md @@ -79,13 +79,16 @@ Hybrid example: `order { rrf(nearest($d.embedding, $q), bm25($d.body, $q_text)) ## Mutation execution (`exec/mutation.rs`) -Resolves expression values to literals, converts to typed Arrow arrays (`literal_to_typed_array(lit, DataType, num_rows)`), then writes: +Resolves expression values to literals, converts to typed Arrow arrays (`literal_to_typed_array(lit, DataType, num_rows)`), then writes via Lance's two-phase distributed-write API at end-of-query: -- `insert` → Lance `WriteMode::Append` -- `update` → Lance `merge_insert(WhenMatched::Update)` -- `delete` → Lance `merge_insert(WhenMatched::Delete)` (logical) or filtered overwrite. +- `insert` (no `@key`, edges) → accumulate into `MutationStaging.pending` (Append mode); finalize calls `stage_append` once per touched table. +- `insert` (`@key` node) → accumulate into `pending` (Merge mode); finalize calls `stage_merge_insert` once per touched table. +- `update` → scan committed via Lance + pending via DataFusion `MemTable` (read-your-writes), apply assignments, accumulate into `pending` (Merge mode). +- `delete` → still inline-commits via `delete_where` (Lance 4.0.0 has no public two-phase delete); recorded into `MutationStaging.inline_committed`. -Multi-statement mutations are atomic at the manifest commit boundary. +**D₂ parse-time rule.** A single mutation query is either insert/update-only or delete-only. Mixed → reject before any I/O. The check fires in `enforce_no_mixed_destructive_constructive(&ir)` inside `execute_named_mutation`. + +Multi-statement mutations are atomic at the publisher commit boundary: every insert/update batch lives in memory until end-of-query, then exactly one `stage_*` + `commit_staged` runs per touched table, then `ManifestBatchPublisher::publish` commits the manifest atomically with per-table `expected_table_versions` CAS. ### Mutation flow — sequence @@ -93,57 +96,58 @@ Multi-statement mutations are atomic at the manifest commit boundary. sequenceDiagram autonumber participant client as Client - participant og as Omnigraph::mutate
(mutation.rs:511) + participant og as Omnigraph::mutate_as
(mutation.rs) participant cmp as omnigraph-compiler - participant runs as RunRegistry + participant stg as MutationStaging
(exec/staging.rs) participant ts as table_store participant lance as Lance dataset - participant mr as ManifestRepo
(manifest.rs:280) + participant pub as ManifestBatchPublisher - client->>og: mutate(target, source, name, params) - og->>cmp: parse + typecheck_query - cmp-->>og: CheckedQuery (Mutation IR) - og->>runs: begin_run(target, op_hash)
fork __run__ from target head - runs-->>og: RunRecord - loop for each mutation statement (on __run__) - og->>og: resolve expression literals
literal_to_typed_array(lit, type, n) - alt insert - og->>ts: append RecordBatches - ts->>lance: WriteMode::Append → new fragment(s) - else update - og->>ts: merge_insert keyed by id - ts->>lance: merge_insert(WhenMatched::Update) - else delete - og->>ts: merge_insert with delete predicate - ts->>lance: merge_insert(WhenMatched::Delete) + client->>og: mutate_as(branch, source, name, params, actor_id) + og->>cmp: parse + typecheck + lower_mutation_query + cmp-->>og: MutationIR + og->>og: enforce_no_mixed_destructive_constructive (D₂) + loop for each mutation op + og->>og: resolve literals + build batch + alt insert / update (accumulate) + og->>ts: open dataset @ pre-write version (first touch) + og->>stg: ensure_path + append_batch (PendingMode) + opt update — scan committed + pending + og->>ts: scan_with_pending (Lance + DataFusion MemTable union) + ts-->>og: matched batches + end + else delete (inline-commit, D₂ keeps separate) + og->>ts: delete_where (advances Lance HEAD) + og->>stg: record_inline (SubTableUpdate) end - lance-->>ts: new dataset version - og->>mr: commit_updates(SubTableUpdate)
per-statement commit on __run__ - mr-->>og: ack end - og->>og: OCC: target head unchanged since begin_run? - og->>og: publish_run(run_id) - alt fast path (target hasn't moved) - og->>mr: commit_updates_on_branch(target, updates)
promote run snapshot - else merge path (target advanced) - og->>og: branch_merge_internal(__run__, target)
three-way merge + og->>stg: finalize(db, branch) + loop per pending table + stg->>ts: stage_append OR stage_merge_insert (one per table) + ts-->>stg: StagedWrite (transaction + fragments) + stg->>ts: commit_staged (advances Lance HEAD) + ts-->>stg: new Dataset end - mr-->>og: new target snapshot - og->>runs: terminate_run(Published) + stg-->>og: (updates: Vec, expected_versions) + og->>pub: commit_updates_on_branch_with_expected + pub->>pub: publisher CAS (cross-table OCC on __manifest) + pub-->>og: new manifest version og-->>client: MutationResult ``` **Code paths:** -- Entry: `Omnigraph::mutate` at `crates/omnigraph/src/exec/mutation.rs:511` -- Per-mutation orchestration: `mutate_with_current_actor` at `crates/omnigraph/src/exec/mutation.rs:539` -- Per-statement commit on the run-branch: `commit_updates` (called from `execute_insert` / `execute_update` / `execute_delete` in `crates/omnigraph/src/exec/mutation.rs`) -- Run publish: `Omnigraph::publish_run` at `crates/omnigraph/src/db/omnigraph.rs:858` -- Manifest commit primitive: `ManifestRepo::commit` at `crates/omnigraph/src/db/manifest.rs:280` (called from both per-statement `commit_updates` and the publish path) +- Entry: `Omnigraph::mutate_as` at `crates/omnigraph/src/exec/mutation.rs` +- Per-mutation orchestration: `mutate_with_current_actor` at `crates/omnigraph/src/exec/mutation.rs` +- D₂ check: `enforce_no_mixed_destructive_constructive` (in the same file) +- Per-op execution: `execute_insert`, `execute_update`, `execute_delete_node`, `execute_delete_edge` +- Pending-aware reads: `TableStore::scan_with_pending` / `count_rows_with_pending` at `crates/omnigraph/src/table_store.rs` +- Edge cardinality with pending: `validate_edge_cardinality_with_pending` at `crates/omnigraph/src/exec/mutation.rs` +- Per-query accumulator: `crates/omnigraph/src/exec/staging.rs` (`MutationStaging`, `PendingTable`, `PendingMode`, `finalize`) +- End-of-query Lance commit: `TableStore::stage_append`, `stage_merge_insert`, `commit_staged` at `crates/omnigraph/src/table_store.rs` +- Manifest commit primitive: `commit_updates_on_branch_with_expected` at `crates/omnigraph/src/db/omnigraph/table_ops.rs` -Multi-statement mutations don't get atomicity from a single final `commit` — they get it from the **run-branch + publish_run** pattern. By default a mutation forks a fresh `__run__` branch (`begin_run`); each statement individually commits its sub-table updates to that run-branch. After all statements complete, an OCC pre-check verifies the target hasn't moved since the run started, then `publish_run` atomically promotes the run-branch into the target — either via the fast path (direct promotion if the target hasn't moved) or a three-way merge. That final publish is what gives multi-statement mutations their atomicity guarantee (per [`docs/invariants.md`](invariants.md) §VI.26). If anything fails mid-run, the run is failed and the run-branch is dropped without affecting the target. - -One exception: if the caller already targets a `__run__` branch (mutation.rs:555), the mutation runs directly on that branch with no nested run wrapping — the assumption is the caller is managing the surrounding run lifecycle themselves. See [runs.md](runs.md) for the full run lifecycle. +Atomicity guarantee for multi-statement mutations: a mid-query failure leaves Lance HEAD untouched on staged tables (no inline commit happened during op execution), so the next mutation proceeds normally with no `ExpectedVersionMismatch`. The publisher CAS at the very end either succeeds (manifest advances atomically across all touched sub-tables) or fails with a typed `ManifestConflictDetails::ExpectedVersionMismatch` (no partial publish). See [docs/invariants.md §VI.25 / §VI.32](invariants.md) and [docs/runs.md](runs.md). ## Bulk loader (`loader/mod.rs`) @@ -156,19 +160,18 @@ One exception: if the caller already targets a `__run__` branch (mutation.rs ## Load modes (`LoadMode`) -| Mode | Semantics | -|---|---| -| `Overwrite` | Replace all data in the target tables on the branch | -| `Append` | Strict insert; duplicates error | -| `Merge` | Upsert by id (`merge_insert`) | +| Mode | Semantics | Path (post-MR-794) | +|---|---|---| +| `Overwrite` | Replace all data in the target tables on the branch | Inline-commit per type, then publisher CAS at end-of-load. Truncate-then-append doesn't fit the staged shape; documented residual. | +| `Append` | Strict insert; duplicates error | In-memory `MutationStaging` accumulator; one `stage_append` + `commit_staged` per touched table at end-of-load; publisher CAS. | +| `Merge` | Upsert by `id` (`merge_insert`) | Same accumulator; one `stage_merge_insert` per touched table at end-of-load (Merge mode dedupes by `id`, last-write-wins); publisher CAS. | + +For Append/Merge, a mid-load failure (RI / cardinality violation, validation error) leaves Lance HEAD untouched on the staged tables — the next load on the same tables proceeds normally with no `ExpectedVersionMismatch`. For Overwrite, a mid-load failure can still leave Lance HEAD on a partially-truncated table; the next overwrite replaces it. ## `load` vs `ingest` -- `load(branch, data, mode)` — direct load to a branch. -- `ingest(branch, from, data, mode)` — branch-creating, transactional load: - 1. If target advanced since the run started, fork a fresh run branch from `from`. - 2. Load into the run branch (Append). - 3. If target hasn't moved, fast-publish; otherwise abort. +- `load(branch, data, mode)` — direct load to a branch (single publisher commit per call). +- `ingest(branch, from, data, mode)` — branch-creating wrapper: if `branch` doesn't exist, fork it from `from` (default `main`) via `branch_create_from`, then call `load(branch, data, mode)`. - Returns `IngestResult { branch, base_branch, branch_created, mode, tables[] }`. - `ingest_as(actor_id)` records the actor on the resulting commit. diff --git a/docs/invariants.md b/docs/invariants.md index 96a99b4..3e3af86 100644 --- a/docs/invariants.md +++ b/docs/invariants.md @@ -110,7 +110,7 @@ Specific defaults (timeout values, memory caps, TTL windows) are *configuration* *Status: aspirational — referential integrity at scale requires SIP-backed cross-table validation; not yet implemented. Cross-batch / cross-version uniqueness tracked in MR-714.* 25. **Isolation: per-query snapshot; read-your-writes within and across queries in a session.** Each query reads from one consistent manifest version. Within a multi-statement mutation, the read subplan inside each write operator sees the writes from earlier statements. Across queries in a session, reads always resolve the latest manifest version — no reader pinning to older snapshots. - *Status: open — read-your-writes within a multi-statement mutation requires Kuzu-style local-uncommitted scan path; deferred per MR-737 §10.10.* + *Status: upheld for inserts/updates after MR-794 step 2+ — `MutationStaging`'s in-memory accumulator + `TableStore::scan_with_pending` (DataFusion `MemTable` union with the committed Lance scan) implements read-your-writes within a multi-statement mutation. Delete-touching mutations are limited to delete-only by parse-time D₂; closing the within-query RYW gap for deletes requires Lance's two-phase delete API (tracked: MR-793 / Lance-upstream).* 26. **Durability before acknowledgement.** Commit returns only after the substrate has confirmed durable persistence. No "fast" or "fire-and-forget" durability levels. diff --git a/docs/query-language.md b/docs/query-language.md index f90fe60..5c98959 100644 --- a/docs/query-language.md +++ b/docs/query-language.md @@ -64,6 +64,14 @@ Used inside MATCH or as expressions inside RETURN/ORDER: `` is a literal, `$param`, or `now()`. Multi-statement mutations execute atomically (added in v0.2.0). +### D₂ — mixed insert/update + delete is rejected at parse time + +A single mutation query must be **either insert/update-only or delete-only**. Mixed → rejected before any I/O with the message: + +> `mutation '' on the same query mixes inserts/updates and deletes; split into separate mutations: (1) inserts and updates, then (2) deletes. This restriction lifts when Lance exposes a two-phase delete API (tracked: MR-793 / Lance-upstream).` + +Reason: under the staged-write rewire (MR-794), inserts and updates accumulate in memory and commit at end-of-query, while deletes still inline-commit (Lance 4.0.0 has no public two-phase delete). Mixing creates ordering hazards (same-row insert→delete becomes a no-op because the staged insert isn't visible to delete; cascading deletes of just-inserted edges break referential integrity by silent design). Until Lance exposes `DeleteJob::execute_uncommitted`, the parse-time rejection keeps both paths atomic and correct. See [docs/runs.md](runs.md) and [docs/invariants.md §VI.25](invariants.md). + ## IR (Intermediate Representation) `QueryIR { name, params, pipeline: Vec, return_exprs, order_by, limit }` diff --git a/docs/releases/v0.4.1.md b/docs/releases/v0.4.1.md new file mode 100644 index 0000000..4c5d44a --- /dev/null +++ b/docs/releases/v0.4.1.md @@ -0,0 +1,138 @@ +# Omnigraph v0.4.1 + +Omnigraph v0.4.1 closes the multi-statement-mutation atomicity gap that +v0.4.0 documented as a known limitation. Inserts and updates now route +through an in-memory `MutationStaging` accumulator and commit via Lance's +two-phase distributed-write API at end-of-query. A failed mid-query op +no longer leaves Lance HEAD drifted on the touched table — the next +mutation proceeds normally. + +## Highlights + +- **Staged-write rewire (MR-794)**: `mutate_as` and `load` (Append / + Merge modes) accumulate insert/update batches into + `MutationStaging.pending` per touched table. No Lance HEAD advance + happens during op execution; one `stage_*` + `commit_staged` per + table runs at end-of-query, then `ManifestBatchPublisher::publish` + commits the manifest atomically. A mid-query failure leaves Lance + HEAD untouched on staged tables. +- **D₂ parse-time rule**: a single mutation query is either + insert/update-only or delete-only. Mixed → rejected with a clear + error directing the caller to split into two queries. Lance 4.0.0 + has no public two-phase delete; deletes still inline-commit, and D₂ + keeps that path safe. +- **Read-your-writes via DataFusion `MemTable`**: read sites in + multi-statement mutations consume `TableStore::scan_with_pending`, + which Lance-scans the committed snapshot at the captured + `expected_version` and unions with a DataFusion `MemTable` over the + pending batches. Replaces the previous "reopen at staged Lance + version" pattern. +- **Coordinator swap-restore eliminated** from `mutate_with_current_actor`. + Branch is threaded explicitly through the per-op execution path + (`execute_named_mutation`, `execute_insert`, `execute_update`, + `execute_delete*`, `validate_edge_insert_endpoints`, + `ensure_node_id_exists`). The `swap_coordinator_for_branch` / + `restore_coordinator` API and `CoordinatorRestoreGuard` are removed + from `mutation.rs`. (`merge.rs` keeps its own swap pattern; that's + a separate workflow tracked in MR-793.) +- **`docs/invariants.md` §VI.25** flips from `aspirational/open` to + `upheld for inserts/updates`. The within-query read-your-writes + guarantee is now load-bearing for the publisher CAS contract. + +## Behavior changes + +- A failed multi-statement mutation no longer surfaces + `ExpectedVersionMismatch` on the *next* mutation against the same + table. The next call proceeds normally — Lance HEAD on staged + tables is unchanged. +- Mixed insert/update + delete in one query is rejected at parse + time. Existing test queries that mixed both must be split. +- `MutationStaging`'s shape changed: `pending: HashMap` + + `inline_committed: HashMap` replaces the + previous `latest: HashMap`. This is an internal + type; no public API impact. + +## Residual / out of scope + +- **`LoadMode::Overwrite`** keeps the legacy inline-commit path + (truncate-then-append doesn't fit the staged shape). A mid-overwrite + failure can still drift Lance HEAD on a partially-truncated table; + the next overwrite replaces it. Operator-driven, rare. +- **Delete-only multi-statement mutations** still inline-commit per op. + D₂ keeps inserts/updates from coexisting with deletes, so the + inline path remains atomic per op but not per query for delete-only + cascades. Closing this requires Lance to expose + `DeleteJob::execute_uncommitted`; tracked in MR-793 / Lance-upstream. +- **`schema_apply`, `branch_merge_internal`, `ensure_indices`** still + use Lance's inline-commit APIs. The two-phase pattern is in + `mutate_as` and `load` only; hoisting it to a storage-trait + invariant covering all writers is MR-793. + +## Tests added + +- `tests/runs.rs::partial_failure_leaves_target_queryable_and_unblocks_next_mutation` + (replaces the old `partial_failure_observably_rolls_back_but_blocks_next_mutation_on_same_table`) +- `tests/runs.rs::mutation_rejects_mixed_insert_and_delete_at_parse_time` +- `tests/runs.rs::mixed_insert_and_update_on_same_person_coalesces_to_one_merge` +- `tests/runs.rs::multiple_appends_to_same_edge_coalesce_to_one_append` +- `tests/runs.rs::multi_statement_inserts_publish_exactly_once` +- `tests/runs.rs::load_with_bad_edge_reference_unblocks_next_load` +- `tests/runs.rs::load_with_cardinality_violation_unblocks_next_load` + +## Files changed + +- `crates/omnigraph/src/exec/staging.rs` (NEW) — `MutationStaging`, + `PendingTable`, `PendingMode`, `StagedTablePath`, + `dedupe_merge_batches_by_id`. +- `crates/omnigraph/src/exec/mutation.rs` — D₂ check; per-op + rewires (`execute_insert`, `execute_update`, `execute_delete*`); + branch threading; coordinator-swap removal; helper + `validate_edge_cardinality_with_pending`; helper + `concat_match_batches_to_schema`; `apply_assignments` updated to + copy unassigned blob columns from full-schema scans. +- `crates/omnigraph/src/loader/mod.rs` — `load_jsonl_reader` split: + staged path for Append/Merge, legacy inline-commit path for + Overwrite. Helpers `collect_node_ids_with_pending` and + `validate_edge_cardinality_with_pending_loader`. +- `crates/omnigraph/src/table_store.rs` — `scan_with_pending`, + `count_rows_with_pending` (DataFusion `MemTable`-backed union with + Lance scan). +- `Cargo.toml` (workspace) + `crates/omnigraph/Cargo.toml` — added + `datafusion = "52"` direct dep (transitively pulled by Lance + already; required for `MemTable`). +- `docs/runs.md` — removed "Known limitation" section; documented + the new accumulator + D₂ + LoadMode::Overwrite residual. +- `docs/invariants.md` — §VI.25 status flipped to `upheld for + inserts/updates`. +- `docs/architecture.md` — added "Mutation atomicity — in-memory + accumulator (MR-794)" subsection; refreshed the engine + state + diagrams to drop `RunRegistry` and add `MutationStaging`. +- `docs/execution.md` — rewrote the mutation flow sequence diagram + for the staged-write path; updated the `LoadMode` table to call + out per-mode commit semantics; rewrote `load` vs `ingest`. +- `docs/query-language.md` — documented the D₂ parse-time rule. +- `docs/errors.md` — added the D₂ `BadRequest` rejection path. +- `docs/storage.md` — dropped the live `_graph_runs.lance` reference + (legacy from MR-771) from the layout diagram and prose. +- `docs/branches-commits.md` — moved `__run__` to a legacy note; + removed `publish_run` from the publish-trigger list. +- `docs/audit.md` — current `_as` API list refreshed; legacy + `RunRecord.actor_id` moved to a historical note. +- `docs/constants.md` — marked the run registry / branch-prefix rows + as legacy. +- `docs/cli.md` — replaced the legacy `omnigraph run *` quickstart + block with `omnigraph commit list/show`. +- `docs/testing.md` — extended the `runs.rs` row to cover the new + MR-794 contract tests; added the `staged_writes.rs` row. +- `AGENTS.md` (CLAUDE.md symlink) — updated the atomic-per-query + description and the L2 capability matrix row. + +## Included Changes + +- MR-794 step 2+ — rewire `mutate_as` and `load` via in-memory + `MutationStaging` + `stage_*` / `commit_staged` per touched table at + end-of-query. +- (MR-794 step 1 shipped in v0.4.0's PR #67 — `StagedWrite`, + `stage_append`, `stage_merge_insert`, `commit_staged`, + `scan_with_staged`, `count_rows_with_staged` — and is the substrate + this release builds on.) diff --git a/docs/runs.md b/docs/runs.md index 789d91d..63295fd 100644 --- a/docs/runs.md +++ b/docs/runs.md @@ -20,22 +20,60 @@ publisher's row-level CAS on `__manifest` is the single fence. A `.gq` query with multiple ops (e.g. `insert Person … insert Knows …`) must observe earlier ops' writes when validating later ops (referential -integrity, edge cardinality). After demotion this is implemented via an -in-process `MutationStaging` accumulator in -`crates/omnigraph/src/exec/mutation.rs`: +integrity, edge cardinality). After MR-794 step 2+ this is implemented +via an in-memory `MutationStaging` accumulator in +[`crates/omnigraph/src/exec/staging.rs`](../crates/omnigraph/src/exec/staging.rs), +shared by both `mutate_as` and the bulk loader: - On the first touch of each table, the pre-write manifest version is - captured into `expected_versions[table_key]`. -- Subsequent ops on the same table re-open the dataset at the locally - staged Lance version (bypassing the manifest, which has not been - committed yet) so they see prior writes. -- One `commit_with_expected(updates, expected_versions)` at the end - publishes the lot atomically. Cross-table conflicts surface as + captured into `expected_versions[table_key]` (the publisher's CAS + fence at end-of-query). +- Each insert/update op pushes a `RecordBatch` into the per-table + pending accumulator. Lance HEAD does **not** advance during op + execution. +- Read sites (validation, predicate matching for `update`) consume + `TableStore::scan_with_pending`, which scans committed via Lance + and applies the same SQL filter to the pending batches via DataFusion + `MemTable`. Same-query writes are visible to subsequent reads. +- At end-of-query, `MutationStaging::finalize` issues exactly one + `stage_*` + `commit_staged` per touched table (concatenating + accumulated batches; merge-mode dedupes by `id`, last-write-wins), + and the publisher publishes the manifest atomically across all + touched sub-tables. Cross-table conflicts surface as `ManifestConflictDetails::ExpectedVersionMismatch`. +- **Deletes still inline-commit.** Lance's `Dataset::delete` is not + exposed as a two-phase op in 4.0.0; deletes go through `delete_where` + immediately and record their post-write state in + `MutationStaging.inline_committed`. The parse-time D₂ rule (below) + prevents inserts/updates from coexisting with deletes in one query, + so the inline path is safe for delete-only mutations. This upholds [docs/invariants.md §VI.23](invariants.md) (atomicity per -query) and §VI.25 (read-your-writes within a multi-statement mutation — -previously aspirational, now upheld). +query) and §VI.25 (read-your-writes within a multi-statement mutation, +upheld). + +### D₂ — parse-time mixed-mode rejection + +A single mutation query is either insert/update-only or delete-only. +Mixed → rejected at parse time with a clear error directing the user to +split the query. Reason: mixing creates ordering hazards +(insert→delete on the same row would silently no-op because the staged +insert isn't visible to delete; cascading deletes of just-inserted +edges break referential integrity). Until Lance exposes a two-phase +delete API, the parse-time rejection keeps both paths atomic and +correct. Tracked: MR-793, plus a Lance-upstream ticket. + +### `LoadMode::Overwrite` residual + +The bulk loader's Append and Merge modes use the staged-write path +described above. `LoadMode::Overwrite` keeps the legacy inline-commit +path: truncate-then-append doesn't fit the staged shape cleanly in +Lance 4.0.0, and overwrite has no in-flight read-your-writes +requirement (the prior data is being wiped). A mid-overwrite failure +can leave Lance HEAD on a partially-truncated table; the next overwrite +will replace it. Operator-driven (rare in agent workloads); document +permanently until Lance exposes `Operation::Overwrite { fragments }` as +a two-phase op. ## Conflict shape @@ -59,40 +97,32 @@ list`. `_graph_runs.lance` belongs in MR-770 (the production sweep) — this PR stops *creating* run state but does not destroy legacy bytes on disk. -## Known limitation: mid-query partial failure on the same table +## Mid-query partial failure: closed by MR-794 -A multi-statement `.gq` mutation where op-N writes a Lance fragment -successfully and op-N+1 then fails leaves the touched table at -`Lance HEAD = manifest_version + 1`. The query is atomic at the manifest -level (the publisher never publishes, so reads at the pinned manifest -version do *not* see op-N's data), but the *next* mutation against the -same table fails loudly with -`ManifestConflictDetails::ExpectedVersionMismatch` because -`ensure_expected_version` enforces strict equality between Lance HEAD and -the manifest's pinned version. +The pre-MR-794 design had a known limitation: a multi-statement `.gq` +mutation where op-N inline-committed a Lance fragment and op-N+1 then +failed left the touched table at `Lance HEAD = manifest_version + 1`, +blocking the next mutation with `ExpectedVersionMismatch`. -**Why the engine doesn't auto-rollback**: Lance's `Dataset::restore()` is -*not* a rewind — it appends a new commit (containing the desired -historical version's data) and advances HEAD further. There is no Lance -API to delete a committed version. A proper fix requires writing each -mutation's per-table fragments to a *transient Lance branch* on the -sub-table, then fast-forwarding main on success or dropping the branch -on failure. That work is tracked as a follow-up to MR-771; in the -meantime: +MR-794 (step 1 + step 2+) closed this for inserts/updates **by +construction at the writer layer**: insert and update batches accumulate +in memory; no Lance HEAD advance happens during op execution; one +`stage_*` + `commit_staged` per touched table runs at end-of-query, and +only after every op succeeded. A failed op leaves Lance HEAD untouched +on the staged tables, so the next mutation proceeds normally with no +drift to reconcile. -- **In practice this is rare.** Most schema-language validation - (`@key`, `@enum`, `@range`, intra-batch uniqueness, edge-endpoint - existence) runs *before* any Lance write inside the failing op, so - single-statement mutations never trip this. The narrow path is - multi-statement queries (`insert ... insert ...`, - `insert ... update ...`) where a late op fails on validation that - depends on earlier ops' staged data. -- **Workaround**: callers that hit this should refresh the handle and - retry the mutation; if Lance HEAD remains drifted the - `omnigraph cleanup` command will GC the orphan version once a later - successful commit on the same table moves HEAD past it. (`cleanup` - cannot reclaim an orphan that *is* the current Lance HEAD; that case - needs the per-table-branch follow-up to fully heal.) +The cancellation case (future drop mid-mutation) inherits the same +guarantee — the in-memory accumulator evaporates with the dropped task +and no Lance write was ever issued. -The cancellation case (future drop mid-mutation) has the same shape and -the same workaround. +For delete-touching mutations the legacy inline-commit shape is +preserved (Lance has no public two-phase delete in 4.0.0) — the same +narrow window remains. The parse-time D₂ rule prevents inserts/updates +from coexisting with deletes in one query, so a pure-delete failure +cannot drift any staged-table state. If a delete-only multi-table +mutation fails mid-cascade, the same workaround as before applies +(retry; rely on `omnigraph cleanup` once a later successful commit +moves HEAD past the orphan version). Closing this requires Lance to +expose `DeleteJob::execute_uncommitted`; tracked in MR-793 and a +Lance-upstream ticket. diff --git a/docs/storage.md b/docs/storage.md index 153b080..90806d0 100644 --- a/docs/storage.md +++ b/docs/storage.md @@ -22,7 +22,7 @@ OmniGraph is **not** a single Lance dataset; it is a *graph* of datasets coordin - `edges/{fnv1a64-hex(edge_type_name)}` — one Lance dataset per edge type - `__manifest/` — the catalog of all sub-tables and their published versions - `_graph_commits.lance` / `_graph_commit_actors.lance` — the commit graph and its actor map - - `_graph_runs.lance` / `_graph_run_actors.lance` — the run registry and its actor map + - (legacy `_graph_runs.lance` / `_graph_run_actors.lance` from pre-v0.4.0 repos are inert; the run state machine was removed in MR-771 and these files are cleaned up via MR-770's production sweep) - **Manifest row schema** (`object_id, object_type, location, metadata, base_objects, table_key, table_version, table_branch, row_count`): - `object_type` ∈ `table | table_version | table_tombstone` - `table_key` ∈ `node: | edge:` @@ -63,14 +63,12 @@ flowchart TB nodes["nodes/{fnv1a64-hex}/
one dataset per node type"]:::l2 edges["edges/{fnv1a64-hex}/
one dataset per edge type"]:::l2 cgraph["_graph_commits.lance/
_graph_commit_actors.lance/"]:::l2 - runs["_graph_runs.lance/
_graph_run_actors.lance/"]:::l2 refs["_refs/branches/{name}.json
graph-level branches"]:::l2 repo --> manifest repo --> nodes repo --> edges repo --> cgraph - repo --> runs repo --> refs subgraph dataset[Inside each Lance dataset — L1] @@ -91,7 +89,7 @@ flowchart TB - **Repo root** is one directory (or S3 prefix). Everything below is part of one OmniGraph repo. - **`__manifest/`** is a Lance dataset whose rows describe which sub-table version is published at which graph-branch. Reading a snapshot starts here. - **`nodes/`** and **`edges/`** are sibling directories holding one Lance dataset per declared type. Names are `fnv1a64-hex` of the type name to keep paths fixed-length and case-safe. -- **`_graph_commits.lance` / `_graph_runs.lance`** are L2 datasets that record the graph-level commit DAG and run registry respectively (each has a paired `*_actors.lance` for the actor map). +- **`_graph_commits.lance`** is an L2 dataset that records the graph-level commit DAG, with a paired `_graph_commit_actors.lance` for the actor map. (Pre-v0.4.0 repos also have inert `_graph_runs.lance` / `_graph_run_actors.lance` from the removed Run state machine; MR-770 sweeps these in production.) - **`_refs/branches/{name}.json`** is graph-level branch metadata — pointers from a branch name to the manifest version it heads. - **Inside each Lance dataset** (orange): the standard Lance directory layout. `_versions/{n}.manifest` records every commit; `data/` holds the actual Arrow fragments; `_indices/{uuid}/` holds index segments with their own `fragment_bitmap` for partial coverage; `_refs/` holds Lance-native per-dataset branches and tags. diff --git a/docs/testing.md b/docs/testing.md index f4330db..4ba3cbc 100644 --- a/docs/testing.md +++ b/docs/testing.md @@ -19,7 +19,8 @@ The engine's `tests/` is the principal coverage surface; most graph-shaped behav |---|---| | `end_to_end.rs` | Full init → load → query/mutate flow | | `branching.rs` | Branch create / list / delete, lazy fork | -| `runs.rs` | Transactional runs (begin/publish/abort), idempotency | +| `runs.rs` | Direct-publish writes: cancellation, concurrent-writer CAS, multi-statement atomicity, MR-794 staged-write rewire (D₂ rejection, insert+update coalesce, multi-append coalesce, partial-failure recovery, load RI/cardinality recovery) | +| `staged_writes.rs` | TableStore staged-write primitives (`stage_append`, `stage_merge_insert`, `commit_staged`, `scan_with_staged`, `count_rows_with_staged`) — primitive-level only; engine code uses the in-memory `MutationStaging` accumulator instead | | `lifecycle.rs` | Repo lifecycle, schema state | | `point_in_time.rs` | Snapshots, time travel (`snapshot_at_version`, `entity_at`) | | `changes.rs` | `diff_between` / `diff_commits` | From 3223b51cf130ad5a97df3af7c71e9cf673caeef4 Mon Sep 17 00:00:00 2001 From: Ragnor Comerford Date: Fri, 1 May 2026 13:47:55 +0200 Subject: [PATCH 06/10] =?UTF-8?q?MR-794=20step=202:=20address=20PR=20#68?= =?UTF-8?q?=20review=20=E2=80=94=20merge=20semantics,=20cardinality,=20res?= =?UTF-8?q?idual?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Five fixes from PR #68 review (Cursor Bugbot + Codex + Cubic): * **scan_with_pending gains merge-shadow semantics** (Codex P1, Cubic P1#1): new `key_column: Option<&str>` parameter. When set, committed rows whose key value appears in any pending batch are excluded from the scan — making `scan_with_pending` correctly merge-semantic for chained updates instead of naively unioning. execute_update calls with Some("id"). Without this, a chained `update where age > 30` could match a row whose pending value already moved out of range. * **Multi-delete on same table no longer trips ExpectedVersionMismatch** (Cursor Bugbot HIGH): open_table_for_mutation routes through reopen_for_mutation when staging.inline_committed has the table, using the post-inline-commit Lance version captured at record_inline time. The legacy open_for_mutation_on_branch fence (Lance HEAD == manifest pinned) is correct cross-writer but wrong intra-query when deletes have already advanced HEAD on this table. Branch goes away when Lance ships two-phase delete (lance-format/lance#6658). * **Cardinality validation consolidated** (Cursor LOW + Codex P2 + Cubic P1#2 + Cubic P2): new exec/staging::count_src_per_edge + enforce_cardinality_bounds shared by mutation and loader paths. Restores the missing min-cardinality check on the engine path. Loader Merge mode passes Some("id") to dedupe edges being updated by id (not double-count committed + pending). Loader Append mode and engine path pass None (ULID-generated ids never collide). * **Dead count_rows_with_pending removed** (Cursor LOW): never called. * **Misleading concat-helper comment fixed** (Cubic P3): claimed schema normalization the helper doesn't implement. Updated to match reality. * **Documentation honesty** (Cubic P1#3): MR-794 narrows but doesn't eliminate the "Lance HEAD ahead of __manifest" drift class. Drift is unreachable for op-execution failures (the partial_failure test pins this), but a residual remains at the finalize→publisher boundary because Lance has no multi-dataset commit primitive: per-table commit_staged calls run sequentially before manifest commit. Updated docs/runs.md, docs/invariants.md §VI.25, docs/releases/v0.4.1.md to scope the claim precisely. * **Failpoint test pinning the residual**: new mutation.post_finalize_pre_publisher failpoint + two tests in tests/failpoints.rs that confirm the documented residual behavior. Catches future regressions that widen the residual. Test additions on tests/runs.rs: * chained_updates_with_overlapping_predicate_respects_intermediate_value * multi_statement_delete_on_same_node_table * cascade_delete_node_then_explicit_delete_edge_on_same_table * mutation_insert_edge_enforces_min_cardinality * load_merge_mode_dedupes_edge_for_cardinality_count 113/113 engine integration tests pass (runs + end_to_end + consistency + staged_writes + validators). Failpoints feature build runs in CI. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/omnigraph/src/exec/mutation.rs | 158 ++++++++------- crates/omnigraph/src/exec/staging.rs | 147 ++++++++++++++ crates/omnigraph/src/loader/mod.rs | 101 +++------ crates/omnigraph/src/table_store.rs | 168 ++++++++++----- crates/omnigraph/tests/failpoints.rs | 124 +++++++++++ crates/omnigraph/tests/runs.rs | 282 ++++++++++++++++++++++++++ docs/invariants.md | 2 +- docs/releases/v0.4.1.md | 9 +- docs/runs.md | 36 ++++ 9 files changed, 828 insertions(+), 199 deletions(-) diff --git a/crates/omnigraph/src/exec/mutation.rs b/crates/omnigraph/src/exec/mutation.rs index 7397074..a588c68 100644 --- a/crates/omnigraph/src/exec/mutation.rs +++ b/crates/omnigraph/src/exec/mutation.rs @@ -565,19 +565,59 @@ use super::staging::{MutationStaging, PendingMode}; /// first touch. The captured version is the publisher's CAS fence at /// end-of-query (per-table OCC). /// -/// The dataset is always opened at HEAD on the requested branch. Under -/// the staged-write rewire (MR-794 step 2+), no per-op commit advances -/// HEAD, so subsequent opens within the same query observe the same -/// version. The exception is the inline-commit delete path -/// (`execute_delete_*`), which still advances HEAD per-op — but D₂ at -/// parse time prevents inserts/updates and deletes from coexisting in -/// one query, so this can't conflict with a pending-batch read. +/// On first touch, opens the dataset at HEAD on the requested branch +/// via `open_for_mutation_on_branch`, which compares Lance HEAD against +/// the manifest's pinned version — that fence is the engine's +/// publisher-style OCC catching cross-writer drift before we make any +/// changes. +/// +/// On subsequent touches *within the same query*, behavior depends on +/// whether the table has already been inline-committed by a delete op: +/// +/// - **Insert / update path (no inline commit between touches).** Lance +/// HEAD has not moved since first touch, so a fresh +/// `open_for_mutation_on_branch` would still match the manifest +/// pinned version. We just go through it again; `ensure_path` is a +/// no-op (idempotent on the captured `expected_version`). +/// - **Delete cascade or multi-delete on the same table.** A prior +/// `delete_where` on this table has already advanced Lance HEAD past +/// the manifest's pinned version (the manifest doesn't move until +/// end-of-query). Going through `open_for_mutation_on_branch` again +/// would trip its `ensure_expected_version` equality check +/// (`actual = pinned + 1` vs `expected = pinned`). Instead we route +/// through `reopen_for_mutation` at the post-inline-commit Lance +/// version captured in `staging.inline_committed[table_key]`, which +/// is the source of truth for "where is Lance HEAD right now on +/// this table within this query." +/// +/// The `inline_committed` reopen branch closes the cursor-bot HIGH +/// "multi-delete fails on same table" finding. The branch goes away +/// once Lance exposes a two-phase delete API +/// ([lance-format/lance#6658](https://github.com/lance-format/lance/issues/6658)) +/// and we can stage deletes on the same path as inserts/updates. async fn open_table_for_mutation( db: &Omnigraph, staging: &mut MutationStaging, branch: Option<&str>, table_key: &str, ) -> Result<(Dataset, String, Option)> { + if let Some(prior) = staging.inline_committed.get(table_key) { + let path = staging.paths.get(table_key).ok_or_else(|| { + OmniError::manifest_internal(format!( + "open_table_for_mutation: inline_committed[{}] without paths entry", + table_key + )) + })?; + let ds = db + .reopen_for_mutation( + table_key, + &path.full_path, + path.table_branch.as_deref(), + prior.table_version, + ) + .await?; + return Ok((ds, path.full_path.clone(), path.table_branch.clone())); + } let (ds, full_path, table_branch) = db.open_for_mutation_on_branch(branch, table_key).await?; let expected_version = ds.version().version; @@ -701,6 +741,16 @@ impl Omnigraph { let (updates, expected_versions) = staging .finalize(self, requested.as_deref()) .await?; + // Failpoint that wedges the documented finalize→publisher + // residual: per-table `commit_staged` calls already + // advanced Lance HEAD on every touched table; a failure + // injected here mirrors the production-rare case where + // the publisher's CAS pre-check rejects (or the manifest + // write throws) after staged commits succeeded. Used by + // `tests/failpoints.rs::finalize_publisher_residual_*` + // to pin the documented residual behavior. See + // `docs/runs.md` "Finalize → publisher residual". + crate::failpoints::maybe_fail("mutation.post_finalize_pre_publisher")?; self.commit_updates_on_branch_with_expected( requested.as_deref(), &updates, @@ -947,6 +997,12 @@ impl Omnigraph { (!blob_props.is_empty()).then_some(non_blob_cols.as_slice()); let pending_batches = staging.pending_batches(&table_key); let pending_schema = staging.pending_schema(&table_key); + // Use merge semantics on the union: a committed row whose `id` + // also appears in pending has been logically updated by an + // earlier op in this query and is shadowed from the scan, + // otherwise the predicate runs against stale committed values + // and a chained `update where ` can match a row whose + // pending value no longer satisfies . let batches = self .table_store() .scan_with_pending( @@ -955,6 +1011,7 @@ impl Omnigraph { pending_schema, projection, Some(&pred_sql), + Some("id"), ) .await?; @@ -965,11 +1022,14 @@ impl Omnigraph { }); } - // Concat the matched batches into one. The pending side may have - // a slightly different schema (e.g. Lance's `_rowid` column on - // the committed side, missing on the pending side). Normalize by - // dropping any `_rowid` / `_rowaddr` columns and reordering to - // the table's canonical schema. + // Concat the matched batches (committed + pending) into one. The + // helper trusts that both sides share a schema — Lance returns + // dataset-schema-ordered columns and DataFusion returns + // MemTable-schema-ordered columns; both should match the catalog's + // arrow_schema when the projection is consistent. If they + // diverge (typically a blob-table mid-schema-shift), the helper + // surfaces a clear error directing the caller to split the + // mutation. let matched = concat_match_batches_to_schema(&schema, &blob_props, batches)?; let affected_count = matched.num_rows(); @@ -1218,10 +1278,10 @@ fn concat_match_batches_to_schema( }) } -/// Validate that adding `pending` edges plus the committed edges does not -/// exceed the per-source cardinality bound on `edge_type`. Reads the `src` -/// column from both committed (Lance) and pending (in-memory) and counts -/// per src. +/// Validate `@card` bounds against committed (Lance) + pending (in-memory) +/// edges for one edge table. Engine path: each insert produces a fresh +/// ULID id, so committed and pending cannot share a primary key — no +/// dedup needed (`dedupe_key_column = None`). async fn validate_edge_cardinality_with_pending( db: &Omnigraph, committed_ds: &Dataset, @@ -1229,66 +1289,18 @@ async fn validate_edge_cardinality_with_pending( table_key: &str, edge_type: &omnigraph_compiler::catalog::EdgeType, ) -> Result<()> { - use std::collections::HashMap as StdHashMap; - if edge_type.cardinality.is_default() { return Ok(()); } - - let mut counts: StdHashMap = StdHashMap::new(); - - // Committed side: scan `src` column (cheap, no filter). - let committed = db - .table_store() - .scan(committed_ds, Some(&["src"]), None, None) - .await?; - for batch in &committed { - let srcs = batch - .column_by_name("src") - .ok_or_else(|| OmniError::Lance("missing 'src' column on edge table".into()))? - .as_any() - .downcast_ref::() - .ok_or_else(|| OmniError::Lance("'src' column is not Utf8".into()))?; - for i in 0..srcs.len() { - if srcs.is_valid(i) { - *counts.entry(srcs.value(i).to_string()).or_insert(0) += 1; - } - } - } - - // Pending side: walk in-memory pending batches for `src`. - for batch in staging.pending_batches(table_key) { - let Some(col) = batch.column_by_name("src") else { - continue; - }; - let Some(srcs) = col.as_any().downcast_ref::() else { - continue; - }; - for i in 0..srcs.len() { - if srcs.is_valid(i) { - *counts.entry(srcs.value(i).to_string()).or_insert(0) += 1; - } - } - } - - let card = &edge_type.cardinality; - for (src, count) in &counts { - if let Some(max) = card.max { - if *count > max { - return Err(OmniError::manifest(format!( - "@card violation on edge {}: source '{}' has {} edges (max {})", - edge_type.name, src, count, max - ))); - } - } - // Note: per-source minimum cardinality cannot be checked - // mid-query (a bound of `2..` requires both edges to be inserted - // before validation). The publisher path could re-validate at - // commit time; for now, defer to the loader's end-of-query check. - let _ = card.min; - } - - Ok(()) + let counts = super::staging::count_src_per_edge( + db, + committed_ds, + table_key, + staging, + None, + ) + .await?; + super::staging::enforce_cardinality_bounds(edge_type, &counts) } fn enrich_mutation_params(params: &ParamMap) -> Result { diff --git a/crates/omnigraph/src/exec/staging.rs b/crates/omnigraph/src/exec/staging.rs index 2b2c193..033cb99 100644 --- a/crates/omnigraph/src/exec/staging.rs +++ b/crates/omnigraph/src/exec/staging.rs @@ -23,6 +23,8 @@ use std::sync::Arc; use arrow_array::{Array, RecordBatch, StringArray, UInt32Array}; use arrow_schema::SchemaRef; +use lance::Dataset; +use omnigraph_compiler::catalog::EdgeType; use crate::db::SubTableUpdate; use crate::error::{OmniError, Result}; @@ -389,3 +391,148 @@ fn dedupe_merge_batches_by_id( arrow_select::concat::concat_batches(schema, &sliced) .map_err(|e| OmniError::Lance(e.to_string())) } + +// ─── Cardinality helpers (shared by mutation + loader paths) ──────────────── + +/// Count edges per `src` value across committed (Lance scan) + pending +/// (in-memory). Caller supplies an opened committed dataset so the +/// mutation path (which already has one) and the loader path (which +/// opens via snapshot) share the same body. +/// +/// `dedupe_key_column` controls whether committed rows are shadowed by +/// pending: +/// - `None` — every committed row counts, every pending row counts. +/// Correct when committed and pending cannot share a primary key +/// (engine inserts always use fresh ULID edge ids; loader Append +/// mode uses fresh ids too). +/// - `Some(col)` — committed rows whose `col` value also appears in any +/// pending batch are EXCLUDED from the committed count, so a Merge-mode +/// load that *updates* an existing edge (potentially changing its +/// `src`) counts the post-update row exactly once. Without this, +/// `LoadMode::Merge` double-counts. +pub(crate) async fn count_src_per_edge( + db: &crate::db::Omnigraph, + committed_ds: &Dataset, + table_key: &str, + staging: &MutationStaging, + dedupe_key_column: Option<&str>, +) -> Result> { + let mut counts: HashMap = HashMap::new(); + + let pending_batches = staging.pending_batches(table_key); + + // Collect pending key values (for shadow-on-merge dedupe). Only when + // dedupe is requested AND there's anything pending. + let pending_keys: Option> = match dedupe_key_column { + Some(col) if !pending_batches.is_empty() => { + let mut set = HashSet::new(); + for batch in pending_batches { + if let Some(arr) = batch + .column_by_name(col) + .and_then(|c| c.as_any().downcast_ref::()) + { + for i in 0..arr.len() { + if arr.is_valid(i) { + set.insert(arr.value(i).to_string()); + } + } + } + } + Some(set) + } + _ => None, + }; + + // Committed side: scan `src` plus the dedupe key column when set, so + // we can both count and shadow in one pass. + let projection: Vec<&str> = match dedupe_key_column { + Some(col) if pending_keys.as_ref().is_some_and(|s| !s.is_empty()) => vec!["src", col], + _ => vec!["src"], + }; + let committed = db + .table_store() + .scan(committed_ds, Some(&projection), None, None) + .await?; + for batch in &committed { + let srcs = batch + .column_by_name("src") + .ok_or_else(|| OmniError::Lance("missing 'src' column on edge table".into()))? + .as_any() + .downcast_ref::() + .ok_or_else(|| OmniError::Lance("'src' column is not Utf8".into()))?; + // Optional shadow-key column (only present when dedupe is on). + let key_arr = match (&pending_keys, dedupe_key_column) { + (Some(set), Some(col)) if !set.is_empty() => batch + .column_by_name(col) + .and_then(|c| c.as_any().downcast_ref::()), + _ => None, + }; + for i in 0..srcs.len() { + if !srcs.is_valid(i) { + continue; + } + // Shadow this committed row if its key is in pending. + if let (Some(arr), Some(set)) = (key_arr, pending_keys.as_ref()) { + if arr.is_valid(i) && set.contains(arr.value(i)) { + continue; + } + } + *counts.entry(srcs.value(i).to_string()).or_insert(0) += 1; + } + } + + // Pending side: walk in-memory batches for `src`. (No dedupe needed — + // `dedupe_merge_batches_by_id` runs at finalize-time so any same-id + // duplicates within pending are already collapsed by the time the + // publisher commits, but cardinality runs before finalize. The + // engine's per-op edge insert produces one row per op with a fresh + // ULID, so within-pending duplicates are not a concern here.) + for batch in pending_batches { + let Some(col) = batch.column_by_name("src") else { + continue; + }; + let Some(srcs) = col.as_any().downcast_ref::() else { + continue; + }; + for i in 0..srcs.len() { + if srcs.is_valid(i) { + *counts.entry(srcs.value(i).to_string()).or_insert(0) += 1; + } + } + } + + Ok(counts) +} + +/// Apply `@card(min..max)` bounds to a per-source count map. +/// +/// Both bounds are checked. The `min` check produces a misleading error +/// during a per-op insert mid-query (a bound of `2..` requires both +/// edges to be inserted before validation passes), but the historical +/// behavior was to enforce min per-op anyway — keeping users from +/// accidentally publishing a graph that violates the schema. Consumers +/// that need end-of-query semantics call this from after all edge ops +/// are accumulated (the loader does, via Phase 3). +pub(crate) fn enforce_cardinality_bounds( + edge_type: &EdgeType, + counts: &HashMap, +) -> Result<()> { + let card = &edge_type.cardinality; + for (src, count) in counts { + if let Some(max) = card.max { + if *count > max { + return Err(OmniError::manifest(format!( + "@card violation on edge {}: source '{}' has {} edges (max {})", + edge_type.name, src, count, max + ))); + } + } + if *count < card.min { + return Err(OmniError::manifest(format!( + "@card violation on edge {}: source '{}' has {} edges (min {})", + edge_type.name, src, count, card.min + ))); + } + } + Ok(()) +} diff --git a/crates/omnigraph/src/loader/mod.rs b/crates/omnigraph/src/loader/mod.rs index d05a315..a349a64 100644 --- a/crates/omnigraph/src/loader/mod.rs +++ b/crates/omnigraph/src/loader/mod.rs @@ -521,6 +521,7 @@ async fn load_jsonl_reader( edge_type, &table_key, &staging, + mode, ) .await?; } else if let Some(update) = overwrite_updates.iter().find(|u| u.table_key == table_key) { @@ -1553,87 +1554,49 @@ pub(crate) async fn validate_edge_cardinality( /// Validate edge `@card` cardinality with in-memory pending edges visible. /// /// Loader-level analog to `exec::mutation::validate_edge_cardinality_with_pending`: -/// scans committed edges via Lance and unions counts with the pending -/// edge batches accumulated by the staged loader. Used by Append/Merge -/// loads (the Overwrite path uses `validate_edge_cardinality` which -/// opens the just-written Lance version). +/// opens the committed dataset at the pre-load snapshot version, then +/// delegates to the shared `count_src_per_edge` + `enforce_cardinality_bounds` +/// helpers in `exec::staging`. Used by Append/Merge loads (the Overwrite +/// path uses `validate_edge_cardinality` which opens the just-written +/// Lance version). +/// +/// `mode` controls dedup behavior. `LoadMode::Merge` passes `Some("id")` +/// so committed edges that the load is *updating* (same edge id, possibly +/// changed `src`) are not double-counted (Cubic P1 finding on PR #68). +/// `LoadMode::Append` passes `None` because each line generates a fresh +/// ULID id that never collides with committed. async fn validate_edge_cardinality_with_pending_loader( db: &Omnigraph, branch: Option<&str>, edge_type: &omnigraph_compiler::catalog::EdgeType, table_key: &str, staging: &MutationStaging, + mode: LoadMode, ) -> Result<()> { if edge_type.cardinality.is_default() { return Ok(()); } - - let mut counts: HashMap = HashMap::new(); - - // Committed side: open at pre-write version (the snapshot pinned at - // load entry; pending writes haven't committed yet). let snapshot = db.snapshot_for_branch(branch).await?; - if let Some(entry) = snapshot.entry(table_key) { - let ds = db - .open_dataset_at_state( - &entry.table_path, - entry.table_branch.as_deref(), - entry.table_version, - ) + let Some(entry) = snapshot.entry(table_key) else { + // No manifest entry — table doesn't exist yet. Pending-only is + // fine; the helper handles empty committed scans. + return Ok(()); + }; + let ds = db + .open_dataset_at_state( + &entry.table_path, + entry.table_branch.as_deref(), + entry.table_version, + ) + .await?; + let dedupe_key = match mode { + LoadMode::Merge => Some("id"), + LoadMode::Append | LoadMode::Overwrite => None, + }; + let counts = + crate::exec::staging::count_src_per_edge(db, &ds, table_key, staging, dedupe_key) .await?; - let batches = db - .table_store() - .scan(&ds, Some(&["src"]), None, None) - .await?; - for batch in &batches { - let srcs = batch - .column_by_name("src") - .ok_or_else(|| OmniError::Lance("missing 'src' column".into()))? - .as_any() - .downcast_ref::() - .ok_or_else(|| OmniError::Lance("'src' column is not Utf8".into()))?; - for i in 0..srcs.len() { - if srcs.is_valid(i) { - *counts.entry(srcs.value(i).to_string()).or_insert(0) += 1; - } - } - } - } - - // Pending side: walk in-memory pending batches for `src`. - for batch in staging.pending_batches(table_key) { - let Some(col) = batch.column_by_name("src") else { - continue; - }; - let Some(srcs) = col.as_any().downcast_ref::() else { - continue; - }; - for i in 0..srcs.len() { - if srcs.is_valid(i) { - *counts.entry(srcs.value(i).to_string()).or_insert(0) += 1; - } - } - } - - let card = &edge_type.cardinality; - for (src, count) in &counts { - if let Some(max) = card.max { - if *count > max { - return Err(OmniError::manifest(format!( - "@card violation on edge {}: source '{}' has {} edges (max {})", - edge_type.name, src, count, max - ))); - } - } - if *count < card.min { - return Err(OmniError::manifest(format!( - "@card violation on edge {}: source '{}' has {} edges (min {})", - edge_type.name, src, count, card.min - ))); - } - } - - Ok(()) + crate::exec::staging::enforce_cardinality_bounds(edge_type, &counts) } /// Collect all valid node IDs for a given type, with in-memory pending diff --git a/crates/omnigraph/src/table_store.rs b/crates/omnigraph/src/table_store.rs index 52deb2f..d87858d 100644 --- a/crates/omnigraph/src/table_store.rs +++ b/crates/omnigraph/src/table_store.rs @@ -841,6 +841,22 @@ impl TableStore { /// pending side runs it through a fresh DataFusion `SessionContext` /// with the batches registered as a `MemTable` named `pending`. /// + /// `key_column` controls how committed and pending are unioned: + /// - **`None` (union semantics)**: every committed row that matches + /// the filter and every pending row that matches the filter is + /// returned. Correct when committed and pending cannot share a + /// primary key — e.g., Append-mode loads with ULID-generated ids, + /// or any read where pending hasn't been used to update committed + /// rows. + /// - **`Some(col)` (merge / shadow semantics)**: committed rows whose + /// `col` value appears in any pending batch are EXCLUDED from the + /// result; only pending's view of those rows is returned. Required + /// for Merge-mode reads (e.g., `execute_update` on the engine path) + /// so a chained `update` doesn't see stale committed values that + /// a prior op already updated in pending. Without this, a predicate + /// like `where age > 30` can match a row that an earlier + /// `set age = 20` already moved out of range. + /// /// When `pending_batches` is empty this delegates to the regular /// scan path. pub async fn scan_with_pending( @@ -850,11 +866,30 @@ impl TableStore { pending_schema: Option, projection: Option<&[&str]>, filter: Option<&str>, + key_column: Option<&str>, ) -> Result> { - let mut out = self.scan(committed_ds, projection, filter, None).await?; + let committed = self.scan(committed_ds, projection, filter, None).await?; if pending_batches.is_empty() { - return Ok(out); + return Ok(committed); } + + // Shadow committed rows whose key value also appears in pending. + // This makes scan_with_pending implement merge semantics rather + // than naive union: any row that has a pending update is + // represented ONLY by its pending value, never by both its + // (stale) committed value and its (current) pending value. + let committed = match key_column { + Some(key_col) => { + let pending_keys = collect_string_column_values(pending_batches, key_col)?; + if pending_keys.is_empty() { + committed + } else { + filter_out_rows_where_string_in(committed, key_col, &pending_keys)? + } + } + None => committed, + }; + let pending = scan_pending_batches( pending_batches, pending_schema, @@ -862,62 +897,12 @@ impl TableStore { filter, ) .await?; + + let mut out = committed; out.extend(pending); Ok(out) } - /// `count_rows` variant that respects in-memory pending batches via - /// DataFusion `MemTable`. Used by edge-cardinality validation that - /// needs to see staged edges before commit. - /// - /// Cheaper than `scan_with_pending` for the count case because we - /// don't materialize columns on the pending side. - pub async fn count_rows_with_pending( - &self, - committed_ds: &Dataset, - pending_batches: &[RecordBatch], - pending_schema: Option, - filter: Option, - ) -> Result { - let committed = self.count_rows(committed_ds, filter.clone()).await?; - if pending_batches.is_empty() { - return Ok(committed); - } - // Count via DataFusion: COUNT(*) from pending [WHERE filter]. - let schema = - pending_schema.unwrap_or_else(|| pending_batches[0].schema()); - let ctx = datafusion::execution::context::SessionContext::new(); - let mem = datafusion::datasource::MemTable::try_new( - schema, - vec![pending_batches.to_vec()], - ) - .map_err(|e| OmniError::Lance(e.to_string()))?; - ctx.register_table("pending", Arc::new(mem)) - .map_err(|e| OmniError::Lance(e.to_string()))?; - let where_clause = filter - .map(|f| format!("WHERE {f}")) - .unwrap_or_default(); - let sql = format!("SELECT COUNT(*) AS c FROM pending {where_clause}"); - let df = ctx - .sql(&sql) - .await - .map_err(|e| OmniError::Lance(e.to_string()))?; - let batches = df - .collect() - .await - .map_err(|e| OmniError::Lance(e.to_string()))?; - let pending_count = batches - .into_iter() - .filter(|b| b.num_rows() > 0) - .map(|b| { - use arrow_array::cast::AsArray; - let arr = b.column(0).as_primitive::(); - arr.value(0) as usize - }) - .sum::(); - Ok(committed + pending_count) - } - /// `count_rows` variant that respects staged writes. Used for /// edge-cardinality validation that needs to see staged edges before /// commit. Same `committed - removed + new` composition as @@ -1171,6 +1156,81 @@ fn assign_row_id_meta(fragments: &mut [Fragment], start_row_id: u64) -> Result<( /// /// `pending_batches` must be non-empty (the caller short-circuits on /// empty). +/// Collect the set of values in a Utf8 column across multiple batches. +/// Used by `scan_with_pending`'s merge-semantic path to identify +/// committed rows that are shadowed by pending writes. NULL values are +/// skipped. +fn collect_string_column_values( + batches: &[RecordBatch], + column: &str, +) -> Result> { + use arrow_array::{Array, StringArray}; + let mut out = std::collections::HashSet::new(); + for batch in batches { + let Some(col) = batch.column_by_name(column) else { + return Err(OmniError::Lance(format!( + "scan_with_pending: pending batch missing key column '{}'", + column + ))); + }; + let arr = col.as_any().downcast_ref::().ok_or_else(|| { + OmniError::Lance(format!( + "scan_with_pending: key column '{}' is not Utf8", + column + )) + })?; + for i in 0..arr.len() { + if arr.is_valid(i) { + out.insert(arr.value(i).to_string()); + } + } + } + Ok(out) +} + +/// Drop rows from `batches` whose Utf8 `column` value is in `excluded`. +/// Used by `scan_with_pending`'s merge-semantic path to shadow committed +/// rows that pending has already updated. Returns the surviving rows. +fn filter_out_rows_where_string_in( + batches: Vec, + column: &str, + excluded: &std::collections::HashSet, +) -> Result> { + use arrow_array::{Array, BooleanArray, StringArray}; + let mut out = Vec::with_capacity(batches.len()); + for batch in batches { + if batch.num_rows() == 0 { + out.push(batch); + continue; + } + let Some(col) = batch.column_by_name(column) else { + // The committed scan didn't project this column. We cannot + // shadow without it; pass the batch through unchanged. + out.push(batch); + continue; + }; + let arr = col.as_any().downcast_ref::().ok_or_else(|| { + OmniError::Lance(format!( + "scan_with_pending: committed column '{}' is not Utf8", + column + )) + })?; + let mask: BooleanArray = (0..arr.len()) + .map(|i| { + if arr.is_valid(i) { + Some(!excluded.contains(arr.value(i))) + } else { + Some(true) + } + }) + .collect(); + let filtered = arrow_select::filter::filter_record_batch(&batch, &mask) + .map_err(|e| OmniError::Lance(e.to_string()))?; + out.push(filtered); + } + Ok(out) +} + async fn scan_pending_batches( pending_batches: &[RecordBatch], pending_schema: Option, diff --git a/crates/omnigraph/tests/failpoints.rs b/crates/omnigraph/tests/failpoints.rs index bdc5f83..3adc15d 100644 --- a/crates/omnigraph/tests/failpoints.rs +++ b/crates/omnigraph/tests/failpoints.rs @@ -140,6 +140,130 @@ async fn schema_apply_recovers_partial_rename() { assert_no_staging_files(dir.path()); } +/// Pin the documented "finalize → publisher residual" from MR-794. +/// +/// `MutationStaging::finalize` runs `commit_staged` per touched table +/// sequentially before the publisher commits the manifest. Lance has no +/// multi-dataset atomic commit primitive, so a failure between the +/// per-table staged commits and the manifest commit leaves Lance HEAD +/// advanced on the touched tables with no manifest update — and the +/// next mutation surfaces `ExpectedVersionMismatch` on those tables. +/// +/// This isn't a code bug we can fix without an upstream Lance change; +/// it's the documented residual (see `docs/runs.md` "Finalize → +/// publisher residual"). The test pins the behavior so future code +/// changes catch any silent regression: if someone widens the residual +/// (e.g. failing earlier in finalize without rolling back), this test +/// will surface a different error than `ExpectedVersionMismatch`. If +/// someone narrows the residual (e.g. lance ships multi-dataset commit +/// and we plumb it), this test will start passing the next mutation +/// — and someone has to update the assertion + the docs. +#[tokio::test] +async fn finalize_publisher_residual_drifts_lance_head_until_next_writer_recovers() { + use omnigraph::error::{ManifestConflictDetails, OmniError}; + + let _scenario = FailScenario::setup(); + let dir = tempfile::tempdir().unwrap(); + let mut db = Omnigraph::init(dir.path().to_str().unwrap(), helpers::TEST_SCHEMA) + .await + .unwrap(); + + { + let _failpoint = + ScopedFailPoint::new("mutation.post_finalize_pre_publisher", "return"); + + // First mutation: finalize succeeds (commit_staged advances Lance + // HEAD on node:Person), then the failpoint kicks before the + // publisher's manifest commit. The caller sees the synthetic + // error. + let err = mutate_main( + &mut db, + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + .unwrap_err(); + assert!( + err.to_string().contains( + "injected failpoint triggered: mutation.post_finalize_pre_publisher" + ), + "unexpected error: {err}" + ); + } + // Failpoint dropped — subsequent calls are not synthetic-failed. + + // Next mutation against the same table surfaces the documented + // residual: Lance HEAD on node:Person advanced (commit_staged ran), + // manifest didn't, so the publisher CAS at next-mutation time + // surfaces ExpectedVersionMismatch. + let err = mutate_main( + &mut db, + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Frank")], &[("$age", 33)]), + ) + .await + .unwrap_err(); + let OmniError::Manifest(manifest_err) = err else { + panic!("expected Manifest error, got {err:?}"); + }; + let Some(ManifestConflictDetails::ExpectedVersionMismatch { + ref table_key, + expected, + actual, + }) = manifest_err.details + else { + panic!( + "expected ExpectedVersionMismatch (the documented residual), got {:?}", + manifest_err.details + ); + }; + assert_eq!( + table_key, "node:Person", + "drift should be on the table the failed finalize touched" + ); + assert!( + actual > expected, + "Lance HEAD on the drifted table should be ahead of manifest pinned: actual={actual} expected={expected}", + ); +} + +/// Companion to the above — confirms that a finalize→publisher failure +/// on one table leaves OTHER tables untouched. Subsequent writes to +/// non-drifted tables proceed normally; the drift is contained. +#[tokio::test] +async fn finalize_publisher_residual_does_not_drift_untouched_tables() { + let _scenario = FailScenario::setup(); + let dir = tempfile::tempdir().unwrap(); + let mut db = Omnigraph::init(dir.path().to_str().unwrap(), helpers::TEST_SCHEMA) + .await + .unwrap(); + + { + let _failpoint = + ScopedFailPoint::new("mutation.post_finalize_pre_publisher", "return"); + let _ = mutate_main( + &mut db, + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + .expect_err("synthetic failpoint must fire"); + } + + // node:Person drifted. node:Company didn't — try a Company write. + use omnigraph::loader::{LoadMode, load_jsonl}; + load_jsonl( + &mut db, + r#"{"type": "Company", "data": {"name": "Acme"}}"#, + LoadMode::Append, + ) + .await + .expect("Company write on a non-drifted table should succeed"); +} + fn assert_no_staging_files(repo: &std::path::Path) { for name in [ "_schema.pg.staging", diff --git a/crates/omnigraph/tests/runs.rs b/crates/omnigraph/tests/runs.rs index 2e9b480..b9f18a8 100644 --- a/crates/omnigraph/tests/runs.rs +++ b/crates/omnigraph/tests/runs.rs @@ -507,6 +507,19 @@ query mixed_insert_and_delete($name: String, $age: I32, $victim: String) { insert Person { name: $name, age: $age } delete Person where name = $victim } + +query update_then_filter_by_old_value( + $first_name: String, $first_new_age: I32, + $second_threshold: I32, $second_new_age: I32 +) { + update Person set { age: $first_new_age } where name = $first_name + update Person set { age: $second_new_age } where age > $second_threshold +} + +query delete_two_persons($first: String, $second: String) { + delete Person where name = $first + delete Person where name = $second +} "#; /// D₂: a query mixing inserts/updates with deletes is rejected at parse @@ -838,3 +851,272 @@ edge WorksAt: Person -> Company @card(0..1) "follow-up load must succeed (no drift on edge table)", ); } + +// ─── PR #68 review-comment fixes — pinned coverage ────────────────────────── + +/// Codex P1 / Cubic P1 #1: chained `update` ops in one query must respect +/// each previous op's view of the rows. Without merge-shadow semantics on +/// `scan_with_pending`, the second update sees the stale committed value +/// (the first update's row still appears in the Lance scan because the +/// pending side hasn't committed), the predicate matches it, and the +/// dedupe-last-wins step at finalize ends up applying the second update +/// to a row whose pending value should have shielded it. +/// +/// Concretely: Alice starts at age=30 in TEST_DATA. Op-1 sets Alice to +/// age=99. Op-2 updates anyone with age > 50 to age=10. After op-1, +/// Alice's logical value is age=99 — within op-2's predicate. So op-2 +/// SHOULD update Alice to age=10. The interesting case is: op-2 must +/// see Alice at age=99 (op-1's pending value), not age=30 (committed). +/// If the helper unioned without shadowing, op-2 would also match the +/// stale committed Alice (age=30 doesn't trigger the predicate, but the +/// row would appear twice and dedupe could pick either). The test +/// asserts both ends: Alice ends at age=10, the publisher publishes +/// once. +#[tokio::test] +async fn chained_updates_with_overlapping_predicate_respects_intermediate_value() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + + let pre_version = version_main(&db).await.unwrap(); + + db.mutate( + "main", + STAGED_QUERIES, + "update_then_filter_by_old_value", + &mixed_params( + &[("$first_name", "Alice")], + &[ + ("$first_new_age", 99), + ("$second_threshold", 50), + ("$second_new_age", 10), + ], + ), + ) + .await + .unwrap(); + + // After op-1: Alice = 99. After op-2 (where age > 50): Alice + // matches (99 > 50) → set to 10. End state: Alice = 10. + let batches = read_table(&db, "node:Person").await; + let mut alice_age: Option = None; + for batch in &batches { + let names = batch + .column_by_name("name") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let ages = batch + .column_by_name("age") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + for i in 0..batch.num_rows() { + if names.is_valid(i) && names.value(i) == "Alice" && ages.is_valid(i) { + alice_age = Some(ages.value(i)); + } + } + } + assert_eq!( + alice_age, + Some(10), + "chained-update final value must reflect the second update applied to op-1's pending value" + ); + + let post_version = version_main(&db).await.unwrap(); + assert_eq!( + post_version, + pre_version + 1, + "chained update must publish exactly once", + ); +} + +/// Cursor Bugbot HIGH: two `delete` ops on the same node table in one +/// query. Pre-fix, op-2's `open_table_for_mutation` went through +/// `open_for_mutation_on_branch` which trips `ensure_expected_version` +/// (Lance HEAD has advanced past the manifest's pinned version after +/// op-1's inline-commit, but the manifest hasn't moved). Post-fix, +/// `open_table_for_mutation` reopens via `inline_committed[table_key]` +/// at the post-delete Lance version. Test asserts both deletes succeed +/// in one query, both rows are gone, manifest version advances by 1. +#[tokio::test] +async fn multi_statement_delete_on_same_node_table() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + + let pre_persons = count_rows(&db, "node:Person").await; + let pre_version = version_main(&db).await.unwrap(); + + db.mutate( + "main", + STAGED_QUERIES, + "delete_two_persons", + ¶ms(&[("$first", "Alice"), ("$second", "Bob")]), + ) + .await + .expect("multi-delete on same table must succeed"); + + assert_eq!( + count_rows(&db, "node:Person").await, + pre_persons - 2, + "both deletes must land", + ); + let post_version = version_main(&db).await.unwrap(); + assert_eq!( + post_version, + pre_version + 1, + "multi-delete query publishes exactly once at end", + ); + + // Both rows actually gone: + for name in ["Alice", "Bob"] { + let qr = db + .query( + ReadTarget::branch("main"), + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", name)]), + ) + .await + .unwrap(); + assert_eq!(qr.num_rows(), 0, "{name} should be deleted"); + } +} + +/// Cursor Bugbot HIGH (cascade variant): deleting a node cascades to its +/// edges, advancing Lance HEAD on the edge table. A subsequent +/// `delete ` op in the same query must reopen at the +/// post-cascade-commit version of the edge table — not trip +/// `ensure_expected_version` against the manifest's pinned version. +#[tokio::test] +async fn cascade_delete_node_then_explicit_delete_edge_on_same_table() { + const QUERY: &str = r#" +query cascade_then_explicit($name: String, $other: String) { + delete Person where name = $name + delete Knows where from = $other +} +"#; + + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + + let pre_knows = count_rows(&db, "edge:Knows").await; + + db.mutate( + "main", + QUERY, + "cascade_then_explicit", + ¶ms(&[("$name", "Alice"), ("$other", "Bob")]), + ) + .await + .expect("cascade-then-explicit-delete on same edge table must succeed"); + + let post_knows = count_rows(&db, "edge:Knows").await; + assert!( + post_knows < pre_knows, + "cascade + explicit delete should remove edges; pre={pre_knows} post={post_knows}", + ); +} + +/// Codex P2 / Cursor Bugbot LOW / Cubic P2: the engine cardinality path +/// must enforce `min` bounds. Pre-fix the engine path silently dropped +/// the min check (a `let _ = card.min;` line). The loader path always +/// enforced both. Post-fix, both paths route through +/// `enforce_cardinality_bounds` which checks both bounds. +/// +/// Build a custom schema with `Knows: Person -> Person @card(2..*)`. +/// Inserting a single Knows edge violates min=2. The mutation path must +/// reject. +#[tokio::test] +async fn mutation_insert_edge_enforces_min_cardinality() { + use omnigraph::loader::{LoadMode, load_jsonl}; + + const MIN_CARD_SCHEMA: &str = r#" +node Person { + name: String @key +} +edge Knows: Person -> Person @card(2..) +"#; + const MIN_CARD_QUERY: &str = r#" +query add_friend($from: String, $to: String) { + insert Knows { from: $from, to: $to } +} +"#; + + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, MIN_CARD_SCHEMA).await.unwrap(); + + let seed = r#"{"type": "Person", "data": {"name": "Alice"}} +{"type": "Person", "data": {"name": "Bob"}} +"#; + load_jsonl(&mut db, seed, LoadMode::Overwrite).await.unwrap(); + + // Single insert: count=1 < min=2 → reject with clear message. + let err = db + .mutate( + "main", + MIN_CARD_QUERY, + "add_friend", + ¶ms(&[("$from", "Alice"), ("$to", "Bob")]), + ) + .await + .expect_err("min cardinality must reject the engine path"); + let OmniError::Manifest(manifest_err) = err else { + panic!("expected Manifest error, got {err:?}"); + }; + assert!( + manifest_err.message.contains("@card violation") + && manifest_err.message.contains("min 2"), + "unexpected error: {}", + manifest_err.message, + ); +} + +/// Cubic P1 #2: `LoadMode::Merge` on edges must NOT double-count the +/// committed edge AND its updated pending replacement. Build a custom +/// schema where WorksAt has @card(0..1). Seed Alice with one WorksAt to +/// Acme. Then Merge-load the SAME edge id (so it's an update, not an +/// insert) pointing Alice's WorksAt at Bigco. Cardinality must count +/// Alice's edges as 1 (the post-merge count), not 2 (committed + pending). +#[tokio::test] +async fn load_merge_mode_dedupes_edge_for_cardinality_count() { + use omnigraph::loader::{LoadMode, load_jsonl}; + + const CARD_SCHEMA: &str = r#" +node Person { + name: String @key +} +node Company { + name: String @key +} +edge WorksAt: Person -> Company @card(0..1) +"#; + + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, CARD_SCHEMA).await.unwrap(); + + // Seed: Alice + Acme + Bigco + WorksAt(id=w1, Alice→Acme). Note the + // loader reads edge ids from the `data.id` field (not top-level), so + // we place the id inside `data` for both the seed and the update. + let seed = r#"{"type": "Person", "data": {"name": "Alice"}} +{"type": "Company", "data": {"name": "Acme"}} +{"type": "Company", "data": {"name": "Bigco"}} +{"edge": "WorksAt", "from": "Alice", "to": "Acme", "data": {"id": "w1"}} +"#; + load_jsonl(&mut db, seed, LoadMode::Overwrite).await.unwrap(); + + // Merge-update the same edge id w1 to point at Bigco. Counted naively + // as union, Alice has 2 WorksAt (committed Acme + pending Bigco) which + // would trip @card(0..1). With merge dedupe, Alice has 1 WorksAt. + let merge_data = r#"{"edge": "WorksAt", "from": "Alice", "to": "Bigco", "data": {"id": "w1"}} +"#; + load_jsonl(&mut db, merge_data, LoadMode::Merge) + .await + .expect("Merge update must dedupe the committed edge by id"); + + // Confirm there's exactly 1 WorksAt edge after merge. + assert_eq!(count_rows(&db, "edge:WorksAt").await, 1); +} diff --git a/docs/invariants.md b/docs/invariants.md index 3e3af86..996b5c0 100644 --- a/docs/invariants.md +++ b/docs/invariants.md @@ -110,7 +110,7 @@ Specific defaults (timeout values, memory caps, TTL windows) are *configuration* *Status: aspirational — referential integrity at scale requires SIP-backed cross-table validation; not yet implemented. Cross-batch / cross-version uniqueness tracked in MR-714.* 25. **Isolation: per-query snapshot; read-your-writes within and across queries in a session.** Each query reads from one consistent manifest version. Within a multi-statement mutation, the read subplan inside each write operator sees the writes from earlier statements. Across queries in a session, reads always resolve the latest manifest version — no reader pinning to older snapshots. - *Status: upheld for inserts/updates after MR-794 step 2+ — `MutationStaging`'s in-memory accumulator + `TableStore::scan_with_pending` (DataFusion `MemTable` union with the committed Lance scan) implements read-your-writes within a multi-statement mutation. Delete-touching mutations are limited to delete-only by parse-time D₂; closing the within-query RYW gap for deletes requires Lance's two-phase delete API (tracked: MR-793 / Lance-upstream).* + *Status: upheld for inserts/updates after MR-794 step 2+ — `MutationStaging`'s in-memory accumulator + `TableStore::scan_with_pending` (DataFusion `MemTable` union with the committed Lance scan, with merge-shadow semantics for chained updates) implements read-your-writes within a multi-statement mutation. Delete-touching mutations are limited to delete-only by parse-time D₂; closing the within-query RYW gap for deletes requires Lance's two-phase delete API (tracked: MR-793 / Lance-upstream lance-format/lance#6658). The "Lance HEAD ahead of `__manifest`" drift class is unreachable for op-execution failures (the partial-failure test pins this), but a narrowed residual remains at the finalize→publisher boundary because Lance has no multi-dataset commit primitive — see [docs/runs.md](runs.md) "Finalize → publisher residual".* 26. **Durability before acknowledgement.** Commit returns only after the substrate has confirmed durable persistence. No "fast" or "fire-and-forget" durability levels. diff --git a/docs/releases/v0.4.1.md b/docs/releases/v0.4.1.md index 4c5d44a..031b2e7 100644 --- a/docs/releases/v0.4.1.md +++ b/docs/releases/v0.4.1.md @@ -14,8 +14,13 @@ mutation proceeds normally. `MutationStaging.pending` per touched table. No Lance HEAD advance happens during op execution; one `stage_*` + `commit_staged` per table runs at end-of-query, then `ManifestBatchPublisher::publish` - commits the manifest atomically. A mid-query failure leaves Lance - HEAD untouched on staged tables. + commits the manifest atomically. **For op-execution failures** + (validation errors, missing endpoints, parse-time D₂ rejection), Lance + HEAD on every staged table is untouched and the next mutation + proceeds normally. A narrowed residual remains at the + finalize→publisher boundary (multi-table `commit_staged` is not + atomic with the manifest commit) — see [docs/runs.md](../runs.md) + "Finalize → publisher residual" for details. - **D₂ parse-time rule**: a single mutation query is either insert/update-only or delete-only. Mixed → rejected with a clear error directing the caller to split into two queries. Lance 4.0.0 diff --git a/docs/runs.md b/docs/runs.md index 63295fd..459b7ca 100644 --- a/docs/runs.md +++ b/docs/runs.md @@ -75,6 +75,42 @@ will replace it. Operator-driven (rare in agent workloads); document permanently until Lance exposes `Operation::Overwrite { fragments }` as a two-phase op. +### Finalize → publisher residual + +The staged-write rewire eliminates one drift class **by construction at +the writer layer**: an op that fails before pushing to the in-memory +accumulator (validation errors, missing endpoints, parse-time D₂ +rejection) leaves Lance HEAD untouched on every staged table. This is +the case the `partial_failure_leaves_target_queryable_and_unblocks_next_mutation` +test pins. + +A second, narrower drift class remains. `MutationStaging::finalize` +runs `stage_*` + `commit_staged` per touched table sequentially, then +the publisher commits the manifest. Lance has no multi-dataset atomic +commit, so the per-table `commit_staged` calls are independent +operations: if commit_staged on table N+1 fails *after* commit_staged +on tables 1..N succeeded, or if the publisher's CAS pre-check rejects +*after* every commit_staged succeeded, tables 1..N are left at +`Lance HEAD = manifest_pinned + 1`. The next mutation against those +tables surfaces `ManifestConflictDetails::ExpectedVersionMismatch` — +the same loud failure mode the rewire was designed to make rare, just +no longer "unreachable." + +Triggers: transient Lance write errors during finalize (object-store +retry budget exhaustion, disk full); persistent publisher contention +exceeding `PUBLISHER_RETRY_BUDGET = 5` retries. Closing this requires +either a Lance multi-dataset atomic-commit primitive (filed upstream +alongside the two-phase delete request) or a manifest-layer journal +that replays staged commits on next open. Both are heavyweight; the +v1 stance is "narrowed window, documented residual, surface the loud +error when it fires." + +The publisher-CAS contract is unchanged: a *concurrent writer* that +advances any of our touched tables between snapshot capture and +publisher commit produces exactly one winner. The residual above is +about *our* abandoned commits in the failure path, not about +concurrency races. + ## Conflict shape Concurrent writers to the same `(table, branch)` produce exactly one From 052b6e680fed8e6794f3df14a6307f0a9e3c731a Mon Sep 17 00:00:00 2001 From: Ragnor Comerford Date: Fri, 1 May 2026 20:47:45 +0200 Subject: [PATCH 07/10] =?UTF-8?q?MR-794=20step=202:=20address=20PR=20#68?= =?UTF-8?q?=20follow-up=20review=20(Cubic)=20=E2=80=94=20pending=20dedupe?= =?UTF-8?q?=20+=20projection=20guard=20+=20CI?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three new findings from Cubic on commit 3223b51: * **Pending edge cardinality counted within-input duplicates** (P2): count_src_per_edge's pending walk added every row to the count, including duplicate rows that finalize will collapse via dedupe_merge_batches_by_id. A LoadMode::Merge with the same edge id twice would over-count → spurious @card violation. Fix: when dedupe_key_column is Some, walk pending in reverse, track seen keys via HashSet, count only the kept (last-occurrence) rows. Mirrors finalize-time dedupe so cardinality counts what stage_merge_insert actually publishes. * **scan_with_pending silently disabled merge-shadow when projection omitted key_column** (P2): if a caller passed Some("id") as key_column but their projection didn't include "id", the filter_out_rows_where_string_in helper passed batches through unchanged — silently degrading to union semantics. Fix: validate up front that projection contains key_column when both are Some; return a typed Lance error otherwise. Tightened the helper too: missing column is now an internal error (was a silent passthrough). * **Cascade-vs-explicit delete test was too weak** (P2): asserted only that edge count decreased after delete. The cascade alone could satisfy that even if the explicit second-delete silently no-op'd. Strengthened: assert post_knows == 0, which only holds when both ops landed (Bob→Diana would survive if op-2 no-op'd). CI gap: also added test_failpoints_feature job to .github/workflows/ci.yml. The workspace test runs without --features failpoints (the feature is behind a Cargo flag), so the failpoints test suite was never exercised by CI before now. The new job builds + runs `cargo test -p omnigraph-engine --features failpoints --test failpoints` on every full CI run, mirroring the test_aws_feature pattern. New tests on tests/runs.rs: * load_merge_mode_dedupes_within_pending_for_cardinality_count (Cubic P2 #2 — pending-vs-pending dedup, distinct from the load_merge_mode_dedupes_edge_for_cardinality_count test which covers committed-vs-pending dedup). * scan_with_pending_rejects_key_column_missing_from_projection (Cubic P2 #3 — verifies the up-front validation rejects bad callers and that the happy path still works correctly). Local test results: * tests/runs.rs: 23/23 passed * tests/failpoints.rs --features failpoints: 7/7 passed (includes the two new finalize→publisher residual tests landed in 3223b51). Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/ci.yml | 42 +++++++ crates/omnigraph/src/exec/staging.rs | 84 ++++++++++++-- crates/omnigraph/src/table_store.rs | 36 +++++- crates/omnigraph/tests/runs.rs | 160 ++++++++++++++++++++++++++- 4 files changed, 306 insertions(+), 16 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4cae31c..1d81f17 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -239,6 +239,48 @@ jobs: if: needs.classify_changes.outputs.run_full_ci == 'true' run: cargo test --locked -p omnigraph-server --features aws + test_failpoints_feature: + name: Test omnigraph-engine --features failpoints + needs: classify_changes + runs-on: ubuntu-latest + timeout-minutes: 30 + permissions: + contents: read + env: + CARGO_TERM_COLOR: always + steps: + - name: Skip for text-only changes + if: needs.classify_changes.outputs.run_full_ci != 'true' + run: echo "Text-only change detected; skipping failpoints feature build." + + - name: Checkout source + if: needs.classify_changes.outputs.run_full_ci == 'true' + uses: actions/checkout@v5.0.1 + + - name: Install system dependencies + if: needs.classify_changes.outputs.run_full_ci == 'true' + run: | + sudo apt-get update + sudo apt-get install -y protobuf-compiler libprotobuf-dev + + - name: Install Rust stable + if: needs.classify_changes.outputs.run_full_ci == 'true' + uses: dtolnay/rust-toolchain@stable + with: + toolchain: stable + + - name: Cache Rust build data + if: needs.classify_changes.outputs.run_full_ci == 'true' + uses: Swatinem/rust-cache@v2 + with: + workspaces: | + . -> target + key: failpoints-feature + + - name: Run failpoints test suite + if: needs.classify_changes.outputs.run_full_ci == 'true' + run: cargo test --locked -p omnigraph-engine --features failpoints --test failpoints + rustfs_integration: name: RustFS S3 Integration needs: diff --git a/crates/omnigraph/src/exec/staging.rs b/crates/omnigraph/src/exec/staging.rs index 033cb99..7695906 100644 --- a/crates/omnigraph/src/exec/staging.rs +++ b/crates/omnigraph/src/exec/staging.rs @@ -481,12 +481,31 @@ pub(crate) async fn count_src_per_edge( } } - // Pending side: walk in-memory batches for `src`. (No dedupe needed — - // `dedupe_merge_batches_by_id` runs at finalize-time so any same-id - // duplicates within pending are already collapsed by the time the - // publisher commits, but cardinality runs before finalize. The - // engine's per-op edge insert produces one row per op with a fresh - // ULID, so within-pending duplicates are not a concern here.) + // Pending side: walk in-memory batches for `src`. When dedupe is on, + // collapse rows that share `dedupe_key_column` to their last occurrence + // — mirrors `dedupe_merge_batches_by_id`'s last-write-wins applied at + // finalize time, so cardinality counts what `commit_staged` will + // actually publish, not raw input duplicates. + // + // Without this, a Merge-mode load whose input JSONL has two rows with + // the same edge id would be double-counted here, even though the + // finalize-time dedupe would collapse them to one. The result: spurious + // `@card` violations on perfectly valid Merge inputs. + match dedupe_key_column { + Some(key_col) => count_pending_src_with_dedupe(pending_batches, key_col, &mut counts)?, + None => count_pending_src_naive(pending_batches, &mut counts), + } + + Ok(counts) +} + +/// Count pending edges per `src` with NO dedup. Correct when caller +/// guarantees pending rows have unique primary keys (engine inserts via +/// fresh ULID; loader Append mode). +fn count_pending_src_naive( + pending_batches: &[RecordBatch], + counts: &mut HashMap, +) { for batch in pending_batches { let Some(col) = batch.column_by_name("src") else { continue; @@ -500,8 +519,59 @@ pub(crate) async fn count_src_per_edge( } } } +} - Ok(counts) +/// Count pending edges per `src` after deduping rows that share +/// `dedupe_key_column`. Last occurrence wins (mirrors +/// `dedupe_merge_batches_by_id`'s walk-in-reverse contract). Required for +/// `LoadMode::Merge` where the same edge id may appear multiple times in +/// one load and finalize will collapse them to the last value. +fn count_pending_src_with_dedupe( + pending_batches: &[RecordBatch], + dedupe_key_column: &str, + counts: &mut HashMap, +) -> Result<()> { + // Walk in reverse, track seen keys, keep one (key, src) pair per key. + let mut seen: HashSet = HashSet::new(); + let mut kept_srcs: Vec = Vec::new(); + for batch in pending_batches.iter().rev() { + let Some(key_col) = batch.column_by_name(dedupe_key_column) else { + // Pending batch is missing the key column — fall back to naive + // counting for this batch (caller's contract was about merge + // semantics; if the column isn't there we can't dedupe). + continue; + }; + let key_arr = key_col.as_any().downcast_ref::().ok_or_else(|| { + OmniError::Lance(format!( + "count_src_per_edge: pending '{}' column is not Utf8", + dedupe_key_column + )) + })?; + let src_arr = batch + .column_by_name("src") + .and_then(|c| c.as_any().downcast_ref::()); + let Some(srcs) = src_arr else { + continue; + }; + for i in (0..batch.num_rows()).rev() { + if !srcs.is_valid(i) { + continue; + } + // NULL key: keep (NULL != NULL semantics — every NULL counts). + if !key_arr.is_valid(i) { + kept_srcs.push(srcs.value(i).to_string()); + continue; + } + let key = key_arr.value(i); + if seen.insert(key.to_string()) { + kept_srcs.push(srcs.value(i).to_string()); + } + } + } + for src in kept_srcs { + *counts.entry(src).or_insert(0) += 1; + } + Ok(()) } /// Apply `@card(min..max)` bounds to a per-source count map. diff --git a/crates/omnigraph/src/table_store.rs b/crates/omnigraph/src/table_store.rs index d87858d..c460e51 100644 --- a/crates/omnigraph/src/table_store.rs +++ b/crates/omnigraph/src/table_store.rs @@ -868,6 +868,24 @@ impl TableStore { filter: Option<&str>, key_column: Option<&str>, ) -> Result> { + // Contract: when merge-shadow semantics are requested via + // `key_column`, the committed-side projection MUST include that + // column so we can filter committed rows whose key appears in + // pending. Silently dropping the shadow when projection omits + // the key would re-introduce union semantics behind the + // caller's back. Reject up front with a clear error so callers + // either (a) include the key in projection or (b) drop + // `key_column` if union is what they wanted. + if let (Some(key_col), Some(cols)) = (key_column, projection) { + if !cols.iter().any(|c| *c == key_col) { + return Err(OmniError::Lance(format!( + "scan_with_pending: key_column '{}' must appear in projection \ + when merge-shadow semantics are requested (got projection = {:?})", + key_col, cols + ))); + } + } + let committed = self.scan(committed_ds, projection, filter, None).await?; if pending_batches.is_empty() { return Ok(committed); @@ -1191,6 +1209,11 @@ fn collect_string_column_values( /// Drop rows from `batches` whose Utf8 `column` value is in `excluded`. /// Used by `scan_with_pending`'s merge-semantic path to shadow committed /// rows that pending has already updated. Returns the surviving rows. +/// +/// `scan_with_pending` validates up front that the projection contains +/// `column`, so a missing column here is a programmer error — error +/// loudly instead of silently passing batches through (which would +/// re-introduce the union semantics the caller asked us to avoid). fn filter_out_rows_where_string_in( batches: Vec, column: &str, @@ -1203,12 +1226,13 @@ fn filter_out_rows_where_string_in( out.push(batch); continue; } - let Some(col) = batch.column_by_name(column) else { - // The committed scan didn't project this column. We cannot - // shadow without it; pass the batch through unchanged. - out.push(batch); - continue; - }; + let col = batch.column_by_name(column).ok_or_else(|| { + OmniError::manifest_internal(format!( + "scan_with_pending: committed batch missing key column '{}' \ + (the up-front projection check should have rejected this)", + column + )) + })?; let arr = col.as_any().downcast_ref::().ok_or_else(|| { OmniError::Lance(format!( "scan_with_pending: committed column '{}' is not Utf8", diff --git a/crates/omnigraph/tests/runs.rs b/crates/omnigraph/tests/runs.rs index b9f18a8..5353a2f 100644 --- a/crates/omnigraph/tests/runs.rs +++ b/crates/omnigraph/tests/runs.rs @@ -1001,7 +1001,15 @@ query cascade_then_explicit($name: String, $other: String) { let dir = tempfile::tempdir().unwrap(); let mut db = init_and_load(&dir).await; + // TEST_DATA seeds three Knows edges: + // Alice → Bob, Alice → Charlie (cascade target — should be deleted by op-1) + // Bob → Diana (explicit target — should be deleted by op-2) + // After both ops, all three edges must be gone. A weaker assertion + // (just "count decreased") would pass even if op-2 silently no-op'd + // — Bob→Diana would survive. The exact-count check makes both ops + // independently observable. let pre_knows = count_rows(&db, "edge:Knows").await; + assert_eq!(pre_knows, 3, "fixture invariant: TEST_DATA seeds 3 Knows edges"); db.mutate( "main", @@ -1012,10 +1020,13 @@ query cascade_then_explicit($name: String, $other: String) { .await .expect("cascade-then-explicit-delete on same edge table must succeed"); + // Both ops landed: cascade removed Alice→Bob and Alice→Charlie; + // explicit removed Bob→Diana. Anything > 0 means one op silently + // did nothing (the bug we're guarding against). let post_knows = count_rows(&db, "edge:Knows").await; - assert!( - post_knows < pre_knows, - "cascade + explicit delete should remove edges; pre={pre_knows} post={post_knows}", + assert_eq!( + post_knows, 0, + "both cascade + explicit delete must complete (Bob→Diana would survive if op-2 no-op'd)", ); } @@ -1120,3 +1131,146 @@ edge WorksAt: Person -> Company @card(0..1) // Confirm there's exactly 1 WorksAt edge after merge. assert_eq!(count_rows(&db, "edge:WorksAt").await, 1); } + +/// Cubic P2 (follow-up to PR #68 review of commit 3223b51): a Merge load +/// whose input has TWO rows with the same edge id must be deduped at +/// cardinality-count time, not just at finalize. Without dedup, two +/// pending rows count twice → spurious `@card` violation. With dedup +/// (last-occurrence-wins, mirroring `dedupe_merge_batches_by_id`), the +/// pending side counts once. +/// +/// This is a separate path from `load_merge_mode_dedupes_edge_for_cardinality_count` +/// (which dedupes committed-vs-pending). Here we verify pending-vs-pending +/// dedup. +#[tokio::test] +async fn load_merge_mode_dedupes_within_pending_for_cardinality_count() { + use omnigraph::loader::{LoadMode, load_jsonl}; + + const CARD_SCHEMA: &str = r#" +node Person { + name: String @key +} +node Company { + name: String @key +} +edge WorksAt: Person -> Company @card(0..1) +"#; + + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, CARD_SCHEMA).await.unwrap(); + + let seed = r#"{"type": "Person", "data": {"name": "Alice"}} +{"type": "Company", "data": {"name": "Acme"}} +{"type": "Company", "data": {"name": "Bigco"}} +"#; + load_jsonl(&mut db, seed, LoadMode::Overwrite).await.unwrap(); + + // Merge load with the SAME edge id twice — the second row supersedes + // the first in the finalize-time dedupe. If pending-counting doesn't + // dedupe, Alice has 2 pending edges → @card(0..1) trips → load + // fails. With dedupe, Alice has 1 → load succeeds. + let dup_data = r#"{"edge": "WorksAt", "from": "Alice", "to": "Acme", "data": {"id": "w1"}} +{"edge": "WorksAt", "from": "Alice", "to": "Bigco", "data": {"id": "w1"}} +"#; + load_jsonl(&mut db, dup_data, LoadMode::Merge) + .await + .expect("Merge load with within-input dup ids must dedupe pending count"); + + // Exactly one WorksAt edge after the dedup; the second row wins + // (last-occurrence) so dst should be Bigco. + assert_eq!(count_rows(&db, "edge:WorksAt").await, 1); +} + +/// Cubic P2 (follow-up): `scan_with_pending` must reject a call where +/// `key_column` is requested but the projection omits that column. +/// Without the up-front check, the helper silently degraded to union +/// semantics — letting a chained-update bug slip through unnoticed. +/// This test verifies the contract is enforced at the API boundary. +#[tokio::test] +async fn scan_with_pending_rejects_key_column_missing_from_projection() { + use arrow_array::{RecordBatch, StringArray}; + use arrow_schema::{DataType, Field, Schema}; + use omnigraph::table_store::TableStore; + use std::sync::Arc; + + let dir = tempfile::tempdir().unwrap(); + let uri = format!("{}/people.lance", dir.path().to_str().unwrap()); + let store = TableStore::new(dir.path().to_str().unwrap()); + + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Utf8, false), + Field::new("note", DataType::Utf8, true), + ])); + let seed = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(StringArray::from(vec!["a", "b"])) as _, + Arc::new(StringArray::from(vec![Some("seed-a"), Some("seed-b")])) as _, + ], + ) + .unwrap(); + let ds = TableStore::write_dataset(&uri, seed).await.unwrap(); + + let pending = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(StringArray::from(vec!["a"])) as _, + Arc::new(StringArray::from(vec![Some("pending-a")])) as _, + ], + ) + .unwrap(); + + // Bad call: key_column = "id" but projection doesn't include "id". + // Pre-fix this silently disabled merge-shadowing and returned both + // committed "a" and pending "a" rows. Now it must error. + let err = store + .scan_with_pending( + &ds, + std::slice::from_ref(&pending), + None, + Some(&["note"]), + None, + Some("id"), + ) + .await + .expect_err("scan_with_pending must reject merge-shadow with missing key in projection"); + let msg = err.to_string(); + assert!( + msg.contains("key_column 'id'") && msg.contains("must appear in projection"), + "unexpected error: {msg}" + ); + + // Good call: projection includes the key column. Shadow works: + // pending row 'a' shadows committed 'a', so the result has only + // committed 'b' + pending 'a'. + let batches = store + .scan_with_pending( + &ds, + std::slice::from_ref(&pending), + None, + Some(&["id", "note"]), + None, + Some("id"), + ) + .await + .expect("projection containing key_column must succeed"); + let mut ids: Vec = Vec::new(); + for b in &batches { + let arr = b + .column_by_name("id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + for i in 0..arr.len() { + ids.push(arr.value(i).to_string()); + } + } + ids.sort(); + assert_eq!( + ids, + vec!["a", "b"], + "merge-shadow should drop committed 'a' and surface pending 'a' + committed 'b'" + ); +} From 675568ce85eadd3d242c7b70c96a72ffc6221f99 Mon Sep 17 00:00:00 2001 From: Ragnor Comerford Date: Fri, 1 May 2026 21:15:14 +0200 Subject: [PATCH 08/10] ci: fold failpoints test into Test Workspace job MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The standalone test_failpoints_feature job took 21min on first run (cold cache; the omnigraph-engine crate has lance + datafusion deps that make any fresh build expensive). Folding into Test Workspace shares the warm cache so the failpoints invocation is incremental — ~30s vs 21min on subsequent runs, and within the workspace job's existing budget. The failpoints feature is gated behind a Cargo flag and only adds the small `fail` crate dep + a few feature-gated code paths; it doesn't change the dep tree of any other crate. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/ci.yml | 52 ++++++++-------------------------------- 1 file changed, 10 insertions(+), 42 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1d81f17..f36b31e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -161,6 +161,16 @@ jobs: OMNIGRAPH_UPDATE_OPENAPI: ${{ (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository) && '1' || '' }} run: cargo test --workspace --locked + - name: Run failpoints feature test + if: needs.classify_changes.outputs.run_full_ci == 'true' + # Run after the workspace test so the build cache is warm — + # enabling --features failpoints is just an incremental rebuild + # of omnigraph-engine + the small `fail` crate, not the full + # dep tree (lance, datafusion). A separate job with its own + # cache key would be a fresh ~20min build on first run; this + # is ~30s on a warm cache. + run: cargo test --locked -p omnigraph-engine --features failpoints --test failpoints + - name: Commit regenerated openapi.json to PR branch if: | needs.classify_changes.outputs.run_full_ci == 'true' && @@ -239,48 +249,6 @@ jobs: if: needs.classify_changes.outputs.run_full_ci == 'true' run: cargo test --locked -p omnigraph-server --features aws - test_failpoints_feature: - name: Test omnigraph-engine --features failpoints - needs: classify_changes - runs-on: ubuntu-latest - timeout-minutes: 30 - permissions: - contents: read - env: - CARGO_TERM_COLOR: always - steps: - - name: Skip for text-only changes - if: needs.classify_changes.outputs.run_full_ci != 'true' - run: echo "Text-only change detected; skipping failpoints feature build." - - - name: Checkout source - if: needs.classify_changes.outputs.run_full_ci == 'true' - uses: actions/checkout@v5.0.1 - - - name: Install system dependencies - if: needs.classify_changes.outputs.run_full_ci == 'true' - run: | - sudo apt-get update - sudo apt-get install -y protobuf-compiler libprotobuf-dev - - - name: Install Rust stable - if: needs.classify_changes.outputs.run_full_ci == 'true' - uses: dtolnay/rust-toolchain@stable - with: - toolchain: stable - - - name: Cache Rust build data - if: needs.classify_changes.outputs.run_full_ci == 'true' - uses: Swatinem/rust-cache@v2 - with: - workspaces: | - . -> target - key: failpoints-feature - - - name: Run failpoints test suite - if: needs.classify_changes.outputs.run_full_ci == 'true' - run: cargo test --locked -p omnigraph-engine --features failpoints --test failpoints - rustfs_integration: name: RustFS S3 Integration needs: From ea16c74329be48d641ecfb3d0341d35789556987 Mon Sep 17 00:00:00 2001 From: Ragnor Comerford Date: Fri, 1 May 2026 21:50:13 +0200 Subject: [PATCH 09/10] MR-794 step 2: address Cursor Bugbot follow-ups on commits 3223b51 + 052b6e6 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Four code/doc fixes from the latest Cursor Bugbot pass: * **Misplaced doc comment in table_store.rs (Medium):** the doc block intended for `scan_pending_batches` was, after my earlier edit, attached to `collect_string_column_values` because the new helper was inserted between the original docblock and `scan_pending_batches`. Move the docblock back onto its function and add a note about the shared SQL-dialect contract with the Lance scanner (the predicate goes to both, which is fine for `predicate_to_sql`'s plain comparison shapes today; future Lance-specific scanner extensions in the filter would need translation). * **Missing null check on committed `id` column (Low):** the committed-side loop in `collect_node_ids_with_pending` (and the parallel non-pending `collect_node_ids`) read `id_col.value(i)` without `is_valid(i)` first. `id` is the @key column on every node type and non-nullable by schema, so this is unreachable today, but the inconsistency with the pending-side `is_valid` guard is worth closing for symmetry / defense. * **Misleading comment in count_pending_src_with_dedupe (Low):** the comment claimed "fall back to naive counting" but the code did `continue`. Fix: it's unreachable in practice (the pending-side schema always contains the key when the caller passes one), so failing loudly with a typed error if it ever does fire is correct — silently skipping the batch would let `@card` violations slip past validation. * **PendingTable.schema mismatch surfaces too late (Medium):** PendingTable captures the schema from the first batch and never updates it. On a blob-bearing table, `insert` produces a full-schema batch and `update` (without assigning every blob) produces a subset-schema batch. Pre-fix the mismatch surfaced inside finalize/MemTable construction — distant from the offending op. Post-fix `MutationStaging::append_batch` validates the new batch's schema against the existing accumulator's schema and returns a typed error directing the caller to split the mutation. Error fires at the offending op, not at end-of-query. New helper `schemas_compatible` compares field name + data_type pairs; nullability and field metadata differences stay tolerated (downstream concat already permits those). Cubic Cursor Bugbot finding #5 (cascade delete edge re-open) self-resolved in the bot's own analysis ("logic appears sound on re-examination") — no action. New test on tests/runs.rs: * append_batch_rejects_mismatched_schema_in_blob_table_at_offending_op — pins the early-error path. Builds a blob-bearing schema, runs an `insert + update` query where the update doesn't assign the blob, asserts the error fires at the second op with the "Split the mutation" message and the manifest is unchanged. Local: tests/runs.rs 24/24 passed. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/omnigraph/src/exec/staging.rs | 62 +++++++++++++++++-- crates/omnigraph/src/loader/mod.rs | 11 +++- crates/omnigraph/src/table_store.rs | 21 +++++-- crates/omnigraph/tests/runs.rs | 90 ++++++++++++++++++++++++++++ 4 files changed, 173 insertions(+), 11 deletions(-) diff --git a/crates/omnigraph/src/exec/staging.rs b/crates/omnigraph/src/exec/staging.rs index 7695906..92ddfac 100644 --- a/crates/omnigraph/src/exec/staging.rs +++ b/crates/omnigraph/src/exec/staging.rs @@ -133,6 +133,33 @@ impl MutationStaging { // be appended. return Ok(()); } + // If we've already accumulated a batch on this table, the new + // batch's schema MUST match the existing accumulator's schema. + // The mismatch case in practice is a blob-bearing table that + // sees an `insert` (full schema, blob columns included) and + // then an `update` whose `apply_assignments` output omits + // unassigned blob columns (subset schema). Concat-time and + // MemTable-construction errors would catch this later, but + // surfacing it at the offending `append_batch` call gives the + // caller a clearer point of failure attached to the specific + // op that introduced the drift. + if let Some(existing) = self.pending.get(table_key) { + if !schemas_compatible(&existing.schema, &batch.schema()) { + return Err(OmniError::manifest(format!( + "table '{}' accumulated mutation batches with mismatched schemas: \ + prior batches have {} columns, this batch has {}. \ + This typically happens on a blob-bearing table when one \ + op uses the full schema (e.g. an `insert`) and another \ + omits unassigned blob columns (e.g. an `update` that \ + doesn't set every blob property). Split the mutation \ + into two queries: one for the inserts, one for the \ + updates.", + table_key, + existing.schema.fields().len(), + batch.schema().fields().len(), + ))); + } + } let entry = self .pending .entry(table_key.to_string()) @@ -305,6 +332,24 @@ impl MutationStaging { /// arbitrary results on duplicates). /// /// `batches` must be non-empty and all share `schema` (caller enforces). +/// Compare two schemas for the purposes of `MutationStaging::append_batch`'s +/// accumulator-compatibility check. We treat schemas as compatible if +/// they have the same field names and data types in the same order. +/// Nullability and field metadata differences are tolerated — Lance and +/// Arrow round-trip these freely and the accumulator's downstream +/// `concat_batches` is also permissive on those. +fn schemas_compatible(a: &SchemaRef, b: &SchemaRef) -> bool { + if a.fields().len() != b.fields().len() { + return false; + } + for (af, bf) in a.fields().iter().zip(b.fields().iter()) { + if af.name() != bf.name() || af.data_type() != bf.data_type() { + return false; + } + } + true +} + fn dedupe_merge_batches_by_id( schema: &SchemaRef, batches: Vec, @@ -536,10 +581,19 @@ fn count_pending_src_with_dedupe( let mut kept_srcs: Vec = Vec::new(); for batch in pending_batches.iter().rev() { let Some(key_col) = batch.column_by_name(dedupe_key_column) else { - // Pending batch is missing the key column — fall back to naive - // counting for this batch (caller's contract was about merge - // semantics; if the column isn't there we can't dedupe). - continue; + // Pending batch is missing the key column. By construction + // this is unreachable: callers in dedupe mode always push + // batches whose schema contains the key (loader Merge mode + // builds via build_edge_batch which always emits `id`; the + // append_batch schema-compatibility check at the call site + // would also reject a heterogeneous mix). If it ever fires + // it's a programmer error — fail loudly rather than skip + // counting (which would let `@card` violations slip). + return Err(OmniError::manifest_internal(format!( + "count_pending_src_with_dedupe: pending batch missing dedup key column '{}' \ + (schema-compat check at append_batch should have rejected this)", + dedupe_key_column + ))); }; let key_arr = key_col.as_any().downcast_ref::().ok_or_else(|| { OmniError::Lance(format!( diff --git a/crates/omnigraph/src/loader/mod.rs b/crates/omnigraph/src/loader/mod.rs index a349a64..374adc9 100644 --- a/crates/omnigraph/src/loader/mod.rs +++ b/crates/omnigraph/src/loader/mod.rs @@ -1655,7 +1655,13 @@ async fn collect_node_ids_with_pending( .downcast_ref::() .ok_or_else(|| OmniError::Lance("'id' column is not Utf8".into()))?; for i in 0..batch.num_rows() { - ids.insert(id_col.value(i).to_string()); + // Defensive: `id` is the @key column on every node type and + // is non-nullable by schema, but a committed-row corruption + // (or future schema change) could surface a NULL. Skip + // rather than insert "" — pending-side does the same. + if id_col.is_valid(i) { + ids.insert(id_col.value(i).to_string()); + } } } @@ -1718,6 +1724,9 @@ async fn collect_node_ids( .downcast_ref::() .unwrap(); for i in 0..batch.num_rows() { + if !id_col.is_valid(i) { + continue; + } ids.insert(id_col.value(i).to_string()); } } diff --git a/crates/omnigraph/src/table_store.rs b/crates/omnigraph/src/table_store.rs index c460e51..b0ecc18 100644 --- a/crates/omnigraph/src/table_store.rs +++ b/crates/omnigraph/src/table_store.rs @@ -1168,12 +1168,6 @@ fn assign_row_id_meta(fragments: &mut [Fragment], start_row_id: u64) -> Result<( Ok(()) } -/// Apply `projection` and `filter` to in-memory pending batches via a -/// fresh DataFusion `SessionContext`. Used by `scan_with_pending` for -/// the read-your-writes side of MR-794's in-memory accumulator. -/// -/// `pending_batches` must be non-empty (the caller short-circuits on -/// empty). /// Collect the set of values in a Utf8 column across multiple batches. /// Used by `scan_with_pending`'s merge-semantic path to identify /// committed rows that are shadowed by pending writes. NULL values are @@ -1255,6 +1249,21 @@ fn filter_out_rows_where_string_in( Ok(out) } +/// Apply `projection` and `filter` to in-memory pending batches via a +/// fresh DataFusion `SessionContext`. Used by `scan_with_pending` for +/// the read-your-writes side of MR-794's in-memory accumulator. +/// +/// `pending_batches` must be non-empty (the caller short-circuits on +/// empty). +/// +/// **SQL dialect contract.** `filter` is also passed to Lance's scanner +/// on the committed side. Lance and DataFusion both accept standard +/// SQL comparison predicates (`col op literal`) and OmniGraph's +/// `predicate_to_sql` only emits those shapes today (`=`, `!=`, `>`, +/// `<`, `>=`, `<=`). If a future caller introduces a Lance-specific +/// scanner extension (vector search, FTS, `_rowid` references) into +/// the filter, this function will need explicit translation — DataFusion +/// won't recognize those operators against the in-memory `MemTable`. async fn scan_pending_batches( pending_batches: &[RecordBatch], pending_schema: Option, diff --git a/crates/omnigraph/tests/runs.rs b/crates/omnigraph/tests/runs.rs index 5353a2f..74bafc1 100644 --- a/crates/omnigraph/tests/runs.rs +++ b/crates/omnigraph/tests/runs.rs @@ -1274,3 +1274,93 @@ async fn scan_with_pending_rejects_key_column_missing_from_projection() { "merge-shadow should drop committed 'a' and surface pending 'a' + committed 'b'" ); } + +/// Cursor Bugbot Medium (follow-up on commit 052b6e6): the +/// `PendingTable.schema` field is captured from the first `append_batch` +/// call and never updated. On a blob-bearing table, an `insert` +/// produces a full-schema batch (blob columns included) and an `update` +/// that doesn't assign every blob produces a subset-schema batch. Mixed +/// in one query, the second `append_batch` would silently push an +/// incompatible batch — the mismatch surfaced eventually at +/// `concat_batches`/MemTable construction inside finalize, but the +/// failure point was distant from the offending op. +/// +/// Post-fix: `append_batch` validates the new batch's schema against +/// the existing accumulator's schema and returns a typed error +/// directing the caller to split the mutation. The error fires at the +/// second op (the update), not at end-of-query. +#[tokio::test] +async fn append_batch_rejects_mismatched_schema_in_blob_table_at_offending_op() { + use omnigraph::loader::{LoadMode, load_jsonl}; + + const BLOB_SCHEMA: &str = r#" +node Document { + title: String @key + content: Blob? + note: String? +} +"#; + const BLOB_QUERIES: &str = r#" +query insert_then_update_note( + $title: String, $blob: String, $note: String +) { + insert Document { title: $title, content: $blob } + update Document set { note: $note } where title = $title +} +"#; + + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, BLOB_SCHEMA).await.unwrap(); + + // Seed with a Document so the update has something to match (the + // mid-query case is the chained-update scenario where the update's + // predicate matches the just-inserted row, exercising the in-memory + // pending union). + load_jsonl( + &mut db, + r#"{"type":"Document","data":{"title":"seed","content":"base64:AQID"}}"#, + LoadMode::Overwrite, + ) + .await + .unwrap(); + + let err = db + .mutate( + "main", + BLOB_QUERIES, + "insert_then_update_note", + ¶ms(&[ + ("$title", "letter"), + ("$blob", "base64:BAUG"), + ("$note", "draft 1"), + ]), + ) + .await + .expect_err("blob-table mixed insert+update with non-fully-assigned blob must error early"); + let OmniError::Manifest(manifest_err) = err else { + panic!("expected Manifest error, got {err:?}"); + }; + assert!( + manifest_err.message.contains("mismatched schemas") + && manifest_err.message.contains("Split the mutation"), + "error must direct user to split: {}", + manifest_err.message, + ); + + // Confirm the manifest didn't advance — early error must be + // before any commit. + let qr = db + .query( + ReadTarget::branch("main"), + r#"query get_doc($title: String) { + match { $d: Document { title: $title } } + return { $d.title } + }"#, + "get_doc", + ¶ms(&[("$title", "letter")]), + ) + .await + .unwrap(); + assert_eq!(qr.num_rows(), 0, "letter must not be visible after early error"); +} From 044ed4601964fab172cc1bb472de61e72f6d3150 Mon Sep 17 00:00:00 2001 From: Ragnor Comerford Date: Fri, 1 May 2026 22:45:38 +0200 Subject: [PATCH 10/10] chore: scrub Linear ticket numbers and review-bot mentions from code comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit OmniGraph is OSS; internal Linear ticket references and code-review-bot mentions in source-code comments don't help external readers and leak internal tooling. Replace ticket numbers (MR-XXX) with descriptive prose, drop linear.app URLs, and remove inline mentions of Cursor/Bugbot/Cubic/Codex review threads. Scope is limited to source-code comments (`crates/`). Docs under `docs/` keep their MR-XXX references — those are part of the established change-history narrative for in-repo docs and don't require a Linear account to find context for. No behavior changes; no public API changes. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/omnigraph-cli/tests/cli.rs | 2 +- crates/omnigraph-cli/tests/system_local.rs | 20 ++-- crates/omnigraph-cli/tests/system_remote.rs | 2 +- crates/omnigraph-server/tests/server.rs | 10 +- crates/omnigraph/src/db/omnigraph.rs | 14 +-- crates/omnigraph/src/db/omnigraph/optimize.rs | 4 +- .../src/db/omnigraph/schema_apply.rs | 6 +- crates/omnigraph/src/db/run_registry.rs | 16 +-- crates/omnigraph/src/exec/mutation.rs | 23 ++-- crates/omnigraph/src/exec/staging.rs | 4 +- crates/omnigraph/src/loader/mod.rs | 17 ++- crates/omnigraph/src/table_store.rs | 44 ++++---- crates/omnigraph/tests/consistency.rs | 11 +- crates/omnigraph/tests/failpoints.rs | 3 +- crates/omnigraph/tests/runs.rs | 102 +++++++++--------- crates/omnigraph/tests/s3_storage.rs | 4 +- crates/omnigraph/tests/staged_writes.rs | 44 ++++---- 17 files changed, 164 insertions(+), 162 deletions(-) diff --git a/crates/omnigraph-cli/tests/cli.rs b/crates/omnigraph-cli/tests/cli.rs index b1c1398..9dc7338 100644 --- a/crates/omnigraph-cli/tests/cli.rs +++ b/crates/omnigraph-cli/tests/cli.rs @@ -1905,7 +1905,7 @@ fn cli_fails_for_invalid_merge_requests() { ); } -// MR-771: `omnigraph run list/show/publish/abort` subcommands removed +// `omnigraph run list/show/publish/abort` subcommands removed // alongside the run state machine. Direct-to-target writes leave nothing // for these CLIs to manage. Audit history is now visible via // `omnigraph commit list` reading the commit graph. diff --git a/crates/omnigraph-cli/tests/system_local.rs b/crates/omnigraph-cli/tests/system_local.rs index f5e75eb..d890603 100644 --- a/crates/omnigraph-cli/tests/system_local.rs +++ b/crates/omnigraph-cli/tests/system_local.rs @@ -307,7 +307,7 @@ fn local_cli_end_to_end_branch_change_merge_flow() { assert_eq!(main_read["row_count"], 1); assert_eq!(main_read["rows"][0]["p.name"], "Zoe"); - // MR-771: `omnigraph run list` removed. Audit visible via commit list. + // `omnigraph run list` removed. Audit visible via commit list. let commits_payload = parse_stdout_json(&output_success( cli() .arg("commit") @@ -597,8 +597,8 @@ fn local_cli_failed_load_keeps_target_state_unchanged() { snapshot_table_row_count(&repo, "edge:Knows"), knows_rows_before ); - // MR-771: failed loads no longer leave a RunRecord. The atomicity - // guarantee is verified above (target tables are unchanged). + // Failed loads leave no run record (the run lifecycle has been + // removed); atomicity is verified above by the unchanged target. } #[test] @@ -631,9 +631,8 @@ fn local_cli_failed_change_keeps_target_state_unchanged() { .arg("--json"), )); assert_eq!(friends_payload["row_count"], 2); - // MR-771: failed mutations no longer leave a RunRecord. The atomicity - // guarantee is verified above (the friends_of read above shows main - // unchanged). + // Failed mutations leave no run record (the run lifecycle has been + // removed); atomicity is verified above by the unchanged target. } #[test] @@ -941,12 +940,13 @@ query vector_search($q: String) { assert_eq!(result["rows"][0]["d.slug"], "alpha-doc"); } -// MR-771: the publisher CAS conflict shape is verified end-to-end at the -// engine level in `crates/omnigraph/tests/runs.rs::concurrent_writers_one_succeeds_one_gets_expected_version_mismatch` +// The publisher CAS conflict shape is verified end-to-end at the engine +// level in +// `crates/omnigraph/tests/runs.rs::concurrent_writers_one_succeeds_one_gets_expected_version_mismatch` // and at the HTTP boundary in // `crates/omnigraph-server/tests/server.rs::change_conflict_returns_manifest_conflict_409`. -// The pre-MR-771 CLI-level race was timing-dependent; with direct-publish -// the surface is the same engine path the unit test already covers. +// A CLI-level race would be timing-dependent; with direct-publish the +// surface is the same engine path the unit test already covers. #[test] fn local_cli_policy_tooling_is_end_to_end_while_local_writes_stay_unenforced() { diff --git a/crates/omnigraph-cli/tests/system_remote.rs b/crates/omnigraph-cli/tests/system_remote.rs index cecd725..15f3a6f 100644 --- a/crates/omnigraph-cli/tests/system_remote.rs +++ b/crates/omnigraph-cli/tests/system_remote.rs @@ -192,7 +192,7 @@ query insert_person($name: String, $age: I32) { assert_eq!(local_verify["row_count"], 1); assert_eq!(local_verify["rows"][0]["p.name"], "Mina"); - // MR-771: `run publish` / `run list` removed. Direct-to-target writes + // `run publish` / `run list` removed. Direct-to-target writes // already landed via the change call above; the commit graph is now // the audit surface (verified separately by `commit list`). } diff --git a/crates/omnigraph-server/tests/server.rs b/crates/omnigraph-server/tests/server.rs index 60d4043..f8d6b8d 100644 --- a/crates/omnigraph-server/tests/server.rs +++ b/crates/omnigraph-server/tests/server.rs @@ -1346,7 +1346,7 @@ async fn policy_blocks_non_admin_merge_to_main_and_allows_admin() { #[tokio::test(flavor = "multi_thread")] async fn authenticated_change_stamps_actor_on_commits() { - // MR-771: with the Run state machine removed, actor_id is recorded + // With the Run state machine removed, actor_id is recorded // directly on the commit graph (no intermediate run record). let (_temp, app) = app_for_loaded_repo_with_auth_tokens(&[("act-andrew", "token-one")]).await; @@ -2108,10 +2108,10 @@ query vector_search_string($q: String) { #[tokio::test(flavor = "multi_thread")] async fn change_conflict_returns_manifest_conflict_409() { - // MR-771: a write that races with another writer surfaces as HTTP 409 - // with a structured `manifest_conflict` body — `table_key`, `expected`, - // and `actual` — so clients can detect-and-retry without parsing the - // message. (Replaces the old run-publish merge-conflict shape.) + // A write that races with another writer surfaces as HTTP 409 with + // a structured `manifest_conflict` body — `table_key`, `expected`, + // and `actual` — so clients can detect-and-retry without parsing + // the message. let temp = init_loaded_repo().await; let repo = repo_path(temp.path()); diff --git a/crates/omnigraph/src/db/omnigraph.rs b/crates/omnigraph/src/db/omnigraph.rs index 01cf551..a924b1f 100644 --- a/crates/omnigraph/src/db/omnigraph.rs +++ b/crates/omnigraph/src/db/omnigraph.rs @@ -1430,12 +1430,12 @@ edge WorksAt: Person -> Company #[tokio::test] async fn test_apply_schema_succeeds_after_load() { - // MR-670 + MR-674: schema apply used to be blocked by leftover - // __run__ branches. MR-670 added a defense-in-depth filter that - // skips internal system branches. MR-674 made run branches - // ephemeral on every terminal state, so in practice no __run__ - // branch survives publish — but the filter still guards the - // invariant. + // Historical: schema apply used to be blocked by leftover + // `__run__` branches. A defense-in-depth filter now skips + // internal system branches, and run branches were made + // ephemeral on every terminal state — so in practice no + // `__run__` branch survives publish. The filter still guards + // the invariant. let dir = tempfile::tempdir().unwrap(); let uri = dir.path().to_str().unwrap(); let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); @@ -1451,7 +1451,7 @@ edge WorksAt: Person -> Company let all_branches = db.coordinator.all_branches().await.unwrap(); assert!( !all_branches.iter().any(|b| is_internal_run_branch(b)), - "MR-674: run branch should be deleted after publish, got: {:?}", + "run branch should be deleted after publish, got: {:?}", all_branches ); diff --git a/crates/omnigraph/src/db/omnigraph/optimize.rs b/crates/omnigraph/src/db/omnigraph/optimize.rs index 32889a0..21050c1 100644 --- a/crates/omnigraph/src/db/omnigraph/optimize.rs +++ b/crates/omnigraph/src/db/omnigraph/optimize.rs @@ -15,8 +15,8 @@ //! retention. Destructive to version history — callers should gate this //! behind an explicit confirm flag at the CLI layer. //! -//! Both walk every node + edge table on the `main` branch. Run branches are -//! ephemeral by design (MR-670 / MR-674) so we do not optimize them. +//! Both walk every node + edge table on the `main` branch. Run branches +//! are ephemeral by design so we do not optimize them. use std::time::Duration; diff --git a/crates/omnigraph/src/db/omnigraph/schema_apply.rs b/crates/omnigraph/src/db/omnigraph/schema_apply.rs index b294260..b096dbd 100644 --- a/crates/omnigraph/src/db/omnigraph/schema_apply.rs +++ b/crates/omnigraph/src/db/omnigraph/schema_apply.rs @@ -34,9 +34,9 @@ pub(super) async fn apply_schema_with_lock( let branches = db.coordinator.all_branches().await?; // Skip `main` and internal system branches. The schema-apply lock branch // is excluded because it is the cluster-wide schema-apply serializer. - // `__run__*` branches are no longer created (MR-771); the filter remains - // as defense-in-depth for legacy repos with leftover staging branches — - // MR-770 will sweep them and this guard can go. + // `__run__*` branches are no longer created; the filter remains as + // defense-in-depth for legacy repos with leftover staging branches. + // A future production sweep will let this guard go. let blocking_branches = branches .into_iter() .filter(|branch| branch != "main" && !is_internal_system_branch(branch)) diff --git a/crates/omnigraph/src/db/run_registry.rs b/crates/omnigraph/src/db/run_registry.rs index 9f1c376..ee3d336 100644 --- a/crates/omnigraph/src/db/run_registry.rs +++ b/crates/omnigraph/src/db/run_registry.rs @@ -1,12 +1,12 @@ -// Run state-machine has been removed (MR-771). Mutations now write directly -// to target tables and use the publisher's `expected_table_versions` CAS for -// cross-table OCC; the `__run__` staging branches and `_graph_runs.lance` -// state machine no longer exist. +// The Run state machine has been removed. Mutations now write directly +// to target tables and use the publisher's `expected_table_versions` +// CAS for cross-table OCC; `__run__` staging branches and the +// `_graph_runs.lance` state machine no longer exist. // -// What remains is the branch-name predicate, kept as a defense-in-depth guard -// against users naming a public branch `__run__*`. MR-770 owns the production -// sweep of legacy `_graph_runs.lance` rows and stale `__run__*` branches; once -// that lands the predicate (and this file) can go too. +// What remains is the branch-name predicate, kept as a defense-in-depth +// guard against users naming a public branch `__run__*`. A future +// production sweep of legacy `_graph_runs.lance` rows and stale +// `__run__*` branches will let this predicate (and this file) go too. pub(crate) const INTERNAL_RUN_BRANCH_PREFIX: &str = "__run__"; diff --git a/crates/omnigraph/src/exec/mutation.rs b/crates/omnigraph/src/exec/mutation.rs index a588c68..e996241 100644 --- a/crates/omnigraph/src/exec/mutation.rs +++ b/crates/omnigraph/src/exec/mutation.rs @@ -590,9 +590,9 @@ use super::staging::{MutationStaging, PendingMode}; /// is the source of truth for "where is Lance HEAD right now on /// this table within this query." /// -/// The `inline_committed` reopen branch closes the cursor-bot HIGH -/// "multi-delete fails on same table" finding. The branch goes away -/// once Lance exposes a two-phase delete API +/// The `inline_committed` reopen branch closes the multi-delete-on-same-table +/// failure path that pre-staged-write engines inherited. The branch goes +/// away once Lance exposes a two-phase delete API /// ([lance-format/lance#6658](https://github.com/lance-format/lance/issues/6658)) /// and we can stage deletes on the same path as inserts/updates. async fn open_table_for_mutation( @@ -633,9 +633,9 @@ async fn open_table_for_mutation( /// D₂ parse-time check: a single mutation query is either insert/update-only /// or delete-only. Mixed → reject before any I/O. /// -/// Reason: under the staged-write rewire (MR-794 step 2+), inserts and -/// updates accumulate in memory and commit at end-of-query, while deletes -/// still inline-commit (Lance lacks a public two-phase delete in 4.0.0). +/// Reason: under the staged-write writer, inserts and updates +/// accumulate in memory and commit at end-of-query, while deletes still +/// inline-commit (Lance lacks a public two-phase delete in 4.0.0). /// Mixing creates ordering hazards (same-row insert→delete becomes a no-op /// because the staged insert isn't visible to delete; cascading deletes /// of just-inserted edges break referential integrity by silent design). @@ -661,7 +661,7 @@ fn enforce_no_mixed_destructive_constructive( "mutation '{}' on the same query mixes inserts/updates and deletes; \ split into separate mutations: (1) inserts and updates, then (2) deletes. \ This restriction lifts when Lance exposes a two-phase delete API \ - (tracked: MR-793 / Lance-upstream).", + (tracked: lance-format/lance#6658).", ir.name ))); } @@ -706,11 +706,10 @@ impl Omnigraph { ) -> Result { self.ensure_schema_state_valid().await?; let requested = Self::normalize_branch_name(branch)?; - // Reject internal `__run__*` / system-prefixed branches at the public - // write boundary. The pre-MR-771 path got this guard transitively via - // `begin_run`'s `ensure_public_branch_ref` call; the direct-publish - // path needs to assert it explicitly so a caller can't write to - // legacy or system staging branches by passing the prefix verbatim. + // Reject internal `__run__*` / system-prefixed branches at the + // public write boundary. Direct-publish paths assert this + // explicitly so a caller can't write to legacy or system + // staging branches by passing the prefix verbatim. if let Some(name) = requested.as_deref() { crate::db::ensure_public_branch_ref(name, "mutate")?; } diff --git a/crates/omnigraph/src/exec/staging.rs b/crates/omnigraph/src/exec/staging.rs index 92ddfac..2c7eae3 100644 --- a/crates/omnigraph/src/exec/staging.rs +++ b/crates/omnigraph/src/exec/staging.rs @@ -1,4 +1,4 @@ -//! Per-query staging accumulator for direct-publish writes (MR-794 step 2+). +//! Per-query staging accumulator for direct-publish writes. //! //! `MutationStaging` accumulates per-table input batches in memory during a //! `mutate_as` or `load` query, then at end-of-query commits each touched @@ -74,7 +74,7 @@ pub(crate) struct StagedTablePath { /// Per-query staging state. /// -/// Replaces the post-MR-771 inline-commit `MutationStaging.latest` map with +/// Replaces the legacy inline-commit `MutationStaging.latest` map with /// an in-memory accumulator that defers all Lance HEAD advances to /// end-of-query. After this rewire the bug class "Lance HEAD drifts ahead /// of `__manifest`" is unreachable in `mutate_as` and `load` for inserts diff --git a/crates/omnigraph/src/loader/mod.rs b/crates/omnigraph/src/loader/mod.rs index 374adc9..f275938 100644 --- a/crates/omnigraph/src/loader/mod.rs +++ b/crates/omnigraph/src/loader/mod.rs @@ -155,11 +155,10 @@ impl Omnigraph { pub async fn load(&mut self, branch: &str, data: &str, mode: LoadMode) -> Result { self.ensure_schema_state_valid().await?; - // Reject internal `__run__*` / system-prefixed branches at the public - // write boundary. The pre-MR-771 path got this guard transitively via - // `begin_run`'s `ensure_public_branch_ref` call; the direct-publish - // path needs to assert it explicitly so a caller can't write to - // legacy or system staging branches by passing the prefix verbatim. + // Reject internal `__run__*` / system-prefixed branches at the + // public write boundary. Direct-publish paths assert this + // explicitly so a caller can't write to legacy or system + // staging branches by passing the prefix verbatim. crate::db::ensure_public_branch_ref(branch, "load")?; // Branch convention: `None` represents `main`. Re-normalizing to // `Some("main")` here would route the publisher commit through a @@ -1561,10 +1560,10 @@ pub(crate) async fn validate_edge_cardinality( /// Lance version). /// /// `mode` controls dedup behavior. `LoadMode::Merge` passes `Some("id")` -/// so committed edges that the load is *updating* (same edge id, possibly -/// changed `src`) are not double-counted (Cubic P1 finding on PR #68). -/// `LoadMode::Append` passes `None` because each line generates a fresh -/// ULID id that never collides with committed. +/// so committed edges that the load is *updating* (same edge id, +/// possibly changed `src`) are not double-counted. `LoadMode::Append` +/// passes `None` because each line generates a fresh ULID id that +/// never collides with committed. async fn validate_edge_cardinality_with_pending_loader( db: &Omnigraph, branch: Option<&str>, diff --git a/crates/omnigraph/src/table_store.rs b/crates/omnigraph/src/table_store.rs index b0ecc18..adc7c2b 100644 --- a/crates/omnigraph/src/table_store.rs +++ b/crates/omnigraph/src/table_store.rs @@ -40,12 +40,12 @@ pub struct DeleteState { /// A Lance write that has produced fragment files on object storage but is /// not yet committed to the dataset's manifest. The staged-write primitives -/// are defined here for later integration in `MutationStaging` -/// (`exec/mutation.rs`) and the loader (`loader/mod.rs`) — those rewires -/// land in [MR-794](https://linear.app/modernrelay/issue/MR-794) step 2+. -/// The intent: defer Lance commits to end-of-query so a mid-query failure -/// leaves the touched table at the pre-mutation HEAD instead of drifting -/// ahead. +/// are consumed by `MutationStaging` (`exec/staging.rs`, +/// `exec/mutation.rs`) and the bulk loader (`loader/mod.rs`). The +/// intent: defer Lance commits to end-of-query so a mid-query failure +/// leaves the touched table at the pre-mutation HEAD instead of +/// drifting ahead. See `docs/runs.md` for the publisher-CAS contract +/// this builds on. /// /// `transaction` is opaque from our side — Lance owns its semantics. We /// commit it via `CommitBuilder::execute(transaction)` (see @@ -545,7 +545,7 @@ impl TableStore { }) } - // ─── Staged-write API (MR-794) ─────────────────────────────────────────── + // ─── Staged-write API ──────────────────────────────────────────────────── // // These primitives wrap Lance's distributed-write API: each call writes // fragment files to object storage but does NOT advance the dataset's @@ -672,16 +672,16 @@ impl TableStore { /// /// This is intrinsic to the underlying Lance API: there is no public /// way to make `MergeInsertBuilder` see uncommitted fragments. The - /// engine's mutation path enforces the rule "per touched table: all - /// stage_append OR exactly one stage_merge_insert" at parse time - /// (the D₂′ check landing with [MR-794](https://linear.app/modernrelay/issue/MR-794) - /// step 2+ in `exec/mutation.rs`). Multi-table queries and append-chains - /// remain safe; only chained merges on a single table are rejected. + /// engine's `MutationStaging` accumulator works around this by + /// concatenating per-table batches in memory and issuing exactly + /// one `stage_merge_insert` per touched table at end-of-query (with + /// last-write-wins dedupe by id) — see `exec/staging.rs`. Direct + /// callers of this primitive must respect the contract themselves. /// /// Lift path: either a Lance API extension that lets /// `MergeInsertBuilder` accept additional staged fragments, or an /// in-memory pre-merge here that folds prior staged batches into the - /// input stream. See `docs/runs.md` and MR-793. + /// input stream. See `docs/runs.md`. pub async fn stage_merge_insert( &self, ds: Dataset, @@ -780,10 +780,10 @@ impl TableStore { /// filtered scan even when their data would match. Staged-fragment /// rows are silently absent from the result. `scanner.use_stats(false)` /// does not fix this in lance 4.0.0. Callers needing correct filtered - /// reads against staged data should use a different strategy (the - /// engine's MR-794 step 2+ design uses in-memory pending-batch - /// accumulation + DataFusion `MemTable` instead — see - /// `.context/mr-794-step2-design.md`). + /// reads against staged data should use a different strategy — the + /// engine's `MutationStaging` accumulator unions in-memory pending + /// batches with the committed scan via DataFusion `MemTable` (see + /// `scan_with_pending`). /// /// This method remains on the surface for primitive-level testing /// (basic stage + scan correctness without filters works) and for @@ -824,10 +824,10 @@ impl TableStore { /// Scan committed via Lance + apply the same filter to in-memory /// pending batches via DataFusion `MemTable`, concat the two result /// streams. The replacement for `scan_with_staged` in engine code: - /// MR-794 step 2+ accumulates input batches in memory and unions - /// them with the committed snapshot at read time, sidestepping the - /// `Scanner::with_fragments` filter-pushdown limitation documented - /// on `scan_with_staged`. + /// the staged-write writer accumulates input batches in memory and + /// unions them with the committed snapshot at read time, + /// sidestepping the `Scanner::with_fragments` filter-pushdown + /// limitation documented on `scan_with_staged`. /// /// `committed_ds` should be opened at the pre-mutation /// `expected_version` (the same version captured in `MutationStaging::expected_versions` @@ -1251,7 +1251,7 @@ fn filter_out_rows_where_string_in( /// Apply `projection` and `filter` to in-memory pending batches via a /// fresh DataFusion `SessionContext`. Used by `scan_with_pending` for -/// the read-your-writes side of MR-794's in-memory accumulator. +/// the read-your-writes side of the in-memory staging accumulator. /// /// `pending_batches` must be non-empty (the caller short-circuits on /// empty). diff --git a/crates/omnigraph/tests/consistency.rs b/crates/omnigraph/tests/consistency.rs index 205893c..63dc3f7 100644 --- a/crates/omnigraph/tests/consistency.rs +++ b/crates/omnigraph/tests/consistency.rs @@ -522,11 +522,12 @@ query high_value() { #[tokio::test] async fn stale_handle_public_mutation_must_refresh_then_retry() { - // MR-771: with the Run state machine removed, the engine no longer - // auto-rebases stale-handle mutations onto the latest target head. The - // publisher's `expected_table_versions` CAS makes the contract explicit - // — a stale writer fails loudly with `ExpectedVersionMismatch` and the - // client decides whether to refresh-and-retry. + // With the Run state machine removed, the engine no longer + // auto-rebases stale-handle mutations onto the latest target head. + // The publisher's `expected_table_versions` CAS makes the contract + // explicit — a stale writer fails loudly with + // `ExpectedVersionMismatch` and the client decides whether to + // refresh-and-retry. let dir = tempfile::tempdir().unwrap(); let _db = init_and_load(&dir).await; drop(_db); diff --git a/crates/omnigraph/tests/failpoints.rs b/crates/omnigraph/tests/failpoints.rs index 3adc15d..f16ed59 100644 --- a/crates/omnigraph/tests/failpoints.rs +++ b/crates/omnigraph/tests/failpoints.rs @@ -140,7 +140,8 @@ async fn schema_apply_recovers_partial_rename() { assert_no_staging_files(dir.path()); } -/// Pin the documented "finalize → publisher residual" from MR-794. +/// Pin the documented "finalize → publisher residual" of the +/// staged-write commit path. /// /// `MutationStaging::finalize` runs `commit_staged` per touched table /// sequentially before the publisher commits the manifest. Lance has no diff --git a/crates/omnigraph/tests/runs.rs b/crates/omnigraph/tests/runs.rs index 74bafc1..4e363bf 100644 --- a/crates/omnigraph/tests/runs.rs +++ b/crates/omnigraph/tests/runs.rs @@ -1,4 +1,4 @@ -//! Tests for the direct-to-target write path (MR-771: Run state machine +//! Tests for the direct-to-target write path (Run state machine //! removed). The Run/`__run__` staging branch / RunRecord state machine no //! longer exists; mutations and loads write directly to target tables and //! commit once via the publisher's `expected_table_versions` CAS. @@ -161,17 +161,17 @@ async fn multi_statement_mutation_is_atomic_with_read_your_writes() { assert_eq!(friends.num_rows(), 1); } -/// Mid-query partial failure: op-1 stages a Person insert, op-2 fails on -/// referential integrity (validate_edge_insert_endpoints). Under the -/// MR-794 staged-write rewire, op-1's batch lives in the in-memory +/// Mid-query partial failure: op-1 stages a Person insert, op-2 fails +/// on referential integrity (validate_edge_insert_endpoints). Under +/// the staged-write writer, op-1's batch lives in the in-memory /// accumulator and never reaches Lance — Lance HEAD on `node:Person` /// stays at the pre-mutation version. The publisher never publishes, /// the manifest never advances, and the next mutation against the same /// table proceeds normally (no `ExpectedVersionMismatch`). /// -/// This test pins the post-MR-794 contract: -/// - Failed multi-statement mutation surfaces a clear error, no manifest -/// commit, no observable state change. +/// Pins the staged-write contract: +/// - Failed multi-statement mutation surfaces a clear error, no +/// manifest commit, no observable state change. /// - The touched tables stay queryable and writable from the next /// query — Lance HEAD has not drifted. #[tokio::test] @@ -228,7 +228,7 @@ async fn partial_failure_leaves_target_queryable_and_unblocks_next_mutation() { &mixed_params(&[("$name", "Frank")], &[("$age", 33)]), ) .await - .expect("next mutation on the touched table must succeed under MR-794"); + .expect("next mutation on the touched table must succeed under the staged-write writer"); assert_eq!( result.affected_nodes, 1, "follow-up insert should report 1 affected node" @@ -251,7 +251,7 @@ async fn partial_failure_leaves_target_queryable_and_unblocks_next_mutation() { /// success and one `ExpectedVersionMismatch`. The replacement for the old /// `concurrent_conflicting_run_publish_fails_cleanly` test — the OCC fence /// has moved from a graph-level run-publish merge into the publisher's -/// per-table CAS (MR-766 + MR-771). +/// per-table CAS. /// /// Drives the race by interleaving two handles that captured the same /// pre-write manifest snapshot: A commits first; B's commit then sees @@ -323,7 +323,7 @@ async fn concurrent_writers_one_succeeds_one_gets_expected_version_mismatch() { ); } -/// The cancellation hole that motivated MR-771: dropping a mutation future +/// The cancellation hole that motivated removing the Run state machine: dropping a mutation future /// mid-flight must not leave any graph-level state behind. With the run /// state machine gone, only orphaned Lance fragments can remain — and those /// are reclaimed by `omnigraph cleanup`. @@ -381,7 +381,7 @@ async fn cancelled_mutation_future_leaves_no_state() { // `is_internal_system_branch`, so a runtime "no `__run__` branches" check // would be vacuous. The structural property that no `__run__` branches // can ever be created is enforced by deletion of `begin_run` etc. in - // MR-771 (verified by the build itself — those symbols no longer exist). + // (verified by the build itself — those symbols no longer exist). // // (1) The branch list is unchanged: cancellation/completion cannot // synthesize new public branches. @@ -448,9 +448,10 @@ async fn repeated_loads_do_not_accumulate_branches() { assert_eq!(db.branch_list().await.unwrap(), vec!["main".to_string()]); } -/// User code must not be able to write to internal `__run__*` names. The -/// branch-name guard predicate is kept as defense-in-depth (MR-770 will -/// remove it once production legacy branches are swept). +/// User code must not be able to write to internal `__run__*` names. +/// The branch-name guard predicate is kept as defense-in-depth; it +/// will be removed once a future production sweep retires the legacy +/// branches. #[tokio::test] async fn public_branch_apis_reject_internal_run_refs() { let dir = tempfile::tempdir().unwrap(); @@ -480,11 +481,11 @@ async fn public_branch_apis_reject_internal_run_refs() { ); } -// ─── MR-794: staged-write rewire — additional contract tests ──────────────── +// ─── Staged-write rewire — additional contract tests ─────────────────────── -/// Mutation queries used only by the MR-794 tests below. Kept in the test -/// file (not in helpers' shared `MUTATION_QUERIES`) to keep their scope -/// local to the staged-write coverage. +/// Mutation queries used only by the staged-write tests below. Kept in +/// the test file (not in helpers' shared `MUTATION_QUERIES`) to keep +/// their scope local to the staged-write coverage. const STAGED_QUERIES: &str = r#" query insert_two_persons($a_name: String, $a_age: I32, $b_name: String, $b_age: I32) { insert Person { name: $a_name, age: $a_age } @@ -852,10 +853,10 @@ edge WorksAt: Person -> Company @card(0..1) ); } -// ─── PR #68 review-comment fixes — pinned coverage ────────────────────────── +// ─── Chained-mutation correctness — pinned coverage ───────────────────────── -/// Codex P1 / Cubic P1 #1: chained `update` ops in one query must respect -/// each previous op's view of the rows. Without merge-shadow semantics on +/// Chained `update` ops in one query must respect each previous op's +/// view of the rows. Without merge-shadow semantics on /// `scan_with_pending`, the second update sees the stale committed value /// (the first update's row still appears in the Lance scan because the /// pending side hasn't committed), the predicate matches it, and the @@ -932,8 +933,8 @@ async fn chained_updates_with_overlapping_predicate_respects_intermediate_value( ); } -/// Cursor Bugbot HIGH: two `delete` ops on the same node table in one -/// query. Pre-fix, op-2's `open_table_for_mutation` went through +/// Two `delete` ops on the same node table in one query. Pre-fix, +/// op-2's `open_table_for_mutation` went through /// `open_for_mutation_on_branch` which trips `ensure_expected_version` /// (Lance HEAD has advanced past the manifest's pinned version after /// op-1's inline-commit, but the manifest hasn't moved). Post-fix, @@ -984,7 +985,7 @@ async fn multi_statement_delete_on_same_node_table() { } } -/// Cursor Bugbot HIGH (cascade variant): deleting a node cascades to its +/// Cascade-then-explicit variant: deleting a node cascades to its /// edges, advancing Lance HEAD on the edge table. A subsequent /// `delete ` op in the same query must reopen at the /// post-cascade-commit version of the edge table — not trip @@ -1030,11 +1031,10 @@ query cascade_then_explicit($name: String, $other: String) { ); } -/// Codex P2 / Cursor Bugbot LOW / Cubic P2: the engine cardinality path -/// must enforce `min` bounds. Pre-fix the engine path silently dropped -/// the min check (a `let _ = card.min;` line). The loader path always -/// enforced both. Post-fix, both paths route through -/// `enforce_cardinality_bounds` which checks both bounds. +/// The engine cardinality path must enforce `min` bounds. Pre-fix the +/// engine path silently dropped the min check (a `let _ = card.min;` +/// line). The loader path always enforced both. Post-fix, both paths +/// route through `enforce_cardinality_bounds` which checks both bounds. /// /// Build a custom schema with `Knows: Person -> Person @card(2..*)`. /// Inserting a single Knows edge violates min=2. The mutation path must @@ -1085,8 +1085,8 @@ query add_friend($from: String, $to: String) { ); } -/// Cubic P1 #2: `LoadMode::Merge` on edges must NOT double-count the -/// committed edge AND its updated pending replacement. Build a custom +/// `LoadMode::Merge` on edges must NOT double-count the committed +/// edge AND its updated pending replacement. Build a custom /// schema where WorksAt has @card(0..1). Seed Alice with one WorksAt to /// Acme. Then Merge-load the SAME edge id (so it's an update, not an /// insert) pointing Alice's WorksAt at Bigco. Cardinality must count @@ -1132,12 +1132,11 @@ edge WorksAt: Person -> Company @card(0..1) assert_eq!(count_rows(&db, "edge:WorksAt").await, 1); } -/// Cubic P2 (follow-up to PR #68 review of commit 3223b51): a Merge load -/// whose input has TWO rows with the same edge id must be deduped at -/// cardinality-count time, not just at finalize. Without dedup, two -/// pending rows count twice → spurious `@card` violation. With dedup -/// (last-occurrence-wins, mirroring `dedupe_merge_batches_by_id`), the -/// pending side counts once. +/// A Merge load whose input has TWO rows with the same edge id must be +/// deduped at cardinality-count time, not just at finalize. Without +/// dedup, two pending rows count twice → spurious `@card` violation. +/// With dedup (last-occurrence-wins, mirroring +/// `dedupe_merge_batches_by_id`), the pending side counts once. /// /// This is a separate path from `load_merge_mode_dedupes_edge_for_cardinality_count` /// (which dedupes committed-vs-pending). Here we verify pending-vs-pending @@ -1182,11 +1181,11 @@ edge WorksAt: Person -> Company @card(0..1) assert_eq!(count_rows(&db, "edge:WorksAt").await, 1); } -/// Cubic P2 (follow-up): `scan_with_pending` must reject a call where -/// `key_column` is requested but the projection omits that column. -/// Without the up-front check, the helper silently degraded to union -/// semantics — letting a chained-update bug slip through unnoticed. -/// This test verifies the contract is enforced at the API boundary. +/// `scan_with_pending` must reject a call where `key_column` is +/// requested but the projection omits that column. Without the +/// up-front check, the helper silently degraded to union semantics — +/// letting a chained-update bug slip through unnoticed. This test +/// verifies the contract is enforced at the API boundary. #[tokio::test] async fn scan_with_pending_rejects_key_column_missing_from_projection() { use arrow_array::{RecordBatch, StringArray}; @@ -1275,20 +1274,19 @@ async fn scan_with_pending_rejects_key_column_missing_from_projection() { ); } -/// Cursor Bugbot Medium (follow-up on commit 052b6e6): the -/// `PendingTable.schema` field is captured from the first `append_batch` -/// call and never updated. On a blob-bearing table, an `insert` -/// produces a full-schema batch (blob columns included) and an `update` -/// that doesn't assign every blob produces a subset-schema batch. Mixed -/// in one query, the second `append_batch` would silently push an +/// `PendingTable.schema` is captured from the first `append_batch` call +/// and never updated. On a blob-bearing table, an `insert` produces a +/// full-schema batch (blob columns included) and an `update` that +/// doesn't assign every blob produces a subset-schema batch. Mixed in +/// one query, the second `append_batch` would silently push an /// incompatible batch — the mismatch surfaced eventually at /// `concat_batches`/MemTable construction inside finalize, but the /// failure point was distant from the offending op. /// -/// Post-fix: `append_batch` validates the new batch's schema against -/// the existing accumulator's schema and returns a typed error -/// directing the caller to split the mutation. The error fires at the -/// second op (the update), not at end-of-query. +/// `append_batch` validates the new batch's schema against the existing +/// accumulator's schema and returns a typed error directing the caller +/// to split the mutation. The error fires at the second op (the +/// update), not at end-of-query. #[tokio::test] async fn append_batch_rejects_mismatched_schema_in_blob_table_at_offending_op() { use omnigraph::loader::{LoadMode, load_jsonl}; diff --git a/crates/omnigraph/tests/s3_storage.rs b/crates/omnigraph/tests/s3_storage.rs index 3a3d2be..5b90022 100644 --- a/crates/omnigraph/tests/s3_storage.rs +++ b/crates/omnigraph/tests/s3_storage.rs @@ -44,7 +44,7 @@ async fn s3_compatible_repo_lifecycle_works() { .await .unwrap(); - // Direct-to-target load (MR-771): no run lifecycle, single publisher + // Direct-to-target load: no run lifecycle, single publisher // commit lands the row. reopened .load( @@ -153,7 +153,7 @@ async fn s3_public_load_uses_hidden_run_and_publishes() { .await .unwrap(); - // Direct-to-target writes (MR-771): no run state machine, just the + // Direct-to-target writes: no run state machine, just the // published commit lands the row. Verify by reopening and reading. let mut reopened = Omnigraph::open(&uri).await.unwrap(); let loaded = query_main( diff --git a/crates/omnigraph/tests/staged_writes.rs b/crates/omnigraph/tests/staged_writes.rs index 811ccaf..1f8822d 100644 --- a/crates/omnigraph/tests/staged_writes.rs +++ b/crates/omnigraph/tests/staged_writes.rs @@ -1,20 +1,21 @@ -//! Primitive-level tests for `TableStore`'s staged-write API -//! (MR-794 step 1). These exercise `stage_append`, `stage_merge_insert`, -//! `scan_with_staged`, and `count_rows_with_staged` directly against a -//! Lance dataset — no Omnigraph engine involved. The engine-level rewire -//! (MR-794 step 2+) lives in `tests/runs.rs` once it lands. +//! Primitive-level tests for `TableStore`'s staged-write API. These +//! exercise `stage_append`, `stage_merge_insert`, `scan_with_staged`, +//! and `count_rows_with_staged` directly against a Lance dataset — no +//! Omnigraph engine involved. The engine-level use of these primitives +//! is exercised by `tests/runs.rs`. //! //! Test surface here: //! 1. `stage_append` + `scan_with_staged` shows committed + staged data //! without duplicates. //! 2. `stage_merge_insert` of a row that supersedes a committed fragment -//! surfaces only the rewritten row, not both — the -//! `removed_fragment_ids` dedup landed in PR #66's `730631c`. -//! 3. **Documented contract**: chained `stage_merge_insert` calls on the -//! same dataset whose source rows share keys produce duplicate rows in -//! `scan_with_staged`. The engine's parse-time D₂′ check (MR-794 step -//! 2+) prevents callers from triggering this; this test pins the -//! primitive's behavior so a future change either (a) preserves it or +//! surfaces only the rewritten row, not both, via the +//! `removed_fragment_ids` dedup contract. +//! 3. **Documented contract**: chained `stage_merge_insert` calls on +//! the same dataset whose source rows share keys produce duplicate +//! rows in `scan_with_staged`. The engine's accumulator dedupes by +//! id at finalize time so this primitive-level pitfall doesn't +//! surface in production paths; this test pins the primitive's +//! behavior so a future change either (a) preserves it or //! (b) consciously fixes it (and updates this test). use arrow_array::{Array, Int32Array, RecordBatch, StringArray, UInt64Array}; @@ -120,8 +121,9 @@ async fn stage_merge_insert_dedupes_superseded_committed_fragment() { assert!( !staged.removed_fragment_ids.is_empty(), "merge_insert that rewrites a committed row must set removed_fragment_ids \ - (this is the dedup invariant from PR #66 commit 730631c — its absence \ - was caught by Cubic/Cursor/Codex on PR #66)" + so the scan-with-staged composer can shadow the superseded committed \ + fragment — without it, the committed row and its rewrite both appear, \ + producing duplicates by key" ); // scan_with_staged: alice appears exactly once, with the new age. @@ -347,11 +349,11 @@ async fn stage_merge_insert_then_commit_persists_merged_view() { /// silently absent. `scanner.use_stats(false)` does not bypass this in /// lance 4.0.0. /// -/// This test pins the actual behavior so a future change either preserves -/// it (and updates the doc) or fixes it (and rewrites this test). The -/// engine's MR-794 step 2+ design uses in-memory pending-batch -/// accumulation + DataFusion `MemTable` for read-your-writes instead, so -/// production code is unaffected. +/// This test pins the actual behavior so a future change either +/// preserves it (and updates the doc) or fixes it (and rewrites this +/// test). The engine's `MutationStaging` accumulator unions in-memory +/// pending batches with the committed scan via DataFusion `MemTable` +/// for read-your-writes instead, so production code is unaffected. #[tokio::test] async fn scan_with_staged_with_filter_silently_drops_staged_rows() { let dir = tempfile::tempdir().unwrap(); @@ -485,6 +487,8 @@ async fn chained_stage_merge_insert_with_shared_key_documents_duplicate_behavior here because this assertion failed: either (a) the primitive was \ improved to dedupe across stages (good — update to assert == 1) \ or (b) something subtler broke (investigate before changing the \ - assertion). See PR #67 Codex P1 thread + .context/mr-794-step2-design.md §3.1." + assertion). The engine's MutationStaging accumulator dedupes by \ + id at finalize time so this primitive-level pitfall doesn't \ + surface in production paths — see exec/staging.rs." ); }