diff --git a/AGENTS.md b/AGENTS.md index 059c182..cd1adb6 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -16,7 +16,7 @@ Tools that support `@`-imports (Claude Code) auto-include all three files via th `CLAUDE.md` is a symlink to this file — there is exactly one source of truth. Edit `AGENTS.md`. -**Version surveyed:** 0.7.1 +**Version surveyed:** 0.7.2 **Workspace crates:** `omnigraph-compiler`, `omnigraph` (engine), `omnigraph-policy`, `omnigraph-api-types` (shared HTTP wire DTOs), `omnigraph-cluster`, `omnigraph-cli`, `omnigraph-server` **Storage substrate:** Lance 7.x (columnar, versioned, branchable) **License:** MIT @@ -264,7 +264,7 @@ omnigraph policy explain --cluster ./company-brain --graph knowledge --actor act | Schema language | — | `.pg` + Pest grammar + catalog + interfaces + constraints + annotations | | Query language | — | `.gq` + Pest grammar + IR + lowering + linter | | Schema migration planning | — | `plan_schema_migration` + `apply_schema` step types + `__schema_apply_lock__` | -| Commit graph (DAG) across whole graph | — | `_graph_commits.lance` with linear + merge parents, ULID ids, actor map | +| Commit graph (DAG) across whole graph | — | Lineage (linear + merge parents, ULID ids, actor) stored as `graph_commit`/`graph_head` rows in `__manifest`, written in the same publish CAS as the table-version rows (RFC-013 Phase 7 — no separate `_graph_commits.lance` write; manifest→commit-graph atomicity gap closed); the in-memory commit graph is a projection of those rows | | Per-query atomic writes | — | In-memory `MutationStaging.pending` accumulator + `stage_*` / `commit_staged` per touched table at end-of-query + publisher CAS via `commit_with_expected` (single manifest commit per `mutate_as` / `load`); D₂ parse-time rule keeps inserts/updates and deletes from mixing | | Three-way row-level merge | — | `OrderedTableCursor` + `StagedTableWriter`, structured `MergeConflictKind` | | Change feeds | — | `diff_between` / `diff_commits` with manifest fast path + ID streaming | diff --git a/Cargo.lock b/Cargo.lock index 126c117..a28cd5b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4851,7 +4851,7 @@ dependencies = [ [[package]] name = "omnigraph-api-types" -version = "0.7.1" +version = "0.7.2" dependencies = [ "omnigraph-compiler", "omnigraph-engine", @@ -4863,7 +4863,7 @@ dependencies = [ [[package]] name = "omnigraph-cli" -version = "0.7.1" +version = "0.7.2" dependencies = [ "assert_cmd", "clap", @@ -4887,7 +4887,7 @@ dependencies = [ [[package]] name = "omnigraph-cluster" -version = "0.7.1" +version = "0.7.2" dependencies = [ "fail", "omnigraph-compiler", @@ -4895,6 +4895,7 @@ dependencies = [ "serde", "serde_json", "serde_yaml", + "serial_test", "sha2 0.10.9", "tempfile", "thiserror", @@ -4905,7 +4906,7 @@ dependencies = [ [[package]] name = "omnigraph-compiler" -version = "0.7.1" +version = "0.7.2" dependencies = [ "ahash", "arrow-array", @@ -4924,7 +4925,7 @@ dependencies = [ [[package]] name = "omnigraph-engine" -version = "0.7.1" +version = "0.7.2" dependencies = [ "arc-swap", "arrow-array", @@ -4968,7 +4969,7 @@ dependencies = [ [[package]] name = "omnigraph-mcp" -version = "0.7.1" +version = "0.7.2" dependencies = [ "async-trait", "axum", @@ -4982,7 +4983,7 @@ dependencies = [ [[package]] name = "omnigraph-policy" -version = "0.7.1" +version = "0.7.2" dependencies = [ "cedar-policy", "clap", @@ -4995,7 +4996,7 @@ dependencies = [ [[package]] name = "omnigraph-server" -version = "0.7.1" +version = "0.7.2" dependencies = [ "arc-swap", "async-trait", diff --git a/crates/omnigraph-api-types/Cargo.toml b/crates/omnigraph-api-types/Cargo.toml index d7e57ef..29c7854 100644 --- a/crates/omnigraph-api-types/Cargo.toml +++ b/crates/omnigraph-api-types/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "omnigraph-api-types" -version = "0.7.1" +version = "0.7.2" edition = "2024" description = "Shared HTTP wire DTOs for Omnigraph — request/response types and engine-result → DTO mappings used by both omnigraph-server and omnigraph-cli (RFC-009). Plain serde/utoipa types; no transport or server internals." license = "MIT" @@ -9,8 +9,8 @@ homepage = "https://github.com/ModernRelay/omnigraph" documentation = "https://docs.rs/omnigraph-api-types" [dependencies] -omnigraph = { package = "omnigraph-engine", path = "../omnigraph", version = "0.7.1" } -omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.7.1" } +omnigraph = { package = "omnigraph-engine", path = "../omnigraph", version = "0.7.2" } +omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.7.2" } serde = { workspace = true } serde_json = { workspace = true } utoipa = { workspace = true } diff --git a/crates/omnigraph-cli/Cargo.toml b/crates/omnigraph-cli/Cargo.toml index 87c42aa..df4ac8d 100644 --- a/crates/omnigraph-cli/Cargo.toml +++ b/crates/omnigraph-cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "omnigraph-cli" -version = "0.7.1" +version = "0.7.2" edition = "2024" description = "CLI for the Omnigraph graph database." license = "MIT" @@ -13,12 +13,12 @@ name = "omnigraph" path = "src/main.rs" [dependencies] -omnigraph = { package = "omnigraph-engine", path = "../omnigraph", version = "0.7.1" } -omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.7.1" } -omnigraph-api-types = { path = "../omnigraph-api-types", version = "0.7.1" } -omnigraph-cluster = { path = "../omnigraph-cluster", version = "0.7.1" } -omnigraph-policy = { path = "../omnigraph-policy", version = "0.7.1" } -omnigraph-server = { path = "../omnigraph-server", version = "0.7.1" } +omnigraph = { package = "omnigraph-engine", path = "../omnigraph", version = "0.7.2" } +omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.7.2" } +omnigraph-api-types = { path = "../omnigraph-api-types", version = "0.7.2" } +omnigraph-cluster = { path = "../omnigraph-cluster", version = "0.7.2" } +omnigraph-policy = { path = "../omnigraph-policy", version = "0.7.2" } +omnigraph-server = { path = "../omnigraph-server", version = "0.7.2" } clap = { workspace = true } color-eyre = { workspace = true } serde = { workspace = true } diff --git a/crates/omnigraph-cluster/Cargo.toml b/crates/omnigraph-cluster/Cargo.toml index 119545e..ad3cf24 100644 --- a/crates/omnigraph-cluster/Cargo.toml +++ b/crates/omnigraph-cluster/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "omnigraph-cluster" -version = "0.7.1" +version = "0.7.2" edition = "2024" description = "Cluster configuration validation, planning, and config-only apply for Omnigraph." license = "MIT" @@ -14,8 +14,8 @@ documentation = "https://docs.rs/omnigraph-cluster" failpoints = ["dep:fail", "fail/failpoints", "omnigraph/failpoints"] [dependencies] -omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.7.1" } -omnigraph = { package = "omnigraph-engine", path = "../omnigraph", version = "0.7.1" } +omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.7.2" } +omnigraph = { package = "omnigraph-engine", path = "../omnigraph", version = "0.7.2" } fail = { workspace = true, optional = true } serde = { workspace = true } serde_json = { workspace = true } @@ -30,5 +30,6 @@ tokio = { workspace = true } ulid = { workspace = true } [dev-dependencies] +serial_test = "3" tempfile = { workspace = true } tokio = { workspace = true } diff --git a/crates/omnigraph-cluster/src/config.rs b/crates/omnigraph-cluster/src/config.rs index d0e0edd..10621da 100644 --- a/crates/omnigraph-cluster/src/config.rs +++ b/crates/omnigraph-cluster/src/config.rs @@ -474,7 +474,7 @@ pub(crate) async fn preview_schema_migration( Ok(preview.plan) } -struct LiveGraphObservation { +pub(crate) struct LiveGraphObservation { manifest_version: u64, schema_digest: String, } @@ -494,7 +494,7 @@ pub(crate) async fn observe_live_graph(graph_uri: &str) -> Result { +pub(crate) struct GraphObservationJson<'a> { address: &'a str, graph_uri: &'a str, observed_at: &'a str, @@ -949,7 +949,7 @@ pub(crate) fn validate_id(kind: &str, path: &str, value: &str, diagnostics: &mut } } -enum PolicyTarget { +pub(crate) enum PolicyTarget { Cluster, Graph(String), WrongKind(String), diff --git a/crates/omnigraph-cluster/src/failpoints.rs b/crates/omnigraph-cluster/src/failpoints.rs index f1799d7..f5b2023 100644 --- a/crates/omnigraph-cluster/src/failpoints.rs +++ b/crates/omnigraph-cluster/src/failpoints.rs @@ -1,6 +1,13 @@ //! Fault-injection hooks for the cluster apply protocol, mirroring the //! engine's `omnigraph::failpoints` pattern. With the `failpoints` feature //! off, every call site compiles to `Ok(())`. +//! +//! Only `maybe_fail` lives here — it returns the cluster's [`Diagnostic`] +//! error type. The test-side configuration guard is shared: use +//! [`omnigraph::failpoints::ScopedFailPoint`], which is registry-only +//! (error-type agnostic) and reachable because the cluster's `failpoints` +//! feature enables `omnigraph/failpoints`. One `ScopedFailPoint`, in the +//! lowest crate, avoids a drifting duplicate. use crate::Diagnostic; @@ -19,38 +26,16 @@ pub(crate) fn maybe_fail(_name: &str) -> Result<(), Diagnostic> { Ok(()) } -#[cfg(feature = "failpoints")] -pub struct ScopedFailPoint { - name: String, -} - -#[cfg(feature = "failpoints")] -impl ScopedFailPoint { - pub fn new(name: &str, action: &str) -> Self { - fail::cfg(name, action).expect("configure failpoint"); - Self { - name: name.to_string(), - } - } - - /// Register a callback failpoint with the same Drop-based cleanup as - /// `new`. Without the guard, a panic while the point is active would - /// leak the callback into the process-global registry and fire it under - /// later tests in the same binary. - pub fn with_callback(name: &str, callback: F) -> Self - where - F: Fn() + Send + Sync + 'static, - { - fail::cfg_callback(name, callback).expect("configure callback failpoint"); - Self { - name: name.to_string(), - } - } -} - -#[cfg(feature = "failpoints")] -impl Drop for ScopedFailPoint { - fn drop(&mut self) { - fail::remove(&self.name); - } +/// Compile-checked catalog of this crate's apply-protocol failpoint names. +/// Engine-scoped names referenced from cluster tests live in +/// [`omnigraph::failpoints::names`]. +pub mod names { + pub const CLUSTER_APPLY_AFTER_GRAPH_CREATE: &str = "cluster_apply.after_graph_create"; + pub const CLUSTER_APPLY_AFTER_GRAPH_DELETE: &str = "cluster_apply.after_graph_delete"; + pub const CLUSTER_APPLY_AFTER_PAYLOAD_PHASE: &str = "cluster_apply.after_payload_phase"; + pub const CLUSTER_APPLY_AFTER_SCHEMA_APPLY: &str = "cluster_apply.after_schema_apply"; + pub const CLUSTER_APPLY_BEFORE_GRAPH_CREATE: &str = "cluster_apply.before_graph_create"; + pub const CLUSTER_APPLY_BEFORE_GRAPH_DELETE: &str = "cluster_apply.before_graph_delete"; + pub const CLUSTER_APPLY_BEFORE_SCHEMA_APPLY: &str = "cluster_apply.before_schema_apply"; + pub const CLUSTER_APPLY_BEFORE_STATE_WRITE: &str = "cluster_apply.before_state_write"; } diff --git a/crates/omnigraph-cluster/src/lib.rs b/crates/omnigraph-cluster/src/lib.rs index bed27c8..42735ae 100644 --- a/crates/omnigraph-cluster/src/lib.rs +++ b/crates/omnigraph-cluster/src/lib.rs @@ -1,8 +1,6 @@ use std::collections::{BTreeMap, BTreeSet}; -use std::fs::{self, OpenOptions}; -use std::io::{ErrorKind, Write}; +use std::fs::{self}; use std::path::{Path, PathBuf}; -use std::process; use omnigraph::db::{Omnigraph, ReadTarget, SchemaApplyOptions}; use omnigraph_compiler::SchemaMigrationPlan; @@ -26,11 +24,7 @@ mod store; mod sweep; mod types; use config::{ - QueriesDecl, future_field_diagnostics, graph_address, initial_import_state, load_desired, - normalize_policy_target, observe_declared_graphs, observe_live_graph, parse_cluster_config, - policy_address, preview_schema_migration, query_address, resolve_config_path, - resolve_query_decls, schema_address, state_resource_digests, validate_cluster_header, - validate_id, validate_query_source, + QueriesDecl, graph_address, initial_import_state, load_desired, observe_declared_graphs, parse_cluster_config, preview_schema_migration, schema_address, state_resource_digests, validate_cluster_header, }; use diff::{ FailedGraphOrigin, ResourceKind, append_embedding_profile_changes, @@ -42,13 +36,12 @@ pub use serve::{ cluster_root_for_graph_uri, read_serving_snapshot, read_serving_snapshot_from_storage, resolve_graph_storage_uri, }; -use store::{ClusterStore, StateLockGuard, StateSnapshot}; +use store::ClusterStore; use sweep::{ mark_approvals_consumed, record_approval_consumed, sweep_recovery_sidecars, tombstone_graph_subtree, warn_pending_recovery_sidecars, }; pub use types::*; -use types::*; pub const CLUSTER_CONFIG_FILE: &str = "cluster.yaml"; pub const CLUSTER_GRAPHS_DIR: &str = "graphs"; @@ -510,7 +503,7 @@ pub async fn apply_config_dir_with_options( continue; } }; - if let Err(diagnostic) = failpoints::maybe_fail("cluster_apply.before_graph_create") { + if let Err(diagnostic) = failpoints::maybe_fail(crate::failpoints::names::CLUSTER_APPLY_BEFORE_GRAPH_CREATE) { // Simulated crash before the init: the sidecar stays for the // sweep (row 1: root absent -> intent removed next run). diagnostics.push(diagnostic); @@ -587,7 +580,7 @@ pub async fn apply_config_dir_with_options( // Crash point: the graph exists, the cluster state does not record it // yet. A failure here must acknowledge nothing; the next run's sweep // rolls the ledger forward (row 4). - if let Err(diagnostic) = failpoints::maybe_fail("cluster_apply.after_graph_create") { + if let Err(diagnostic) = failpoints::maybe_fail(crate::failpoints::names::CLUSTER_APPLY_AFTER_GRAPH_CREATE) { diagnostics.push(diagnostic); return early_return( display_path(&desired.config_dir), @@ -727,7 +720,7 @@ pub async fn apply_config_dir_with_options( continue; } }; - if let Err(diagnostic) = failpoints::maybe_fail("cluster_apply.before_schema_apply") { + if let Err(diagnostic) = failpoints::maybe_fail(crate::failpoints::names::CLUSTER_APPLY_BEFORE_SCHEMA_APPLY) { // Simulated crash before the engine call: the sidecar stays; the // sweep retires it next run (ledger still consistent with live). diagnostics.push(diagnostic); @@ -787,7 +780,7 @@ pub async fn apply_config_dir_with_options( } // Crash point: the manifest moved, the ledger does not record it yet. // A failure here acknowledges nothing; the sweep rolls forward. - if let Err(diagnostic) = failpoints::maybe_fail("cluster_apply.after_schema_apply") { + if let Err(diagnostic) = failpoints::maybe_fail(crate::failpoints::names::CLUSTER_APPLY_AFTER_SCHEMA_APPLY) { diagnostics.push(diagnostic); return early_return( display_path(&desired.config_dir), @@ -872,7 +865,7 @@ pub async fn apply_config_dir_with_options( // Crash point: payloads are on disk, state has not moved. A failure here // must leave state.json byte-identical and acknowledge nothing; re-running // apply repairs via the skip-if-exists blob reuse. - if let Err(diagnostic) = failpoints::maybe_fail("cluster_apply.after_payload_phase") { + if let Err(diagnostic) = failpoints::maybe_fail(crate::failpoints::names::CLUSTER_APPLY_AFTER_PAYLOAD_PHASE) { diagnostics.push(diagnostic); return early_return( display_path(&desired.config_dir), @@ -949,7 +942,7 @@ pub async fn apply_config_dir_with_options( continue; } }; - if let Err(diagnostic) = failpoints::maybe_fail("cluster_apply.before_graph_delete") { + if let Err(diagnostic) = failpoints::maybe_fail(crate::failpoints::names::CLUSTER_APPLY_BEFORE_GRAPH_DELETE) { // Simulated crash before removal: row 8 retires the intent and // the still-valid approval lets a later run retry. diagnostics.push(diagnostic); @@ -974,7 +967,7 @@ pub async fn apply_config_dir_with_options( } // Crash point: the root is gone, the ledger does not record it yet. // The sweep rolls forward (row 7b) and consumes the approval. - if let Err(diagnostic) = failpoints::maybe_fail("cluster_apply.after_graph_delete") { + if let Err(diagnostic) = failpoints::maybe_fail(crate::failpoints::names::CLUSTER_APPLY_AFTER_GRAPH_DELETE) { diagnostics.push(diagnostic); return early_return( display_path(&desired.config_dir), @@ -1080,7 +1073,7 @@ pub async fn apply_config_dir_with_options( // persisted-statuses revert contract below is exercised; a cfg_callback // on this point can mutate state.json to simulate a concurrent writer, // making write_state's CAS check fail organically. - let write_result = match failpoints::maybe_fail("cluster_apply.before_state_write") { + let write_result = match failpoints::maybe_fail(crate::failpoints::names::CLUSTER_APPLY_BEFORE_STATE_WRITE) { Ok(()) => { backend .write_state(&new_state, expected_cas.as_deref(), &mut observations) diff --git a/crates/omnigraph-cluster/src/store.rs b/crates/omnigraph-cluster/src/store.rs index a156d78..9136f5e 100644 --- a/crates/omnigraph-cluster/src/store.rs +++ b/crates/omnigraph-cluster/src/store.rs @@ -408,10 +408,6 @@ impl ClusterStore { } } - pub(crate) fn payload_display(&self, kind: &ResourceKind, digest: &str) -> Option { - Self::payload_relative(kind, digest).map(|relative| self.display(&relative)) - } - pub(crate) async fn payload_exists(&self, kind: &ResourceKind, digest: &str) -> bool { let Some(relative) = Self::payload_relative(kind, digest) else { return false; diff --git a/crates/omnigraph-cluster/tests/failpoints.rs b/crates/omnigraph-cluster/tests/failpoints.rs index 51997ce..6b6d339 100644 --- a/crates/omnigraph-cluster/tests/failpoints.rs +++ b/crates/omnigraph-cluster/tests/failpoints.rs @@ -13,9 +13,11 @@ use std::fs; use std::path::{Path, PathBuf}; use fail::FailScenario; +use serial_test::serial; use omnigraph::db::Omnigraph; -use omnigraph::failpoints::ScopedFailPoint as EngineScopedFailPoint; -use omnigraph_cluster::failpoints::ScopedFailPoint; +// One ScopedFailPoint for both engine- and cluster-scoped failpoint names: +// it is registry-only (error-type agnostic) and lives in the lowest crate. +use omnigraph::failpoints::ScopedFailPoint; use omnigraph_cluster::{ ApplyOptions, apply_config_dir, apply_config_dir_with_options, approve_config_dir, validate_config_dir, @@ -105,12 +107,13 @@ fn query_blob(config_dir: &Path, digests: &BTreeMap) -> PathBuf } #[tokio::test] +#[serial] async fn failpoint_wiring_returns_injected_diagnostic() { let scenario = FailScenario::setup(); let dir = fixture(); seed_applyable_state(dir.path()); - let _failpoint = ScopedFailPoint::new("cluster_apply.after_payload_phase", "return"); + let _failpoint = ScopedFailPoint::new(omnigraph_cluster::failpoints::names::CLUSTER_APPLY_AFTER_PAYLOAD_PHASE, "return"); let out = apply_config_dir(dir.path()).await; assert!(!out.ok); assert!(out.diagnostics.iter().any(|diagnostic| { @@ -127,6 +130,7 @@ async fn failpoint_wiring_returns_injected_diagnostic() { /// state.json is byte-identical, nothing is acknowledged — and a plain re-run /// repairs by trusting the existing content-addressed blobs. #[tokio::test] +#[serial] async fn apply_crash_after_payload_phase_leaves_state_unmoved_then_recovers() { let scenario = FailScenario::setup(); let dir = fixture(); @@ -134,7 +138,7 @@ async fn apply_crash_after_payload_phase_leaves_state_unmoved_then_recovers() { let state_before = fs::read(state_path(dir.path())).unwrap(); { - let _failpoint = ScopedFailPoint::new("cluster_apply.after_payload_phase", "return"); + let _failpoint = ScopedFailPoint::new(omnigraph_cluster::failpoints::names::CLUSTER_APPLY_AFTER_PAYLOAD_PHASE, "return"); let out = apply_config_dir(dir.path()).await; assert!(!out.ok); assert!(!out.state_written); @@ -169,6 +173,7 @@ async fn apply_crash_after_payload_phase_leaves_state_unmoved_then_recovers() { /// (possible under `state.lock: false`) must surface `state_cas_mismatch`, /// acknowledge nothing, and leave the concurrent writer's state on disk. #[tokio::test] +#[serial] async fn apply_cas_race_surfaces_state_cas_mismatch() { let scenario = FailScenario::setup(); let dir = fixture(); @@ -179,7 +184,7 @@ async fn apply_cas_race_surfaces_state_cas_mismatch() { // after apply read it but before apply writes. RAII-guarded so a panic // inside apply cannot leak the callback into the global registry. let race_path = state_path(dir.path()); - let failpoint = ScopedFailPoint::with_callback("cluster_apply.before_state_write", move || { + let failpoint = ScopedFailPoint::with_callback(omnigraph_cluster::failpoints::names::CLUSTER_APPLY_BEFORE_STATE_WRITE, move || { let mut state: serde_json::Value = serde_json::from_str(&fs::read_to_string(&race_path).unwrap()).unwrap(); state["state_revision"] = serde_json::json!(99); @@ -256,13 +261,14 @@ fn recovery_sidecars(config_dir: &Path) -> Vec { /// The next run's sweep removes the intent (row 1) and the same run creates /// the graph and converges. #[tokio::test] +#[serial] async fn create_crash_before_init_recovers_via_sweep() { let scenario = FailScenario::setup(); let dir = fixture(); seed_empty_state(dir.path()); { - let _failpoint = ScopedFailPoint::new("cluster_apply.before_graph_create", "return"); + let _failpoint = ScopedFailPoint::new(omnigraph_cluster::failpoints::names::CLUSTER_APPLY_BEFORE_GRAPH_CREATE, "return"); let out = apply_config_dir(dir.path()).await; assert!(!out.ok); assert!(out.diagnostics.iter().any(|diagnostic| { @@ -298,6 +304,7 @@ async fn create_crash_before_init_recovers_via_sweep() { /// ledger is stale, nothing was acknowledged. The next run's sweep rolls the /// ledger forward (row 4) with an audit entry, and the run converges. #[tokio::test] +#[serial] async fn create_crash_after_init_rolls_state_forward() { let scenario = FailScenario::setup(); let dir = fixture(); @@ -305,7 +312,7 @@ async fn create_crash_after_init_rolls_state_forward() { let state_before = fs::read(dir.path().join("__cluster/state.json")).unwrap(); { - let _failpoint = ScopedFailPoint::new("cluster_apply.after_graph_create", "return"); + let _failpoint = ScopedFailPoint::new(omnigraph_cluster::failpoints::names::CLUSTER_APPLY_AFTER_GRAPH_CREATE, "return"); let out = apply_config_dir(dir.path()).await; assert!(!out.ok); assert!(!out.state_written); @@ -385,6 +392,7 @@ async fn live_schema_digest(dir: &Path) -> String { /// live schema and ledger are untouched; the next run's sweep retires the /// stale intent and the same run applies and converges. #[tokio::test] +#[serial] async fn schema_crash_before_apply_recovers_via_sweep() { let scenario = FailScenario::setup(); let dir = fixture(); @@ -393,7 +401,7 @@ async fn schema_crash_before_apply_recovers_via_sweep() { fs::write(dir.path().join("people.pg"), SCHEMA_V2).unwrap(); { - let _failpoint = ScopedFailPoint::new("cluster_apply.before_schema_apply", "return"); + let _failpoint = ScopedFailPoint::new(omnigraph_cluster::failpoints::names::CLUSTER_APPLY_BEFORE_SCHEMA_APPLY, "return"); let out = apply_config_dir_with_options( dir.path(), ApplyOptions { @@ -425,6 +433,7 @@ async fn schema_crash_before_apply_recovers_via_sweep() { /// the graph manifest moves. The defensive cleanup proof should remove the /// cluster sidecar immediately so a pre-movement error cannot brick boot. #[tokio::test] +#[serial] async fn schema_apply_error_before_graph_movement_removes_sidecar() { let scenario = FailScenario::setup(); let dir = fixture(); @@ -433,7 +442,7 @@ async fn schema_apply_error_before_graph_movement_removes_sidecar() { fs::write(dir.path().join("people.pg"), SCHEMA_V2).unwrap(); { - let _failpoint = EngineScopedFailPoint::new("schema_apply.before_staging_write", "return"); + let _failpoint = ScopedFailPoint::new(omnigraph::failpoints::names::SCHEMA_APPLY_BEFORE_STAGING_WRITE, "return"); let out = apply_config_dir(dir.path()).await; assert!(!out.ok); assert!( @@ -462,6 +471,7 @@ async fn schema_apply_error_before_graph_movement_removes_sidecar() { /// prove this is a pre-movement failure, so the sidecar must survive for /// explicit recovery/quarantine instead of being cleaned up defensively. #[tokio::test] +#[serial] async fn schema_apply_error_after_graph_movement_keeps_sidecar() { let scenario = FailScenario::setup(); let dir = fixture(); @@ -472,7 +482,7 @@ async fn schema_apply_error_after_graph_movement_keeps_sidecar() { let v2_digest = desired.resource_digests["schema.knowledge"].clone(); { - let _failpoint = EngineScopedFailPoint::new("schema_apply.after_manifest_commit", "return"); + let _failpoint = ScopedFailPoint::new(omnigraph::failpoints::names::SCHEMA_APPLY_AFTER_MANIFEST_COMMIT, "return"); let out = apply_config_dir(dir.path()).await; assert!(!out.ok); assert!( @@ -524,6 +534,7 @@ async fn schema_apply_error_after_graph_movement_keeps_sidecar() { /// moved, the ledger is stale, nothing acknowledged; the next run's sweep /// rolls the ledger forward with an audit entry and the run converges. #[tokio::test] +#[serial] async fn schema_crash_after_apply_rolls_state_forward() { let scenario = FailScenario::setup(); let dir = fixture(); @@ -534,7 +545,7 @@ async fn schema_crash_after_apply_rolls_state_forward() { let v2_digest = desired.resource_digests["schema.knowledge"].clone(); { - let _failpoint = ScopedFailPoint::new("cluster_apply.after_schema_apply", "return"); + let _failpoint = ScopedFailPoint::new(omnigraph_cluster::failpoints::names::CLUSTER_APPLY_AFTER_SCHEMA_APPLY, "return"); let out = apply_config_dir(dir.path()).await; assert!(!out.ok); assert!(!out.state_written); @@ -608,13 +619,14 @@ async fn seed_approved_delete(dir: &Path) -> String { /// next run retires the stale intent (row 8) and the still-approved delete /// completes in the same run. #[tokio::test] +#[serial] async fn delete_crash_before_removal_reproposes() { let scenario = FailScenario::setup(); let dir = fixture(); let approval_id = seed_approved_delete(dir.path()).await; { - let _failpoint = ScopedFailPoint::new("cluster_apply.before_graph_delete", "return"); + let _failpoint = ScopedFailPoint::new(omnigraph_cluster::failpoints::names::CLUSTER_APPLY_BEFORE_GRAPH_DELETE, "return"); let out = apply_config_dir(dir.path()).await; assert!(!out.ok); assert!(dir.path().join("graphs/old.omni").exists()); @@ -650,6 +662,7 @@ async fn delete_crash_before_removal_reproposes() { /// nothing acknowledged; the next run's sweep rolls the tombstone forward, /// consumes the approval the sidecar carries, and audits the recovery. #[tokio::test] +#[serial] async fn delete_crash_after_removal_rolls_forward() { let scenario = FailScenario::setup(); let dir = fixture(); @@ -657,7 +670,7 @@ async fn delete_crash_after_removal_rolls_forward() { let state_before = fs::read(state_path(dir.path())).unwrap(); { - let _failpoint = ScopedFailPoint::new("cluster_apply.after_graph_delete", "return"); + let _failpoint = ScopedFailPoint::new(omnigraph_cluster::failpoints::names::CLUSTER_APPLY_AFTER_GRAPH_DELETE, "return"); let out = apply_config_dir(dir.path()).await; assert!(!out.ok); assert!(!out.state_written); diff --git a/crates/omnigraph-compiler/Cargo.toml b/crates/omnigraph-compiler/Cargo.toml index 13c3bbf..f885a9f 100644 --- a/crates/omnigraph-compiler/Cargo.toml +++ b/crates/omnigraph-compiler/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "omnigraph-compiler" -version = "0.7.1" +version = "0.7.2" edition = "2024" description = "Schema/query compiler for Omnigraph. Zero Lance dependency." license = "MIT" diff --git a/crates/omnigraph-mcp/Cargo.toml b/crates/omnigraph-mcp/Cargo.toml index 7784b70..92b519a 100644 --- a/crates/omnigraph-mcp/Cargo.toml +++ b/crates/omnigraph-mcp/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "omnigraph-mcp" -version = "0.7.1" +version = "0.7.2" edition = "2024" description = "MCP (Model Context Protocol) Streamable-HTTP transport and backend seam for Omnigraph. Contains the rmcp dependency and defines the McpBackend trait the server implements; names no omnigraph engine/server type, so the dependency edge is server → mcp." license = "MIT" diff --git a/crates/omnigraph-policy/Cargo.toml b/crates/omnigraph-policy/Cargo.toml index 25bedd1..136df84 100644 --- a/crates/omnigraph-policy/Cargo.toml +++ b/crates/omnigraph-policy/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "omnigraph-policy" -version = "0.7.1" +version = "0.7.2" edition = "2024" description = "Policy / authorization layer for Omnigraph — Cedar-backed PolicyEngine, PolicyChecker trait, ResourceScope enum." license = "MIT" diff --git a/crates/omnigraph-server/Cargo.toml b/crates/omnigraph-server/Cargo.toml index 00daadb..7d0ca48 100644 --- a/crates/omnigraph-server/Cargo.toml +++ b/crates/omnigraph-server/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "omnigraph-server" -version = "0.7.1" +version = "0.7.2" edition = "2024" description = "HTTP server for the Omnigraph graph database." license = "MIT" @@ -19,14 +19,14 @@ default = [] aws = ["dep:aws-config", "dep:aws-sdk-secretsmanager"] [dependencies] -omnigraph = { package = "omnigraph-engine", path = "../omnigraph", version = "0.7.1" } -omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.7.1" } -omnigraph-policy = { path = "../omnigraph-policy", version = "0.7.1" } -omnigraph-api-types = { path = "../omnigraph-api-types", version = "0.7.1" } -omnigraph-cluster = { path = "../omnigraph-cluster", version = "0.7.1" } +omnigraph = { package = "omnigraph-engine", path = "../omnigraph", version = "0.7.2" } +omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.7.2" } +omnigraph-policy = { path = "../omnigraph-policy", version = "0.7.2" } +omnigraph-api-types = { path = "../omnigraph-api-types", version = "0.7.2" } +omnigraph-cluster = { path = "../omnigraph-cluster", version = "0.7.2" } # The MCP surface. rmcp is contained to omnigraph-mcp — the server carries NO # direct rmcp dependency (verify: `cargo tree -p omnigraph-server -e normal | grep rmcp`). -omnigraph-mcp = { path = "../omnigraph-mcp", version = "0.7.1" } +omnigraph-mcp = { path = "../omnigraph-mcp", version = "0.7.2" } axum = { workspace = true } http = "1" clap = { workspace = true } diff --git a/crates/omnigraph/Cargo.toml b/crates/omnigraph/Cargo.toml index c830367..5038fd1 100644 --- a/crates/omnigraph/Cargo.toml +++ b/crates/omnigraph/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "omnigraph-engine" -version = "0.7.1" +version = "0.7.2" edition = "2024" description = "Runtime engine for the Omnigraph graph database." license = "MIT" @@ -16,8 +16,8 @@ default = [] failpoints = ["dep:fail", "fail/failpoints"] [dependencies] -omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.7.1" } -omnigraph-policy = { path = "../omnigraph-policy", version = "0.7.1" } +omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.7.2" } +omnigraph-policy = { path = "../omnigraph-policy", version = "0.7.2" } lance = { workspace = true } lance-datafusion = { workspace = true } datafusion = { workspace = true } @@ -52,7 +52,7 @@ chrono = { workspace = true } arc-swap = { workspace = true } [dev-dependencies] -omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.7.1" } +omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.7.2" } tokio = { workspace = true } lance-namespace-impls = { workspace = true } lance-io = "7.0.0" diff --git a/crates/omnigraph/src/db/commit_graph.rs b/crates/omnigraph/src/db/commit_graph.rs index 181d1d8..fb61874 100644 --- a/crates/omnigraph/src/db/commit_graph.rs +++ b/crates/omnigraph/src/db/commit_graph.rs @@ -1,6 +1,5 @@ use std::collections::{HashMap, VecDeque}; use std::sync::Arc; -use std::time::{SystemTime, UNIX_EPOCH}; use arrow_array::{ Array, RecordBatch, RecordBatchIterator, StringArray, TimestampMicrosecondArray, UInt64Array, @@ -29,7 +28,16 @@ pub struct GraphCommit { pub struct CommitGraph { root_uri: String, - dataset: Dataset, + /// Handle on `_graph_commits.lance` at the active branch, held only for the + /// branch-management WRITES (`create_branch`, formerly `version`) and + /// `refresh`. It is a DERIVED artifact (RFC-013 Phase 7): graph lineage lives + /// in `__manifest`, and reads (`head_commit`/`load_commits`/`get_commit`/ + /// `merge_base`) never touch it. `None` means the branch's + /// `_graph_commits.lance` ref is missing (an interrupted fork-reclaim or a + /// `cleanup` race) while the manifest lineage is still authoritative — so the + /// READS stay correct and only a subsequent `create_branch` surfaces the loud + /// actionable error. Mirrors `actor_dataset`'s best-effort `Option`. + dataset: Option, actor_dataset: Option, active_branch: Option, actor_by_commit_id: HashMap, @@ -38,20 +46,19 @@ pub struct CommitGraph { } impl CommitGraph { - pub async fn init(root_uri: &str, manifest_version: u64) -> Result { + /// Create the commit-graph datasets for a fresh graph. The genesis + /// `graph_commit` + `graph_head` rows live in `__manifest` (folded into the + /// init write — RFC-013 Phase 7), so `_graph_commits.lance` is created EMPTY + /// here: it exists only to carry the Lance branch refs that `create_branch` / + /// `list_branches` / the `cleanup` orphan reconciler operate on. No commit + /// rows are ever written to it. The in-memory cache is sourced from the + /// manifest projection — the same path as [`open`], so genesis is seen + /// identically whether the graph was just initialized or reopened. + pub async fn init(root_uri: &str) -> Result { let root = root_uri.trim_end_matches('/'); let uri = graph_commits_uri(root); - let genesis = GraphCommit { - graph_commit_id: ulid::Ulid::new().to_string(), - manifest_branch: None, - manifest_version, - parent_commit_id: None, - merged_parent_commit_id: None, - actor_id: None, - created_at: now_micros()?, - }; - let batch = commits_to_batch(&[genesis.clone()])?; + let batch = RecordBatch::new_empty(commit_graph_schema()); let reader = RecordBatchIterator::new(vec![Ok(batch)], commit_graph_schema()); let params = WriteParams { mode: WriteMode::Create, @@ -66,17 +73,30 @@ impl CommitGraph { .map_err(|e| OmniError::Lance(e.to_string()))?; let actor_dataset = create_commit_actor_dataset(root).await?; + let (commit_by_id, head_commit) = load_commit_cache_from_manifest(root, None).await?; Ok(Self { root_uri: root.to_string(), - dataset, + dataset: Some(dataset), actor_dataset: Some(actor_dataset), active_branch: None, actor_by_commit_id: HashMap::new(), - commit_by_id: HashMap::from([(genesis.graph_commit_id.clone(), genesis.clone())]), - head_commit: Some(genesis), + commit_by_id, + head_commit, }) } + /// Insert a just-published commit into the in-memory cache (RFC-013 Phase 7). + /// The durable write already happened in the manifest publish CAS; this only + /// keeps the cache consistent for same-handle reads, with no storage I/O. + /// Head selection matches the manifest-sourced load (`should_replace_head`). + pub fn insert_committed(&mut self, commit: GraphCommit) { + if should_replace_head(self.head_commit.as_ref(), &commit) { + self.head_commit = Some(commit.clone()); + } + self.commit_by_id + .insert(commit.graph_commit_id.clone(), commit); + } + pub async fn open(root_uri: &str) -> Result { let root = root_uri.trim_end_matches('/'); let wrapper = crate::instrumentation::commit_graph_wrapper(); @@ -87,17 +107,24 @@ impl CommitGraph { crate::instrumentation::open_dataset_tracked(&graph_commit_actors_uri(root), wrapper) .await .ok(); - let actor_by_commit_id = match &actor_dataset { - Some(dataset) => load_commit_actor_cache(dataset).await?, - None => HashMap::new(), - }; - let (commit_by_id, head_commit) = load_commit_cache(&dataset, &actor_by_commit_id).await?; + // RFC-013 step 4: source the in-memory cache from the `__manifest` + // lineage projection (which carries the actor inline), not from + // `_graph_commits.lance`. The dataset handles above are retained for the + // branch-management ops (create/delete/list/version) that still target + // the commit-graph dataset; the actor dataset is only kept for the + // dual-write append path. The projection-equivalence gate proves this + // cache equals the prior `_graph_commits.lance` read. A pre-Phase-7 (v3) + // graph not yet migrated falls back to the legacy read — see + // `load_commit_cache_for_branch`. + let (commit_by_id, head_commit) = load_commit_cache_for_branch(root, None).await?; Ok(Self { root_uri: root.to_string(), - dataset, + // `open` targets main and never checks out a branch (main cannot be + // deleted/recreated), so the handle is always present here. + dataset: Some(dataset), actor_dataset, active_branch: None, - actor_by_commit_id, + actor_by_commit_id: HashMap::new(), commit_by_id, head_commit, }) @@ -109,25 +136,33 @@ impl CommitGraph { let dataset = crate::instrumentation::open_dataset_tracked(&graph_commits_uri(root), wrapper.clone()) .await?; - let dataset = dataset - .checkout_branch(branch) - .await - .map_err(|e| OmniError::Lance(e.to_string()))?; + // Best-effort checkout of the DERIVED `_graph_commits.lance` branch ref. + // It is held only for `create_branch` (a write); the lineage READ below + // comes from `__manifest`. A missing ref (interrupted fork-reclaim / + // `cleanup` race) must not wedge the read, so a typed not-found yields a + // `None` handle — a subsequent `create_branch` then surfaces the loud + // error. Any OTHER open error (transient IO / corrupt) still propagates, + // matching the `force_delete_branch` / `read_legacy_commit_cache` idiom. + let dataset = match dataset.checkout_branch(branch).await { + Ok(ds) => Some(ds), + Err(lance::Error::RefNotFound { .. }) | Err(lance::Error::NotFound { .. }) => None, + Err(e) => return Err(OmniError::Lance(e.to_string())), + }; let actor_dataset = crate::instrumentation::open_dataset_tracked(&graph_commit_actors_uri(root), wrapper) .await .ok(); - let actor_by_commit_id = match &actor_dataset { - Some(dataset) => load_commit_actor_cache(dataset).await?, - None => HashMap::new(), - }; - let (commit_by_id, head_commit) = load_commit_cache(&dataset, &actor_by_commit_id).await?; + // Hard `?`: the manifest existence gate. `load_commit_cache_for_branch` + // opens the branch's `__manifest` (its own `checkout_branch` on the + // authoritative table), so a TRULY absent branch still fails loudly here — + // only the derived `_graph_commits.lance` ref is allowed to be missing. + let (commit_by_id, head_commit) = load_commit_cache_for_branch(root, Some(branch)).await?; Ok(Self { root_uri: root.to_string(), dataset, actor_dataset, active_branch: Some(branch.to_string()), - actor_by_commit_id, + actor_by_commit_id: HashMap::new(), commit_by_id, head_commit, }) @@ -136,40 +171,49 @@ impl CommitGraph { pub async fn refresh(&mut self) -> Result<()> { let root = self.root_uri.clone(); let wrapper = crate::instrumentation::commit_graph_wrapper(); - self.dataset = crate::instrumentation::open_dataset_tracked( + let dataset = crate::instrumentation::open_dataset_tracked( &graph_commits_uri(&root), wrapper.clone(), ) .await?; - if let Some(branch) = &self.active_branch { - self.dataset = self - .dataset - .checkout_branch(branch) - .await - .map_err(|e| OmniError::Lance(e.to_string()))?; - } + // Same best-effort checkout as `open_at_branch`: a missing DERIVED branch + // ref leaves the handle `None` (only `create_branch` then errors), while + // the in-memory cache re-syncs from the authoritative manifest below. + self.dataset = match &self.active_branch { + Some(branch) => match dataset.checkout_branch(branch).await { + Ok(ds) => Some(ds), + Err(lance::Error::RefNotFound { .. }) | Err(lance::Error::NotFound { .. }) => None, + Err(e) => return Err(OmniError::Lance(e.to_string())), + }, + None => Some(dataset), + }; self.actor_dataset = crate::instrumentation::open_dataset_tracked(&graph_commit_actors_uri(&root), wrapper) .await .ok(); - self.actor_by_commit_id = match &self.actor_dataset { - Some(dataset) => load_commit_actor_cache(dataset).await?, - None => HashMap::new(), - }; let (commit_by_id, head_commit) = - load_commit_cache(&self.dataset, &self.actor_by_commit_id).await?; + load_commit_cache_for_branch(&root, self.active_branch.as_deref()).await?; self.commit_by_id = commit_by_id; self.head_commit = head_commit; Ok(()) } - pub fn version(&self) -> u64 { - self.dataset.version().version - } - pub async fn create_branch(&mut self, name: &str) -> Result<()> { - let mut ds = self.dataset.clone(); - ds.create_branch(name, self.version(), None) + // The held `_graph_commits.lance` handle is the only thing that can fork a + // branch ref. If it is missing (an interrupted fork-reclaim or a `cleanup` + // race dropped the derived ref while manifest lineage stayed authoritative), + // fail loudly + actionably rather than silently. Repair is the existing + // `cleanup` orphan reconciler (`reconcile_commit_graph_orphans`), not an + // inline write on this path. + let Some(dataset) = &self.dataset else { + let branch = self.active_branch.as_deref().unwrap_or("main"); + return Err(OmniError::manifest_internal(format!( + "commit-graph branch ref for '{branch}' is missing; run `omnigraph cleanup` then retry" + ))); + }; + let version = dataset.version().version; + let mut ds = dataset.clone(); + ds.create_branch(name, version, None) .await .map_err(|e| OmniError::Lance(e.to_string()))?; Ok(()) @@ -216,7 +260,17 @@ impl CommitGraph { Ok(branches.into_keys().collect()) } - pub async fn append_commit( + // DEAD as of RFC-013 Phase 7: graph commits are recorded in `__manifest` + // (folded into the publish CAS), never appended to `_graph_commits.lance`. + // These append helpers are retained only because the actor sidecar table they + // touch is still enumerated by `optimize` (internal-table compaction); they + // have no caller on any write path. The single-source invariant is guarded by + // `tests/lineage_projection.rs`, which fails if `_graph_commits.lance` ever + // gains a commit row. Do NOT call these to record a commit — use the + // coordinator's `commit_*_with_actor` / `commit_merge_with_actor`, which carry + // the lineage intent into the manifest publish. + #[allow(dead_code)] + async fn append_commit( &mut self, manifest_branch: Option<&str>, manifest_version: u64, @@ -233,7 +287,8 @@ impl CommitGraph { .await } - pub async fn append_merge_commit( + #[allow(dead_code)] + async fn append_merge_commit( &mut self, manifest_branch: Option<&str>, manifest_version: u64, @@ -251,6 +306,7 @@ impl CommitGraph { .await } + #[allow(dead_code)] async fn append_commit_with_parents( &mut self, manifest_branch: Option<&str>, @@ -267,16 +323,22 @@ impl CommitGraph { parent_commit_id: parent_commit_id.map(|s| s.to_string()), merged_parent_commit_id: merged_parent_commit_id.map(|s| s.to_string()), actor_id: actor_id.map(str::to_string), - created_at: now_micros()?, + created_at: crate::db::now_micros()?, }; let batch = commits_to_batch(&[commit.clone()])?; let reader = RecordBatchIterator::new(vec![Ok(batch)], commit_graph_schema()); - let mut ds = self.dataset.clone(); + // This helper is dead on every write path (RFC-013 Phase 7) — reached only + // by the transitional v3 fixtures, which always hold the commits dataset. + // A `None` here would be a fixture bug, so fail loudly rather than silently. + let mut ds = self + .dataset + .clone() + .ok_or_else(|| OmniError::manifest_internal("commit-graph dataset is missing"))?; ds.append(reader, None) .await .map_err(|e| OmniError::Lance(e.to_string()))?; - self.dataset = ds; + self.dataset = Some(ds); if let Some(actor_id) = actor_id { self.append_actor(&graph_commit_id, actor_id).await?; } @@ -289,6 +351,7 @@ impl CommitGraph { Ok(graph_commit_id) } + #[allow(dead_code)] // RFC-013 Phase 7: dead — see `append_commit`. async fn append_actor(&mut self, graph_commit_id: &str, actor_id: &str) -> Result<()> { if self .actor_by_commit_id @@ -301,7 +364,7 @@ impl CommitGraph { let record = CommitActorRecord { graph_commit_id: graph_commit_id.to_string(), actor_id: actor_id.to_string(), - created_at: now_micros()?, + created_at: crate::db::now_micros()?, }; let batch = commit_actors_to_batch(&[record])?; let reader = RecordBatchIterator::new(vec![Ok(batch)], commit_actor_schema()); @@ -452,7 +515,12 @@ async fn create_commit_actor_dataset(root_uri: &str) -> Result { }; match Dataset::write(reader, &uri as &str, Some(params)).await { Ok(dataset) => Ok(dataset), - Err(err) if err.to_string().contains("Dataset already exists") => Dataset::open(&uri) + // Create-or-open idempotency: a concurrent/prior create raced us. Match + // the typed `DatasetAlreadyExists` variant, not the display string — the + // message is not a Lance API contract (a wording change would silently + // break this fallback). Pinned by + // `lance_surface_guards.rs::lance_error_dataset_already_exists_variant_exists`. + Err(lance::Error::DatasetAlreadyExists { .. }) => Dataset::open(&uri) .await .map_err(|open_err| OmniError::Lance(open_err.to_string())), Err(err) => Err(OmniError::Lance(err.to_string())), @@ -490,6 +558,156 @@ fn commits_to_batch(commits: &[GraphCommit]) -> Result { .map_err(|e| OmniError::Lance(e.to_string())) } +/// Build the in-memory commit cache for `branch`, choosing the source by the +/// branch manifest's internal-schema stamp (RFC-013 step 4 forward/back-compat): +/// +/// - stamp ≥ v4 (post-Phase-7, the normal case): the `__manifest` lineage +/// projection — `graph_commit`/`graph_head` rows folded into the publish CAS. +/// - stamp < v4 (a pre-Phase-7 graph not yet migrated): the legacy +/// `_graph_commits.lance` read. This is the **transitional v3 fallback** that +/// lets a READ-ONLY open of an un-migrated graph still see correct history — +/// a read-only open never runs the v3→v4 backfill (it must not write), so +/// without this gate it would read an empty DAG from `__manifest`. A +/// read-write open backfills `__manifest` on its first write and thereafter +/// takes the projection branch. +/// +/// Both sources pick the head with `should_replace_head`, so the cache is +/// identical regardless of which branch is taken. Remove the fallback once no +/// graph below internal-schema v4 remains. +async fn load_commit_cache_for_branch( + root_uri: &str, + branch: Option<&str>, +) -> Result<(HashMap, Option)> { + let stamp = crate::db::manifest::internal_schema_stamp_at(root_uri, branch).await?; + // Defense-in-depth: refuse a branch whose stamp this binary cannot serve — + // newer than CURRENT, or below MIN_SUPPORTED — for the same reason the main + // read path does (`refuse_if_internal_schema_unsupported`). A `> CURRENT` stamp + // means a newer binary wrote a shape we can't read, so the projection below + // would misread it; a `< MIN` stamp predates the legacy readers this binary + // still carries. Not a live hole today: migrations run main-first + // (`migrate_on_open` migrates main; each branch migrates on its own first + // write), so main's stamp bounds every branch's and the main read path already + // refuses first. The guard closes the gap if that ordering is ever weakened. + crate::db::manifest::refuse_if_stamp_unsupported(stamp)?; + if stamp < crate::db::manifest::INTERNAL_MANIFEST_SCHEMA_VERSION { + // Transitional: un-migrated v3 graph — read lineage from the legacy + // `_graph_commits.lance` so reads (incl. read-only opens) see history. + return read_legacy_commit_cache(root_uri, branch).await; + } + load_commit_cache_from_manifest(root_uri, branch).await +} + +/// Build the in-memory commit cache from the `__manifest` graph-lineage +/// projection (RFC-013 step 4) rather than `_graph_commits.lance`. The lineage +/// rows carry the actor inline, so no separate actor-table read is needed. Head +/// selection is identical to [`load_commit_cache`] (`should_replace_head`), so +/// the resulting cache is equivalent to the prior `_graph_commits.lance` read. +async fn load_commit_cache_from_manifest( + root_uri: &str, + branch: Option<&str>, +) -> Result<(HashMap, Option)> { + let (rows, _heads) = + crate::db::manifest::ManifestCoordinator::read_graph_lineage_at(root_uri, branch).await?; + let mut commit_by_id = HashMap::with_capacity(rows.len()); + let mut head_commit = None; + for row in rows { + let commit = GraphCommit { + graph_commit_id: row.graph_commit_id, + manifest_branch: row.manifest_branch, + manifest_version: row.manifest_version, + parent_commit_id: row.parent_commit_id, + merged_parent_commit_id: row.merged_parent_commit_id, + actor_id: row.actor_id, + created_at: row.created_at, + }; + if should_replace_head(head_commit.as_ref(), &commit) { + head_commit = Some(commit.clone()); + } + commit_by_id.insert(commit.graph_commit_id.clone(), commit); + } + Ok((commit_by_id, head_commit)) +} + +/// Read the legacy `_graph_commits.lance` (+ its actor sidecar) for `branch` +/// into an in-memory cache — the transitional source for graphs not yet +/// migrated to internal-schema v4 (RFC-013 step 4). Two callers, both +/// transitional: the v3→v4 migration backfill (which copies these rows into +/// `__manifest`) and the read-only v3 fallback in `CommitGraph::open*`. Returns +/// `(commit_by_id, head)`, with the head picked by `should_replace_head` — +/// identical to the manifest projection. A genuinely ABSENT (not-found) commit +/// dataset or actor sidecar yields an empty cache (no head); any OTHER open error +/// (transient IO / corrupt file) propagates loudly rather than being read as +/// "empty" — a swallow here would let the v3→v4 migration backfill nothing and +/// still stamp v4, orphaning the real lineage permanently. This keeps the legacy +/// readers alive while any v3 graph survives; once no graph is below v4 it can +/// retire. +pub(crate) async fn read_legacy_commit_cache( + root_uri: &str, + branch: Option<&str>, +) -> Result<(HashMap, Option)> { + let root = root_uri.trim_end_matches('/'); + let commits_uri = graph_commits_uri(root); + let commits_open = match crate::failpoints::maybe_fail_lance_open("migration.v3_to_v4.legacy_open") + { + Ok(()) => Dataset::open(&commits_uri).await, + Err(injected) => Err(injected), + }; + let mut dataset = match commits_open { + Ok(dataset) => dataset, + // An ABSENT commits dataset is the legitimate "no legacy data" signal — + // a graph with no `_graph_commits.lance` (or none on this branch) yields + // an empty cache. But ONLY a genuine not-found gets that treatment: a + // transient/corrupt open (IO / CorruptFile / …) must propagate, never be + // read as "empty". The v3→v4 migration calls this once before stamping + // v4; swallowing a non-not-found error here would backfill nothing and + // stamp v4 anyway, orphaning the real lineage permanently (the migration + // never re-runs, and the v3 fallback is then disabled). Lance maps an + // object-store NotFound to `DatasetNotFound`; the variant match (vs an + // existence probe) is exactly right and not over-strict — pinned by + // `lance_surface_guards.rs::dataset_open_missing_returns_not_found_variant`. + Err(lance::Error::DatasetNotFound { .. }) | Err(lance::Error::NotFound { .. }) => { + return Ok((HashMap::new(), None)); + } + Err(e) => return Err(OmniError::Lance(e.to_string())), + }; + if let Some(branch) = branch.filter(|b| *b != "main") { + dataset = dataset + .checkout_branch(branch) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + } + + // The actor sidecar may be absent (older graphs without authored commits); + // an empty actor map then leaves every commit's actor `None`. It is read + // FLAT (no branch checkout): the pre-Phase-7 commit graph never forked the + // actor dataset — actors are keyed by `graph_commit_id` globally — so a + // branch's commits resolve their actor from the same single actor table. + // This matches the live `CommitGraph::open_at_branch`, which also opens the + // actor dataset on main while checking out the branch only on the commits + // dataset. + let actors_open = + match crate::failpoints::maybe_fail_lance_open("migration.v3_to_v4.legacy_open") { + Ok(()) => Dataset::open(&graph_commit_actors_uri(root)).await, + Err(injected) => Err(injected), + }; + let actor_by_commit_id = match actors_open { + Ok(actor_dataset) => load_commit_actor_cache(&actor_dataset).await?, + // An ABSENT actor sidecar is benign (older graphs without authored + // commits) — every commit's actor stays `None`. A not-found is therefore + // the empty-map signal. But a CORRUPT/transient actor open must NOT be + // read as "no authors": silently wiping all authorship and then stamping + // v4 is the same permanent-loss hole as the commits arm, so anything + // other than not-found propagates. (Same variant contract, different + // rationale — absence is normal here, error is not.) + Err(lance::Error::DatasetNotFound { .. }) | Err(lance::Error::NotFound { .. }) => { + HashMap::new() + } + Err(e) => return Err(OmniError::Lance(e.to_string())), + }; + + load_commit_cache(&dataset, &actor_by_commit_id).await +} + async fn load_commit_cache( dataset: &Dataset, actor_by_commit_id: &HashMap, @@ -694,11 +912,170 @@ async fn open_for_branch(root_uri: &str, branch: Option<&str>) -> Result Result { - let duration = SystemTime::now() - .duration_since(UNIX_EPOCH) - .map_err(|e| OmniError::manifest(format!("system clock before UNIX_EPOCH: {}", e)))?; - Ok(duration.as_micros() as i64) +/// Identities of the commits written into a synthetic pre-Phase-7 (v3) graph by +/// [`seed_legacy_v3_lineage`], for assertions after migration. +// +// Gated on `test` OR the `failpoints` feature: the v3→v4 migration fault-injection +// test lives in the `failpoints` integration binary (the fail registry is +// process-global, so failpoint tests must not run in-source), and that binary +// compiles the crate without `cfg(test)` — so it needs this fixture under the +// feature too. Still excluded from release builds. +#[cfg(any(test, feature = "failpoints"))] +#[derive(Debug, Clone)] +pub struct V3LineageFixture { + /// The genesis (parentless) commit id. + pub genesis: String, + /// A direct, authored commit on main (actor `act-a`). + pub commit_a: String, + /// A commit tagged to the `feature` branch (actor `act-feature`). + pub feature_commit: String, + /// The merge commit on main: parent = `commit_a`, merged_parent = + /// `feature_commit`, actor `act-merger`. This is the head of main. + pub merge_commit: String, + /// Every commit id written, in append order (for count assertions). + pub all_ids: Vec, +} + +/// Build a synthetic pre-Phase-7 (internal-schema v3) graph at `root_uri`: graph +/// lineage lives ONLY in `_graph_commits.lance` (+ its actor sidecar), `__manifest` +/// carries NO `graph_commit`/`graph_head` rows, and the stamp is set to v3. This +/// reproduces exactly the on-disk shape a graph created by a pre-RFC-013-Phase-7 +/// binary would have, so the v3→v4 migration and the v3-read fallback can be +/// tested against it. +/// +/// The lineage is a realistic DAG with a branch + a real merge: genesis → A → +/// (feature commit, off to the side) → merge(A, feature) at the head of main, +/// with authored actors on the non-genesis commits. Reaches the dead-on-the- +/// write-path `append_commit_with_parents`/`append_actor` (still present for +/// exactly this transitional purpose) to write the legacy rows. +#[cfg(any(test, feature = "failpoints"))] +pub async fn seed_legacy_v3_lineage(root_uri: &str) -> Result { + let root = root_uri.trim_end_matches('/'); + + // 1. Create `__manifest` (Phase-7 folds genesis lineage into it) and the + // EMPTY legacy `_graph_commits.lance`. We then append the v3-style commit + // rows below — a real v3 graph carried its genesis in `_graph_commits`. + crate::db::manifest::seed_manifest_for_v3_fixture(root).await?; + let mut cg = CommitGraph::init(root).await?; + // Clear the cache that init seeded from the (genesis-bearing) manifest, so + // the appended rows below are the whole story and parents come out right. + cg.commit_by_id.clear(); + cg.head_commit = None; + + // 2. Append the legacy lineage to `_graph_commits.lance` on main. + let genesis = cg + .append_commit_with_parents(None, 1, None, None, None) + .await?; + let commit_a = cg + .append_commit_with_parents(None, 2, Some(&genesis), None, Some("act-a")) + .await?; + let feature_commit = cg + .append_commit_with_parents(Some("feature"), 3, Some(&commit_a), None, Some("act-feature")) + .await?; + let merge_commit = cg + .append_commit_with_parents( + None, + 4, + Some(&commit_a), + Some(&feature_commit), + Some("act-merger"), + ) + .await?; + + // 3. Strip the genesis lineage rows the Phase-7 init folded into `__manifest` + // and rewind the stamp to v3, so the manifest matches a true pre-Phase-7 + // graph (no lineage in `__manifest`, stamp v3). + crate::db::manifest::strip_lineage_and_set_v3_stamp_for_fixture(root).await?; + + Ok(V3LineageFixture { + genesis: genesis.clone(), + commit_a: commit_a.clone(), + feature_commit: feature_commit.clone(), + merge_commit: merge_commit.clone(), + all_ids: vec![genesis, commit_a, feature_commit, merge_commit], + }) +} + +/// Identities of a synthetic pre-Phase-7 (v3) graph that carries a REAL Lance +/// branch (built by [`seed_legacy_v3_lineage_with_branch`]). +#[cfg(test)] +#[derive(Debug, Clone)] +pub struct V3BranchedLineageFixture { + /// The genesis (parentless) commit on main. + pub genesis: String, + /// A direct authored commit on main (actor `act-a`). The head of main. + pub commit_a: String, + /// A commit on the real `feature` Lance branch (actor `act-branch`), + /// parented off `commit_a`. The head of `feature`. + pub branch_commit: String, + /// The branch name forked on both `_graph_commits.lance` and `__manifest`. + pub branch: String, +} + +/// Build a synthetic pre-Phase-7 (internal-schema v3) graph at `root_uri` that +/// carries a REAL Lance branch `feature` on BOTH `_graph_commits.lance` and +/// `__manifest`, reproducing exactly the on-disk shape of a branched graph +/// created by a pre-RFC-013-Phase-7 binary: +/// +/// - `_graph_commits.lance`: main has `genesis → A`; the `feature` Lance branch +/// adds `branch_commit` (parent `A`). Authored actors land in the FLAT actor +/// sidecar (the pre-Phase-7 commit graph never forked the actor table). +/// - `__manifest`: main is stamped v3 with NO lineage rows; the `feature` branch +/// is forked from main's v3 state, so it too is v3 with NO lineage of its own. +/// +/// This is the fixture the per-branch v3→v4 migration runs against: it lets a +/// test prove that migrating the `feature` branch reads the branch's legacy +/// lineage, writes it into the BRANCH's `__manifest`, and leaves main untouched — +/// the case the main-only [`seed_legacy_v3_lineage`] cannot exercise. +#[cfg(test)] +pub async fn seed_legacy_v3_lineage_with_branch(root_uri: &str) -> Result { + let root = root_uri.trim_end_matches('/'); + + // 1. `__manifest` (genesis folded by Phase-7 init) + an empty legacy + // `_graph_commits.lance`. Clear the init-seeded cache so the rows we + // append below are the whole story. + crate::db::manifest::seed_manifest_for_v3_fixture(root).await?; + let mut cg = CommitGraph::init(root).await?; + cg.commit_by_id.clear(); + cg.head_commit = None; + + // 2. Main lineage on `_graph_commits.lance`: genesis → A (authored). + let genesis = cg + .append_commit_with_parents(None, 1, None, None, None) + .await?; + let commit_a = cg + .append_commit_with_parents(None, 2, Some(&genesis), None, Some("act-a")) + .await?; + + // 3. Fork a real `feature` Lance branch on `_graph_commits.lance`, switch the + // handle to it, and append an authored branch commit (its actor lands in + // the flat main actor table — exactly the pre-Phase-7 shape). + cg.create_branch("feature").await?; + let commits_ds = cg + .dataset + .take() + .expect("commits dataset present after create_branch") + .checkout_branch("feature") + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + cg.dataset = Some(commits_ds); + cg.active_branch = Some("feature".to_string()); + let branch_commit = cg + .append_commit_with_parents(Some("feature"), 3, Some(&commit_a), None, Some("act-branch")) + .await?; + + // 4. Rewind main's `__manifest` to the v3 shape (strip the folded genesis + // lineage, set stamp 3) BEFORE forking — so the `feature` manifest branch + // inherits the stripped v3 state (no lineage, stamp 3). + crate::db::manifest::strip_lineage_and_set_v3_stamp_for_fixture(root).await?; + crate::db::manifest::fork_manifest_branch_for_v3_fixture(root, "feature").await?; + + Ok(V3BranchedLineageFixture { + genesis, + commit_a, + branch_commit, + branch: "feature".to_string(), + }) } #[cfg(test)] @@ -709,6 +1086,83 @@ mod tests { use super::*; + // RFC-013 step 4: the v3-read fallback / migration source reads a NAMED + // branch's lineage from a real Lance branch on `_graph_commits.lance`, while + // resolving actors from the FLAT actor table (the pre-Phase-7 commit graph + // forked only the commits dataset, never the actor sidecar). This guards + // both that branch-checkout path and the flat-actor resolution — the case + // the main-branch fixture (commits on main only) does not exercise. + #[tokio::test] + async fn read_legacy_commit_cache_resolves_branch_commits_with_flat_actors() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + + // A v3 graph needs `__manifest` to exist for `CommitGraph::init`'s + // genesis-cache seed; we clear that cache and write our own legacy rows. + crate::db::manifest::seed_manifest_for_v3_fixture(uri) + .await + .unwrap(); + let mut cg = CommitGraph::init(uri).await.unwrap(); + cg.commit_by_id.clear(); + cg.head_commit = None; + + // Main lineage: genesis → A (authored). The actor lands in the FLAT + // `_graph_commit_actors.lance` (never branched). + let genesis = cg + .append_commit_with_parents(None, 1, None, None, None) + .await + .unwrap(); + let commit_a = cg + .append_commit_with_parents(None, 2, Some(&genesis), None, Some("act-a")) + .await + .unwrap(); + + // Fork a real Lance branch on `_graph_commits.lance`, switch the handle + // to it, and append an authored branch commit (its actor also goes to + // the flat main actor table — exactly the pre-Phase-7 shape). + cg.create_branch("feature").await.unwrap(); + cg.dataset = Some( + cg.dataset + .take() + .unwrap() + .checkout_branch("feature") + .await + .unwrap(), + ); + cg.active_branch = Some("feature".to_string()); + let branch_commit = cg + .append_commit_with_parents( + Some("feature"), + 3, + Some(&commit_a), + None, + Some("act-branch"), + ) + .await + .unwrap(); + + // The legacy read at the branch sees the inherited main commits + the + // branch commit, the head is the branch commit, and the authored actors + // resolve from the flat table (no branch checkout on the actor dataset). + let (commits, head) = read_legacy_commit_cache(uri, Some("feature")).await.unwrap(); + assert_eq!(commits.len(), 3, "branch inherits genesis + A + its own commit"); + assert_eq!( + head.as_ref().unwrap().graph_commit_id, + branch_commit, + "the branch commit is the head" + ); + assert_eq!( + commits.get(&commit_a).unwrap().actor_id.as_deref(), + Some("act-a"), + "main commit's actor resolves from the flat actor table", + ); + assert_eq!( + commits.get(&branch_commit).unwrap().actor_id.as_deref(), + Some("act-branch"), + "branch commit's actor resolves from the flat actor table", + ); + } + #[test] fn load_commits_from_batches_returns_error_for_bad_schema() { let batch = RecordBatch::try_new( diff --git a/crates/omnigraph/src/db/graph_coordinator.rs b/crates/omnigraph/src/db/graph_coordinator.rs index b9bcb11..aff791d 100644 --- a/crates/omnigraph/src/db/graph_coordinator.rs +++ b/crates/omnigraph/src/db/graph_coordinator.rs @@ -106,13 +106,17 @@ impl GraphCoordinator { storage: Arc, ) -> Result { let root = normalize_root_uri(root_uri)?; + // The genesis graph commit is folded into the manifest init write, so + // `__manifest` is the single source of graph lineage from version one + // (RFC-013 Phase 7). `CommitGraph::init` then creates the empty + // branch-ref dataset and seeds its cache from that manifest genesis. let manifest = ManifestCoordinator::init(&root, catalog).await?; - let commit_graph = Some(CommitGraph::init(&root, manifest.version()).await?); + let commit_graph = CommitGraph::init(&root).await?; Ok(Self { root_uri: root, storage, manifest, - commit_graph, + commit_graph: Some(commit_graph), bound_branch: None, }) } @@ -257,7 +261,7 @@ impl GraphCoordinator { /// fresh, so any existing commit-graph branch with this name is provably /// orphaned and is force-dropped before recreating. async fn create_commit_graph_branch(&mut self, branch: &str) -> Result<()> { - failpoints::maybe_fail("branch_create.after_manifest_branch_create")?; + failpoints::maybe_fail(crate::failpoints::names::BRANCH_CREATE_AFTER_MANIFEST_BRANCH_CREATE)?; let Some(commit_graph) = &mut self.commit_graph else { return Ok(()); }; @@ -306,7 +310,7 @@ impl GraphCoordinator { /// Best-effort, idempotent reclaim of the commit-graph branch `branch`. /// Tolerates an absent commit-graph dataset (a graph that never committed). async fn reclaim_commit_graph_branch(&mut self, branch: &str) -> Result<()> { - failpoints::maybe_fail("branch_delete.before_commit_graph_reclaim")?; + failpoints::maybe_fail(crate::failpoints::names::BRANCH_DELETE_BEFORE_COMMIT_GRAPH_RECLAIM)?; if let Some(commit_graph) = &mut self.commit_graph { commit_graph.force_delete_branch(branch).await } else if self @@ -438,7 +442,12 @@ impl GraphCoordinator { .exists(&graph_commits_uri(self.root_uri())) .await? { - let _ = CommitGraph::init(self.root_uri(), self.manifest.version()).await?; + // A graph opened without a commit-graph dataset gets the empty + // branch-ref dataset created lazily here. Graph lineage lives in + // `__manifest` (RFC-013 Phase 7) — a graph initialized by current + // code already carries its genesis there, and the commit graph + // sources its cache from it. No genesis is written here. + CommitGraph::init(self.root_uri()).await?; } self.commit_graph = match self.current_branch() { Some(branch) => Some(CommitGraph::open_at_branch(self.root_uri(), branch).await?), @@ -452,12 +461,8 @@ impl GraphCoordinator { updates: &[SubTableUpdate], actor_id: Option<&str>, ) -> Result { - let manifest_version = self.commit_manifest_updates(updates).await?; - let snapshot_id = self.record_graph_commit(manifest_version, actor_id).await?; - Ok(PublishedSnapshot { - manifest_version, - _snapshot_id: snapshot_id, - }) + self.commit_updates_with_actor_with_expected(updates, &HashMap::new(), actor_id) + .await } /// Commit with publisher-level OCC fence. The `expected_table_versions` map @@ -471,45 +476,9 @@ impl GraphCoordinator { expected_table_versions: &HashMap, actor_id: Option<&str>, ) -> Result { - let manifest_version = self - .commit_manifest_updates_with_expected(updates, expected_table_versions) - .await?; - let snapshot_id = self.record_graph_commit(manifest_version, actor_id).await?; - Ok(PublishedSnapshot { - manifest_version, - _snapshot_id: snapshot_id, - }) - } - - pub(crate) async fn commit_manifest_updates( - &mut self, - updates: &[SubTableUpdate], - ) -> Result { - let manifest_version = self.manifest.commit(updates).await?; - failpoints::maybe_fail("graph_publish.after_manifest_commit")?; - Ok(manifest_version) - } - - pub(crate) async fn commit_manifest_updates_with_expected( - &mut self, - updates: &[SubTableUpdate], - expected_table_versions: &HashMap, - ) -> Result { - let manifest_version = self - .manifest - .commit_with_expected(updates, expected_table_versions) - .await?; - failpoints::maybe_fail("graph_publish.after_manifest_commit")?; - Ok(manifest_version) - } - - pub(crate) async fn commit_manifest_changes( - &mut self, - changes: &[ManifestChange], - ) -> Result { - let manifest_version = self.manifest.commit_changes(changes).await?; - failpoints::maybe_fail("graph_publish.after_manifest_commit")?; - Ok(manifest_version) + let changes = updates_to_changes(updates); + self.commit_changes_with_actor_with_expected(&changes, expected_table_versions, actor_id) + .await } pub(crate) async fn commit_changes_with_actor( @@ -517,71 +486,110 @@ impl GraphCoordinator { changes: &[ManifestChange], actor_id: Option<&str>, ) -> Result { - let manifest_version = self.commit_manifest_changes(changes).await?; - let snapshot_id = self.record_graph_commit(manifest_version, actor_id).await?; + self.commit_changes_with_actor_with_expected(changes, &HashMap::new(), actor_id) + .await + } + + /// Publish `changes` and record one graph commit in the SAME manifest CAS + /// (RFC-013 Phase 7). The lineage intent (a freshly minted commit id, the + /// branch, the actor) rides the publish so the `graph_commit` + `graph_head` + /// rows land atomically with the table-version rows — one manifest version, + /// no separate write, no `commit_graph.refresh()` to pick a parent (the + /// publisher resolves it under the CAS). The in-memory commit cache is then + /// updated from the intent + the resolved parent without a re-read. + async fn commit_changes_with_actor_with_expected( + &mut self, + changes: &[ManifestChange], + expected_table_versions: &HashMap, + actor_id: Option<&str>, + ) -> Result { + self.ensure_commit_graph_initialized().await?; + let intent = self.new_lineage_intent(actor_id, None)?; + failpoints::maybe_fail(crate::failpoints::names::GRAPH_PUBLISH_BEFORE_COMMIT_APPEND)?; + let outcome = self + .manifest + .commit_changes_with_lineage(changes, expected_table_versions, Some(&intent)) + .await?; + failpoints::maybe_fail(crate::failpoints::names::GRAPH_PUBLISH_AFTER_MANIFEST_COMMIT)?; + let snapshot_id = self.apply_lineage_to_cache(intent, &outcome); Ok(PublishedSnapshot { - manifest_version, + manifest_version: outcome.version, _snapshot_id: snapshot_id, }) } - pub(crate) async fn record_graph_commit( + /// Publish a branch-merge: `updates` (the merged table versions) plus the + /// merge commit, in one manifest CAS (RFC-013 Phase 7). The merge commit's + /// merged-in parent is `merged_parent_commit_id` (the source head, stable); + /// its first parent is resolved by the publisher as the current target-branch + /// head — the live head, which is the post-merge correct parent even if the + /// target advanced since the merge began. + pub(crate) async fn commit_merge_with_actor( &mut self, - manifest_version: u64, - actor_id: Option<&str>, - ) -> Result { - self.ensure_commit_graph_initialized().await?; - let current_branch = self.current_branch().map(str::to_string); - let Some(commit_graph) = &mut self.commit_graph else { - return Ok(SnapshotId::synthetic( - current_branch.as_deref(), - manifest_version, - self.manifest_incarnation().e_tag.as_deref(), - )); - }; - failpoints::maybe_fail("graph_publish.before_commit_append")?; - // Refresh the commit-graph head from storage before selecting the - // parent. `append_commit` parents the new commit on the IN-MEMORY head - // (`head_commit_id`, zero storage read), but the manifest was just - // committed against a freshly rebased pin (`commit_all` opens a fresh - // coordinator) while THIS coordinator's cached head may be stale because - // an external writer advanced the branch. Without this refresh a - // same-branch write after an external commit appends off the stale head - // and FORKS the commit DAG (the new commit and the external commit - // sharing a parent). Refreshing makes the parent the true current head; - // the just-committed manifest version has no commit-graph row yet, so the - // fresh head is exactly the prior commit. (record_merge_commit is - // unaffected — it passes explicit parents, never the cached head.) - commit_graph.refresh().await?; - let graph_commit_id = commit_graph - .append_commit(current_branch.as_deref(), manifest_version, actor_id) - .await?; - Ok(SnapshotId::new(graph_commit_id)) - } - - pub(crate) async fn record_merge_commit( - &mut self, - manifest_version: u64, - parent_commit_id: &str, + updates: &[SubTableUpdate], merged_parent_commit_id: &str, actor_id: Option<&str>, ) -> Result { self.ensure_commit_graph_initialized().await?; - let current_branch = self.current_branch().map(str::to_string); - let commit_graph = self.commit_graph.as_mut().ok_or_else(|| { - OmniError::manifest("branch merge requires _graph_commits.lance".to_string()) - })?; - failpoints::maybe_fail("graph_publish.before_commit_append")?; - let graph_commit_id = commit_graph - .append_merge_commit( - current_branch.as_deref(), - manifest_version, - parent_commit_id, - merged_parent_commit_id, - actor_id, - ) + let intent = + self.new_lineage_intent(actor_id, Some(merged_parent_commit_id.to_string()))?; + failpoints::maybe_fail(crate::failpoints::names::GRAPH_PUBLISH_BEFORE_COMMIT_APPEND)?; + let changes = updates_to_changes(updates); + let outcome = self + .manifest + .commit_changes_with_lineage(&changes, &HashMap::new(), Some(&intent)) .await?; - Ok(SnapshotId::new(graph_commit_id)) + failpoints::maybe_fail(crate::failpoints::names::GRAPH_PUBLISH_AFTER_MANIFEST_COMMIT)?; + Ok(self.apply_lineage_to_cache(intent, &outcome)) + } + + /// Mint a [`LineageIntent`] for the next commit on the current branch: a + /// fresh ULID (stable across the publisher's CAS retries) and a timestamp. + /// The parent is NOT chosen here — the publisher resolves it per attempt + /// against the manifest it commits against. + fn new_lineage_intent( + &self, + actor_id: Option<&str>, + merged_parent_commit_id: Option, + ) -> Result { + Ok(crate::db::manifest::LineageIntent { + graph_commit_id: ulid::Ulid::new().to_string(), + branch: self.current_branch().map(str::to_string), + actor_id: actor_id.map(str::to_string), + merged_parent_commit_id, + created_at: crate::db::now_micros()?, + }) + } + + /// Insert the just-published commit into the in-memory commit cache from the + /// intent + the publisher-resolved parent + the new manifest version. No + /// storage I/O: the durable write already happened in the publish CAS, and + /// this keeps a same-handle read's `head_commit_id` consistent with the + /// snapshot it just advanced. Falls back to a synthetic id only when the + /// commit graph is somehow absent (never on a real write). + fn apply_lineage_to_cache( + &mut self, + intent: crate::db::manifest::LineageIntent, + outcome: &crate::db::manifest::CommitOutcome, + ) -> SnapshotId { + let Some(commit_graph) = &mut self.commit_graph else { + return SnapshotId::synthetic( + self.bound_branch.as_deref(), + outcome.version, + self.manifest.incarnation().e_tag.as_deref(), + ); + }; + let commit = GraphCommit { + graph_commit_id: intent.graph_commit_id.clone(), + manifest_branch: intent.branch, + manifest_version: outcome.version, + parent_commit_id: outcome.parent_commit_id.clone(), + merged_parent_commit_id: intent.merged_parent_commit_id, + actor_id: intent.actor_id, + created_at: intent.created_at, + }; + commit_graph.insert_committed(commit); + SnapshotId::new(intent.graph_commit_id) } async fn open_commit_graph_for_branch( @@ -625,6 +633,15 @@ fn graph_commits_uri(root_uri: &str) -> String { join_uri(root_uri, GRAPH_COMMITS_DIR) } +/// Wrap each `SubTableUpdate` as a `ManifestChange::Update` for the publisher. +fn updates_to_changes(updates: &[SubTableUpdate]) -> Vec { + updates + .iter() + .cloned() + .map(ManifestChange::Update) + .collect() +} + fn normalize_branch_name(branch: &str) -> Result> { let branch = branch.trim(); if branch.is_empty() { diff --git a/crates/omnigraph/src/db/manifest.rs b/crates/omnigraph/src/db/manifest.rs index fa05b49..da22136 100644 --- a/crates/omnigraph/src/db/manifest.rs +++ b/crates/omnigraph/src/db/manifest.rs @@ -35,7 +35,9 @@ pub(crate) use metadata::TableVersionMetadata; use metadata::{OMNIGRAPH_ROW_COUNT_KEY, table_version_metadata_for_state}; #[cfg(test)] use namespace::{branch_manifest_namespace, staged_table_namespace}; -use publisher::{GraphNamespacePublisher, ManifestBatchPublisher}; +pub(crate) use migrations::refuse_if_stamp_unsupported; +pub(crate) use publisher::LineageIntent; +use publisher::{GraphNamespacePublisher, ManifestBatchPublisher, PublishOutcome}; pub(crate) use recovery::{ RecoveryMode, RecoverySidecar, RecoverySidecarHandle, SidecarKind, SidecarTablePin, SidecarTableRegistration, SidecarTombstone, confirm_sidecar_phase_b, delete_sidecar, @@ -43,6 +45,7 @@ pub(crate) use recovery::{ recover_manifest_drift, schema_apply_serial_queue_key, write_sidecar, }; pub use state::SubTableEntry; +pub(crate) use state::{GraphLineageRow, read_graph_lineage}; #[cfg(test)] use state::string_column; use state::{ManifestState, read_manifest_state}; @@ -50,8 +53,34 @@ use state::{ManifestState, read_manifest_state}; const OBJECT_TYPE_TABLE: &str = "table"; const OBJECT_TYPE_TABLE_VERSION: &str = "table_version"; const OBJECT_TYPE_TABLE_TOMBSTONE: &str = "table_tombstone"; +/// Immutable per-commit graph-lineage row (RFC-013 Phase 7). One row per graph +/// commit; the projected form reconstructs a [`GraphCommit`]. `__manifest` is +/// the single source — written in the same publish CAS as the table-version +/// rows (no `_graph_commits.lance` row). +const OBJECT_TYPE_GRAPH_COMMIT: &str = "graph_commit"; +/// Mutable per-branch head pointer for the graph lineage (RFC-013 Phase 7). +/// `object_id` is `graph_head:` (`graph_head:main` for the main branch). +const OBJECT_TYPE_GRAPH_HEAD: &str = "graph_head"; const TABLE_VERSION_MANAGEMENT_KEY: &str = "table_version_management"; +/// Stable head-key segment for the main branch in `graph_head:` rows. +/// `table_branch`/`manifest_branch` encode main as null, but `object_id` must be +/// non-null, so the head row needs a literal — matching the `"main"` sentinel +/// already used by `SnapshotId::synthetic` and `open_for_branch`. +pub(crate) const MAIN_BRANCH_HEAD_KEY: &str = "main"; + +/// The result of a manifest commit that may have folded in a graph commit +/// (RFC-013 Phase 7). +#[derive(Debug, Clone)] +pub(crate) struct CommitOutcome { + /// The new `__manifest` version after the publish. + pub version: u64, + /// The parent the publisher resolved for the recorded commit, or `None` when + /// no lineage was recorded or the commit is the genesis. Lets the caller + /// update its in-memory commit cache without re-reading the manifest. + pub parent_commit_id: Option, +} + /// Apply pending internal-schema migrations against `__manifest` on the /// open-for-write path, independent of a publish. /// @@ -65,7 +94,105 @@ const TABLE_VERSION_MANAGEMENT_KEY: &str = "table_version_management"; /// Idempotent: a no-op stamp read when the on-disk version already matches. pub(crate) async fn migrate_on_open(root_uri: &str) -> Result<()> { let mut dataset = open_manifest_dataset(root_uri, None).await?; - migrations::migrate_internal_schema(&mut dataset).await + // Main branch: the v3→v4 lineage backfill reads `_graph_commits.lance` at + // main. Named branches migrate on their own first write via the publisher. + migrations::migrate_internal_schema(&mut dataset, root_uri, None).await +} + +/// The on-disk internal-schema stamp of `__manifest` at `branch` (main when +/// `None`). The transitional v3-read fallback in `CommitGraph` uses this to +/// decide whether to source lineage from `__manifest` (stamp ≥ v4, post-Phase-7) +/// or from the legacy `_graph_commits.lance` (stamp < v4, not yet migrated). +pub(crate) async fn internal_schema_stamp_at(root_uri: &str, branch: Option<&str>) -> Result { + let dataset = open_manifest_dataset(root_uri, branch).await?; + Ok(migrations::read_stamp(&dataset)) +} + +/// Refuse to open a graph whose `__manifest` is stamped outside this binary's +/// supported internal-schema range (newer than CURRENT, or older than +/// MIN_SUPPORTED). The read-only open path calls this — it skips the write-path +/// migration where the refusal otherwise lives — so an old binary still refuses a +/// newer graph instead of silently misreading it, and a too-new binary refuses a +/// below-floor graph instead of opening an unmigrated one. +pub(crate) async fn refuse_if_internal_schema_unsupported(root_uri: &str) -> Result<()> { + let stamp = internal_schema_stamp_at(root_uri, None).await?; + migrations::refuse_if_stamp_unsupported(stamp) +} + +/// The internal-schema version this binary writes. Exposed so the v3-read +/// fallback can compare a branch's on-disk stamp against it. +pub(crate) const INTERNAL_MANIFEST_SCHEMA_VERSION: u32 = + migrations::INTERNAL_MANIFEST_SCHEMA_VERSION; + +/// Test-only: create a `__manifest` for a minimal catalog, the first half of a +/// synthetic pre-Phase-7 (v3) graph (see `commit_graph::seed_legacy_v3_lineage`). +/// A small two-type schema is enough — the v3→v4 migration touches only the +/// lineage rows, never the table-version rows. +#[cfg(any(test, feature = "failpoints"))] +pub(crate) async fn seed_manifest_for_v3_fixture(root_uri: &str) -> Result<()> { + let schema = omnigraph_compiler::schema::parser::parse_schema( + "node Person { name: String }\nedge Knows: Person -> Person { }\n", + ) + .map_err(|e| OmniError::manifest(e.to_string()))?; + let catalog = + omnigraph_compiler::catalog::build_catalog(&schema).map_err(|e| OmniError::manifest(e.to_string()))?; + ManifestCoordinator::init(root_uri, &catalog).await?; + Ok(()) +} + +/// Test-only: strip the `graph_commit`/`graph_head` rows that Phase-7 init folds +/// into `__manifest`, then rewind the internal-schema stamp to v3 — completing a +/// synthetic pre-Phase-7 graph whose lineage lives only in `_graph_commits.lance`. +#[cfg(any(test, feature = "failpoints"))] +pub(crate) async fn strip_lineage_and_set_v3_stamp_for_fixture(root_uri: &str) -> Result<()> { + let mut dataset = open_manifest_dataset(root_uri, None).await?; + dataset + .delete("object_type = 'graph_commit' OR object_type = 'graph_head'") + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + // Re-open so the stamp write lands on the post-delete HEAD. + let mut dataset = open_manifest_dataset(root_uri, None).await?; + migrations::set_stamp_for_test(&mut dataset, 3).await +} + +/// Test-only: fork a real Lance branch `name` on `__manifest` from main's CURRENT +/// state. Call AFTER `strip_lineage_and_set_v3_stamp_for_fixture` so the forked +/// branch inherits the v3 stamp with no lineage rows — i.e. a faithful +/// pre-Phase-7 branch whose `__manifest` carries no lineage of its own. The +/// branch's commits live only on the `_graph_commits.lance` branch until the +/// per-branch v3→v4 migration runs against this branch's `__manifest`. +#[cfg(test)] +pub(crate) async fn fork_manifest_branch_for_v3_fixture(root_uri: &str, name: &str) -> Result<()> { + let mut dataset = open_manifest_dataset(root_uri, None).await?; + let version = dataset.version().version; + dataset + .create_branch(name, version, None) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + Ok(()) +} + +/// Test-support re-export of the read-write migration entry point for the +/// `failpoints` integration binary (which can't reach `pub(crate)` items). Gated +/// on `test` OR `failpoints`; never in a release build. +#[cfg(any(test, feature = "failpoints"))] +pub async fn migrate_on_open_for_test(root_uri: &str) -> Result<()> { + migrate_on_open(root_uri).await +} + +/// Test-support: the number of `graph_commit` lineage rows in `__manifest` at +/// `branch` (main when `None`), plus the on-disk internal-schema stamp. Lets the +/// `failpoints` integration binary assert the migration neither stamped nor +/// backfilled when a legacy-open fault fired. Gated on `test` OR `failpoints`. +#[cfg(any(test, feature = "failpoints"))] +pub async fn lineage_row_count_and_stamp_for_test( + root_uri: &str, + branch: Option<&str>, +) -> Result<(usize, u32)> { + let dataset = open_manifest_dataset(root_uri, branch).await?; + let stamp = migrations::read_stamp(&dataset); + let (rows, _heads) = read_graph_lineage(&dataset).await?; + Ok((rows.len(), stamp)) } /// Immutable point-in-time view of the database. @@ -313,6 +440,9 @@ impl ManifestCoordinator { /// Create a new graph at `root_uri` from a catalog. /// /// Creates per-type Lance datasets and the namespace `__manifest` table. + /// The genesis graph commit is folded into the init write, so `__manifest` + /// is the single source of graph lineage from version one — callers read it + /// back through the lineage projection rather than via a second write. pub async fn init(root_uri: &str, catalog: &Catalog) -> Result { let root = root_uri.trim_end_matches('/'); let (dataset, known_state) = init_manifest_graph(root, catalog).await?; @@ -419,17 +549,58 @@ impl ManifestCoordinator { changes: &[ManifestChange], expected_table_versions: &HashMap, ) -> Result { - if changes.is_empty() && expected_table_versions.is_empty() { - return Ok(self.version()); + Ok(self + .commit_changes_with_lineage(changes, expected_table_versions, None) + .await? + .version) + } + + /// Publish `changes` and, when `lineage` is present, record the graph commit + /// in the SAME merge-insert (RFC-013 Phase 7). `__manifest` is the single + /// source of graph lineage: the `graph_commit` + `graph_head:` rows + /// ride the table-version publish so the whole commit lands at one manifest + /// version — no separate write, no manifest→commit-graph atomicity gap, no + /// per-write commit-graph refresh. Returns the new version and the parent the + /// publisher resolved for the commit (so the caller can update its in-memory + /// commit cache without a re-read). + pub(crate) async fn commit_changes_with_lineage( + &mut self, + changes: &[ManifestChange], + expected_table_versions: &HashMap, + lineage: Option<&LineageIntent>, + ) -> Result { + if changes.is_empty() && expected_table_versions.is_empty() && lineage.is_none() { + return Ok(CommitOutcome { + version: self.version(), + parent_commit_id: None, + }); } - self.dataset = self + let PublishOutcome { + dataset, + parent_commit_id, + } = self .publisher - .publish(changes, expected_table_versions) + .publish(changes, expected_table_versions, lineage) .await?; + self.dataset = dataset; self.known_state = read_manifest_state(&self.dataset).await?; - Ok(self.version()) + Ok(CommitOutcome { + version: self.version(), + parent_commit_id, + }) + } + + /// Project the graph-lineage rows out of `__manifest` at `branch` without an + /// open coordinator. Opens the manifest fresh; used by `CommitGraph` to + /// source its in-memory cache from the manifest projection. + pub(crate) async fn read_graph_lineage_at( + root_uri: &str, + branch: Option<&str>, + ) -> Result<(Vec, HashMap)> { + let dataset = open_manifest_dataset(root_uri, branch).await?; + read_graph_lineage(&dataset).await } /// Current manifest version. diff --git a/crates/omnigraph/src/db/manifest/graph.rs b/crates/omnigraph/src/db/manifest/graph.rs index da2c641..e0d3c85 100644 --- a/crates/omnigraph/src/db/manifest/graph.rs +++ b/crates/omnigraph/src/db/manifest/graph.rs @@ -14,9 +14,17 @@ use super::layout::{manifest_uri, open_manifest_dataset, type_name_hash}; use super::metadata::TableVersionMetadata; use super::migrations::stamp_current_version; use super::state::{ - ManifestState, SubTableEntry, entries_to_batch, manifest_schema, read_manifest_state, + GraphLineageRow, ManifestState, SubTableEntry, entries_to_batch, graph_lineage_row_parts, + manifest_schema, read_manifest_state, }; +/// The manifest version the init `Dataset::write` produces (Lance datasets start +/// at version one). The genesis graph commit pins this version — a snapshot at +/// it is the empty, freshly-initialized graph. The two config-only commits that +/// follow (`update_config`, `stamp_current_version`) advance the live manifest +/// version but add no table data, so genesis correctly stays pinned at one. +const GENESIS_MANIFEST_VERSION: u64 = 1; + pub(super) async fn init_manifest_graph( root_uri: &str, catalog: &Catalog, @@ -24,7 +32,21 @@ pub(super) async fn init_manifest_graph( let root = root_uri.trim_end_matches('/'); let (entries, version_metadata) = build_initial_entries(root, catalog).await?; - let manifest_batch = entries_to_batch(&entries, &version_metadata)?; + // Genesis graph commit: parentless, actorless, minted once and folded into + // the init write so `__manifest` is the single source of graph lineage from + // version one (no `_graph_commits.lance` row, no separate publish). + let genesis = GraphLineageRow { + graph_commit_id: ulid::Ulid::new().to_string(), + manifest_branch: None, + manifest_version: GENESIS_MANIFEST_VERSION, + parent_commit_id: None, + merged_parent_commit_id: None, + actor_id: None, + created_at: crate::db::now_micros()?, + }; + let genesis_lineage = graph_lineage_row_parts(&genesis, None)?; + + let manifest_batch = entries_to_batch(&entries, &version_metadata, &genesis_lineage)?; let schema = manifest_schema(); let reader = RecordBatchIterator::new(vec![Ok(manifest_batch)], schema); let params = WriteParams { diff --git a/crates/omnigraph/src/db/manifest/migrations.rs b/crates/omnigraph/src/db/manifest/migrations.rs index 2a65079..207def4 100644 --- a/crates/omnigraph/src/db/manifest/migrations.rs +++ b/crates/omnigraph/src/db/manifest/migrations.rs @@ -37,6 +37,9 @@ use lance::Dataset; use crate::error::{OmniError, Result}; +use crate::db::commit_graph::GraphCommit; +use super::state::{GraphLineageRow, graph_lineage_row_parts, merge_lineage_rows, read_graph_lineage}; + /// Current internal schema version this binary expects to find on disk. /// /// History: @@ -50,14 +53,62 @@ use crate::error::{OmniError, Result}; /// `__manifest` dataset by the pre-v0.4.0 Run state machine (removed in /// MR-771). Once swept, the `is_internal_run_branch` defense-in-depth guard /// is no longer needed (MR-770). -pub(super) const INTERNAL_MANIFEST_SCHEMA_VERSION: u32 = 3; +/// - v4 — RFC-013 Phase 7 folds graph lineage into `__manifest` as +/// `graph_commit`/`graph_head` rows written in the publish CAS. A pre-Phase-7 +/// (v3) graph has its lineage only in `_graph_commits.lance`, so the new +/// binary would read an empty commit DAG. This one-time per-branch backfill +/// copies the lineage from `_graph_commits.lance` into `__manifest` +/// (`migrate_v3_to_v4`). `_graph_commits.lance` is left in place as the +/// branch-ref carrier; no commit rows are ever written to it again. +pub(crate) const INTERNAL_MANIFEST_SCHEMA_VERSION: u32 = 4; + +/// The oldest on-disk internal-schema stamp this binary will open. A graph below +/// this floor is refused (`refuse_if_stamp_unsupported`) with a "migrate it +/// forward with an older release first" error, instead of obliging this binary to +/// carry that version's `migrate_vN_…` arm and the legacy readers it needs +/// forever. Raising the floor is how the migration chain sheds old code. +/// +/// **Retirement runbook** — turning "accumulates forever" into a sliding window: +/// 1. *Shed version N* once no graph below `N+1` remains in the fleet: bump this +/// floor AND `LOWEST_REGISTERED_MIGRATION_SOURCE` to `N+1`, then delete the +/// `N =>` arm in `migrate_internal_schema`, `migrate_vN_to_vN+1`, and its +/// helpers + tests. The tripwire test keeps the two consts in lockstep, so a +/// half-done shed fails CI. +/// 2. *Retire the v3 legacy readers entirely* once MIN ≥ 4: `git rm` the +/// `commit_graph/commit_graph_legacy_v3.rs` seam file and flip the single +/// `stamp < CURRENT` gate in `load_commit_cache_for_branch` to read the +/// manifest projection unconditionally. +/// +/// MIN = 1 today is a pure no-op: `read_stamp` floors an absent stamp at 1 and no +/// real graph carries 0, so nothing is refused. +pub(crate) const MIN_SUPPORTED_INTERNAL_SCHEMA_VERSION: u32 = 1; + +/// The lowest `current` value the `migrate_internal_schema` dispatcher still has a +/// `match` arm for. Mirrors the lowest registered migration source so a floor bump +/// that forgets to delete the now-dead arm (or vice versa) is caught by the +/// compile-time tripwire below. Migration arms aren't an enumerable registry, so +/// this hand-mirrored const is the minimal enforced coupling — cheaper than +/// reshaping the dispatcher into a data-driven table. +const LOWEST_REGISTERED_MIGRATION_SOURCE: u32 = 1; + +/// Retirement tripwire (compile-time): the refusal floor and the lowest migration +/// arm must move together. Raising `MIN_SUPPORTED` without deleting the now-dead +/// below-floor arm — or vice versa — fails the build with this message, which is +/// stronger than a runtime test and impossible to skip. Migration arms can't be +/// enumerated, so this const-mirror is the check. +const _: () = assert!( + LOWEST_REGISTERED_MIGRATION_SOURCE == MIN_SUPPORTED_INTERNAL_SCHEMA_VERSION, + "internal-schema floor drifted from the lowest registered migration arm: when raising \ + MIN_SUPPORTED_INTERNAL_SCHEMA_VERSION, delete every below-floor `N =>` arm + migrate_vN_… \ + + its helpers/tests and bump LOWEST_REGISTERED_MIGRATION_SOURCE to match (or vice versa)", +); const INTERNAL_SCHEMA_VERSION_KEY: &str = "omnigraph:internal_schema_version"; const OBJECT_ID_PK_KEY: &str = "lance-schema:unenforced-primary-key"; /// Read the on-disk stamp from `__manifest`'s schema-level metadata. /// Absent ⇒ v1 (pre-stamp world). -pub(super) fn read_stamp(dataset: &Dataset) -> u32 { +pub(crate) fn read_stamp(dataset: &Dataset) -> u32 { dataset .schema() .metadata @@ -72,20 +123,52 @@ pub(super) async fn stamp_current_version(dataset: &mut Dataset) -> Result<()> { set_stamp(dataset, INTERNAL_MANIFEST_SCHEMA_VERSION).await } +/// Refuse to open a manifest whose stamp this binary cannot serve — in either +/// direction — with a clear upgrade path. Shared by every place a stamp is read +/// and enforced: the write-path migration dispatcher, the read-only open guard, +/// and the branch lineage-read path. Checking both bounds in one function means a +/// new stamp-reading caller gets the floor and the ceiling together and cannot +/// half-enforce. +/// +/// - `stamp > CURRENT`: the graph was written by a newer binary — upgrade omnigraph. +/// - `stamp < MIN_SUPPORTED`: the graph predates the oldest migration this binary +/// still carries — migrate it forward with an older release first, then reopen. +pub(crate) fn refuse_if_stamp_unsupported(stamp: u32) -> Result<()> { + if stamp > INTERNAL_MANIFEST_SCHEMA_VERSION { + return Err(OmniError::manifest(format!( + "__manifest is stamped at internal schema v{} but this binary expects v{} \ + — upgrade omnigraph before opening this graph", + stamp, INTERNAL_MANIFEST_SCHEMA_VERSION, + ))); + } + if stamp < MIN_SUPPORTED_INTERNAL_SCHEMA_VERSION { + return Err(OmniError::manifest(format!( + "__manifest is stamped at internal schema v{} but this binary supports v{} or later \ + — open it with an older omnigraph release to migrate it forward first, then reopen", + stamp, MIN_SUPPORTED_INTERNAL_SCHEMA_VERSION, + ))); + } + Ok(()) +} + /// Apply any pending internal-schema migrations to the manifest dataset. /// /// Idempotent: when the on-disk stamp matches the binary, this is a single /// metadata read with no writes. -pub(super) async fn migrate_internal_schema(dataset: &mut Dataset) -> Result<()> { +/// +/// `root_uri` + `branch` identify which graph + branch this `dataset` is a +/// manifest for. The v3→v4 lineage backfill needs them to read that branch's +/// `_graph_commits.lance`. `migrate_on_open` passes the main branch +/// (`branch = None`); the publisher's `load_publish_state` passes its own +/// branch, so each branch backfills on its first write. +pub(super) async fn migrate_internal_schema( + dataset: &mut Dataset, + root_uri: &str, + branch: Option<&str>, +) -> Result<()> { let mut current = read_stamp(dataset); - if current > INTERNAL_MANIFEST_SCHEMA_VERSION { - return Err(OmniError::manifest(format!( - "__manifest is stamped at internal schema v{} but this binary expects v{} \ - — upgrade omnigraph before opening this graph for writes", - current, INTERNAL_MANIFEST_SCHEMA_VERSION, - ))); - } + refuse_if_stamp_unsupported(current)?; while current < INTERNAL_MANIFEST_SCHEMA_VERSION { match current { @@ -97,6 +180,10 @@ pub(super) async fn migrate_internal_schema(dataset: &mut Dataset) -> Result<()> migrate_v2_to_v3(dataset).await?; current = 3; } + 3 => { + migrate_v3_to_v4(dataset, root_uri, branch).await?; + current = 4; + } other => { return Err(OmniError::manifest_internal(format!( "no internal-schema migration registered for v{} → v{}", @@ -202,6 +289,218 @@ async fn migrate_v2_to_v3(dataset: &mut Dataset) -> Result<()> { set_stamp(dataset, 3).await } +/// v3 → v4: backfill the graph lineage from `_graph_commits.lance` into +/// `__manifest`, then bump the stamp. +/// +/// RFC-013 Phase 7 made `__manifest` the single source of graph lineage +/// (`graph_commit` / `graph_head:` rows, written in the publish CAS). +/// A pre-Phase-7 (v3) graph has its lineage only in `_graph_commits.lance` and +/// none in `__manifest`, so the new binary would read an EMPTY commit DAG. This +/// one-time per-branch migration copies that branch's commits + the single head +/// into `__manifest` so reads see the real history. `_graph_commits.lance` +/// itself is left untouched as the branch-ref carrier (no commit row is ever +/// written to it again). +/// +/// `dataset` is the `__manifest` for `branch` (main when `branch` is `None`); +/// the migration runs per-branch on that branch's first write, so it reads +/// `_graph_commits.lance` at the SAME branch. +/// +/// Idempotency + crash recovery: the stamp bump is the LAST step, and the +/// lineage merge is keyed on `object_id` (re-inserting the same commit rows is a +/// no-op update). A crash after the merge but before the stamp bump re-enters +/// here at v3 and re-runs harmlessly. As a fast path, if `__manifest` already +/// carries `graph_commit` rows (a previous run completed the merge), we skip +/// straight to the stamp bump. +/// +/// Concurrent runners: two processes (or two open-for-write handles) can open the +/// same legacy graph at once and both reach the backfill merge. `merge_lineage_rows` +/// uses `conflict_retries(0)`, so the row-level CAS loser on `graph_head:` +/// must be re-driven here rather than failing the open — `migrate_v2_to_v3` is +/// concurrent-runner idempotent and this step must be too. The bounded loop +/// re-reads the fast path (a concurrent winner's merge is one atomic Lance commit, +/// so a re-read sees either zero or all of its rows, never partial), re-opens the +/// stale handle past the winner's commit, and retries. On budget exhaustion it +/// returns a `RowLevelCasContention`-typed error so the publisher's OUTER retry +/// loop (which only re-runs `is_retryable_publish_conflict` conflicts) completes +/// it on the next attempt — the same converge-on-next-attempt contract the +/// recovery sweep uses. +async fn migrate_v3_to_v4( + dataset: &mut Dataset, + root_uri: &str, + branch: Option<&str>, +) -> Result<()> { + // Mirror the publisher's budget (`publisher::PUBLISHER_RETRY_BUDGET = 5`); kept + // as a local const rather than re-exporting that private one — the two are the + // same shape (bounded row-level-CAS retries) but independent knobs. + const MIGRATION_MERGE_RETRY_BUDGET: u32 = 5; + + // Exclusive range + an unguarded retryable arm (see `commit_v4_stamp_idempotently` + // for the rationale): every retryable conflict re-opens and retries inside the + // loop, and the SINGLE reachable exhaustion path is the typed contention return + // below — so the retryable variant can never fall through to the `Err(err)` + // propagate arm on the last iteration. + for _ in 0..MIGRATION_MERGE_RETRY_BUDGET { + // Fast path / idempotency + concurrent-winner guard: if the backfill + // already landed (a previous run, OR a concurrent runner that won the CAS + // — its merge is atomic, so this is all-or-nothing), don't re-merge — just + // (re)stamp. `dataset` is re-opened past any winner's commit below, so this + // re-read sees the winner's rows on a retry. + let (existing_lineage, _heads) = read_graph_lineage(dataset).await?; + if !existing_lineage.is_empty() { + return commit_v4_stamp_idempotently(dataset, root_uri, branch).await; + } + + // Read this branch's legacy commit cache (commits + the head). An absent or + // empty `_graph_commits.lance` yields no commits — nothing to backfill. + let (commit_by_id, head) = + crate::db::commit_graph::read_legacy_commit_cache(root_uri, branch).await?; + if commit_by_id.is_empty() { + return commit_v4_stamp_idempotently(dataset, root_uri, branch).await; + } + + let parts = build_lineage_backfill_parts(&commit_by_id, head.as_ref(), branch)?; + + match merge_lineage_rows(dataset.clone(), &parts).await { + Ok(new_dataset) => { + *dataset = new_dataset; + // Stamp LAST. Crash window: a failure between the merge above and + // this stamp bump leaves stamp v3 + lineage present in `__manifest`. + // The next open re-enters at v3, the fast path at the top sees the + // lineage and skips straight to the stamp bump — completing the + // migration with no duplicate rows (the merge is keyed on + // `object_id`). Pinned by + // `crash_after_merge_before_stamp_completes_on_next_open`. + return commit_v4_stamp_idempotently(dataset, root_uri, branch).await; + } + // A concurrent runner won the `graph_head:` CAS. Our in-hand + // handle is stale at the pre-contention HEAD, so a re-open is required + // to see the winner's commit; then re-loop (the fast path will see the + // winner's lineage and stamp). Bounded by the budget. + Err(err) if super::publisher::is_retryable_publish_conflict(&err) => { + *dataset = super::layout::open_manifest_dataset(root_uri, branch).await?; + continue; + } + Err(err) => return Err(err), + } + } + + // Budget exhausted under sustained contention. Return a CAS-typed error (not a + // plain conflict) so the publisher's outer retry loop — which only re-runs + // `is_retryable_publish_conflict` — re-runs `load_publish_state` and completes + // the migration, rather than giving up. + Err(OmniError::manifest_row_level_cas_contention(format!( + "v3→v4 lineage backfill exhausted {} retries against concurrent runners", + MIGRATION_MERGE_RETRY_BUDGET + ))) +} + +/// Stamp the v3→v4 migration's terminal version idempotently under concurrent +/// runners. `set_stamp` issues an `UpdateConfig` Lance commit; once the merge CAS +/// loser is made to converge (above), BOTH runners reach this stamp bump and race +/// it — the loser gets `lance::Error::IncompatibleTransaction` (two `UpdateConfig` +/// commits touching the same metadata key), which is NOT a row-level CAS +/// contention and so is not caught by the merge loop. But both write the SAME +/// value, so the conflict is benign: re-open and, if the stamp already reached the +/// target (the concurrent runner finished it), succeed; otherwise re-apply. +/// Bounded; on exhaustion surface a CAS-typed error for the publisher's outer +/// retry, same as the merge loop. +async fn commit_v4_stamp_idempotently( + dataset: &mut Dataset, + root_uri: &str, + branch: Option<&str>, +) -> Result<()> { + const STAMP_RETRY_BUDGET: u32 = 5; + // Exclusive range + an UNGUARDED `IncompatibleTransaction` arm: the retryable + // variant is always handled inside the loop (re-open + same-value check + retry), + // so it can never fall through to the stringifying `Err(e)` catch-all, and the + // SINGLE reachable exhaustion path is the typed contention return below. (A + // `0..=BUDGET` range with an `attempt < BUDGET` guard let the last iteration's + // retryable conflict reach the catch-all and return a non-retryable + // `OmniError::Lance` — the publisher's outer retry would then give up.) + for _ in 0..STAMP_RETRY_BUDGET { + // Inline the `update_schema_metadata` write (rather than `set_stamp`) so the + // raw Lance error variant is in hand — `set_stamp` pre-stringifies it. + let stamp_result = stamp_internal_schema(dataset).await; + match stamp_result { + Ok(_) => return Ok(()), + Err(lance::Error::IncompatibleTransaction { .. }) => { + // A concurrent runner's `UpdateConfig` preempted ours — the + // retryable case. Re-open past its commit; if it already stamped to + // the target we're done (the value is identical), else fall through + // to retry on the advanced handle. + *dataset = super::layout::open_manifest_dataset(root_uri, branch).await?; + if read_stamp(dataset) >= INTERNAL_MANIFEST_SCHEMA_VERSION { + return Ok(()); + } + } + Err(e) => return Err(OmniError::Lance(e.to_string())), + } + } + + // Exhausted the budget against sustained concurrent stampers. Return a + // CAS-typed (retryable) error so the publisher's OUTER retry — which only + // re-runs `is_retryable_publish_conflict` — completes it, rather than the + // stringified `OmniError::Lance` it would treat as fatal. + Err(OmniError::manifest_row_level_cas_contention(format!( + "v3→v4 stamp bump exhausted {} retries against concurrent runners", + STAMP_RETRY_BUDGET + ))) +} + +/// The single `update_schema_metadata` write that bumps the on-disk internal-schema +/// stamp to the current version. Extracted from `commit_v4_stamp_idempotently`'s +/// retry loop so a `failpoints` test can inject a concurrent-stamper +/// `IncompatibleTransaction` deterministically (the loop's exhaustion path is +/// otherwise near-unreachable). Returns the RAW `lance::Error` so the loop can match +/// the `IncompatibleTransaction` variant — `set_stamp` pre-stringifies it. +async fn stamp_internal_schema(dataset: &mut Dataset) -> std::result::Result<(), lance::Error> { + crate::failpoints::maybe_fail_lance_incompatible("migration.v4_stamp.force_incompatible")?; + dataset + .update_schema_metadata([( + INTERNAL_SCHEMA_VERSION_KEY.to_string(), + INTERNAL_MANIFEST_SCHEMA_VERSION.to_string(), + )]) + .await + .map(|_| ()) +} + +/// Build the `__manifest` rows for the v3→v4 backfill: one immutable +/// `graph_commit` row per commit, plus EXACTLY ONE `graph_head:` row for +/// the actual head. Each commit encodes to a `[graph_commit, graph_head]` pair, +/// but only the head commit's head row is kept — the others would be redundant +/// updates of the same `graph_head:` object_id (the head is per-branch, +/// not per-commit). +fn build_lineage_backfill_parts( + commit_by_id: &std::collections::HashMap, + head: Option<&GraphCommit>, + branch: Option<&str>, +) -> Result> { + let head_id = head.map(|h| h.graph_commit_id.as_str()); + // Deterministic iteration order (the source is a HashMap): merge-insert is + // keyed on `object_id` so the final manifest content is order-independent, + // but a stable order keeps the produced batch reproducible regardless. + let mut commits: Vec<&GraphCommit> = commit_by_id.values().collect(); + commits.sort_by(|a, b| a.graph_commit_id.cmp(&b.graph_commit_id)); + let mut parts = Vec::with_capacity(commits.len() + 1); + for commit in commits { + let row = GraphLineageRow { + graph_commit_id: commit.graph_commit_id.clone(), + manifest_branch: commit.manifest_branch.clone(), + manifest_version: commit.manifest_version, + parent_commit_id: commit.parent_commit_id.clone(), + merged_parent_commit_id: commit.merged_parent_commit_id.clone(), + actor_id: commit.actor_id.clone(), + created_at: commit.created_at, + }; + let [commit_part, head_part] = graph_lineage_row_parts(&row, branch)?; + parts.push(commit_part); + if Some(commit.graph_commit_id.as_str()) == head_id { + parts.push(head_part); + } + } + Ok(parts) +} + async fn set_stamp(dataset: &mut Dataset, version: u32) -> Result<()> { dataset .update_schema_metadata([(INTERNAL_SCHEMA_VERSION_KEY.to_string(), version.to_string())]) @@ -209,3 +508,42 @@ async fn set_stamp(dataset: &mut Dataset, version: u32) -> Result<()> { .map_err(|e| OmniError::Lance(e.to_string()))?; Ok(()) } + +/// Test-only: force the on-disk internal-schema stamp to `version`. Used to +/// synthesize a pre-migration graph (rewinding to v3) and to simulate a crash +/// that lost the final stamp bump. Gated on `test` OR `failpoints` so the +/// fault-injection migration test (in the `failpoints` integration binary, +/// compiled without `cfg(test)`) can reach it too. +#[cfg(any(test, feature = "failpoints"))] +pub(crate) async fn set_stamp_for_test(dataset: &mut Dataset, version: u32) -> Result<()> { + set_stamp(dataset, version).await +} + +#[cfg(test)] +mod tests { + use super::*; + + /// The floor never refuses any stamp the binary can actually serve — a graph + /// at MIN through CURRENT passes, only sub-MIN / super-CURRENT are rejected. + /// With MIN = 1 and CURRENT = 4 this proves the live range is exactly [1, 4] + /// and that the floor is a no-op for every real graph (lowest real stamp is 1). + #[test] + fn unsupported_guard_accepts_exactly_the_supported_range() { + for stamp in MIN_SUPPORTED_INTERNAL_SCHEMA_VERSION..=INTERNAL_MANIFEST_SCHEMA_VERSION { + assert!( + refuse_if_stamp_unsupported(stamp).is_ok(), + "stamp v{stamp} is within [MIN, CURRENT] and must be accepted" + ); + } + if MIN_SUPPORTED_INTERNAL_SCHEMA_VERSION > 0 { + assert!( + refuse_if_stamp_unsupported(MIN_SUPPORTED_INTERNAL_SCHEMA_VERSION - 1).is_err(), + "a sub-floor stamp must be refused" + ); + } + assert!( + refuse_if_stamp_unsupported(INTERNAL_MANIFEST_SCHEMA_VERSION + 1).is_err(), + "a future stamp must be refused" + ); + } +} diff --git a/crates/omnigraph/src/db/manifest/publisher.rs b/crates/omnigraph/src/db/manifest/publisher.rs index ba1166d..382b51a 100644 --- a/crates/omnigraph/src/db/manifest/publisher.rs +++ b/crates/omnigraph/src/db/manifest/publisher.rs @@ -35,8 +35,8 @@ use super::layout::{open_manifest_dataset, tombstone_object_id, version_object_i use super::metadata::parse_namespace_version_request; use super::migrations::migrate_internal_schema; use super::state::{ - manifest_rows_batch, manifest_schema, read_manifest_entries, read_registered_table_locations, - read_tombstone_versions, + GraphLineageRow, GraphLineageRowPart, graph_lineage_row_parts, head_lineage_row, + manifest_rows_batch, manifest_schema, read_publish_scan, }; use super::{ ManifestChange, OBJECT_TYPE_TABLE, OBJECT_TYPE_TABLE_TOMBSTONE, OBJECT_TYPE_TABLE_VERSION, @@ -50,13 +50,48 @@ use super::{ /// iteration re-runs `load_publish_state` and the expected-version pre-check. const PUBLISHER_RETRY_BUDGET: u32 = 5; +/// The graph-lineage commit to record atomically with a manifest publish +/// (RFC-013 Phase 7). One logical commit per publish: the `graph_commit_id` is +/// minted once by the caller and stays stable across the publisher's CAS +/// retries; only the parent re-resolves per attempt (against the freshly loaded +/// `__manifest`), so a retry after a concurrent commit parents off the new head +/// — the TOCTOU the dual-write era's `commit_graph.refresh()` guarded is closed +/// by construction. +#[derive(Debug, Clone)] +pub(crate) struct LineageIntent { + /// ULID minted once before the publish loop; the graph commit's identity. + pub graph_commit_id: String, + /// The branch this commit lands on (`None` = main). Selects the + /// `graph_head:` pointer row that gets updated. + pub branch: Option, + /// Authoring actor, or `None` for unauthored / system writes. + pub actor_id: Option, + /// The merged-in source head — `Some` only for a branch-merge commit. + pub merged_parent_commit_id: Option, + /// Commit timestamp (microseconds since the UNIX epoch). + pub created_at: i64, +} + +/// The result of a manifest publish that may have folded in a graph commit. +#[derive(Debug)] +pub(super) struct PublishOutcome { + /// The advanced `__manifest` dataset (its version is the published version). + pub dataset: Dataset, + /// The parent the publisher resolved for the recorded commit, if a + /// [`LineageIntent`] was supplied. Returned so the caller can update its + /// in-memory commit cache without a re-read. `None` when no lineage was + /// recorded, or when the commit is the genesis (no parent). + pub parent_commit_id: Option, +} + #[async_trait] pub(super) trait ManifestBatchPublisher: Send + Sync { async fn publish( &self, changes: &[ManifestChange], expected_table_versions: &HashMap, - ) -> Result; + lineage: Option<&LineageIntent>, + ) -> Result; } pub(super) struct GraphNamespacePublisher { @@ -76,6 +111,19 @@ struct PendingVersionRow { row_count: Option, } +/// Everything one CAS attempt needs out of a single `__manifest` scan +/// (RFC-013 P2): the open dataset, table state for the pre-check + pending-row +/// build, and the `graph_commit` lineage rows for parent resolution. Folding the +/// lineage into this struct is what lets `resolve_lineage_rows` skip its own +/// `read_graph_lineage` scan. +struct LoadedPublishState { + dataset: Dataset, + registered_tables: HashMap, + existing_versions: HashMap<(String, u64), SubTableEntry>, + existing_tombstones: HashMap<(String, u64), ()>, + lineage_rows: Vec, +} + impl GraphNamespacePublisher { pub(super) fn new(root_uri: &str, branch: Option<&str>) -> Self { Self { @@ -90,22 +138,31 @@ impl GraphNamespacePublisher { open_manifest_dataset(&self.root_uri, self.branch.as_deref()).await } - async fn load_publish_state( - &self, - ) -> Result<( - Dataset, - HashMap, - HashMap<(String, u64), SubTableEntry>, - HashMap<(String, u64), ()>, - )> { + async fn load_publish_state(&self) -> Result { + // Test seam: inject a retryable contention here to exercise the outer + // retry loop's re-run-on-retryable-load-error path (no-op without the + // `failpoints` feature). The migration surfaces the same typed error. + crate::failpoints::maybe_fail_retryable_contention( + crate::failpoints::names::PUBLISH_LOAD_STATE_RETRYABLE_CONTENTION, + )?; let mut dataset = self.dataset().await?; // Run pending internal-schema migrations exactly once per publish on // the open-for-write path; idempotent when the on-disk stamp already - // matches this binary. See `db/manifest/migrations.rs`. - migrate_internal_schema(&mut dataset).await?; - let registered_tables = read_registered_table_locations(&dataset).await?; - let existing_entries = read_manifest_entries(&dataset).await?; - let existing_versions = existing_entries + // matches this binary. Pass this publisher's branch so the v3→v4 lineage + // backfill reads `_graph_commits.lance` at the SAME branch it is + // publishing to (each branch backfills on its first write). See + // `db/manifest/migrations.rs`. + migrate_internal_schema(&mut dataset, &self.root_uri, self.branch.as_deref()).await?; + // ONE `__manifest` scan for everything the publish needs: table + // locations, version entries, tombstones, AND the `graph_commit` lineage + // rows for parent resolution (RFC-013 P2). The lineage extraction rides + // this pass instead of a second `read_graph_lineage` scan in + // `resolve_lineage_rows`; the per-attempt re-read is preserved because + // `load_publish_state` runs once per CAS attempt, so a retry sees the + // advanced head and re-parents correctly. + let scan = read_publish_scan(&dataset).await?; + let existing_versions = scan + .version_entries .iter() .map(|entry| { ( @@ -114,13 +171,14 @@ impl GraphNamespacePublisher { ) }) .collect(); - let existing_tombstones = read_tombstone_versions(&dataset).await?; - Ok(( + let existing_tombstones = scan.tombstones.into_iter().collect(); + Ok(LoadedPublishState { dataset, - registered_tables, + registered_tables: scan.table_locations, existing_versions, existing_tombstones, - )) + lineage_rows: scan.lineage_rows, + }) } fn build_pending_rows( @@ -266,6 +324,50 @@ impl GraphNamespacePublisher { Ok(rows) } + /// Resolve the parent for `intent` against the just-loaded `dataset` and + /// build the two lineage rows (`graph_commit` + `graph_head:`) to + /// fold into the publish batch. Runs INSIDE the CAS retry loop, so the + /// parent is read from the manifest state this attempt will commit against — + /// a retry after a concurrent commit re-reads the advanced head and parents + /// correctly (TOCTOU closed). `new_manifest_version` is the version this + /// publish produces (the recorded commit pins it). + /// + /// The parent is the current head of the branch's lineage — the + /// `should_replace_head` winner over the visible `graph_commit` rows, the + /// same selection the commit-graph cache uses. (The denormalized + /// `graph_head:` row is written for forward-compat but is not the + /// parent source here: a branch freshly forked from main inherits main's + /// commits but not yet a `graph_head:` row, and the head-over-rows + /// computation gives the correct fork-point parent in that case.) + /// + /// `lineage_rows` is the `graph_commit` set this attempt already parsed in + /// `load_publish_state`'s single scan (RFC-013 P2) — NOT a fresh + /// `read_graph_lineage` scan. The per-attempt re-read is still preserved: the + /// retry loop re-runs `load_publish_state`, so each attempt's `lineage_rows` + /// reflects the head as it stands for that attempt. + fn resolve_lineage_rows( + lineage_rows: &[GraphLineageRow], + intent: &LineageIntent, + new_manifest_version: u64, + ) -> Result<(Vec, Option)> { + let parent_commit_id = head_lineage_row(lineage_rows).map(|h| h.graph_commit_id.clone()); + + let commit = GraphLineageRow { + graph_commit_id: intent.graph_commit_id.clone(), + manifest_branch: intent.branch.clone(), + manifest_version: new_manifest_version, + parent_commit_id: parent_commit_id.clone(), + merged_parent_commit_id: intent.merged_parent_commit_id.clone(), + actor_id: intent.actor_id.clone(), + created_at: intent.created_at, + }; + let parts = graph_lineage_row_parts(&commit, intent.branch.as_deref())?; + Ok(( + parts.into_iter().map(lineage_part_to_pending).collect(), + parent_commit_id, + )) + } + fn pending_rows_to_batch(rows: Vec) -> Result { let mut object_ids = Vec::with_capacity(rows.len()); let mut object_types = Vec::with_capacity(rows.len()); @@ -420,7 +522,25 @@ impl GraphNamespacePublisher { })) }) .collect::>>()?; - self.publish(&changes, &HashMap::new()).await + Ok(self.publish(&changes, &HashMap::new(), None).await?.dataset) + } +} + +/// Map a `state::GraphLineageRowPart` onto a `PendingVersionRow` so a graph +/// commit's two lineage rows ride the same publish batch as the table-version +/// rows (RFC-013 Phase 7). Lineage rows carry no table identity: `table_key` is +/// the empty string (never matched by a real key) and `location`/`row_count` +/// are null. +fn lineage_part_to_pending(part: GraphLineageRowPart) -> PendingVersionRow { + PendingVersionRow { + object_id: part.object_id, + object_type: part.object_type.to_string(), + location: None, + metadata: Some(part.metadata), + table_key: String::new(), + table_version: part.table_version, + table_branch: part.table_branch, + row_count: None, } } @@ -429,7 +549,17 @@ impl GraphNamespacePublisher { /// merge-insert join key, annotated as an unenforced primary key on /// `__manifest`). Translate it to a typed manifest conflict so callers can /// match without parsing strings; everything else is opaque storage. -fn map_lance_publish_error(err: LanceError) -> OmniError { +/// +/// Shared (`pub(crate)`) with the v3→v4 lineage backfill +/// (`state::merge_lineage_rows`), which issues its own `__manifest` merge-insert +/// outside the publisher and must surface the SAME typed +/// `RowLevelCasContention` so the migration's re-open retry loop can recognize a +/// CAS loss. This is the merge-insert (`execute_reader`) conflict vocabulary +/// only. It is deliberately NOT `optimize::is_retryable_lance_conflict`: that one +/// also matches `CommitConflict`/`RetryableCommitConflict` from the COMPACTION +/// commit path (`compact_files` -> `apply_commit`), which a row-level merge-insert +/// never emits — folding it in here would match impossible variants. +pub(crate) fn map_lance_publish_error(err: LanceError) -> OmniError { if matches!(err, LanceError::TooMuchWriteContention { .. }) { return OmniError::manifest_row_level_cas_contention(format!( "manifest publish lost a row-level CAS race: {}", @@ -445,14 +575,40 @@ impl ManifestBatchPublisher for GraphNamespacePublisher { &self, changes: &[ManifestChange], expected_table_versions: &HashMap, - ) -> Result { - if changes.is_empty() && expected_table_versions.is_empty() { - return self.dataset().await; + lineage: Option<&LineageIntent>, + ) -> Result { + if changes.is_empty() && expected_table_versions.is_empty() && lineage.is_none() { + return Ok(PublishOutcome { + dataset: self.dataset().await?, + parent_commit_id: None, + }); } for attempt in 0..=PUBLISHER_RETRY_BUDGET { - let (dataset, known_tables, existing_versions, existing_tombstones) = - self.load_publish_state().await?; + // `load_publish_state` runs the v3→v4 migration (`migrate_internal_schema`) + // on its first scan. The migration's bounded merge/stamp retries surface a + // retryable `RowLevelCasContention` on exhaustion EXPECTING this outer loop + // to re-run them — a re-run re-reads the manifest, by which point a + // concurrent winner has usually completed the migration (next scan is a + // no-op). Route a retryable load error through the SAME retry path as a + // retryable `merge_rows` conflict below, so that typed contention actually + // composes with the publisher retry instead of aborting the publish. + let loaded = match self.load_publish_state().await { + Ok(loaded) => loaded, + Err(err) + if attempt < PUBLISHER_RETRY_BUDGET && is_retryable_publish_conflict(&err) => + { + continue; + } + Err(err) => return Err(err), + }; + let LoadedPublishState { + dataset, + registered_tables: known_tables, + existing_versions, + existing_tombstones, + lineage_rows, + } = loaded; let latest_per_table = Self::latest_visible_per_table(&existing_versions, &existing_tombstones); @@ -461,19 +617,48 @@ impl ManifestBatchPublisher for GraphNamespacePublisher { // surfaced as `ExpectedVersionMismatch` rather than retried. Self::check_expected_table_versions(&latest_per_table, expected_table_versions)?; - if changes.is_empty() { - return Ok(dataset); - } - - let rows = Self::build_pending_rows( + let mut rows = Self::build_pending_rows( changes, &known_tables, &existing_versions, &existing_tombstones, )?; + // Fold the graph commit into the SAME batch so table-version rows + // and lineage rows land in one merge-insert (one Lance commit, one + // manifest version) — no separate write, no manifest→commit-graph + // atomicity gap. The merge-insert advances exactly one version on + // top of the loaded dataset, so the commit pins + // `current + 1`. The parent is resolved here, per attempt, from the + // lineage rows THIS attempt's scan loaded (TOCTOU closed on a CAS + // retry — a retry re-runs `load_publish_state` → fresh lineage). + let parent_commit_id = match lineage { + Some(intent) => { + let new_manifest_version = dataset.version().version + 1; + let (commit_rows, parent) = + Self::resolve_lineage_rows(&lineage_rows, intent, new_manifest_version)?; + rows.extend(commit_rows); + parent + } + None => None, + }; + + if rows.is_empty() { + // Expected-version-only publish with no changes and no lineage: + // the precondition held, nothing to write. + return Ok(PublishOutcome { + dataset, + parent_commit_id, + }); + } + match self.merge_rows(dataset, rows).await { - Ok(new_dataset) => return Ok(new_dataset), + Ok(new_dataset) => { + return Ok(PublishOutcome { + dataset: new_dataset, + parent_commit_id, + }); + } Err(err) => { if attempt < PUBLISHER_RETRY_BUDGET && is_retryable_publish_conflict(&err) { continue; @@ -497,7 +682,12 @@ impl ManifestBatchPublisher for GraphNamespacePublisher { /// contention; if the caller's `expected_table_versions` still holds against /// the new manifest state, we re-attempt. Other conflict variants (notably /// `ExpectedVersionMismatch`) propagate so the caller learns immediately. -fn is_retryable_publish_conflict(err: &OmniError) -> bool { +/// +/// Shared (`pub(crate)`) with the v3→v4 lineage backfill's re-open retry loop +/// (`migrations::migrate_v3_to_v4`), so the migration's retry decision matches the +/// publisher's by construction — both retry exactly `RowLevelCasContention` and +/// propagate everything else. +pub(crate) fn is_retryable_publish_conflict(err: &OmniError) -> bool { matches!( err, OmniError::Manifest(m) diff --git a/crates/omnigraph/src/db/manifest/recovery.rs b/crates/omnigraph/src/db/manifest/recovery.rs index d21e0fd..c32c493 100644 --- a/crates/omnigraph/src/db/manifest/recovery.rs +++ b/crates/omnigraph/src/db/manifest/recovery.rs @@ -40,17 +40,14 @@ use lance::Dataset; use serde::{Deserialize, Serialize}; use tracing::warn; -use crate::db::commit_graph::CommitGraph; use crate::db::graph_coordinator::GraphCoordinator; -use crate::db::recovery_audit::{ - RecoveryAudit, RecoveryAuditRecord, RecoveryKind, TableOutcome, now_micros, -}; +use crate::db::recovery_audit::{RecoveryAudit, RecoveryAuditRecord, RecoveryKind, TableOutcome}; use crate::db::schema_state::SchemaStateRecovery; use crate::error::{OmniError, Result}; use crate::storage::StorageAdapter; use super::Snapshot; -use super::publisher::{GraphNamespacePublisher, ManifestBatchPublisher}; +use super::publisher::{GraphNamespacePublisher, LineageIntent, ManifestBatchPublisher}; use super::{ManifestChange, SubTableUpdate, TableRegistration, TableTombstone}; /// System actor identifier recorded on every recovery commit. Operators @@ -59,6 +56,44 @@ use super::{ManifestChange, SubTableUpdate, TableRegistration, TableTombstone}; /// into the audit row's `recovery_for_actor` field. pub(crate) const RECOVERY_ACTOR: &str = "omnigraph:recovery"; +/// Publish a recovery action's manifest `updates` AND its recovery commit in one +/// CAS (RFC-013 Phase 7). The recovery commit's lineage (`graph_commit` + +/// `graph_head`) rides the same merge-insert as the table-version re-pin — there +/// is no separate `_graph_commits.lance` write and no manifest→commit-graph gap. +/// `updates` is empty for the no-table-change recovery paths (all-NoMovement +/// roll-back, stale-sidecar cleanup, orphaned-branch discard); the lineage rows +/// still publish, so the recovery commit is always durable. +/// +/// The commit's first parent is resolved by the publisher (the live head of the +/// recovery's branch); its merged-in parent is the sidecar's recorded source +/// head for a rolled-forward branch merge, matching the pre-Phase-7 merge-commit +/// shape. Returns the new manifest version and the minted recovery commit id +/// (which the audit row references). +async fn publish_recovery_commit( + root_uri: &str, + sidecar: &RecoverySidecar, + kind: RecoveryKind, + updates: &[ManifestChange], + expected: &HashMap, +) -> Result<(u64, String)> { + let merged_parent_commit_id = match (sidecar.writer_kind, kind) { + (SidecarKind::BranchMerge, RecoveryKind::RolledForward) => { + sidecar.merge_source_commit_id.clone() + } + _ => None, + }; + let intent = LineageIntent { + graph_commit_id: ulid::Ulid::new().to_string(), + branch: sidecar.branch.clone(), + actor_id: Some(RECOVERY_ACTOR.to_string()), + merged_parent_commit_id, + created_at: crate::db::now_micros()?, + }; + let publisher = GraphNamespacePublisher::new(root_uri, sidecar.branch.as_deref()); + let outcome = publisher.publish(updates, expected, Some(&intent)).await?; + Ok((outcome.dataset.version().version, intent.graph_commit_id)) +} + /// Subdirectory under the graph root holding sidecar files. pub(crate) const RECOVERY_DIR_NAME: &str = "__recovery"; @@ -416,7 +451,7 @@ pub(crate) async fn write_sidecar( ) -> Result { // Failpoint: models a storage put failure (S3 PutObject / fs write) // in Phase A — every writer must abort before any HEAD advance. - crate::failpoints::maybe_fail("recovery.sidecar_write")?; + crate::failpoints::maybe_fail(crate::failpoints::names::RECOVERY_SIDECAR_WRITE)?; debug_assert_eq!(sidecar.schema_version, SIDECAR_SCHEMA_VERSION); let uri = sidecar_uri(root_uri, &sidecar.operation_id); let json = serde_json::to_string_pretty(sidecar).map_err(|err| { @@ -457,7 +492,7 @@ pub(crate) async fn confirm_sidecar_phase_b( ) -> Result<()> { // Failpoint: models a storage failure on the confirmation write — the // pre-confirm sidecar stays on disk, so recovery rolls the operation back. - crate::failpoints::maybe_fail("recovery.sidecar_confirm")?; + crate::failpoints::maybe_fail(crate::failpoints::names::RECOVERY_SIDECAR_CONFIRM)?; for pin in &mut sidecar.tables { // Every pinned table MUST have an achieved version. A miss means the // pin set and the publish `updates` diverged — fail loudly at the @@ -489,7 +524,7 @@ pub(crate) async fn delete_sidecar( // Failpoint: models a storage delete failure (S3 DeleteObject) in // Phase D — callers swallow it (the write already published) and the // stale sidecar is healed by the next write or open. - crate::failpoints::maybe_fail("recovery.sidecar_delete")?; + crate::failpoints::maybe_fail(crate::failpoints::names::RECOVERY_SIDECAR_DELETE)?; storage.delete(&handle.sidecar_uri).await } @@ -507,7 +542,7 @@ pub(crate) async fn list_sidecars( // Failpoint: models a storage list failure (S3 ListObjectsV2) — every // consumer (open-time sweep, write-entry heal) must fail loudly // rather than silently skipping recovery. - crate::failpoints::maybe_fail("recovery.sidecar_list")?; + crate::failpoints::maybe_fail(crate::failpoints::names::RECOVERY_SIDECAR_LIST)?; let dir = recovery_dir_uri(root_uri); let mut uris = storage.list_dir(&dir).await?; // Sort by URI so the sweep processes sidecars deterministically. @@ -831,20 +866,13 @@ pub(crate) async fn heal_pending_sidecars_roll_forward( // authority) BEFORE opening: a deferred sidecar whose // branch was deleted would otherwise wedge every write // on the dead-branch open. - let (branch_exists, main_version) = { + let branch_exists = { let mut coord = coordinator.write().await; coord.refresh().await?; - let exists = coord.all_branches().await?.iter().any(|name| name == b); - (exists, coord.snapshot().version()) + coord.all_branches().await?.iter().any(|name| name == b) }; if !branch_exists { - discard_orphaned_branch_sidecar( - root_uri, - storage.as_ref(), - &sidecar, - main_version, - ) - .await?; + discard_orphaned_branch_sidecar(root_uri, storage.as_ref(), &sidecar).await?; processed_any = true; continue; } @@ -862,7 +890,7 @@ pub(crate) async fn heal_pending_sidecars_roll_forward( }; if process_sidecar( root_uri, - storage.as_ref(), + &storage, &branch_snapshot, &sidecar, RecoveryMode::RollForwardOnly, @@ -893,7 +921,6 @@ async fn discard_orphaned_branch_sidecar( root_uri: &str, storage: &dyn StorageAdapter, sidecar: &RecoverySidecar, - manifest_version: u64, ) -> Result<()> { warn!( operation_id = sidecar.operation_id.as_str(), @@ -922,22 +949,31 @@ async fn discard_orphaned_branch_sidecar( && record.recovery_kind == RecoveryKind::OrphanedBranchDiscarded }); if !already_recorded { - let mut graph = CommitGraph::open(root_uri).await?; - let graph_commit_id = graph - .append_commit(None, manifest_version, Some(RECOVERY_ACTOR)) - .await?; - // Failpoint: the residual window above — commit appended, audit + // The orphan-discard commit is recorded on MAIN (the sidecar's own + // branch is gone), via a lineage-only publish into `__manifest` (RFC-013 + // Phase 7) — no `_graph_commits.lance` row. The publisher stamps the + // commit at the version it produces. + let intent = LineageIntent { + graph_commit_id: ulid::Ulid::new().to_string(), + branch: None, + actor_id: Some(RECOVERY_ACTOR.to_string()), + merged_parent_commit_id: None, + created_at: crate::db::now_micros()?, + }; + let publisher = GraphNamespacePublisher::new(root_uri, None); + publisher.publish(&[], &HashMap::new(), Some(&intent)).await?; + // Failpoint: the residual window above — commit published, audit // not yet durable. - crate::failpoints::maybe_fail("recovery.orphan_discard_audit_append")?; + crate::failpoints::maybe_fail(crate::failpoints::names::RECOVERY_ORPHAN_DISCARD_AUDIT_APPEND)?; audit .append(RecoveryAuditRecord { - graph_commit_id, + graph_commit_id: intent.graph_commit_id, recovery_kind: RecoveryKind::OrphanedBranchDiscarded, recovery_for_actor: sidecar.actor_id.clone(), operation_id: sidecar.operation_id.clone(), sidecar_writer_kind: format!("{:?}", sidecar.writer_kind), per_table_outcomes: Vec::new(), - created_at: now_micros()?, + created_at: crate::db::now_micros()?, }) .await?; } @@ -1014,13 +1050,7 @@ pub(crate) async fn recover_manifest_drift( .iter() .any(|name| name == b) { - discard_orphaned_branch_sidecar( - root_uri, - storage.as_ref(), - &sidecar, - coordinator.snapshot().version(), - ) - .await?; + discard_orphaned_branch_sidecar(root_uri, storage.as_ref(), &sidecar).await?; continue; } let mut branch_coord = @@ -1036,7 +1066,7 @@ pub(crate) async fn recover_manifest_drift( }; process_sidecar( root_uri, - storage.as_ref(), + &storage, &branch_snapshot, &sidecar, mode, @@ -1051,7 +1081,7 @@ pub(crate) async fn recover_manifest_drift( async fn process_sidecar( root_uri: &str, - storage: &dyn StorageAdapter, + storage: &std::sync::Arc, snapshot: &Snapshot, sidecar: &RecoverySidecar, mode: RecoveryMode, @@ -1154,7 +1184,7 @@ async fn process_sidecar( ); } return record_audit_recovery_rollforward( - root_uri, storage, snapshot, sidecar, &states, + root_uri, storage.as_ref(), sidecar, &states, ) .await .map(|()| true); @@ -1176,7 +1206,7 @@ async fn process_sidecar( writer_kind = ?sidecar.writer_kind, "recovery: rolling back sidecar (mixed or unexpected state)" ); - roll_back_sidecar(root_uri, storage, snapshot, sidecar, &states) + roll_back_sidecar(root_uri, storage.as_ref(), sidecar, &states) .await .map(|()| true) } @@ -1191,7 +1221,7 @@ async fn process_sidecar( "recovery: rolling back SchemaApply sidecar because schema staging \ files were not promoted in this recovery pass" ); - roll_back_sidecar(root_uri, storage, snapshot, sidecar, &states) + roll_back_sidecar(root_uri, storage.as_ref(), sidecar, &states) .await .map(|()| true) } @@ -1211,8 +1241,36 @@ async fn process_sidecar( "recovery: rolling forward sidecar (Phase B completed; \ Phase C did not land)" ); - let (new_manifest_version, published_versions) = - roll_forward_all(root_uri, sidecar, &states, snapshot).await?; + // TOCTOU window: between `classify_table` (which read the manifest + // pin) and the publish CAS below, a concurrent live writer can + // advance the manifest past our expected version. The failpoint + // lets a test force that interleave deterministically. + crate::failpoints::maybe_fail( + crate::failpoints::names::RECOVERY_BEFORE_ROLL_FORWARD_PUBLISH, + )?; + // RFC-013 Phase 7: `roll_forward_all` folds the recovery commit into the + // manifest publish CAS, so it also returns the minted `graph_commit_id` + // for the audit row below. + let (new_manifest_version, published_versions, graph_commit_id) = + match roll_forward_all(root_uri, sidecar, &states, snapshot).await { + Ok(published) => published, + // Convergence-idempotent (invariants 7 & 15): a roll-forward's + // postcondition is "the manifest reflects the sidecar's committed + // Lance state", NOT "this sweep personally won the CAS". A + // concurrent writer that advanced the manifest to/past that goal + // during the classify→publish window is convergence, not a logical + // conflict — so re-read and either record the already-achieved + // roll-forward or defer to the next pass; never fail the open. + // Any other error still propagates. + Err(err) if is_expected_version_mismatch(&err) => { + return converge_or_defer_roll_forward( + root_uri, storage, sidecar, &states, err, + ) + .await; + } + Err(err) => return Err(err), + }; + let _ = new_manifest_version; // `to_version` records the ACTUAL Lance HEAD published for // each table (not pin.post_commit_pin, which is a lower bound // for loose-match writers like SchemaApply / EnsureIndices / @@ -1242,17 +1300,214 @@ async fn process_sidecar( record_audit( root_uri, sidecar, - new_manifest_version, + graph_commit_id, RecoveryKind::RolledForward, outcomes, ) .await?; - delete_sidecar_by_operation_id(root_uri, storage, &sidecar.operation_id).await?; + delete_sidecar_by_operation_id(root_uri, storage.as_ref(), &sidecar.operation_id) + .await?; Ok(true) } } } +/// True if `err` is the publisher's per-table CAS precondition failure +/// (`ExpectedVersionMismatch`) — the signal that a concurrent writer advanced +/// the manifest past what this caller expected. +fn is_expected_version_mismatch(err: &OmniError) -> bool { + matches!( + err, + OmniError::Manifest(m) + if matches!( + m.details, + Some(crate::error::ManifestConflictDetails::ExpectedVersionMismatch { .. }) + ) + ) +} + +/// Whether the live manifest already reflects everything this sidecar intended +/// to publish. +/// +/// SOUNDNESS: the per-table test is `current_version >= observed lance_head`, a +/// *proxy* for "the sidecar's committed Lance commit is an ancestor of the +/// published HEAD" (so a higher version is a descendant that contains it). The +/// proxy is sound only because of the heal-first invariant: every writer that +/// can advance a table's manifest version first heals pending sidecars +/// (`heal_pending_recovery_sidecars` runs at the head of `load`/`mutate`/ +/// schema-apply/branch-merge) or refuses on an unrecovered graph (`optimize`). +/// So the only path past `expected_version` is one that first publishes THIS +/// sidecar's commit at `lance_head` — version ordering then implies lineage +/// containment. A future writer that advances a pinned table WITHOUT healing +/// first (e.g. a non-heal-first `Overwrite` that replaces rows) would void this +/// proxy and must be re-validated by row-id lineage, not version ordering. +/// Added tables must be registered; tombstoned tables must be gone. +fn sidecar_intent_satisfied( + snapshot: &Snapshot, + sidecar: &RecoverySidecar, + states: &[ClassifiedTable], +) -> bool { + for (pin, state) in sidecar.tables.iter().zip(states.iter()) { + let current = snapshot + .entry(&pin.table_key) + .map(|e| e.table_version) + .unwrap_or(0); + if current < state.lance_head { + return false; + } + } + for reg in &sidecar.additional_registrations { + if snapshot.entry(®.table_key).is_none() { + return false; + } + } + for tomb in &sidecar.tombstones { + if snapshot.entry(&tomb.table_key).is_some() { + return false; + } + } + true +} + +/// Re-read the live manifest snapshot for the sidecar's branch. +async fn fresh_snapshot_for_sidecar( + root_uri: &str, + storage: &std::sync::Arc, + sidecar: &RecoverySidecar, +) -> Result { + let mut coordinator = match sidecar.branch.as_deref() { + Some(branch) if branch != "main" => { + GraphCoordinator::open_branch(root_uri, branch, std::sync::Arc::clone(storage)).await? + } + _ => GraphCoordinator::open(root_uri, std::sync::Arc::clone(storage)).await?, + }; + coordinator.refresh().await?; + Ok(coordinator.snapshot()) +} + +/// Convergence-idempotent handling of a roll-forward publish CAS that lost to a +/// concurrent writer (`ExpectedVersionMismatch`). A roll-forward's postcondition +/// is "the manifest reflects the sidecar's committed Lance state", not "this +/// sweep won the CAS" (invariants 7 & 15). Re-read the live manifest: +/// +/// - if it already reached the sidecar's goal, the work is done (just not by us) +/// — record the `RolledForward` audit and delete the sidecar idempotently; +/// - otherwise the manifest is progressing but not yet at the goal — leave the +/// sidecar for the next open / the live writer's own Phase D. +/// +/// Either way the open does NOT fail. A genuine logical conflict (a table below +/// `expected_version`, i.e. data lost) is not satisfiable here and re-surfaces +/// loudly via the classifier's `InvariantViolation` on the next pass. +/// See iss-schema-apply-reopen-recovery-race. +async fn converge_or_defer_roll_forward( + root_uri: &str, + storage: &std::sync::Arc, + sidecar: &RecoverySidecar, + states: &[ClassifiedTable], + conflict: OmniError, +) -> Result { + let fresh = fresh_snapshot_for_sidecar(root_uri, storage, sidecar).await?; + if !sidecar_intent_satisfied(&fresh, sidecar, states) { + warn!( + operation_id = sidecar.operation_id.as_str(), + writer_kind = ?sidecar.writer_kind, + "recovery: roll-forward publish lost a CAS and the manifest has not \ + yet reached the sidecar's goal; deferring to the next pass \ + (conflict: {conflict})" + ); + return Ok(false); + } + // The manifest already reached the sidecar's goal — some other actor + // advanced it. Under the heal-first invariant, whoever advanced past + // `expected_version` first healed THIS sidecar (recorded its RolledForward + // audit and deleted it). So the audit row already exists; recording another + // here would put two RolledForward rows in `_graph_commit_recoveries` for + // one recovery event (visible in `commit list --filter actor=…recovery`). + // Only finish the bookkeeping if the sidecar is still on disk (the winner + // crashed between audit and delete); if it is already gone, the winner + // completed it — return success WITHOUT a duplicate audit, keeping the + // audit append-idempotent per operation_id across concurrent sweeps. + let sidecar_path = sidecar_uri(root_uri, &sidecar.operation_id); + if !storage.exists(&sidecar_path).await? { + warn!( + operation_id = sidecar.operation_id.as_str(), + writer_kind = ?sidecar.writer_kind, + "recovery: roll-forward publish lost a CAS; the winner already \ + converged and cleaned up this sidecar — nothing to do" + ); + return Ok(true); + } + warn!( + operation_id = sidecar.operation_id.as_str(), + writer_kind = ?sidecar.writer_kind, + "recovery: roll-forward publish lost a CAS to a concurrent writer that \ + already reached the goal; converging (RolledForward audit + delete)" + ); + let mut outcomes: Vec = sidecar + .tables + .iter() + .map(|pin| TableOutcome { + table_key: pin.table_key.clone(), + from_version: pin.expected_version, + to_version: fresh + .entry(&pin.table_key) + .map(|e| e.table_version) + .unwrap_or(pin.post_commit_pin), + }) + .collect(); + // Mirror the normal roll-forward audit shape: SchemaApply sidecars also + // register added tables, so the audit must list them too (else a converge + // audit row is incomplete vs the `roll_forward_all` path for the same + // recovery kind). + for reg in &sidecar.additional_registrations { + outcomes.push(TableOutcome { + table_key: reg.table_key.clone(), + from_version: 0, + to_version: fresh + .entry(®.table_key) + .map(|e| e.table_version) + .unwrap_or(0), + }); + } + // RFC-013 Phase 7: the winning writer folded its recovery commit into the + // manifest CAS, so the converge audit references THAT commit. We lost the CAS + // and never minted it, but a recovery commit is distinguishable by its + // `RECOVERY_ACTOR` authorship (`publish_recovery_commit`), so the latest + // recovery-actored commit on this branch IS it. Do NOT use the branch head: + // a concurrent USER write can advance `graph_head` past the recovery commit + // between the winner's publish and this read, which would attribute the audit + // row to the wrong (later, user) commit. (We only reach here with the sidecar + // still on disk: the winner advanced the manifest but crashed before its own + // audit+delete, so we finish its bookkeeping.) + let cache = match sidecar.branch.as_deref() { + Some(branch) => { + crate::db::commit_graph::CommitGraph::open_at_branch(root_uri, branch).await? + } + None => crate::db::commit_graph::CommitGraph::open(root_uri).await?, + }; + let converged_commit_id = match cache + .load_commits() + .await? + .into_iter() + .rfind(|c| c.actor_id.as_deref() == Some(RECOVERY_ACTOR)) + { + Some(recovery_commit) => recovery_commit.graph_commit_id, + // No recovery commit visible — unexpected on this path (the winner just + // published one); fall back to the head rather than an empty id. + None => cache.head_commit_id().await?.unwrap_or_default(), + }; + record_audit( + root_uri, + sidecar, + converged_commit_id, + RecoveryKind::RolledForward, + outcomes, + ) + .await?; + delete_sidecar_by_operation_id(root_uri, storage.as_ref(), &sidecar.operation_id).await?; + Ok(true) +} + #[derive(Debug, Clone, Copy)] struct ClassifiedTable { classification: TableClassification, @@ -1268,7 +1523,6 @@ struct ClassifiedTable { async fn roll_back_sidecar( root_uri: &str, storage: &dyn StorageAdapter, - snapshot: &Snapshot, sidecar: &RecoverySidecar, states: &[ClassifiedTable], ) -> Result<()> { @@ -1328,23 +1582,18 @@ async fn roll_back_sidecar( }); } } - // Publish the restored HEADs so manifest == HEAD. A degenerate all-NoMovement - // roll-back restores nothing — there's nothing to publish, and the audit - // records the unchanged snapshot version. - let manifest_version = if updates.is_empty() { - snapshot.version() - } else { - let publisher = GraphNamespacePublisher::new(root_uri, sidecar.branch.as_deref()); - publisher - .publish(&updates, &expected) - .await? - .version() - .version - }; + // Publish the restored HEADs so manifest == HEAD AND record the recovery + // commit in the same CAS (RFC-013 Phase 7). A degenerate all-NoMovement + // roll-back restores no table — `updates` is empty — but the recovery commit + // lineage still publishes (a lineage-only merge), so the rollback is recorded + // in the commit history just like a roll-forward. + let (_manifest_version, graph_commit_id) = + publish_recovery_commit(root_uri, sidecar, RecoveryKind::RolledBack, &updates, &expected) + .await?; record_audit( root_uri, sidecar, - manifest_version, + graph_commit_id, RecoveryKind::RolledBack, outcomes, ) @@ -1370,7 +1619,6 @@ async fn roll_back_sidecar( async fn record_audit_recovery_rollforward( root_uri: &str, storage: &dyn StorageAdapter, - snapshot: &Snapshot, sidecar: &RecoverySidecar, states: &[ClassifiedTable], ) -> Result<()> { @@ -1384,10 +1632,22 @@ async fn record_audit_recovery_rollforward( to_version: state.manifest_pinned, }) .collect(); + // The substrate is already in the post-roll-forward state (the prior pass's + // table re-pin landed), so there are no table `updates` — but a recovery + // commit is still recorded for this cleanup pass via a lineage-only publish + // (RFC-013 Phase 7), which the audit row references. + let (_manifest_version, graph_commit_id) = publish_recovery_commit( + root_uri, + sidecar, + RecoveryKind::RolledForward, + &[], + &HashMap::new(), + ) + .await?; record_audit( root_uri, sidecar, - snapshot.version(), + graph_commit_id, RecoveryKind::RolledForward, outcomes, ) @@ -1407,17 +1667,19 @@ async fn record_audit_recovery_rollforward( /// contention; persistent contention surfaces the typed conflict error to /// the recovery sweep, which leaves the sidecar in place for the next /// open's retry. -/// Returns `(new_manifest_version, per_table_published_versions)`. The -/// per-table map is what the audit row's `to_version` should record — -/// for loose-match writers the actual Lance HEAD can be higher than the -/// sidecar's `post_commit_pin` (which is a lower bound), so the pin is -/// the wrong source of truth for an operator-facing audit field. +/// Returns `(new_manifest_version, per_table_published_versions, +/// recovery_commit_id)`. The per-table map is what the audit row's `to_version` +/// should record — for loose-match writers the actual Lance HEAD can be higher +/// than the sidecar's `post_commit_pin` (which is a lower bound), so the pin is +/// the wrong source of truth for an operator-facing audit field. The recovery +/// commit id is the `graph_commit` folded into the publish CAS (RFC-013 +/// Phase 7), which the audit row references. async fn roll_forward_all( root_uri: &str, sidecar: &RecoverySidecar, states: &[ClassifiedTable], snapshot: &Snapshot, -) -> Result<(u64, HashMap)> { +) -> Result<(u64, HashMap, String)> { let total_changes = sidecar.tables.len() + sidecar.additional_registrations.len() + sidecar.tombstones.len(); let mut updates: Vec = Vec::with_capacity(total_changes); @@ -1528,9 +1790,10 @@ async fn roll_forward_all( ); } - let publisher = GraphNamespacePublisher::new(root_uri, sidecar.branch.as_deref()); - let new_dataset = publisher.publish(&updates, &expected).await?; - Ok((new_dataset.version().version, published_versions)) + let (new_manifest_version, graph_commit_id) = + publish_recovery_commit(root_uri, sidecar, RecoveryKind::RolledForward, &updates, &expected) + .await?; + Ok((new_manifest_version, published_versions, graph_commit_id)) } /// Open `table_path` at its branch HEAD, read the current Lance HEAD version, @@ -1600,62 +1863,27 @@ async fn push_table_update( Ok(published_version) } -/// Append the audit row describing this recovery action. +/// Append the audit row describing this recovery action (RFC-013 Phase 7). /// -/// Two-part write: (a) `_graph_commits.lance` row anchored on the recovery -/// actor (`omnigraph:recovery`); (b) `_graph_commit_recoveries.lance` row -/// linking back to (a) and naming the original actor + per-table outcomes. -/// Same not-atomic-pair-write shape as the existing `_graph_commits` -/// + `_graph_commit_actors` split — a crash between the two leaves an -/// orphan commit row with no audit row. The recovery sweep tolerates this: -/// on re-entry the classifier surfaces `NoMovement` for already-restored / -/// already-published tables, the action is a no-op, and the audit append -/// is retried. +/// The recovery COMMIT (`graph_commit` + `graph_head`) was already recorded +/// durably in `__manifest` by `publish_recovery_commit` (folded into the same +/// CAS as the table re-pin), so this only writes the `_graph_commit_recoveries` +/// row, referencing that commit by `graph_commit_id`. A crash between the +/// recovery publish and this audit append leaves a recovery commit with no audit +/// row — the same not-atomic-pair-write shape as before; the sweep tolerates it +/// (on re-entry the classifier surfaces `NoMovement`, the action is a no-op, and +/// the audit append is retried, minting a fresh recovery commit). async fn record_audit( root_uri: &str, sidecar: &RecoverySidecar, - manifest_version: u64, + graph_commit_id: String, kind: RecoveryKind, outcomes: Vec, ) -> Result<()> { // Failpoint: models an audit write failure after the roll-forward / - // roll-back publish already landed — the sweep aborts, the sidecar - // stays, and re-entry records the audit row (see the retry note in - // the doc comment above). - crate::failpoints::maybe_fail("recovery.record_audit")?; - // Non-main recovery commits must be appended on the sidecar branch's - // commit graph, otherwise parent_commit_id comes from the global - // main head. BranchMerge additionally records the source branch's - // HEAD as merged_parent_commit_id so future merges between the same - // pair recognize "already up-to-date". - let target_branch = sidecar.branch.as_deref(); - let mut graph = match target_branch { - Some(branch) => CommitGraph::open_at_branch(root_uri, branch).await?, - None => CommitGraph::open(root_uri).await?, - }; - let graph_commit_id = match ( - sidecar.writer_kind, - sidecar.merge_source_commit_id.as_deref(), - kind, - ) { - (SidecarKind::BranchMerge, Some(source_id), RecoveryKind::RolledForward) => { - let parent_commit_id = graph.head_commit_id().await?.unwrap_or_default(); - graph - .append_merge_commit( - target_branch, - manifest_version, - &parent_commit_id, - source_id, - Some(RECOVERY_ACTOR), - ) - .await? - } - _ => { - graph - .append_commit(target_branch, manifest_version, Some(RECOVERY_ACTOR)) - .await? - } - }; + // roll-back publish (with its folded-in recovery commit) already landed — + // the sweep aborts, the sidecar stays, and re-entry records the audit row. + crate::failpoints::maybe_fail(crate::failpoints::names::RECOVERY_RECORD_AUDIT)?; let mut audit = RecoveryAudit::open(root_uri).await?; audit .append(RecoveryAuditRecord { @@ -1665,7 +1893,7 @@ async fn record_audit( operation_id: sidecar.operation_id.clone(), sidecar_writer_kind: format!("{:?}", sidecar.writer_kind), per_table_outcomes: outcomes, - created_at: now_micros()?, + created_at: crate::db::now_micros()?, }) .await?; Ok(()) diff --git a/crates/omnigraph/src/db/manifest/state.rs b/crates/omnigraph/src/db/manifest/state.rs index e222ede..4fbbde3 100644 --- a/crates/omnigraph/src/db/manifest/state.rs +++ b/crates/omnigraph/src/db/manifest/state.rs @@ -10,7 +10,10 @@ use crate::error::{OmniError, Result}; use super::layout::version_object_id; use super::metadata::TableVersionMetadata; -use super::{OBJECT_TYPE_TABLE, OBJECT_TYPE_TABLE_TOMBSTONE, OBJECT_TYPE_TABLE_VERSION}; +use super::{ + MAIN_BRANCH_HEAD_KEY, OBJECT_TYPE_GRAPH_COMMIT, OBJECT_TYPE_GRAPH_HEAD, OBJECT_TYPE_TABLE, + OBJECT_TYPE_TABLE_TOMBSTONE, OBJECT_TYPE_TABLE_VERSION, +}; #[derive(Debug, Clone)] pub struct SubTableEntry { @@ -34,11 +37,64 @@ struct TableTombstoneEntry { tombstone_version: u64, } +/// A graph-lineage commit projected out of the `__manifest` `graph_commit` +/// rows (RFC-013 step 4). Field-for-field identical to `commit_graph::GraphCommit` +/// so the commit-graph cache can be sourced from the manifest projection without +/// touching any reader above that boundary. Kept as a separate struct here to +/// keep `state.rs` free of the `commit_graph` module dependency. +#[derive(Debug, Clone, PartialEq, Eq)] +pub(crate) struct GraphLineageRow { + pub(crate) graph_commit_id: String, + pub(crate) manifest_branch: Option, + pub(crate) manifest_version: u64, + pub(crate) parent_commit_id: Option, + pub(crate) merged_parent_commit_id: Option, + pub(crate) actor_id: Option, + pub(crate) created_at: i64, +} + +/// JSON payload of a `graph_commit` row's `metadata` column. The immutable +/// commit fields that have no dedicated manifest column live here; the mutable +/// ones (`graph_commit_id`, `manifest_branch`, `manifest_version`) reuse +/// `object_id` / `table_branch` / `table_version`. +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +struct GraphCommitMetadata { + #[serde(default, skip_serializing_if = "Option::is_none")] + parent_commit_id: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + merged_parent_commit_id: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + actor_id: Option, + created_at: i64, +} + +/// JSON payload of a `graph_head` row's `metadata` column. +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +struct GraphHeadMetadata { + head_commit_id: String, + #[serde(default, skip_serializing_if = "Option::is_none")] + parent_commit_id: Option, +} + +/// The `object_id` for a branch's mutable head pointer row. Main encodes as +/// `graph_head:main`; named branches as `graph_head:`. +pub(crate) fn graph_head_object_id(branch: Option<&str>) -> String { + format!("graph_head:{}", branch.unwrap_or(MAIN_BRANCH_HEAD_KEY)) +} + #[derive(Debug, Clone)] struct ManifestScan { table_locations: HashMap, version_entries: Vec, tombstones: Vec, + /// Graph-lineage `graph_commit` rows, collected in the SAME pass only when + /// the caller asked (`collect_lineage`). Empty on the table-state read hot + /// path so it never pays the O(commits) lineage JSON decode; populated on the + /// publish path, where `load_publish_state` already needs the parent and would + /// otherwise scan `__manifest` a second time via `read_graph_lineage`. `graph_head` + /// rows are not collected here — parent resolution uses the head-over-commits + /// computation, not the denormalized head pointer (see `resolve_lineage_rows`). + lineage_rows: Vec, } pub(super) fn manifest_schema() -> SchemaRef { @@ -73,7 +129,8 @@ pub(super) fn manifest_schema() -> SchemaRef { pub(super) async fn read_manifest_state(dataset: &Dataset) -> Result { let version = dataset.version().version; - let scan = read_manifest_scan(dataset).await?; + // The table-state hot path never needs lineage, so don't pay its JSON decode. + let scan = read_manifest_scan(dataset, false).await?; let mut latest_versions = HashMap::::new(); for entry in scan.version_entries { @@ -109,28 +166,85 @@ pub(super) async fn read_manifest_state(dataset: &Dataset) -> Result Result> { - Ok(read_manifest_scan(dataset).await?.version_entries) + Ok(read_manifest_scan(dataset, false).await?.version_entries) } -pub(super) async fn read_registered_table_locations( - dataset: &Dataset, -) -> Result> { - Ok(read_manifest_scan(dataset).await?.table_locations) +/// The full table state the publisher needs to build its CAS batch, plus the +/// `graph_commit` lineage rows for parent resolution — all from ONE `__manifest` +/// scan (RFC-013 P2). Replaces the prior four scans on the publish path (three +/// thin accessors + a separate `read_graph_lineage`): `load_publish_state` +/// projects every piece it needs out of this single result. +pub(super) struct PublishScan { + pub(super) table_locations: HashMap, + pub(super) version_entries: Vec, + pub(super) tombstones: Vec<((String, u64), ())>, + pub(super) lineage_rows: Vec, } -pub(super) async fn read_tombstone_versions( - dataset: &Dataset, -) -> Result> { - Ok(read_manifest_scan(dataset) - .await? - .tombstones - .into_iter() - .map(|tombstone| ((tombstone.table_key, tombstone.tombstone_version), ())) - .collect()) +/// One-scan read of everything the publish path needs. `collect_lineage` is +/// always on here (the publisher resolves a parent), so the lineage JSON decode +/// rides the same pass as the table-state assembly instead of a second scan. +pub(super) async fn read_publish_scan(dataset: &Dataset) -> Result { + let scan = read_manifest_scan(dataset, true).await?; + Ok(PublishScan { + table_locations: scan.table_locations, + version_entries: scan.version_entries, + tombstones: scan + .tombstones + .into_iter() + .map(|tombstone| ((tombstone.table_key, tombstone.tombstone_version), ())) + .collect(), + lineage_rows: scan.lineage_rows, + }) } -async fn read_manifest_scan(dataset: &Dataset) -> Result { +/// Decode one `graph_commit` row (`object_type == OBJECT_TYPE_GRAPH_COMMIT`) into +/// a [`GraphLineageRow`]. The single decode for both lineage readers — the +/// dedicated `read_graph_lineage` scan and the folded `collect_lineage` branch of +/// `read_manifest_scan` — so the two cannot drift. The caller has already matched +/// the object type; `row` indexes into the per-batch columns. +fn decode_graph_commit_row( + object_ids: &StringArray, + metadata: &StringArray, + versions: &UInt64Array, + branches: &StringArray, + row: usize, +) -> Result { + if metadata.is_null(row) { + return Err(OmniError::manifest_internal(format!( + "manifest graph_commit row missing metadata for {}", + object_ids.value(row) + ))); + } + let commit_meta: GraphCommitMetadata = + serde_json::from_str(metadata.value(row)).map_err(|e| { + OmniError::manifest_internal(format!("failed to decode graph_commit metadata: {e}")) + })?; + Ok(GraphLineageRow { + graph_commit_id: object_ids.value(row).to_string(), + manifest_branch: if branches.is_null(row) { + None + } else { + Some(branches.value(row).to_string()) + }, + manifest_version: required_u64(versions, row, "table_version")?, + parent_commit_id: commit_meta.parent_commit_id, + merged_parent_commit_id: commit_meta.merged_parent_commit_id, + actor_id: commit_meta.actor_id, + created_at: commit_meta.created_at, + }) +} + +async fn read_manifest_scan(dataset: &Dataset, collect_lineage: bool) -> Result { let batches: Vec = dataset .scan() .try_into_stream() @@ -143,6 +257,7 @@ async fn read_manifest_scan(dataset: &Dataset) -> Result { let mut table_locations = HashMap::new(); let mut version_entries = Vec::new(); let mut tombstones = Vec::new(); + let mut lineage_rows = Vec::new(); for batch in &batches { let object_types = string_column(batch, "object_type")?; @@ -152,6 +267,13 @@ async fn read_manifest_scan(dataset: &Dataset) -> Result { let versions = u64_column(batch, "table_version")?; let branches = string_column(batch, "table_branch")?; let row_counts = u64_column(batch, "row_count")?; + // `object_id` is only needed for lineage decoding; skip the lookup + // entirely on the table-state hot path (`collect_lineage == false`). + let object_ids = if collect_lineage { + Some(string_column(batch, "object_id")?) + } else { + None + }; for row in 0..batch.num_rows() { let table_key = table_keys.value(row).to_string(); @@ -195,6 +317,21 @@ async fn read_manifest_scan(dataset: &Dataset) -> Result { tombstone_version, }); } + // `graph_commit` rows (RFC-013) are decoded into the scan ONLY + // when `collect_lineage` is set (the publish path, which resolves + // a parent). The table-state hot path leaves them — and + // `graph_head` + any future object type — in the `_` arm so it + // never pays the O(commits) lineage JSON decode. When NOT + // collecting, `object_ids` is `None`, so this arm is the same + // forward-compat skip as the `_` arm. + OBJECT_TYPE_GRAPH_COMMIT if collect_lineage => { + let object_ids = object_ids.expect("object_ids read when collect_lineage"); + lineage_rows.push(decode_graph_commit_row( + object_ids, metadata, versions, branches, row, + )?); + } + // Skipped on the table-state path (and for `graph_head` / unknown + // future object types on every path): no table snapshot needs them. _ => {} } } @@ -225,21 +362,167 @@ async fn read_manifest_scan(dataset: &Dataset) -> Result { table_locations, version_entries: entries, tombstones, + lineage_rows, }) } +/// Project the graph-lineage rows (`graph_commit` + `graph_head`) out of +/// `__manifest` (RFC-013 step 4). Returns every commit and the per-branch head +/// map (keyed by branch name, `"main"` for main). `__manifest` is the single +/// source of graph lineage: the commit-graph cache is sourced from here, and the +/// publisher resolves a new commit's parent from here inside its CAS loop. +/// +/// Dedicated scan (separate from `read_manifest_scan`): it decodes ONLY the two +/// lineage object types and builds no table snapshot, so the table-state hot +/// path never pays for lineage JSON and this path never pays for table-entry +/// assembly. +pub(crate) async fn read_graph_lineage( + dataset: &Dataset, +) -> Result<(Vec, HashMap)> { + let batches: Vec = dataset + .scan() + .try_into_stream() + .await + .map_err(|e| OmniError::Lance(e.to_string()))? + .try_collect() + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + + let mut graph_commits = Vec::new(); + let mut graph_heads = HashMap::new(); + + for batch in &batches { + let object_ids = string_column(batch, "object_id")?; + let object_types = string_column(batch, "object_type")?; + let metadata = string_column(batch, "metadata")?; + let versions = u64_column(batch, "table_version")?; + let branches = string_column(batch, "table_branch")?; + + for row in 0..batch.num_rows() { + match object_types.value(row) { + OBJECT_TYPE_GRAPH_COMMIT => { + graph_commits.push(decode_graph_commit_row( + object_ids, metadata, versions, branches, row, + )?); + } + OBJECT_TYPE_GRAPH_HEAD => { + if metadata.is_null(row) { + return Err(OmniError::manifest_internal(format!( + "manifest graph_head row missing metadata for {}", + object_ids.value(row) + ))); + } + let head_meta: GraphHeadMetadata = serde_json::from_str(metadata.value(row)) + .map_err(|e| { + OmniError::manifest_internal(format!( + "failed to decode graph_head metadata: {e}" + )) + })?; + // `object_id` is `graph_head:`; the branch key after + // the prefix is the projection's map key (`main` for main). + let branch_key = object_ids + .value(row) + .strip_prefix("graph_head:") + .unwrap_or_default() + .to_string(); + graph_heads.insert(branch_key, head_meta.head_commit_id); + } + _ => {} + } + } + } + + Ok((graph_commits, graph_heads)) +} + +/// The current head of a branch's lineage: the [`GraphLineageRow`] with the +/// greatest `(manifest_version, created_at, graph_commit_id)`. This is the same +/// ordering the commit-graph cache uses to pick its head (`should_replace_head`) +/// — kept in one place so the publisher's per-attempt parent resolution and the +/// cache agree by construction. `None` only for a graph with no commits yet +/// (a parentless genesis). +pub(crate) fn head_lineage_row(rows: &[GraphLineageRow]) -> Option<&GraphLineageRow> { + rows.iter().max_by(|a, b| { + a.manifest_version + .cmp(&b.manifest_version) + .then_with(|| a.created_at.cmp(&b.created_at)) + .then_with(|| a.graph_commit_id.cmp(&b.graph_commit_id)) + }) +} + +/// One `__manifest` row materializing a piece of a graph commit's lineage. The +/// publisher maps these onto its `PendingVersionRow`s (folding lineage into the +/// table-version publish batch), and the genesis init path pushes them straight +/// into the init batch. +pub(crate) struct GraphLineageRowPart { + pub(crate) object_id: String, + pub(crate) object_type: &'static str, + pub(crate) metadata: String, + pub(crate) table_version: Option, + pub(crate) table_branch: Option, +} + +/// Encode one graph commit into its two `__manifest` rows: the immutable +/// `graph_commit` row plus the mutable `graph_head:` pointer (a +/// merge-insert on `object_id` updates the head in place). `branch` is `None` +/// for main. The immutable commit fields with no dedicated column live in the +/// `graph_commit` row's `metadata` JSON; the mutable head pointer payload lives +/// in the `graph_head` row's `metadata`. +pub(crate) fn graph_lineage_row_parts( + commit: &GraphLineageRow, + branch: Option<&str>, +) -> Result<[GraphLineageRowPart; 2]> { + let commit_metadata = serde_json::to_string(&GraphCommitMetadata { + parent_commit_id: commit.parent_commit_id.clone(), + merged_parent_commit_id: commit.merged_parent_commit_id.clone(), + actor_id: commit.actor_id.clone(), + created_at: commit.created_at, + }) + .map_err(|e| { + OmniError::manifest_internal(format!("failed to encode graph_commit metadata: {e}")) + })?; + let head_metadata = serde_json::to_string(&GraphHeadMetadata { + head_commit_id: commit.graph_commit_id.clone(), + parent_commit_id: commit.parent_commit_id.clone(), + }) + .map_err(|e| { + OmniError::manifest_internal(format!("failed to encode graph_head metadata: {e}")) + })?; + + Ok([ + // Only the immutable commit row carries the manifest version + branch. + GraphLineageRowPart { + object_id: commit.graph_commit_id.clone(), + object_type: OBJECT_TYPE_GRAPH_COMMIT, + metadata: commit_metadata, + table_version: Some(commit.manifest_version), + table_branch: commit.manifest_branch.clone(), + }, + // The head row reuses `metadata` for its pointer payload. + GraphLineageRowPart { + object_id: graph_head_object_id(branch), + object_type: OBJECT_TYPE_GRAPH_HEAD, + metadata: head_metadata, + table_version: None, + table_branch: None, + }, + ]) +} + pub(super) fn entries_to_batch( entries: &[SubTableEntry], version_metadata: &HashMap, + genesis_lineage: &[GraphLineageRowPart], ) -> Result { - let mut object_ids = Vec::with_capacity(entries.len() * 2); - let mut object_types = Vec::with_capacity(entries.len() * 2); - let mut locations = Vec::with_capacity(entries.len() * 2); - let mut metadata = Vec::with_capacity(entries.len() * 2); - let mut table_keys = Vec::with_capacity(entries.len() * 2); - let mut table_versions = Vec::with_capacity(entries.len() * 2); - let mut table_branches = Vec::with_capacity(entries.len() * 2); - let mut row_counts = Vec::with_capacity(entries.len() * 2); + let cap = entries.len() * 2 + genesis_lineage.len(); + let mut object_ids = Vec::with_capacity(cap); + let mut object_types = Vec::with_capacity(cap); + let mut locations = Vec::with_capacity(cap); + let mut metadata = Vec::with_capacity(cap); + let mut table_keys = Vec::with_capacity(cap); + let mut table_versions = Vec::with_capacity(cap); + let mut table_branches = Vec::with_capacity(cap); + let mut row_counts = Vec::with_capacity(cap); for entry in entries { object_ids.push(entry.table_key.clone()); @@ -271,6 +554,22 @@ pub(super) fn entries_to_batch( row_counts.push(Some(entry.row_count)); } + // Genesis graph-lineage rows ride the init write so a fresh graph carries + // its `graph_commit` + `graph_head` in `__manifest` from version one (no + // separate lineage fragment, no second commit). `table_key` is non-nullable + // but lineage rows have no table identity, so the empty string stands in + // (never matched by a real key). + for part in genesis_lineage { + object_ids.push(part.object_id.clone()); + object_types.push(part.object_type.to_string()); + locations.push(None); + metadata.push(Some(part.metadata.clone())); + table_keys.push(String::new()); + table_versions.push(part.table_version); + table_branches.push(part.table_branch.clone()); + row_counts.push(None); + } + manifest_rows_batch( object_ids, object_types, @@ -283,6 +582,72 @@ pub(super) fn entries_to_batch( ) } +/// Merge-insert a set of graph-lineage rows (`graph_commit` + `graph_head`) +/// straight into `__manifest`, keyed on `object_id`. Used only by the v3→v4 +/// internal-schema backfill (RFC-013 step 4): the normal write path folds +/// lineage into the publisher's batch, but the migration writes lineage with +/// no accompanying table-version change, so it issues its own merge. +/// +/// Mirrors the publisher's merge knobs (`use_index(false)`, `skip_auto_cleanup`, +/// `conflict_retries(0)`) so it has identical CAS / cleanup semantics. The +/// migration runs under the open-for-write path and is idempotent (re-inserting +/// the same `object_id` rows updates them in place), so it does not need the +/// publisher's retry loop. Returns the advanced dataset (its version is the +/// commit the lineage landed in). +pub(crate) async fn merge_lineage_rows( + dataset: Dataset, + parts: &[GraphLineageRowPart], +) -> Result { + let len = parts.len(); + let mut object_ids = Vec::with_capacity(len); + let mut object_types = Vec::with_capacity(len); + let mut metadata = Vec::with_capacity(len); + let mut table_versions = Vec::with_capacity(len); + let mut table_branches = Vec::with_capacity(len); + for part in parts { + object_ids.push(part.object_id.clone()); + object_types.push(part.object_type.to_string()); + metadata.push(Some(part.metadata.clone())); + table_versions.push(part.table_version); + table_branches.push(part.table_branch.clone()); + } + // Lineage rows carry no table identity: empty `table_key`, null location / + // row_count (matching `lineage_part_to_pending` in the publisher). + let batch = manifest_rows_batch( + object_ids, + object_types, + vec![None; len], + metadata, + vec![String::new(); len], + table_versions, + table_branches, + vec![None; len], + )?; + let reader = + arrow_array::RecordBatchIterator::new(vec![Ok(batch)], manifest_schema()); + let dataset = Arc::new(dataset); + let mut merge_builder = + lance::dataset::MergeInsertBuilder::try_new(dataset, vec!["object_id".to_string()]) + .map_err(|e| OmniError::Lance(e.to_string()))?; + merge_builder.when_matched(lance::dataset::WhenMatched::UpdateAll); + merge_builder.when_not_matched(lance::dataset::WhenNotMatched::InsertAll); + merge_builder.conflict_retries(0); + merge_builder.use_index(false); + merge_builder.skip_auto_cleanup(true); + let (new_dataset, _stats) = merge_builder + .try_build() + .map_err(|e| OmniError::Lance(e.to_string()))? + .execute_reader(Box::new(reader)) + // Route through the publisher's classifier (not a stringify) so a + // concurrent first-open's CAS loss on `__manifest` surfaces as the SAME + // typed `RowLevelCasContention` the publisher's retry consumes. The + // migration's re-open retry loop matches on that to converge instead of + // erroring out (FIX B). + .await + .map_err(super::publisher::map_lance_publish_error)?; + Ok(Arc::try_unwrap(new_dataset).unwrap_or_else(|arc| (*arc).clone())) +} + pub(super) fn manifest_rows_batch( object_ids: Vec, object_types: Vec, diff --git a/crates/omnigraph/src/db/manifest/tests.rs b/crates/omnigraph/src/db/manifest/tests.rs index 3888bd4..31a77fe 100644 --- a/crates/omnigraph/src/db/manifest/tests.rs +++ b/crates/omnigraph/src/db/manifest/tests.rs @@ -12,7 +12,7 @@ use lance_namespace::models::{ use lance_namespace_impls::DirectoryNamespaceBuilder; use tokio::sync::Mutex; -use super::publisher::ManifestBatchPublisher; +use super::publisher::{LineageIntent, ManifestBatchPublisher, PublishOutcome}; use super::*; use omnigraph_compiler::catalog::build_catalog; use omnigraph_compiler::schema::parser::parse_schema; @@ -988,7 +988,8 @@ impl ManifestBatchPublisher for RecordingPublisher { &self, changes: &[ManifestChange], expected_table_versions: &HashMap, - ) -> Result { + lineage: Option<&LineageIntent>, + ) -> Result { let requests: Vec = changes .iter() .filter_map(|change| match change { @@ -997,7 +998,9 @@ impl ManifestBatchPublisher for RecordingPublisher { }) .collect(); self.requests.lock().await.extend_from_slice(&requests); - self.inner.publish(changes, expected_table_versions).await + self.inner + .publish(changes, expected_table_versions, lineage) + .await } } @@ -1009,7 +1012,8 @@ impl ManifestBatchPublisher for FailingPublisher { &self, _changes: &[ManifestChange], _expected_table_versions: &HashMap, - ) -> Result { + _lineage: Option<&LineageIntent>, + ) -> Result { Err(OmniError::manifest( "injected batch publisher failure".to_string(), )) @@ -1389,8 +1393,8 @@ async fn test_concurrent_publish_with_overlapping_expected_versions_one_succeeds let expected_b = expected; let (res_a, res_b) = tokio::join!( - async { publisher_a.publish(&changes_a, &expected_a).await }, - async { publisher_b.publish(&changes_b, &expected_b).await } + async { publisher_a.publish(&changes_a, &expected_a, None).await }, + async { publisher_b.publish(&changes_b, &expected_b, None).await } ); let (succeeded, err) = match (res_a, res_b) { @@ -1481,7 +1485,7 @@ async fn test_publish_migrates_pre_stamp_manifest_to_current_version() { let mut expected = HashMap::new(); expected.insert("node:Person".to_string(), 1); GraphNamespacePublisher::new(uri, None) - .publish(&[], &expected) + .publish(&[], &expected, None) .await .unwrap(); @@ -1542,7 +1546,7 @@ async fn test_v2_to_v3_sweeps_legacy_run_branches_on_write_open() { let mut expected = HashMap::new(); expected.insert("node:Person".to_string(), 1); GraphNamespacePublisher::new(uri, None) - .publish(&[], &expected) + .publish(&[], &expected, None) .await .unwrap(); @@ -1569,7 +1573,7 @@ async fn test_v2_to_v3_sweeps_legacy_run_branches_on_write_open() { // Idempotent: a second write-open finds the stamp at current and does not // re-run the sweep or error. GraphNamespacePublisher::new(uri, None) - .publish(&[], &expected) + .publish(&[], &expected, None) .await .unwrap(); let final_ds = open_manifest_dataset(uri, None).await.unwrap(); @@ -1601,7 +1605,7 @@ async fn test_publish_rejects_manifest_stamped_at_future_version() { let mut expected = HashMap::new(); expected.insert("node:Person".to_string(), 1); let err = GraphNamespacePublisher::new(uri, None) - .publish(&[], &expected) + .publish(&[], &expected, None) .await .expect_err("future-stamped manifest should reject open-for-write"); let msg = err.to_string(); @@ -1627,3 +1631,957 @@ fn manifest_column_helpers_return_error_for_bad_schema() { let err = string_column(&batch, "table_key").unwrap_err(); assert!(err.to_string().contains("table_key")); } + +// ── RFC-013 Phase 7 stage 4: existing-graph (v3 → v4) lineage migration ────── +// +// A graph created by a pre-Phase-7 binary (internal schema v3) keeps its +// lineage in `_graph_commits.lance`, with NONE in `__manifest`. The new binary +// reads lineage from the `__manifest` projection, so without a migration it +// would see an EMPTY commit DAG. These tests pin the backfill (`migrate_v3_to_v4`), +// its idempotency, the transitional v3-read fallback, the read-only refusal, and +// the crash-mid-migration recovery. + +use crate::db::commit_graph::{CommitGraph, seed_legacy_v3_lineage}; + +/// Number of `graph_commit` rows in `__manifest` at main. +async fn manifest_commit_row_count(uri: &str) -> usize { + let ds = open_manifest_dataset(uri, None).await.unwrap(); + let (rows, _heads) = read_graph_lineage(&ds).await.unwrap(); + rows.len() +} + +#[tokio::test] +async fn v3_graph_backfills_lineage_into_manifest_on_read_write_open() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + + let fixture = seed_legacy_v3_lineage(uri).await.unwrap(); + + // Precondition: a true v3 graph — stamp 3, NO lineage rows in `__manifest`, + // and a NEW-binary projection therefore reads an empty DAG. + { + let ds = open_manifest_dataset(uri, None).await.unwrap(); + assert_eq!(super::migrations::read_stamp(&ds), 3, "fixture is stamped v3"); + } + assert_eq!( + manifest_commit_row_count(uri).await, + 0, + "precondition: __manifest carries no graph_commit rows in a v3 graph", + ); + + // Run the production read-write migration entry point (main branch). + super::migrate_on_open(uri).await.unwrap(); + + // The manifest now carries the lineage and is stamped at the current version. + { + let ds = open_manifest_dataset(uri, None).await.unwrap(); + assert_eq!( + super::migrations::read_stamp(&ds), + super::migrations::INTERNAL_MANIFEST_SCHEMA_VERSION, + "migration stamps the manifest at the current internal schema version", + ); + } + // 4 commits (genesis, A, feature, merge) → 4 `graph_commit` rows. + assert_eq!( + manifest_commit_row_count(uri).await, + fixture.all_ids.len(), + "every legacy commit is backfilled into __manifest", + ); + + // The commit-graph projection (now sourced from __manifest) reconstructs the + // full DAG: every old id resolves, parents/merge parents are connected, the + // merge commit's actor + two parents survive, and the head is the merge. + let cg = CommitGraph::open(uri).await.unwrap(); + let commits = cg.load_commits().await.unwrap(); + assert_eq!(commits.len(), fixture.all_ids.len()); + for id in &fixture.all_ids { + assert!( + cg.get_commit(id).is_some(), + "old commit id {id} must still resolve after migration", + ); + } + + let genesis = cg.get_commit(&fixture.genesis).unwrap(); + assert!(genesis.parent_commit_id.is_none(), "genesis is parentless"); + assert!(genesis.actor_id.is_none(), "genesis is actorless"); + + let commit_a = cg.get_commit(&fixture.commit_a).unwrap(); + assert_eq!(commit_a.parent_commit_id.as_deref(), Some(fixture.genesis.as_str())); + assert_eq!(commit_a.actor_id.as_deref(), Some("act-a"), "actor backfilled inline"); + + let merge = cg.get_commit(&fixture.merge_commit).unwrap(); + assert_eq!(merge.parent_commit_id.as_deref(), Some(fixture.commit_a.as_str())); + assert_eq!( + merge.merged_parent_commit_id.as_deref(), + Some(fixture.feature_commit.as_str()), + "the merge commit keeps both parents", + ); + assert_eq!(merge.actor_id.as_deref(), Some("act-merger")); + + assert_eq!( + cg.head_commit_id().await.unwrap().as_deref(), + Some(fixture.merge_commit.as_str()), + "the merge commit is the head of main after migration", + ); + + // merge_base of main vs main is reflexively the head — a smoke check that the + // ancestor walk works over the backfilled DAG. + let base = CommitGraph::merge_base(uri, None, None).await.unwrap(); + assert!(base.is_some(), "merge_base resolves over the backfilled DAG"); +} + +#[tokio::test] +async fn v3_to_v4_migration_is_idempotent() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let fixture = seed_legacy_v3_lineage(uri).await.unwrap(); + + super::migrate_on_open(uri).await.unwrap(); + let after_first = manifest_commit_row_count(uri).await; + // Re-running the migration must not duplicate any rows. + super::migrate_on_open(uri).await.unwrap(); + let after_second = manifest_commit_row_count(uri).await; + + assert_eq!(after_first, fixture.all_ids.len()); + assert_eq!( + after_first, after_second, + "a second migration pass adds no duplicate graph_commit rows", + ); +} + +#[tokio::test] +async fn v3_graph_reads_history_via_fallback_without_migrating() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let fixture = seed_legacy_v3_lineage(uri).await.unwrap(); + + // Open the commit-graph projection WITHOUT running the migration (this is the + // read-only path: `CommitGraph::open` reads, never writes). The stamp-gated + // fallback sources lineage from `_graph_commits.lance`, so history is correct. + let cg = CommitGraph::open(uri).await.unwrap(); + let commits = cg.load_commits().await.unwrap(); + assert_eq!( + commits.len(), + fixture.all_ids.len(), + "the v3 fallback reads the full legacy DAG with no migration", + ); + assert_eq!( + cg.head_commit_id().await.unwrap().as_deref(), + Some(fixture.merge_commit.as_str()), + ); + + // The fallback is read-only: stamp stays v3, __manifest still has no lineage. + { + let ds = open_manifest_dataset(uri, None).await.unwrap(); + assert_eq!(super::migrations::read_stamp(&ds), 3, "fallback did not write"); + } + assert_eq!( + manifest_commit_row_count(uri).await, + 0, + "the read-only fallback writes nothing to __manifest", + ); +} + +#[tokio::test] +async fn future_stamp_is_refused_in_both_open_modes() { + use crate::db::{Omnigraph, OpenMode}; + use crate::storage::storage_for_uri; + + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + // A full graph (schema artifacts present) so `Omnigraph::open*` gets past its + // schema read to the stamp check. + Omnigraph::init(uri, "node Person { name: String }\n") + .await + .unwrap(); + + // Stamp past this binary's known version. + { + let mut ds = open_manifest_dataset(uri, None).await.unwrap(); + ds.update_schema_metadata([( + "omnigraph:internal_schema_version".to_string(), + Some("5".to_string()), + )]) + .await + .unwrap(); + } + + let storage = storage_for_uri(uri).unwrap(); + for mode in [OpenMode::ReadWrite, OpenMode::ReadOnly] { + // `Omnigraph` is not `Debug`, so match instead of `expect_err`. + let err = match Omnigraph::open_with_storage_and_mode(uri, Arc::clone(&storage), mode).await + { + Ok(_) => panic!("{mode:?}: a future-stamped graph must be refused"), + Err(err) => err, + }; + assert!( + err.to_string().contains("upgrade omnigraph"), + "{mode:?}: expected an upgrade-omnigraph refusal, got: {err}", + ); + } +} + +#[tokio::test] +async fn sub_floor_stamp_is_refused_in_both_open_modes() { + use crate::db::{Omnigraph, OpenMode}; + use crate::storage::storage_for_uri; + + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + Omnigraph::init(uri, "node Person { name: String }\n") + .await + .unwrap(); + + // Stamp below MIN_SUPPORTED (1 today). No real graph carries 0 — `read_stamp` + // floors an absent stamp at 1 — so this is the symmetric twin of + // `future_stamp_is_refused_in_both_open_modes`, exercising the floor the + // combined `refuse_if_stamp_unsupported` guard adds at every open mode + // (write-path migrate, read-only open, and the branch lineage-read path). The + // upper side — a graph at exactly MIN migrating to CURRENT — is covered by + // `test_publish_migrates_pre_stamp_manifest_to_current_version`, where an + // absent stamp reads as 1 = MIN. + { + let mut ds = open_manifest_dataset(uri, None).await.unwrap(); + ds.update_schema_metadata([( + "omnigraph:internal_schema_version".to_string(), + Some("0".to_string()), + )]) + .await + .unwrap(); + } + + let storage = storage_for_uri(uri).unwrap(); + for mode in [OpenMode::ReadWrite, OpenMode::ReadOnly] { + let err = match Omnigraph::open_with_storage_and_mode(uri, Arc::clone(&storage), mode).await + { + Ok(_) => panic!("{mode:?}: a sub-floor graph must be refused"), + Err(err) => err, + }; + assert!( + err.to_string().contains("migrate it forward"), + "{mode:?}: expected a migrate-forward floor refusal, got: {err}", + ); + } +} + +#[tokio::test] +async fn crash_after_merge_before_stamp_completes_on_next_open() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let fixture = seed_legacy_v3_lineage(uri).await.unwrap(); + + // Simulate a crash that landed the lineage merge but lost the stamp bump: + // run the full migration (lineage now in __manifest), then rewind the stamp + // to v3. This is exactly the on-disk state after a crash at the + // `migration.v3_to_v4.after_merge_before_stamp` window. + super::migrate_on_open(uri).await.unwrap(); + { + let mut ds = open_manifest_dataset(uri, None).await.unwrap(); + super::migrations::set_stamp_for_test(&mut ds, 3).await.unwrap(); + } + assert_eq!( + manifest_commit_row_count(uri).await, + fixture.all_ids.len(), + "crash state: lineage present, stamp rewound to v3", + ); + + // The next open re-enters at v3; the idempotency guard sees the lineage and + // skips straight to the stamp bump — no duplicate rows, migration completes. + super::migrate_on_open(uri).await.unwrap(); + { + let ds = open_manifest_dataset(uri, None).await.unwrap(); + assert_eq!( + super::migrations::read_stamp(&ds), + super::migrations::INTERNAL_MANIFEST_SCHEMA_VERSION, + "the re-entered migration completes the stamp bump", + ); + } + assert_eq!( + manifest_commit_row_count(uri).await, + fixture.all_ids.len(), + "re-running over an already-merged manifest adds no duplicate rows", + ); +} + +/// Migrate the `__manifest` at `branch` (the per-branch v3→v4 entry shape: +/// `migrate_on_open` runs it for main; the publisher runs it for each branch's +/// first write). Returns the migrated branch lineage `(commit_by_id, heads)`. +async fn migrate_branch_and_read_lineage( + uri: &str, + branch: &str, +) -> ( + std::collections::HashMap, + std::collections::HashMap, +) { + let mut ds = open_manifest_dataset(uri, Some(branch)).await.unwrap(); + super::migrations::migrate_internal_schema(&mut ds, uri, Some(branch)) + .await + .unwrap(); + // Re-open at the branch so the read sees the migration's committed HEAD. + let ds = open_manifest_dataset(uri, Some(branch)).await.unwrap(); + let (rows, heads) = read_graph_lineage(&ds).await.unwrap(); + let by_id = rows + .into_iter() + .map(|r| (r.graph_commit_id.clone(), r)) + .collect(); + (by_id, heads) +} + +// FIX C — the per-branch v3→v4 migration against a REAL Lance branch. +// +// `seed_legacy_v3_lineage` writes every commit (incl. the "feature"-tagged one) +// to MAIN's `_graph_commits.lance` with `manifest_branch` as a mere field — it +// never exercises the production per-branch path (`read_legacy_commit_cache` → +// `checkout_branch`, and a branch-scoped `__manifest`). This test builds a graph +// with a REAL Lance branch on both `_graph_commits.lance` and `__manifest`, then +// migrates the BRANCH and asserts the branch's lineage lands in the BRANCH's +// `__manifest` with main untouched. +// +// It also EMPIRICALLY decides the open question behind FIX B: the fast-path +// `read_graph_lineage(dataset)` has no `manifest_branch` filter in its query, but +// `dataset` is branch-scoped (`__manifest` is Lance-branched per graph-branch), +// so a branch should read only its OWN lineage. If migrating the branch were to +// leak main's backfill (or vice versa), that would be a 5th bug needing a branch +// filter. The assertions below pin that it does not. +#[tokio::test] +async fn v3_branch_migration_backfills_branch_manifest_and_leaves_main_untouched() { + use crate::db::commit_graph::seed_legacy_v3_lineage_with_branch; + + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let fx = seed_legacy_v3_lineage_with_branch(uri).await.unwrap(); + + // Precondition: both main and the branch are v3 with no lineage in __manifest. + for branch in [None, Some(fx.branch.as_str())] { + let ds = open_manifest_dataset(uri, branch).await.unwrap(); + assert_eq!( + super::migrations::read_stamp(&ds), + 3, + "{branch:?}: fixture branch is stamped v3", + ); + let (rows, _heads) = read_graph_lineage(&ds).await.unwrap(); + assert!( + rows.is_empty(), + "{branch:?}: fixture branch has no lineage in __manifest", + ); + } + + // Migrate ONLY the branch. + let (branch_by_id, branch_heads) = migrate_branch_and_read_lineage(uri, &fx.branch).await; + + // The branch's __manifest now carries the branch's full DAG: genesis, A, and + // the branch commit (3 rows), with the branch commit as `graph_head:feature`. + assert_eq!( + branch_by_id.len(), + 3, + "the branch backfill carries genesis + A + the branch commit", + ); + for id in [&fx.genesis, &fx.commit_a, &fx.branch_commit] { + assert!( + branch_by_id.contains_key(id), + "branch commit {id} must be backfilled into the branch __manifest", + ); + } + assert_eq!( + branch_heads.get(&fx.branch).map(String::as_str), + Some(fx.branch_commit.as_str()), + "graph_head:feature points at the branch commit", + ); + + // Parents + actors survived the backfill. + let branch_commit = &branch_by_id[&fx.branch_commit]; + assert_eq!( + branch_commit.parent_commit_id.as_deref(), + Some(fx.commit_a.as_str()), + "the branch commit keeps its parent", + ); + assert_eq!( + branch_commit.actor_id.as_deref(), + Some("act-branch"), + "the branch commit's authored actor survives", + ); + assert_eq!( + branch_by_id[&fx.commit_a].actor_id.as_deref(), + Some("act-a"), + "the inherited main commit's actor survives on the branch", + ); + + // Contingency check: migrating the branch left MAIN's __manifest untouched — + // still v3, still no lineage. The unfiltered fast-path read is branch-correct + // because `__manifest` is Lance-branched; no `manifest_branch` filter is + // needed (no 5th bug). + { + let main_ds = open_manifest_dataset(uri, None).await.unwrap(); + assert_eq!( + super::migrations::read_stamp(&main_ds), + 3, + "migrating the branch must not advance main's stamp", + ); + let (main_rows, _heads) = read_graph_lineage(&main_ds).await.unwrap(); + assert!( + main_rows.is_empty(), + "migrating the branch must not backfill main's __manifest", + ); + } +} + +// FIX D — the branch read path refuses a `> CURRENT` branch stamp. +// +// `load_commit_cache_for_branch` handled `< CURRENT` (the v3 fallback) and +// `>= CURRENT` (the manifest projection), but never a `> CURRENT` branch stamp — +// it would misread a future shape with the projection. The main read path already +// refuses (`refuse_if_internal_schema_unsupported`), and migrations run main-first so +// main's stamp ≥ every branch's — so this is not a live hole today. The guard is +// defense-in-depth against that ordering invariant ever weakening. Here we +// synthesize the unreachable state directly (force-stamp a branch past CURRENT) +// and assert the branch read refuses loudly instead of misreading. +#[tokio::test] +async fn branch_read_refuses_future_internal_schema_stamp() { + use crate::db::commit_graph::{CommitGraph, seed_legacy_v3_lineage_with_branch}; + + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + // A graph with a real `feature` Lance branch on both `_graph_commits.lance` + // and `__manifest` (so `open_at_branch` can check it out). + let fx = seed_legacy_v3_lineage_with_branch(uri).await.unwrap(); + + // Force the BRANCH's `__manifest` stamp past this binary's known version. + let future = super::migrations::INTERNAL_MANIFEST_SCHEMA_VERSION + 1; + { + let mut branch_ds = open_manifest_dataset(uri, Some(&fx.branch)).await.unwrap(); + super::migrations::set_stamp_for_test(&mut branch_ds, future) + .await + .unwrap(); + } + + // Reading the commit graph at that branch must refuse, not misread. + let err = match CommitGraph::open_at_branch(uri, &fx.branch).await { + Ok(_) => panic!("a branch stamped past CURRENT must be refused on read"), + Err(e) => e, + }; + assert!( + err.to_string().contains("upgrade omnigraph"), + "expected an upgrade-omnigraph refusal at the branch read, got: {err}", + ); +} + +// A v4 branch whose AUTHORITATIVE lineage lives in `__manifest` must stay +// readable even when its DERIVED `_graph_commits.lance` branch ref is gone. +// +// `_graph_commits.lance` is no longer the source of graph lineage on a v4 graph +// (RFC-013 Phase 7) — `__manifest`'s `graph_commit`/`graph_head:` rows +// are. The Lance branch ref on `_graph_commits.lance` is a derived artifact, kept +// only so `create_branch`/`cleanup` have something to operate on. An interrupted +// fork-reclaim or a `cleanup` race can leave that ref missing while the manifest +// lineage is fully intact. Per invariants 7 + 15 a missing DERIVED ref must not +// fail a LOGICAL read of the lineage. +// +// The wedge: take a real v4 `feature` branch (its `graph_head:feature` row in +// `__manifest`), then `force_delete` ONLY the `_graph_commits.lance` `feature` +// ref — manifest lineage is left authoritative. The contract: +// - reads at the wedged branch (`open_at_branch` / list-commits / `merge_base`) +// SUCCEED, sourcing the DAG from `__manifest`; and +// - a WRITE that needs the derived ref (`create_branch`) fails LOUDLY with the +// typed actionable error, deferring repair to `cleanup`'s orphan reconciler. +// +// RED before the fix: `open_at_branch` does a hard `checkout_branch(branch)?` on +// the now-missing `_graph_commits.lance` ref and errors `OmniError::Lance`, +// wedging the logical read. +#[tokio::test] +async fn open_at_branch_reads_manifest_lineage_when_commit_graph_ref_is_missing() { + use crate::db::commit_graph::{CommitGraph, seed_legacy_v3_lineage_with_branch}; + + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + + // 1. A graph with a REAL `feature` Lance branch on both `_graph_commits.lance` + // and `__manifest`, then migrate BOTH main and the branch to v4 so the + // branch's lineage is authoritative in `__manifest` (not the legacy + // fallback). After this, `graph_head:feature` resolves the branch commit + // from `__manifest` and the `_graph_commits.lance` `feature` ref still + // exists (the v3→v4 migration leaves it in place). + let fx = seed_legacy_v3_lineage_with_branch(uri).await.unwrap(); + super::migrate_on_open(uri).await.unwrap(); + let (_branch_by_id, branch_heads) = migrate_branch_and_read_lineage(uri, &fx.branch).await; + assert_eq!( + branch_heads.get(&fx.branch).map(String::as_str), + Some(fx.branch_commit.as_str()), + "precondition: __manifest carries graph_head:feature (lineage is authoritative)", + ); + + // 2. Force-delete ONLY the derived `_graph_commits.lance` `feature` ref, + // leaving the `__manifest` `feature` branch (and its lineage) untouched — + // the exact shape an interrupted fork-reclaim / cleanup race produces. + { + let mut cg = CommitGraph::open(uri).await.unwrap(); + cg.force_delete_branch(&fx.branch).await.unwrap(); + } + // Sanity: the derived ref is genuinely gone from `_graph_commits.lance`. + { + let cg = CommitGraph::open(uri).await.unwrap(); + let branches = cg.list_branches().await.unwrap(); + assert!( + !branches.iter().any(|b| b == &fx.branch), + "the _graph_commits.lance feature ref must be deleted to build the wedge, got: {branches:?}", + ); + } + + // 3a. The logical READS at the branch succeed from `__manifest` despite the + // missing derived ref. `open_at_branch` is the one that errors pre-fix. + let mut cg = CommitGraph::open_at_branch(uri, &fx.branch) + .await + .expect("open_at_branch must read manifest lineage when the commit-graph ref is missing"); + let commits = cg.load_commits().await.unwrap(); + assert_eq!( + commits.len(), + 3, + "the branch DAG (genesis + A + branch commit) is read from __manifest", + ); + assert_eq!( + cg.head_commit_id().await.unwrap().as_deref(), + Some(fx.branch_commit.as_str()), + "the branch head resolves from __manifest's graph_head:feature", + ); + let base = CommitGraph::merge_base(uri, Some(&fx.branch), Some(&fx.branch)) + .await + .expect("merge_base must resolve over the manifest-sourced DAG"); + assert_eq!( + base.map(|c| c.graph_commit_id), + Some(fx.branch_commit.clone()), + "merge_base(feature, feature) is reflexively the branch head", + ); + + // 3b. A WRITE that needs the derived ref fails loudly + actionably — the repair + // is deferred to `cleanup`'s orphan reconciler, not inlined on a read. + let err = match cg.create_branch("derived").await { + Ok(()) => panic!("create_branch must fail when the commit-graph branch ref is missing"), + Err(e) => e, + }; + let msg = err.to_string(); + assert!( + msg.contains("commit-graph branch ref") && msg.contains("is missing"), + "expected the typed missing-ref error, got: {msg}", + ); +} + +// FIX B — the v3→v4 lineage backfill must be concurrent-runner idempotent. +// +// `migrate_v2_to_v3` is explicitly safe under two processes opening the same +// legacy graph at once (each re-enumerates branches; `force_delete_branch` +// tolerates an already-gone branch). v3→v4 regressed that: `merge_lineage_rows` +// uses `conflict_retries(0)` and the migration had no app-level retry, so a +// concurrent first-open's CAS loser errored the whole open instead of converging. +// +// This test reproduces exactly two concurrent first-opens: two `__manifest` +// handles opened at the SAME pre-migration (v3, empty-lineage) HEAD, then their +// `migrate_internal_schema` calls run under `tokio::join!`. Both pass the +// fast-path empty-lineage check and both attempt the backfill merge, so the +// row-level CAS on `graph_head:main` is guaranteed to fire — deterministically +// red against the pre-fix code (the loser errors). The contract: BOTH converge +// to `Ok`, the manifest carries exactly the fixture's commit rows (merge keyed on +// `object_id`, so a double-merge stays exact), and the stamp is v4. +// +// (Driving pre-opened handles rather than `migrate_on_open(uri)` twice is a +// deliberate choice: `migrate_on_open` opens fresh each call, so two of them can +// luckily serialize — one finishes before the other reads the fast path — which +// would not exercise the CAS path and would pass even pre-fix. Pre-opening both +// at the empty-lineage HEAD forces the contention every run, so the RED is real.) +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn concurrent_v3_to_v4_migrations_both_converge() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let fixture = seed_legacy_v3_lineage(uri).await.unwrap(); + + // Two handles opened at the same pre-migration HEAD: both see stamp v3 and an + // empty lineage, so both will run the full backfill and collide on the merge. + let mut ds_a = open_manifest_dataset(uri, None).await.unwrap(); + let mut ds_b = open_manifest_dataset(uri, None).await.unwrap(); + + let (res_a, res_b) = tokio::join!( + super::migrations::migrate_internal_schema(&mut ds_a, uri, None), + super::migrations::migrate_internal_schema(&mut ds_b, uri, None), + ); + + // The whole contract: a concurrent first-open's CAS loser converges instead of + // erroring. BOTH must succeed. + res_a.expect("migration runner A must converge"); + res_b.expect("migration runner B must converge"); + + // Exactly the fixture's commits, no duplicates (the merge is keyed on + // `object_id`, so even a double-merge under read-after-write lag stays exact). + assert_eq!( + manifest_commit_row_count(uri).await, + fixture.all_ids.len(), + "concurrent backfills converge to exactly the fixture's commit rows", + ); + // And the stamp landed at v4. + { + let ds = open_manifest_dataset(uri, None).await.unwrap(); + assert_eq!( + super::migrations::read_stamp(&ds), + super::migrations::INTERNAL_MANIFEST_SCHEMA_VERSION, + "both runners leave the manifest stamped at the current version", + ); + } +} + +// ── RFC-013 Phase 7 / step 5: the `graph_head` concurrency gate ────────────── +// +// Two (or N) writers committing DISJOINT tables on the same branch still share +// one mutable `graph_head:main` row (one `object_id`, `WhenMatched::UpdateAll`). +// Their table-version rows never collide (distinct `object_id`s), so the *only* +// row-level CAS contention is on `graph_head:main`. The contract under test: +// exactly one writer wins each CAS round; the loser retries, re-resolves its +// parent off the freshly-advanced head (inside the publisher's retry loop), and +// re-commits — so every writer commits and the resulting graph_commit DAG is a +// single LINEAR chain (no fork), not a tree. This is the cross-process +// disjoint-table fork closed by the shared head row (invariants.md §7.1). + +/// A microsecond UNIX timestamp for a `LineageIntent`, matching the genesis / +/// commit-graph `created_at` unit. +fn lineage_now_micros() -> i64 { + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_micros() as i64 +} + +/// Append one row to a two-column NODE table (`id`, `name`) and return the +/// resulting `SubTableUpdate` at the new on-disk version. Generalizes +/// `append_person_and_make_update` to any node table whose schema is `(id: +/// String, name: String[, ...])`; the extra `Person.age` column is filled null +/// when present so the same helper drives both `node:Person` and `node:Company`. +async fn append_node_row_and_make_update( + uri: &str, + entry: &SubTableEntry, + id: &str, + name: &str, +) -> SubTableUpdate { + let mut ds = Dataset::open(&format!("{}/{}", uri, entry.table_path)) + .await + .unwrap(); + let schema = Arc::new(ds.schema().into()); + let arrow_schema: &Schema = &schema; + // Columns 0/1 are (id, name); a third column (Person.age) is filled null. + let mut columns: Vec> = vec![ + Arc::new(StringArray::from(vec![id.to_string()])), + Arc::new(StringArray::from(vec![name.to_string()])), + ]; + for field in arrow_schema.fields().iter().skip(2) { + columns.push(arrow_array::new_null_array(field.data_type(), 1)); + } + let row = RecordBatch::try_new(Arc::clone(&schema), columns).unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(row)], schema); + ds.append(reader, None).await.unwrap(); + let new_version = ds.version().version; + let version_metadata = + table_version_metadata_for_state(uri, &entry.table_path, None, new_version) + .await + .unwrap(); + SubTableUpdate { + table_key: entry.table_key.clone(), + table_version: new_version, + table_branch: None, + row_count: 1, + version_metadata, + } +} + +/// Read the `graph_commit` lineage rows from `__manifest` at main and assert +/// they form a single LINEAR chain of `expected_total` commits (one genesis + +/// the rest), with no fork. Returns the head commit id. +/// +/// "Linear, not a fork" is proven structurally: (1) exactly one parentless +/// genesis; (2) no two commits share a `parent_commit_id` (a fork would have two +/// children off one parent); (3) every commit except the unique head is the +/// parent of exactly one other commit — so the parent pointers form one path +/// that visits all commits. (1)+(2)+(3) over a connected set is a single chain. +async fn assert_linear_chain(uri: &str, expected_total: usize) -> String { + let ds = open_manifest_dataset(uri, None).await.unwrap(); + let (rows, _heads) = read_graph_lineage(&ds).await.unwrap(); + assert_eq!( + rows.len(), + expected_total, + "expected {expected_total} graph_commit rows (genesis + the concurrent commits), got {}", + rows.len(), + ); + + // (1) exactly one genesis. + let genesis: Vec<&GraphLineageRow> = + rows.iter().filter(|r| r.parent_commit_id.is_none()).collect(); + assert_eq!( + genesis.len(), + 1, + "exactly one parentless genesis commit in a linear chain, got {}", + genesis.len(), + ); + + // (2) no two commits parent off the same commit (no fork). + let mut parents: Vec<&str> = rows + .iter() + .filter_map(|r| r.parent_commit_id.as_deref()) + .collect(); + let parent_count = parents.len(); + parents.sort_unstable(); + parents.dedup(); + assert_eq!( + parents.len(), + parent_count, + "two commits share a parent_commit_id — the DAG forked instead of forming a linear chain", + ); + + // (3) the head (the `should_replace_head` winner) plus the parent set covers + // every commit exactly once: each non-head commit is some commit's parent. + let head = super::state::head_lineage_row(&rows).expect("a non-empty lineage has a head"); + let ids: std::collections::HashSet<&str> = + rows.iter().map(|r| r.graph_commit_id.as_str()).collect(); + let parent_set: std::collections::HashSet<&str> = parents.iter().copied().collect(); + // The head is the only commit that is not a parent of anything. + let non_parents: Vec<&str> = ids + .iter() + .copied() + .filter(|id| !parent_set.contains(id)) + .collect(); + assert_eq!( + non_parents, + vec![head.graph_commit_id.as_str()], + "the only commit that is no one's parent must be the head — a fork or break leaves others", + ); + // Every parent points at a real commit (connectedness). + for parent in &parent_set { + assert!( + ids.contains(parent), + "parent {parent} must be a known commit in the chain", + ); + } + + head.graph_commit_id.clone() +} + +/// Test A (deterministic, the must-have): two writers, two DISJOINT table +/// updates, two distinct `LineageIntent`s, `tokio::join!`. BOTH commit (the loser +/// retries on the `graph_head:main` CAS conflict and re-parents off the winner), +/// and the on-disk graph_commit DAG is a single linear chain genesis → c → c'. +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn concurrent_disjoint_writes_share_head_and_form_linear_chain() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let catalog = build_test_catalog(); + let mc = ManifestCoordinator::init(uri, &catalog).await.unwrap(); + let snap = mc.snapshot(); + let person_entry = snap.entry("node:Person").unwrap().clone(); + let company_entry = snap.entry("node:Company").unwrap().clone(); + + // Two DISJOINT table-version rows (`node:Person@v=2`, `node:Company@v=2`): + // distinct `object_id`s, so neither hits the table-version CAS. The ONLY + // shared row both writers merge is `graph_head:main`. + let update_a = append_node_row_and_make_update(uri, &person_entry, "p1", "Alice").await; + let update_b = append_node_row_and_make_update(uri, &company_entry, "c1", "Acme").await; + + let publisher_a = GraphNamespacePublisher::new(uri, None); + let publisher_b = GraphNamespacePublisher::new(uri, None); + let changes_a = vec![ManifestChange::Update(update_a)]; + let changes_b = vec![ManifestChange::Update(update_b)]; + // Each writer mints its own stable commit id; the parent re-resolves per + // attempt inside the publisher. + let intent_a = LineageIntent { + graph_commit_id: ulid::Ulid::new().to_string(), + branch: None, + actor_id: Some("act-a".to_string()), + merged_parent_commit_id: None, + created_at: lineage_now_micros(), + }; + let intent_b = LineageIntent { + graph_commit_id: ulid::Ulid::new().to_string(), + branch: None, + actor_id: Some("act-b".to_string()), + merged_parent_commit_id: None, + created_at: lineage_now_micros(), + }; + // Empty expected-versions: the two writers are disjoint, so neither asserts a + // version on the other's table; contention is purely the shared head row. + let empty = HashMap::new(); + let (res_a, res_b) = tokio::join!( + async { publisher_a.publish(&changes_a, &empty, Some(&intent_a)).await }, + async { publisher_b.publish(&changes_b, &empty, Some(&intent_b)).await } + ); + + // BOTH commit: disjoint tables → the head-row CAS loser retries within + // PUBLISHER_RETRY_BUDGET, re-resolves its parent off the winner, and lands. + res_a.expect("writer A must commit"); + res_b.expect("writer B must commit"); + + // End-state assertion (the on-disk DAG is fixed once both committed): a single + // linear chain genesis → first → second, no fork. The two minted ids both + // appear; their parents form a chain (one off genesis, the other off the + // first), so no two commits share a parent. + let head = assert_linear_chain(uri, 3).await; + assert!( + head == intent_a.graph_commit_id || head == intent_b.graph_commit_id, + "the head must be one of the two concurrent commits", + ); + // Both committed table writes are visible (Person and Company advanced). + let reopened = ManifestCoordinator::open(uri).await.unwrap(); + let after = reopened.snapshot(); + assert_eq!(after.entry("node:Person").unwrap().table_version, 2); + assert_eq!(after.entry("node:Company").unwrap().table_version, 2); +} + +/// Test C (S3 variant, bucket-gated): the same two-disjoint-writers + +/// `LineageIntent` race as Test A, but on a real object store so the one-winner +/// behaviour exercises the genuine conditional-put CAS on `__manifest` rather +/// than the local content-token emulation. Skips with a log when +/// `OMNIGRAPH_S3_TEST_BUCKET` is unset (the `tests/s3_storage.rs` gate); the +/// rustfs CI job sets it. Asserts the same end-state: both commit, single linear +/// chain. +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn concurrent_disjoint_writes_form_linear_chain_on_s3() { + let Ok(bucket) = std::env::var("OMNIGRAPH_S3_TEST_BUCKET") else { + eprintln!( + "SKIP concurrent_disjoint_writes_form_linear_chain_on_s3: \ + OMNIGRAPH_S3_TEST_BUCKET unset — the S3 lineage-CAS gate needs an object store" + ); + return; + }; + let uri = format!( + "s3://{bucket}/lineage-concurrency/{}-{}", + std::process::id(), + ulid::Ulid::new() + ); + + let catalog = build_test_catalog(); + let mc = ManifestCoordinator::init(&uri, &catalog).await.unwrap(); + let snap = mc.snapshot(); + let person_entry = snap.entry("node:Person").unwrap().clone(); + let company_entry = snap.entry("node:Company").unwrap().clone(); + + let update_a = append_node_row_and_make_update(&uri, &person_entry, "p1", "Alice").await; + let update_b = append_node_row_and_make_update(&uri, &company_entry, "c1", "Acme").await; + + let publisher_a = GraphNamespacePublisher::new(&uri, None); + let publisher_b = GraphNamespacePublisher::new(&uri, None); + let changes_a = vec![ManifestChange::Update(update_a)]; + let changes_b = vec![ManifestChange::Update(update_b)]; + let intent_a = LineageIntent { + graph_commit_id: ulid::Ulid::new().to_string(), + branch: None, + actor_id: Some("act-a".to_string()), + merged_parent_commit_id: None, + created_at: lineage_now_micros(), + }; + let intent_b = LineageIntent { + graph_commit_id: ulid::Ulid::new().to_string(), + branch: None, + actor_id: Some("act-b".to_string()), + merged_parent_commit_id: None, + created_at: lineage_now_micros(), + }; + let empty = HashMap::new(); + let (res_a, res_b) = tokio::join!( + async { publisher_a.publish(&changes_a, &empty, Some(&intent_a)).await }, + async { publisher_b.publish(&changes_b, &empty, Some(&intent_b)).await } + ); + res_a.expect("writer A must commit on S3"); + res_b.expect("writer B must commit on S3"); + + let head = assert_linear_chain(&uri, 3).await; + assert!( + head == intent_a.graph_commit_id || head == intent_b.graph_commit_id, + "the head must be one of the two concurrent commits", + ); +} + +/// Test B (bounded-retry convergence, scaled): N=8 same-branch writers, each +/// touching a DISJOINT table-version row + its own `LineageIntent`, each wrapped +/// in an APP-LEVEL retry loop. `PUBLISHER_RETRY_BUDGET=5` means the later writers +/// can exhaust the internal budget under contention, so the app loop re-submits +/// on a typed `Conflict` / row-level-CAS-contention error. All 8 eventually +/// commit and the final DAG is a single linear chain of 8 (+genesis), no fork, +/// no lost commit. +#[tokio::test(flavor = "multi_thread", worker_threads = 4)] +async fn n_concurrent_disjoint_writers_converge_to_one_linear_chain() { + use crate::error::ManifestConflictDetails; + use crate::error::ManifestErrorKind; + + const N: usize = 8; + + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let catalog = build_test_catalog(); + let mc = ManifestCoordinator::init(uri, &catalog).await.unwrap(); + let snap = mc.snapshot(); + let person_entry = snap.entry("node:Person").unwrap().clone(); + let company_entry = snap.entry("node:Company").unwrap().clone(); + + // Synthesize N=8 DISJOINT table-version updates by sequentially advancing the + // two node tables four versions each (Person@v2..v5, Company@v2..v5). Each + // update is a distinct `object_id`, so the writers never collide on a + // table-version row — only on the shared `graph_head:main`. Built serially + // here (before the concurrent phase) so the on-disk versions exist. + let mut updates: Vec = Vec::with_capacity(N); + for i in 0..(N / 2) { + updates.push( + append_node_row_and_make_update(uri, &person_entry, &format!("p{i}"), &format!("P{i}")) + .await, + ); + updates.push( + append_node_row_and_make_update(uri, &company_entry, &format!("c{i}"), &format!("C{i}")) + .await, + ); + } + assert_eq!(updates.len(), N); + + // Each writer: its own publisher + its own commit id + an app-level retry loop + // re-submitting on a typed Conflict (the publisher's internal budget can be + // exhausted by the later contenders, so convergence relies on the app retry). + let uri_owned = uri.to_string(); + let mut handles = Vec::with_capacity(N); + for update in updates { + let uri = uri_owned.clone(); + handles.push(tokio::spawn(async move { + let commit_id = ulid::Ulid::new().to_string(); + let changes = vec![ManifestChange::Update(update)]; + let empty = HashMap::new(); + // Bounded app-level retry: re-submit on a Conflict-kind manifest error + // (the only retryable outcome here is losing the shared-head CAS). + for _attempt in 0..64 { + let intent = LineageIntent { + graph_commit_id: commit_id.clone(), + branch: None, + actor_id: None, + merged_parent_commit_id: None, + created_at: lineage_now_micros(), + }; + let publisher = GraphNamespacePublisher::new(&uri, None); + match publisher.publish(&changes, &empty, Some(&intent)).await { + Ok(_) => return commit_id, + Err(OmniError::Manifest(m)) + if matches!(m.kind, ManifestErrorKind::Conflict) + && matches!( + m.details, + Some(ManifestConflictDetails::RowLevelCasContention) + ) => + { + // lost the shared-head CAS after exhausting the internal + // budget — re-resolve parent + re-submit. + continue; + } + Err(other) => panic!("non-retryable publish error: {other:?}"), + } + } + panic!("writer for commit {commit_id} did not converge within the app-retry budget"); + })); + } + + let mut committed_ids = Vec::with_capacity(N); + for handle in handles { + committed_ids.push(handle.await.unwrap()); + } + // All 8 distinct writer ids committed (no lost commit, no duplicate id). + committed_ids.sort(); + committed_ids.dedup(); + assert_eq!(committed_ids.len(), N, "every writer must commit exactly once"); + + // The final DAG is a single linear chain of genesis + 8 = 9, no fork. + assert_linear_chain(uri, N + 1).await; +} diff --git a/crates/omnigraph/src/db/mod.rs b/crates/omnigraph/src/db/mod.rs index f382908..2ce3e29 100644 --- a/crates/omnigraph/src/db/mod.rs +++ b/crates/omnigraph/src/db/mod.rs @@ -10,12 +10,15 @@ pub use commit_graph::GraphCommit; pub use graph_coordinator::{GraphCoordinator, ReadTarget, ResolvedTarget, SnapshotId}; pub use manifest::{Snapshot, SubTableEntry, SubTableUpdate}; pub(crate) use omnigraph::ensure_public_branch_ref; +pub(crate) use omnigraph::WriteTxn; pub use omnigraph::{ CleanupPolicyOptions, InitOptions, MergeOutcome, Omnigraph, OpenMode, PendingIndex, RepairAction, RepairClassification, RepairOptions, RepairStats, SchemaApplyOptions, SchemaApplyResult, SkipReason, TableCleanupStats, TableOptimizeStats, TableRepairStats, }; +use crate::error::{OmniError, Result}; + pub(crate) const SCHEMA_APPLY_LOCK_BRANCH: &str = "__schema_apply_lock__"; /// Mutation kind, threaded through the version-check call sites so the @@ -73,3 +76,14 @@ pub(crate) fn is_internal_system_branch(name: &str) -> bool { // only internal branch the engine still creates is the schema-apply lock. is_schema_apply_lock_branch(name) } + +/// Microseconds since the UNIX epoch — the `created_at` stamp threaded through +/// every graph-lineage / recovery-audit / commit-graph row. One canonical +/// helper so the clock-error mapping (variant + message) cannot drift across +/// the call sites that record those timestamps. +pub(crate) fn now_micros() -> Result { + let duration = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map_err(|e| OmniError::manifest(format!("system clock before UNIX_EPOCH: {e}")))?; + Ok(duration.as_micros() as i64) +} diff --git a/crates/omnigraph/src/db/omnigraph.rs b/crates/omnigraph/src/db/omnigraph.rs index e1d7acf..4a770f0 100644 --- a/crates/omnigraph/src/db/omnigraph.rs +++ b/crates/omnigraph/src/db/omnigraph.rs @@ -41,6 +41,7 @@ pub use repair::{ }; pub use schema_apply::SchemaApplyOptions; pub use table_ops::PendingIndex; +pub(crate) use table_ops::OpenedForMutation; use super::commit_graph::GraphCommit; use super::manifest::{ @@ -79,6 +80,35 @@ pub struct SchemaApplyPreview { pub catalog: Catalog, } +/// A capture-once write transaction (RFC-013 step 3b). Pins the operation's read +/// base ONCE so the per-table opens reuse the pinned version instead of +/// re-resolving / re-validating per table. The schema contract is validated once +/// (when `base` is captured). NOT a general "no re-resolution" handle — the +/// commit-time OCC re-read, the live-HEAD drift probe, and the fork-authority reads +/// stay fresh (correctness machinery). Step 5 (PublishPlan unification) makes this +/// the non-optional publish carrier and adds session-aware base opens there, gated +/// by an S3 cost test — the warm-session benefit on the single remaining open is an +/// object-store phenomenon, so it earns its own gate rather than riding this PR. +/// +/// Threaded as `Option<&WriteTxn>` through the mutate/load write chain +/// (`open_for_mutation_on_branch`, `commit_all`, `commit_updates_on_branch_with_expected`) +/// so a single write validates the schema contract EXACTLY ONCE — at capture. When +/// present, the per-table resolves source the pinned `base` entry instead of calling +/// `resolved_branch_target` / `snapshot_for_branch` / `fresh_snapshot_for_branch` +/// (each of which re-runs `ensure_schema_state_valid`). When absent (`None` — every +/// non-mutate/load caller), every threaded function behaves byte-identically to +/// before. The carrier never removes a version guard or changes which dataset version +/// the per-table open targets: strict ops keep `open_dataset_head_for_write` + +/// `ensure_expected_version`, and the commit-time OCC re-read still opens a fresh +/// manifest snapshot (via `fresh_snapshot_for_branch_unchecked`) — only the redundant +/// schema re-validation is dropped. +pub(crate) struct WriteTxn { + /// The resolved branch (`None` = main). + pub(crate) branch: Option, + /// The pinned base snapshot (per-table location + version + e_tag), captured once. + pub(crate) base: Snapshot, +} + /// Top-level handle to an Omnigraph database. /// /// An Omnigraph is a Lance-native graph database with git-style branching. @@ -93,12 +123,12 @@ pub struct Omnigraph { /// calls without a global write lock). Reads (`snapshot`, `version`, /// `current_branch`, `branch_list`, `resolve_*`, `head_commit_id`, /// `list_commits`, …) acquire `.read().await` and parallelize. - /// Writes (`refresh`, `branch_create`, `branch_delete`, `commit_*`, - /// `record_*`) acquire `.write().await` and serialize. The atomic - /// commit invariant — `commit_manifest_updates` followed by - /// `record_graph_commit` must be atomic — is preserved by the - /// single `.write()` covering both calls inside - /// `commit_updates_with_actor_with_expected`. PR 2 Phase 2 + /// Writes (`refresh`, `branch_create`, `branch_delete`, `commit_*`) + /// acquire `.write().await` and serialize. The atomic commit invariant — + /// table-version rows and the graph commit are one unit — holds by + /// construction since RFC-013 Phase 7: both ride a SINGLE manifest publish + /// CAS (`commit_changes_with_lineage`), so there is no two-write window to + /// keep atomic. PR 2 Phase 2 /// converted from `Mutex` to `RwLock` because the bench showed /// the Mutex was the dominant serializer for disjoint-table /// workloads. Lock acquisition order: always before `runtime_cache` @@ -287,7 +317,7 @@ impl Omnigraph { { return Err(OmniError::AlreadyInitialized { uri: root.clone() }); } - if let Err(err) = crate::failpoints::maybe_fail("init.after_schema_pg_written") { + if let Err(err) = crate::failpoints::maybe_fail(crate::failpoints::names::INIT_AFTER_SCHEMA_PG_WRITTEN) { best_effort_cleanup_init_artifacts(&root, storage.as_ref()).await; return Err(err); } @@ -387,6 +417,14 @@ impl Omnigraph { // first read-write open (an accepted, documented limitation). if matches!(mode, OpenMode::ReadWrite) { crate::db::manifest::migrate_on_open(&root).await?; + } else { + // A read-only open skips `migrate_on_open` (no object-store writes), + // which is where the version refusal otherwise lives. Still refuse a + // `__manifest` stamped outside this binary's supported range — newer + // than CURRENT (an old binary cannot silently misread a newer graph, + // e.g. one folded to internal-schema v4 lineage), or below + // MIN_SUPPORTED (predates the readers we carry). Read-only, no write. + crate::db::manifest::refuse_if_internal_schema_unsupported(&root).await?; } // Open the coordinator first so the schema-staging recovery sweep can // compare its snapshot against any leftover staging files. @@ -736,6 +774,29 @@ impl Omnigraph { *self.coordinator.write().await = coordinator; } + /// Open a capture-once write transaction (RFC-013 step 3b): validate the schema + /// contract ONCE and pin the base snapshot. The per-table opens take + /// `Option<&WriteTxn>` and, on the bound branch for the non-strict (Insert/Merge) + /// path, source the pinned base entry — instead of re-resolving (re-validating the + /// schema) per table. Strict ops, the fork path, and the commit-time OCC re-read + /// keep their fresh reads (those are correctness machinery — see the handoff doc). + /// + /// "Once" covers the table-touch hot path captured here (proven by the node-insert + /// gate `write_validates_schema_contract_once`); it does NOT yet cover edge endpoint + /// / cardinality RI validation (`ensure_node_id_exists`, the loader's RI/cardinality), + /// which still resolve through `snapshot_for_branch` and re-validate. Those reads must + /// observe LIVE committed state, so unifying them (validate-once + pinned + re-checked + /// read-set) is step 4's §7.1 work — threading `txn.base` there would re-introduce the + /// stale-read class the #298 cardinality fix removed. A session-aware base open is + /// likewise deferred to step 5 (handoff §1d). + pub(crate) async fn open_write_txn(&self, branch: Option<&str>) -> Result { + let resolved = self.resolved_branch_target(branch).await?; + Ok(WriteTxn { + branch: resolved.branch, + base: resolved.snapshot, + }) + } + pub(crate) async fn resolved_branch_target( &self, branch: Option<&str>, @@ -770,12 +831,39 @@ impl Omnigraph { pub(crate) async fn fresh_snapshot_for_branch(&self, branch: Option<&str>) -> Result { self.ensure_schema_state_valid().await?; - let requested = ReadTarget::Branch(branch.unwrap_or("main").to_string()); - let coord = self.coordinator.read().await; - coord - .resolve_target(&requested) - .await - .map(|resolved| resolved.snapshot) + self.fresh_snapshot_for_branch_unchecked(branch).await + } + + /// Fresh per-branch manifest snapshot WITHOUT the schema-contract + /// re-validation. Identical OCC freshness to [`fresh_snapshot_for_branch`] + /// — a fresh manifest re-read from storage, never the warm cache — only the + /// redundant `ensure_schema_state_valid` is dropped. Used inside a single + /// write once a `WriteTxn` has already validated the contract at capture: the + /// commit-time drift re-read needs the live manifest, not a second contract + /// read. Callers with no `WriteTxn` MUST use the checked variant. + /// + /// Reads the manifest directly via `ManifestCoordinator` rather than + /// `resolve_target`. The OCC re-read uses only the returned `Snapshot` + /// (per-table location + version), which `ManifestCoordinator::open().snapshot()` + /// produces identically to `GraphCoordinator::open(...).snapshot()` — but + /// `resolve_target` additionally opens the commit graph (an extra + /// `_graph_commits.lance` probe) the OCC read never consults. Skipping that + /// load is a pure read-cost reduction, not a freshness change. The checked + /// `fresh_snapshot_for_branch` delegates here, so its no-`txn` callers + /// (commit_all's None arm, optimize, repair, fork reclaim) get the same + /// identical `Snapshot` via this lighter manifest-only read; they consume + /// only the snapshot and never relied on the commit-graph side load. + pub(crate) async fn fresh_snapshot_for_branch_unchecked( + &self, + branch: Option<&str>, + ) -> Result { + let manifest = match branch { + Some(branch) => { + crate::db::manifest::ManifestCoordinator::open_at_branch(self.uri(), branch).await? + } + None => crate::db::manifest::ManifestCoordinator::open(self.uri()).await?, + }; + Ok(manifest.snapshot()) } pub(crate) async fn version(&self) -> u64 { @@ -1367,7 +1455,7 @@ impl Omnigraph { for (table_key, table_path) in cleanup_targets { let dataset_uri = self.storage().dataset_uri(&table_path); - let outcome = match crate::failpoints::maybe_fail("branch_delete.before_table_cleanup") + let outcome = match crate::failpoints::maybe_fail(crate::failpoints::names::BRANCH_DELETE_BEFORE_TABLE_CLEANUP) { Ok(()) => { self.storage() @@ -1599,7 +1687,7 @@ impl Omnigraph { &self, table_key: &str, op_kind: crate::db::MutationOpKind, - ) -> Result<(SnapshotHandle, String, Option)> { + ) -> Result { table_ops::open_for_mutation(self, table_key, op_kind).await } @@ -1608,8 +1696,9 @@ impl Omnigraph { branch: Option<&str>, table_key: &str, op_kind: crate::db::MutationOpKind, - ) -> Result<(SnapshotHandle, String, Option)> { - table_ops::open_for_mutation_on_branch(self, branch, table_key, op_kind).await + txn: Option<&crate::db::WriteTxn>, + ) -> Result { + table_ops::open_for_mutation_on_branch(self, branch, table_key, op_kind, txn).await } /// Fork `table_key` onto `active_branch` from the given source state, @@ -1698,28 +1787,17 @@ impl Omnigraph { table_ops::commit_updates(self, updates).await } - pub(crate) async fn commit_manifest_updates( + /// Publish a branch merge: the merged table `updates` and the merge commit + /// in one manifest CAS (RFC-013 Phase 7). The merge commit's merged-in parent + /// is `merged_parent_commit_id` (the source head); its first parent is the + /// live target-branch head, resolved by the publisher. + pub(crate) async fn commit_merge_with_actor( &self, updates: &[crate::db::SubTableUpdate], - ) -> Result { - table_ops::commit_manifest_updates(self, updates).await - } - - pub(crate) async fn record_merge_commit( - &self, - manifest_version: u64, - parent_commit_id: &str, merged_parent_commit_id: &str, actor_id: Option<&str>, ) -> Result { - table_ops::record_merge_commit( - self, - manifest_version, - parent_commit_id, - merged_parent_commit_id, - actor_id, - ) - .await + table_ops::commit_merge_with_actor(self, updates, merged_parent_commit_id, actor_id).await } pub(crate) async fn commit_updates_on_branch_with_expected( @@ -1728,6 +1806,8 @@ impl Omnigraph { updates: &[crate::db::SubTableUpdate], expected_table_versions: &std::collections::HashMap, actor_id: Option<&str>, + txn: Option<&crate::db::WriteTxn>, + committed_handles: std::collections::HashMap, ) -> Result { table_ops::commit_updates_on_branch_with_expected( self, @@ -1735,6 +1815,8 @@ impl Omnigraph { updates, expected_table_versions, actor_id, + txn, + committed_handles, ) .await } @@ -1939,14 +2021,14 @@ async fn init_storage_phase( if write_schema_pg { let schema_path = join_uri(root, SCHEMA_SOURCE_FILENAME); storage.write_text(&schema_path, schema_source).await?; - crate::failpoints::maybe_fail("init.after_schema_pg_written")?; + crate::failpoints::maybe_fail(crate::failpoints::names::INIT_AFTER_SCHEMA_PG_WRITTEN)?; } write_schema_contract(root, storage.as_ref(), schema_ir).await?; - crate::failpoints::maybe_fail("init.after_schema_contract_written")?; + crate::failpoints::maybe_fail(crate::failpoints::names::INIT_AFTER_SCHEMA_CONTRACT_WRITTEN)?; let coordinator = GraphCoordinator::init(root, catalog, Arc::clone(storage)).await?; - crate::failpoints::maybe_fail("init.after_coordinator_init")?; + crate::failpoints::maybe_fail(crate::failpoints::names::INIT_AFTER_COORDINATOR_INIT)?; Ok(coordinator) } @@ -2466,10 +2548,13 @@ edge WorksAt: Person -> Company } async fn seed_person_row(db: &mut Omnigraph, name: &str, age: Option) { + // No-txn entry, so the handle is always `Some` (collapse #1's skip is + // gated on `txn.is_some()`). let (ds, full_path, table_branch) = db .open_for_mutation("node:Person", crate::db::MutationOpKind::Insert) .await - .unwrap(); + .unwrap() + .require_handle("seed_person_row test"); let schema: Arc = Arc::new(ds.dataset().schema().into()); let columns: Vec> = schema .fields() diff --git a/crates/omnigraph/src/db/omnigraph/optimize.rs b/crates/omnigraph/src/db/omnigraph/optimize.rs index e3aed3d..bae0c88 100644 --- a/crates/omnigraph/src/db/omnigraph/optimize.rs +++ b/crates/omnigraph/src/db/omnigraph/optimize.rs @@ -512,7 +512,7 @@ async fn optimize_one_table( // Test seam: a concurrent (cross-process) writer can interleave here, before // any Phase-B commit lands, to exercise the reopen+replan path. - crate::failpoints::maybe_fail("optimize.before_compact")?; + crate::failpoints::maybe_fail(crate::failpoints::names::OPTIMIZE_BEFORE_COMPACT)?; // Phase B: scrub stale auto_cleanup (keeps optimize non-destructive on a // graph upgraded from a pre-v7 binary whose `compact_files`/`optimize_indices` @@ -549,7 +549,7 @@ async fn optimize_one_table( // committed (so HEAD is already ahead of the manifest from our own work), // exercising the own-HEAD (not external) drift classification on the next // reopened attempt. - if crate::failpoints::maybe_fail("optimize.inject_reindex_conflict").is_err() + if crate::failpoints::maybe_fail(crate::failpoints::names::OPTIMIZE_INJECT_REINDEX_CONFLICT).is_err() && attempt < COMPACTION_RETRY_BUDGET { continue; @@ -584,7 +584,7 @@ async fn optimize_one_table( // Pin the per-writer Phase B → Phase C residual: Lance HEAD has advanced but the // manifest publish below hasn't run. - crate::failpoints::maybe_fail("optimize.post_phase_b_pre_manifest_commit")?; + crate::failpoints::maybe_fail(crate::failpoints::names::OPTIMIZE_POST_PHASE_B_PRE_MANIFEST_COMMIT)?; // Phase C: monotonic fast-forward publish. The compaction is committed at Lance // HEAD `N`; publish a manifest pointer that includes it. If a concurrent writer @@ -921,7 +921,7 @@ pub async fn cleanup_all_tables( let results: Vec = futures::stream::iter(table_tasks.into_iter()) .map(|(table_key, full_path)| async move { let outcome: Result = async { - crate::failpoints::maybe_fail("cleanup.table_gc")?; + crate::failpoints::maybe_fail(crate::failpoints::names::CLEANUP_TABLE_GC)?; // `cleanup_old_versions` is a Lance-only maintenance API not // surfaced through `TableStorage` — see the optimize path // above for the same rationale. Unwrap via `into_dataset()`. @@ -1079,7 +1079,7 @@ pub async fn reconcile_orphaned_branches(db: &Omnigraph) -> Result db.snapshot_for_branch(Some(&branch)).await, Err(injected) => Err(injected), }; @@ -1158,7 +1158,7 @@ pub async fn reconcile_orphaned_branches(db: &Omnigraph) -> Result storage.force_delete_branch(&full_path, &branch).await, Err(injected) => Err(injected), }; @@ -1308,7 +1308,10 @@ mod tests { ds.create_branch("feature", base, None).await.unwrap(); } - let _fp = ScopedFailPoint::new("cleanup.resolve_branch_snapshot", "return"); + let _fp = ScopedFailPoint::new( + crate::failpoints::names::CLEANUP_RESOLVE_BRANCH_SNAPSHOT, + "return", + ); let stats = reconcile_orphaned_branches(&db).await.unwrap(); assert_eq!( diff --git a/crates/omnigraph/src/db/omnigraph/schema_apply.rs b/crates/omnigraph/src/db/omnigraph/schema_apply.rs index 3089641..364f5a4 100644 --- a/crates/omnigraph/src/db/omnigraph/schema_apply.rs +++ b/crates/omnigraph/src/db/omnigraph/schema_apply.rs @@ -648,7 +648,7 @@ where // `recover_schema_state_files`: // - crash before commit → manifest unchanged; staging deleted on open // - crash after commit → manifest advanced; staging renamed on open - crate::failpoints::maybe_fail("schema_apply.before_staging_write")?; + crate::failpoints::maybe_fail(crate::failpoints::names::SCHEMA_APPLY_BEFORE_STAGING_WRITE)?; let staging_pg_uri = schema_source_staging_uri(&db.root_uri); db.storage @@ -656,7 +656,7 @@ where .await?; write_schema_contract_staging(&db.root_uri, db.storage.as_ref(), &desired_ir).await?; - crate::failpoints::maybe_fail("schema_apply.after_staging_write")?; + crate::failpoints::maybe_fail(crate::failpoints::names::SCHEMA_APPLY_AFTER_STAGING_WRITE)?; // `apply_schema` doesn't currently take an actor; system-attributed. let PublishedSnapshot { @@ -669,7 +669,7 @@ where .commit_changes_with_actor(&manifest_changes, None) .await?; - crate::failpoints::maybe_fail("schema_apply.after_manifest_commit")?; + crate::failpoints::maybe_fail(crate::failpoints::names::SCHEMA_APPLY_AFTER_MANIFEST_COMMIT)?; db.storage .rename_text(&staging_pg_uri, &schema_source_uri(&db.root_uri)) diff --git a/crates/omnigraph/src/db/omnigraph/table_ops.rs b/crates/omnigraph/src/db/omnigraph/table_ops.rs index ed5d082..a917150 100644 --- a/crates/omnigraph/src/db/omnigraph/table_ops.rs +++ b/crates/omnigraph/src/db/omnigraph/table_ops.rs @@ -296,7 +296,7 @@ pub(super) async fn ensure_indices_for_branch( // (one commit_staged per index built) but the manifest publish below // hasn't run. Used by // `tests/failpoints.rs::ensure_indices_phase_b_failure_recovered_on_next_open`. - crate::failpoints::maybe_fail("ensure_indices.post_phase_b_pre_manifest_commit")?; + crate::failpoints::maybe_fail(crate::failpoints::names::ENSURE_INDICES_POST_PHASE_B_PRE_MANIFEST_COMMIT)?; if !updates.is_empty() { commit_prepared_updates_on_branch(db, branch, &updates, None).await?; @@ -488,18 +488,52 @@ pub(super) async fn needs_index_work_edge( || !db.storage().has_btree_index(&ds, "dst").await?) } +/// Result of opening a sub-table for mutation. `handle` is `None` only when a +/// non-strict (Insert/Merge) op on the WriteTxn's own branch skipped the +/// accumulation open (RFC-013 step 3b collapse #1) — there the caller needs just +/// `expected_version`. It is ALWAYS `Some` for strict ops, the fork path, and +/// every no-`txn` caller (branch merge), which use [`Self::require_handle`]. +#[derive(Debug)] +pub(crate) struct OpenedForMutation { + /// The opened dataset, or `None` on the non-strict-txn open-skip path. + pub(crate) handle: Option, + /// The publisher's CAS fence: the opened handle's version, or — when the open + /// was skipped — the pinned base entry's version (equal absent uncovered drift). + pub(crate) expected_version: u64, + pub(crate) full_path: String, + pub(crate) table_branch: Option, +} + +impl OpenedForMutation { + /// Destructure for a caller that REQUIRES the handle (strict ops, the fork + /// path, every no-`txn` caller). The `None` skip fires solely on the + /// non-strict `txn` path, which these callers are not — so a panic here means + /// a future change broke that contract, named by `ctx`. + pub(crate) fn require_handle(self, ctx: &str) -> (SnapshotHandle, String, Option) { + let handle = self.handle.unwrap_or_else(|| { + panic!("{ctx}: open_for_mutation returned no handle on a path that requires one") + }); + (handle, self.full_path, self.table_branch) + } +} + pub(super) async fn open_for_mutation( db: &Omnigraph, table_key: &str, op_kind: crate::db::MutationOpKind, -) -> Result<(SnapshotHandle, String, Option)> { +) -> Result { let current_branch = db .coordinator .read() .await .current_branch() .map(str::to_string); - open_for_mutation_on_branch(db, current_branch.as_deref(), table_key, op_kind).await + // `open_for_mutation` is the no-txn entry (branch merge). Passing `None` + // keeps the exact pre-WriteTxn code path (a fresh `resolved_branch_target` + // that re-validates the schema). With `txn = None` the non-strict early-skip + // in `open_for_mutation_on_branch` never fires, so this always returns a + // `Some(handle)` for its callers. + open_for_mutation_on_branch(db, current_branch.as_deref(), table_key, op_kind, None).await } /// Open a sub-table for mutation. The `op_kind` selects the strict-vs-relaxed @@ -513,15 +547,69 @@ pub(super) async fn open_for_mutation_on_branch( branch: Option<&str>, table_key: &str, op_kind: crate::db::MutationOpKind, -) -> Result<(SnapshotHandle, String, Option)> { + txn: Option<&crate::db::WriteTxn>, +) -> Result { db.ensure_schema_apply_not_locked("write").await?; - let resolved = db.resolved_branch_target(branch).await?; - let entry = resolved - .snapshot + // Source the resolved (snapshot, branch). With a `WriteTxn` the contract was + // validated once at capture, so use the pinned base + resolved branch instead + // of `resolved_branch_target` (which re-runs `ensure_schema_state_valid`). The + // base is the same fresh per-branch manifest read the no-txn path would have + // resolved — only the redundant schema re-validation is dropped. Without a txn + // this is byte-identical to the prior `resolved_branch_target` call. + let (snapshot, resolved_branch) = match txn { + Some(txn) => (txn.base.clone(), txn.branch.clone()), + None => { + let resolved = db.resolved_branch_target(branch).await?; + (resolved.snapshot, resolved.branch) + } + }; + let entry = snapshot .entry(table_key) .ok_or_else(|| OmniError::manifest(format!("no manifest entry for {}", table_key)))?; let full_path = format!("{}/{}", db.root_uri, entry.table_path); - match resolved.branch.as_deref() { + + // Collapse #1 (RFC-013 step 3b): a non-strict op (Insert/Merge) on the txn's + // own branch needs no dataset open for ACCUMULATION — the only thing the + // caller reads from this handle on the non-strict path is `.version()` (the + // publisher's CAS fence), which is exactly the pinned base version. The base + // already validated the schema contract once, and the staging reopen + // (`reopen_for_mutation`) plus the publisher CAS in `commit_all` are the real + // drift guards. So skip `open_dataset_head_for_write` entirely and source the + // expected version from the pinned entry. + // + // Gated on `txn.is_some()`: without a txn (branch merge's `open_for_mutation`) + // every arm below is byte-identical to before. STRICT ops (Update/Delete/ + // SchemaRewrite) always open live HEAD + run `ensure_expected_version` + // (read-modify-write SI), and any write that must FORK (the table isn't yet on + // the resolved branch) opens too (the fork is a real Lance state advance the + // manifest snapshot can't substitute for). + if txn.is_some() && !op_kind.strict_pre_stage_version_check() { + match resolved_branch.as_deref() { + // Non-strict, table already on the active branch → no open, no fork. + Some(active_branch) if entry.table_branch.as_deref() == Some(active_branch) => { + return Ok(OpenedForMutation { + handle: None, + expected_version: entry.table_version, + full_path, + table_branch: Some(active_branch.to_string()), + }); + } + // Main branch, non-strict → no open. (Main never forks.) + None => { + return Ok(OpenedForMutation { + handle: None, + expected_version: entry.table_version, + full_path, + table_branch: None, + }); + } + // Non-strict but the table isn't on the active branch yet — falls + // through to fork below. + Some(_) => {} + } + } + + match resolved_branch.as_deref() { None => { let ds = db .storage() @@ -531,7 +619,13 @@ pub(super) async fn open_for_mutation_on_branch( db.storage() .ensure_expected_version(&ds, table_key, entry.table_version)?; } - Ok((ds, full_path, None)) + let version = ds.version(); + Ok(OpenedForMutation { + handle: Some(ds), + expected_version: version, + full_path, + table_branch: None, + }) } Some(active_branch) => { let (ds, table_branch) = open_owned_dataset_for_branch_write( @@ -544,7 +638,13 @@ pub(super) async fn open_for_mutation_on_branch( op_kind, ) .await?; - Ok((ds, full_path, table_branch)) + let version = ds.version(); + Ok(OpenedForMutation { + handle: Some(ds), + expected_version: version, + full_path, + table_branch, + }) } } } @@ -571,7 +671,7 @@ pub(super) async fn open_owned_dataset_for_branch_write( Ok((ds, Some(active_branch.to_string()))) } source_branch => { - crate::failpoints::maybe_fail("fork.before_classify")?; + crate::failpoints::maybe_fail(crate::failpoints::names::FORK_BEFORE_CLASSIFY)?; // Authority check before forking: re-read the live manifest. If this // table is already forked on active_branch, a concurrent first-write // won the race and our snapshot is stale — that is a retryable @@ -667,7 +767,7 @@ pub(crate) async fn classify_fork_ref( // fresh-authority read (no-op without the `failpoints` feature). Lets a // test exercise the Indeterminate path — a read failure on a live branch // must classify as Indeterminate (skip), never Orphan (destroy). - let fresh = match crate::failpoints::maybe_fail("classify.fresh_read") { + let fresh = match crate::failpoints::maybe_fail(crate::failpoints::names::CLASSIFY_FRESH_READ) { Ok(()) => db.fresh_snapshot_for_branch(Some(branch)).await, Err(injected) => Err(injected), }; @@ -751,7 +851,7 @@ pub(super) async fn reclaim_orphaned_fork_and_refork( } } - crate::failpoints::maybe_fail("fork.before_reclaim")?; + crate::failpoints::maybe_fail(crate::failpoints::names::FORK_BEFORE_RECLAIM)?; db.storage() .force_delete_branch(full_path, active_branch) .await @@ -1014,7 +1114,7 @@ async fn stage_and_commit_btree( // to demonstrate that a stage-step failure in the staged-index // path (`stage_create_btree_index` succeeded; `commit_staged` not // yet called) leaves no Lance-HEAD drift on the touched table. - crate::failpoints::maybe_fail("ensure_indices.post_stage_pre_commit_btree")?; + crate::failpoints::maybe_fail(crate::failpoints::names::ENSURE_INDICES_POST_STAGE_PRE_COMMIT_BTREE)?; let new_ds = db .storage() .commit_staged(ds.clone(), staged) @@ -1065,12 +1165,30 @@ async fn prepare_updates_for_commit( db: &Omnigraph, branch: Option<&str>, updates: &[crate::db::SubTableUpdate], + txn: Option<&crate::db::WriteTxn>, + // Post-`commit_staged` handles handed out by `StagedMutation::commit_all` + // (RFC-013 step 3b, collapse #4): table_key → the handle already open at + // its just-committed version. When a table's handle is present, the index + // build below reuses it and SKIPS the `reopen_for_mutation` open. Absent + // entries (other writers — schema apply, merge, ensure_indices, tests — + // pass `HashMap::new()`; inline-committed/delete tables are never staged) + // keep the byte-identical `reopen_for_mutation` path. + mut committed_handles: std::collections::HashMap, ) -> Result> { if updates.is_empty() { return Ok(Vec::new()); } - let snapshot = db.snapshot_for_branch(branch).await?; + // With a `WriteTxn` the schema contract was validated once at capture, so + // reuse the pinned base entries (same per-branch manifest snapshot) instead + // of `snapshot_for_branch` (which re-runs `ensure_schema_state_valid`). Only + // the `entry(table_key).table_path` is read out of it here, identical to the + // no-txn path; the post-`commit_staged` index build below still reopens the + // dataset at its just-committed version. Without a txn, byte-identical. + let snapshot = match txn { + Some(txn) => txn.base.clone(), + None => db.snapshot_for_branch(branch).await?, + }; let mut prepared = Vec::with_capacity(updates.len()); for update in updates { @@ -1084,21 +1202,34 @@ async fn prepare_updates_for_commit( let mut prepared_update = update.clone(); if prepared_update.row_count > 0 { let full_path = format!("{}/{}", db.root_uri, entry.table_path); - // Strict version check is correct here: this runs INSIDE + // Reuse the post-`commit_staged` handle when the caller handed one + // out (collapse #4): it is already open at exactly + // `prepared_update.table_version`, so the defense-in-depth strict + // re-check `reopen_for_mutation` would run is trivially satisfied + // and the open is redundant. When no handle is present (other + // writers, or any non-staged table), fall back to the byte-identical + // `reopen_for_mutation` path. + // + // Strict version check is correct on the fallback: this runs INSIDE // the publisher commit path, after `commit_staged` already // advanced Lance HEAD to `prepared_update.table_version`. // The check is a defense-in-depth assertion that the // dataset state matches what we just committed; not the // pre-stage race the op-kind policy targets. - let mut ds = reopen_for_mutation( - db, - &prepared_update.table_key, - &full_path, - prepared_update.table_branch.as_deref(), - prepared_update.table_version, - crate::db::MutationOpKind::SchemaRewrite, - ) - .await?; + let mut ds = match committed_handles.remove(&prepared_update.table_key) { + Some(ds) => ds, + None => { + reopen_for_mutation( + db, + &prepared_update.table_key, + &full_path, + prepared_update.table_branch.as_deref(), + prepared_update.table_version, + crate::db::MutationOpKind::SchemaRewrite, + ) + .await? + } + }; // Any column not yet buildable (e.g. a vector column whose rows // have null embeddings) is deferred and logged inside // build_indices; a later ensure_indices/optimize materializes it. @@ -1237,37 +1368,27 @@ pub(super) async fn commit_updates( .await .current_branch() .map(str::to_string); - let prepared = prepare_updates_for_commit(db, current_branch.as_deref(), updates).await?; + let prepared = prepare_updates_for_commit( + db, + current_branch.as_deref(), + updates, + None, + std::collections::HashMap::new(), + ) + .await?; commit_prepared_updates(db, &prepared, None).await } -pub(super) async fn commit_manifest_updates( +pub(super) async fn commit_merge_with_actor( db: &Omnigraph, updates: &[crate::db::SubTableUpdate], -) -> Result { - db.coordinator - .write() - .await - .commit_manifest_updates(updates) - .await -} - -pub(super) async fn record_merge_commit( - db: &Omnigraph, - manifest_version: u64, - parent_commit_id: &str, merged_parent_commit_id: &str, actor_id: Option<&str>, ) -> Result { db.coordinator .write() .await - .record_merge_commit( - manifest_version, - parent_commit_id, - merged_parent_commit_id, - actor_id, - ) + .commit_merge_with_actor(updates, merged_parent_commit_id, actor_id) .await .map(|snapshot_id| snapshot_id.as_str().to_string()) } @@ -1281,9 +1402,12 @@ pub(super) async fn commit_updates_on_branch_with_expected( updates: &[crate::db::SubTableUpdate], expected_table_versions: &std::collections::HashMap, actor_id: Option<&str>, + txn: Option<&crate::db::WriteTxn>, + committed_handles: std::collections::HashMap, ) -> Result { db.ensure_schema_apply_not_locked("write commit").await?; - let prepared = prepare_updates_for_commit(db, branch, updates).await?; + let prepared = + prepare_updates_for_commit(db, branch, updates, txn, committed_handles).await?; commit_prepared_updates_on_branch_with_expected( db, branch, diff --git a/crates/omnigraph/src/db/recovery_audit.rs b/crates/omnigraph/src/db/recovery_audit.rs index 05d84b8..3444773 100644 --- a/crates/omnigraph/src/db/recovery_audit.rs +++ b/crates/omnigraph/src/db/recovery_audit.rs @@ -14,15 +14,14 @@ //! this change additive. //! //! Atomicity caveat: append to `_graph_commit_recoveries.lance` is -//! sequential w.r.t. the `CommitGraph::append_commit` write. A crash -//! between the two leaves an orphan commit-graph row with no audit row. -//! Same shape as the existing `_graph_commits` + `_graph_commit_actors` -//! split; the recovery sweep tolerates it the same way (re-entry sees -//! `NoMovement` for already-restored / already-published tables; the -//! audit append is retried). +//! sequential w.r.t. the recovery commit, which RFC-013 Phase 7 records in +//! `__manifest` (folded into the recovery publish CAS via `publish_recovery_commit`). +//! A crash between the publish and this audit append leaves a recovery commit +//! with no audit row. The recovery sweep tolerates it the same way (re-entry +//! sees `NoMovement` for already-restored / already-published tables; the audit +//! append is retried, minting a fresh recovery commit). use std::sync::Arc; -use std::time::{SystemTime, UNIX_EPOCH}; use arrow_array::{ Array, RecordBatch, RecordBatchIterator, StringArray, TimestampMicrosecondArray, @@ -195,7 +194,11 @@ async fn create_recoveries_dataset(root_uri: &str) -> Result { }; match Dataset::write(reader, &uri as &str, Some(params)).await { Ok(dataset) => Ok(dataset), - Err(err) if err.to_string().contains("Dataset already exists") => Dataset::open(&uri) + // Create-or-open idempotency — match the typed `DatasetAlreadyExists` + // variant, not the display string (not a Lance API contract). Same + // discipline as `commit_graph.rs`'s create-or-open; pinned by + // `lance_surface_guards.rs::lance_error_dataset_already_exists_variant_exists`. + Err(lance::Error::DatasetAlreadyExists { .. }) => Dataset::open(&uri) .await .map_err(|open_err| OmniError::Lance(open_err.to_string())), Err(err) => Err(OmniError::Lance(err.to_string())), @@ -276,13 +279,6 @@ fn decode_row(batch: &RecordBatch, row: usize) -> Result { }) } -pub(crate) fn now_micros() -> Result { - SystemTime::now() - .duration_since(UNIX_EPOCH) - .map(|d| d.as_micros() as i64) - .map_err(|e| OmniError::manifest_internal(format!("system clock before unix epoch: {}", e))) -} - #[cfg(test)] mod tests { use super::*; diff --git a/crates/omnigraph/src/exec/merge.rs b/crates/omnigraph/src/exec/merge.rs index 600fdf1..c846894 100644 --- a/crates/omnigraph/src/exec/merge.rs +++ b/crates/omnigraph/src/exec/merge.rs @@ -1068,10 +1068,13 @@ async fn publish_rewritten_merge_table( // source onto target). The inline `delete_where` later in this // function operates on rows the rewrite chose to remove, not // user-facing predicates, so Merge is the correct policy here. - let (ds, full_path, table_branch) = target_db + // `open_for_mutation` is the no-txn entry, so collapse #1's non-strict + // open-skip (gated on `txn.is_some()`) never fires here — the handle is + // always `Some`. + let (mut current_ds, full_path, table_branch) = target_db .open_for_mutation(table_key, crate::db::MutationOpKind::Merge) - .await?; - let mut current_ds = ds; + .await? + .require_handle("branch merge"); // Phase 1: merge_insert changed/new rows (preserves _row_created_at_version for // existing rows, bumps _row_last_updated_at_version only for actually-changed rows). @@ -1125,7 +1128,7 @@ async fn publish_rewritten_merge_table( // rows are on Lance HEAD but the delete has not committed and the // achieved-version intent has not been recorded, so recovery must roll BACK. // See tests/failpoints.rs::branch_merge_rewrite_partial_after_merge_rolls_back. - crate::failpoints::maybe_fail("branch_merge.rewrite_after_merge_pre_delete")?; + crate::failpoints::maybe_fail(crate::failpoints::names::BRANCH_MERGE_REWRITE_AFTER_MERGE_PRE_DELETE)?; // Phase 2: delete removed rows via deletion vectors. // @@ -1156,7 +1159,7 @@ async fn publish_rewritten_merge_table( // recorded, so recovery must roll BACK (the index is reconciler-owned derived // state, but the merge itself never reached its commit boundary). See // tests/failpoints.rs::branch_merge_rewrite_partial_after_delete_rolls_back. - crate::failpoints::maybe_fail("branch_merge.rewrite_after_delete_pre_index")?; + crate::failpoints::maybe_fail(crate::failpoints::names::BRANCH_MERGE_REWRITE_AFTER_DELETE_PRE_INDEX)?; // Phase 3: rebuild indices. // @@ -1237,10 +1240,13 @@ async fn publish_adopted_delta( table_key: &str, delta: &AdoptDelta, ) -> Result { - let (ds, full_path, table_branch) = target_db + // `open_for_mutation` is the no-txn entry, so collapse #1's non-strict + // open-skip (gated on `txn.is_some()`) never fires here — the handle is + // always `Some`. + let (mut current_ds, full_path, table_branch) = target_db .open_for_mutation(table_key, crate::db::MutationOpKind::Merge) - .await?; - let mut current_ds = ds; + .await? + .require_handle("branch merge"); // Phase 1a: append the NEW rows. `stage_append_stream` is a streaming // `Operation::Append` — no hash join — so it never buffers the delta and @@ -1270,7 +1276,7 @@ async fn publish_adopted_delta( // have not committed and the achieved-version intent has not been recorded, so // recovery must roll BACK (not publish the appends-only state). See // tests/failpoints.rs::branch_merge_adopt_partial_after_append_rolls_back. - crate::failpoints::maybe_fail("branch_merge.adopt_after_append_pre_upsert")?; + crate::failpoints::maybe_fail(crate::failpoints::names::BRANCH_MERGE_ADOPT_AFTER_APPEND_PRE_UPSERT)?; // Phase 1b: upsert the CHANGED rows. The merge_insert hash join is now // bounded to the genuinely-changed set, not the whole delta. It runs against @@ -1302,7 +1308,7 @@ async fn publish_adopted_delta( // has not committed and the achieved-version intent has not been recorded, so // recovery must roll BACK. See // tests/failpoints.rs::branch_merge_adopt_partial_after_upsert_rolls_back. - crate::failpoints::maybe_fail("branch_merge.adopt_after_upsert_pre_delete")?; + crate::failpoints::maybe_fail(crate::failpoints::names::BRANCH_MERGE_ADOPT_AFTER_UPSERT_PRE_DELETE)?; // Phase 2: delete removed rows via deletion vectors (inline-commit residual, // same as the three-way path until Lance ships a public two-phase delete). @@ -1787,17 +1793,22 @@ impl Omnigraph { // (publish_*) AND the sidecar is confirmed, but the manifest publish // below hasn't run — so recovery rolls FORWARD. Used by // `tests/failpoints.rs::branch_merge_phase_b_failure_recovered_on_next_open`. - crate::failpoints::maybe_fail("branch_merge.post_phase_b_pre_manifest_commit")?; + crate::failpoints::maybe_fail(crate::failpoints::names::BRANCH_MERGE_POST_PHASE_B_PRE_MANIFEST_COMMIT)?; - let manifest_version = if updates.is_empty() { - self.version().await - } else { - self.commit_manifest_updates(&updates).await? - }; + // Publish the merged table versions AND the merge commit in one manifest + // CAS (RFC-013 Phase 7): `graph_commit` + `graph_head` rows ride the same + // merge-insert as the table-version rows. The merge commit's first parent + // is resolved by the publisher as the live target-branch head (the + // post-merge correct parent even if the target advanced); its merged-in + // parent is the source head. `target_head_commit_id` is no longer passed + // — it was the pre-merge target head, which the publisher reads live. + let _ = target_head_commit_id; + self.commit_merge_with_actor(&updates, source_head_commit_id, actor_id) + .await?; - // Recovery sidecar lifecycle: delete after manifest publish. - // Best-effort cleanup; the merge already landed durably so - // failing the user here is undesirable. + // Recovery sidecar lifecycle: delete after the manifest publish (Phase C). + // Best-effort cleanup; the merge already landed durably so failing the + // user here is undesirable. if let Some((_, handle)) = recovery { if let Err(err) = crate::db::manifest::delete_sidecar(&handle, self.storage_adapter()).await @@ -1809,13 +1820,6 @@ impl Omnigraph { ); } } - self.record_merge_commit( - manifest_version, - target_head_commit_id, - source_head_commit_id, - actor_id, - ) - .await?; if changed_edge_tables { self.invalidate_graph_index().await; diff --git a/crates/omnigraph/src/exec/mutation.rs b/crates/omnigraph/src/exec/mutation.rs index fbd0751..fe63a0c 100644 --- a/crates/omnigraph/src/exec/mutation.rs +++ b/crates/omnigraph/src/exec/mutation.rs @@ -601,13 +601,51 @@ use super::staging::{MutationStaging, PendingMode}; /// away once Lance exposes a two-phase delete API /// ([lance-format/lance#6658](https://github.com/lance-format/lance/issues/6658)) /// and we can stage deletes on the same path as inserts/updates. +impl Omnigraph { + /// Resolve a LIVE-HEAD read handle for an edge table's committed-state `@card` + /// scan when collapse #1 skipped the accumulation open. The edge-insert path no + /// longer opens the edge dataset (non-strict op + txn), but cardinality is + /// validated ONCE (never rechecked at commit), so the scan must observe the + /// freshest committed edges — NOT the pinned `txn.base`. A concurrent writer can + /// commit edges to this table after `txn` capture; counting against the stale + /// base undercounts and lets a violating insert through (invariant 9). The table + /// LOCATION is read from the pinned entry (stable across versions); the dataset is + /// opened at live HEAD via `open_dataset_head_for_write` (a read here despite the + /// name — no lock/stage), restoring the pre-3b image (the mutation's own open). + /// The residual validate→commit race (a writer committing between this scan and + /// the end-of-query commit) is the §7.1 gap, closed by RFC-013 step 4. + async fn edge_cardinality_read_handle( + &self, + txn: Option<&crate::db::WriteTxn>, + table_key: &str, + ) -> Result { + let branch = txn.and_then(|t| t.branch.as_deref()); + match txn.and_then(|t| t.base.entry(table_key)) { + Some(entry) => { + let full_path = self.storage().dataset_uri(&entry.table_path); + self.storage() + .open_dataset_head_for_write(table_key, &full_path, branch) + .await + } + // Unreachable today (the `None` handle only reaches here under a txn whose + // base contains the table). Defensive: resolve the table fresh (live) + // without the schema re-validation `snapshot_for_branch` would re-run. + None => { + let snapshot = self.fresh_snapshot_for_branch_unchecked(branch).await?; + self.storage().open_snapshot_at_table(&snapshot, table_key).await + } + } + } +} + async fn open_table_for_mutation( db: &Omnigraph, staging: &mut MutationStaging, branch: Option<&str>, table_key: &str, op_kind: crate::db::MutationOpKind, -) -> Result<(SnapshotHandle, String, Option)> { + txn: Option<&crate::db::WriteTxn>, +) -> Result<(Option, String, Option)> { if let Some(prior) = staging.inline_committed.get(table_key) { let path = staging.paths.get(table_key).ok_or_else(|| { OmniError::manifest_internal(format!( @@ -615,6 +653,10 @@ async fn open_table_for_mutation( table_key )) })?; + // The inline-committed reopen does NOT validate the schema contract + // (it reopens at the post-inline-commit Lance version directly), so it + // takes no `txn` — threading it here would change nothing. Deletes are + // strict ops, so this always opens (returns `Some`). let ds = db .reopen_for_mutation( table_key, @@ -624,20 +666,32 @@ async fn open_table_for_mutation( op_kind, ) .await?; - return Ok((ds, path.full_path.clone(), path.table_branch.clone())); + return Ok((Some(ds), path.full_path.clone(), path.table_branch.clone())); } - let (ds, full_path, table_branch) = db - .open_for_mutation_on_branch(branch, table_key, op_kind) + // `open_for_mutation_on_branch` returns the expected version even when it + // skips the open (collapse #1, the non-strict insert/merge path): the version + // is the pinned base's, identical to the opened handle's `.version()`. Use it + // directly for `ensure_path` so the no-open path still captures the publisher + // CAS fence. + let opened = db + .open_for_mutation_on_branch(branch, table_key, op_kind, txn) .await?; - let expected_version = ds.version(); + // Pin the open-skip contract (collapse #1): a missing handle is legal ONLY on + // the non-strict `txn` path. A future change that returns `None` elsewhere + // (e.g. a new strict arm) trips this in debug builds rather than silently + // handing a `None` to a `require_handle` consumer. + debug_assert!( + opened.handle.is_some() || (txn.is_some() && !op_kind.strict_pre_stage_version_check()), + "open_for_mutation_on_branch returned no handle outside the non-strict txn open-skip path", + ); staging.ensure_path( table_key, - full_path.clone(), - table_branch.clone(), - expected_version, + opened.full_path.clone(), + opened.table_branch.clone(), + opened.expected_version, op_kind, ); - Ok((ds, full_path, table_branch)) + Ok((opened.handle, opened.full_path, opened.table_branch)) } /// D₂ parse-time check: a single mutation query is either insert/update-only @@ -720,14 +774,14 @@ impl Omnigraph { params: &ParamMap, actor_id: Option<&str>, ) -> Result { - self.ensure_schema_state_valid().await?; // Converge any pending recovery sidecar (a previously failed // writer's Phase B → Phase C residual) before executing: the // inline delete path advances Lance HEAD during execution and // the staged path's commit-time drift guard refuses // sidecar-covered drift, so a long-lived handle must heal here // — not at restart. One `list_dir` when no sidecars exist (the - // steady state). + // steady state). MUST run before `open_write_txn` below — the heal + // may advance the manifest, so the pinned base must be captured after. self.heal_pending_recovery_sidecars().await?; let requested = Self::normalize_branch_name(branch)?; // Reject internal `__run__*` / system-prefixed branches at the @@ -737,6 +791,16 @@ impl Omnigraph { if let Some(name) = requested.as_deref() { crate::db::ensure_public_branch_ref(name, "mutate")?; } + // Capture-once write transaction (RFC-013 step 3b). `open_write_txn` + // validates the schema contract ONCE (it resolves the branch target, + // whose first line is `ensure_schema_state_valid`) and pins the base + // snapshot for this write. Threaded as `Some(&txn)` through execution, + // staging commit, and the manifest publish so the per-table opens and + // the commit-time OCC re-read reuse the pinned base instead of + // re-validating the contract at every resolve point. Captured AFTER the + // recovery heal (which may advance the manifest) and AFTER `requested` + // is known so it pins the post-heal snapshot for the correct branch. + let txn = self.open_write_txn(requested.as_deref()).await?; let resolved_params = enrich_mutation_params(params)?; // Per-query staging accumulator. Inserts and updates push batches @@ -785,7 +849,13 @@ impl Omnigraph { }; let exec_result = self - .execute_named_mutation(&ir, &resolved_params, requested.as_deref(), &mut staging) + .execute_named_mutation( + &ir, + &resolved_params, + requested.as_deref(), + &mut staging, + Some(&txn), + ) .await; match exec_result { @@ -799,13 +869,20 @@ impl Omnigraph { // interleave between our commit_staged and our publish // (which would correctly fail our CAS but leave Lance // HEAD advanced — the residual class MR-870 recovers). - let (updates, expected_versions, sidecar_handle, _queue_guards) = staged + let super::staging::CommittedMutation { + updates, + expected_versions, + sidecar_handle, + guards: _queue_guards, + committed_handles, + } = staged .commit_all( self, requested.as_deref(), crate::db::manifest::SidecarKind::Mutation, actor_id, fork_queue_guards, + Some(&txn), ) .await?; // Failpoint that wedges the documented finalize→publisher @@ -818,12 +895,14 @@ impl Omnigraph { // across this failure so the next `Omnigraph::open`'s // recovery sweep can roll forward — see // `tests/failpoints.rs::recovery_rolls_forward_after_finalize_publisher_failure`. - crate::failpoints::maybe_fail("mutation.post_finalize_pre_publisher")?; + crate::failpoints::maybe_fail(crate::failpoints::names::MUTATION_POST_FINALIZE_PRE_PUBLISHER)?; self.commit_updates_on_branch_with_expected( requested.as_deref(), &updates, &expected_versions, actor_id, + Some(&txn), + committed_handles, ) .await?; // Phase C succeeded — sidecar can be deleted. If this @@ -938,6 +1017,7 @@ impl Omnigraph { params: &ParamMap, branch: Option<&str>, staging: &mut MutationStaging, + txn: Option<&crate::db::WriteTxn>, ) -> Result { let mut total = MutationResult::default(); for op in &ir.ops { @@ -946,7 +1026,7 @@ impl Omnigraph { type_name, assignments, } => { - self.execute_insert(type_name, assignments, params, branch, staging) + self.execute_insert(type_name, assignments, params, branch, staging, txn) .await? } MutationOpIR::Update { @@ -954,14 +1034,16 @@ impl Omnigraph { assignments, predicate, } => { - self.execute_update(type_name, assignments, predicate, params, branch, staging) - .await? + self.execute_update( + type_name, assignments, predicate, params, branch, staging, txn, + ) + .await? } MutationOpIR::Delete { type_name, predicate, } => { - self.execute_delete(type_name, predicate, params, branch, staging) + self.execute_delete(type_name, predicate, params, branch, staging, txn) .await? } }; @@ -978,6 +1060,7 @@ impl Omnigraph { params: &ParamMap, branch: Option<&str>, staging: &mut MutationStaging, + txn: Option<&crate::db::WriteTxn>, ) -> Result { let mut resolved: HashMap = HashMap::new(); for a in assignments { @@ -1025,8 +1108,12 @@ impl Omnigraph { } else { crate::db::MutationOpKind::Insert }; + // Node inserts are non-strict (Insert/Merge), so with a `WriteTxn` + // this opens NOTHING (collapse #1) — the handle is discarded anyway; + // only `ensure_path`'s captured version (read inside + // `open_table_for_mutation`) is used downstream. let (_ds, _full_path, _table_branch) = - open_table_for_mutation(self, staging, branch, &table_key, insert_kind).await?; + open_table_for_mutation(self, staging, branch, &table_key, insert_kind, txn).await?; // Accumulate. @key inserts go into the Merge stream (so a // later update on the same id coalesces correctly); no-key // inserts go into the Append stream. @@ -1059,13 +1146,16 @@ impl Omnigraph { )?; } let table_key = format!("edge:{}", type_name); - // Capture pre-write metadata on first touch (no Lance write). - let (ds, _full_path, _table_branch) = open_table_for_mutation( + // Capture pre-write metadata on first touch. Edge inserts are + // non-strict, so with a `WriteTxn` this opens NOTHING (collapse #1) + // and returns `None`. + let (handle, _full_path, _table_branch) = open_table_for_mutation( self, staging, branch, &table_key, crate::db::MutationOpKind::Insert, + txn, ) .await?; // Accumulate the new edge row. Edge IDs are ULID-generated so @@ -1075,9 +1165,27 @@ impl Omnigraph { // Edge cardinality validation: scan committed edges via Lance // + iterate pending edges in-memory for the `src` column, // group-by-src. The pending side already includes the row - // we just appended (above). - validate_edge_cardinality_with_pending(self, &ds, staging, &table_key, edge_type) + // we just appended (above). When the open was skipped (collapse + // #1), resolve a read handle for the committed scan at LIVE HEAD + // (`edge_cardinality_read_handle`, #298) — NOT the pinned txn.base, + // which would undercount edges a concurrent writer committed since + // capture. Only when cardinality is non-default, so the common + // default-cardinality edge keeps the open-free path. (The residual + // validate→commit race is the §7.1 gap — step 4.) + if !edge_type.cardinality.is_default() { + let committed_ds = match handle { + Some(h) => h, + None => self.edge_cardinality_read_handle(txn, &table_key).await?, + }; + validate_edge_cardinality_with_pending( + self, + &committed_ds, + staging, + &table_key, + edge_type, + ) .await?; + } self.invalidate_graph_index().await; @@ -1098,6 +1206,7 @@ impl Omnigraph { params: &ParamMap, branch: Option<&str>, staging: &mut MutationStaging, + txn: Option<&crate::db::WriteTxn>, ) -> Result { // Defense in depth: ensure this is a node type if !self.catalog().node_types.contains_key(type_name) { @@ -1122,14 +1231,18 @@ impl Omnigraph { let blob_props = self.catalog().node_types[type_name].blob_properties.clone(); let table_key = format!("node:{}", type_name); - let (ds, _full_path, _table_branch) = open_table_for_mutation( + let (handle, _full_path, _table_branch) = open_table_for_mutation( self, staging, branch, &table_key, crate::db::MutationOpKind::Update, + txn, ) .await?; + // Update is a STRICT op, so collapse #1 never skips its open — the + // handle is always `Some` (and it's needed for the committed scan below). + let ds = handle.expect("strict Update op always opens its dataset"); // Scan committed via Lance + apply the same predicate to pending // batches via DataFusion `MemTable` (read-your-writes for prior @@ -1228,13 +1341,14 @@ impl Omnigraph { params: &ParamMap, branch: Option<&str>, staging: &mut MutationStaging, + txn: Option<&crate::db::WriteTxn>, ) -> Result { let is_node = self.catalog().node_types.contains_key(type_name); if is_node { - self.execute_delete_node(type_name, predicate, params, branch, staging) + self.execute_delete_node(type_name, predicate, params, branch, staging, txn) .await } else { - self.execute_delete_edge(type_name, predicate, params, branch, staging) + self.execute_delete_edge(type_name, predicate, params, branch, staging, txn) .await } } @@ -1246,18 +1360,22 @@ impl Omnigraph { params: &ParamMap, branch: Option<&str>, staging: &mut MutationStaging, + txn: Option<&crate::db::WriteTxn>, ) -> Result { let pred_sql = predicate_to_sql(predicate, params, false)?; let table_key = format!("node:{}", type_name); - let (ds, full_path, table_branch) = open_table_for_mutation( + let (handle, full_path, table_branch) = open_table_for_mutation( self, staging, branch, &table_key, crate::db::MutationOpKind::Delete, + txn, ) .await?; + // Delete is a STRICT op, so collapse #1 never skips its open. + let ds = handle.expect("strict Delete op always opens its dataset"); let initial_version = ds.version(); // Scan matching IDs for cascade. Per D₂ this never overlaps with @@ -1305,7 +1423,7 @@ impl Omnigraph { crate::db::MutationOpKind::Delete, ) .await?; - crate::failpoints::maybe_fail("mutation.delete_node_pre_primary_delete")?; + crate::failpoints::maybe_fail(crate::failpoints::names::MUTATION_DELETE_NODE_PRE_PRIMARY_DELETE)?; let (_new_ds, delete_state) = self .storage_inline_residual() .delete_where(&full_path, ds, &pred_sql) @@ -1347,14 +1465,17 @@ impl Omnigraph { let edge_table_key = format!("edge:{}", edge_name); let cascade_filter = cascade_filters.join(" OR "); - let (edge_ds, edge_full_path, edge_table_branch) = open_table_for_mutation( + let (edge_handle, edge_full_path, edge_table_branch) = open_table_for_mutation( self, staging, branch, &edge_table_key, crate::db::MutationOpKind::Delete, + txn, ) .await?; + // Delete is a STRICT op, so collapse #1 never skips its open. + let edge_ds = edge_handle.expect("strict Delete op always opens its dataset"); let (_new_edge_ds, edge_delete) = self .storage_inline_residual() @@ -1391,18 +1512,22 @@ impl Omnigraph { params: &ParamMap, branch: Option<&str>, staging: &mut MutationStaging, + txn: Option<&crate::db::WriteTxn>, ) -> Result { let pred_sql = predicate_to_sql(predicate, params, true)?; let table_key = format!("edge:{}", type_name); - let (ds, full_path, table_branch) = open_table_for_mutation( + let (handle, full_path, table_branch) = open_table_for_mutation( self, staging, branch, &table_key, crate::db::MutationOpKind::Delete, + txn, ) .await?; + // Delete is a STRICT op, so collapse #1 never skips its open. + let ds = handle.expect("strict Delete op always opens its dataset"); let (_new_ds, delete_state) = self .storage_inline_residual() diff --git a/crates/omnigraph/src/exec/staging.rs b/crates/omnigraph/src/exec/staging.rs index 31d5ce8..7760c95 100644 --- a/crates/omnigraph/src/exec/staging.rs +++ b/crates/omnigraph/src/exec/staging.rs @@ -440,6 +440,26 @@ struct StagedTableEntry { staged_write: StagedHandle, } +/// Output of [`StagedMutation::commit_all`] (Phase B): the publisher's input plus +/// the queue guards the caller must hold across the manifest publish. +pub(crate) struct CommittedMutation { + /// Per-table updates to publish to the manifest. + pub(crate) updates: Vec, + /// Per-table manifest pins refreshed under the write queue — the publisher's CAS fence. + pub(crate) expected_versions: HashMap, + /// Recovery sidecar to delete after Phase C succeeds (`None` when nothing staged). + pub(crate) sidecar_handle: Option, + /// Per-`(table, branch)` write-queue guards — the caller MUST hold these across + /// the manifest publish (see `commit_all`) so no writer interleaves between + /// `commit_staged` and the publish. + pub(crate) guards: Vec>, + /// Post-`commit_staged` handle per STAGED table (table_key → handle at the + /// just-committed version). Carried out (RFC-013 step 3b, collapse #4) so the + /// publish-prepare index build reuses it instead of a fresh `reopen_for_mutation` + /// at the same version. Inline-committed / delete tables are absent (no staged handle). + pub(crate) committed_handles: HashMap, +} + impl StagedMutation { /// **Phase B** of the two-phase commit: acquire per-`(table_key, /// branch)` queues, revalidate manifest pins, write the recovery @@ -485,12 +505,8 @@ impl StagedMutation { Vec<(String, Option)>, Vec>, )>, - ) -> Result<( - Vec, - HashMap, - Option, - Vec>, - )> { + txn: Option<&crate::db::WriteTxn>, + ) -> Result { let StagedMutation { inline_committed, mut staged, @@ -585,7 +601,18 @@ impl StagedMutation { // Multi-coordinator deployments (§VI.27 aspirational) get // genuine cross-process drift detection from this read for // free. - let snapshot = db.fresh_snapshot_for_branch(branch).await?; + // + // This MUST be a FRESH per-branch manifest read (never the warm + // cache) for the OCC re-capture below — but with a `WriteTxn` the + // schema contract was already validated at capture, so use the + // `_unchecked` variant, which drops the redundant + // `ensure_schema_state_valid` AND the commit-graph load the OCC read + // never consults (a fresh manifest read yields the same `Snapshot`). + // Without a txn this is byte-identical to the prior checked call. + let snapshot = match txn { + Some(_) => db.fresh_snapshot_for_branch_unchecked(branch).await?, + None => db.fresh_snapshot_for_branch(branch).await?, + }; for entry in staged.iter_mut() { let current = snapshot .entry(&entry.table_key) @@ -619,15 +646,20 @@ impl StagedMutation { // live Lance HEAD still equals that manifest pin. If an external // raw Lance write or a pre-fix maintenance path moved HEAD without // publishing `__manifest`, this write must not silently fold it. - let head = db - .storage() - .open_dataset_head_for_write( - &entry.table_key, - &entry.path.full_path, - entry.path.table_branch.as_deref(), - ) - .await? - .version(); + // + // `latest_version_id` reads the latest manifest pointer off the + // already-open staged handle (the #2 staging open) WITHOUT a fresh + // `Dataset::open` — the same cheap live-HEAD probe + // `ManifestCoordinator::probe_latest_version` uses. This replaces a + // redundant `open_dataset_head_for_write` (RFC-013 step 3b, collapse + // #3): the drift comparison below is byte-identical; only how `head` + // is obtained changes (probe vs cold open). + let head = entry + .dataset + .dataset() + .latest_version_id() + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; if head < current { return Err(OmniError::manifest_internal(format!( "table '{}' Lance HEAD version {} is behind manifest version {}", @@ -786,6 +818,12 @@ impl StagedMutation { let mut updates: Vec = inline_committed.into_values().collect(); + // Carry each staged table's post-`commit_staged` handle out so the + // publish-prepare index build reuses it (collapse #4) instead of + // re-opening the dataset at the same just-committed version. + let mut committed_handles: HashMap = + HashMap::with_capacity(staged.len()); + for entry in staged { let StagedTableEntry { table_key, @@ -798,15 +836,22 @@ impl StagedMutation { let new_ds = db.storage().commit_staged(dataset, staged_write).await?; let state = db.storage().table_state(&path.full_path, &new_ds).await?; updates.push(SubTableUpdate { - table_key, + table_key: table_key.clone(), table_version: state.version, table_branch: path.table_branch.clone(), row_count: state.row_count, version_metadata: state.version_metadata, }); + committed_handles.insert(table_key, new_ds); } - Ok((updates, expected_versions, sidecar_handle, guards)) + Ok(CommittedMutation { + updates, + expected_versions, + sidecar_handle, + guards, + committed_handles, + }) } } diff --git a/crates/omnigraph/src/failpoints.rs b/crates/omnigraph/src/failpoints.rs index 461b73e..a353345 100644 --- a/crates/omnigraph/src/failpoints.rs +++ b/crates/omnigraph/src/failpoints.rs @@ -14,6 +14,115 @@ pub(crate) fn maybe_fail(_name: &str) -> Result<()> { Ok(()) } +/// Failpoint that injects a *Lance* error rather than an `OmniError`. Used to +/// stand in for a `Dataset::open` failing with a transient/corrupt (non-not-found) +/// error, so a test can drive the caller's lance-error classification — the +/// behavior FIX A (`read_legacy_commit_cache`) relies on: a not-found is benign +/// (empty), anything else propagates. A no-op without the `failpoints` feature +/// (the injected variant is therefore unreachable in release builds). +#[allow(unused_variables)] +pub(crate) fn maybe_fail_lance_open(name: &str) -> std::result::Result<(), lance::Error> { + #[cfg(feature = "failpoints")] + { + fail::fail_point!(name, |_| { + Err(lance::Error::io(format!( + "injected failpoint triggered: {name}" + ))) + }); + } + Ok(()) +} + +/// Failpoint that injects a Lance `IncompatibleTransaction` — the variant a +/// concurrent `UpdateConfig` stamp race produces. Lets a test drive the v3→v4 +/// stamp loop's exhaustion path (`commit_v4_stamp_idempotently`) deterministically; +/// it is otherwise near-unreachable, since a real concurrent winner stamps the SAME +/// value, so the loop's re-read returns `Ok` on the first retry. A no-op without the +/// `failpoints` feature. +#[allow(unused_variables)] +pub(crate) fn maybe_fail_lance_incompatible(name: &str) -> std::result::Result<(), lance::Error> { + #[cfg(feature = "failpoints")] + { + fail::fail_point!(name, |_| { + Err(lance::Error::incompatible_transaction_source( + format!("injected failpoint triggered: {name}").into(), + )) + }); + } + Ok(()) +} + +/// Failpoint that injects a *retryable* `RowLevelCasContention` `OmniError` — the +/// typed conflict the manifest publisher's outer retry treats as retryable +/// (`is_retryable_publish_conflict`). Used to drive the publisher's +/// retry-on-`load_publish_state`-error path deterministically: the v3→v4 migration +/// surfaces this same type on exhaustion EXPECTING the publisher to re-run the +/// load, a path otherwise reachable only under sustained multi-writer contention. +/// A no-op without the `failpoints` feature. +#[allow(unused_variables)] +pub(crate) fn maybe_fail_retryable_contention(name: &str) -> Result<()> { + #[cfg(feature = "failpoints")] + { + fail::fail_point!(name, |_| { + return Err(crate::error::OmniError::manifest_row_level_cas_contention( + format!("injected retryable contention failpoint: {name}"), + )); + }); + } + Ok(()) +} + +/// Compile-checked catalog of every failpoint name in this crate. Call sites +/// (`maybe_fail`) and tests (`ScopedFailPoint` / the test rendezvous helper) +/// reference these constants instead of bare string literals, so a typo is a +/// compile error rather than a silently-never-firing failpoint. +pub mod names { + pub const BRANCH_CREATE_AFTER_MANIFEST_BRANCH_CREATE: &str = "branch_create.after_manifest_branch_create"; + pub const BRANCH_DELETE_BEFORE_COMMIT_GRAPH_RECLAIM: &str = "branch_delete.before_commit_graph_reclaim"; + pub const BRANCH_DELETE_BEFORE_TABLE_CLEANUP: &str = "branch_delete.before_table_cleanup"; + pub const BRANCH_MERGE_ADOPT_AFTER_APPEND_PRE_UPSERT: &str = "branch_merge.adopt_after_append_pre_upsert"; + pub const BRANCH_MERGE_ADOPT_AFTER_UPSERT_PRE_DELETE: &str = "branch_merge.adopt_after_upsert_pre_delete"; + pub const BRANCH_MERGE_POST_PHASE_B_PRE_MANIFEST_COMMIT: &str = "branch_merge.post_phase_b_pre_manifest_commit"; + pub const BRANCH_MERGE_REWRITE_AFTER_DELETE_PRE_INDEX: &str = "branch_merge.rewrite_after_delete_pre_index"; + pub const BRANCH_MERGE_REWRITE_AFTER_MERGE_PRE_DELETE: &str = "branch_merge.rewrite_after_merge_pre_delete"; + pub const CLASSIFY_FRESH_READ: &str = "classify.fresh_read"; + pub const CLEANUP_RECONCILE_FORK: &str = "cleanup.reconcile_fork"; + pub const CLEANUP_RESOLVE_BRANCH_SNAPSHOT: &str = "cleanup.resolve_branch_snapshot"; + pub const CLEANUP_TABLE_GC: &str = "cleanup.table_gc"; + pub const ENSURE_INDICES_POST_PHASE_B_PRE_MANIFEST_COMMIT: &str = "ensure_indices.post_phase_b_pre_manifest_commit"; + pub const ENSURE_INDICES_POST_STAGE_PRE_COMMIT_BTREE: &str = "ensure_indices.post_stage_pre_commit_btree"; + pub const FORK_BEFORE_CLASSIFY: &str = "fork.before_classify"; + pub const FORK_BEFORE_RECLAIM: &str = "fork.before_reclaim"; + pub const GRAPH_PUBLISH_AFTER_MANIFEST_COMMIT: &str = "graph_publish.after_manifest_commit"; + pub const GRAPH_PUBLISH_BEFORE_COMMIT_APPEND: &str = "graph_publish.before_commit_append"; + pub const INIT_AFTER_COORDINATOR_INIT: &str = "init.after_coordinator_init"; + pub const INIT_AFTER_SCHEMA_CONTRACT_WRITTEN: &str = "init.after_schema_contract_written"; + pub const INIT_AFTER_SCHEMA_PG_WRITTEN: &str = "init.after_schema_pg_written"; + pub const MUTATION_DELETE_NODE_PRE_PRIMARY_DELETE: &str = "mutation.delete_node_pre_primary_delete"; + pub const MUTATION_POST_FINALIZE_PRE_PUBLISHER: &str = "mutation.post_finalize_pre_publisher"; + pub const OPTIMIZE_BEFORE_COMPACT: &str = "optimize.before_compact"; + pub const OPTIMIZE_INJECT_REINDEX_CONFLICT: &str = "optimize.inject_reindex_conflict"; + pub const OPTIMIZE_POST_PHASE_B_PRE_MANIFEST_COMMIT: &str = "optimize.post_phase_b_pre_manifest_commit"; + pub const RECOVERY_BEFORE_ROLL_FORWARD_PUBLISH: &str = "recovery.before_roll_forward_publish"; + pub const RECOVERY_ORPHAN_DISCARD_AUDIT_APPEND: &str = "recovery.orphan_discard_audit_append"; + pub const RECOVERY_RECORD_AUDIT: &str = "recovery.record_audit"; + pub const RECOVERY_SIDECAR_CONFIRM: &str = "recovery.sidecar_confirm"; + pub const RECOVERY_SIDECAR_DELETE: &str = "recovery.sidecar_delete"; + pub const RECOVERY_SIDECAR_LIST: &str = "recovery.sidecar_list"; + pub const RECOVERY_SIDECAR_WRITE: &str = "recovery.sidecar_write"; + pub const SCHEMA_APPLY_AFTER_MANIFEST_COMMIT: &str = "schema_apply.after_manifest_commit"; + pub const SCHEMA_APPLY_AFTER_STAGING_WRITE: &str = "schema_apply.after_staging_write"; + pub const SCHEMA_APPLY_BEFORE_STAGING_WRITE: &str = "schema_apply.before_staging_write"; + // RFC-013 Phase 7 migration failpoints (this branch). + pub const MIGRATION_V3_TO_V4_LEGACY_OPEN: &str = "migration.v3_to_v4.legacy_open"; + pub const MIGRATION_V4_STAMP_FORCE_INCOMPATIBLE: &str = "migration.v4_stamp.force_incompatible"; + /// Injects a retryable `RowLevelCasContention` from `load_publish_state` so a + /// test can prove the publisher's outer retry re-runs the load (the migration + /// surfaces this same typed error on exhaustion). + pub const PUBLISH_LOAD_STATE_RETRYABLE_CONTENTION: &str = + "publish.load_state_retryable_contention"; +} + #[cfg(feature = "failpoints")] pub struct ScopedFailPoint { name: String, @@ -27,6 +136,20 @@ impl ScopedFailPoint { name: name.to_string(), } } + + /// Register a callback failpoint with the same Drop-based cleanup as + /// `new`. Without the guard, a panic while the point is active would + /// leak the callback into the process-global registry and fire it under + /// later tests in the same binary. + pub fn with_callback(name: &str, callback: F) -> Self + where + F: Fn() + Send + Sync + 'static, + { + fail::cfg_callback(name, callback).expect("configure callback failpoint"); + Self { + name: name.to_string(), + } + } } #[cfg(feature = "failpoints")] diff --git a/crates/omnigraph/src/instrumentation.rs b/crates/omnigraph/src/instrumentation.rs index de5b7d3..9718686 100644 --- a/crates/omnigraph/src/instrumentation.rs +++ b/crates/omnigraph/src/instrumentation.rs @@ -43,6 +43,23 @@ pub struct QueryIoProbes { /// handle cache (Fix 3) serves them. pub table_wrapper: Option>, pub probe_count: Arc, + /// Counts DATA-table open CALLS through the two instrumented chokepoints + /// (`open_dataset_tracked` / `open_table_dataset`), classified by URI so the + /// internal/system tables (`__manifest`, `_graph_commits*`) are EXCLUDED — the + /// publisher CAS and commit-graph append open those every write, and counting + /// them would make the `data_open_count <= |touched_tables|` write gate + /// (RFC-013 step 3b) unreachable by threading alone. Unlike the opener-read + /// term (which mixes with the merge-insert/RI scan on the write path), this is + /// an exact open-invocation count. `forbidden_apis` keeps engine code OUTSIDE the + /// storage layer (`exec/`, `db/omnigraph/`, `loader/`, `changes/`) from opening + /// datasets except through these chokepoints, so the count is complete for the + /// keyed-write data path the gate measures. (`table_store.rs` is allow-listed and + /// does hold direct `Dataset::open`s — but only for branch-management ops + /// (`delete_branch`/`list_branches`/`force_delete_branch`), never that hot path.) + pub data_open_count: Arc, + /// Internal/system-table (`__manifest`, `_graph_commits*`) open CALLS — the + /// complement of `data_open_count`, kept for symmetry and debugging. + pub internal_open_count: Arc, } tokio::task_local! { @@ -80,6 +97,39 @@ pub(crate) fn record_probe() { let _ = current(|p| p.probe_count.fetch_add(1, Ordering::Relaxed)); } +/// Internal/system table directory names. An open of one of these is a metadata +/// open (publisher CAS, commit-graph append, recovery audit), NOT a data-table +/// open. Kept in sync with the dir constants in `db/manifest/layout.rs`, +/// `db/commit_graph.rs`, and `db/recovery_audit.rs`. +const INTERNAL_TABLE_DIRS: [&str; 4] = [ + "__manifest", + "_graph_commits.lance", + "_graph_commit_actors.lance", + "_graph_commit_recoveries.lance", +]; + +/// True when `uri`'s last path segment names an internal/system table. +fn open_is_internal(uri: &str) -> bool { + let trimmed = uri.trim_end_matches('/'); + let last = trimmed.rsplit('/').next().unwrap_or(trimmed); + INTERNAL_TABLE_DIRS.contains(&last) +} + +/// Record one table-open call against the active per-query probes, classified by +/// table class (the URI's last segment) so the write gate counts DATA-table opens +/// only and ignores the publisher/commit-graph metadata opens. No-op in production +/// (the classification runs only inside the probe closure, which `current` skips +/// when no probes are installed). Called at both open chokepoints. +pub(crate) fn record_open(uri: &str) { + let _ = current(|p| { + if open_is_internal(uri) { + p.internal_open_count.fetch_add(1, Ordering::Relaxed); + } else { + p.data_open_count.fetch_add(1, Ordering::Relaxed); + } + }); +} + /// Per-operation staged-write counts, installed for a task via /// [`with_merge_write_probes`]. Lets a cost-budget test assert WHICH staged-write /// primitive an operation invokes — e.g. that an append-only fast-forward merge @@ -177,6 +227,7 @@ pub(crate) async fn open_dataset_tracked( uri: &str, wrapper: Option>, ) -> Result { + record_open(uri); let result = match wrapper { None => Dataset::open(uri).await, Some(wrapper) => { @@ -203,6 +254,7 @@ pub(crate) async fn open_table_dataset( version: u64, session: Option<&Arc>, ) -> Result { + record_open(location); let mut builder = DatasetBuilder::from_uri(location).with_version(version); if let Some(session) = session { builder = builder.with_session(session.clone()); diff --git a/crates/omnigraph/src/loader/mod.rs b/crates/omnigraph/src/loader/mod.rs index 2365243..075724d 100644 --- a/crates/omnigraph/src/loader/mod.rs +++ b/crates/omnigraph/src/loader/mod.rs @@ -187,7 +187,10 @@ impl Omnigraph { &omnigraph_policy::ResourceScope::Branch(branch.to_string()), actor_id, )?; - self.ensure_schema_state_valid().await?; + // Schema-contract validation is captured ONCE per write via the + // `WriteTxn` opened in `load_jsonl_reader` (after branch resolution). + // The redundant `ensure_schema_state_valid` that used to run here is + // subsumed by `open_write_txn`'s `resolved_branch_target` call. // Converge any pending recovery sidecar (a previously failed // writer's Phase B → Phase C residual) before staging anything: // without this, sidecar-covered drift wedges every load on the @@ -397,7 +400,16 @@ async fn load_jsonl_reader( // inline path. let mut result = LoadResult::default(); - let snapshot = db.snapshot_for_branch(branch).await?; + // Capture-once write transaction (RFC-013 step 3b). `open_write_txn` + // validates the schema contract ONCE and pins the base snapshot. Threaded + // as `Some(&txn)` through the per-table opens and the manifest publish so + // each resolve point reuses the pinned base instead of re-validating the + // contract. The branch already exists here (fork-if-missing ran in + // `load_as` before this), so this captures the post-fork snapshot. The + // load's own base read (`db.snapshot_for_branch` previously) is the same + // per-branch snapshot, so reuse `txn.base` for it — dropping a validation. + let txn = db.open_write_txn(branch).await?; + let snapshot = txn.base.clone(); let mut staging = MutationStaging::default(); let pending_mode = match mode { LoadMode::Merge => PendingMode::Merge, @@ -481,15 +493,18 @@ async fn load_jsonl_reader( // Phase 2b: accumulate every node type in memory. Fragment writes are // delayed until after all validation succeeds. for (type_name, table_key, batch, loaded_count) in prepared_nodes { - let (ds, full_path, table_branch) = db - .open_for_mutation_on_branch(branch, &table_key, load_op_kind) + // The loader only needs the captured expected version (the publisher's + // CAS fence) for `ensure_path` — it discards the handle. With a + // non-strict load op (Merge/Append) and a `WriteTxn`, collapse #1 skips + // the dataset open and returns the pinned base version directly. + let opened = db + .open_for_mutation_on_branch(branch, &table_key, load_op_kind, Some(&txn)) .await?; - let expected_version = ds.version(); staging.ensure_path( &table_key, - full_path, - table_branch, - expected_version, + opened.full_path, + opened.table_branch, + opened.expected_version, load_op_kind, ); let schema = batch.schema(); @@ -553,15 +568,16 @@ async fn load_jsonl_reader( // Phase 2e: accumulate every edge type. Same dispatch as Phase 2b. for (edge_name, table_key, batch, loaded_count) in prepared_edges { - let (ds, full_path, table_branch) = db - .open_for_mutation_on_branch(branch, &table_key, load_op_kind) + // Same as the node phase: only the captured expected version is used; + // collapse #1 skips the open for a non-strict load op under a `WriteTxn`. + let opened = db + .open_for_mutation_on_branch(branch, &table_key, load_op_kind, Some(&txn)) .await?; - let expected_version = ds.version(); staging.ensure_path( &table_key, - full_path, - table_branch, - expected_version, + opened.full_path, + opened.table_branch, + opened.expected_version, load_op_kind, ); let schema = batch.schema(); @@ -589,22 +605,36 @@ async fn load_jsonl_reader( // `_queue_guards` holds per-(table_key, branch) write queues // across the manifest publish below — see exec/mutation.rs for // the rationale (interleaving prevention). - let (updates, expected_versions, sidecar_handle, _queue_guards) = staged + let crate::exec::staging::CommittedMutation { + updates, + expected_versions, + sidecar_handle, + guards: _queue_guards, + committed_handles, + } = staged .commit_all( db, branch, crate::db::manifest::SidecarKind::Load, actor_id, fork_queue_guards, + Some(&txn), ) .await?; // Same finalize → publisher residual as mutations: per-table // staged commits have advanced Lance HEAD, but the manifest // publish has not run yet. Reuse the mutation failpoint name so // one failpoint pins the shared `MutationStaging` boundary. - crate::failpoints::maybe_fail("mutation.post_finalize_pre_publisher")?; - db.commit_updates_on_branch_with_expected(branch, &updates, &expected_versions, actor_id) - .await?; + crate::failpoints::maybe_fail(crate::failpoints::names::MUTATION_POST_FINALIZE_PRE_PUBLISHER)?; + db.commit_updates_on_branch_with_expected( + branch, + &updates, + &expected_versions, + actor_id, + Some(&txn), + committed_handles, + ) + .await?; // The recovery sidecar protects the per-table commit_staged → // manifest publish window. Phase C succeeded — clean up // best-effort: failing the user here would error out a write @@ -1548,80 +1578,14 @@ fn literal_value_to_f64(v: &omnigraph_compiler::catalog::LiteralValue) -> f64 { // ─── Edge cardinality validation ───────────────────────────────────────────── -pub(crate) async fn validate_edge_cardinality( - db: &crate::db::Omnigraph, - branch: Option<&str>, - edge_name: &str, - written_version: u64, - written_branch: Option<&str>, -) -> Result<()> { - use arrow_array::Array; - let catalog = db.catalog(); - let edge_type = &catalog.edge_types[edge_name]; - if edge_type.cardinality.is_default() { - return Ok(()); - } - - // Open edge sub-table at the just-written version, not the snapshot's - // (the snapshot still pins to the pre-write version). - let snapshot = db.snapshot_for_branch(branch).await?; - let table_key = format!("edge:{}", edge_name); - let entry = snapshot - .entry(&table_key) - .ok_or_else(|| OmniError::manifest(format!("no manifest entry for {}", table_key)))?; - let ds = db - .open_dataset_at_state( - &entry.table_path, - written_branch.or(entry.table_branch.as_deref()), - written_version, - ) - .await?; - - // Scan src column, count per source - let batches = db.storage().scan(&ds, Some(&["src"]), None, None).await?; - - let mut counts: HashMap = HashMap::new(); - for batch in &batches { - let srcs = batch - .column_by_name("src") - .unwrap() - .as_any() - .downcast_ref::() - .unwrap(); - for i in 0..srcs.len() { - *counts.entry(srcs.value(i).to_string()).or_insert(0) += 1; - } - } - - let card = &edge_type.cardinality; - for (src, count) in &counts { - if let Some(max) = card.max { - if *count > max { - return Err(OmniError::manifest(format!( - "@card violation on edge {}: source '{}' has {} edges (max {})", - edge_name, src, count, max - ))); - } - } - if *count < card.min { - return Err(OmniError::manifest(format!( - "@card violation on edge {}: source '{}' has {} edges (min {})", - edge_name, src, count, card.min - ))); - } - } - - Ok(()) -} - /// Validate edge `@card` cardinality with in-memory pending edges visible. /// /// Loader-level analog to `exec::mutation::validate_edge_cardinality_with_pending`: /// opens the committed dataset at the pre-load snapshot version, then /// delegates to the shared `count_src_per_edge` + `enforce_cardinality_bounds` -/// helpers in `exec::staging`. Used by Append/Merge loads (the Overwrite -/// path uses `validate_edge_cardinality` which opens the just-written -/// Lance version). +/// helpers in `exec::staging`. Used by every load mode; for `LoadMode::Overwrite` +/// it treats the pending edge batches as the replacement table image (the +/// committed rows are being replaced, so only the pending set is counted). /// /// `mode` controls dedup behavior. `LoadMode::Merge` passes `Some("id")` /// so committed edges that the load is *updating* (same edge id, diff --git a/crates/omnigraph/src/table_store.rs b/crates/omnigraph/src/table_store.rs index 96e6196..da31848 100644 --- a/crates/omnigraph/src/table_store.rs +++ b/crates/omnigraph/src/table_store.rs @@ -812,10 +812,12 @@ impl TableStore { /// Legacy inline-commit append: writes fragments AND commits in one /// call, advancing Lance HEAD as a side effect. Not on the /// `TableStorage` trait surface — the staged primitive `stage_append` - /// + `commit_staged` is the engine write path. This inherent - /// `pub(crate)` method survives only for recovery test setup. Do not - /// add new engine call sites — they re-introduce the multi-phase - /// commit drift the trait surface was designed to eliminate. + /// + `commit_staged` is the engine write path. This inherent method + /// survives only for in-source recovery test setup, so it is + /// `#[cfg(test)]`-gated: engine code physically cannot call it (which + /// enforces "no new call sites" by construction and silences the + /// dead-code warning the non-test lib build would otherwise emit). + #[cfg(test)] pub(crate) async fn append_batch( &self, dataset_uri: &str, diff --git a/crates/omnigraph/tests/failpoint_names_guard.rs b/crates/omnigraph/tests/failpoint_names_guard.rs new file mode 100644 index 0000000..df8fc1c --- /dev/null +++ b/crates/omnigraph/tests/failpoint_names_guard.rs @@ -0,0 +1,96 @@ +//! Guard: failpoint names must come from the compile-checked `names` catalog +//! (`omnigraph::failpoints::names` / `omnigraph_cluster::failpoints::names`), +//! never bare string literals. +//! +//! The `names` consts give compile-time typo protection only if every call +//! site uses them. A bare `maybe_fail("typo.literal")` still compiles (the +//! arg is `&str`), so a typo there would silently never fire. This +//! source-walk closes that gap by construction — the same defense-in-depth +//! shape as `forbidden_apis.rs`. Add a new failpoint by adding its const to +//! the catalog first; this guard then forces every call site to reference it. + +use std::path::{Path, PathBuf}; + +/// Call-site prefixes whose first argument must be a `names::` constant. The +/// check is whitespace/newline-tolerant (it skips past the open paren to the +/// first non-whitespace token), so wrapping the call across lines cannot hide +/// a literal — a per-line `contains` scan would miss +/// `park_first(\n "name",\n)`. +const CALL_PREFIXES: &[&str] = &[ + "maybe_fail(", + "ScopedFailPoint::new(", + "ScopedFailPoint::with_callback(", + "park_first(", +]; + +/// 1-based line number of `byte_off` within `contents`. +fn line_of(contents: &str, byte_off: usize) -> usize { + contents[..byte_off].bytes().filter(|&b| b == b'\n').count() + 1 +} + +fn manifest_dir() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) +} + +/// Production call sites live under each crate's `src`; test call sites live +/// in the two failpoint integration binaries. This guard file is deliberately +/// not in the set (it names the patterns as literals itself). +fn files_to_scan() -> Vec { + let engine = manifest_dir(); + let cluster = engine.join("../omnigraph-cluster"); + let mut out = Vec::new(); + collect_rs(&engine.join("src"), &mut out); + collect_rs(&cluster.join("src"), &mut out); + out.push(engine.join("tests/failpoints.rs")); + out.push(cluster.join("tests/failpoints.rs")); + out +} + +fn collect_rs(dir: &Path, out: &mut Vec) { + let Ok(entries) = std::fs::read_dir(dir) else { + return; + }; + for entry in entries.flatten() { + let path = entry.path(); + if path.is_dir() { + collect_rs(&path, out); + } else if path.extension().is_some_and(|e| e == "rs") { + out.push(path); + } + } +} + +#[test] +fn failpoint_names_use_the_compile_checked_catalog() { + let mut violations = Vec::new(); + for file in files_to_scan() { + let Ok(contents) = std::fs::read_to_string(&file) else { + continue; + }; + for prefix in CALL_PREFIXES { + let mut from = 0; + while let Some(rel) = contents[from..].find(prefix) { + let after_open = from + rel + prefix.len(); + // Skip whitespace (incl. newlines) after the open paren. If the + // first argument token is a `"`, it's a literal failpoint name + // — across a line break or not. + if contents[after_open..].trim_start().starts_with('"') { + violations.push(format!( + "{}:{}: literal failpoint name at `{}` — use a `names::` const", + file.display(), + line_of(&contents, from + rel), + prefix.trim_end_matches('('), + )); + } + from = after_open; + } + } + } + assert!( + violations.is_empty(), + "failpoint names must reference the compile-checked \ + `omnigraph::failpoints::names::*` (or `omnigraph_cluster::failpoints::names::*`) \ + constants, not string literals — a literal typo would silently never fire:\n{}", + violations.join("\n") + ); +} diff --git a/crates/omnigraph/tests/failpoints.rs b/crates/omnigraph/tests/failpoints.rs index 85c056d..cbd57be 100644 --- a/crates/omnigraph/tests/failpoints.rs +++ b/crates/omnigraph/tests/failpoints.rs @@ -3,10 +3,10 @@ mod helpers; use fail::FailScenario; -use futures::FutureExt; use omnigraph::db::Omnigraph; use omnigraph::error::{ManifestErrorKind, OmniError}; use omnigraph::failpoints::ScopedFailPoint; +use omnigraph::failpoints::names; use omnigraph::loader::LoadMode; use serial_test::serial; @@ -32,12 +32,13 @@ fn node_table_uri(root: &str, type_name: &str) -> String { } #[tokio::test] +#[serial] async fn branch_create_failpoint_triggers() { let _scenario = FailScenario::setup(); let dir = tempfile::tempdir().unwrap(); let uri = dir.path().to_str().unwrap(); let db = Omnigraph::init(uri, helpers::TEST_SCHEMA).await.unwrap(); - let _failpoint = ScopedFailPoint::new("branch_create.after_manifest_branch_create", "return"); + let _failpoint = ScopedFailPoint::new(names::BRANCH_CREATE_AFTER_MANIFEST_BRANCH_CREATE, "return"); let err = db.branch_create("feature").await.unwrap_err(); assert!( @@ -52,6 +53,7 @@ async fn branch_create_failpoint_triggers() { // object-store error) must NOT fail the call: the branch is already gone, and // `cleanup` reconciles the stranded fork. The branch name is reusable after. #[tokio::test] +#[serial] async fn branch_delete_partial_failure_converges_via_cleanup() { let _scenario = FailScenario::setup(); let dir = tempfile::tempdir().unwrap(); @@ -83,7 +85,7 @@ async fn branch_delete_partial_failure_converges_via_cleanup() { // Inject a failure during per-table cleanup, AFTER the manifest authority // flip. branch_delete must still succeed (best-effort reclaim). { - let _fp = ScopedFailPoint::new("branch_delete.before_table_cleanup", "return"); + let _fp = ScopedFailPoint::new(names::BRANCH_DELETE_BEFORE_TABLE_CLEANUP, "return"); main.branch_delete("feature").await.expect( "branch_delete is best-effort after the manifest flip: a cleanup-step \ failure must not fail the call", @@ -137,11 +139,12 @@ async fn branch_delete_partial_failure_converges_via_cleanup() { // prior delete; run cleanup". (This test was the inverse before the fork-as- // idempotent-reconcile fix; its flip is the signal the bug class is closed.) #[tokio::test] +#[serial] async fn recreate_over_orphaned_fork_self_heals_without_cleanup() { let _scenario = FailScenario::setup(); let dir = tempfile::tempdir().unwrap(); let uri = dir.path().to_str().unwrap().to_string(); - let mut main = helpers::init_and_load(&dir).await; + let main = helpers::init_and_load(&dir).await; main.branch_create("feature").await.unwrap(); let mut feature = Omnigraph::open(&uri).await.unwrap(); @@ -158,7 +161,7 @@ async fn recreate_over_orphaned_fork_self_heals_without_cleanup() { // Partial delete: leaves the Person fork orphaned (cleanup not yet run). { - let _fp = ScopedFailPoint::new("branch_delete.before_table_cleanup", "return"); + let _fp = ScopedFailPoint::new(names::BRANCH_DELETE_BEFORE_TABLE_CLEANUP, "return"); main.branch_delete("feature").await.unwrap(); } @@ -195,6 +198,7 @@ async fn recreate_over_orphaned_fork_self_heals_without_cleanup() { // leave the ref in place. It must not squeeze the ambiguity through // ExpectedVersionMismatch with expected == actual, which lies about the cause. #[tokio::test] +#[serial] async fn recreate_over_orphaned_fork_reports_indeterminate_authority_read() { let _scenario = FailScenario::setup(); let dir = tempfile::tempdir().unwrap(); @@ -211,7 +215,7 @@ async fn recreate_over_orphaned_fork_reports_indeterminate_authority_read() { let row = r#"{"type":"Person","data":{"name":"Grace","age":37}}"#; { - let _fp = ScopedFailPoint::new("classify.fresh_read", "return"); + let _fp = ScopedFailPoint::new(names::CLASSIFY_FRESH_READ, "return"); let err = db .load_as("feature", None, row, LoadMode::Merge, None) .await @@ -257,6 +261,7 @@ async fn recreate_over_orphaned_fork_reports_indeterminate_authority_read() { // surfaced per-table in the returned stats, and the independent reconcile pass // still reclaimed an orphan. #[tokio::test] +#[serial] async fn cleanup_isolates_single_table_failure() { let _scenario = FailScenario::setup(); let dir = tempfile::tempdir().unwrap(); @@ -272,7 +277,7 @@ async fn cleanup_isolates_single_table_failure() { } // One table's version GC fails once; the sweep must isolate it. - let _fp = ScopedFailPoint::new("cleanup.table_gc", "1*return"); + let _fp = ScopedFailPoint::new(names::CLEANUP_TABLE_GC, "1*return"); let stats = db .cleanup(omnigraph::db::CleanupPolicyOptions { keep_versions: Some(1), @@ -306,6 +311,7 @@ async fn cleanup_isolates_single_table_failure() { // isolated (logged, not propagated) so the sweep continues, and a later // cleanup converges. This is the loop the Devin finding was about. #[tokio::test] +#[serial] async fn cleanup_isolates_reconcile_failure() { let _scenario = FailScenario::setup(); let dir = tempfile::tempdir().unwrap(); @@ -323,7 +329,7 @@ async fn cleanup_isolates_reconcile_failure() { // Inject a one-shot failure into the reconcile force-delete. The sweep must // not abort. { - let _fp = ScopedFailPoint::new("cleanup.reconcile_fork", "1*return"); + let _fp = ScopedFailPoint::new(names::CLEANUP_RECONCILE_FORK, "1*return"); db.cleanup(omnigraph::db::CleanupPolicyOptions { keep_versions: Some(1), older_than: None, @@ -359,6 +365,7 @@ async fn cleanup_isolates_reconcile_failure() { // per-table forks. A delete whose best-effort commit-graph reclaim fails leaves // a commit-graph orphan; the next cleanup must drop it. #[tokio::test] +#[serial] async fn cleanup_reclaims_orphaned_commit_graph_branch() { let _scenario = FailScenario::setup(); let dir = tempfile::tempdir().unwrap(); @@ -369,7 +376,7 @@ async fn cleanup_reclaims_orphaned_commit_graph_branch() { // Delete, failing the commit-graph reclaim → commit-graph "feature" orphan // (manifest branch gone, commit-graph branch left behind). { - let _fp = ScopedFailPoint::new("branch_delete.before_commit_graph_reclaim", "return"); + let _fp = ScopedFailPoint::new(names::BRANCH_DELETE_BEFORE_COMMIT_GRAPH_RECLAIM, "return"); db.branch_delete("feature").await.unwrap(); } @@ -407,6 +414,7 @@ async fn cleanup_reclaims_orphaned_commit_graph_branch() { // the next run once the read succeeds. This pins the Indeterminate arm and the // don't-destroy-on-ambiguity rule end-to-end through cleanup. #[tokio::test] +#[serial] async fn reconcile_skips_fork_when_fresh_recheck_is_unavailable_then_converges() { let _scenario = FailScenario::setup(); let dir = tempfile::tempdir().unwrap(); @@ -430,7 +438,7 @@ async fn reconcile_skips_fork_when_fresh_recheck_is_unavailable_then_converges() // With the fresh re-check failing, the fork's status is Indeterminate (the // branch is live but unreadable) → cleanup must SKIP it, not delete. { - let _fp = ScopedFailPoint::new("classify.fresh_read", "return"); + let _fp = ScopedFailPoint::new(names::CLASSIFY_FRESH_READ, "return"); db.cleanup(omnigraph::db::CleanupPolicyOptions { keep_versions: Some(1), older_than: None, @@ -465,6 +473,7 @@ async fn reconcile_skips_fork_when_fresh_recheck_is_unavailable_then_converges() // succeed (branch_create force-deletes a stale commit-graph ref since the // manifest branch is created fresh), instead of dying on the leftover ref. #[tokio::test] +#[serial] async fn branch_create_recreates_over_commit_graph_zombie() { let _scenario = FailScenario::setup(); let dir = tempfile::tempdir().unwrap(); @@ -476,7 +485,7 @@ async fn branch_create_recreates_over_commit_graph_zombie() { { // Fail the best-effort commit-graph reclaim → commit-graph "feature" // zombie survives the delete (manifest authority still flips). - let _fp = ScopedFailPoint::new("branch_delete.before_commit_graph_reclaim", "return"); + let _fp = ScopedFailPoint::new(names::BRANCH_DELETE_BEFORE_COMMIT_GRAPH_RECLAIM, "return"); db.branch_delete("feature").await.unwrap(); } assert_eq!(db.branch_list().await.unwrap(), vec!["main".to_string()]); @@ -497,6 +506,7 @@ async fn branch_create_recreates_over_commit_graph_zombie() { // the branch does not half-exist. The existing failpoint fires right after the // manifest create, standing in for any post-authority failure. #[tokio::test] +#[serial] async fn branch_create_rolls_back_manifest_on_commit_graph_failure() { let _scenario = FailScenario::setup(); let dir = tempfile::tempdir().unwrap(); @@ -505,7 +515,7 @@ async fn branch_create_rolls_back_manifest_on_commit_graph_failure() { .unwrap(); let err = { - let _fp = ScopedFailPoint::new("branch_create.after_manifest_branch_create", "return"); + let _fp = ScopedFailPoint::new(names::BRANCH_CREATE_AFTER_MANIFEST_BRANCH_CREATE, "return"); db.branch_create("feature").await.unwrap_err() }; assert!( @@ -524,42 +534,21 @@ async fn branch_create_rolls_back_manifest_on_commit_graph_failure() { // an orphan, so it must be a retryable "refresh and retry", never a misleading // "run cleanup". // -// Ordering is made deterministic (no sleeps) via a callback at the fork point: -// `compare_exchange` lets only the FIRST arrival (writer A) record readiness and -// block until released; later arrivals (writer B) fall through. The test waits -// on the readiness flag, lets B win and commit the fork, then releases A. -static FORK_A_AT_POINT: std::sync::atomic::AtomicBool = std::sync::atomic::AtomicBool::new(false); -static FORK_RELEASE_A: std::sync::atomic::AtomicBool = std::sync::atomic::AtomicBool::new(false); - +// Ordering is made deterministic (no fixed sleeps) via the shared rendezvous: +// it parks the first arrival (writer A) at the fork point until released; later +// arrivals (writer B) fall through. The test waits on the reached condition, +// lets B win and commit the fork, then releases A. #[tokio::test(flavor = "multi_thread")] +#[serial] async fn fork_collision_with_live_concurrent_fork_is_retryable() { - use std::sync::atomic::Ordering::SeqCst; - let _scenario = FailScenario::setup(); - FORK_A_AT_POINT.store(false, SeqCst); - FORK_RELEASE_A.store(false, SeqCst); let dir = tempfile::tempdir().unwrap(); let uri = dir.path().to_str().unwrap().to_string(); let main = helpers::init_and_load(&dir).await; main.branch_create("feature").await.unwrap(); - // First arrival (A) records readiness and blocks until released; the rest - // (B) fall through immediately. Bounded spin so a mistake can't hang forever. - fail::cfg_callback("fork.before_classify", || { - if FORK_A_AT_POINT - .compare_exchange(false, true, SeqCst, SeqCst) - .is_ok() - { - for _ in 0..2000 { - if FORK_RELEASE_A.load(SeqCst) { - break; - } - std::thread::sleep(std::time::Duration::from_millis(5)); - } - } - }) - .unwrap(); + let rv = helpers::failpoint::Rendezvous::park_first(names::FORK_BEFORE_CLASSIFY); let uri_a = uri.clone(); let writer_a = tokio::spawn(async move { @@ -574,17 +563,8 @@ async fn fork_collision_with_live_concurrent_fork_is_retryable() { .await }); - // Wait (bounded) until A is parked at the fork point. - for _ in 0..600 { - if FORK_A_AT_POINT.load(SeqCst) { - break; - } - tokio::time::sleep(std::time::Duration::from_millis(5)).await; - } - assert!( - FORK_A_AT_POINT.load(SeqCst), - "writer A never reached the fork point" - ); + // Wait until A is parked at the fork point. + rv.wait_until_reached().await; // B wins the fork and commits it. let mut b = Omnigraph::open(&uri).await.unwrap(); @@ -599,12 +579,11 @@ async fn fork_collision_with_live_concurrent_fork_is_retryable() { .unwrap(); // Release A; it resumes, re-reads the manifest, and sees the fork is live. - FORK_RELEASE_A.store(true, SeqCst); + rv.release(); let err = writer_a .await .unwrap() .expect_err("A's stale-snapshot fork should be a retryable conflict"); - fail::remove("fork.before_classify"); let msg = err.to_string(); assert!( @@ -618,13 +597,14 @@ async fn fork_collision_with_live_concurrent_fork_is_retryable() { } #[tokio::test(flavor = "multi_thread")] +#[serial] async fn graph_publish_failpoint_triggers_before_commit_append() { let _scenario = FailScenario::setup(); let dir = tempfile::tempdir().unwrap(); let mut db = Omnigraph::init(dir.path().to_str().unwrap(), helpers::TEST_SCHEMA) .await .unwrap(); - let _failpoint = ScopedFailPoint::new("graph_publish.before_commit_append", "return"); + let _failpoint = ScopedFailPoint::new(names::GRAPH_PUBLISH_BEFORE_COMMIT_APPEND, "return"); let err = mutate_main( &mut db, @@ -646,6 +626,7 @@ async fn graph_publish_failpoint_triggers_before_commit_append() { // state. #[tokio::test] +#[serial] async fn schema_apply_pre_commit_crash_rolls_forward_via_sidecar() { let _scenario = FailScenario::setup(); let dir = tempfile::tempdir().unwrap(); @@ -653,7 +634,7 @@ async fn schema_apply_pre_commit_crash_rolls_forward_via_sidecar() { { let db = Omnigraph::init(&uri, SCHEMA_V1).await.unwrap(); - let _failpoint = ScopedFailPoint::new("schema_apply.after_staging_write", "return"); + let _failpoint = ScopedFailPoint::new(names::SCHEMA_APPLY_AFTER_STAGING_WRITE, "return"); let err = db.apply_schema(SCHEMA_V2_ADDED_TYPE).await.unwrap_err(); assert!( err.to_string() @@ -689,6 +670,7 @@ async fn schema_apply_pre_commit_crash_rolls_forward_via_sidecar() { } #[tokio::test] +#[serial] async fn schema_apply_recovers_post_commit_crash() { let _scenario = FailScenario::setup(); let dir = tempfile::tempdir().unwrap(); @@ -696,7 +678,7 @@ async fn schema_apply_recovers_post_commit_crash() { { let db = Omnigraph::init(&uri, SCHEMA_V1).await.unwrap(); - let _failpoint = ScopedFailPoint::new("schema_apply.after_manifest_commit", "return"); + let _failpoint = ScopedFailPoint::new(names::SCHEMA_APPLY_AFTER_MANIFEST_COMMIT, "return"); let err = db.apply_schema(SCHEMA_V2_ADDED_TYPE).await.unwrap_err(); assert!( err.to_string() @@ -714,6 +696,7 @@ async fn schema_apply_recovers_post_commit_crash() { } #[tokio::test] +#[serial] async fn schema_apply_recovers_partial_rename() { // Construct a partial-rename state: _schema.pg has been renamed in // (matching v2), but _schema.ir.json.staging and __schema_state.json.staging @@ -778,6 +761,7 @@ async fn schema_apply_recovers_partial_rename() { /// Continuous in-process recovery (no restart needed between failure /// and recovery) is the goal of a future background reconciler. #[tokio::test] +#[serial] async fn recovery_rolls_forward_after_finalize_publisher_failure() { let _scenario = FailScenario::setup(); let dir = tempfile::tempdir().unwrap(); @@ -787,7 +771,7 @@ async fn recovery_rolls_forward_after_finalize_publisher_failure() { // Setup: trigger the residual. { let mut db = Omnigraph::init(&uri, helpers::TEST_SCHEMA).await.unwrap(); - let _failpoint = ScopedFailPoint::new("mutation.post_finalize_pre_publisher", "return"); + let _failpoint = ScopedFailPoint::new(names::MUTATION_POST_FINALIZE_PRE_PUBLISHER, "return"); // The mutation's finalize completes (commit_staged advances Lance // HEAD on node:Person AND writes a `__recovery/{ulid}.json` @@ -864,12 +848,114 @@ async fn recovery_rolls_forward_after_finalize_publisher_failure() { ); } -#[tokio::test] -async fn inline_delete_conflict_writes_sidecar_before_rejecting() { +/// Regression for iss-schema-apply-reopen-recovery-race: the open-time +/// recovery sweep's roll-forward must CONVERGE (not fatally error the open) +/// when a concurrent writer advances the manifest past the sidecar's pin +/// during the classify→publish window. +/// +/// Two concurrent `Omnigraph::open` sweeps race the same pending sidecar. +/// One is parked at `recovery.before_roll_forward_publish` (after it has +/// classified `RolledPastExpected`, before its publish CAS); the other falls +/// through, rolls the sidecar forward (manifest v → v+1), and deletes it. The +/// parked sweep then loses its publish CAS at the now-stale `expected = v`. +/// The manifest already reached the sidecar's goal, so this is convergence, +/// not a logical conflict — the open must succeed, not panic with +/// `ExpectedVersionMismatch`. +#[tokio::test(flavor = "multi_thread", worker_threads = 4)] +#[serial] +async fn open_sweep_roll_forward_converges_when_manifest_advances_concurrently() { let _scenario = FailScenario::setup(); let dir = tempfile::tempdir().unwrap(); let uri = dir.path().to_str().unwrap().to_string(); - let db = helpers::init_and_load(&dir).await; + + // Setup: leave one pending sidecar (node:Person at Lance v+1, manifest v). + { + let mut db = Omnigraph::init(&uri, helpers::TEST_SCHEMA).await.unwrap(); + let _failpoint = + ScopedFailPoint::new(names::MUTATION_POST_FINALIZE_PRE_PUBLISHER, "return"); + mutate_main( + &mut db, + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + .unwrap_err(); + } + assert_eq!( + std::fs::read_dir(dir.path().join("__recovery")) + .unwrap() + .count(), + 1, + "exactly one pending sidecar must persist for the sweep to roll forward" + ); + + // Park the FIRST sweep to reach the publish window; later arrivals fall + // through. wait_until_reached gates the second open so it is guaranteed + // to be the one that converges the sidecar. + let rv = helpers::failpoint::Rendezvous::park_first( + names::RECOVERY_BEFORE_ROLL_FORWARD_PUBLISH, + ); + + let uri_parked = uri.clone(); + let parked_open = tokio::spawn(async move { Omnigraph::open(&uri_parked).await }); + rv.wait_until_reached().await; + + // A concurrent open rolls the sidecar forward (manifest v → v+1) and + // deletes it, advancing the manifest past the parked sweep's pin. + let converging_open = Omnigraph::open(&uri) + .await + .expect("the second open's sweep should roll the sidecar forward and succeed"); + assert_eq!( + helpers::count_rows(&converging_open, "node:Person").await, + 1, + "the converging open must publish the rolled-forward Person row" + ); + + // Release the parked sweep: its publish CAS finds the manifest already at + // the goal. It must converge, not fail the open. + rv.release(); + parked_open + .await + .expect("the parked open task must not panic") + .expect( + "the open-time sweep must converge when the manifest already reached \ + the sidecar's goal, not fail the open with ExpectedVersionMismatch", + ); + + // The sidecar is gone and the graph is readable and consistent. + let recovery_dir = dir.path().join("__recovery"); + if recovery_dir.exists() { + assert_eq!( + std::fs::read_dir(&recovery_dir).unwrap().count(), + 0, + "the sidecar must be gone after both sweeps converge" + ); + } + let db = Omnigraph::open(&uri).await.unwrap(); + assert_eq!(helpers::count_rows(&db, "node:Person").await, 1); + + // Exactly one RolledForward audit row for this recovery event: the loser's + // convergence path must NOT append a duplicate once the winner already + // recorded the audit and deleted the sidecar (append-idempotent per + // operation_id). Two rows here would be the duplicate-audit regression. + let kinds = helpers::recovery::recovery_audit_kinds(dir.path()).await; + assert_eq!( + kinds.len(), + 1, + "exactly one recovery audit row expected after concurrent convergence, got {kinds:?}" + ); +} + +#[tokio::test(flavor = "multi_thread")] +#[serial] +async fn inline_delete_conflict_writes_sidecar_before_rejecting() { + use std::sync::Arc; + + let _scenario = FailScenario::setup(); + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap().to_string(); + let db = Arc::new(helpers::init_and_load(&dir).await); let pre_snapshot = db .snapshot_of(omnigraph::db::ReadTarget::branch("main")) @@ -879,39 +965,37 @@ async fn inline_delete_conflict_writes_sidecar_before_rejecting() { let person_uri = node_table_uri(&uri, "Person"); { - let _pause_delete = - ScopedFailPoint::new("mutation.delete_node_pre_primary_delete", "pause"); - let delete_params = helpers::params(&[("$name", "Alice")]); - let delete = db.mutate("main", MUTATION_QUERIES, "remove_person", &delete_params); - tokio::pin!(delete); - - let mut concurrent_update_succeeded = false; - for _ in 0..50 { - if delete.as_mut().now_or_never().is_some() { - panic!("delete mutation completed before primary-delete failpoint was released"); - } - let mut concurrent = Omnigraph::open_read_only(&uri).await.unwrap(); - if mutate_main( - &mut concurrent, - MUTATION_QUERIES, - "set_age", - &mixed_params(&[("$name", "Bob")], &[("$age", 26)]), - ) - .await - .is_ok() - { - concurrent_update_succeeded = true; - break; - } - tokio::time::sleep(std::time::Duration::from_millis(20)).await; - } - assert!( - concurrent_update_succeeded, - "concurrent update must land while delete is paused" + // Park the delete at the primary-delete point. The concurrent update + // then lands deterministically before the delete resumes, so the + // delete's manifest CAS is guaranteed stale — no retry loop, no sleep. + let rv = helpers::failpoint::Rendezvous::park_first( + names::MUTATION_DELETE_NODE_PRE_PRIMARY_DELETE, ); - fail::remove("mutation.delete_node_pre_primary_delete"); - let err = delete.await.unwrap_err(); + let del_db = Arc::clone(&db); + let delete = tokio::spawn(async move { + let delete_params = helpers::params(&[("$name", "Alice")]); + del_db + .mutate("main", MUTATION_QUERIES, "remove_person", &delete_params) + .await + }); + + rv.wait_until_reached().await; + + // Concurrent update lands while the delete is parked. + let mut concurrent = Omnigraph::open_read_only(&uri).await.unwrap(); + mutate_main( + &mut concurrent, + MUTATION_QUERIES, + "set_age", + &mixed_params(&[("$name", "Bob")], &[("$age", 26)]), + ) + .await + .expect("concurrent update must land while delete is paused"); + + rv.release(); + + let err = delete.await.unwrap().unwrap_err(); assert!( err.to_string().contains("stale view of 'node:Person'") || err.to_string().contains("ExpectedVersionMismatch") @@ -943,6 +1027,7 @@ async fn inline_delete_conflict_writes_sidecar_before_rejecting() { } #[tokio::test] +#[serial] async fn recovery_rolls_forward_load_on_feature_branch() { use omnigraph::loader::LoadMode; @@ -973,7 +1058,7 @@ async fn recovery_rolls_forward_load_on_feature_branch() { .table_version; feature_parent_commit_id = branch_head_commit_id(dir.path(), "feature").await.unwrap(); - let _failpoint = ScopedFailPoint::new("mutation.post_finalize_pre_publisher", "return"); + let _failpoint = ScopedFailPoint::new(names::MUTATION_POST_FINALIZE_PRE_PUBLISHER, "return"); let err = db .load( "feature", @@ -1038,6 +1123,7 @@ async fn recovery_rolls_forward_load_on_feature_branch() { } #[tokio::test] +#[serial] async fn recovery_rolls_forward_load_overwrite() { use omnigraph::loader::{LoadMode, load_jsonl}; @@ -1054,7 +1140,7 @@ async fn recovery_rolls_forward_load_overwrite() { .unwrap(); parent_commit_id = branch_head_commit_id(dir.path(), "main").await.unwrap(); - let _failpoint = ScopedFailPoint::new("mutation.post_finalize_pre_publisher", "return"); + let _failpoint = ScopedFailPoint::new(names::MUTATION_POST_FINALIZE_PRE_PUBLISHER, "return"); let err = db .load( "main", @@ -1108,6 +1194,7 @@ async fn recovery_rolls_forward_load_overwrite() { } #[tokio::test] +#[serial] async fn recovery_rolls_forward_ensure_indices_on_feature_branch() { use lance::index::DatasetIndexExt; use omnigraph::loader::{LoadMode, load_jsonl}; @@ -1182,7 +1269,7 @@ async fn recovery_rolls_forward_ensure_indices_on_feature_branch() { { let _failpoint = - ScopedFailPoint::new("ensure_indices.post_phase_b_pre_manifest_commit", "return"); + ScopedFailPoint::new(names::ENSURE_INDICES_POST_PHASE_B_PRE_MANIFEST_COMMIT, "return"); let err = db.ensure_indices_on("feature").await.unwrap_err(); assert!( err.to_string().contains( @@ -1250,6 +1337,7 @@ async fn recovery_rolls_forward_ensure_indices_on_feature_branch() { /// on the same handle succeeds without restart and without /// ExpectedVersionMismatch. #[tokio::test] +#[serial] async fn refresh_runs_roll_forward_recovery_in_process() { let _scenario = FailScenario::setup(); let dir = tempfile::tempdir().unwrap(); @@ -1259,7 +1347,7 @@ async fn refresh_runs_roll_forward_recovery_in_process() { // Setup: trigger the residual (sidecar persists; manifest unchanged). { - let _failpoint = ScopedFailPoint::new("mutation.post_finalize_pre_publisher", "return"); + let _failpoint = ScopedFailPoint::new(names::MUTATION_POST_FINALIZE_PRE_PUBLISHER, "return"); let err = mutate_main( &mut db, MUTATION_QUERIES, @@ -1329,6 +1417,7 @@ async fn refresh_runs_roll_forward_recovery_in_process() { /// drift in-process: no restart, no explicit `refresh()`, no /// `omnigraph repair`. #[tokio::test] +#[serial] async fn load_after_finalize_publisher_failure_heals_without_reopen() { use omnigraph::loader::{LoadMode, load_jsonl}; @@ -1342,7 +1431,7 @@ async fn load_after_finalize_publisher_failure_heals_without_reopen() { // commit_staged (Lance HEAD advances on three tables), then the // publisher is wedged before the manifest commit. { - let _failpoint = ScopedFailPoint::new("mutation.post_finalize_pre_publisher", "return"); + let _failpoint = ScopedFailPoint::new(names::MUTATION_POST_FINALIZE_PRE_PUBLISHER, "return"); let err = load_jsonl( &mut db, r#"{"type":"Person","data":{"name":"Alice","age":30}} @@ -1405,6 +1494,7 @@ async fn load_after_finalize_publisher_failure_heals_without_reopen() { /// recover — and the same handle must write normally once the fault /// clears (a transient storage error never wedges the graph). #[tokio::test] +#[serial] async fn sidecar_write_failure_aborts_load_with_no_head_advance() { use omnigraph::loader::{LoadMode, load_jsonl}; @@ -1422,7 +1512,7 @@ async fn sidecar_write_failure_aborts_load_with_no_head_advance() { .version; { - let _failpoint = ScopedFailPoint::new("recovery.sidecar_write", "return"); + let _failpoint = ScopedFailPoint::new(names::RECOVERY_SIDECAR_WRITE, "return"); let err = load_jsonl( &mut db, r#"{"type":"Person","data":{"name":"Alice","age":30}} @@ -1490,6 +1580,7 @@ async fn sidecar_write_failure_aborts_load_with_no_head_advance() { /// local filesystem backend. Skips unless `OMNIGRAPH_S3_TEST_BUCKET` is set /// (same gate as `s3_storage.rs`); CI runs it against RustFS. #[tokio::test] +#[serial] async fn s3_load_recovers_after_publisher_failure_without_reopen() { use omnigraph::loader::{LoadMode, load_jsonl}; @@ -1507,7 +1598,7 @@ async fn s3_load_recovers_after_publisher_failure_without_reopen() { // Failed load: commit_staged lands on S3, manifest publish does not; // the sidecar PUT went through the S3 adapter. { - let _failpoint = ScopedFailPoint::new("mutation.post_finalize_pre_publisher", "return"); + let _failpoint = ScopedFailPoint::new(names::MUTATION_POST_FINALIZE_PRE_PUBLISHER, "return"); let err = load_jsonl( &mut db, r#"{"type":"Person","data":{"name":"Alice","age":30}} @@ -1554,6 +1645,7 @@ async fn s3_load_recovers_after_publisher_failure_without_reopen() { /// documented retry tolerance in `record_audit`'s contract, exercised /// end-to-end through a real injected failure. #[tokio::test] +#[serial] async fn record_audit_failure_after_roll_forward_converges_on_next_write() { use omnigraph::loader::{LoadMode, load_jsonl}; @@ -1565,7 +1657,7 @@ async fn record_audit_failure_after_roll_forward_converges_on_next_write() { // Pending sidecar with real drift. { - let _failpoint = ScopedFailPoint::new("mutation.post_finalize_pre_publisher", "return"); + let _failpoint = ScopedFailPoint::new(names::MUTATION_POST_FINALIZE_PRE_PUBLISHER, "return"); load_jsonl( &mut db, r#"{"type":"Person","data":{"name":"Alice","age":30}} @@ -1581,7 +1673,7 @@ async fn record_audit_failure_after_roll_forward_converges_on_next_write() { // the audit write fails — the write must fail loudly and the sidecar // must survive for the retry. { - let _failpoint = ScopedFailPoint::new("recovery.record_audit", "return"); + let _failpoint = ScopedFailPoint::new(names::RECOVERY_RECORD_AUDIT, "return"); let err = load_jsonl( &mut db, r#"{"type":"Person","data":{"name":"Bob","age":25}} @@ -1645,6 +1737,7 @@ async fn record_audit_failure_after_roll_forward_converges_on_next_write() { /// (which would be consumer tolerance of drift). Once the fault clears, /// open recovers normally. #[tokio::test] +#[serial] async fn sidecar_list_failure_fails_write_and_open_loudly_then_clears() { use omnigraph::loader::{LoadMode, load_jsonl}; @@ -1656,7 +1749,7 @@ async fn sidecar_list_failure_fails_write_and_open_loudly_then_clears() { // Pending sidecar via the usual finalize → publisher failure. { - let _failpoint = ScopedFailPoint::new("mutation.post_finalize_pre_publisher", "return"); + let _failpoint = ScopedFailPoint::new(names::MUTATION_POST_FINALIZE_PRE_PUBLISHER, "return"); let err = load_jsonl( &mut db, r#"{"type":"Person","data":{"name":"Alice","age":30}} @@ -1674,7 +1767,7 @@ async fn sidecar_list_failure_fails_write_and_open_loudly_then_clears() { assert_eq!(std::fs::read_dir(&recovery_dir).unwrap().count(), 1); } - let _failpoint = ScopedFailPoint::new("recovery.sidecar_list", "return"); + let _failpoint = ScopedFailPoint::new(names::RECOVERY_SIDECAR_LIST, "return"); // Write-entry heal: the list failure surfaces as the write's error — // no silent skip that would proceed over the pending sidecar. @@ -1725,6 +1818,7 @@ async fn sidecar_list_failure_fails_write_and_open_loudly_then_clears() { /// consumed by the next write's entry heal (attributed `RolledForward` /// audit row), not by an operator. #[tokio::test] +#[serial] async fn sidecar_delete_failure_keeps_write_success_and_next_write_heals() { use omnigraph::loader::{LoadMode, load_jsonl}; @@ -1735,7 +1829,7 @@ async fn sidecar_delete_failure_keeps_write_success_and_next_write_heals() { let mut db = Omnigraph::init(&uri, helpers::TEST_SCHEMA).await.unwrap(); { - let _failpoint = ScopedFailPoint::new("recovery.sidecar_delete", "return"); + let _failpoint = ScopedFailPoint::new(names::RECOVERY_SIDECAR_DELETE, "return"); // The load itself must succeed: commit_staged + manifest publish // landed; only the Phase D cleanup failed (swallowed + logged). load_jsonl( @@ -1783,6 +1877,7 @@ async fn sidecar_delete_failure_keeps_write_success_and_next_write_heals() { /// PUT failure must abort the merge before any target-table HEAD moves; /// retrying after the fault clears merges cleanly. #[tokio::test] +#[serial] async fn sidecar_write_failure_aborts_branch_merge_with_no_head_advance() { use omnigraph::loader::{LoadMode, load_jsonl}; @@ -1830,7 +1925,7 @@ async fn sidecar_write_failure_aborts_branch_merge_with_no_head_advance() { .version; { - let _failpoint = ScopedFailPoint::new("recovery.sidecar_write", "return"); + let _failpoint = ScopedFailPoint::new(names::RECOVERY_SIDECAR_WRITE, "return"); let err = db.branch_merge("feature", "main").await.unwrap_err(); assert!( err.to_string() @@ -1873,6 +1968,7 @@ async fn sidecar_write_failure_aborts_branch_merge_with_no_head_advance() { /// `refresh()` (which `refresh_runs_roll_forward_recovery_in_process` /// covers), no reopen. #[tokio::test] +#[serial] async fn mutation_after_finalize_publisher_failure_heals_without_reopen() { let _scenario = FailScenario::setup(); let dir = tempfile::tempdir().unwrap(); @@ -1881,7 +1977,7 @@ async fn mutation_after_finalize_publisher_failure_heals_without_reopen() { let mut db = Omnigraph::init(&uri, helpers::TEST_SCHEMA).await.unwrap(); { - let _failpoint = ScopedFailPoint::new("mutation.post_finalize_pre_publisher", "return"); + let _failpoint = ScopedFailPoint::new(names::MUTATION_POST_FINALIZE_PRE_PUBLISHER, "return"); let err = mutate_main( &mut db, MUTATION_QUERIES, @@ -1936,6 +2032,7 @@ async fn mutation_after_finalize_publisher_failure_heals_without_reopen() { /// runs, so a long-lived handle can evolve the schema without a /// restart after a Phase B → Phase C failure. #[tokio::test] +#[serial] async fn schema_apply_after_finalize_publisher_failure_heals_without_reopen() { use omnigraph::loader::{LoadMode, load_jsonl}; @@ -1946,7 +2043,7 @@ async fn schema_apply_after_finalize_publisher_failure_heals_without_reopen() { let mut db = Omnigraph::init(&uri, helpers::TEST_SCHEMA).await.unwrap(); { - let _failpoint = ScopedFailPoint::new("mutation.post_finalize_pre_publisher", "return"); + let _failpoint = ScopedFailPoint::new(names::MUTATION_POST_FINALIZE_PRE_PUBLISHER, "return"); let err = load_jsonl( &mut db, r#"{"type":"Person","data":{"name":"Alice","age":30}} @@ -1995,6 +2092,7 @@ async fn schema_apply_after_finalize_publisher_failure_heals_without_reopen() { /// (with its recovery audit row) before the merge reads its target /// snapshot — not silently folded into the merge's publish. #[tokio::test] +#[serial] async fn branch_merge_after_finalize_publisher_failure_heals_without_reopen() { use omnigraph::loader::{LoadMode, load_jsonl}; @@ -2027,7 +2125,7 @@ async fn branch_merge_after_finalize_publisher_failure_heals_without_reopen() { // Failed load on MAIN: Person drifts ahead of the manifest with a // sidecar covering it. { - let _failpoint = ScopedFailPoint::new("mutation.post_finalize_pre_publisher", "return"); + let _failpoint = ScopedFailPoint::new(names::MUTATION_POST_FINALIZE_PRE_PUBLISHER, "return"); let err = load_jsonl( &mut db, r#"{"type":"Person","data":{"name":"Bob","age":25}} @@ -2079,6 +2177,7 @@ async fn branch_merge_after_finalize_publisher_failure_heals_without_reopen() { /// the audit already written — the retry must NOT append a second /// audit row for the same operation, only finish the delete. #[tokio::test] +#[serial] async fn orphaned_branch_discard_is_idempotent_across_delete_failure() { use omnigraph::loader::{LoadMode, load_jsonl}; @@ -2141,7 +2240,7 @@ async fn orphaned_branch_discard_is_idempotent_across_delete_failure() { // First write: the discard path writes its audit row, then the // sidecar delete fails (injected). The write fails loudly. { - let _failpoint = ScopedFailPoint::new("recovery.sidecar_delete", "return"); + let _failpoint = ScopedFailPoint::new(names::RECOVERY_SIDECAR_DELETE, "return"); let err = load_jsonl( &mut db, "{\"type\":\"Person\",\"data\":{\"name\":\"Bob\",\"age\":25}}\n", @@ -2185,6 +2284,7 @@ async fn orphaned_branch_discard_is_idempotent_across_delete_failure() { /// while a sidecar is pending. Sequenced failpoint: first list (entry /// heal) passes, second list (the guard) fails. #[tokio::test] +#[serial] async fn drift_guard_names_both_paths_when_sidecar_list_fails() { use omnigraph::loader::{LoadMode, load_jsonl}; @@ -2244,7 +2344,7 @@ async fn drift_guard_names_both_paths_when_sidecar_list_fails() { // First list (entry heal) passes and defers the sidecar; second // list (the guard's classification) fails. - let _failpoint = ScopedFailPoint::new("recovery.sidecar_list", "1*off->1*return"); + let _failpoint = ScopedFailPoint::new(names::RECOVERY_SIDECAR_LIST, "1*off->1*return"); let err = load_jsonl( &mut db, "{\"type\":\"Person\",\"data\":{\"name\":\"bob\",\"age\":25}}\n", @@ -2272,6 +2372,7 @@ async fn drift_guard_names_both_paths_when_sidecar_list_fails() { /// Gap): bounded commit-graph noise, never a lost or duplicated audit /// record under clean failures. #[tokio::test] +#[serial] async fn orphaned_branch_discard_converges_across_audit_append_failure() { use omnigraph::loader::{LoadMode, load_jsonl}; @@ -2332,7 +2433,7 @@ async fn orphaned_branch_discard_converges_across_audit_append_failure() { // fails (injected). The write fails loudly; the sidecar survives so // the discard is retried with the audit still owed. { - let _failpoint = ScopedFailPoint::new("recovery.orphan_discard_audit_append", "return"); + let _failpoint = ScopedFailPoint::new(names::RECOVERY_ORPHAN_DISCARD_AUDIT_APPEND, "return"); let err = load_jsonl( &mut db, "{\"type\":\"Person\",\"data\":{\"name\":\"Bob\",\"age\":25}}\n", @@ -2386,6 +2487,7 @@ async fn orphaned_branch_discard_converges_across_audit_append_failure() { /// writes against the stale catalog rejects rows of types the graph /// already has. #[tokio::test] +#[serial] async fn load_after_schema_apply_phase_b_failure_uses_recovered_catalog() { use omnigraph::loader::{LoadMode, load_jsonl}; @@ -2425,7 +2527,7 @@ edge Knows: Person -> Person { edge WorksAt: Person -> Company "#; { - let _failpoint = ScopedFailPoint::new("schema_apply.after_staging_write", "return"); + let _failpoint = ScopedFailPoint::new(names::SCHEMA_APPLY_AFTER_STAGING_WRITE, "return"); let err = db.apply_schema(v2_schema).await.unwrap_err(); assert!( err.to_string() @@ -2467,6 +2569,7 @@ edge WorksAt: Person -> Company /// the resumed apply's own renames then fail on the missing sources: /// an error (and a corrupted catalog) for an otherwise-healthy apply. #[tokio::test(flavor = "multi_thread", worker_threads = 4)] +#[serial] async fn heal_does_not_promote_live_schema_apply_staging() { use omnigraph::loader::LoadMode; use std::sync::Arc; @@ -2477,22 +2580,17 @@ async fn heal_does_not_promote_live_schema_apply_staging() { let db = Arc::new(Omnigraph::init(&uri, helpers::TEST_SCHEMA).await.unwrap()); - // Pause the apply right after its staging files land (its sidecar is + // Park the apply right after its staging files land (its sidecar is // already on disk from Phase A; the manifest commit has not run). - let failpoint = ScopedFailPoint::new("schema_apply.after_staging_write", "pause"); + let rv = helpers::failpoint::Rendezvous::park_first(names::SCHEMA_APPLY_AFTER_STAGING_WRITE); let apply_db = Arc::clone(&db); let desired = format!("{}\nnode Tag {{ name: String @key }}\n", helpers::TEST_SCHEMA); let apply = tokio::spawn(async move { apply_db.apply_schema(&desired).await }); - // Wait until the apply is parked in the window: staging on disk. + // Wait until the apply is parked in the window (staging files written). + rv.wait_until_reached().await; let staging_pg = dir.path().join("_schema.pg.staging"); - for _ in 0..500 { - if staging_pg.exists() { - break; - } - tokio::time::sleep(std::time::Duration::from_millis(10)).await; - } assert!(staging_pg.exists(), "schema apply never reached the paused window"); // Concurrent load on the same handle: its entry heal runs while the @@ -2516,7 +2614,7 @@ async fn heal_does_not_promote_live_schema_apply_staging() { // stole the apply's commit); fixed code leaves the load blocked on // the schema-apply serialization key until the apply finishes. tokio::time::sleep(std::time::Duration::from_millis(500)).await; - drop(failpoint); + rv.release(); let apply_result = apply.await.unwrap(); let _ = tokio::time::timeout(std::time::Duration::from_secs(30), load) @@ -2546,6 +2644,7 @@ async fn heal_does_not_promote_live_schema_apply_staging() { /// sidecar still on disk, Lance HEAD unchanged (no restore commit). /// Then drop + open: full sweep handles it. #[tokio::test] +#[serial] async fn refresh_defers_rollback_eligible_sidecar_to_next_open() { use omnigraph::loader::{LoadMode, load_jsonl}; @@ -2718,6 +2817,7 @@ async fn refresh_defers_rollback_eligible_sidecar_to_next_open() { /// on one table leaves OTHER tables untouched. Subsequent writes to /// non-drifted tables proceed normally; the drift is contained. #[tokio::test] +#[serial] async fn finalize_publisher_residual_does_not_drift_untouched_tables() { let _scenario = FailScenario::setup(); let dir = tempfile::tempdir().unwrap(); @@ -2726,7 +2826,7 @@ async fn finalize_publisher_residual_does_not_drift_untouched_tables() { .unwrap(); { - let _failpoint = ScopedFailPoint::new("mutation.post_finalize_pre_publisher", "return"); + let _failpoint = ScopedFailPoint::new(names::MUTATION_POST_FINALIZE_PRE_PUBLISHER, "return"); let _ = mutate_main( &mut db, MUTATION_QUERIES, @@ -2764,6 +2864,7 @@ async fn finalize_publisher_residual_does_not_drift_untouched_tables() { /// a roll-forward-only heal and proceed — they do not refuse on a pending /// sidecar the way `optimize`/`repair` do — so the write succeeds with no drift. #[tokio::test] +#[serial] async fn ensure_indices_stage_btree_failure_leaves_existing_tables_writable() { let _scenario = FailScenario::setup(); let dir = tempfile::tempdir().unwrap(); @@ -2791,7 +2892,7 @@ async fn ensure_indices_stage_btree_failure_leaves_existing_tables_writable() { // ensure_indices builds the deferred `age` BTREE on Person; the failpoint // fires between stage and commit, so Person's Lance HEAD does not move. let _failpoint = - ScopedFailPoint::new("ensure_indices.post_stage_pre_commit_btree", "return"); + ScopedFailPoint::new(names::ENSURE_INDICES_POST_STAGE_PRE_COMMIT_BTREE, "return"); let err = db.ensure_indices().await.unwrap_err(); assert!( err.to_string() @@ -2844,6 +2945,7 @@ fn assert_no_staging_files(graph: &std::path::Path) { // ExpectedVersionMismatch. #[tokio::test] +#[serial] async fn schema_apply_without_schema_staging_rolls_back_on_next_open() { use omnigraph::loader::{LoadMode, load_jsonl}; @@ -2871,7 +2973,7 @@ async fn schema_apply_without_schema_staging_rolls_back_on_next_open() { { let db = Omnigraph::open(&uri).await.unwrap(); - let _failpoint = ScopedFailPoint::new("schema_apply.before_staging_write", "return"); + let _failpoint = ScopedFailPoint::new(names::SCHEMA_APPLY_BEFORE_STAGING_WRITE, "return"); let v2_schema = r#"node Person { name: String @key age: I32? @@ -2941,6 +3043,7 @@ edge WorksAt: Person -> Company } #[tokio::test] +#[serial] async fn schema_apply_phase_b_failure_recovered_on_next_open() { use omnigraph::loader::{LoadMode, load_jsonl}; @@ -2976,7 +3079,7 @@ async fn schema_apply_phase_b_failure_recovered_on_next_open() { // written, but BEFORE the manifest publish. The recovery sidecar persists. { let db = Omnigraph::open(&uri).await.unwrap(); - let _failpoint = ScopedFailPoint::new("schema_apply.after_staging_write", "return"); + let _failpoint = ScopedFailPoint::new(names::SCHEMA_APPLY_AFTER_STAGING_WRITE, "return"); // v2 schema: add a `city` property to Person AND add a new // `Tag` node type. The new property triggers the rewritten_tables // path (Phase B sidecar coverage). The new type changes the @@ -3123,7 +3226,7 @@ async fn optimize_phase_b_failure_recovered_on_next_open() { { let db = Omnigraph::open(&uri).await.unwrap(); let _failpoint = - ScopedFailPoint::new("optimize.post_phase_b_pre_manifest_commit", "return"); + ScopedFailPoint::new(names::OPTIMIZE_POST_PHASE_B_PRE_MANIFEST_COMMIT, "return"); let err = db.optimize().await.unwrap_err(); assert!( err.to_string().contains( @@ -3212,7 +3315,7 @@ async fn optimize_survives_concurrent_insert_advancing_manifest() { // HEAD == manifest (no in-flight optimize drift for the writer to trip on); the // insert advances the manifest, then optimize compacts on top and must converge // its publish over the advanced manifest rather than fail the equality CAS. - let failpoint = ScopedFailPoint::new("optimize.before_compact", "pause"); + let failpoint = ScopedFailPoint::new(names::OPTIMIZE_BEFORE_COMPACT, "pause"); let uri_opt = uri.clone(); let optimize = tokio::spawn(async move { @@ -3289,7 +3392,7 @@ async fn optimize_survives_concurrent_delete_before_compaction() { } // Pause optimize BEFORE its compaction commits. - let failpoint = ScopedFailPoint::new("optimize.before_compact", "pause"); + let failpoint = ScopedFailPoint::new(names::OPTIMIZE_BEFORE_COMPACT, "pause"); let uri_opt = uri.clone(); let optimize = tokio::spawn(async move { @@ -3367,7 +3470,7 @@ async fn optimize_retry_does_not_misclassify_own_head_drift() { // Inject exactly one retryable reindex conflict: attempt 1 compacts (HEAD+1) then // "conflicts" on reindex → retry; attempt 2 reopens with HEAD ahead of the manifest // from our own compaction — the misclassification trigger. - let _failpoint = ScopedFailPoint::new("optimize.inject_reindex_conflict", "1*return"); + let _failpoint = ScopedFailPoint::new(names::OPTIMIZE_INJECT_REINDEX_CONFLICT, "1*return"); let db = Omnigraph::open(&uri).await.unwrap(); let stats = db @@ -3416,6 +3519,7 @@ async fn wait_for_sidecar(root: &std::path::Path) -> bool { } #[tokio::test] +#[serial] #[serial(branch_merge_phase_b)] async fn branch_merge_phase_b_failure_recovered_on_next_open() { use omnigraph::loader::{LoadMode, load_jsonl}; @@ -3472,7 +3576,7 @@ async fn branch_merge_phase_b_failure_recovered_on_next_open() { { let db = Omnigraph::open(&uri).await.unwrap(); let _failpoint = - ScopedFailPoint::new("branch_merge.post_phase_b_pre_manifest_commit", "return"); + ScopedFailPoint::new(names::BRANCH_MERGE_POST_PHASE_B_PRE_MANIFEST_COMMIT, "return"); let err = db.branch_merge("feature", "main").await.unwrap_err(); assert!( err.to_string().contains( @@ -3528,47 +3632,24 @@ async fn branch_merge_phase_b_failure_recovered_on_next_open() { ); // The recovered branch_merge must record a MERGE commit (with - // `merged_parent_commit_id` set), not a plain commit. Without - // this, future merges between the same pair lose - // already-up-to-date detection. We verify by reading - // `_graph_commits.lance` and asserting the most recent commit - // tagged with the recovery actor has a non-null - // `merged_parent_commit_id`. + // `merged_parent_commit_id` set), not a plain commit. Without this, future + // merges between the same pair lose already-up-to-date detection. RFC-013 + // Phase 7 records the recovery commit in `__manifest` (folded into the + // recovery publish CAS), so we read it through the commit-graph projection + // (`CommitGraph::load_commits`) and assert some commit carries a non-null + // `merged_parent_commit_id`. Only a recovered branch_merge can produce one + // here (we never completed a normal merge in this test). { - use arrow_array::{Array, StringArray}; - use futures::TryStreamExt; - let commits_dir = dir.path().join("_graph_commits.lance"); - let ds = lance::Dataset::open(commits_dir.to_str().unwrap()) - .await - .unwrap(); - let batches: Vec = ds - .scan() - .try_into_stream() - .await - .unwrap() - .try_collect() - .await - .unwrap(); - let mut found_recovery_merge = false; - for batch in batches { - let merged = batch - .column_by_name("merged_parent_commit_id") - .expect("merged_parent_commit_id column present") - .as_any() - .downcast_ref::() - .expect("merged_parent_commit_id is Utf8"); - // The actor_id lives in _graph_commit_actors; cross-checking - // is heavier than necessary. Detecting any non-null - // merged_parent_commit_id in the post-recovery state is - // sufficient: only a recovered branch_merge can produce one - // here (we never completed a normal merge in this test). - for i in 0..merged.len() { - if !merged.is_null(i) { - found_recovery_merge = true; - break; - } - } - } + let commits = + omnigraph::db::commit_graph::CommitGraph::open(dir.path().to_str().unwrap()) + .await + .unwrap() + .load_commits() + .await + .unwrap(); + let found_recovery_merge = commits + .iter() + .any(|c| c.merged_parent_commit_id.is_some()); assert!( found_recovery_merge, "recovered branch_merge must record `merged_parent_commit_id` so future \ @@ -3585,6 +3666,7 @@ async fn branch_merge_phase_b_failure_recovered_on_next_open() { /// silently: the adopt path advanced Lance HEAD but was unpinned, so the sweep /// found no sidecar and the merge was lost. #[tokio::test] +#[serial] #[serial(branch_merge_phase_b)] async fn branch_merge_adopt_with_delta_phase_b_failure_recovered_on_next_open() { use omnigraph::loader::{LoadMode, load_jsonl}; @@ -3627,7 +3709,7 @@ async fn branch_merge_adopt_with_delta_phase_b_failure_recovered_on_next_open() { let db = Omnigraph::open(&uri).await.unwrap(); let _failpoint = - ScopedFailPoint::new("branch_merge.post_phase_b_pre_manifest_commit", "return"); + ScopedFailPoint::new(names::BRANCH_MERGE_POST_PHASE_B_PRE_MANIFEST_COMMIT, "return"); let err = db.branch_merge("feature", "main").await.unwrap_err(); assert!( err.to_string().contains( @@ -3799,6 +3881,7 @@ async fn assert_partial_merge_rolls_back(scenario: MergeScenario, failpoint: &st } #[tokio::test] +#[serial] #[serial(branch_merge_phase_b)] async fn branch_merge_adopt_partial_after_append_rolls_back() { assert_partial_merge_rolls_back( @@ -3809,6 +3892,7 @@ async fn branch_merge_adopt_partial_after_append_rolls_back() { } #[tokio::test] +#[serial] #[serial(branch_merge_phase_b)] async fn branch_merge_adopt_partial_after_upsert_rolls_back() { assert_partial_merge_rolls_back( @@ -3819,6 +3903,7 @@ async fn branch_merge_adopt_partial_after_upsert_rolls_back() { } #[tokio::test] +#[serial] #[serial(branch_merge_phase_b)] async fn branch_merge_rewrite_partial_after_merge_rolls_back() { assert_partial_merge_rolls_back( @@ -3829,6 +3914,7 @@ async fn branch_merge_rewrite_partial_after_merge_rolls_back() { } #[tokio::test] +#[serial] #[serial(branch_merge_phase_b)] async fn branch_merge_rewrite_partial_after_delete_rolls_back() { assert_partial_merge_rolls_back( @@ -3855,6 +3941,7 @@ async fn branch_merge_rewrite_partial_after_delete_rolls_back() { /// the version-aware classifier reads v1 as the old loose generation → rolls /// forward → `bob` preserved. #[tokio::test] +#[serial] #[serial(branch_merge_phase_b)] async fn pre_upgrade_v1_branch_merge_sidecar_rolls_forward_not_back() { use omnigraph::loader::{LoadMode, load_jsonl}; @@ -3889,7 +3976,7 @@ async fn pre_upgrade_v1_branch_merge_sidecar_rolls_forward_not_back() { // sidecar lands on disk. { let db = Omnigraph::open(&uri).await.unwrap(); - let _fp = ScopedFailPoint::new("branch_merge.post_phase_b_pre_manifest_commit", "return"); + let _fp = ScopedFailPoint::new(names::BRANCH_MERGE_POST_PHASE_B_PRE_MANIFEST_COMMIT, "return"); db.branch_merge("feature", "main").await.unwrap_err(); } @@ -3931,6 +4018,7 @@ async fn pre_upgrade_v1_branch_merge_sidecar_rolls_forward_not_back() { /// target, and future merges between the same pair would lose /// already-up-to-date detection. #[tokio::test] +#[serial] #[serial(branch_merge_phase_b)] async fn branch_merge_phase_b_failure_recovered_on_non_main_target() { use omnigraph::loader::{LoadMode, load_jsonl}; @@ -3995,7 +4083,7 @@ async fn branch_merge_phase_b_failure_recovered_on_non_main_target() { { let db = Omnigraph::open(&uri).await.unwrap(); let _failpoint = - ScopedFailPoint::new("branch_merge.post_phase_b_pre_manifest_commit", "return"); + ScopedFailPoint::new(names::BRANCH_MERGE_POST_PHASE_B_PRE_MANIFEST_COMMIT, "return"); let err = db .branch_merge("source_branch", "target_branch") .await @@ -4056,6 +4144,7 @@ async fn branch_merge_phase_b_failure_recovered_on_non_main_target() { /// keeps RewriteMerged tables on active_branch), the contract assertion /// catches a regression that reverts to `entry.table_branch.clone()`. #[tokio::test] +#[serial] #[serial(branch_merge_phase_b)] async fn branch_merge_sidecar_pins_table_branch_to_active_branch() { use omnigraph::loader::{LoadMode, load_jsonl}; @@ -4097,7 +4186,7 @@ async fn branch_merge_sidecar_pins_table_branch_to_active_branch() { { let db = Omnigraph::open(&uri).await.unwrap(); let _failpoint = - ScopedFailPoint::new("branch_merge.post_phase_b_pre_manifest_commit", "return"); + ScopedFailPoint::new(names::BRANCH_MERGE_POST_PHASE_B_PRE_MANIFEST_COMMIT, "return"); let _ = db .branch_merge("source_branch", "target_branch") .await @@ -4158,6 +4247,7 @@ async fn branch_merge_sidecar_pins_table_branch_to_active_branch() { /// `needs_index_work_*` code path and the /// `recovery_ensure_indices_handles_empty_tables` integration test. #[tokio::test] +#[serial] async fn ensure_indices_phase_b_failure_does_not_leak_sidecar_when_no_work_needed() { use omnigraph::loader::{LoadMode, load_jsonl}; @@ -4188,7 +4278,7 @@ async fn ensure_indices_phase_b_failure_does_not_leak_sidecar_when_no_work_neede { let db = Omnigraph::open(&uri).await.unwrap(); let _failpoint = - ScopedFailPoint::new("ensure_indices.post_phase_b_pre_manifest_commit", "return"); + ScopedFailPoint::new(names::ENSURE_INDICES_POST_PHASE_B_PRE_MANIFEST_COMMIT, "return"); let err = db.ensure_indices().await.unwrap_err(); assert!( err.to_string().contains( @@ -4258,11 +4348,12 @@ async fn ensure_indices_phase_b_failure_does_not_leak_sidecar_when_no_work_neede // limitation. #[tokio::test] +#[serial] async fn init_failpoint_after_schema_pg_written_cleans_up_schema_file() { let _scenario = FailScenario::setup(); let dir = tempfile::tempdir().unwrap(); let uri = dir.path().to_str().unwrap(); - let _failpoint = ScopedFailPoint::new("init.after_schema_pg_written", "return"); + let _failpoint = ScopedFailPoint::new(names::INIT_AFTER_SCHEMA_PG_WRITTEN, "return"); let err = match Omnigraph::init(uri, helpers::TEST_SCHEMA).await { Ok(_) => panic!("expected Omnigraph::init to fail at the configured failpoint"), @@ -4284,11 +4375,12 @@ async fn init_failpoint_after_schema_pg_written_cleans_up_schema_file() { } #[tokio::test] +#[serial] async fn init_failpoint_after_schema_contract_written_cleans_up_all_schema_files() { let _scenario = FailScenario::setup(); let dir = tempfile::tempdir().unwrap(); let uri = dir.path().to_str().unwrap(); - let _failpoint = ScopedFailPoint::new("init.after_schema_contract_written", "return"); + let _failpoint = ScopedFailPoint::new(names::INIT_AFTER_SCHEMA_CONTRACT_WRITTEN, "return"); let err = match Omnigraph::init(uri, helpers::TEST_SCHEMA).await { Ok(_) => panic!("expected Omnigraph::init to fail at the configured failpoint"), @@ -4315,11 +4407,12 @@ async fn init_failpoint_after_schema_contract_written_cleans_up_all_schema_files } #[tokio::test] +#[serial] async fn init_failpoint_after_coordinator_init_cleans_up_schema_files() { let _scenario = FailScenario::setup(); let dir = tempfile::tempdir().unwrap(); let uri = dir.path().to_str().unwrap(); - let _failpoint = ScopedFailPoint::new("init.after_coordinator_init", "return"); + let _failpoint = ScopedFailPoint::new(names::INIT_AFTER_COORDINATOR_INIT, "return"); let err = match Omnigraph::init(uri, helpers::TEST_SCHEMA).await { Ok(_) => panic!("expected Omnigraph::init to fail at the configured failpoint"), @@ -4355,6 +4448,7 @@ async fn init_failpoint_after_coordinator_init_cleans_up_schema_files() { } #[tokio::test] +#[serial] async fn init_failpoint_returns_original_error_not_cleanup_error() { // The cleanup is best-effort. If `storage.delete` fails (e.g. transient // network blip on S3), the original init failpoint error must still @@ -4366,7 +4460,7 @@ async fn init_failpoint_returns_original_error_not_cleanup_error() { let _scenario = FailScenario::setup(); let dir = tempfile::tempdir().unwrap(); let uri = dir.path().to_str().unwrap(); - let _failpoint = ScopedFailPoint::new("init.after_schema_pg_written", "return"); + let _failpoint = ScopedFailPoint::new(names::INIT_AFTER_SCHEMA_PG_WRITTEN, "return"); let err = match Omnigraph::init(uri, helpers::TEST_SCHEMA).await { Ok(_) => panic!("expected Omnigraph::init to fail at the configured failpoint"), @@ -4379,3 +4473,153 @@ async fn init_failpoint_returns_original_error_not_cleanup_error() { "init error must surface the failpoint cause, got: {msg}" ); } + +// ── RFC-013 Phase 7 / FIX A: a transient legacy-open failure must abort the ── +// v3→v4 migration loudly, not silently swallow the lineage and stamp v4. +// +// `migrate_v3_to_v4` backfills graph lineage from `_graph_commits.lance` into +// `__manifest`, then stamps internal-schema v4. The migration runs exactly once +// per graph (`migrate_internal_schema` is `while stamp < CURRENT`). If a +// transient or corrupt `Dataset::open` of the legacy commit dataset is treated +// as "no legacy data" (the pre-fix `Err(_) => empty` arm), the migration backfills +// NOTHING and stamps v4 — orphaning the real lineage permanently, since the v3 +// fallback is then disabled. The fix matches the not-found variants (benign: +// genuinely no legacy data) and propagates anything else. +// +// This test injects a non-not-found Lance error at the legacy open via the +// `migration.v3_to_v4.legacy_open` failpoint. The load-bearing assertion is the +// last one: a once-transient failure leaves the graph RETRYABLE (stamp still v3, +// no lineage), so a later open with the fault cleared completes the migration — +// it was not a poison pill. +#[tokio::test] +async fn transient_legacy_open_failure_aborts_migration_without_stamping_v4() { + let _scenario = FailScenario::setup(); + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap().to_string(); + + // A real pre-Phase-7 (v3) graph: lineage only in `_graph_commits.lance`, + // `__manifest` stamped v3 with no `graph_commit` rows. + let fixture = omnigraph::db::commit_graph::seed_legacy_v3_lineage(&uri) + .await + .unwrap(); + let (rows_before, stamp_before) = + omnigraph::db::manifest::lineage_row_count_and_stamp_for_test(&uri, None) + .await + .unwrap(); + assert_eq!(stamp_before, 3, "fixture is stamped v3"); + assert_eq!(rows_before, 0, "fixture has no lineage in __manifest"); + + // Arm the legacy-open fault and run the read-write migration entry point. + { + let _fp = ScopedFailPoint::new(names::MIGRATION_V3_TO_V4_LEGACY_OPEN, "return"); + let err = match omnigraph::db::manifest::migrate_on_open_for_test(&uri).await { + Ok(()) => panic!("migration must abort when the legacy open fails transiently"), + Err(e) => e, + }; + // The injected (non-not-found) Lance error must surface, not be masked. + let msg = err.to_string(); + assert!( + msg.contains("injected failpoint triggered: migration.v3_to_v4.legacy_open"), + "expected the injected legacy-open error to propagate, got: {msg}" + ); + } + + // The migration left NO drift: stamp still v3, still no lineage. (Pre-fix, + // the swallow would have stamped v4 with an empty backfill — permanent loss.) + let (rows_after_fault, stamp_after_fault) = + omnigraph::db::manifest::lineage_row_count_and_stamp_for_test(&uri, None) + .await + .unwrap(); + assert_eq!( + stamp_after_fault, 3, + "a transient legacy-open failure must NOT stamp the manifest to v4", + ); + assert_eq!( + rows_after_fault, 0, + "a transient legacy-open failure must NOT partially backfill lineage", + ); + + // The whole correctness claim: a once-transient failure is retryable. With the + // fault cleared, the next migration pass reads the legacy lineage and completes. + omnigraph::db::manifest::migrate_on_open_for_test(&uri) + .await + .unwrap(); + let (rows_done, stamp_done) = + omnigraph::db::manifest::lineage_row_count_and_stamp_for_test(&uri, None) + .await + .unwrap(); + assert_eq!(stamp_done, 4, "the retried migration stamps v4"); + assert_eq!( + rows_done, + fixture.all_ids.len(), + "the retried migration backfills every legacy commit", + ); +} + +// ── RFC-013 Phase 7 / FIX B follow-up: the v3→v4 stamp-bump retry loop must ── +// surface a RETRYABLE contention error on exhaustion, not a stringified Lance error. +// +// `commit_v4_stamp_idempotently` bumps the internal-schema stamp under concurrent +// runners: the `UpdateConfig` CAS loser gets `IncompatibleTransaction`, re-opens, +// confirms the winner stamped the same value, and is done. Genuine exhaustion (every +// attempt loses) must return a `RowLevelCasContention` so the publisher's OUTER retry +// completes the one-time open — an `OmniError::Lance` would be treated as fatal. The +// `migration.v4_stamp.force_incompatible` failpoint forces every stamp attempt to lose, +// driving the otherwise-near-unreachable exhaustion path deterministically. (Pre-fix — +// `0..=BUDGET` + an `attempt < BUDGET` guard — the last iteration fell through to the +// stringifying `Err(e)` arm and returned a non-retryable `OmniError::Lance`.) +#[tokio::test] +async fn v4_stamp_exhaustion_returns_retryable_contention() { + let _scenario = FailScenario::setup(); + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap().to_string(); + + // A real v3 graph: the backfill merge succeeds; only the terminal stamp loop + // is forced to exhaust. + let _fixture = omnigraph::db::commit_graph::seed_legacy_v3_lineage(&uri) + .await + .unwrap(); + + let _fp = ScopedFailPoint::new(names::MIGRATION_V4_STAMP_FORCE_INCOMPATIBLE, "return"); + let err = match omnigraph::db::manifest::migrate_on_open_for_test(&uri).await { + Ok(()) => panic!("migration must error when the stamp bump exhausts its retries"), + Err(e) => e, + }; + assert!( + matches!( + &err, + omnigraph::error::OmniError::Manifest(m) + if matches!( + m.details, + Some(omnigraph::error::ManifestConflictDetails::RowLevelCasContention) + ) + ), + "stamp-bump exhaustion must surface a RETRYABLE RowLevelCasContention so the \ + publisher's outer retry completes the open, got: {err:?}", + ); +} + +// The publisher's outer retry must re-run `load_publish_state` on a RETRYABLE error, +// not propagate it fatally. `load_publish_state` runs `migrate_internal_schema`, whose +// bounded merge/stamp loops surface a `RowLevelCasContention` on exhaustion EXPECTING +// this re-run (a clean second scan, by which point a concurrent winner has finished the +// migration). Before the fix, `load_publish_state().await?` short-circuited the loop — +// only `merge_rows` conflicts hit the retry — so the typed contention aborted the +// publish. Inject a ONE-SHOT retryable contention into `load_publish_state`: the write +// must still commit, because the publisher retries and the cleared second attempt wins. +#[tokio::test] +#[serial] +async fn publisher_retries_retryable_load_publish_state_error() { + let _scenario = FailScenario::setup(); + let dir = tempfile::tempdir().unwrap(); + let db = helpers::init_and_load(&dir).await; + + // `1*return`: fail only the FIRST `load_publish_state` of the next publish, so the + // retry's second call is clean. Set after `init_and_load` so its publishes are + // unaffected. + let _fp = ScopedFailPoint::new(names::PUBLISH_LOAD_STATE_RETRYABLE_CONTENTION, "1*return"); + let row = r#"{"type":"Person","data":{"name":"Grace","age":37}}"#; + db.load_as("main", None, row, LoadMode::Merge, None) + .await + .expect("publisher must retry the one-shot retryable load_publish_state error and commit"); +} diff --git a/crates/omnigraph/tests/helpers/cost.rs b/crates/omnigraph/tests/helpers/cost.rs index 2114f23..9c82229 100644 --- a/crates/omnigraph/tests/helpers/cost.rs +++ b/crates/omnigraph/tests/helpers/cost.rs @@ -58,6 +58,14 @@ pub struct IoCounts { pub commit_graph_reads: u64, /// Version-probe invocations (the cheap freshness check). pub version_probes: u64, + /// DATA-table open CALL count through the two instrumented chokepoints — an + /// exact open-invocation count (not the opener-read term), classified by URI so + /// internal/system-table opens are excluded. Step-3b target: + /// `data_open_count <= |touched_tables|` for a write. + pub data_open_count: u64, + /// Internal/system-table (`__manifest`, `_graph_commits*`) open CALL count — + /// the complement of `data_open_count` (publisher CAS + commit-graph append). + pub internal_open_count: u64, } impl IoCounts { @@ -225,6 +233,8 @@ struct ProbeHandles { commit_graph: IOTracker, table: PrefixCounter, probe_count: Arc, + data_open_count: Arc, + internal_open_count: Arc, } impl ProbeHandles { @@ -234,6 +244,8 @@ impl ProbeHandles { commit_graph: IOTracker::default(), table: PrefixCounter::default(), probe_count: Arc::new(AtomicU64::new(0)), + data_open_count: Arc::new(AtomicU64::new(0)), + internal_open_count: Arc::new(AtomicU64::new(0)), }; let probes = QueryIoProbes { manifest_wrapper: Some(Arc::new(h.manifest.clone()) as Arc), @@ -242,6 +254,8 @@ impl ProbeHandles { ), table_wrapper: Some(Arc::new(h.table.clone()) as Arc), probe_count: Arc::clone(&h.probe_count), + data_open_count: Arc::clone(&h.data_open_count), + internal_open_count: Arc::clone(&h.internal_open_count), }; (probes, h) } @@ -256,6 +270,8 @@ impl ProbeHandles { manifest_reads: self.manifest.stats().read_iops, commit_graph_reads: self.commit_graph.stats().read_iops, version_probes: self.probe_count.load(Ordering::Relaxed), + data_open_count: self.data_open_count.load(Ordering::Relaxed), + internal_open_count: self.internal_open_count.load(Ordering::Relaxed), } } } diff --git a/crates/omnigraph/tests/helpers/failpoint.rs b/crates/omnigraph/tests/helpers/failpoint.rs new file mode 100644 index 0000000..0c93670 --- /dev/null +++ b/crates/omnigraph/tests/helpers/failpoint.rs @@ -0,0 +1,84 @@ +//! Deterministic rendezvous for concurrent failpoint tests. +//! +//! The pattern: park the FIRST thread that hits a failpoint until the test +//! explicitly releases it, while later arrivals fall through. This replaces +//! fixed "guess" `sleep`s for cross-thread coordination — the test waits on +//! the *condition* (the point was reached) with a bounded timeout that fails +//! loudly, instead of betting a fixed duration is long enough. +//! +//! Extracted from the open-coded `AtomicBool` + callback pattern that +//! `fork_collision_with_live_concurrent_fork_is_retryable` proved out. +//! +//! The `reached` flag also doubles as a fired-assertion: a point that is +//! never hit makes [`Rendezvous::wait_until_reached`] panic, so a typo'd or +//! misplaced failpoint cannot pass silently. + +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering::SeqCst}; +use std::time::Duration; + +use omnigraph::failpoints::ScopedFailPoint; + +/// A parked-on-first-arrival rendezvous bound to a failpoint name. The +/// underlying callback is RAII-cleaned when this guard drops. +pub struct Rendezvous { + name: String, + reached: Arc, + release: Arc, + _failpoint: ScopedFailPoint, +} + +impl Rendezvous { + /// Register `name` so the FIRST thread to hit it records readiness and + /// blocks until [`release`](Self::release); later arrivals fall through + /// immediately. The park is bounded (~30s) so a test bug cannot hang the + /// suite forever. + pub fn park_first(name: &str) -> Self { + let reached = Arc::new(AtomicBool::new(false)); + let release = Arc::new(AtomicBool::new(false)); + let (cb_reached, cb_release) = (Arc::clone(&reached), Arc::clone(&release)); + let _failpoint = ScopedFailPoint::with_callback(name, move || { + if cb_reached + .compare_exchange(false, true, SeqCst, SeqCst) + .is_ok() + { + // ~30s bound (6000 * 5ms); released earlier on the common path. + for _ in 0..6000 { + if cb_release.load(SeqCst) { + return; + } + std::thread::sleep(Duration::from_millis(5)); + } + } + }); + Self { + name: name.to_string(), + reached, + release, + _failpoint, + } + } + + /// Async-wait until the parked thread has reached the failpoint, polling + /// the readiness condition with a bounded (~12s) timeout. Panics if the + /// point is never hit — the fired-assertion. + pub async fn wait_until_reached(&self) { + for _ in 0..2400 { + if self.reached.load(SeqCst) { + return; + } + tokio::time::sleep(Duration::from_millis(5)).await; + } + panic!("rendezvous: failpoint '{}' was never reached", self.name); + } + + /// Whether the parked thread has reached the failpoint yet. + pub fn reached(&self) -> bool { + self.reached.load(SeqCst) + } + + /// Release the parked thread so it resumes past the failpoint. + pub fn release(&self) { + self.release.store(true, SeqCst); + } +} diff --git a/crates/omnigraph/tests/helpers/mod.rs b/crates/omnigraph/tests/helpers/mod.rs index d89227f..13127f2 100644 --- a/crates/omnigraph/tests/helpers/mod.rs +++ b/crates/omnigraph/tests/helpers/mod.rs @@ -1,6 +1,8 @@ #![allow(dead_code)] pub mod cost; +#[cfg(feature = "failpoints")] +pub mod failpoint; pub mod recovery; use arrow_array::{Array, RecordBatch, StringArray}; diff --git a/crates/omnigraph/tests/lance_surface_guards.rs b/crates/omnigraph/tests/lance_surface_guards.rs index 6e7b891..d34080b 100644 --- a/crates/omnigraph/tests/lance_surface_guards.rs +++ b/crates/omnigraph/tests/lance_surface_guards.rs @@ -86,6 +86,83 @@ async fn lance_error_too_much_write_contention_variant_exists() { ); } +// --- Guard 1a: LanceError::IncompatibleTransaction variant exists ---------- +// +// `db/manifest/migrations.rs::commit_v4_stamp_idempotently` pattern-matches on +// this variant: two concurrent v3→v4 runners both bump the internal-schema stamp +// (an `UpdateConfig` commit on the same metadata key), and the loser gets +// `IncompatibleTransaction`. Since both write the same value the conflict is +// benign and is retried idempotently. If Lance renames the variant or removes the +// builder, the match silently stops catching the conflict — this guard fails to +// force an update. + +#[tokio::test] +async fn lance_error_incompatible_transaction_variant_exists() { + let err = + lance::Error::incompatible_transaction_source("concurrent UpdateConfig at version N".into()); + assert!( + matches!(err, lance::Error::IncompatibleTransaction { .. }), + "Lance::Error::IncompatibleTransaction variant missing or renamed; \ + update db/manifest/migrations.rs::commit_v4_stamp_idempotently and \ + this guard, then re-pin docs/dev/lance.md." + ); +} + +// --- Guard 1c: LanceError::DatasetAlreadyExists variant exists -------------- +// +// `db/commit_graph.rs` and `db/recovery_audit.rs` create internal Lance tables +// with a create-or-open idempotency fallback: a concurrent/prior create races, +// and the `DatasetAlreadyExists` arm falls back to `Dataset::open`. They match +// the typed variant, NOT the display string ("Dataset already exists: ..."), +// which is not a Lance API contract. If Lance renames the variant the match +// silently stops catching the race and a re-create errors instead of opening — +// this guard turns red to force an update. + +#[tokio::test] +async fn lance_error_dataset_already_exists_variant_exists() { + let err = lance::Error::dataset_already_exists("guard"); + assert!( + matches!(err, lance::Error::DatasetAlreadyExists { .. }), + "Lance::Error::DatasetAlreadyExists variant missing or renamed; update the \ + db/commit_graph.rs + db/recovery_audit.rs create-or-open fallbacks and \ + this guard, then re-pin docs/dev/lance.md." + ); +} + +// --- Guard 1b: Dataset::open on a missing path returns a not-found variant -- +// +// `db/commit_graph.rs::read_legacy_commit_cache` (the v3→v4 lineage migration +// source) classifies a legacy-open error: a genuine not-found is the benign +// "no legacy data" signal (empty cache), and ANY OTHER error propagates loudly +// rather than being read as "empty" — a swallow there would let the migration +// stamp v4 over an empty backfill, orphaning real lineage permanently. That +// classification relies on Lance mapping an object-store NotFound to +// `DatasetNotFound` (or, for some paths, `NotFound`). If a Lance bump emits a +// different variant for a missing dataset, the migration would propagate a +// genuine "no legacy data" as a hard error — this guard turns red to force the +// classifier (and this guard) to be updated together. + +#[tokio::test] +async fn dataset_open_missing_returns_not_found_variant() { + let dir = tempfile::tempdir().unwrap(); + // A path that was never written — nothing to open. + let missing = dir.path().join("does-not-exist.lance"); + let err = match Dataset::open(missing.to_str().unwrap()).await { + Ok(_) => panic!("opening a never-written dataset path must error"), + Err(e) => e, + }; + assert!( + matches!( + err, + lance::Error::DatasetNotFound { .. } | lance::Error::NotFound { .. } + ), + "Dataset::open on a missing path no longer returns DatasetNotFound/NotFound \ + (got: {err:?}); update db/commit_graph.rs::read_legacy_commit_cache's \ + legacy-open classification and this guard together, then re-pin \ + docs/dev/lance.md." + ); +} + // --- Guard 2: ManifestLocation field shape --------------------------------- // // `db/manifest/metadata.rs:84-88` reads `.path`, `.size`, `.e_tag`, diff --git a/crates/omnigraph/tests/lineage_projection.rs b/crates/omnigraph/tests/lineage_projection.rs new file mode 100644 index 0000000..e2a6762 --- /dev/null +++ b/crates/omnigraph/tests/lineage_projection.rs @@ -0,0 +1,235 @@ +//! RFC-013 Phase 7 acceptance gate: graph lineage lives ONLY in `__manifest`. +//! +//! The `graph_commit` + `graph_head` rows ride the same publish CAS as the +//! table-version rows, so `_graph_commits.lance` carries NO commit rows. This +//! gate proves two things over a realistic history (commits on main, a branch, +//! a merge, all with actors): +//! +//! 1. The production commit-graph projection (`CommitGraph::open(...)`, which now +//! reads `__manifest`) reconstructs the full lineage correctly — commit set, +//! parents, the merge commit's two parents + merge actor, per-branch heads, +//! and the inline actors. +//! 2. `_graph_commits.lance` (and its actor sidecar) hold ZERO commit rows: the +//! dual-write is gone and nothing appends to them. This is the load-bearing +//! "single source" assertion. + +mod helpers; + +use futures::TryStreamExt; +use lance::Dataset; + +use omnigraph::db::commit_graph::CommitGraph; +use omnigraph::db::{GraphCommit, Omnigraph}; + +use helpers::*; + +/// Count rows in a Lance dataset directory under the graph root, or `0` if it +/// does not exist. +async fn row_count(root: &str, dir: &str) -> usize { + let uri = format!("{}/{}", root.trim_end_matches('/'), dir); + let Ok(dataset) = Dataset::open(&uri).await else { + return 0; + }; + let batches: Vec = dataset + .scan() + .try_into_stream() + .await + .unwrap() + .try_collect() + .await + .unwrap(); + batches.iter().map(|b| b.num_rows()).sum() +} + +/// The production commit-graph projection at `branch`, sourced from `__manifest`. +async fn projected_commits(root: &str, branch: Option<&str>) -> Vec { + let graph = match branch { + Some(branch) => CommitGraph::open_at_branch(root, branch).await.unwrap(), + None => CommitGraph::open(root).await.unwrap(), + }; + let mut commits = graph.load_commits().await.unwrap(); + commits.sort_by(|a, b| { + a.manifest_version + .cmp(&b.manifest_version) + .then_with(|| a.created_at.cmp(&b.created_at)) + .then_with(|| a.graph_commit_id.cmp(&b.graph_commit_id)) + }); + commits +} + +async fn head_id(root: &str, branch: Option<&str>) -> String { + let graph = match branch { + Some(branch) => CommitGraph::open_at_branch(root, branch).await.unwrap(), + None => CommitGraph::open(root).await.unwrap(), + }; + graph + .head_commit() + .await + .unwrap() + .unwrap() + .graph_commit_id +} + +#[tokio::test] +async fn graph_lineage_lives_only_in_manifest() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap().to_string(); + + // Build a realistic history: several authored commits on main, a branch with + // its own authored commits, then an authored merge back into main. + let main = init_and_load(&dir).await; + + main.mutate_as( + "main", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Alice")], &[("$age", 30)]), + Some("act-alice"), + ) + .await + .unwrap(); + main.mutate_as( + "main", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Bob")], &[("$age", 41)]), + Some("act-bob"), + ) + .await + .unwrap(); + + main.branch_create("feature").await.unwrap(); + + let feature = Omnigraph::open(&uri).await.unwrap(); + feature + .mutate_as( + "feature", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Carol")], &[("$age", 27)]), + Some("act-carol"), + ) + .await + .unwrap(); + feature + .mutate_as( + "feature", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Dave")], &[("$age", 33)]), + Some("act-dave"), + ) + .await + .unwrap(); + + // Advance main once more so the merge is a real (non-fast-forward) merge with + // two distinct parents. + main.mutate_as( + "main", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Erin")], &[("$age", 38)]), + Some("act-erin"), + ) + .await + .unwrap(); + + let outcome = main + .branch_merge_as("feature", "main", Some("act-merger")) + .await + .unwrap(); + // A genuine three-way merge (both sides advanced past the base). + assert_eq!( + outcome, + omnigraph::db::MergeOutcome::Merged, + "expected a real merge, not fast-forward/up-to-date" + ); + + // ── single source: nothing writes `_graph_commits.lance` ───────────────── + // RFC-013 Phase 7 folds lineage into `__manifest`; the commit-graph dataset + // exists only to carry branch refs, so it (and its actor sidecar) hold ZERO + // commit rows. If a stray `append_commit` reappears, this turns red. + assert_eq!( + row_count(&uri, "_graph_commits.lance").await, + 0, + "_graph_commits.lance must carry no commit rows — lineage lives in __manifest" + ); + assert_eq!( + row_count(&uri, "_graph_commit_actors.lance").await, + 0, + "_graph_commit_actors.lance must carry no rows — actors live inline in __manifest" + ); + + // ── main lineage projected from `__manifest` ───────────────────────────── + let main_commits = projected_commits(&uri, None).await; + // genesis + Alice + Bob + Erin + the merge = 5 on main. + assert!( + main_commits.len() >= 5, + "expected a non-trivial main history, got {} commits", + main_commits.len() + ); + + // Genesis is the unique parentless commit and carries no actor. + let genesis: Vec<&GraphCommit> = main_commits + .iter() + .filter(|c| c.parent_commit_id.is_none()) + .collect(); + assert_eq!(genesis.len(), 1, "exactly one genesis (parentless) commit"); + assert!( + genesis[0].actor_id.is_none(), + "genesis commit carries no actor" + ); + + // Every non-genesis commit's parent resolves to a known commit (a connected + // lineage — the publisher resolved each parent under the CAS). + for commit in &main_commits { + if let Some(parent) = &commit.parent_commit_id { + assert!( + main_commits.iter().any(|c| &c.graph_commit_id == parent), + "parent {parent} of {} must be a known commit", + commit.graph_commit_id + ); + } + } + + // The merge commit carries both parents and the merge actor. + let merge_commit = main_commits + .iter() + .find(|c| c.merged_parent_commit_id.is_some()) + .expect("a merge commit with a merged parent must exist"); + assert_eq!(merge_commit.actor_id.as_deref(), Some("act-merger")); + assert!(merge_commit.parent_commit_id.is_some()); + // The merge is the head of main. + assert_eq!( + head_id(&uri, None).await, + merge_commit.graph_commit_id, + "the merge commit is the head of main" + ); + + // ── feature lineage projected from `__manifest` ────────────────────────── + let feature_commits = projected_commits(&uri, Some("feature")).await; + // The feature head is Dave's commit (the last authored on the branch). + let feature_head = head_id(&uri, Some("feature")).await; + let feature_head_commit = feature_commits + .iter() + .find(|c| c.graph_commit_id == feature_head) + .expect("feature head must be in the feature projection"); + assert_eq!( + feature_head_commit.actor_id.as_deref(), + Some("act-dave"), + "feature head is Dave's authored commit" + ); + + // ── actors surface inline from the manifest metadata ───────────────────── + // main's authored commits: Alice, Bob, Erin (direct) + the merge (act-merger) + // = 4. Carol/Dave were authored on the feature branch, not main. Genesis has + // no actor. + let authored = main_commits + .iter() + .filter(|c| c.actor_id.is_some()) + .count(); + assert!( + authored >= 4, + "expected the authored commits to surface their actor in the projection, saw {authored}" + ); +} diff --git a/crates/omnigraph/tests/maintenance.rs b/crates/omnigraph/tests/maintenance.rs index ca9026d..8e7bfc9 100644 --- a/crates/omnigraph/tests/maintenance.rs +++ b/crates/omnigraph/tests/maintenance.rs @@ -97,7 +97,9 @@ async fn optimize_on_empty_graph_returns_stats_per_table_with_no_changes() { // Schema declares 2 nodes + 2 edges = 4 data tables, plus the 3 internal // system tables (`__manifest`, `_graph_commits`, `_graph_commit_actors`) optimize // also compacts (RFC-013 step 2) = 7. Compaction should run on each but find - // nothing to merge. + // nothing to merge. The genesis graph commit rides the SINGLE init + // `__manifest` write (RFC-013 Phase 7), so a fresh graph has one fragment per + // table — nothing to compact anywhere. assert_eq!(stats.len(), 7); for s in &stats { assert_eq!(s.fragments_removed, 0, "{} should not remove", s.table_key); @@ -143,17 +145,20 @@ async fn optimize_after_load_then_again_is_idempotent() { } } -/// RFC-013 step 2: `optimize` compacts the internal system tables -/// (`__manifest`, `_graph_commits`), which accumulate one fragment per commit. -/// After compaction they shed fragments, write no recovery sidecar (a single -/// atomic Lance commit — no HEAD-before-publish gap), and the graph stays -/// coherent for subsequent reads + strict writes. +/// RFC-013 step 2 + Phase 7: `optimize` compacts `__manifest`, which now +/// accumulates one fragment per commit for BOTH the table-version rows and the +/// folded-in graph-lineage rows (`graph_commit` + `graph_head`). The +/// commit-graph datasets (`_graph_commits`, `_graph_commit_actors`) no longer +/// take a per-commit row (lineage lives in `__manifest`), so they stay flat — +/// nothing to compact. After compaction `__manifest` sheds fragments, writes no +/// recovery sidecar (a single atomic Lance commit — no HEAD-before-publish gap), +/// and the graph stays coherent for subsequent reads + strict writes. #[tokio::test] async fn optimize_compacts_internal_tables() { let dir = tempfile::tempdir().unwrap(); let mut db = init_and_load(&dir).await; - // Build version-history depth so the internal tables accumulate fragments. + // Build version-history depth so `__manifest` accumulates fragments. for i in 0..20 { mutate_main( &mut db, @@ -167,16 +172,32 @@ async fn optimize_compacts_internal_tables() { let stats = db.optimize().await.unwrap(); - for key in ["__manifest", "_graph_commits"] { + // `__manifest` carries every per-commit fragment (table versions + lineage) + // and compacts. + let manifest_stats = stats + .iter() + .find(|s| s.table_key == "__manifest") + .expect("optimize stats missing internal table __manifest"); + assert!( + manifest_stats.committed, + "__manifest should compact after 20 commits" + ); + assert!( + manifest_stats.fragments_removed > 0, + "__manifest should shed fragments, removed {}", + manifest_stats.fragments_removed + ); + + // The commit-graph datasets take no per-commit row anymore (RFC-013 Phase 7 + // folds lineage into `__manifest`), so they stay at one fragment — no-ops. + for key in ["_graph_commits", "_graph_commit_actors"] { let s = stats .iter() .find(|s| s.table_key == key) .unwrap_or_else(|| panic!("optimize stats missing internal table {key}")); - assert!(s.committed, "{key} should compact after 20 commits"); assert!( - s.fragments_removed > 0, - "{key} should shed fragments, removed {}", - s.fragments_removed + !s.committed, + "{key} carries no per-commit rows after Phase 7 — nothing to compact" ); } diff --git a/crates/omnigraph/tests/recovery.rs b/crates/omnigraph/tests/recovery.rs index ed47811..9658c58 100644 --- a/crates/omnigraph/tests/recovery.rs +++ b/crates/omnigraph/tests/recovery.rs @@ -685,38 +685,21 @@ async fn list_recovery_audit_kinds(graph_root: &Path) -> Vec { out } -/// Helper: count `_graph_commits.lance` rows tagged with the recovery actor. +/// Helper: count graph commits authored by the recovery actor. RFC-013 Phase 7 +/// records the recovery commit in `__manifest` (folded into the recovery publish +/// CAS), not `_graph_commits.lance`, so this counts through the production +/// commit-graph projection (`load_commits`), filtering on the inline actor. async fn count_recovery_actor_commits(graph_root: &Path) -> usize { - let actors_dir = graph_root.join("_graph_commit_actors.lance"); - if !actors_dir.exists() { - return 0; - } - let ds = Dataset::open(actors_dir.to_str().unwrap()).await.unwrap(); - use arrow_array::{Array, StringArray}; - use futures::TryStreamExt; - let batches: Vec = ds - .scan() - .try_into_stream() + let commits = omnigraph::db::commit_graph::CommitGraph::open(graph_root.to_str().unwrap()) .await .unwrap() - .try_collect() + .load_commits() .await .unwrap(); - let mut count = 0; - for batch in &batches { - let actors = batch - .column_by_name("actor_id") - .unwrap() - .as_any() - .downcast_ref::() - .unwrap(); - for i in 0..actors.len() { - if actors.value(i) == "omnigraph:recovery" { - count += 1; - } - } - } - count + commits + .iter() + .filter(|c| c.actor_id.as_deref() == Some("omnigraph:recovery")) + .count() } #[tokio::test] diff --git a/crates/omnigraph/tests/validators.rs b/crates/omnigraph/tests/validators.rs index 4c7a2f3..ce8525d 100644 --- a/crates/omnigraph/tests/validators.rs +++ b/crates/omnigraph/tests/validators.rs @@ -237,6 +237,58 @@ async fn cardinality_rejected_on_mutation_insert_edge() { ); } +/// RFC-013 step 3b regression guard (cursor High / codex P1 on #298): edge `@card` +/// validation must scan LIVE committed HEAD, not the pinned `txn.base`. Collapse #1 +/// skips the edge accumulation open, so a non-strict edge insert under a `WriteTxn` +/// reopens for the cardinality scan — and that scan must observe edges a concurrent +/// writer committed after this mutation captured its base, or a `@card` max is +/// silently exceeded (invariant 9). The residual validate→commit TOCTOU is the §7.1 +/// gap (step 4); this only un-widens what 3b widened (live HEAD vs mutation-start base). +/// +/// Deterministic — no failpoint: handle B's coordinator is stale by construction +/// (the write path does not probe the manifest version, unlike the read path). B MUST +/// NOT read between A's commit and B's insert — a read refreshes B's coordinator and +/// masks the bug (the same caveat as the served stale-view repro in `writes.rs`). +#[tokio::test] +async fn cardinality_rejected_for_stale_handle_after_concurrent_edge_commit() { + let (dir, mut db_a) = init_with(CARDINALITY_SCHEMA, CARDINALITY_SEED).await; + let uri = dir.path().to_str().unwrap(); + + // Handle B opens the same graph at the seed version (no edges yet); it then + // never reads again, so its in-memory coordinator stays pinned at the seed. + let mut db_b = Omnigraph::open(uri).await.unwrap(); + + // Handle A commits WorksAt(Alice -> Acme): Alice is now at the @card(0..1) max. + // This advances the on-disk manifest; B's coordinator is now stale. + mutate_main( + &mut db_a, + CARDINALITY_MUTATIONS, + "add_employment", + ¶ms(&[("$person", "Alice"), ("$company", "Acme")]), + ) + .await + .unwrap(); + + // Handle B (stale, never read since A committed) inserts a second WorksAt for + // Alice. B is non-strict + under a WriteTxn, so collapse #1 skips the open and the + // cardinality scan reopens: it MUST read live HEAD (Alice has 1) → reject (1+1 > 1), + // not the stale base (Alice has 0) → which would wrongly pass and commit a 2nd edge. + let err = mutate_main( + &mut db_b, + CARDINALITY_MUTATIONS, + "add_employment", + ¶ms(&[("$person", "Alice"), ("$company", "Beta")]), + ) + .await + .unwrap_err(); + assert!( + err.to_string().to_lowercase().contains("cardinality") + || err.to_string().to_lowercase().contains("@card"), + "a stale-handle edge insert must be rejected by @card against live HEAD, got: {}", + err + ); +} + #[tokio::test] async fn cardinality_rejected_on_jsonl_load() { // Already covered by existing loader Phase 3 logic but assert the diff --git a/crates/omnigraph/tests/write_cost.rs b/crates/omnigraph/tests/write_cost.rs index c7e8528..6cbf763 100644 --- a/crates/omnigraph/tests/write_cost.rs +++ b/crates/omnigraph/tests/write_cost.rs @@ -24,10 +24,10 @@ mod helpers; use helpers::cost::{ - IoCounts, assert_flat, assert_grows, local_graph, measure_insert, measure_insert_as, + IoCounts, assert_flat, assert_grows, local_graph, measure, measure_insert, measure_insert_as, measure_with_staged, }; -use helpers::{MUTATION_QUERIES, commit_many, commit_many_as, mixed_params}; +use helpers::{MUTATION_QUERIES, commit_many, commit_many_as, init_and_load, mixed_params}; // ── (A) The internal-table LOCK — the acceptance test for step 2 (compaction) ── // @@ -130,7 +130,16 @@ async fn single_insert_data_write_is_bounded() { /// At a fixed shallow depth, the per-write object-store read count is below a /// documented ceiling. Fails the moment a change *adds* a round-trip on the write -/// path — the "no new round-trip" guard (calibrated: ~50 at depth ~5). +/// path — the "no new round-trip" guard. +/// +/// Two folds keep the count low: RFC-013 Phase 7 put the `graph_commit` + +/// `graph_head` rows in the same publish merge-insert (no extra `__manifest` +/// write/scan per commit), and RFC-013 P2 collapsed the publish path's FOUR +/// `__manifest` scans (table locations + version entries + tombstones + a +/// separate `read_graph_lineage` for the parent) into ONE — the +/// `manifest_reads` sub-ceiling below would trip if any of those scans crept +/// back. Calibrated at depth ~5: ~26 `__manifest` reads / ~36 total after the +/// P2 fold (was ~44 / ~54 with the four separate scans). #[tokio::test] async fn write_op_count_ceiling_at_shallow_depth() { let dir = tempfile::tempdir().unwrap(); @@ -141,6 +150,16 @@ async fn write_op_count_ceiling_at_shallow_depth() { "depth~5: data={} __manifest={} _graph_commits={} total_reads={}", io.data_reads, io.manifest_reads, io.commit_graph_reads, io.total_reads() ); + // Sub-ceiling on `__manifest` reads specifically: the publish path does one + // scan, not four. ~26 measured at this depth; a re-added scan would push it + // well past this. (Deterministic on local FS.) + const MANIFEST_CEILING: u64 = 34; + assert!( + io.manifest_reads <= MANIFEST_CEILING, + "per-write __manifest reads {} exceeded ceiling {MANIFEST_CEILING} — a publish-path \ + scan was re-added (RFC-013 P2 folds them into one)", + io.manifest_reads, + ); const CEILING: u64 = 80; assert!( io.total_reads() <= CEILING, @@ -169,3 +188,86 @@ async fn keyed_insert_routes_through_merge_insert_only() { assert_eq!(staged.stage_append, 0, "keyed insert must not stage_append"); assert_eq!(staged.create_vector_index, 0, "no inline vector-index build on a plain insert"); } + +// ── (D) Step-3b capture-once fitness asserts (RED today → GREEN after WriteTxn) ── + +/// A write must validate the schema contract EXACTLY ONCE (3 `read_text` + 2 `exists`). +/// Today the write path re-validates at every resolve point (entry, per-table +/// `resolved_branch_target`, commit-time `fresh_snapshot_for_branch`), so the delta is +/// a multiple of that. Step 3b's `WriteTxn` validates once and threads it. The shape is +/// the write twin of `warm_read_cost.rs::warm_query_validates_schema_contract_once`, +/// built with ZERO production change via the counting storage adapter. +#[tokio::test] +async fn write_validates_schema_contract_once() { + use omnigraph::instrumentation::CountingStorageAdapter; + use omnigraph::storage::storage_for_uri; + + let dir = tempfile::tempdir().unwrap(); + let _ = init_and_load(&dir).await; + let uri = dir.path().to_str().unwrap(); + let (adapter, counts) = CountingStorageAdapter::new(storage_for_uri(uri).unwrap()); + let db = omnigraph::db::Omnigraph::open_with_storage(uri, adapter) + .await + .unwrap(); + + let before_read_text = counts.read_text(); + let before_exists = counts.exists(); + db.mutate( + "main", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "schema_once")], &[("$age", 30)]), + ) + .await + .unwrap(); + + let read_text_delta = counts.read_text() - before_read_text; + let exists_delta = counts.exists() - before_exists; + eprintln!("schema-contract reads on one write: read_text={read_text_delta} exists={exists_delta}"); + assert_eq!( + read_text_delta, 3, + "a write must validate the schema contract once (3 reads), not N times", + ); + assert_eq!( + exists_delta, 2, + "a write must probe contract-file existence once (2 probes), not N times", + ); +} + +/// A keyed single-table write must open its DATA table AT MOST ONCE. Today it opens +/// ~4× (accumulation, staging, commit drift-guard, publish-prepare/index-build), each +/// a fresh cold `Dataset::open`. Step 3b opens the base once (a *session-aware* base +/// open is deferred to step 5), threads the commit-return handle, and replaces the +/// drift-guard open with a cheap `latest_version_id` probe — collapsing to 1 open. +/// Counted by `data_open_count`, the +/// table-class-scoped chokepoint probe: the internal-table opens (publisher CAS + +/// commit-graph append) are EXCLUDED, since they are unrelated to data-table reuse and +/// would otherwise keep this count >1 regardless of threading. (`forbidden_apis` keeps +/// engine code outside the storage layer from opening datasets except through the +/// instrumented chokepoints — `table_store.rs`'s own direct opens are branch-management +/// ops, not this keyed-write path.) +#[tokio::test] +async fn keyed_insert_opens_table_at_most_once() { + let dir = tempfile::tempdir().unwrap(); + let mut db = local_graph(&dir).await; + let io = { + let (res, io) = measure(db.mutate( + "main", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "opens")], &[("$age", 30)]), + )) + .await; + res.unwrap(); + io + }; + eprintln!( + "data_open_count={} internal_open_count={} for a single-table keyed insert", + io.data_open_count, io.internal_open_count + ); + assert!( + io.data_open_count <= 1, + "a keyed single-table write must open its data table at most once, got {}", + io.data_open_count, + ); +} diff --git a/crates/omnigraph/tests/writes.rs b/crates/omnigraph/tests/writes.rs index 9cb8689..b57d8fd 100644 --- a/crates/omnigraph/tests/writes.rs +++ b/crates/omnigraph/tests/writes.rs @@ -613,7 +613,10 @@ async fn mixed_insert_and_update_on_same_person_coalesces_to_one_merge() { "dedupe must keep the update's age value, not the insert's", ); - // One-publish guarantee: manifest version advanced by exactly 1. + // One-publish guarantee: manifest version advanced by exactly 1. The graph + // commit (`graph_commit` + `graph_head` rows) rides the SAME publish CAS as + // the table-version rows (RFC-013 Phase 7), so one graph commit is exactly + // one manifest version bump. let post_version = version_main(&db).await.unwrap(); assert_eq!( post_version, @@ -659,7 +662,9 @@ async fn multiple_appends_to_same_edge_coalesce_to_one_append() { let edges_after = count_rows(&db, "edge:Knows").await; assert_eq!(edges_after, edges_before + 2); - // One manifest version bump for the two-edge query (atomic publish). + // One manifest version bump for the two-edge query (atomic publish): the + // graph commit rides the same publish CAS as the table-version rows + // (RFC-013 Phase 7). let post_version = version_main(&db).await.unwrap(); assert_eq!( post_version, @@ -690,6 +695,8 @@ async fn multi_statement_inserts_publish_exactly_once() { .await .unwrap(); + // One manifest version bump: the graph commit rides the same publish CAS + // as the table-version rows (RFC-013 Phase 7). let post_version = version_main(&db).await.unwrap(); assert_eq!( post_version, @@ -1005,6 +1012,8 @@ async fn chained_updates_with_overlapping_predicate_respects_intermediate_value( "chained-update final value must reflect the second update applied to op-1's pending value" ); + // One manifest version bump: the graph commit rides the same publish CAS + // as the table-version rows (RFC-013 Phase 7). let post_version = version_main(&db).await.unwrap(); assert_eq!( post_version, @@ -1043,6 +1052,9 @@ async fn multi_statement_delete_on_same_node_table() { pre_persons - 2, "both deletes must land", ); + // One manifest version bump: the graph commit (delete-only queries record + // one too) rides the same publish CAS as the table-version rows + // (RFC-013 Phase 7). let post_version = version_main(&db).await.unwrap(); assert_eq!( post_version, diff --git a/docs/dev/architecture.md b/docs/dev/architecture.md index 004a98a..972157b 100644 --- a/docs/dev/architecture.md +++ b/docs/dev/architecture.md @@ -133,7 +133,7 @@ flowchart TB subgraph state[graph state] coord[GraphCoordinator]:::l2 mr[ManifestCoordinator
db/manifest.rs]:::l2 - cg[CommitGraph
_graph_commits.lance]:::l2 + cg[CommitGraph
projection of __manifest graph_commit/graph_head rows]:::l2 stg[MutationStaging
per-query in-memory accumulator
exec/staging.rs]:::l2 end diff --git a/docs/dev/handoff-rfc-013-write-path.md b/docs/dev/handoff-rfc-013-write-path.md new file mode 100644 index 0000000..3706012 --- /dev/null +++ b/docs/dev/handoff-rfc-013-write-path.md @@ -0,0 +1,460 @@ +# Handoff: finishing RFC-013 (write-path latency + correctness) + +**Status:** living handoff. **Source of truth is [`rfc-013-write-path-latency.md`](rfc-013-write-path-latency.md)** — +this doc is the *current-state map + the decisions/validation from the latest work cycle ++ the concrete next actions*. When they disagree, the RFC wins (and fix this doc). + +**Audience:** the engineer/agent who picks up RFC-013 next. + +--- + +## 0. TL;DR — where we are and what's next + +RFC-013 makes the write path fast **and** correct on object storage (217 Lance tables +under one `__manifest` catalog, on R2/S3). It is sequenced as steps; read §9 of the RFC +for the canonical list. Current reality: + +**Landed on `main`:** +- **Step 1** — Tier-1 cost gate + the shared `helpers::cost` harness (#288). +- **Step 3a** — opener bypass: write opens go direct (`Dataset::open` by URI + version) + instead of the Lance-namespace builder (#288). **This already banked the dominant + depth win** — see §2 below; it reframes everything. +- **Step 2a** — internal-table compaction: `optimize` now compacts `__manifest` / + `_graph_commits` / `_graph_commit_actors` (#291). Plus the RFC latency-model + correction (#292). +- **Optimize-vs-write race** — optimize survives a cross-process write race on the + same table (#297, **LANDED** — origin/main `6d4606a8`; see §6 for why it's not + redundant with Design A). Step 3b stacks on top of this. + +**Open PRs (land these; relationships in §7):** +- **#296** `correctness-by-design-fix` — recovery roll-forward converges on a concurrent + manifest advance (the fix for the flaky `iss-schema-apply-reopen-recovery-race`). + **MERGED to main and integrated into this branch** — the converge helper now threads + Phase-7's manifest-CAS recovery `graph_commit_id` (see `converge_or_defer_roll_forward`). +- **#295** `docs/rfc-013-step-3b` — the step-3b RFC doc. +- **#254** `ragnorc/bug-4-schema-apply-occ` — schema-apply vs optimize false-fail + (same op-class family as #297, logical side). + +**Step 3b is DONE** (capture-once `WriteTxn`, schema-once + open-collapse; see §4) on +`rfc-013-step-3b-writetxn-v2`. **Next: Phase 7 (step 4), then the big one — Design A / +`PublishPlan` unification (step 5)** — see §5, the convergent fix for the bug *class* this +area keeps generating, which also absorbs 3b's deferred session-aware write opens. + +--- + +## 1. The corrected mental model (read this before touching anything) + +Three reframes from the latest cycle that the older RFC prose may not fully reflect: + +### 1a. 3a already won the depth fight → the residual is constant-factor + RTT +Before 3a, the write re-opened each table through the lance-namespace builder ~13×, and +that path was **O(depth)** (it re-opened `__manifest` + `list_table_versions` per open — +**not** a Lance back-walk; the root cause was OmniGraph's own namespace round-trips, not +Lance — validated against Lance source). 3a swapped it for the direct opener, which is +**O(1)** (`from_uri(loc).with_version(N)` = arithmetic path + one HEAD). So: + +- The dominant **O(depth) data-table** term is **gone**. +- Step 2a flattened the secondary **internal-table** scan term. +- What remains is the **~110-hop serial backbone × RTT + compute** — a constant in + depth. The latency model is **`wall = (serial_hops + ops/effective_concurrency)·RTT + + compute`**; on a capped store (R2) the op-count term re-enters wall-clock, on an + unlimited store it parallelizes away. Measured: prod one-row write 27→15.76s after + 2a; the remaining 15.76s is the serial backbone — **step 3b's target**, not step 2's. +- Step 3b's win is therefore the **call-count/RTT collapse** (redundant opens, the + flat-46 schema reads), NOT a depth slope. Don't expect a depth-slope improvement from + 3b; gate it on the constant-factor (S3 round-trips), not a curve. + +### 1b. Two op classes, two commit models (the §6.6 principle) +Every concurrency bug in this area is **one op class using the other's commit model**: + +| class | examples | commutes? | correct commit model | +|---|---|---|---| +| **maintenance** | compaction (`Rewrite`), `optimize_indices` | yes (content-preserving) | Lance native rebase + app reopen/replan on real overlap + **monotonic manifest fast-forward** — no epoch, no read-set | +| **logical mutation** | load / mutate / merge / delete | no (lost-update, write-skew) | strict cross-process OCC: read-set + write-set CAS under the `writer_epoch` fence | + +Applying strict OCC + equality-CAS uniformly is the mistake: too strong for maintenance +(false conflicts — #297's bug), too weak for logical cross-process (§6.5 corruption). + +### 1c. The root liability (what keeps generating these bugs) +Lance gives **per-table atomic commits** but **no cross-table/cross-step atomicity**, so +every multi-commit op advances per-table Lance HEAD **before** the manifest references it +(the "A-before-B window"). The resulting `HEAD vs manifest` delta is **ambiguous** +(external drift? my own in-flight work? a crashed writer?), and **many uncoordinated code +paths each re-interpret it** (4 writers + the maintenance path + recovery + the write-path +drift guard). Each interpreter is a fresh chance to misclassify. That is the bug class: +- §6.5 cross-process logical corruption, +- #297's own-HEAD-drift misclassification, +- the flaky write-path "HEAD ahead of manifest, run repair" guard, +- the recovery classifier edges. + +**The convergent fix is Design A (one publish authority — step 5); Lance MTT eventually +retires the window entirely.** See §5. + +### 1d. The second facet: the write base is a stale pin (no probe) +The READ path resolves its base behind a freshness probe (`resolve_target_inner` +omnigraph.rs:~1072 → `probe_latest_incarnation` → `refresh_manifest_only`); the WRITE path +does NOT (`resolved_branch_target` omnigraph.rs:~778 returns the warm `coord.snapshot()` for +the bound branch, no probe). So a long-lived server's write base lags the live manifest. That +single staleness feeds **two distinct failure modes**, both surfaced this cycle: + +1. **Stale validation *reads* → integrity under-enforced.** Write-path RI checks read + committed state off the stale base. 3b's collapse #1 made it worse for edge `@card`: + `edge_cardinality_read_handle` (mutation.rs:~614) scans the pinned `txn.base` instead of + live HEAD (was live HEAD pre-3b), so a concurrent edge committed after `txn` capture is + uncounted → a `@card` max can be exceeded (cursor **High** / codex **P1** on #298, + **VALID**). **#298 fix: restore the live-HEAD read for that scan** (un-regress; gate-safe — + the `data_open_count` gate is a node insert) + a deterministic regression test (commit A's + edge, then B validates → must see A) + correct the wrong "pinned base == live HEAD" doc + comment (mutation.rs:~605-613, which assumes a single writer). The *structural* liability + underneath: there is **no unified write-validation read-set** — endpoint + (`ensure_node_id_exists`, warm `snapshot_for_branch`), cardinality (mutation: pinned + `txn.base`; loader: warm `snapshot_for_branch` — the SAME check forks per write path), + commit drift guard (live `fresh_snapshot_for_branch`), and uniqueness + (`enforce_unique_constraints_intra_batch`, intra-batch only — cross-version uniqueness is a + documented gap). Three freshness levels chosen ad hoc, none re-validated at commit → the + §7.1 TOCTOU class, and each new constraint forks the pattern again. + +2. **Stale OCC *pin* → false-fail on a maintenance advance.** A served strict update/delete + pins the stale base version, then false-fails `ExpectedVersionMismatch` after an external + `optimize` advanced `__manifest` — even though the advance was content-preserving + compaction the logical write should fast-forward past (invariant 7). It's the **write-side + mirror of #297/§6.6** (#297 made optimize fast-forward past a logical write; this is a + logical write that must fast-forward past optimize). A served read clears it (the read + probes the shared coordinator). Validated repro on prod (omnigraph.ragnor.co) + + `writes.rs::served_strict_delete_after_external_optimize_advance_auto_refreshes` + (`#[ignore]` on branch `fix/write-path-stale-view-probe`). **The naive "just probe" fix is + proven wrong** — a blanket probe silently refreshes past *logical* advances too, breaking + `consistency::stale_handle_public_mutation_must_refresh_then_retry` (the deliberate + cross-process lost-update OCC primitive). The fix must **discriminate by op class**. + +**Both fold into Design A (step 5), same as §1c.** `open_txn`'s one warm probe makes the base +fresh (absorbs maintenance advances cheaply); the **op-class-aware strict precondition** — +derive from Lance's per-version transaction metadata (all `Rewrite`/`ReserveFragments` = +maintenance → fast-forward the pin; any `Append`/`Update`/`Delete`/`Merge` = logical → fail +loudly; NO parallel marker, invariant 1/15) — is the correctness fence for anything that lands +after. And the §7.1 read-set-in-CAS unifies the validation read-set + re-validates it under the +`graph_head` contention. So **the stale-view false-fail, the cardinality/validation-read-set +liability, and #297's mirror are one bug** (the write base is a stale, un-probed, un-classified +pin) with **one home: the single PublishPlan delta-interpreter** (§1c + §5). Strong corroboration +of Design A — three symptoms, one fix. + +--- + +## 2. Validated facts — do NOT re-derive these + +Established this cycle against **Lance 7.0.0 source** +(`~/.cargo/registry/src/index.crates.io-*/lance-7.0.0`) and current engine code. Cited so +you can trust them without re-investigating. + +**Lance (upstream):** +- `from_uri(loc).with_version(N).load()` and `checkout_version(N)` are **O(1)** (computed + V2 path `_versions/{u64::MAX-N:020}.manifest` + one HEAD; no listing/back-walk). + (`lance-table/src/io/commit.rs` `default_resolve_version`.) +- A shared `Arc` (`DatasetBuilder::with_session`) warms metadata/index caches + keyed by `(URI, version, e_tag)`. Caveat: the *first* manifest read on open is uncached + — the Session warms the *scan/index* metadata, not the first open. **`WriteParams` *does* + carry a `session` field** (`lance/src/dataset/write.rs`), but it only matters on the + `WriteDestination::Uri` arm; OmniGraph's staged path always drives off an **already-open + `Dataset`**, and Lance takes the store/session from that handle. So to attach the shared + Session to a write base, open read-style (`open_table_dataset` → `from_uri().with_version() + .with_session()`) and drive the staged write off that handle. +- A held `Arc` at a pinned version is `Send + Sync`, immutable, safe to reuse for + many scans/count/staged-write base in one txn (OmniGraph's `TableHandleCache` already + relies on this). +- **No compaction `RetryExecutor`** (only Delete/MergeInsert/Update have one). + `commit_compaction` commits a fixed `Rewrite` via `apply_commit` direct. In + `commit_transaction`, a semantic `RetryableCommitConflict` **escapes the retry loop** + via `?` at `io/commit.rs:979`; the loop only retries the OCC `CommitConflict` + (`:1096`), and even that re-rebases the *same* transaction (never re-plans). ⇒ + **compaction needs app-level reopen+REPLAN; you cannot "set conflict_retries" and let + Lance own it.** +- `check_rewrite_txn`: a `Rewrite` rebases **cleanly** past a concurrent `Append`/disjoint + `Update`/`Delete` (preserving both); only a same-fragment overlap yields a retryable + conflict. ⇒ the common concurrent insert/update/delete is rebased for free; the app + retry fires only on real overlap. + +**Engine (internal):** +- Read path (post-#268) already has the capture-once machinery: `Snapshot` (`db/manifest.rs`), + warm `GraphCoordinator` behind a `latest_version_id`/incarnation probe, a held + `TableHandleCache` keyed `(table,branch,version,e_tag)`, **one shared `Session` per + graph** (`read_caches.session`). **Writes bypass all of it by construction** + (`resolved_branch_target` returns `read_caches: None`; the 3a write opener attaches no + session and opens by latest, not pinned version). +- A single write opens each table **3–4×** (accumulation → staging reopen → commit + drift-guard → publish prepare), each a fresh cold open. `validate_schema_contract` + (`db/schema_state.rs`, via `ensure_schema_state_valid`) runs uncached (~3 `read_text` + + 2 `exists`) at every resolve point (~the flat-46). Both are constant-factor, flat in + depth — 3b's targets. +- Strict-op guards are the lost-update floor (3 layers: pre-stage `ensure_expected_version` + `table_store.rs`; commit-time strict drift `exec/staging.rs`; publisher CAS + `publisher.rs`). Capture-once **supplies** the pinned operand — never remove a guard. +- Fork-on-first-write authority reads (`classify_fork_ref` → `fresh_snapshot_for_branch`) + must stay **fresh** (not served from a pinned base). +- Cost harness: `helpers::cost` (`measure`/`measure_with_staged`/`IoCounts`/`assert_flat`/ + `local_graph`/`s3_graph`). The schema-once assert can reuse `CountingStorageAdapter` + (`warm_read_cost.rs::warm_query_validates_schema_contract_once`) with **zero** prod + change; an open-count assert wants a small `open_count` AtomicU64 in `QueryIoProbes` + (copy the `probe_count`/`record_probe` pattern). The forbidden-API guard + (`tests/forbidden_apis.rs`) makes an instrumentation-level counter complete. + +--- + +## 3. The #297 cycle (this branch) — what it is, and the lesson + +`fix-optimize-concurrency-race` (5 commits): a CLI `optimize` racing a served write on the +same table failed (Lance Rewrite lost, or the equality-CAS publish lost). Fix: unify both +compaction paths on the internal path's **reopen+replan** shape, with a **two-level retry** +— outer loop reopens+replans on a real Lance overlap; inner Phase-C loop makes the manifest +publish a **monotonic fast-forward** (advance to compacted version `N`, or no-op when the +manifest already moved to `≥ N`), never the strict equality CAS. Sidecar written once; +in-process queue kept as a contention reducer (not the cross-process guard); no `writer_epoch`. + +**Two review rounds surfaced two follow-on bugs I introduced with the retry loop** — both +fixed, both regression-tested (own-HEAD-drift via negative control): +1. **Own-HEAD-drift misclassification** (`56d004e0`): the drift guard re-ran every + iteration and, after a partial Phase-B commit (auto_cleanup strip or compact, then a + later op conflicts), saw `HEAD > manifest` from *our own* covered work and deleted the + sidecar + returned `skipped_for_drift` (stranding uncovered drift). Fix: track + `head_advanced`; the drift guard fires only when `!head_advanced`. +2. **Publish exhaustion spurious error** (`e9d16a2c`): the publish loop returned `Err` on + its final retry even if the conflict meant a concurrent writer already published `≥ N` + (postcondition met). Fix: re-check `current >= state.version` on exhaustion. + +**The lesson (write it on the wall):** *wrapping a sequence of side-effecting commits in a +retry silently converts every "checked once, before any side effect" precondition into +"re-checked after partial side effects."* That's a distinct bug class; it needs +fault-injection tests **at each commit boundary**, not just end-to-end concurrency tests. +(The `optimize.before_compact` / `optimize.inject_reindex_conflict` failpoints exist for +exactly this.) + +**Temporary mechanism flag:** `head_advanced` is an in-memory proxy for "is this HEAD +movement mine." Under Design A the authority answers that from the plan/sidecar **identity** +— so `head_advanced` is the part that gets *replaced*, while the monotonic-publish + +reopen/replan **semantics** are permanent. (Noted in RFC §6.6.) + +--- + +## 4. DONE: Step 3b — capture-once `WriteTxn` (shipped on `rfc-013-step-3b-writetxn-v2`) + +**Delivered:** on the **table-touch hot path**, a single `mutate`/`load` validates the schema +contract **once** and opens each touched data table **at most once** — a constant-factor/RTT +win (not a depth-slope win; 1a). Two cost gates in `write_cost.rs` lock it (both on a node +insert): `write_validates_schema_contract_once` (3 `read_text` / 2 `exists`, was 12/9) and +`keyed_insert_opens_table_at_most_once` (`data_open_count <= 1`, was 4). The carrier is the +minimal `WriteTxn { branch, base }`, threaded as `Option<&WriteTxn>` (`Some` on the hot +mutate/load path, `None` byte-identical everywhere else); it **converges into** step 5's +`PublishPlan`. + +**Not "once" everywhere (scope, not regression):** edge endpoint / cardinality RI validation +(`ensure_node_id_exists`, the loader's RI + cardinality) still resolves through +`snapshot_for_branch` and re-validates the schema — and reads **warm**, not live. Threading +`txn.base` there to make it "once" would re-introduce the stale-read class the #298 cardinality +fix removed (it now reads live HEAD). Doing schema-once *and* fresh reads for those validations +needs the unified, re-checked read-set — **step 4 §7.1** (§1d). So #298 **un-regresses +cardinality only; it does not close write-validation freshness.** No edge-insert/load schema-once +gate yet (only the node gates above). + +Commits (off merged-#297 main): +- **Stage 0** — scope `open_count` → `data_open_count`/`internal_open_count` by URI class + (the review fix: `open_dataset_tracked` also opens `__manifest`/`_graph_commits`, so the + raw counter conflated them and the gate was unreachable). Re-baselined RED 4. +- **Commit A (schema-once)** — capture `txn` once at entry (the single validation); the 4 + validation sites collapse: S1 (entry `ensure_schema_state_valid`) removed; S3a + (`open_for_mutation_on_branch`) + S3b (`prepare_updates_for_commit`) source `txn.base`; + S4 (`commit_all`) uses new `fresh_snapshot_for_branch_unchecked` (the OCC manifest re-read + minus the schema re-validation). `fresh_snapshot_for_branch{,_unchecked}` now read the + manifest directly via `ManifestCoordinator` (drops a spurious commit-graph `exists` probe; + same `Snapshot`). +- **Commit B (open collapse 4→1)** — #1 accumulation open ELIMINATED (the node path discarded + the handle; read `txn.base.entry().table_version`); #2 staging open KEPT (the one open); + #3 commit drift-guard reads live HEAD via `entry.dataset.dataset().latest_version_id()` (a + cheap manifest-pointer probe off the staged handle, not a fresh open); #4 index build reuses + the `commit_staged` handle threaded through `CommittedMutation`/`prepare_updates_for_commit`. +- **Commit B.1 + cleanup** — named the two positional returns (`OpenedForMutation`, + `CommittedMutation`) + a `debug_assert` pinning the open-skip contract; **removed the + unearned `WriteTxn.session` field** (the collapse uses skip/probe/reuse, not a session). + +**RFC §4.1 corrections — how they resolved:** +1. *Thread the evolving handle, not a version-keyed cache* → realized as collapse #4 (carry + the `commit_staged` handle forward into the index build). +2. *Don't forbid re-resolution* → honored: the commit-time OCC re-read + (`fresh_snapshot_for_branch_unchecked` — fresh manifest, only schema-revalidation dropped) + and the fork-authority reads stay fresh. +3. *Minimal carrier* → `WriteTxn { branch, base }` (even the `session` from the original + sketch was dropped as unearned). + +**Deferred to step 5 (NOT in this PR):** session-aware write base opens. The one remaining +open (#2) stays a HEAD open; warming the shared `Session` across writes is an object-store +(S3) phenomenon invisible on local FS, so it earns its own `write_cost_s3.rs` gate in step 5, +where `txn` becomes the non-optional publish carrier. No new concurrency test was needed here: +#2 stays a HEAD open (no pinned+session base introduced), so the publisher CAS + #3 live-HEAD +probe fences are unchanged (covered by the green `writes.rs`/`consistency.rs`). + +**Guardrails (don't regress):** schema validation is deliberately uncached for drift +detection — collapse to 1 *per write*, never cache across writes on a long-lived handle +(`lifecycle::long_lived_handle_rejects_schema_*`). The commit-time fresh read is OCC +machinery, not redundancy. Keep all 3 strict-op guards. Keep fork-authority reads fresh. +Pin the *correct* branch (server-bound-to-main writing a feature branch falls to a fresh +open). A branch `rfc-013-step-3b-writetxn` exists off an earlier main; rebase onto the +post-#297 main before starting. + +--- + +## 5. Design A — the `PublishPlan` unification (step 5) = the convergent fix + +**This is the real fix for the bug class in §1c.** Collapse the four hand-rolled writers + +the maintenance path into **one `publish(txn, plan)` authority** where the CAS + bounded +retry is **unconditional and unbypassable** (no caller can "hold the queue → skip the CAS"). +Properties: +- **One interpreter of the `HEAD vs manifest` delta** — and "is this my work?" is answered + by the plan/sidecar **identity**, not a re-derived comparison. The own-HEAD-drift bug, the + §6.5 writers, the write-path guard — all close *by construction*. +- **Recovery = the same `PublishPlan` re-applied** — the crash-recovery interpreter and the + live interpreter become the same code (`iss-merge-recovery-partial-rollforward` gone). +- Each `TableAction` commits by its **class** (§1b): `Rewrite` = maintenance (Lance rebase + + reopen/replan + monotonic fast-forward, **no epoch**); load/mutate = logical (strict OCC + + `writer_epoch`). + +**Why it composes with Lance MTT (don't over-build):** +- The **unification itself is convergent** — when MTT lands, it slots *underneath* the same + authority; nothing wasted. Build this. +- The **`writer_epoch`** is the one MTT-redundant piece (MTT's commit-handler lease subsumes + a cross-process fence). Build it *last and minimally*, gated on actually deploying + multi-writer topologies. Per the deny-list, don't reimplement what the substrate will own. + +**Sequencing judgment (this cycle's strongest signal):** the bug density here (this PR alone += 3 review rounds, all "a writer re-interprets the delta") means the current N-writers interim +is high integrated-over-time liability. **Consider pulling the *convergent half* of step 5 +(the single authority + recovery-as-plan) forward — possibly ahead of 3b** — because it stops +the bug class rather than patching instances. #297 + #254 are the *de-risking inputs*: they +validate the maintenance-class and logical-class commit models in isolation first, so Design +A implements a known spec rather than designing under refactor pressure. Do NOT build more +substrate-shaped scaffolding (custom WAL / job queue / second coordination table) to paper +over the window — strictly higher liability than either Design A or waiting for MTT. + +**Deeper-than-A (post-MTT or as Lance exposes uncommitted variants):** all-uncommitted-fragments ++ one manifest commit would shrink the A-before-B window itself, blocked today by Lance not +exposing uncommitted variants for `compact_files` / `optimize_indices` / vector index (#6666 +open; delete #6658 shipped). Track, don't build yet. + +### 5.1 Step-5 design constraints inherited from the #295 spec review +3b shipped a **minimal** `WriteTxn { branch, base }` (schema-once + open-collapse via +eliminate/probe/thread) and **deferred** the full §4.1 opener-unification — the pinned-base +opener, the shared-`Session` open, the write-local **handle cache**, and the strict-op +conflict-timing move — to step 5. So the greptile-bot comments on the #295 *spec* were **moot +for #298** (which built none of those constructs) but are **load-bearing constraints for step +5** when it builds them. Bank them: +1. **Handle cache must be `Send + Sync`** (`Mutex>`, not `RefCell`) if + `WriteTxn::open(&self)` is shared across concurrent stage futures — a `RefCell` compiles + but panics when two stages poll. Or make it `&mut self` (no parallel-stage sharing). This + is the deny-list "in-process-only `Dataset` impls — `Send + Sync`" item. +2. **The strict-op timing move needs an explicit retry contract.** If step 5 moves + strict-op conflict detection from open-time `ensure_expected_version` to commit-time CAS + (the §4.1 pinned-base design), it MUST specify: the txn is **discarded after any commit** + (success or conflict — the handle cache is commit-invalidated), and the retry **re-opens a + fresh `WriteTxn` at the new HEAD** (never re-stages against the stale pinned base — that + reproduces the lost-update). **This is the same retry/refresh contract as the stale-view + false-fail (§1d.2)** — the op-class-aware precondition + "fresh base on retry" are one + design point. Today (#298) strict ops keep open-at-HEAD + `ensure_expected_version`, so the + contract is unchanged; step 5 owns it the moment it pins strict reads to the base. +3. **The opener-equivalence test must be non-trivial.** A differential test that only passes + when `HEAD == base` proves nothing about pinning. To actually prove "`WriteTxn::open` + returns the pinned base, not HEAD," the test must **advance the branch HEAD externally + (direct Lance write), then assert the txn open still reads the base version** — and that a + strict write then fails `ExpectedVersionMismatch` at commit (verifying the timing move). + +--- + +## 6. Why #297 is still needed even if you do Design A +- Design A **relocates** #297's maintenance-class commit logic into the authority's + `TableAction::Rewrite` path; it does not eliminate it. #297 is the *validated spec + tests*. +- The two regression tests + §6.6 are the **contract** Design A must keep green. +- The prod bug is **live**; Design A is the largest write-path change in the RFC. Don't hold a + correctness fix hostage to a big refactor, and don't do a big refactor under bug-fix urgency. +- Genuinely throwaway under Design A: only the loop's *location* + the `head_advanced` proxy + (~a dozen lines). Everything else relocates or persists. **#297 LANDED.** + +--- + +## 7. Open PRs and their relationships +- **#297** — maintenance-class fix (optimize vs write). **LANDED** (origin/main `6d4606a8`); + step 3b stacks on it. +- **#254** — logical-class fix (schema-apply vs optimize false-fail). Same op-class family; + both are de-risking inputs for Design A's per-class commit models. +- **#296** — recovery roll-forward converges on concurrent manifest advance. The fix + for the flaky `iss-schema-apply-reopen-recovery-race`. It touches `recovery.rs` and is + *aligned* with #297's "postcondition is the state, not winning the CAS" principle. **#296 + landed on main first and is merged into this branch:** the converge helper + (`converge_or_defer_roll_forward`) was reconciled with Phase-7's manifest-CAS roll-forward — + on convergence the audit references the winner's folded `graph_commit_id` (the current + `graph_head`), not a freshly minted one. +- **#295** — the step-3b RFC doc (apply §4's three corrections to it). + +--- + +## 8. Remaining RFC steps after 3b (RFC §9 is canonical) +- **#298 follow-up (do on the 3b PR, before merge): the edge-`@card` stale-read regression** + (§1d.1). Restore the live-HEAD cardinality scan, add the deterministic regression test, fix + the wrong doc comment. Small, gate-safe, un-regresses an integrity check (invariant 9). The + residual concurrent TOCTOU is the §7.1 gap (step 4) — un-widen here, don't over-reach. +- **Step 4 / Phase 7** (`iss-991`): lineage into `__manifest` (publish `graph_commit` + + mutable `graph_head:` in the same merge-insert; `_graph_commits` becomes a + projection). Removes the per-write `commit_graph.refresh`; closes the manifest→commit-graph + atomicity + commit-graph-parent-under-concurrency gaps. **Hard prereq: step 2 (done).** + Carries the §7.1 *concurrent* write-skew fix (needs the `graph_head` contention row) — + **frame §7.1 as "unify the entire write-validation read-set" (endpoint + cardinality + + cross-version uniqueness), not merely "add `graph_head`"** (§1d.1): the bespoke + `edge_cardinality_read_handle` and the mutation-vs-loader freshness fork dissolve into one + pinned read-set re-validated under the `graph_head` contention, or the liability survives as + a second special-case. +- **Step 5 / Design A** — §5 above. **Acceptance item: the served-strict-write stale-view + false-fail** (§1d.2) — the op-class-aware precondition + `open_txn` probe. The contract is + two tests passing *together*: un-ignore + `writes.rs::served_strict_delete_after_external_optimize_advance_auto_refreshes` (goes green) + *while* `consistency::stale_handle_public_mutation_must_refresh_then_retry` stays green + (maintenance fast-forwards; logical fails loudly). Self-contained enough to ship standalone + like #297 if prod pain is acute; otherwise fold into the single PublishPlan delta-interpreter. +- **Step 2b** — internal-table cleanup + the Q8 monotonic watermark (a Lance boundary tag). + Deferred: only the secondary version-count/space term, touches the read/open path, and is + MTT-redundant. Land when version-count cost bites. +- **§7.1 sequential write-skew** (`iss-overwrite-orphans-committed-edges`) — inbound-RI + validation on node removal; independent, ships anytime. +- **#20** — the prod per-write `storage.ops` span metric (RFC §5.3), still owed. +- Branch ops: Lance `Clone` for create (`iss-691`). + +--- + +## 9. Gotchas / traps (learned the hard way) +- **In-process queue ≠ cross-process lock.** Any "I hold the queue → skip the retry/CAS" + reasoning is a bug across processes. This is the recurring trap. +- **Monotonic publish must be `≥`-conditional, never "no assertion."** The `__manifest` + merge-insert is unconditional `UpdateAll` keyed on `object_id` (`publisher.rs:379`), so + the equality (or monotonic) pre-check is the *only* guard — dropping it lets `UpdateAll` + regress a newer version = lost write. +- **The drift guard interprets an ambiguous delta.** Re-evaluating it in a retry over + self-mutated state is how #297's follow-on bug happened. Gate any HEAD-vs-manifest + interpretation on "have *we* committed yet." +- **`compact_files` fires Lance's auto_cleanup GC hook** (commits with + `skip_auto_cleanup=false`, no override) — optimize strips stale `lance.auto_cleanup.*` + config before compacting to stay non-destructive on upgraded graphs. The strip is a + separate commit (relevant to the partial-commit retry trap). +- **Lance rebases the common concurrent case for free** — so the data-table conflict usually + surfaces as the manifest fast-forward, not a Lance error. The Lance-Rewrite-overlap path is + rare and needs failpoint injection to test. + +--- + +## 10. Verification (the gate) +- `cargo test --workspace --locked` — the canonical gate (matches CI). +- `cargo test -p omnigraph-engine --features failpoints --test failpoints optimize` — + the optimize concurrency/recovery tests. +- `cargo test -p omnigraph-engine --test write_cost` / `write_cost_s3` (bucket-gated) — + cost gates (3b adds the schema-once + open-count asserts here). +- `cargo test -p omnigraph-engine --test maintenance` — optimize/repair/cleanup. +- Re-read [`invariants.md`](invariants.md), [`lance.md`](lance.md), [`testing.md`](testing.md) + before each change (always-on requirement). + +Lance source for re-validation: +`/Users/ragnor/.cargo/registry/src/index.crates.io-*/lance-7.0.0` (key files: `io/commit.rs`, +`io/commit/conflict_resolver.rs`, `dataset/optimize.rs`, `dataset/write/retry.rs`, +`dataset/builder.rs`). diff --git a/docs/dev/index.md b/docs/dev/index.md index 23f0610..be98602 100644 --- a/docs/dev/index.md +++ b/docs/dev/index.md @@ -93,6 +93,7 @@ Working documents for in-flight feature work. Removed when the work lands. | CLI refactoring — one addressing & config model post-`omnigraph.yaml`: scope + `--graph` + derived access path, served-default / privileged-direct, profiles, named queries, capability classifier (completes RFC-008) | [rfc-011-cli-refactoring.md](rfc-011-cli-refactoring.md) | | Provider-independent embedding configuration — one resolved `EmbeddingConfig` + sealed provider enum (Gemini/OpenAI/Mock), identity recorded in the schema IR, query-time same-space validation, NFR floor | [rfc-012-embedding-provider-config.md](rfc-012-embedding-provider-config.md) | | Write-path latency — capture-once `WriteTxn`, version-pinned opens, one `GraphPublishAuthority` fed declarative `PublishPlan`s, manifest-authoritative lineage, epoch fence, bounded history (compaction + cleanup), and an IO-counted cost contract (`iss-write-s3-roundtrip-amplification`, `iss-991`) | [rfc-013-write-path-latency.md](rfc-013-write-path-latency.md) | +| RFC-013 handoff — current-state map, latest validation, and concrete next actions for finishing write-path latency and correctness work | [handoff-rfc-013-write-path.md](handoff-rfc-013-write-path.md) | ## Boundary diff --git a/docs/dev/invariants.md b/docs/dev/invariants.md index 3195bd0..9bb6dbd 100644 --- a/docs/dev/invariants.md +++ b/docs/dev/invariants.md @@ -211,10 +211,21 @@ them explicit. sweep has the same exposure, and always has): it may roll a live foreign writer's sidecar forward, which degrades to publisher-CAS contention for data writes but can race the schema-staging promotion for a foreign live - schema apply. Multi-process writers on one graph are already documented - one-winner-CAS territory; closing this fully needs a cross-process - serialization primitive (e.g. lease-based use of the schema-apply lock - branch) — design it before promoting multi-process write topologies. + schema apply. The roll-**forward** CAS contention is now + convergence-idempotent: when the publish loses the CAS to a concurrent + writer that already reached the sidecar's goal, the sweep treats it as + convergence (record the `RolledForward` audit + delete) rather than a fatal + `ExpectedVersionMismatch`, and defers when the manifest is only partway + (`converge_or_defer_roll_forward` in `db/manifest/recovery.rs`; + iss-schema-apply-reopen-recovery-race). So a concurrent advance no longer + fails the open. The schema-staging promotion race and the destructive + roll-**back** path (Lance `Restore` "trumps" a concurrent commit, so it + cannot be made idempotent — iss-recovery-sweep-live-writer-rollback) still + need the cross-process primitive. Multi-process writers on one graph are + already documented one-winner-CAS territory; closing this fully needs a + cross-process serialization primitive (e.g. lease-based use of the + schema-apply lock branch) — design it before promoting multi-process write + topologies. - **Fork reclaim is in-process-safe only:** the first write to a table on a branch forks it (a Lance `create_branch` that advances state before the manifest publish). An interrupted fork (crash, or a cancelled request @@ -242,20 +253,43 @@ them explicit. acknowledged-before-visible bug this branch fixed. Close it (local CAS primitive, or a trait-level lock requirement) before admitting any lock-free `if_match` caller. -- **Manifest→commit-graph publish atomicity:** a graph commit advances - `__manifest` (the visibility authority) and then appends `_graph_commits` as - two separate writes (`commit_updates_with_actor_with_expected`, failpoint - `graph_publish.before_commit_append`). A crash between them leaves the manifest - at version N with no commit-graph row for N. Live reads and durability are - unaffected — the live version resolves via the manifest - (`GraphCoordinator::version()`), not the commit-graph head — and the open-time - recovery sweep does NOT repair it (`lance_head == manifest_pinned` classifies - `NoMovement`; a recovery sidecar would not change this). Impact is bounded to - commit history: `commit list` misses N, time-travel by commit id to N fails, - and merge-base loses a node (a likely-benign off-by-one re-merge). This affects - every publish, not a specific maintenance command. Eventual fix: make the - commit graph reconcilable from the manifest (or the two writes atomic) — not a - recovery-sidecar concern. +- **Manifest→commit-graph publish atomicity — CLOSED (RFC-013 Phase 7):** graph + lineage now lives ONLY in `__manifest`, as `graph_commit` + `graph_head:` + rows written in the SAME `MergeInsertBuilder` commit as the table-version rows + (`commit_changes_with_lineage` → `GraphNamespacePublisher::publish` with a + `LineageIntent`). There is no second write to fail between — a graph commit and + its lineage land at one manifest version atomically, so a crash after the publish + leaves no gap. The commit-graph cache is a derived projection of those manifest + rows; nothing writes `_graph_commits.lance` (it persists only to carry branch + refs). The prior two-write gap (manifest at N with no `_graph_commits` row for N) + is gone by construction. A graph created before Phase 7 (internal schema v3) + carries its lineage only in `_graph_commits.lance`; the `migrate_v3_to_v4` + internal-schema step (`db/manifest/migrations.rs`) backfills it into `__manifest` + per-branch on the first read-write open (idempotent, crash-safe, data-preserving), + and a read-only open of an un-migrated v3 graph sources the DAG from + `_graph_commits.lance` via a stamp-gated transitional fallback so reads stay + correct until the first write migrates it. An old binary refuses a v4-stamped + graph (read-write and read-only) with the standard upgrade error. The migration + is **loud on failure and concurrent-runner idempotent**: the legacy-open read + (`read_legacy_commit_cache`) treats only a genuine not-found as "no legacy data" + and propagates any other open error (so a transient/corrupt open can never stamp + v4 over an empty backfill — orphaning lineage permanently), and the backfill + converges all-or-nothing when two runners open the same legacy graph at once — a + bounded re-open retry on the `graph_head:` row-level CAS plus an + idempotent terminal stamp bump (both runners write the same value, so a concurrent + `UpdateConfig`/`IncompatibleTransaction` loss re-opens and no-ops if the stamp + already landed). The branch read path (`load_commit_cache_for_branch`) also + refuses an out-of-range branch stamp (`> CURRENT` or `< MIN_SUPPORTED`; + defense-in-depth; not a live hole because migrations run main-first, so main + refuses first). The migration chain is **floor-bounded**: + `MIN_SUPPORTED_INTERNAL_SCHEMA_VERSION` (migrations.rs; 1 today, a pure no-op) is + the oldest stamp this binary opens, enforced symmetrically with the ceiling by the + single `refuse_if_stamp_unsupported` guard at all three stamp-read sites + (write-path migrate, read-only open, branch lineage-read). Raising MIN sheds the + now-dead `migrate_vN_…` arms and (at MIN ≥ 4) the `commit_graph_legacy_v3` legacy + readers; a compile-time tripwire (`LOWEST_REGISTERED_MIGRATION_SOURCE`) fails the + build if the floor and the lowest registered arm drift. Retirement runbook lives on + the `MIN_SUPPORTED_INTERNAL_SCHEMA_VERSION` doc-comment. - **Planner capability/stat surfaces:** cost-aware planning, complete capability advertisement, and explain-with-cost are roadmap. Do not describe them as implemented. @@ -291,19 +325,23 @@ them explicit. in history; but they are not yet brought into `cleanup` (version GC), so the `_versions/` chain still grows until an explicit cleanup (the cleanup half is deferred — it needs the Q8 cleanup-resurrection watermark first). The commit - graph is not yet reconcilable from the manifest; and the traversal id-map is + graph IS now reconcilable from the manifest (RFC-013 Phase 7 — it is a pure + projection of the `graph_commit`/`graph_head` rows); the traversal id-map is still rebuilt. -- **Commit-graph parent under concurrency:** `record_graph_commit` now refreshes - the commit-graph head from storage before appending, so a same-branch write - after an external commit no longer forks the commit DAG by parenting off a - stale cached head (the single-process fork, pre-existing for non-strict - inserts and widened to strict ops by Fix 1's `refresh_manifest_only`, is now - closed). Residual: two processes writing disjoint tables can still pass their - per-table manifest CAS and append off the same parent (a refresh-then-append - TOCTOU). The convergent fix is reconcile-from-manifest (parent = the commit at - the manifest version the publisher CAS'd against; `manifest_version` is on - every commit row), composing with the manifest-to-commit-graph atomicity gap; - it needs commit-graph append ordering or a Lance append-CAS to fully close. +- **Commit-graph parent under concurrency — CLOSED (RFC-013 Phase 7):** the graph + commit is now recorded in the manifest publish CAS, and the publisher resolves + the new commit's parent INSIDE its retry loop, per attempt, from the just-loaded + `__manifest` (the `should_replace_head` winner over the visible `graph_commit` + rows). A CAS-conflict retry re-reads the advanced head and parents correctly, so + the refresh-then-append TOCTOU is gone. Two processes writing disjoint tables on + the same branch now also contend on the shared `graph_head:` row (one + `object_id`, `WhenMatched::UpdateAll`): one wins, the other retries and re-parents + — so the cross-process disjoint-table fork is closed too. This is the intended + §7.1 contention point, pinned by + `manifest::tests::concurrent_disjoint_writes_share_head_and_form_linear_chain` + (two disjoint writers → both commit, single linear chain) and + `manifest::tests::n_concurrent_disjoint_writers_converge_to_one_linear_chain` + (N=8 disjoint writers with app-level retry → one linear chain of 8, no fork). ## Deny-list diff --git a/docs/dev/lance.md b/docs/dev/lance.md index 4c624b3..d0a5c31 100644 --- a/docs/dev/lance.md +++ b/docs/dev/lance.md @@ -170,6 +170,7 @@ Migration from Lance 6.0.1 → 7.0.0 landed in this cycle. **Arrow stayed 58, Da - **Native `DirectoryNamespace` no longer recognizes omnigraph's manifest-tracked tables** (`lance-namespace-impls` dir.rs ~L1310): `list/describe/create_table_version` route through `check_table_status`, which reports an omnigraph table absent → `TableNotFound`. The decoupling is *contingent on omnigraph's legacy boolean PK key*, not an unconditional v7 property: v7's namespace eagerly adds the new `lance-schema:unenforced-primary-key:position` key to any `__manifest` lacking it; that write hits the immutable-PK rule above (the boolean key already set the PK), so `ensure_manifest_table_up_to_date` errors and the namespace silently falls back to directory listing. omnigraph keeps the boolean key deliberately — Lance honors it permanently (maps to PK position 0), and one uniform on-disk format beats a new-vs-old split (existing graphs can't be re-keyed to the position key under that same immutability rule). omnigraph production never uses Lance's native namespace (its publisher writes `__manifest` directly via merge_insert; its own `namespace.rs` impls are custom), so this is test-only — the `test_directory_namespace_direct_publish_cannot_replace_native_omnigraph_write_path` surface guard was realigned to the v7 behavior (it now asserts the native namespace is fully decoupled, which only strengthens the guard's thesis). - **Still NOT fixed in 7.0.0:** vector-index two-phase (Lance #6666 open) — `create_vector_index` inline residual retained; blob-column compaction — `compact_files_still_fails_on_blob_columns` guard still red on a fix, `optimize` still skips blob tables behind `LANCE_SUPPORTS_BLOB_COMPACTION`. - **No Lance API surface omnigraph uses changed at *compile* time** (the only compile break was object_store) — but **two runtime behaviors did** (the unenforced-PK immutability and the native-namespace `TableNotFound`, above), each caught by the full engine test suite rather than the build. `CleanupPolicy`, `WriteParams` (apart from the `auto_cleanup` default), `CompactionOptions`, the namespace models (resolved via `lance-namespace-reqwest-client` 0.7.7, unchanged across the bump), `Operation`, `ManifestLocation`, and `MergeInsertBuilder` shapes are all stable. Lesson: a clean build is not a clean alignment — run `cargo test --workspace` before declaring a Lance bump done. +- **Two surface guards added by the v3→v4 migration-robustness follow-up** (not a Lance bump, but they pin Lance error surfaces the migration now classifies on): `dataset_open_missing_returns_not_found_variant` (a missing `Dataset::open` returns `DatasetNotFound`/`NotFound` — the legacy-open read in `db/commit_graph.rs::read_legacy_commit_cache` treats only those as "no legacy data" and propagates everything else) and `lance_error_incompatible_transaction_variant_exists` (a concurrent `UpdateConfig` stamp-bump loses with `IncompatibleTransaction` — `db/manifest/migrations.rs::commit_v4_stamp_idempotently` matches it to retry the benign same-value race). Re-run on a Lance bump like the others. Bump this date stanza on the next alignment pass. diff --git a/docs/dev/rfc-013-write-path-latency.md b/docs/dev/rfc-013-write-path-latency.md index 1954b01..53f6430 100644 --- a/docs/dev/rfc-013-write-path-latency.md +++ b/docs/dev/rfc-013-write-path-latency.md @@ -523,7 +523,10 @@ struct WriteTxn { branch: BranchRef, base: PinnedSnapshot, // {manifest_version, per-table (loc,version,e_tag), schema_hash, writer_epoch} session: Arc, // shared per-graph; warms metadata/index caches across opens - handles: HandleCache, // open-by-version; each table opened once, reused across stages + handles: HandleMap, // open the base once WITH session; thread the handle each + // commit RETURNS forward (HEAD walks N→N+1→N+2). NOT a + // version-keyed cache — HEAD moves, so a (table,version) key + // misses; reuse = forward the commit-return handle. [3b-validated] } // A typed, declarative publish plan — the COMPLETE "what", built before any HEAD moves. @@ -546,8 +549,17 @@ impl GraphPublishAuthority { Properties that make it optimal: -- **Stages take `&WriteTxn`/`&PublishPlan`, never storage** — re-resolution and - open-latest are *unrepresentable*. Invariants 2/3/15 hold by construction. +- **Stages take `&WriteTxn`/`&PublishPlan` for the BASE** — re-resolving the pinned + read base / open-latest for the pre-commit phase is unrepresentable; invariants 2/3/15 + hold for the base by construction. **Caveat [3b-validated]:** this is NOT "no + re-resolution anywhere." Three commit-boundary reads are irreducible correctness + machinery and MUST stay fresh: the commit-time `fresh_snapshot_for_branch` (cross-process + OCC), the live-HEAD drift probe (a concurrent writer may have moved HEAD since staging), + and the fork-authority reads (`classify_fork_ref` deliberately bypasses the cached base — + a pinned base there re-opens the "force-delete a live fork" bug). Model "pinned base for + the pre-commit phase + named fresh re-reads at the commit/fork boundary." The achievable + open count is **1 base open (with session) + 1 cheap `latest_version_id` probe + threaded + commit handles**, not literally one open. - **The recovery sidecar *is* the serialized `PublishPlan`.** Phase C and recovery both call `plan.apply()` — a merge that bumps tables A+B can never roll A forward and silently drop B. The diff --git a/docs/dev/testing.md b/docs/dev/testing.md index 6d2dab2..0c130da 100644 --- a/docs/dev/testing.md +++ b/docs/dev/testing.md @@ -47,7 +47,7 @@ The engine's `tests/` is the principal coverage surface; most graph-shaped behav | `validators.rs` | Schema constraint enforcement (enum, range, unique, cardinality) across JSONL, insert, update paths | | `policy_engine_chassis.rs` | Engine-layer Cedar enforcement (MR-722): allow + deny through every `_as` writer via the SDK directly — no HTTP — proving embedded and CLI callers hit the same gate as the server, with action × scope shapes matching `authorize_request` | | `maintenance.rs` | `optimize` (compaction), `repair` (explicit uncovered-drift publish), and `cleanup` (version GC): empty/idempotent/no-op edges, policy validation, head preservation; `optimize` publishes its own compaction (`optimize_publishes_compaction_to_manifest_so_schema_apply_succeeds`), skips pre-existing uncovered drift (`optimize_skips_preexisting_manifest_head_drift`), and refuses to run while a `__recovery` sidecar is pending (`optimize_defers_when_recovery_sidecar_is_pending`); `repair` previews/heals verified maintenance drift, refuses raw semantic drift without `--force`, and forced repair publishes only by explicit operator choice; the index reconciler (iss-848): `index_build_tolerates_null_vector_rows` (an untrainable Vector column defers instead of aborting the build, sibling indexes still build) and `optimize_materializes_index_declared_but_unbuilt` (optimize creates a declared-but-deferred index) | -| `failpoints.rs` | Failure-injection coverage (gated on `failpoints` feature). Includes the five per-writer Phase B → recovery integration tests (`recovery_rolls_forward_after_finalize_publisher_failure`, `schema_apply_phase_b_failure_recovered_on_next_open`, `branch_merge_phase_b_failure_recovered_on_next_open`, `ensure_indices_phase_b_failure_recovered_on_next_open`, `optimize_phase_b_failure_recovered_on_next_open`) and the write-entry in-process heal contract (the four `*_after_finalize_publisher_failure_heals_without_reopen` tests — load, mutation, schema apply, branch merge: a follow-up write on the same handle rolls a sidecar-covered residual forward without reopen/refresh) and the storage-fault matrix for the sidecar lifecycle (`recovery.sidecar_{write,delete,list}` / `recovery.record_audit` failpoints: Phase A put failure aborts with zero drift, Phase D delete failure is swallowed and healed by the next write, list failures are loud at heal and open, audit-append failures are retried to exactly one audit row; plus the bucket-gated `s3_load_recovers_after_publisher_failure_without_reopen`). | +| `failpoints.rs` | Failure-injection coverage (gated on `failpoints` feature). Includes the five per-writer Phase B → recovery integration tests (`recovery_rolls_forward_after_finalize_publisher_failure`, `schema_apply_phase_b_failure_recovered_on_next_open`, `branch_merge_phase_b_failure_recovered_on_next_open`, `ensure_indices_phase_b_failure_recovered_on_next_open`, `optimize_phase_b_failure_recovered_on_next_open`) and the write-entry in-process heal contract (the four `*_after_finalize_publisher_failure_heals_without_reopen` tests — load, mutation, schema apply, branch merge: a follow-up write on the same handle rolls a sidecar-covered residual forward without reopen/refresh) and the storage-fault matrix for the sidecar lifecycle (`recovery.sidecar_{write,delete,list}` / `recovery.record_audit` failpoints: Phase A put failure aborts with zero drift, Phase D delete failure is swallowed and healed by the next write, list failures are loud at heal and open, audit-append failures are retried to exactly one audit row; plus the bucket-gated `s3_load_recovers_after_publisher_failure_without_reopen`). Also the v3→v4 migration fault-injection test (`transient_legacy_open_failure_aborts_migration_without_stamping_v4`, `migration.v3_to_v4.legacy_open` failpoint): a transient legacy-open failure aborts the migration loudly and leaves it retryable (stamp stays v3, no partial backfill), never stamping v4 over an empty backfill. Also the v4 stamp-bump exhaustion regression (`v4_stamp_exhaustion_returns_retryable_contention`, `migration.v4_stamp.force_incompatible` failpoint): the stamp retry loop surfaces a retryable `RowLevelCasContention` on exhaustion, not a stringified `Lance`. And the convergence-idempotent roll-forward regression (`open_sweep_roll_forward_converges_when_manifest_advances_concurrently`: two concurrent open-sweeps race one sidecar at the `recovery.before_roll_forward_publish` rendezvous; the CAS loser must converge, not fail the open — iss-schema-apply-reopen-recovery-race). | | `recovery.rs` | Open-time recovery sweep — sidecar I/O, classifier dispatch (NoMovement / RolledPastExpected / UnexpectedAtP1 / UnexpectedMultistep / InvariantViolation), all-or-nothing decision, roll-forward via `ManifestBatchPublisher::publish`, roll-back via `Dataset::restore`, audit row in `_graph_commit_recoveries.lance`, `OpenMode::ReadOnly` skip path | | `composite_flow.rs` | Compositional/narrative end-to-end stories — multi-step flows that compose mechanics covered by other test files. Catches integration regressions where individual operations all pass their unit tests but their composition breaks (sequential merges, post-merge main writes, time-travel through merge DAG, reopen consistency over multi-merge histories, post-optimize and post-cleanup strict writes). | @@ -65,10 +65,12 @@ The engine's `tests/` is the principal coverage surface; most graph-shaped behav ## Failpoints (fault injection) -- Cargo feature: `failpoints = ["dep:fail", "fail/failpoints"]` (in `crates/omnigraph/Cargo.toml` **and** `crates/omnigraph-cluster/Cargo.toml`; the cluster feature does not enable the engine's). -- Wrappers: `crates/omnigraph/src/failpoints.rs` and `crates/omnigraph-cluster/src/failpoints.rs` expose `maybe_fail("name")` and `ScopedFailPoint` for tests. -- Call sites are inserted at sensitive transaction boundaries (branch create, graph publish commit, cluster apply's payload→state-write window, etc.). -- Activated tests: `crates/omnigraph/tests/failpoints.rs` and `crates/omnigraph-cluster/tests/failpoints.rs` (crash-mid-apply + state CAS race via `fail::cfg_callback`; integration binaries, never in-source — the fail registry is process-global). Run with `cargo test -p omnigraph-engine --features failpoints --test failpoints` / `cargo test -p omnigraph-cluster --features failpoints --test failpoints`. +- Cargo feature: `failpoints = ["dep:fail", "fail/failpoints"]` in `crates/omnigraph/Cargo.toml`; the cluster's `failpoints` feature additionally enables `omnigraph/failpoints` (`crates/omnigraph-cluster/Cargo.toml`), so the shared test guard is available to cluster tests. +- Wrappers: `crates/omnigraph/src/failpoints.rs` and `crates/omnigraph-cluster/src/failpoints.rs` each expose `maybe_fail("name")` (per-crate error type). The test-side config guard `ScopedFailPoint` (`new` for action strings, `with_callback` for callbacks; RAII `Drop` removes the point) lives **once** in the engine and is reused by both test binaries. +- **Names are compile-checked.** Every failpoint name is a `pub const` in `omnigraph::failpoints::names` (engine) / `omnigraph_cluster::failpoints::names` (cluster). Call sites and tests reference the constant, never a bare literal — a typo is a compile error, not a silently-never-firing point. Add a new failpoint by adding its const first. +- Call sites are inserted at sensitive transaction boundaries (branch create, graph publish commit, the recovery sweep's classify→roll-forward-publish window, cluster apply's payload→state-write window, etc.). +- **Serialize and rendezvous, never sleep.** The `fail` registry is process-global, so every failpoint test carries `#[serial]` (`serial_test`). For concurrent tests, use `helpers::failpoint::Rendezvous` (`tests/helpers/failpoint.rs`): `park_first(name)` parks the first thread to hit the point until `release()`, and `wait_until_reached().await` blocks on that condition (it doubles as a fired-assertion). Do not coordinate threads with fixed `sleep`s. +- Activated tests: `crates/omnigraph/tests/failpoints.rs` and `crates/omnigraph-cluster/tests/failpoints.rs` (integration binaries, never in-source — the fail registry is process-global). Run with `cargo test -p omnigraph-engine --features failpoints --test failpoints` / `cargo test -p omnigraph-cluster --features failpoints --test failpoints`. ## RustFS / S3 integration diff --git a/docs/dev/writes.md b/docs/dev/writes.md index c4e174c..7239742 100644 --- a/docs/dev/writes.md +++ b/docs/dev/writes.md @@ -230,8 +230,9 @@ recovery sweep in `crates/omnigraph/src/db/manifest/recovery.rs`: rolled-back-to version (`manifest_pinned`); the manifest is published at the restore commit (`manifest_pinned + 1`, same content). - After a successful roll-forward or roll-back, an audit row is - recorded — `_graph_commits.lance` carries - a commit tagged `actor_id = "omnigraph:recovery"`, and a sibling + recorded — the graph commit lineage (the `graph_commit` rows in `__manifest` + since RFC-013 Phase 7) carries a commit tagged + `actor_id = "omnigraph:recovery"`, and a sibling `_graph_commit_recoveries.lance` row carries `recovery_kind`, `recovery_for_actor` (the original sidecar's actor), `operation_id`, per-table outcomes. Operators run `omnigraph commit list --filter @@ -336,20 +337,40 @@ actual }`. The HTTP server maps this to **409 Conflict** with body ## Audit -`actor_id` lands in `_graph_commits.lance` via `record_graph_commit` (no -intermediate run record). Audit history is queried via `omnigraph commit -list`. +`actor_id` lands in the graph commit lineage — the `graph_commit` rows in +`__manifest`, written in the publish CAS (RFC-013 Phase 7; previously +`_graph_commits.lance`). Audit history is queried via `omnigraph commit list`. ## Migration code -`db/manifest/migrations.rs` carries the v2→v3 internal-schema step (MR-770): -a one-time sweep that deletes legacy `__run__*` staging branches off -`__manifest`. It runs in `Omnigraph::open(ReadWrite)` (via -`manifest::migrate_on_open`, before the coordinator reads branch state) and -again on the publisher's write path; both are idempotent once the stamp is at -v3. Deleting the inert `_graph_runs.lance` / `_graph_run_actors.lance` dataset -*bytes* is still deferred — it needs a `StorageAdapter::delete_prefix` -primitive — but those bytes are invisible to graph-level state. +`db/manifest/migrations.rs` is the single place on-disk `__manifest` shape is +reconciled with what the binary expects, stepping the +`omnigraph:internal_schema_version` stamp forward one `match`-arm at a time. It +runs in `Omnigraph::open(ReadWrite)` (via `manifest::migrate_on_open`, before the +coordinator reads branch state) and again on the publisher's write path, so each +branch migrates on its first write; every step is idempotent under crash-retry +(work first, stamp bump last). + +- **v2→v3** (MR-770): a one-time sweep that deletes legacy `__run__*` staging + branches off `__manifest`. Deleting the inert `_graph_runs.lance` / + `_graph_run_actors.lance` dataset *bytes* is still deferred — it needs a + `StorageAdapter::delete_prefix` primitive — but those bytes are invisible to + graph-level state. +- **v3→v4** (RFC-013 Phase 7, `migrate_v3_to_v4`): backfills the graph lineage + from `_graph_commits.lance` into `__manifest` as `graph_commit` / `graph_head` + rows. A graph created before Phase 7 has its lineage only in + `_graph_commits.lance`; the new binary reads lineage from the `__manifest` + projection, so without this backfill it would see an empty commit DAG. The + backfill is per-branch (each branch migrates on its first write), idempotent + (keyed on `object_id`; a fast-path guard skips when `__manifest` already + carries `graph_commit` rows), and writes exactly one `graph_head:` row + for the actual head. `_graph_commits.lance` is left in place as the branch-ref + carrier — no commit row is written to it again. While a graph is below v4, a + **read-only** open (which never writes, so never migrates) sources the commit + DAG from `_graph_commits.lance` via the stamp-gated transitional fallback in + `CommitGraph::open*`, so reads see correct history before the first write + migrates the graph. An old binary opening a v4-stamped graph is refused with an + "upgrade omnigraph" error in both read-write and read-only modes. ## Mid-query partial failure: closed by MR-794 diff --git a/docs/releases/v0.7.2.md b/docs/releases/v0.7.2.md new file mode 100644 index 0000000..ecf0acf --- /dev/null +++ b/docs/releases/v0.7.2.md @@ -0,0 +1,60 @@ +# Omnigraph v0.7.2 + +A patch release over v0.7.1: write-path latency reductions plus three +correctness fixes on the maintenance and recovery paths. No breaking changes, no +on-disk format change, and no migration — drop-in over v0.7.1. + +## Performance + +- **Write opens go direct, schema validates once (#288, #298).** Write opens + used to route through the per-table Lance namespace catalog, which re-opened + the dataset just to read its location and re-resolved the latest version on + every table open — an O(commit-depth) double resolution that dominated write + latency on object stores (~70%). Writes now open each touched data table + directly by its manifest-recorded location (Lance's O(1) version-hint path), + validate the schema contract once per write instead of ~4×, and open each + touched table once instead of 4×. + +- **`optimize` compacts the internal metadata tables (#291).** `optimize` + previously iterated only node/edge tables, so the internal `__manifest`, + `_graph_commits`, and `_graph_commit_actors` tables accumulated one fragment + per commit and were never compacted — making every write's metadata scan grow + with commit history. `optimize` now compacts all three, so a periodically + optimized long-lived graph keeps its per-write metadata scan flat in history. + +## Fixes + +- **`optimize` survives a cross-process write race (#297).** A CLI `optimize` + racing a served write on the same table could fail: the in-process write queue + doesn't serialize across processes, so a concurrent insert/delete advancing the + manifest between optimize's compaction and its publish broke the strict + equality CAS. Optimize now reopens-and-replans on a genuine Lance conflict and + fast-forwards its publish monotonically, so a maintenance compaction never + fails a live write. Bounded retry; sustained contention surfaces a loud + conflict rather than dropping work. + +- **`optimize` is non-destructive on upgraded graphs (#291).** A graph created by + a pre-0.7.0 binary carries an on-by-default Lance auto-cleanup config; under it, + optimize's compaction commit could fire Lance's version-GC hook and prune + `__manifest`-pinned versions (breaking snapshots and time travel). Optimize now + strips any stale `lance.auto_cleanup.*` config off every table — data and + internal — before its HEAD-advancing commits, so compaction can never GC pinned + versions. + +- **Recovery converges instead of failing `open` under a concurrent manifest + advance (#296).** The open-time recovery sweep published its roll-forward at the + sidecar's pinned expected version; if another writer advanced the manifest + during the classify→publish window, the CAS failed and aborted the whole + `Omnigraph::open`. The sweep now treats roll-forward as "the manifest reflects + the sidecar's committed state," not "this sweep won the CAS": on a CAS loss it + re-reads the live manifest and, when the sidecar's intent is already satisfied, + records the recovery and deletes the sidecar idempotently — so a concurrent + advance no longer fails the open. (The destructive roll-back twin still defers + to a cross-process lease, as documented.) + +## Upgrade notes + +Drop-in over v0.7.1 — no configuration, schema, or data changes. Upgrade the +server and CLI together as usual. Graphs created on v0.7.1 read and write +identically on v0.7.2; the optimize non-destructive fix additionally protects +graphs created by pre-0.7.0 binaries from version GC during compaction. diff --git a/docs/releases/v0.8.0.md b/docs/releases/v0.8.0.md index 0153bc1..7a661b6 100644 --- a/docs/releases/v0.8.0.md +++ b/docs/releases/v0.8.0.md @@ -1,16 +1,23 @@ # Omnigraph v0.8.0 -v0.8.0 makes every served graph an **MCP (Model Context Protocol) server**. An -MCP-capable agent — Claude Code/Desktop, Cursor, the OpenAI Responses `mcp` tool, -and others — can connect to a graph and operate it directly: run reads and -mutations, load data, manage branches, browse commits, read the schema, and -invoke the graph's curated stored queries. The surface adds no new capability and -no new business logic; every tool delegates to the same engine/handler path the -REST routes use and is gated by the same Cedar policy. +v0.8.0 has two headline changes: -## Highlights +1. **Every served graph becomes an MCP (Model Context Protocol) server** — an + MCP-capable agent (Claude Code/Desktop, Cursor, the OpenAI Responses `mcp` + tool, and others) can connect to a graph and operate it directly. The surface + adds no new capability and no new business logic; every tool delegates to the + same engine/handler path the REST routes use and is gated by the same Cedar + policy. It is **additive**. +2. **Graph commit lineage moves into `__manifest`** (RFC-013 Phase 7), folded + into the publish CAS, via a one-time on-disk migration (internal schema + **v3 → v4**). This is the first internal-schema change since v0.4.0 and carries + an **upgrade-order requirement** — read the upgrade notes before rolling it out. -### MCP surface (`POST /graphs/{id}/mcp`) +## MCP surface (`POST /graphs/{id}/mcp`) + +An MCP-capable agent can connect to a graph and run reads and mutations, load +data, manage branches, browse commits, read the schema, and invoke the graph's +curated stored queries. - **One MCP endpoint per served graph**, mounted automatically by the cluster server — no separate flag. It is a stateless Streamable-HTTP transport: a @@ -78,8 +85,56 @@ carried in the query source: unsupported version is a `400`); `initialize` negotiates the version in its body and is exempt by design. +## Graph lineage now lives in `__manifest` (internal schema v4) + +The graph commit DAG (commits, parents, merge parents, per-branch heads, and the +authoring actor) is now stored in `__manifest` as `graph_commit` / `graph_head` +rows, written in the **same commit (CAS)** as the table-version rows of a graph +publish. Previously the lineage lived in a separate `_graph_commits.lance` +dataset written after the manifest commit, leaving a narrow window where a crash +could land a manifest version with no matching lineage row. Folding the lineage +into the publish closes that gap by construction: a graph commit and its lineage +now land atomically at one manifest version. The in-memory commit graph is a +projection of those manifest rows; `_graph_commits.lance` is retained only as a +carrier for Lance branch refs and no longer receives commit rows. + +This bumps the `__manifest` internal schema stamp from **v3 to v4**. + +### Existing graphs migrate seamlessly on first write + +A graph created by an earlier binary (internal schema v3) keeps its lineage in +`_graph_commits.lance` with none in `__manifest`. On the **first read-write +open**, Omnigraph backfills that lineage into `__manifest` (the `migrate_v3_to_v4` +internal-schema step) and bumps the stamp to v4. The migration: + +- is **per-branch** — each branch backfills on its first write; +- is **idempotent and crash-safe** — the stamp bump is the last step, and the + backfill is keyed on the commit id, so a crash mid-migration re-runs harmlessly + on the next open; +- **preserves all data** — every commit, parent, merge parent, actor, and head is + carried over; commit ids are stable, so existing references still resolve. + +No data is lost and no operator action is required beyond upgrading the binary. + +Before its first write migrates the graph, a **read-only** open of a v3 graph +(e.g. `omnigraph commit list`, NDJSON export) still reads correct history via a +transitional fallback that sources the commit DAG from `_graph_commits.lance` — +read-only opens never write, so they never migrate, but they never show an empty +history either. + ## Upgrade notes +- **Breaking: internal schema v4 — upgrade writer (and reader) binaries first.** + Internal schema v4 is a hard version gate. Once a graph has been opened for + write by a v0.8.0 binary, its `__manifest` is stamped v4, and an **older binary + will refuse to open it** — read-write *and* read-only — with an + `upgrade omnigraph before opening this graph` error rather than silently + misreading the new lineage. This is the standard forward-version protection + (same shape as the v1→v2 / v2→v3 steps), now enforced on the read-only path + too. Upgrade every writer (and reader) binary that touches a graph to v0.8.0 + before, or together with, the first write under the new version. A mixed fleet + where an old binary still writes the same graph is unsupported, as with any + internal-schema bump. - **`GET /graphs/{id}/queries` is now `invoke_query`-gated (was `read`).** The stored-query catalog uses the same authority as invocation and the MCP `tools/list` surface, so discovery and invocation agree ("see the menu iff you @@ -87,8 +142,9 @@ carried in the query source: `403` instead of a listing; in default-deny mode the endpoint returns `403` until an `invoke_query` rule is configured. This is the one observable REST behavior change in this release. -- Otherwise no breaking changes: the rest of the REST surface, CLI, cluster - config, and on-disk format are unchanged. The MCP endpoint is additive. +- **The MCP endpoint is additive.** Apart from the `GET /queries` gate change and + the v4 on-disk migration above, the REST surface, CLI, and cluster config are + unchanged. - **Pointing an agent at a graph:** configure your MCP client with the URL `https:///graphs//mcp` and the same bearer token you use for REST. See [docs/user/operations/mcp.md](../user/operations/mcp.md) for the connect diff --git a/docs/user/concepts/storage.md b/docs/user/concepts/storage.md index 68bfbcc..e3d9ef1 100644 --- a/docs/user/concepts/storage.md +++ b/docs/user/concepts/storage.md @@ -20,13 +20,14 @@ OmniGraph is **not** a single Lance dataset; it is a *graph* of datasets coordin - **Layout**: - `nodes/{fnv1a64-hex(type_name)}` — one Lance dataset per node type - `edges/{fnv1a64-hex(edge_type_name)}` — one Lance dataset per edge type - - `__manifest/` — the catalog of all sub-tables and their published versions - - `_graph_commits.lance` / `_graph_commit_actors.lance` — the commit graph and its actor map + - `__manifest/` — the catalog of all sub-tables and their published versions, **and** the graph commit lineage (RFC-013 Phase 7) + - `_graph_commits.lance` / `_graph_commit_actors.lance` — legacy / branch-ref carriers. Since RFC-013 Phase 7 the graph lineage lives in `__manifest` (`graph_commit` / `graph_head` rows, written in the publish CAS); `_graph_commits.lance` no longer receives commit rows, but is retained to carry the Lance branch refs that `create_branch` / `list_branches` / the `cleanup` orphan reconciler operate on. A graph created before Phase 7 (internal schema v3) keeps its lineage here until its first read-write open, which migrates it into `__manifest` via `migrate_v3_to_v4`. - (legacy `_graph_runs.lance` / `_graph_run_actors.lance` from pre-v0.4.0 graphs are inert; the run state machine was removed. The internal schema migration sweeps stale `__run__*` branches on first write-open; the inert dataset bytes themselves remain until a prefix-delete storage primitive lands) - **Manifest row schema** (`object_id, object_type, location, metadata, base_objects, table_key, table_version, table_branch, row_count`): - - `object_type` ∈ `table | table_version | table_tombstone` - - `table_key` ∈ `node: | edge:` + - `object_type` ∈ `table | table_version | table_tombstone | graph_commit | graph_head` + - `table_key` ∈ `node: | edge:` (empty for `graph_commit` / `graph_head` lineage rows) - `table_branch` is `null` for the main lineage and the branch name otherwise + - **Graph lineage rows** (RFC-013 Phase 7): one immutable `graph_commit` row per commit (`object_id` = the commit ULID; `metadata` JSON carries parent / merged-parent / actor / timestamp) plus one mutable `graph_head:` pointer per branch (`graph_head:main` for main). The in-memory commit DAG is a projection of these rows. - **Snapshot reconstruction**: latest visible `table_version` per `(table_key, table_branch)` minus tombstones — rows where `object_type = table_tombstone`, whose own `table_version` (acting as the tombstone version) is `>= the entry's table_version`. - **Atomic publish**: multi-dataset commits publish so that a single write to `__manifest` flips all the new sub-table versions visible at once. - **Row-level CAS on the merge-insert join key**: `object_id` carries an unenforced-primary-key annotation so Lance's bloom-filter conflict resolver rejects two concurrent commits that land the same `object_id` row. Without this annotation, Lance's transparent rebase would admit silent duplicates from racing publishers. @@ -90,8 +91,8 @@ flowchart TB - **Graph root** is one directory (or S3 prefix). Everything below is part of one OmniGraph graph. - **`__manifest/`** is a Lance dataset whose rows describe which sub-table version is published at which graph-branch. Reading a snapshot starts here. - **`nodes/`** and **`edges/`** are sibling directories holding one Lance dataset per declared type. Names are `fnv1a64-hex` of the type name to keep paths fixed-length and case-safe. -- **`_graph_commits.lance`** is an L2 dataset that records the graph-level commit DAG, with a paired `_graph_commit_actors.lance` for the actor map. (Pre-v0.4.0 graphs also have inert `_graph_runs.lance` / `_graph_run_actors.lance` from the removed Run state machine; the internal schema migration sweeps their stale `__run__*` branches, and the dataset bytes are reclaimed once a prefix-delete primitive lands.) -- **`_graph_commit_recoveries.lance`** — one row per crash-recovery action. Joined to `_graph_commits.lance` by `graph_commit_id`; the linked commit row carries `actor_id=omnigraph:recovery`. Operators correlate recoveries with the original mutations they rolled forward / back via this join. +- **`_graph_commits.lance`** is an L2 dataset retained only as a branch-ref carrier (and, on a pre-Phase-7 graph, the migration source). Since RFC-013 Phase 7 the graph commit DAG lives in `__manifest` as `graph_commit` / `graph_head` rows written in the publish CAS — `_graph_commits.lance` and its paired `_graph_commit_actors.lance` no longer receive commit rows. A graph created before Phase 7 (internal schema v3) backfills its lineage into `__manifest` on its first read-write open (`migrate_v3_to_v4`). (Pre-v0.4.0 graphs also have inert `_graph_runs.lance` / `_graph_run_actors.lance` from the removed Run state machine; the internal schema migration sweeps their stale `__run__*` branches, and the dataset bytes are reclaimed once a prefix-delete primitive lands.) +- **`_graph_commit_recoveries.lance`** — one row per crash-recovery action. Joined by `graph_commit_id` to the graph commit lineage (the `graph_commit` rows in `__manifest` since RFC-013 Phase 7); the linked commit carries `actor_id=omnigraph:recovery`. Operators correlate recoveries with the original mutations they rolled forward / back via this join. - **`__recovery/{ulid}.json`** — transient sidecar files written by a writer before it advances the underlying dataset, deleted once the matching manifest publish succeeds. A sidecar persisting after process exit means the writer crashed mid-commit; the next read-write open processes it. Steady-state directory is empty. - **`_refs/branches/{name}.json`** is graph-level branch metadata — pointers from a branch name to the manifest version it heads. - **Inside each Lance dataset** (orange): the standard Lance directory layout. `_versions/{n}.manifest` records every commit; `data/` holds the actual Arrow fragments; `_indices/{uuid}/` holds index segments with their own `fragment_bitmap` for partial coverage; `_refs/` holds Lance-native per-dataset branches and tags. diff --git a/docs/user/reference/constants.md b/docs/user/reference/constants.md index 3da9a2b..0e9ee22 100644 --- a/docs/user/reference/constants.md +++ b/docs/user/reference/constants.md @@ -3,12 +3,12 @@ | Name | Value | Area | |---|---|---| | `MANIFEST_DIR` | `__manifest` | manifest layout | -| Commit graph dir | `_graph_commits.lance` | commit graph | +| Commit graph dir | `_graph_commits.lance` | branch-ref carrier + pre-v4 lineage source (lineage lives in `__manifest` since RFC-013 Phase 7) | | Run registry dir (legacy, removed) | `_graph_runs.lance` | inert post-v0.4.0; bytes remain until a prefix-delete primitive lands | | Run branch prefix (legacy, removed) | `__run__` | swept off `__manifest` by the internal schema migration; no longer a reserved name | | Schema apply lock | `__schema_apply_lock__` | schema apply | | Manifest publisher retry budget | `PUBLISHER_RETRY_BUDGET = 5` | manifest publish | -| Internal manifest schema version | `INTERNAL_MANIFEST_SCHEMA_VERSION = 3` | manifest migrations | +| Internal manifest schema version | `INTERNAL_MANIFEST_SCHEMA_VERSION = 4` | manifest migrations (v4 = graph lineage in `__manifest`, RFC-013 Phase 7) | | Merge stage batch | `MERGE_STAGE_BATCH_ROWS = 8192` | merge execution | | Maintenance concurrency | `OMNIGRAPH_MAINTENANCE_CONCURRENCY=8` | optimize/cleanup | | Lance blob compaction support | `LANCE_SUPPORTS_BLOB_COMPACTION = false` | optimize | diff --git a/openapi.json b/openapi.json index d624856..fbb1119 100644 --- a/openapi.json +++ b/openapi.json @@ -7,7 +7,7 @@ "name": "MIT", "identifier": "MIT" }, - "version": "0.7.1" + "version": "0.7.2" }, "paths": { "/graphs": {