Merge branch 'main' into ragnorc/omnigraph-mcp-crate

Folds in v0.7.2 (release #301) + RFC-013 Phase 7 (graph lineage in __manifest,
internal schema v3→v4 migration #299; WriteTxn #298; recovery convergence #296)
under the MCP branch.

Conflict resolutions (2 files):
- crates/omnigraph-server/Cargo.toml: take main's 0.7.2 path-dep constraints;
  keep our omnigraph-mcp dep (bumped to 0.7.2).
- docs/releases/v0.8.0.md (add/add): both branches drafted v0.8.0 notes for the
  same next minor — combined them. v0.8.0 now documents BOTH the MCP surface
  (ours) and main's __manifest lineage fold + the breaking internal-schema-v4
  upgrade-order requirement (kept prominent under Upgrade notes). Corrected our
  'no breaking changes / on-disk format unchanged' line, which the v4 migration
  makes false.

Coherence: omnigraph-mcp [package] + Cargo.lock bumped 0.7.1→0.7.2; openapi.json
auto-merged to info.version 0.7.2 (no API-surface drift from the incoming
engine-internal commits). Verification deferred to CI (no local rebuild).
This commit is contained in:
Ragnor Comerford 2026-06-25 15:53:53 +02:00
commit 4d4c2164de
No known key found for this signature in database
62 changed files with 5898 additions and 1053 deletions

View file

@ -1,6 +1,6 @@
[package]
name = "omnigraph-api-types"
version = "0.7.1"
version = "0.7.2"
edition = "2024"
description = "Shared HTTP wire DTOs for Omnigraph — request/response types and engine-result → DTO mappings used by both omnigraph-server and omnigraph-cli (RFC-009). Plain serde/utoipa types; no transport or server internals."
license = "MIT"
@ -9,8 +9,8 @@ homepage = "https://github.com/ModernRelay/omnigraph"
documentation = "https://docs.rs/omnigraph-api-types"
[dependencies]
omnigraph = { package = "omnigraph-engine", path = "../omnigraph", version = "0.7.1" }
omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.7.1" }
omnigraph = { package = "omnigraph-engine", path = "../omnigraph", version = "0.7.2" }
omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.7.2" }
serde = { workspace = true }
serde_json = { workspace = true }
utoipa = { workspace = true }

View file

@ -1,6 +1,6 @@
[package]
name = "omnigraph-cli"
version = "0.7.1"
version = "0.7.2"
edition = "2024"
description = "CLI for the Omnigraph graph database."
license = "MIT"
@ -13,12 +13,12 @@ name = "omnigraph"
path = "src/main.rs"
[dependencies]
omnigraph = { package = "omnigraph-engine", path = "../omnigraph", version = "0.7.1" }
omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.7.1" }
omnigraph-api-types = { path = "../omnigraph-api-types", version = "0.7.1" }
omnigraph-cluster = { path = "../omnigraph-cluster", version = "0.7.1" }
omnigraph-policy = { path = "../omnigraph-policy", version = "0.7.1" }
omnigraph-server = { path = "../omnigraph-server", version = "0.7.1" }
omnigraph = { package = "omnigraph-engine", path = "../omnigraph", version = "0.7.2" }
omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.7.2" }
omnigraph-api-types = { path = "../omnigraph-api-types", version = "0.7.2" }
omnigraph-cluster = { path = "../omnigraph-cluster", version = "0.7.2" }
omnigraph-policy = { path = "../omnigraph-policy", version = "0.7.2" }
omnigraph-server = { path = "../omnigraph-server", version = "0.7.2" }
clap = { workspace = true }
color-eyre = { workspace = true }
serde = { workspace = true }

View file

@ -1,6 +1,6 @@
[package]
name = "omnigraph-cluster"
version = "0.7.1"
version = "0.7.2"
edition = "2024"
description = "Cluster configuration validation, planning, and config-only apply for Omnigraph."
license = "MIT"
@ -14,8 +14,8 @@ documentation = "https://docs.rs/omnigraph-cluster"
failpoints = ["dep:fail", "fail/failpoints", "omnigraph/failpoints"]
[dependencies]
omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.7.1" }
omnigraph = { package = "omnigraph-engine", path = "../omnigraph", version = "0.7.1" }
omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.7.2" }
omnigraph = { package = "omnigraph-engine", path = "../omnigraph", version = "0.7.2" }
fail = { workspace = true, optional = true }
serde = { workspace = true }
serde_json = { workspace = true }
@ -30,5 +30,6 @@ tokio = { workspace = true }
ulid = { workspace = true }
[dev-dependencies]
serial_test = "3"
tempfile = { workspace = true }
tokio = { workspace = true }

View file

@ -474,7 +474,7 @@ pub(crate) async fn preview_schema_migration(
Ok(preview.plan)
}
struct LiveGraphObservation {
pub(crate) struct LiveGraphObservation {
manifest_version: u64,
schema_digest: String,
}
@ -494,7 +494,7 @@ pub(crate) async fn observe_live_graph(graph_uri: &str) -> Result<LiveGraphObser
})
}
struct GraphObservationJson<'a> {
pub(crate) struct GraphObservationJson<'a> {
address: &'a str,
graph_uri: &'a str,
observed_at: &'a str,
@ -949,7 +949,7 @@ pub(crate) fn validate_id(kind: &str, path: &str, value: &str, diagnostics: &mut
}
}
enum PolicyTarget {
pub(crate) enum PolicyTarget {
Cluster,
Graph(String),
WrongKind(String),

View file

@ -1,6 +1,13 @@
//! Fault-injection hooks for the cluster apply protocol, mirroring the
//! engine's `omnigraph::failpoints` pattern. With the `failpoints` feature
//! off, every call site compiles to `Ok(())`.
//!
//! Only `maybe_fail` lives here — it returns the cluster's [`Diagnostic`]
//! error type. The test-side configuration guard is shared: use
//! [`omnigraph::failpoints::ScopedFailPoint`], which is registry-only
//! (error-type agnostic) and reachable because the cluster's `failpoints`
//! feature enables `omnigraph/failpoints`. One `ScopedFailPoint`, in the
//! lowest crate, avoids a drifting duplicate.
use crate::Diagnostic;
@ -19,38 +26,16 @@ pub(crate) fn maybe_fail(_name: &str) -> Result<(), Diagnostic> {
Ok(())
}
#[cfg(feature = "failpoints")]
pub struct ScopedFailPoint {
name: String,
}
#[cfg(feature = "failpoints")]
impl ScopedFailPoint {
pub fn new(name: &str, action: &str) -> Self {
fail::cfg(name, action).expect("configure failpoint");
Self {
name: name.to_string(),
}
}
/// Register a callback failpoint with the same Drop-based cleanup as
/// `new`. Without the guard, a panic while the point is active would
/// leak the callback into the process-global registry and fire it under
/// later tests in the same binary.
pub fn with_callback<F>(name: &str, callback: F) -> Self
where
F: Fn() + Send + Sync + 'static,
{
fail::cfg_callback(name, callback).expect("configure callback failpoint");
Self {
name: name.to_string(),
}
}
}
#[cfg(feature = "failpoints")]
impl Drop for ScopedFailPoint {
fn drop(&mut self) {
fail::remove(&self.name);
}
/// Compile-checked catalog of this crate's apply-protocol failpoint names.
/// Engine-scoped names referenced from cluster tests live in
/// [`omnigraph::failpoints::names`].
pub mod names {
pub const CLUSTER_APPLY_AFTER_GRAPH_CREATE: &str = "cluster_apply.after_graph_create";
pub const CLUSTER_APPLY_AFTER_GRAPH_DELETE: &str = "cluster_apply.after_graph_delete";
pub const CLUSTER_APPLY_AFTER_PAYLOAD_PHASE: &str = "cluster_apply.after_payload_phase";
pub const CLUSTER_APPLY_AFTER_SCHEMA_APPLY: &str = "cluster_apply.after_schema_apply";
pub const CLUSTER_APPLY_BEFORE_GRAPH_CREATE: &str = "cluster_apply.before_graph_create";
pub const CLUSTER_APPLY_BEFORE_GRAPH_DELETE: &str = "cluster_apply.before_graph_delete";
pub const CLUSTER_APPLY_BEFORE_SCHEMA_APPLY: &str = "cluster_apply.before_schema_apply";
pub const CLUSTER_APPLY_BEFORE_STATE_WRITE: &str = "cluster_apply.before_state_write";
}

View file

@ -1,8 +1,6 @@
use std::collections::{BTreeMap, BTreeSet};
use std::fs::{self, OpenOptions};
use std::io::{ErrorKind, Write};
use std::fs::{self};
use std::path::{Path, PathBuf};
use std::process;
use omnigraph::db::{Omnigraph, ReadTarget, SchemaApplyOptions};
use omnigraph_compiler::SchemaMigrationPlan;
@ -26,11 +24,7 @@ mod store;
mod sweep;
mod types;
use config::{
QueriesDecl, future_field_diagnostics, graph_address, initial_import_state, load_desired,
normalize_policy_target, observe_declared_graphs, observe_live_graph, parse_cluster_config,
policy_address, preview_schema_migration, query_address, resolve_config_path,
resolve_query_decls, schema_address, state_resource_digests, validate_cluster_header,
validate_id, validate_query_source,
QueriesDecl, graph_address, initial_import_state, load_desired, observe_declared_graphs, parse_cluster_config, preview_schema_migration, schema_address, state_resource_digests, validate_cluster_header,
};
use diff::{
FailedGraphOrigin, ResourceKind, append_embedding_profile_changes,
@ -42,13 +36,12 @@ pub use serve::{
cluster_root_for_graph_uri, read_serving_snapshot, read_serving_snapshot_from_storage,
resolve_graph_storage_uri,
};
use store::{ClusterStore, StateLockGuard, StateSnapshot};
use store::ClusterStore;
use sweep::{
mark_approvals_consumed, record_approval_consumed, sweep_recovery_sidecars,
tombstone_graph_subtree, warn_pending_recovery_sidecars,
};
pub use types::*;
use types::*;
pub const CLUSTER_CONFIG_FILE: &str = "cluster.yaml";
pub const CLUSTER_GRAPHS_DIR: &str = "graphs";
@ -510,7 +503,7 @@ pub async fn apply_config_dir_with_options(
continue;
}
};
if let Err(diagnostic) = failpoints::maybe_fail("cluster_apply.before_graph_create") {
if let Err(diagnostic) = failpoints::maybe_fail(crate::failpoints::names::CLUSTER_APPLY_BEFORE_GRAPH_CREATE) {
// Simulated crash before the init: the sidecar stays for the
// sweep (row 1: root absent -> intent removed next run).
diagnostics.push(diagnostic);
@ -587,7 +580,7 @@ pub async fn apply_config_dir_with_options(
// Crash point: the graph exists, the cluster state does not record it
// yet. A failure here must acknowledge nothing; the next run's sweep
// rolls the ledger forward (row 4).
if let Err(diagnostic) = failpoints::maybe_fail("cluster_apply.after_graph_create") {
if let Err(diagnostic) = failpoints::maybe_fail(crate::failpoints::names::CLUSTER_APPLY_AFTER_GRAPH_CREATE) {
diagnostics.push(diagnostic);
return early_return(
display_path(&desired.config_dir),
@ -727,7 +720,7 @@ pub async fn apply_config_dir_with_options(
continue;
}
};
if let Err(diagnostic) = failpoints::maybe_fail("cluster_apply.before_schema_apply") {
if let Err(diagnostic) = failpoints::maybe_fail(crate::failpoints::names::CLUSTER_APPLY_BEFORE_SCHEMA_APPLY) {
// Simulated crash before the engine call: the sidecar stays; the
// sweep retires it next run (ledger still consistent with live).
diagnostics.push(diagnostic);
@ -787,7 +780,7 @@ pub async fn apply_config_dir_with_options(
}
// Crash point: the manifest moved, the ledger does not record it yet.
// A failure here acknowledges nothing; the sweep rolls forward.
if let Err(diagnostic) = failpoints::maybe_fail("cluster_apply.after_schema_apply") {
if let Err(diagnostic) = failpoints::maybe_fail(crate::failpoints::names::CLUSTER_APPLY_AFTER_SCHEMA_APPLY) {
diagnostics.push(diagnostic);
return early_return(
display_path(&desired.config_dir),
@ -872,7 +865,7 @@ pub async fn apply_config_dir_with_options(
// Crash point: payloads are on disk, state has not moved. A failure here
// must leave state.json byte-identical and acknowledge nothing; re-running
// apply repairs via the skip-if-exists blob reuse.
if let Err(diagnostic) = failpoints::maybe_fail("cluster_apply.after_payload_phase") {
if let Err(diagnostic) = failpoints::maybe_fail(crate::failpoints::names::CLUSTER_APPLY_AFTER_PAYLOAD_PHASE) {
diagnostics.push(diagnostic);
return early_return(
display_path(&desired.config_dir),
@ -949,7 +942,7 @@ pub async fn apply_config_dir_with_options(
continue;
}
};
if let Err(diagnostic) = failpoints::maybe_fail("cluster_apply.before_graph_delete") {
if let Err(diagnostic) = failpoints::maybe_fail(crate::failpoints::names::CLUSTER_APPLY_BEFORE_GRAPH_DELETE) {
// Simulated crash before removal: row 8 retires the intent and
// the still-valid approval lets a later run retry.
diagnostics.push(diagnostic);
@ -974,7 +967,7 @@ pub async fn apply_config_dir_with_options(
}
// Crash point: the root is gone, the ledger does not record it yet.
// The sweep rolls forward (row 7b) and consumes the approval.
if let Err(diagnostic) = failpoints::maybe_fail("cluster_apply.after_graph_delete") {
if let Err(diagnostic) = failpoints::maybe_fail(crate::failpoints::names::CLUSTER_APPLY_AFTER_GRAPH_DELETE) {
diagnostics.push(diagnostic);
return early_return(
display_path(&desired.config_dir),
@ -1080,7 +1073,7 @@ pub async fn apply_config_dir_with_options(
// persisted-statuses revert contract below is exercised; a cfg_callback
// on this point can mutate state.json to simulate a concurrent writer,
// making write_state's CAS check fail organically.
let write_result = match failpoints::maybe_fail("cluster_apply.before_state_write") {
let write_result = match failpoints::maybe_fail(crate::failpoints::names::CLUSTER_APPLY_BEFORE_STATE_WRITE) {
Ok(()) => {
backend
.write_state(&new_state, expected_cas.as_deref(), &mut observations)

View file

@ -408,10 +408,6 @@ impl ClusterStore {
}
}
pub(crate) fn payload_display(&self, kind: &ResourceKind, digest: &str) -> Option<String> {
Self::payload_relative(kind, digest).map(|relative| self.display(&relative))
}
pub(crate) async fn payload_exists(&self, kind: &ResourceKind, digest: &str) -> bool {
let Some(relative) = Self::payload_relative(kind, digest) else {
return false;

View file

@ -13,9 +13,11 @@ use std::fs;
use std::path::{Path, PathBuf};
use fail::FailScenario;
use serial_test::serial;
use omnigraph::db::Omnigraph;
use omnigraph::failpoints::ScopedFailPoint as EngineScopedFailPoint;
use omnigraph_cluster::failpoints::ScopedFailPoint;
// One ScopedFailPoint for both engine- and cluster-scoped failpoint names:
// it is registry-only (error-type agnostic) and lives in the lowest crate.
use omnigraph::failpoints::ScopedFailPoint;
use omnigraph_cluster::{
ApplyOptions, apply_config_dir, apply_config_dir_with_options, approve_config_dir,
validate_config_dir,
@ -105,12 +107,13 @@ fn query_blob(config_dir: &Path, digests: &BTreeMap<String, String>) -> PathBuf
}
#[tokio::test]
#[serial]
async fn failpoint_wiring_returns_injected_diagnostic() {
let scenario = FailScenario::setup();
let dir = fixture();
seed_applyable_state(dir.path());
let _failpoint = ScopedFailPoint::new("cluster_apply.after_payload_phase", "return");
let _failpoint = ScopedFailPoint::new(omnigraph_cluster::failpoints::names::CLUSTER_APPLY_AFTER_PAYLOAD_PHASE, "return");
let out = apply_config_dir(dir.path()).await;
assert!(!out.ok);
assert!(out.diagnostics.iter().any(|diagnostic| {
@ -127,6 +130,7 @@ async fn failpoint_wiring_returns_injected_diagnostic() {
/// state.json is byte-identical, nothing is acknowledged — and a plain re-run
/// repairs by trusting the existing content-addressed blobs.
#[tokio::test]
#[serial]
async fn apply_crash_after_payload_phase_leaves_state_unmoved_then_recovers() {
let scenario = FailScenario::setup();
let dir = fixture();
@ -134,7 +138,7 @@ async fn apply_crash_after_payload_phase_leaves_state_unmoved_then_recovers() {
let state_before = fs::read(state_path(dir.path())).unwrap();
{
let _failpoint = ScopedFailPoint::new("cluster_apply.after_payload_phase", "return");
let _failpoint = ScopedFailPoint::new(omnigraph_cluster::failpoints::names::CLUSTER_APPLY_AFTER_PAYLOAD_PHASE, "return");
let out = apply_config_dir(dir.path()).await;
assert!(!out.ok);
assert!(!out.state_written);
@ -169,6 +173,7 @@ async fn apply_crash_after_payload_phase_leaves_state_unmoved_then_recovers() {
/// (possible under `state.lock: false`) must surface `state_cas_mismatch`,
/// acknowledge nothing, and leave the concurrent writer's state on disk.
#[tokio::test]
#[serial]
async fn apply_cas_race_surfaces_state_cas_mismatch() {
let scenario = FailScenario::setup();
let dir = fixture();
@ -179,7 +184,7 @@ async fn apply_cas_race_surfaces_state_cas_mismatch() {
// after apply read it but before apply writes. RAII-guarded so a panic
// inside apply cannot leak the callback into the global registry.
let race_path = state_path(dir.path());
let failpoint = ScopedFailPoint::with_callback("cluster_apply.before_state_write", move || {
let failpoint = ScopedFailPoint::with_callback(omnigraph_cluster::failpoints::names::CLUSTER_APPLY_BEFORE_STATE_WRITE, move || {
let mut state: serde_json::Value =
serde_json::from_str(&fs::read_to_string(&race_path).unwrap()).unwrap();
state["state_revision"] = serde_json::json!(99);
@ -256,13 +261,14 @@ fn recovery_sidecars(config_dir: &Path) -> Vec<PathBuf> {
/// The next run's sweep removes the intent (row 1) and the same run creates
/// the graph and converges.
#[tokio::test]
#[serial]
async fn create_crash_before_init_recovers_via_sweep() {
let scenario = FailScenario::setup();
let dir = fixture();
seed_empty_state(dir.path());
{
let _failpoint = ScopedFailPoint::new("cluster_apply.before_graph_create", "return");
let _failpoint = ScopedFailPoint::new(omnigraph_cluster::failpoints::names::CLUSTER_APPLY_BEFORE_GRAPH_CREATE, "return");
let out = apply_config_dir(dir.path()).await;
assert!(!out.ok);
assert!(out.diagnostics.iter().any(|diagnostic| {
@ -298,6 +304,7 @@ async fn create_crash_before_init_recovers_via_sweep() {
/// ledger is stale, nothing was acknowledged. The next run's sweep rolls the
/// ledger forward (row 4) with an audit entry, and the run converges.
#[tokio::test]
#[serial]
async fn create_crash_after_init_rolls_state_forward() {
let scenario = FailScenario::setup();
let dir = fixture();
@ -305,7 +312,7 @@ async fn create_crash_after_init_rolls_state_forward() {
let state_before = fs::read(dir.path().join("__cluster/state.json")).unwrap();
{
let _failpoint = ScopedFailPoint::new("cluster_apply.after_graph_create", "return");
let _failpoint = ScopedFailPoint::new(omnigraph_cluster::failpoints::names::CLUSTER_APPLY_AFTER_GRAPH_CREATE, "return");
let out = apply_config_dir(dir.path()).await;
assert!(!out.ok);
assert!(!out.state_written);
@ -385,6 +392,7 @@ async fn live_schema_digest(dir: &Path) -> String {
/// live schema and ledger are untouched; the next run's sweep retires the
/// stale intent and the same run applies and converges.
#[tokio::test]
#[serial]
async fn schema_crash_before_apply_recovers_via_sweep() {
let scenario = FailScenario::setup();
let dir = fixture();
@ -393,7 +401,7 @@ async fn schema_crash_before_apply_recovers_via_sweep() {
fs::write(dir.path().join("people.pg"), SCHEMA_V2).unwrap();
{
let _failpoint = ScopedFailPoint::new("cluster_apply.before_schema_apply", "return");
let _failpoint = ScopedFailPoint::new(omnigraph_cluster::failpoints::names::CLUSTER_APPLY_BEFORE_SCHEMA_APPLY, "return");
let out = apply_config_dir_with_options(
dir.path(),
ApplyOptions {
@ -425,6 +433,7 @@ async fn schema_crash_before_apply_recovers_via_sweep() {
/// the graph manifest moves. The defensive cleanup proof should remove the
/// cluster sidecar immediately so a pre-movement error cannot brick boot.
#[tokio::test]
#[serial]
async fn schema_apply_error_before_graph_movement_removes_sidecar() {
let scenario = FailScenario::setup();
let dir = fixture();
@ -433,7 +442,7 @@ async fn schema_apply_error_before_graph_movement_removes_sidecar() {
fs::write(dir.path().join("people.pg"), SCHEMA_V2).unwrap();
{
let _failpoint = EngineScopedFailPoint::new("schema_apply.before_staging_write", "return");
let _failpoint = ScopedFailPoint::new(omnigraph::failpoints::names::SCHEMA_APPLY_BEFORE_STAGING_WRITE, "return");
let out = apply_config_dir(dir.path()).await;
assert!(!out.ok);
assert!(
@ -462,6 +471,7 @@ async fn schema_apply_error_before_graph_movement_removes_sidecar() {
/// prove this is a pre-movement failure, so the sidecar must survive for
/// explicit recovery/quarantine instead of being cleaned up defensively.
#[tokio::test]
#[serial]
async fn schema_apply_error_after_graph_movement_keeps_sidecar() {
let scenario = FailScenario::setup();
let dir = fixture();
@ -472,7 +482,7 @@ async fn schema_apply_error_after_graph_movement_keeps_sidecar() {
let v2_digest = desired.resource_digests["schema.knowledge"].clone();
{
let _failpoint = EngineScopedFailPoint::new("schema_apply.after_manifest_commit", "return");
let _failpoint = ScopedFailPoint::new(omnigraph::failpoints::names::SCHEMA_APPLY_AFTER_MANIFEST_COMMIT, "return");
let out = apply_config_dir(dir.path()).await;
assert!(!out.ok);
assert!(
@ -524,6 +534,7 @@ async fn schema_apply_error_after_graph_movement_keeps_sidecar() {
/// moved, the ledger is stale, nothing acknowledged; the next run's sweep
/// rolls the ledger forward with an audit entry and the run converges.
#[tokio::test]
#[serial]
async fn schema_crash_after_apply_rolls_state_forward() {
let scenario = FailScenario::setup();
let dir = fixture();
@ -534,7 +545,7 @@ async fn schema_crash_after_apply_rolls_state_forward() {
let v2_digest = desired.resource_digests["schema.knowledge"].clone();
{
let _failpoint = ScopedFailPoint::new("cluster_apply.after_schema_apply", "return");
let _failpoint = ScopedFailPoint::new(omnigraph_cluster::failpoints::names::CLUSTER_APPLY_AFTER_SCHEMA_APPLY, "return");
let out = apply_config_dir(dir.path()).await;
assert!(!out.ok);
assert!(!out.state_written);
@ -608,13 +619,14 @@ async fn seed_approved_delete(dir: &Path) -> String {
/// next run retires the stale intent (row 8) and the still-approved delete
/// completes in the same run.
#[tokio::test]
#[serial]
async fn delete_crash_before_removal_reproposes() {
let scenario = FailScenario::setup();
let dir = fixture();
let approval_id = seed_approved_delete(dir.path()).await;
{
let _failpoint = ScopedFailPoint::new("cluster_apply.before_graph_delete", "return");
let _failpoint = ScopedFailPoint::new(omnigraph_cluster::failpoints::names::CLUSTER_APPLY_BEFORE_GRAPH_DELETE, "return");
let out = apply_config_dir(dir.path()).await;
assert!(!out.ok);
assert!(dir.path().join("graphs/old.omni").exists());
@ -650,6 +662,7 @@ async fn delete_crash_before_removal_reproposes() {
/// nothing acknowledged; the next run's sweep rolls the tombstone forward,
/// consumes the approval the sidecar carries, and audits the recovery.
#[tokio::test]
#[serial]
async fn delete_crash_after_removal_rolls_forward() {
let scenario = FailScenario::setup();
let dir = fixture();
@ -657,7 +670,7 @@ async fn delete_crash_after_removal_rolls_forward() {
let state_before = fs::read(state_path(dir.path())).unwrap();
{
let _failpoint = ScopedFailPoint::new("cluster_apply.after_graph_delete", "return");
let _failpoint = ScopedFailPoint::new(omnigraph_cluster::failpoints::names::CLUSTER_APPLY_AFTER_GRAPH_DELETE, "return");
let out = apply_config_dir(dir.path()).await;
assert!(!out.ok);
assert!(!out.state_written);

View file

@ -1,6 +1,6 @@
[package]
name = "omnigraph-compiler"
version = "0.7.1"
version = "0.7.2"
edition = "2024"
description = "Schema/query compiler for Omnigraph. Zero Lance dependency."
license = "MIT"

View file

@ -1,6 +1,6 @@
[package]
name = "omnigraph-mcp"
version = "0.7.1"
version = "0.7.2"
edition = "2024"
description = "MCP (Model Context Protocol) Streamable-HTTP transport and backend seam for Omnigraph. Contains the rmcp dependency and defines the McpBackend trait the server implements; names no omnigraph engine/server type, so the dependency edge is server → mcp."
license = "MIT"

View file

@ -1,6 +1,6 @@
[package]
name = "omnigraph-policy"
version = "0.7.1"
version = "0.7.2"
edition = "2024"
description = "Policy / authorization layer for Omnigraph — Cedar-backed PolicyEngine, PolicyChecker trait, ResourceScope enum."
license = "MIT"

View file

@ -1,6 +1,6 @@
[package]
name = "omnigraph-server"
version = "0.7.1"
version = "0.7.2"
edition = "2024"
description = "HTTP server for the Omnigraph graph database."
license = "MIT"
@ -19,14 +19,14 @@ default = []
aws = ["dep:aws-config", "dep:aws-sdk-secretsmanager"]
[dependencies]
omnigraph = { package = "omnigraph-engine", path = "../omnigraph", version = "0.7.1" }
omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.7.1" }
omnigraph-policy = { path = "../omnigraph-policy", version = "0.7.1" }
omnigraph-api-types = { path = "../omnigraph-api-types", version = "0.7.1" }
omnigraph-cluster = { path = "../omnigraph-cluster", version = "0.7.1" }
omnigraph = { package = "omnigraph-engine", path = "../omnigraph", version = "0.7.2" }
omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.7.2" }
omnigraph-policy = { path = "../omnigraph-policy", version = "0.7.2" }
omnigraph-api-types = { path = "../omnigraph-api-types", version = "0.7.2" }
omnigraph-cluster = { path = "../omnigraph-cluster", version = "0.7.2" }
# The MCP surface. rmcp is contained to omnigraph-mcp — the server carries NO
# direct rmcp dependency (verify: `cargo tree -p omnigraph-server -e normal | grep rmcp`).
omnigraph-mcp = { path = "../omnigraph-mcp", version = "0.7.1" }
omnigraph-mcp = { path = "../omnigraph-mcp", version = "0.7.2" }
axum = { workspace = true }
http = "1"
clap = { workspace = true }

View file

@ -1,6 +1,6 @@
[package]
name = "omnigraph-engine"
version = "0.7.1"
version = "0.7.2"
edition = "2024"
description = "Runtime engine for the Omnigraph graph database."
license = "MIT"
@ -16,8 +16,8 @@ default = []
failpoints = ["dep:fail", "fail/failpoints"]
[dependencies]
omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.7.1" }
omnigraph-policy = { path = "../omnigraph-policy", version = "0.7.1" }
omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.7.2" }
omnigraph-policy = { path = "../omnigraph-policy", version = "0.7.2" }
lance = { workspace = true }
lance-datafusion = { workspace = true }
datafusion = { workspace = true }
@ -52,7 +52,7 @@ chrono = { workspace = true }
arc-swap = { workspace = true }
[dev-dependencies]
omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.7.1" }
omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.7.2" }
tokio = { workspace = true }
lance-namespace-impls = { workspace = true }
lance-io = "7.0.0"

View file

@ -1,6 +1,5 @@
use std::collections::{HashMap, VecDeque};
use std::sync::Arc;
use std::time::{SystemTime, UNIX_EPOCH};
use arrow_array::{
Array, RecordBatch, RecordBatchIterator, StringArray, TimestampMicrosecondArray, UInt64Array,
@ -29,7 +28,16 @@ pub struct GraphCommit {
pub struct CommitGraph {
root_uri: String,
dataset: Dataset,
/// Handle on `_graph_commits.lance` at the active branch, held only for the
/// branch-management WRITES (`create_branch`, formerly `version`) and
/// `refresh`. It is a DERIVED artifact (RFC-013 Phase 7): graph lineage lives
/// in `__manifest`, and reads (`head_commit`/`load_commits`/`get_commit`/
/// `merge_base`) never touch it. `None` means the branch's
/// `_graph_commits.lance` ref is missing (an interrupted fork-reclaim or a
/// `cleanup` race) while the manifest lineage is still authoritative — so the
/// READS stay correct and only a subsequent `create_branch` surfaces the loud
/// actionable error. Mirrors `actor_dataset`'s best-effort `Option`.
dataset: Option<Dataset>,
actor_dataset: Option<Dataset>,
active_branch: Option<String>,
actor_by_commit_id: HashMap<String, String>,
@ -38,20 +46,19 @@ pub struct CommitGraph {
}
impl CommitGraph {
pub async fn init(root_uri: &str, manifest_version: u64) -> Result<Self> {
/// Create the commit-graph datasets for a fresh graph. The genesis
/// `graph_commit` + `graph_head` rows live in `__manifest` (folded into the
/// init write — RFC-013 Phase 7), so `_graph_commits.lance` is created EMPTY
/// here: it exists only to carry the Lance branch refs that `create_branch` /
/// `list_branches` / the `cleanup` orphan reconciler operate on. No commit
/// rows are ever written to it. The in-memory cache is sourced from the
/// manifest projection — the same path as [`open`], so genesis is seen
/// identically whether the graph was just initialized or reopened.
pub async fn init(root_uri: &str) -> Result<Self> {
let root = root_uri.trim_end_matches('/');
let uri = graph_commits_uri(root);
let genesis = GraphCommit {
graph_commit_id: ulid::Ulid::new().to_string(),
manifest_branch: None,
manifest_version,
parent_commit_id: None,
merged_parent_commit_id: None,
actor_id: None,
created_at: now_micros()?,
};
let batch = commits_to_batch(&[genesis.clone()])?;
let batch = RecordBatch::new_empty(commit_graph_schema());
let reader = RecordBatchIterator::new(vec![Ok(batch)], commit_graph_schema());
let params = WriteParams {
mode: WriteMode::Create,
@ -66,17 +73,30 @@ impl CommitGraph {
.map_err(|e| OmniError::Lance(e.to_string()))?;
let actor_dataset = create_commit_actor_dataset(root).await?;
let (commit_by_id, head_commit) = load_commit_cache_from_manifest(root, None).await?;
Ok(Self {
root_uri: root.to_string(),
dataset,
dataset: Some(dataset),
actor_dataset: Some(actor_dataset),
active_branch: None,
actor_by_commit_id: HashMap::new(),
commit_by_id: HashMap::from([(genesis.graph_commit_id.clone(), genesis.clone())]),
head_commit: Some(genesis),
commit_by_id,
head_commit,
})
}
/// Insert a just-published commit into the in-memory cache (RFC-013 Phase 7).
/// The durable write already happened in the manifest publish CAS; this only
/// keeps the cache consistent for same-handle reads, with no storage I/O.
/// Head selection matches the manifest-sourced load (`should_replace_head`).
pub fn insert_committed(&mut self, commit: GraphCommit) {
if should_replace_head(self.head_commit.as_ref(), &commit) {
self.head_commit = Some(commit.clone());
}
self.commit_by_id
.insert(commit.graph_commit_id.clone(), commit);
}
pub async fn open(root_uri: &str) -> Result<Self> {
let root = root_uri.trim_end_matches('/');
let wrapper = crate::instrumentation::commit_graph_wrapper();
@ -87,17 +107,24 @@ impl CommitGraph {
crate::instrumentation::open_dataset_tracked(&graph_commit_actors_uri(root), wrapper)
.await
.ok();
let actor_by_commit_id = match &actor_dataset {
Some(dataset) => load_commit_actor_cache(dataset).await?,
None => HashMap::new(),
};
let (commit_by_id, head_commit) = load_commit_cache(&dataset, &actor_by_commit_id).await?;
// RFC-013 step 4: source the in-memory cache from the `__manifest`
// lineage projection (which carries the actor inline), not from
// `_graph_commits.lance`. The dataset handles above are retained for the
// branch-management ops (create/delete/list/version) that still target
// the commit-graph dataset; the actor dataset is only kept for the
// dual-write append path. The projection-equivalence gate proves this
// cache equals the prior `_graph_commits.lance` read. A pre-Phase-7 (v3)
// graph not yet migrated falls back to the legacy read — see
// `load_commit_cache_for_branch`.
let (commit_by_id, head_commit) = load_commit_cache_for_branch(root, None).await?;
Ok(Self {
root_uri: root.to_string(),
dataset,
// `open` targets main and never checks out a branch (main cannot be
// deleted/recreated), so the handle is always present here.
dataset: Some(dataset),
actor_dataset,
active_branch: None,
actor_by_commit_id,
actor_by_commit_id: HashMap::new(),
commit_by_id,
head_commit,
})
@ -109,25 +136,33 @@ impl CommitGraph {
let dataset =
crate::instrumentation::open_dataset_tracked(&graph_commits_uri(root), wrapper.clone())
.await?;
let dataset = dataset
.checkout_branch(branch)
.await
.map_err(|e| OmniError::Lance(e.to_string()))?;
// Best-effort checkout of the DERIVED `_graph_commits.lance` branch ref.
// It is held only for `create_branch` (a write); the lineage READ below
// comes from `__manifest`. A missing ref (interrupted fork-reclaim /
// `cleanup` race) must not wedge the read, so a typed not-found yields a
// `None` handle — a subsequent `create_branch` then surfaces the loud
// error. Any OTHER open error (transient IO / corrupt) still propagates,
// matching the `force_delete_branch` / `read_legacy_commit_cache` idiom.
let dataset = match dataset.checkout_branch(branch).await {
Ok(ds) => Some(ds),
Err(lance::Error::RefNotFound { .. }) | Err(lance::Error::NotFound { .. }) => None,
Err(e) => return Err(OmniError::Lance(e.to_string())),
};
let actor_dataset =
crate::instrumentation::open_dataset_tracked(&graph_commit_actors_uri(root), wrapper)
.await
.ok();
let actor_by_commit_id = match &actor_dataset {
Some(dataset) => load_commit_actor_cache(dataset).await?,
None => HashMap::new(),
};
let (commit_by_id, head_commit) = load_commit_cache(&dataset, &actor_by_commit_id).await?;
// Hard `?`: the manifest existence gate. `load_commit_cache_for_branch`
// opens the branch's `__manifest` (its own `checkout_branch` on the
// authoritative table), so a TRULY absent branch still fails loudly here —
// only the derived `_graph_commits.lance` ref is allowed to be missing.
let (commit_by_id, head_commit) = load_commit_cache_for_branch(root, Some(branch)).await?;
Ok(Self {
root_uri: root.to_string(),
dataset,
actor_dataset,
active_branch: Some(branch.to_string()),
actor_by_commit_id,
actor_by_commit_id: HashMap::new(),
commit_by_id,
head_commit,
})
@ -136,40 +171,49 @@ impl CommitGraph {
pub async fn refresh(&mut self) -> Result<()> {
let root = self.root_uri.clone();
let wrapper = crate::instrumentation::commit_graph_wrapper();
self.dataset = crate::instrumentation::open_dataset_tracked(
let dataset = crate::instrumentation::open_dataset_tracked(
&graph_commits_uri(&root),
wrapper.clone(),
)
.await?;
if let Some(branch) = &self.active_branch {
self.dataset = self
.dataset
.checkout_branch(branch)
.await
.map_err(|e| OmniError::Lance(e.to_string()))?;
}
// Same best-effort checkout as `open_at_branch`: a missing DERIVED branch
// ref leaves the handle `None` (only `create_branch` then errors), while
// the in-memory cache re-syncs from the authoritative manifest below.
self.dataset = match &self.active_branch {
Some(branch) => match dataset.checkout_branch(branch).await {
Ok(ds) => Some(ds),
Err(lance::Error::RefNotFound { .. }) | Err(lance::Error::NotFound { .. }) => None,
Err(e) => return Err(OmniError::Lance(e.to_string())),
},
None => Some(dataset),
};
self.actor_dataset =
crate::instrumentation::open_dataset_tracked(&graph_commit_actors_uri(&root), wrapper)
.await
.ok();
self.actor_by_commit_id = match &self.actor_dataset {
Some(dataset) => load_commit_actor_cache(dataset).await?,
None => HashMap::new(),
};
let (commit_by_id, head_commit) =
load_commit_cache(&self.dataset, &self.actor_by_commit_id).await?;
load_commit_cache_for_branch(&root, self.active_branch.as_deref()).await?;
self.commit_by_id = commit_by_id;
self.head_commit = head_commit;
Ok(())
}
pub fn version(&self) -> u64 {
self.dataset.version().version
}
pub async fn create_branch(&mut self, name: &str) -> Result<()> {
let mut ds = self.dataset.clone();
ds.create_branch(name, self.version(), None)
// The held `_graph_commits.lance` handle is the only thing that can fork a
// branch ref. If it is missing (an interrupted fork-reclaim or a `cleanup`
// race dropped the derived ref while manifest lineage stayed authoritative),
// fail loudly + actionably rather than silently. Repair is the existing
// `cleanup` orphan reconciler (`reconcile_commit_graph_orphans`), not an
// inline write on this path.
let Some(dataset) = &self.dataset else {
let branch = self.active_branch.as_deref().unwrap_or("main");
return Err(OmniError::manifest_internal(format!(
"commit-graph branch ref for '{branch}' is missing; run `omnigraph cleanup` then retry"
)));
};
let version = dataset.version().version;
let mut ds = dataset.clone();
ds.create_branch(name, version, None)
.await
.map_err(|e| OmniError::Lance(e.to_string()))?;
Ok(())
@ -216,7 +260,17 @@ impl CommitGraph {
Ok(branches.into_keys().collect())
}
pub async fn append_commit(
// DEAD as of RFC-013 Phase 7: graph commits are recorded in `__manifest`
// (folded into the publish CAS), never appended to `_graph_commits.lance`.
// These append helpers are retained only because the actor sidecar table they
// touch is still enumerated by `optimize` (internal-table compaction); they
// have no caller on any write path. The single-source invariant is guarded by
// `tests/lineage_projection.rs`, which fails if `_graph_commits.lance` ever
// gains a commit row. Do NOT call these to record a commit — use the
// coordinator's `commit_*_with_actor` / `commit_merge_with_actor`, which carry
// the lineage intent into the manifest publish.
#[allow(dead_code)]
async fn append_commit(
&mut self,
manifest_branch: Option<&str>,
manifest_version: u64,
@ -233,7 +287,8 @@ impl CommitGraph {
.await
}
pub async fn append_merge_commit(
#[allow(dead_code)]
async fn append_merge_commit(
&mut self,
manifest_branch: Option<&str>,
manifest_version: u64,
@ -251,6 +306,7 @@ impl CommitGraph {
.await
}
#[allow(dead_code)]
async fn append_commit_with_parents(
&mut self,
manifest_branch: Option<&str>,
@ -267,16 +323,22 @@ impl CommitGraph {
parent_commit_id: parent_commit_id.map(|s| s.to_string()),
merged_parent_commit_id: merged_parent_commit_id.map(|s| s.to_string()),
actor_id: actor_id.map(str::to_string),
created_at: now_micros()?,
created_at: crate::db::now_micros()?,
};
let batch = commits_to_batch(&[commit.clone()])?;
let reader = RecordBatchIterator::new(vec![Ok(batch)], commit_graph_schema());
let mut ds = self.dataset.clone();
// This helper is dead on every write path (RFC-013 Phase 7) — reached only
// by the transitional v3 fixtures, which always hold the commits dataset.
// A `None` here would be a fixture bug, so fail loudly rather than silently.
let mut ds = self
.dataset
.clone()
.ok_or_else(|| OmniError::manifest_internal("commit-graph dataset is missing"))?;
ds.append(reader, None)
.await
.map_err(|e| OmniError::Lance(e.to_string()))?;
self.dataset = ds;
self.dataset = Some(ds);
if let Some(actor_id) = actor_id {
self.append_actor(&graph_commit_id, actor_id).await?;
}
@ -289,6 +351,7 @@ impl CommitGraph {
Ok(graph_commit_id)
}
#[allow(dead_code)] // RFC-013 Phase 7: dead — see `append_commit`.
async fn append_actor(&mut self, graph_commit_id: &str, actor_id: &str) -> Result<()> {
if self
.actor_by_commit_id
@ -301,7 +364,7 @@ impl CommitGraph {
let record = CommitActorRecord {
graph_commit_id: graph_commit_id.to_string(),
actor_id: actor_id.to_string(),
created_at: now_micros()?,
created_at: crate::db::now_micros()?,
};
let batch = commit_actors_to_batch(&[record])?;
let reader = RecordBatchIterator::new(vec![Ok(batch)], commit_actor_schema());
@ -452,7 +515,12 @@ async fn create_commit_actor_dataset(root_uri: &str) -> Result<Dataset> {
};
match Dataset::write(reader, &uri as &str, Some(params)).await {
Ok(dataset) => Ok(dataset),
Err(err) if err.to_string().contains("Dataset already exists") => Dataset::open(&uri)
// Create-or-open idempotency: a concurrent/prior create raced us. Match
// the typed `DatasetAlreadyExists` variant, not the display string — the
// message is not a Lance API contract (a wording change would silently
// break this fallback). Pinned by
// `lance_surface_guards.rs::lance_error_dataset_already_exists_variant_exists`.
Err(lance::Error::DatasetAlreadyExists { .. }) => Dataset::open(&uri)
.await
.map_err(|open_err| OmniError::Lance(open_err.to_string())),
Err(err) => Err(OmniError::Lance(err.to_string())),
@ -490,6 +558,156 @@ fn commits_to_batch(commits: &[GraphCommit]) -> Result<RecordBatch> {
.map_err(|e| OmniError::Lance(e.to_string()))
}
/// Build the in-memory commit cache for `branch`, choosing the source by the
/// branch manifest's internal-schema stamp (RFC-013 step 4 forward/back-compat):
///
/// - stamp ≥ v4 (post-Phase-7, the normal case): the `__manifest` lineage
/// projection — `graph_commit`/`graph_head` rows folded into the publish CAS.
/// - stamp < v4 (a pre-Phase-7 graph not yet migrated): the legacy
/// `_graph_commits.lance` read. This is the **transitional v3 fallback** that
/// lets a READ-ONLY open of an un-migrated graph still see correct history —
/// a read-only open never runs the v3→v4 backfill (it must not write), so
/// without this gate it would read an empty DAG from `__manifest`. A
/// read-write open backfills `__manifest` on its first write and thereafter
/// takes the projection branch.
///
/// Both sources pick the head with `should_replace_head`, so the cache is
/// identical regardless of which branch is taken. Remove the fallback once no
/// graph below internal-schema v4 remains.
async fn load_commit_cache_for_branch(
root_uri: &str,
branch: Option<&str>,
) -> Result<(HashMap<String, GraphCommit>, Option<GraphCommit>)> {
let stamp = crate::db::manifest::internal_schema_stamp_at(root_uri, branch).await?;
// Defense-in-depth: refuse a branch whose stamp this binary cannot serve —
// newer than CURRENT, or below MIN_SUPPORTED — for the same reason the main
// read path does (`refuse_if_internal_schema_unsupported`). A `> CURRENT` stamp
// means a newer binary wrote a shape we can't read, so the projection below
// would misread it; a `< MIN` stamp predates the legacy readers this binary
// still carries. Not a live hole today: migrations run main-first
// (`migrate_on_open` migrates main; each branch migrates on its own first
// write), so main's stamp bounds every branch's and the main read path already
// refuses first. The guard closes the gap if that ordering is ever weakened.
crate::db::manifest::refuse_if_stamp_unsupported(stamp)?;
if stamp < crate::db::manifest::INTERNAL_MANIFEST_SCHEMA_VERSION {
// Transitional: un-migrated v3 graph — read lineage from the legacy
// `_graph_commits.lance` so reads (incl. read-only opens) see history.
return read_legacy_commit_cache(root_uri, branch).await;
}
load_commit_cache_from_manifest(root_uri, branch).await
}
/// Build the in-memory commit cache from the `__manifest` graph-lineage
/// projection (RFC-013 step 4) rather than `_graph_commits.lance`. The lineage
/// rows carry the actor inline, so no separate actor-table read is needed. Head
/// selection is identical to [`load_commit_cache`] (`should_replace_head`), so
/// the resulting cache is equivalent to the prior `_graph_commits.lance` read.
async fn load_commit_cache_from_manifest(
root_uri: &str,
branch: Option<&str>,
) -> Result<(HashMap<String, GraphCommit>, Option<GraphCommit>)> {
let (rows, _heads) =
crate::db::manifest::ManifestCoordinator::read_graph_lineage_at(root_uri, branch).await?;
let mut commit_by_id = HashMap::with_capacity(rows.len());
let mut head_commit = None;
for row in rows {
let commit = GraphCommit {
graph_commit_id: row.graph_commit_id,
manifest_branch: row.manifest_branch,
manifest_version: row.manifest_version,
parent_commit_id: row.parent_commit_id,
merged_parent_commit_id: row.merged_parent_commit_id,
actor_id: row.actor_id,
created_at: row.created_at,
};
if should_replace_head(head_commit.as_ref(), &commit) {
head_commit = Some(commit.clone());
}
commit_by_id.insert(commit.graph_commit_id.clone(), commit);
}
Ok((commit_by_id, head_commit))
}
/// Read the legacy `_graph_commits.lance` (+ its actor sidecar) for `branch`
/// into an in-memory cache — the transitional source for graphs not yet
/// migrated to internal-schema v4 (RFC-013 step 4). Two callers, both
/// transitional: the v3→v4 migration backfill (which copies these rows into
/// `__manifest`) and the read-only v3 fallback in `CommitGraph::open*`. Returns
/// `(commit_by_id, head)`, with the head picked by `should_replace_head` —
/// identical to the manifest projection. A genuinely ABSENT (not-found) commit
/// dataset or actor sidecar yields an empty cache (no head); any OTHER open error
/// (transient IO / corrupt file) propagates loudly rather than being read as
/// "empty" — a swallow here would let the v3→v4 migration backfill nothing and
/// still stamp v4, orphaning the real lineage permanently. This keeps the legacy
/// readers alive while any v3 graph survives; once no graph is below v4 it can
/// retire.
pub(crate) async fn read_legacy_commit_cache(
root_uri: &str,
branch: Option<&str>,
) -> Result<(HashMap<String, GraphCommit>, Option<GraphCommit>)> {
let root = root_uri.trim_end_matches('/');
let commits_uri = graph_commits_uri(root);
let commits_open = match crate::failpoints::maybe_fail_lance_open("migration.v3_to_v4.legacy_open")
{
Ok(()) => Dataset::open(&commits_uri).await,
Err(injected) => Err(injected),
};
let mut dataset = match commits_open {
Ok(dataset) => dataset,
// An ABSENT commits dataset is the legitimate "no legacy data" signal —
// a graph with no `_graph_commits.lance` (or none on this branch) yields
// an empty cache. But ONLY a genuine not-found gets that treatment: a
// transient/corrupt open (IO / CorruptFile / …) must propagate, never be
// read as "empty". The v3→v4 migration calls this once before stamping
// v4; swallowing a non-not-found error here would backfill nothing and
// stamp v4 anyway, orphaning the real lineage permanently (the migration
// never re-runs, and the v3 fallback is then disabled). Lance maps an
// object-store NotFound to `DatasetNotFound`; the variant match (vs an
// existence probe) is exactly right and not over-strict — pinned by
// `lance_surface_guards.rs::dataset_open_missing_returns_not_found_variant`.
Err(lance::Error::DatasetNotFound { .. }) | Err(lance::Error::NotFound { .. }) => {
return Ok((HashMap::new(), None));
}
Err(e) => return Err(OmniError::Lance(e.to_string())),
};
if let Some(branch) = branch.filter(|b| *b != "main") {
dataset = dataset
.checkout_branch(branch)
.await
.map_err(|e| OmniError::Lance(e.to_string()))?;
}
// The actor sidecar may be absent (older graphs without authored commits);
// an empty actor map then leaves every commit's actor `None`. It is read
// FLAT (no branch checkout): the pre-Phase-7 commit graph never forked the
// actor dataset — actors are keyed by `graph_commit_id` globally — so a
// branch's commits resolve their actor from the same single actor table.
// This matches the live `CommitGraph::open_at_branch`, which also opens the
// actor dataset on main while checking out the branch only on the commits
// dataset.
let actors_open =
match crate::failpoints::maybe_fail_lance_open("migration.v3_to_v4.legacy_open") {
Ok(()) => Dataset::open(&graph_commit_actors_uri(root)).await,
Err(injected) => Err(injected),
};
let actor_by_commit_id = match actors_open {
Ok(actor_dataset) => load_commit_actor_cache(&actor_dataset).await?,
// An ABSENT actor sidecar is benign (older graphs without authored
// commits) — every commit's actor stays `None`. A not-found is therefore
// the empty-map signal. But a CORRUPT/transient actor open must NOT be
// read as "no authors": silently wiping all authorship and then stamping
// v4 is the same permanent-loss hole as the commits arm, so anything
// other than not-found propagates. (Same variant contract, different
// rationale — absence is normal here, error is not.)
Err(lance::Error::DatasetNotFound { .. }) | Err(lance::Error::NotFound { .. }) => {
HashMap::new()
}
Err(e) => return Err(OmniError::Lance(e.to_string())),
};
load_commit_cache(&dataset, &actor_by_commit_id).await
}
async fn load_commit_cache(
dataset: &Dataset,
actor_by_commit_id: &HashMap<String, String>,
@ -694,11 +912,170 @@ async fn open_for_branch(root_uri: &str, branch: Option<&str>) -> Result<CommitG
}
}
fn now_micros() -> Result<i64> {
let duration = SystemTime::now()
.duration_since(UNIX_EPOCH)
.map_err(|e| OmniError::manifest(format!("system clock before UNIX_EPOCH: {}", e)))?;
Ok(duration.as_micros() as i64)
/// Identities of the commits written into a synthetic pre-Phase-7 (v3) graph by
/// [`seed_legacy_v3_lineage`], for assertions after migration.
//
// Gated on `test` OR the `failpoints` feature: the v3→v4 migration fault-injection
// test lives in the `failpoints` integration binary (the fail registry is
// process-global, so failpoint tests must not run in-source), and that binary
// compiles the crate without `cfg(test)` — so it needs this fixture under the
// feature too. Still excluded from release builds.
#[cfg(any(test, feature = "failpoints"))]
#[derive(Debug, Clone)]
pub struct V3LineageFixture {
/// The genesis (parentless) commit id.
pub genesis: String,
/// A direct, authored commit on main (actor `act-a`).
pub commit_a: String,
/// A commit tagged to the `feature` branch (actor `act-feature`).
pub feature_commit: String,
/// The merge commit on main: parent = `commit_a`, merged_parent =
/// `feature_commit`, actor `act-merger`. This is the head of main.
pub merge_commit: String,
/// Every commit id written, in append order (for count assertions).
pub all_ids: Vec<String>,
}
/// Build a synthetic pre-Phase-7 (internal-schema v3) graph at `root_uri`: graph
/// lineage lives ONLY in `_graph_commits.lance` (+ its actor sidecar), `__manifest`
/// carries NO `graph_commit`/`graph_head` rows, and the stamp is set to v3. This
/// reproduces exactly the on-disk shape a graph created by a pre-RFC-013-Phase-7
/// binary would have, so the v3→v4 migration and the v3-read fallback can be
/// tested against it.
///
/// The lineage is a realistic DAG with a branch + a real merge: genesis → A →
/// (feature commit, off to the side) → merge(A, feature) at the head of main,
/// with authored actors on the non-genesis commits. Reaches the dead-on-the-
/// write-path `append_commit_with_parents`/`append_actor` (still present for
/// exactly this transitional purpose) to write the legacy rows.
#[cfg(any(test, feature = "failpoints"))]
pub async fn seed_legacy_v3_lineage(root_uri: &str) -> Result<V3LineageFixture> {
let root = root_uri.trim_end_matches('/');
// 1. Create `__manifest` (Phase-7 folds genesis lineage into it) and the
// EMPTY legacy `_graph_commits.lance`. We then append the v3-style commit
// rows below — a real v3 graph carried its genesis in `_graph_commits`.
crate::db::manifest::seed_manifest_for_v3_fixture(root).await?;
let mut cg = CommitGraph::init(root).await?;
// Clear the cache that init seeded from the (genesis-bearing) manifest, so
// the appended rows below are the whole story and parents come out right.
cg.commit_by_id.clear();
cg.head_commit = None;
// 2. Append the legacy lineage to `_graph_commits.lance` on main.
let genesis = cg
.append_commit_with_parents(None, 1, None, None, None)
.await?;
let commit_a = cg
.append_commit_with_parents(None, 2, Some(&genesis), None, Some("act-a"))
.await?;
let feature_commit = cg
.append_commit_with_parents(Some("feature"), 3, Some(&commit_a), None, Some("act-feature"))
.await?;
let merge_commit = cg
.append_commit_with_parents(
None,
4,
Some(&commit_a),
Some(&feature_commit),
Some("act-merger"),
)
.await?;
// 3. Strip the genesis lineage rows the Phase-7 init folded into `__manifest`
// and rewind the stamp to v3, so the manifest matches a true pre-Phase-7
// graph (no lineage in `__manifest`, stamp v3).
crate::db::manifest::strip_lineage_and_set_v3_stamp_for_fixture(root).await?;
Ok(V3LineageFixture {
genesis: genesis.clone(),
commit_a: commit_a.clone(),
feature_commit: feature_commit.clone(),
merge_commit: merge_commit.clone(),
all_ids: vec![genesis, commit_a, feature_commit, merge_commit],
})
}
/// Identities of a synthetic pre-Phase-7 (v3) graph that carries a REAL Lance
/// branch (built by [`seed_legacy_v3_lineage_with_branch`]).
#[cfg(test)]
#[derive(Debug, Clone)]
pub struct V3BranchedLineageFixture {
/// The genesis (parentless) commit on main.
pub genesis: String,
/// A direct authored commit on main (actor `act-a`). The head of main.
pub commit_a: String,
/// A commit on the real `feature` Lance branch (actor `act-branch`),
/// parented off `commit_a`. The head of `feature`.
pub branch_commit: String,
/// The branch name forked on both `_graph_commits.lance` and `__manifest`.
pub branch: String,
}
/// Build a synthetic pre-Phase-7 (internal-schema v3) graph at `root_uri` that
/// carries a REAL Lance branch `feature` on BOTH `_graph_commits.lance` and
/// `__manifest`, reproducing exactly the on-disk shape of a branched graph
/// created by a pre-RFC-013-Phase-7 binary:
///
/// - `_graph_commits.lance`: main has `genesis → A`; the `feature` Lance branch
/// adds `branch_commit` (parent `A`). Authored actors land in the FLAT actor
/// sidecar (the pre-Phase-7 commit graph never forked the actor table).
/// - `__manifest`: main is stamped v3 with NO lineage rows; the `feature` branch
/// is forked from main's v3 state, so it too is v3 with NO lineage of its own.
///
/// This is the fixture the per-branch v3→v4 migration runs against: it lets a
/// test prove that migrating the `feature` branch reads the branch's legacy
/// lineage, writes it into the BRANCH's `__manifest`, and leaves main untouched —
/// the case the main-only [`seed_legacy_v3_lineage`] cannot exercise.
#[cfg(test)]
pub async fn seed_legacy_v3_lineage_with_branch(root_uri: &str) -> Result<V3BranchedLineageFixture> {
let root = root_uri.trim_end_matches('/');
// 1. `__manifest` (genesis folded by Phase-7 init) + an empty legacy
// `_graph_commits.lance`. Clear the init-seeded cache so the rows we
// append below are the whole story.
crate::db::manifest::seed_manifest_for_v3_fixture(root).await?;
let mut cg = CommitGraph::init(root).await?;
cg.commit_by_id.clear();
cg.head_commit = None;
// 2. Main lineage on `_graph_commits.lance`: genesis → A (authored).
let genesis = cg
.append_commit_with_parents(None, 1, None, None, None)
.await?;
let commit_a = cg
.append_commit_with_parents(None, 2, Some(&genesis), None, Some("act-a"))
.await?;
// 3. Fork a real `feature` Lance branch on `_graph_commits.lance`, switch the
// handle to it, and append an authored branch commit (its actor lands in
// the flat main actor table — exactly the pre-Phase-7 shape).
cg.create_branch("feature").await?;
let commits_ds = cg
.dataset
.take()
.expect("commits dataset present after create_branch")
.checkout_branch("feature")
.await
.map_err(|e| OmniError::Lance(e.to_string()))?;
cg.dataset = Some(commits_ds);
cg.active_branch = Some("feature".to_string());
let branch_commit = cg
.append_commit_with_parents(Some("feature"), 3, Some(&commit_a), None, Some("act-branch"))
.await?;
// 4. Rewind main's `__manifest` to the v3 shape (strip the folded genesis
// lineage, set stamp 3) BEFORE forking — so the `feature` manifest branch
// inherits the stripped v3 state (no lineage, stamp 3).
crate::db::manifest::strip_lineage_and_set_v3_stamp_for_fixture(root).await?;
crate::db::manifest::fork_manifest_branch_for_v3_fixture(root, "feature").await?;
Ok(V3BranchedLineageFixture {
genesis,
commit_a,
branch_commit,
branch: "feature".to_string(),
})
}
#[cfg(test)]
@ -709,6 +1086,83 @@ mod tests {
use super::*;
// RFC-013 step 4: the v3-read fallback / migration source reads a NAMED
// branch's lineage from a real Lance branch on `_graph_commits.lance`, while
// resolving actors from the FLAT actor table (the pre-Phase-7 commit graph
// forked only the commits dataset, never the actor sidecar). This guards
// both that branch-checkout path and the flat-actor resolution — the case
// the main-branch fixture (commits on main only) does not exercise.
#[tokio::test]
async fn read_legacy_commit_cache_resolves_branch_commits_with_flat_actors() {
let dir = tempfile::tempdir().unwrap();
let uri = dir.path().to_str().unwrap();
// A v3 graph needs `__manifest` to exist for `CommitGraph::init`'s
// genesis-cache seed; we clear that cache and write our own legacy rows.
crate::db::manifest::seed_manifest_for_v3_fixture(uri)
.await
.unwrap();
let mut cg = CommitGraph::init(uri).await.unwrap();
cg.commit_by_id.clear();
cg.head_commit = None;
// Main lineage: genesis → A (authored). The actor lands in the FLAT
// `_graph_commit_actors.lance` (never branched).
let genesis = cg
.append_commit_with_parents(None, 1, None, None, None)
.await
.unwrap();
let commit_a = cg
.append_commit_with_parents(None, 2, Some(&genesis), None, Some("act-a"))
.await
.unwrap();
// Fork a real Lance branch on `_graph_commits.lance`, switch the handle
// to it, and append an authored branch commit (its actor also goes to
// the flat main actor table — exactly the pre-Phase-7 shape).
cg.create_branch("feature").await.unwrap();
cg.dataset = Some(
cg.dataset
.take()
.unwrap()
.checkout_branch("feature")
.await
.unwrap(),
);
cg.active_branch = Some("feature".to_string());
let branch_commit = cg
.append_commit_with_parents(
Some("feature"),
3,
Some(&commit_a),
None,
Some("act-branch"),
)
.await
.unwrap();
// The legacy read at the branch sees the inherited main commits + the
// branch commit, the head is the branch commit, and the authored actors
// resolve from the flat table (no branch checkout on the actor dataset).
let (commits, head) = read_legacy_commit_cache(uri, Some("feature")).await.unwrap();
assert_eq!(commits.len(), 3, "branch inherits genesis + A + its own commit");
assert_eq!(
head.as_ref().unwrap().graph_commit_id,
branch_commit,
"the branch commit is the head"
);
assert_eq!(
commits.get(&commit_a).unwrap().actor_id.as_deref(),
Some("act-a"),
"main commit's actor resolves from the flat actor table",
);
assert_eq!(
commits.get(&branch_commit).unwrap().actor_id.as_deref(),
Some("act-branch"),
"branch commit's actor resolves from the flat actor table",
);
}
#[test]
fn load_commits_from_batches_returns_error_for_bad_schema() {
let batch = RecordBatch::try_new(

View file

@ -106,13 +106,17 @@ impl GraphCoordinator {
storage: Arc<dyn StorageAdapter>,
) -> Result<Self> {
let root = normalize_root_uri(root_uri)?;
// The genesis graph commit is folded into the manifest init write, so
// `__manifest` is the single source of graph lineage from version one
// (RFC-013 Phase 7). `CommitGraph::init` then creates the empty
// branch-ref dataset and seeds its cache from that manifest genesis.
let manifest = ManifestCoordinator::init(&root, catalog).await?;
let commit_graph = Some(CommitGraph::init(&root, manifest.version()).await?);
let commit_graph = CommitGraph::init(&root).await?;
Ok(Self {
root_uri: root,
storage,
manifest,
commit_graph,
commit_graph: Some(commit_graph),
bound_branch: None,
})
}
@ -257,7 +261,7 @@ impl GraphCoordinator {
/// fresh, so any existing commit-graph branch with this name is provably
/// orphaned and is force-dropped before recreating.
async fn create_commit_graph_branch(&mut self, branch: &str) -> Result<()> {
failpoints::maybe_fail("branch_create.after_manifest_branch_create")?;
failpoints::maybe_fail(crate::failpoints::names::BRANCH_CREATE_AFTER_MANIFEST_BRANCH_CREATE)?;
let Some(commit_graph) = &mut self.commit_graph else {
return Ok(());
};
@ -306,7 +310,7 @@ impl GraphCoordinator {
/// Best-effort, idempotent reclaim of the commit-graph branch `branch`.
/// Tolerates an absent commit-graph dataset (a graph that never committed).
async fn reclaim_commit_graph_branch(&mut self, branch: &str) -> Result<()> {
failpoints::maybe_fail("branch_delete.before_commit_graph_reclaim")?;
failpoints::maybe_fail(crate::failpoints::names::BRANCH_DELETE_BEFORE_COMMIT_GRAPH_RECLAIM)?;
if let Some(commit_graph) = &mut self.commit_graph {
commit_graph.force_delete_branch(branch).await
} else if self
@ -438,7 +442,12 @@ impl GraphCoordinator {
.exists(&graph_commits_uri(self.root_uri()))
.await?
{
let _ = CommitGraph::init(self.root_uri(), self.manifest.version()).await?;
// A graph opened without a commit-graph dataset gets the empty
// branch-ref dataset created lazily here. Graph lineage lives in
// `__manifest` (RFC-013 Phase 7) — a graph initialized by current
// code already carries its genesis there, and the commit graph
// sources its cache from it. No genesis is written here.
CommitGraph::init(self.root_uri()).await?;
}
self.commit_graph = match self.current_branch() {
Some(branch) => Some(CommitGraph::open_at_branch(self.root_uri(), branch).await?),
@ -452,12 +461,8 @@ impl GraphCoordinator {
updates: &[SubTableUpdate],
actor_id: Option<&str>,
) -> Result<PublishedSnapshot> {
let manifest_version = self.commit_manifest_updates(updates).await?;
let snapshot_id = self.record_graph_commit(manifest_version, actor_id).await?;
Ok(PublishedSnapshot {
manifest_version,
_snapshot_id: snapshot_id,
})
self.commit_updates_with_actor_with_expected(updates, &HashMap::new(), actor_id)
.await
}
/// Commit with publisher-level OCC fence. The `expected_table_versions` map
@ -471,45 +476,9 @@ impl GraphCoordinator {
expected_table_versions: &HashMap<String, u64>,
actor_id: Option<&str>,
) -> Result<PublishedSnapshot> {
let manifest_version = self
.commit_manifest_updates_with_expected(updates, expected_table_versions)
.await?;
let snapshot_id = self.record_graph_commit(manifest_version, actor_id).await?;
Ok(PublishedSnapshot {
manifest_version,
_snapshot_id: snapshot_id,
})
}
pub(crate) async fn commit_manifest_updates(
&mut self,
updates: &[SubTableUpdate],
) -> Result<u64> {
let manifest_version = self.manifest.commit(updates).await?;
failpoints::maybe_fail("graph_publish.after_manifest_commit")?;
Ok(manifest_version)
}
pub(crate) async fn commit_manifest_updates_with_expected(
&mut self,
updates: &[SubTableUpdate],
expected_table_versions: &HashMap<String, u64>,
) -> Result<u64> {
let manifest_version = self
.manifest
.commit_with_expected(updates, expected_table_versions)
.await?;
failpoints::maybe_fail("graph_publish.after_manifest_commit")?;
Ok(manifest_version)
}
pub(crate) async fn commit_manifest_changes(
&mut self,
changes: &[ManifestChange],
) -> Result<u64> {
let manifest_version = self.manifest.commit_changes(changes).await?;
failpoints::maybe_fail("graph_publish.after_manifest_commit")?;
Ok(manifest_version)
let changes = updates_to_changes(updates);
self.commit_changes_with_actor_with_expected(&changes, expected_table_versions, actor_id)
.await
}
pub(crate) async fn commit_changes_with_actor(
@ -517,71 +486,110 @@ impl GraphCoordinator {
changes: &[ManifestChange],
actor_id: Option<&str>,
) -> Result<PublishedSnapshot> {
let manifest_version = self.commit_manifest_changes(changes).await?;
let snapshot_id = self.record_graph_commit(manifest_version, actor_id).await?;
self.commit_changes_with_actor_with_expected(changes, &HashMap::new(), actor_id)
.await
}
/// Publish `changes` and record one graph commit in the SAME manifest CAS
/// (RFC-013 Phase 7). The lineage intent (a freshly minted commit id, the
/// branch, the actor) rides the publish so the `graph_commit` + `graph_head`
/// rows land atomically with the table-version rows — one manifest version,
/// no separate write, no `commit_graph.refresh()` to pick a parent (the
/// publisher resolves it under the CAS). The in-memory commit cache is then
/// updated from the intent + the resolved parent without a re-read.
async fn commit_changes_with_actor_with_expected(
&mut self,
changes: &[ManifestChange],
expected_table_versions: &HashMap<String, u64>,
actor_id: Option<&str>,
) -> Result<PublishedSnapshot> {
self.ensure_commit_graph_initialized().await?;
let intent = self.new_lineage_intent(actor_id, None)?;
failpoints::maybe_fail(crate::failpoints::names::GRAPH_PUBLISH_BEFORE_COMMIT_APPEND)?;
let outcome = self
.manifest
.commit_changes_with_lineage(changes, expected_table_versions, Some(&intent))
.await?;
failpoints::maybe_fail(crate::failpoints::names::GRAPH_PUBLISH_AFTER_MANIFEST_COMMIT)?;
let snapshot_id = self.apply_lineage_to_cache(intent, &outcome);
Ok(PublishedSnapshot {
manifest_version,
manifest_version: outcome.version,
_snapshot_id: snapshot_id,
})
}
pub(crate) async fn record_graph_commit(
/// Publish a branch-merge: `updates` (the merged table versions) plus the
/// merge commit, in one manifest CAS (RFC-013 Phase 7). The merge commit's
/// merged-in parent is `merged_parent_commit_id` (the source head, stable);
/// its first parent is resolved by the publisher as the current target-branch
/// head — the live head, which is the post-merge correct parent even if the
/// target advanced since the merge began.
pub(crate) async fn commit_merge_with_actor(
&mut self,
manifest_version: u64,
actor_id: Option<&str>,
) -> Result<SnapshotId> {
self.ensure_commit_graph_initialized().await?;
let current_branch = self.current_branch().map(str::to_string);
let Some(commit_graph) = &mut self.commit_graph else {
return Ok(SnapshotId::synthetic(
current_branch.as_deref(),
manifest_version,
self.manifest_incarnation().e_tag.as_deref(),
));
};
failpoints::maybe_fail("graph_publish.before_commit_append")?;
// Refresh the commit-graph head from storage before selecting the
// parent. `append_commit` parents the new commit on the IN-MEMORY head
// (`head_commit_id`, zero storage read), but the manifest was just
// committed against a freshly rebased pin (`commit_all` opens a fresh
// coordinator) while THIS coordinator's cached head may be stale because
// an external writer advanced the branch. Without this refresh a
// same-branch write after an external commit appends off the stale head
// and FORKS the commit DAG (the new commit and the external commit
// sharing a parent). Refreshing makes the parent the true current head;
// the just-committed manifest version has no commit-graph row yet, so the
// fresh head is exactly the prior commit. (record_merge_commit is
// unaffected — it passes explicit parents, never the cached head.)
commit_graph.refresh().await?;
let graph_commit_id = commit_graph
.append_commit(current_branch.as_deref(), manifest_version, actor_id)
.await?;
Ok(SnapshotId::new(graph_commit_id))
}
pub(crate) async fn record_merge_commit(
&mut self,
manifest_version: u64,
parent_commit_id: &str,
updates: &[SubTableUpdate],
merged_parent_commit_id: &str,
actor_id: Option<&str>,
) -> Result<SnapshotId> {
self.ensure_commit_graph_initialized().await?;
let current_branch = self.current_branch().map(str::to_string);
let commit_graph = self.commit_graph.as_mut().ok_or_else(|| {
OmniError::manifest("branch merge requires _graph_commits.lance".to_string())
})?;
failpoints::maybe_fail("graph_publish.before_commit_append")?;
let graph_commit_id = commit_graph
.append_merge_commit(
current_branch.as_deref(),
manifest_version,
parent_commit_id,
merged_parent_commit_id,
actor_id,
)
let intent =
self.new_lineage_intent(actor_id, Some(merged_parent_commit_id.to_string()))?;
failpoints::maybe_fail(crate::failpoints::names::GRAPH_PUBLISH_BEFORE_COMMIT_APPEND)?;
let changes = updates_to_changes(updates);
let outcome = self
.manifest
.commit_changes_with_lineage(&changes, &HashMap::new(), Some(&intent))
.await?;
Ok(SnapshotId::new(graph_commit_id))
failpoints::maybe_fail(crate::failpoints::names::GRAPH_PUBLISH_AFTER_MANIFEST_COMMIT)?;
Ok(self.apply_lineage_to_cache(intent, &outcome))
}
/// Mint a [`LineageIntent`] for the next commit on the current branch: a
/// fresh ULID (stable across the publisher's CAS retries) and a timestamp.
/// The parent is NOT chosen here — the publisher resolves it per attempt
/// against the manifest it commits against.
fn new_lineage_intent(
&self,
actor_id: Option<&str>,
merged_parent_commit_id: Option<String>,
) -> Result<crate::db::manifest::LineageIntent> {
Ok(crate::db::manifest::LineageIntent {
graph_commit_id: ulid::Ulid::new().to_string(),
branch: self.current_branch().map(str::to_string),
actor_id: actor_id.map(str::to_string),
merged_parent_commit_id,
created_at: crate::db::now_micros()?,
})
}
/// Insert the just-published commit into the in-memory commit cache from the
/// intent + the publisher-resolved parent + the new manifest version. No
/// storage I/O: the durable write already happened in the publish CAS, and
/// this keeps a same-handle read's `head_commit_id` consistent with the
/// snapshot it just advanced. Falls back to a synthetic id only when the
/// commit graph is somehow absent (never on a real write).
fn apply_lineage_to_cache(
&mut self,
intent: crate::db::manifest::LineageIntent,
outcome: &crate::db::manifest::CommitOutcome,
) -> SnapshotId {
let Some(commit_graph) = &mut self.commit_graph else {
return SnapshotId::synthetic(
self.bound_branch.as_deref(),
outcome.version,
self.manifest.incarnation().e_tag.as_deref(),
);
};
let commit = GraphCommit {
graph_commit_id: intent.graph_commit_id.clone(),
manifest_branch: intent.branch,
manifest_version: outcome.version,
parent_commit_id: outcome.parent_commit_id.clone(),
merged_parent_commit_id: intent.merged_parent_commit_id,
actor_id: intent.actor_id,
created_at: intent.created_at,
};
commit_graph.insert_committed(commit);
SnapshotId::new(intent.graph_commit_id)
}
async fn open_commit_graph_for_branch(
@ -625,6 +633,15 @@ fn graph_commits_uri(root_uri: &str) -> String {
join_uri(root_uri, GRAPH_COMMITS_DIR)
}
/// Wrap each `SubTableUpdate` as a `ManifestChange::Update` for the publisher.
fn updates_to_changes(updates: &[SubTableUpdate]) -> Vec<ManifestChange> {
updates
.iter()
.cloned()
.map(ManifestChange::Update)
.collect()
}
fn normalize_branch_name(branch: &str) -> Result<Option<String>> {
let branch = branch.trim();
if branch.is_empty() {

View file

@ -35,7 +35,9 @@ pub(crate) use metadata::TableVersionMetadata;
use metadata::{OMNIGRAPH_ROW_COUNT_KEY, table_version_metadata_for_state};
#[cfg(test)]
use namespace::{branch_manifest_namespace, staged_table_namespace};
use publisher::{GraphNamespacePublisher, ManifestBatchPublisher};
pub(crate) use migrations::refuse_if_stamp_unsupported;
pub(crate) use publisher::LineageIntent;
use publisher::{GraphNamespacePublisher, ManifestBatchPublisher, PublishOutcome};
pub(crate) use recovery::{
RecoveryMode, RecoverySidecar, RecoverySidecarHandle, SidecarKind, SidecarTablePin,
SidecarTableRegistration, SidecarTombstone, confirm_sidecar_phase_b, delete_sidecar,
@ -43,6 +45,7 @@ pub(crate) use recovery::{
recover_manifest_drift, schema_apply_serial_queue_key, write_sidecar,
};
pub use state::SubTableEntry;
pub(crate) use state::{GraphLineageRow, read_graph_lineage};
#[cfg(test)]
use state::string_column;
use state::{ManifestState, read_manifest_state};
@ -50,8 +53,34 @@ use state::{ManifestState, read_manifest_state};
const OBJECT_TYPE_TABLE: &str = "table";
const OBJECT_TYPE_TABLE_VERSION: &str = "table_version";
const OBJECT_TYPE_TABLE_TOMBSTONE: &str = "table_tombstone";
/// Immutable per-commit graph-lineage row (RFC-013 Phase 7). One row per graph
/// commit; the projected form reconstructs a [`GraphCommit`]. `__manifest` is
/// the single source — written in the same publish CAS as the table-version
/// rows (no `_graph_commits.lance` row).
const OBJECT_TYPE_GRAPH_COMMIT: &str = "graph_commit";
/// Mutable per-branch head pointer for the graph lineage (RFC-013 Phase 7).
/// `object_id` is `graph_head:<branch>` (`graph_head:main` for the main branch).
const OBJECT_TYPE_GRAPH_HEAD: &str = "graph_head";
const TABLE_VERSION_MANAGEMENT_KEY: &str = "table_version_management";
/// Stable head-key segment for the main branch in `graph_head:<branch>` rows.
/// `table_branch`/`manifest_branch` encode main as null, but `object_id` must be
/// non-null, so the head row needs a literal — matching the `"main"` sentinel
/// already used by `SnapshotId::synthetic` and `open_for_branch`.
pub(crate) const MAIN_BRANCH_HEAD_KEY: &str = "main";
/// The result of a manifest commit that may have folded in a graph commit
/// (RFC-013 Phase 7).
#[derive(Debug, Clone)]
pub(crate) struct CommitOutcome {
/// The new `__manifest` version after the publish.
pub version: u64,
/// The parent the publisher resolved for the recorded commit, or `None` when
/// no lineage was recorded or the commit is the genesis. Lets the caller
/// update its in-memory commit cache without re-reading the manifest.
pub parent_commit_id: Option<String>,
}
/// Apply pending internal-schema migrations against `__manifest` on the
/// open-for-write path, independent of a publish.
///
@ -65,7 +94,105 @@ const TABLE_VERSION_MANAGEMENT_KEY: &str = "table_version_management";
/// Idempotent: a no-op stamp read when the on-disk version already matches.
pub(crate) async fn migrate_on_open(root_uri: &str) -> Result<()> {
let mut dataset = open_manifest_dataset(root_uri, None).await?;
migrations::migrate_internal_schema(&mut dataset).await
// Main branch: the v3→v4 lineage backfill reads `_graph_commits.lance` at
// main. Named branches migrate on their own first write via the publisher.
migrations::migrate_internal_schema(&mut dataset, root_uri, None).await
}
/// The on-disk internal-schema stamp of `__manifest` at `branch` (main when
/// `None`). The transitional v3-read fallback in `CommitGraph` uses this to
/// decide whether to source lineage from `__manifest` (stamp ≥ v4, post-Phase-7)
/// or from the legacy `_graph_commits.lance` (stamp < v4, not yet migrated).
pub(crate) async fn internal_schema_stamp_at(root_uri: &str, branch: Option<&str>) -> Result<u32> {
let dataset = open_manifest_dataset(root_uri, branch).await?;
Ok(migrations::read_stamp(&dataset))
}
/// Refuse to open a graph whose `__manifest` is stamped outside this binary's
/// supported internal-schema range (newer than CURRENT, or older than
/// MIN_SUPPORTED). The read-only open path calls this — it skips the write-path
/// migration where the refusal otherwise lives — so an old binary still refuses a
/// newer graph instead of silently misreading it, and a too-new binary refuses a
/// below-floor graph instead of opening an unmigrated one.
pub(crate) async fn refuse_if_internal_schema_unsupported(root_uri: &str) -> Result<()> {
let stamp = internal_schema_stamp_at(root_uri, None).await?;
migrations::refuse_if_stamp_unsupported(stamp)
}
/// The internal-schema version this binary writes. Exposed so the v3-read
/// fallback can compare a branch's on-disk stamp against it.
pub(crate) const INTERNAL_MANIFEST_SCHEMA_VERSION: u32 =
migrations::INTERNAL_MANIFEST_SCHEMA_VERSION;
/// Test-only: create a `__manifest` for a minimal catalog, the first half of a
/// synthetic pre-Phase-7 (v3) graph (see `commit_graph::seed_legacy_v3_lineage`).
/// A small two-type schema is enough — the v3→v4 migration touches only the
/// lineage rows, never the table-version rows.
#[cfg(any(test, feature = "failpoints"))]
pub(crate) async fn seed_manifest_for_v3_fixture(root_uri: &str) -> Result<()> {
let schema = omnigraph_compiler::schema::parser::parse_schema(
"node Person { name: String }\nedge Knows: Person -> Person { }\n",
)
.map_err(|e| OmniError::manifest(e.to_string()))?;
let catalog =
omnigraph_compiler::catalog::build_catalog(&schema).map_err(|e| OmniError::manifest(e.to_string()))?;
ManifestCoordinator::init(root_uri, &catalog).await?;
Ok(())
}
/// Test-only: strip the `graph_commit`/`graph_head` rows that Phase-7 init folds
/// into `__manifest`, then rewind the internal-schema stamp to v3 — completing a
/// synthetic pre-Phase-7 graph whose lineage lives only in `_graph_commits.lance`.
#[cfg(any(test, feature = "failpoints"))]
pub(crate) async fn strip_lineage_and_set_v3_stamp_for_fixture(root_uri: &str) -> Result<()> {
let mut dataset = open_manifest_dataset(root_uri, None).await?;
dataset
.delete("object_type = 'graph_commit' OR object_type = 'graph_head'")
.await
.map_err(|e| OmniError::Lance(e.to_string()))?;
// Re-open so the stamp write lands on the post-delete HEAD.
let mut dataset = open_manifest_dataset(root_uri, None).await?;
migrations::set_stamp_for_test(&mut dataset, 3).await
}
/// Test-only: fork a real Lance branch `name` on `__manifest` from main's CURRENT
/// state. Call AFTER `strip_lineage_and_set_v3_stamp_for_fixture` so the forked
/// branch inherits the v3 stamp with no lineage rows — i.e. a faithful
/// pre-Phase-7 branch whose `__manifest` carries no lineage of its own. The
/// branch's commits live only on the `_graph_commits.lance` branch until the
/// per-branch v3→v4 migration runs against this branch's `__manifest`.
#[cfg(test)]
pub(crate) async fn fork_manifest_branch_for_v3_fixture(root_uri: &str, name: &str) -> Result<()> {
let mut dataset = open_manifest_dataset(root_uri, None).await?;
let version = dataset.version().version;
dataset
.create_branch(name, version, None)
.await
.map_err(|e| OmniError::Lance(e.to_string()))?;
Ok(())
}
/// Test-support re-export of the read-write migration entry point for the
/// `failpoints` integration binary (which can't reach `pub(crate)` items). Gated
/// on `test` OR `failpoints`; never in a release build.
#[cfg(any(test, feature = "failpoints"))]
pub async fn migrate_on_open_for_test(root_uri: &str) -> Result<()> {
migrate_on_open(root_uri).await
}
/// Test-support: the number of `graph_commit` lineage rows in `__manifest` at
/// `branch` (main when `None`), plus the on-disk internal-schema stamp. Lets the
/// `failpoints` integration binary assert the migration neither stamped nor
/// backfilled when a legacy-open fault fired. Gated on `test` OR `failpoints`.
#[cfg(any(test, feature = "failpoints"))]
pub async fn lineage_row_count_and_stamp_for_test(
root_uri: &str,
branch: Option<&str>,
) -> Result<(usize, u32)> {
let dataset = open_manifest_dataset(root_uri, branch).await?;
let stamp = migrations::read_stamp(&dataset);
let (rows, _heads) = read_graph_lineage(&dataset).await?;
Ok((rows.len(), stamp))
}
/// Immutable point-in-time view of the database.
@ -313,6 +440,9 @@ impl ManifestCoordinator {
/// Create a new graph at `root_uri` from a catalog.
///
/// Creates per-type Lance datasets and the namespace `__manifest` table.
/// The genesis graph commit is folded into the init write, so `__manifest`
/// is the single source of graph lineage from version one — callers read it
/// back through the lineage projection rather than via a second write.
pub async fn init(root_uri: &str, catalog: &Catalog) -> Result<Self> {
let root = root_uri.trim_end_matches('/');
let (dataset, known_state) = init_manifest_graph(root, catalog).await?;
@ -419,17 +549,58 @@ impl ManifestCoordinator {
changes: &[ManifestChange],
expected_table_versions: &HashMap<String, u64>,
) -> Result<u64> {
if changes.is_empty() && expected_table_versions.is_empty() {
return Ok(self.version());
Ok(self
.commit_changes_with_lineage(changes, expected_table_versions, None)
.await?
.version)
}
/// Publish `changes` and, when `lineage` is present, record the graph commit
/// in the SAME merge-insert (RFC-013 Phase 7). `__manifest` is the single
/// source of graph lineage: the `graph_commit` + `graph_head:<branch>` rows
/// ride the table-version publish so the whole commit lands at one manifest
/// version — no separate write, no manifest→commit-graph atomicity gap, no
/// per-write commit-graph refresh. Returns the new version and the parent the
/// publisher resolved for the commit (so the caller can update its in-memory
/// commit cache without a re-read).
pub(crate) async fn commit_changes_with_lineage(
&mut self,
changes: &[ManifestChange],
expected_table_versions: &HashMap<String, u64>,
lineage: Option<&LineageIntent>,
) -> Result<CommitOutcome> {
if changes.is_empty() && expected_table_versions.is_empty() && lineage.is_none() {
return Ok(CommitOutcome {
version: self.version(),
parent_commit_id: None,
});
}
self.dataset = self
let PublishOutcome {
dataset,
parent_commit_id,
} = self
.publisher
.publish(changes, expected_table_versions)
.publish(changes, expected_table_versions, lineage)
.await?;
self.dataset = dataset;
self.known_state = read_manifest_state(&self.dataset).await?;
Ok(self.version())
Ok(CommitOutcome {
version: self.version(),
parent_commit_id,
})
}
/// Project the graph-lineage rows out of `__manifest` at `branch` without an
/// open coordinator. Opens the manifest fresh; used by `CommitGraph` to
/// source its in-memory cache from the manifest projection.
pub(crate) async fn read_graph_lineage_at(
root_uri: &str,
branch: Option<&str>,
) -> Result<(Vec<GraphLineageRow>, HashMap<String, String>)> {
let dataset = open_manifest_dataset(root_uri, branch).await?;
read_graph_lineage(&dataset).await
}
/// Current manifest version.

View file

@ -14,9 +14,17 @@ use super::layout::{manifest_uri, open_manifest_dataset, type_name_hash};
use super::metadata::TableVersionMetadata;
use super::migrations::stamp_current_version;
use super::state::{
ManifestState, SubTableEntry, entries_to_batch, manifest_schema, read_manifest_state,
GraphLineageRow, ManifestState, SubTableEntry, entries_to_batch, graph_lineage_row_parts,
manifest_schema, read_manifest_state,
};
/// The manifest version the init `Dataset::write` produces (Lance datasets start
/// at version one). The genesis graph commit pins this version — a snapshot at
/// it is the empty, freshly-initialized graph. The two config-only commits that
/// follow (`update_config`, `stamp_current_version`) advance the live manifest
/// version but add no table data, so genesis correctly stays pinned at one.
const GENESIS_MANIFEST_VERSION: u64 = 1;
pub(super) async fn init_manifest_graph(
root_uri: &str,
catalog: &Catalog,
@ -24,7 +32,21 @@ pub(super) async fn init_manifest_graph(
let root = root_uri.trim_end_matches('/');
let (entries, version_metadata) = build_initial_entries(root, catalog).await?;
let manifest_batch = entries_to_batch(&entries, &version_metadata)?;
// Genesis graph commit: parentless, actorless, minted once and folded into
// the init write so `__manifest` is the single source of graph lineage from
// version one (no `_graph_commits.lance` row, no separate publish).
let genesis = GraphLineageRow {
graph_commit_id: ulid::Ulid::new().to_string(),
manifest_branch: None,
manifest_version: GENESIS_MANIFEST_VERSION,
parent_commit_id: None,
merged_parent_commit_id: None,
actor_id: None,
created_at: crate::db::now_micros()?,
};
let genesis_lineage = graph_lineage_row_parts(&genesis, None)?;
let manifest_batch = entries_to_batch(&entries, &version_metadata, &genesis_lineage)?;
let schema = manifest_schema();
let reader = RecordBatchIterator::new(vec![Ok(manifest_batch)], schema);
let params = WriteParams {

View file

@ -37,6 +37,9 @@ use lance::Dataset;
use crate::error::{OmniError, Result};
use crate::db::commit_graph::GraphCommit;
use super::state::{GraphLineageRow, graph_lineage_row_parts, merge_lineage_rows, read_graph_lineage};
/// Current internal schema version this binary expects to find on disk.
///
/// History:
@ -50,14 +53,62 @@ use crate::error::{OmniError, Result};
/// `__manifest` dataset by the pre-v0.4.0 Run state machine (removed in
/// MR-771). Once swept, the `is_internal_run_branch` defense-in-depth guard
/// is no longer needed (MR-770).
pub(super) const INTERNAL_MANIFEST_SCHEMA_VERSION: u32 = 3;
/// - v4 — RFC-013 Phase 7 folds graph lineage into `__manifest` as
/// `graph_commit`/`graph_head` rows written in the publish CAS. A pre-Phase-7
/// (v3) graph has its lineage only in `_graph_commits.lance`, so the new
/// binary would read an empty commit DAG. This one-time per-branch backfill
/// copies the lineage from `_graph_commits.lance` into `__manifest`
/// (`migrate_v3_to_v4`). `_graph_commits.lance` is left in place as the
/// branch-ref carrier; no commit rows are ever written to it again.
pub(crate) const INTERNAL_MANIFEST_SCHEMA_VERSION: u32 = 4;
/// The oldest on-disk internal-schema stamp this binary will open. A graph below
/// this floor is refused (`refuse_if_stamp_unsupported`) with a "migrate it
/// forward with an older release first" error, instead of obliging this binary to
/// carry that version's `migrate_vN_…` arm and the legacy readers it needs
/// forever. Raising the floor is how the migration chain sheds old code.
///
/// **Retirement runbook** — turning "accumulates forever" into a sliding window:
/// 1. *Shed version N* once no graph below `N+1` remains in the fleet: bump this
/// floor AND `LOWEST_REGISTERED_MIGRATION_SOURCE` to `N+1`, then delete the
/// `N =>` arm in `migrate_internal_schema`, `migrate_vN_to_vN+1`, and its
/// helpers + tests. The tripwire test keeps the two consts in lockstep, so a
/// half-done shed fails CI.
/// 2. *Retire the v3 legacy readers entirely* once MIN ≥ 4: `git rm` the
/// `commit_graph/commit_graph_legacy_v3.rs` seam file and flip the single
/// `stamp < CURRENT` gate in `load_commit_cache_for_branch` to read the
/// manifest projection unconditionally.
///
/// MIN = 1 today is a pure no-op: `read_stamp` floors an absent stamp at 1 and no
/// real graph carries 0, so nothing is refused.
pub(crate) const MIN_SUPPORTED_INTERNAL_SCHEMA_VERSION: u32 = 1;
/// The lowest `current` value the `migrate_internal_schema` dispatcher still has a
/// `match` arm for. Mirrors the lowest registered migration source so a floor bump
/// that forgets to delete the now-dead arm (or vice versa) is caught by the
/// compile-time tripwire below. Migration arms aren't an enumerable registry, so
/// this hand-mirrored const is the minimal enforced coupling — cheaper than
/// reshaping the dispatcher into a data-driven table.
const LOWEST_REGISTERED_MIGRATION_SOURCE: u32 = 1;
/// Retirement tripwire (compile-time): the refusal floor and the lowest migration
/// arm must move together. Raising `MIN_SUPPORTED` without deleting the now-dead
/// below-floor arm — or vice versa — fails the build with this message, which is
/// stronger than a runtime test and impossible to skip. Migration arms can't be
/// enumerated, so this const-mirror is the check.
const _: () = assert!(
LOWEST_REGISTERED_MIGRATION_SOURCE == MIN_SUPPORTED_INTERNAL_SCHEMA_VERSION,
"internal-schema floor drifted from the lowest registered migration arm: when raising \
MIN_SUPPORTED_INTERNAL_SCHEMA_VERSION, delete every below-floor `N =>` arm + migrate_vN_ \
+ its helpers/tests and bump LOWEST_REGISTERED_MIGRATION_SOURCE to match (or vice versa)",
);
const INTERNAL_SCHEMA_VERSION_KEY: &str = "omnigraph:internal_schema_version";
const OBJECT_ID_PK_KEY: &str = "lance-schema:unenforced-primary-key";
/// Read the on-disk stamp from `__manifest`'s schema-level metadata.
/// Absent ⇒ v1 (pre-stamp world).
pub(super) fn read_stamp(dataset: &Dataset) -> u32 {
pub(crate) fn read_stamp(dataset: &Dataset) -> u32 {
dataset
.schema()
.metadata
@ -72,20 +123,52 @@ pub(super) async fn stamp_current_version(dataset: &mut Dataset) -> Result<()> {
set_stamp(dataset, INTERNAL_MANIFEST_SCHEMA_VERSION).await
}
/// Refuse to open a manifest whose stamp this binary cannot serve — in either
/// direction — with a clear upgrade path. Shared by every place a stamp is read
/// and enforced: the write-path migration dispatcher, the read-only open guard,
/// and the branch lineage-read path. Checking both bounds in one function means a
/// new stamp-reading caller gets the floor and the ceiling together and cannot
/// half-enforce.
///
/// - `stamp > CURRENT`: the graph was written by a newer binary — upgrade omnigraph.
/// - `stamp < MIN_SUPPORTED`: the graph predates the oldest migration this binary
/// still carries — migrate it forward with an older release first, then reopen.
pub(crate) fn refuse_if_stamp_unsupported(stamp: u32) -> Result<()> {
if stamp > INTERNAL_MANIFEST_SCHEMA_VERSION {
return Err(OmniError::manifest(format!(
"__manifest is stamped at internal schema v{} but this binary expects v{} \
upgrade omnigraph before opening this graph",
stamp, INTERNAL_MANIFEST_SCHEMA_VERSION,
)));
}
if stamp < MIN_SUPPORTED_INTERNAL_SCHEMA_VERSION {
return Err(OmniError::manifest(format!(
"__manifest is stamped at internal schema v{} but this binary supports v{} or later \
open it with an older omnigraph release to migrate it forward first, then reopen",
stamp, MIN_SUPPORTED_INTERNAL_SCHEMA_VERSION,
)));
}
Ok(())
}
/// Apply any pending internal-schema migrations to the manifest dataset.
///
/// Idempotent: when the on-disk stamp matches the binary, this is a single
/// metadata read with no writes.
pub(super) async fn migrate_internal_schema(dataset: &mut Dataset) -> Result<()> {
///
/// `root_uri` + `branch` identify which graph + branch this `dataset` is a
/// manifest for. The v3→v4 lineage backfill needs them to read that branch's
/// `_graph_commits.lance`. `migrate_on_open` passes the main branch
/// (`branch = None`); the publisher's `load_publish_state` passes its own
/// branch, so each branch backfills on its first write.
pub(super) async fn migrate_internal_schema(
dataset: &mut Dataset,
root_uri: &str,
branch: Option<&str>,
) -> Result<()> {
let mut current = read_stamp(dataset);
if current > INTERNAL_MANIFEST_SCHEMA_VERSION {
return Err(OmniError::manifest(format!(
"__manifest is stamped at internal schema v{} but this binary expects v{} \
upgrade omnigraph before opening this graph for writes",
current, INTERNAL_MANIFEST_SCHEMA_VERSION,
)));
}
refuse_if_stamp_unsupported(current)?;
while current < INTERNAL_MANIFEST_SCHEMA_VERSION {
match current {
@ -97,6 +180,10 @@ pub(super) async fn migrate_internal_schema(dataset: &mut Dataset) -> Result<()>
migrate_v2_to_v3(dataset).await?;
current = 3;
}
3 => {
migrate_v3_to_v4(dataset, root_uri, branch).await?;
current = 4;
}
other => {
return Err(OmniError::manifest_internal(format!(
"no internal-schema migration registered for v{} → v{}",
@ -202,6 +289,218 @@ async fn migrate_v2_to_v3(dataset: &mut Dataset) -> Result<()> {
set_stamp(dataset, 3).await
}
/// v3 → v4: backfill the graph lineage from `_graph_commits.lance` into
/// `__manifest`, then bump the stamp.
///
/// RFC-013 Phase 7 made `__manifest` the single source of graph lineage
/// (`graph_commit` / `graph_head:<branch>` rows, written in the publish CAS).
/// A pre-Phase-7 (v3) graph has its lineage only in `_graph_commits.lance` and
/// none in `__manifest`, so the new binary would read an EMPTY commit DAG. This
/// one-time per-branch migration copies that branch's commits + the single head
/// into `__manifest` so reads see the real history. `_graph_commits.lance`
/// itself is left untouched as the branch-ref carrier (no commit row is ever
/// written to it again).
///
/// `dataset` is the `__manifest` for `branch` (main when `branch` is `None`);
/// the migration runs per-branch on that branch's first write, so it reads
/// `_graph_commits.lance` at the SAME branch.
///
/// Idempotency + crash recovery: the stamp bump is the LAST step, and the
/// lineage merge is keyed on `object_id` (re-inserting the same commit rows is a
/// no-op update). A crash after the merge but before the stamp bump re-enters
/// here at v3 and re-runs harmlessly. As a fast path, if `__manifest` already
/// carries `graph_commit` rows (a previous run completed the merge), we skip
/// straight to the stamp bump.
///
/// Concurrent runners: two processes (or two open-for-write handles) can open the
/// same legacy graph at once and both reach the backfill merge. `merge_lineage_rows`
/// uses `conflict_retries(0)`, so the row-level CAS loser on `graph_head:<branch>`
/// must be re-driven here rather than failing the open — `migrate_v2_to_v3` is
/// concurrent-runner idempotent and this step must be too. The bounded loop
/// re-reads the fast path (a concurrent winner's merge is one atomic Lance commit,
/// so a re-read sees either zero or all of its rows, never partial), re-opens the
/// stale handle past the winner's commit, and retries. On budget exhaustion it
/// returns a `RowLevelCasContention`-typed error so the publisher's OUTER retry
/// loop (which only re-runs `is_retryable_publish_conflict` conflicts) completes
/// it on the next attempt — the same converge-on-next-attempt contract the
/// recovery sweep uses.
async fn migrate_v3_to_v4(
dataset: &mut Dataset,
root_uri: &str,
branch: Option<&str>,
) -> Result<()> {
// Mirror the publisher's budget (`publisher::PUBLISHER_RETRY_BUDGET = 5`); kept
// as a local const rather than re-exporting that private one — the two are the
// same shape (bounded row-level-CAS retries) but independent knobs.
const MIGRATION_MERGE_RETRY_BUDGET: u32 = 5;
// Exclusive range + an unguarded retryable arm (see `commit_v4_stamp_idempotently`
// for the rationale): every retryable conflict re-opens and retries inside the
// loop, and the SINGLE reachable exhaustion path is the typed contention return
// below — so the retryable variant can never fall through to the `Err(err)`
// propagate arm on the last iteration.
for _ in 0..MIGRATION_MERGE_RETRY_BUDGET {
// Fast path / idempotency + concurrent-winner guard: if the backfill
// already landed (a previous run, OR a concurrent runner that won the CAS
// — its merge is atomic, so this is all-or-nothing), don't re-merge — just
// (re)stamp. `dataset` is re-opened past any winner's commit below, so this
// re-read sees the winner's rows on a retry.
let (existing_lineage, _heads) = read_graph_lineage(dataset).await?;
if !existing_lineage.is_empty() {
return commit_v4_stamp_idempotently(dataset, root_uri, branch).await;
}
// Read this branch's legacy commit cache (commits + the head). An absent or
// empty `_graph_commits.lance` yields no commits — nothing to backfill.
let (commit_by_id, head) =
crate::db::commit_graph::read_legacy_commit_cache(root_uri, branch).await?;
if commit_by_id.is_empty() {
return commit_v4_stamp_idempotently(dataset, root_uri, branch).await;
}
let parts = build_lineage_backfill_parts(&commit_by_id, head.as_ref(), branch)?;
match merge_lineage_rows(dataset.clone(), &parts).await {
Ok(new_dataset) => {
*dataset = new_dataset;
// Stamp LAST. Crash window: a failure between the merge above and
// this stamp bump leaves stamp v3 + lineage present in `__manifest`.
// The next open re-enters at v3, the fast path at the top sees the
// lineage and skips straight to the stamp bump — completing the
// migration with no duplicate rows (the merge is keyed on
// `object_id`). Pinned by
// `crash_after_merge_before_stamp_completes_on_next_open`.
return commit_v4_stamp_idempotently(dataset, root_uri, branch).await;
}
// A concurrent runner won the `graph_head:<branch>` CAS. Our in-hand
// handle is stale at the pre-contention HEAD, so a re-open is required
// to see the winner's commit; then re-loop (the fast path will see the
// winner's lineage and stamp). Bounded by the budget.
Err(err) if super::publisher::is_retryable_publish_conflict(&err) => {
*dataset = super::layout::open_manifest_dataset(root_uri, branch).await?;
continue;
}
Err(err) => return Err(err),
}
}
// Budget exhausted under sustained contention. Return a CAS-typed error (not a
// plain conflict) so the publisher's outer retry loop — which only re-runs
// `is_retryable_publish_conflict` — re-runs `load_publish_state` and completes
// the migration, rather than giving up.
Err(OmniError::manifest_row_level_cas_contention(format!(
"v3→v4 lineage backfill exhausted {} retries against concurrent runners",
MIGRATION_MERGE_RETRY_BUDGET
)))
}
/// Stamp the v3→v4 migration's terminal version idempotently under concurrent
/// runners. `set_stamp` issues an `UpdateConfig` Lance commit; once the merge CAS
/// loser is made to converge (above), BOTH runners reach this stamp bump and race
/// it — the loser gets `lance::Error::IncompatibleTransaction` (two `UpdateConfig`
/// commits touching the same metadata key), which is NOT a row-level CAS
/// contention and so is not caught by the merge loop. But both write the SAME
/// value, so the conflict is benign: re-open and, if the stamp already reached the
/// target (the concurrent runner finished it), succeed; otherwise re-apply.
/// Bounded; on exhaustion surface a CAS-typed error for the publisher's outer
/// retry, same as the merge loop.
async fn commit_v4_stamp_idempotently(
dataset: &mut Dataset,
root_uri: &str,
branch: Option<&str>,
) -> Result<()> {
const STAMP_RETRY_BUDGET: u32 = 5;
// Exclusive range + an UNGUARDED `IncompatibleTransaction` arm: the retryable
// variant is always handled inside the loop (re-open + same-value check + retry),
// so it can never fall through to the stringifying `Err(e)` catch-all, and the
// SINGLE reachable exhaustion path is the typed contention return below. (A
// `0..=BUDGET` range with an `attempt < BUDGET` guard let the last iteration's
// retryable conflict reach the catch-all and return a non-retryable
// `OmniError::Lance` — the publisher's outer retry would then give up.)
for _ in 0..STAMP_RETRY_BUDGET {
// Inline the `update_schema_metadata` write (rather than `set_stamp`) so the
// raw Lance error variant is in hand — `set_stamp` pre-stringifies it.
let stamp_result = stamp_internal_schema(dataset).await;
match stamp_result {
Ok(_) => return Ok(()),
Err(lance::Error::IncompatibleTransaction { .. }) => {
// A concurrent runner's `UpdateConfig` preempted ours — the
// retryable case. Re-open past its commit; if it already stamped to
// the target we're done (the value is identical), else fall through
// to retry on the advanced handle.
*dataset = super::layout::open_manifest_dataset(root_uri, branch).await?;
if read_stamp(dataset) >= INTERNAL_MANIFEST_SCHEMA_VERSION {
return Ok(());
}
}
Err(e) => return Err(OmniError::Lance(e.to_string())),
}
}
// Exhausted the budget against sustained concurrent stampers. Return a
// CAS-typed (retryable) error so the publisher's OUTER retry — which only
// re-runs `is_retryable_publish_conflict` — completes it, rather than the
// stringified `OmniError::Lance` it would treat as fatal.
Err(OmniError::manifest_row_level_cas_contention(format!(
"v3→v4 stamp bump exhausted {} retries against concurrent runners",
STAMP_RETRY_BUDGET
)))
}
/// The single `update_schema_metadata` write that bumps the on-disk internal-schema
/// stamp to the current version. Extracted from `commit_v4_stamp_idempotently`'s
/// retry loop so a `failpoints` test can inject a concurrent-stamper
/// `IncompatibleTransaction` deterministically (the loop's exhaustion path is
/// otherwise near-unreachable). Returns the RAW `lance::Error` so the loop can match
/// the `IncompatibleTransaction` variant — `set_stamp` pre-stringifies it.
async fn stamp_internal_schema(dataset: &mut Dataset) -> std::result::Result<(), lance::Error> {
crate::failpoints::maybe_fail_lance_incompatible("migration.v4_stamp.force_incompatible")?;
dataset
.update_schema_metadata([(
INTERNAL_SCHEMA_VERSION_KEY.to_string(),
INTERNAL_MANIFEST_SCHEMA_VERSION.to_string(),
)])
.await
.map(|_| ())
}
/// Build the `__manifest` rows for the v3→v4 backfill: one immutable
/// `graph_commit` row per commit, plus EXACTLY ONE `graph_head:<branch>` row for
/// the actual head. Each commit encodes to a `[graph_commit, graph_head]` pair,
/// but only the head commit's head row is kept — the others would be redundant
/// updates of the same `graph_head:<branch>` object_id (the head is per-branch,
/// not per-commit).
fn build_lineage_backfill_parts(
commit_by_id: &std::collections::HashMap<String, GraphCommit>,
head: Option<&GraphCommit>,
branch: Option<&str>,
) -> Result<Vec<super::state::GraphLineageRowPart>> {
let head_id = head.map(|h| h.graph_commit_id.as_str());
// Deterministic iteration order (the source is a HashMap): merge-insert is
// keyed on `object_id` so the final manifest content is order-independent,
// but a stable order keeps the produced batch reproducible regardless.
let mut commits: Vec<&GraphCommit> = commit_by_id.values().collect();
commits.sort_by(|a, b| a.graph_commit_id.cmp(&b.graph_commit_id));
let mut parts = Vec::with_capacity(commits.len() + 1);
for commit in commits {
let row = GraphLineageRow {
graph_commit_id: commit.graph_commit_id.clone(),
manifest_branch: commit.manifest_branch.clone(),
manifest_version: commit.manifest_version,
parent_commit_id: commit.parent_commit_id.clone(),
merged_parent_commit_id: commit.merged_parent_commit_id.clone(),
actor_id: commit.actor_id.clone(),
created_at: commit.created_at,
};
let [commit_part, head_part] = graph_lineage_row_parts(&row, branch)?;
parts.push(commit_part);
if Some(commit.graph_commit_id.as_str()) == head_id {
parts.push(head_part);
}
}
Ok(parts)
}
async fn set_stamp(dataset: &mut Dataset, version: u32) -> Result<()> {
dataset
.update_schema_metadata([(INTERNAL_SCHEMA_VERSION_KEY.to_string(), version.to_string())])
@ -209,3 +508,42 @@ async fn set_stamp(dataset: &mut Dataset, version: u32) -> Result<()> {
.map_err(|e| OmniError::Lance(e.to_string()))?;
Ok(())
}
/// Test-only: force the on-disk internal-schema stamp to `version`. Used to
/// synthesize a pre-migration graph (rewinding to v3) and to simulate a crash
/// that lost the final stamp bump. Gated on `test` OR `failpoints` so the
/// fault-injection migration test (in the `failpoints` integration binary,
/// compiled without `cfg(test)`) can reach it too.
#[cfg(any(test, feature = "failpoints"))]
pub(crate) async fn set_stamp_for_test(dataset: &mut Dataset, version: u32) -> Result<()> {
set_stamp(dataset, version).await
}
#[cfg(test)]
mod tests {
use super::*;
/// The floor never refuses any stamp the binary can actually serve — a graph
/// at MIN through CURRENT passes, only sub-MIN / super-CURRENT are rejected.
/// With MIN = 1 and CURRENT = 4 this proves the live range is exactly [1, 4]
/// and that the floor is a no-op for every real graph (lowest real stamp is 1).
#[test]
fn unsupported_guard_accepts_exactly_the_supported_range() {
for stamp in MIN_SUPPORTED_INTERNAL_SCHEMA_VERSION..=INTERNAL_MANIFEST_SCHEMA_VERSION {
assert!(
refuse_if_stamp_unsupported(stamp).is_ok(),
"stamp v{stamp} is within [MIN, CURRENT] and must be accepted"
);
}
if MIN_SUPPORTED_INTERNAL_SCHEMA_VERSION > 0 {
assert!(
refuse_if_stamp_unsupported(MIN_SUPPORTED_INTERNAL_SCHEMA_VERSION - 1).is_err(),
"a sub-floor stamp must be refused"
);
}
assert!(
refuse_if_stamp_unsupported(INTERNAL_MANIFEST_SCHEMA_VERSION + 1).is_err(),
"a future stamp must be refused"
);
}
}

View file

@ -35,8 +35,8 @@ use super::layout::{open_manifest_dataset, tombstone_object_id, version_object_i
use super::metadata::parse_namespace_version_request;
use super::migrations::migrate_internal_schema;
use super::state::{
manifest_rows_batch, manifest_schema, read_manifest_entries, read_registered_table_locations,
read_tombstone_versions,
GraphLineageRow, GraphLineageRowPart, graph_lineage_row_parts, head_lineage_row,
manifest_rows_batch, manifest_schema, read_publish_scan,
};
use super::{
ManifestChange, OBJECT_TYPE_TABLE, OBJECT_TYPE_TABLE_TOMBSTONE, OBJECT_TYPE_TABLE_VERSION,
@ -50,13 +50,48 @@ use super::{
/// iteration re-runs `load_publish_state` and the expected-version pre-check.
const PUBLISHER_RETRY_BUDGET: u32 = 5;
/// The graph-lineage commit to record atomically with a manifest publish
/// (RFC-013 Phase 7). One logical commit per publish: the `graph_commit_id` is
/// minted once by the caller and stays stable across the publisher's CAS
/// retries; only the parent re-resolves per attempt (against the freshly loaded
/// `__manifest`), so a retry after a concurrent commit parents off the new head
/// — the TOCTOU the dual-write era's `commit_graph.refresh()` guarded is closed
/// by construction.
#[derive(Debug, Clone)]
pub(crate) struct LineageIntent {
/// ULID minted once before the publish loop; the graph commit's identity.
pub graph_commit_id: String,
/// The branch this commit lands on (`None` = main). Selects the
/// `graph_head:<branch>` pointer row that gets updated.
pub branch: Option<String>,
/// Authoring actor, or `None` for unauthored / system writes.
pub actor_id: Option<String>,
/// The merged-in source head — `Some` only for a branch-merge commit.
pub merged_parent_commit_id: Option<String>,
/// Commit timestamp (microseconds since the UNIX epoch).
pub created_at: i64,
}
/// The result of a manifest publish that may have folded in a graph commit.
#[derive(Debug)]
pub(super) struct PublishOutcome {
/// The advanced `__manifest` dataset (its version is the published version).
pub dataset: Dataset,
/// The parent the publisher resolved for the recorded commit, if a
/// [`LineageIntent`] was supplied. Returned so the caller can update its
/// in-memory commit cache without a re-read. `None` when no lineage was
/// recorded, or when the commit is the genesis (no parent).
pub parent_commit_id: Option<String>,
}
#[async_trait]
pub(super) trait ManifestBatchPublisher: Send + Sync {
async fn publish(
&self,
changes: &[ManifestChange],
expected_table_versions: &HashMap<String, u64>,
) -> Result<Dataset>;
lineage: Option<&LineageIntent>,
) -> Result<PublishOutcome>;
}
pub(super) struct GraphNamespacePublisher {
@ -76,6 +111,19 @@ struct PendingVersionRow {
row_count: Option<u64>,
}
/// Everything one CAS attempt needs out of a single `__manifest` scan
/// (RFC-013 P2): the open dataset, table state for the pre-check + pending-row
/// build, and the `graph_commit` lineage rows for parent resolution. Folding the
/// lineage into this struct is what lets `resolve_lineage_rows` skip its own
/// `read_graph_lineage` scan.
struct LoadedPublishState {
dataset: Dataset,
registered_tables: HashMap<String, String>,
existing_versions: HashMap<(String, u64), SubTableEntry>,
existing_tombstones: HashMap<(String, u64), ()>,
lineage_rows: Vec<GraphLineageRow>,
}
impl GraphNamespacePublisher {
pub(super) fn new(root_uri: &str, branch: Option<&str>) -> Self {
Self {
@ -90,22 +138,31 @@ impl GraphNamespacePublisher {
open_manifest_dataset(&self.root_uri, self.branch.as_deref()).await
}
async fn load_publish_state(
&self,
) -> Result<(
Dataset,
HashMap<String, String>,
HashMap<(String, u64), SubTableEntry>,
HashMap<(String, u64), ()>,
)> {
async fn load_publish_state(&self) -> Result<LoadedPublishState> {
// Test seam: inject a retryable contention here to exercise the outer
// retry loop's re-run-on-retryable-load-error path (no-op without the
// `failpoints` feature). The migration surfaces the same typed error.
crate::failpoints::maybe_fail_retryable_contention(
crate::failpoints::names::PUBLISH_LOAD_STATE_RETRYABLE_CONTENTION,
)?;
let mut dataset = self.dataset().await?;
// Run pending internal-schema migrations exactly once per publish on
// the open-for-write path; idempotent when the on-disk stamp already
// matches this binary. See `db/manifest/migrations.rs`.
migrate_internal_schema(&mut dataset).await?;
let registered_tables = read_registered_table_locations(&dataset).await?;
let existing_entries = read_manifest_entries(&dataset).await?;
let existing_versions = existing_entries
// matches this binary. Pass this publisher's branch so the v3→v4 lineage
// backfill reads `_graph_commits.lance` at the SAME branch it is
// publishing to (each branch backfills on its first write). See
// `db/manifest/migrations.rs`.
migrate_internal_schema(&mut dataset, &self.root_uri, self.branch.as_deref()).await?;
// ONE `__manifest` scan for everything the publish needs: table
// locations, version entries, tombstones, AND the `graph_commit` lineage
// rows for parent resolution (RFC-013 P2). The lineage extraction rides
// this pass instead of a second `read_graph_lineage` scan in
// `resolve_lineage_rows`; the per-attempt re-read is preserved because
// `load_publish_state` runs once per CAS attempt, so a retry sees the
// advanced head and re-parents correctly.
let scan = read_publish_scan(&dataset).await?;
let existing_versions = scan
.version_entries
.iter()
.map(|entry| {
(
@ -114,13 +171,14 @@ impl GraphNamespacePublisher {
)
})
.collect();
let existing_tombstones = read_tombstone_versions(&dataset).await?;
Ok((
let existing_tombstones = scan.tombstones.into_iter().collect();
Ok(LoadedPublishState {
dataset,
registered_tables,
registered_tables: scan.table_locations,
existing_versions,
existing_tombstones,
))
lineage_rows: scan.lineage_rows,
})
}
fn build_pending_rows(
@ -266,6 +324,50 @@ impl GraphNamespacePublisher {
Ok(rows)
}
/// Resolve the parent for `intent` against the just-loaded `dataset` and
/// build the two lineage rows (`graph_commit` + `graph_head:<branch>`) to
/// fold into the publish batch. Runs INSIDE the CAS retry loop, so the
/// parent is read from the manifest state this attempt will commit against —
/// a retry after a concurrent commit re-reads the advanced head and parents
/// correctly (TOCTOU closed). `new_manifest_version` is the version this
/// publish produces (the recorded commit pins it).
///
/// The parent is the current head of the branch's lineage — the
/// `should_replace_head` winner over the visible `graph_commit` rows, the
/// same selection the commit-graph cache uses. (The denormalized
/// `graph_head:<branch>` row is written for forward-compat but is not the
/// parent source here: a branch freshly forked from main inherits main's
/// commits but not yet a `graph_head:<its-name>` row, and the head-over-rows
/// computation gives the correct fork-point parent in that case.)
///
/// `lineage_rows` is the `graph_commit` set this attempt already parsed in
/// `load_publish_state`'s single scan (RFC-013 P2) — NOT a fresh
/// `read_graph_lineage` scan. The per-attempt re-read is still preserved: the
/// retry loop re-runs `load_publish_state`, so each attempt's `lineage_rows`
/// reflects the head as it stands for that attempt.
fn resolve_lineage_rows(
lineage_rows: &[GraphLineageRow],
intent: &LineageIntent,
new_manifest_version: u64,
) -> Result<(Vec<PendingVersionRow>, Option<String>)> {
let parent_commit_id = head_lineage_row(lineage_rows).map(|h| h.graph_commit_id.clone());
let commit = GraphLineageRow {
graph_commit_id: intent.graph_commit_id.clone(),
manifest_branch: intent.branch.clone(),
manifest_version: new_manifest_version,
parent_commit_id: parent_commit_id.clone(),
merged_parent_commit_id: intent.merged_parent_commit_id.clone(),
actor_id: intent.actor_id.clone(),
created_at: intent.created_at,
};
let parts = graph_lineage_row_parts(&commit, intent.branch.as_deref())?;
Ok((
parts.into_iter().map(lineage_part_to_pending).collect(),
parent_commit_id,
))
}
fn pending_rows_to_batch(rows: Vec<PendingVersionRow>) -> Result<arrow_array::RecordBatch> {
let mut object_ids = Vec::with_capacity(rows.len());
let mut object_types = Vec::with_capacity(rows.len());
@ -420,7 +522,25 @@ impl GraphNamespacePublisher {
}))
})
.collect::<Result<Vec<_>>>()?;
self.publish(&changes, &HashMap::new()).await
Ok(self.publish(&changes, &HashMap::new(), None).await?.dataset)
}
}
/// Map a `state::GraphLineageRowPart` onto a `PendingVersionRow` so a graph
/// commit's two lineage rows ride the same publish batch as the table-version
/// rows (RFC-013 Phase 7). Lineage rows carry no table identity: `table_key` is
/// the empty string (never matched by a real key) and `location`/`row_count`
/// are null.
fn lineage_part_to_pending(part: GraphLineageRowPart) -> PendingVersionRow {
PendingVersionRow {
object_id: part.object_id,
object_type: part.object_type.to_string(),
location: None,
metadata: Some(part.metadata),
table_key: String::new(),
table_version: part.table_version,
table_branch: part.table_branch,
row_count: None,
}
}
@ -429,7 +549,17 @@ impl GraphNamespacePublisher {
/// merge-insert join key, annotated as an unenforced primary key on
/// `__manifest`). Translate it to a typed manifest conflict so callers can
/// match without parsing strings; everything else is opaque storage.
fn map_lance_publish_error(err: LanceError) -> OmniError {
///
/// Shared (`pub(crate)`) with the v3→v4 lineage backfill
/// (`state::merge_lineage_rows`), which issues its own `__manifest` merge-insert
/// outside the publisher and must surface the SAME typed
/// `RowLevelCasContention` so the migration's re-open retry loop can recognize a
/// CAS loss. This is the merge-insert (`execute_reader`) conflict vocabulary
/// only. It is deliberately NOT `optimize::is_retryable_lance_conflict`: that one
/// also matches `CommitConflict`/`RetryableCommitConflict` from the COMPACTION
/// commit path (`compact_files` -> `apply_commit`), which a row-level merge-insert
/// never emits — folding it in here would match impossible variants.
pub(crate) fn map_lance_publish_error(err: LanceError) -> OmniError {
if matches!(err, LanceError::TooMuchWriteContention { .. }) {
return OmniError::manifest_row_level_cas_contention(format!(
"manifest publish lost a row-level CAS race: {}",
@ -445,14 +575,40 @@ impl ManifestBatchPublisher for GraphNamespacePublisher {
&self,
changes: &[ManifestChange],
expected_table_versions: &HashMap<String, u64>,
) -> Result<Dataset> {
if changes.is_empty() && expected_table_versions.is_empty() {
return self.dataset().await;
lineage: Option<&LineageIntent>,
) -> Result<PublishOutcome> {
if changes.is_empty() && expected_table_versions.is_empty() && lineage.is_none() {
return Ok(PublishOutcome {
dataset: self.dataset().await?,
parent_commit_id: None,
});
}
for attempt in 0..=PUBLISHER_RETRY_BUDGET {
let (dataset, known_tables, existing_versions, existing_tombstones) =
self.load_publish_state().await?;
// `load_publish_state` runs the v3→v4 migration (`migrate_internal_schema`)
// on its first scan. The migration's bounded merge/stamp retries surface a
// retryable `RowLevelCasContention` on exhaustion EXPECTING this outer loop
// to re-run them — a re-run re-reads the manifest, by which point a
// concurrent winner has usually completed the migration (next scan is a
// no-op). Route a retryable load error through the SAME retry path as a
// retryable `merge_rows` conflict below, so that typed contention actually
// composes with the publisher retry instead of aborting the publish.
let loaded = match self.load_publish_state().await {
Ok(loaded) => loaded,
Err(err)
if attempt < PUBLISHER_RETRY_BUDGET && is_retryable_publish_conflict(&err) =>
{
continue;
}
Err(err) => return Err(err),
};
let LoadedPublishState {
dataset,
registered_tables: known_tables,
existing_versions,
existing_tombstones,
lineage_rows,
} = loaded;
let latest_per_table =
Self::latest_visible_per_table(&existing_versions, &existing_tombstones);
@ -461,19 +617,48 @@ impl ManifestBatchPublisher for GraphNamespacePublisher {
// surfaced as `ExpectedVersionMismatch` rather than retried.
Self::check_expected_table_versions(&latest_per_table, expected_table_versions)?;
if changes.is_empty() {
return Ok(dataset);
}
let rows = Self::build_pending_rows(
let mut rows = Self::build_pending_rows(
changes,
&known_tables,
&existing_versions,
&existing_tombstones,
)?;
// Fold the graph commit into the SAME batch so table-version rows
// and lineage rows land in one merge-insert (one Lance commit, one
// manifest version) — no separate write, no manifest→commit-graph
// atomicity gap. The merge-insert advances exactly one version on
// top of the loaded dataset, so the commit pins
// `current + 1`. The parent is resolved here, per attempt, from the
// lineage rows THIS attempt's scan loaded (TOCTOU closed on a CAS
// retry — a retry re-runs `load_publish_state` → fresh lineage).
let parent_commit_id = match lineage {
Some(intent) => {
let new_manifest_version = dataset.version().version + 1;
let (commit_rows, parent) =
Self::resolve_lineage_rows(&lineage_rows, intent, new_manifest_version)?;
rows.extend(commit_rows);
parent
}
None => None,
};
if rows.is_empty() {
// Expected-version-only publish with no changes and no lineage:
// the precondition held, nothing to write.
return Ok(PublishOutcome {
dataset,
parent_commit_id,
});
}
match self.merge_rows(dataset, rows).await {
Ok(new_dataset) => return Ok(new_dataset),
Ok(new_dataset) => {
return Ok(PublishOutcome {
dataset: new_dataset,
parent_commit_id,
});
}
Err(err) => {
if attempt < PUBLISHER_RETRY_BUDGET && is_retryable_publish_conflict(&err) {
continue;
@ -497,7 +682,12 @@ impl ManifestBatchPublisher for GraphNamespacePublisher {
/// contention; if the caller's `expected_table_versions` still holds against
/// the new manifest state, we re-attempt. Other conflict variants (notably
/// `ExpectedVersionMismatch`) propagate so the caller learns immediately.
fn is_retryable_publish_conflict(err: &OmniError) -> bool {
///
/// Shared (`pub(crate)`) with the v3→v4 lineage backfill's re-open retry loop
/// (`migrations::migrate_v3_to_v4`), so the migration's retry decision matches the
/// publisher's by construction — both retry exactly `RowLevelCasContention` and
/// propagate everything else.
pub(crate) fn is_retryable_publish_conflict(err: &OmniError) -> bool {
matches!(
err,
OmniError::Manifest(m)

View file

@ -40,17 +40,14 @@ use lance::Dataset;
use serde::{Deserialize, Serialize};
use tracing::warn;
use crate::db::commit_graph::CommitGraph;
use crate::db::graph_coordinator::GraphCoordinator;
use crate::db::recovery_audit::{
RecoveryAudit, RecoveryAuditRecord, RecoveryKind, TableOutcome, now_micros,
};
use crate::db::recovery_audit::{RecoveryAudit, RecoveryAuditRecord, RecoveryKind, TableOutcome};
use crate::db::schema_state::SchemaStateRecovery;
use crate::error::{OmniError, Result};
use crate::storage::StorageAdapter;
use super::Snapshot;
use super::publisher::{GraphNamespacePublisher, ManifestBatchPublisher};
use super::publisher::{GraphNamespacePublisher, LineageIntent, ManifestBatchPublisher};
use super::{ManifestChange, SubTableUpdate, TableRegistration, TableTombstone};
/// System actor identifier recorded on every recovery commit. Operators
@ -59,6 +56,44 @@ use super::{ManifestChange, SubTableUpdate, TableRegistration, TableTombstone};
/// into the audit row's `recovery_for_actor` field.
pub(crate) const RECOVERY_ACTOR: &str = "omnigraph:recovery";
/// Publish a recovery action's manifest `updates` AND its recovery commit in one
/// CAS (RFC-013 Phase 7). The recovery commit's lineage (`graph_commit` +
/// `graph_head`) rides the same merge-insert as the table-version re-pin — there
/// is no separate `_graph_commits.lance` write and no manifest→commit-graph gap.
/// `updates` is empty for the no-table-change recovery paths (all-NoMovement
/// roll-back, stale-sidecar cleanup, orphaned-branch discard); the lineage rows
/// still publish, so the recovery commit is always durable.
///
/// The commit's first parent is resolved by the publisher (the live head of the
/// recovery's branch); its merged-in parent is the sidecar's recorded source
/// head for a rolled-forward branch merge, matching the pre-Phase-7 merge-commit
/// shape. Returns the new manifest version and the minted recovery commit id
/// (which the audit row references).
async fn publish_recovery_commit(
root_uri: &str,
sidecar: &RecoverySidecar,
kind: RecoveryKind,
updates: &[ManifestChange],
expected: &HashMap<String, u64>,
) -> Result<(u64, String)> {
let merged_parent_commit_id = match (sidecar.writer_kind, kind) {
(SidecarKind::BranchMerge, RecoveryKind::RolledForward) => {
sidecar.merge_source_commit_id.clone()
}
_ => None,
};
let intent = LineageIntent {
graph_commit_id: ulid::Ulid::new().to_string(),
branch: sidecar.branch.clone(),
actor_id: Some(RECOVERY_ACTOR.to_string()),
merged_parent_commit_id,
created_at: crate::db::now_micros()?,
};
let publisher = GraphNamespacePublisher::new(root_uri, sidecar.branch.as_deref());
let outcome = publisher.publish(updates, expected, Some(&intent)).await?;
Ok((outcome.dataset.version().version, intent.graph_commit_id))
}
/// Subdirectory under the graph root holding sidecar files.
pub(crate) const RECOVERY_DIR_NAME: &str = "__recovery";
@ -416,7 +451,7 @@ pub(crate) async fn write_sidecar(
) -> Result<RecoverySidecarHandle> {
// Failpoint: models a storage put failure (S3 PutObject / fs write)
// in Phase A — every writer must abort before any HEAD advance.
crate::failpoints::maybe_fail("recovery.sidecar_write")?;
crate::failpoints::maybe_fail(crate::failpoints::names::RECOVERY_SIDECAR_WRITE)?;
debug_assert_eq!(sidecar.schema_version, SIDECAR_SCHEMA_VERSION);
let uri = sidecar_uri(root_uri, &sidecar.operation_id);
let json = serde_json::to_string_pretty(sidecar).map_err(|err| {
@ -457,7 +492,7 @@ pub(crate) async fn confirm_sidecar_phase_b(
) -> Result<()> {
// Failpoint: models a storage failure on the confirmation write — the
// pre-confirm sidecar stays on disk, so recovery rolls the operation back.
crate::failpoints::maybe_fail("recovery.sidecar_confirm")?;
crate::failpoints::maybe_fail(crate::failpoints::names::RECOVERY_SIDECAR_CONFIRM)?;
for pin in &mut sidecar.tables {
// Every pinned table MUST have an achieved version. A miss means the
// pin set and the publish `updates` diverged — fail loudly at the
@ -489,7 +524,7 @@ pub(crate) async fn delete_sidecar(
// Failpoint: models a storage delete failure (S3 DeleteObject) in
// Phase D — callers swallow it (the write already published) and the
// stale sidecar is healed by the next write or open.
crate::failpoints::maybe_fail("recovery.sidecar_delete")?;
crate::failpoints::maybe_fail(crate::failpoints::names::RECOVERY_SIDECAR_DELETE)?;
storage.delete(&handle.sidecar_uri).await
}
@ -507,7 +542,7 @@ pub(crate) async fn list_sidecars(
// Failpoint: models a storage list failure (S3 ListObjectsV2) — every
// consumer (open-time sweep, write-entry heal) must fail loudly
// rather than silently skipping recovery.
crate::failpoints::maybe_fail("recovery.sidecar_list")?;
crate::failpoints::maybe_fail(crate::failpoints::names::RECOVERY_SIDECAR_LIST)?;
let dir = recovery_dir_uri(root_uri);
let mut uris = storage.list_dir(&dir).await?;
// Sort by URI so the sweep processes sidecars deterministically.
@ -831,20 +866,13 @@ pub(crate) async fn heal_pending_sidecars_roll_forward(
// authority) BEFORE opening: a deferred sidecar whose
// branch was deleted would otherwise wedge every write
// on the dead-branch open.
let (branch_exists, main_version) = {
let branch_exists = {
let mut coord = coordinator.write().await;
coord.refresh().await?;
let exists = coord.all_branches().await?.iter().any(|name| name == b);
(exists, coord.snapshot().version())
coord.all_branches().await?.iter().any(|name| name == b)
};
if !branch_exists {
discard_orphaned_branch_sidecar(
root_uri,
storage.as_ref(),
&sidecar,
main_version,
)
.await?;
discard_orphaned_branch_sidecar(root_uri, storage.as_ref(), &sidecar).await?;
processed_any = true;
continue;
}
@ -862,7 +890,7 @@ pub(crate) async fn heal_pending_sidecars_roll_forward(
};
if process_sidecar(
root_uri,
storage.as_ref(),
&storage,
&branch_snapshot,
&sidecar,
RecoveryMode::RollForwardOnly,
@ -893,7 +921,6 @@ async fn discard_orphaned_branch_sidecar(
root_uri: &str,
storage: &dyn StorageAdapter,
sidecar: &RecoverySidecar,
manifest_version: u64,
) -> Result<()> {
warn!(
operation_id = sidecar.operation_id.as_str(),
@ -922,22 +949,31 @@ async fn discard_orphaned_branch_sidecar(
&& record.recovery_kind == RecoveryKind::OrphanedBranchDiscarded
});
if !already_recorded {
let mut graph = CommitGraph::open(root_uri).await?;
let graph_commit_id = graph
.append_commit(None, manifest_version, Some(RECOVERY_ACTOR))
.await?;
// Failpoint: the residual window above — commit appended, audit
// The orphan-discard commit is recorded on MAIN (the sidecar's own
// branch is gone), via a lineage-only publish into `__manifest` (RFC-013
// Phase 7) — no `_graph_commits.lance` row. The publisher stamps the
// commit at the version it produces.
let intent = LineageIntent {
graph_commit_id: ulid::Ulid::new().to_string(),
branch: None,
actor_id: Some(RECOVERY_ACTOR.to_string()),
merged_parent_commit_id: None,
created_at: crate::db::now_micros()?,
};
let publisher = GraphNamespacePublisher::new(root_uri, None);
publisher.publish(&[], &HashMap::new(), Some(&intent)).await?;
// Failpoint: the residual window above — commit published, audit
// not yet durable.
crate::failpoints::maybe_fail("recovery.orphan_discard_audit_append")?;
crate::failpoints::maybe_fail(crate::failpoints::names::RECOVERY_ORPHAN_DISCARD_AUDIT_APPEND)?;
audit
.append(RecoveryAuditRecord {
graph_commit_id,
graph_commit_id: intent.graph_commit_id,
recovery_kind: RecoveryKind::OrphanedBranchDiscarded,
recovery_for_actor: sidecar.actor_id.clone(),
operation_id: sidecar.operation_id.clone(),
sidecar_writer_kind: format!("{:?}", sidecar.writer_kind),
per_table_outcomes: Vec::new(),
created_at: now_micros()?,
created_at: crate::db::now_micros()?,
})
.await?;
}
@ -1014,13 +1050,7 @@ pub(crate) async fn recover_manifest_drift(
.iter()
.any(|name| name == b)
{
discard_orphaned_branch_sidecar(
root_uri,
storage.as_ref(),
&sidecar,
coordinator.snapshot().version(),
)
.await?;
discard_orphaned_branch_sidecar(root_uri, storage.as_ref(), &sidecar).await?;
continue;
}
let mut branch_coord =
@ -1036,7 +1066,7 @@ pub(crate) async fn recover_manifest_drift(
};
process_sidecar(
root_uri,
storage.as_ref(),
&storage,
&branch_snapshot,
&sidecar,
mode,
@ -1051,7 +1081,7 @@ pub(crate) async fn recover_manifest_drift(
async fn process_sidecar(
root_uri: &str,
storage: &dyn StorageAdapter,
storage: &std::sync::Arc<dyn StorageAdapter>,
snapshot: &Snapshot,
sidecar: &RecoverySidecar,
mode: RecoveryMode,
@ -1154,7 +1184,7 @@ async fn process_sidecar(
);
}
return record_audit_recovery_rollforward(
root_uri, storage, snapshot, sidecar, &states,
root_uri, storage.as_ref(), sidecar, &states,
)
.await
.map(|()| true);
@ -1176,7 +1206,7 @@ async fn process_sidecar(
writer_kind = ?sidecar.writer_kind,
"recovery: rolling back sidecar (mixed or unexpected state)"
);
roll_back_sidecar(root_uri, storage, snapshot, sidecar, &states)
roll_back_sidecar(root_uri, storage.as_ref(), sidecar, &states)
.await
.map(|()| true)
}
@ -1191,7 +1221,7 @@ async fn process_sidecar(
"recovery: rolling back SchemaApply sidecar because schema staging \
files were not promoted in this recovery pass"
);
roll_back_sidecar(root_uri, storage, snapshot, sidecar, &states)
roll_back_sidecar(root_uri, storage.as_ref(), sidecar, &states)
.await
.map(|()| true)
}
@ -1211,8 +1241,36 @@ async fn process_sidecar(
"recovery: rolling forward sidecar (Phase B completed; \
Phase C did not land)"
);
let (new_manifest_version, published_versions) =
roll_forward_all(root_uri, sidecar, &states, snapshot).await?;
// TOCTOU window: between `classify_table` (which read the manifest
// pin) and the publish CAS below, a concurrent live writer can
// advance the manifest past our expected version. The failpoint
// lets a test force that interleave deterministically.
crate::failpoints::maybe_fail(
crate::failpoints::names::RECOVERY_BEFORE_ROLL_FORWARD_PUBLISH,
)?;
// RFC-013 Phase 7: `roll_forward_all` folds the recovery commit into the
// manifest publish CAS, so it also returns the minted `graph_commit_id`
// for the audit row below.
let (new_manifest_version, published_versions, graph_commit_id) =
match roll_forward_all(root_uri, sidecar, &states, snapshot).await {
Ok(published) => published,
// Convergence-idempotent (invariants 7 & 15): a roll-forward's
// postcondition is "the manifest reflects the sidecar's committed
// Lance state", NOT "this sweep personally won the CAS". A
// concurrent writer that advanced the manifest to/past that goal
// during the classify→publish window is convergence, not a logical
// conflict — so re-read and either record the already-achieved
// roll-forward or defer to the next pass; never fail the open.
// Any other error still propagates.
Err(err) if is_expected_version_mismatch(&err) => {
return converge_or_defer_roll_forward(
root_uri, storage, sidecar, &states, err,
)
.await;
}
Err(err) => return Err(err),
};
let _ = new_manifest_version;
// `to_version` records the ACTUAL Lance HEAD published for
// each table (not pin.post_commit_pin, which is a lower bound
// for loose-match writers like SchemaApply / EnsureIndices /
@ -1242,17 +1300,214 @@ async fn process_sidecar(
record_audit(
root_uri,
sidecar,
new_manifest_version,
graph_commit_id,
RecoveryKind::RolledForward,
outcomes,
)
.await?;
delete_sidecar_by_operation_id(root_uri, storage, &sidecar.operation_id).await?;
delete_sidecar_by_operation_id(root_uri, storage.as_ref(), &sidecar.operation_id)
.await?;
Ok(true)
}
}
}
/// True if `err` is the publisher's per-table CAS precondition failure
/// (`ExpectedVersionMismatch`) — the signal that a concurrent writer advanced
/// the manifest past what this caller expected.
fn is_expected_version_mismatch(err: &OmniError) -> bool {
matches!(
err,
OmniError::Manifest(m)
if matches!(
m.details,
Some(crate::error::ManifestConflictDetails::ExpectedVersionMismatch { .. })
)
)
}
/// Whether the live manifest already reflects everything this sidecar intended
/// to publish.
///
/// SOUNDNESS: the per-table test is `current_version >= observed lance_head`, a
/// *proxy* for "the sidecar's committed Lance commit is an ancestor of the
/// published HEAD" (so a higher version is a descendant that contains it). The
/// proxy is sound only because of the heal-first invariant: every writer that
/// can advance a table's manifest version first heals pending sidecars
/// (`heal_pending_recovery_sidecars` runs at the head of `load`/`mutate`/
/// schema-apply/branch-merge) or refuses on an unrecovered graph (`optimize`).
/// So the only path past `expected_version` is one that first publishes THIS
/// sidecar's commit at `lance_head` — version ordering then implies lineage
/// containment. A future writer that advances a pinned table WITHOUT healing
/// first (e.g. a non-heal-first `Overwrite` that replaces rows) would void this
/// proxy and must be re-validated by row-id lineage, not version ordering.
/// Added tables must be registered; tombstoned tables must be gone.
fn sidecar_intent_satisfied(
snapshot: &Snapshot,
sidecar: &RecoverySidecar,
states: &[ClassifiedTable],
) -> bool {
for (pin, state) in sidecar.tables.iter().zip(states.iter()) {
let current = snapshot
.entry(&pin.table_key)
.map(|e| e.table_version)
.unwrap_or(0);
if current < state.lance_head {
return false;
}
}
for reg in &sidecar.additional_registrations {
if snapshot.entry(&reg.table_key).is_none() {
return false;
}
}
for tomb in &sidecar.tombstones {
if snapshot.entry(&tomb.table_key).is_some() {
return false;
}
}
true
}
/// Re-read the live manifest snapshot for the sidecar's branch.
async fn fresh_snapshot_for_sidecar(
root_uri: &str,
storage: &std::sync::Arc<dyn StorageAdapter>,
sidecar: &RecoverySidecar,
) -> Result<Snapshot> {
let mut coordinator = match sidecar.branch.as_deref() {
Some(branch) if branch != "main" => {
GraphCoordinator::open_branch(root_uri, branch, std::sync::Arc::clone(storage)).await?
}
_ => GraphCoordinator::open(root_uri, std::sync::Arc::clone(storage)).await?,
};
coordinator.refresh().await?;
Ok(coordinator.snapshot())
}
/// Convergence-idempotent handling of a roll-forward publish CAS that lost to a
/// concurrent writer (`ExpectedVersionMismatch`). A roll-forward's postcondition
/// is "the manifest reflects the sidecar's committed Lance state", not "this
/// sweep won the CAS" (invariants 7 & 15). Re-read the live manifest:
///
/// - if it already reached the sidecar's goal, the work is done (just not by us)
/// — record the `RolledForward` audit and delete the sidecar idempotently;
/// - otherwise the manifest is progressing but not yet at the goal — leave the
/// sidecar for the next open / the live writer's own Phase D.
///
/// Either way the open does NOT fail. A genuine logical conflict (a table below
/// `expected_version`, i.e. data lost) is not satisfiable here and re-surfaces
/// loudly via the classifier's `InvariantViolation` on the next pass.
/// See iss-schema-apply-reopen-recovery-race.
async fn converge_or_defer_roll_forward(
root_uri: &str,
storage: &std::sync::Arc<dyn StorageAdapter>,
sidecar: &RecoverySidecar,
states: &[ClassifiedTable],
conflict: OmniError,
) -> Result<bool> {
let fresh = fresh_snapshot_for_sidecar(root_uri, storage, sidecar).await?;
if !sidecar_intent_satisfied(&fresh, sidecar, states) {
warn!(
operation_id = sidecar.operation_id.as_str(),
writer_kind = ?sidecar.writer_kind,
"recovery: roll-forward publish lost a CAS and the manifest has not \
yet reached the sidecar's goal; deferring to the next pass \
(conflict: {conflict})"
);
return Ok(false);
}
// The manifest already reached the sidecar's goal — some other actor
// advanced it. Under the heal-first invariant, whoever advanced past
// `expected_version` first healed THIS sidecar (recorded its RolledForward
// audit and deleted it). So the audit row already exists; recording another
// here would put two RolledForward rows in `_graph_commit_recoveries` for
// one recovery event (visible in `commit list --filter actor=…recovery`).
// Only finish the bookkeeping if the sidecar is still on disk (the winner
// crashed between audit and delete); if it is already gone, the winner
// completed it — return success WITHOUT a duplicate audit, keeping the
// audit append-idempotent per operation_id across concurrent sweeps.
let sidecar_path = sidecar_uri(root_uri, &sidecar.operation_id);
if !storage.exists(&sidecar_path).await? {
warn!(
operation_id = sidecar.operation_id.as_str(),
writer_kind = ?sidecar.writer_kind,
"recovery: roll-forward publish lost a CAS; the winner already \
converged and cleaned up this sidecar nothing to do"
);
return Ok(true);
}
warn!(
operation_id = sidecar.operation_id.as_str(),
writer_kind = ?sidecar.writer_kind,
"recovery: roll-forward publish lost a CAS to a concurrent writer that \
already reached the goal; converging (RolledForward audit + delete)"
);
let mut outcomes: Vec<TableOutcome> = sidecar
.tables
.iter()
.map(|pin| TableOutcome {
table_key: pin.table_key.clone(),
from_version: pin.expected_version,
to_version: fresh
.entry(&pin.table_key)
.map(|e| e.table_version)
.unwrap_or(pin.post_commit_pin),
})
.collect();
// Mirror the normal roll-forward audit shape: SchemaApply sidecars also
// register added tables, so the audit must list them too (else a converge
// audit row is incomplete vs the `roll_forward_all` path for the same
// recovery kind).
for reg in &sidecar.additional_registrations {
outcomes.push(TableOutcome {
table_key: reg.table_key.clone(),
from_version: 0,
to_version: fresh
.entry(&reg.table_key)
.map(|e| e.table_version)
.unwrap_or(0),
});
}
// RFC-013 Phase 7: the winning writer folded its recovery commit into the
// manifest CAS, so the converge audit references THAT commit. We lost the CAS
// and never minted it, but a recovery commit is distinguishable by its
// `RECOVERY_ACTOR` authorship (`publish_recovery_commit`), so the latest
// recovery-actored commit on this branch IS it. Do NOT use the branch head:
// a concurrent USER write can advance `graph_head` past the recovery commit
// between the winner's publish and this read, which would attribute the audit
// row to the wrong (later, user) commit. (We only reach here with the sidecar
// still on disk: the winner advanced the manifest but crashed before its own
// audit+delete, so we finish its bookkeeping.)
let cache = match sidecar.branch.as_deref() {
Some(branch) => {
crate::db::commit_graph::CommitGraph::open_at_branch(root_uri, branch).await?
}
None => crate::db::commit_graph::CommitGraph::open(root_uri).await?,
};
let converged_commit_id = match cache
.load_commits()
.await?
.into_iter()
.rfind(|c| c.actor_id.as_deref() == Some(RECOVERY_ACTOR))
{
Some(recovery_commit) => recovery_commit.graph_commit_id,
// No recovery commit visible — unexpected on this path (the winner just
// published one); fall back to the head rather than an empty id.
None => cache.head_commit_id().await?.unwrap_or_default(),
};
record_audit(
root_uri,
sidecar,
converged_commit_id,
RecoveryKind::RolledForward,
outcomes,
)
.await?;
delete_sidecar_by_operation_id(root_uri, storage.as_ref(), &sidecar.operation_id).await?;
Ok(true)
}
#[derive(Debug, Clone, Copy)]
struct ClassifiedTable {
classification: TableClassification,
@ -1268,7 +1523,6 @@ struct ClassifiedTable {
async fn roll_back_sidecar(
root_uri: &str,
storage: &dyn StorageAdapter,
snapshot: &Snapshot,
sidecar: &RecoverySidecar,
states: &[ClassifiedTable],
) -> Result<()> {
@ -1328,23 +1582,18 @@ async fn roll_back_sidecar(
});
}
}
// Publish the restored HEADs so manifest == HEAD. A degenerate all-NoMovement
// roll-back restores nothing — there's nothing to publish, and the audit
// records the unchanged snapshot version.
let manifest_version = if updates.is_empty() {
snapshot.version()
} else {
let publisher = GraphNamespacePublisher::new(root_uri, sidecar.branch.as_deref());
publisher
.publish(&updates, &expected)
.await?
.version()
.version
};
// Publish the restored HEADs so manifest == HEAD AND record the recovery
// commit in the same CAS (RFC-013 Phase 7). A degenerate all-NoMovement
// roll-back restores no table — `updates` is empty — but the recovery commit
// lineage still publishes (a lineage-only merge), so the rollback is recorded
// in the commit history just like a roll-forward.
let (_manifest_version, graph_commit_id) =
publish_recovery_commit(root_uri, sidecar, RecoveryKind::RolledBack, &updates, &expected)
.await?;
record_audit(
root_uri,
sidecar,
manifest_version,
graph_commit_id,
RecoveryKind::RolledBack,
outcomes,
)
@ -1370,7 +1619,6 @@ async fn roll_back_sidecar(
async fn record_audit_recovery_rollforward(
root_uri: &str,
storage: &dyn StorageAdapter,
snapshot: &Snapshot,
sidecar: &RecoverySidecar,
states: &[ClassifiedTable],
) -> Result<()> {
@ -1384,10 +1632,22 @@ async fn record_audit_recovery_rollforward(
to_version: state.manifest_pinned,
})
.collect();
// The substrate is already in the post-roll-forward state (the prior pass's
// table re-pin landed), so there are no table `updates` — but a recovery
// commit is still recorded for this cleanup pass via a lineage-only publish
// (RFC-013 Phase 7), which the audit row references.
let (_manifest_version, graph_commit_id) = publish_recovery_commit(
root_uri,
sidecar,
RecoveryKind::RolledForward,
&[],
&HashMap::new(),
)
.await?;
record_audit(
root_uri,
sidecar,
snapshot.version(),
graph_commit_id,
RecoveryKind::RolledForward,
outcomes,
)
@ -1407,17 +1667,19 @@ async fn record_audit_recovery_rollforward(
/// contention; persistent contention surfaces the typed conflict error to
/// the recovery sweep, which leaves the sidecar in place for the next
/// open's retry.
/// Returns `(new_manifest_version, per_table_published_versions)`. The
/// per-table map is what the audit row's `to_version` should record —
/// for loose-match writers the actual Lance HEAD can be higher than the
/// sidecar's `post_commit_pin` (which is a lower bound), so the pin is
/// the wrong source of truth for an operator-facing audit field.
/// Returns `(new_manifest_version, per_table_published_versions,
/// recovery_commit_id)`. The per-table map is what the audit row's `to_version`
/// should record — for loose-match writers the actual Lance HEAD can be higher
/// than the sidecar's `post_commit_pin` (which is a lower bound), so the pin is
/// the wrong source of truth for an operator-facing audit field. The recovery
/// commit id is the `graph_commit` folded into the publish CAS (RFC-013
/// Phase 7), which the audit row references.
async fn roll_forward_all(
root_uri: &str,
sidecar: &RecoverySidecar,
states: &[ClassifiedTable],
snapshot: &Snapshot,
) -> Result<(u64, HashMap<String, u64>)> {
) -> Result<(u64, HashMap<String, u64>, String)> {
let total_changes =
sidecar.tables.len() + sidecar.additional_registrations.len() + sidecar.tombstones.len();
let mut updates: Vec<ManifestChange> = Vec::with_capacity(total_changes);
@ -1528,9 +1790,10 @@ async fn roll_forward_all(
);
}
let publisher = GraphNamespacePublisher::new(root_uri, sidecar.branch.as_deref());
let new_dataset = publisher.publish(&updates, &expected).await?;
Ok((new_dataset.version().version, published_versions))
let (new_manifest_version, graph_commit_id) =
publish_recovery_commit(root_uri, sidecar, RecoveryKind::RolledForward, &updates, &expected)
.await?;
Ok((new_manifest_version, published_versions, graph_commit_id))
}
/// Open `table_path` at its branch HEAD, read the current Lance HEAD version,
@ -1600,62 +1863,27 @@ async fn push_table_update(
Ok(published_version)
}
/// Append the audit row describing this recovery action.
/// Append the audit row describing this recovery action (RFC-013 Phase 7).
///
/// Two-part write: (a) `_graph_commits.lance` row anchored on the recovery
/// actor (`omnigraph:recovery`); (b) `_graph_commit_recoveries.lance` row
/// linking back to (a) and naming the original actor + per-table outcomes.
/// Same not-atomic-pair-write shape as the existing `_graph_commits`
/// + `_graph_commit_actors` split — a crash between the two leaves an
/// orphan commit row with no audit row. The recovery sweep tolerates this:
/// on re-entry the classifier surfaces `NoMovement` for already-restored /
/// already-published tables, the action is a no-op, and the audit append
/// is retried.
/// The recovery COMMIT (`graph_commit` + `graph_head`) was already recorded
/// durably in `__manifest` by `publish_recovery_commit` (folded into the same
/// CAS as the table re-pin), so this only writes the `_graph_commit_recoveries`
/// row, referencing that commit by `graph_commit_id`. A crash between the
/// recovery publish and this audit append leaves a recovery commit with no audit
/// row — the same not-atomic-pair-write shape as before; the sweep tolerates it
/// (on re-entry the classifier surfaces `NoMovement`, the action is a no-op, and
/// the audit append is retried, minting a fresh recovery commit).
async fn record_audit(
root_uri: &str,
sidecar: &RecoverySidecar,
manifest_version: u64,
graph_commit_id: String,
kind: RecoveryKind,
outcomes: Vec<TableOutcome>,
) -> Result<()> {
// Failpoint: models an audit write failure after the roll-forward /
// roll-back publish already landed — the sweep aborts, the sidecar
// stays, and re-entry records the audit row (see the retry note in
// the doc comment above).
crate::failpoints::maybe_fail("recovery.record_audit")?;
// Non-main recovery commits must be appended on the sidecar branch's
// commit graph, otherwise parent_commit_id comes from the global
// main head. BranchMerge additionally records the source branch's
// HEAD as merged_parent_commit_id so future merges between the same
// pair recognize "already up-to-date".
let target_branch = sidecar.branch.as_deref();
let mut graph = match target_branch {
Some(branch) => CommitGraph::open_at_branch(root_uri, branch).await?,
None => CommitGraph::open(root_uri).await?,
};
let graph_commit_id = match (
sidecar.writer_kind,
sidecar.merge_source_commit_id.as_deref(),
kind,
) {
(SidecarKind::BranchMerge, Some(source_id), RecoveryKind::RolledForward) => {
let parent_commit_id = graph.head_commit_id().await?.unwrap_or_default();
graph
.append_merge_commit(
target_branch,
manifest_version,
&parent_commit_id,
source_id,
Some(RECOVERY_ACTOR),
)
.await?
}
_ => {
graph
.append_commit(target_branch, manifest_version, Some(RECOVERY_ACTOR))
.await?
}
};
// roll-back publish (with its folded-in recovery commit) already landed —
// the sweep aborts, the sidecar stays, and re-entry records the audit row.
crate::failpoints::maybe_fail(crate::failpoints::names::RECOVERY_RECORD_AUDIT)?;
let mut audit = RecoveryAudit::open(root_uri).await?;
audit
.append(RecoveryAuditRecord {
@ -1665,7 +1893,7 @@ async fn record_audit(
operation_id: sidecar.operation_id.clone(),
sidecar_writer_kind: format!("{:?}", sidecar.writer_kind),
per_table_outcomes: outcomes,
created_at: now_micros()?,
created_at: crate::db::now_micros()?,
})
.await?;
Ok(())

View file

@ -10,7 +10,10 @@ use crate::error::{OmniError, Result};
use super::layout::version_object_id;
use super::metadata::TableVersionMetadata;
use super::{OBJECT_TYPE_TABLE, OBJECT_TYPE_TABLE_TOMBSTONE, OBJECT_TYPE_TABLE_VERSION};
use super::{
MAIN_BRANCH_HEAD_KEY, OBJECT_TYPE_GRAPH_COMMIT, OBJECT_TYPE_GRAPH_HEAD, OBJECT_TYPE_TABLE,
OBJECT_TYPE_TABLE_TOMBSTONE, OBJECT_TYPE_TABLE_VERSION,
};
#[derive(Debug, Clone)]
pub struct SubTableEntry {
@ -34,11 +37,64 @@ struct TableTombstoneEntry {
tombstone_version: u64,
}
/// A graph-lineage commit projected out of the `__manifest` `graph_commit`
/// rows (RFC-013 step 4). Field-for-field identical to `commit_graph::GraphCommit`
/// so the commit-graph cache can be sourced from the manifest projection without
/// touching any reader above that boundary. Kept as a separate struct here to
/// keep `state.rs` free of the `commit_graph` module dependency.
#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) struct GraphLineageRow {
pub(crate) graph_commit_id: String,
pub(crate) manifest_branch: Option<String>,
pub(crate) manifest_version: u64,
pub(crate) parent_commit_id: Option<String>,
pub(crate) merged_parent_commit_id: Option<String>,
pub(crate) actor_id: Option<String>,
pub(crate) created_at: i64,
}
/// JSON payload of a `graph_commit` row's `metadata` column. The immutable
/// commit fields that have no dedicated manifest column live here; the mutable
/// ones (`graph_commit_id`, `manifest_branch`, `manifest_version`) reuse
/// `object_id` / `table_branch` / `table_version`.
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
struct GraphCommitMetadata {
#[serde(default, skip_serializing_if = "Option::is_none")]
parent_commit_id: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
merged_parent_commit_id: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
actor_id: Option<String>,
created_at: i64,
}
/// JSON payload of a `graph_head` row's `metadata` column.
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
struct GraphHeadMetadata {
head_commit_id: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
parent_commit_id: Option<String>,
}
/// The `object_id` for a branch's mutable head pointer row. Main encodes as
/// `graph_head:main`; named branches as `graph_head:<branch>`.
pub(crate) fn graph_head_object_id(branch: Option<&str>) -> String {
format!("graph_head:{}", branch.unwrap_or(MAIN_BRANCH_HEAD_KEY))
}
#[derive(Debug, Clone)]
struct ManifestScan {
table_locations: HashMap<String, String>,
version_entries: Vec<SubTableEntry>,
tombstones: Vec<TableTombstoneEntry>,
/// Graph-lineage `graph_commit` rows, collected in the SAME pass only when
/// the caller asked (`collect_lineage`). Empty on the table-state read hot
/// path so it never pays the O(commits) lineage JSON decode; populated on the
/// publish path, where `load_publish_state` already needs the parent and would
/// otherwise scan `__manifest` a second time via `read_graph_lineage`. `graph_head`
/// rows are not collected here — parent resolution uses the head-over-commits
/// computation, not the denormalized head pointer (see `resolve_lineage_rows`).
lineage_rows: Vec<GraphLineageRow>,
}
pub(super) fn manifest_schema() -> SchemaRef {
@ -73,7 +129,8 @@ pub(super) fn manifest_schema() -> SchemaRef {
pub(super) async fn read_manifest_state(dataset: &Dataset) -> Result<ManifestState> {
let version = dataset.version().version;
let scan = read_manifest_scan(dataset).await?;
// The table-state hot path never needs lineage, so don't pay its JSON decode.
let scan = read_manifest_scan(dataset, false).await?;
let mut latest_versions = HashMap::<String, SubTableEntry>::new();
for entry in scan.version_entries {
@ -109,28 +166,85 @@ pub(super) async fn read_manifest_state(dataset: &Dataset) -> Result<ManifestSta
Ok(ManifestState { version, entries })
}
// After RFC-013 P2 folded the publish path off this accessor (it now projects
// version entries out of `read_publish_scan`'s single scan), the only remaining
// caller is `BranchManifestNamespace::version_entries`. That namespace module is
// `#[cfg(test)]` (see `db/manifest.rs`: "nothing in production routes through it;
// the `LanceNamespace` impls are retained only to validate the contract in unit
// tests"), so this stays `#[cfg(test)]` too — otherwise it is dead code in
// non-test builds.
#[cfg(test)]
pub(super) async fn read_manifest_entries(dataset: &Dataset) -> Result<Vec<SubTableEntry>> {
Ok(read_manifest_scan(dataset).await?.version_entries)
Ok(read_manifest_scan(dataset, false).await?.version_entries)
}
pub(super) async fn read_registered_table_locations(
dataset: &Dataset,
) -> Result<HashMap<String, String>> {
Ok(read_manifest_scan(dataset).await?.table_locations)
/// The full table state the publisher needs to build its CAS batch, plus the
/// `graph_commit` lineage rows for parent resolution — all from ONE `__manifest`
/// scan (RFC-013 P2). Replaces the prior four scans on the publish path (three
/// thin accessors + a separate `read_graph_lineage`): `load_publish_state`
/// projects every piece it needs out of this single result.
pub(super) struct PublishScan {
pub(super) table_locations: HashMap<String, String>,
pub(super) version_entries: Vec<SubTableEntry>,
pub(super) tombstones: Vec<((String, u64), ())>,
pub(super) lineage_rows: Vec<GraphLineageRow>,
}
pub(super) async fn read_tombstone_versions(
dataset: &Dataset,
) -> Result<HashMap<(String, u64), ()>> {
Ok(read_manifest_scan(dataset)
.await?
.tombstones
.into_iter()
.map(|tombstone| ((tombstone.table_key, tombstone.tombstone_version), ()))
.collect())
/// One-scan read of everything the publish path needs. `collect_lineage` is
/// always on here (the publisher resolves a parent), so the lineage JSON decode
/// rides the same pass as the table-state assembly instead of a second scan.
pub(super) async fn read_publish_scan(dataset: &Dataset) -> Result<PublishScan> {
let scan = read_manifest_scan(dataset, true).await?;
Ok(PublishScan {
table_locations: scan.table_locations,
version_entries: scan.version_entries,
tombstones: scan
.tombstones
.into_iter()
.map(|tombstone| ((tombstone.table_key, tombstone.tombstone_version), ()))
.collect(),
lineage_rows: scan.lineage_rows,
})
}
async fn read_manifest_scan(dataset: &Dataset) -> Result<ManifestScan> {
/// Decode one `graph_commit` row (`object_type == OBJECT_TYPE_GRAPH_COMMIT`) into
/// a [`GraphLineageRow`]. The single decode for both lineage readers — the
/// dedicated `read_graph_lineage` scan and the folded `collect_lineage` branch of
/// `read_manifest_scan` — so the two cannot drift. The caller has already matched
/// the object type; `row` indexes into the per-batch columns.
fn decode_graph_commit_row(
object_ids: &StringArray,
metadata: &StringArray,
versions: &UInt64Array,
branches: &StringArray,
row: usize,
) -> Result<GraphLineageRow> {
if metadata.is_null(row) {
return Err(OmniError::manifest_internal(format!(
"manifest graph_commit row missing metadata for {}",
object_ids.value(row)
)));
}
let commit_meta: GraphCommitMetadata =
serde_json::from_str(metadata.value(row)).map_err(|e| {
OmniError::manifest_internal(format!("failed to decode graph_commit metadata: {e}"))
})?;
Ok(GraphLineageRow {
graph_commit_id: object_ids.value(row).to_string(),
manifest_branch: if branches.is_null(row) {
None
} else {
Some(branches.value(row).to_string())
},
manifest_version: required_u64(versions, row, "table_version")?,
parent_commit_id: commit_meta.parent_commit_id,
merged_parent_commit_id: commit_meta.merged_parent_commit_id,
actor_id: commit_meta.actor_id,
created_at: commit_meta.created_at,
})
}
async fn read_manifest_scan(dataset: &Dataset, collect_lineage: bool) -> Result<ManifestScan> {
let batches: Vec<RecordBatch> = dataset
.scan()
.try_into_stream()
@ -143,6 +257,7 @@ async fn read_manifest_scan(dataset: &Dataset) -> Result<ManifestScan> {
let mut table_locations = HashMap::new();
let mut version_entries = Vec::new();
let mut tombstones = Vec::new();
let mut lineage_rows = Vec::new();
for batch in &batches {
let object_types = string_column(batch, "object_type")?;
@ -152,6 +267,13 @@ async fn read_manifest_scan(dataset: &Dataset) -> Result<ManifestScan> {
let versions = u64_column(batch, "table_version")?;
let branches = string_column(batch, "table_branch")?;
let row_counts = u64_column(batch, "row_count")?;
// `object_id` is only needed for lineage decoding; skip the lookup
// entirely on the table-state hot path (`collect_lineage == false`).
let object_ids = if collect_lineage {
Some(string_column(batch, "object_id")?)
} else {
None
};
for row in 0..batch.num_rows() {
let table_key = table_keys.value(row).to_string();
@ -195,6 +317,21 @@ async fn read_manifest_scan(dataset: &Dataset) -> Result<ManifestScan> {
tombstone_version,
});
}
// `graph_commit` rows (RFC-013) are decoded into the scan ONLY
// when `collect_lineage` is set (the publish path, which resolves
// a parent). The table-state hot path leaves them — and
// `graph_head` + any future object type — in the `_` arm so it
// never pays the O(commits) lineage JSON decode. When NOT
// collecting, `object_ids` is `None`, so this arm is the same
// forward-compat skip as the `_` arm.
OBJECT_TYPE_GRAPH_COMMIT if collect_lineage => {
let object_ids = object_ids.expect("object_ids read when collect_lineage");
lineage_rows.push(decode_graph_commit_row(
object_ids, metadata, versions, branches, row,
)?);
}
// Skipped on the table-state path (and for `graph_head` / unknown
// future object types on every path): no table snapshot needs them.
_ => {}
}
}
@ -225,21 +362,167 @@ async fn read_manifest_scan(dataset: &Dataset) -> Result<ManifestScan> {
table_locations,
version_entries: entries,
tombstones,
lineage_rows,
})
}
/// Project the graph-lineage rows (`graph_commit` + `graph_head`) out of
/// `__manifest` (RFC-013 step 4). Returns every commit and the per-branch head
/// map (keyed by branch name, `"main"` for main). `__manifest` is the single
/// source of graph lineage: the commit-graph cache is sourced from here, and the
/// publisher resolves a new commit's parent from here inside its CAS loop.
///
/// Dedicated scan (separate from `read_manifest_scan`): it decodes ONLY the two
/// lineage object types and builds no table snapshot, so the table-state hot
/// path never pays for lineage JSON and this path never pays for table-entry
/// assembly.
pub(crate) async fn read_graph_lineage(
dataset: &Dataset,
) -> Result<(Vec<GraphLineageRow>, HashMap<String, String>)> {
let batches: Vec<RecordBatch> = dataset
.scan()
.try_into_stream()
.await
.map_err(|e| OmniError::Lance(e.to_string()))?
.try_collect()
.await
.map_err(|e| OmniError::Lance(e.to_string()))?;
let mut graph_commits = Vec::new();
let mut graph_heads = HashMap::new();
for batch in &batches {
let object_ids = string_column(batch, "object_id")?;
let object_types = string_column(batch, "object_type")?;
let metadata = string_column(batch, "metadata")?;
let versions = u64_column(batch, "table_version")?;
let branches = string_column(batch, "table_branch")?;
for row in 0..batch.num_rows() {
match object_types.value(row) {
OBJECT_TYPE_GRAPH_COMMIT => {
graph_commits.push(decode_graph_commit_row(
object_ids, metadata, versions, branches, row,
)?);
}
OBJECT_TYPE_GRAPH_HEAD => {
if metadata.is_null(row) {
return Err(OmniError::manifest_internal(format!(
"manifest graph_head row missing metadata for {}",
object_ids.value(row)
)));
}
let head_meta: GraphHeadMetadata = serde_json::from_str(metadata.value(row))
.map_err(|e| {
OmniError::manifest_internal(format!(
"failed to decode graph_head metadata: {e}"
))
})?;
// `object_id` is `graph_head:<branch>`; the branch key after
// the prefix is the projection's map key (`main` for main).
let branch_key = object_ids
.value(row)
.strip_prefix("graph_head:")
.unwrap_or_default()
.to_string();
graph_heads.insert(branch_key, head_meta.head_commit_id);
}
_ => {}
}
}
}
Ok((graph_commits, graph_heads))
}
/// The current head of a branch's lineage: the [`GraphLineageRow`] with the
/// greatest `(manifest_version, created_at, graph_commit_id)`. This is the same
/// ordering the commit-graph cache uses to pick its head (`should_replace_head`)
/// — kept in one place so the publisher's per-attempt parent resolution and the
/// cache agree by construction. `None` only for a graph with no commits yet
/// (a parentless genesis).
pub(crate) fn head_lineage_row(rows: &[GraphLineageRow]) -> Option<&GraphLineageRow> {
rows.iter().max_by(|a, b| {
a.manifest_version
.cmp(&b.manifest_version)
.then_with(|| a.created_at.cmp(&b.created_at))
.then_with(|| a.graph_commit_id.cmp(&b.graph_commit_id))
})
}
/// One `__manifest` row materializing a piece of a graph commit's lineage. The
/// publisher maps these onto its `PendingVersionRow`s (folding lineage into the
/// table-version publish batch), and the genesis init path pushes them straight
/// into the init batch.
pub(crate) struct GraphLineageRowPart {
pub(crate) object_id: String,
pub(crate) object_type: &'static str,
pub(crate) metadata: String,
pub(crate) table_version: Option<u64>,
pub(crate) table_branch: Option<String>,
}
/// Encode one graph commit into its two `__manifest` rows: the immutable
/// `graph_commit` row plus the mutable `graph_head:<branch>` pointer (a
/// merge-insert on `object_id` updates the head in place). `branch` is `None`
/// for main. The immutable commit fields with no dedicated column live in the
/// `graph_commit` row's `metadata` JSON; the mutable head pointer payload lives
/// in the `graph_head` row's `metadata`.
pub(crate) fn graph_lineage_row_parts(
commit: &GraphLineageRow,
branch: Option<&str>,
) -> Result<[GraphLineageRowPart; 2]> {
let commit_metadata = serde_json::to_string(&GraphCommitMetadata {
parent_commit_id: commit.parent_commit_id.clone(),
merged_parent_commit_id: commit.merged_parent_commit_id.clone(),
actor_id: commit.actor_id.clone(),
created_at: commit.created_at,
})
.map_err(|e| {
OmniError::manifest_internal(format!("failed to encode graph_commit metadata: {e}"))
})?;
let head_metadata = serde_json::to_string(&GraphHeadMetadata {
head_commit_id: commit.graph_commit_id.clone(),
parent_commit_id: commit.parent_commit_id.clone(),
})
.map_err(|e| {
OmniError::manifest_internal(format!("failed to encode graph_head metadata: {e}"))
})?;
Ok([
// Only the immutable commit row carries the manifest version + branch.
GraphLineageRowPart {
object_id: commit.graph_commit_id.clone(),
object_type: OBJECT_TYPE_GRAPH_COMMIT,
metadata: commit_metadata,
table_version: Some(commit.manifest_version),
table_branch: commit.manifest_branch.clone(),
},
// The head row reuses `metadata` for its pointer payload.
GraphLineageRowPart {
object_id: graph_head_object_id(branch),
object_type: OBJECT_TYPE_GRAPH_HEAD,
metadata: head_metadata,
table_version: None,
table_branch: None,
},
])
}
pub(super) fn entries_to_batch(
entries: &[SubTableEntry],
version_metadata: &HashMap<String, String>,
genesis_lineage: &[GraphLineageRowPart],
) -> Result<RecordBatch> {
let mut object_ids = Vec::with_capacity(entries.len() * 2);
let mut object_types = Vec::with_capacity(entries.len() * 2);
let mut locations = Vec::with_capacity(entries.len() * 2);
let mut metadata = Vec::with_capacity(entries.len() * 2);
let mut table_keys = Vec::with_capacity(entries.len() * 2);
let mut table_versions = Vec::with_capacity(entries.len() * 2);
let mut table_branches = Vec::with_capacity(entries.len() * 2);
let mut row_counts = Vec::with_capacity(entries.len() * 2);
let cap = entries.len() * 2 + genesis_lineage.len();
let mut object_ids = Vec::with_capacity(cap);
let mut object_types = Vec::with_capacity(cap);
let mut locations = Vec::with_capacity(cap);
let mut metadata = Vec::with_capacity(cap);
let mut table_keys = Vec::with_capacity(cap);
let mut table_versions = Vec::with_capacity(cap);
let mut table_branches = Vec::with_capacity(cap);
let mut row_counts = Vec::with_capacity(cap);
for entry in entries {
object_ids.push(entry.table_key.clone());
@ -271,6 +554,22 @@ pub(super) fn entries_to_batch(
row_counts.push(Some(entry.row_count));
}
// Genesis graph-lineage rows ride the init write so a fresh graph carries
// its `graph_commit` + `graph_head` in `__manifest` from version one (no
// separate lineage fragment, no second commit). `table_key` is non-nullable
// but lineage rows have no table identity, so the empty string stands in
// (never matched by a real key).
for part in genesis_lineage {
object_ids.push(part.object_id.clone());
object_types.push(part.object_type.to_string());
locations.push(None);
metadata.push(Some(part.metadata.clone()));
table_keys.push(String::new());
table_versions.push(part.table_version);
table_branches.push(part.table_branch.clone());
row_counts.push(None);
}
manifest_rows_batch(
object_ids,
object_types,
@ -283,6 +582,72 @@ pub(super) fn entries_to_batch(
)
}
/// Merge-insert a set of graph-lineage rows (`graph_commit` + `graph_head`)
/// straight into `__manifest`, keyed on `object_id`. Used only by the v3→v4
/// internal-schema backfill (RFC-013 step 4): the normal write path folds
/// lineage into the publisher's batch, but the migration writes lineage with
/// no accompanying table-version change, so it issues its own merge.
///
/// Mirrors the publisher's merge knobs (`use_index(false)`, `skip_auto_cleanup`,
/// `conflict_retries(0)`) so it has identical CAS / cleanup semantics. The
/// migration runs under the open-for-write path and is idempotent (re-inserting
/// the same `object_id` rows updates them in place), so it does not need the
/// publisher's retry loop. Returns the advanced dataset (its version is the
/// commit the lineage landed in).
pub(crate) async fn merge_lineage_rows(
dataset: Dataset,
parts: &[GraphLineageRowPart],
) -> Result<Dataset> {
let len = parts.len();
let mut object_ids = Vec::with_capacity(len);
let mut object_types = Vec::with_capacity(len);
let mut metadata = Vec::with_capacity(len);
let mut table_versions = Vec::with_capacity(len);
let mut table_branches = Vec::with_capacity(len);
for part in parts {
object_ids.push(part.object_id.clone());
object_types.push(part.object_type.to_string());
metadata.push(Some(part.metadata.clone()));
table_versions.push(part.table_version);
table_branches.push(part.table_branch.clone());
}
// Lineage rows carry no table identity: empty `table_key`, null location /
// row_count (matching `lineage_part_to_pending` in the publisher).
let batch = manifest_rows_batch(
object_ids,
object_types,
vec![None; len],
metadata,
vec![String::new(); len],
table_versions,
table_branches,
vec![None; len],
)?;
let reader =
arrow_array::RecordBatchIterator::new(vec![Ok(batch)], manifest_schema());
let dataset = Arc::new(dataset);
let mut merge_builder =
lance::dataset::MergeInsertBuilder::try_new(dataset, vec!["object_id".to_string()])
.map_err(|e| OmniError::Lance(e.to_string()))?;
merge_builder.when_matched(lance::dataset::WhenMatched::UpdateAll);
merge_builder.when_not_matched(lance::dataset::WhenNotMatched::InsertAll);
merge_builder.conflict_retries(0);
merge_builder.use_index(false);
merge_builder.skip_auto_cleanup(true);
let (new_dataset, _stats) = merge_builder
.try_build()
.map_err(|e| OmniError::Lance(e.to_string()))?
.execute_reader(Box::new(reader))
// Route through the publisher's classifier (not a stringify) so a
// concurrent first-open's CAS loss on `__manifest` surfaces as the SAME
// typed `RowLevelCasContention` the publisher's retry consumes. The
// migration's re-open retry loop matches on that to converge instead of
// erroring out (FIX B).
.await
.map_err(super::publisher::map_lance_publish_error)?;
Ok(Arc::try_unwrap(new_dataset).unwrap_or_else(|arc| (*arc).clone()))
}
pub(super) fn manifest_rows_batch(
object_ids: Vec<String>,
object_types: Vec<String>,

File diff suppressed because it is too large Load diff

View file

@ -10,12 +10,15 @@ pub use commit_graph::GraphCommit;
pub use graph_coordinator::{GraphCoordinator, ReadTarget, ResolvedTarget, SnapshotId};
pub use manifest::{Snapshot, SubTableEntry, SubTableUpdate};
pub(crate) use omnigraph::ensure_public_branch_ref;
pub(crate) use omnigraph::WriteTxn;
pub use omnigraph::{
CleanupPolicyOptions, InitOptions, MergeOutcome, Omnigraph, OpenMode, PendingIndex,
RepairAction, RepairClassification, RepairOptions, RepairStats, SchemaApplyOptions,
SchemaApplyResult, SkipReason, TableCleanupStats, TableOptimizeStats, TableRepairStats,
};
use crate::error::{OmniError, Result};
pub(crate) const SCHEMA_APPLY_LOCK_BRANCH: &str = "__schema_apply_lock__";
/// Mutation kind, threaded through the version-check call sites so the
@ -73,3 +76,14 @@ pub(crate) fn is_internal_system_branch(name: &str) -> bool {
// only internal branch the engine still creates is the schema-apply lock.
is_schema_apply_lock_branch(name)
}
/// Microseconds since the UNIX epoch — the `created_at` stamp threaded through
/// every graph-lineage / recovery-audit / commit-graph row. One canonical
/// helper so the clock-error mapping (variant + message) cannot drift across
/// the call sites that record those timestamps.
pub(crate) fn now_micros() -> Result<i64> {
let duration = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.map_err(|e| OmniError::manifest(format!("system clock before UNIX_EPOCH: {e}")))?;
Ok(duration.as_micros() as i64)
}

View file

@ -41,6 +41,7 @@ pub use repair::{
};
pub use schema_apply::SchemaApplyOptions;
pub use table_ops::PendingIndex;
pub(crate) use table_ops::OpenedForMutation;
use super::commit_graph::GraphCommit;
use super::manifest::{
@ -79,6 +80,35 @@ pub struct SchemaApplyPreview {
pub catalog: Catalog,
}
/// A capture-once write transaction (RFC-013 step 3b). Pins the operation's read
/// base ONCE so the per-table opens reuse the pinned version instead of
/// re-resolving / re-validating per table. The schema contract is validated once
/// (when `base` is captured). NOT a general "no re-resolution" handle — the
/// commit-time OCC re-read, the live-HEAD drift probe, and the fork-authority reads
/// stay fresh (correctness machinery). Step 5 (PublishPlan unification) makes this
/// the non-optional publish carrier and adds session-aware base opens there, gated
/// by an S3 cost test — the warm-session benefit on the single remaining open is an
/// object-store phenomenon, so it earns its own gate rather than riding this PR.
///
/// Threaded as `Option<&WriteTxn>` through the mutate/load write chain
/// (`open_for_mutation_on_branch`, `commit_all`, `commit_updates_on_branch_with_expected`)
/// so a single write validates the schema contract EXACTLY ONCE — at capture. When
/// present, the per-table resolves source the pinned `base` entry instead of calling
/// `resolved_branch_target` / `snapshot_for_branch` / `fresh_snapshot_for_branch`
/// (each of which re-runs `ensure_schema_state_valid`). When absent (`None` — every
/// non-mutate/load caller), every threaded function behaves byte-identically to
/// before. The carrier never removes a version guard or changes which dataset version
/// the per-table open targets: strict ops keep `open_dataset_head_for_write` +
/// `ensure_expected_version`, and the commit-time OCC re-read still opens a fresh
/// manifest snapshot (via `fresh_snapshot_for_branch_unchecked`) — only the redundant
/// schema re-validation is dropped.
pub(crate) struct WriteTxn {
/// The resolved branch (`None` = main).
pub(crate) branch: Option<String>,
/// The pinned base snapshot (per-table location + version + e_tag), captured once.
pub(crate) base: Snapshot,
}
/// Top-level handle to an Omnigraph database.
///
/// An Omnigraph is a Lance-native graph database with git-style branching.
@ -93,12 +123,12 @@ pub struct Omnigraph {
/// calls without a global write lock). Reads (`snapshot`, `version`,
/// `current_branch`, `branch_list`, `resolve_*`, `head_commit_id`,
/// `list_commits`, …) acquire `.read().await` and parallelize.
/// Writes (`refresh`, `branch_create`, `branch_delete`, `commit_*`,
/// `record_*`) acquire `.write().await` and serialize. The atomic
/// commit invariant — `commit_manifest_updates` followed by
/// `record_graph_commit` must be atomic — is preserved by the
/// single `.write()` covering both calls inside
/// `commit_updates_with_actor_with_expected`. PR 2 Phase 2
/// Writes (`refresh`, `branch_create`, `branch_delete`, `commit_*`)
/// acquire `.write().await` and serialize. The atomic commit invariant —
/// table-version rows and the graph commit are one unit — holds by
/// construction since RFC-013 Phase 7: both ride a SINGLE manifest publish
/// CAS (`commit_changes_with_lineage`), so there is no two-write window to
/// keep atomic. PR 2 Phase 2
/// converted from `Mutex` to `RwLock` because the bench showed
/// the Mutex was the dominant serializer for disjoint-table
/// workloads. Lock acquisition order: always before `runtime_cache`
@ -287,7 +317,7 @@ impl Omnigraph {
{
return Err(OmniError::AlreadyInitialized { uri: root.clone() });
}
if let Err(err) = crate::failpoints::maybe_fail("init.after_schema_pg_written") {
if let Err(err) = crate::failpoints::maybe_fail(crate::failpoints::names::INIT_AFTER_SCHEMA_PG_WRITTEN) {
best_effort_cleanup_init_artifacts(&root, storage.as_ref()).await;
return Err(err);
}
@ -387,6 +417,14 @@ impl Omnigraph {
// first read-write open (an accepted, documented limitation).
if matches!(mode, OpenMode::ReadWrite) {
crate::db::manifest::migrate_on_open(&root).await?;
} else {
// A read-only open skips `migrate_on_open` (no object-store writes),
// which is where the version refusal otherwise lives. Still refuse a
// `__manifest` stamped outside this binary's supported range — newer
// than CURRENT (an old binary cannot silently misread a newer graph,
// e.g. one folded to internal-schema v4 lineage), or below
// MIN_SUPPORTED (predates the readers we carry). Read-only, no write.
crate::db::manifest::refuse_if_internal_schema_unsupported(&root).await?;
}
// Open the coordinator first so the schema-staging recovery sweep can
// compare its snapshot against any leftover staging files.
@ -736,6 +774,29 @@ impl Omnigraph {
*self.coordinator.write().await = coordinator;
}
/// Open a capture-once write transaction (RFC-013 step 3b): validate the schema
/// contract ONCE and pin the base snapshot. The per-table opens take
/// `Option<&WriteTxn>` and, on the bound branch for the non-strict (Insert/Merge)
/// path, source the pinned base entry — instead of re-resolving (re-validating the
/// schema) per table. Strict ops, the fork path, and the commit-time OCC re-read
/// keep their fresh reads (those are correctness machinery — see the handoff doc).
///
/// "Once" covers the table-touch hot path captured here (proven by the node-insert
/// gate `write_validates_schema_contract_once`); it does NOT yet cover edge endpoint
/// / cardinality RI validation (`ensure_node_id_exists`, the loader's RI/cardinality),
/// which still resolve through `snapshot_for_branch` and re-validate. Those reads must
/// observe LIVE committed state, so unifying them (validate-once + pinned + re-checked
/// read-set) is step 4's §7.1 work — threading `txn.base` there would re-introduce the
/// stale-read class the #298 cardinality fix removed. A session-aware base open is
/// likewise deferred to step 5 (handoff §1d).
pub(crate) async fn open_write_txn(&self, branch: Option<&str>) -> Result<WriteTxn> {
let resolved = self.resolved_branch_target(branch).await?;
Ok(WriteTxn {
branch: resolved.branch,
base: resolved.snapshot,
})
}
pub(crate) async fn resolved_branch_target(
&self,
branch: Option<&str>,
@ -770,12 +831,39 @@ impl Omnigraph {
pub(crate) async fn fresh_snapshot_for_branch(&self, branch: Option<&str>) -> Result<Snapshot> {
self.ensure_schema_state_valid().await?;
let requested = ReadTarget::Branch(branch.unwrap_or("main").to_string());
let coord = self.coordinator.read().await;
coord
.resolve_target(&requested)
.await
.map(|resolved| resolved.snapshot)
self.fresh_snapshot_for_branch_unchecked(branch).await
}
/// Fresh per-branch manifest snapshot WITHOUT the schema-contract
/// re-validation. Identical OCC freshness to [`fresh_snapshot_for_branch`]
/// — a fresh manifest re-read from storage, never the warm cache — only the
/// redundant `ensure_schema_state_valid` is dropped. Used inside a single
/// write once a `WriteTxn` has already validated the contract at capture: the
/// commit-time drift re-read needs the live manifest, not a second contract
/// read. Callers with no `WriteTxn` MUST use the checked variant.
///
/// Reads the manifest directly via `ManifestCoordinator` rather than
/// `resolve_target`. The OCC re-read uses only the returned `Snapshot`
/// (per-table location + version), which `ManifestCoordinator::open().snapshot()`
/// produces identically to `GraphCoordinator::open(...).snapshot()` — but
/// `resolve_target` additionally opens the commit graph (an extra
/// `_graph_commits.lance` probe) the OCC read never consults. Skipping that
/// load is a pure read-cost reduction, not a freshness change. The checked
/// `fresh_snapshot_for_branch` delegates here, so its no-`txn` callers
/// (commit_all's None arm, optimize, repair, fork reclaim) get the same
/// identical `Snapshot` via this lighter manifest-only read; they consume
/// only the snapshot and never relied on the commit-graph side load.
pub(crate) async fn fresh_snapshot_for_branch_unchecked(
&self,
branch: Option<&str>,
) -> Result<Snapshot> {
let manifest = match branch {
Some(branch) => {
crate::db::manifest::ManifestCoordinator::open_at_branch(self.uri(), branch).await?
}
None => crate::db::manifest::ManifestCoordinator::open(self.uri()).await?,
};
Ok(manifest.snapshot())
}
pub(crate) async fn version(&self) -> u64 {
@ -1367,7 +1455,7 @@ impl Omnigraph {
for (table_key, table_path) in cleanup_targets {
let dataset_uri = self.storage().dataset_uri(&table_path);
let outcome = match crate::failpoints::maybe_fail("branch_delete.before_table_cleanup")
let outcome = match crate::failpoints::maybe_fail(crate::failpoints::names::BRANCH_DELETE_BEFORE_TABLE_CLEANUP)
{
Ok(()) => {
self.storage()
@ -1599,7 +1687,7 @@ impl Omnigraph {
&self,
table_key: &str,
op_kind: crate::db::MutationOpKind,
) -> Result<(SnapshotHandle, String, Option<String>)> {
) -> Result<OpenedForMutation> {
table_ops::open_for_mutation(self, table_key, op_kind).await
}
@ -1608,8 +1696,9 @@ impl Omnigraph {
branch: Option<&str>,
table_key: &str,
op_kind: crate::db::MutationOpKind,
) -> Result<(SnapshotHandle, String, Option<String>)> {
table_ops::open_for_mutation_on_branch(self, branch, table_key, op_kind).await
txn: Option<&crate::db::WriteTxn>,
) -> Result<OpenedForMutation> {
table_ops::open_for_mutation_on_branch(self, branch, table_key, op_kind, txn).await
}
/// Fork `table_key` onto `active_branch` from the given source state,
@ -1698,28 +1787,17 @@ impl Omnigraph {
table_ops::commit_updates(self, updates).await
}
pub(crate) async fn commit_manifest_updates(
/// Publish a branch merge: the merged table `updates` and the merge commit
/// in one manifest CAS (RFC-013 Phase 7). The merge commit's merged-in parent
/// is `merged_parent_commit_id` (the source head); its first parent is the
/// live target-branch head, resolved by the publisher.
pub(crate) async fn commit_merge_with_actor(
&self,
updates: &[crate::db::SubTableUpdate],
) -> Result<u64> {
table_ops::commit_manifest_updates(self, updates).await
}
pub(crate) async fn record_merge_commit(
&self,
manifest_version: u64,
parent_commit_id: &str,
merged_parent_commit_id: &str,
actor_id: Option<&str>,
) -> Result<String> {
table_ops::record_merge_commit(
self,
manifest_version,
parent_commit_id,
merged_parent_commit_id,
actor_id,
)
.await
table_ops::commit_merge_with_actor(self, updates, merged_parent_commit_id, actor_id).await
}
pub(crate) async fn commit_updates_on_branch_with_expected(
@ -1728,6 +1806,8 @@ impl Omnigraph {
updates: &[crate::db::SubTableUpdate],
expected_table_versions: &std::collections::HashMap<String, u64>,
actor_id: Option<&str>,
txn: Option<&crate::db::WriteTxn>,
committed_handles: std::collections::HashMap<String, crate::storage_layer::SnapshotHandle>,
) -> Result<u64> {
table_ops::commit_updates_on_branch_with_expected(
self,
@ -1735,6 +1815,8 @@ impl Omnigraph {
updates,
expected_table_versions,
actor_id,
txn,
committed_handles,
)
.await
}
@ -1939,14 +2021,14 @@ async fn init_storage_phase(
if write_schema_pg {
let schema_path = join_uri(root, SCHEMA_SOURCE_FILENAME);
storage.write_text(&schema_path, schema_source).await?;
crate::failpoints::maybe_fail("init.after_schema_pg_written")?;
crate::failpoints::maybe_fail(crate::failpoints::names::INIT_AFTER_SCHEMA_PG_WRITTEN)?;
}
write_schema_contract(root, storage.as_ref(), schema_ir).await?;
crate::failpoints::maybe_fail("init.after_schema_contract_written")?;
crate::failpoints::maybe_fail(crate::failpoints::names::INIT_AFTER_SCHEMA_CONTRACT_WRITTEN)?;
let coordinator = GraphCoordinator::init(root, catalog, Arc::clone(storage)).await?;
crate::failpoints::maybe_fail("init.after_coordinator_init")?;
crate::failpoints::maybe_fail(crate::failpoints::names::INIT_AFTER_COORDINATOR_INIT)?;
Ok(coordinator)
}
@ -2466,10 +2548,13 @@ edge WorksAt: Person -> Company
}
async fn seed_person_row(db: &mut Omnigraph, name: &str, age: Option<i32>) {
// No-txn entry, so the handle is always `Some` (collapse #1's skip is
// gated on `txn.is_some()`).
let (ds, full_path, table_branch) = db
.open_for_mutation("node:Person", crate::db::MutationOpKind::Insert)
.await
.unwrap();
.unwrap()
.require_handle("seed_person_row test");
let schema: Arc<Schema> = Arc::new(ds.dataset().schema().into());
let columns: Vec<Arc<dyn Array>> = schema
.fields()

View file

@ -512,7 +512,7 @@ async fn optimize_one_table(
// Test seam: a concurrent (cross-process) writer can interleave here, before
// any Phase-B commit lands, to exercise the reopen+replan path.
crate::failpoints::maybe_fail("optimize.before_compact")?;
crate::failpoints::maybe_fail(crate::failpoints::names::OPTIMIZE_BEFORE_COMPACT)?;
// Phase B: scrub stale auto_cleanup (keeps optimize non-destructive on a
// graph upgraded from a pre-v7 binary whose `compact_files`/`optimize_indices`
@ -549,7 +549,7 @@ async fn optimize_one_table(
// committed (so HEAD is already ahead of the manifest from our own work),
// exercising the own-HEAD (not external) drift classification on the next
// reopened attempt.
if crate::failpoints::maybe_fail("optimize.inject_reindex_conflict").is_err()
if crate::failpoints::maybe_fail(crate::failpoints::names::OPTIMIZE_INJECT_REINDEX_CONFLICT).is_err()
&& attempt < COMPACTION_RETRY_BUDGET
{
continue;
@ -584,7 +584,7 @@ async fn optimize_one_table(
// Pin the per-writer Phase B → Phase C residual: Lance HEAD has advanced but the
// manifest publish below hasn't run.
crate::failpoints::maybe_fail("optimize.post_phase_b_pre_manifest_commit")?;
crate::failpoints::maybe_fail(crate::failpoints::names::OPTIMIZE_POST_PHASE_B_PRE_MANIFEST_COMMIT)?;
// Phase C: monotonic fast-forward publish. The compaction is committed at Lance
// HEAD `N`; publish a manifest pointer that includes it. If a concurrent writer
@ -921,7 +921,7 @@ pub async fn cleanup_all_tables(
let results: Vec<TableCleanupStats> = futures::stream::iter(table_tasks.into_iter())
.map(|(table_key, full_path)| async move {
let outcome: Result<RemovalStats> = async {
crate::failpoints::maybe_fail("cleanup.table_gc")?;
crate::failpoints::maybe_fail(crate::failpoints::names::CLEANUP_TABLE_GC)?;
// `cleanup_old_versions` is a Lance-only maintenance API not
// surfaced through `TableStorage` — see the optimize path
// above for the same rationale. Unwrap via `into_dataset()`.
@ -1079,7 +1079,7 @@ pub async fn reconcile_orphaned_branches(db: &Omnigraph) -> Result<BranchReconci
}
if !branch_snapshots.contains_key(&branch) {
let branch_snapshot =
match crate::failpoints::maybe_fail("cleanup.resolve_branch_snapshot") {
match crate::failpoints::maybe_fail(crate::failpoints::names::CLEANUP_RESOLVE_BRANCH_SNAPSHOT) {
Ok(()) => db.snapshot_for_branch(Some(&branch)).await,
Err(injected) => Err(injected),
};
@ -1158,7 +1158,7 @@ pub async fn reconcile_orphaned_branches(db: &Omnigraph) -> Result<BranchReconci
continue;
}
}
let outcome = match crate::failpoints::maybe_fail("cleanup.reconcile_fork") {
let outcome = match crate::failpoints::maybe_fail(crate::failpoints::names::CLEANUP_RECONCILE_FORK) {
Ok(()) => storage.force_delete_branch(&full_path, &branch).await,
Err(injected) => Err(injected),
};
@ -1308,7 +1308,10 @@ mod tests {
ds.create_branch("feature", base, None).await.unwrap();
}
let _fp = ScopedFailPoint::new("cleanup.resolve_branch_snapshot", "return");
let _fp = ScopedFailPoint::new(
crate::failpoints::names::CLEANUP_RESOLVE_BRANCH_SNAPSHOT,
"return",
);
let stats = reconcile_orphaned_branches(&db).await.unwrap();
assert_eq!(

View file

@ -648,7 +648,7 @@ where
// `recover_schema_state_files`:
// - crash before commit → manifest unchanged; staging deleted on open
// - crash after commit → manifest advanced; staging renamed on open
crate::failpoints::maybe_fail("schema_apply.before_staging_write")?;
crate::failpoints::maybe_fail(crate::failpoints::names::SCHEMA_APPLY_BEFORE_STAGING_WRITE)?;
let staging_pg_uri = schema_source_staging_uri(&db.root_uri);
db.storage
@ -656,7 +656,7 @@ where
.await?;
write_schema_contract_staging(&db.root_uri, db.storage.as_ref(), &desired_ir).await?;
crate::failpoints::maybe_fail("schema_apply.after_staging_write")?;
crate::failpoints::maybe_fail(crate::failpoints::names::SCHEMA_APPLY_AFTER_STAGING_WRITE)?;
// `apply_schema` doesn't currently take an actor; system-attributed.
let PublishedSnapshot {
@ -669,7 +669,7 @@ where
.commit_changes_with_actor(&manifest_changes, None)
.await?;
crate::failpoints::maybe_fail("schema_apply.after_manifest_commit")?;
crate::failpoints::maybe_fail(crate::failpoints::names::SCHEMA_APPLY_AFTER_MANIFEST_COMMIT)?;
db.storage
.rename_text(&staging_pg_uri, &schema_source_uri(&db.root_uri))

View file

@ -296,7 +296,7 @@ pub(super) async fn ensure_indices_for_branch(
// (one commit_staged per index built) but the manifest publish below
// hasn't run. Used by
// `tests/failpoints.rs::ensure_indices_phase_b_failure_recovered_on_next_open`.
crate::failpoints::maybe_fail("ensure_indices.post_phase_b_pre_manifest_commit")?;
crate::failpoints::maybe_fail(crate::failpoints::names::ENSURE_INDICES_POST_PHASE_B_PRE_MANIFEST_COMMIT)?;
if !updates.is_empty() {
commit_prepared_updates_on_branch(db, branch, &updates, None).await?;
@ -488,18 +488,52 @@ pub(super) async fn needs_index_work_edge(
|| !db.storage().has_btree_index(&ds, "dst").await?)
}
/// Result of opening a sub-table for mutation. `handle` is `None` only when a
/// non-strict (Insert/Merge) op on the WriteTxn's own branch skipped the
/// accumulation open (RFC-013 step 3b collapse #1) — there the caller needs just
/// `expected_version`. It is ALWAYS `Some` for strict ops, the fork path, and
/// every no-`txn` caller (branch merge), which use [`Self::require_handle`].
#[derive(Debug)]
pub(crate) struct OpenedForMutation {
/// The opened dataset, or `None` on the non-strict-txn open-skip path.
pub(crate) handle: Option<SnapshotHandle>,
/// The publisher's CAS fence: the opened handle's version, or — when the open
/// was skipped — the pinned base entry's version (equal absent uncovered drift).
pub(crate) expected_version: u64,
pub(crate) full_path: String,
pub(crate) table_branch: Option<String>,
}
impl OpenedForMutation {
/// Destructure for a caller that REQUIRES the handle (strict ops, the fork
/// path, every no-`txn` caller). The `None` skip fires solely on the
/// non-strict `txn` path, which these callers are not — so a panic here means
/// a future change broke that contract, named by `ctx`.
pub(crate) fn require_handle(self, ctx: &str) -> (SnapshotHandle, String, Option<String>) {
let handle = self.handle.unwrap_or_else(|| {
panic!("{ctx}: open_for_mutation returned no handle on a path that requires one")
});
(handle, self.full_path, self.table_branch)
}
}
pub(super) async fn open_for_mutation(
db: &Omnigraph,
table_key: &str,
op_kind: crate::db::MutationOpKind,
) -> Result<(SnapshotHandle, String, Option<String>)> {
) -> Result<OpenedForMutation> {
let current_branch = db
.coordinator
.read()
.await
.current_branch()
.map(str::to_string);
open_for_mutation_on_branch(db, current_branch.as_deref(), table_key, op_kind).await
// `open_for_mutation` is the no-txn entry (branch merge). Passing `None`
// keeps the exact pre-WriteTxn code path (a fresh `resolved_branch_target`
// that re-validates the schema). With `txn = None` the non-strict early-skip
// in `open_for_mutation_on_branch` never fires, so this always returns a
// `Some(handle)` for its callers.
open_for_mutation_on_branch(db, current_branch.as_deref(), table_key, op_kind, None).await
}
/// Open a sub-table for mutation. The `op_kind` selects the strict-vs-relaxed
@ -513,15 +547,69 @@ pub(super) async fn open_for_mutation_on_branch(
branch: Option<&str>,
table_key: &str,
op_kind: crate::db::MutationOpKind,
) -> Result<(SnapshotHandle, String, Option<String>)> {
txn: Option<&crate::db::WriteTxn>,
) -> Result<OpenedForMutation> {
db.ensure_schema_apply_not_locked("write").await?;
let resolved = db.resolved_branch_target(branch).await?;
let entry = resolved
.snapshot
// Source the resolved (snapshot, branch). With a `WriteTxn` the contract was
// validated once at capture, so use the pinned base + resolved branch instead
// of `resolved_branch_target` (which re-runs `ensure_schema_state_valid`). The
// base is the same fresh per-branch manifest read the no-txn path would have
// resolved — only the redundant schema re-validation is dropped. Without a txn
// this is byte-identical to the prior `resolved_branch_target` call.
let (snapshot, resolved_branch) = match txn {
Some(txn) => (txn.base.clone(), txn.branch.clone()),
None => {
let resolved = db.resolved_branch_target(branch).await?;
(resolved.snapshot, resolved.branch)
}
};
let entry = snapshot
.entry(table_key)
.ok_or_else(|| OmniError::manifest(format!("no manifest entry for {}", table_key)))?;
let full_path = format!("{}/{}", db.root_uri, entry.table_path);
match resolved.branch.as_deref() {
// Collapse #1 (RFC-013 step 3b): a non-strict op (Insert/Merge) on the txn's
// own branch needs no dataset open for ACCUMULATION — the only thing the
// caller reads from this handle on the non-strict path is `.version()` (the
// publisher's CAS fence), which is exactly the pinned base version. The base
// already validated the schema contract once, and the staging reopen
// (`reopen_for_mutation`) plus the publisher CAS in `commit_all` are the real
// drift guards. So skip `open_dataset_head_for_write` entirely and source the
// expected version from the pinned entry.
//
// Gated on `txn.is_some()`: without a txn (branch merge's `open_for_mutation`)
// every arm below is byte-identical to before. STRICT ops (Update/Delete/
// SchemaRewrite) always open live HEAD + run `ensure_expected_version`
// (read-modify-write SI), and any write that must FORK (the table isn't yet on
// the resolved branch) opens too (the fork is a real Lance state advance the
// manifest snapshot can't substitute for).
if txn.is_some() && !op_kind.strict_pre_stage_version_check() {
match resolved_branch.as_deref() {
// Non-strict, table already on the active branch → no open, no fork.
Some(active_branch) if entry.table_branch.as_deref() == Some(active_branch) => {
return Ok(OpenedForMutation {
handle: None,
expected_version: entry.table_version,
full_path,
table_branch: Some(active_branch.to_string()),
});
}
// Main branch, non-strict → no open. (Main never forks.)
None => {
return Ok(OpenedForMutation {
handle: None,
expected_version: entry.table_version,
full_path,
table_branch: None,
});
}
// Non-strict but the table isn't on the active branch yet — falls
// through to fork below.
Some(_) => {}
}
}
match resolved_branch.as_deref() {
None => {
let ds = db
.storage()
@ -531,7 +619,13 @@ pub(super) async fn open_for_mutation_on_branch(
db.storage()
.ensure_expected_version(&ds, table_key, entry.table_version)?;
}
Ok((ds, full_path, None))
let version = ds.version();
Ok(OpenedForMutation {
handle: Some(ds),
expected_version: version,
full_path,
table_branch: None,
})
}
Some(active_branch) => {
let (ds, table_branch) = open_owned_dataset_for_branch_write(
@ -544,7 +638,13 @@ pub(super) async fn open_for_mutation_on_branch(
op_kind,
)
.await?;
Ok((ds, full_path, table_branch))
let version = ds.version();
Ok(OpenedForMutation {
handle: Some(ds),
expected_version: version,
full_path,
table_branch,
})
}
}
}
@ -571,7 +671,7 @@ pub(super) async fn open_owned_dataset_for_branch_write(
Ok((ds, Some(active_branch.to_string())))
}
source_branch => {
crate::failpoints::maybe_fail("fork.before_classify")?;
crate::failpoints::maybe_fail(crate::failpoints::names::FORK_BEFORE_CLASSIFY)?;
// Authority check before forking: re-read the live manifest. If this
// table is already forked on active_branch, a concurrent first-write
// won the race and our snapshot is stale — that is a retryable
@ -667,7 +767,7 @@ pub(crate) async fn classify_fork_ref(
// fresh-authority read (no-op without the `failpoints` feature). Lets a
// test exercise the Indeterminate path — a read failure on a live branch
// must classify as Indeterminate (skip), never Orphan (destroy).
let fresh = match crate::failpoints::maybe_fail("classify.fresh_read") {
let fresh = match crate::failpoints::maybe_fail(crate::failpoints::names::CLASSIFY_FRESH_READ) {
Ok(()) => db.fresh_snapshot_for_branch(Some(branch)).await,
Err(injected) => Err(injected),
};
@ -751,7 +851,7 @@ pub(super) async fn reclaim_orphaned_fork_and_refork(
}
}
crate::failpoints::maybe_fail("fork.before_reclaim")?;
crate::failpoints::maybe_fail(crate::failpoints::names::FORK_BEFORE_RECLAIM)?;
db.storage()
.force_delete_branch(full_path, active_branch)
.await
@ -1014,7 +1114,7 @@ async fn stage_and_commit_btree(
// to demonstrate that a stage-step failure in the staged-index
// path (`stage_create_btree_index` succeeded; `commit_staged` not
// yet called) leaves no Lance-HEAD drift on the touched table.
crate::failpoints::maybe_fail("ensure_indices.post_stage_pre_commit_btree")?;
crate::failpoints::maybe_fail(crate::failpoints::names::ENSURE_INDICES_POST_STAGE_PRE_COMMIT_BTREE)?;
let new_ds = db
.storage()
.commit_staged(ds.clone(), staged)
@ -1065,12 +1165,30 @@ async fn prepare_updates_for_commit(
db: &Omnigraph,
branch: Option<&str>,
updates: &[crate::db::SubTableUpdate],
txn: Option<&crate::db::WriteTxn>,
// Post-`commit_staged` handles handed out by `StagedMutation::commit_all`
// (RFC-013 step 3b, collapse #4): table_key → the handle already open at
// its just-committed version. When a table's handle is present, the index
// build below reuses it and SKIPS the `reopen_for_mutation` open. Absent
// entries (other writers — schema apply, merge, ensure_indices, tests —
// pass `HashMap::new()`; inline-committed/delete tables are never staged)
// keep the byte-identical `reopen_for_mutation` path.
mut committed_handles: std::collections::HashMap<String, SnapshotHandle>,
) -> Result<Vec<crate::db::SubTableUpdate>> {
if updates.is_empty() {
return Ok(Vec::new());
}
let snapshot = db.snapshot_for_branch(branch).await?;
// With a `WriteTxn` the schema contract was validated once at capture, so
// reuse the pinned base entries (same per-branch manifest snapshot) instead
// of `snapshot_for_branch` (which re-runs `ensure_schema_state_valid`). Only
// the `entry(table_key).table_path` is read out of it here, identical to the
// no-txn path; the post-`commit_staged` index build below still reopens the
// dataset at its just-committed version. Without a txn, byte-identical.
let snapshot = match txn {
Some(txn) => txn.base.clone(),
None => db.snapshot_for_branch(branch).await?,
};
let mut prepared = Vec::with_capacity(updates.len());
for update in updates {
@ -1084,21 +1202,34 @@ async fn prepare_updates_for_commit(
let mut prepared_update = update.clone();
if prepared_update.row_count > 0 {
let full_path = format!("{}/{}", db.root_uri, entry.table_path);
// Strict version check is correct here: this runs INSIDE
// Reuse the post-`commit_staged` handle when the caller handed one
// out (collapse #4): it is already open at exactly
// `prepared_update.table_version`, so the defense-in-depth strict
// re-check `reopen_for_mutation` would run is trivially satisfied
// and the open is redundant. When no handle is present (other
// writers, or any non-staged table), fall back to the byte-identical
// `reopen_for_mutation` path.
//
// Strict version check is correct on the fallback: this runs INSIDE
// the publisher commit path, after `commit_staged` already
// advanced Lance HEAD to `prepared_update.table_version`.
// The check is a defense-in-depth assertion that the
// dataset state matches what we just committed; not the
// pre-stage race the op-kind policy targets.
let mut ds = reopen_for_mutation(
db,
&prepared_update.table_key,
&full_path,
prepared_update.table_branch.as_deref(),
prepared_update.table_version,
crate::db::MutationOpKind::SchemaRewrite,
)
.await?;
let mut ds = match committed_handles.remove(&prepared_update.table_key) {
Some(ds) => ds,
None => {
reopen_for_mutation(
db,
&prepared_update.table_key,
&full_path,
prepared_update.table_branch.as_deref(),
prepared_update.table_version,
crate::db::MutationOpKind::SchemaRewrite,
)
.await?
}
};
// Any column not yet buildable (e.g. a vector column whose rows
// have null embeddings) is deferred and logged inside
// build_indices; a later ensure_indices/optimize materializes it.
@ -1237,37 +1368,27 @@ pub(super) async fn commit_updates(
.await
.current_branch()
.map(str::to_string);
let prepared = prepare_updates_for_commit(db, current_branch.as_deref(), updates).await?;
let prepared = prepare_updates_for_commit(
db,
current_branch.as_deref(),
updates,
None,
std::collections::HashMap::new(),
)
.await?;
commit_prepared_updates(db, &prepared, None).await
}
pub(super) async fn commit_manifest_updates(
pub(super) async fn commit_merge_with_actor(
db: &Omnigraph,
updates: &[crate::db::SubTableUpdate],
) -> Result<u64> {
db.coordinator
.write()
.await
.commit_manifest_updates(updates)
.await
}
pub(super) async fn record_merge_commit(
db: &Omnigraph,
manifest_version: u64,
parent_commit_id: &str,
merged_parent_commit_id: &str,
actor_id: Option<&str>,
) -> Result<String> {
db.coordinator
.write()
.await
.record_merge_commit(
manifest_version,
parent_commit_id,
merged_parent_commit_id,
actor_id,
)
.commit_merge_with_actor(updates, merged_parent_commit_id, actor_id)
.await
.map(|snapshot_id| snapshot_id.as_str().to_string())
}
@ -1281,9 +1402,12 @@ pub(super) async fn commit_updates_on_branch_with_expected(
updates: &[crate::db::SubTableUpdate],
expected_table_versions: &std::collections::HashMap<String, u64>,
actor_id: Option<&str>,
txn: Option<&crate::db::WriteTxn>,
committed_handles: std::collections::HashMap<String, SnapshotHandle>,
) -> Result<u64> {
db.ensure_schema_apply_not_locked("write commit").await?;
let prepared = prepare_updates_for_commit(db, branch, updates).await?;
let prepared =
prepare_updates_for_commit(db, branch, updates, txn, committed_handles).await?;
commit_prepared_updates_on_branch_with_expected(
db,
branch,

View file

@ -14,15 +14,14 @@
//! this change additive.
//!
//! Atomicity caveat: append to `_graph_commit_recoveries.lance` is
//! sequential w.r.t. the `CommitGraph::append_commit` write. A crash
//! between the two leaves an orphan commit-graph row with no audit row.
//! Same shape as the existing `_graph_commits` + `_graph_commit_actors`
//! split; the recovery sweep tolerates it the same way (re-entry sees
//! `NoMovement` for already-restored / already-published tables; the
//! audit append is retried).
//! sequential w.r.t. the recovery commit, which RFC-013 Phase 7 records in
//! `__manifest` (folded into the recovery publish CAS via `publish_recovery_commit`).
//! A crash between the publish and this audit append leaves a recovery commit
//! with no audit row. The recovery sweep tolerates it the same way (re-entry
//! sees `NoMovement` for already-restored / already-published tables; the audit
//! append is retried, minting a fresh recovery commit).
use std::sync::Arc;
use std::time::{SystemTime, UNIX_EPOCH};
use arrow_array::{
Array, RecordBatch, RecordBatchIterator, StringArray, TimestampMicrosecondArray,
@ -195,7 +194,11 @@ async fn create_recoveries_dataset(root_uri: &str) -> Result<Dataset> {
};
match Dataset::write(reader, &uri as &str, Some(params)).await {
Ok(dataset) => Ok(dataset),
Err(err) if err.to_string().contains("Dataset already exists") => Dataset::open(&uri)
// Create-or-open idempotency — match the typed `DatasetAlreadyExists`
// variant, not the display string (not a Lance API contract). Same
// discipline as `commit_graph.rs`'s create-or-open; pinned by
// `lance_surface_guards.rs::lance_error_dataset_already_exists_variant_exists`.
Err(lance::Error::DatasetAlreadyExists { .. }) => Dataset::open(&uri)
.await
.map_err(|open_err| OmniError::Lance(open_err.to_string())),
Err(err) => Err(OmniError::Lance(err.to_string())),
@ -276,13 +279,6 @@ fn decode_row(batch: &RecordBatch, row: usize) -> Result<RecoveryAuditRecord> {
})
}
pub(crate) fn now_micros() -> Result<i64> {
SystemTime::now()
.duration_since(UNIX_EPOCH)
.map(|d| d.as_micros() as i64)
.map_err(|e| OmniError::manifest_internal(format!("system clock before unix epoch: {}", e)))
}
#[cfg(test)]
mod tests {
use super::*;

View file

@ -1068,10 +1068,13 @@ async fn publish_rewritten_merge_table(
// source onto target). The inline `delete_where` later in this
// function operates on rows the rewrite chose to remove, not
// user-facing predicates, so Merge is the correct policy here.
let (ds, full_path, table_branch) = target_db
// `open_for_mutation` is the no-txn entry, so collapse #1's non-strict
// open-skip (gated on `txn.is_some()`) never fires here — the handle is
// always `Some`.
let (mut current_ds, full_path, table_branch) = target_db
.open_for_mutation(table_key, crate::db::MutationOpKind::Merge)
.await?;
let mut current_ds = ds;
.await?
.require_handle("branch merge");
// Phase 1: merge_insert changed/new rows (preserves _row_created_at_version for
// existing rows, bumps _row_last_updated_at_version only for actually-changed rows).
@ -1125,7 +1128,7 @@ async fn publish_rewritten_merge_table(
// rows are on Lance HEAD but the delete has not committed and the
// achieved-version intent has not been recorded, so recovery must roll BACK.
// See tests/failpoints.rs::branch_merge_rewrite_partial_after_merge_rolls_back.
crate::failpoints::maybe_fail("branch_merge.rewrite_after_merge_pre_delete")?;
crate::failpoints::maybe_fail(crate::failpoints::names::BRANCH_MERGE_REWRITE_AFTER_MERGE_PRE_DELETE)?;
// Phase 2: delete removed rows via deletion vectors.
//
@ -1156,7 +1159,7 @@ async fn publish_rewritten_merge_table(
// recorded, so recovery must roll BACK (the index is reconciler-owned derived
// state, but the merge itself never reached its commit boundary). See
// tests/failpoints.rs::branch_merge_rewrite_partial_after_delete_rolls_back.
crate::failpoints::maybe_fail("branch_merge.rewrite_after_delete_pre_index")?;
crate::failpoints::maybe_fail(crate::failpoints::names::BRANCH_MERGE_REWRITE_AFTER_DELETE_PRE_INDEX)?;
// Phase 3: rebuild indices.
//
@ -1237,10 +1240,13 @@ async fn publish_adopted_delta(
table_key: &str,
delta: &AdoptDelta,
) -> Result<crate::db::SubTableUpdate> {
let (ds, full_path, table_branch) = target_db
// `open_for_mutation` is the no-txn entry, so collapse #1's non-strict
// open-skip (gated on `txn.is_some()`) never fires here — the handle is
// always `Some`.
let (mut current_ds, full_path, table_branch) = target_db
.open_for_mutation(table_key, crate::db::MutationOpKind::Merge)
.await?;
let mut current_ds = ds;
.await?
.require_handle("branch merge");
// Phase 1a: append the NEW rows. `stage_append_stream` is a streaming
// `Operation::Append` — no hash join — so it never buffers the delta and
@ -1270,7 +1276,7 @@ async fn publish_adopted_delta(
// have not committed and the achieved-version intent has not been recorded, so
// recovery must roll BACK (not publish the appends-only state). See
// tests/failpoints.rs::branch_merge_adopt_partial_after_append_rolls_back.
crate::failpoints::maybe_fail("branch_merge.adopt_after_append_pre_upsert")?;
crate::failpoints::maybe_fail(crate::failpoints::names::BRANCH_MERGE_ADOPT_AFTER_APPEND_PRE_UPSERT)?;
// Phase 1b: upsert the CHANGED rows. The merge_insert hash join is now
// bounded to the genuinely-changed set, not the whole delta. It runs against
@ -1302,7 +1308,7 @@ async fn publish_adopted_delta(
// has not committed and the achieved-version intent has not been recorded, so
// recovery must roll BACK. See
// tests/failpoints.rs::branch_merge_adopt_partial_after_upsert_rolls_back.
crate::failpoints::maybe_fail("branch_merge.adopt_after_upsert_pre_delete")?;
crate::failpoints::maybe_fail(crate::failpoints::names::BRANCH_MERGE_ADOPT_AFTER_UPSERT_PRE_DELETE)?;
// Phase 2: delete removed rows via deletion vectors (inline-commit residual,
// same as the three-way path until Lance ships a public two-phase delete).
@ -1787,17 +1793,22 @@ impl Omnigraph {
// (publish_*) AND the sidecar is confirmed, but the manifest publish
// below hasn't run — so recovery rolls FORWARD. Used by
// `tests/failpoints.rs::branch_merge_phase_b_failure_recovered_on_next_open`.
crate::failpoints::maybe_fail("branch_merge.post_phase_b_pre_manifest_commit")?;
crate::failpoints::maybe_fail(crate::failpoints::names::BRANCH_MERGE_POST_PHASE_B_PRE_MANIFEST_COMMIT)?;
let manifest_version = if updates.is_empty() {
self.version().await
} else {
self.commit_manifest_updates(&updates).await?
};
// Publish the merged table versions AND the merge commit in one manifest
// CAS (RFC-013 Phase 7): `graph_commit` + `graph_head` rows ride the same
// merge-insert as the table-version rows. The merge commit's first parent
// is resolved by the publisher as the live target-branch head (the
// post-merge correct parent even if the target advanced); its merged-in
// parent is the source head. `target_head_commit_id` is no longer passed
// — it was the pre-merge target head, which the publisher reads live.
let _ = target_head_commit_id;
self.commit_merge_with_actor(&updates, source_head_commit_id, actor_id)
.await?;
// Recovery sidecar lifecycle: delete after manifest publish.
// Best-effort cleanup; the merge already landed durably so
// failing the user here is undesirable.
// Recovery sidecar lifecycle: delete after the manifest publish (Phase C).
// Best-effort cleanup; the merge already landed durably so failing the
// user here is undesirable.
if let Some((_, handle)) = recovery {
if let Err(err) =
crate::db::manifest::delete_sidecar(&handle, self.storage_adapter()).await
@ -1809,13 +1820,6 @@ impl Omnigraph {
);
}
}
self.record_merge_commit(
manifest_version,
target_head_commit_id,
source_head_commit_id,
actor_id,
)
.await?;
if changed_edge_tables {
self.invalidate_graph_index().await;

View file

@ -601,13 +601,51 @@ use super::staging::{MutationStaging, PendingMode};
/// away once Lance exposes a two-phase delete API
/// ([lance-format/lance#6658](https://github.com/lance-format/lance/issues/6658))
/// and we can stage deletes on the same path as inserts/updates.
impl Omnigraph {
/// Resolve a LIVE-HEAD read handle for an edge table's committed-state `@card`
/// scan when collapse #1 skipped the accumulation open. The edge-insert path no
/// longer opens the edge dataset (non-strict op + txn), but cardinality is
/// validated ONCE (never rechecked at commit), so the scan must observe the
/// freshest committed edges — NOT the pinned `txn.base`. A concurrent writer can
/// commit edges to this table after `txn` capture; counting against the stale
/// base undercounts and lets a violating insert through (invariant 9). The table
/// LOCATION is read from the pinned entry (stable across versions); the dataset is
/// opened at live HEAD via `open_dataset_head_for_write` (a read here despite the
/// name — no lock/stage), restoring the pre-3b image (the mutation's own open).
/// The residual validate→commit race (a writer committing between this scan and
/// the end-of-query commit) is the §7.1 gap, closed by RFC-013 step 4.
async fn edge_cardinality_read_handle(
&self,
txn: Option<&crate::db::WriteTxn>,
table_key: &str,
) -> Result<SnapshotHandle> {
let branch = txn.and_then(|t| t.branch.as_deref());
match txn.and_then(|t| t.base.entry(table_key)) {
Some(entry) => {
let full_path = self.storage().dataset_uri(&entry.table_path);
self.storage()
.open_dataset_head_for_write(table_key, &full_path, branch)
.await
}
// Unreachable today (the `None` handle only reaches here under a txn whose
// base contains the table). Defensive: resolve the table fresh (live)
// without the schema re-validation `snapshot_for_branch` would re-run.
None => {
let snapshot = self.fresh_snapshot_for_branch_unchecked(branch).await?;
self.storage().open_snapshot_at_table(&snapshot, table_key).await
}
}
}
}
async fn open_table_for_mutation(
db: &Omnigraph,
staging: &mut MutationStaging,
branch: Option<&str>,
table_key: &str,
op_kind: crate::db::MutationOpKind,
) -> Result<(SnapshotHandle, String, Option<String>)> {
txn: Option<&crate::db::WriteTxn>,
) -> Result<(Option<SnapshotHandle>, String, Option<String>)> {
if let Some(prior) = staging.inline_committed.get(table_key) {
let path = staging.paths.get(table_key).ok_or_else(|| {
OmniError::manifest_internal(format!(
@ -615,6 +653,10 @@ async fn open_table_for_mutation(
table_key
))
})?;
// The inline-committed reopen does NOT validate the schema contract
// (it reopens at the post-inline-commit Lance version directly), so it
// takes no `txn` — threading it here would change nothing. Deletes are
// strict ops, so this always opens (returns `Some`).
let ds = db
.reopen_for_mutation(
table_key,
@ -624,20 +666,32 @@ async fn open_table_for_mutation(
op_kind,
)
.await?;
return Ok((ds, path.full_path.clone(), path.table_branch.clone()));
return Ok((Some(ds), path.full_path.clone(), path.table_branch.clone()));
}
let (ds, full_path, table_branch) = db
.open_for_mutation_on_branch(branch, table_key, op_kind)
// `open_for_mutation_on_branch` returns the expected version even when it
// skips the open (collapse #1, the non-strict insert/merge path): the version
// is the pinned base's, identical to the opened handle's `.version()`. Use it
// directly for `ensure_path` so the no-open path still captures the publisher
// CAS fence.
let opened = db
.open_for_mutation_on_branch(branch, table_key, op_kind, txn)
.await?;
let expected_version = ds.version();
// Pin the open-skip contract (collapse #1): a missing handle is legal ONLY on
// the non-strict `txn` path. A future change that returns `None` elsewhere
// (e.g. a new strict arm) trips this in debug builds rather than silently
// handing a `None` to a `require_handle` consumer.
debug_assert!(
opened.handle.is_some() || (txn.is_some() && !op_kind.strict_pre_stage_version_check()),
"open_for_mutation_on_branch returned no handle outside the non-strict txn open-skip path",
);
staging.ensure_path(
table_key,
full_path.clone(),
table_branch.clone(),
expected_version,
opened.full_path.clone(),
opened.table_branch.clone(),
opened.expected_version,
op_kind,
);
Ok((ds, full_path, table_branch))
Ok((opened.handle, opened.full_path, opened.table_branch))
}
/// D₂ parse-time check: a single mutation query is either insert/update-only
@ -720,14 +774,14 @@ impl Omnigraph {
params: &ParamMap,
actor_id: Option<&str>,
) -> Result<MutationResult> {
self.ensure_schema_state_valid().await?;
// Converge any pending recovery sidecar (a previously failed
// writer's Phase B → Phase C residual) before executing: the
// inline delete path advances Lance HEAD during execution and
// the staged path's commit-time drift guard refuses
// sidecar-covered drift, so a long-lived handle must heal here
// — not at restart. One `list_dir` when no sidecars exist (the
// steady state).
// steady state). MUST run before `open_write_txn` below — the heal
// may advance the manifest, so the pinned base must be captured after.
self.heal_pending_recovery_sidecars().await?;
let requested = Self::normalize_branch_name(branch)?;
// Reject internal `__run__*` / system-prefixed branches at the
@ -737,6 +791,16 @@ impl Omnigraph {
if let Some(name) = requested.as_deref() {
crate::db::ensure_public_branch_ref(name, "mutate")?;
}
// Capture-once write transaction (RFC-013 step 3b). `open_write_txn`
// validates the schema contract ONCE (it resolves the branch target,
// whose first line is `ensure_schema_state_valid`) and pins the base
// snapshot for this write. Threaded as `Some(&txn)` through execution,
// staging commit, and the manifest publish so the per-table opens and
// the commit-time OCC re-read reuse the pinned base instead of
// re-validating the contract at every resolve point. Captured AFTER the
// recovery heal (which may advance the manifest) and AFTER `requested`
// is known so it pins the post-heal snapshot for the correct branch.
let txn = self.open_write_txn(requested.as_deref()).await?;
let resolved_params = enrich_mutation_params(params)?;
// Per-query staging accumulator. Inserts and updates push batches
@ -785,7 +849,13 @@ impl Omnigraph {
};
let exec_result = self
.execute_named_mutation(&ir, &resolved_params, requested.as_deref(), &mut staging)
.execute_named_mutation(
&ir,
&resolved_params,
requested.as_deref(),
&mut staging,
Some(&txn),
)
.await;
match exec_result {
@ -799,13 +869,20 @@ impl Omnigraph {
// interleave between our commit_staged and our publish
// (which would correctly fail our CAS but leave Lance
// HEAD advanced — the residual class MR-870 recovers).
let (updates, expected_versions, sidecar_handle, _queue_guards) = staged
let super::staging::CommittedMutation {
updates,
expected_versions,
sidecar_handle,
guards: _queue_guards,
committed_handles,
} = staged
.commit_all(
self,
requested.as_deref(),
crate::db::manifest::SidecarKind::Mutation,
actor_id,
fork_queue_guards,
Some(&txn),
)
.await?;
// Failpoint that wedges the documented finalize→publisher
@ -818,12 +895,14 @@ impl Omnigraph {
// across this failure so the next `Omnigraph::open`'s
// recovery sweep can roll forward — see
// `tests/failpoints.rs::recovery_rolls_forward_after_finalize_publisher_failure`.
crate::failpoints::maybe_fail("mutation.post_finalize_pre_publisher")?;
crate::failpoints::maybe_fail(crate::failpoints::names::MUTATION_POST_FINALIZE_PRE_PUBLISHER)?;
self.commit_updates_on_branch_with_expected(
requested.as_deref(),
&updates,
&expected_versions,
actor_id,
Some(&txn),
committed_handles,
)
.await?;
// Phase C succeeded — sidecar can be deleted. If this
@ -938,6 +1017,7 @@ impl Omnigraph {
params: &ParamMap,
branch: Option<&str>,
staging: &mut MutationStaging,
txn: Option<&crate::db::WriteTxn>,
) -> Result<MutationResult> {
let mut total = MutationResult::default();
for op in &ir.ops {
@ -946,7 +1026,7 @@ impl Omnigraph {
type_name,
assignments,
} => {
self.execute_insert(type_name, assignments, params, branch, staging)
self.execute_insert(type_name, assignments, params, branch, staging, txn)
.await?
}
MutationOpIR::Update {
@ -954,14 +1034,16 @@ impl Omnigraph {
assignments,
predicate,
} => {
self.execute_update(type_name, assignments, predicate, params, branch, staging)
.await?
self.execute_update(
type_name, assignments, predicate, params, branch, staging, txn,
)
.await?
}
MutationOpIR::Delete {
type_name,
predicate,
} => {
self.execute_delete(type_name, predicate, params, branch, staging)
self.execute_delete(type_name, predicate, params, branch, staging, txn)
.await?
}
};
@ -978,6 +1060,7 @@ impl Omnigraph {
params: &ParamMap,
branch: Option<&str>,
staging: &mut MutationStaging,
txn: Option<&crate::db::WriteTxn>,
) -> Result<MutationResult> {
let mut resolved: HashMap<String, Literal> = HashMap::new();
for a in assignments {
@ -1025,8 +1108,12 @@ impl Omnigraph {
} else {
crate::db::MutationOpKind::Insert
};
// Node inserts are non-strict (Insert/Merge), so with a `WriteTxn`
// this opens NOTHING (collapse #1) — the handle is discarded anyway;
// only `ensure_path`'s captured version (read inside
// `open_table_for_mutation`) is used downstream.
let (_ds, _full_path, _table_branch) =
open_table_for_mutation(self, staging, branch, &table_key, insert_kind).await?;
open_table_for_mutation(self, staging, branch, &table_key, insert_kind, txn).await?;
// Accumulate. @key inserts go into the Merge stream (so a
// later update on the same id coalesces correctly); no-key
// inserts go into the Append stream.
@ -1059,13 +1146,16 @@ impl Omnigraph {
)?;
}
let table_key = format!("edge:{}", type_name);
// Capture pre-write metadata on first touch (no Lance write).
let (ds, _full_path, _table_branch) = open_table_for_mutation(
// Capture pre-write metadata on first touch. Edge inserts are
// non-strict, so with a `WriteTxn` this opens NOTHING (collapse #1)
// and returns `None`.
let (handle, _full_path, _table_branch) = open_table_for_mutation(
self,
staging,
branch,
&table_key,
crate::db::MutationOpKind::Insert,
txn,
)
.await?;
// Accumulate the new edge row. Edge IDs are ULID-generated so
@ -1075,9 +1165,27 @@ impl Omnigraph {
// Edge cardinality validation: scan committed edges via Lance
// + iterate pending edges in-memory for the `src` column,
// group-by-src. The pending side already includes the row
// we just appended (above).
validate_edge_cardinality_with_pending(self, &ds, staging, &table_key, edge_type)
// we just appended (above). When the open was skipped (collapse
// #1), resolve a read handle for the committed scan at LIVE HEAD
// (`edge_cardinality_read_handle`, #298) — NOT the pinned txn.base,
// which would undercount edges a concurrent writer committed since
// capture. Only when cardinality is non-default, so the common
// default-cardinality edge keeps the open-free path. (The residual
// validate→commit race is the §7.1 gap — step 4.)
if !edge_type.cardinality.is_default() {
let committed_ds = match handle {
Some(h) => h,
None => self.edge_cardinality_read_handle(txn, &table_key).await?,
};
validate_edge_cardinality_with_pending(
self,
&committed_ds,
staging,
&table_key,
edge_type,
)
.await?;
}
self.invalidate_graph_index().await;
@ -1098,6 +1206,7 @@ impl Omnigraph {
params: &ParamMap,
branch: Option<&str>,
staging: &mut MutationStaging,
txn: Option<&crate::db::WriteTxn>,
) -> Result<MutationResult> {
// Defense in depth: ensure this is a node type
if !self.catalog().node_types.contains_key(type_name) {
@ -1122,14 +1231,18 @@ impl Omnigraph {
let blob_props = self.catalog().node_types[type_name].blob_properties.clone();
let table_key = format!("node:{}", type_name);
let (ds, _full_path, _table_branch) = open_table_for_mutation(
let (handle, _full_path, _table_branch) = open_table_for_mutation(
self,
staging,
branch,
&table_key,
crate::db::MutationOpKind::Update,
txn,
)
.await?;
// Update is a STRICT op, so collapse #1 never skips its open — the
// handle is always `Some` (and it's needed for the committed scan below).
let ds = handle.expect("strict Update op always opens its dataset");
// Scan committed via Lance + apply the same predicate to pending
// batches via DataFusion `MemTable` (read-your-writes for prior
@ -1228,13 +1341,14 @@ impl Omnigraph {
params: &ParamMap,
branch: Option<&str>,
staging: &mut MutationStaging,
txn: Option<&crate::db::WriteTxn>,
) -> Result<MutationResult> {
let is_node = self.catalog().node_types.contains_key(type_name);
if is_node {
self.execute_delete_node(type_name, predicate, params, branch, staging)
self.execute_delete_node(type_name, predicate, params, branch, staging, txn)
.await
} else {
self.execute_delete_edge(type_name, predicate, params, branch, staging)
self.execute_delete_edge(type_name, predicate, params, branch, staging, txn)
.await
}
}
@ -1246,18 +1360,22 @@ impl Omnigraph {
params: &ParamMap,
branch: Option<&str>,
staging: &mut MutationStaging,
txn: Option<&crate::db::WriteTxn>,
) -> Result<MutationResult> {
let pred_sql = predicate_to_sql(predicate, params, false)?;
let table_key = format!("node:{}", type_name);
let (ds, full_path, table_branch) = open_table_for_mutation(
let (handle, full_path, table_branch) = open_table_for_mutation(
self,
staging,
branch,
&table_key,
crate::db::MutationOpKind::Delete,
txn,
)
.await?;
// Delete is a STRICT op, so collapse #1 never skips its open.
let ds = handle.expect("strict Delete op always opens its dataset");
let initial_version = ds.version();
// Scan matching IDs for cascade. Per D₂ this never overlaps with
@ -1305,7 +1423,7 @@ impl Omnigraph {
crate::db::MutationOpKind::Delete,
)
.await?;
crate::failpoints::maybe_fail("mutation.delete_node_pre_primary_delete")?;
crate::failpoints::maybe_fail(crate::failpoints::names::MUTATION_DELETE_NODE_PRE_PRIMARY_DELETE)?;
let (_new_ds, delete_state) = self
.storage_inline_residual()
.delete_where(&full_path, ds, &pred_sql)
@ -1347,14 +1465,17 @@ impl Omnigraph {
let edge_table_key = format!("edge:{}", edge_name);
let cascade_filter = cascade_filters.join(" OR ");
let (edge_ds, edge_full_path, edge_table_branch) = open_table_for_mutation(
let (edge_handle, edge_full_path, edge_table_branch) = open_table_for_mutation(
self,
staging,
branch,
&edge_table_key,
crate::db::MutationOpKind::Delete,
txn,
)
.await?;
// Delete is a STRICT op, so collapse #1 never skips its open.
let edge_ds = edge_handle.expect("strict Delete op always opens its dataset");
let (_new_edge_ds, edge_delete) = self
.storage_inline_residual()
@ -1391,18 +1512,22 @@ impl Omnigraph {
params: &ParamMap,
branch: Option<&str>,
staging: &mut MutationStaging,
txn: Option<&crate::db::WriteTxn>,
) -> Result<MutationResult> {
let pred_sql = predicate_to_sql(predicate, params, true)?;
let table_key = format!("edge:{}", type_name);
let (ds, full_path, table_branch) = open_table_for_mutation(
let (handle, full_path, table_branch) = open_table_for_mutation(
self,
staging,
branch,
&table_key,
crate::db::MutationOpKind::Delete,
txn,
)
.await?;
// Delete is a STRICT op, so collapse #1 never skips its open.
let ds = handle.expect("strict Delete op always opens its dataset");
let (_new_ds, delete_state) = self
.storage_inline_residual()

View file

@ -440,6 +440,26 @@ struct StagedTableEntry {
staged_write: StagedHandle,
}
/// Output of [`StagedMutation::commit_all`] (Phase B): the publisher's input plus
/// the queue guards the caller must hold across the manifest publish.
pub(crate) struct CommittedMutation {
/// Per-table updates to publish to the manifest.
pub(crate) updates: Vec<SubTableUpdate>,
/// Per-table manifest pins refreshed under the write queue — the publisher's CAS fence.
pub(crate) expected_versions: HashMap<String, u64>,
/// Recovery sidecar to delete after Phase C succeeds (`None` when nothing staged).
pub(crate) sidecar_handle: Option<RecoverySidecarHandle>,
/// Per-`(table, branch)` write-queue guards — the caller MUST hold these across
/// the manifest publish (see `commit_all`) so no writer interleaves between
/// `commit_staged` and the publish.
pub(crate) guards: Vec<tokio::sync::OwnedMutexGuard<()>>,
/// Post-`commit_staged` handle per STAGED table (table_key → handle at the
/// just-committed version). Carried out (RFC-013 step 3b, collapse #4) so the
/// publish-prepare index build reuses it instead of a fresh `reopen_for_mutation`
/// at the same version. Inline-committed / delete tables are absent (no staged handle).
pub(crate) committed_handles: HashMap<String, SnapshotHandle>,
}
impl StagedMutation {
/// **Phase B** of the two-phase commit: acquire per-`(table_key,
/// branch)` queues, revalidate manifest pins, write the recovery
@ -485,12 +505,8 @@ impl StagedMutation {
Vec<(String, Option<String>)>,
Vec<tokio::sync::OwnedMutexGuard<()>>,
)>,
) -> Result<(
Vec<SubTableUpdate>,
HashMap<String, u64>,
Option<RecoverySidecarHandle>,
Vec<tokio::sync::OwnedMutexGuard<()>>,
)> {
txn: Option<&crate::db::WriteTxn>,
) -> Result<CommittedMutation> {
let StagedMutation {
inline_committed,
mut staged,
@ -585,7 +601,18 @@ impl StagedMutation {
// Multi-coordinator deployments (§VI.27 aspirational) get
// genuine cross-process drift detection from this read for
// free.
let snapshot = db.fresh_snapshot_for_branch(branch).await?;
//
// This MUST be a FRESH per-branch manifest read (never the warm
// cache) for the OCC re-capture below — but with a `WriteTxn` the
// schema contract was already validated at capture, so use the
// `_unchecked` variant, which drops the redundant
// `ensure_schema_state_valid` AND the commit-graph load the OCC read
// never consults (a fresh manifest read yields the same `Snapshot`).
// Without a txn this is byte-identical to the prior checked call.
let snapshot = match txn {
Some(_) => db.fresh_snapshot_for_branch_unchecked(branch).await?,
None => db.fresh_snapshot_for_branch(branch).await?,
};
for entry in staged.iter_mut() {
let current = snapshot
.entry(&entry.table_key)
@ -619,15 +646,20 @@ impl StagedMutation {
// live Lance HEAD still equals that manifest pin. If an external
// raw Lance write or a pre-fix maintenance path moved HEAD without
// publishing `__manifest`, this write must not silently fold it.
let head = db
.storage()
.open_dataset_head_for_write(
&entry.table_key,
&entry.path.full_path,
entry.path.table_branch.as_deref(),
)
.await?
.version();
//
// `latest_version_id` reads the latest manifest pointer off the
// already-open staged handle (the #2 staging open) WITHOUT a fresh
// `Dataset::open` — the same cheap live-HEAD probe
// `ManifestCoordinator::probe_latest_version` uses. This replaces a
// redundant `open_dataset_head_for_write` (RFC-013 step 3b, collapse
// #3): the drift comparison below is byte-identical; only how `head`
// is obtained changes (probe vs cold open).
let head = entry
.dataset
.dataset()
.latest_version_id()
.await
.map_err(|e| OmniError::Lance(e.to_string()))?;
if head < current {
return Err(OmniError::manifest_internal(format!(
"table '{}' Lance HEAD version {} is behind manifest version {}",
@ -786,6 +818,12 @@ impl StagedMutation {
let mut updates: Vec<SubTableUpdate> = inline_committed.into_values().collect();
// Carry each staged table's post-`commit_staged` handle out so the
// publish-prepare index build reuses it (collapse #4) instead of
// re-opening the dataset at the same just-committed version.
let mut committed_handles: HashMap<String, SnapshotHandle> =
HashMap::with_capacity(staged.len());
for entry in staged {
let StagedTableEntry {
table_key,
@ -798,15 +836,22 @@ impl StagedMutation {
let new_ds = db.storage().commit_staged(dataset, staged_write).await?;
let state = db.storage().table_state(&path.full_path, &new_ds).await?;
updates.push(SubTableUpdate {
table_key,
table_key: table_key.clone(),
table_version: state.version,
table_branch: path.table_branch.clone(),
row_count: state.row_count,
version_metadata: state.version_metadata,
});
committed_handles.insert(table_key, new_ds);
}
Ok((updates, expected_versions, sidecar_handle, guards))
Ok(CommittedMutation {
updates,
expected_versions,
sidecar_handle,
guards,
committed_handles,
})
}
}

View file

@ -14,6 +14,115 @@ pub(crate) fn maybe_fail(_name: &str) -> Result<()> {
Ok(())
}
/// Failpoint that injects a *Lance* error rather than an `OmniError`. Used to
/// stand in for a `Dataset::open` failing with a transient/corrupt (non-not-found)
/// error, so a test can drive the caller's lance-error classification — the
/// behavior FIX A (`read_legacy_commit_cache`) relies on: a not-found is benign
/// (empty), anything else propagates. A no-op without the `failpoints` feature
/// (the injected variant is therefore unreachable in release builds).
#[allow(unused_variables)]
pub(crate) fn maybe_fail_lance_open(name: &str) -> std::result::Result<(), lance::Error> {
#[cfg(feature = "failpoints")]
{
fail::fail_point!(name, |_| {
Err(lance::Error::io(format!(
"injected failpoint triggered: {name}"
)))
});
}
Ok(())
}
/// Failpoint that injects a Lance `IncompatibleTransaction` — the variant a
/// concurrent `UpdateConfig` stamp race produces. Lets a test drive the v3→v4
/// stamp loop's exhaustion path (`commit_v4_stamp_idempotently`) deterministically;
/// it is otherwise near-unreachable, since a real concurrent winner stamps the SAME
/// value, so the loop's re-read returns `Ok` on the first retry. A no-op without the
/// `failpoints` feature.
#[allow(unused_variables)]
pub(crate) fn maybe_fail_lance_incompatible(name: &str) -> std::result::Result<(), lance::Error> {
#[cfg(feature = "failpoints")]
{
fail::fail_point!(name, |_| {
Err(lance::Error::incompatible_transaction_source(
format!("injected failpoint triggered: {name}").into(),
))
});
}
Ok(())
}
/// Failpoint that injects a *retryable* `RowLevelCasContention` `OmniError` — the
/// typed conflict the manifest publisher's outer retry treats as retryable
/// (`is_retryable_publish_conflict`). Used to drive the publisher's
/// retry-on-`load_publish_state`-error path deterministically: the v3→v4 migration
/// surfaces this same type on exhaustion EXPECTING the publisher to re-run the
/// load, a path otherwise reachable only under sustained multi-writer contention.
/// A no-op without the `failpoints` feature.
#[allow(unused_variables)]
pub(crate) fn maybe_fail_retryable_contention(name: &str) -> Result<()> {
#[cfg(feature = "failpoints")]
{
fail::fail_point!(name, |_| {
return Err(crate::error::OmniError::manifest_row_level_cas_contention(
format!("injected retryable contention failpoint: {name}"),
));
});
}
Ok(())
}
/// Compile-checked catalog of every failpoint name in this crate. Call sites
/// (`maybe_fail`) and tests (`ScopedFailPoint` / the test rendezvous helper)
/// reference these constants instead of bare string literals, so a typo is a
/// compile error rather than a silently-never-firing failpoint.
pub mod names {
pub const BRANCH_CREATE_AFTER_MANIFEST_BRANCH_CREATE: &str = "branch_create.after_manifest_branch_create";
pub const BRANCH_DELETE_BEFORE_COMMIT_GRAPH_RECLAIM: &str = "branch_delete.before_commit_graph_reclaim";
pub const BRANCH_DELETE_BEFORE_TABLE_CLEANUP: &str = "branch_delete.before_table_cleanup";
pub const BRANCH_MERGE_ADOPT_AFTER_APPEND_PRE_UPSERT: &str = "branch_merge.adopt_after_append_pre_upsert";
pub const BRANCH_MERGE_ADOPT_AFTER_UPSERT_PRE_DELETE: &str = "branch_merge.adopt_after_upsert_pre_delete";
pub const BRANCH_MERGE_POST_PHASE_B_PRE_MANIFEST_COMMIT: &str = "branch_merge.post_phase_b_pre_manifest_commit";
pub const BRANCH_MERGE_REWRITE_AFTER_DELETE_PRE_INDEX: &str = "branch_merge.rewrite_after_delete_pre_index";
pub const BRANCH_MERGE_REWRITE_AFTER_MERGE_PRE_DELETE: &str = "branch_merge.rewrite_after_merge_pre_delete";
pub const CLASSIFY_FRESH_READ: &str = "classify.fresh_read";
pub const CLEANUP_RECONCILE_FORK: &str = "cleanup.reconcile_fork";
pub const CLEANUP_RESOLVE_BRANCH_SNAPSHOT: &str = "cleanup.resolve_branch_snapshot";
pub const CLEANUP_TABLE_GC: &str = "cleanup.table_gc";
pub const ENSURE_INDICES_POST_PHASE_B_PRE_MANIFEST_COMMIT: &str = "ensure_indices.post_phase_b_pre_manifest_commit";
pub const ENSURE_INDICES_POST_STAGE_PRE_COMMIT_BTREE: &str = "ensure_indices.post_stage_pre_commit_btree";
pub const FORK_BEFORE_CLASSIFY: &str = "fork.before_classify";
pub const FORK_BEFORE_RECLAIM: &str = "fork.before_reclaim";
pub const GRAPH_PUBLISH_AFTER_MANIFEST_COMMIT: &str = "graph_publish.after_manifest_commit";
pub const GRAPH_PUBLISH_BEFORE_COMMIT_APPEND: &str = "graph_publish.before_commit_append";
pub const INIT_AFTER_COORDINATOR_INIT: &str = "init.after_coordinator_init";
pub const INIT_AFTER_SCHEMA_CONTRACT_WRITTEN: &str = "init.after_schema_contract_written";
pub const INIT_AFTER_SCHEMA_PG_WRITTEN: &str = "init.after_schema_pg_written";
pub const MUTATION_DELETE_NODE_PRE_PRIMARY_DELETE: &str = "mutation.delete_node_pre_primary_delete";
pub const MUTATION_POST_FINALIZE_PRE_PUBLISHER: &str = "mutation.post_finalize_pre_publisher";
pub const OPTIMIZE_BEFORE_COMPACT: &str = "optimize.before_compact";
pub const OPTIMIZE_INJECT_REINDEX_CONFLICT: &str = "optimize.inject_reindex_conflict";
pub const OPTIMIZE_POST_PHASE_B_PRE_MANIFEST_COMMIT: &str = "optimize.post_phase_b_pre_manifest_commit";
pub const RECOVERY_BEFORE_ROLL_FORWARD_PUBLISH: &str = "recovery.before_roll_forward_publish";
pub const RECOVERY_ORPHAN_DISCARD_AUDIT_APPEND: &str = "recovery.orphan_discard_audit_append";
pub const RECOVERY_RECORD_AUDIT: &str = "recovery.record_audit";
pub const RECOVERY_SIDECAR_CONFIRM: &str = "recovery.sidecar_confirm";
pub const RECOVERY_SIDECAR_DELETE: &str = "recovery.sidecar_delete";
pub const RECOVERY_SIDECAR_LIST: &str = "recovery.sidecar_list";
pub const RECOVERY_SIDECAR_WRITE: &str = "recovery.sidecar_write";
pub const SCHEMA_APPLY_AFTER_MANIFEST_COMMIT: &str = "schema_apply.after_manifest_commit";
pub const SCHEMA_APPLY_AFTER_STAGING_WRITE: &str = "schema_apply.after_staging_write";
pub const SCHEMA_APPLY_BEFORE_STAGING_WRITE: &str = "schema_apply.before_staging_write";
// RFC-013 Phase 7 migration failpoints (this branch).
pub const MIGRATION_V3_TO_V4_LEGACY_OPEN: &str = "migration.v3_to_v4.legacy_open";
pub const MIGRATION_V4_STAMP_FORCE_INCOMPATIBLE: &str = "migration.v4_stamp.force_incompatible";
/// Injects a retryable `RowLevelCasContention` from `load_publish_state` so a
/// test can prove the publisher's outer retry re-runs the load (the migration
/// surfaces this same typed error on exhaustion).
pub const PUBLISH_LOAD_STATE_RETRYABLE_CONTENTION: &str =
"publish.load_state_retryable_contention";
}
#[cfg(feature = "failpoints")]
pub struct ScopedFailPoint {
name: String,
@ -27,6 +136,20 @@ impl ScopedFailPoint {
name: name.to_string(),
}
}
/// Register a callback failpoint with the same Drop-based cleanup as
/// `new`. Without the guard, a panic while the point is active would
/// leak the callback into the process-global registry and fire it under
/// later tests in the same binary.
pub fn with_callback<F>(name: &str, callback: F) -> Self
where
F: Fn() + Send + Sync + 'static,
{
fail::cfg_callback(name, callback).expect("configure callback failpoint");
Self {
name: name.to_string(),
}
}
}
#[cfg(feature = "failpoints")]

View file

@ -43,6 +43,23 @@ pub struct QueryIoProbes {
/// handle cache (Fix 3) serves them.
pub table_wrapper: Option<Arc<dyn WrappingObjectStore>>,
pub probe_count: Arc<AtomicU64>,
/// Counts DATA-table open CALLS through the two instrumented chokepoints
/// (`open_dataset_tracked` / `open_table_dataset`), classified by URI so the
/// internal/system tables (`__manifest`, `_graph_commits*`) are EXCLUDED — the
/// publisher CAS and commit-graph append open those every write, and counting
/// them would make the `data_open_count <= |touched_tables|` write gate
/// (RFC-013 step 3b) unreachable by threading alone. Unlike the opener-read
/// term (which mixes with the merge-insert/RI scan on the write path), this is
/// an exact open-invocation count. `forbidden_apis` keeps engine code OUTSIDE the
/// storage layer (`exec/`, `db/omnigraph/`, `loader/`, `changes/`) from opening
/// datasets except through these chokepoints, so the count is complete for the
/// keyed-write data path the gate measures. (`table_store.rs` is allow-listed and
/// does hold direct `Dataset::open`s — but only for branch-management ops
/// (`delete_branch`/`list_branches`/`force_delete_branch`), never that hot path.)
pub data_open_count: Arc<AtomicU64>,
/// Internal/system-table (`__manifest`, `_graph_commits*`) open CALLS — the
/// complement of `data_open_count`, kept for symmetry and debugging.
pub internal_open_count: Arc<AtomicU64>,
}
tokio::task_local! {
@ -80,6 +97,39 @@ pub(crate) fn record_probe() {
let _ = current(|p| p.probe_count.fetch_add(1, Ordering::Relaxed));
}
/// Internal/system table directory names. An open of one of these is a metadata
/// open (publisher CAS, commit-graph append, recovery audit), NOT a data-table
/// open. Kept in sync with the dir constants in `db/manifest/layout.rs`,
/// `db/commit_graph.rs`, and `db/recovery_audit.rs`.
const INTERNAL_TABLE_DIRS: [&str; 4] = [
"__manifest",
"_graph_commits.lance",
"_graph_commit_actors.lance",
"_graph_commit_recoveries.lance",
];
/// True when `uri`'s last path segment names an internal/system table.
fn open_is_internal(uri: &str) -> bool {
let trimmed = uri.trim_end_matches('/');
let last = trimmed.rsplit('/').next().unwrap_or(trimmed);
INTERNAL_TABLE_DIRS.contains(&last)
}
/// Record one table-open call against the active per-query probes, classified by
/// table class (the URI's last segment) so the write gate counts DATA-table opens
/// only and ignores the publisher/commit-graph metadata opens. No-op in production
/// (the classification runs only inside the probe closure, which `current` skips
/// when no probes are installed). Called at both open chokepoints.
pub(crate) fn record_open(uri: &str) {
let _ = current(|p| {
if open_is_internal(uri) {
p.internal_open_count.fetch_add(1, Ordering::Relaxed);
} else {
p.data_open_count.fetch_add(1, Ordering::Relaxed);
}
});
}
/// Per-operation staged-write counts, installed for a task via
/// [`with_merge_write_probes`]. Lets a cost-budget test assert WHICH staged-write
/// primitive an operation invokes — e.g. that an append-only fast-forward merge
@ -177,6 +227,7 @@ pub(crate) async fn open_dataset_tracked(
uri: &str,
wrapper: Option<Arc<dyn WrappingObjectStore>>,
) -> Result<Dataset> {
record_open(uri);
let result = match wrapper {
None => Dataset::open(uri).await,
Some(wrapper) => {
@ -203,6 +254,7 @@ pub(crate) async fn open_table_dataset(
version: u64,
session: Option<&Arc<lance::session::Session>>,
) -> Result<Dataset> {
record_open(location);
let mut builder = DatasetBuilder::from_uri(location).with_version(version);
if let Some(session) = session {
builder = builder.with_session(session.clone());

View file

@ -187,7 +187,10 @@ impl Omnigraph {
&omnigraph_policy::ResourceScope::Branch(branch.to_string()),
actor_id,
)?;
self.ensure_schema_state_valid().await?;
// Schema-contract validation is captured ONCE per write via the
// `WriteTxn` opened in `load_jsonl_reader` (after branch resolution).
// The redundant `ensure_schema_state_valid` that used to run here is
// subsumed by `open_write_txn`'s `resolved_branch_target` call.
// Converge any pending recovery sidecar (a previously failed
// writer's Phase B → Phase C residual) before staging anything:
// without this, sidecar-covered drift wedges every load on the
@ -397,7 +400,16 @@ async fn load_jsonl_reader<R: BufRead>(
// inline path.
let mut result = LoadResult::default();
let snapshot = db.snapshot_for_branch(branch).await?;
// Capture-once write transaction (RFC-013 step 3b). `open_write_txn`
// validates the schema contract ONCE and pins the base snapshot. Threaded
// as `Some(&txn)` through the per-table opens and the manifest publish so
// each resolve point reuses the pinned base instead of re-validating the
// contract. The branch already exists here (fork-if-missing ran in
// `load_as` before this), so this captures the post-fork snapshot. The
// load's own base read (`db.snapshot_for_branch` previously) is the same
// per-branch snapshot, so reuse `txn.base` for it — dropping a validation.
let txn = db.open_write_txn(branch).await?;
let snapshot = txn.base.clone();
let mut staging = MutationStaging::default();
let pending_mode = match mode {
LoadMode::Merge => PendingMode::Merge,
@ -481,15 +493,18 @@ async fn load_jsonl_reader<R: BufRead>(
// Phase 2b: accumulate every node type in memory. Fragment writes are
// delayed until after all validation succeeds.
for (type_name, table_key, batch, loaded_count) in prepared_nodes {
let (ds, full_path, table_branch) = db
.open_for_mutation_on_branch(branch, &table_key, load_op_kind)
// The loader only needs the captured expected version (the publisher's
// CAS fence) for `ensure_path` — it discards the handle. With a
// non-strict load op (Merge/Append) and a `WriteTxn`, collapse #1 skips
// the dataset open and returns the pinned base version directly.
let opened = db
.open_for_mutation_on_branch(branch, &table_key, load_op_kind, Some(&txn))
.await?;
let expected_version = ds.version();
staging.ensure_path(
&table_key,
full_path,
table_branch,
expected_version,
opened.full_path,
opened.table_branch,
opened.expected_version,
load_op_kind,
);
let schema = batch.schema();
@ -553,15 +568,16 @@ async fn load_jsonl_reader<R: BufRead>(
// Phase 2e: accumulate every edge type. Same dispatch as Phase 2b.
for (edge_name, table_key, batch, loaded_count) in prepared_edges {
let (ds, full_path, table_branch) = db
.open_for_mutation_on_branch(branch, &table_key, load_op_kind)
// Same as the node phase: only the captured expected version is used;
// collapse #1 skips the open for a non-strict load op under a `WriteTxn`.
let opened = db
.open_for_mutation_on_branch(branch, &table_key, load_op_kind, Some(&txn))
.await?;
let expected_version = ds.version();
staging.ensure_path(
&table_key,
full_path,
table_branch,
expected_version,
opened.full_path,
opened.table_branch,
opened.expected_version,
load_op_kind,
);
let schema = batch.schema();
@ -589,22 +605,36 @@ async fn load_jsonl_reader<R: BufRead>(
// `_queue_guards` holds per-(table_key, branch) write queues
// across the manifest publish below — see exec/mutation.rs for
// the rationale (interleaving prevention).
let (updates, expected_versions, sidecar_handle, _queue_guards) = staged
let crate::exec::staging::CommittedMutation {
updates,
expected_versions,
sidecar_handle,
guards: _queue_guards,
committed_handles,
} = staged
.commit_all(
db,
branch,
crate::db::manifest::SidecarKind::Load,
actor_id,
fork_queue_guards,
Some(&txn),
)
.await?;
// Same finalize → publisher residual as mutations: per-table
// staged commits have advanced Lance HEAD, but the manifest
// publish has not run yet. Reuse the mutation failpoint name so
// one failpoint pins the shared `MutationStaging` boundary.
crate::failpoints::maybe_fail("mutation.post_finalize_pre_publisher")?;
db.commit_updates_on_branch_with_expected(branch, &updates, &expected_versions, actor_id)
.await?;
crate::failpoints::maybe_fail(crate::failpoints::names::MUTATION_POST_FINALIZE_PRE_PUBLISHER)?;
db.commit_updates_on_branch_with_expected(
branch,
&updates,
&expected_versions,
actor_id,
Some(&txn),
committed_handles,
)
.await?;
// The recovery sidecar protects the per-table commit_staged →
// manifest publish window. Phase C succeeded — clean up
// best-effort: failing the user here would error out a write
@ -1548,80 +1578,14 @@ fn literal_value_to_f64(v: &omnigraph_compiler::catalog::LiteralValue) -> f64 {
// ─── Edge cardinality validation ─────────────────────────────────────────────
pub(crate) async fn validate_edge_cardinality(
db: &crate::db::Omnigraph,
branch: Option<&str>,
edge_name: &str,
written_version: u64,
written_branch: Option<&str>,
) -> Result<()> {
use arrow_array::Array;
let catalog = db.catalog();
let edge_type = &catalog.edge_types[edge_name];
if edge_type.cardinality.is_default() {
return Ok(());
}
// Open edge sub-table at the just-written version, not the snapshot's
// (the snapshot still pins to the pre-write version).
let snapshot = db.snapshot_for_branch(branch).await?;
let table_key = format!("edge:{}", edge_name);
let entry = snapshot
.entry(&table_key)
.ok_or_else(|| OmniError::manifest(format!("no manifest entry for {}", table_key)))?;
let ds = db
.open_dataset_at_state(
&entry.table_path,
written_branch.or(entry.table_branch.as_deref()),
written_version,
)
.await?;
// Scan src column, count per source
let batches = db.storage().scan(&ds, Some(&["src"]), None, None).await?;
let mut counts: HashMap<String, u32> = HashMap::new();
for batch in &batches {
let srcs = batch
.column_by_name("src")
.unwrap()
.as_any()
.downcast_ref::<StringArray>()
.unwrap();
for i in 0..srcs.len() {
*counts.entry(srcs.value(i).to_string()).or_insert(0) += 1;
}
}
let card = &edge_type.cardinality;
for (src, count) in &counts {
if let Some(max) = card.max {
if *count > max {
return Err(OmniError::manifest(format!(
"@card violation on edge {}: source '{}' has {} edges (max {})",
edge_name, src, count, max
)));
}
}
if *count < card.min {
return Err(OmniError::manifest(format!(
"@card violation on edge {}: source '{}' has {} edges (min {})",
edge_name, src, count, card.min
)));
}
}
Ok(())
}
/// Validate edge `@card` cardinality with in-memory pending edges visible.
///
/// Loader-level analog to `exec::mutation::validate_edge_cardinality_with_pending`:
/// opens the committed dataset at the pre-load snapshot version, then
/// delegates to the shared `count_src_per_edge` + `enforce_cardinality_bounds`
/// helpers in `exec::staging`. Used by Append/Merge loads (the Overwrite
/// path uses `validate_edge_cardinality` which opens the just-written
/// Lance version).
/// helpers in `exec::staging`. Used by every load mode; for `LoadMode::Overwrite`
/// it treats the pending edge batches as the replacement table image (the
/// committed rows are being replaced, so only the pending set is counted).
///
/// `mode` controls dedup behavior. `LoadMode::Merge` passes `Some("id")`
/// so committed edges that the load is *updating* (same edge id,

View file

@ -812,10 +812,12 @@ impl TableStore {
/// Legacy inline-commit append: writes fragments AND commits in one
/// call, advancing Lance HEAD as a side effect. Not on the
/// `TableStorage` trait surface — the staged primitive `stage_append`
/// + `commit_staged` is the engine write path. This inherent
/// `pub(crate)` method survives only for recovery test setup. Do not
/// add new engine call sites — they re-introduce the multi-phase
/// commit drift the trait surface was designed to eliminate.
/// + `commit_staged` is the engine write path. This inherent method
/// survives only for in-source recovery test setup, so it is
/// `#[cfg(test)]`-gated: engine code physically cannot call it (which
/// enforces "no new call sites" by construction and silences the
/// dead-code warning the non-test lib build would otherwise emit).
#[cfg(test)]
pub(crate) async fn append_batch(
&self,
dataset_uri: &str,

View file

@ -0,0 +1,96 @@
//! Guard: failpoint names must come from the compile-checked `names` catalog
//! (`omnigraph::failpoints::names` / `omnigraph_cluster::failpoints::names`),
//! never bare string literals.
//!
//! The `names` consts give compile-time typo protection only if every call
//! site uses them. A bare `maybe_fail("typo.literal")` still compiles (the
//! arg is `&str`), so a typo there would silently never fire. This
//! source-walk closes that gap by construction — the same defense-in-depth
//! shape as `forbidden_apis.rs`. Add a new failpoint by adding its const to
//! the catalog first; this guard then forces every call site to reference it.
use std::path::{Path, PathBuf};
/// Call-site prefixes whose first argument must be a `names::` constant. The
/// check is whitespace/newline-tolerant (it skips past the open paren to the
/// first non-whitespace token), so wrapping the call across lines cannot hide
/// a literal — a per-line `contains` scan would miss
/// `park_first(\n "name",\n)`.
const CALL_PREFIXES: &[&str] = &[
"maybe_fail(",
"ScopedFailPoint::new(",
"ScopedFailPoint::with_callback(",
"park_first(",
];
/// 1-based line number of `byte_off` within `contents`.
fn line_of(contents: &str, byte_off: usize) -> usize {
contents[..byte_off].bytes().filter(|&b| b == b'\n').count() + 1
}
fn manifest_dir() -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
}
/// Production call sites live under each crate's `src`; test call sites live
/// in the two failpoint integration binaries. This guard file is deliberately
/// not in the set (it names the patterns as literals itself).
fn files_to_scan() -> Vec<PathBuf> {
let engine = manifest_dir();
let cluster = engine.join("../omnigraph-cluster");
let mut out = Vec::new();
collect_rs(&engine.join("src"), &mut out);
collect_rs(&cluster.join("src"), &mut out);
out.push(engine.join("tests/failpoints.rs"));
out.push(cluster.join("tests/failpoints.rs"));
out
}
fn collect_rs(dir: &Path, out: &mut Vec<PathBuf>) {
let Ok(entries) = std::fs::read_dir(dir) else {
return;
};
for entry in entries.flatten() {
let path = entry.path();
if path.is_dir() {
collect_rs(&path, out);
} else if path.extension().is_some_and(|e| e == "rs") {
out.push(path);
}
}
}
#[test]
fn failpoint_names_use_the_compile_checked_catalog() {
let mut violations = Vec::new();
for file in files_to_scan() {
let Ok(contents) = std::fs::read_to_string(&file) else {
continue;
};
for prefix in CALL_PREFIXES {
let mut from = 0;
while let Some(rel) = contents[from..].find(prefix) {
let after_open = from + rel + prefix.len();
// Skip whitespace (incl. newlines) after the open paren. If the
// first argument token is a `"`, it's a literal failpoint name
// — across a line break or not.
if contents[after_open..].trim_start().starts_with('"') {
violations.push(format!(
"{}:{}: literal failpoint name at `{}` — use a `names::` const",
file.display(),
line_of(&contents, from + rel),
prefix.trim_end_matches('('),
));
}
from = after_open;
}
}
}
assert!(
violations.is_empty(),
"failpoint names must reference the compile-checked \
`omnigraph::failpoints::names::*` (or `omnigraph_cluster::failpoints::names::*`) \
constants, not string literals a literal typo would silently never fire:\n{}",
violations.join("\n")
);
}

File diff suppressed because it is too large Load diff

View file

@ -58,6 +58,14 @@ pub struct IoCounts {
pub commit_graph_reads: u64,
/// Version-probe invocations (the cheap freshness check).
pub version_probes: u64,
/// DATA-table open CALL count through the two instrumented chokepoints — an
/// exact open-invocation count (not the opener-read term), classified by URI so
/// internal/system-table opens are excluded. Step-3b target:
/// `data_open_count <= |touched_tables|` for a write.
pub data_open_count: u64,
/// Internal/system-table (`__manifest`, `_graph_commits*`) open CALL count —
/// the complement of `data_open_count` (publisher CAS + commit-graph append).
pub internal_open_count: u64,
}
impl IoCounts {
@ -225,6 +233,8 @@ struct ProbeHandles {
commit_graph: IOTracker,
table: PrefixCounter,
probe_count: Arc<AtomicU64>,
data_open_count: Arc<AtomicU64>,
internal_open_count: Arc<AtomicU64>,
}
impl ProbeHandles {
@ -234,6 +244,8 @@ impl ProbeHandles {
commit_graph: IOTracker::default(),
table: PrefixCounter::default(),
probe_count: Arc::new(AtomicU64::new(0)),
data_open_count: Arc::new(AtomicU64::new(0)),
internal_open_count: Arc::new(AtomicU64::new(0)),
};
let probes = QueryIoProbes {
manifest_wrapper: Some(Arc::new(h.manifest.clone()) as Arc<dyn WrappingObjectStore>),
@ -242,6 +254,8 @@ impl ProbeHandles {
),
table_wrapper: Some(Arc::new(h.table.clone()) as Arc<dyn WrappingObjectStore>),
probe_count: Arc::clone(&h.probe_count),
data_open_count: Arc::clone(&h.data_open_count),
internal_open_count: Arc::clone(&h.internal_open_count),
};
(probes, h)
}
@ -256,6 +270,8 @@ impl ProbeHandles {
manifest_reads: self.manifest.stats().read_iops,
commit_graph_reads: self.commit_graph.stats().read_iops,
version_probes: self.probe_count.load(Ordering::Relaxed),
data_open_count: self.data_open_count.load(Ordering::Relaxed),
internal_open_count: self.internal_open_count.load(Ordering::Relaxed),
}
}
}

View file

@ -0,0 +1,84 @@
//! Deterministic rendezvous for concurrent failpoint tests.
//!
//! The pattern: park the FIRST thread that hits a failpoint until the test
//! explicitly releases it, while later arrivals fall through. This replaces
//! fixed "guess" `sleep`s for cross-thread coordination — the test waits on
//! the *condition* (the point was reached) with a bounded timeout that fails
//! loudly, instead of betting a fixed duration is long enough.
//!
//! Extracted from the open-coded `AtomicBool` + callback pattern that
//! `fork_collision_with_live_concurrent_fork_is_retryable` proved out.
//!
//! The `reached` flag also doubles as a fired-assertion: a point that is
//! never hit makes [`Rendezvous::wait_until_reached`] panic, so a typo'd or
//! misplaced failpoint cannot pass silently.
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering::SeqCst};
use std::time::Duration;
use omnigraph::failpoints::ScopedFailPoint;
/// A parked-on-first-arrival rendezvous bound to a failpoint name. The
/// underlying callback is RAII-cleaned when this guard drops.
pub struct Rendezvous {
name: String,
reached: Arc<AtomicBool>,
release: Arc<AtomicBool>,
_failpoint: ScopedFailPoint,
}
impl Rendezvous {
/// Register `name` so the FIRST thread to hit it records readiness and
/// blocks until [`release`](Self::release); later arrivals fall through
/// immediately. The park is bounded (~30s) so a test bug cannot hang the
/// suite forever.
pub fn park_first(name: &str) -> Self {
let reached = Arc::new(AtomicBool::new(false));
let release = Arc::new(AtomicBool::new(false));
let (cb_reached, cb_release) = (Arc::clone(&reached), Arc::clone(&release));
let _failpoint = ScopedFailPoint::with_callback(name, move || {
if cb_reached
.compare_exchange(false, true, SeqCst, SeqCst)
.is_ok()
{
// ~30s bound (6000 * 5ms); released earlier on the common path.
for _ in 0..6000 {
if cb_release.load(SeqCst) {
return;
}
std::thread::sleep(Duration::from_millis(5));
}
}
});
Self {
name: name.to_string(),
reached,
release,
_failpoint,
}
}
/// Async-wait until the parked thread has reached the failpoint, polling
/// the readiness condition with a bounded (~12s) timeout. Panics if the
/// point is never hit — the fired-assertion.
pub async fn wait_until_reached(&self) {
for _ in 0..2400 {
if self.reached.load(SeqCst) {
return;
}
tokio::time::sleep(Duration::from_millis(5)).await;
}
panic!("rendezvous: failpoint '{}' was never reached", self.name);
}
/// Whether the parked thread has reached the failpoint yet.
pub fn reached(&self) -> bool {
self.reached.load(SeqCst)
}
/// Release the parked thread so it resumes past the failpoint.
pub fn release(&self) {
self.release.store(true, SeqCst);
}
}

View file

@ -1,6 +1,8 @@
#![allow(dead_code)]
pub mod cost;
#[cfg(feature = "failpoints")]
pub mod failpoint;
pub mod recovery;
use arrow_array::{Array, RecordBatch, StringArray};

View file

@ -86,6 +86,83 @@ async fn lance_error_too_much_write_contention_variant_exists() {
);
}
// --- Guard 1a: LanceError::IncompatibleTransaction variant exists ----------
//
// `db/manifest/migrations.rs::commit_v4_stamp_idempotently` pattern-matches on
// this variant: two concurrent v3→v4 runners both bump the internal-schema stamp
// (an `UpdateConfig` commit on the same metadata key), and the loser gets
// `IncompatibleTransaction`. Since both write the same value the conflict is
// benign and is retried idempotently. If Lance renames the variant or removes the
// builder, the match silently stops catching the conflict — this guard fails to
// force an update.
#[tokio::test]
async fn lance_error_incompatible_transaction_variant_exists() {
let err =
lance::Error::incompatible_transaction_source("concurrent UpdateConfig at version N".into());
assert!(
matches!(err, lance::Error::IncompatibleTransaction { .. }),
"Lance::Error::IncompatibleTransaction variant missing or renamed; \
update db/manifest/migrations.rs::commit_v4_stamp_idempotently and \
this guard, then re-pin docs/dev/lance.md."
);
}
// --- Guard 1c: LanceError::DatasetAlreadyExists variant exists --------------
//
// `db/commit_graph.rs` and `db/recovery_audit.rs` create internal Lance tables
// with a create-or-open idempotency fallback: a concurrent/prior create races,
// and the `DatasetAlreadyExists` arm falls back to `Dataset::open`. They match
// the typed variant, NOT the display string ("Dataset already exists: ..."),
// which is not a Lance API contract. If Lance renames the variant the match
// silently stops catching the race and a re-create errors instead of opening —
// this guard turns red to force an update.
#[tokio::test]
async fn lance_error_dataset_already_exists_variant_exists() {
let err = lance::Error::dataset_already_exists("guard");
assert!(
matches!(err, lance::Error::DatasetAlreadyExists { .. }),
"Lance::Error::DatasetAlreadyExists variant missing or renamed; update the \
db/commit_graph.rs + db/recovery_audit.rs create-or-open fallbacks and \
this guard, then re-pin docs/dev/lance.md."
);
}
// --- Guard 1b: Dataset::open on a missing path returns a not-found variant --
//
// `db/commit_graph.rs::read_legacy_commit_cache` (the v3→v4 lineage migration
// source) classifies a legacy-open error: a genuine not-found is the benign
// "no legacy data" signal (empty cache), and ANY OTHER error propagates loudly
// rather than being read as "empty" — a swallow there would let the migration
// stamp v4 over an empty backfill, orphaning real lineage permanently. That
// classification relies on Lance mapping an object-store NotFound to
// `DatasetNotFound` (or, for some paths, `NotFound`). If a Lance bump emits a
// different variant for a missing dataset, the migration would propagate a
// genuine "no legacy data" as a hard error — this guard turns red to force the
// classifier (and this guard) to be updated together.
#[tokio::test]
async fn dataset_open_missing_returns_not_found_variant() {
let dir = tempfile::tempdir().unwrap();
// A path that was never written — nothing to open.
let missing = dir.path().join("does-not-exist.lance");
let err = match Dataset::open(missing.to_str().unwrap()).await {
Ok(_) => panic!("opening a never-written dataset path must error"),
Err(e) => e,
};
assert!(
matches!(
err,
lance::Error::DatasetNotFound { .. } | lance::Error::NotFound { .. }
),
"Dataset::open on a missing path no longer returns DatasetNotFound/NotFound \
(got: {err:?}); update db/commit_graph.rs::read_legacy_commit_cache's \
legacy-open classification and this guard together, then re-pin \
docs/dev/lance.md."
);
}
// --- Guard 2: ManifestLocation field shape ---------------------------------
//
// `db/manifest/metadata.rs:84-88` reads `.path`, `.size`, `.e_tag`,

View file

@ -0,0 +1,235 @@
//! RFC-013 Phase 7 acceptance gate: graph lineage lives ONLY in `__manifest`.
//!
//! The `graph_commit` + `graph_head` rows ride the same publish CAS as the
//! table-version rows, so `_graph_commits.lance` carries NO commit rows. This
//! gate proves two things over a realistic history (commits on main, a branch,
//! a merge, all with actors):
//!
//! 1. The production commit-graph projection (`CommitGraph::open(...)`, which now
//! reads `__manifest`) reconstructs the full lineage correctly — commit set,
//! parents, the merge commit's two parents + merge actor, per-branch heads,
//! and the inline actors.
//! 2. `_graph_commits.lance` (and its actor sidecar) hold ZERO commit rows: the
//! dual-write is gone and nothing appends to them. This is the load-bearing
//! "single source" assertion.
mod helpers;
use futures::TryStreamExt;
use lance::Dataset;
use omnigraph::db::commit_graph::CommitGraph;
use omnigraph::db::{GraphCommit, Omnigraph};
use helpers::*;
/// Count rows in a Lance dataset directory under the graph root, or `0` if it
/// does not exist.
async fn row_count(root: &str, dir: &str) -> usize {
let uri = format!("{}/{}", root.trim_end_matches('/'), dir);
let Ok(dataset) = Dataset::open(&uri).await else {
return 0;
};
let batches: Vec<arrow_array::RecordBatch> = dataset
.scan()
.try_into_stream()
.await
.unwrap()
.try_collect()
.await
.unwrap();
batches.iter().map(|b| b.num_rows()).sum()
}
/// The production commit-graph projection at `branch`, sourced from `__manifest`.
async fn projected_commits(root: &str, branch: Option<&str>) -> Vec<GraphCommit> {
let graph = match branch {
Some(branch) => CommitGraph::open_at_branch(root, branch).await.unwrap(),
None => CommitGraph::open(root).await.unwrap(),
};
let mut commits = graph.load_commits().await.unwrap();
commits.sort_by(|a, b| {
a.manifest_version
.cmp(&b.manifest_version)
.then_with(|| a.created_at.cmp(&b.created_at))
.then_with(|| a.graph_commit_id.cmp(&b.graph_commit_id))
});
commits
}
async fn head_id(root: &str, branch: Option<&str>) -> String {
let graph = match branch {
Some(branch) => CommitGraph::open_at_branch(root, branch).await.unwrap(),
None => CommitGraph::open(root).await.unwrap(),
};
graph
.head_commit()
.await
.unwrap()
.unwrap()
.graph_commit_id
}
#[tokio::test]
async fn graph_lineage_lives_only_in_manifest() {
let dir = tempfile::tempdir().unwrap();
let uri = dir.path().to_str().unwrap().to_string();
// Build a realistic history: several authored commits on main, a branch with
// its own authored commits, then an authored merge back into main.
let main = init_and_load(&dir).await;
main.mutate_as(
"main",
MUTATION_QUERIES,
"insert_person",
&mixed_params(&[("$name", "Alice")], &[("$age", 30)]),
Some("act-alice"),
)
.await
.unwrap();
main.mutate_as(
"main",
MUTATION_QUERIES,
"insert_person",
&mixed_params(&[("$name", "Bob")], &[("$age", 41)]),
Some("act-bob"),
)
.await
.unwrap();
main.branch_create("feature").await.unwrap();
let feature = Omnigraph::open(&uri).await.unwrap();
feature
.mutate_as(
"feature",
MUTATION_QUERIES,
"insert_person",
&mixed_params(&[("$name", "Carol")], &[("$age", 27)]),
Some("act-carol"),
)
.await
.unwrap();
feature
.mutate_as(
"feature",
MUTATION_QUERIES,
"insert_person",
&mixed_params(&[("$name", "Dave")], &[("$age", 33)]),
Some("act-dave"),
)
.await
.unwrap();
// Advance main once more so the merge is a real (non-fast-forward) merge with
// two distinct parents.
main.mutate_as(
"main",
MUTATION_QUERIES,
"insert_person",
&mixed_params(&[("$name", "Erin")], &[("$age", 38)]),
Some("act-erin"),
)
.await
.unwrap();
let outcome = main
.branch_merge_as("feature", "main", Some("act-merger"))
.await
.unwrap();
// A genuine three-way merge (both sides advanced past the base).
assert_eq!(
outcome,
omnigraph::db::MergeOutcome::Merged,
"expected a real merge, not fast-forward/up-to-date"
);
// ── single source: nothing writes `_graph_commits.lance` ─────────────────
// RFC-013 Phase 7 folds lineage into `__manifest`; the commit-graph dataset
// exists only to carry branch refs, so it (and its actor sidecar) hold ZERO
// commit rows. If a stray `append_commit` reappears, this turns red.
assert_eq!(
row_count(&uri, "_graph_commits.lance").await,
0,
"_graph_commits.lance must carry no commit rows — lineage lives in __manifest"
);
assert_eq!(
row_count(&uri, "_graph_commit_actors.lance").await,
0,
"_graph_commit_actors.lance must carry no rows — actors live inline in __manifest"
);
// ── main lineage projected from `__manifest` ─────────────────────────────
let main_commits = projected_commits(&uri, None).await;
// genesis + Alice + Bob + Erin + the merge = 5 on main.
assert!(
main_commits.len() >= 5,
"expected a non-trivial main history, got {} commits",
main_commits.len()
);
// Genesis is the unique parentless commit and carries no actor.
let genesis: Vec<&GraphCommit> = main_commits
.iter()
.filter(|c| c.parent_commit_id.is_none())
.collect();
assert_eq!(genesis.len(), 1, "exactly one genesis (parentless) commit");
assert!(
genesis[0].actor_id.is_none(),
"genesis commit carries no actor"
);
// Every non-genesis commit's parent resolves to a known commit (a connected
// lineage — the publisher resolved each parent under the CAS).
for commit in &main_commits {
if let Some(parent) = &commit.parent_commit_id {
assert!(
main_commits.iter().any(|c| &c.graph_commit_id == parent),
"parent {parent} of {} must be a known commit",
commit.graph_commit_id
);
}
}
// The merge commit carries both parents and the merge actor.
let merge_commit = main_commits
.iter()
.find(|c| c.merged_parent_commit_id.is_some())
.expect("a merge commit with a merged parent must exist");
assert_eq!(merge_commit.actor_id.as_deref(), Some("act-merger"));
assert!(merge_commit.parent_commit_id.is_some());
// The merge is the head of main.
assert_eq!(
head_id(&uri, None).await,
merge_commit.graph_commit_id,
"the merge commit is the head of main"
);
// ── feature lineage projected from `__manifest` ──────────────────────────
let feature_commits = projected_commits(&uri, Some("feature")).await;
// The feature head is Dave's commit (the last authored on the branch).
let feature_head = head_id(&uri, Some("feature")).await;
let feature_head_commit = feature_commits
.iter()
.find(|c| c.graph_commit_id == feature_head)
.expect("feature head must be in the feature projection");
assert_eq!(
feature_head_commit.actor_id.as_deref(),
Some("act-dave"),
"feature head is Dave's authored commit"
);
// ── actors surface inline from the manifest metadata ─────────────────────
// main's authored commits: Alice, Bob, Erin (direct) + the merge (act-merger)
// = 4. Carol/Dave were authored on the feature branch, not main. Genesis has
// no actor.
let authored = main_commits
.iter()
.filter(|c| c.actor_id.is_some())
.count();
assert!(
authored >= 4,
"expected the authored commits to surface their actor in the projection, saw {authored}"
);
}

View file

@ -97,7 +97,9 @@ async fn optimize_on_empty_graph_returns_stats_per_table_with_no_changes() {
// Schema declares 2 nodes + 2 edges = 4 data tables, plus the 3 internal
// system tables (`__manifest`, `_graph_commits`, `_graph_commit_actors`) optimize
// also compacts (RFC-013 step 2) = 7. Compaction should run on each but find
// nothing to merge.
// nothing to merge. The genesis graph commit rides the SINGLE init
// `__manifest` write (RFC-013 Phase 7), so a fresh graph has one fragment per
// table — nothing to compact anywhere.
assert_eq!(stats.len(), 7);
for s in &stats {
assert_eq!(s.fragments_removed, 0, "{} should not remove", s.table_key);
@ -143,17 +145,20 @@ async fn optimize_after_load_then_again_is_idempotent() {
}
}
/// RFC-013 step 2: `optimize` compacts the internal system tables
/// (`__manifest`, `_graph_commits`), which accumulate one fragment per commit.
/// After compaction they shed fragments, write no recovery sidecar (a single
/// atomic Lance commit — no HEAD-before-publish gap), and the graph stays
/// coherent for subsequent reads + strict writes.
/// RFC-013 step 2 + Phase 7: `optimize` compacts `__manifest`, which now
/// accumulates one fragment per commit for BOTH the table-version rows and the
/// folded-in graph-lineage rows (`graph_commit` + `graph_head`). The
/// commit-graph datasets (`_graph_commits`, `_graph_commit_actors`) no longer
/// take a per-commit row (lineage lives in `__manifest`), so they stay flat —
/// nothing to compact. After compaction `__manifest` sheds fragments, writes no
/// recovery sidecar (a single atomic Lance commit — no HEAD-before-publish gap),
/// and the graph stays coherent for subsequent reads + strict writes.
#[tokio::test]
async fn optimize_compacts_internal_tables() {
let dir = tempfile::tempdir().unwrap();
let mut db = init_and_load(&dir).await;
// Build version-history depth so the internal tables accumulate fragments.
// Build version-history depth so `__manifest` accumulates fragments.
for i in 0..20 {
mutate_main(
&mut db,
@ -167,16 +172,32 @@ async fn optimize_compacts_internal_tables() {
let stats = db.optimize().await.unwrap();
for key in ["__manifest", "_graph_commits"] {
// `__manifest` carries every per-commit fragment (table versions + lineage)
// and compacts.
let manifest_stats = stats
.iter()
.find(|s| s.table_key == "__manifest")
.expect("optimize stats missing internal table __manifest");
assert!(
manifest_stats.committed,
"__manifest should compact after 20 commits"
);
assert!(
manifest_stats.fragments_removed > 0,
"__manifest should shed fragments, removed {}",
manifest_stats.fragments_removed
);
// The commit-graph datasets take no per-commit row anymore (RFC-013 Phase 7
// folds lineage into `__manifest`), so they stay at one fragment — no-ops.
for key in ["_graph_commits", "_graph_commit_actors"] {
let s = stats
.iter()
.find(|s| s.table_key == key)
.unwrap_or_else(|| panic!("optimize stats missing internal table {key}"));
assert!(s.committed, "{key} should compact after 20 commits");
assert!(
s.fragments_removed > 0,
"{key} should shed fragments, removed {}",
s.fragments_removed
!s.committed,
"{key} carries no per-commit rows after Phase 7 — nothing to compact"
);
}

View file

@ -685,38 +685,21 @@ async fn list_recovery_audit_kinds(graph_root: &Path) -> Vec<String> {
out
}
/// Helper: count `_graph_commits.lance` rows tagged with the recovery actor.
/// Helper: count graph commits authored by the recovery actor. RFC-013 Phase 7
/// records the recovery commit in `__manifest` (folded into the recovery publish
/// CAS), not `_graph_commits.lance`, so this counts through the production
/// commit-graph projection (`load_commits`), filtering on the inline actor.
async fn count_recovery_actor_commits(graph_root: &Path) -> usize {
let actors_dir = graph_root.join("_graph_commit_actors.lance");
if !actors_dir.exists() {
return 0;
}
let ds = Dataset::open(actors_dir.to_str().unwrap()).await.unwrap();
use arrow_array::{Array, StringArray};
use futures::TryStreamExt;
let batches: Vec<arrow_array::RecordBatch> = ds
.scan()
.try_into_stream()
let commits = omnigraph::db::commit_graph::CommitGraph::open(graph_root.to_str().unwrap())
.await
.unwrap()
.try_collect()
.load_commits()
.await
.unwrap();
let mut count = 0;
for batch in &batches {
let actors = batch
.column_by_name("actor_id")
.unwrap()
.as_any()
.downcast_ref::<StringArray>()
.unwrap();
for i in 0..actors.len() {
if actors.value(i) == "omnigraph:recovery" {
count += 1;
}
}
}
count
commits
.iter()
.filter(|c| c.actor_id.as_deref() == Some("omnigraph:recovery"))
.count()
}
#[tokio::test]

View file

@ -237,6 +237,58 @@ async fn cardinality_rejected_on_mutation_insert_edge() {
);
}
/// RFC-013 step 3b regression guard (cursor High / codex P1 on #298): edge `@card`
/// validation must scan LIVE committed HEAD, not the pinned `txn.base`. Collapse #1
/// skips the edge accumulation open, so a non-strict edge insert under a `WriteTxn`
/// reopens for the cardinality scan — and that scan must observe edges a concurrent
/// writer committed after this mutation captured its base, or a `@card` max is
/// silently exceeded (invariant 9). The residual validate→commit TOCTOU is the §7.1
/// gap (step 4); this only un-widens what 3b widened (live HEAD vs mutation-start base).
///
/// Deterministic — no failpoint: handle B's coordinator is stale by construction
/// (the write path does not probe the manifest version, unlike the read path). B MUST
/// NOT read between A's commit and B's insert — a read refreshes B's coordinator and
/// masks the bug (the same caveat as the served stale-view repro in `writes.rs`).
#[tokio::test]
async fn cardinality_rejected_for_stale_handle_after_concurrent_edge_commit() {
let (dir, mut db_a) = init_with(CARDINALITY_SCHEMA, CARDINALITY_SEED).await;
let uri = dir.path().to_str().unwrap();
// Handle B opens the same graph at the seed version (no edges yet); it then
// never reads again, so its in-memory coordinator stays pinned at the seed.
let mut db_b = Omnigraph::open(uri).await.unwrap();
// Handle A commits WorksAt(Alice -> Acme): Alice is now at the @card(0..1) max.
// This advances the on-disk manifest; B's coordinator is now stale.
mutate_main(
&mut db_a,
CARDINALITY_MUTATIONS,
"add_employment",
&params(&[("$person", "Alice"), ("$company", "Acme")]),
)
.await
.unwrap();
// Handle B (stale, never read since A committed) inserts a second WorksAt for
// Alice. B is non-strict + under a WriteTxn, so collapse #1 skips the open and the
// cardinality scan reopens: it MUST read live HEAD (Alice has 1) → reject (1+1 > 1),
// not the stale base (Alice has 0) → which would wrongly pass and commit a 2nd edge.
let err = mutate_main(
&mut db_b,
CARDINALITY_MUTATIONS,
"add_employment",
&params(&[("$person", "Alice"), ("$company", "Beta")]),
)
.await
.unwrap_err();
assert!(
err.to_string().to_lowercase().contains("cardinality")
|| err.to_string().to_lowercase().contains("@card"),
"a stale-handle edge insert must be rejected by @card against live HEAD, got: {}",
err
);
}
#[tokio::test]
async fn cardinality_rejected_on_jsonl_load() {
// Already covered by existing loader Phase 3 logic but assert the

View file

@ -24,10 +24,10 @@
mod helpers;
use helpers::cost::{
IoCounts, assert_flat, assert_grows, local_graph, measure_insert, measure_insert_as,
IoCounts, assert_flat, assert_grows, local_graph, measure, measure_insert, measure_insert_as,
measure_with_staged,
};
use helpers::{MUTATION_QUERIES, commit_many, commit_many_as, mixed_params};
use helpers::{MUTATION_QUERIES, commit_many, commit_many_as, init_and_load, mixed_params};
// ── (A) The internal-table LOCK — the acceptance test for step 2 (compaction) ──
//
@ -130,7 +130,16 @@ async fn single_insert_data_write_is_bounded() {
/// At a fixed shallow depth, the per-write object-store read count is below a
/// documented ceiling. Fails the moment a change *adds* a round-trip on the write
/// path — the "no new round-trip" guard (calibrated: ~50 at depth ~5).
/// path — the "no new round-trip" guard.
///
/// Two folds keep the count low: RFC-013 Phase 7 put the `graph_commit` +
/// `graph_head` rows in the same publish merge-insert (no extra `__manifest`
/// write/scan per commit), and RFC-013 P2 collapsed the publish path's FOUR
/// `__manifest` scans (table locations + version entries + tombstones + a
/// separate `read_graph_lineage` for the parent) into ONE — the
/// `manifest_reads` sub-ceiling below would trip if any of those scans crept
/// back. Calibrated at depth ~5: ~26 `__manifest` reads / ~36 total after the
/// P2 fold (was ~44 / ~54 with the four separate scans).
#[tokio::test]
async fn write_op_count_ceiling_at_shallow_depth() {
let dir = tempfile::tempdir().unwrap();
@ -141,6 +150,16 @@ async fn write_op_count_ceiling_at_shallow_depth() {
"depth~5: data={} __manifest={} _graph_commits={} total_reads={}",
io.data_reads, io.manifest_reads, io.commit_graph_reads, io.total_reads()
);
// Sub-ceiling on `__manifest` reads specifically: the publish path does one
// scan, not four. ~26 measured at this depth; a re-added scan would push it
// well past this. (Deterministic on local FS.)
const MANIFEST_CEILING: u64 = 34;
assert!(
io.manifest_reads <= MANIFEST_CEILING,
"per-write __manifest reads {} exceeded ceiling {MANIFEST_CEILING} — a publish-path \
scan was re-added (RFC-013 P2 folds them into one)",
io.manifest_reads,
);
const CEILING: u64 = 80;
assert!(
io.total_reads() <= CEILING,
@ -169,3 +188,86 @@ async fn keyed_insert_routes_through_merge_insert_only() {
assert_eq!(staged.stage_append, 0, "keyed insert must not stage_append");
assert_eq!(staged.create_vector_index, 0, "no inline vector-index build on a plain insert");
}
// ── (D) Step-3b capture-once fitness asserts (RED today → GREEN after WriteTxn) ──
/// A write must validate the schema contract EXACTLY ONCE (3 `read_text` + 2 `exists`).
/// Today the write path re-validates at every resolve point (entry, per-table
/// `resolved_branch_target`, commit-time `fresh_snapshot_for_branch`), so the delta is
/// a multiple of that. Step 3b's `WriteTxn` validates once and threads it. The shape is
/// the write twin of `warm_read_cost.rs::warm_query_validates_schema_contract_once`,
/// built with ZERO production change via the counting storage adapter.
#[tokio::test]
async fn write_validates_schema_contract_once() {
use omnigraph::instrumentation::CountingStorageAdapter;
use omnigraph::storage::storage_for_uri;
let dir = tempfile::tempdir().unwrap();
let _ = init_and_load(&dir).await;
let uri = dir.path().to_str().unwrap();
let (adapter, counts) = CountingStorageAdapter::new(storage_for_uri(uri).unwrap());
let db = omnigraph::db::Omnigraph::open_with_storage(uri, adapter)
.await
.unwrap();
let before_read_text = counts.read_text();
let before_exists = counts.exists();
db.mutate(
"main",
MUTATION_QUERIES,
"insert_person",
&mixed_params(&[("$name", "schema_once")], &[("$age", 30)]),
)
.await
.unwrap();
let read_text_delta = counts.read_text() - before_read_text;
let exists_delta = counts.exists() - before_exists;
eprintln!("schema-contract reads on one write: read_text={read_text_delta} exists={exists_delta}");
assert_eq!(
read_text_delta, 3,
"a write must validate the schema contract once (3 reads), not N times",
);
assert_eq!(
exists_delta, 2,
"a write must probe contract-file existence once (2 probes), not N times",
);
}
/// A keyed single-table write must open its DATA table AT MOST ONCE. Today it opens
/// ~4× (accumulation, staging, commit drift-guard, publish-prepare/index-build), each
/// a fresh cold `Dataset::open`. Step 3b opens the base once (a *session-aware* base
/// open is deferred to step 5), threads the commit-return handle, and replaces the
/// drift-guard open with a cheap `latest_version_id` probe — collapsing to 1 open.
/// Counted by `data_open_count`, the
/// table-class-scoped chokepoint probe: the internal-table opens (publisher CAS +
/// commit-graph append) are EXCLUDED, since they are unrelated to data-table reuse and
/// would otherwise keep this count >1 regardless of threading. (`forbidden_apis` keeps
/// engine code outside the storage layer from opening datasets except through the
/// instrumented chokepoints — `table_store.rs`'s own direct opens are branch-management
/// ops, not this keyed-write path.)
#[tokio::test]
async fn keyed_insert_opens_table_at_most_once() {
let dir = tempfile::tempdir().unwrap();
let mut db = local_graph(&dir).await;
let io = {
let (res, io) = measure(db.mutate(
"main",
MUTATION_QUERIES,
"insert_person",
&mixed_params(&[("$name", "opens")], &[("$age", 30)]),
))
.await;
res.unwrap();
io
};
eprintln!(
"data_open_count={} internal_open_count={} for a single-table keyed insert",
io.data_open_count, io.internal_open_count
);
assert!(
io.data_open_count <= 1,
"a keyed single-table write must open its data table at most once, got {}",
io.data_open_count,
);
}

View file

@ -613,7 +613,10 @@ async fn mixed_insert_and_update_on_same_person_coalesces_to_one_merge() {
"dedupe must keep the update's age value, not the insert's",
);
// One-publish guarantee: manifest version advanced by exactly 1.
// One-publish guarantee: manifest version advanced by exactly 1. The graph
// commit (`graph_commit` + `graph_head` rows) rides the SAME publish CAS as
// the table-version rows (RFC-013 Phase 7), so one graph commit is exactly
// one manifest version bump.
let post_version = version_main(&db).await.unwrap();
assert_eq!(
post_version,
@ -659,7 +662,9 @@ async fn multiple_appends_to_same_edge_coalesce_to_one_append() {
let edges_after = count_rows(&db, "edge:Knows").await;
assert_eq!(edges_after, edges_before + 2);
// One manifest version bump for the two-edge query (atomic publish).
// One manifest version bump for the two-edge query (atomic publish): the
// graph commit rides the same publish CAS as the table-version rows
// (RFC-013 Phase 7).
let post_version = version_main(&db).await.unwrap();
assert_eq!(
post_version,
@ -690,6 +695,8 @@ async fn multi_statement_inserts_publish_exactly_once() {
.await
.unwrap();
// One manifest version bump: the graph commit rides the same publish CAS
// as the table-version rows (RFC-013 Phase 7).
let post_version = version_main(&db).await.unwrap();
assert_eq!(
post_version,
@ -1005,6 +1012,8 @@ async fn chained_updates_with_overlapping_predicate_respects_intermediate_value(
"chained-update final value must reflect the second update applied to op-1's pending value"
);
// One manifest version bump: the graph commit rides the same publish CAS
// as the table-version rows (RFC-013 Phase 7).
let post_version = version_main(&db).await.unwrap();
assert_eq!(
post_version,
@ -1043,6 +1052,9 @@ async fn multi_statement_delete_on_same_node_table() {
pre_persons - 2,
"both deletes must land",
);
// One manifest version bump: the graph commit (delete-only queries record
// one too) rides the same publish CAS as the table-version rows
// (RFC-013 Phase 7).
let post_version = version_main(&db).await.unwrap();
assert_eq!(
post_version,