2026-04-10 20:49:41 +03:00
|
|
|
#![cfg(feature = "failpoints")]
|
|
|
|
|
|
|
|
|
|
mod helpers;
|
|
|
|
|
|
|
|
|
|
use fail::FailScenario;
|
2026-05-10 10:37:46 +00:00
|
|
|
use futures::FutureExt;
|
2026-04-10 20:49:41 +03:00
|
|
|
use omnigraph::db::Omnigraph;
|
|
|
|
|
use omnigraph::failpoints::ScopedFailPoint;
|
|
|
|
|
|
2026-05-05 16:04:48 +02:00
|
|
|
use helpers::recovery::{
|
|
|
|
|
FollowUpMutation, RecoveryExpectation, TableExpectation, assert_post_recovery_invariants,
|
|
|
|
|
branch_head_commit_id, single_sidecar_operation_id,
|
|
|
|
|
};
|
2026-05-03 15:09:58 +02:00
|
|
|
use helpers::{MUTATION_QUERIES, mixed_params, mutate_main, version_main};
|
2026-04-27 16:21:00 +03:00
|
|
|
|
|
|
|
|
const SCHEMA_V1: &str = "node Person { name: String @key }\n";
|
|
|
|
|
const SCHEMA_V2_ADDED_TYPE: &str =
|
|
|
|
|
"node Person { name: String @key }\nnode Company { name: String @key }\n";
|
2026-04-10 20:49:41 +03:00
|
|
|
|
2026-05-05 16:04:48 +02:00
|
|
|
fn node_table_uri(root: &str, type_name: &str) -> String {
|
|
|
|
|
let mut hash: u64 = 0xcbf2_9ce4_8422_2325;
|
|
|
|
|
for &b in type_name.as_bytes() {
|
|
|
|
|
hash ^= b as u64;
|
|
|
|
|
hash = hash.wrapping_mul(0x100_0000_01b3);
|
|
|
|
|
}
|
|
|
|
|
format!("{}/nodes/{hash:016x}", root.trim_end_matches('/'))
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-10 20:49:41 +03:00
|
|
|
#[tokio::test]
|
|
|
|
|
async fn branch_create_failpoint_triggers() {
|
|
|
|
|
let _scenario = FailScenario::setup();
|
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
|
|
|
|
let uri = dir.path().to_str().unwrap();
|
2026-05-08 16:26:23 +02:00
|
|
|
let db = Omnigraph::init(uri, helpers::TEST_SCHEMA).await.unwrap();
|
2026-04-10 20:49:41 +03:00
|
|
|
let _failpoint = ScopedFailPoint::new("branch_create.after_manifest_branch_create", "return");
|
|
|
|
|
|
|
|
|
|
let err = db.branch_create("feature").await.unwrap_err();
|
|
|
|
|
assert!(
|
|
|
|
|
err.to_string()
|
|
|
|
|
.contains("injected failpoint triggered: branch_create.after_manifest_branch_create")
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
fix(branch): make branch delete correct under partial failure (#137)
* test(lance): pin force_delete_branch surface guard
Pin the Lance 6.0.1 force_delete_branch behavior the branch-delete
single-authority redesign relies on: plain delete_branch errors on a
missing ref, force_delete_branch removes an existing forked branch, and
the local-store quirk where force_delete on a fully-absent branch still
errors (worked around by the upcoming TableStore::force_delete_branch).
Re-pin the docs/dev/lance.md alignment stanza (9 guards; 4 runtime).
* feat(storage): add force branch-delete to TableStore + CommitGraph
Add TableStore::force_delete_branch and CommitGraph::force_delete_branch
(idempotent: tolerate an already-absent branch via Lance RefNotFound /
NotFound), plus CommitGraph::list_branches for the cleanup reconciler to
diff against the manifest authority. RefConflict (referencing
descendants) is still surfaced. Unused until the branch-delete rewire.
* test(maintenance): red — cleanup reconciles orphaned branch forks
Forge a Lance branch on the Person table that the manifest never
references (a zombie fork from an incomplete prior delete) and assert
cleanup reclaims it while leaving main intact. Fails today: cleanup does
not yet reconcile orphaned forks. Goes green with the next commit.
* fix(maintenance): reconcile orphaned branch forks in cleanup
Add reconcile_orphaned_branches: force_delete_branch every per-table and
commit-graph Lance branch absent from the manifest branch set (the
authority), children-before-parents. Folded into cleanup_all_tables,
runs before version GC. Idempotent and authority-derived; no-ops once
nothing is orphaned, and would harmlessly find nothing if a future Lance
atomic multi-dataset branch op prevented orphans. Adds TableStore::list_branches
and exposes graph_commits_uri(pub crate). Turns the maintenance red test green.
* test(failpoints): red — branch_delete partial failure converges
Add the branch_delete.before_table_cleanup failpoint hook (inert without
the feature) and a regression test: a cleanup-step failure after the
manifest authority flip must leave branch_delete returning Ok, the branch
gone, the orphan stranded, then reclaimed by cleanup, and the name
reusable. Fails today: cleanup_deleted_branch_tables propagates the error
as a hard failure. Goes green with the next commit.
* fix(branch): best-effort fork reclaim after the manifest flip
Make branch_delete treat per-table forks and the commit-graph branch as
derived state reclaimed best-effort with force_delete_branch after the
manifest authority flip. A reclaim failure (transient error, or the
branch_delete.before_table_cleanup failpoint) is logged via tracing::warn
and swallowed: the branch is already gone and the cleanup reconciler
converges the orphan. cleanup_deleted_branch_tables no longer returns an
error or blocks the call. Turns the partial-failure recovery test green.
* test(failpoints): red — recreate over orphaned fork is actionable
After a partial-failure delete leaves a fork orphaned, recreating the
branch name and writing to the previously-forked table before cleanup
runs currently surfaces the opaque ExpectedVersionMismatch ("stale view
... expected manifest table version N"). Assert instead a clear error
pointing the user at cleanup. Goes green with the next commit.
* fix(branch): actionable orphan-collision error in fork_branch_from_state
When a fork's create_branch collides with an existing target ref, reuse
it only if its head matches source_version (a legitimate concurrent
first-write). A version mismatch means a zombie fork from an incomplete
prior delete: return a manifest_conflict pointing the user at
`omnigraph cleanup`, instead of the opaque ExpectedVersionMismatch.
Turns the recreate-over-orphan red test green.
* docs(invariants): single-authority branch-lifecycle + Lance forward-compat
Record branch delete in the Current Truth Matrix: manifest is the single
authority flipped atomically first, per-table forks + commit-graph branch
are derived state reclaimed best-effort with the cleanup reconciler as
backstop, and reusing a name whose reclaim failed surfaces an actionable
error. Note the reconciler is authority-derived and degrades to a no-op
under a future Lance atomic multi-dataset branch op, the same shape as
invariant 7.
* test(failpoints): red — cleanup isolates a single-table failure
Add the cleanup.table_gc failpoint hook (inert without the feature) and
an error: Option<String> field on TableCleanupStats (mechanical, always
None for now). Regression test: a one-shot version-GC failure for one
table must not abort the whole cleanup — assert cleanup still succeeds,
surfaces the failure per-table in stats, and the independent reconcile
pass still reclaimed an orphan. Fails today: the version-GC collect
aborts on the first table error. Goes green with the next commit.
* fix(maintenance): fault-isolate cleanup per table
Make the cleanup sweep do as much as it can and converge on re-run
instead of aborting wholesale on one table's transient error
(invariant 13). The version-GC loop now records a per-table failure on
its stats row (error: Some) and logs it rather than collecting into a
Result that aborts; reconcile_orphaned_branches isolates per-table and
commit-graph failures into BranchReconcileStats.failures. The CLI reports
any failed tables and tells the user to rerun cleanup. Addresses the
Devin review finding. Turns the single-table-failure test green.
* test(failpoints): red — branch_create heals commit-graph zombie + is atomic
Add the branch_delete.before_commit_graph_reclaim failpoint hook and two
regression tests: (a) recreating a name whose delete left a commit-graph
zombie must succeed (today it dies on Lance's internal Clone error), and
(b) branch_create must roll back the manifest branch when the derived
commit-graph branch fails (today it leaves the manifest branch created
while returning Err). Both fail now; green with the next commit. The
existing branch_create_failpoint_triggers test still passes.
* fix(branch): make branch_create atomic + heal commit-graph zombie
branch_create now flips the manifest authority first, then creates the
derived commit-graph branch in create_commit_graph_branch, force-dropping
any orphaned commit-graph ref left by an incomplete prior delete (the
manifest branch is fresh, so a same-named commit-graph branch is provably
a zombie). If commit-graph creation fails, the manifest branch is rolled
back so the name never half-exists. Addresses the Codex review finding.
Turns the two branch_create red tests green; existing tests unaffected.
* test(failpoints): red — fork collision misclassifies live concurrent fork
Add the fork.before_classify failpoint hook and a concurrency test: when
a concurrent first-write legitimately wins the fork race, the loser must
get a retryable refresh-and-retry, not the misleading run-cleanup orphan
error. Today the version-comparison misclassifies the live fork as an
orphan (the Cursor finding). Goes green with the next commit.
* fix(branch): manifest-arbitrated fork-collision classification
Classify a fork collision by the manifest authority instead of comparing
Lance branch versions. Before forking, open_owned_dataset_for_branch_write
re-reads the live manifest: if the table is already forked on the active
branch, a concurrent first-write won and the loser gets a retryable
refresh-and-retry (not a misleading orphan error). fork_branch_from_state
no longer guesses from versions — a create collision past that check is
an orphan, so it returns the actionable cleanup error. Addresses the
Cursor finding; turns the live-concurrent-fork test green, zombie path
unchanged.
* test(failpoints): close branch-lifecycle test gaps
Three coverage additions for the branch-delete work (behavior already
correct; these lock it in and catch regressions):
- cleanup_isolates_reconcile_failure: inject a force-delete failure into
the reconcile loop (new cleanup.reconcile_fork hook) and assert the
sweep continues + converges on re-run. Directly covers the reconcile
loop the Devin finding was about (previously only version-GC was).
- cleanup_reclaims_orphaned_commit_graph_branch: forge a commit-graph
orphan via the delete reclaim failpoint and assert cleanup's
reconcile_commit_graph_orphans drops it (previously untested).
- fork_collision_with_live_concurrent_fork_is_retryable: replace the
fixed 300ms sleep with a deterministic readiness signal (cfg_callback +
compare_exchange atomics) so the two-writer ordering can't flake.
Full failpoints suite 31/0.
2026-06-01 13:28:38 +02:00
|
|
|
// Branch delete flips the manifest authority first, then reclaims the per-table
|
|
|
|
|
// forks best-effort. A failure during that reclaim (here, the
|
|
|
|
|
// `branch_delete.before_table_cleanup` failpoint, standing in for a transient
|
|
|
|
|
// object-store error) must NOT fail the call: the branch is already gone, and
|
|
|
|
|
// `cleanup` reconciles the stranded fork. The branch name is reusable after.
|
|
|
|
|
#[tokio::test]
|
|
|
|
|
async fn branch_delete_partial_failure_converges_via_cleanup() {
|
|
|
|
|
let _scenario = FailScenario::setup();
|
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
|
|
|
|
let uri = dir.path().to_str().unwrap().to_string();
|
|
|
|
|
let mut main = helpers::init_and_load(&dir).await;
|
|
|
|
|
|
|
|
|
|
main.branch_create("feature").await.unwrap();
|
|
|
|
|
let mut feature = Omnigraph::open(&uri).await.unwrap();
|
|
|
|
|
helpers::mutate_branch(
|
|
|
|
|
&mut feature,
|
|
|
|
|
"feature",
|
|
|
|
|
MUTATION_QUERIES,
|
|
|
|
|
"insert_person",
|
|
|
|
|
&mixed_params(&[("$name", "Eve")], &[("$age", 22)]),
|
|
|
|
|
)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
|
|
|
|
drop(feature);
|
|
|
|
|
|
|
|
|
|
let person_uri = node_table_uri(&uri, "Person");
|
|
|
|
|
{
|
|
|
|
|
let ds = lance::Dataset::open(&person_uri).await.unwrap();
|
|
|
|
|
assert!(
|
|
|
|
|
ds.list_branches().await.unwrap().contains_key("feature"),
|
|
|
|
|
"precondition: the owned table fork exists before delete"
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Inject a failure during per-table cleanup, AFTER the manifest authority
|
|
|
|
|
// flip. branch_delete must still succeed (best-effort reclaim).
|
|
|
|
|
{
|
|
|
|
|
let _fp = ScopedFailPoint::new("branch_delete.before_table_cleanup", "return");
|
|
|
|
|
main.branch_delete("feature").await.expect(
|
|
|
|
|
"branch_delete is best-effort after the manifest flip: a cleanup-step \
|
|
|
|
|
failure must not fail the call",
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Authority flipped: the branch is gone.
|
|
|
|
|
assert_eq!(main.branch_list().await.unwrap(), vec!["main".to_string()]);
|
|
|
|
|
|
|
|
|
|
// The eager reclaim failed, so the orphan is stranded until cleanup.
|
|
|
|
|
{
|
|
|
|
|
let ds = lance::Dataset::open(&person_uri).await.unwrap();
|
|
|
|
|
assert!(
|
|
|
|
|
ds.list_branches().await.unwrap().contains_key("feature"),
|
|
|
|
|
"failed eager reclaim should leave the orphan for cleanup to reconcile"
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// cleanup converges: the orphan is reclaimed.
|
|
|
|
|
main.cleanup(omnigraph::db::CleanupPolicyOptions {
|
|
|
|
|
keep_versions: Some(1),
|
|
|
|
|
older_than: None,
|
|
|
|
|
})
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
|
|
|
|
{
|
|
|
|
|
let ds = lance::Dataset::open(&person_uri).await.unwrap();
|
|
|
|
|
assert!(
|
|
|
|
|
!ds.list_branches().await.unwrap().contains_key("feature"),
|
|
|
|
|
"cleanup should reconcile the orphaned fork away"
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// The name is reusable after cleanup reclaims the orphan.
|
|
|
|
|
main.branch_create("feature").await.unwrap();
|
|
|
|
|
let mut feature2 = Omnigraph::open(&uri).await.unwrap();
|
|
|
|
|
helpers::mutate_branch(
|
|
|
|
|
&mut feature2,
|
|
|
|
|
"feature",
|
|
|
|
|
MUTATION_QUERIES,
|
|
|
|
|
"insert_person",
|
|
|
|
|
&mixed_params(&[("$name", "Frank")], &[("$age", 41)]),
|
|
|
|
|
)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Reusing a branch name whose delete left an orphaned fork (before `cleanup`
|
|
|
|
|
// reconciles it) must fail with a clear, actionable error pointing at
|
|
|
|
|
// `cleanup`, not the opaque `ExpectedVersionMismatch` that leaks from the fork
|
|
|
|
|
// path. The recreate itself succeeds; the first write to the previously-forked
|
|
|
|
|
// table is where the stale orphan collides.
|
|
|
|
|
#[tokio::test]
|
|
|
|
|
async fn recreate_over_orphaned_fork_before_cleanup_is_actionable() {
|
|
|
|
|
let _scenario = FailScenario::setup();
|
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
|
|
|
|
let uri = dir.path().to_str().unwrap().to_string();
|
|
|
|
|
let mut main = helpers::init_and_load(&dir).await;
|
|
|
|
|
|
|
|
|
|
main.branch_create("feature").await.unwrap();
|
|
|
|
|
let mut feature = Omnigraph::open(&uri).await.unwrap();
|
|
|
|
|
helpers::mutate_branch(
|
|
|
|
|
&mut feature,
|
|
|
|
|
"feature",
|
|
|
|
|
MUTATION_QUERIES,
|
|
|
|
|
"insert_person",
|
|
|
|
|
&mixed_params(&[("$name", "Eve")], &[("$age", 22)]),
|
|
|
|
|
)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
|
|
|
|
drop(feature);
|
|
|
|
|
|
|
|
|
|
// Partial delete: leaves the Person fork orphaned (cleanup not yet run).
|
|
|
|
|
{
|
|
|
|
|
let _fp = ScopedFailPoint::new("branch_delete.before_table_cleanup", "return");
|
|
|
|
|
main.branch_delete("feature").await.unwrap();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Recreate the name and write to the previously-forked table WITHOUT a
|
|
|
|
|
// cleanup in between.
|
|
|
|
|
main.branch_create("feature").await.unwrap();
|
|
|
|
|
let mut feature2 = Omnigraph::open(&uri).await.unwrap();
|
|
|
|
|
let err = helpers::mutate_branch(
|
|
|
|
|
&mut feature2,
|
|
|
|
|
"feature",
|
|
|
|
|
MUTATION_QUERIES,
|
|
|
|
|
"insert_person",
|
|
|
|
|
&mixed_params(&[("$name", "Frank")], &[("$age", 41)]),
|
|
|
|
|
)
|
|
|
|
|
.await
|
|
|
|
|
.expect_err("write should collide with the stale orphaned fork");
|
|
|
|
|
|
|
|
|
|
let msg = err.to_string();
|
|
|
|
|
assert!(
|
|
|
|
|
msg.contains("cleanup")
|
|
|
|
|
&& (msg.contains("orphan") || msg.contains("incomplete prior delete")),
|
|
|
|
|
"expected an actionable orphaned-fork error pointing at cleanup, got: {msg}"
|
|
|
|
|
);
|
|
|
|
|
assert!(
|
|
|
|
|
!msg.contains("expected manifest table version"),
|
|
|
|
|
"should not surface the opaque ExpectedVersionMismatch, got: {msg}"
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// cleanup is the guaranteed convergence backstop, so one table's transient
|
|
|
|
|
// failure must not abort the whole sweep. Inject a one-shot version-GC failure
|
|
|
|
|
// for a single table and assert: cleanup still succeeds, the failure is
|
|
|
|
|
// surfaced per-table in the returned stats, and the independent reconcile pass
|
|
|
|
|
// still reclaimed an orphan.
|
|
|
|
|
#[tokio::test]
|
|
|
|
|
async fn cleanup_isolates_single_table_failure() {
|
|
|
|
|
let _scenario = FailScenario::setup();
|
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
|
|
|
|
let uri = dir.path().to_str().unwrap().to_string();
|
|
|
|
|
let mut db = helpers::init_and_load(&dir).await;
|
|
|
|
|
|
|
|
|
|
// Forge an orphaned fork on the Person table (a reconcile target).
|
|
|
|
|
let person_uri = node_table_uri(&uri, "Person");
|
|
|
|
|
{
|
|
|
|
|
let mut ds = lance::Dataset::open(&person_uri).await.unwrap();
|
|
|
|
|
let base = ds.version().version;
|
|
|
|
|
ds.create_branch("ghost", base, None).await.unwrap();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// One table's version GC fails once; the sweep must isolate it.
|
|
|
|
|
let _fp = ScopedFailPoint::new("cleanup.table_gc", "1*return");
|
|
|
|
|
let stats = db
|
|
|
|
|
.cleanup(omnigraph::db::CleanupPolicyOptions {
|
|
|
|
|
keep_versions: Some(1),
|
|
|
|
|
older_than: None,
|
|
|
|
|
})
|
|
|
|
|
.await
|
|
|
|
|
.expect("a single table's GC failure must not abort cleanup");
|
|
|
|
|
|
|
|
|
|
let errored = stats.iter().filter(|s| s.error.is_some()).count();
|
|
|
|
|
assert_eq!(
|
|
|
|
|
errored, 1,
|
|
|
|
|
"exactly one table's GC failure should be surfaced in stats, got {errored}"
|
|
|
|
|
);
|
|
|
|
|
assert!(
|
|
|
|
|
stats.len() >= 4,
|
|
|
|
|
"every node+edge table should still appear in the stats"
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
// The reconcile pass is independent of the GC failure, so the orphan is gone.
|
|
|
|
|
{
|
|
|
|
|
let ds = lance::Dataset::open(&person_uri).await.unwrap();
|
|
|
|
|
assert!(
|
|
|
|
|
!ds.list_branches().await.unwrap().contains_key("ghost"),
|
|
|
|
|
"reconcile should reclaim the orphan despite the GC failure"
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Companion to the version-GC isolation test, exercising the OTHER cleanup
|
|
|
|
|
// loop: a force-delete failure while reconciling one orphaned fork must be
|
|
|
|
|
// isolated (logged, not propagated) so the sweep continues, and a later
|
|
|
|
|
// cleanup converges. This is the loop the Devin finding was about.
|
|
|
|
|
#[tokio::test]
|
|
|
|
|
async fn cleanup_isolates_reconcile_failure() {
|
|
|
|
|
let _scenario = FailScenario::setup();
|
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
|
|
|
|
let uri = dir.path().to_str().unwrap().to_string();
|
|
|
|
|
let mut db = helpers::init_and_load(&dir).await;
|
|
|
|
|
|
|
|
|
|
// Forge an orphaned fork the reconcile pass will try to reclaim.
|
|
|
|
|
let person_uri = node_table_uri(&uri, "Person");
|
|
|
|
|
{
|
|
|
|
|
let mut ds = lance::Dataset::open(&person_uri).await.unwrap();
|
|
|
|
|
let base = ds.version().version;
|
|
|
|
|
ds.create_branch("ghost", base, None).await.unwrap();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Inject a one-shot failure into the reconcile force-delete. The sweep must
|
|
|
|
|
// not abort.
|
|
|
|
|
{
|
|
|
|
|
let _fp = ScopedFailPoint::new("cleanup.reconcile_fork", "1*return");
|
|
|
|
|
db.cleanup(omnigraph::db::CleanupPolicyOptions {
|
|
|
|
|
keep_versions: Some(1),
|
|
|
|
|
older_than: None,
|
|
|
|
|
})
|
|
|
|
|
.await
|
|
|
|
|
.expect("a reconcile force-delete failure must not abort cleanup");
|
|
|
|
|
}
|
|
|
|
|
// The blocked orphan is still present (the failure was isolated, not retried).
|
|
|
|
|
{
|
|
|
|
|
let ds = lance::Dataset::open(&person_uri).await.unwrap();
|
|
|
|
|
assert!(
|
|
|
|
|
ds.list_branches().await.unwrap().contains_key("ghost"),
|
|
|
|
|
"the orphan whose reclaim was injected-to-fail should remain"
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
// A second cleanup with no injected failure converges.
|
|
|
|
|
db.cleanup(omnigraph::db::CleanupPolicyOptions {
|
|
|
|
|
keep_versions: Some(1),
|
|
|
|
|
older_than: None,
|
|
|
|
|
})
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
|
|
|
|
{
|
|
|
|
|
let ds = lance::Dataset::open(&person_uri).await.unwrap();
|
|
|
|
|
assert!(
|
|
|
|
|
!ds.list_branches().await.unwrap().contains_key("ghost"),
|
|
|
|
|
"the second cleanup should reconcile the orphan"
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// The cleanup reconciler must reclaim orphaned commit-graph branches, not just
|
|
|
|
|
// per-table forks. A delete whose best-effort commit-graph reclaim fails leaves
|
|
|
|
|
// a commit-graph orphan; the next cleanup must drop it.
|
|
|
|
|
#[tokio::test]
|
|
|
|
|
async fn cleanup_reclaims_orphaned_commit_graph_branch() {
|
|
|
|
|
let _scenario = FailScenario::setup();
|
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
|
|
|
|
let uri = dir.path().to_str().unwrap().to_string();
|
|
|
|
|
let mut db = helpers::init_and_load(&dir).await;
|
|
|
|
|
|
|
|
|
|
db.branch_create("feature").await.unwrap();
|
|
|
|
|
// Delete, failing the commit-graph reclaim → commit-graph "feature" orphan
|
|
|
|
|
// (manifest branch gone, commit-graph branch left behind).
|
|
|
|
|
{
|
|
|
|
|
let _fp = ScopedFailPoint::new("branch_delete.before_commit_graph_reclaim", "return");
|
|
|
|
|
db.branch_delete("feature").await.unwrap();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let commits_uri = format!("{}/_graph_commits.lance", uri.trim_end_matches('/'));
|
|
|
|
|
{
|
|
|
|
|
let ds = lance::Dataset::open(&commits_uri).await.unwrap();
|
|
|
|
|
assert!(
|
|
|
|
|
ds.list_branches().await.unwrap().contains_key("feature"),
|
|
|
|
|
"precondition: the commit-graph branch should be orphaned after the failed reclaim"
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
db.cleanup(omnigraph::db::CleanupPolicyOptions {
|
|
|
|
|
keep_versions: Some(1),
|
|
|
|
|
older_than: None,
|
|
|
|
|
})
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
|
|
|
|
|
|
|
|
|
{
|
|
|
|
|
let ds = lance::Dataset::open(&commits_uri).await.unwrap();
|
|
|
|
|
assert!(
|
|
|
|
|
!ds.list_branches().await.unwrap().contains_key("feature"),
|
|
|
|
|
"cleanup should reclaim the orphaned commit-graph branch"
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// A branch_delete whose best-effort commit-graph reclaim fails leaves a
|
|
|
|
|
// commit-graph "zombie" branch. Recreating that name must heal the zombie and
|
|
|
|
|
// succeed (branch_create force-deletes a stale commit-graph ref since the
|
|
|
|
|
// manifest branch is created fresh), instead of dying on the leftover ref.
|
|
|
|
|
#[tokio::test]
|
|
|
|
|
async fn branch_create_recreates_over_commit_graph_zombie() {
|
|
|
|
|
let _scenario = FailScenario::setup();
|
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
|
|
|
|
let db = Omnigraph::init(dir.path().to_str().unwrap(), helpers::TEST_SCHEMA)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
|
|
|
|
|
|
|
|
|
db.branch_create("feature").await.unwrap();
|
|
|
|
|
{
|
|
|
|
|
// Fail the best-effort commit-graph reclaim → commit-graph "feature"
|
|
|
|
|
// zombie survives the delete (manifest authority still flips).
|
|
|
|
|
let _fp = ScopedFailPoint::new("branch_delete.before_commit_graph_reclaim", "return");
|
|
|
|
|
db.branch_delete("feature").await.unwrap();
|
|
|
|
|
}
|
|
|
|
|
assert_eq!(db.branch_list().await.unwrap(), vec!["main".to_string()]);
|
|
|
|
|
|
|
|
|
|
db.branch_create("feature")
|
|
|
|
|
.await
|
|
|
|
|
.expect("branch_create should heal the zombie commit-graph branch and succeed");
|
|
|
|
|
assert!(
|
|
|
|
|
db.branch_list()
|
|
|
|
|
.await
|
|
|
|
|
.unwrap()
|
|
|
|
|
.contains(&"feature".to_string())
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// branch_create is authority-then-derived: if the derived commit-graph branch
|
|
|
|
|
// cannot be created, the manifest branch (the authority) must be rolled back so
|
|
|
|
|
// the branch does not half-exist. The existing failpoint fires right after the
|
|
|
|
|
// manifest create, standing in for any post-authority failure.
|
|
|
|
|
#[tokio::test]
|
|
|
|
|
async fn branch_create_rolls_back_manifest_on_commit_graph_failure() {
|
|
|
|
|
let _scenario = FailScenario::setup();
|
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
|
|
|
|
let db = Omnigraph::init(dir.path().to_str().unwrap(), helpers::TEST_SCHEMA)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
|
|
|
|
|
|
|
|
|
let err = {
|
|
|
|
|
let _fp = ScopedFailPoint::new("branch_create.after_manifest_branch_create", "return");
|
|
|
|
|
db.branch_create("feature").await.unwrap_err()
|
|
|
|
|
};
|
|
|
|
|
assert!(
|
|
|
|
|
!db.branch_list()
|
|
|
|
|
.await
|
|
|
|
|
.unwrap()
|
|
|
|
|
.contains(&"feature".to_string()),
|
|
|
|
|
"branch_create must roll back the manifest branch when the derived \
|
|
|
|
|
commit-graph branch fails, got error: {err}"
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// A fork collision must be classified by the manifest authority, not by Lance
|
|
|
|
|
// branch versions. When a concurrent first-write legitimately wins the fork
|
|
|
|
|
// race, the loser sees a version mismatch — but that is a stale snapshot, not
|
|
|
|
|
// an orphan, so it must be a retryable "refresh and retry", never a misleading
|
|
|
|
|
// "run cleanup".
|
|
|
|
|
//
|
|
|
|
|
// Ordering is made deterministic (no sleeps) via a callback at the fork point:
|
|
|
|
|
// `compare_exchange` lets only the FIRST arrival (writer A) record readiness and
|
|
|
|
|
// block until released; later arrivals (writer B) fall through. The test waits
|
|
|
|
|
// on the readiness flag, lets B win and commit the fork, then releases A.
|
|
|
|
|
static FORK_A_AT_POINT: std::sync::atomic::AtomicBool = std::sync::atomic::AtomicBool::new(false);
|
|
|
|
|
static FORK_RELEASE_A: std::sync::atomic::AtomicBool = std::sync::atomic::AtomicBool::new(false);
|
|
|
|
|
|
|
|
|
|
#[tokio::test(flavor = "multi_thread")]
|
|
|
|
|
async fn fork_collision_with_live_concurrent_fork_is_retryable() {
|
|
|
|
|
use std::sync::atomic::Ordering::SeqCst;
|
|
|
|
|
|
|
|
|
|
let _scenario = FailScenario::setup();
|
|
|
|
|
FORK_A_AT_POINT.store(false, SeqCst);
|
|
|
|
|
FORK_RELEASE_A.store(false, SeqCst);
|
|
|
|
|
|
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
|
|
|
|
let uri = dir.path().to_str().unwrap().to_string();
|
|
|
|
|
let main = helpers::init_and_load(&dir).await;
|
|
|
|
|
main.branch_create("feature").await.unwrap();
|
|
|
|
|
|
|
|
|
|
// First arrival (A) records readiness and blocks until released; the rest
|
|
|
|
|
// (B) fall through immediately. Bounded spin so a mistake can't hang forever.
|
|
|
|
|
fail::cfg_callback("fork.before_classify", || {
|
|
|
|
|
if FORK_A_AT_POINT
|
|
|
|
|
.compare_exchange(false, true, SeqCst, SeqCst)
|
|
|
|
|
.is_ok()
|
|
|
|
|
{
|
|
|
|
|
for _ in 0..2000 {
|
|
|
|
|
if FORK_RELEASE_A.load(SeqCst) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
std::thread::sleep(std::time::Duration::from_millis(5));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
.unwrap();
|
|
|
|
|
|
|
|
|
|
let uri_a = uri.clone();
|
|
|
|
|
let writer_a = tokio::spawn(async move {
|
|
|
|
|
let mut a = Omnigraph::open(&uri_a).await.unwrap();
|
|
|
|
|
helpers::mutate_branch(
|
|
|
|
|
&mut a,
|
|
|
|
|
"feature",
|
|
|
|
|
MUTATION_QUERIES,
|
|
|
|
|
"insert_person",
|
|
|
|
|
&mixed_params(&[("$name", "Eve")], &[("$age", 22)]),
|
|
|
|
|
)
|
|
|
|
|
.await
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
// Wait (bounded) until A is parked at the fork point.
|
|
|
|
|
for _ in 0..600 {
|
|
|
|
|
if FORK_A_AT_POINT.load(SeqCst) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
tokio::time::sleep(std::time::Duration::from_millis(5)).await;
|
|
|
|
|
}
|
|
|
|
|
assert!(
|
|
|
|
|
FORK_A_AT_POINT.load(SeqCst),
|
|
|
|
|
"writer A never reached the fork point"
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
// B wins the fork and commits it.
|
|
|
|
|
let mut b = Omnigraph::open(&uri).await.unwrap();
|
|
|
|
|
helpers::mutate_branch(
|
|
|
|
|
&mut b,
|
|
|
|
|
"feature",
|
|
|
|
|
MUTATION_QUERIES,
|
|
|
|
|
"insert_person",
|
|
|
|
|
&mixed_params(&[("$name", "Frank")], &[("$age", 41)]),
|
|
|
|
|
)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
|
|
|
|
|
|
|
|
|
// Release A; it resumes, re-reads the manifest, and sees the fork is live.
|
|
|
|
|
FORK_RELEASE_A.store(true, SeqCst);
|
|
|
|
|
let err = writer_a
|
|
|
|
|
.await
|
|
|
|
|
.unwrap()
|
|
|
|
|
.expect_err("A's stale-snapshot fork should be a retryable conflict");
|
|
|
|
|
fail::remove("fork.before_classify");
|
|
|
|
|
|
|
|
|
|
let msg = err.to_string();
|
|
|
|
|
assert!(
|
|
|
|
|
!msg.contains("cleanup"),
|
|
|
|
|
"a live concurrent fork must not be misclassified as an orphan, got: {msg}"
|
|
|
|
|
);
|
|
|
|
|
assert!(
|
|
|
|
|
msg.contains("refresh and retry") || msg.contains("expected manifest table version"),
|
|
|
|
|
"expected a retryable stale-view error, got: {msg}"
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
2026-05-10 10:37:46 +00:00
|
|
|
#[tokio::test(flavor = "multi_thread")]
|
2026-04-10 20:49:41 +03:00
|
|
|
async fn graph_publish_failpoint_triggers_before_commit_append() {
|
|
|
|
|
let _scenario = FailScenario::setup();
|
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
|
|
|
|
let mut db = Omnigraph::init(dir.path().to_str().unwrap(), helpers::TEST_SCHEMA)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
|
|
|
|
let _failpoint = ScopedFailPoint::new("graph_publish.before_commit_append", "return");
|
|
|
|
|
|
|
|
|
|
let err = mutate_main(
|
|
|
|
|
&mut db,
|
|
|
|
|
MUTATION_QUERIES,
|
|
|
|
|
"insert_person",
|
|
|
|
|
&mixed_params(&[("$name", "Eve")], &[("$age", 22)]),
|
|
|
|
|
)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap_err();
|
|
|
|
|
assert!(
|
|
|
|
|
err.to_string()
|
|
|
|
|
.contains("injected failpoint triggered: graph_publish.before_commit_append")
|
|
|
|
|
);
|
|
|
|
|
}
|
2026-04-27 16:21:00 +03:00
|
|
|
|
|
|
|
|
// Atomic schema apply: schema apply writes staging files first, then commits
|
|
|
|
|
// the manifest, then renames staging → final. Tests below inject crashes at
|
2026-05-24 16:46:00 +01:00
|
|
|
// the two boundaries and assert that reopening the graph yields a consistent
|
2026-04-27 16:21:00 +03:00
|
|
|
// state.
|
|
|
|
|
|
|
|
|
|
#[tokio::test]
|
recovery: register added tables + tombstones in SchemaApply roll-forward
Cursor flagged that SchemaApply sidecars only captured `Update` pins
(via `snapshot.entry()?` in schema_apply.rs:166), so recovery's
`roll_forward_all` only published `ManifestChange::Update` for the
rewritten/indexed tables. Added types (`added_tables`) and tombstones
(`renamed_tables` sources) were silently dropped during recovery.
Reproducer: in `schema_apply_phase_b_failure_recovered_on_next_open`,
the v2 schema added a `Tag` node type. Pre-fix, `node:Tag` ended up as
an orphan dataset on disk while the manifest never received a
`RegisterTable` entry — the live `_schema.pg` declared a type the
manifest didn't know about, and `count_rows(node:Tag)` panicked with
`no manifest entry for node:Tag`. The existing test passed only
because it never queried Tag.
Fix:
1. Extend `RecoverySidecar` with `additional_registrations` and
`tombstones` fields (optional, serde-default for backward compat
with existing on-disk sidecars). Both are SchemaApply-only.
2. Populate them in `apply_schema_with_lock` from the migration plan's
upfront diff (`added_tables` + `renamed_tables` keys for
registrations; `renamed_tables` values for tombstones, version-
pinned at `source_entry.table_version + 1`).
3. Update `roll_forward_all` to:
- emit `RegisterTable` + `Update` for each `additional_registrations`
entry (read the dataset's current Lance HEAD for the version
metadata + row_count)
- emit `Tombstone` for each `tombstones` entry
- filter against `snapshot` so previously-published registrations /
tombstones are skipped (handles the post-Phase-C-success-but-
sidecar-not-yet-deleted case — without filtering, the publisher's
CAS pre-check would error with `expected=0, actual=N` on the
redundant Register)
4. Extend the audit-row outcomes to include published registrations.
Test changes:
- `schema_apply_phase_b_failure_recovered_on_next_open` now asserts
`count_rows(node:Tag) == 0` (no panic), proving the new manifest
entry exists.
- `schema_apply_recovers_pre_commit_crash` renamed to
`schema_apply_pre_commit_crash_rolls_forward_via_sidecar` and
rewritten — pre-fix it expected pre-commit crashes to roll BACK
(delete staging, keep V1, leave Company as orphan); the sidecar
protocol's "complete the writer's intent" semantic now rolls
FORWARD (rename staging -> final, register Company atomically). The
new assertions verify schema = V2 and `node:Company` is queryable.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-05 22:15:50 +02:00
|
|
|
async fn schema_apply_pre_commit_crash_rolls_forward_via_sidecar() {
|
2026-04-27 16:21:00 +03:00
|
|
|
let _scenario = FailScenario::setup();
|
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
|
|
|
|
let uri = dir.path().to_str().unwrap().to_string();
|
|
|
|
|
|
|
|
|
|
{
|
2026-05-08 16:26:23 +02:00
|
|
|
let db = Omnigraph::init(&uri, SCHEMA_V1).await.unwrap();
|
2026-04-27 16:21:00 +03:00
|
|
|
let _failpoint = ScopedFailPoint::new("schema_apply.after_staging_write", "return");
|
|
|
|
|
let err = db.apply_schema(SCHEMA_V2_ADDED_TYPE).await.unwrap_err();
|
|
|
|
|
assert!(
|
|
|
|
|
err.to_string()
|
|
|
|
|
.contains("injected failpoint triggered: schema_apply.after_staging_write"),
|
|
|
|
|
"got: {}",
|
|
|
|
|
err
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
recovery: register added tables + tombstones in SchemaApply roll-forward
Cursor flagged that SchemaApply sidecars only captured `Update` pins
(via `snapshot.entry()?` in schema_apply.rs:166), so recovery's
`roll_forward_all` only published `ManifestChange::Update` for the
rewritten/indexed tables. Added types (`added_tables`) and tombstones
(`renamed_tables` sources) were silently dropped during recovery.
Reproducer: in `schema_apply_phase_b_failure_recovered_on_next_open`,
the v2 schema added a `Tag` node type. Pre-fix, `node:Tag` ended up as
an orphan dataset on disk while the manifest never received a
`RegisterTable` entry — the live `_schema.pg` declared a type the
manifest didn't know about, and `count_rows(node:Tag)` panicked with
`no manifest entry for node:Tag`. The existing test passed only
because it never queried Tag.
Fix:
1. Extend `RecoverySidecar` with `additional_registrations` and
`tombstones` fields (optional, serde-default for backward compat
with existing on-disk sidecars). Both are SchemaApply-only.
2. Populate them in `apply_schema_with_lock` from the migration plan's
upfront diff (`added_tables` + `renamed_tables` keys for
registrations; `renamed_tables` values for tombstones, version-
pinned at `source_entry.table_version + 1`).
3. Update `roll_forward_all` to:
- emit `RegisterTable` + `Update` for each `additional_registrations`
entry (read the dataset's current Lance HEAD for the version
metadata + row_count)
- emit `Tombstone` for each `tombstones` entry
- filter against `snapshot` so previously-published registrations /
tombstones are skipped (handles the post-Phase-C-success-but-
sidecar-not-yet-deleted case — without filtering, the publisher's
CAS pre-check would error with `expected=0, actual=N` on the
redundant Register)
4. Extend the audit-row outcomes to include published registrations.
Test changes:
- `schema_apply_phase_b_failure_recovered_on_next_open` now asserts
`count_rows(node:Tag) == 0` (no panic), proving the new manifest
entry exists.
- `schema_apply_recovers_pre_commit_crash` renamed to
`schema_apply_pre_commit_crash_rolls_forward_via_sidecar` and
rewritten — pre-fix it expected pre-commit crashes to roll BACK
(delete staging, keep V1, leave Company as orphan); the sidecar
protocol's "complete the writer's intent" semantic now rolls
FORWARD (rename staging -> final, register Company atomically). The
new assertions verify schema = V2 and `node:Company` is queryable.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-05 22:15:50 +02:00
|
|
|
// Reopen. With the sidecar protocol, a Phase B → Phase C crash
|
|
|
|
|
// (per-table commit_staged done; manifest publish not yet) is
|
|
|
|
|
// recoverable: the sidecar's `additional_registrations` carries the
|
|
|
|
|
// intent to register `node:Company`, schema-state recovery promotes
|
|
|
|
|
// the staging files, and the manifest-drift sweep publishes the
|
|
|
|
|
// RegisterTable + Update so the manifest catches up to the schema
|
|
|
|
|
// the writer already declared. The orphan-dataset-on-disk-with-no-
|
|
|
|
|
// manifest-entry corruption that pre-this-protocol recoveries left
|
|
|
|
|
// behind is closed.
|
2026-04-27 16:21:00 +03:00
|
|
|
let db = Omnigraph::open(&uri).await.unwrap();
|
recovery: register added tables + tombstones in SchemaApply roll-forward
Cursor flagged that SchemaApply sidecars only captured `Update` pins
(via `snapshot.entry()?` in schema_apply.rs:166), so recovery's
`roll_forward_all` only published `ManifestChange::Update` for the
rewritten/indexed tables. Added types (`added_tables`) and tombstones
(`renamed_tables` sources) were silently dropped during recovery.
Reproducer: in `schema_apply_phase_b_failure_recovered_on_next_open`,
the v2 schema added a `Tag` node type. Pre-fix, `node:Tag` ended up as
an orphan dataset on disk while the manifest never received a
`RegisterTable` entry — the live `_schema.pg` declared a type the
manifest didn't know about, and `count_rows(node:Tag)` panicked with
`no manifest entry for node:Tag`. The existing test passed only
because it never queried Tag.
Fix:
1. Extend `RecoverySidecar` with `additional_registrations` and
`tombstones` fields (optional, serde-default for backward compat
with existing on-disk sidecars). Both are SchemaApply-only.
2. Populate them in `apply_schema_with_lock` from the migration plan's
upfront diff (`added_tables` + `renamed_tables` keys for
registrations; `renamed_tables` values for tombstones, version-
pinned at `source_entry.table_version + 1`).
3. Update `roll_forward_all` to:
- emit `RegisterTable` + `Update` for each `additional_registrations`
entry (read the dataset's current Lance HEAD for the version
metadata + row_count)
- emit `Tombstone` for each `tombstones` entry
- filter against `snapshot` so previously-published registrations /
tombstones are skipped (handles the post-Phase-C-success-but-
sidecar-not-yet-deleted case — without filtering, the publisher's
CAS pre-check would error with `expected=0, actual=N` on the
redundant Register)
4. Extend the audit-row outcomes to include published registrations.
Test changes:
- `schema_apply_phase_b_failure_recovered_on_next_open` now asserts
`count_rows(node:Tag) == 0` (no panic), proving the new manifest
entry exists.
- `schema_apply_recovers_pre_commit_crash` renamed to
`schema_apply_pre_commit_crash_rolls_forward_via_sidecar` and
rewritten — pre-fix it expected pre-commit crashes to roll BACK
(delete staging, keep V1, leave Company as orphan); the sidecar
protocol's "complete the writer's intent" semantic now rolls
FORWARD (rename staging -> final, register Company atomically). The
new assertions verify schema = V2 and `node:Company` is queryable.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-05 22:15:50 +02:00
|
|
|
assert_eq!(
|
2026-05-08 16:26:23 +02:00
|
|
|
db.schema_source().as_str(),
|
recovery: register added tables + tombstones in SchemaApply roll-forward
Cursor flagged that SchemaApply sidecars only captured `Update` pins
(via `snapshot.entry()?` in schema_apply.rs:166), so recovery's
`roll_forward_all` only published `ManifestChange::Update` for the
rewritten/indexed tables. Added types (`added_tables`) and tombstones
(`renamed_tables` sources) were silently dropped during recovery.
Reproducer: in `schema_apply_phase_b_failure_recovered_on_next_open`,
the v2 schema added a `Tag` node type. Pre-fix, `node:Tag` ended up as
an orphan dataset on disk while the manifest never received a
`RegisterTable` entry — the live `_schema.pg` declared a type the
manifest didn't know about, and `count_rows(node:Tag)` panicked with
`no manifest entry for node:Tag`. The existing test passed only
because it never queried Tag.
Fix:
1. Extend `RecoverySidecar` with `additional_registrations` and
`tombstones` fields (optional, serde-default for backward compat
with existing on-disk sidecars). Both are SchemaApply-only.
2. Populate them in `apply_schema_with_lock` from the migration plan's
upfront diff (`added_tables` + `renamed_tables` keys for
registrations; `renamed_tables` values for tombstones, version-
pinned at `source_entry.table_version + 1`).
3. Update `roll_forward_all` to:
- emit `RegisterTable` + `Update` for each `additional_registrations`
entry (read the dataset's current Lance HEAD for the version
metadata + row_count)
- emit `Tombstone` for each `tombstones` entry
- filter against `snapshot` so previously-published registrations /
tombstones are skipped (handles the post-Phase-C-success-but-
sidecar-not-yet-deleted case — without filtering, the publisher's
CAS pre-check would error with `expected=0, actual=N` on the
redundant Register)
4. Extend the audit-row outcomes to include published registrations.
Test changes:
- `schema_apply_phase_b_failure_recovered_on_next_open` now asserts
`count_rows(node:Tag) == 0` (no panic), proving the new manifest
entry exists.
- `schema_apply_recovers_pre_commit_crash` renamed to
`schema_apply_pre_commit_crash_rolls_forward_via_sidecar` and
rewritten — pre-fix it expected pre-commit crashes to roll BACK
(delete staging, keep V1, leave Company as orphan); the sidecar
protocol's "complete the writer's intent" semantic now rolls
FORWARD (rename staging -> final, register Company atomically). The
new assertions verify schema = V2 and `node:Company` is queryable.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-05 22:15:50 +02:00
|
|
|
SCHEMA_V2_ADDED_TYPE,
|
|
|
|
|
"live schema must reflect the rolled-forward apply (Company added)"
|
|
|
|
|
);
|
2026-04-27 16:21:00 +03:00
|
|
|
assert_no_staging_files(dir.path());
|
recovery: register added tables + tombstones in SchemaApply roll-forward
Cursor flagged that SchemaApply sidecars only captured `Update` pins
(via `snapshot.entry()?` in schema_apply.rs:166), so recovery's
`roll_forward_all` only published `ManifestChange::Update` for the
rewritten/indexed tables. Added types (`added_tables`) and tombstones
(`renamed_tables` sources) were silently dropped during recovery.
Reproducer: in `schema_apply_phase_b_failure_recovered_on_next_open`,
the v2 schema added a `Tag` node type. Pre-fix, `node:Tag` ended up as
an orphan dataset on disk while the manifest never received a
`RegisterTable` entry — the live `_schema.pg` declared a type the
manifest didn't know about, and `count_rows(node:Tag)` panicked with
`no manifest entry for node:Tag`. The existing test passed only
because it never queried Tag.
Fix:
1. Extend `RecoverySidecar` with `additional_registrations` and
`tombstones` fields (optional, serde-default for backward compat
with existing on-disk sidecars). Both are SchemaApply-only.
2. Populate them in `apply_schema_with_lock` from the migration plan's
upfront diff (`added_tables` + `renamed_tables` keys for
registrations; `renamed_tables` values for tombstones, version-
pinned at `source_entry.table_version + 1`).
3. Update `roll_forward_all` to:
- emit `RegisterTable` + `Update` for each `additional_registrations`
entry (read the dataset's current Lance HEAD for the version
metadata + row_count)
- emit `Tombstone` for each `tombstones` entry
- filter against `snapshot` so previously-published registrations /
tombstones are skipped (handles the post-Phase-C-success-but-
sidecar-not-yet-deleted case — without filtering, the publisher's
CAS pre-check would error with `expected=0, actual=N` on the
redundant Register)
4. Extend the audit-row outcomes to include published registrations.
Test changes:
- `schema_apply_phase_b_failure_recovered_on_next_open` now asserts
`count_rows(node:Tag) == 0` (no panic), proving the new manifest
entry exists.
- `schema_apply_recovers_pre_commit_crash` renamed to
`schema_apply_pre_commit_crash_rolls_forward_via_sidecar` and
rewritten — pre-fix it expected pre-commit crashes to roll BACK
(delete staging, keep V1, leave Company as orphan); the sidecar
protocol's "complete the writer's intent" semantic now rolls
FORWARD (rename staging -> final, register Company atomically). The
new assertions verify schema = V2 and `node:Company` is queryable.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-05 22:15:50 +02:00
|
|
|
// node:Company must be registered in the manifest (queryable);
|
|
|
|
|
// pre-protocol recoveries left it as an orphan dataset on disk.
|
|
|
|
|
let company_rows = helpers::count_rows(&db, "node:Company").await;
|
|
|
|
|
assert_eq!(
|
|
|
|
|
company_rows, 0,
|
|
|
|
|
"node:Company must have a manifest entry post-recovery"
|
|
|
|
|
);
|
2026-04-27 16:21:00 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[tokio::test]
|
|
|
|
|
async fn schema_apply_recovers_post_commit_crash() {
|
|
|
|
|
let _scenario = FailScenario::setup();
|
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
|
|
|
|
let uri = dir.path().to_str().unwrap().to_string();
|
|
|
|
|
|
|
|
|
|
{
|
2026-05-08 16:26:23 +02:00
|
|
|
let db = Omnigraph::init(&uri, SCHEMA_V1).await.unwrap();
|
2026-04-27 16:21:00 +03:00
|
|
|
let _failpoint = ScopedFailPoint::new("schema_apply.after_manifest_commit", "return");
|
|
|
|
|
let err = db.apply_schema(SCHEMA_V2_ADDED_TYPE).await.unwrap_err();
|
|
|
|
|
assert!(
|
|
|
|
|
err.to_string()
|
|
|
|
|
.contains("injected failpoint triggered: schema_apply.after_manifest_commit"),
|
|
|
|
|
"got: {}",
|
|
|
|
|
err
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Reopen — manifest is at the new version, so recovery sweep should
|
|
|
|
|
// complete the rename and the live schema matches v2.
|
|
|
|
|
let db = Omnigraph::open(&uri).await.unwrap();
|
2026-05-08 16:26:23 +02:00
|
|
|
assert_eq!(db.schema_source().as_str(), SCHEMA_V2_ADDED_TYPE);
|
2026-04-27 16:21:00 +03:00
|
|
|
assert_no_staging_files(dir.path());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[tokio::test]
|
|
|
|
|
async fn schema_apply_recovers_partial_rename() {
|
|
|
|
|
// Construct a partial-rename state: _schema.pg has been renamed in
|
|
|
|
|
// (matching v2), but _schema.ir.json.staging and __schema_state.json.staging
|
|
|
|
|
// were never renamed. Recovery should detect that the live source matches
|
|
|
|
|
// the staging state's hash and complete the remaining renames.
|
|
|
|
|
let _scenario = FailScenario::setup();
|
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
|
|
|
|
let uri = dir.path().to_str().unwrap().to_string();
|
|
|
|
|
|
|
|
|
|
{
|
2026-05-08 16:26:23 +02:00
|
|
|
let db = Omnigraph::init(&uri, SCHEMA_V1).await.unwrap();
|
2026-04-27 16:21:00 +03:00
|
|
|
db.apply_schema(SCHEMA_V2_ADDED_TYPE).await.unwrap();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Simulate: one of the renames (the IR or state file) didn't complete by
|
|
|
|
|
// copying the live ir/state files back to their staging names.
|
|
|
|
|
std::fs::copy(
|
|
|
|
|
dir.path().join("_schema.ir.json"),
|
|
|
|
|
dir.path().join("_schema.ir.json.staging"),
|
|
|
|
|
)
|
|
|
|
|
.unwrap();
|
|
|
|
|
std::fs::copy(
|
|
|
|
|
dir.path().join("__schema_state.json"),
|
|
|
|
|
dir.path().join("__schema_state.json.staging"),
|
|
|
|
|
)
|
|
|
|
|
.unwrap();
|
|
|
|
|
|
|
|
|
|
// Reopen — recovery should complete the rename (overwriting final files
|
|
|
|
|
// with identical staging content) and remove the staging files.
|
|
|
|
|
let db = Omnigraph::open(&uri).await.unwrap();
|
2026-05-08 16:26:23 +02:00
|
|
|
assert_eq!(db.schema_source().as_str(), SCHEMA_V2_ADDED_TYPE);
|
2026-04-27 16:21:00 +03:00
|
|
|
assert_no_staging_files(dir.path());
|
|
|
|
|
}
|
|
|
|
|
|
2026-05-03 13:56:36 +02:00
|
|
|
/// Prove the recovery sweep closes the "finalize → publisher residual"
|
|
|
|
|
/// across one open cycle.
|
2026-05-01 13:47:55 +02:00
|
|
|
///
|
|
|
|
|
/// `MutationStaging::finalize` runs `commit_staged` per touched table
|
|
|
|
|
/// sequentially before the publisher commits the manifest. Lance has no
|
|
|
|
|
/// multi-dataset atomic commit primitive, so a failure between the
|
|
|
|
|
/// per-table staged commits and the manifest commit leaves Lance HEAD
|
recovery: wire sidecar into MutationStaging::finalize + flip headline test (Phase 5)
Production wiring (~120 LOC):
- `MutationStaging::finalize` now takes a `SidecarKind` parameter and
returns an additional `Option<RecoverySidecarHandle>`. Builds a
Vec<SidecarTablePin> from `pending` BEFORE the per-table commit_staged
loop and writes the sidecar via `recovery::write_sidecar`. Skips the
sidecar when `pending` is empty (delete-only mutation; D₂ keeps these
out of the staged-write path so the option is just a clean signal,
not a code path users hit).
- `exec/mutation.rs::execute_mutation_as` (around line 740): destructure
the new third element, pass `SidecarKind::Mutation`, delete the
sidecar after `commit_updates_on_branch_with_expected` succeeds.
- `loader/mod.rs::ingest_loaded` (around line 540): same shape, with
`SidecarKind::Load`. The Overwrite path stays inline-commit (legacy
residual; out of MR-847 scope per docs/runs.md).
- New engine accessors `Omnigraph::storage_adapter()` and
`Omnigraph::root_uri()` for the sidecar I/O. The pre-existing
`db.storage` field stays private; no other engine code reaches around
the accessor.
- Re-exports from `db::manifest`: `new_sidecar`, `write_sidecar`,
`delete_sidecar`, plus the `RecoverySidecar*` types and `SidecarKind`,
so consumers in `exec/` can use them via `crate::db::manifest::...`.
Bugfix folded in (~5 LOC): make `coordinator` mutable in
`Omnigraph::open_with_storage_and_mode` and call `coordinator.refresh()`
after the recovery sweep returns. Roll-forward advances the manifest
pin on disk; without the refresh the returned engine carried a stale
in-memory snapshot. The Phase 4 tests passed only because they
opened Lance datasets directly rather than going through `db.snapshot()`.
Storage adapter (~15 LOC): `LocalStorageAdapter::write_text` now ensures
the parent directory exists via `tokio::fs::create_dir_all`. Required
because the sidecar protocol writes into `__recovery/` which doesn't
pre-exist after `Omnigraph::init`. S3 has no equivalent; PutObject is
path-agnostic.
Headline test flip (~150 LOC):
- `tests/failpoints.rs::finalize_publisher_residual_drifts_lance_head_until_next_writer_recovers`
is replaced by `recovery_rolls_forward_after_finalize_publisher_failure`.
Same setup (failpoint at `mutation.post_finalize_pre_publisher`) but
after the synthetic failure the test:
1. Asserts the sidecar persists in `__recovery/` for the recovery
sweep to find.
2. Drops the engine handle.
3. Reopens via `Omnigraph::open` — recovery sweep classifies
RolledPastExpected, decides RollForward, publishes the manifest
update, records the audit row, deletes the sidecar.
4. Asserts the sidecar is gone.
5. Asserts the originally-attempted Eve insert is now visible
(Person count = 1).
6. Asserts a subsequent insert succeeds without
ExpectedVersionMismatch (Person count = 2).
7. Asserts the audit dataset `_graph_commit_recoveries.lance` exists.
This is the headline contract the MR-847 acceptance criteria require.
All other failpoint and runs tests continue to pass (8 + 24 unchanged).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:15:37 +02:00
|
|
|
/// advanced on the touched tables with no manifest update.
|
2026-05-01 13:47:55 +02:00
|
|
|
///
|
2026-05-03 13:56:36 +02:00
|
|
|
/// Closing the residual: finalize writes a sidecar at
|
|
|
|
|
/// `__recovery/{ulid}.json` BEFORE Phase B, the failpoint fires AFTER
|
|
|
|
|
/// finalize but BEFORE the publisher, the engine handle is dropped, and
|
|
|
|
|
/// the next `Omnigraph::open` runs the recovery sweep. The sweep
|
|
|
|
|
/// classifies every table in the sidecar as `RolledPastExpected` (Lance
|
|
|
|
|
/// HEAD == expected + 1, post_commit_pin matches), decides RollForward,
|
|
|
|
|
/// atomically extends every manifest pin via
|
|
|
|
|
/// `ManifestBatchPublisher::publish`, records an audit row, and deletes
|
|
|
|
|
/// the sidecar.
|
recovery: wire sidecar into MutationStaging::finalize + flip headline test (Phase 5)
Production wiring (~120 LOC):
- `MutationStaging::finalize` now takes a `SidecarKind` parameter and
returns an additional `Option<RecoverySidecarHandle>`. Builds a
Vec<SidecarTablePin> from `pending` BEFORE the per-table commit_staged
loop and writes the sidecar via `recovery::write_sidecar`. Skips the
sidecar when `pending` is empty (delete-only mutation; D₂ keeps these
out of the staged-write path so the option is just a clean signal,
not a code path users hit).
- `exec/mutation.rs::execute_mutation_as` (around line 740): destructure
the new third element, pass `SidecarKind::Mutation`, delete the
sidecar after `commit_updates_on_branch_with_expected` succeeds.
- `loader/mod.rs::ingest_loaded` (around line 540): same shape, with
`SidecarKind::Load`. The Overwrite path stays inline-commit (legacy
residual; out of MR-847 scope per docs/runs.md).
- New engine accessors `Omnigraph::storage_adapter()` and
`Omnigraph::root_uri()` for the sidecar I/O. The pre-existing
`db.storage` field stays private; no other engine code reaches around
the accessor.
- Re-exports from `db::manifest`: `new_sidecar`, `write_sidecar`,
`delete_sidecar`, plus the `RecoverySidecar*` types and `SidecarKind`,
so consumers in `exec/` can use them via `crate::db::manifest::...`.
Bugfix folded in (~5 LOC): make `coordinator` mutable in
`Omnigraph::open_with_storage_and_mode` and call `coordinator.refresh()`
after the recovery sweep returns. Roll-forward advances the manifest
pin on disk; without the refresh the returned engine carried a stale
in-memory snapshot. The Phase 4 tests passed only because they
opened Lance datasets directly rather than going through `db.snapshot()`.
Storage adapter (~15 LOC): `LocalStorageAdapter::write_text` now ensures
the parent directory exists via `tokio::fs::create_dir_all`. Required
because the sidecar protocol writes into `__recovery/` which doesn't
pre-exist after `Omnigraph::init`. S3 has no equivalent; PutObject is
path-agnostic.
Headline test flip (~150 LOC):
- `tests/failpoints.rs::finalize_publisher_residual_drifts_lance_head_until_next_writer_recovers`
is replaced by `recovery_rolls_forward_after_finalize_publisher_failure`.
Same setup (failpoint at `mutation.post_finalize_pre_publisher`) but
after the synthetic failure the test:
1. Asserts the sidecar persists in `__recovery/` for the recovery
sweep to find.
2. Drops the engine handle.
3. Reopens via `Omnigraph::open` — recovery sweep classifies
RolledPastExpected, decides RollForward, publishes the manifest
update, records the audit row, deletes the sidecar.
4. Asserts the sidecar is gone.
5. Asserts the originally-attempted Eve insert is now visible
(Person count = 1).
6. Asserts a subsequent insert succeeds without
ExpectedVersionMismatch (Person count = 2).
7. Asserts the audit dataset `_graph_commit_recoveries.lance` exists.
This is the headline contract the MR-847 acceptance criteria require.
All other failpoint and runs tests continue to pass (8 + 24 unchanged).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:15:37 +02:00
|
|
|
///
|
|
|
|
|
/// After this test passes:
|
|
|
|
|
/// - The originally-attempted insert ("Eve") is visible via a normal
|
|
|
|
|
/// query.
|
|
|
|
|
/// - The next mutation succeeds without `ExpectedVersionMismatch`.
|
|
|
|
|
/// - `_graph_commit_recoveries.lance` carries an audit row with
|
|
|
|
|
/// `recovery_kind=RolledForward` and the original sidecar's
|
|
|
|
|
/// `actor_id` in `recovery_for_actor`.
|
|
|
|
|
///
|
|
|
|
|
/// Continuous in-process recovery (no restart needed between failure
|
2026-05-03 13:56:36 +02:00
|
|
|
/// and recovery) is the goal of a future background reconciler.
|
2026-05-01 13:47:55 +02:00
|
|
|
#[tokio::test]
|
recovery: wire sidecar into MutationStaging::finalize + flip headline test (Phase 5)
Production wiring (~120 LOC):
- `MutationStaging::finalize` now takes a `SidecarKind` parameter and
returns an additional `Option<RecoverySidecarHandle>`. Builds a
Vec<SidecarTablePin> from `pending` BEFORE the per-table commit_staged
loop and writes the sidecar via `recovery::write_sidecar`. Skips the
sidecar when `pending` is empty (delete-only mutation; D₂ keeps these
out of the staged-write path so the option is just a clean signal,
not a code path users hit).
- `exec/mutation.rs::execute_mutation_as` (around line 740): destructure
the new third element, pass `SidecarKind::Mutation`, delete the
sidecar after `commit_updates_on_branch_with_expected` succeeds.
- `loader/mod.rs::ingest_loaded` (around line 540): same shape, with
`SidecarKind::Load`. The Overwrite path stays inline-commit (legacy
residual; out of MR-847 scope per docs/runs.md).
- New engine accessors `Omnigraph::storage_adapter()` and
`Omnigraph::root_uri()` for the sidecar I/O. The pre-existing
`db.storage` field stays private; no other engine code reaches around
the accessor.
- Re-exports from `db::manifest`: `new_sidecar`, `write_sidecar`,
`delete_sidecar`, plus the `RecoverySidecar*` types and `SidecarKind`,
so consumers in `exec/` can use them via `crate::db::manifest::...`.
Bugfix folded in (~5 LOC): make `coordinator` mutable in
`Omnigraph::open_with_storage_and_mode` and call `coordinator.refresh()`
after the recovery sweep returns. Roll-forward advances the manifest
pin on disk; without the refresh the returned engine carried a stale
in-memory snapshot. The Phase 4 tests passed only because they
opened Lance datasets directly rather than going through `db.snapshot()`.
Storage adapter (~15 LOC): `LocalStorageAdapter::write_text` now ensures
the parent directory exists via `tokio::fs::create_dir_all`. Required
because the sidecar protocol writes into `__recovery/` which doesn't
pre-exist after `Omnigraph::init`. S3 has no equivalent; PutObject is
path-agnostic.
Headline test flip (~150 LOC):
- `tests/failpoints.rs::finalize_publisher_residual_drifts_lance_head_until_next_writer_recovers`
is replaced by `recovery_rolls_forward_after_finalize_publisher_failure`.
Same setup (failpoint at `mutation.post_finalize_pre_publisher`) but
after the synthetic failure the test:
1. Asserts the sidecar persists in `__recovery/` for the recovery
sweep to find.
2. Drops the engine handle.
3. Reopens via `Omnigraph::open` — recovery sweep classifies
RolledPastExpected, decides RollForward, publishes the manifest
update, records the audit row, deletes the sidecar.
4. Asserts the sidecar is gone.
5. Asserts the originally-attempted Eve insert is now visible
(Person count = 1).
6. Asserts a subsequent insert succeeds without
ExpectedVersionMismatch (Person count = 2).
7. Asserts the audit dataset `_graph_commit_recoveries.lance` exists.
This is the headline contract the MR-847 acceptance criteria require.
All other failpoint and runs tests continue to pass (8 + 24 unchanged).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:15:37 +02:00
|
|
|
async fn recovery_rolls_forward_after_finalize_publisher_failure() {
|
2026-05-01 13:47:55 +02:00
|
|
|
let _scenario = FailScenario::setup();
|
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
recovery: wire sidecar into MutationStaging::finalize + flip headline test (Phase 5)
Production wiring (~120 LOC):
- `MutationStaging::finalize` now takes a `SidecarKind` parameter and
returns an additional `Option<RecoverySidecarHandle>`. Builds a
Vec<SidecarTablePin> from `pending` BEFORE the per-table commit_staged
loop and writes the sidecar via `recovery::write_sidecar`. Skips the
sidecar when `pending` is empty (delete-only mutation; D₂ keeps these
out of the staged-write path so the option is just a clean signal,
not a code path users hit).
- `exec/mutation.rs::execute_mutation_as` (around line 740): destructure
the new third element, pass `SidecarKind::Mutation`, delete the
sidecar after `commit_updates_on_branch_with_expected` succeeds.
- `loader/mod.rs::ingest_loaded` (around line 540): same shape, with
`SidecarKind::Load`. The Overwrite path stays inline-commit (legacy
residual; out of MR-847 scope per docs/runs.md).
- New engine accessors `Omnigraph::storage_adapter()` and
`Omnigraph::root_uri()` for the sidecar I/O. The pre-existing
`db.storage` field stays private; no other engine code reaches around
the accessor.
- Re-exports from `db::manifest`: `new_sidecar`, `write_sidecar`,
`delete_sidecar`, plus the `RecoverySidecar*` types and `SidecarKind`,
so consumers in `exec/` can use them via `crate::db::manifest::...`.
Bugfix folded in (~5 LOC): make `coordinator` mutable in
`Omnigraph::open_with_storage_and_mode` and call `coordinator.refresh()`
after the recovery sweep returns. Roll-forward advances the manifest
pin on disk; without the refresh the returned engine carried a stale
in-memory snapshot. The Phase 4 tests passed only because they
opened Lance datasets directly rather than going through `db.snapshot()`.
Storage adapter (~15 LOC): `LocalStorageAdapter::write_text` now ensures
the parent directory exists via `tokio::fs::create_dir_all`. Required
because the sidecar protocol writes into `__recovery/` which doesn't
pre-exist after `Omnigraph::init`. S3 has no equivalent; PutObject is
path-agnostic.
Headline test flip (~150 LOC):
- `tests/failpoints.rs::finalize_publisher_residual_drifts_lance_head_until_next_writer_recovers`
is replaced by `recovery_rolls_forward_after_finalize_publisher_failure`.
Same setup (failpoint at `mutation.post_finalize_pre_publisher`) but
after the synthetic failure the test:
1. Asserts the sidecar persists in `__recovery/` for the recovery
sweep to find.
2. Drops the engine handle.
3. Reopens via `Omnigraph::open` — recovery sweep classifies
RolledPastExpected, decides RollForward, publishes the manifest
update, records the audit row, deletes the sidecar.
4. Asserts the sidecar is gone.
5. Asserts the originally-attempted Eve insert is now visible
(Person count = 1).
6. Asserts a subsequent insert succeeds without
ExpectedVersionMismatch (Person count = 2).
7. Asserts the audit dataset `_graph_commit_recoveries.lance` exists.
This is the headline contract the MR-847 acceptance criteria require.
All other failpoint and runs tests continue to pass (8 + 24 unchanged).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:15:37 +02:00
|
|
|
let uri = dir.path().to_str().unwrap().to_string();
|
2026-05-05 16:04:48 +02:00
|
|
|
let operation_id;
|
2026-05-01 13:47:55 +02:00
|
|
|
|
2026-05-05 22:46:03 +02:00
|
|
|
// Setup: trigger the residual.
|
2026-05-01 13:47:55 +02:00
|
|
|
{
|
recovery: wire sidecar into MutationStaging::finalize + flip headline test (Phase 5)
Production wiring (~120 LOC):
- `MutationStaging::finalize` now takes a `SidecarKind` parameter and
returns an additional `Option<RecoverySidecarHandle>`. Builds a
Vec<SidecarTablePin> from `pending` BEFORE the per-table commit_staged
loop and writes the sidecar via `recovery::write_sidecar`. Skips the
sidecar when `pending` is empty (delete-only mutation; D₂ keeps these
out of the staged-write path so the option is just a clean signal,
not a code path users hit).
- `exec/mutation.rs::execute_mutation_as` (around line 740): destructure
the new third element, pass `SidecarKind::Mutation`, delete the
sidecar after `commit_updates_on_branch_with_expected` succeeds.
- `loader/mod.rs::ingest_loaded` (around line 540): same shape, with
`SidecarKind::Load`. The Overwrite path stays inline-commit (legacy
residual; out of MR-847 scope per docs/runs.md).
- New engine accessors `Omnigraph::storage_adapter()` and
`Omnigraph::root_uri()` for the sidecar I/O. The pre-existing
`db.storage` field stays private; no other engine code reaches around
the accessor.
- Re-exports from `db::manifest`: `new_sidecar`, `write_sidecar`,
`delete_sidecar`, plus the `RecoverySidecar*` types and `SidecarKind`,
so consumers in `exec/` can use them via `crate::db::manifest::...`.
Bugfix folded in (~5 LOC): make `coordinator` mutable in
`Omnigraph::open_with_storage_and_mode` and call `coordinator.refresh()`
after the recovery sweep returns. Roll-forward advances the manifest
pin on disk; without the refresh the returned engine carried a stale
in-memory snapshot. The Phase 4 tests passed only because they
opened Lance datasets directly rather than going through `db.snapshot()`.
Storage adapter (~15 LOC): `LocalStorageAdapter::write_text` now ensures
the parent directory exists via `tokio::fs::create_dir_all`. Required
because the sidecar protocol writes into `__recovery/` which doesn't
pre-exist after `Omnigraph::init`. S3 has no equivalent; PutObject is
path-agnostic.
Headline test flip (~150 LOC):
- `tests/failpoints.rs::finalize_publisher_residual_drifts_lance_head_until_next_writer_recovers`
is replaced by `recovery_rolls_forward_after_finalize_publisher_failure`.
Same setup (failpoint at `mutation.post_finalize_pre_publisher`) but
after the synthetic failure the test:
1. Asserts the sidecar persists in `__recovery/` for the recovery
sweep to find.
2. Drops the engine handle.
3. Reopens via `Omnigraph::open` — recovery sweep classifies
RolledPastExpected, decides RollForward, publishes the manifest
update, records the audit row, deletes the sidecar.
4. Asserts the sidecar is gone.
5. Asserts the originally-attempted Eve insert is now visible
(Person count = 1).
6. Asserts a subsequent insert succeeds without
ExpectedVersionMismatch (Person count = 2).
7. Asserts the audit dataset `_graph_commit_recoveries.lance` exists.
This is the headline contract the MR-847 acceptance criteria require.
All other failpoint and runs tests continue to pass (8 + 24 unchanged).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:15:37 +02:00
|
|
|
let mut db = Omnigraph::init(&uri, helpers::TEST_SCHEMA).await.unwrap();
|
2026-05-05 16:04:48 +02:00
|
|
|
let _failpoint = ScopedFailPoint::new("mutation.post_finalize_pre_publisher", "return");
|
2026-05-01 13:47:55 +02:00
|
|
|
|
recovery: wire sidecar into MutationStaging::finalize + flip headline test (Phase 5)
Production wiring (~120 LOC):
- `MutationStaging::finalize` now takes a `SidecarKind` parameter and
returns an additional `Option<RecoverySidecarHandle>`. Builds a
Vec<SidecarTablePin> from `pending` BEFORE the per-table commit_staged
loop and writes the sidecar via `recovery::write_sidecar`. Skips the
sidecar when `pending` is empty (delete-only mutation; D₂ keeps these
out of the staged-write path so the option is just a clean signal,
not a code path users hit).
- `exec/mutation.rs::execute_mutation_as` (around line 740): destructure
the new third element, pass `SidecarKind::Mutation`, delete the
sidecar after `commit_updates_on_branch_with_expected` succeeds.
- `loader/mod.rs::ingest_loaded` (around line 540): same shape, with
`SidecarKind::Load`. The Overwrite path stays inline-commit (legacy
residual; out of MR-847 scope per docs/runs.md).
- New engine accessors `Omnigraph::storage_adapter()` and
`Omnigraph::root_uri()` for the sidecar I/O. The pre-existing
`db.storage` field stays private; no other engine code reaches around
the accessor.
- Re-exports from `db::manifest`: `new_sidecar`, `write_sidecar`,
`delete_sidecar`, plus the `RecoverySidecar*` types and `SidecarKind`,
so consumers in `exec/` can use them via `crate::db::manifest::...`.
Bugfix folded in (~5 LOC): make `coordinator` mutable in
`Omnigraph::open_with_storage_and_mode` and call `coordinator.refresh()`
after the recovery sweep returns. Roll-forward advances the manifest
pin on disk; without the refresh the returned engine carried a stale
in-memory snapshot. The Phase 4 tests passed only because they
opened Lance datasets directly rather than going through `db.snapshot()`.
Storage adapter (~15 LOC): `LocalStorageAdapter::write_text` now ensures
the parent directory exists via `tokio::fs::create_dir_all`. Required
because the sidecar protocol writes into `__recovery/` which doesn't
pre-exist after `Omnigraph::init`. S3 has no equivalent; PutObject is
path-agnostic.
Headline test flip (~150 LOC):
- `tests/failpoints.rs::finalize_publisher_residual_drifts_lance_head_until_next_writer_recovers`
is replaced by `recovery_rolls_forward_after_finalize_publisher_failure`.
Same setup (failpoint at `mutation.post_finalize_pre_publisher`) but
after the synthetic failure the test:
1. Asserts the sidecar persists in `__recovery/` for the recovery
sweep to find.
2. Drops the engine handle.
3. Reopens via `Omnigraph::open` — recovery sweep classifies
RolledPastExpected, decides RollForward, publishes the manifest
update, records the audit row, deletes the sidecar.
4. Asserts the sidecar is gone.
5. Asserts the originally-attempted Eve insert is now visible
(Person count = 1).
6. Asserts a subsequent insert succeeds without
ExpectedVersionMismatch (Person count = 2).
7. Asserts the audit dataset `_graph_commit_recoveries.lance` exists.
This is the headline contract the MR-847 acceptance criteria require.
All other failpoint and runs tests continue to pass (8 + 24 unchanged).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:15:37 +02:00
|
|
|
// The mutation's finalize completes (commit_staged advances Lance
|
|
|
|
|
// HEAD on node:Person AND writes a `__recovery/{ulid}.json`
|
|
|
|
|
// sidecar). Then the failpoint kicks in before the publisher's
|
|
|
|
|
// manifest commit, so the manifest pin stays at the pre-write
|
|
|
|
|
// version. The sidecar persists for the next-open recovery sweep.
|
2026-05-01 13:47:55 +02:00
|
|
|
let err = mutate_main(
|
|
|
|
|
&mut db,
|
|
|
|
|
MUTATION_QUERIES,
|
|
|
|
|
"insert_person",
|
|
|
|
|
&mixed_params(&[("$name", "Eve")], &[("$age", 22)]),
|
|
|
|
|
)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap_err();
|
|
|
|
|
assert!(
|
2026-05-05 16:04:48 +02:00
|
|
|
err.to_string()
|
|
|
|
|
.contains("injected failpoint triggered: mutation.post_finalize_pre_publisher"),
|
2026-05-01 13:47:55 +02:00
|
|
|
"unexpected error: {err}"
|
|
|
|
|
);
|
recovery: wire sidecar into MutationStaging::finalize + flip headline test (Phase 5)
Production wiring (~120 LOC):
- `MutationStaging::finalize` now takes a `SidecarKind` parameter and
returns an additional `Option<RecoverySidecarHandle>`. Builds a
Vec<SidecarTablePin> from `pending` BEFORE the per-table commit_staged
loop and writes the sidecar via `recovery::write_sidecar`. Skips the
sidecar when `pending` is empty (delete-only mutation; D₂ keeps these
out of the staged-write path so the option is just a clean signal,
not a code path users hit).
- `exec/mutation.rs::execute_mutation_as` (around line 740): destructure
the new third element, pass `SidecarKind::Mutation`, delete the
sidecar after `commit_updates_on_branch_with_expected` succeeds.
- `loader/mod.rs::ingest_loaded` (around line 540): same shape, with
`SidecarKind::Load`. The Overwrite path stays inline-commit (legacy
residual; out of MR-847 scope per docs/runs.md).
- New engine accessors `Omnigraph::storage_adapter()` and
`Omnigraph::root_uri()` for the sidecar I/O. The pre-existing
`db.storage` field stays private; no other engine code reaches around
the accessor.
- Re-exports from `db::manifest`: `new_sidecar`, `write_sidecar`,
`delete_sidecar`, plus the `RecoverySidecar*` types and `SidecarKind`,
so consumers in `exec/` can use them via `crate::db::manifest::...`.
Bugfix folded in (~5 LOC): make `coordinator` mutable in
`Omnigraph::open_with_storage_and_mode` and call `coordinator.refresh()`
after the recovery sweep returns. Roll-forward advances the manifest
pin on disk; without the refresh the returned engine carried a stale
in-memory snapshot. The Phase 4 tests passed only because they
opened Lance datasets directly rather than going through `db.snapshot()`.
Storage adapter (~15 LOC): `LocalStorageAdapter::write_text` now ensures
the parent directory exists via `tokio::fs::create_dir_all`. Required
because the sidecar protocol writes into `__recovery/` which doesn't
pre-exist after `Omnigraph::init`. S3 has no equivalent; PutObject is
path-agnostic.
Headline test flip (~150 LOC):
- `tests/failpoints.rs::finalize_publisher_residual_drifts_lance_head_until_next_writer_recovers`
is replaced by `recovery_rolls_forward_after_finalize_publisher_failure`.
Same setup (failpoint at `mutation.post_finalize_pre_publisher`) but
after the synthetic failure the test:
1. Asserts the sidecar persists in `__recovery/` for the recovery
sweep to find.
2. Drops the engine handle.
3. Reopens via `Omnigraph::open` — recovery sweep classifies
RolledPastExpected, decides RollForward, publishes the manifest
update, records the audit row, deletes the sidecar.
4. Asserts the sidecar is gone.
5. Asserts the originally-attempted Eve insert is now visible
(Person count = 1).
6. Asserts a subsequent insert succeeds without
ExpectedVersionMismatch (Person count = 2).
7. Asserts the audit dataset `_graph_commit_recoveries.lance` exists.
This is the headline contract the MR-847 acceptance criteria require.
All other failpoint and runs tests continue to pass (8 + 24 unchanged).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:15:37 +02:00
|
|
|
|
|
|
|
|
// Sidecar must still exist on disk for the recovery sweep to find.
|
|
|
|
|
let recovery_dir = dir.path().join("__recovery");
|
|
|
|
|
let sidecars: Vec<_> = std::fs::read_dir(&recovery_dir)
|
|
|
|
|
.unwrap()
|
|
|
|
|
.filter_map(|e| e.ok())
|
|
|
|
|
.collect();
|
|
|
|
|
assert_eq!(
|
|
|
|
|
sidecars.len(),
|
|
|
|
|
1,
|
|
|
|
|
"exactly one sidecar should persist after the finalize failure"
|
|
|
|
|
);
|
2026-05-05 16:04:48 +02:00
|
|
|
operation_id = single_sidecar_operation_id(dir.path());
|
recovery: wire sidecar into MutationStaging::finalize + flip headline test (Phase 5)
Production wiring (~120 LOC):
- `MutationStaging::finalize` now takes a `SidecarKind` parameter and
returns an additional `Option<RecoverySidecarHandle>`. Builds a
Vec<SidecarTablePin> from `pending` BEFORE the per-table commit_staged
loop and writes the sidecar via `recovery::write_sidecar`. Skips the
sidecar when `pending` is empty (delete-only mutation; D₂ keeps these
out of the staged-write path so the option is just a clean signal,
not a code path users hit).
- `exec/mutation.rs::execute_mutation_as` (around line 740): destructure
the new third element, pass `SidecarKind::Mutation`, delete the
sidecar after `commit_updates_on_branch_with_expected` succeeds.
- `loader/mod.rs::ingest_loaded` (around line 540): same shape, with
`SidecarKind::Load`. The Overwrite path stays inline-commit (legacy
residual; out of MR-847 scope per docs/runs.md).
- New engine accessors `Omnigraph::storage_adapter()` and
`Omnigraph::root_uri()` for the sidecar I/O. The pre-existing
`db.storage` field stays private; no other engine code reaches around
the accessor.
- Re-exports from `db::manifest`: `new_sidecar`, `write_sidecar`,
`delete_sidecar`, plus the `RecoverySidecar*` types and `SidecarKind`,
so consumers in `exec/` can use them via `crate::db::manifest::...`.
Bugfix folded in (~5 LOC): make `coordinator` mutable in
`Omnigraph::open_with_storage_and_mode` and call `coordinator.refresh()`
after the recovery sweep returns. Roll-forward advances the manifest
pin on disk; without the refresh the returned engine carried a stale
in-memory snapshot. The Phase 4 tests passed only because they
opened Lance datasets directly rather than going through `db.snapshot()`.
Storage adapter (~15 LOC): `LocalStorageAdapter::write_text` now ensures
the parent directory exists via `tokio::fs::create_dir_all`. Required
because the sidecar protocol writes into `__recovery/` which doesn't
pre-exist after `Omnigraph::init`. S3 has no equivalent; PutObject is
path-agnostic.
Headline test flip (~150 LOC):
- `tests/failpoints.rs::finalize_publisher_residual_drifts_lance_head_until_next_writer_recovers`
is replaced by `recovery_rolls_forward_after_finalize_publisher_failure`.
Same setup (failpoint at `mutation.post_finalize_pre_publisher`) but
after the synthetic failure the test:
1. Asserts the sidecar persists in `__recovery/` for the recovery
sweep to find.
2. Drops the engine handle.
3. Reopens via `Omnigraph::open` — recovery sweep classifies
RolledPastExpected, decides RollForward, publishes the manifest
update, records the audit row, deletes the sidecar.
4. Asserts the sidecar is gone.
5. Asserts the originally-attempted Eve insert is now visible
(Person count = 1).
6. Asserts a subsequent insert succeeds without
ExpectedVersionMismatch (Person count = 2).
7. Asserts the audit dataset `_graph_commit_recoveries.lance` exists.
This is the headline contract the MR-847 acceptance criteria require.
All other failpoint and runs tests continue to pass (8 + 24 unchanged).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:15:37 +02:00
|
|
|
|
|
|
|
|
// Drop the failpoint scope and the engine handle.
|
2026-05-01 13:47:55 +02:00
|
|
|
}
|
|
|
|
|
|
2026-05-05 22:46:03 +02:00
|
|
|
// Recovery: reopen runs the recovery sweep. The sweep finds the
|
recovery: wire sidecar into MutationStaging::finalize + flip headline test (Phase 5)
Production wiring (~120 LOC):
- `MutationStaging::finalize` now takes a `SidecarKind` parameter and
returns an additional `Option<RecoverySidecarHandle>`. Builds a
Vec<SidecarTablePin> from `pending` BEFORE the per-table commit_staged
loop and writes the sidecar via `recovery::write_sidecar`. Skips the
sidecar when `pending` is empty (delete-only mutation; D₂ keeps these
out of the staged-write path so the option is just a clean signal,
not a code path users hit).
- `exec/mutation.rs::execute_mutation_as` (around line 740): destructure
the new third element, pass `SidecarKind::Mutation`, delete the
sidecar after `commit_updates_on_branch_with_expected` succeeds.
- `loader/mod.rs::ingest_loaded` (around line 540): same shape, with
`SidecarKind::Load`. The Overwrite path stays inline-commit (legacy
residual; out of MR-847 scope per docs/runs.md).
- New engine accessors `Omnigraph::storage_adapter()` and
`Omnigraph::root_uri()` for the sidecar I/O. The pre-existing
`db.storage` field stays private; no other engine code reaches around
the accessor.
- Re-exports from `db::manifest`: `new_sidecar`, `write_sidecar`,
`delete_sidecar`, plus the `RecoverySidecar*` types and `SidecarKind`,
so consumers in `exec/` can use them via `crate::db::manifest::...`.
Bugfix folded in (~5 LOC): make `coordinator` mutable in
`Omnigraph::open_with_storage_and_mode` and call `coordinator.refresh()`
after the recovery sweep returns. Roll-forward advances the manifest
pin on disk; without the refresh the returned engine carried a stale
in-memory snapshot. The Phase 4 tests passed only because they
opened Lance datasets directly rather than going through `db.snapshot()`.
Storage adapter (~15 LOC): `LocalStorageAdapter::write_text` now ensures
the parent directory exists via `tokio::fs::create_dir_all`. Required
because the sidecar protocol writes into `__recovery/` which doesn't
pre-exist after `Omnigraph::init`. S3 has no equivalent; PutObject is
path-agnostic.
Headline test flip (~150 LOC):
- `tests/failpoints.rs::finalize_publisher_residual_drifts_lance_head_until_next_writer_recovers`
is replaced by `recovery_rolls_forward_after_finalize_publisher_failure`.
Same setup (failpoint at `mutation.post_finalize_pre_publisher`) but
after the synthetic failure the test:
1. Asserts the sidecar persists in `__recovery/` for the recovery
sweep to find.
2. Drops the engine handle.
3. Reopens via `Omnigraph::open` — recovery sweep classifies
RolledPastExpected, decides RollForward, publishes the manifest
update, records the audit row, deletes the sidecar.
4. Asserts the sidecar is gone.
5. Asserts the originally-attempted Eve insert is now visible
(Person count = 1).
6. Asserts a subsequent insert succeeds without
ExpectedVersionMismatch (Person count = 2).
7. Asserts the audit dataset `_graph_commit_recoveries.lance` exists.
This is the headline contract the MR-847 acceptance criteria require.
All other failpoint and runs tests continue to pass (8 + 24 unchanged).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:15:37 +02:00
|
|
|
// sidecar, classifies node:Person as RolledPastExpected, decides
|
|
|
|
|
// RollForward, publishes the manifest update, records the audit
|
|
|
|
|
// row, deletes the sidecar.
|
2026-05-05 16:04:48 +02:00
|
|
|
let db = Omnigraph::open(&uri).await.unwrap();
|
recovery: wire sidecar into MutationStaging::finalize + flip headline test (Phase 5)
Production wiring (~120 LOC):
- `MutationStaging::finalize` now takes a `SidecarKind` parameter and
returns an additional `Option<RecoverySidecarHandle>`. Builds a
Vec<SidecarTablePin> from `pending` BEFORE the per-table commit_staged
loop and writes the sidecar via `recovery::write_sidecar`. Skips the
sidecar when `pending` is empty (delete-only mutation; D₂ keeps these
out of the staged-write path so the option is just a clean signal,
not a code path users hit).
- `exec/mutation.rs::execute_mutation_as` (around line 740): destructure
the new third element, pass `SidecarKind::Mutation`, delete the
sidecar after `commit_updates_on_branch_with_expected` succeeds.
- `loader/mod.rs::ingest_loaded` (around line 540): same shape, with
`SidecarKind::Load`. The Overwrite path stays inline-commit (legacy
residual; out of MR-847 scope per docs/runs.md).
- New engine accessors `Omnigraph::storage_adapter()` and
`Omnigraph::root_uri()` for the sidecar I/O. The pre-existing
`db.storage` field stays private; no other engine code reaches around
the accessor.
- Re-exports from `db::manifest`: `new_sidecar`, `write_sidecar`,
`delete_sidecar`, plus the `RecoverySidecar*` types and `SidecarKind`,
so consumers in `exec/` can use them via `crate::db::manifest::...`.
Bugfix folded in (~5 LOC): make `coordinator` mutable in
`Omnigraph::open_with_storage_and_mode` and call `coordinator.refresh()`
after the recovery sweep returns. Roll-forward advances the manifest
pin on disk; without the refresh the returned engine carried a stale
in-memory snapshot. The Phase 4 tests passed only because they
opened Lance datasets directly rather than going through `db.snapshot()`.
Storage adapter (~15 LOC): `LocalStorageAdapter::write_text` now ensures
the parent directory exists via `tokio::fs::create_dir_all`. Required
because the sidecar protocol writes into `__recovery/` which doesn't
pre-exist after `Omnigraph::init`. S3 has no equivalent; PutObject is
path-agnostic.
Headline test flip (~150 LOC):
- `tests/failpoints.rs::finalize_publisher_residual_drifts_lance_head_until_next_writer_recovers`
is replaced by `recovery_rolls_forward_after_finalize_publisher_failure`.
Same setup (failpoint at `mutation.post_finalize_pre_publisher`) but
after the synthetic failure the test:
1. Asserts the sidecar persists in `__recovery/` for the recovery
sweep to find.
2. Drops the engine handle.
3. Reopens via `Omnigraph::open` — recovery sweep classifies
RolledPastExpected, decides RollForward, publishes the manifest
update, records the audit row, deletes the sidecar.
4. Asserts the sidecar is gone.
5. Asserts the originally-attempted Eve insert is now visible
(Person count = 1).
6. Asserts a subsequent insert succeeds without
ExpectedVersionMismatch (Person count = 2).
7. Asserts the audit dataset `_graph_commit_recoveries.lance` exists.
This is the headline contract the MR-847 acceptance criteria require.
All other failpoint and runs tests continue to pass (8 + 24 unchanged).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:15:37 +02:00
|
|
|
|
|
|
|
|
// The originally-attempted "Eve" insert is now visible — the recovery
|
|
|
|
|
// sweep extended the manifest pin to include the staged commit.
|
|
|
|
|
let person_count = helpers::count_rows(&db, "node:Person").await;
|
|
|
|
|
assert_eq!(
|
|
|
|
|
person_count, 1,
|
|
|
|
|
"exactly one person (Eve) must be visible after roll-forward"
|
|
|
|
|
);
|
2026-05-05 16:04:48 +02:00
|
|
|
drop(db);
|
recovery: wire sidecar into MutationStaging::finalize + flip headline test (Phase 5)
Production wiring (~120 LOC):
- `MutationStaging::finalize` now takes a `SidecarKind` parameter and
returns an additional `Option<RecoverySidecarHandle>`. Builds a
Vec<SidecarTablePin> from `pending` BEFORE the per-table commit_staged
loop and writes the sidecar via `recovery::write_sidecar`. Skips the
sidecar when `pending` is empty (delete-only mutation; D₂ keeps these
out of the staged-write path so the option is just a clean signal,
not a code path users hit).
- `exec/mutation.rs::execute_mutation_as` (around line 740): destructure
the new third element, pass `SidecarKind::Mutation`, delete the
sidecar after `commit_updates_on_branch_with_expected` succeeds.
- `loader/mod.rs::ingest_loaded` (around line 540): same shape, with
`SidecarKind::Load`. The Overwrite path stays inline-commit (legacy
residual; out of MR-847 scope per docs/runs.md).
- New engine accessors `Omnigraph::storage_adapter()` and
`Omnigraph::root_uri()` for the sidecar I/O. The pre-existing
`db.storage` field stays private; no other engine code reaches around
the accessor.
- Re-exports from `db::manifest`: `new_sidecar`, `write_sidecar`,
`delete_sidecar`, plus the `RecoverySidecar*` types and `SidecarKind`,
so consumers in `exec/` can use them via `crate::db::manifest::...`.
Bugfix folded in (~5 LOC): make `coordinator` mutable in
`Omnigraph::open_with_storage_and_mode` and call `coordinator.refresh()`
after the recovery sweep returns. Roll-forward advances the manifest
pin on disk; without the refresh the returned engine carried a stale
in-memory snapshot. The Phase 4 tests passed only because they
opened Lance datasets directly rather than going through `db.snapshot()`.
Storage adapter (~15 LOC): `LocalStorageAdapter::write_text` now ensures
the parent directory exists via `tokio::fs::create_dir_all`. Required
because the sidecar protocol writes into `__recovery/` which doesn't
pre-exist after `Omnigraph::init`. S3 has no equivalent; PutObject is
path-agnostic.
Headline test flip (~150 LOC):
- `tests/failpoints.rs::finalize_publisher_residual_drifts_lance_head_until_next_writer_recovers`
is replaced by `recovery_rolls_forward_after_finalize_publisher_failure`.
Same setup (failpoint at `mutation.post_finalize_pre_publisher`) but
after the synthetic failure the test:
1. Asserts the sidecar persists in `__recovery/` for the recovery
sweep to find.
2. Drops the engine handle.
3. Reopens via `Omnigraph::open` — recovery sweep classifies
RolledPastExpected, decides RollForward, publishes the manifest
update, records the audit row, deletes the sidecar.
4. Asserts the sidecar is gone.
5. Asserts the originally-attempted Eve insert is now visible
(Person count = 1).
6. Asserts a subsequent insert succeeds without
ExpectedVersionMismatch (Person count = 2).
7. Asserts the audit dataset `_graph_commit_recoveries.lance` exists.
This is the headline contract the MR-847 acceptance criteria require.
All other failpoint and runs tests continue to pass (8 + 24 unchanged).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:15:37 +02:00
|
|
|
|
2026-05-05 16:04:48 +02:00
|
|
|
assert_post_recovery_invariants(
|
|
|
|
|
dir.path(),
|
|
|
|
|
&operation_id,
|
|
|
|
|
RecoveryExpectation::RolledForward {
|
|
|
|
|
tables: vec![TableExpectation::main("node:Person").follow_up_mutation(
|
|
|
|
|
FollowUpMutation::new(
|
|
|
|
|
"main",
|
|
|
|
|
MUTATION_QUERIES,
|
|
|
|
|
"insert_person",
|
|
|
|
|
mixed_params(&[("$name", "Frank")], &[("$age", 33)]),
|
|
|
|
|
),
|
|
|
|
|
)],
|
|
|
|
|
},
|
2026-05-01 13:47:55 +02:00
|
|
|
)
|
|
|
|
|
.await
|
2026-05-05 16:04:48 +02:00
|
|
|
.unwrap();
|
|
|
|
|
|
|
|
|
|
let db = Omnigraph::open(&uri).await.unwrap();
|
recovery: wire sidecar into MutationStaging::finalize + flip headline test (Phase 5)
Production wiring (~120 LOC):
- `MutationStaging::finalize` now takes a `SidecarKind` parameter and
returns an additional `Option<RecoverySidecarHandle>`. Builds a
Vec<SidecarTablePin> from `pending` BEFORE the per-table commit_staged
loop and writes the sidecar via `recovery::write_sidecar`. Skips the
sidecar when `pending` is empty (delete-only mutation; D₂ keeps these
out of the staged-write path so the option is just a clean signal,
not a code path users hit).
- `exec/mutation.rs::execute_mutation_as` (around line 740): destructure
the new third element, pass `SidecarKind::Mutation`, delete the
sidecar after `commit_updates_on_branch_with_expected` succeeds.
- `loader/mod.rs::ingest_loaded` (around line 540): same shape, with
`SidecarKind::Load`. The Overwrite path stays inline-commit (legacy
residual; out of MR-847 scope per docs/runs.md).
- New engine accessors `Omnigraph::storage_adapter()` and
`Omnigraph::root_uri()` for the sidecar I/O. The pre-existing
`db.storage` field stays private; no other engine code reaches around
the accessor.
- Re-exports from `db::manifest`: `new_sidecar`, `write_sidecar`,
`delete_sidecar`, plus the `RecoverySidecar*` types and `SidecarKind`,
so consumers in `exec/` can use them via `crate::db::manifest::...`.
Bugfix folded in (~5 LOC): make `coordinator` mutable in
`Omnigraph::open_with_storage_and_mode` and call `coordinator.refresh()`
after the recovery sweep returns. Roll-forward advances the manifest
pin on disk; without the refresh the returned engine carried a stale
in-memory snapshot. The Phase 4 tests passed only because they
opened Lance datasets directly rather than going through `db.snapshot()`.
Storage adapter (~15 LOC): `LocalStorageAdapter::write_text` now ensures
the parent directory exists via `tokio::fs::create_dir_all`. Required
because the sidecar protocol writes into `__recovery/` which doesn't
pre-exist after `Omnigraph::init`. S3 has no equivalent; PutObject is
path-agnostic.
Headline test flip (~150 LOC):
- `tests/failpoints.rs::finalize_publisher_residual_drifts_lance_head_until_next_writer_recovers`
is replaced by `recovery_rolls_forward_after_finalize_publisher_failure`.
Same setup (failpoint at `mutation.post_finalize_pre_publisher`) but
after the synthetic failure the test:
1. Asserts the sidecar persists in `__recovery/` for the recovery
sweep to find.
2. Drops the engine handle.
3. Reopens via `Omnigraph::open` — recovery sweep classifies
RolledPastExpected, decides RollForward, publishes the manifest
update, records the audit row, deletes the sidecar.
4. Asserts the sidecar is gone.
5. Asserts the originally-attempted Eve insert is now visible
(Person count = 1).
6. Asserts a subsequent insert succeeds without
ExpectedVersionMismatch (Person count = 2).
7. Asserts the audit dataset `_graph_commit_recoveries.lance` exists.
This is the headline contract the MR-847 acceptance criteria require.
All other failpoint and runs tests continue to pass (8 + 24 unchanged).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:15:37 +02:00
|
|
|
let person_count = helpers::count_rows(&db, "node:Person").await;
|
2026-05-01 13:47:55 +02:00
|
|
|
assert_eq!(
|
recovery: wire sidecar into MutationStaging::finalize + flip headline test (Phase 5)
Production wiring (~120 LOC):
- `MutationStaging::finalize` now takes a `SidecarKind` parameter and
returns an additional `Option<RecoverySidecarHandle>`. Builds a
Vec<SidecarTablePin> from `pending` BEFORE the per-table commit_staged
loop and writes the sidecar via `recovery::write_sidecar`. Skips the
sidecar when `pending` is empty (delete-only mutation; D₂ keeps these
out of the staged-write path so the option is just a clean signal,
not a code path users hit).
- `exec/mutation.rs::execute_mutation_as` (around line 740): destructure
the new third element, pass `SidecarKind::Mutation`, delete the
sidecar after `commit_updates_on_branch_with_expected` succeeds.
- `loader/mod.rs::ingest_loaded` (around line 540): same shape, with
`SidecarKind::Load`. The Overwrite path stays inline-commit (legacy
residual; out of MR-847 scope per docs/runs.md).
- New engine accessors `Omnigraph::storage_adapter()` and
`Omnigraph::root_uri()` for the sidecar I/O. The pre-existing
`db.storage` field stays private; no other engine code reaches around
the accessor.
- Re-exports from `db::manifest`: `new_sidecar`, `write_sidecar`,
`delete_sidecar`, plus the `RecoverySidecar*` types and `SidecarKind`,
so consumers in `exec/` can use them via `crate::db::manifest::...`.
Bugfix folded in (~5 LOC): make `coordinator` mutable in
`Omnigraph::open_with_storage_and_mode` and call `coordinator.refresh()`
after the recovery sweep returns. Roll-forward advances the manifest
pin on disk; without the refresh the returned engine carried a stale
in-memory snapshot. The Phase 4 tests passed only because they
opened Lance datasets directly rather than going through `db.snapshot()`.
Storage adapter (~15 LOC): `LocalStorageAdapter::write_text` now ensures
the parent directory exists via `tokio::fs::create_dir_all`. Required
because the sidecar protocol writes into `__recovery/` which doesn't
pre-exist after `Omnigraph::init`. S3 has no equivalent; PutObject is
path-agnostic.
Headline test flip (~150 LOC):
- `tests/failpoints.rs::finalize_publisher_residual_drifts_lance_head_until_next_writer_recovers`
is replaced by `recovery_rolls_forward_after_finalize_publisher_failure`.
Same setup (failpoint at `mutation.post_finalize_pre_publisher`) but
after the synthetic failure the test:
1. Asserts the sidecar persists in `__recovery/` for the recovery
sweep to find.
2. Drops the engine handle.
3. Reopens via `Omnigraph::open` — recovery sweep classifies
RolledPastExpected, decides RollForward, publishes the manifest
update, records the audit row, deletes the sidecar.
4. Asserts the sidecar is gone.
5. Asserts the originally-attempted Eve insert is now visible
(Person count = 1).
6. Asserts a subsequent insert succeeds without
ExpectedVersionMismatch (Person count = 2).
7. Asserts the audit dataset `_graph_commit_recoveries.lance` exists.
This is the headline contract the MR-847 acceptance criteria require.
All other failpoint and runs tests continue to pass (8 + 24 unchanged).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:15:37 +02:00
|
|
|
person_count, 2,
|
|
|
|
|
"Frank's insert must land normally after recovery"
|
2026-05-01 13:47:55 +02:00
|
|
|
);
|
2026-05-05 16:04:48 +02:00
|
|
|
}
|
recovery: wire sidecar into MutationStaging::finalize + flip headline test (Phase 5)
Production wiring (~120 LOC):
- `MutationStaging::finalize` now takes a `SidecarKind` parameter and
returns an additional `Option<RecoverySidecarHandle>`. Builds a
Vec<SidecarTablePin> from `pending` BEFORE the per-table commit_staged
loop and writes the sidecar via `recovery::write_sidecar`. Skips the
sidecar when `pending` is empty (delete-only mutation; D₂ keeps these
out of the staged-write path so the option is just a clean signal,
not a code path users hit).
- `exec/mutation.rs::execute_mutation_as` (around line 740): destructure
the new third element, pass `SidecarKind::Mutation`, delete the
sidecar after `commit_updates_on_branch_with_expected` succeeds.
- `loader/mod.rs::ingest_loaded` (around line 540): same shape, with
`SidecarKind::Load`. The Overwrite path stays inline-commit (legacy
residual; out of MR-847 scope per docs/runs.md).
- New engine accessors `Omnigraph::storage_adapter()` and
`Omnigraph::root_uri()` for the sidecar I/O. The pre-existing
`db.storage` field stays private; no other engine code reaches around
the accessor.
- Re-exports from `db::manifest`: `new_sidecar`, `write_sidecar`,
`delete_sidecar`, plus the `RecoverySidecar*` types and `SidecarKind`,
so consumers in `exec/` can use them via `crate::db::manifest::...`.
Bugfix folded in (~5 LOC): make `coordinator` mutable in
`Omnigraph::open_with_storage_and_mode` and call `coordinator.refresh()`
after the recovery sweep returns. Roll-forward advances the manifest
pin on disk; without the refresh the returned engine carried a stale
in-memory snapshot. The Phase 4 tests passed only because they
opened Lance datasets directly rather than going through `db.snapshot()`.
Storage adapter (~15 LOC): `LocalStorageAdapter::write_text` now ensures
the parent directory exists via `tokio::fs::create_dir_all`. Required
because the sidecar protocol writes into `__recovery/` which doesn't
pre-exist after `Omnigraph::init`. S3 has no equivalent; PutObject is
path-agnostic.
Headline test flip (~150 LOC):
- `tests/failpoints.rs::finalize_publisher_residual_drifts_lance_head_until_next_writer_recovers`
is replaced by `recovery_rolls_forward_after_finalize_publisher_failure`.
Same setup (failpoint at `mutation.post_finalize_pre_publisher`) but
after the synthetic failure the test:
1. Asserts the sidecar persists in `__recovery/` for the recovery
sweep to find.
2. Drops the engine handle.
3. Reopens via `Omnigraph::open` — recovery sweep classifies
RolledPastExpected, decides RollForward, publishes the manifest
update, records the audit row, deletes the sidecar.
4. Asserts the sidecar is gone.
5. Asserts the originally-attempted Eve insert is now visible
(Person count = 1).
6. Asserts a subsequent insert succeeds without
ExpectedVersionMismatch (Person count = 2).
7. Asserts the audit dataset `_graph_commit_recoveries.lance` exists.
This is the headline contract the MR-847 acceptance criteria require.
All other failpoint and runs tests continue to pass (8 + 24 unchanged).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:15:37 +02:00
|
|
|
|
2026-05-10 10:37:46 +00:00
|
|
|
#[tokio::test]
|
|
|
|
|
async fn inline_delete_conflict_writes_sidecar_before_rejecting() {
|
|
|
|
|
let _scenario = FailScenario::setup();
|
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
|
|
|
|
let uri = dir.path().to_str().unwrap().to_string();
|
|
|
|
|
let db = helpers::init_and_load(&dir).await;
|
|
|
|
|
|
|
|
|
|
let pre_snapshot = db
|
|
|
|
|
.snapshot_of(omnigraph::db::ReadTarget::branch("main"))
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
|
|
|
|
let pre_person_pin = pre_snapshot.entry("node:Person").unwrap().table_version;
|
|
|
|
|
let person_uri = node_table_uri(&uri, "Person");
|
|
|
|
|
|
|
|
|
|
{
|
2026-05-24 16:46:00 +01:00
|
|
|
let _pause_delete =
|
|
|
|
|
ScopedFailPoint::new("mutation.delete_node_pre_primary_delete", "pause");
|
2026-05-10 10:37:46 +00:00
|
|
|
let delete_params = helpers::params(&[("$name", "Alice")]);
|
2026-05-24 16:46:00 +01:00
|
|
|
let delete = db.mutate("main", MUTATION_QUERIES, "remove_person", &delete_params);
|
2026-05-10 10:37:46 +00:00
|
|
|
tokio::pin!(delete);
|
|
|
|
|
|
|
|
|
|
let mut concurrent_update_succeeded = false;
|
|
|
|
|
for _ in 0..50 {
|
|
|
|
|
if delete.as_mut().now_or_never().is_some() {
|
|
|
|
|
panic!("delete mutation completed before primary-delete failpoint was released");
|
|
|
|
|
}
|
|
|
|
|
let mut concurrent = Omnigraph::open_read_only(&uri).await.unwrap();
|
|
|
|
|
if mutate_main(
|
|
|
|
|
&mut concurrent,
|
|
|
|
|
MUTATION_QUERIES,
|
|
|
|
|
"set_age",
|
|
|
|
|
&mixed_params(&[("$name", "Bob")], &[("$age", 26)]),
|
|
|
|
|
)
|
2026-05-24 16:46:00 +01:00
|
|
|
.await
|
|
|
|
|
.is_ok()
|
2026-05-10 10:37:46 +00:00
|
|
|
{
|
|
|
|
|
concurrent_update_succeeded = true;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
tokio::time::sleep(std::time::Duration::from_millis(20)).await;
|
|
|
|
|
}
|
2026-05-24 16:46:00 +01:00
|
|
|
assert!(
|
|
|
|
|
concurrent_update_succeeded,
|
|
|
|
|
"concurrent update must land while delete is paused"
|
|
|
|
|
);
|
2026-05-10 10:37:46 +00:00
|
|
|
fail::remove("mutation.delete_node_pre_primary_delete");
|
|
|
|
|
|
|
|
|
|
let err = delete.await.unwrap_err();
|
|
|
|
|
assert!(
|
|
|
|
|
err.to_string().contains("stale view of 'node:Person'")
|
|
|
|
|
|| err.to_string().contains("ExpectedVersionMismatch")
|
|
|
|
|
|| err.to_string().contains("expected version mismatch"),
|
|
|
|
|
"unexpected error: {err}",
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let person_head = lance::Dataset::open(&person_uri)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap()
|
|
|
|
|
.version()
|
|
|
|
|
.version;
|
|
|
|
|
assert!(
|
|
|
|
|
person_head > pre_person_pin,
|
|
|
|
|
"primary inline delete must have advanced node:Person before rejecting"
|
|
|
|
|
);
|
|
|
|
|
let db = Omnigraph::open(&uri).await.unwrap();
|
|
|
|
|
assert_eq!(
|
|
|
|
|
helpers::count_rows(&db, "node:Person").await,
|
|
|
|
|
4,
|
|
|
|
|
"manifest-conflicted delete must not remove net Person rows after recovery"
|
|
|
|
|
);
|
|
|
|
|
assert_eq!(
|
|
|
|
|
helpers::count_rows(&db, "edge:Knows").await,
|
|
|
|
|
3,
|
|
|
|
|
"manifest-conflicted delete must not remove net Knows rows after recovery"
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
2026-05-05 16:04:48 +02:00
|
|
|
#[tokio::test]
|
|
|
|
|
async fn recovery_rolls_forward_load_on_feature_branch() {
|
|
|
|
|
use omnigraph::loader::LoadMode;
|
|
|
|
|
|
|
|
|
|
let _scenario = FailScenario::setup();
|
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
|
|
|
|
let uri = dir.path().to_str().unwrap().to_string();
|
|
|
|
|
let operation_id;
|
|
|
|
|
let main_person_pin;
|
|
|
|
|
let feature_parent_commit_id;
|
|
|
|
|
|
|
|
|
|
{
|
2026-05-08 16:26:23 +02:00
|
|
|
let db = Omnigraph::init(&uri, helpers::TEST_SCHEMA).await.unwrap();
|
2026-05-05 16:04:48 +02:00
|
|
|
db.branch_create("feature").await.unwrap();
|
|
|
|
|
db.mutate(
|
|
|
|
|
"feature",
|
|
|
|
|
MUTATION_QUERIES,
|
|
|
|
|
"insert_person",
|
|
|
|
|
&mixed_params(&[("$name", "BeforeLoad")], &[("$age", 40)]),
|
|
|
|
|
)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
|
|
|
|
main_person_pin = db
|
|
|
|
|
.snapshot_of(omnigraph::db::ReadTarget::branch("main"))
|
|
|
|
|
.await
|
|
|
|
|
.unwrap()
|
|
|
|
|
.entry("node:Person")
|
|
|
|
|
.expect("main must have Person")
|
|
|
|
|
.table_version;
|
|
|
|
|
feature_parent_commit_id = branch_head_commit_id(dir.path(), "feature").await.unwrap();
|
|
|
|
|
|
|
|
|
|
let _failpoint = ScopedFailPoint::new("mutation.post_finalize_pre_publisher", "return");
|
|
|
|
|
let err = db
|
|
|
|
|
.load(
|
|
|
|
|
"feature",
|
|
|
|
|
r#"{"type":"Person","data":{"name":"FeatureLoad","age":41}}
|
|
|
|
|
"#,
|
|
|
|
|
LoadMode::Append,
|
|
|
|
|
)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap_err();
|
|
|
|
|
assert!(
|
|
|
|
|
err.to_string()
|
|
|
|
|
.contains("injected failpoint triggered: mutation.post_finalize_pre_publisher"),
|
|
|
|
|
"unexpected error: {err}"
|
|
|
|
|
);
|
|
|
|
|
operation_id = single_sidecar_operation_id(dir.path());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let db = Omnigraph::open(&uri).await.unwrap();
|
|
|
|
|
assert_eq!(
|
|
|
|
|
helpers::count_rows_branch(&db, "feature", "node:Person").await,
|
|
|
|
|
2,
|
|
|
|
|
"feature branch load row must be visible after recovery"
|
|
|
|
|
);
|
|
|
|
|
assert_eq!(
|
|
|
|
|
helpers::count_rows(&db, "node:Person").await,
|
|
|
|
|
0,
|
|
|
|
|
"feature branch load recovery must not publish the row to main"
|
|
|
|
|
);
|
|
|
|
|
drop(db);
|
|
|
|
|
|
|
|
|
|
assert_post_recovery_invariants(
|
|
|
|
|
dir.path(),
|
|
|
|
|
&operation_id,
|
|
|
|
|
RecoveryExpectation::RolledForward {
|
|
|
|
|
tables: vec![
|
|
|
|
|
TableExpectation::branch("node:Person", "feature")
|
|
|
|
|
.expected_main_manifest_pin(main_person_pin)
|
|
|
|
|
.expected_recovery_parent_commit_id(feature_parent_commit_id)
|
|
|
|
|
.follow_up_mutation(FollowUpMutation::new(
|
|
|
|
|
"feature",
|
|
|
|
|
MUTATION_QUERIES,
|
|
|
|
|
"insert_person",
|
|
|
|
|
mixed_params(&[("$name", "AfterLoad")], &[("$age", 42)]),
|
|
|
|
|
)),
|
|
|
|
|
],
|
|
|
|
|
},
|
|
|
|
|
)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
|
|
|
|
|
|
|
|
|
let db = Omnigraph::open(&uri).await.unwrap();
|
|
|
|
|
assert_eq!(
|
|
|
|
|
helpers::count_rows_branch(&db, "feature", "node:Person").await,
|
|
|
|
|
3,
|
|
|
|
|
"follow-up feature mutation must succeed after load recovery"
|
|
|
|
|
);
|
|
|
|
|
assert_eq!(
|
|
|
|
|
helpers::count_rows(&db, "node:Person").await,
|
|
|
|
|
0,
|
|
|
|
|
"follow-up feature mutation must not move main"
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
(feat) convert engine call sites to &dyn TableStorage; demote legacy TableStore methods to pub(crate) (#86)
* MR-854: convert engine call sites to &dyn TableStorage; demote legacy methods
Phase 1b: every db.table_store.X(...) call site converts to
db.storage().X(...), reaching the storage layer through the sealed
TableStorage trait (returns &dyn TableStorage). Opaque SnapshotHandle
and StagedHandle replace bare lance::Dataset and Transaction in the
threaded values.
Phase 9: the inherent inline-commit methods on TableStore
(append_batch, merge_insert_batch{,es}, overwrite_batch,
create_btree_index, create_inverted_index) demote from pub to
pub(crate). Their only remaining direct users are table_store.rs
itself and the bulk loader's LoadMode::{Append, Overwrite, Merge}
concurrent fast-paths in loader::write_batch_to_dataset (no
two-phase shape in Lance 4.0.0 — closes after lance#6658 and #6666).
Docs:
- invariants.md \u00a7VI.23: drop "at the writer-trait surface"
qualifier; staged primitives are now the only engine surface.
- runs.md: residual matrix shrinks to delete_where and
create_vector_index (the two upstream-blocked residuals).
- forbidden_apis.rs: replace transitional language with the
current allow-list shape (table_store.rs + loader concurrent
fast-path only).
Files touched:
- changes/mod.rs, db/omnigraph.rs (+export/optimize/schema_apply/
table_ops.rs), exec/{merge,mod,mutation,staging}.rs,
loader/mod.rs, storage_layer.rs, table_store.rs,
tests/forbidden_apis.rs, docs/{invariants,runs}.md.
Co-Authored-By: Ragnor Comerford <ragnor.comerford@gmail.com>
* MR-854: replace test-only inline-commit append callers with local Lance helpers
After demoting TableStore::append_batch from pub to pub(crate), the
integration tests in tests/recovery.rs and tests/staged_writes.rs
that previously called store.append_batch(...) directly to simulate
HEAD-ahead-of-manifest drift can no longer access the inherent
method. Replace those calls with small in-test helpers that do a raw
Dataset::append (the same body the inherent method runs).
- tests/helpers/mod.rs gains lance_append_inline (shared helper).
- tests/staged_writes.rs gets a file-local lance_append_inline_local
(staged_writes.rs does not import helpers::).
- tests/recovery.rs drops the unused TableStore import in the one
function whose store binding became unused after the conversion.
Co-Authored-By: Ragnor Comerford <ragnor.comerford@gmail.com>
* MR-854: retrigger CI for flaky Test Workspace job
Co-Authored-By: Ragnor Comerford <ragnor.comerford@gmail.com>
* MR-854: convert remaining table_store call sites in export.rs / read_blob
Two leftover `self.table_store.X` / `db.table_store.X` call sites were
missed in the initial sweep — flagged by Devin Review on PR #86. Both
now go through the trait surface:
- `entity_from_snapshot` (db/omnigraph/export.rs): switch from
`db.table_store.open_snapshot_table` + `db.table_store.scan` to
`db.storage().open_snapshot_at_table` + `db.storage().scan`.
- `read_blob` (db/omnigraph.rs): replace
`snapshot.open(table_key)` + `self.table_store.first_row_id_for_filter`
with `self.storage().open_snapshot_at_table` +
`self.storage().first_row_id_for_filter`. The follow-up
`take_blobs` call still needs an `Arc<Dataset>` (it's a Lance blob
accessor not surfaced through the trait), so we hand off via
`SnapshotHandle::into_arc()` with a comment.
After this commit, no engine code outside `table_store.rs` reaches the
inherent `TableStore` API — the docs/runs.md and docs/invariants.md
claim is now uniformly true.
Co-Authored-By: Ragnor Comerford <ragnor.comerford@gmail.com>
* MR-854: post-rebase doc fixes (Lance 6.0.1, MR-A framing, into_dataset note)
Reviewer feedback on the rebased PR:
* docs/dev/writes.md residuals matrix: drop demoted methods from the trait-surface table (now `pub(crate)`); keep only the two genuine trait-surface residuals (`delete_where`, `create_vector_index`); reframe under MR-A (Lance v7.x bump) per docs/dev/lance.md.
* tests/forbidden_apis.rs: update transitional allow-list header to (a) drop the truncate_table mislabel (truncate_table is a Lance Dataset method, not a TableStore method — overwrite_batch's internal call), (b) reframe trait-surface residuals under MR-A / Lance #6666.
* crates/omnigraph/src/storage_layer.rs::SnapshotHandle::{into_arc, into_dataset}: add single-ref invariant doc — both consume Arc via try_unwrap-or-clone; sibling SnapshotHandle clones across an await point force a deep Dataset clone.
* Replace lance-4.0.0 version refs with lance-6.0.1 in active source/test/dev-doc comments (storage_layer.rs, table_store.rs, table_ops.rs, schema_apply.rs, merge.rs, recovery.rs, staged_writes.rs, consistency.rs, docs/dev/execution.md, docs/user/query-language.md). Historical refs in docs/releases/v0.4.1.md and the canonical "Lance 4.0.0 → 6.0.1 migration" line in docs/dev/lance.md left intact.
No engine code changes.
* MR-854: update docs/dev/invariants.md Storage trait row + gap entry
Reviewer feedback: the docs reorg landed; the invariant row now lives in
docs/dev/invariants.md with stable headings (no more numbered §VI.23).
Update two pieces to reflect MR-854 completion:
* Status table 'Storage trait' row: was 'full call-site migration ... incomplete';
now 'engine call sites all route through db.storage() (MR-854); inline-commit
inherent methods are pub(crate)-demoted; capability/stat surfaces are roadmap'.
* 'Known Gaps' 'Storage abstraction' entry: was 'older inherent TableStore call
sites and inline residuals remain'; now names the closed scope (MR-854 — call
sites migrated, methods demoted, loader fast-paths) and the remaining
trait-surface residuals under MR-A (Lance v7.x bump) and Lance #6666.
Cross-links to docs/dev/lance.md and docs/dev/writes.md so the framing stays
co-located with the canonical Lance surface tracking.
* MR-854: remove dead inline-commit methods from the storage surface
The loader concurrent fast-path (write_batch_to_dataset) is only reached
for LoadMode::Overwrite — Append/Merge route through MutationStaging — so
its Append/Merge arms were unreachable. Collapse it to overwrite-only and
drop the now-unused mode params, which removes the only callers of:
- TableStorage::append_batch + TableStorage::merge_insert_batches (trait)
- TableStore::merge_insert_batch + merge_insert_batches (inherent)
create_btree_index / create_inverted_index had zero callers anywhere
(scalar index builds use the stage_* primitives). Remove both from the
trait and the inherent impl.
Inherent append_batch stays pub(crate): overwrite_batch and recovery
tests use it. Migrate the one trait-append_batch test caller
(seed_person_row) to stage_append + commit_staged. The merge_insert
FirstSeen-workaround rationale moves from the deleted merge_insert_batch
into stage_merge_insert (now the sole merge path). No behavior change.
Also corrects the inaccurate loader residual comment (the prior text
blamed Lance #6658/#6666, which are the delete and vector-index issues,
for keeping overwrite inline; a stage_overwrite primitive already exists
and schema_apply uses it).
* MR-854: seal db.storage() to staged-only; move residuals to InlineCommitResidual
Split the three remaining inline-commit writes (overwrite_batch,
delete_where, create_vector_index) off the TableStorage trait onto a new
sealed InlineCommitResidual trait, reachable only via the explicit
Omnigraph::storage_inline_residual() accessor. db.storage() now exposes
only staged primitives + reads, so engine code cannot couple a write
with a Lance HEAD advance through the default surface — MR-793 acceptance
§1 ("no public method commits as a side effect of writing") now holds by
construction, not by review + naming.
Call sites moved to storage_inline_residual(): loader overwrite
fast-path, the three mutation delete_where paths, the branch-merge
delete, and the vector-index build. Impl bodies are unchanged (same
delegation to the pub(crate) inherent methods); this is a pure surface
reshape with no behavior change.
The residual trait holds two genuinely upstream-blocked methods
(delete_where -> Lance #6658/v7.x, create_vector_index -> Lance #6666)
plus overwrite_batch, kept for the loader's cross-table bulk-overwrite
concurrency until its staged migration lands (tracked follow-up).
* MR-854 docs: describe the staged-only seal; fix stale Lance index URLs
- writes.md / invariants.md / AGENTS.md: the inline-commit residuals now
live on InlineCommitResidual behind db.storage_inline_residual(), so
acceptance §1 holds by construction rather than 'option (b)' per-method
enumeration. Drop the inaccurate 'until Lance exposes
Operation::Overwrite { fragments }' claim (that op exists; stage_overwrite
already builds it) and reframe overwrite_batch as a removable legacy
residual gated on the loader's bulk-overwrite concurrency.
- forbidden_apis.rs: rewrite the allow-list doc for the split surface.
- lance.md: the index spec pages moved from /format/table/index/ to
/format/index/ in Lance 6.x (the old paths 404). Fix all 13 URLs.
* MR-854: fix stale lance-4.0.0 comment refs flagged in review
Addresses greptile (exec/merge.rs) and aaltshuler's stale-version blocker:
update lance-4.0.0 -> 6.0.1 in the comment/doc refs within this PR's
footprint (exec/merge.rs, exec/mutation.rs, docs/dev/writes.md). Also
corrects exec/merge.rs to cite lance#6666 (not #6658) for
build_index_metadata_from_segments — that is the vector-index segment-commit
API; #6658 is the two-phase delete. (Pre-existing 4.0.0 refs in untouched
files like architecture.md/storage.md are main's incomplete migration
cleanup, left out of scope.)
* fix(storage): stage loader overwrites
* fix(storage): stage empty schema rewrites
---------
Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Co-authored-by: Ragnor Comerford <ragnor.comerford@gmail.com>
Co-authored-by: Ragnor Comerford <hello@ragnor.co>
2026-06-09 23:03:08 +02:00
|
|
|
#[tokio::test]
|
|
|
|
|
async fn recovery_rolls_forward_load_overwrite() {
|
|
|
|
|
use omnigraph::loader::{LoadMode, load_jsonl};
|
|
|
|
|
|
|
|
|
|
let _scenario = FailScenario::setup();
|
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
|
|
|
|
let uri = dir.path().to_str().unwrap().to_string();
|
|
|
|
|
let operation_id;
|
|
|
|
|
let parent_commit_id;
|
|
|
|
|
|
|
|
|
|
{
|
|
|
|
|
let mut db = Omnigraph::init(&uri, helpers::TEST_SCHEMA).await.unwrap();
|
|
|
|
|
load_jsonl(&mut db, helpers::TEST_DATA, LoadMode::Overwrite)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
|
|
|
|
parent_commit_id = branch_head_commit_id(dir.path(), "main").await.unwrap();
|
|
|
|
|
|
|
|
|
|
let _failpoint = ScopedFailPoint::new("mutation.post_finalize_pre_publisher", "return");
|
|
|
|
|
let err = db
|
|
|
|
|
.load(
|
|
|
|
|
"main",
|
|
|
|
|
r#"{"type":"Person","data":{"name":"OverwriteLoad","age":41}}
|
|
|
|
|
"#,
|
|
|
|
|
LoadMode::Overwrite,
|
|
|
|
|
)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap_err();
|
|
|
|
|
assert!(
|
|
|
|
|
err.to_string()
|
|
|
|
|
.contains("injected failpoint triggered: mutation.post_finalize_pre_publisher"),
|
|
|
|
|
"unexpected error: {err}"
|
|
|
|
|
);
|
|
|
|
|
operation_id = single_sidecar_operation_id(dir.path());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let db = Omnigraph::open(&uri).await.unwrap();
|
|
|
|
|
assert_eq!(
|
|
|
|
|
helpers::count_rows(&db, "node:Person").await,
|
|
|
|
|
1,
|
|
|
|
|
"overwrite row must be visible after recovery rolls the load forward"
|
|
|
|
|
);
|
|
|
|
|
drop(db);
|
|
|
|
|
|
|
|
|
|
assert_post_recovery_invariants(
|
|
|
|
|
dir.path(),
|
|
|
|
|
&operation_id,
|
|
|
|
|
RecoveryExpectation::RolledForward {
|
|
|
|
|
tables: vec![
|
|
|
|
|
TableExpectation::main("node:Person")
|
|
|
|
|
.expected_recovery_parent_commit_id(parent_commit_id)
|
|
|
|
|
.follow_up_mutation(FollowUpMutation::new(
|
|
|
|
|
"main",
|
|
|
|
|
MUTATION_QUERIES,
|
|
|
|
|
"insert_person",
|
|
|
|
|
mixed_params(&[("$name", "AfterOverwriteLoad")], &[("$age", 42)]),
|
|
|
|
|
)),
|
|
|
|
|
],
|
|
|
|
|
},
|
|
|
|
|
)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
|
|
|
|
|
|
|
|
|
let db = Omnigraph::open(&uri).await.unwrap();
|
|
|
|
|
assert_eq!(
|
|
|
|
|
helpers::count_rows(&db, "node:Person").await,
|
|
|
|
|
2,
|
|
|
|
|
"follow-up mutation must succeed after overwrite load recovery"
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
2026-05-05 16:04:48 +02:00
|
|
|
#[tokio::test]
|
|
|
|
|
async fn recovery_rolls_forward_ensure_indices_on_feature_branch() {
|
chore(lance): bump 4.0.0 → 6.0.1 (DataFusion 52→53, Arrow 57→58) (#111)
* tests: add lance_surface_guards pre-flight pins for the v6 bump
Land 8 named guards in a new test file that pin Lance API surfaces
OmniGraph relies on. Each guard turns a silent-break risk (variant
rename, struct restructure, async-flip) into a red CI bar instead of
runtime drift.
Guards (mapped to the silent-break inventory from the v6 migration plan):
Runtime (#[tokio::test]):
1. lance_error_too_much_write_contention_variant_exists — pins the
variant referenced by db/manifest/publisher.rs::map_lance_publish_error.
2. manifest_location_field_shape — pins .path/.size/.e_tag/.naming_scheme
types and ManifestLocation accessor returning &Self (the access
pattern at db/manifest/metadata.rs:84-88).
6. write_params_default_does_not_set_storage_version — confirms our
explicit V2_2 pin remains load-bearing (blob v2 requirement).
Compile-only async fns (#[allow(...)] + unimplemented!() placeholders;
never run, but cargo build --tests enforces the API shape):
3. checkout_version + restore chain — pins the recovery rollback hammer
at db/manifest/recovery.rs:505-522.
4. DatasetBuilder::from_namespace().with_branch().with_version().load()
— pins the namespace builder chain at db/manifest/namespace.rs:162-174.
5. MergeInsertBuilder fluent chain — pins the manifest CAS at
db/manifest/publisher.rs:370-391, including the return shape
(Arc<Dataset>, MergeStats).
7. compact_files(&mut ds, CompactionOptions, None) — pins
db/omnigraph/optimize.rs:107.
8. DeleteResult { new_dataset, num_deleted_rows } — pins the inline
delete result shape (MR-A will repurpose this guard to the staged
two-phase variant once Lance #6658 migration lands).
This is commit 1 of the chore/lance-6.0.1 migration. Cargo bump
follows in commit 2 (will trigger the guards under v6 if any surface
drifted).
Per the migration plan at ~/.claude/plans/shimmering-percolating-duckling.md
(written this session). Two guards from the plan deferred to follow-up:
- manifest_cas_returns_row_level_contention_variant (full publisher
race integration test — needs harness scaffolding)
- table_version_metadata_byte_compatible_with_v4 (TableVersionMetadata
is pub(crate); requires test reach extension).
Verified on v4: cargo test -p omnigraph-engine --test lance_surface_guards
passes 3/3 runtime tests; cargo build -p omnigraph-engine --tests
compiles all 5 compile-only guards clean.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* chore(deps): bump Lance 4.0.0 → 6.0.1, DataFusion 52 → 53, Arrow 57 → 58
The Cargo bump itself. Source is intentionally untouched — this commit
will not compile. The compile errors are the work-list for subsequent
commits on this branch.
Lance updates: lance + 7 sub-crates 4.0.0 → 6.0.1. Transitive churn:
+ lance-tokenizer v6.0.1 (vendored tokenizer per Lance PR #6512)
+ object_store 0.13.x (Lance 6 brings it transitively; our explicit
pin stays at 0.12.5 for now — revisit in stages if diamond bites)
- tantivy* crates (replaced by lance-tokenizer)
Compile error landscape on this commit (11 errors):
• 1× E0432: `lance_index::DatasetIndexExt` import (Lance PR #6280
moved it to lance::index). Sites: table_store.rs:20,
db/manifest.rs:37 (the second site was missed by the pre-flight
inventory).
• 8× E0599: `create_index_builder` / `load_indices` missing on
`lance::Dataset` — all downstream of the DatasetIndexExt move.
Once the import is corrected on table_store.rs and db/manifest.rs,
these resolve automatically.
• 2× E0063: missing field `is_only_declared` in `DescribeTableResponse`
initializer at db/manifest/namespace.rs:221, 364. New Lance
namespace field per the v5 namespace restructure (PR #6186).
Surface guards (lance_surface_guards.rs, commit d571fa8) all still
compile + the 3 runtime ones pass on v6 — none of the silent-break
surfaces drifted. That's the load-bearing observation: the publisher
CAS chain, ManifestLocation field shape, checkout_version/restore,
DatasetBuilder fluent chain, MergeInsertBuilder return shape,
WriteParams::default, compact_files signature, and DeleteResult
fields are all v6-stable.
Next commits address the 11 errors per the migration plan stages
3-8.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* imports: move DatasetIndexExt to lance::index (Lance PR #6280)
Lance 5.0 (PR #6280) moved `DatasetIndexExt` out of `lance-index` into
`lance::index`. `is_system_index` and `IndexType` stayed in `lance-index`.
Mechanical update of 6 import sites:
crates/omnigraph/src/table_store.rs:20 — split into two `use` lines
crates/omnigraph-server/tests/server.rs:10 — was traits::DatasetIndexExt
crates/omnigraph/tests/search.rs:6
crates/omnigraph/tests/branching.rs:7
crates/omnigraph/tests/failpoints.rs:467
crates/omnigraph-cli/tests/cli.rs:3 — was traits::DatasetIndexExt
All 9 E0599 cascading errors on .create_index_builder / .load_indices
resolve once the trait is back in scope.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* namespace: add is_only_declared field to DescribeTableResponse
Lance namespace 6.0.0 added `is_only_declared: Option<bool>` to
`DescribeTableResponse` (lance-namespace-reqwest-client 0.7+ via the
v5.0 namespace API restructure, Lance PR #6186). Set to `Some(false)`
because every table BranchManifestNamespace returns from describe_table
is materialized — the manifest snapshot only includes entries for
tables we've already opened via Dataset::open.
Two sites in db/manifest/namespace.rs (BranchManifestNamespace +
StagedTableNamespace impls of LanceNamespace::describe_table).
Closes the last two compile errors from the v6 bump in the engine lib.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* cargo: add lance to omnigraph-cli + omnigraph-server dev-deps
Stage 3 moved DatasetIndexExt imports from `lance-index` to `lance::index`
in the cli and server test crates. Both crates only had `lance-index`
in their dev-dependencies; add `lance` alongside so the new path
resolves.
This is the last compile-error fix from the v6 bump — `cargo build
--workspace --tests` is now green.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* docs: refresh Lance alignment audit for v6.0.1; bump surveyed version
Per CLAUDE.md maintenance rule 2 (same-PR docs):
- docs/dev/lance.md: replace the v4.0.1 alignment audit stanza with
the v6.0.1 audit. Captures every v5/v6 finding from this PR (the
DatasetIndexExt move, DescribeTableResponse.is_only_declared,
MergeInsertBuilder return shape, ManifestLocation field shape,
LanceFileVersion::default flip, file-reader async, tokenizer
vendor, Lance #6658/#6666/#6877 status). Cross-references each
guard in tests/lance_surface_guards.rs.
- AGENTS.md: bump "Storage substrate: Lance 4.x" → "Lance 6.x".
Note: surveyed crate version stays at 0.4.2 — substrate version
bumps are independent of OmniGraph's release version.
- crates/omnigraph/src/storage_layer.rs: update the trait module-level
doc-comment to reflect that Lance #6658 closed 2026-05-14 and
delete_where two-phase migration is MR-A (the next follow-up).
#6666 stays open; create_vector_index inline residual stays.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* tests: silence clippy::diverging_sub_expression on compile-only guards
The five `_compile_*` async fns in lance_surface_guards.rs use
`let ds: Dataset = unimplemented!()` as a placeholder so type inference
can chase the method chain we want to pin, without ever running the
function. Clippy's `diverging_sub_expression` lint flags this pattern
because the RHS diverges; that's the entire point. Added to the
per-fn `#[allow(...)]` list, alongside dead_code / unreachable_code /
unused_variables / unused_mut already there.
No behavior change. cargo test -p omnigraph-engine --test
lance_surface_guards still 3/3 green.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* docs: correct #6658 status — closed but API ships in Lance v7.x, not v6.0.1
The audit stanza in docs/dev/lance.md and the storage_layer.rs trait
doc-comment both implied the public DeleteBuilder::execute_uncommitted
API shipped with Lance 6.0.1. It did not. Issue #6658 closed
2026-05-14, but binary search across the release stream confirms:
v6.0.1 ❌ no pub async fn execute_uncommitted on DeleteBuilder
v6.1.0-rc.1 ❌
v7.0.0-beta.5 ❌
v7.0.0-beta.10 ✅ first appearance
v7.0.0-rc.1 ✅
So MR-A (delete two-phase migration) is gated on the Lance v7.x bump,
not on this PR. v7.0.0-rc.1 dropped 2026-05-21; GA likely within a
week.
No behavior change. Doc-only correction.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* ci(lib): bump recursion_limit to 256 — Lance 6 trait depth on Linux
Lance 6's heavier trait surface around futures/streams in storage_layer.rs's
staged-write API pushes the rustc trait-resolution recursion limit past
the default 128 on Linux builds. CI on PR #111 surfaced this in both
`Test Workspace` and `Test omnigraph-server --features aws`:
error: queries overflow the depth limit!
= help: consider increasing the recursion limit by adding a
`#![recursion_limit = "256"]` attribute to your crate (`omnigraph`)
= note: query depth increased by 130 when computing layout of
`{async block@crates/omnigraph/src/storage_layer.rs:697:5: 697:10}`
(The async block is `stage_create_btree_index`'s body — its return type
is several layers of `impl Future<Output=Result<StagedHandle>>` deep on
top of Lance's own builder return types.)
Local macOS builds happened to short-circuit before tripping the limit,
which is why this didn't surface during the v6 bump sequence. The fix
rustc itself suggests is one line at the crate root.
No behavior change. Revisit if a future Lance bump stops needing it.
Verified: `cargo build --locked -p omnigraph-server --features aws`
compiles clean.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---------
Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-23 00:42:29 +01:00
|
|
|
use lance::index::DatasetIndexExt;
|
2026-05-05 16:04:48 +02:00
|
|
|
use omnigraph::loader::{LoadMode, load_jsonl};
|
|
|
|
|
use omnigraph::table_store::TableStore;
|
|
|
|
|
|
|
|
|
|
let _scenario = FailScenario::setup();
|
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
|
|
|
|
let uri = dir.path().to_str().unwrap().to_string();
|
|
|
|
|
let operation_id;
|
|
|
|
|
let feature_parent_commit_id;
|
|
|
|
|
let main_person_pin;
|
|
|
|
|
|
|
|
|
|
let mut db = Omnigraph::init(&uri, helpers::TEST_SCHEMA).await.unwrap();
|
|
|
|
|
load_jsonl(
|
|
|
|
|
&mut db,
|
|
|
|
|
r#"{"type":"Person","data":{"name":"alice","age":30}}
|
|
|
|
|
"#,
|
|
|
|
|
LoadMode::Append,
|
|
|
|
|
)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
|
|
|
|
db.branch_create("feature").await.unwrap();
|
|
|
|
|
db.mutate(
|
|
|
|
|
"feature",
|
|
|
|
|
MUTATION_QUERIES,
|
|
|
|
|
"insert_person",
|
|
|
|
|
&mixed_params(&[("$name", "BeforeEnsure")], &[("$age", 42)]),
|
|
|
|
|
)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
|
|
|
|
|
|
|
|
|
main_person_pin = db
|
|
|
|
|
.snapshot_of(omnigraph::db::ReadTarget::branch("main"))
|
|
|
|
|
.await
|
|
|
|
|
.unwrap()
|
|
|
|
|
.entry("node:Person")
|
|
|
|
|
.expect("main must have Person")
|
|
|
|
|
.table_version;
|
|
|
|
|
|
|
|
|
|
// Make the feature branch's Person table genuinely need index work
|
|
|
|
|
// while keeping the manifest internally consistent. The test-only
|
|
|
|
|
// publisher deliberately skips the normal index-rebuild preparation;
|
|
|
|
|
// the failed writer below is still the real `ensure_indices_on`.
|
|
|
|
|
let person_uri = node_table_uri(&uri, "Person");
|
|
|
|
|
let store = TableStore::new(&uri);
|
|
|
|
|
let mut ds = store
|
|
|
|
|
.open_dataset_head(&person_uri, Some("feature"))
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
|
|
|
|
ds.drop_index("id_idx").await.unwrap();
|
|
|
|
|
let dropped_index_head = ds.version().version;
|
|
|
|
|
db.failpoint_publish_table_head_without_index_rebuild_for_test(
|
|
|
|
|
"feature",
|
|
|
|
|
"node:Person",
|
|
|
|
|
Some("feature"),
|
|
|
|
|
)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
|
|
|
|
let feature_snapshot = db
|
|
|
|
|
.snapshot_of(omnigraph::db::ReadTarget::branch("feature"))
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
|
|
|
|
assert_eq!(
|
|
|
|
|
feature_snapshot
|
|
|
|
|
.entry("node:Person")
|
|
|
|
|
.expect("feature must have Person")
|
|
|
|
|
.table_version,
|
|
|
|
|
dropped_index_head,
|
|
|
|
|
"test setup must publish the dropped-index table head before ensure_indices runs",
|
|
|
|
|
);
|
|
|
|
|
feature_parent_commit_id = branch_head_commit_id(dir.path(), "feature").await.unwrap();
|
|
|
|
|
|
|
|
|
|
{
|
|
|
|
|
let _failpoint =
|
|
|
|
|
ScopedFailPoint::new("ensure_indices.post_phase_b_pre_manifest_commit", "return");
|
|
|
|
|
let err = db.ensure_indices_on("feature").await.unwrap_err();
|
|
|
|
|
assert!(
|
|
|
|
|
err.to_string().contains(
|
|
|
|
|
"injected failpoint triggered: ensure_indices.post_phase_b_pre_manifest_commit"
|
|
|
|
|
),
|
|
|
|
|
"unexpected error: {err}"
|
|
|
|
|
);
|
|
|
|
|
operation_id = single_sidecar_operation_id(dir.path());
|
|
|
|
|
}
|
|
|
|
|
drop(db);
|
|
|
|
|
|
|
|
|
|
let db = Omnigraph::open(&uri).await.unwrap();
|
|
|
|
|
assert_eq!(
|
|
|
|
|
helpers::count_rows_branch(&db, "feature", "node:Person").await,
|
|
|
|
|
2,
|
|
|
|
|
"feature should see inherited alice plus recovered branch-local row"
|
|
|
|
|
);
|
|
|
|
|
assert_eq!(
|
|
|
|
|
helpers::count_rows(&db, "node:Person").await,
|
|
|
|
|
1,
|
|
|
|
|
"ensure_indices branch recovery must not move main"
|
|
|
|
|
);
|
|
|
|
|
drop(db);
|
|
|
|
|
|
|
|
|
|
assert_post_recovery_invariants(
|
|
|
|
|
dir.path(),
|
|
|
|
|
&operation_id,
|
|
|
|
|
RecoveryExpectation::RolledForward {
|
|
|
|
|
tables: vec![
|
|
|
|
|
TableExpectation::branch("node:Person", "feature")
|
|
|
|
|
.expected_main_manifest_pin(main_person_pin)
|
|
|
|
|
.expected_recovery_parent_commit_id(feature_parent_commit_id)
|
|
|
|
|
.follow_up_mutation(FollowUpMutation::new(
|
|
|
|
|
"feature",
|
|
|
|
|
MUTATION_QUERIES,
|
|
|
|
|
"insert_person",
|
|
|
|
|
mixed_params(&[("$name", "AfterEnsure")], &[("$age", 44)]),
|
|
|
|
|
)),
|
|
|
|
|
],
|
|
|
|
|
},
|
|
|
|
|
)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
|
|
|
|
|
|
|
|
|
let db = Omnigraph::open(&uri).await.unwrap();
|
|
|
|
|
assert_eq!(
|
|
|
|
|
helpers::count_rows_branch(&db, "feature", "node:Person").await,
|
|
|
|
|
3,
|
|
|
|
|
"follow-up feature mutation must succeed after ensure_indices recovery"
|
|
|
|
|
);
|
|
|
|
|
assert_eq!(
|
|
|
|
|
helpers::count_rows(&db, "node:Person").await,
|
|
|
|
|
1,
|
|
|
|
|
"follow-up feature mutation must not move main"
|
2026-05-01 13:47:55 +02:00
|
|
|
);
|
2026-05-04 00:15:42 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Refresh-time recovery (Option B): the in-process `Omnigraph::refresh`
|
|
|
|
|
/// runs roll-forward-only recovery, closing the long-running-server
|
|
|
|
|
/// residual without restart.
|
|
|
|
|
///
|
|
|
|
|
/// Setup: trigger `mutation.post_finalize_pre_publisher` once. The
|
|
|
|
|
/// sidecar persists. Without dropping the engine, call `db.refresh()`.
|
|
|
|
|
/// The post-condition: sidecar gone; Eve visible; subsequent mutation
|
|
|
|
|
/// on the same handle succeeds without restart and without
|
|
|
|
|
/// ExpectedVersionMismatch.
|
|
|
|
|
#[tokio::test]
|
|
|
|
|
async fn refresh_runs_roll_forward_recovery_in_process() {
|
|
|
|
|
let _scenario = FailScenario::setup();
|
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
|
|
|
|
let uri = dir.path().to_str().unwrap().to_string();
|
|
|
|
|
|
|
|
|
|
let mut db = Omnigraph::init(&uri, helpers::TEST_SCHEMA).await.unwrap();
|
|
|
|
|
|
2026-05-05 22:46:03 +02:00
|
|
|
// Setup: trigger the residual (sidecar persists; manifest unchanged).
|
2026-05-04 00:15:42 +02:00
|
|
|
{
|
2026-05-05 16:04:48 +02:00
|
|
|
let _failpoint = ScopedFailPoint::new("mutation.post_finalize_pre_publisher", "return");
|
2026-05-04 00:15:42 +02:00
|
|
|
let err = mutate_main(
|
|
|
|
|
&mut db,
|
|
|
|
|
MUTATION_QUERIES,
|
|
|
|
|
"insert_person",
|
|
|
|
|
&mixed_params(&[("$name", "Eve")], &[("$age", 22)]),
|
|
|
|
|
)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap_err();
|
|
|
|
|
assert!(
|
2026-05-05 16:04:48 +02:00
|
|
|
err.to_string()
|
|
|
|
|
.contains("injected failpoint triggered: mutation.post_finalize_pre_publisher"),
|
2026-05-04 00:15:42 +02:00
|
|
|
"unexpected error: {err}"
|
|
|
|
|
);
|
|
|
|
|
let recovery_dir = dir.path().join("__recovery");
|
|
|
|
|
assert_eq!(
|
|
|
|
|
std::fs::read_dir(&recovery_dir).unwrap().count(),
|
|
|
|
|
1,
|
|
|
|
|
"exactly one sidecar must persist after the finalize failure"
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
2026-05-05 22:46:03 +02:00
|
|
|
// Recovery: explicit refresh runs roll-forward-only recovery
|
2026-05-04 00:15:42 +02:00
|
|
|
// in-process — no restart needed. Sidecar finds the Person drift,
|
|
|
|
|
// classifies RolledPastExpected, rolls forward via publisher CAS,
|
|
|
|
|
// and deletes the sidecar.
|
|
|
|
|
db.refresh().await.expect("refresh must succeed");
|
|
|
|
|
|
|
|
|
|
// Sidecar must be gone — refresh-time recovery rolled it forward.
|
|
|
|
|
let recovery_dir = dir.path().join("__recovery");
|
|
|
|
|
if recovery_dir.exists() {
|
|
|
|
|
let remaining: Vec<_> = std::fs::read_dir(&recovery_dir)
|
|
|
|
|
.unwrap()
|
|
|
|
|
.filter_map(|e| e.ok())
|
|
|
|
|
.collect();
|
|
|
|
|
assert!(
|
|
|
|
|
remaining.is_empty(),
|
|
|
|
|
"sidecar must be deleted by refresh-time roll-forward; remaining: {:?}",
|
|
|
|
|
remaining,
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Eve (the originally-attempted insert) is visible without restart.
|
|
|
|
|
let person_count = helpers::count_rows(&db, "node:Person").await;
|
|
|
|
|
assert_eq!(
|
|
|
|
|
person_count, 1,
|
|
|
|
|
"Eve must be visible after refresh-time roll-forward"
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
// A direct Person mutation also succeeds without ExpectedVersionMismatch.
|
|
|
|
|
mutate_main(
|
|
|
|
|
&mut db,
|
|
|
|
|
MUTATION_QUERIES,
|
|
|
|
|
"insert_person",
|
|
|
|
|
&mixed_params(&[("$name", "Frank")], &[("$age", 33)]),
|
|
|
|
|
)
|
|
|
|
|
.await
|
|
|
|
|
.expect("Person insert must succeed after refresh-time recovery");
|
|
|
|
|
assert_eq!(helpers::count_rows(&db, "node:Person").await, 2);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Refresh-time recovery must NOT call `Dataset::restore` — it can
|
|
|
|
|
/// silently orphan a concurrent writer's commit. Sidecars that would
|
|
|
|
|
/// require rollback must be left on disk for the next ReadWrite open.
|
|
|
|
|
///
|
|
|
|
|
/// Setup: synthesize a sidecar that would classify as `UnexpectedAtP1`
|
|
|
|
|
/// (rollback territory) — strict-match Mutation kind with
|
|
|
|
|
/// expected_version != manifest_pinned. Trigger refresh and assert:
|
|
|
|
|
/// sidecar still on disk, Lance HEAD unchanged (no restore commit).
|
|
|
|
|
/// Then drop + open: full sweep handles it.
|
|
|
|
|
#[tokio::test]
|
|
|
|
|
async fn refresh_defers_rollback_eligible_sidecar_to_next_open() {
|
|
|
|
|
use omnigraph::loader::{LoadMode, load_jsonl};
|
|
|
|
|
|
|
|
|
|
let _scenario = FailScenario::setup();
|
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
|
|
|
|
let uri = dir.path().to_str().unwrap().to_string();
|
|
|
|
|
|
|
|
|
|
// Bootstrap.
|
|
|
|
|
let mut db = Omnigraph::init(&uri, helpers::TEST_SCHEMA).await.unwrap();
|
|
|
|
|
load_jsonl(
|
|
|
|
|
&mut db,
|
|
|
|
|
r#"{"type":"Person","data":{"name":"alice","age":30}}
|
|
|
|
|
"#,
|
|
|
|
|
LoadMode::Append,
|
|
|
|
|
)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
|
|
|
|
|
|
|
|
|
// Capture Person's full URI and manifest pin.
|
|
|
|
|
let snapshot = db
|
|
|
|
|
.snapshot_of(omnigraph::db::ReadTarget::branch("main"))
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
|
|
|
|
let entry = snapshot.entry("node:Person").unwrap();
|
|
|
|
|
let person_uri = format!("{}/{}", uri.trim_end_matches('/'), entry.table_path);
|
|
|
|
|
let manifest_pin = entry.table_version;
|
|
|
|
|
|
|
|
|
|
// Drift Person's Lance HEAD ahead of the manifest pin (without
|
|
|
|
|
// touching the manifest) so the classifier can reach UnexpectedAtP1
|
|
|
|
|
// / UnexpectedMultistep / RolledPastExpected paths that require
|
|
|
|
|
// a real restore on rollback.
|
|
|
|
|
let mut ds = lance::Dataset::open(&person_uri).await.unwrap();
|
(feat) convert engine call sites to &dyn TableStorage; demote legacy TableStore methods to pub(crate) (#86)
* MR-854: convert engine call sites to &dyn TableStorage; demote legacy methods
Phase 1b: every db.table_store.X(...) call site converts to
db.storage().X(...), reaching the storage layer through the sealed
TableStorage trait (returns &dyn TableStorage). Opaque SnapshotHandle
and StagedHandle replace bare lance::Dataset and Transaction in the
threaded values.
Phase 9: the inherent inline-commit methods on TableStore
(append_batch, merge_insert_batch{,es}, overwrite_batch,
create_btree_index, create_inverted_index) demote from pub to
pub(crate). Their only remaining direct users are table_store.rs
itself and the bulk loader's LoadMode::{Append, Overwrite, Merge}
concurrent fast-paths in loader::write_batch_to_dataset (no
two-phase shape in Lance 4.0.0 — closes after lance#6658 and #6666).
Docs:
- invariants.md \u00a7VI.23: drop "at the writer-trait surface"
qualifier; staged primitives are now the only engine surface.
- runs.md: residual matrix shrinks to delete_where and
create_vector_index (the two upstream-blocked residuals).
- forbidden_apis.rs: replace transitional language with the
current allow-list shape (table_store.rs + loader concurrent
fast-path only).
Files touched:
- changes/mod.rs, db/omnigraph.rs (+export/optimize/schema_apply/
table_ops.rs), exec/{merge,mod,mutation,staging}.rs,
loader/mod.rs, storage_layer.rs, table_store.rs,
tests/forbidden_apis.rs, docs/{invariants,runs}.md.
Co-Authored-By: Ragnor Comerford <ragnor.comerford@gmail.com>
* MR-854: replace test-only inline-commit append callers with local Lance helpers
After demoting TableStore::append_batch from pub to pub(crate), the
integration tests in tests/recovery.rs and tests/staged_writes.rs
that previously called store.append_batch(...) directly to simulate
HEAD-ahead-of-manifest drift can no longer access the inherent
method. Replace those calls with small in-test helpers that do a raw
Dataset::append (the same body the inherent method runs).
- tests/helpers/mod.rs gains lance_append_inline (shared helper).
- tests/staged_writes.rs gets a file-local lance_append_inline_local
(staged_writes.rs does not import helpers::).
- tests/recovery.rs drops the unused TableStore import in the one
function whose store binding became unused after the conversion.
Co-Authored-By: Ragnor Comerford <ragnor.comerford@gmail.com>
* MR-854: retrigger CI for flaky Test Workspace job
Co-Authored-By: Ragnor Comerford <ragnor.comerford@gmail.com>
* MR-854: convert remaining table_store call sites in export.rs / read_blob
Two leftover `self.table_store.X` / `db.table_store.X` call sites were
missed in the initial sweep — flagged by Devin Review on PR #86. Both
now go through the trait surface:
- `entity_from_snapshot` (db/omnigraph/export.rs): switch from
`db.table_store.open_snapshot_table` + `db.table_store.scan` to
`db.storage().open_snapshot_at_table` + `db.storage().scan`.
- `read_blob` (db/omnigraph.rs): replace
`snapshot.open(table_key)` + `self.table_store.first_row_id_for_filter`
with `self.storage().open_snapshot_at_table` +
`self.storage().first_row_id_for_filter`. The follow-up
`take_blobs` call still needs an `Arc<Dataset>` (it's a Lance blob
accessor not surfaced through the trait), so we hand off via
`SnapshotHandle::into_arc()` with a comment.
After this commit, no engine code outside `table_store.rs` reaches the
inherent `TableStore` API — the docs/runs.md and docs/invariants.md
claim is now uniformly true.
Co-Authored-By: Ragnor Comerford <ragnor.comerford@gmail.com>
* MR-854: post-rebase doc fixes (Lance 6.0.1, MR-A framing, into_dataset note)
Reviewer feedback on the rebased PR:
* docs/dev/writes.md residuals matrix: drop demoted methods from the trait-surface table (now `pub(crate)`); keep only the two genuine trait-surface residuals (`delete_where`, `create_vector_index`); reframe under MR-A (Lance v7.x bump) per docs/dev/lance.md.
* tests/forbidden_apis.rs: update transitional allow-list header to (a) drop the truncate_table mislabel (truncate_table is a Lance Dataset method, not a TableStore method — overwrite_batch's internal call), (b) reframe trait-surface residuals under MR-A / Lance #6666.
* crates/omnigraph/src/storage_layer.rs::SnapshotHandle::{into_arc, into_dataset}: add single-ref invariant doc — both consume Arc via try_unwrap-or-clone; sibling SnapshotHandle clones across an await point force a deep Dataset clone.
* Replace lance-4.0.0 version refs with lance-6.0.1 in active source/test/dev-doc comments (storage_layer.rs, table_store.rs, table_ops.rs, schema_apply.rs, merge.rs, recovery.rs, staged_writes.rs, consistency.rs, docs/dev/execution.md, docs/user/query-language.md). Historical refs in docs/releases/v0.4.1.md and the canonical "Lance 4.0.0 → 6.0.1 migration" line in docs/dev/lance.md left intact.
No engine code changes.
* MR-854: update docs/dev/invariants.md Storage trait row + gap entry
Reviewer feedback: the docs reorg landed; the invariant row now lives in
docs/dev/invariants.md with stable headings (no more numbered §VI.23).
Update two pieces to reflect MR-854 completion:
* Status table 'Storage trait' row: was 'full call-site migration ... incomplete';
now 'engine call sites all route through db.storage() (MR-854); inline-commit
inherent methods are pub(crate)-demoted; capability/stat surfaces are roadmap'.
* 'Known Gaps' 'Storage abstraction' entry: was 'older inherent TableStore call
sites and inline residuals remain'; now names the closed scope (MR-854 — call
sites migrated, methods demoted, loader fast-paths) and the remaining
trait-surface residuals under MR-A (Lance v7.x bump) and Lance #6666.
Cross-links to docs/dev/lance.md and docs/dev/writes.md so the framing stays
co-located with the canonical Lance surface tracking.
* MR-854: remove dead inline-commit methods from the storage surface
The loader concurrent fast-path (write_batch_to_dataset) is only reached
for LoadMode::Overwrite — Append/Merge route through MutationStaging — so
its Append/Merge arms were unreachable. Collapse it to overwrite-only and
drop the now-unused mode params, which removes the only callers of:
- TableStorage::append_batch + TableStorage::merge_insert_batches (trait)
- TableStore::merge_insert_batch + merge_insert_batches (inherent)
create_btree_index / create_inverted_index had zero callers anywhere
(scalar index builds use the stage_* primitives). Remove both from the
trait and the inherent impl.
Inherent append_batch stays pub(crate): overwrite_batch and recovery
tests use it. Migrate the one trait-append_batch test caller
(seed_person_row) to stage_append + commit_staged. The merge_insert
FirstSeen-workaround rationale moves from the deleted merge_insert_batch
into stage_merge_insert (now the sole merge path). No behavior change.
Also corrects the inaccurate loader residual comment (the prior text
blamed Lance #6658/#6666, which are the delete and vector-index issues,
for keeping overwrite inline; a stage_overwrite primitive already exists
and schema_apply uses it).
* MR-854: seal db.storage() to staged-only; move residuals to InlineCommitResidual
Split the three remaining inline-commit writes (overwrite_batch,
delete_where, create_vector_index) off the TableStorage trait onto a new
sealed InlineCommitResidual trait, reachable only via the explicit
Omnigraph::storage_inline_residual() accessor. db.storage() now exposes
only staged primitives + reads, so engine code cannot couple a write
with a Lance HEAD advance through the default surface — MR-793 acceptance
§1 ("no public method commits as a side effect of writing") now holds by
construction, not by review + naming.
Call sites moved to storage_inline_residual(): loader overwrite
fast-path, the three mutation delete_where paths, the branch-merge
delete, and the vector-index build. Impl bodies are unchanged (same
delegation to the pub(crate) inherent methods); this is a pure surface
reshape with no behavior change.
The residual trait holds two genuinely upstream-blocked methods
(delete_where -> Lance #6658/v7.x, create_vector_index -> Lance #6666)
plus overwrite_batch, kept for the loader's cross-table bulk-overwrite
concurrency until its staged migration lands (tracked follow-up).
* MR-854 docs: describe the staged-only seal; fix stale Lance index URLs
- writes.md / invariants.md / AGENTS.md: the inline-commit residuals now
live on InlineCommitResidual behind db.storage_inline_residual(), so
acceptance §1 holds by construction rather than 'option (b)' per-method
enumeration. Drop the inaccurate 'until Lance exposes
Operation::Overwrite { fragments }' claim (that op exists; stage_overwrite
already builds it) and reframe overwrite_batch as a removable legacy
residual gated on the loader's bulk-overwrite concurrency.
- forbidden_apis.rs: rewrite the allow-list doc for the split surface.
- lance.md: the index spec pages moved from /format/table/index/ to
/format/index/ in Lance 6.x (the old paths 404). Fix all 13 URLs.
* MR-854: fix stale lance-4.0.0 comment refs flagged in review
Addresses greptile (exec/merge.rs) and aaltshuler's stale-version blocker:
update lance-4.0.0 -> 6.0.1 in the comment/doc refs within this PR's
footprint (exec/merge.rs, exec/mutation.rs, docs/dev/writes.md). Also
corrects exec/merge.rs to cite lance#6666 (not #6658) for
build_index_metadata_from_segments — that is the vector-index segment-commit
API; #6658 is the two-phase delete. (Pre-existing 4.0.0 refs in untouched
files like architecture.md/storage.md are main's incomplete migration
cleanup, left out of scope.)
* fix(storage): stage loader overwrites
* fix(storage): stage empty schema rewrites
---------
Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Co-authored-by: Ragnor Comerford <ragnor.comerford@gmail.com>
Co-authored-by: Ragnor Comerford <hello@ragnor.co>
2026-06-09 23:03:08 +02:00
|
|
|
helpers::lance_delete_inline(&mut ds, "1 = 2").await;
|
2026-05-04 00:15:42 +02:00
|
|
|
let head_after_drift = ds.version().version;
|
|
|
|
|
assert_eq!(head_after_drift, manifest_pin + 1);
|
|
|
|
|
|
|
|
|
|
// Synthesize a sidecar with expected_version that DOES NOT match
|
|
|
|
|
// the current manifest pin AND post_commit_pin == lance_head →
|
|
|
|
|
// strict Mutation classifier sees lance_head == manifest_pinned + 1
|
|
|
|
|
// but expected != manifest_pinned → UnexpectedAtP1. decide → RollBack.
|
|
|
|
|
//
|
|
|
|
|
// expected_version must be a REAL Lance version (`restore_table_to_version`
|
|
|
|
|
// calls `checkout_version` on it, and an unknown version errors). Use
|
|
|
|
|
// manifest_pin - 1 which exists from the bootstrap commit chain.
|
|
|
|
|
let bogus_expected = manifest_pin - 1;
|
|
|
|
|
let bogus_post = head_after_drift;
|
|
|
|
|
let sidecar_json = format!(
|
|
|
|
|
r#"{{
|
|
|
|
|
"schema_version": 1,
|
|
|
|
|
"operation_id": "01H0000000000000000000RBCK",
|
|
|
|
|
"started_at": "0",
|
|
|
|
|
"branch": null,
|
|
|
|
|
"actor_id": "act-rollback",
|
|
|
|
|
"writer_kind": "Mutation",
|
|
|
|
|
"tables": [
|
|
|
|
|
{{
|
|
|
|
|
"table_key":"node:Person",
|
|
|
|
|
"table_path":"{}",
|
|
|
|
|
"expected_version":{},
|
|
|
|
|
"post_commit_pin":{}
|
|
|
|
|
}}
|
|
|
|
|
]
|
|
|
|
|
}}"#,
|
|
|
|
|
person_uri, bogus_expected, bogus_post,
|
|
|
|
|
);
|
|
|
|
|
let recovery_dir = dir.path().join("__recovery");
|
|
|
|
|
std::fs::create_dir_all(&recovery_dir).unwrap();
|
|
|
|
|
std::fs::write(
|
|
|
|
|
recovery_dir.join("01H0000000000000000000RBCK.json"),
|
|
|
|
|
&sidecar_json,
|
|
|
|
|
)
|
|
|
|
|
.unwrap();
|
|
|
|
|
|
|
|
|
|
// Capture pre-refresh Lance HEAD on Person.
|
|
|
|
|
let pre_head = lance::Dataset::open(&person_uri)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap()
|
|
|
|
|
.version()
|
|
|
|
|
.version;
|
|
|
|
|
|
|
|
|
|
// Trigger refresh-time recovery directly. Sidecar is rollback-
|
|
|
|
|
// eligible (UnexpectedAtP1); RollForwardOnly mode defers it,
|
|
|
|
|
// leaving the sidecar on disk and Lance HEAD unchanged on Person.
|
2026-05-05 16:04:48 +02:00
|
|
|
db.refresh()
|
|
|
|
|
.await
|
|
|
|
|
.expect("refresh must succeed (deferring rollback)");
|
2026-05-04 00:15:42 +02:00
|
|
|
|
|
|
|
|
// Sidecar still on disk.
|
|
|
|
|
assert_eq!(
|
|
|
|
|
std::fs::read_dir(&recovery_dir).unwrap().count(),
|
|
|
|
|
1,
|
|
|
|
|
"rollback-eligible sidecar must be deferred to next ReadWrite open",
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
// Lance HEAD on Person unchanged — no restore ran.
|
|
|
|
|
let post_head = lance::Dataset::open(&person_uri)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap()
|
|
|
|
|
.version()
|
|
|
|
|
.version;
|
|
|
|
|
assert_eq!(
|
|
|
|
|
pre_head, post_head,
|
|
|
|
|
"refresh-time recovery must NOT call Dataset::restore on Person; \
|
|
|
|
|
pre_head={pre_head}, post_head={post_head}",
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
// Cross-check: drop the engine and reopen — full sweep handles
|
|
|
|
|
// the rollback (will use Dataset::restore safely; no concurrent
|
|
|
|
|
// writers at open time).
|
|
|
|
|
drop(db);
|
fix: optimize publishes compaction; recovery roll-back converges manifest (#141)
* test(optimize): cover manifest publish + HEAD-drift reconcile
Red against the pre-fix optimize, which ran compact_files without
publishing the compacted version to __manifest:
- maintenance: optimize must publish so the manifest table_version
tracks the compacted Lance HEAD and a later schema apply succeeds;
and must reconcile a pre-existing manifest-behind-HEAD drift (forged
via raw Lance compaction) so strict writes commit again.
- end_to_end + composite_flow: post-optimize query / strict update /
reopen in the full lifecycle (the canonical flow previously omitted
post-optimize writes as a documented "known limitation").
- failpoints: a crash between compaction and the manifest publish rolls
forward on next open.
* fix(optimize): publish compaction to manifest and reconcile HEAD drift
optimize ran Lance compact_files without publishing the new version to
__manifest, so the manifest table_version lagged the Lance HEAD: reads
stayed pinned to the pre-compaction version, and the next schema apply or
strict update/delete failed its HEAD-vs-manifest precondition with
"stale view ... refresh and retry" (open-time recovery rollback inflated
the gap on retry).
optimize now publishes each compacted table's version under the
per-(table, main) write queue, guarded by a manifest CAS and a
SidecarKind::Optimize recovery sidecar (loose-match; roll-forward is safe
because compaction is content-preserving). When a table has nothing left
to compact but its Lance HEAD is already ahead of the manifest pin
(pre-fix drift, or a recovery restore commit), optimize reconciles the
manifest forward to HEAD (metadata-only, no sidecar). Caches and the
CSR/CSC graph index are invalidated after a publish.
Docs updated (maintenance, storage, branches-commits, writes, testing).
* test(recovery): rollback convergence + optimize-defer regressions
Red against the current code, landed before the fix:
- recovery: after the open-time sweep rolls a sidecar back, the manifest
must track Lance HEAD (no residual drift) so a follow-up schema apply
succeeds — the original "+1 per retry" loop. Today roll-back restores
without publishing, so the manifest lags HEAD and the apply fails its
HEAD-vs-manifest precondition.
- maintenance: optimize must refuse while a recovery sidecar is pending —
operating on an unrecovered graph could publish a partial write the
sweep would roll back.
Also removes optimize_reconciles_preexisting_manifest_head_drift: the
ad-hoc drift reconcile it covered is replaced by recovery-side convergence.
* fix(recovery): converge manifest on roll-back; optimize defers on pending recovery
Root of PR #141's review findings and the original "+1 per retry" loop:
a Lance HEAD ahead of the manifest was ambiguous (benign content-preserving
drift vs. a partial write a sidecar will roll back), and optimize's reconcile
guessed it benign. Close the class instead of guessing:
- Recovery roll-back now PUBLISHES the restored version (via a
push_table_update_at_head helper shared with roll-forward), so the manifest
tracks the Lance HEAD after recovery — symmetric with roll-forward. This
fixes the +1 loop (after one roll-back the retry's HEAD-vs-manifest
precondition passes) and removes the only remaining source of orphaned
drift. The audit still records the logical rolled-back-to version; the
manifest is published at the restore commit (identical content).
- optimize drops the ad-hoc drift reconcile and instead REFUSES when a
__recovery sidecar is pending, so it only ever operates on a recovered
graph (manifest == HEAD); its compaction publish can no longer commit a
partial write. With the reconcile gone, the blob-skip-vs-reconcile gap is
moot.
Updates the rollback recovery-test helper (manifest == HEAD after roll-back),
the failpoints assertions, and the user/dev docs.
* test(recovery): fix rollback assertion for manifest convergence
The roll-back-publishes change makes the manifest version advance after a
SchemaApply roll-back (to the old-schema content), so the
schema_apply_without_schema_staging_rolls_back_on_next_open assertion must
be `version > pre`, not `version == pre`. This update was dropped during
the commit churn and surfaced as a CI Test Workspace failure; the
old-schema-preserved intent stays covered by count_rows + _schema.pg + the
RolledBack convergence invariant.
2026-06-08 01:50:12 +02:00
|
|
|
let db = Omnigraph::open(&uri).await.unwrap();
|
2026-05-04 00:15:42 +02:00
|
|
|
// After full-sweep recovery, the sidecar should be processed
|
|
|
|
|
// (deleted). Sidecar's tables are eligible for rollback (UnexpectedAtP1):
|
|
|
|
|
// restore happens on Person (HEAD advances by 1).
|
|
|
|
|
let remaining = if recovery_dir.exists() {
|
|
|
|
|
std::fs::read_dir(&recovery_dir).unwrap().count()
|
|
|
|
|
} else {
|
|
|
|
|
0
|
|
|
|
|
};
|
|
|
|
|
assert_eq!(
|
|
|
|
|
remaining, 0,
|
|
|
|
|
"full sweep at next open must process the deferred sidecar",
|
|
|
|
|
);
|
|
|
|
|
let final_head = lance::Dataset::open(&person_uri)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap()
|
|
|
|
|
.version()
|
|
|
|
|
.version;
|
|
|
|
|
assert!(
|
|
|
|
|
final_head > post_head,
|
|
|
|
|
"full sweep must run Dataset::restore (head advances); \
|
|
|
|
|
post_head={post_head}, final_head={final_head}",
|
|
|
|
|
);
|
fix: optimize publishes compaction; recovery roll-back converges manifest (#141)
* test(optimize): cover manifest publish + HEAD-drift reconcile
Red against the pre-fix optimize, which ran compact_files without
publishing the compacted version to __manifest:
- maintenance: optimize must publish so the manifest table_version
tracks the compacted Lance HEAD and a later schema apply succeeds;
and must reconcile a pre-existing manifest-behind-HEAD drift (forged
via raw Lance compaction) so strict writes commit again.
- end_to_end + composite_flow: post-optimize query / strict update /
reopen in the full lifecycle (the canonical flow previously omitted
post-optimize writes as a documented "known limitation").
- failpoints: a crash between compaction and the manifest publish rolls
forward on next open.
* fix(optimize): publish compaction to manifest and reconcile HEAD drift
optimize ran Lance compact_files without publishing the new version to
__manifest, so the manifest table_version lagged the Lance HEAD: reads
stayed pinned to the pre-compaction version, and the next schema apply or
strict update/delete failed its HEAD-vs-manifest precondition with
"stale view ... refresh and retry" (open-time recovery rollback inflated
the gap on retry).
optimize now publishes each compacted table's version under the
per-(table, main) write queue, guarded by a manifest CAS and a
SidecarKind::Optimize recovery sidecar (loose-match; roll-forward is safe
because compaction is content-preserving). When a table has nothing left
to compact but its Lance HEAD is already ahead of the manifest pin
(pre-fix drift, or a recovery restore commit), optimize reconciles the
manifest forward to HEAD (metadata-only, no sidecar). Caches and the
CSR/CSC graph index are invalidated after a publish.
Docs updated (maintenance, storage, branches-commits, writes, testing).
* test(recovery): rollback convergence + optimize-defer regressions
Red against the current code, landed before the fix:
- recovery: after the open-time sweep rolls a sidecar back, the manifest
must track Lance HEAD (no residual drift) so a follow-up schema apply
succeeds — the original "+1 per retry" loop. Today roll-back restores
without publishing, so the manifest lags HEAD and the apply fails its
HEAD-vs-manifest precondition.
- maintenance: optimize must refuse while a recovery sidecar is pending —
operating on an unrecovered graph could publish a partial write the
sweep would roll back.
Also removes optimize_reconciles_preexisting_manifest_head_drift: the
ad-hoc drift reconcile it covered is replaced by recovery-side convergence.
* fix(recovery): converge manifest on roll-back; optimize defers on pending recovery
Root of PR #141's review findings and the original "+1 per retry" loop:
a Lance HEAD ahead of the manifest was ambiguous (benign content-preserving
drift vs. a partial write a sidecar will roll back), and optimize's reconcile
guessed it benign. Close the class instead of guessing:
- Recovery roll-back now PUBLISHES the restored version (via a
push_table_update_at_head helper shared with roll-forward), so the manifest
tracks the Lance HEAD after recovery — symmetric with roll-forward. This
fixes the +1 loop (after one roll-back the retry's HEAD-vs-manifest
precondition passes) and removes the only remaining source of orphaned
drift. The audit still records the logical rolled-back-to version; the
manifest is published at the restore commit (identical content).
- optimize drops the ad-hoc drift reconcile and instead REFUSES when a
__recovery sidecar is pending, so it only ever operates on a recovered
graph (manifest == HEAD); its compaction publish can no longer commit a
partial write. With the reconcile gone, the blob-skip-vs-reconcile gap is
moot.
Updates the rollback recovery-test helper (manifest == HEAD after roll-back),
the failpoints assertions, and the user/dev docs.
* test(recovery): fix rollback assertion for manifest convergence
The roll-back-publishes change makes the manifest version advance after a
SchemaApply roll-back (to the old-schema content), so the
schema_apply_without_schema_staging_rolls_back_on_next_open assertion must
be `version > pre`, not `version == pre`. This update was dropped during
the commit churn and surfaced as a CI Test Workspace failure; the
old-schema-preserved intent stays covered by count_rows + _schema.pg + the
RolledBack convergence invariant.
2026-06-08 01:50:12 +02:00
|
|
|
// Convergence: roll-back published the restored HEAD, so the manifest pin
|
|
|
|
|
// tracks Lance HEAD afterward (no residual drift).
|
|
|
|
|
let entry_version = db
|
|
|
|
|
.snapshot_of(omnigraph::db::ReadTarget::branch("main"))
|
|
|
|
|
.await
|
|
|
|
|
.unwrap()
|
|
|
|
|
.entry("node:Person")
|
|
|
|
|
.unwrap()
|
|
|
|
|
.table_version;
|
|
|
|
|
assert_eq!(
|
|
|
|
|
entry_version, final_head,
|
|
|
|
|
"full-sweep roll-back must publish so manifest pin ({entry_version}) == Lance HEAD ({final_head})",
|
|
|
|
|
);
|
2026-05-01 13:47:55 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Companion to the above — confirms that a finalize→publisher failure
|
|
|
|
|
/// on one table leaves OTHER tables untouched. Subsequent writes to
|
|
|
|
|
/// non-drifted tables proceed normally; the drift is contained.
|
|
|
|
|
#[tokio::test]
|
|
|
|
|
async fn finalize_publisher_residual_does_not_drift_untouched_tables() {
|
|
|
|
|
let _scenario = FailScenario::setup();
|
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
|
|
|
|
let mut db = Omnigraph::init(dir.path().to_str().unwrap(), helpers::TEST_SCHEMA)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
|
|
|
|
|
|
|
|
|
{
|
2026-05-05 16:04:48 +02:00
|
|
|
let _failpoint = ScopedFailPoint::new("mutation.post_finalize_pre_publisher", "return");
|
2026-05-01 13:47:55 +02:00
|
|
|
let _ = mutate_main(
|
|
|
|
|
&mut db,
|
|
|
|
|
MUTATION_QUERIES,
|
|
|
|
|
"insert_person",
|
|
|
|
|
&mixed_params(&[("$name", "Eve")], &[("$age", 22)]),
|
|
|
|
|
)
|
|
|
|
|
.await
|
|
|
|
|
.expect_err("synthetic failpoint must fire");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// node:Person drifted. node:Company didn't — try a Company write.
|
|
|
|
|
use omnigraph::loader::{LoadMode, load_jsonl};
|
|
|
|
|
load_jsonl(
|
|
|
|
|
&mut db,
|
|
|
|
|
r#"{"type": "Company", "data": {"name": "Acme"}}"#,
|
|
|
|
|
LoadMode::Append,
|
|
|
|
|
)
|
|
|
|
|
.await
|
|
|
|
|
.expect("Company write on a non-drifted table should succeed");
|
|
|
|
|
}
|
|
|
|
|
|
2026-05-05 22:46:03 +02:00
|
|
|
/// Acceptance test: a stage-step failure in the staged-index path
|
2026-05-03 13:56:36 +02:00
|
|
|
/// (`stage_create_btree_index` succeeded; `commit_staged` not yet
|
|
|
|
|
/// called) leaves NO Lance-HEAD drift on the existing tables.
|
|
|
|
|
/// Subsequent operations against those tables succeed without
|
|
|
|
|
/// `ExpectedVersionMismatch`.
|
2026-05-02 18:47:07 +02:00
|
|
|
///
|
|
|
|
|
/// Path: `apply_schema(v1 → v2)` adds a new node type. The
|
|
|
|
|
/// `added_tables` loop in `schema_apply` creates the empty dataset and
|
|
|
|
|
/// then calls `build_indices_on_dataset_for_catalog` →
|
|
|
|
|
/// `stage_and_commit_btree(..., &["id"])`. The failpoint fires
|
|
|
|
|
/// between `stage_create_btree_index` and `commit_staged`, so the
|
|
|
|
|
/// staged segments are written under `_indices/<uuid>/` but Lance HEAD
|
|
|
|
|
/// on the new dataset is unchanged at v=1. The schema-apply lock
|
|
|
|
|
/// branch is released by `apply_schema`'s outer match. Existing
|
|
|
|
|
/// tables (e.g. `node:Person`) are completely untouched by the new
|
|
|
|
|
/// node's added_tables iteration — they're outside the failed apply
|
|
|
|
|
/// path entirely — and we assert that mutations against them continue
|
|
|
|
|
/// to work.
|
|
|
|
|
///
|
|
|
|
|
/// The orphan empty dataset from the failed apply is acceptable
|
|
|
|
|
/// residual: it's unreferenced by `__manifest` and will be reclaimed
|
|
|
|
|
/// by `cleanup_old_versions` (or removed when a future apply at the
|
|
|
|
|
/// same target path resolves the rename).
|
|
|
|
|
#[tokio::test]
|
2026-05-05 22:46:03 +02:00
|
|
|
async fn ensure_indices_stage_btree_failure_leaves_existing_tables_writable() {
|
2026-05-02 18:47:07 +02:00
|
|
|
let _scenario = FailScenario::setup();
|
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
|
|
|
|
let uri = dir.path().to_str().unwrap().to_string();
|
|
|
|
|
|
|
|
|
|
// Init with TEST_SCHEMA which declares Person + Knows. Indices on
|
|
|
|
|
// those tables get built during init.
|
|
|
|
|
let mut db = Omnigraph::init(&uri, helpers::TEST_SCHEMA).await.unwrap();
|
|
|
|
|
|
|
|
|
|
// Apply a schema that adds a new node type. The added_tables loop
|
|
|
|
|
// will hit the failpoint between stage and commit on the new
|
|
|
|
|
// node:Project table's btree-on-id build. (TEST_SCHEMA already
|
|
|
|
|
// has Person + Company + Knows + WorksAt — pick a name that isn't
|
|
|
|
|
// already declared.)
|
2026-05-05 16:04:48 +02:00
|
|
|
let extended_schema = format!(
|
|
|
|
|
"{}\nnode Project {{ name: String @key }}\n",
|
|
|
|
|
helpers::TEST_SCHEMA
|
|
|
|
|
);
|
2026-05-02 18:47:07 +02:00
|
|
|
|
|
|
|
|
{
|
2026-05-05 16:04:48 +02:00
|
|
|
let _failpoint =
|
|
|
|
|
ScopedFailPoint::new("ensure_indices.post_stage_pre_commit_btree", "return");
|
2026-05-02 18:47:07 +02:00
|
|
|
let err = db.apply_schema(&extended_schema).await.unwrap_err();
|
|
|
|
|
assert!(
|
|
|
|
|
err.to_string()
|
|
|
|
|
.contains("ensure_indices.post_stage_pre_commit_btree"),
|
|
|
|
|
"schema apply should fail with the synthetic failpoint error, got: {err}"
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Existing tables stayed at their pre-apply versions; subsequent
|
|
|
|
|
// mutations against them succeed (no Lance-HEAD drift).
|
|
|
|
|
mutate_main(
|
|
|
|
|
&mut db,
|
|
|
|
|
helpers::MUTATION_QUERIES,
|
|
|
|
|
"insert_person",
|
|
|
|
|
&mixed_params(&[("$name", "Eve")], &[("$age", 22)]),
|
|
|
|
|
)
|
|
|
|
|
.await
|
|
|
|
|
.expect("Person mutation must succeed after the failed schema apply — existing tables are not drifted");
|
|
|
|
|
}
|
|
|
|
|
|
2026-05-24 16:46:00 +01:00
|
|
|
fn assert_no_staging_files(graph: &std::path::Path) {
|
2026-04-27 16:21:00 +03:00
|
|
|
for name in [
|
|
|
|
|
"_schema.pg.staging",
|
|
|
|
|
"_schema.ir.json.staging",
|
|
|
|
|
"__schema_state.json.staging",
|
|
|
|
|
] {
|
2026-05-24 16:46:00 +01:00
|
|
|
let path = graph.join(name);
|
2026-04-27 16:21:00 +03:00
|
|
|
assert!(
|
|
|
|
|
!path.exists(),
|
|
|
|
|
"staging file {} still exists after recovery",
|
|
|
|
|
path.display()
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
}
|
recovery: per-writer Phase B failure → recovery integration tests (Phase 9)
Add the three paired per-writer tests required by MR-847's acceptance
criteria — "All four migrated writers ... have paired Phase B → recovery
integration tests."
Production additions (~10 LOC):
- New failpoint `branch_merge.post_phase_b_pre_manifest_commit` in
`exec/merge.rs::branch_merge_on_current_target` between the per-table
publish loop and `commit_manifest_updates`.
- New failpoint `ensure_indices.post_phase_b_pre_manifest_commit` in
`db/omnigraph/table_ops.rs::ensure_indices_for_branch` between the
per-table loops and `commit_prepared_updates_on_branch`.
- For schema_apply, the existing `schema_apply.after_staging_write`
failpoint already fires in the right window (after the per-table
rewrites + index builds, before the manifest publish).
Sidecar tweak:
- `schema_apply` sidecar's `branch` is now `None` (was
`Some("__schema_apply_lock__")`). The lock branch is purely a
serialization sentinel; `coordinator.commit_changes_with_actor`
publishes against the coordinator's pre-lock branch (main). After
the failpoint fires, `release_schema_apply_lock` removes the lock
branch — if the sidecar referenced it, the recovery sweep would try
to publish to a branch that no longer exists and fail. Fix: record
the actual publish target.
Tests added in `tests/failpoints.rs` (~280 LOC):
- `schema_apply_phase_b_failure_recovered_on_next_open` — seeds a row,
opens, attempts a schema apply that adds a new node type + a new
property (the new type ensures the table set differs so
`recover_schema_state_files` doesn't trip on property-only
ambiguity), failpoint fires, drops engine, reopens, asserts sidecar
deleted + audit row recorded.
- `branch_merge_phase_b_failure_recovered_on_next_open` — seeds main,
branches off, mutates the branch, attempts merge with the
`branch_merge.post_phase_b_pre_manifest_commit` failpoint active.
Same recovery shape.
- `ensure_indices_phase_b_failure_recovered_on_next_open` — seeds
rows, attempts ensure_indices with the
`ensure_indices.post_phase_b_pre_manifest_commit` failpoint active.
After this commit, all four migrated writers have paired
Phase B → recovery tests:
- mutate_as / load: `recovery_rolls_forward_after_finalize_publisher_failure` (Phase 5)
- schema_apply: `schema_apply_phase_b_failure_recovered_on_next_open`
- branch_merge: `branch_merge_phase_b_failure_recovered_on_next_open`
- ensure_indices: `ensure_indices_phase_b_failure_recovered_on_next_open`
11 failpoint tests pass; full workspace lib + integration tests pass
(350+ tests across 20 binaries).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:38:02 +02:00
|
|
|
|
|
|
|
|
// =====================================================================
|
2026-05-03 13:56:36 +02:00
|
|
|
// Per-writer Phase B → Phase C recovery integration
|
recovery: per-writer Phase B failure → recovery integration tests (Phase 9)
Add the three paired per-writer tests required by MR-847's acceptance
criteria — "All four migrated writers ... have paired Phase B → recovery
integration tests."
Production additions (~10 LOC):
- New failpoint `branch_merge.post_phase_b_pre_manifest_commit` in
`exec/merge.rs::branch_merge_on_current_target` between the per-table
publish loop and `commit_manifest_updates`.
- New failpoint `ensure_indices.post_phase_b_pre_manifest_commit` in
`db/omnigraph/table_ops.rs::ensure_indices_for_branch` between the
per-table loops and `commit_prepared_updates_on_branch`.
- For schema_apply, the existing `schema_apply.after_staging_write`
failpoint already fires in the right window (after the per-table
rewrites + index builds, before the manifest publish).
Sidecar tweak:
- `schema_apply` sidecar's `branch` is now `None` (was
`Some("__schema_apply_lock__")`). The lock branch is purely a
serialization sentinel; `coordinator.commit_changes_with_actor`
publishes against the coordinator's pre-lock branch (main). After
the failpoint fires, `release_schema_apply_lock` removes the lock
branch — if the sidecar referenced it, the recovery sweep would try
to publish to a branch that no longer exists and fail. Fix: record
the actual publish target.
Tests added in `tests/failpoints.rs` (~280 LOC):
- `schema_apply_phase_b_failure_recovered_on_next_open` — seeds a row,
opens, attempts a schema apply that adds a new node type + a new
property (the new type ensures the table set differs so
`recover_schema_state_files` doesn't trip on property-only
ambiguity), failpoint fires, drops engine, reopens, asserts sidecar
deleted + audit row recorded.
- `branch_merge_phase_b_failure_recovered_on_next_open` — seeds main,
branches off, mutates the branch, attempts merge with the
`branch_merge.post_phase_b_pre_manifest_commit` failpoint active.
Same recovery shape.
- `ensure_indices_phase_b_failure_recovered_on_next_open` — seeds
rows, attempts ensure_indices with the
`ensure_indices.post_phase_b_pre_manifest_commit` failpoint active.
After this commit, all four migrated writers have paired
Phase B → recovery tests:
- mutate_as / load: `recovery_rolls_forward_after_finalize_publisher_failure` (Phase 5)
- schema_apply: `schema_apply_phase_b_failure_recovered_on_next_open`
- branch_merge: `branch_merge_phase_b_failure_recovered_on_next_open`
- ensure_indices: `ensure_indices_phase_b_failure_recovered_on_next_open`
11 failpoint tests pass; full workspace lib + integration tests pass
(350+ tests across 20 binaries).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:38:02 +02:00
|
|
|
// =====================================================================
|
|
|
|
|
//
|
2026-05-03 13:56:36 +02:00
|
|
|
// Each of the four migrated writers writes a sidecar BEFORE its
|
|
|
|
|
// per-table commit_staged loop and deletes it AFTER the manifest
|
|
|
|
|
// publish. The `recovery_rolls_forward_after_finalize_publisher_failure`
|
|
|
|
|
// test above covers MutationStaging::finalize. The three tests below
|
|
|
|
|
// cover the other three writers: schema_apply, branch_merge,
|
|
|
|
|
// ensure_indices.
|
recovery: per-writer Phase B failure → recovery integration tests (Phase 9)
Add the three paired per-writer tests required by MR-847's acceptance
criteria — "All four migrated writers ... have paired Phase B → recovery
integration tests."
Production additions (~10 LOC):
- New failpoint `branch_merge.post_phase_b_pre_manifest_commit` in
`exec/merge.rs::branch_merge_on_current_target` between the per-table
publish loop and `commit_manifest_updates`.
- New failpoint `ensure_indices.post_phase_b_pre_manifest_commit` in
`db/omnigraph/table_ops.rs::ensure_indices_for_branch` between the
per-table loops and `commit_prepared_updates_on_branch`.
- For schema_apply, the existing `schema_apply.after_staging_write`
failpoint already fires in the right window (after the per-table
rewrites + index builds, before the manifest publish).
Sidecar tweak:
- `schema_apply` sidecar's `branch` is now `None` (was
`Some("__schema_apply_lock__")`). The lock branch is purely a
serialization sentinel; `coordinator.commit_changes_with_actor`
publishes against the coordinator's pre-lock branch (main). After
the failpoint fires, `release_schema_apply_lock` removes the lock
branch — if the sidecar referenced it, the recovery sweep would try
to publish to a branch that no longer exists and fail. Fix: record
the actual publish target.
Tests added in `tests/failpoints.rs` (~280 LOC):
- `schema_apply_phase_b_failure_recovered_on_next_open` — seeds a row,
opens, attempts a schema apply that adds a new node type + a new
property (the new type ensures the table set differs so
`recover_schema_state_files` doesn't trip on property-only
ambiguity), failpoint fires, drops engine, reopens, asserts sidecar
deleted + audit row recorded.
- `branch_merge_phase_b_failure_recovered_on_next_open` — seeds main,
branches off, mutates the branch, attempts merge with the
`branch_merge.post_phase_b_pre_manifest_commit` failpoint active.
Same recovery shape.
- `ensure_indices_phase_b_failure_recovered_on_next_open` — seeds
rows, attempts ensure_indices with the
`ensure_indices.post_phase_b_pre_manifest_commit` failpoint active.
After this commit, all four migrated writers have paired
Phase B → recovery tests:
- mutate_as / load: `recovery_rolls_forward_after_finalize_publisher_failure` (Phase 5)
- schema_apply: `schema_apply_phase_b_failure_recovered_on_next_open`
- branch_merge: `branch_merge_phase_b_failure_recovered_on_next_open`
- ensure_indices: `ensure_indices_phase_b_failure_recovered_on_next_open`
11 failpoint tests pass; full workspace lib + integration tests pass
(350+ tests across 20 binaries).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:38:02 +02:00
|
|
|
//
|
|
|
|
|
// Each follows the same shape: trigger the writer with a failpoint
|
|
|
|
|
// active in the Phase B → Phase C window, drop the engine, reopen,
|
|
|
|
|
// assert recovery rolled forward (manifest pin advanced, audit row
|
|
|
|
|
// recorded, sidecar deleted) and a follow-up operation succeeds without
|
|
|
|
|
// ExpectedVersionMismatch.
|
|
|
|
|
|
2026-05-05 16:04:48 +02:00
|
|
|
#[tokio::test]
|
|
|
|
|
async fn schema_apply_without_schema_staging_rolls_back_on_next_open() {
|
|
|
|
|
use omnigraph::loader::{LoadMode, load_jsonl};
|
|
|
|
|
|
|
|
|
|
let _scenario = FailScenario::setup();
|
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
|
|
|
|
let uri = dir.path().to_str().unwrap().to_string();
|
|
|
|
|
let operation_id;
|
|
|
|
|
|
|
|
|
|
{
|
|
|
|
|
let mut db = Omnigraph::init(&uri, helpers::TEST_SCHEMA).await.unwrap();
|
|
|
|
|
load_jsonl(
|
|
|
|
|
&mut db,
|
|
|
|
|
r#"{"type":"Person","data":{"name":"alice","age":30}}
|
|
|
|
|
"#,
|
|
|
|
|
LoadMode::Append,
|
|
|
|
|
)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let pre_failure_version = {
|
|
|
|
|
let db = Omnigraph::open(&uri).await.unwrap();
|
|
|
|
|
version_main(&db).await.unwrap()
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
{
|
2026-05-08 16:26:23 +02:00
|
|
|
let db = Omnigraph::open(&uri).await.unwrap();
|
2026-05-05 16:04:48 +02:00
|
|
|
let _failpoint = ScopedFailPoint::new("schema_apply.before_staging_write", "return");
|
|
|
|
|
let v2_schema = r#"node Person {
|
|
|
|
|
name: String @key
|
|
|
|
|
age: I32?
|
|
|
|
|
city: String?
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
node Company {
|
|
|
|
|
name: String @key
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
node Tag {
|
|
|
|
|
label: String @key
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
edge Knows: Person -> Person {
|
|
|
|
|
since: Date?
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
edge WorksAt: Person -> Company
|
|
|
|
|
"#;
|
|
|
|
|
let err = db.apply_schema(v2_schema).await.unwrap_err();
|
|
|
|
|
assert!(
|
|
|
|
|
err.to_string()
|
|
|
|
|
.contains("injected failpoint triggered: schema_apply.before_staging_write"),
|
|
|
|
|
"unexpected error: {err}"
|
|
|
|
|
);
|
|
|
|
|
operation_id = single_sidecar_operation_id(dir.path());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let db = Omnigraph::open(&uri).await.unwrap();
|
fix: optimize publishes compaction; recovery roll-back converges manifest (#141)
* test(optimize): cover manifest publish + HEAD-drift reconcile
Red against the pre-fix optimize, which ran compact_files without
publishing the compacted version to __manifest:
- maintenance: optimize must publish so the manifest table_version
tracks the compacted Lance HEAD and a later schema apply succeeds;
and must reconcile a pre-existing manifest-behind-HEAD drift (forged
via raw Lance compaction) so strict writes commit again.
- end_to_end + composite_flow: post-optimize query / strict update /
reopen in the full lifecycle (the canonical flow previously omitted
post-optimize writes as a documented "known limitation").
- failpoints: a crash between compaction and the manifest publish rolls
forward on next open.
* fix(optimize): publish compaction to manifest and reconcile HEAD drift
optimize ran Lance compact_files without publishing the new version to
__manifest, so the manifest table_version lagged the Lance HEAD: reads
stayed pinned to the pre-compaction version, and the next schema apply or
strict update/delete failed its HEAD-vs-manifest precondition with
"stale view ... refresh and retry" (open-time recovery rollback inflated
the gap on retry).
optimize now publishes each compacted table's version under the
per-(table, main) write queue, guarded by a manifest CAS and a
SidecarKind::Optimize recovery sidecar (loose-match; roll-forward is safe
because compaction is content-preserving). When a table has nothing left
to compact but its Lance HEAD is already ahead of the manifest pin
(pre-fix drift, or a recovery restore commit), optimize reconciles the
manifest forward to HEAD (metadata-only, no sidecar). Caches and the
CSR/CSC graph index are invalidated after a publish.
Docs updated (maintenance, storage, branches-commits, writes, testing).
* test(recovery): rollback convergence + optimize-defer regressions
Red against the current code, landed before the fix:
- recovery: after the open-time sweep rolls a sidecar back, the manifest
must track Lance HEAD (no residual drift) so a follow-up schema apply
succeeds — the original "+1 per retry" loop. Today roll-back restores
without publishing, so the manifest lags HEAD and the apply fails its
HEAD-vs-manifest precondition.
- maintenance: optimize must refuse while a recovery sidecar is pending —
operating on an unrecovered graph could publish a partial write the
sweep would roll back.
Also removes optimize_reconciles_preexisting_manifest_head_drift: the
ad-hoc drift reconcile it covered is replaced by recovery-side convergence.
* fix(recovery): converge manifest on roll-back; optimize defers on pending recovery
Root of PR #141's review findings and the original "+1 per retry" loop:
a Lance HEAD ahead of the manifest was ambiguous (benign content-preserving
drift vs. a partial write a sidecar will roll back), and optimize's reconcile
guessed it benign. Close the class instead of guessing:
- Recovery roll-back now PUBLISHES the restored version (via a
push_table_update_at_head helper shared with roll-forward), so the manifest
tracks the Lance HEAD after recovery — symmetric with roll-forward. This
fixes the +1 loop (after one roll-back the retry's HEAD-vs-manifest
precondition passes) and removes the only remaining source of orphaned
drift. The audit still records the logical rolled-back-to version; the
manifest is published at the restore commit (identical content).
- optimize drops the ad-hoc drift reconcile and instead REFUSES when a
__recovery sidecar is pending, so it only ever operates on a recovered
graph (manifest == HEAD); its compaction publish can no longer commit a
partial write. With the reconcile gone, the blob-skip-vs-reconcile gap is
moot.
Updates the rollback recovery-test helper (manifest == HEAD after roll-back),
the failpoints assertions, and the user/dev docs.
* test(recovery): fix rollback assertion for manifest convergence
The roll-back-publishes change makes the manifest version advance after a
SchemaApply roll-back (to the old-schema content), so the
schema_apply_without_schema_staging_rolls_back_on_next_open assertion must
be `version > pre`, not `version == pre`. This update was dropped during
the commit churn and surfaced as a CI Test Workspace failure; the
old-schema-preserved intent stays covered by count_rows + _schema.pg + the
RolledBack convergence invariant.
2026-06-08 01:50:12 +02:00
|
|
|
// Roll-back now publishes the restored version, so the manifest version
|
|
|
|
|
// advances — but to the OLD-schema content: the migration never applied
|
|
|
|
|
// (asserted by count_rows + the `_schema.pg` checks below), and the sweep
|
|
|
|
|
// converges (`manifest == Lance HEAD`, asserted by
|
|
|
|
|
// assert_post_recovery_invariants's RolledBack arm).
|
|
|
|
|
assert!(
|
|
|
|
|
version_main(&db).await.unwrap() > pre_failure_version,
|
|
|
|
|
"roll-back publishes the restored (old-schema) version, advancing the manifest; \
|
|
|
|
|
pre={pre_failure_version}",
|
2026-05-05 16:04:48 +02:00
|
|
|
);
|
|
|
|
|
assert_eq!(
|
|
|
|
|
helpers::count_rows(&db, "node:Person").await,
|
|
|
|
|
1,
|
|
|
|
|
"old-schema data must remain readable after rollback"
|
|
|
|
|
);
|
|
|
|
|
drop(db);
|
|
|
|
|
|
|
|
|
|
assert_post_recovery_invariants(
|
|
|
|
|
dir.path(),
|
|
|
|
|
&operation_id,
|
|
|
|
|
RecoveryExpectation::RolledBack {
|
|
|
|
|
tables: vec![TableExpectation::main("node:Person")],
|
|
|
|
|
},
|
|
|
|
|
)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
|
|
|
|
|
|
|
|
|
let live_schema = std::fs::read_to_string(dir.path().join("_schema.pg")).unwrap();
|
|
|
|
|
assert!(
|
|
|
|
|
!live_schema.contains("city: String?"),
|
|
|
|
|
"_schema.pg must keep the OLD schema when staging files never existed; got:\n{live_schema}",
|
|
|
|
|
);
|
|
|
|
|
assert!(
|
|
|
|
|
!live_schema.contains("node Tag"),
|
|
|
|
|
"_schema.pg must keep the OLD schema when staging files never existed; got:\n{live_schema}",
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
recovery: per-writer Phase B failure → recovery integration tests (Phase 9)
Add the three paired per-writer tests required by MR-847's acceptance
criteria — "All four migrated writers ... have paired Phase B → recovery
integration tests."
Production additions (~10 LOC):
- New failpoint `branch_merge.post_phase_b_pre_manifest_commit` in
`exec/merge.rs::branch_merge_on_current_target` between the per-table
publish loop and `commit_manifest_updates`.
- New failpoint `ensure_indices.post_phase_b_pre_manifest_commit` in
`db/omnigraph/table_ops.rs::ensure_indices_for_branch` between the
per-table loops and `commit_prepared_updates_on_branch`.
- For schema_apply, the existing `schema_apply.after_staging_write`
failpoint already fires in the right window (after the per-table
rewrites + index builds, before the manifest publish).
Sidecar tweak:
- `schema_apply` sidecar's `branch` is now `None` (was
`Some("__schema_apply_lock__")`). The lock branch is purely a
serialization sentinel; `coordinator.commit_changes_with_actor`
publishes against the coordinator's pre-lock branch (main). After
the failpoint fires, `release_schema_apply_lock` removes the lock
branch — if the sidecar referenced it, the recovery sweep would try
to publish to a branch that no longer exists and fail. Fix: record
the actual publish target.
Tests added in `tests/failpoints.rs` (~280 LOC):
- `schema_apply_phase_b_failure_recovered_on_next_open` — seeds a row,
opens, attempts a schema apply that adds a new node type + a new
property (the new type ensures the table set differs so
`recover_schema_state_files` doesn't trip on property-only
ambiguity), failpoint fires, drops engine, reopens, asserts sidecar
deleted + audit row recorded.
- `branch_merge_phase_b_failure_recovered_on_next_open` — seeds main,
branches off, mutates the branch, attempts merge with the
`branch_merge.post_phase_b_pre_manifest_commit` failpoint active.
Same recovery shape.
- `ensure_indices_phase_b_failure_recovered_on_next_open` — seeds
rows, attempts ensure_indices with the
`ensure_indices.post_phase_b_pre_manifest_commit` failpoint active.
After this commit, all four migrated writers have paired
Phase B → recovery tests:
- mutate_as / load: `recovery_rolls_forward_after_finalize_publisher_failure` (Phase 5)
- schema_apply: `schema_apply_phase_b_failure_recovered_on_next_open`
- branch_merge: `branch_merge_phase_b_failure_recovered_on_next_open`
- ensure_indices: `ensure_indices_phase_b_failure_recovered_on_next_open`
11 failpoint tests pass; full workspace lib + integration tests pass
(350+ tests across 20 binaries).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:38:02 +02:00
|
|
|
#[tokio::test]
|
|
|
|
|
async fn schema_apply_phase_b_failure_recovered_on_next_open() {
|
|
|
|
|
use omnigraph::loader::{LoadMode, load_jsonl};
|
|
|
|
|
|
|
|
|
|
let _scenario = FailScenario::setup();
|
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
|
|
|
|
let uri = dir.path().to_str().unwrap().to_string();
|
2026-05-05 16:04:48 +02:00
|
|
|
let operation_id;
|
recovery: per-writer Phase B failure → recovery integration tests (Phase 9)
Add the three paired per-writer tests required by MR-847's acceptance
criteria — "All four migrated writers ... have paired Phase B → recovery
integration tests."
Production additions (~10 LOC):
- New failpoint `branch_merge.post_phase_b_pre_manifest_commit` in
`exec/merge.rs::branch_merge_on_current_target` between the per-table
publish loop and `commit_manifest_updates`.
- New failpoint `ensure_indices.post_phase_b_pre_manifest_commit` in
`db/omnigraph/table_ops.rs::ensure_indices_for_branch` between the
per-table loops and `commit_prepared_updates_on_branch`.
- For schema_apply, the existing `schema_apply.after_staging_write`
failpoint already fires in the right window (after the per-table
rewrites + index builds, before the manifest publish).
Sidecar tweak:
- `schema_apply` sidecar's `branch` is now `None` (was
`Some("__schema_apply_lock__")`). The lock branch is purely a
serialization sentinel; `coordinator.commit_changes_with_actor`
publishes against the coordinator's pre-lock branch (main). After
the failpoint fires, `release_schema_apply_lock` removes the lock
branch — if the sidecar referenced it, the recovery sweep would try
to publish to a branch that no longer exists and fail. Fix: record
the actual publish target.
Tests added in `tests/failpoints.rs` (~280 LOC):
- `schema_apply_phase_b_failure_recovered_on_next_open` — seeds a row,
opens, attempts a schema apply that adds a new node type + a new
property (the new type ensures the table set differs so
`recover_schema_state_files` doesn't trip on property-only
ambiguity), failpoint fires, drops engine, reopens, asserts sidecar
deleted + audit row recorded.
- `branch_merge_phase_b_failure_recovered_on_next_open` — seeds main,
branches off, mutates the branch, attempts merge with the
`branch_merge.post_phase_b_pre_manifest_commit` failpoint active.
Same recovery shape.
- `ensure_indices_phase_b_failure_recovered_on_next_open` — seeds
rows, attempts ensure_indices with the
`ensure_indices.post_phase_b_pre_manifest_commit` failpoint active.
After this commit, all four migrated writers have paired
Phase B → recovery tests:
- mutate_as / load: `recovery_rolls_forward_after_finalize_publisher_failure` (Phase 5)
- schema_apply: `schema_apply_phase_b_failure_recovered_on_next_open`
- branch_merge: `branch_merge_phase_b_failure_recovered_on_next_open`
- ensure_indices: `ensure_indices_phase_b_failure_recovered_on_next_open`
11 failpoint tests pass; full workspace lib + integration tests pass
(350+ tests across 20 binaries).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:38:02 +02:00
|
|
|
|
|
|
|
|
// Seed: a Person table with one row so the schema-apply rewritten_tables
|
|
|
|
|
// loop has actual work to do.
|
|
|
|
|
{
|
|
|
|
|
let mut db = Omnigraph::init(&uri, helpers::TEST_SCHEMA).await.unwrap();
|
|
|
|
|
load_jsonl(
|
|
|
|
|
&mut db,
|
|
|
|
|
r#"{"type":"Person","data":{"name":"alice","age":30}}
|
|
|
|
|
"#,
|
|
|
|
|
LoadMode::Append,
|
|
|
|
|
)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
|
|
|
|
}
|
|
|
|
|
|
2026-05-03 15:09:58 +02:00
|
|
|
// Capture pre-failure manifest version so we can assert the recovery
|
|
|
|
|
// sweep advances it.
|
|
|
|
|
let pre_failure_version = {
|
|
|
|
|
let db = Omnigraph::open(&uri).await.unwrap();
|
|
|
|
|
version_main(&db).await.unwrap()
|
|
|
|
|
};
|
|
|
|
|
|
2026-05-05 22:46:03 +02:00
|
|
|
// Setup: trigger the residual via `schema_apply.after_staging_write`.
|
recovery: per-writer Phase B failure → recovery integration tests (Phase 9)
Add the three paired per-writer tests required by MR-847's acceptance
criteria — "All four migrated writers ... have paired Phase B → recovery
integration tests."
Production additions (~10 LOC):
- New failpoint `branch_merge.post_phase_b_pre_manifest_commit` in
`exec/merge.rs::branch_merge_on_current_target` between the per-table
publish loop and `commit_manifest_updates`.
- New failpoint `ensure_indices.post_phase_b_pre_manifest_commit` in
`db/omnigraph/table_ops.rs::ensure_indices_for_branch` between the
per-table loops and `commit_prepared_updates_on_branch`.
- For schema_apply, the existing `schema_apply.after_staging_write`
failpoint already fires in the right window (after the per-table
rewrites + index builds, before the manifest publish).
Sidecar tweak:
- `schema_apply` sidecar's `branch` is now `None` (was
`Some("__schema_apply_lock__")`). The lock branch is purely a
serialization sentinel; `coordinator.commit_changes_with_actor`
publishes against the coordinator's pre-lock branch (main). After
the failpoint fires, `release_schema_apply_lock` removes the lock
branch — if the sidecar referenced it, the recovery sweep would try
to publish to a branch that no longer exists and fail. Fix: record
the actual publish target.
Tests added in `tests/failpoints.rs` (~280 LOC):
- `schema_apply_phase_b_failure_recovered_on_next_open` — seeds a row,
opens, attempts a schema apply that adds a new node type + a new
property (the new type ensures the table set differs so
`recover_schema_state_files` doesn't trip on property-only
ambiguity), failpoint fires, drops engine, reopens, asserts sidecar
deleted + audit row recorded.
- `branch_merge_phase_b_failure_recovered_on_next_open` — seeds main,
branches off, mutates the branch, attempts merge with the
`branch_merge.post_phase_b_pre_manifest_commit` failpoint active.
Same recovery shape.
- `ensure_indices_phase_b_failure_recovered_on_next_open` — seeds
rows, attempts ensure_indices with the
`ensure_indices.post_phase_b_pre_manifest_commit` failpoint active.
After this commit, all four migrated writers have paired
Phase B → recovery tests:
- mutate_as / load: `recovery_rolls_forward_after_finalize_publisher_failure` (Phase 5)
- schema_apply: `schema_apply_phase_b_failure_recovered_on_next_open`
- branch_merge: `branch_merge_phase_b_failure_recovered_on_next_open`
- ensure_indices: `ensure_indices_phase_b_failure_recovered_on_next_open`
11 failpoint tests pass; full workspace lib + integration tests pass
(350+ tests across 20 binaries).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:38:02 +02:00
|
|
|
// This failpoint fires AFTER the rewritten_tables/indexed_tables loops
|
|
|
|
|
// (Lance HEAD advanced) AND AFTER the schema-state staging files are
|
2026-05-03 13:56:36 +02:00
|
|
|
// written, but BEFORE the manifest publish. The recovery sidecar persists.
|
recovery: per-writer Phase B failure → recovery integration tests (Phase 9)
Add the three paired per-writer tests required by MR-847's acceptance
criteria — "All four migrated writers ... have paired Phase B → recovery
integration tests."
Production additions (~10 LOC):
- New failpoint `branch_merge.post_phase_b_pre_manifest_commit` in
`exec/merge.rs::branch_merge_on_current_target` between the per-table
publish loop and `commit_manifest_updates`.
- New failpoint `ensure_indices.post_phase_b_pre_manifest_commit` in
`db/omnigraph/table_ops.rs::ensure_indices_for_branch` between the
per-table loops and `commit_prepared_updates_on_branch`.
- For schema_apply, the existing `schema_apply.after_staging_write`
failpoint already fires in the right window (after the per-table
rewrites + index builds, before the manifest publish).
Sidecar tweak:
- `schema_apply` sidecar's `branch` is now `None` (was
`Some("__schema_apply_lock__")`). The lock branch is purely a
serialization sentinel; `coordinator.commit_changes_with_actor`
publishes against the coordinator's pre-lock branch (main). After
the failpoint fires, `release_schema_apply_lock` removes the lock
branch — if the sidecar referenced it, the recovery sweep would try
to publish to a branch that no longer exists and fail. Fix: record
the actual publish target.
Tests added in `tests/failpoints.rs` (~280 LOC):
- `schema_apply_phase_b_failure_recovered_on_next_open` — seeds a row,
opens, attempts a schema apply that adds a new node type + a new
property (the new type ensures the table set differs so
`recover_schema_state_files` doesn't trip on property-only
ambiguity), failpoint fires, drops engine, reopens, asserts sidecar
deleted + audit row recorded.
- `branch_merge_phase_b_failure_recovered_on_next_open` — seeds main,
branches off, mutates the branch, attempts merge with the
`branch_merge.post_phase_b_pre_manifest_commit` failpoint active.
Same recovery shape.
- `ensure_indices_phase_b_failure_recovered_on_next_open` — seeds
rows, attempts ensure_indices with the
`ensure_indices.post_phase_b_pre_manifest_commit` failpoint active.
After this commit, all four migrated writers have paired
Phase B → recovery tests:
- mutate_as / load: `recovery_rolls_forward_after_finalize_publisher_failure` (Phase 5)
- schema_apply: `schema_apply_phase_b_failure_recovered_on_next_open`
- branch_merge: `branch_merge_phase_b_failure_recovered_on_next_open`
- ensure_indices: `ensure_indices_phase_b_failure_recovered_on_next_open`
11 failpoint tests pass; full workspace lib + integration tests pass
(350+ tests across 20 binaries).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:38:02 +02:00
|
|
|
{
|
2026-05-08 16:26:23 +02:00
|
|
|
let db = Omnigraph::open(&uri).await.unwrap();
|
recovery: per-writer Phase B failure → recovery integration tests (Phase 9)
Add the three paired per-writer tests required by MR-847's acceptance
criteria — "All four migrated writers ... have paired Phase B → recovery
integration tests."
Production additions (~10 LOC):
- New failpoint `branch_merge.post_phase_b_pre_manifest_commit` in
`exec/merge.rs::branch_merge_on_current_target` between the per-table
publish loop and `commit_manifest_updates`.
- New failpoint `ensure_indices.post_phase_b_pre_manifest_commit` in
`db/omnigraph/table_ops.rs::ensure_indices_for_branch` between the
per-table loops and `commit_prepared_updates_on_branch`.
- For schema_apply, the existing `schema_apply.after_staging_write`
failpoint already fires in the right window (after the per-table
rewrites + index builds, before the manifest publish).
Sidecar tweak:
- `schema_apply` sidecar's `branch` is now `None` (was
`Some("__schema_apply_lock__")`). The lock branch is purely a
serialization sentinel; `coordinator.commit_changes_with_actor`
publishes against the coordinator's pre-lock branch (main). After
the failpoint fires, `release_schema_apply_lock` removes the lock
branch — if the sidecar referenced it, the recovery sweep would try
to publish to a branch that no longer exists and fail. Fix: record
the actual publish target.
Tests added in `tests/failpoints.rs` (~280 LOC):
- `schema_apply_phase_b_failure_recovered_on_next_open` — seeds a row,
opens, attempts a schema apply that adds a new node type + a new
property (the new type ensures the table set differs so
`recover_schema_state_files` doesn't trip on property-only
ambiguity), failpoint fires, drops engine, reopens, asserts sidecar
deleted + audit row recorded.
- `branch_merge_phase_b_failure_recovered_on_next_open` — seeds main,
branches off, mutates the branch, attempts merge with the
`branch_merge.post_phase_b_pre_manifest_commit` failpoint active.
Same recovery shape.
- `ensure_indices_phase_b_failure_recovered_on_next_open` — seeds
rows, attempts ensure_indices with the
`ensure_indices.post_phase_b_pre_manifest_commit` failpoint active.
After this commit, all four migrated writers have paired
Phase B → recovery tests:
- mutate_as / load: `recovery_rolls_forward_after_finalize_publisher_failure` (Phase 5)
- schema_apply: `schema_apply_phase_b_failure_recovered_on_next_open`
- branch_merge: `branch_merge_phase_b_failure_recovered_on_next_open`
- ensure_indices: `ensure_indices_phase_b_failure_recovered_on_next_open`
11 failpoint tests pass; full workspace lib + integration tests pass
(350+ tests across 20 binaries).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:38:02 +02:00
|
|
|
let _failpoint = ScopedFailPoint::new("schema_apply.after_staging_write", "return");
|
|
|
|
|
// v2 schema: add a `city` property to Person AND add a new
|
|
|
|
|
// `Tag` node type. The new property triggers the rewritten_tables
|
|
|
|
|
// path (Phase B sidecar coverage). The new type changes the
|
|
|
|
|
// overall table set — required to keep `recover_schema_state_files`
|
|
|
|
|
// (which runs BEFORE recover_manifest_drift) happy: it can't
|
|
|
|
|
// disambiguate property-only migrations and would reject the
|
2026-05-03 13:56:36 +02:00
|
|
|
// open before the recovery sweep ever ran.
|
recovery: per-writer Phase B failure → recovery integration tests (Phase 9)
Add the three paired per-writer tests required by MR-847's acceptance
criteria — "All four migrated writers ... have paired Phase B → recovery
integration tests."
Production additions (~10 LOC):
- New failpoint `branch_merge.post_phase_b_pre_manifest_commit` in
`exec/merge.rs::branch_merge_on_current_target` between the per-table
publish loop and `commit_manifest_updates`.
- New failpoint `ensure_indices.post_phase_b_pre_manifest_commit` in
`db/omnigraph/table_ops.rs::ensure_indices_for_branch` between the
per-table loops and `commit_prepared_updates_on_branch`.
- For schema_apply, the existing `schema_apply.after_staging_write`
failpoint already fires in the right window (after the per-table
rewrites + index builds, before the manifest publish).
Sidecar tweak:
- `schema_apply` sidecar's `branch` is now `None` (was
`Some("__schema_apply_lock__")`). The lock branch is purely a
serialization sentinel; `coordinator.commit_changes_with_actor`
publishes against the coordinator's pre-lock branch (main). After
the failpoint fires, `release_schema_apply_lock` removes the lock
branch — if the sidecar referenced it, the recovery sweep would try
to publish to a branch that no longer exists and fail. Fix: record
the actual publish target.
Tests added in `tests/failpoints.rs` (~280 LOC):
- `schema_apply_phase_b_failure_recovered_on_next_open` — seeds a row,
opens, attempts a schema apply that adds a new node type + a new
property (the new type ensures the table set differs so
`recover_schema_state_files` doesn't trip on property-only
ambiguity), failpoint fires, drops engine, reopens, asserts sidecar
deleted + audit row recorded.
- `branch_merge_phase_b_failure_recovered_on_next_open` — seeds main,
branches off, mutates the branch, attempts merge with the
`branch_merge.post_phase_b_pre_manifest_commit` failpoint active.
Same recovery shape.
- `ensure_indices_phase_b_failure_recovered_on_next_open` — seeds
rows, attempts ensure_indices with the
`ensure_indices.post_phase_b_pre_manifest_commit` failpoint active.
After this commit, all four migrated writers have paired
Phase B → recovery tests:
- mutate_as / load: `recovery_rolls_forward_after_finalize_publisher_failure` (Phase 5)
- schema_apply: `schema_apply_phase_b_failure_recovered_on_next_open`
- branch_merge: `branch_merge_phase_b_failure_recovered_on_next_open`
- ensure_indices: `ensure_indices_phase_b_failure_recovered_on_next_open`
11 failpoint tests pass; full workspace lib + integration tests pass
(350+ tests across 20 binaries).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:38:02 +02:00
|
|
|
let v2_schema = r#"node Person {
|
|
|
|
|
name: String @key
|
|
|
|
|
age: I32?
|
|
|
|
|
city: String?
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
node Company {
|
|
|
|
|
name: String @key
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
node Tag {
|
|
|
|
|
label: String @key
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
edge Knows: Person -> Person {
|
|
|
|
|
since: Date?
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
edge WorksAt: Person -> Company
|
|
|
|
|
"#;
|
|
|
|
|
let err = db.apply_schema(v2_schema).await.unwrap_err();
|
|
|
|
|
assert!(
|
|
|
|
|
err.to_string()
|
|
|
|
|
.contains("injected failpoint triggered: schema_apply.after_staging_write"),
|
|
|
|
|
"unexpected error: {err}"
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
// Sidecar must still exist.
|
|
|
|
|
let recovery_dir = dir.path().join("__recovery");
|
|
|
|
|
let sidecars: Vec<_> = std::fs::read_dir(&recovery_dir)
|
|
|
|
|
.unwrap()
|
|
|
|
|
.filter_map(|e| e.ok())
|
|
|
|
|
.collect();
|
|
|
|
|
assert_eq!(
|
|
|
|
|
sidecars.len(),
|
|
|
|
|
1,
|
|
|
|
|
"exactly one sidecar must persist after schema_apply failure"
|
|
|
|
|
);
|
2026-05-05 16:04:48 +02:00
|
|
|
operation_id = single_sidecar_operation_id(dir.path());
|
recovery: per-writer Phase B failure → recovery integration tests (Phase 9)
Add the three paired per-writer tests required by MR-847's acceptance
criteria — "All four migrated writers ... have paired Phase B → recovery
integration tests."
Production additions (~10 LOC):
- New failpoint `branch_merge.post_phase_b_pre_manifest_commit` in
`exec/merge.rs::branch_merge_on_current_target` between the per-table
publish loop and `commit_manifest_updates`.
- New failpoint `ensure_indices.post_phase_b_pre_manifest_commit` in
`db/omnigraph/table_ops.rs::ensure_indices_for_branch` between the
per-table loops and `commit_prepared_updates_on_branch`.
- For schema_apply, the existing `schema_apply.after_staging_write`
failpoint already fires in the right window (after the per-table
rewrites + index builds, before the manifest publish).
Sidecar tweak:
- `schema_apply` sidecar's `branch` is now `None` (was
`Some("__schema_apply_lock__")`). The lock branch is purely a
serialization sentinel; `coordinator.commit_changes_with_actor`
publishes against the coordinator's pre-lock branch (main). After
the failpoint fires, `release_schema_apply_lock` removes the lock
branch — if the sidecar referenced it, the recovery sweep would try
to publish to a branch that no longer exists and fail. Fix: record
the actual publish target.
Tests added in `tests/failpoints.rs` (~280 LOC):
- `schema_apply_phase_b_failure_recovered_on_next_open` — seeds a row,
opens, attempts a schema apply that adds a new node type + a new
property (the new type ensures the table set differs so
`recover_schema_state_files` doesn't trip on property-only
ambiguity), failpoint fires, drops engine, reopens, asserts sidecar
deleted + audit row recorded.
- `branch_merge_phase_b_failure_recovered_on_next_open` — seeds main,
branches off, mutates the branch, attempts merge with the
`branch_merge.post_phase_b_pre_manifest_commit` failpoint active.
Same recovery shape.
- `ensure_indices_phase_b_failure_recovered_on_next_open` — seeds
rows, attempts ensure_indices with the
`ensure_indices.post_phase_b_pre_manifest_commit` failpoint active.
After this commit, all four migrated writers have paired
Phase B → recovery tests:
- mutate_as / load: `recovery_rolls_forward_after_finalize_publisher_failure` (Phase 5)
- schema_apply: `schema_apply_phase_b_failure_recovered_on_next_open`
- branch_merge: `branch_merge_phase_b_failure_recovered_on_next_open`
- ensure_indices: `ensure_indices_phase_b_failure_recovered_on_next_open`
11 failpoint tests pass; full workspace lib + integration tests pass
(350+ tests across 20 binaries).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:38:02 +02:00
|
|
|
}
|
|
|
|
|
|
2026-05-05 22:46:03 +02:00
|
|
|
// Recovery: reopen runs the recovery sweep. Sidecar's writer_kind is
|
recovery: per-writer Phase B failure → recovery integration tests (Phase 9)
Add the three paired per-writer tests required by MR-847's acceptance
criteria — "All four migrated writers ... have paired Phase B → recovery
integration tests."
Production additions (~10 LOC):
- New failpoint `branch_merge.post_phase_b_pre_manifest_commit` in
`exec/merge.rs::branch_merge_on_current_target` between the per-table
publish loop and `commit_manifest_updates`.
- New failpoint `ensure_indices.post_phase_b_pre_manifest_commit` in
`db/omnigraph/table_ops.rs::ensure_indices_for_branch` between the
per-table loops and `commit_prepared_updates_on_branch`.
- For schema_apply, the existing `schema_apply.after_staging_write`
failpoint already fires in the right window (after the per-table
rewrites + index builds, before the manifest publish).
Sidecar tweak:
- `schema_apply` sidecar's `branch` is now `None` (was
`Some("__schema_apply_lock__")`). The lock branch is purely a
serialization sentinel; `coordinator.commit_changes_with_actor`
publishes against the coordinator's pre-lock branch (main). After
the failpoint fires, `release_schema_apply_lock` removes the lock
branch — if the sidecar referenced it, the recovery sweep would try
to publish to a branch that no longer exists and fail. Fix: record
the actual publish target.
Tests added in `tests/failpoints.rs` (~280 LOC):
- `schema_apply_phase_b_failure_recovered_on_next_open` — seeds a row,
opens, attempts a schema apply that adds a new node type + a new
property (the new type ensures the table set differs so
`recover_schema_state_files` doesn't trip on property-only
ambiguity), failpoint fires, drops engine, reopens, asserts sidecar
deleted + audit row recorded.
- `branch_merge_phase_b_failure_recovered_on_next_open` — seeds main,
branches off, mutates the branch, attempts merge with the
`branch_merge.post_phase_b_pre_manifest_commit` failpoint active.
Same recovery shape.
- `ensure_indices_phase_b_failure_recovered_on_next_open` — seeds
rows, attempts ensure_indices with the
`ensure_indices.post_phase_b_pre_manifest_commit` failpoint active.
After this commit, all four migrated writers have paired
Phase B → recovery tests:
- mutate_as / load: `recovery_rolls_forward_after_finalize_publisher_failure` (Phase 5)
- schema_apply: `schema_apply_phase_b_failure_recovered_on_next_open`
- branch_merge: `branch_merge_phase_b_failure_recovered_on_next_open`
- ensure_indices: `ensure_indices_phase_b_failure_recovered_on_next_open`
11 failpoint tests pass; full workspace lib + integration tests pass
(350+ tests across 20 binaries).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:38:02 +02:00
|
|
|
// SchemaApply (loose-match) — classifier accepts the multi-commit
|
|
|
|
|
// drift on Person, decision is RollForward, manifest extends to the
|
|
|
|
|
// current Lance HEAD.
|
2026-05-03 15:09:58 +02:00
|
|
|
let db = Omnigraph::open(&uri).await.unwrap();
|
recovery: per-writer Phase B failure → recovery integration tests (Phase 9)
Add the three paired per-writer tests required by MR-847's acceptance
criteria — "All four migrated writers ... have paired Phase B → recovery
integration tests."
Production additions (~10 LOC):
- New failpoint `branch_merge.post_phase_b_pre_manifest_commit` in
`exec/merge.rs::branch_merge_on_current_target` between the per-table
publish loop and `commit_manifest_updates`.
- New failpoint `ensure_indices.post_phase_b_pre_manifest_commit` in
`db/omnigraph/table_ops.rs::ensure_indices_for_branch` between the
per-table loops and `commit_prepared_updates_on_branch`.
- For schema_apply, the existing `schema_apply.after_staging_write`
failpoint already fires in the right window (after the per-table
rewrites + index builds, before the manifest publish).
Sidecar tweak:
- `schema_apply` sidecar's `branch` is now `None` (was
`Some("__schema_apply_lock__")`). The lock branch is purely a
serialization sentinel; `coordinator.commit_changes_with_actor`
publishes against the coordinator's pre-lock branch (main). After
the failpoint fires, `release_schema_apply_lock` removes the lock
branch — if the sidecar referenced it, the recovery sweep would try
to publish to a branch that no longer exists and fail. Fix: record
the actual publish target.
Tests added in `tests/failpoints.rs` (~280 LOC):
- `schema_apply_phase_b_failure_recovered_on_next_open` — seeds a row,
opens, attempts a schema apply that adds a new node type + a new
property (the new type ensures the table set differs so
`recover_schema_state_files` doesn't trip on property-only
ambiguity), failpoint fires, drops engine, reopens, asserts sidecar
deleted + audit row recorded.
- `branch_merge_phase_b_failure_recovered_on_next_open` — seeds main,
branches off, mutates the branch, attempts merge with the
`branch_merge.post_phase_b_pre_manifest_commit` failpoint active.
Same recovery shape.
- `ensure_indices_phase_b_failure_recovered_on_next_open` — seeds
rows, attempts ensure_indices with the
`ensure_indices.post_phase_b_pre_manifest_commit` failpoint active.
After this commit, all four migrated writers have paired
Phase B → recovery tests:
- mutate_as / load: `recovery_rolls_forward_after_finalize_publisher_failure` (Phase 5)
- schema_apply: `schema_apply_phase_b_failure_recovered_on_next_open`
- branch_merge: `branch_merge_phase_b_failure_recovered_on_next_open`
- ensure_indices: `ensure_indices_phase_b_failure_recovered_on_next_open`
11 failpoint tests pass; full workspace lib + integration tests pass
(350+ tests across 20 binaries).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:38:02 +02:00
|
|
|
|
2026-05-03 15:09:58 +02:00
|
|
|
// Recovery sweep must have advanced the manifest pin on the rewritten
|
|
|
|
|
// table: roll-forward published the post-failure Lance HEAD.
|
|
|
|
|
let post_recovery_version = version_main(&db).await.unwrap();
|
|
|
|
|
assert!(
|
|
|
|
|
post_recovery_version > pre_failure_version,
|
|
|
|
|
"manifest version must advance post-recovery; pre={pre_failure_version}, \
|
|
|
|
|
post={post_recovery_version}",
|
|
|
|
|
);
|
2026-05-05 16:04:48 +02:00
|
|
|
drop(db);
|
|
|
|
|
|
|
|
|
|
assert_post_recovery_invariants(
|
|
|
|
|
dir.path(),
|
|
|
|
|
&operation_id,
|
|
|
|
|
RecoveryExpectation::RolledForward {
|
|
|
|
|
tables: vec![TableExpectation::main("node:Person")],
|
|
|
|
|
},
|
|
|
|
|
)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
recovery: close four correctness gaps (schema-apply, branch-aware, restore short-circuit, merge parent)
B1. Schema-apply atomicity. Before this commit, a failure between
`_schema.pg.staging` write and the manifest publish left the repo
corrupt: Lance HEADs advanced under the new schema, manifest stayed
at old pins, and on reopen schema-state recovery deleted the staging
files (manifest's table set still matched the live schema), then
manifest-drift recovery rolled the table versions forward — leaving
new-schema data on disk with the old `_schema.pg` live.
Fix: a SchemaApply sidecar is the marker that Phase B completed but
Phase C didn't. New helper `has_schema_apply_sidecar` is consulted
by `recover_schema_state_files` BEFORE its disambiguation logic;
when present, it completes the staging→final rename so the
subsequent manifest-drift roll-forward sees the new catalog.
B2. Branch-aware recovery. Sidecars from feature-branch writers were
being classified against main's snapshot and main's Lance HEAD,
silently no-op'ing or rolling back the wrong table version (the
classifier saw NoMovement; the writer's drift on the feature branch
persisted; subsequent feature writers surfaced
ExpectedVersionMismatch).
Fix: SidecarTablePin gets an optional `table_branch` field;
`recover_manifest_drift` opens a per-branch coordinator
(`GraphCoordinator::open_branch`) per sidecar; `open_lance_head`,
`restore_table_to_version`, and `roll_forward_all` honor the pin's
branch via `Dataset::checkout_branch`.
B3. Remove fragment-id short-circuit in `restore_table_to_version`.
Equal fragment IDs do NOT imply equal content: Lance index commits
and deletion-vector updates change the manifest without touching
fragment IDs. Skipping restore in those cases would leave Lance HEAD
ahead of the manifest with no recovery artifact left. Restore is
now unconditional; pile-up under repeated mid-rollback crashes
bounded and reclaimed by `omnigraph cleanup`.
B4. Recovered branch_merge records merge parent. `record_audit` always
called `append_commit`, dropping `merged_parent_commit_id`. Future
`branch_merge feature -> main` between the same pair lost
already-up-to-date detection. RecoverySidecar gets an optional
`merge_source_commit_id`; `branch_merge_on_current_target`
populates it from `source_head_commit_id`; `record_audit`
dispatches to `append_merge_commit` when present.
New tests: feature-branch sidecar classification (B2); B1 deepens the
existing schema_apply test with live-`_schema.pg` and new-type
assertions; B4 deepens the existing branch_merge test by reading
`_graph_commits.lance` and asserting a non-null `merged_parent_commit_id`.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 23:39:41 +02:00
|
|
|
|
|
|
|
|
// Schema-apply atomicity: the live `_schema.pg` must reflect the
|
|
|
|
|
// NEW schema (city column on Person, Tag node type) — not the old.
|
|
|
|
|
// Without the schema-staging coordination, the schema-state
|
|
|
|
|
// recovery would have deleted the staging files (because manifest
|
2026-05-24 16:46:00 +01:00
|
|
|
// hadn't advanced when it ran), leaving a corrupt graph with new-
|
recovery: close four correctness gaps (schema-apply, branch-aware, restore short-circuit, merge parent)
B1. Schema-apply atomicity. Before this commit, a failure between
`_schema.pg.staging` write and the manifest publish left the repo
corrupt: Lance HEADs advanced under the new schema, manifest stayed
at old pins, and on reopen schema-state recovery deleted the staging
files (manifest's table set still matched the live schema), then
manifest-drift recovery rolled the table versions forward — leaving
new-schema data on disk with the old `_schema.pg` live.
Fix: a SchemaApply sidecar is the marker that Phase B completed but
Phase C didn't. New helper `has_schema_apply_sidecar` is consulted
by `recover_schema_state_files` BEFORE its disambiguation logic;
when present, it completes the staging→final rename so the
subsequent manifest-drift roll-forward sees the new catalog.
B2. Branch-aware recovery. Sidecars from feature-branch writers were
being classified against main's snapshot and main's Lance HEAD,
silently no-op'ing or rolling back the wrong table version (the
classifier saw NoMovement; the writer's drift on the feature branch
persisted; subsequent feature writers surfaced
ExpectedVersionMismatch).
Fix: SidecarTablePin gets an optional `table_branch` field;
`recover_manifest_drift` opens a per-branch coordinator
(`GraphCoordinator::open_branch`) per sidecar; `open_lance_head`,
`restore_table_to_version`, and `roll_forward_all` honor the pin's
branch via `Dataset::checkout_branch`.
B3. Remove fragment-id short-circuit in `restore_table_to_version`.
Equal fragment IDs do NOT imply equal content: Lance index commits
and deletion-vector updates change the manifest without touching
fragment IDs. Skipping restore in those cases would leave Lance HEAD
ahead of the manifest with no recovery artifact left. Restore is
now unconditional; pile-up under repeated mid-rollback crashes
bounded and reclaimed by `omnigraph cleanup`.
B4. Recovered branch_merge records merge parent. `record_audit` always
called `append_commit`, dropping `merged_parent_commit_id`. Future
`branch_merge feature -> main` between the same pair lost
already-up-to-date detection. RecoverySidecar gets an optional
`merge_source_commit_id`; `branch_merge_on_current_target`
populates it from `source_head_commit_id`; `record_audit`
dispatches to `append_merge_commit` when present.
New tests: feature-branch sidecar classification (B2); B1 deepens the
existing schema_apply test with live-`_schema.pg` and new-type
assertions; B4 deepens the existing branch_merge test by reading
`_graph_commits.lance` and asserting a non-null `merged_parent_commit_id`.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 23:39:41 +02:00
|
|
|
// schema data on disk but old-schema catalog.
|
|
|
|
|
let live_schema = std::fs::read_to_string(dir.path().join("_schema.pg")).unwrap();
|
|
|
|
|
assert!(
|
|
|
|
|
live_schema.contains("city: String?"),
|
|
|
|
|
"_schema.pg must reflect the NEW schema (city column added); got:\n{live_schema}",
|
|
|
|
|
);
|
|
|
|
|
assert!(
|
|
|
|
|
live_schema.contains("node Tag"),
|
|
|
|
|
"_schema.pg must reflect the NEW schema (Tag type added); got:\n{live_schema}",
|
|
|
|
|
);
|
recovery: register added tables + tombstones in SchemaApply roll-forward
Cursor flagged that SchemaApply sidecars only captured `Update` pins
(via `snapshot.entry()?` in schema_apply.rs:166), so recovery's
`roll_forward_all` only published `ManifestChange::Update` for the
rewritten/indexed tables. Added types (`added_tables`) and tombstones
(`renamed_tables` sources) were silently dropped during recovery.
Reproducer: in `schema_apply_phase_b_failure_recovered_on_next_open`,
the v2 schema added a `Tag` node type. Pre-fix, `node:Tag` ended up as
an orphan dataset on disk while the manifest never received a
`RegisterTable` entry — the live `_schema.pg` declared a type the
manifest didn't know about, and `count_rows(node:Tag)` panicked with
`no manifest entry for node:Tag`. The existing test passed only
because it never queried Tag.
Fix:
1. Extend `RecoverySidecar` with `additional_registrations` and
`tombstones` fields (optional, serde-default for backward compat
with existing on-disk sidecars). Both are SchemaApply-only.
2. Populate them in `apply_schema_with_lock` from the migration plan's
upfront diff (`added_tables` + `renamed_tables` keys for
registrations; `renamed_tables` values for tombstones, version-
pinned at `source_entry.table_version + 1`).
3. Update `roll_forward_all` to:
- emit `RegisterTable` + `Update` for each `additional_registrations`
entry (read the dataset's current Lance HEAD for the version
metadata + row_count)
- emit `Tombstone` for each `tombstones` entry
- filter against `snapshot` so previously-published registrations /
tombstones are skipped (handles the post-Phase-C-success-but-
sidecar-not-yet-deleted case — without filtering, the publisher's
CAS pre-check would error with `expected=0, actual=N` on the
redundant Register)
4. Extend the audit-row outcomes to include published registrations.
Test changes:
- `schema_apply_phase_b_failure_recovered_on_next_open` now asserts
`count_rows(node:Tag) == 0` (no panic), proving the new manifest
entry exists.
- `schema_apply_recovers_pre_commit_crash` renamed to
`schema_apply_pre_commit_crash_rolls_forward_via_sidecar` and
rewritten — pre-fix it expected pre-commit crashes to roll BACK
(delete staging, keep V1, leave Company as orphan); the sidecar
protocol's "complete the writer's intent" semantic now rolls
FORWARD (rename staging -> final, register Company atomically). The
new assertions verify schema = V2 and `node:Company` is queryable.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-05 22:15:50 +02:00
|
|
|
|
|
|
|
|
// Catalog ↔ manifest agreement: the new `node:Tag` type the schema
|
|
|
|
|
// declares must have a manifest entry the engine can read against.
|
|
|
|
|
// Without registrations / tombstones in the sidecar, recovery's
|
|
|
|
|
// `roll_forward_all` only publishes Updates for rewritten tables;
|
|
|
|
|
// added tables (Tag) end up as orphan datasets on disk with no
|
|
|
|
|
// manifest entry, and the live schema declares a type the manifest
|
|
|
|
|
// doesn't know about.
|
|
|
|
|
let db = Omnigraph::open(&uri).await.unwrap();
|
|
|
|
|
let tag_rows = helpers::count_rows(&db, "node:Tag").await;
|
|
|
|
|
assert_eq!(
|
|
|
|
|
tag_rows, 0,
|
|
|
|
|
"node:Tag must have a manifest entry (with 0 rows) post-recovery; \
|
|
|
|
|
a panic here means recovery failed to register the added table"
|
|
|
|
|
);
|
recovery: per-writer Phase B failure → recovery integration tests (Phase 9)
Add the three paired per-writer tests required by MR-847's acceptance
criteria — "All four migrated writers ... have paired Phase B → recovery
integration tests."
Production additions (~10 LOC):
- New failpoint `branch_merge.post_phase_b_pre_manifest_commit` in
`exec/merge.rs::branch_merge_on_current_target` between the per-table
publish loop and `commit_manifest_updates`.
- New failpoint `ensure_indices.post_phase_b_pre_manifest_commit` in
`db/omnigraph/table_ops.rs::ensure_indices_for_branch` between the
per-table loops and `commit_prepared_updates_on_branch`.
- For schema_apply, the existing `schema_apply.after_staging_write`
failpoint already fires in the right window (after the per-table
rewrites + index builds, before the manifest publish).
Sidecar tweak:
- `schema_apply` sidecar's `branch` is now `None` (was
`Some("__schema_apply_lock__")`). The lock branch is purely a
serialization sentinel; `coordinator.commit_changes_with_actor`
publishes against the coordinator's pre-lock branch (main). After
the failpoint fires, `release_schema_apply_lock` removes the lock
branch — if the sidecar referenced it, the recovery sweep would try
to publish to a branch that no longer exists and fail. Fix: record
the actual publish target.
Tests added in `tests/failpoints.rs` (~280 LOC):
- `schema_apply_phase_b_failure_recovered_on_next_open` — seeds a row,
opens, attempts a schema apply that adds a new node type + a new
property (the new type ensures the table set differs so
`recover_schema_state_files` doesn't trip on property-only
ambiguity), failpoint fires, drops engine, reopens, asserts sidecar
deleted + audit row recorded.
- `branch_merge_phase_b_failure_recovered_on_next_open` — seeds main,
branches off, mutates the branch, attempts merge with the
`branch_merge.post_phase_b_pre_manifest_commit` failpoint active.
Same recovery shape.
- `ensure_indices_phase_b_failure_recovered_on_next_open` — seeds
rows, attempts ensure_indices with the
`ensure_indices.post_phase_b_pre_manifest_commit` failpoint active.
After this commit, all four migrated writers have paired
Phase B → recovery tests:
- mutate_as / load: `recovery_rolls_forward_after_finalize_publisher_failure` (Phase 5)
- schema_apply: `schema_apply_phase_b_failure_recovered_on_next_open`
- branch_merge: `branch_merge_phase_b_failure_recovered_on_next_open`
- ensure_indices: `ensure_indices_phase_b_failure_recovered_on_next_open`
11 failpoint tests pass; full workspace lib + integration tests pass
(350+ tests across 20 binaries).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:38:02 +02:00
|
|
|
}
|
|
|
|
|
|
fix: optimize publishes compaction; recovery roll-back converges manifest (#141)
* test(optimize): cover manifest publish + HEAD-drift reconcile
Red against the pre-fix optimize, which ran compact_files without
publishing the compacted version to __manifest:
- maintenance: optimize must publish so the manifest table_version
tracks the compacted Lance HEAD and a later schema apply succeeds;
and must reconcile a pre-existing manifest-behind-HEAD drift (forged
via raw Lance compaction) so strict writes commit again.
- end_to_end + composite_flow: post-optimize query / strict update /
reopen in the full lifecycle (the canonical flow previously omitted
post-optimize writes as a documented "known limitation").
- failpoints: a crash between compaction and the manifest publish rolls
forward on next open.
* fix(optimize): publish compaction to manifest and reconcile HEAD drift
optimize ran Lance compact_files without publishing the new version to
__manifest, so the manifest table_version lagged the Lance HEAD: reads
stayed pinned to the pre-compaction version, and the next schema apply or
strict update/delete failed its HEAD-vs-manifest precondition with
"stale view ... refresh and retry" (open-time recovery rollback inflated
the gap on retry).
optimize now publishes each compacted table's version under the
per-(table, main) write queue, guarded by a manifest CAS and a
SidecarKind::Optimize recovery sidecar (loose-match; roll-forward is safe
because compaction is content-preserving). When a table has nothing left
to compact but its Lance HEAD is already ahead of the manifest pin
(pre-fix drift, or a recovery restore commit), optimize reconciles the
manifest forward to HEAD (metadata-only, no sidecar). Caches and the
CSR/CSC graph index are invalidated after a publish.
Docs updated (maintenance, storage, branches-commits, writes, testing).
* test(recovery): rollback convergence + optimize-defer regressions
Red against the current code, landed before the fix:
- recovery: after the open-time sweep rolls a sidecar back, the manifest
must track Lance HEAD (no residual drift) so a follow-up schema apply
succeeds — the original "+1 per retry" loop. Today roll-back restores
without publishing, so the manifest lags HEAD and the apply fails its
HEAD-vs-manifest precondition.
- maintenance: optimize must refuse while a recovery sidecar is pending —
operating on an unrecovered graph could publish a partial write the
sweep would roll back.
Also removes optimize_reconciles_preexisting_manifest_head_drift: the
ad-hoc drift reconcile it covered is replaced by recovery-side convergence.
* fix(recovery): converge manifest on roll-back; optimize defers on pending recovery
Root of PR #141's review findings and the original "+1 per retry" loop:
a Lance HEAD ahead of the manifest was ambiguous (benign content-preserving
drift vs. a partial write a sidecar will roll back), and optimize's reconcile
guessed it benign. Close the class instead of guessing:
- Recovery roll-back now PUBLISHES the restored version (via a
push_table_update_at_head helper shared with roll-forward), so the manifest
tracks the Lance HEAD after recovery — symmetric with roll-forward. This
fixes the +1 loop (after one roll-back the retry's HEAD-vs-manifest
precondition passes) and removes the only remaining source of orphaned
drift. The audit still records the logical rolled-back-to version; the
manifest is published at the restore commit (identical content).
- optimize drops the ad-hoc drift reconcile and instead REFUSES when a
__recovery sidecar is pending, so it only ever operates on a recovered
graph (manifest == HEAD); its compaction publish can no longer commit a
partial write. With the reconcile gone, the blob-skip-vs-reconcile gap is
moot.
Updates the rollback recovery-test helper (manifest == HEAD after roll-back),
the failpoints assertions, and the user/dev docs.
* test(recovery): fix rollback assertion for manifest convergence
The roll-back-publishes change makes the manifest version advance after a
SchemaApply roll-back (to the old-schema content), so the
schema_apply_without_schema_staging_rolls_back_on_next_open assertion must
be `version > pre`, not `version == pre`. This update was dropped during
the commit churn and surfaced as a CI Test Workspace failure; the
old-schema-preserved intent stays covered by count_rows + _schema.pg + the
RolledBack convergence invariant.
2026-06-08 01:50:12 +02:00
|
|
|
/// `optimize` Phase B → Phase C residual: `compact_files` advanced the Lance
|
|
|
|
|
/// HEAD but the manifest publish hasn't run. The `Optimize` recovery sidecar
|
|
|
|
|
/// (loose-match, like SchemaApply/EnsureIndices) must roll the compacted version
|
|
|
|
|
/// forward on next open so the manifest tracks the Lance HEAD — and the healed
|
|
|
|
|
/// table must then accept a schema apply (the original bug's victim).
|
|
|
|
|
#[tokio::test]
|
|
|
|
|
async fn optimize_phase_b_failure_recovered_on_next_open() {
|
|
|
|
|
let _scenario = FailScenario::setup();
|
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
|
|
|
|
let uri = dir.path().to_str().unwrap().to_string();
|
|
|
|
|
let operation_id;
|
|
|
|
|
|
|
|
|
|
// Seed: several separate Person inserts → multiple fragments, so compaction
|
|
|
|
|
// has real work and advances the Lance HEAD.
|
|
|
|
|
{
|
|
|
|
|
let db = Omnigraph::init(&uri, helpers::TEST_SCHEMA).await.unwrap();
|
|
|
|
|
for (name, age) in [("alice", 30), ("bob", 31), ("carol", 32), ("dave", 33)] {
|
|
|
|
|
db.mutate(
|
|
|
|
|
"main",
|
|
|
|
|
MUTATION_QUERIES,
|
|
|
|
|
"insert_person",
|
|
|
|
|
&mixed_params(&[("$name", name)], &[("$age", age)]),
|
|
|
|
|
)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let pre_failure_version = {
|
|
|
|
|
let db = Omnigraph::open(&uri).await.unwrap();
|
|
|
|
|
version_main(&db).await.unwrap()
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// Failpoint fires AFTER compact_files advanced the Lance HEAD but BEFORE the
|
|
|
|
|
// manifest publish. The Optimize sidecar persists (only node:Person has
|
|
|
|
|
// compactable fragments, so exactly one sidecar is written).
|
|
|
|
|
{
|
|
|
|
|
let db = Omnigraph::open(&uri).await.unwrap();
|
|
|
|
|
let _failpoint =
|
|
|
|
|
ScopedFailPoint::new("optimize.post_phase_b_pre_manifest_commit", "return");
|
|
|
|
|
let err = db.optimize().await.unwrap_err();
|
|
|
|
|
assert!(
|
(feat) convert engine call sites to &dyn TableStorage; demote legacy TableStore methods to pub(crate) (#86)
* MR-854: convert engine call sites to &dyn TableStorage; demote legacy methods
Phase 1b: every db.table_store.X(...) call site converts to
db.storage().X(...), reaching the storage layer through the sealed
TableStorage trait (returns &dyn TableStorage). Opaque SnapshotHandle
and StagedHandle replace bare lance::Dataset and Transaction in the
threaded values.
Phase 9: the inherent inline-commit methods on TableStore
(append_batch, merge_insert_batch{,es}, overwrite_batch,
create_btree_index, create_inverted_index) demote from pub to
pub(crate). Their only remaining direct users are table_store.rs
itself and the bulk loader's LoadMode::{Append, Overwrite, Merge}
concurrent fast-paths in loader::write_batch_to_dataset (no
two-phase shape in Lance 4.0.0 — closes after lance#6658 and #6666).
Docs:
- invariants.md \u00a7VI.23: drop "at the writer-trait surface"
qualifier; staged primitives are now the only engine surface.
- runs.md: residual matrix shrinks to delete_where and
create_vector_index (the two upstream-blocked residuals).
- forbidden_apis.rs: replace transitional language with the
current allow-list shape (table_store.rs + loader concurrent
fast-path only).
Files touched:
- changes/mod.rs, db/omnigraph.rs (+export/optimize/schema_apply/
table_ops.rs), exec/{merge,mod,mutation,staging}.rs,
loader/mod.rs, storage_layer.rs, table_store.rs,
tests/forbidden_apis.rs, docs/{invariants,runs}.md.
Co-Authored-By: Ragnor Comerford <ragnor.comerford@gmail.com>
* MR-854: replace test-only inline-commit append callers with local Lance helpers
After demoting TableStore::append_batch from pub to pub(crate), the
integration tests in tests/recovery.rs and tests/staged_writes.rs
that previously called store.append_batch(...) directly to simulate
HEAD-ahead-of-manifest drift can no longer access the inherent
method. Replace those calls with small in-test helpers that do a raw
Dataset::append (the same body the inherent method runs).
- tests/helpers/mod.rs gains lance_append_inline (shared helper).
- tests/staged_writes.rs gets a file-local lance_append_inline_local
(staged_writes.rs does not import helpers::).
- tests/recovery.rs drops the unused TableStore import in the one
function whose store binding became unused after the conversion.
Co-Authored-By: Ragnor Comerford <ragnor.comerford@gmail.com>
* MR-854: retrigger CI for flaky Test Workspace job
Co-Authored-By: Ragnor Comerford <ragnor.comerford@gmail.com>
* MR-854: convert remaining table_store call sites in export.rs / read_blob
Two leftover `self.table_store.X` / `db.table_store.X` call sites were
missed in the initial sweep — flagged by Devin Review on PR #86. Both
now go through the trait surface:
- `entity_from_snapshot` (db/omnigraph/export.rs): switch from
`db.table_store.open_snapshot_table` + `db.table_store.scan` to
`db.storage().open_snapshot_at_table` + `db.storage().scan`.
- `read_blob` (db/omnigraph.rs): replace
`snapshot.open(table_key)` + `self.table_store.first_row_id_for_filter`
with `self.storage().open_snapshot_at_table` +
`self.storage().first_row_id_for_filter`. The follow-up
`take_blobs` call still needs an `Arc<Dataset>` (it's a Lance blob
accessor not surfaced through the trait), so we hand off via
`SnapshotHandle::into_arc()` with a comment.
After this commit, no engine code outside `table_store.rs` reaches the
inherent `TableStore` API — the docs/runs.md and docs/invariants.md
claim is now uniformly true.
Co-Authored-By: Ragnor Comerford <ragnor.comerford@gmail.com>
* MR-854: post-rebase doc fixes (Lance 6.0.1, MR-A framing, into_dataset note)
Reviewer feedback on the rebased PR:
* docs/dev/writes.md residuals matrix: drop demoted methods from the trait-surface table (now `pub(crate)`); keep only the two genuine trait-surface residuals (`delete_where`, `create_vector_index`); reframe under MR-A (Lance v7.x bump) per docs/dev/lance.md.
* tests/forbidden_apis.rs: update transitional allow-list header to (a) drop the truncate_table mislabel (truncate_table is a Lance Dataset method, not a TableStore method — overwrite_batch's internal call), (b) reframe trait-surface residuals under MR-A / Lance #6666.
* crates/omnigraph/src/storage_layer.rs::SnapshotHandle::{into_arc, into_dataset}: add single-ref invariant doc — both consume Arc via try_unwrap-or-clone; sibling SnapshotHandle clones across an await point force a deep Dataset clone.
* Replace lance-4.0.0 version refs with lance-6.0.1 in active source/test/dev-doc comments (storage_layer.rs, table_store.rs, table_ops.rs, schema_apply.rs, merge.rs, recovery.rs, staged_writes.rs, consistency.rs, docs/dev/execution.md, docs/user/query-language.md). Historical refs in docs/releases/v0.4.1.md and the canonical "Lance 4.0.0 → 6.0.1 migration" line in docs/dev/lance.md left intact.
No engine code changes.
* MR-854: update docs/dev/invariants.md Storage trait row + gap entry
Reviewer feedback: the docs reorg landed; the invariant row now lives in
docs/dev/invariants.md with stable headings (no more numbered §VI.23).
Update two pieces to reflect MR-854 completion:
* Status table 'Storage trait' row: was 'full call-site migration ... incomplete';
now 'engine call sites all route through db.storage() (MR-854); inline-commit
inherent methods are pub(crate)-demoted; capability/stat surfaces are roadmap'.
* 'Known Gaps' 'Storage abstraction' entry: was 'older inherent TableStore call
sites and inline residuals remain'; now names the closed scope (MR-854 — call
sites migrated, methods demoted, loader fast-paths) and the remaining
trait-surface residuals under MR-A (Lance v7.x bump) and Lance #6666.
Cross-links to docs/dev/lance.md and docs/dev/writes.md so the framing stays
co-located with the canonical Lance surface tracking.
* MR-854: remove dead inline-commit methods from the storage surface
The loader concurrent fast-path (write_batch_to_dataset) is only reached
for LoadMode::Overwrite — Append/Merge route through MutationStaging — so
its Append/Merge arms were unreachable. Collapse it to overwrite-only and
drop the now-unused mode params, which removes the only callers of:
- TableStorage::append_batch + TableStorage::merge_insert_batches (trait)
- TableStore::merge_insert_batch + merge_insert_batches (inherent)
create_btree_index / create_inverted_index had zero callers anywhere
(scalar index builds use the stage_* primitives). Remove both from the
trait and the inherent impl.
Inherent append_batch stays pub(crate): overwrite_batch and recovery
tests use it. Migrate the one trait-append_batch test caller
(seed_person_row) to stage_append + commit_staged. The merge_insert
FirstSeen-workaround rationale moves from the deleted merge_insert_batch
into stage_merge_insert (now the sole merge path). No behavior change.
Also corrects the inaccurate loader residual comment (the prior text
blamed Lance #6658/#6666, which are the delete and vector-index issues,
for keeping overwrite inline; a stage_overwrite primitive already exists
and schema_apply uses it).
* MR-854: seal db.storage() to staged-only; move residuals to InlineCommitResidual
Split the three remaining inline-commit writes (overwrite_batch,
delete_where, create_vector_index) off the TableStorage trait onto a new
sealed InlineCommitResidual trait, reachable only via the explicit
Omnigraph::storage_inline_residual() accessor. db.storage() now exposes
only staged primitives + reads, so engine code cannot couple a write
with a Lance HEAD advance through the default surface — MR-793 acceptance
§1 ("no public method commits as a side effect of writing") now holds by
construction, not by review + naming.
Call sites moved to storage_inline_residual(): loader overwrite
fast-path, the three mutation delete_where paths, the branch-merge
delete, and the vector-index build. Impl bodies are unchanged (same
delegation to the pub(crate) inherent methods); this is a pure surface
reshape with no behavior change.
The residual trait holds two genuinely upstream-blocked methods
(delete_where -> Lance #6658/v7.x, create_vector_index -> Lance #6666)
plus overwrite_batch, kept for the loader's cross-table bulk-overwrite
concurrency until its staged migration lands (tracked follow-up).
* MR-854 docs: describe the staged-only seal; fix stale Lance index URLs
- writes.md / invariants.md / AGENTS.md: the inline-commit residuals now
live on InlineCommitResidual behind db.storage_inline_residual(), so
acceptance §1 holds by construction rather than 'option (b)' per-method
enumeration. Drop the inaccurate 'until Lance exposes
Operation::Overwrite { fragments }' claim (that op exists; stage_overwrite
already builds it) and reframe overwrite_batch as a removable legacy
residual gated on the loader's bulk-overwrite concurrency.
- forbidden_apis.rs: rewrite the allow-list doc for the split surface.
- lance.md: the index spec pages moved from /format/table/index/ to
/format/index/ in Lance 6.x (the old paths 404). Fix all 13 URLs.
* MR-854: fix stale lance-4.0.0 comment refs flagged in review
Addresses greptile (exec/merge.rs) and aaltshuler's stale-version blocker:
update lance-4.0.0 -> 6.0.1 in the comment/doc refs within this PR's
footprint (exec/merge.rs, exec/mutation.rs, docs/dev/writes.md). Also
corrects exec/merge.rs to cite lance#6666 (not #6658) for
build_index_metadata_from_segments — that is the vector-index segment-commit
API; #6658 is the two-phase delete. (Pre-existing 4.0.0 refs in untouched
files like architecture.md/storage.md are main's incomplete migration
cleanup, left out of scope.)
* fix(storage): stage loader overwrites
* fix(storage): stage empty schema rewrites
---------
Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Co-authored-by: Ragnor Comerford <ragnor.comerford@gmail.com>
Co-authored-by: Ragnor Comerford <hello@ragnor.co>
2026-06-09 23:03:08 +02:00
|
|
|
err.to_string().contains(
|
|
|
|
|
"injected failpoint triggered: optimize.post_phase_b_pre_manifest_commit"
|
|
|
|
|
),
|
fix: optimize publishes compaction; recovery roll-back converges manifest (#141)
* test(optimize): cover manifest publish + HEAD-drift reconcile
Red against the pre-fix optimize, which ran compact_files without
publishing the compacted version to __manifest:
- maintenance: optimize must publish so the manifest table_version
tracks the compacted Lance HEAD and a later schema apply succeeds;
and must reconcile a pre-existing manifest-behind-HEAD drift (forged
via raw Lance compaction) so strict writes commit again.
- end_to_end + composite_flow: post-optimize query / strict update /
reopen in the full lifecycle (the canonical flow previously omitted
post-optimize writes as a documented "known limitation").
- failpoints: a crash between compaction and the manifest publish rolls
forward on next open.
* fix(optimize): publish compaction to manifest and reconcile HEAD drift
optimize ran Lance compact_files without publishing the new version to
__manifest, so the manifest table_version lagged the Lance HEAD: reads
stayed pinned to the pre-compaction version, and the next schema apply or
strict update/delete failed its HEAD-vs-manifest precondition with
"stale view ... refresh and retry" (open-time recovery rollback inflated
the gap on retry).
optimize now publishes each compacted table's version under the
per-(table, main) write queue, guarded by a manifest CAS and a
SidecarKind::Optimize recovery sidecar (loose-match; roll-forward is safe
because compaction is content-preserving). When a table has nothing left
to compact but its Lance HEAD is already ahead of the manifest pin
(pre-fix drift, or a recovery restore commit), optimize reconciles the
manifest forward to HEAD (metadata-only, no sidecar). Caches and the
CSR/CSC graph index are invalidated after a publish.
Docs updated (maintenance, storage, branches-commits, writes, testing).
* test(recovery): rollback convergence + optimize-defer regressions
Red against the current code, landed before the fix:
- recovery: after the open-time sweep rolls a sidecar back, the manifest
must track Lance HEAD (no residual drift) so a follow-up schema apply
succeeds — the original "+1 per retry" loop. Today roll-back restores
without publishing, so the manifest lags HEAD and the apply fails its
HEAD-vs-manifest precondition.
- maintenance: optimize must refuse while a recovery sidecar is pending —
operating on an unrecovered graph could publish a partial write the
sweep would roll back.
Also removes optimize_reconciles_preexisting_manifest_head_drift: the
ad-hoc drift reconcile it covered is replaced by recovery-side convergence.
* fix(recovery): converge manifest on roll-back; optimize defers on pending recovery
Root of PR #141's review findings and the original "+1 per retry" loop:
a Lance HEAD ahead of the manifest was ambiguous (benign content-preserving
drift vs. a partial write a sidecar will roll back), and optimize's reconcile
guessed it benign. Close the class instead of guessing:
- Recovery roll-back now PUBLISHES the restored version (via a
push_table_update_at_head helper shared with roll-forward), so the manifest
tracks the Lance HEAD after recovery — symmetric with roll-forward. This
fixes the +1 loop (after one roll-back the retry's HEAD-vs-manifest
precondition passes) and removes the only remaining source of orphaned
drift. The audit still records the logical rolled-back-to version; the
manifest is published at the restore commit (identical content).
- optimize drops the ad-hoc drift reconcile and instead REFUSES when a
__recovery sidecar is pending, so it only ever operates on a recovered
graph (manifest == HEAD); its compaction publish can no longer commit a
partial write. With the reconcile gone, the blob-skip-vs-reconcile gap is
moot.
Updates the rollback recovery-test helper (manifest == HEAD after roll-back),
the failpoints assertions, and the user/dev docs.
* test(recovery): fix rollback assertion for manifest convergence
The roll-back-publishes change makes the manifest version advance after a
SchemaApply roll-back (to the old-schema content), so the
schema_apply_without_schema_staging_rolls_back_on_next_open assertion must
be `version > pre`, not `version == pre`. This update was dropped during
the commit churn and surfaced as a CI Test Workspace failure; the
old-schema-preserved intent stays covered by count_rows + _schema.pg + the
RolledBack convergence invariant.
2026-06-08 01:50:12 +02:00
|
|
|
"unexpected error: {err}"
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
let recovery_dir = dir.path().join("__recovery");
|
|
|
|
|
let sidecars: Vec<_> = std::fs::read_dir(&recovery_dir)
|
|
|
|
|
.unwrap()
|
|
|
|
|
.filter_map(|e| e.ok())
|
|
|
|
|
.collect();
|
|
|
|
|
assert_eq!(
|
|
|
|
|
sidecars.len(),
|
|
|
|
|
1,
|
|
|
|
|
"exactly one Optimize sidecar must persist after optimize failure"
|
|
|
|
|
);
|
|
|
|
|
operation_id = single_sidecar_operation_id(dir.path());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Recovery: reopen runs the sweep. The Optimize sidecar classifies
|
|
|
|
|
// RolledPastExpected (loose-match) → RollForward → manifest extends to the
|
|
|
|
|
// compacted Lance HEAD.
|
|
|
|
|
let db = Omnigraph::open(&uri).await.unwrap();
|
|
|
|
|
let post_recovery_version = version_main(&db).await.unwrap();
|
|
|
|
|
assert!(
|
|
|
|
|
post_recovery_version > pre_failure_version,
|
|
|
|
|
"manifest version must advance post-recovery (compaction rolled forward); \
|
|
|
|
|
pre={pre_failure_version}, post={post_recovery_version}",
|
|
|
|
|
);
|
|
|
|
|
drop(db);
|
|
|
|
|
|
|
|
|
|
assert_post_recovery_invariants(
|
|
|
|
|
dir.path(),
|
|
|
|
|
&operation_id,
|
|
|
|
|
RecoveryExpectation::RolledForward {
|
|
|
|
|
tables: vec![TableExpectation::main("node:Person")],
|
|
|
|
|
},
|
|
|
|
|
)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
|
|
|
|
|
|
|
|
|
// The healed table accepts an additive schema apply — its HEAD-vs-manifest
|
|
|
|
|
// precondition is satisfied because recovery published the compacted version.
|
|
|
|
|
let db = Omnigraph::open(&uri).await.unwrap();
|
|
|
|
|
let desired = helpers::TEST_SCHEMA.replace(
|
|
|
|
|
" age: I32?\n}",
|
|
|
|
|
" age: I32?\n nickname: String?\n}",
|
|
|
|
|
);
|
|
|
|
|
db.apply_schema(&desired)
|
|
|
|
|
.await
|
|
|
|
|
.expect("schema apply after optimize recovery must succeed");
|
|
|
|
|
}
|
|
|
|
|
|
recovery: per-writer Phase B failure → recovery integration tests (Phase 9)
Add the three paired per-writer tests required by MR-847's acceptance
criteria — "All four migrated writers ... have paired Phase B → recovery
integration tests."
Production additions (~10 LOC):
- New failpoint `branch_merge.post_phase_b_pre_manifest_commit` in
`exec/merge.rs::branch_merge_on_current_target` between the per-table
publish loop and `commit_manifest_updates`.
- New failpoint `ensure_indices.post_phase_b_pre_manifest_commit` in
`db/omnigraph/table_ops.rs::ensure_indices_for_branch` between the
per-table loops and `commit_prepared_updates_on_branch`.
- For schema_apply, the existing `schema_apply.after_staging_write`
failpoint already fires in the right window (after the per-table
rewrites + index builds, before the manifest publish).
Sidecar tweak:
- `schema_apply` sidecar's `branch` is now `None` (was
`Some("__schema_apply_lock__")`). The lock branch is purely a
serialization sentinel; `coordinator.commit_changes_with_actor`
publishes against the coordinator's pre-lock branch (main). After
the failpoint fires, `release_schema_apply_lock` removes the lock
branch — if the sidecar referenced it, the recovery sweep would try
to publish to a branch that no longer exists and fail. Fix: record
the actual publish target.
Tests added in `tests/failpoints.rs` (~280 LOC):
- `schema_apply_phase_b_failure_recovered_on_next_open` — seeds a row,
opens, attempts a schema apply that adds a new node type + a new
property (the new type ensures the table set differs so
`recover_schema_state_files` doesn't trip on property-only
ambiguity), failpoint fires, drops engine, reopens, asserts sidecar
deleted + audit row recorded.
- `branch_merge_phase_b_failure_recovered_on_next_open` — seeds main,
branches off, mutates the branch, attempts merge with the
`branch_merge.post_phase_b_pre_manifest_commit` failpoint active.
Same recovery shape.
- `ensure_indices_phase_b_failure_recovered_on_next_open` — seeds
rows, attempts ensure_indices with the
`ensure_indices.post_phase_b_pre_manifest_commit` failpoint active.
After this commit, all four migrated writers have paired
Phase B → recovery tests:
- mutate_as / load: `recovery_rolls_forward_after_finalize_publisher_failure` (Phase 5)
- schema_apply: `schema_apply_phase_b_failure_recovered_on_next_open`
- branch_merge: `branch_merge_phase_b_failure_recovered_on_next_open`
- ensure_indices: `ensure_indices_phase_b_failure_recovered_on_next_open`
11 failpoint tests pass; full workspace lib + integration tests pass
(350+ tests across 20 binaries).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:38:02 +02:00
|
|
|
#[tokio::test]
|
|
|
|
|
async fn branch_merge_phase_b_failure_recovered_on_next_open() {
|
|
|
|
|
use omnigraph::loader::{LoadMode, load_jsonl};
|
|
|
|
|
|
|
|
|
|
let _scenario = FailScenario::setup();
|
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
|
|
|
|
let uri = dir.path().to_str().unwrap().to_string();
|
|
|
|
|
|
2026-05-03 15:09:58 +02:00
|
|
|
// Seed main with a row, branch off, mutate BOTH sides so the merge
|
|
|
|
|
// produces at least one `RewriteMerged` candidate (target moved past
|
|
|
|
|
// base too — required for the recovery sidecar to pin anything; the
|
|
|
|
|
// sidecar only pins RewriteMerged candidates because they're the
|
|
|
|
|
// only path that always advances Lance HEAD via
|
|
|
|
|
// `publish_rewritten_merge_table`).
|
recovery: per-writer Phase B failure → recovery integration tests (Phase 9)
Add the three paired per-writer tests required by MR-847's acceptance
criteria — "All four migrated writers ... have paired Phase B → recovery
integration tests."
Production additions (~10 LOC):
- New failpoint `branch_merge.post_phase_b_pre_manifest_commit` in
`exec/merge.rs::branch_merge_on_current_target` between the per-table
publish loop and `commit_manifest_updates`.
- New failpoint `ensure_indices.post_phase_b_pre_manifest_commit` in
`db/omnigraph/table_ops.rs::ensure_indices_for_branch` between the
per-table loops and `commit_prepared_updates_on_branch`.
- For schema_apply, the existing `schema_apply.after_staging_write`
failpoint already fires in the right window (after the per-table
rewrites + index builds, before the manifest publish).
Sidecar tweak:
- `schema_apply` sidecar's `branch` is now `None` (was
`Some("__schema_apply_lock__")`). The lock branch is purely a
serialization sentinel; `coordinator.commit_changes_with_actor`
publishes against the coordinator's pre-lock branch (main). After
the failpoint fires, `release_schema_apply_lock` removes the lock
branch — if the sidecar referenced it, the recovery sweep would try
to publish to a branch that no longer exists and fail. Fix: record
the actual publish target.
Tests added in `tests/failpoints.rs` (~280 LOC):
- `schema_apply_phase_b_failure_recovered_on_next_open` — seeds a row,
opens, attempts a schema apply that adds a new node type + a new
property (the new type ensures the table set differs so
`recover_schema_state_files` doesn't trip on property-only
ambiguity), failpoint fires, drops engine, reopens, asserts sidecar
deleted + audit row recorded.
- `branch_merge_phase_b_failure_recovered_on_next_open` — seeds main,
branches off, mutates the branch, attempts merge with the
`branch_merge.post_phase_b_pre_manifest_commit` failpoint active.
Same recovery shape.
- `ensure_indices_phase_b_failure_recovered_on_next_open` — seeds
rows, attempts ensure_indices with the
`ensure_indices.post_phase_b_pre_manifest_commit` failpoint active.
After this commit, all four migrated writers have paired
Phase B → recovery tests:
- mutate_as / load: `recovery_rolls_forward_after_finalize_publisher_failure` (Phase 5)
- schema_apply: `schema_apply_phase_b_failure_recovered_on_next_open`
- branch_merge: `branch_merge_phase_b_failure_recovered_on_next_open`
- ensure_indices: `ensure_indices_phase_b_failure_recovered_on_next_open`
11 failpoint tests pass; full workspace lib + integration tests pass
(350+ tests across 20 binaries).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:38:02 +02:00
|
|
|
{
|
|
|
|
|
let mut db = Omnigraph::init(&uri, helpers::TEST_SCHEMA).await.unwrap();
|
|
|
|
|
load_jsonl(
|
|
|
|
|
&mut db,
|
|
|
|
|
r#"{"type":"Person","data":{"name":"alice","age":30}}
|
|
|
|
|
"#,
|
|
|
|
|
LoadMode::Append,
|
|
|
|
|
)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
|
|
|
|
db.branch_create("feature").await.unwrap();
|
|
|
|
|
db.mutate(
|
|
|
|
|
"feature",
|
|
|
|
|
MUTATION_QUERIES,
|
|
|
|
|
"insert_person",
|
|
|
|
|
&mixed_params(&[("$name", "Bob")], &[("$age", 40)]),
|
|
|
|
|
)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
2026-05-03 15:09:58 +02:00
|
|
|
// Mutate main too so the merge sees target ≠ base for Person —
|
|
|
|
|
// forces RewriteMerged classification.
|
|
|
|
|
mutate_main(
|
|
|
|
|
&mut db,
|
|
|
|
|
MUTATION_QUERIES,
|
|
|
|
|
"insert_person",
|
|
|
|
|
&mixed_params(&[("$name", "Carol")], &[("$age", 50)]),
|
|
|
|
|
)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
recovery: per-writer Phase B failure → recovery integration tests (Phase 9)
Add the three paired per-writer tests required by MR-847's acceptance
criteria — "All four migrated writers ... have paired Phase B → recovery
integration tests."
Production additions (~10 LOC):
- New failpoint `branch_merge.post_phase_b_pre_manifest_commit` in
`exec/merge.rs::branch_merge_on_current_target` between the per-table
publish loop and `commit_manifest_updates`.
- New failpoint `ensure_indices.post_phase_b_pre_manifest_commit` in
`db/omnigraph/table_ops.rs::ensure_indices_for_branch` between the
per-table loops and `commit_prepared_updates_on_branch`.
- For schema_apply, the existing `schema_apply.after_staging_write`
failpoint already fires in the right window (after the per-table
rewrites + index builds, before the manifest publish).
Sidecar tweak:
- `schema_apply` sidecar's `branch` is now `None` (was
`Some("__schema_apply_lock__")`). The lock branch is purely a
serialization sentinel; `coordinator.commit_changes_with_actor`
publishes against the coordinator's pre-lock branch (main). After
the failpoint fires, `release_schema_apply_lock` removes the lock
branch — if the sidecar referenced it, the recovery sweep would try
to publish to a branch that no longer exists and fail. Fix: record
the actual publish target.
Tests added in `tests/failpoints.rs` (~280 LOC):
- `schema_apply_phase_b_failure_recovered_on_next_open` — seeds a row,
opens, attempts a schema apply that adds a new node type + a new
property (the new type ensures the table set differs so
`recover_schema_state_files` doesn't trip on property-only
ambiguity), failpoint fires, drops engine, reopens, asserts sidecar
deleted + audit row recorded.
- `branch_merge_phase_b_failure_recovered_on_next_open` — seeds main,
branches off, mutates the branch, attempts merge with the
`branch_merge.post_phase_b_pre_manifest_commit` failpoint active.
Same recovery shape.
- `ensure_indices_phase_b_failure_recovered_on_next_open` — seeds
rows, attempts ensure_indices with the
`ensure_indices.post_phase_b_pre_manifest_commit` failpoint active.
After this commit, all four migrated writers have paired
Phase B → recovery tests:
- mutate_as / load: `recovery_rolls_forward_after_finalize_publisher_failure` (Phase 5)
- schema_apply: `schema_apply_phase_b_failure_recovered_on_next_open`
- branch_merge: `branch_merge_phase_b_failure_recovered_on_next_open`
- ensure_indices: `ensure_indices_phase_b_failure_recovered_on_next_open`
11 failpoint tests pass; full workspace lib + integration tests pass
(350+ tests across 20 binaries).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:38:02 +02:00
|
|
|
}
|
|
|
|
|
|
2026-05-03 15:09:58 +02:00
|
|
|
// Capture pre-failure state on main for post-recovery comparison.
|
|
|
|
|
let pre_failure_version = {
|
|
|
|
|
let db = Omnigraph::open(&uri).await.unwrap();
|
|
|
|
|
version_main(&db).await.unwrap()
|
|
|
|
|
};
|
|
|
|
|
|
2026-05-05 22:46:03 +02:00
|
|
|
// Setup: failpoint fires after the per-table publish loop completes
|
recovery: per-writer Phase B failure → recovery integration tests (Phase 9)
Add the three paired per-writer tests required by MR-847's acceptance
criteria — "All four migrated writers ... have paired Phase B → recovery
integration tests."
Production additions (~10 LOC):
- New failpoint `branch_merge.post_phase_b_pre_manifest_commit` in
`exec/merge.rs::branch_merge_on_current_target` between the per-table
publish loop and `commit_manifest_updates`.
- New failpoint `ensure_indices.post_phase_b_pre_manifest_commit` in
`db/omnigraph/table_ops.rs::ensure_indices_for_branch` between the
per-table loops and `commit_prepared_updates_on_branch`.
- For schema_apply, the existing `schema_apply.after_staging_write`
failpoint already fires in the right window (after the per-table
rewrites + index builds, before the manifest publish).
Sidecar tweak:
- `schema_apply` sidecar's `branch` is now `None` (was
`Some("__schema_apply_lock__")`). The lock branch is purely a
serialization sentinel; `coordinator.commit_changes_with_actor`
publishes against the coordinator's pre-lock branch (main). After
the failpoint fires, `release_schema_apply_lock` removes the lock
branch — if the sidecar referenced it, the recovery sweep would try
to publish to a branch that no longer exists and fail. Fix: record
the actual publish target.
Tests added in `tests/failpoints.rs` (~280 LOC):
- `schema_apply_phase_b_failure_recovered_on_next_open` — seeds a row,
opens, attempts a schema apply that adds a new node type + a new
property (the new type ensures the table set differs so
`recover_schema_state_files` doesn't trip on property-only
ambiguity), failpoint fires, drops engine, reopens, asserts sidecar
deleted + audit row recorded.
- `branch_merge_phase_b_failure_recovered_on_next_open` — seeds main,
branches off, mutates the branch, attempts merge with the
`branch_merge.post_phase_b_pre_manifest_commit` failpoint active.
Same recovery shape.
- `ensure_indices_phase_b_failure_recovered_on_next_open` — seeds
rows, attempts ensure_indices with the
`ensure_indices.post_phase_b_pre_manifest_commit` failpoint active.
After this commit, all four migrated writers have paired
Phase B → recovery tests:
- mutate_as / load: `recovery_rolls_forward_after_finalize_publisher_failure` (Phase 5)
- schema_apply: `schema_apply_phase_b_failure_recovered_on_next_open`
- branch_merge: `branch_merge_phase_b_failure_recovered_on_next_open`
- ensure_indices: `ensure_indices_phase_b_failure_recovered_on_next_open`
11 failpoint tests pass; full workspace lib + integration tests pass
(350+ tests across 20 binaries).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:38:02 +02:00
|
|
|
// but before commit_manifest_updates. Sidecar persists.
|
|
|
|
|
{
|
2026-05-08 16:26:23 +02:00
|
|
|
let db = Omnigraph::open(&uri).await.unwrap();
|
recovery: per-writer Phase B failure → recovery integration tests (Phase 9)
Add the three paired per-writer tests required by MR-847's acceptance
criteria — "All four migrated writers ... have paired Phase B → recovery
integration tests."
Production additions (~10 LOC):
- New failpoint `branch_merge.post_phase_b_pre_manifest_commit` in
`exec/merge.rs::branch_merge_on_current_target` between the per-table
publish loop and `commit_manifest_updates`.
- New failpoint `ensure_indices.post_phase_b_pre_manifest_commit` in
`db/omnigraph/table_ops.rs::ensure_indices_for_branch` between the
per-table loops and `commit_prepared_updates_on_branch`.
- For schema_apply, the existing `schema_apply.after_staging_write`
failpoint already fires in the right window (after the per-table
rewrites + index builds, before the manifest publish).
Sidecar tweak:
- `schema_apply` sidecar's `branch` is now `None` (was
`Some("__schema_apply_lock__")`). The lock branch is purely a
serialization sentinel; `coordinator.commit_changes_with_actor`
publishes against the coordinator's pre-lock branch (main). After
the failpoint fires, `release_schema_apply_lock` removes the lock
branch — if the sidecar referenced it, the recovery sweep would try
to publish to a branch that no longer exists and fail. Fix: record
the actual publish target.
Tests added in `tests/failpoints.rs` (~280 LOC):
- `schema_apply_phase_b_failure_recovered_on_next_open` — seeds a row,
opens, attempts a schema apply that adds a new node type + a new
property (the new type ensures the table set differs so
`recover_schema_state_files` doesn't trip on property-only
ambiguity), failpoint fires, drops engine, reopens, asserts sidecar
deleted + audit row recorded.
- `branch_merge_phase_b_failure_recovered_on_next_open` — seeds main,
branches off, mutates the branch, attempts merge with the
`branch_merge.post_phase_b_pre_manifest_commit` failpoint active.
Same recovery shape.
- `ensure_indices_phase_b_failure_recovered_on_next_open` — seeds
rows, attempts ensure_indices with the
`ensure_indices.post_phase_b_pre_manifest_commit` failpoint active.
After this commit, all four migrated writers have paired
Phase B → recovery tests:
- mutate_as / load: `recovery_rolls_forward_after_finalize_publisher_failure` (Phase 5)
- schema_apply: `schema_apply_phase_b_failure_recovered_on_next_open`
- branch_merge: `branch_merge_phase_b_failure_recovered_on_next_open`
- ensure_indices: `ensure_indices_phase_b_failure_recovered_on_next_open`
11 failpoint tests pass; full workspace lib + integration tests pass
(350+ tests across 20 binaries).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:38:02 +02:00
|
|
|
let _failpoint =
|
|
|
|
|
ScopedFailPoint::new("branch_merge.post_phase_b_pre_manifest_commit", "return");
|
|
|
|
|
let err = db.branch_merge("feature", "main").await.unwrap_err();
|
|
|
|
|
assert!(
|
|
|
|
|
err.to_string().contains(
|
|
|
|
|
"injected failpoint triggered: branch_merge.post_phase_b_pre_manifest_commit"
|
|
|
|
|
),
|
|
|
|
|
"unexpected error: {err}"
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
let recovery_dir = dir.path().join("__recovery");
|
|
|
|
|
let sidecars: Vec<_> = std::fs::read_dir(&recovery_dir)
|
|
|
|
|
.unwrap()
|
|
|
|
|
.filter_map(|e| e.ok())
|
|
|
|
|
.collect();
|
|
|
|
|
assert_eq!(
|
|
|
|
|
sidecars.len(),
|
|
|
|
|
1,
|
|
|
|
|
"exactly one sidecar must persist after branch_merge failure"
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
2026-05-05 22:46:03 +02:00
|
|
|
// Recovery: reopen runs the sweep. BranchMerge uses LOOSE
|
2026-05-03 15:09:58 +02:00
|
|
|
// classification — `publish_rewritten_merge_table` runs multiple
|
|
|
|
|
// commit_staged calls per table (stage_merge_insert + delete_where +
|
|
|
|
|
// index rebuilds), so post_commit_pin in the sidecar is a lower
|
|
|
|
|
// bound; the loose-match classifier accepts any HEAD > expected_version
|
|
|
|
|
// when expected_version == manifest_pinned.
|
|
|
|
|
let db = Omnigraph::open(&uri).await.unwrap();
|
recovery: per-writer Phase B failure → recovery integration tests (Phase 9)
Add the three paired per-writer tests required by MR-847's acceptance
criteria — "All four migrated writers ... have paired Phase B → recovery
integration tests."
Production additions (~10 LOC):
- New failpoint `branch_merge.post_phase_b_pre_manifest_commit` in
`exec/merge.rs::branch_merge_on_current_target` between the per-table
publish loop and `commit_manifest_updates`.
- New failpoint `ensure_indices.post_phase_b_pre_manifest_commit` in
`db/omnigraph/table_ops.rs::ensure_indices_for_branch` between the
per-table loops and `commit_prepared_updates_on_branch`.
- For schema_apply, the existing `schema_apply.after_staging_write`
failpoint already fires in the right window (after the per-table
rewrites + index builds, before the manifest publish).
Sidecar tweak:
- `schema_apply` sidecar's `branch` is now `None` (was
`Some("__schema_apply_lock__")`). The lock branch is purely a
serialization sentinel; `coordinator.commit_changes_with_actor`
publishes against the coordinator's pre-lock branch (main). After
the failpoint fires, `release_schema_apply_lock` removes the lock
branch — if the sidecar referenced it, the recovery sweep would try
to publish to a branch that no longer exists and fail. Fix: record
the actual publish target.
Tests added in `tests/failpoints.rs` (~280 LOC):
- `schema_apply_phase_b_failure_recovered_on_next_open` — seeds a row,
opens, attempts a schema apply that adds a new node type + a new
property (the new type ensures the table set differs so
`recover_schema_state_files` doesn't trip on property-only
ambiguity), failpoint fires, drops engine, reopens, asserts sidecar
deleted + audit row recorded.
- `branch_merge_phase_b_failure_recovered_on_next_open` — seeds main,
branches off, mutates the branch, attempts merge with the
`branch_merge.post_phase_b_pre_manifest_commit` failpoint active.
Same recovery shape.
- `ensure_indices_phase_b_failure_recovered_on_next_open` — seeds
rows, attempts ensure_indices with the
`ensure_indices.post_phase_b_pre_manifest_commit` failpoint active.
After this commit, all four migrated writers have paired
Phase B → recovery tests:
- mutate_as / load: `recovery_rolls_forward_after_finalize_publisher_failure` (Phase 5)
- schema_apply: `schema_apply_phase_b_failure_recovered_on_next_open`
- branch_merge: `branch_merge_phase_b_failure_recovered_on_next_open`
- ensure_indices: `ensure_indices_phase_b_failure_recovered_on_next_open`
11 failpoint tests pass; full workspace lib + integration tests pass
(350+ tests across 20 binaries).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:38:02 +02:00
|
|
|
|
|
|
|
|
let recovery_dir = dir.path().join("__recovery");
|
|
|
|
|
if recovery_dir.exists() {
|
|
|
|
|
let remaining: Vec<_> = std::fs::read_dir(&recovery_dir)
|
|
|
|
|
.unwrap()
|
|
|
|
|
.filter_map(|e| e.ok())
|
|
|
|
|
.collect();
|
|
|
|
|
assert!(
|
|
|
|
|
remaining.is_empty(),
|
|
|
|
|
"sidecar must be deleted; remaining: {:?}",
|
|
|
|
|
remaining,
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
let audit_dir = dir.path().join("_graph_commit_recoveries.lance");
|
|
|
|
|
assert!(
|
|
|
|
|
audit_dir.exists(),
|
|
|
|
|
"_graph_commit_recoveries.lance must exist after branch_merge recovery"
|
|
|
|
|
);
|
2026-05-03 15:09:58 +02:00
|
|
|
|
|
|
|
|
// Recovery must have advanced main's manifest pin (the merge published).
|
|
|
|
|
let post_recovery_version = version_main(&db).await.unwrap();
|
|
|
|
|
assert!(
|
|
|
|
|
post_recovery_version > pre_failure_version,
|
|
|
|
|
"manifest version must advance post-recovery; pre={pre_failure_version}, \
|
|
|
|
|
post={post_recovery_version}",
|
|
|
|
|
);
|
recovery: close four correctness gaps (schema-apply, branch-aware, restore short-circuit, merge parent)
B1. Schema-apply atomicity. Before this commit, a failure between
`_schema.pg.staging` write and the manifest publish left the repo
corrupt: Lance HEADs advanced under the new schema, manifest stayed
at old pins, and on reopen schema-state recovery deleted the staging
files (manifest's table set still matched the live schema), then
manifest-drift recovery rolled the table versions forward — leaving
new-schema data on disk with the old `_schema.pg` live.
Fix: a SchemaApply sidecar is the marker that Phase B completed but
Phase C didn't. New helper `has_schema_apply_sidecar` is consulted
by `recover_schema_state_files` BEFORE its disambiguation logic;
when present, it completes the staging→final rename so the
subsequent manifest-drift roll-forward sees the new catalog.
B2. Branch-aware recovery. Sidecars from feature-branch writers were
being classified against main's snapshot and main's Lance HEAD,
silently no-op'ing or rolling back the wrong table version (the
classifier saw NoMovement; the writer's drift on the feature branch
persisted; subsequent feature writers surfaced
ExpectedVersionMismatch).
Fix: SidecarTablePin gets an optional `table_branch` field;
`recover_manifest_drift` opens a per-branch coordinator
(`GraphCoordinator::open_branch`) per sidecar; `open_lance_head`,
`restore_table_to_version`, and `roll_forward_all` honor the pin's
branch via `Dataset::checkout_branch`.
B3. Remove fragment-id short-circuit in `restore_table_to_version`.
Equal fragment IDs do NOT imply equal content: Lance index commits
and deletion-vector updates change the manifest without touching
fragment IDs. Skipping restore in those cases would leave Lance HEAD
ahead of the manifest with no recovery artifact left. Restore is
now unconditional; pile-up under repeated mid-rollback crashes
bounded and reclaimed by `omnigraph cleanup`.
B4. Recovered branch_merge records merge parent. `record_audit` always
called `append_commit`, dropping `merged_parent_commit_id`. Future
`branch_merge feature -> main` between the same pair lost
already-up-to-date detection. RecoverySidecar gets an optional
`merge_source_commit_id`; `branch_merge_on_current_target`
populates it from `source_head_commit_id`; `record_audit`
dispatches to `append_merge_commit` when present.
New tests: feature-branch sidecar classification (B2); B1 deepens the
existing schema_apply test with live-`_schema.pg` and new-type
assertions; B4 deepens the existing branch_merge test by reading
`_graph_commits.lance` and asserting a non-null `merged_parent_commit_id`.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 23:39:41 +02:00
|
|
|
|
|
|
|
|
// The recovered branch_merge must record a MERGE commit (with
|
|
|
|
|
// `merged_parent_commit_id` set), not a plain commit. Without
|
|
|
|
|
// this, future merges between the same pair lose
|
|
|
|
|
// already-up-to-date detection. We verify by reading
|
|
|
|
|
// `_graph_commits.lance` and asserting the most recent commit
|
|
|
|
|
// tagged with the recovery actor has a non-null
|
|
|
|
|
// `merged_parent_commit_id`.
|
|
|
|
|
{
|
|
|
|
|
use arrow_array::{Array, StringArray};
|
|
|
|
|
use futures::TryStreamExt;
|
|
|
|
|
let commits_dir = dir.path().join("_graph_commits.lance");
|
|
|
|
|
let ds = lance::Dataset::open(commits_dir.to_str().unwrap())
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
|
|
|
|
let batches: Vec<arrow_array::RecordBatch> = ds
|
|
|
|
|
.scan()
|
|
|
|
|
.try_into_stream()
|
|
|
|
|
.await
|
|
|
|
|
.unwrap()
|
|
|
|
|
.try_collect()
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
|
|
|
|
let mut found_recovery_merge = false;
|
|
|
|
|
for batch in batches {
|
|
|
|
|
let merged = batch
|
|
|
|
|
.column_by_name("merged_parent_commit_id")
|
|
|
|
|
.expect("merged_parent_commit_id column present")
|
|
|
|
|
.as_any()
|
|
|
|
|
.downcast_ref::<StringArray>()
|
|
|
|
|
.expect("merged_parent_commit_id is Utf8");
|
|
|
|
|
// The actor_id lives in _graph_commit_actors; cross-checking
|
|
|
|
|
// is heavier than necessary. Detecting any non-null
|
|
|
|
|
// merged_parent_commit_id in the post-recovery state is
|
|
|
|
|
// sufficient: only a recovered branch_merge can produce one
|
|
|
|
|
// here (we never completed a normal merge in this test).
|
|
|
|
|
for i in 0..merged.len() {
|
|
|
|
|
if !merged.is_null(i) {
|
|
|
|
|
found_recovery_merge = true;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
assert!(
|
|
|
|
|
found_recovery_merge,
|
|
|
|
|
"recovered branch_merge must record `merged_parent_commit_id` so future \
|
|
|
|
|
merges detect already-up-to-date — no merge-parent-tagged commit found",
|
|
|
|
|
);
|
|
|
|
|
}
|
2026-05-03 15:09:58 +02:00
|
|
|
drop(db);
|
recovery: per-writer Phase B failure → recovery integration tests (Phase 9)
Add the three paired per-writer tests required by MR-847's acceptance
criteria — "All four migrated writers ... have paired Phase B → recovery
integration tests."
Production additions (~10 LOC):
- New failpoint `branch_merge.post_phase_b_pre_manifest_commit` in
`exec/merge.rs::branch_merge_on_current_target` between the per-table
publish loop and `commit_manifest_updates`.
- New failpoint `ensure_indices.post_phase_b_pre_manifest_commit` in
`db/omnigraph/table_ops.rs::ensure_indices_for_branch` between the
per-table loops and `commit_prepared_updates_on_branch`.
- For schema_apply, the existing `schema_apply.after_staging_write`
failpoint already fires in the right window (after the per-table
rewrites + index builds, before the manifest publish).
Sidecar tweak:
- `schema_apply` sidecar's `branch` is now `None` (was
`Some("__schema_apply_lock__")`). The lock branch is purely a
serialization sentinel; `coordinator.commit_changes_with_actor`
publishes against the coordinator's pre-lock branch (main). After
the failpoint fires, `release_schema_apply_lock` removes the lock
branch — if the sidecar referenced it, the recovery sweep would try
to publish to a branch that no longer exists and fail. Fix: record
the actual publish target.
Tests added in `tests/failpoints.rs` (~280 LOC):
- `schema_apply_phase_b_failure_recovered_on_next_open` — seeds a row,
opens, attempts a schema apply that adds a new node type + a new
property (the new type ensures the table set differs so
`recover_schema_state_files` doesn't trip on property-only
ambiguity), failpoint fires, drops engine, reopens, asserts sidecar
deleted + audit row recorded.
- `branch_merge_phase_b_failure_recovered_on_next_open` — seeds main,
branches off, mutates the branch, attempts merge with the
`branch_merge.post_phase_b_pre_manifest_commit` failpoint active.
Same recovery shape.
- `ensure_indices_phase_b_failure_recovered_on_next_open` — seeds
rows, attempts ensure_indices with the
`ensure_indices.post_phase_b_pre_manifest_commit` failpoint active.
After this commit, all four migrated writers have paired
Phase B → recovery tests:
- mutate_as / load: `recovery_rolls_forward_after_finalize_publisher_failure` (Phase 5)
- schema_apply: `schema_apply_phase_b_failure_recovered_on_next_open`
- branch_merge: `branch_merge_phase_b_failure_recovered_on_next_open`
- ensure_indices: `ensure_indices_phase_b_failure_recovered_on_next_open`
11 failpoint tests pass; full workspace lib + integration tests pass
(350+ tests across 20 binaries).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:38:02 +02:00
|
|
|
}
|
|
|
|
|
|
2026-05-04 11:34:18 +02:00
|
|
|
/// Branch-axis variant of the branch_merge recovery test: target is a
|
|
|
|
|
/// non-main branch. Catches the branch-specific commit-graph head bug
|
|
|
|
|
/// (D2) — without `CommitGraph::open_at_branch`, the recovery sweep
|
|
|
|
|
/// would record the global head as the merge parent on a non-main
|
|
|
|
|
/// target, and future merges between the same pair would lose
|
|
|
|
|
/// already-up-to-date detection.
|
|
|
|
|
#[tokio::test]
|
|
|
|
|
async fn branch_merge_phase_b_failure_recovered_on_non_main_target() {
|
|
|
|
|
use omnigraph::loader::{LoadMode, load_jsonl};
|
|
|
|
|
|
|
|
|
|
let _scenario = FailScenario::setup();
|
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
|
|
|
|
let uri = dir.path().to_str().unwrap().to_string();
|
2026-05-05 16:04:48 +02:00
|
|
|
let operation_id;
|
|
|
|
|
let target_parent_commit_id;
|
2026-05-04 11:34:18 +02:00
|
|
|
|
|
|
|
|
// Setup:
|
|
|
|
|
// main: alice
|
|
|
|
|
// target_branch (off main): + bob (target moved past base)
|
|
|
|
|
// source_branch (off main): + carol (source moved past base)
|
|
|
|
|
// Merge: source_branch → target_branch
|
|
|
|
|
{
|
|
|
|
|
let mut db = Omnigraph::init(&uri, helpers::TEST_SCHEMA).await.unwrap();
|
|
|
|
|
load_jsonl(
|
|
|
|
|
&mut db,
|
|
|
|
|
r#"{"type":"Person","data":{"name":"alice","age":30}}
|
|
|
|
|
"#,
|
|
|
|
|
LoadMode::Append,
|
|
|
|
|
)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
|
|
|
|
db.branch_create("target_branch").await.unwrap();
|
|
|
|
|
db.mutate(
|
|
|
|
|
"target_branch",
|
|
|
|
|
MUTATION_QUERIES,
|
|
|
|
|
"insert_person",
|
|
|
|
|
&mixed_params(&[("$name", "Bob")], &[("$age", 40)]),
|
|
|
|
|
)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
|
|
|
|
db.branch_create("source_branch").await.unwrap();
|
|
|
|
|
db.mutate(
|
|
|
|
|
"source_branch",
|
|
|
|
|
MUTATION_QUERIES,
|
|
|
|
|
"insert_person",
|
|
|
|
|
&mixed_params(&[("$name", "Carol")], &[("$age", 50)]),
|
|
|
|
|
)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
|
|
|
|
}
|
|
|
|
|
|
2026-05-05 16:04:48 +02:00
|
|
|
let main_person_pin = {
|
|
|
|
|
let db = Omnigraph::open(&uri).await.unwrap();
|
|
|
|
|
db.snapshot_of(omnigraph::db::ReadTarget::branch("main"))
|
recovery: refresh reloads schema after staging recovery; non-main merge test pins parent_commit_id
E1. After D3 added recover_schema_state_files to refresh(), the
in-memory `self.schema_source` and `self.catalog` were left stale:
a SchemaApply sidecar processed via refresh would rename the
staging files (`_schema.pg`, IR contract) into place but the
handle continued operating against the old catalog. Subsequent
operations would surface schema mismatches against post-migration
data on disk.
Fix: after recover_manifest_drift completes, refresh() now mirrors
open_with_storage_and_mode's schema-load sequence — re-reads
`_schema.pg`, parses IR via load_or_bootstrap_schema_contract,
rebuilds the catalog with fixup_blob_schemas, and assigns into
self.schema_source / self.catalog. Steady-state cost: one read +
one parse per refresh; only mutates handle state when the on-disk
schema actually changed.
E2. The non-main branch_merge recovery test
(`branch_merge_phase_b_failure_recovered_on_non_main_target`)
asserted only `merged_parent_commit_id` was non-null — but
`merged_parent_commit_id` is independently populated from
sidecar.merge_source_commit_id (the SOURCE branch's tip), so the
assertion would pass even if D2's per-branch CommitGraph fix
regressed (the bug was about `parent_commit_id`, the TARGET
branch's tip).
Fix: capture target_branch's commit-graph head BEFORE the failed
merge by scanning target_branch's Lance ref on _graph_commits.lance
and picking the latest commit by created_at. After recovery, find
the recovery merge commit (the one with non-null
merged_parent_commit_id) and assert its `parent_commit_id` ==
captured pre-failure head. Without D2, recovery would record the
GLOBAL head (the source_branch's insert-Carol commit on this test)
instead, and the assertion fails.
Also fixes the column-type cast: created_at is stored as
TimestampMicrosecondArray, not Int64Array.
All workspace tests pass with --features failpoints.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-04 12:06:17 +02:00
|
|
|
.await
|
|
|
|
|
.unwrap()
|
2026-05-05 16:04:48 +02:00
|
|
|
.entry("node:Person")
|
|
|
|
|
.expect("main must have Person")
|
|
|
|
|
.table_version
|
recovery: refresh reloads schema after staging recovery; non-main merge test pins parent_commit_id
E1. After D3 added recover_schema_state_files to refresh(), the
in-memory `self.schema_source` and `self.catalog` were left stale:
a SchemaApply sidecar processed via refresh would rename the
staging files (`_schema.pg`, IR contract) into place but the
handle continued operating against the old catalog. Subsequent
operations would surface schema mismatches against post-migration
data on disk.
Fix: after recover_manifest_drift completes, refresh() now mirrors
open_with_storage_and_mode's schema-load sequence — re-reads
`_schema.pg`, parses IR via load_or_bootstrap_schema_contract,
rebuilds the catalog with fixup_blob_schemas, and assigns into
self.schema_source / self.catalog. Steady-state cost: one read +
one parse per refresh; only mutates handle state when the on-disk
schema actually changed.
E2. The non-main branch_merge recovery test
(`branch_merge_phase_b_failure_recovered_on_non_main_target`)
asserted only `merged_parent_commit_id` was non-null — but
`merged_parent_commit_id` is independently populated from
sidecar.merge_source_commit_id (the SOURCE branch's tip), so the
assertion would pass even if D2's per-branch CommitGraph fix
regressed (the bug was about `parent_commit_id`, the TARGET
branch's tip).
Fix: capture target_branch's commit-graph head BEFORE the failed
merge by scanning target_branch's Lance ref on _graph_commits.lance
and picking the latest commit by created_at. After recovery, find
the recovery merge commit (the one with non-null
merged_parent_commit_id) and assert its `parent_commit_id` ==
captured pre-failure head. Without D2, recovery would record the
GLOBAL head (the source_branch's insert-Carol commit on this test)
instead, and the assertion fails.
Also fixes the column-type cast: created_at is stored as
TimestampMicrosecondArray, not Int64Array.
All workspace tests pass with --features failpoints.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-04 12:06:17 +02:00
|
|
|
};
|
2026-05-05 16:04:48 +02:00
|
|
|
target_parent_commit_id = branch_head_commit_id(dir.path(), "target_branch")
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
recovery: refresh reloads schema after staging recovery; non-main merge test pins parent_commit_id
E1. After D3 added recover_schema_state_files to refresh(), the
in-memory `self.schema_source` and `self.catalog` were left stale:
a SchemaApply sidecar processed via refresh would rename the
staging files (`_schema.pg`, IR contract) into place but the
handle continued operating against the old catalog. Subsequent
operations would surface schema mismatches against post-migration
data on disk.
Fix: after recover_manifest_drift completes, refresh() now mirrors
open_with_storage_and_mode's schema-load sequence — re-reads
`_schema.pg`, parses IR via load_or_bootstrap_schema_contract,
rebuilds the catalog with fixup_blob_schemas, and assigns into
self.schema_source / self.catalog. Steady-state cost: one read +
one parse per refresh; only mutates handle state when the on-disk
schema actually changed.
E2. The non-main branch_merge recovery test
(`branch_merge_phase_b_failure_recovered_on_non_main_target`)
asserted only `merged_parent_commit_id` was non-null — but
`merged_parent_commit_id` is independently populated from
sidecar.merge_source_commit_id (the SOURCE branch's tip), so the
assertion would pass even if D2's per-branch CommitGraph fix
regressed (the bug was about `parent_commit_id`, the TARGET
branch's tip).
Fix: capture target_branch's commit-graph head BEFORE the failed
merge by scanning target_branch's Lance ref on _graph_commits.lance
and picking the latest commit by created_at. After recovery, find
the recovery merge commit (the one with non-null
merged_parent_commit_id) and assert its `parent_commit_id` ==
captured pre-failure head. Without D2, recovery would record the
GLOBAL head (the source_branch's insert-Carol commit on this test)
instead, and the assertion fails.
Also fixes the column-type cast: created_at is stored as
TimestampMicrosecondArray, not Int64Array.
All workspace tests pass with --features failpoints.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-04 12:06:17 +02:00
|
|
|
|
2026-05-05 22:46:03 +02:00
|
|
|
// Setup: failpoint fires after the per-table publish loop completes
|
2026-05-04 11:34:18 +02:00
|
|
|
// but before commit_manifest_updates. Sidecar persists with
|
|
|
|
|
// branch=Some("target_branch").
|
|
|
|
|
{
|
2026-05-08 16:26:23 +02:00
|
|
|
let db = Omnigraph::open(&uri).await.unwrap();
|
2026-05-04 11:34:18 +02:00
|
|
|
let _failpoint =
|
|
|
|
|
ScopedFailPoint::new("branch_merge.post_phase_b_pre_manifest_commit", "return");
|
|
|
|
|
let err = db
|
|
|
|
|
.branch_merge("source_branch", "target_branch")
|
|
|
|
|
.await
|
|
|
|
|
.unwrap_err();
|
|
|
|
|
assert!(
|
|
|
|
|
err.to_string().contains(
|
|
|
|
|
"injected failpoint triggered: branch_merge.post_phase_b_pre_manifest_commit"
|
|
|
|
|
),
|
|
|
|
|
"unexpected error: {err}"
|
|
|
|
|
);
|
|
|
|
|
let recovery_dir = dir.path().join("__recovery");
|
|
|
|
|
let sidecar_count = std::fs::read_dir(&recovery_dir).unwrap().count();
|
|
|
|
|
assert_eq!(
|
2026-05-05 16:04:48 +02:00
|
|
|
sidecar_count, 1,
|
2026-05-04 11:34:18 +02:00
|
|
|
"exactly one sidecar must persist after non-main branch_merge failure"
|
|
|
|
|
);
|
2026-05-05 16:04:48 +02:00
|
|
|
operation_id = single_sidecar_operation_id(dir.path());
|
2026-05-04 11:34:18 +02:00
|
|
|
}
|
|
|
|
|
|
2026-05-05 22:46:03 +02:00
|
|
|
// Recovery: reopen runs full sweep. The BranchMerge sidecar's branch
|
2026-05-04 11:34:18 +02:00
|
|
|
// = Some("target_branch"); D2 fix opens a per-branch CommitGraph
|
|
|
|
|
// for the audit append so the merge-parent linkage is correct.
|
2026-05-05 16:04:48 +02:00
|
|
|
let db = Omnigraph::open(&uri).await.unwrap();
|
|
|
|
|
drop(db);
|
2026-05-04 11:34:18 +02:00
|
|
|
|
2026-05-05 16:04:48 +02:00
|
|
|
assert_post_recovery_invariants(
|
|
|
|
|
dir.path(),
|
|
|
|
|
&operation_id,
|
|
|
|
|
RecoveryExpectation::RolledForward {
|
|
|
|
|
tables: vec![
|
|
|
|
|
TableExpectation::branch("node:Person", "target_branch")
|
|
|
|
|
.expected_main_manifest_pin(main_person_pin)
|
|
|
|
|
.expected_recovery_parent_commit_id(target_parent_commit_id),
|
|
|
|
|
],
|
|
|
|
|
},
|
|
|
|
|
)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
2026-05-04 11:34:18 +02:00
|
|
|
}
|
|
|
|
|
|
2026-05-05 19:33:32 +02:00
|
|
|
/// Contract: the BranchMerge sidecar's per-table `table_branch` MUST be
|
|
|
|
|
/// the merge target branch (where commits land via
|
|
|
|
|
/// `publish_rewritten_merge_table` → `open_for_mutation` → potentially
|
|
|
|
|
/// `fork_dataset_from_entry_state`), NOT `entry.table_branch` (where
|
|
|
|
|
/// the table currently lives in the target's manifest snapshot).
|
|
|
|
|
///
|
|
|
|
|
/// `ensure_indices_for_branch` already has this invariant pinned by an
|
|
|
|
|
/// explicit comment at `table_ops.rs:115-120`. Without the same fix in
|
|
|
|
|
/// `merge.rs`, a future change to candidate selection or the publish
|
|
|
|
|
/// path that produces a `RewriteMerged` whose entry.table_branch
|
|
|
|
|
/// diverges from active_branch would silently drift Lance HEAD on the
|
|
|
|
|
/// target ref while recovery checks the wrong ref and no-ops the
|
|
|
|
|
/// rollback.
|
|
|
|
|
///
|
|
|
|
|
/// This test reads the sidecar JSON directly and asserts every per-pin
|
|
|
|
|
/// `table_branch` equals the active (target) branch. Even when the
|
|
|
|
|
/// values happen to coincide in practice (the strict candidate logic
|
|
|
|
|
/// keeps RewriteMerged tables on active_branch), the contract assertion
|
|
|
|
|
/// catches a regression that reverts to `entry.table_branch.clone()`.
|
|
|
|
|
#[tokio::test]
|
|
|
|
|
async fn branch_merge_sidecar_pins_table_branch_to_active_branch() {
|
|
|
|
|
use omnigraph::loader::{LoadMode, load_jsonl};
|
|
|
|
|
|
|
|
|
|
let _scenario = FailScenario::setup();
|
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
|
|
|
|
let uri = dir.path().to_str().unwrap().to_string();
|
|
|
|
|
|
|
|
|
|
{
|
|
|
|
|
let mut db = Omnigraph::init(&uri, helpers::TEST_SCHEMA).await.unwrap();
|
|
|
|
|
load_jsonl(
|
|
|
|
|
&mut db,
|
|
|
|
|
r#"{"type":"Person","data":{"name":"alice","age":30}}
|
|
|
|
|
"#,
|
|
|
|
|
LoadMode::Append,
|
|
|
|
|
)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
|
|
|
|
db.branch_create("target_branch").await.unwrap();
|
|
|
|
|
db.mutate(
|
|
|
|
|
"target_branch",
|
|
|
|
|
MUTATION_QUERIES,
|
|
|
|
|
"insert_person",
|
|
|
|
|
&mixed_params(&[("$name", "Bob")], &[("$age", 40)]),
|
|
|
|
|
)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
|
|
|
|
db.branch_create("source_branch").await.unwrap();
|
|
|
|
|
db.mutate(
|
|
|
|
|
"source_branch",
|
|
|
|
|
MUTATION_QUERIES,
|
|
|
|
|
"insert_person",
|
|
|
|
|
&mixed_params(&[("$name", "Carol")], &[("$age", 50)]),
|
|
|
|
|
)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
{
|
2026-05-08 16:26:23 +02:00
|
|
|
let db = Omnigraph::open(&uri).await.unwrap();
|
2026-05-05 19:33:32 +02:00
|
|
|
let _failpoint =
|
|
|
|
|
ScopedFailPoint::new("branch_merge.post_phase_b_pre_manifest_commit", "return");
|
|
|
|
|
let _ = db
|
|
|
|
|
.branch_merge("source_branch", "target_branch")
|
|
|
|
|
.await
|
|
|
|
|
.expect_err("failpoint must fire");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let operation_id = single_sidecar_operation_id(dir.path());
|
|
|
|
|
let sidecar_path = dir
|
|
|
|
|
.path()
|
|
|
|
|
.join("__recovery")
|
|
|
|
|
.join(format!("{operation_id}.json"));
|
|
|
|
|
let sidecar_json = std::fs::read_to_string(&sidecar_path).unwrap();
|
|
|
|
|
let sidecar: serde_json::Value = serde_json::from_str(&sidecar_json).unwrap();
|
|
|
|
|
|
|
|
|
|
let tables = sidecar["tables"]
|
|
|
|
|
.as_array()
|
|
|
|
|
.expect("sidecar tables must be an array");
|
|
|
|
|
assert!(
|
|
|
|
|
!tables.is_empty(),
|
|
|
|
|
"sidecar must pin at least one RewriteMerged table — both branches mutated Person"
|
|
|
|
|
);
|
|
|
|
|
for pin in tables {
|
|
|
|
|
let table_branch = pin
|
|
|
|
|
.get("table_branch")
|
|
|
|
|
.and_then(|v| v.as_str())
|
|
|
|
|
.unwrap_or_else(|| {
|
|
|
|
|
panic!(
|
|
|
|
|
"sidecar pin must record table_branch as the merge target (active_branch); \
|
|
|
|
|
got pin {pin:?}"
|
|
|
|
|
)
|
|
|
|
|
});
|
|
|
|
|
assert_eq!(
|
|
|
|
|
table_branch, "target_branch",
|
|
|
|
|
"sidecar pin must record `table_branch` as the merge target branch (where \
|
|
|
|
|
commits actually land via publish_rewritten_merge_table → open_for_mutation), \
|
|
|
|
|
NOT entry.table_branch from the target snapshot. See merge.rs filter_map and \
|
|
|
|
|
the rationale comment at table_ops.rs:115-120. Got pin: {pin:?}"
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-05-03 13:56:36 +02:00
|
|
|
/// `ensure_indices` only writes a sidecar when at least one table
|
|
|
|
|
/// genuinely needs index work (per `needs_index_work_*` helpers in
|
|
|
|
|
/// `db/omnigraph/table_ops.rs`). When all tables are steady-state
|
|
|
|
|
/// (every declared index already built, or empty tables that the loop
|
|
|
|
|
/// skips), the sidecar is omitted entirely.
|
recovery: address PR #72 round-2 review findings
Bot reviewers (cubic + cursor) flagged 5 follow-on issues after the
first fix push. Three are real bugs in the Phase 6-8 ensure_indices
sidecar wiring; two are AI-slop flags on shallow tests. One cursor
finding is a false positive on intentional node/edge index asymmetry.
Real bugs fixed:
- needs_index_work_node and needs_index_work_edge now skip empty
tables (count_rows == 0). The ensure_indices_for_branch loop has
`if row_count > 0 { build_indices(...) }`, so empty tables produce
zero commit_staged calls. Pinning them in the sidecar would force
NoMovement classification on recovery and trigger the all-or-nothing
rollback of any sibling table's legitimate index work (cubic #1).
- needs_index_work_node and needs_index_work_edge now respect the
table_branch parameter from the snapshot entry, instead of always
passing None (== main). For branch writes, opening the wrong HEAD
could miss recoverable Phase B commits (cubic #2).
- needs_index_work_edge documented as intentionally BTree-only (mirrors
the build_indices_on_dataset_for_catalog edge branch which only
builds id/src/dst BTrees). Cursor flagged FTS/vector omission as
inconsistency with the node helper; confirmed intentional via
inline comment so future readers know the asymmetry is on purpose
(cursor finding, false positive marked).
Test improvements:
- recovery_multi_sidecar_requires_fresh_snapshot_for_correctness — new
integration test that uses TWO sidecars on the SAME table where
sidecar B's expected_version equals sidecar A's post_commit_pin.
Sidecar B's classification only succeeds if the recovery sweep
refreshes the snapshot between iterations to see A's manifest
update. Without the refresh fix from the prior commit, B would be
classified against stale pins (cubic #4 follow-up).
- recovery_ensure_indices_handles_empty_tables — new integration test
that runs ensure_indices on an all-empty repo. With the round-2 fix,
both initial and steady-state runs leave no sidecar (zero pins ⇒
zero sidecar I/O). Without the empty-table fix, the sidecar would
pin Company (zero rows but missing indices) and force a NoMovement
rollback (cubic #1 verification).
- ensure_indices_phase_b_failure_does_not_leak_sidecar_when_no_work_needed —
renamed/rewrote the prior `_recovered_on_next_open` test to assert
the post-fix invariant: when load_jsonl auto-built every catalog
index via prepare_updates_for_commit, ensure_indices's needs_work
helpers correctly report zero pins and produce no sidecar. The old
assertion ("exactly one sidecar must persist") was wrong for the
scoped behavior.
Test surface (post-round-2):
- 25 unit tests in db::manifest::recovery (BranchMerge classifier,
sort order, primitives — unchanged).
- 12 integration tests in tests/recovery.rs (+2 from this commit).
- 11 failpoint tests including the four per-writer Phase B → recovery
tests (one renamed to reflect the scoped behavior).
- ~672 workspace tests pass with --features failpoints.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 12:50:33 +02:00
|
|
|
///
|
|
|
|
|
/// Test setup: `load_jsonl` auto-builds indices via
|
|
|
|
|
/// `prepare_updates_for_commit`. So after the load, every Person/Knows
|
|
|
|
|
/// index is built and Company is empty. `ensure_indices` correctly
|
|
|
|
|
/// produces zero pins → no sidecar. The failpoint still fires (it sits
|
|
|
|
|
/// after the loops), so the call returns Err — but no recovery state
|
|
|
|
|
/// persists. Reopen is a clean no-op.
|
|
|
|
|
///
|
2026-05-03 13:56:36 +02:00
|
|
|
/// Triggering an actual sidecar persistence requires bypassing
|
recovery: address PR #72 round-2 review findings
Bot reviewers (cubic + cursor) flagged 5 follow-on issues after the
first fix push. Three are real bugs in the Phase 6-8 ensure_indices
sidecar wiring; two are AI-slop flags on shallow tests. One cursor
finding is a false positive on intentional node/edge index asymmetry.
Real bugs fixed:
- needs_index_work_node and needs_index_work_edge now skip empty
tables (count_rows == 0). The ensure_indices_for_branch loop has
`if row_count > 0 { build_indices(...) }`, so empty tables produce
zero commit_staged calls. Pinning them in the sidecar would force
NoMovement classification on recovery and trigger the all-or-nothing
rollback of any sibling table's legitimate index work (cubic #1).
- needs_index_work_node and needs_index_work_edge now respect the
table_branch parameter from the snapshot entry, instead of always
passing None (== main). For branch writes, opening the wrong HEAD
could miss recoverable Phase B commits (cubic #2).
- needs_index_work_edge documented as intentionally BTree-only (mirrors
the build_indices_on_dataset_for_catalog edge branch which only
builds id/src/dst BTrees). Cursor flagged FTS/vector omission as
inconsistency with the node helper; confirmed intentional via
inline comment so future readers know the asymmetry is on purpose
(cursor finding, false positive marked).
Test improvements:
- recovery_multi_sidecar_requires_fresh_snapshot_for_correctness — new
integration test that uses TWO sidecars on the SAME table where
sidecar B's expected_version equals sidecar A's post_commit_pin.
Sidecar B's classification only succeeds if the recovery sweep
refreshes the snapshot between iterations to see A's manifest
update. Without the refresh fix from the prior commit, B would be
classified against stale pins (cubic #4 follow-up).
- recovery_ensure_indices_handles_empty_tables — new integration test
that runs ensure_indices on an all-empty repo. With the round-2 fix,
both initial and steady-state runs leave no sidecar (zero pins ⇒
zero sidecar I/O). Without the empty-table fix, the sidecar would
pin Company (zero rows but missing indices) and force a NoMovement
rollback (cubic #1 verification).
- ensure_indices_phase_b_failure_does_not_leak_sidecar_when_no_work_needed —
renamed/rewrote the prior `_recovered_on_next_open` test to assert
the post-fix invariant: when load_jsonl auto-built every catalog
index via prepare_updates_for_commit, ensure_indices's needs_work
helpers correctly report zero pins and produce no sidecar. The old
assertion ("exactly one sidecar must persist") was wrong for the
scoped behavior.
Test surface (post-round-2):
- 25 unit tests in db::manifest::recovery (BranchMerge classifier,
sort order, primitives — unchanged).
- 12 integration tests in tests/recovery.rs (+2 from this commit).
- 11 failpoint tests including the four per-writer Phase B → recovery
tests (one renamed to reflect the scoped behavior).
- ~672 workspace tests pass with --features failpoints.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 12:50:33 +02:00
|
|
|
/// `load_jsonl`'s auto-build via raw `TableStore::append_batch` — the
|
|
|
|
|
/// helper-direct path. That's covered structurally by the
|
2026-05-03 13:56:36 +02:00
|
|
|
/// `needs_index_work_*` code path and the
|
|
|
|
|
/// `recovery_ensure_indices_handles_empty_tables` integration test.
|
recovery: per-writer Phase B failure → recovery integration tests (Phase 9)
Add the three paired per-writer tests required by MR-847's acceptance
criteria — "All four migrated writers ... have paired Phase B → recovery
integration tests."
Production additions (~10 LOC):
- New failpoint `branch_merge.post_phase_b_pre_manifest_commit` in
`exec/merge.rs::branch_merge_on_current_target` between the per-table
publish loop and `commit_manifest_updates`.
- New failpoint `ensure_indices.post_phase_b_pre_manifest_commit` in
`db/omnigraph/table_ops.rs::ensure_indices_for_branch` between the
per-table loops and `commit_prepared_updates_on_branch`.
- For schema_apply, the existing `schema_apply.after_staging_write`
failpoint already fires in the right window (after the per-table
rewrites + index builds, before the manifest publish).
Sidecar tweak:
- `schema_apply` sidecar's `branch` is now `None` (was
`Some("__schema_apply_lock__")`). The lock branch is purely a
serialization sentinel; `coordinator.commit_changes_with_actor`
publishes against the coordinator's pre-lock branch (main). After
the failpoint fires, `release_schema_apply_lock` removes the lock
branch — if the sidecar referenced it, the recovery sweep would try
to publish to a branch that no longer exists and fail. Fix: record
the actual publish target.
Tests added in `tests/failpoints.rs` (~280 LOC):
- `schema_apply_phase_b_failure_recovered_on_next_open` — seeds a row,
opens, attempts a schema apply that adds a new node type + a new
property (the new type ensures the table set differs so
`recover_schema_state_files` doesn't trip on property-only
ambiguity), failpoint fires, drops engine, reopens, asserts sidecar
deleted + audit row recorded.
- `branch_merge_phase_b_failure_recovered_on_next_open` — seeds main,
branches off, mutates the branch, attempts merge with the
`branch_merge.post_phase_b_pre_manifest_commit` failpoint active.
Same recovery shape.
- `ensure_indices_phase_b_failure_recovered_on_next_open` — seeds
rows, attempts ensure_indices with the
`ensure_indices.post_phase_b_pre_manifest_commit` failpoint active.
After this commit, all four migrated writers have paired
Phase B → recovery tests:
- mutate_as / load: `recovery_rolls_forward_after_finalize_publisher_failure` (Phase 5)
- schema_apply: `schema_apply_phase_b_failure_recovered_on_next_open`
- branch_merge: `branch_merge_phase_b_failure_recovered_on_next_open`
- ensure_indices: `ensure_indices_phase_b_failure_recovered_on_next_open`
11 failpoint tests pass; full workspace lib + integration tests pass
(350+ tests across 20 binaries).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:38:02 +02:00
|
|
|
#[tokio::test]
|
recovery: address PR #72 round-2 review findings
Bot reviewers (cubic + cursor) flagged 5 follow-on issues after the
first fix push. Three are real bugs in the Phase 6-8 ensure_indices
sidecar wiring; two are AI-slop flags on shallow tests. One cursor
finding is a false positive on intentional node/edge index asymmetry.
Real bugs fixed:
- needs_index_work_node and needs_index_work_edge now skip empty
tables (count_rows == 0). The ensure_indices_for_branch loop has
`if row_count > 0 { build_indices(...) }`, so empty tables produce
zero commit_staged calls. Pinning them in the sidecar would force
NoMovement classification on recovery and trigger the all-or-nothing
rollback of any sibling table's legitimate index work (cubic #1).
- needs_index_work_node and needs_index_work_edge now respect the
table_branch parameter from the snapshot entry, instead of always
passing None (== main). For branch writes, opening the wrong HEAD
could miss recoverable Phase B commits (cubic #2).
- needs_index_work_edge documented as intentionally BTree-only (mirrors
the build_indices_on_dataset_for_catalog edge branch which only
builds id/src/dst BTrees). Cursor flagged FTS/vector omission as
inconsistency with the node helper; confirmed intentional via
inline comment so future readers know the asymmetry is on purpose
(cursor finding, false positive marked).
Test improvements:
- recovery_multi_sidecar_requires_fresh_snapshot_for_correctness — new
integration test that uses TWO sidecars on the SAME table where
sidecar B's expected_version equals sidecar A's post_commit_pin.
Sidecar B's classification only succeeds if the recovery sweep
refreshes the snapshot between iterations to see A's manifest
update. Without the refresh fix from the prior commit, B would be
classified against stale pins (cubic #4 follow-up).
- recovery_ensure_indices_handles_empty_tables — new integration test
that runs ensure_indices on an all-empty repo. With the round-2 fix,
both initial and steady-state runs leave no sidecar (zero pins ⇒
zero sidecar I/O). Without the empty-table fix, the sidecar would
pin Company (zero rows but missing indices) and force a NoMovement
rollback (cubic #1 verification).
- ensure_indices_phase_b_failure_does_not_leak_sidecar_when_no_work_needed —
renamed/rewrote the prior `_recovered_on_next_open` test to assert
the post-fix invariant: when load_jsonl auto-built every catalog
index via prepare_updates_for_commit, ensure_indices's needs_work
helpers correctly report zero pins and produce no sidecar. The old
assertion ("exactly one sidecar must persist") was wrong for the
scoped behavior.
Test surface (post-round-2):
- 25 unit tests in db::manifest::recovery (BranchMerge classifier,
sort order, primitives — unchanged).
- 12 integration tests in tests/recovery.rs (+2 from this commit).
- 11 failpoint tests including the four per-writer Phase B → recovery
tests (one renamed to reflect the scoped behavior).
- ~672 workspace tests pass with --features failpoints.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 12:50:33 +02:00
|
|
|
async fn ensure_indices_phase_b_failure_does_not_leak_sidecar_when_no_work_needed() {
|
recovery: per-writer Phase B failure → recovery integration tests (Phase 9)
Add the three paired per-writer tests required by MR-847's acceptance
criteria — "All four migrated writers ... have paired Phase B → recovery
integration tests."
Production additions (~10 LOC):
- New failpoint `branch_merge.post_phase_b_pre_manifest_commit` in
`exec/merge.rs::branch_merge_on_current_target` between the per-table
publish loop and `commit_manifest_updates`.
- New failpoint `ensure_indices.post_phase_b_pre_manifest_commit` in
`db/omnigraph/table_ops.rs::ensure_indices_for_branch` between the
per-table loops and `commit_prepared_updates_on_branch`.
- For schema_apply, the existing `schema_apply.after_staging_write`
failpoint already fires in the right window (after the per-table
rewrites + index builds, before the manifest publish).
Sidecar tweak:
- `schema_apply` sidecar's `branch` is now `None` (was
`Some("__schema_apply_lock__")`). The lock branch is purely a
serialization sentinel; `coordinator.commit_changes_with_actor`
publishes against the coordinator's pre-lock branch (main). After
the failpoint fires, `release_schema_apply_lock` removes the lock
branch — if the sidecar referenced it, the recovery sweep would try
to publish to a branch that no longer exists and fail. Fix: record
the actual publish target.
Tests added in `tests/failpoints.rs` (~280 LOC):
- `schema_apply_phase_b_failure_recovered_on_next_open` — seeds a row,
opens, attempts a schema apply that adds a new node type + a new
property (the new type ensures the table set differs so
`recover_schema_state_files` doesn't trip on property-only
ambiguity), failpoint fires, drops engine, reopens, asserts sidecar
deleted + audit row recorded.
- `branch_merge_phase_b_failure_recovered_on_next_open` — seeds main,
branches off, mutates the branch, attempts merge with the
`branch_merge.post_phase_b_pre_manifest_commit` failpoint active.
Same recovery shape.
- `ensure_indices_phase_b_failure_recovered_on_next_open` — seeds
rows, attempts ensure_indices with the
`ensure_indices.post_phase_b_pre_manifest_commit` failpoint active.
After this commit, all four migrated writers have paired
Phase B → recovery tests:
- mutate_as / load: `recovery_rolls_forward_after_finalize_publisher_failure` (Phase 5)
- schema_apply: `schema_apply_phase_b_failure_recovered_on_next_open`
- branch_merge: `branch_merge_phase_b_failure_recovered_on_next_open`
- ensure_indices: `ensure_indices_phase_b_failure_recovered_on_next_open`
11 failpoint tests pass; full workspace lib + integration tests pass
(350+ tests across 20 binaries).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:38:02 +02:00
|
|
|
use omnigraph::loader::{LoadMode, load_jsonl};
|
|
|
|
|
|
|
|
|
|
let _scenario = FailScenario::setup();
|
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
|
|
|
|
let uri = dir.path().to_str().unwrap().to_string();
|
|
|
|
|
|
recovery: address PR #72 round-2 review findings
Bot reviewers (cubic + cursor) flagged 5 follow-on issues after the
first fix push. Three are real bugs in the Phase 6-8 ensure_indices
sidecar wiring; two are AI-slop flags on shallow tests. One cursor
finding is a false positive on intentional node/edge index asymmetry.
Real bugs fixed:
- needs_index_work_node and needs_index_work_edge now skip empty
tables (count_rows == 0). The ensure_indices_for_branch loop has
`if row_count > 0 { build_indices(...) }`, so empty tables produce
zero commit_staged calls. Pinning them in the sidecar would force
NoMovement classification on recovery and trigger the all-or-nothing
rollback of any sibling table's legitimate index work (cubic #1).
- needs_index_work_node and needs_index_work_edge now respect the
table_branch parameter from the snapshot entry, instead of always
passing None (== main). For branch writes, opening the wrong HEAD
could miss recoverable Phase B commits (cubic #2).
- needs_index_work_edge documented as intentionally BTree-only (mirrors
the build_indices_on_dataset_for_catalog edge branch which only
builds id/src/dst BTrees). Cursor flagged FTS/vector omission as
inconsistency with the node helper; confirmed intentional via
inline comment so future readers know the asymmetry is on purpose
(cursor finding, false positive marked).
Test improvements:
- recovery_multi_sidecar_requires_fresh_snapshot_for_correctness — new
integration test that uses TWO sidecars on the SAME table where
sidecar B's expected_version equals sidecar A's post_commit_pin.
Sidecar B's classification only succeeds if the recovery sweep
refreshes the snapshot between iterations to see A's manifest
update. Without the refresh fix from the prior commit, B would be
classified against stale pins (cubic #4 follow-up).
- recovery_ensure_indices_handles_empty_tables — new integration test
that runs ensure_indices on an all-empty repo. With the round-2 fix,
both initial and steady-state runs leave no sidecar (zero pins ⇒
zero sidecar I/O). Without the empty-table fix, the sidecar would
pin Company (zero rows but missing indices) and force a NoMovement
rollback (cubic #1 verification).
- ensure_indices_phase_b_failure_does_not_leak_sidecar_when_no_work_needed —
renamed/rewrote the prior `_recovered_on_next_open` test to assert
the post-fix invariant: when load_jsonl auto-built every catalog
index via prepare_updates_for_commit, ensure_indices's needs_work
helpers correctly report zero pins and produce no sidecar. The old
assertion ("exactly one sidecar must persist") was wrong for the
scoped behavior.
Test surface (post-round-2):
- 25 unit tests in db::manifest::recovery (BranchMerge classifier,
sort order, primitives — unchanged).
- 12 integration tests in tests/recovery.rs (+2 from this commit).
- 11 failpoint tests including the four per-writer Phase B → recovery
tests (one renamed to reflect the scoped behavior).
- ~672 workspace tests pass with --features failpoints.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 12:50:33 +02:00
|
|
|
// Seed: load_jsonl auto-builds Person's indices via
|
|
|
|
|
// prepare_updates_for_commit. After this, ensure_indices has no
|
|
|
|
|
// work to do (steady state).
|
recovery: per-writer Phase B failure → recovery integration tests (Phase 9)
Add the three paired per-writer tests required by MR-847's acceptance
criteria — "All four migrated writers ... have paired Phase B → recovery
integration tests."
Production additions (~10 LOC):
- New failpoint `branch_merge.post_phase_b_pre_manifest_commit` in
`exec/merge.rs::branch_merge_on_current_target` between the per-table
publish loop and `commit_manifest_updates`.
- New failpoint `ensure_indices.post_phase_b_pre_manifest_commit` in
`db/omnigraph/table_ops.rs::ensure_indices_for_branch` between the
per-table loops and `commit_prepared_updates_on_branch`.
- For schema_apply, the existing `schema_apply.after_staging_write`
failpoint already fires in the right window (after the per-table
rewrites + index builds, before the manifest publish).
Sidecar tweak:
- `schema_apply` sidecar's `branch` is now `None` (was
`Some("__schema_apply_lock__")`). The lock branch is purely a
serialization sentinel; `coordinator.commit_changes_with_actor`
publishes against the coordinator's pre-lock branch (main). After
the failpoint fires, `release_schema_apply_lock` removes the lock
branch — if the sidecar referenced it, the recovery sweep would try
to publish to a branch that no longer exists and fail. Fix: record
the actual publish target.
Tests added in `tests/failpoints.rs` (~280 LOC):
- `schema_apply_phase_b_failure_recovered_on_next_open` — seeds a row,
opens, attempts a schema apply that adds a new node type + a new
property (the new type ensures the table set differs so
`recover_schema_state_files` doesn't trip on property-only
ambiguity), failpoint fires, drops engine, reopens, asserts sidecar
deleted + audit row recorded.
- `branch_merge_phase_b_failure_recovered_on_next_open` — seeds main,
branches off, mutates the branch, attempts merge with the
`branch_merge.post_phase_b_pre_manifest_commit` failpoint active.
Same recovery shape.
- `ensure_indices_phase_b_failure_recovered_on_next_open` — seeds
rows, attempts ensure_indices with the
`ensure_indices.post_phase_b_pre_manifest_commit` failpoint active.
After this commit, all four migrated writers have paired
Phase B → recovery tests:
- mutate_as / load: `recovery_rolls_forward_after_finalize_publisher_failure` (Phase 5)
- schema_apply: `schema_apply_phase_b_failure_recovered_on_next_open`
- branch_merge: `branch_merge_phase_b_failure_recovered_on_next_open`
- ensure_indices: `ensure_indices_phase_b_failure_recovered_on_next_open`
11 failpoint tests pass; full workspace lib + integration tests pass
(350+ tests across 20 binaries).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:38:02 +02:00
|
|
|
{
|
|
|
|
|
let mut db = Omnigraph::init(&uri, helpers::TEST_SCHEMA).await.unwrap();
|
|
|
|
|
load_jsonl(
|
|
|
|
|
&mut db,
|
|
|
|
|
r#"{"type":"Person","data":{"name":"alice","age":30}}
|
|
|
|
|
{"type":"Person","data":{"name":"bob","age":25}}
|
|
|
|
|
"#,
|
|
|
|
|
LoadMode::Append,
|
|
|
|
|
)
|
|
|
|
|
.await
|
|
|
|
|
.unwrap();
|
|
|
|
|
}
|
|
|
|
|
|
2026-05-05 22:46:03 +02:00
|
|
|
// Setup: trigger the failpoint. Steady-state ensure_indices
|
2026-05-03 13:56:36 +02:00
|
|
|
// produces zero sidecar pins (the helpers scope pins to tables
|
|
|
|
|
// that genuinely need work); no sidecar is written. The failpoint
|
|
|
|
|
// still fires, surfacing the Err.
|
recovery: per-writer Phase B failure → recovery integration tests (Phase 9)
Add the three paired per-writer tests required by MR-847's acceptance
criteria — "All four migrated writers ... have paired Phase B → recovery
integration tests."
Production additions (~10 LOC):
- New failpoint `branch_merge.post_phase_b_pre_manifest_commit` in
`exec/merge.rs::branch_merge_on_current_target` between the per-table
publish loop and `commit_manifest_updates`.
- New failpoint `ensure_indices.post_phase_b_pre_manifest_commit` in
`db/omnigraph/table_ops.rs::ensure_indices_for_branch` between the
per-table loops and `commit_prepared_updates_on_branch`.
- For schema_apply, the existing `schema_apply.after_staging_write`
failpoint already fires in the right window (after the per-table
rewrites + index builds, before the manifest publish).
Sidecar tweak:
- `schema_apply` sidecar's `branch` is now `None` (was
`Some("__schema_apply_lock__")`). The lock branch is purely a
serialization sentinel; `coordinator.commit_changes_with_actor`
publishes against the coordinator's pre-lock branch (main). After
the failpoint fires, `release_schema_apply_lock` removes the lock
branch — if the sidecar referenced it, the recovery sweep would try
to publish to a branch that no longer exists and fail. Fix: record
the actual publish target.
Tests added in `tests/failpoints.rs` (~280 LOC):
- `schema_apply_phase_b_failure_recovered_on_next_open` — seeds a row,
opens, attempts a schema apply that adds a new node type + a new
property (the new type ensures the table set differs so
`recover_schema_state_files` doesn't trip on property-only
ambiguity), failpoint fires, drops engine, reopens, asserts sidecar
deleted + audit row recorded.
- `branch_merge_phase_b_failure_recovered_on_next_open` — seeds main,
branches off, mutates the branch, attempts merge with the
`branch_merge.post_phase_b_pre_manifest_commit` failpoint active.
Same recovery shape.
- `ensure_indices_phase_b_failure_recovered_on_next_open` — seeds
rows, attempts ensure_indices with the
`ensure_indices.post_phase_b_pre_manifest_commit` failpoint active.
After this commit, all four migrated writers have paired
Phase B → recovery tests:
- mutate_as / load: `recovery_rolls_forward_after_finalize_publisher_failure` (Phase 5)
- schema_apply: `schema_apply_phase_b_failure_recovered_on_next_open`
- branch_merge: `branch_merge_phase_b_failure_recovered_on_next_open`
- ensure_indices: `ensure_indices_phase_b_failure_recovered_on_next_open`
11 failpoint tests pass; full workspace lib + integration tests pass
(350+ tests across 20 binaries).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:38:02 +02:00
|
|
|
{
|
2026-05-08 16:26:23 +02:00
|
|
|
let db = Omnigraph::open(&uri).await.unwrap();
|
2026-05-05 16:04:48 +02:00
|
|
|
let _failpoint =
|
|
|
|
|
ScopedFailPoint::new("ensure_indices.post_phase_b_pre_manifest_commit", "return");
|
recovery: per-writer Phase B failure → recovery integration tests (Phase 9)
Add the three paired per-writer tests required by MR-847's acceptance
criteria — "All four migrated writers ... have paired Phase B → recovery
integration tests."
Production additions (~10 LOC):
- New failpoint `branch_merge.post_phase_b_pre_manifest_commit` in
`exec/merge.rs::branch_merge_on_current_target` between the per-table
publish loop and `commit_manifest_updates`.
- New failpoint `ensure_indices.post_phase_b_pre_manifest_commit` in
`db/omnigraph/table_ops.rs::ensure_indices_for_branch` between the
per-table loops and `commit_prepared_updates_on_branch`.
- For schema_apply, the existing `schema_apply.after_staging_write`
failpoint already fires in the right window (after the per-table
rewrites + index builds, before the manifest publish).
Sidecar tweak:
- `schema_apply` sidecar's `branch` is now `None` (was
`Some("__schema_apply_lock__")`). The lock branch is purely a
serialization sentinel; `coordinator.commit_changes_with_actor`
publishes against the coordinator's pre-lock branch (main). After
the failpoint fires, `release_schema_apply_lock` removes the lock
branch — if the sidecar referenced it, the recovery sweep would try
to publish to a branch that no longer exists and fail. Fix: record
the actual publish target.
Tests added in `tests/failpoints.rs` (~280 LOC):
- `schema_apply_phase_b_failure_recovered_on_next_open` — seeds a row,
opens, attempts a schema apply that adds a new node type + a new
property (the new type ensures the table set differs so
`recover_schema_state_files` doesn't trip on property-only
ambiguity), failpoint fires, drops engine, reopens, asserts sidecar
deleted + audit row recorded.
- `branch_merge_phase_b_failure_recovered_on_next_open` — seeds main,
branches off, mutates the branch, attempts merge with the
`branch_merge.post_phase_b_pre_manifest_commit` failpoint active.
Same recovery shape.
- `ensure_indices_phase_b_failure_recovered_on_next_open` — seeds
rows, attempts ensure_indices with the
`ensure_indices.post_phase_b_pre_manifest_commit` failpoint active.
After this commit, all four migrated writers have paired
Phase B → recovery tests:
- mutate_as / load: `recovery_rolls_forward_after_finalize_publisher_failure` (Phase 5)
- schema_apply: `schema_apply_phase_b_failure_recovered_on_next_open`
- branch_merge: `branch_merge_phase_b_failure_recovered_on_next_open`
- ensure_indices: `ensure_indices_phase_b_failure_recovered_on_next_open`
11 failpoint tests pass; full workspace lib + integration tests pass
(350+ tests across 20 binaries).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:38:02 +02:00
|
|
|
let err = db.ensure_indices().await.unwrap_err();
|
|
|
|
|
assert!(
|
|
|
|
|
err.to_string().contains(
|
|
|
|
|
"injected failpoint triggered: ensure_indices.post_phase_b_pre_manifest_commit"
|
|
|
|
|
),
|
|
|
|
|
"unexpected error: {err}"
|
|
|
|
|
);
|
|
|
|
|
|
2026-05-03 13:56:36 +02:00
|
|
|
// KEY ASSERTION: no sidecar persists, because the helpers
|
|
|
|
|
// scope pins to tables that genuinely need work. Steady-state
|
recovery: address PR #72 round-2 review findings
Bot reviewers (cubic + cursor) flagged 5 follow-on issues after the
first fix push. Three are real bugs in the Phase 6-8 ensure_indices
sidecar wiring; two are AI-slop flags on shallow tests. One cursor
finding is a false positive on intentional node/edge index asymmetry.
Real bugs fixed:
- needs_index_work_node and needs_index_work_edge now skip empty
tables (count_rows == 0). The ensure_indices_for_branch loop has
`if row_count > 0 { build_indices(...) }`, so empty tables produce
zero commit_staged calls. Pinning them in the sidecar would force
NoMovement classification on recovery and trigger the all-or-nothing
rollback of any sibling table's legitimate index work (cubic #1).
- needs_index_work_node and needs_index_work_edge now respect the
table_branch parameter from the snapshot entry, instead of always
passing None (== main). For branch writes, opening the wrong HEAD
could miss recoverable Phase B commits (cubic #2).
- needs_index_work_edge documented as intentionally BTree-only (mirrors
the build_indices_on_dataset_for_catalog edge branch which only
builds id/src/dst BTrees). Cursor flagged FTS/vector omission as
inconsistency with the node helper; confirmed intentional via
inline comment so future readers know the asymmetry is on purpose
(cursor finding, false positive marked).
Test improvements:
- recovery_multi_sidecar_requires_fresh_snapshot_for_correctness — new
integration test that uses TWO sidecars on the SAME table where
sidecar B's expected_version equals sidecar A's post_commit_pin.
Sidecar B's classification only succeeds if the recovery sweep
refreshes the snapshot between iterations to see A's manifest
update. Without the refresh fix from the prior commit, B would be
classified against stale pins (cubic #4 follow-up).
- recovery_ensure_indices_handles_empty_tables — new integration test
that runs ensure_indices on an all-empty repo. With the round-2 fix,
both initial and steady-state runs leave no sidecar (zero pins ⇒
zero sidecar I/O). Without the empty-table fix, the sidecar would
pin Company (zero rows but missing indices) and force a NoMovement
rollback (cubic #1 verification).
- ensure_indices_phase_b_failure_does_not_leak_sidecar_when_no_work_needed —
renamed/rewrote the prior `_recovered_on_next_open` test to assert
the post-fix invariant: when load_jsonl auto-built every catalog
index via prepare_updates_for_commit, ensure_indices's needs_work
helpers correctly report zero pins and produce no sidecar. The old
assertion ("exactly one sidecar must persist") was wrong for the
scoped behavior.
Test surface (post-round-2):
- 25 unit tests in db::manifest::recovery (BranchMerge classifier,
sort order, primitives — unchanged).
- 12 integration tests in tests/recovery.rs (+2 from this commit).
- 11 failpoint tests including the four per-writer Phase B → recovery
tests (one renamed to reflect the scoped behavior).
- ~672 workspace tests pass with --features failpoints.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 12:50:33 +02:00
|
|
|
// = no pins = no sidecar = no recovery state = zero open-time
|
|
|
|
|
// overhead.
|
recovery: per-writer Phase B failure → recovery integration tests (Phase 9)
Add the three paired per-writer tests required by MR-847's acceptance
criteria — "All four migrated writers ... have paired Phase B → recovery
integration tests."
Production additions (~10 LOC):
- New failpoint `branch_merge.post_phase_b_pre_manifest_commit` in
`exec/merge.rs::branch_merge_on_current_target` between the per-table
publish loop and `commit_manifest_updates`.
- New failpoint `ensure_indices.post_phase_b_pre_manifest_commit` in
`db/omnigraph/table_ops.rs::ensure_indices_for_branch` between the
per-table loops and `commit_prepared_updates_on_branch`.
- For schema_apply, the existing `schema_apply.after_staging_write`
failpoint already fires in the right window (after the per-table
rewrites + index builds, before the manifest publish).
Sidecar tweak:
- `schema_apply` sidecar's `branch` is now `None` (was
`Some("__schema_apply_lock__")`). The lock branch is purely a
serialization sentinel; `coordinator.commit_changes_with_actor`
publishes against the coordinator's pre-lock branch (main). After
the failpoint fires, `release_schema_apply_lock` removes the lock
branch — if the sidecar referenced it, the recovery sweep would try
to publish to a branch that no longer exists and fail. Fix: record
the actual publish target.
Tests added in `tests/failpoints.rs` (~280 LOC):
- `schema_apply_phase_b_failure_recovered_on_next_open` — seeds a row,
opens, attempts a schema apply that adds a new node type + a new
property (the new type ensures the table set differs so
`recover_schema_state_files` doesn't trip on property-only
ambiguity), failpoint fires, drops engine, reopens, asserts sidecar
deleted + audit row recorded.
- `branch_merge_phase_b_failure_recovered_on_next_open` — seeds main,
branches off, mutates the branch, attempts merge with the
`branch_merge.post_phase_b_pre_manifest_commit` failpoint active.
Same recovery shape.
- `ensure_indices_phase_b_failure_recovered_on_next_open` — seeds
rows, attempts ensure_indices with the
`ensure_indices.post_phase_b_pre_manifest_commit` failpoint active.
After this commit, all four migrated writers have paired
Phase B → recovery tests:
- mutate_as / load: `recovery_rolls_forward_after_finalize_publisher_failure` (Phase 5)
- schema_apply: `schema_apply_phase_b_failure_recovered_on_next_open`
- branch_merge: `branch_merge_phase_b_failure_recovered_on_next_open`
- ensure_indices: `ensure_indices_phase_b_failure_recovered_on_next_open`
11 failpoint tests pass; full workspace lib + integration tests pass
(350+ tests across 20 binaries).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:38:02 +02:00
|
|
|
let recovery_dir = dir.path().join("__recovery");
|
recovery: address PR #72 round-2 review findings
Bot reviewers (cubic + cursor) flagged 5 follow-on issues after the
first fix push. Three are real bugs in the Phase 6-8 ensure_indices
sidecar wiring; two are AI-slop flags on shallow tests. One cursor
finding is a false positive on intentional node/edge index asymmetry.
Real bugs fixed:
- needs_index_work_node and needs_index_work_edge now skip empty
tables (count_rows == 0). The ensure_indices_for_branch loop has
`if row_count > 0 { build_indices(...) }`, so empty tables produce
zero commit_staged calls. Pinning them in the sidecar would force
NoMovement classification on recovery and trigger the all-or-nothing
rollback of any sibling table's legitimate index work (cubic #1).
- needs_index_work_node and needs_index_work_edge now respect the
table_branch parameter from the snapshot entry, instead of always
passing None (== main). For branch writes, opening the wrong HEAD
could miss recoverable Phase B commits (cubic #2).
- needs_index_work_edge documented as intentionally BTree-only (mirrors
the build_indices_on_dataset_for_catalog edge branch which only
builds id/src/dst BTrees). Cursor flagged FTS/vector omission as
inconsistency with the node helper; confirmed intentional via
inline comment so future readers know the asymmetry is on purpose
(cursor finding, false positive marked).
Test improvements:
- recovery_multi_sidecar_requires_fresh_snapshot_for_correctness — new
integration test that uses TWO sidecars on the SAME table where
sidecar B's expected_version equals sidecar A's post_commit_pin.
Sidecar B's classification only succeeds if the recovery sweep
refreshes the snapshot between iterations to see A's manifest
update. Without the refresh fix from the prior commit, B would be
classified against stale pins (cubic #4 follow-up).
- recovery_ensure_indices_handles_empty_tables — new integration test
that runs ensure_indices on an all-empty repo. With the round-2 fix,
both initial and steady-state runs leave no sidecar (zero pins ⇒
zero sidecar I/O). Without the empty-table fix, the sidecar would
pin Company (zero rows but missing indices) and force a NoMovement
rollback (cubic #1 verification).
- ensure_indices_phase_b_failure_does_not_leak_sidecar_when_no_work_needed —
renamed/rewrote the prior `_recovered_on_next_open` test to assert
the post-fix invariant: when load_jsonl auto-built every catalog
index via prepare_updates_for_commit, ensure_indices's needs_work
helpers correctly report zero pins and produce no sidecar. The old
assertion ("exactly one sidecar must persist") was wrong for the
scoped behavior.
Test surface (post-round-2):
- 25 unit tests in db::manifest::recovery (BranchMerge classifier,
sort order, primitives — unchanged).
- 12 integration tests in tests/recovery.rs (+2 from this commit).
- 11 failpoint tests including the four per-writer Phase B → recovery
tests (one renamed to reflect the scoped behavior).
- ~672 workspace tests pass with --features failpoints.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 12:50:33 +02:00
|
|
|
let sidecars: Vec<_> = if recovery_dir.exists() {
|
|
|
|
|
std::fs::read_dir(&recovery_dir)
|
|
|
|
|
.unwrap()
|
|
|
|
|
.filter_map(|e| e.ok())
|
|
|
|
|
.collect()
|
|
|
|
|
} else {
|
|
|
|
|
Vec::new()
|
|
|
|
|
};
|
|
|
|
|
assert!(
|
|
|
|
|
sidecars.is_empty(),
|
|
|
|
|
"steady-state ensure_indices must not leave a sidecar; got {:?}",
|
|
|
|
|
sidecars,
|
recovery: per-writer Phase B failure → recovery integration tests (Phase 9)
Add the three paired per-writer tests required by MR-847's acceptance
criteria — "All four migrated writers ... have paired Phase B → recovery
integration tests."
Production additions (~10 LOC):
- New failpoint `branch_merge.post_phase_b_pre_manifest_commit` in
`exec/merge.rs::branch_merge_on_current_target` between the per-table
publish loop and `commit_manifest_updates`.
- New failpoint `ensure_indices.post_phase_b_pre_manifest_commit` in
`db/omnigraph/table_ops.rs::ensure_indices_for_branch` between the
per-table loops and `commit_prepared_updates_on_branch`.
- For schema_apply, the existing `schema_apply.after_staging_write`
failpoint already fires in the right window (after the per-table
rewrites + index builds, before the manifest publish).
Sidecar tweak:
- `schema_apply` sidecar's `branch` is now `None` (was
`Some("__schema_apply_lock__")`). The lock branch is purely a
serialization sentinel; `coordinator.commit_changes_with_actor`
publishes against the coordinator's pre-lock branch (main). After
the failpoint fires, `release_schema_apply_lock` removes the lock
branch — if the sidecar referenced it, the recovery sweep would try
to publish to a branch that no longer exists and fail. Fix: record
the actual publish target.
Tests added in `tests/failpoints.rs` (~280 LOC):
- `schema_apply_phase_b_failure_recovered_on_next_open` — seeds a row,
opens, attempts a schema apply that adds a new node type + a new
property (the new type ensures the table set differs so
`recover_schema_state_files` doesn't trip on property-only
ambiguity), failpoint fires, drops engine, reopens, asserts sidecar
deleted + audit row recorded.
- `branch_merge_phase_b_failure_recovered_on_next_open` — seeds main,
branches off, mutates the branch, attempts merge with the
`branch_merge.post_phase_b_pre_manifest_commit` failpoint active.
Same recovery shape.
- `ensure_indices_phase_b_failure_recovered_on_next_open` — seeds
rows, attempts ensure_indices with the
`ensure_indices.post_phase_b_pre_manifest_commit` failpoint active.
After this commit, all four migrated writers have paired
Phase B → recovery tests:
- mutate_as / load: `recovery_rolls_forward_after_finalize_publisher_failure` (Phase 5)
- schema_apply: `schema_apply_phase_b_failure_recovered_on_next_open`
- branch_merge: `branch_merge_phase_b_failure_recovered_on_next_open`
- ensure_indices: `ensure_indices_phase_b_failure_recovered_on_next_open`
11 failpoint tests pass; full workspace lib + integration tests pass
(350+ tests across 20 binaries).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:38:02 +02:00
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
2026-05-05 22:46:03 +02:00
|
|
|
// Recovery: reopen is a clean no-op (no sidecar to recover).
|
recovery: per-writer Phase B failure → recovery integration tests (Phase 9)
Add the three paired per-writer tests required by MR-847's acceptance
criteria — "All four migrated writers ... have paired Phase B → recovery
integration tests."
Production additions (~10 LOC):
- New failpoint `branch_merge.post_phase_b_pre_manifest_commit` in
`exec/merge.rs::branch_merge_on_current_target` between the per-table
publish loop and `commit_manifest_updates`.
- New failpoint `ensure_indices.post_phase_b_pre_manifest_commit` in
`db/omnigraph/table_ops.rs::ensure_indices_for_branch` between the
per-table loops and `commit_prepared_updates_on_branch`.
- For schema_apply, the existing `schema_apply.after_staging_write`
failpoint already fires in the right window (after the per-table
rewrites + index builds, before the manifest publish).
Sidecar tweak:
- `schema_apply` sidecar's `branch` is now `None` (was
`Some("__schema_apply_lock__")`). The lock branch is purely a
serialization sentinel; `coordinator.commit_changes_with_actor`
publishes against the coordinator's pre-lock branch (main). After
the failpoint fires, `release_schema_apply_lock` removes the lock
branch — if the sidecar referenced it, the recovery sweep would try
to publish to a branch that no longer exists and fail. Fix: record
the actual publish target.
Tests added in `tests/failpoints.rs` (~280 LOC):
- `schema_apply_phase_b_failure_recovered_on_next_open` — seeds a row,
opens, attempts a schema apply that adds a new node type + a new
property (the new type ensures the table set differs so
`recover_schema_state_files` doesn't trip on property-only
ambiguity), failpoint fires, drops engine, reopens, asserts sidecar
deleted + audit row recorded.
- `branch_merge_phase_b_failure_recovered_on_next_open` — seeds main,
branches off, mutates the branch, attempts merge with the
`branch_merge.post_phase_b_pre_manifest_commit` failpoint active.
Same recovery shape.
- `ensure_indices_phase_b_failure_recovered_on_next_open` — seeds
rows, attempts ensure_indices with the
`ensure_indices.post_phase_b_pre_manifest_commit` failpoint active.
After this commit, all four migrated writers have paired
Phase B → recovery tests:
- mutate_as / load: `recovery_rolls_forward_after_finalize_publisher_failure` (Phase 5)
- schema_apply: `schema_apply_phase_b_failure_recovered_on_next_open`
- branch_merge: `branch_merge_phase_b_failure_recovered_on_next_open`
- ensure_indices: `ensure_indices_phase_b_failure_recovered_on_next_open`
11 failpoint tests pass; full workspace lib + integration tests pass
(350+ tests across 20 binaries).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:38:02 +02:00
|
|
|
let _db = Omnigraph::open(&uri).await.unwrap();
|
|
|
|
|
|
|
|
|
|
let recovery_dir = dir.path().join("__recovery");
|
|
|
|
|
if recovery_dir.exists() {
|
|
|
|
|
let remaining: Vec<_> = std::fs::read_dir(&recovery_dir)
|
|
|
|
|
.unwrap()
|
|
|
|
|
.filter_map(|e| e.ok())
|
|
|
|
|
.collect();
|
|
|
|
|
assert!(
|
|
|
|
|
remaining.is_empty(),
|
recovery: address PR #72 round-2 review findings
Bot reviewers (cubic + cursor) flagged 5 follow-on issues after the
first fix push. Three are real bugs in the Phase 6-8 ensure_indices
sidecar wiring; two are AI-slop flags on shallow tests. One cursor
finding is a false positive on intentional node/edge index asymmetry.
Real bugs fixed:
- needs_index_work_node and needs_index_work_edge now skip empty
tables (count_rows == 0). The ensure_indices_for_branch loop has
`if row_count > 0 { build_indices(...) }`, so empty tables produce
zero commit_staged calls. Pinning them in the sidecar would force
NoMovement classification on recovery and trigger the all-or-nothing
rollback of any sibling table's legitimate index work (cubic #1).
- needs_index_work_node and needs_index_work_edge now respect the
table_branch parameter from the snapshot entry, instead of always
passing None (== main). For branch writes, opening the wrong HEAD
could miss recoverable Phase B commits (cubic #2).
- needs_index_work_edge documented as intentionally BTree-only (mirrors
the build_indices_on_dataset_for_catalog edge branch which only
builds id/src/dst BTrees). Cursor flagged FTS/vector omission as
inconsistency with the node helper; confirmed intentional via
inline comment so future readers know the asymmetry is on purpose
(cursor finding, false positive marked).
Test improvements:
- recovery_multi_sidecar_requires_fresh_snapshot_for_correctness — new
integration test that uses TWO sidecars on the SAME table where
sidecar B's expected_version equals sidecar A's post_commit_pin.
Sidecar B's classification only succeeds if the recovery sweep
refreshes the snapshot between iterations to see A's manifest
update. Without the refresh fix from the prior commit, B would be
classified against stale pins (cubic #4 follow-up).
- recovery_ensure_indices_handles_empty_tables — new integration test
that runs ensure_indices on an all-empty repo. With the round-2 fix,
both initial and steady-state runs leave no sidecar (zero pins ⇒
zero sidecar I/O). Without the empty-table fix, the sidecar would
pin Company (zero rows but missing indices) and force a NoMovement
rollback (cubic #1 verification).
- ensure_indices_phase_b_failure_does_not_leak_sidecar_when_no_work_needed —
renamed/rewrote the prior `_recovered_on_next_open` test to assert
the post-fix invariant: when load_jsonl auto-built every catalog
index via prepare_updates_for_commit, ensure_indices's needs_work
helpers correctly report zero pins and produce no sidecar. The old
assertion ("exactly one sidecar must persist") was wrong for the
scoped behavior.
Test surface (post-round-2):
- 25 unit tests in db::manifest::recovery (BranchMerge classifier,
sort order, primitives — unchanged).
- 12 integration tests in tests/recovery.rs (+2 from this commit).
- 11 failpoint tests including the four per-writer Phase B → recovery
tests (one renamed to reflect the scoped behavior).
- ~672 workspace tests pass with --features failpoints.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 12:50:33 +02:00
|
|
|
"sidecar must remain deleted; remaining: {:?}",
|
recovery: per-writer Phase B failure → recovery integration tests (Phase 9)
Add the three paired per-writer tests required by MR-847's acceptance
criteria — "All four migrated writers ... have paired Phase B → recovery
integration tests."
Production additions (~10 LOC):
- New failpoint `branch_merge.post_phase_b_pre_manifest_commit` in
`exec/merge.rs::branch_merge_on_current_target` between the per-table
publish loop and `commit_manifest_updates`.
- New failpoint `ensure_indices.post_phase_b_pre_manifest_commit` in
`db/omnigraph/table_ops.rs::ensure_indices_for_branch` between the
per-table loops and `commit_prepared_updates_on_branch`.
- For schema_apply, the existing `schema_apply.after_staging_write`
failpoint already fires in the right window (after the per-table
rewrites + index builds, before the manifest publish).
Sidecar tweak:
- `schema_apply` sidecar's `branch` is now `None` (was
`Some("__schema_apply_lock__")`). The lock branch is purely a
serialization sentinel; `coordinator.commit_changes_with_actor`
publishes against the coordinator's pre-lock branch (main). After
the failpoint fires, `release_schema_apply_lock` removes the lock
branch — if the sidecar referenced it, the recovery sweep would try
to publish to a branch that no longer exists and fail. Fix: record
the actual publish target.
Tests added in `tests/failpoints.rs` (~280 LOC):
- `schema_apply_phase_b_failure_recovered_on_next_open` — seeds a row,
opens, attempts a schema apply that adds a new node type + a new
property (the new type ensures the table set differs so
`recover_schema_state_files` doesn't trip on property-only
ambiguity), failpoint fires, drops engine, reopens, asserts sidecar
deleted + audit row recorded.
- `branch_merge_phase_b_failure_recovered_on_next_open` — seeds main,
branches off, mutates the branch, attempts merge with the
`branch_merge.post_phase_b_pre_manifest_commit` failpoint active.
Same recovery shape.
- `ensure_indices_phase_b_failure_recovered_on_next_open` — seeds
rows, attempts ensure_indices with the
`ensure_indices.post_phase_b_pre_manifest_commit` failpoint active.
After this commit, all four migrated writers have paired
Phase B → recovery tests:
- mutate_as / load: `recovery_rolls_forward_after_finalize_publisher_failure` (Phase 5)
- schema_apply: `schema_apply_phase_b_failure_recovered_on_next_open`
- branch_merge: `branch_merge_phase_b_failure_recovered_on_next_open`
- ensure_indices: `ensure_indices_phase_b_failure_recovered_on_next_open`
11 failpoint tests pass; full workspace lib + integration tests pass
(350+ tests across 20 binaries).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:38:02 +02:00
|
|
|
remaining,
|
|
|
|
|
);
|
|
|
|
|
}
|
recovery: address PR #72 round-2 review findings
Bot reviewers (cubic + cursor) flagged 5 follow-on issues after the
first fix push. Three are real bugs in the Phase 6-8 ensure_indices
sidecar wiring; two are AI-slop flags on shallow tests. One cursor
finding is a false positive on intentional node/edge index asymmetry.
Real bugs fixed:
- needs_index_work_node and needs_index_work_edge now skip empty
tables (count_rows == 0). The ensure_indices_for_branch loop has
`if row_count > 0 { build_indices(...) }`, so empty tables produce
zero commit_staged calls. Pinning them in the sidecar would force
NoMovement classification on recovery and trigger the all-or-nothing
rollback of any sibling table's legitimate index work (cubic #1).
- needs_index_work_node and needs_index_work_edge now respect the
table_branch parameter from the snapshot entry, instead of always
passing None (== main). For branch writes, opening the wrong HEAD
could miss recoverable Phase B commits (cubic #2).
- needs_index_work_edge documented as intentionally BTree-only (mirrors
the build_indices_on_dataset_for_catalog edge branch which only
builds id/src/dst BTrees). Cursor flagged FTS/vector omission as
inconsistency with the node helper; confirmed intentional via
inline comment so future readers know the asymmetry is on purpose
(cursor finding, false positive marked).
Test improvements:
- recovery_multi_sidecar_requires_fresh_snapshot_for_correctness — new
integration test that uses TWO sidecars on the SAME table where
sidecar B's expected_version equals sidecar A's post_commit_pin.
Sidecar B's classification only succeeds if the recovery sweep
refreshes the snapshot between iterations to see A's manifest
update. Without the refresh fix from the prior commit, B would be
classified against stale pins (cubic #4 follow-up).
- recovery_ensure_indices_handles_empty_tables — new integration test
that runs ensure_indices on an all-empty repo. With the round-2 fix,
both initial and steady-state runs leave no sidecar (zero pins ⇒
zero sidecar I/O). Without the empty-table fix, the sidecar would
pin Company (zero rows but missing indices) and force a NoMovement
rollback (cubic #1 verification).
- ensure_indices_phase_b_failure_does_not_leak_sidecar_when_no_work_needed —
renamed/rewrote the prior `_recovered_on_next_open` test to assert
the post-fix invariant: when load_jsonl auto-built every catalog
index via prepare_updates_for_commit, ensure_indices's needs_work
helpers correctly report zero pins and produce no sidecar. The old
assertion ("exactly one sidecar must persist") was wrong for the
scoped behavior.
Test surface (post-round-2):
- 25 unit tests in db::manifest::recovery (BranchMerge classifier,
sort order, primitives — unchanged).
- 12 integration tests in tests/recovery.rs (+2 from this commit).
- 11 failpoint tests including the four per-writer Phase B → recovery
tests (one renamed to reflect the scoped behavior).
- ~672 workspace tests pass with --features failpoints.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 12:50:33 +02:00
|
|
|
// No audit row expected — no sidecar was processed.
|
recovery: per-writer Phase B failure → recovery integration tests (Phase 9)
Add the three paired per-writer tests required by MR-847's acceptance
criteria — "All four migrated writers ... have paired Phase B → recovery
integration tests."
Production additions (~10 LOC):
- New failpoint `branch_merge.post_phase_b_pre_manifest_commit` in
`exec/merge.rs::branch_merge_on_current_target` between the per-table
publish loop and `commit_manifest_updates`.
- New failpoint `ensure_indices.post_phase_b_pre_manifest_commit` in
`db/omnigraph/table_ops.rs::ensure_indices_for_branch` between the
per-table loops and `commit_prepared_updates_on_branch`.
- For schema_apply, the existing `schema_apply.after_staging_write`
failpoint already fires in the right window (after the per-table
rewrites + index builds, before the manifest publish).
Sidecar tweak:
- `schema_apply` sidecar's `branch` is now `None` (was
`Some("__schema_apply_lock__")`). The lock branch is purely a
serialization sentinel; `coordinator.commit_changes_with_actor`
publishes against the coordinator's pre-lock branch (main). After
the failpoint fires, `release_schema_apply_lock` removes the lock
branch — if the sidecar referenced it, the recovery sweep would try
to publish to a branch that no longer exists and fail. Fix: record
the actual publish target.
Tests added in `tests/failpoints.rs` (~280 LOC):
- `schema_apply_phase_b_failure_recovered_on_next_open` — seeds a row,
opens, attempts a schema apply that adds a new node type + a new
property (the new type ensures the table set differs so
`recover_schema_state_files` doesn't trip on property-only
ambiguity), failpoint fires, drops engine, reopens, asserts sidecar
deleted + audit row recorded.
- `branch_merge_phase_b_failure_recovered_on_next_open` — seeds main,
branches off, mutates the branch, attempts merge with the
`branch_merge.post_phase_b_pre_manifest_commit` failpoint active.
Same recovery shape.
- `ensure_indices_phase_b_failure_recovered_on_next_open` — seeds
rows, attempts ensure_indices with the
`ensure_indices.post_phase_b_pre_manifest_commit` failpoint active.
After this commit, all four migrated writers have paired
Phase B → recovery tests:
- mutate_as / load: `recovery_rolls_forward_after_finalize_publisher_failure` (Phase 5)
- schema_apply: `schema_apply_phase_b_failure_recovered_on_next_open`
- branch_merge: `branch_merge_phase_b_failure_recovered_on_next_open`
- ensure_indices: `ensure_indices_phase_b_failure_recovered_on_next_open`
11 failpoint tests pass; full workspace lib + integration tests pass
(350+ tests across 20 binaries).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:38:02 +02:00
|
|
|
let audit_dir = dir.path().join("_graph_commit_recoveries.lance");
|
|
|
|
|
assert!(
|
recovery: address PR #72 round-2 review findings
Bot reviewers (cubic + cursor) flagged 5 follow-on issues after the
first fix push. Three are real bugs in the Phase 6-8 ensure_indices
sidecar wiring; two are AI-slop flags on shallow tests. One cursor
finding is a false positive on intentional node/edge index asymmetry.
Real bugs fixed:
- needs_index_work_node and needs_index_work_edge now skip empty
tables (count_rows == 0). The ensure_indices_for_branch loop has
`if row_count > 0 { build_indices(...) }`, so empty tables produce
zero commit_staged calls. Pinning them in the sidecar would force
NoMovement classification on recovery and trigger the all-or-nothing
rollback of any sibling table's legitimate index work (cubic #1).
- needs_index_work_node and needs_index_work_edge now respect the
table_branch parameter from the snapshot entry, instead of always
passing None (== main). For branch writes, opening the wrong HEAD
could miss recoverable Phase B commits (cubic #2).
- needs_index_work_edge documented as intentionally BTree-only (mirrors
the build_indices_on_dataset_for_catalog edge branch which only
builds id/src/dst BTrees). Cursor flagged FTS/vector omission as
inconsistency with the node helper; confirmed intentional via
inline comment so future readers know the asymmetry is on purpose
(cursor finding, false positive marked).
Test improvements:
- recovery_multi_sidecar_requires_fresh_snapshot_for_correctness — new
integration test that uses TWO sidecars on the SAME table where
sidecar B's expected_version equals sidecar A's post_commit_pin.
Sidecar B's classification only succeeds if the recovery sweep
refreshes the snapshot between iterations to see A's manifest
update. Without the refresh fix from the prior commit, B would be
classified against stale pins (cubic #4 follow-up).
- recovery_ensure_indices_handles_empty_tables — new integration test
that runs ensure_indices on an all-empty repo. With the round-2 fix,
both initial and steady-state runs leave no sidecar (zero pins ⇒
zero sidecar I/O). Without the empty-table fix, the sidecar would
pin Company (zero rows but missing indices) and force a NoMovement
rollback (cubic #1 verification).
- ensure_indices_phase_b_failure_does_not_leak_sidecar_when_no_work_needed —
renamed/rewrote the prior `_recovered_on_next_open` test to assert
the post-fix invariant: when load_jsonl auto-built every catalog
index via prepare_updates_for_commit, ensure_indices's needs_work
helpers correctly report zero pins and produce no sidecar. The old
assertion ("exactly one sidecar must persist") was wrong for the
scoped behavior.
Test surface (post-round-2):
- 25 unit tests in db::manifest::recovery (BranchMerge classifier,
sort order, primitives — unchanged).
- 12 integration tests in tests/recovery.rs (+2 from this commit).
- 11 failpoint tests including the four per-writer Phase B → recovery
tests (one renamed to reflect the scoped behavior).
- ~672 workspace tests pass with --features failpoints.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 12:50:33 +02:00
|
|
|
!audit_dir.exists(),
|
|
|
|
|
"_graph_commit_recoveries.lance must NOT exist when no sidecar was processed"
|
recovery: per-writer Phase B failure → recovery integration tests (Phase 9)
Add the three paired per-writer tests required by MR-847's acceptance
criteria — "All four migrated writers ... have paired Phase B → recovery
integration tests."
Production additions (~10 LOC):
- New failpoint `branch_merge.post_phase_b_pre_manifest_commit` in
`exec/merge.rs::branch_merge_on_current_target` between the per-table
publish loop and `commit_manifest_updates`.
- New failpoint `ensure_indices.post_phase_b_pre_manifest_commit` in
`db/omnigraph/table_ops.rs::ensure_indices_for_branch` between the
per-table loops and `commit_prepared_updates_on_branch`.
- For schema_apply, the existing `schema_apply.after_staging_write`
failpoint already fires in the right window (after the per-table
rewrites + index builds, before the manifest publish).
Sidecar tweak:
- `schema_apply` sidecar's `branch` is now `None` (was
`Some("__schema_apply_lock__")`). The lock branch is purely a
serialization sentinel; `coordinator.commit_changes_with_actor`
publishes against the coordinator's pre-lock branch (main). After
the failpoint fires, `release_schema_apply_lock` removes the lock
branch — if the sidecar referenced it, the recovery sweep would try
to publish to a branch that no longer exists and fail. Fix: record
the actual publish target.
Tests added in `tests/failpoints.rs` (~280 LOC):
- `schema_apply_phase_b_failure_recovered_on_next_open` — seeds a row,
opens, attempts a schema apply that adds a new node type + a new
property (the new type ensures the table set differs so
`recover_schema_state_files` doesn't trip on property-only
ambiguity), failpoint fires, drops engine, reopens, asserts sidecar
deleted + audit row recorded.
- `branch_merge_phase_b_failure_recovered_on_next_open` — seeds main,
branches off, mutates the branch, attempts merge with the
`branch_merge.post_phase_b_pre_manifest_commit` failpoint active.
Same recovery shape.
- `ensure_indices_phase_b_failure_recovered_on_next_open` — seeds
rows, attempts ensure_indices with the
`ensure_indices.post_phase_b_pre_manifest_commit` failpoint active.
After this commit, all four migrated writers have paired
Phase B → recovery tests:
- mutate_as / load: `recovery_rolls_forward_after_finalize_publisher_failure` (Phase 5)
- schema_apply: `schema_apply_phase_b_failure_recovered_on_next_open`
- branch_merge: `branch_merge_phase_b_failure_recovered_on_next_open`
- ensure_indices: `ensure_indices_phase_b_failure_recovered_on_next_open`
11 failpoint tests pass; full workspace lib + integration tests pass
(350+ tests across 20 binaries).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 00:38:02 +02:00
|
|
|
);
|
|
|
|
|
}
|
(feat): multi-graph server mode (#119)
* mr-668: add GraphId newtype + Cloud-mode forward identity stubs (PR 1/10)
PR 1 of the MR-668 multi-graph server work. Pure types, no runtime
behavior changes yet.
Ships the validated identity vocabulary that the rest of the implementation
will consume:
- `GraphId(String)` — `^[a-zA-Z0-9-]{1,64}$`, leading underscore rejected
(engine reserves every `_*` filename), reserved route names rejected
(`policies`, `healthz`, `openapi`, `openapi.json`, `graphs`). Validation
lives in `try_from` only; serde `Deserialize` re-runs it so JSON payloads
cannot bypass.
- `TenantId(String)` — same regex shape as GraphId. `None` in Cluster
mode; reserved for Cloud mode (RFC 0003) where it carries the OAuth
`org_id` claim.
- `GraphKey { tenant_id: Option<TenantId>, graph_id }` — the registry
HashMap key. `cluster()` constructor for the Cluster-mode default.
- `Scope` enum with `Full` variant — Cluster mode default; RFC 0004 will
extend with OAuth scopes (`graph:read`/`write`/`admin`/`*`).
- `AuthSource` enum with `Static` variant — Cluster mode default; RFC
0001 step 1 will add `Oidc`.
- `ResolvedActor { actor_id, tenant_id, scopes, source }` — replaces the
upcoming refactor of `AuthenticatedActor(Arc<str>)` in PR 4a.
Per MR-668 design decision 13: ship the Cloud-mode forward type shapes
now (no `TokenVerifier` trait yet — that's RFC 0001 step 1) so handler
signatures stay stable across the Cluster → Cloud trajectory. `Scope`
and `AuthSource` use `#[non_exhaustive]` so future variants don't break
caller matches.
Tests: 26 new (15 graph_id + 11 identity), all passing. No regression
in the existing 36 server library tests.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* mr-668: Omnigraph::init error-path cleanup + three failpoints (PR 2a/10)
PR 2a of the MR-668 multi-graph server work. Bug fix: a partially-failed
`Omnigraph::init` previously left orphan schema files at the graph URI,
making the URI unusable for a retry (the next `init` would refuse because
`_schema.pg` already exists).
Changes:
1. `init_with_storage` now wraps the I/O phase. On any error from
`init_storage_phase`, calls `best_effort_cleanup_init_artifacts` to
remove the three schema files before returning the original error:
- `_schema.pg`
- `_schema.ir.json`
- `__schema_state.json`
Cleanup is best-effort: a failure to delete is logged via
`tracing::warn` but does NOT mask the init error.
2. Three failpoints added at the init phase boundaries:
- `init.after_schema_pg_written`
- `init.after_schema_contract_written`
- `init.after_coordinator_init`
3. Four new failpoint tests in `tests/failpoints.rs` pin the cleanup
behavior at each boundary plus the "original error wins over cleanup
error" contract. All 23 failpoint tests pass.
Coverage gap (documented in code comments):
Lance per-type datasets and `__manifest/` directory created by
`GraphCoordinator::init` are NOT cleaned up after a coordinator-init-phase
failure. Recursive directory deletion requires `StorageAdapter::delete_prefix`,
which was deferred along with `DELETE /graphs/{id}` (originally PR 2b). When
that primitive lands, the third failpoint test can be tightened to assert
the graph root is fully empty.
Tests: 4 new (init_failpoint_*), all 23 failpoint tests green. No
regression in the 105 engine library tests or 64 end_to_end tests.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* mr-668: add GraphHandle + GraphRegistry data structure (PR 3/10)
PR 3 of the MR-668 multi-graph server work. Pure data structure — no
routing changes yet (that's PR 4a).
New file: `crates/omnigraph-server/src/registry.rs`
- `GraphHandle { key: GraphKey, uri: String, engine: Arc<Omnigraph>,
policy: Option<Arc<PolicyEngine>> }` — the per-graph state that the
routing middleware (PR 4a) will inject as a request extension.
- `RegistrySnapshot { graphs: HashMap<GraphKey, Arc<GraphHandle>> }` —
immutable snapshot; replaced atomically via `ArcSwap`.
- `GraphRegistry { snapshot: ArcSwap<_>, mutate: Mutex<()> }` — lock-free
reads, mutex-serialized mutations.
- `RegistryLookup { Ready(Arc<GraphHandle>) | Gone }` — two-valued, no
`Tombstoned` variant since DELETE is deferred in v0.7.0 scope.
- `InsertError { DuplicateKey | DuplicateUri }` — both rejection cases
for create-graph (maps to HTTP 409 in PR 7).
- Methods: `new`, `from_handles` (bulk startup-time init), `get`, `list`,
`len`, `insert`.
Race semantics pinned by three multi-thread tests:
- `concurrent_insert_same_key_exactly_one_succeeds` — N=8 spawned
inserts with the same key; exactly 1 returns Ok, 7 return DuplicateKey.
- `concurrent_insert_distinct_keys_all_succeed` — N=8 spawned inserts
with distinct keys; all succeed.
- `concurrent_reads_during_inserts_see_consistent_snapshots` — reader
loop concurrent with sequential writes; every listed handle's key
resolves via `get()` (no torn state).
Why no tombstones field: `DELETE /graphs/{id}` is deferred to bound
the scope of v0.7.0. Without a delete endpoint, there's no use for
tombstones — every key in the registry is `Ready`, and every key
not in the registry is `Gone`. When DELETE lands later, the
`Tombstoned` variant + `tombstones: HashSet<GraphKey>` slot in
additively without breaking caller signatures (the `Gone` variant
remains the "not currently active" case).
Why `tokio::sync::Mutex`: insert is async because PR 7's flow holds
this mutex across the atomic YAML rewrite step (file I/O). std::Mutex
would footgun across .await.
Dependency additions: `arc-swap = { workspace = true }`,
`thiserror = { workspace = true }` (used by InsertError).
Tests: 12 new (12 passing). 74 server lib tests total green
(62 from PR 1 + 12 new). Clippy clean on server crate.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* mr-668: router restructure + handler refactor for multi-graph (PR 4a/10)
PR 4a of the MR-668 multi-graph server work. The heaviest single PR —
rewires every handler to extract `Arc<GraphHandle>` from a routing
middleware, replaces `AuthenticatedActor(Arc<str>)` with `ResolvedActor`
everywhere, and adds the `ServerMode` discriminator.
Behavior changes:
- **Single mode** (legacy `omnigraph-server <URI>`): flat routes
(`/snapshot`, `/read`, `/branches`, …) continue to work exactly as
v0.6.0. Internally, the registry holds a single handle keyed by the
sentinel `SINGLE_GRAPH_KEY_ID = "default"`; routing middleware injects
that handle on every request. No HTTP-visible change.
- **Multi mode** (new): routes nest under `/graphs/{graph_id}/...`.
Routing middleware extracts the graph id from the path, looks it up
in the registry, and injects the handle. 404 if not found.
(Multi-mode startup itself lands in PR 5; this PR provides the
router-side wiring.)
AppState refactor:
- `engine: Arc<Omnigraph>` and `policy_engine: Option<Arc<PolicyEngine>>`
fields removed — both now live inside `GraphHandle` in the registry.
- `mode: ServerMode { Single { uri } | Multi { config_path } }` added.
- `registry: Arc<GraphRegistry>` added.
- `server_policy: Option<Arc<PolicyEngine>>` added (placeholder for
management endpoints in PR 6b; unused today).
- Existing constructors (`new`, `new_with_bearer_token{s,_and_policy}`,
`new_with_workload`, `open*`) build a single-mode AppState
internally and remain source-compatible. Tests that constructed
AppState via these constructors continue to work.
- `with_policy_engine` post-construction setter — rebuilds the
single-mode handle with the policy attached. Engine-layer
enforcement is NOT reinstalled (matches the old single-field
semantics; `open_with_bearer_tokens_and_policy` is the path that
installs both layers).
- `new_multi` constructor added for PR 5's startup loop.
- `uri()` now returns `Option<&str>` (Some in single, None in multi).
Routing middleware:
- `resolve_graph_handle` injects `Arc<GraphHandle>` as a request
extension. Mode-aware: single returns the only handle; multi parses
`/graphs/{graph_id}/...` from the URI. Returns 404 in multi mode
when the graph id is unregistered. Records `graph_id` on the
current tracing span.
- `require_bearer_auth` updated to insert `ResolvedActor` (was
`AuthenticatedActor`).
Handler refactor — every protected handler:
- Gains `Extension(handle): Extension<Arc<GraphHandle>>` param.
- Replaces `state.engine` → `handle.engine`.
- Replaces `state.policy_engine()` → `handle.policy.as_deref()`.
- Replaces `state.uri()` → `handle.uri.as_str()` (or `.clone()`
where String is needed).
- Replaces `Arc::clone(&state.engine)` → `Arc::clone(&handle.engine)`
(the spawn-and-clone pattern in `server_export` — proof that a
long-running export survives the registry being mutated later).
authorize_request signature:
- Was: `(state: &AppState, actor: Option<&AuthenticatedActor>, request: PolicyRequest)`.
- Now: `(actor: Option<&ResolvedActor>, policy: Option<&PolicyEngine>, request: PolicyRequest)`.
- Per-graph callers pass `handle.policy.as_deref()`. The (future PR 6b)
management endpoints will pass `state.server_policy.as_deref()`.
MR-731 invariant preserved:
- The single chokepoint `request.actor_id = actor.actor_id.as_ref().to_string()`
inside `authorize_request` still overwrites any client-supplied
actor identity. Regression test
`actor_id_resolves_from_bearer_token_ignoring_client_supplied_headers`
at `tests/server.rs:1114-1216` passes unchanged.
Tests: 0 new (the registry race tests in PR 3 already cover the
data structure; this PR exercises them indirectly via the existing
test suite). 74 lib + 57 server integration + 60 openapi = 191 tests
green. Clippy clean.
LOC: +397 insertions, -153 deletions in `crates/omnigraph-server/src/lib.rs`.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* mr-668: OpenAPI multi-mode cluster filter (PR 4b/10)
PR 4b of the MR-668 multi-graph server work. In multi mode, the served
`/openapi.json` reports cluster routes (`/graphs/{graph_id}/...`) instead
of the legacy flat protected paths — matching what `build_app` actually
mounts (PR 4a's `Router::nest`). Single mode is unchanged.
Implementation:
- New `server_openapi` branch: when `state.mode()` is `Multi`, call
`nest_paths_under_cluster_prefix(&mut doc)` after `ApiDoc::openapi()`.
- The rewrite consumes `doc.paths.paths`, then for every path-item:
- If the path is in `ALWAYS_FLAT_PATHS` (`/healthz` for now), keep
it flat.
- Otherwise, prefix every operation_id with `cluster_` and reinsert
the item at `/graphs/{graph_id}<original_path>`.
- Single mode hits no extra work — the path map is untouched.
- The static `ApiDoc::openapi()` still emits the flat surface, so
in-process callers (the existing `openapi_json()` helper in tests)
see the unmodified spec.
Why cluster_ prefix on operation IDs: OpenAPI specs require unique
operation_ids across the document. With both flat (single-mode) and
cluster (multi-mode) surfaces ever co-existing in a generated SDK,
the prefix prevents collision. The current served doc only carries
one surface, so the prefix is forward-compat with potential future
dual-surface generation.
Tests: 6 new in `tests/openapi.rs`, all via the `/openapi.json` route
(not the static `ApiDoc::openapi()` helper):
- `multi_mode_openapi_lists_cluster_paths` — every protected path
appears as a cluster variant.
- `multi_mode_openapi_drops_flat_protected_paths` — flat protected
paths are absent.
- `multi_mode_openapi_keeps_healthz_flat` — `/healthz` survives.
- `multi_mode_openapi_prefixes_operation_ids_with_cluster` — every
cluster operation_id starts with `cluster_`.
- `multi_mode_operation_ids_are_unique` — no operation_id collisions.
- `single_mode_openapi_unchanged_by_cluster_filter` — single mode
still emits the legacy flat surface (regression).
New test helper `app_for_multi_mode(graph_ids)` exercises the new
`AppState::new_multi` constructor from PR 4a — first user of multi-mode
construction outside of unit tests.
Result: 66 openapi tests + 57 server integration tests + 74 lib tests
= 197 green. No regression in the existing OpenAPI drift check
(`openapi_spec_is_up_to_date` still validates the static flat surface
matches the committed openapi.json).
LOC: +67 in lib.rs (rewrite logic), +219 in tests/openapi.rs (test
suite + helper).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* mr-668: multi-graph startup + mode inference (PR 5/10)
PR 5 of the MR-668 multi-graph server work. This is the first PR that
makes multi mode actually usable end-to-end: operators invoking
`omnigraph-server --config omnigraph.yaml` with a non-empty `graphs:`
map and no single-mode selector now get a running multi-graph server.
Mode inference (MR-668 decision 2, four-rule matrix in
`load_server_settings`):
1. CLI `<URI>` positional → Single
2. CLI `--target <name>` → Single (URI from graphs.<name>)
3. `server.graph` in config → Single (URI from graphs.<name>)
4. `--config` + non-empty `graphs:` + no single-mode selector
→ Multi (all entries in `graphs:`)
5. otherwise → error with migration hint
Rule 5's error message names every escape hatch so operators can fix
their invocation without grepping docs.
Config schema extensions:
- `TargetConfig.policy: PolicySettings` (per-graph Cedar policy file).
`#[serde(default)]` so existing single-graph YAMLs keep parsing.
- `ServerDefaults.policy: PolicySettings` (server-level Cedar policy
for management endpoints — loaded in PR 5, wired into `GET /graphs`
in PR 6b).
- `OmnigraphConfig::resolve_target_policy_file(name)` and
`resolve_server_policy_file()` helpers — both resolve relative to
the config file's `base_dir`.
Public types added to `omnigraph-server`:
- `ServerConfigMode { Single { uri, policy_file } | Multi { graphs,
config_path, server_policy_file } }`.
- `GraphStartupConfig { graph_id, uri, policy_file }` — one entry
per graph in multi mode.
`ServerConfig` shape change:
- WAS: `{ uri: String, bind, policy_file, allow_unauthenticated }`.
- NOW: `{ mode: ServerConfigMode, bind, allow_unauthenticated }`.
- Breaking for any code that constructs `ServerConfig` directly.
`main.rs` is unaffected (uses `load_server_settings`).
`serve()` now forks on `ServerConfig.mode`:
- Single: existing flow via `AppState::open_with_bearer_tokens_and_policy`.
- Multi: parallel open via `futures::stream::iter(graphs)
.map(open_single_graph).buffer_unordered(4).collect()`. Bound 4 is
a rule-of-thumb for I/O-bound work — at N≤10 this trades startup
latency for a small amount of concurrent S3/Lance open pressure.
Fail-fast: first open error aborts startup; in-flight opens drop
their engine via Arc (Lance datasets close cleanly).
New helper `open_single_graph(GraphStartupConfig)`:
- Validates `GraphId` per the regex in PR 1.
- `Omnigraph::open(uri).await` with descriptive error context.
- Loads per-graph policy file and re-applies it via
`Omnigraph::with_policy` (engine-layer enforcement, MR-722).
- Returns `Arc<GraphHandle>` ready for the registry.
Routing middleware bug fix:
- `Router::nest("/graphs/{graph_id}", inner)` rewrites
`request.uri().path()` to the inner suffix (e.g. `/snapshot`).
The previous middleware tried to parse `{graph_id}` from
`request.uri().path()` and got 400 instead of 200. Fixed by reading
from `axum::extract::OriginalUri` request extension, which preserves
the pre-rewrite URI.
- Caught by the two new tests
`cluster_routes_dispatch_per_graph_handle` and
`cluster_route_for_unknown_graph_returns_404`.
Tests (14 new, all passing):
- Four-rule matrix: one test per branch + the joint case
`mode_inference_cli_uri_overrides_graphs_map` + the empty-graphs-map
error case.
- Per-graph + server-level policy file path resolution.
- Reserved `GraphId` rejection at startup.
- End-to-end multi-graph routing: two graphs side by side, each
cluster route hits the right engine.
- Unknown graph id under cluster prefix → 404.
- Flat routes 404 in multi mode.
Inline `ServerConfig` test (`serve_refuses_to_start_in_state_1_without_unauthenticated`)
and three `server_settings_*` tests updated to the new `mode` shape.
Result: 211 server tests green (74 lib + 71 integration + 66 openapi),
MR-731 regression test still pinned and passing.
LOC: +45 config.rs, +281 lib.rs (net), +395 tests/server.rs.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* mr-668: Cedar resource-model refactor (PR 6a/10)
PR 6a of the MR-668 multi-graph server work. Policy-crate-only refactor —
no HTTP handler changes, no operator-supplied policy.yaml changes. Sets
up the chassis that PR 6b's `GET /graphs` consumes.
Two new `PolicyAction` variants:
- `GraphCreate` — gates `POST /graphs` (deferred behavioral PR).
- `GraphList` — gates `GET /graphs` (lands in PR 6b).
Note: `GraphDelete` is intentionally NOT added in this PR. `DELETE
/graphs/{id}` is deferred from MR-668's v0.7.0 scope to bound complexity
(no `delete_prefix`, no tombstone, no `RegistryLookup::Tombstoned`).
Adding the Cedar action without a consumer would be the same kind of
"dead vocabulary" trap the `Admin` variant already documents.
New `PolicyResourceKind { Graph, Server }` enum, plus a
`PolicyAction::resource_kind()` method that classifies every action.
Per-graph actions (Read, Change, BranchCreate, …) bind to
`Omnigraph::Graph::"<graph_label>"`; server-scoped actions
(GraphCreate, GraphList) bind to the singleton
`Omnigraph::Server::"root"`. `Admin` stays classified as per-graph for
now — MR-724 will pick the final shape when the first consumer surface
ships.
Cedar schema string additions:
- `entity Server;`
- `action "graph_create" appliesTo { principal: Actor, resource: Server, ... }`
- `action "graph_list" appliesTo { principal: Actor, resource: Server, ... }`
Compiler updates:
- `compile_policy_source` picks the resource literal based on the
action's `resource_kind`. Existing graph-only policies generate
the same Cedar source as before — pinned by
`per_graph_rules_continue_to_work_alongside_server_rules`.
- `compile_entities` includes the `Server::"root"` entity only when
a rule references a server-scoped action. Keeps test assertions
for graph-only policies tight.
- `PolicyEngine::authorize` builds the right resource UID at
request time based on `request.action.resource_kind()`.
Validation rules added to `PolicyConfig::validate`:
- A rule may not mix server-scoped and per-graph actions (different
resource kinds need different `permit` clauses).
- Server-scoped actions cannot have `branch_scope` or
`target_branch_scope` — there's no branch context at the server
level.
Operator impact: zero. The Cedar schema `Omnigraph::Server` entity is
internally referenced by `compile_policy_source`; operator policy.yaml
files only declare actions in `rules[].allow.actions` and never
reference the resource entity directly. Decision 6's "internal rename
only; operator policies unaffected" contract is preserved and pinned
by `per_graph_rules_continue_to_work_alongside_server_rules`.
Tests: 5 new (11 policy tests total, up from 6):
- `graph_list_action_authorizes_against_server_resource`
- `graph_create_action_authorizes_against_server_resource`
- `server_scoped_rule_cannot_use_branch_scope`
- `rule_mixing_server_and_per_graph_actions_is_rejected`
- `per_graph_rules_continue_to_work_alongside_server_rules`
No regression: 145 server tests (74 lib + 71 integration) still green.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* mr-668: GET /graphs endpoint + per-graph policy wire-up (PR 6b/10)
PR 6b of the MR-668 multi-graph server work. First management endpoint —
`GET /graphs` lists every graph registered with the server, gated by the
server-level Cedar policy from PR 6a.
New API shapes (in `omnigraph-server::api`):
- `GraphInfo { graph_id, uri }` — one entry per registered graph.
- `GraphListResponse { graphs: Vec<GraphInfo> }` — sorted alphabetically
by `graph_id` for deterministic output.
Handler `server_graphs_list`:
- Mounted at `GET /graphs` in both modes.
- Single mode: returns 405 (resource exists in the API surface, just
not operational without a `graphs:` map). 405 chosen over 404 so
clients see "resource exists, wrong context" rather than "no such
resource".
- Multi mode: requires bearer auth (when configured); Cedar-gated by
`PolicyAction::GraphList` against `Omnigraph::Server::"root"`
(PR 6a's chassis). Returns the sorted registry list.
Cedar gate composition:
- When no `server.policy.file` is configured, the MR-723 default-deny
falls through: `GraphList` is not `Read`, so an authenticated actor
without a server policy gets 403. This is the right default — don't
expose the registry until the operator explicitly authorizes it.
- When a server policy is configured, Cedar evaluates the rule. The
test `get_graphs_with_server_policy_authorizes_per_cedar` pins the
admin-allow / viewer-deny split.
Routing:
- New `management` sub-router holding `/graphs` (auth-required, no
`resolve_graph_handle` middleware — operates on the registry, not
a single graph).
- Single mode merges flat protected routes + management.
- Multi mode merges nested `/graphs/{graph_id}/...` + management.
OpenAPI:
- `server_graphs_list` registered in `ApiDoc::paths(...)`.
- `EXPECTED_PATHS` in `tests/openapi.rs` gains `/graphs`.
- `openapi.json` regenerated (auto-tracked by
`openapi_spec_is_up_to_date` in CI).
Tests: 4 new in `tests/server.rs::multi_graph_startup`:
- `get_graphs_lists_registered_graphs_in_multi_mode`
- `get_graphs_returns_405_in_single_mode`
- `get_graphs_requires_bearer_auth_when_configured`
- `get_graphs_with_server_policy_authorizes_per_cedar`
What's NOT in this PR (deferred):
- Per-graph policy enforcement is wired through `handle.policy`
(PR 4a already did this); PR 6b doesn't add new per-graph
behavior beyond making sure the server policy lookup composes
cleanly alongside it.
- `POST /graphs` (PR 7) and `DELETE /graphs/{id}` (out of scope
for v0.7.0).
- CLI `omnigraph graphs list` (PR 8 will add).
Result: 215 server tests green (74 lib + 66 openapi + 75 integration),
11 policy tests green. MR-731 spoof regression preserved across all
this work.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* mr-668: POST /graphs runtime create endpoint (PR 7/10)
PR 7 of the MR-668 multi-graph server work. Operators can now add a
graph to a running multi-graph server without restarting:
curl -X POST http://server/graphs \
-H "Content-Type: application/json" \
-d '{
"graph_id": "beta",
"uri": "/data/beta.omni",
"schema": { "source": "node Person { name: String @key }\n" },
"policy": { "file": "./policies/beta.yaml" }
}'
DELETE remains deferred (out of v0.7.0 scope per the trimmed plan —
no `delete_prefix`, no tombstones).
Body shape (decision 7):
- Nested `schema: { source: "..." }` (mirrors the `policy: { file }`
pattern; leaves room for future fields without breakage).
- Optional nested `policy: { file: "..." }` for per-graph Cedar.
- 32 MiB body limit (reuses `INGEST_REQUEST_BODY_LIMIT_BYTES`).
- Asymmetric with `SchemaApplyRequest` which keeps flat
`schema_source: String` — documented in api.rs.
Atomic YAML rewrite + drift detection:
- New `config::rewrite_atomic(path, new_config, expected_hash)`:
flock → re-read + hash check → serialize → write `.tmp` → fsync
→ rename → fsync parent dir. Returns the new hash for the caller
to update its in-memory baseline.
- New `config::hash_config_file(path)` — SHA-256 of the on-disk
bytes, used at startup and after each rewrite.
- New `RewriteAtomicError { Drift | Io | Serialize }` enum.
- `AppState.config_hash: Option<Arc<Mutex<[u8;32]>>>` carries the
in-memory baseline. Updated after every successful rewrite so
subsequent POSTs don't false-trigger drift.
- The mutex is `std::sync::Mutex` (brief critical section, no .await
inside). The flock itself serializes file access process-wide
AND across multiple server instances (defense in depth).
- All sync I/O runs inside `tokio::task::spawn_blocking` — flock
is sync.
Handler ordering (the load-bearing sequence):
1. Mode check: 405 in single mode.
2. Cedar authorize: `GraphCreate` against `Omnigraph::Server::"root"`.
3. Validate body: `GraphId::try_from` (regex + reserved-name), empty
schema/uri checks, per-graph policy file parse.
4. Pre-check registry for duplicate graph_id / duplicate uri (409).
5. `Omnigraph::init` the new engine.
6. Atomic YAML rewrite (drift detection inside).
7. Publish in registry (atomic re-check via `GraphRegistry::insert`).
Failure modes (documented in handler rustdoc):
- Init fails → orphan storage at `req.uri` (PR 2a cleans up schema
files; Lance datasets remain orphans until `delete_prefix` lands).
- YAML rewrite fails (drift, IO) → orphan storage; YAML unchanged.
- Registry insert fails (race) → YAML has entry but registry doesn't;
next restart opens it cleanly.
New dependency: `fs2 = "0.4"` (workspace + omnigraph-server). POSIX-only
file locking. Linux/macOS deployment supported; Windows out of scope.
Tests (10 new in `tests/server.rs::multi_graph_startup`):
- `post_graphs_creates_a_new_graph_end_to_end` — happy path, includes
YAML inspection to confirm the rewrite landed.
- `post_graphs_baseline_hash_updates_between_rewrites` — two POSTs in
a row both succeed (drift baseline updates correctly).
- `post_graphs_duplicate_graph_id_returns_409`
- `post_graphs_duplicate_uri_returns_409`
- `post_graphs_invalid_graph_id_returns_400` (reserved name)
- `post_graphs_empty_schema_source_returns_400`
- `post_graphs_returns_405_in_single_mode`
- `post_graphs_yaml_drift_detection_returns_503` — operator hand-edits
omnigraph.yaml; server refuses to clobber.
- `hash_config_file_is_deterministic_and_detects_changes`
- `rewrite_atomic_refuses_when_hash_drifts`
OpenAPI: `server_graphs_create` registered in `ApiDoc::paths(...)`;
openapi.json regenerated.
Result: 225 server tests green (74 lib + 66 openapi + 85 integration),
all MR-731 regressions still pinned.
LOC: ~580 lib.rs net (handler + helpers), ~120 config.rs (rewrite
machinery), +71 api.rs (request/response shapes), +332 tests/server.rs.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* mr-668: CLI omnigraph graphs list/create (PR 8/10)
PR 8 of the MR-668 multi-graph server work. CLI parity for the
v0.7.0 management surface: operators can now manage graphs from
the command line against a running multi-graph server.
omnigraph graphs list --target dev --json
omnigraph graphs create \
--target dev \
--graph-id beta \
--graph-uri /data/beta.omni \
--schema schema.pg
DELETE is intentionally absent — server-side DELETE was deferred from
v0.7.0 scope, and shipping a client subcommand for a server endpoint
that doesn't exist would be dead vocabulary. The help output, the
subcommand enum, and the test that pins it (`graphs_subcommand_help_
lists_list_and_create`) all agree.
CLI architecture (modeled on `BranchCommand`):
- New `Command::Graphs { command: GraphsCommand }` top-level variant.
- `GraphsCommand { List, Create }` enum.
- List: GET `<base>/graphs`. Stdout is `<graph_id>\t<uri>` per line,
or JSON via `--json`.
- Create: reads `--schema <path>` from local disk, inlines as
`schema: { source: <file> }` in the POST body (nested per
MR-668 decision 7). Optional `--policy-file <path>` becomes
`policy: { file: <path> }`. Returns 201 → "created graph X at Y"
or JSON via `--json`.
- Both subcommands reject local URI targets with a clear
"remote multi-graph server URL" error.
New API type imports in the CLI: `GraphCreateRequest`,
`GraphCreateResponse`, `GraphListResponse`, `GraphSchemaSpec`,
`GraphPolicySpec` — all from `omnigraph-server::api`.
Tests:
- cli.rs (4 new, non-network):
* `graphs_subcommand_help_lists_list_and_create` — pins the
deferral of `delete` (catches scope creep).
* `graphs_list_against_local_uri_errors_with_remote_only_message`
* `graphs_create_against_local_uri_errors_with_remote_only_message`
* `graphs_create_with_missing_schema_file_errors` — pins the
IO context in the schema-read error path.
- system_remote.rs (1 new, `#[ignore]` like its peers):
* `graphs_list_and_create_against_multi_graph_server` — spawns a
multi-mode server, calls `graphs list` (sees `alpha`),
`graphs create` (adds `beta`), `graphs list` again (sees both),
and confirms the new graph is reachable via its cluster route.
CLI suite: 62 tests green (58 existing + 4 new). The new ignored
end-to-end test runs locally with `cargo test --ignored`.
LOC: +159 main.rs (enum + handlers), +88 cli.rs (unit tests),
+131 system_remote.rs (integration test).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* mr-668: composite e2e tests, race fix, v0.7.0 release (PR 9/10)
PR 9 — the final integration PR for MR-668 multi-graph server work.
Closes the v0.7.0 release.
Composite lifecycle tests (closes gaps flagged in PR 7's coverage
review):
- `multi_graph_lifecycle_post_query_restart_persistence` — POST a
graph, query it via cluster route, reload the config from disk
and confirm `load_server_settings` sees the rewritten YAML.
Validates the "restart resolves orphans" failure-mode story.
- `per_graph_policy_enforced_on_post_created_graph` — POST a graph
with a per-graph policy attached, then send authenticated read
and change requests. Per-graph Cedar enforcement fires correctly
on a POST-created graph (engine-layer policy reinstalled via
`Omnigraph::with_policy` inside the create flow).
- `concurrent_post_graphs_distinct_ids_all_succeed` — 4 concurrent
POSTs with distinct graph_ids all return 201. Caught a real
race in `rewrite_atomic` (see below).
Race fix — `rewrite_atomic_with_modify`:
The first composite test surfaced a real bug. The old
`rewrite_atomic(path, new_config, expected_hash)` captured the
baseline hash OUTSIDE the flock, then called rewrite_atomic which
re-acquired it inside. Under concurrent writers:
- POST A: captures baseline H0, calls rewrite_atomic.
- POST B: captures baseline H0 too (before A's update lands).
- A: acquires flock, on-disk == H0, writes H1, releases.
- A: updates baseline H0 → H1.
- B: tries to acquire flock — waits.
- B: acquires flock. On-disk is now H1. Expected (captured
before A finished) is H0. MISMATCH → spurious Drift error.
Worse: even if the timing happens to align, B's `updated` config
was constructed from BYTES read before the flock. B writes a config
that doesn't include A's new graph — silent data loss.
The fix: new `config::rewrite_atomic_with_modify(path, baseline,
modify)` takes a closure. Inside the flock + baseline mutex:
1. Read on-disk bytes, hash, compare to baseline.
2. Parse on-disk YAML.
3. Call `modify(parsed)` to produce the new config — receives
fresh on-disk state, returns the modification.
4. Serialize + write + fsync + rename + update baseline.
Everything is read-modify-write under the same critical section.
Concurrent writers serialize cleanly. Test confirmed this is no
longer a race.
The old `rewrite_atomic(path, new_config, expected_hash)` API stays
for tests that don't need the read-modify-write shape; the POST
handler switches to the new shape.
Version bump v0.6.0 → v0.7.0:
- All 5 `crates/*/Cargo.toml` (compiler, engine, policy, cli, server)
plus their inter-crate `path` dep version constraints.
- `Cargo.lock` regenerated by `cargo build --workspace`.
- `AGENTS.md` "Version surveyed" line, capability matrix HTTP-server
row updated to mention multi-graph + cluster routes + atomic YAML
rewrite.
- `openapi.json` regenerated.
Docs:
- `docs/releases/v0.7.0.md` (new) — release notes with breaking
changes, new features, deferred items (DELETE, `delete_prefix`,
actor forwarding), and the single→multi migration recipe.
- `docs/user/server.md` — substantial section additions for the
two modes, mode inference, cluster endpoint table, management
endpoints, `omnigraph.yaml` ownership contract, `POST /graphs`
body shape + status codes.
- `docs/user/cli.md` — `omnigraph graphs list/create` section,
deferred-DELETE note.
- `docs/user/policy.md` — server-scoped Cedar actions
(`graph_create`, `graph_list`), per-graph vs server-level policy
composition, example server-level policy.
Workspace test pass: 573 tests green across all crates. Zero
failures. MR-731 spoof regression still pinned and passing across
the entire 10-PR series.
This commit closes MR-668. v0.7.0 is ready for tagging.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* mr-668: remove POST /graphs and CLI graphs create (defer runtime graph mgmt)
The POST /graphs runtime-create endpoint shipped in PR 7/10 has three
unresolved high-severity bugs:
- flock-on-renamed-inode race: the YAML flock is taken on
omnigraph.yaml itself, then a temp file is renamed over it.
Cross-process writers end up locking different inodes — both
believing they hold exclusive access.
- duplicate-check outside the file lock: precheck runs against
the in-memory registry only; the locked closure does
config.graphs.insert(...) unconditionally. Concurrent same-id
POSTs can persist the loser in YAML while the in-memory registry
keeps the winner — they disagree after restart.
- best_effort_cleanup_init_artifacts deletes _schema.pg /
_schema.ir.json / __schema_state.json on any init failure. An
accidental re-init against an existing graph's URI destroys its
schema; subsequent open() fails at read_text(_schema.pg).
The correct fix is a Lance-style cluster catalog (reserve → init →
publish with recovery sidecars), parallel to the engine's existing
__manifest discipline. That work is out of scope for v0.7.0.
For now, disable runtime add/remove from the network and CLI surface.
Operators add graphs by editing omnigraph.yaml and restarting. The
GET /graphs read-only enumeration stays.
Removed:
- POST /graphs handler + router fragment + utoipa registration
- 13 post_graphs_* server tests + 3 composite POST tests +
multi_mode_app_with_real_config / post_graph helpers
- CLI omnigraph graphs create subcommand + its handler + cli.rs tests
- system_remote.rs combined list+create test trimmed to list-only
- YAML rewrite infra: rewrite_atomic[_with_modify], RewriteAtomicError,
staging_path, hash_config_file, AppState::config_hash field +
threading through new_multi and open_multi_graph_state
- fs2 dependency (verified absent from cargo tree)
- sha2/fs2 imports in config.rs (only the rewrite path used them)
- Cedar PolicyAction::GraphCreate variant + "graph_create" match arms
+ action def in Cedar schema + graph_create_action_authorizes_against_server_resource test
- GraphCreateRequest / GraphCreateResponse / GraphSchemaSpec /
GraphPolicySpec API types (only the POST handler / CLI imported them)
Kept:
- GET /graphs (read-only enumeration) and graph_list Cedar action
- omnigraph graphs list CLI subcommand
- All multi-graph startup, mode inference, cluster routes,
per-graph + server-level Cedar policies
- server_settings_drive_multi_graph_startup_end_to_end (the test
that covers operator-authored YAML + restart — the path that
survives)
- best_effort_cleanup_init_artifacts and the three init failpoints
(still reachable from CLI `omnigraph init`; preflight fix deferred
as a follow-up)
- GraphRegistry::insert and its concurrency tests — production
callers gone, but the method is the natural seam for the future
cluster-catalog work
Also fixed (transcript issue 4):
- ALWAYS_FLAT_PATHS now includes /graphs so multi-mode OpenAPI
advertises the management route correctly (was previously rewritten
to /graphs/{graph_id}/graphs)
- multi_mode_openapi_keeps_healthz_flat → renamed to
multi_mode_openapi_keeps_management_paths_flat, asserts both
/healthz and /graphs stay flat
- multi_mode_openapi_prefixes_operation_ids_with_cluster skips
/graphs in addition to /healthz
Doc fixes:
- docs/user/cli.md: graphs list example was --target http://...,
but --target is a config-graph-name lookup; corrected to --uri.
Removed the graphs create example.
- docs/user/server.md: dropped POST /graphs row, "omnigraph.yaml
ownership", and "POST /graphs body shape" sections. Added a
paragraph stating runtime add/remove is not exposed in v0.7.0.
- docs/user/policy.md: dropped graph_create action; reworded the
"Configuration" line to clarify that server-scoped rules (graph_list)
take neither branch_scope nor target_branch_scope.
- docs/releases/v0.7.0.md: rewrote release narrative — multi-graph
mode ships; runtime add/remove deferred.
- AGENTS.md: HTTP server bullet and capability matrix row updated to
reflect read-only GET /graphs and the operator-edit workflow.
- openapi.json regenerated; /graphs has only .get, no .post.
Diff: 17 files, +123 −1525 LOC.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* mr-668: comment cleanup and policy format style
Strip "PR Na/Nb" sub-PR references throughout MR-668 surfaces — they
were useful during the 10-PR delivery sequence but rot now that the
work is in the tree. Keep the MR-668 umbrella references.
Also:
- Add explicit `when = when` and `resource_literal = resource_literal`
named args in `compile_policy_source`'s outer `format!` to match the
surrounding crate style (already explicit for `group` and `action`).
- Rename the best-effort cleanup tracing target from
"omnigraph::init" to "omnigraph::init::cleanup" so operators can
filter init-failure cleanup events separately from init's other
log lines.
No behavior change.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* mr-668: drop actor_id from PolicyRequest; pass actor as separate arg
The MR-731 "server-authoritative actor identity" invariant was enforced
by an in-function chokepoint (`request.actor_id = actor.actor_id...`
overwrite inside `authorize_request`). That worked but relied on every
caller passing in a `PolicyRequest` and trusting the overwrite — a
comment-enforced invariant.
Move the invariant into the type system:
* `PolicyRequest` no longer carries `actor_id`. The struct now models
what a caller wants to do, not who they are.
* `PolicyEngine::authorize(actor_id: &str, request: &PolicyRequest)`
and `validate_request(actor_id, request)` take identity as a
separate argument. The same shape `PolicyChecker::check` already had
for the engine layer.
* `authorize_request` in the HTTP layer extracts `actor_id` from the
bearer-resolved `ResolvedActor` and passes it positionally — no
overwrite step that could be skipped.
* CLI `omnigraph policy explain` updated (the only other consumer
that built a `PolicyRequest`).
Public API break for the `omnigraph-policy` crate. Worth it: handlers
can no longer accidentally populate `actor_id` from a request body
field, and external consumers are forced by the compiler to source
actor identity from a trusted path.
The MR-731 chokepoint test
`actor_id_resolves_from_bearer_token_ignoring_client_supplied_headers`
still passes — the bearer-resolved actor is what reaches the engine.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* mr-668: consolidate AppState single-mode constructors; delete with_policy_engine
The prior `with_policy_engine` constructor reused the engine `Arc`
from the existing handle (`engine: Arc::clone(&existing.engine)`)
without re-applying `Omnigraph::with_policy`. Combined with
`new_with_workload`, the documented composition pattern was
`AppState::new_with_workload(...).with_policy_engine(p)` — which
produced an `AppState` whose HTTP layer enforced Cedar but whose
underlying engine had no `PolicyChecker` installed. Any caller
reaching the engine via `state.registry().list()[i].engine` could
bypass policy entirely. The doc comment named this gap; the type
system didn't.
Make composition impossible to get wrong:
* Add `AppState::new_single(uri, db, tokens, Option<PolicyEngine>,
WorkloadController)` — canonical single-mode constructor that
takes every option together and routes through `build_single_mode`
(which applies `db.with_policy(checker)` to the engine itself).
* `new`, `new_with_bearer_token`, `new_with_bearer_tokens`,
`new_with_bearer_tokens_and_policy`, `new_with_workload` all become
thin wrappers around `new_single`.
* Delete `with_policy_engine`. There is no post-construction policy
install path any more; the single linear construction forces
HTTP-layer and engine-layer policy to install together or not at all.
Regression test `engine_layer_policy_fires_via_direct_arc_omnigraph_from_new_single`
constructs an `AppState::new_single` with a deny-all policy, pulls
the `Arc<Omnigraph>` from the registry handle (the same path an
embedded SDK consumer would take), and asserts a direct `mutate_as`
call returns `OmniError::Policy`. Pre-fix this test would have
succeeded the mutation.
Test caller in `ingest_per_actor_admission_cap_returns_429`
migrates from `.with_policy_engine(...)` to `new_single(...,
Some(policy_engine), workload)`.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* mr-668: derive any_per_graph_policy on RegistrySnapshot; simplify dup check
`AppState::requires_bearer_auth` walked the entire registry per
request (cloning Arcs into a `Vec`, then `.iter().any(|h| h.policy
.is_some())`) to decide whether the auth middleware should challenge.
The walk is unnecessary — the answer only changes when the registry
mutates, which is exactly the moment a new snapshot is constructed.
Move the flag onto the snapshot itself:
* `RegistrySnapshot { graphs, any_per_graph_policy: bool }`.
* `RegistrySnapshot::new(graphs)` is the only construction path —
it derives the flag from `graphs.values().any(|h| h.policy
.is_some())` so the cached value can't drift from the source data.
* `Default` delegates to `new(HashMap::new())`.
* `GraphRegistry::from_handles` and `insert` build snapshots via
`RegistrySnapshot::new(...)`.
* `GraphRegistry::snapshot_ref()` exposes the current snapshot
through an `arc_swap::Guard`; callers that need cached derived
state go through this accessor (callers that only want `graphs`
still use `list` / `get`).
`requires_bearer_auth` becomes one `ArcSwap::load` + bool read.
Also (drive-by, same file, same hunk): replace the dead
`if let Some(other) = seen_uris.get(...)` + `let _ = other;` pattern
in `from_handles` with a plain `seen_uris.contains_key(...)`.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* mr-668: fail-fast multi-graph startup with try_collect
The `open_multi_graph_state` doc comment claims "Fail-fast — the
first open error aborts startup; other in-flight opens are dropped"
but the code did
.buffer_unordered(4)
.collect::<Vec<_>>()
.await
.into_iter()
.collect::<Result<Vec<_>>>()?;
which drains every future in the stream before propagating the first
`Err`. With N S3-backed graphs and graph #2 failing fast, the caller
still waits for #1, #3, #4, … to either succeed or fail before
seeing the error.
Replace the four-line dance with `futures::TryStreamExt::try_collect`,
which short-circuits on the first `Err` and drops the rest. The
doc comment now matches behavior.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* mr-668: drop unused State extractor from 7 read-only handlers
After the routing-middleware refactor moved the engine into the per-graph
`GraphHandle` (extracted via `Extension<Arc<GraphHandle>>`), seven
read-only handlers — `server_snapshot`, `server_read`, `server_export`,
`server_schema_get`, `server_branch_list`, `server_commit_list`,
`server_commit_show` — kept an unused `State(_state): State<AppState>`
extractor. Drop it. Each request avoids one `FromRequestParts` clone
of `AppState`'s Arcs.
Handlers that actually use state (workload admission for write paths,
`server_policy` for management endpoints) keep theirs.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* mr-668: emit info! for graph routing decision
`tracing::Span::current().record("graph_id", ...)` in the routing
middleware silently no-ops here: no upstream `#[tracing::instrument]`
on the handlers declares a `graph_id` field, and `TraceLayer::new_for_http`
doesn't either. The recorded value never lands anywhere visible.
Replace with an explicit `info!(graph_id = %handle.key.graph_id,
"graph routed")` event so operators can grep logs and correlate
requests with the active graph. In single mode the value is the
sentinel `"default"`.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* mr-668: align GET /graphs 405 body code with HTTP status
The single-mode `GET /graphs` handler returned an `ApiError` built
via struct literal with `status: METHOD_NOT_ALLOWED, code: BadRequest`.
The body code disagreed with the HTTP status — clients deserializing
on `code` saw `bad_request`, clients deserializing on `status` saw
405. Same bug class as the earlier 503+Conflict mismatch on the
removed YAML drift path.
Close the class for this one remaining instance:
* Add `ErrorCode::MethodNotAllowed` to the API enum.
* Add `ApiError::method_not_allowed(msg)` — pairs the 405 status
with the matching code.
* Replace the struct literal in `server_graphs_list` with the
constructor.
* Regenerate `openapi.json` (adds `method_not_allowed` to the
ErrorCode schema enum).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* mr-668: drop unused axum::handler::Handler import
The import landed in earlier work but no current call site uses it.
Emitted an `unused_imports` warning on every server build.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* mr-668: drop unused fs2 workspace dependency
`fs2 = "0.4"` lingered in [workspace.dependencies] after the
POST /graphs flock-on-rename design was pulled. `cargo tree -i fs2`
reports no consumers in the workspace and the dep is not in
Cargo.lock. Removing the declaration closes the "phantom dep" class.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* mr-668: AGENTS.md Cedar row no longer hardcodes action count
The "8 actions" claim drifted as soon as MR-668 added `graph_list`.
Bumping the count would just push the drift one PR forward; the
correct-by-design fix is to defer to the canonical list in
docs/user/policy.md and stop maintaining a duplicate count.
Closes the "doc hardcodes a count that drifts from the enum" class.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* mr-668: cfg(test)-gate GraphRegistry::insert and its mutex
`insert` and the `mutate: Mutex<()>` that serializes it had no
runtime consumer in v0.7.0 — the only insertion path at startup
is `from_handles`, and runtime add/remove is deferred until a
managed cluster catalog ships. Leaving both `pub` and live made
them a "looks like API, isn't" footgun: a future change could
build on `insert` without re-establishing the concurrency contract
with an actual consumer in scope.
Gate both together (`#[cfg(test)]` on the method, the field, and
the `tokio::sync::Mutex` import) so the race-pinning tests still
compile but production cannot reach them. When a real consumer
ships, ungate both — they're a unit. Closes the "public API with
no runtime consumer drifts toward incorrect" class.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* mr-668: drop vestigial PolicyEngine surface
* `validate_request` had zero callsites — pure surface for nothing.
* `deny`'s `_actor_id` and `_request` parameters were both unused
(the underscore prefix gave it away); the message is built by the
caller before `deny` ever sees the request. Trim both.
Closes the "public API that the type system can't justify" class
for the policy engine. No behavior change; every existing test
stays green because the deletions never had a runtime effect.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* mr-668: regression test for init re-init footgun (red)
A second `Omnigraph::init` against an existing graph URI today
destroys the existing graph's schema artifacts. `init_storage_phase`
overwrites `_schema.pg` before any preflight, and on the inner
`GraphCoordinator::init` failure that follows,
`best_effort_cleanup_init_artifacts` deletes all three schema files.
The existing Lance datasets and `__manifest/` survive but the
schema metadata is gone — unrecoverable without operator surgery.
This test exercises that path and currently fails with
"_schema.pg must not be deleted by a failed re-init", confirming
the destructive cleanup branch fires. The fix in the next commit
makes the test pass by preflighting with `storage.exists()` and
returning a typed error before any write touches disk.
Per AGENTS.md rule 12, the test commit lands just before the fix
commit so the red → green pair is visible in `git log` and a
reviewer can check out this commit alone to reproduce.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* mr-668: close init re-init footgun via InitOptions preflight (green)
`Omnigraph::init` is "create a new graph"; existing graphs need
an explicit overwrite. Today's behavior — silently overwrite
schema files, then on inner failure delete them via best-effort
cleanup — is destructive against an existing graph regardless of
which branch fires.
Correct-by-design fix:
* New `InitOptions { force: bool }` struct (default `force: false`).
* New `Omnigraph::init_with_options(uri, schema, options)`. The
old `Omnigraph::init(uri, schema)` is a thin shortcut that
passes `InitOptions::default()`.
* `init_with_storage` runs a `storage.exists()` preflight on the
three schema URIs BEFORE any parse, write, or coordinator call.
Any hit → typed `OmniError::AlreadyInitialized { uri }`. The
destructive code paths (the `write_text` overwrite and the
best-effort cleanup) are now unreachable in strict mode against
an existing graph.
* `force: true` skips the preflight; existing operators who
actually mean to overwrite opt in explicitly.
* CLI: `omnigraph init --force` maps to `InitOptions { force: true }`.
* HTTP: `OmniError::AlreadyInitialized` maps to 409 via
`ApiError::from_omni`. Not currently HTTP-reachable (POST /graphs
was pulled), but the wiring lands here so a future runtime
create endpoint has one canonical translation.
Closes the "init is destructive against existing state" class.
The regression test added in the previous commit
(`init_on_existing_graph_uri_does_not_destroy_existing_schema`)
turns green: the original schema files now survive a second
init attempt byte-for-byte, and the call errors cleanly with
`AlreadyInitialized`. The four existing
`init_failpoint_after_*_cleans_up_*` tests stay green — strict
mode's preflight passes on a fresh tempdir, and cleanup still
runs as before when a failpoint fires mid-write.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* mr-668: split PolicyEngine::load into kind-typed loaders
Pre-fix, every caller of `PolicyEngine::load(path, graph_id)`
passed *some* `graph_id` argument — even when the policy was
server-scoped and Cedar's resolution would never touch a Graph
entity. The server-level loader at lib.rs passed the meaningless
sentinel `"server"`. A graph policy file containing a `graph_list`
rule compiled fine; a server policy file containing a `read` rule
compiled fine. Both silently no-op'd at request time because the
engine kind and the rule's resource kind disagreed.
Correct-by-design fix: replace `load` with two kind-typed loaders.
* `PolicyEngine::load_graph(path, graph_id)` — for per-graph
policy files. Rejects any rule whose action `resource_kind()`
is `Server`.
* `PolicyEngine::load_server(path)` — for server-level policy
files. Takes no `graph_id`: server-scoped actions resolve against
the singleton `Omnigraph::Server::"root"` entity, never a Graph.
Rejects any rule whose action `resource_kind()` is `Graph`.
The old `load` is hard-deleted in the same commit because every
in-tree consumer migrates here (no semver promise on the workspace
crate, no external pinners). New `PolicyEngineKind` enum types
the loader's intent; `validate_kind_alignment` is the load-time
check that closes the "wrong action, wrong file, silent no-op"
class — operators get a load-time error instead of confused-and-
silent behavior at request time.
Callsites migrated:
* server lib.rs:374 (single-mode per-graph) → load_graph
* server lib.rs:1065 (multi-mode server) → load_server
* server lib.rs:1103 (multi-mode per-graph) → load_graph
* CLI main.rs:732 (resolve_policy_engine) → load_graph
* tests/server.rs ×5 (4 graph, 1 server) → load_graph/load_server
* policy_engine_chassis.rs → load_graph
Four new in-source tests pin the contract: both rejection paths
and both positive paths.
Closes the "operator puts an action in the wrong file and the
rule silently never matches" class.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* mr-668: introduce GraphRouting, retire single_mode_handle
Pre-fix, `AppState` always carried `Arc<GraphRegistry>` even when
serving one graph. Single mode populated the registry with one
handle keyed by the `SINGLE_GRAPH_KEY_ID = "default"` sentinel;
`single_mode_handle` walked the registry, asserted `len == 1`,
and returned the single element with a 500-class "programmer
error" branch on mismatch. Three smells in a row — magic key,
walk-and-assert, programmer-error guard — all because the
single-mode runtime was forced through a multi-mode abstraction.
Correct-by-design fix: type the routing.
* New `pub enum GraphRouting { Single { handle }, Multi { registry,
config_path } }` on `AppState`. The `Single` arm carries the handle
directly — no registry, no key, no walk.
* `resolve_graph_handle` middleware matches on `routing`. Single mode
returns the handle in O(1); multi mode does the same path-extract +
registry lookup as before. The 500-class programmer-error branch
is gone — the type system now makes the violated invariant
("single mode has exactly one handle") unrepresentable.
* `requires_bearer_auth` reads `handle.policy.is_some()` directly
in the Single arm; Multi arm still uses the cached
`any_per_graph_policy` flag.
`ServerMode` and the legacy `registry` field on `AppState` are still
populated for now — C-3 removes both once every reader is migrated.
The `SINGLE_GRAPH_KEY_ID` sentinel and `ServerMode` will also go
away in C-3.
Closes the "single mode forced through a multi-mode abstraction"
class. All 76 server integration tests stay green: handlers still
extract `Extension<Arc<GraphHandle>>` from the request, so the
middleware's internal change is invisible to them.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* mr-668: remove ServerMode, registry field, and the SINGLE_GRAPH sentinel
C-1/C-2 introduced `GraphRouting` and pointed the middleware at it.
This commit removes the legacy shape that's now dead:
* `ServerMode` enum — deleted. Single mode's `uri` lives on
`handle.uri`; multi mode's `config_path` lives on the
`GraphRouting::Multi` arm.
* `AppState.mode: ServerMode` field — deleted.
* `AppState.registry: Arc<GraphRegistry>` field — deleted. Multi
mode's registry is on `GraphRouting::Multi { registry, .. }`;
single mode has no registry at all.
* `AppState::mode()`, `AppState::uri()`, `AppState::registry()`
accessors — deleted. New `AppState::routing() -> &GraphRouting`
is the single public entry point.
* `SINGLE_GRAPH_KEY_ID` constant — deleted. `GraphHandle.key` is
still required by the struct, but in single mode the key is now
only a tracing label (`"default"`, inlined with a comment naming
its sole remaining purpose). Single-mode flat routes never carry
a `{graph_id}` parameter, so the key is never compared against
user input, and there is no registry where it could be a map
key. C-1/C-2 already removed the registry walk that the sentinel
was named for.
Callers migrated:
* `build_app` (lib.rs:944) — matches on `state.routing()` instead
of `state.mode()`.
* `server_graphs_list` (lib.rs:1162) — destructures the Multi arm
to get the registry; Single arm short-circuits to 405.
* `server_openapi` (lib.rs:1217) — matches the Multi arm for the
cluster-prefix rewrite.
* `tests/server.rs:3735` — the B2 footgun regression test now
matches on `state.routing()` to extract the single-mode handle
(the test's earlier `state.registry().list().next()` shape was
the closest pre-fix analog to "embedded consumer reaches the
engine"; the new shape is more direct).
Closes the entire "single mode forced through a multi-mode
abstraction" class. After this commit:
* No magic sentinel as a routing key.
* No `single_mode_handle` walk-and-assert helper.
* No 500-class "programmer error" branch in the middleware.
* No two-field discriminant on `AppState` where one would do.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* mr-668: regression test for nested-route path extraction (red)
`server_branch_delete` and `server_commit_show` use bare
`Path<String>` extractors. In single-mode flat routes
(`/branches/{branch}`, `/commits/{commit_id}`) this works — one
capture, one value. In multi-graph cluster routes
(`/graphs/{graph_id}/branches/{branch}`,
`/graphs/{graph_id}/commits/{commit_id}`) axum 0.8 propagates the
outer `{graph_id}` capture into the inner handler, so the
extractor sees two captures and 500s with
"Wrong number of path arguments. Expected 1 but got 2."
`cluster_routes_dispatch_per_graph_handle` only exercises
`/snapshot` (no Path extractor), so the regression slipped through.
This test closes that gap structurally: every cluster route with
an inner path param gets exercised here.
Currently fails with the exact symptom above. Fix in the next
commit makes it pass.
Per AGENTS.md rule 12, the red test commit lands just before the
fix so the pair is visible in `git log` and a reviewer can check
out this commit alone to reproduce.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* mr-668: named-field path-param structs for nested cluster routes (green)
`Path<String>` deserializes one path-param value positionally.
Single-mode flat routes (`/branches/{branch}`,
`/commits/{commit_id}`) have one capture; multi-mode nested routes
(`/graphs/{graph_id}/branches/{branch}`,
`/graphs/{graph_id}/commits/{commit_id}`) have two — axum 0.8
propagates the outer capture into nested handlers. Same handler,
two different shapes; the multi-mode shape 500s with
"Wrong number of path arguments. Expected 1 but got 2."
Symptomatic fix: change to `Path<(String, String)>` and ignore the
first element. Breaks again the moment we add another nest layer
(e.g. tenant in Cloud mode).
Correct-by-design fix: named-field structs deserialized by name
from axum's path-param map. Each handler picks only the fields it
needs. Stable across single / multi / future-cloud nest depths
because deserialization is by field name, not position.
* New `BranchPath { branch: String }` (file-local to lib.rs)
* New `CommitPath { commit_id: String }`
* `server_branch_delete` extractor → `Path<BranchPath>`
* `server_commit_show` extractor → `Path<CommitPath>`
Closes the "handler path-extractor type is positional and breaks
when route nesting changes" class. Red test from the previous
commit turns green. All 77 server tests pass (single-mode branch
delete + commit show, plus new multi-mode coverage).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* mr-668: centralize policy-requires-tokens check in the runtime classifier
Single-mode `open_with_bearer_tokens_and_policy` bailed at lib.rs:380
when policy was installed and no tokens. Multi-mode
`open_multi_graph_state` had no equivalent: the server started, every
request 401'd because no token could ever match, and the operator
spent time debugging a misconfiguration the single-mode path would
have caught at startup.
The doc/code contradiction made the gap easy to miss: the
`ServerRuntimeState::PolicyEnabled` docstring said tokens-or-not
was "unusual but valid — every request fails 401 without a bearer,
which is effectively 'locked'." The single-mode bail contradicted
that. In practice, silent-401-on-every-request is bug-shaped, not
feature-shaped (operators wanting deny-all should configure tokens
plus a deny-all Cedar rule to get meaningful 403s with
policy-decision logging).
Symptomatic fix: add a copy of the bail to multi-mode. Two copies
that can drift again the next time a startup path is added.
Correct-by-design fix: hoist the check into
`classify_server_runtime_state` so both modes get the same
enforcement from one source of truth. The classifier becomes the
single source of truth for "should we start?" and adding a startup
invariant there is now the natural extension point for any future
mode.
Classifier matrix is now complete:
| has_tokens | has_policy | allow_unauthenticated | Result |
|---|---|---|---|
| F | F | F | bail (existing) |
| F | F | T | Open (existing) |
| T | F | * | DefaultDeny (existing) |
| F | T | * | bail (NEW — closes the gap) |
| T | T | * | PolicyEnabled (existing) |
Changes:
* `classify_server_runtime_state` (lib.rs:870-890) gains the
`(false, true, _) => bail!(…)` arm with a clear message naming
the failure mode and the two valid resolutions.
* `open_with_bearer_tokens_and_policy` (lib.rs:369+) drops its
redundant local bail — the classifier rejected the invalid case
before construction was reached.
* `ServerRuntimeState::PolicyEnabled` docstring is rewritten:
drops the "(unusual but valid)" carve-out and states plainly
that PolicyEnabled requires tokens. Names the explicit
alternative (tokens + deny-all Cedar rule) for operators who
want the all-requests-denied behavior.
* `classify_policy_enabled_always_wins` test is renamed to
`classify_policy_enabled_requires_tokens` and the now-invalid
`(false, true, _)` assertion is removed (covered by the new
rejection test).
* New `classify_policy_without_tokens_is_rejected` test covers the
new arm.
* New `serve_refuses_to_start_with_policy_but_no_tokens_multi_mode`
integration test pins the multi-mode propagation path —
symmetric with the existing single-mode
`serve_refuses_to_start_in_state_1_without_unauthenticated`.
Closes the "single mode and multi mode startup branches can drift
on safety invariants" class.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* mr-668: close coverage gaps surfaced by the test-coverage audit
The bot-review pass and the subsequent coverage audit surfaced two
material gaps in PR #119's test surface — both easy to close, both
worth closing before merge.
* **Gap 1 — cluster-route sweep.** The Bug-1 path-extractor
regression slipped through because
`cluster_routes_dispatch_per_graph_handle` only exercised
`/snapshot`. The other six protected cluster routes (`/read`,
`/change`, `/export`, `/schema`, `/schema/apply`, `/ingest`,
`/branches/merge`) were implicitly trusted to work without any
multi-mode integration test.
Add `all_protected_cluster_routes_resolve_to_their_handler`
(`tests/server.rs`) that hits each protected cluster route with
a minimal request and asserts the response is consistent with
the handler being reached — no 404 (router didn't match), no 500
with "Wrong number of path arguments" (Bug-1 class), no 500 with
"missing extension" (routing middleware didn't inject the handle).
Status code is a negative assertion because each handler's
happy-path inputs differ; what matters is "the request reached
the handler," not "the handler returned 200" — that's already
pinned by the single-mode tests.
* **Gap 2 — `--force` happy path.** The strict re-init regression
test (`init_on_existing_graph_uri_does_not_destroy_existing_schema`)
pins the error path; nothing pinned the `force: true` escape
hatch actually doing what its docstring claims.
Add `init_with_force_recovers_from_orphan_schema_files`
(`tests/lifecycle.rs`). Writes a bare `_schema.pg` to simulate
orphan files from a failed prior init, confirms strict mode
bails as expected, then confirms `init_with_options(force: true)`
succeeds and produces a functional graph.
Note: the test follows the documented semantics — force skips
the preflight only, it does NOT purge existing Lance state. An
earlier draft of the test (against full overwrite of an existing
populated graph) failed because `GraphCoordinator::init` errored
on the existing `__manifest`, which is exactly the limitation
the `InitOptions::force` docstring already calls out. Recursive
purge needs `StorageAdapter::delete_prefix` (tracked separately).
Coverage is now fully aligned with the PR's claims.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* mr-668: regression test for GraphList open-mode bypass (red)
Cursor bot's review at commit 4120448 surfaced that
`server_graphs_list` returns 200 in Open mode (`--unauthenticated`,
no tokens, no policy), exposing the full graph registry — graph
IDs and URIs that may contain S3 bucket paths or internal
hostnames — to any unauthenticated caller.
Root cause: `authorize_request`'s no-policy fallback only denies
when `actor.is_some()`. In Open mode `actor: None`, so the
denial branch never fires and the call returns `Ok(())`. The
docstring on `server_graphs_list` claims the endpoint is
"Cedar-gated" and that we "don't leak the registry until the
operator explicitly authorizes it" — but Open mode has no Cedar
at all, so the docstring intent and the code disagree.
This commit renames the existing
`get_graphs_lists_registered_graphs_in_multi_mode` test to
`get_graphs_denied_in_open_mode_without_server_policy` and flips
the assertion from 200 → 403. Today this fails (server returns
200) — exactly the symptom the bot named. The fix in the next
commit tightens the no-policy fallback to deny server-scoped
actions unconditionally, regardless of mode.
Per AGENTS.md rule 12, the red test commit lands just before
the fix so the red → green pair is visible in `git log` and a
reviewer can check out this commit alone to reproduce.
Sort-order coverage that previously lived in the renamed test
moves to `get_graphs_with_server_policy_authorizes_per_cedar`
in the next commit, where the admin-200 response is operator-
authorized and a non-empty body is asserted.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* mr-668: server-scoped actions always require explicit policy (green)
`server_graphs_list` returned 200 in Open mode (`--unauthenticated`,
no tokens, no policy) because `authorize_request`'s no-policy
fallback only denied when `actor.is_some()` AND action != Read.
In Open mode `actor: None`, so the denial branch never fired and
the call returned `Ok(())` — leaking the registry (graph IDs +
URIs that may contain S3 bucket paths or internal hostnames) to
any unauthenticated caller. The docstring on `server_graphs_list`
claimed it was "Cedar-gated" and that the server should "not leak
the registry until the operator explicitly authorizes it" —
docstring intent and code disagreed.
Symptomatic fix: special-case GraphList. Breaks the moment
another server-scoped action (`graph_create`, `graph_delete`) is
added.
Correct-by-design fix: tie authorization to the action's
`resource_kind()`. Server-scoped actions
(`PolicyResourceKind::Server`) always require explicit policy
authorization — there is no runtime state where they're served
by default. Per-graph actions keep the existing default-deny
logic (DefaultDeny denies non-Read for authenticated actors;
Open mode allows everything per the operator's `--unauthenticated`
opt-in for graph DATA, but not for server topology).
The fix uses the existing `PolicyResourceKind` enum that #119
already added — no new abstraction. Future server-scoped actions
(runtime `graph_create`/`graph_delete` when the cluster catalog
ships) automatically pick up the same enforcement without any
per-action handler change.
Changes:
* `crates/omnigraph-server/src/lib.rs:51` — re-export
`PolicyResourceKind` (the kind discriminator was already public
on the omnigraph-policy crate; needed in scope here).
* `crates/omnigraph-server/src/lib.rs:1457` — `authorize_request`'s
no-policy fallback gains a server-scoped-action check that fires
before the actor-based default-deny logic. Error message names
the failure mode and points at `server.policy.file`.
* `crates/omnigraph-server/tests/server.rs:5037` —
`get_graphs_with_server_policy_authorizes_per_cedar` extended
to register two graphs in non-alphabetical order and assert
the admin-200 response is sorted alphabetically. Restores the
sort-order coverage that lived in
`get_graphs_lists_registered_graphs_in_multi_mode` before the
red commit renamed it to assert denial.
Also bundles a small adjacent cleanup that the bot-review flagged:
* `crates/omnigraph-server/src/graph_id.rs:124` — drop the
unreachable `"openapi.json"` entry from `is_reserved`. The
regex `^[a-zA-Z0-9-]{1,64}$` rejects every dot-containing name
before `is_reserved` can run, so dotted entries in this list
were dead code that misled readers into thinking the list
needed to cover them. Comment now names the structural
exclusion. The `rejects_reserved_route_names` test loses its
`openapi.json` row (covered by `rejects_dots` via the regex).
Closes the "server-scoped management actions silently leak in
Open mode" class. Red test from the previous commit
(`get_graphs_denied_in_open_mode_without_server_policy`) turns
green; all 78 server integration tests + 76 lib tests pass.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* mr-668: fold multi-graph work into v0.6.0 (no separate v0.7.0 release)
The branch had bumped workspace versions to 0.7.0 and added a
dedicated `docs/releases/v0.7.0.md` for the multi-graph work.
Per scope decision: ship the graph-rename and the multi-graph
mode in one v0.6.0 release.
Changes:
* Workspace versions bumped 0.7.0 → 0.6.0 in every crate manifest
(`omnigraph`, `omnigraph-compiler`, `omnigraph-policy`,
`omnigraph-server`, `omnigraph-cli`) and their internal
`path = ..., version = "..."` dependency constraints.
* `docs/releases/v0.7.0.md` content merged into
`docs/releases/v0.6.0.md`, retargeted to a single coherent
v0.6.0 release note covering both the graph terminology rename
and the multi-graph server mode. The original v0.7.0.md is
deleted.
* All `v0.7.0` / `0.7.0` doc and comment references throughout
`crates/`, `docs/`, `AGENTS.md`, and `openapi.json` retargeted
to `v0.6.0` / `0.6.0`. `Cargo.lock` regenerated to match.
* OpenAPI spec regenerated via `OMNIGRAPH_UPDATE_OPENAPI=1
cargo test -p omnigraph-server --test openapi
openapi_spec_is_up_to_date` — `"version": "0.6.0"` now.
Verification:
* `cargo build --workspace` — clean (6 pre-existing engine
warnings only).
* `cargo test --workspace --locked` — zero failures across all
39 test result groups.
* `bash scripts/check-agents-md.sh` — passes (34 links / 33 docs).
* `grep -rn "0\.7\.0\|v0\.7\.0" --include='*.rs' --include='*.md'
--include='*.json' --include='*.toml' .` returns no workspace
hits. The three remaining `0.7.0` strings in `Cargo.lock`
belong to unrelated 3rd-party crates (`pem-rfc7468`, `radium`,
`rand_xoshiro`).
The git tag and crates.io publish happen later — this commit
just consolidates the surface so the eventual release is one
coherent v0.6.0 covering all the work since v0.5.0.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* mr-668: sanitize internal refs from v0.6.0 release notes
cubic-dev-ai P2 comments flagged that the release notes carried
internal Linear ticket and RFC references (MR-668, MR-731,
MR-723, RFC 0003, RFC 0004). Per AGENTS.md maintenance rule 5,
"Release docs are public project history. Describe capabilities,
behavior changes, breaking changes, upgrade notes, and user
impact; do not reference private ticket systems, internal
codenames, or planning shorthand that an outside contributor
cannot inspect." The bot's comments are correct against our own
published contract — they were a docs-quality regression
introduced when I drafted these notes.
Replaced each internal reference with the public-facing concept
it stood for. The substantive content (capabilities, behavior,
guarantees) was already present alongside the refs; sanitization
just trimmed the bracketed ticket labels:
* Line 6: dropped `(MR-668)` from the multi-graph mode summary —
the descriptive name was already self-sufficient.
* Line 24: `MR-731 spoof defense` → `the bearer-derived-actor-
identity guarantee`; `Forward-compat for Cloud mode (RFC 0003)
and OAuth provider (RFC 0004)` → "forward-compat seams for
future multi-tenant and OAuth deployments; they're inert in
this release" — describes what the operator sees instead of
pointing at planning docs.
* Line 26: `MR-731's server-authoritative-actor invariant` →
"the server-authoritative-actor invariant: actor identity is
always sourced from the bearer-token match resolved at the
auth boundary" — the public-facing statement of the guarantee.
* Line 36: `(MR-723 default-deny otherwise rejects …)` →
"without a server policy the default-deny posture rejects …"
— same content, no ticket label.
* Line 121: `MR-731 spoof regression test` → "The bearer-auth-
derived-actor-identity regression test (client-supplied
identity headers are ignored; the server-resolved actor is the
only identity Cedar sees)" — describes what the test guards
instead of naming the originating ticket.
Verified: `grep -E 'MR-\d+|RFC[ -]?\d+' docs/releases/v0.6.0.md`
returns no matches; the rest of `docs/releases/` is also clean.
`scripts/check-agents-md.sh` passes.
Note: cubic-dev-ai also flagged `crates/omnigraph-cli/src/main.rs:276`
("doc comment incorrectly references v0.6.0 for a command that
only exists in v0.7.0"). That comment is based on a stale model
of the release surface — after folding v0.7.0 into v0.6.0 in
the previous commit, the multi-graph CLI surface IS in v0.6.0
and the comment is correct as written. No change needed.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
* fix: close validated init and multi-graph gaps
* chore: address review cleanup comments
---------
Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-28 16:19:31 +02:00
|
|
|
|
|
|
|
|
// ─── MR-668 PR 2a: Omnigraph::init cleanup on partial failure ──────────────
|
|
|
|
|
//
|
|
|
|
|
// `init_with_storage` writes three schema artifacts before invoking
|
|
|
|
|
// `GraphCoordinator::init`. Without cleanup, a failure between any of those
|
|
|
|
|
// steps left orphan files behind, making the URI unusable for a retry of
|
|
|
|
|
// `init` (it would refuse because `_schema.pg` already exists). The tests
|
|
|
|
|
// below pin: on failpoint trigger at each of the three phase boundaries,
|
|
|
|
|
// the three schema files are removed before the error is returned.
|
|
|
|
|
//
|
|
|
|
|
// Coverage note: the third boundary (`init.after_coordinator_init`) only
|
|
|
|
|
// asserts cleanup of the schema files. Lance per-type directories and
|
|
|
|
|
// `__manifest/` are NOT cleaned up — that requires a recursive
|
|
|
|
|
// `StorageAdapter::delete_prefix` primitive deferred along with
|
|
|
|
|
// `DELETE /graphs/{id}` (MR-668 PR 2b). The orphan Lance directories
|
|
|
|
|
// after a coordinator-init-phase failure are documented as a known
|
|
|
|
|
// limitation.
|
|
|
|
|
|
|
|
|
|
#[tokio::test]
|
|
|
|
|
async fn init_failpoint_after_schema_pg_written_cleans_up_schema_file() {
|
|
|
|
|
let _scenario = FailScenario::setup();
|
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
|
|
|
|
let uri = dir.path().to_str().unwrap();
|
|
|
|
|
let _failpoint = ScopedFailPoint::new("init.after_schema_pg_written", "return");
|
|
|
|
|
|
|
|
|
|
let err = match Omnigraph::init(uri, helpers::TEST_SCHEMA).await {
|
|
|
|
|
Ok(_) => panic!("expected Omnigraph::init to fail at the configured failpoint"),
|
|
|
|
|
Err(e) => e,
|
|
|
|
|
};
|
|
|
|
|
assert!(
|
|
|
|
|
err.to_string()
|
|
|
|
|
.contains("injected failpoint triggered: init.after_schema_pg_written"),
|
|
|
|
|
"got: {err}"
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
// Only `_schema.pg` was written at this phase boundary, but the
|
|
|
|
|
// cleanup attempts all three — `delete` treats not-found as Ok,
|
|
|
|
|
// so the other two deletes are no-ops.
|
|
|
|
|
assert!(
|
|
|
|
|
!dir.path().join("_schema.pg").exists(),
|
|
|
|
|
"_schema.pg must be cleaned up after init failure"
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[tokio::test]
|
|
|
|
|
async fn init_failpoint_after_schema_contract_written_cleans_up_all_schema_files() {
|
|
|
|
|
let _scenario = FailScenario::setup();
|
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
|
|
|
|
let uri = dir.path().to_str().unwrap();
|
|
|
|
|
let _failpoint = ScopedFailPoint::new("init.after_schema_contract_written", "return");
|
|
|
|
|
|
|
|
|
|
let err = match Omnigraph::init(uri, helpers::TEST_SCHEMA).await {
|
|
|
|
|
Ok(_) => panic!("expected Omnigraph::init to fail at the configured failpoint"),
|
|
|
|
|
Err(e) => e,
|
|
|
|
|
};
|
|
|
|
|
assert!(
|
|
|
|
|
err.to_string()
|
|
|
|
|
.contains("injected failpoint triggered: init.after_schema_contract_written"),
|
|
|
|
|
"got: {err}"
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
assert!(
|
|
|
|
|
!dir.path().join("_schema.pg").exists(),
|
|
|
|
|
"_schema.pg must be cleaned up"
|
|
|
|
|
);
|
|
|
|
|
assert!(
|
|
|
|
|
!dir.path().join("_schema.ir.json").exists(),
|
|
|
|
|
"_schema.ir.json must be cleaned up"
|
|
|
|
|
);
|
|
|
|
|
assert!(
|
|
|
|
|
!dir.path().join("__schema_state.json").exists(),
|
|
|
|
|
"__schema_state.json must be cleaned up"
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[tokio::test]
|
|
|
|
|
async fn init_failpoint_after_coordinator_init_cleans_up_schema_files() {
|
|
|
|
|
let _scenario = FailScenario::setup();
|
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
|
|
|
|
let uri = dir.path().to_str().unwrap();
|
|
|
|
|
let _failpoint = ScopedFailPoint::new("init.after_coordinator_init", "return");
|
|
|
|
|
|
|
|
|
|
let err = match Omnigraph::init(uri, helpers::TEST_SCHEMA).await {
|
|
|
|
|
Ok(_) => panic!("expected Omnigraph::init to fail at the configured failpoint"),
|
|
|
|
|
Err(e) => e,
|
|
|
|
|
};
|
|
|
|
|
assert!(
|
|
|
|
|
err.to_string()
|
|
|
|
|
.contains("injected failpoint triggered: init.after_coordinator_init"),
|
|
|
|
|
"got: {err}"
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
// Schema files are cleaned up by `best_effort_cleanup_init_artifacts`.
|
|
|
|
|
assert!(
|
|
|
|
|
!dir.path().join("_schema.pg").exists(),
|
|
|
|
|
"_schema.pg must be cleaned up after late-phase init failure"
|
|
|
|
|
);
|
|
|
|
|
assert!(
|
|
|
|
|
!dir.path().join("_schema.ir.json").exists(),
|
|
|
|
|
"_schema.ir.json must be cleaned up after late-phase init failure"
|
|
|
|
|
);
|
|
|
|
|
assert!(
|
|
|
|
|
!dir.path().join("__schema_state.json").exists(),
|
|
|
|
|
"__schema_state.json must be cleaned up after late-phase init failure"
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
// Documented limitation: Lance per-type datasets and `__manifest/`
|
|
|
|
|
// created by `GraphCoordinator::init` are NOT cleaned up — recursive
|
|
|
|
|
// deletion requires the deferred `delete_prefix` primitive. This
|
|
|
|
|
// assertion does NOT check for their absence; it merely documents
|
|
|
|
|
// the boundary by noting we don't validate orphan directories here.
|
|
|
|
|
// When PR 2b lands, this test can be tightened to assert the graph
|
|
|
|
|
// root is fully empty.
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[tokio::test]
|
|
|
|
|
async fn init_failpoint_returns_original_error_not_cleanup_error() {
|
|
|
|
|
// The cleanup is best-effort. If `storage.delete` fails (e.g. transient
|
|
|
|
|
// network blip on S3), the original init failpoint error must still
|
|
|
|
|
// surface — not be masked by a cleanup failure. This test triggers the
|
|
|
|
|
// failpoint and asserts the returned error references the failpoint,
|
|
|
|
|
// not the cleanup. (The cleanup currently logs via `tracing::warn`;
|
|
|
|
|
// we can't easily fault-inject delete failures without another seam,
|
|
|
|
|
// so this is a smoke test for the precedence contract.)
|
|
|
|
|
let _scenario = FailScenario::setup();
|
|
|
|
|
let dir = tempfile::tempdir().unwrap();
|
|
|
|
|
let uri = dir.path().to_str().unwrap();
|
|
|
|
|
let _failpoint = ScopedFailPoint::new("init.after_schema_pg_written", "return");
|
|
|
|
|
|
|
|
|
|
let err = match Omnigraph::init(uri, helpers::TEST_SCHEMA).await {
|
|
|
|
|
Ok(_) => panic!("expected Omnigraph::init to fail at the configured failpoint"),
|
|
|
|
|
Err(e) => e,
|
|
|
|
|
};
|
|
|
|
|
// Failpoint message wins; no "cleanup" substring expected.
|
|
|
|
|
let msg = err.to_string();
|
|
|
|
|
assert!(
|
|
|
|
|
msg.contains("init.after_schema_pg_written"),
|
|
|
|
|
"init error must surface the failpoint cause, got: {msg}"
|
|
|
|
|
);
|
|
|
|
|
}
|