diff --git a/crates/omnigraph/src/db/omnigraph/table_ops.rs b/crates/omnigraph/src/db/omnigraph/table_ops.rs index ab3757c..1e48d03 100644 --- a/crates/omnigraph/src/db/omnigraph/table_ops.rs +++ b/crates/omnigraph/src/db/omnigraph/table_ops.rs @@ -607,8 +607,9 @@ async fn stage_and_commit_btree( )) })?; // Failpoint between stage and commit. Used by `tests/failpoints.rs` - // to demonstrate that a Phase A failure in the staged-index path - // leaves no Lance-HEAD drift on the touched table. + // to demonstrate that a stage-step failure in the staged-index + // path (`stage_create_btree_index` succeeded; `commit_staged` not + // yet called) leaves no Lance-HEAD drift on the touched table. crate::failpoints::maybe_fail("ensure_indices.post_stage_pre_commit_btree")?; let new_ds = db .table_store diff --git a/crates/omnigraph/tests/composite_flow.rs b/crates/omnigraph/tests/composite_flow.rs index a87187f..00f4d49 100644 --- a/crates/omnigraph/tests/composite_flow.rs +++ b/crates/omnigraph/tests/composite_flow.rs @@ -51,7 +51,7 @@ const TEST_DATA: &str = include_str!("fixtures/test.jsonl"); const TEST_QUERIES: &str = include_str!("fixtures/test.gq"); #[tokio::test] -async fn composite_flow_init_load_branch_merge_time_travel_optimize_cleanup() { +async fn composite_flow_canonical_lifecycle() { let dir = tempfile::tempdir().unwrap(); let uri = dir.path().to_str().unwrap(); diff --git a/crates/omnigraph/tests/failpoints.rs b/crates/omnigraph/tests/failpoints.rs index 8366b9f..0d8a20a 100644 --- a/crates/omnigraph/tests/failpoints.rs +++ b/crates/omnigraph/tests/failpoints.rs @@ -232,7 +232,7 @@ async fn recovery_rolls_forward_after_finalize_publisher_failure() { let uri = dir.path().to_str().unwrap().to_string(); let operation_id; - // Phase A: trigger the residual. + // Setup: trigger the residual. { let mut db = Omnigraph::init(&uri, helpers::TEST_SCHEMA).await.unwrap(); let _failpoint = ScopedFailPoint::new("mutation.post_finalize_pre_publisher", "return"); @@ -272,7 +272,7 @@ async fn recovery_rolls_forward_after_finalize_publisher_failure() { // Drop the failpoint scope and the engine handle. } - // Phase B: reopen runs the recovery sweep. The sweep finds the + // Recovery: reopen runs the recovery sweep. The sweep finds the // sidecar, classifies node:Person as RolledPastExpected, decides // RollForward, publishes the manifest update, records the audit // row, deletes the sidecar. @@ -557,7 +557,7 @@ async fn refresh_runs_roll_forward_recovery_in_process() { let mut db = Omnigraph::init(&uri, helpers::TEST_SCHEMA).await.unwrap(); - // Phase A: trigger the residual (sidecar persists; manifest unchanged). + // Setup: trigger the residual (sidecar persists; manifest unchanged). { let _failpoint = ScopedFailPoint::new("mutation.post_finalize_pre_publisher", "return"); let err = mutate_main( @@ -581,7 +581,7 @@ async fn refresh_runs_roll_forward_recovery_in_process() { ); } - // Phase B: explicit refresh runs roll-forward-only recovery + // Recovery: explicit refresh runs roll-forward-only recovery // in-process — no restart needed. Sidecar finds the Person drift, // classifies RolledPastExpected, rolls forward via publisher CAS, // and deletes the sidecar. @@ -804,7 +804,7 @@ async fn finalize_publisher_residual_does_not_drift_untouched_tables() { .expect("Company write on a non-drifted table should succeed"); } -/// Acceptance test: a Phase A failure in the staged-index path +/// Acceptance test: a stage-step failure in the staged-index path /// (`stage_create_btree_index` succeeded; `commit_staged` not yet /// called) leaves NO Lance-HEAD drift on the existing tables. /// Subsequent operations against those tables succeed without @@ -828,7 +828,7 @@ async fn finalize_publisher_residual_does_not_drift_untouched_tables() { /// by `cleanup_old_versions` (or removed when a future apply at the /// same target path resolves the rename). #[tokio::test] -async fn ensure_indices_phase_a_btree_failure_leaves_existing_tables_writable() { +async fn ensure_indices_stage_btree_failure_leaves_existing_tables_writable() { let _scenario = FailScenario::setup(); let dir = tempfile::tempdir().unwrap(); let uri = dir.path().to_str().unwrap().to_string(); @@ -1024,7 +1024,7 @@ async fn schema_apply_phase_b_failure_recovered_on_next_open() { version_main(&db).await.unwrap() }; - // Phase A: trigger the residual via `schema_apply.after_staging_write`. + // Setup: trigger the residual via `schema_apply.after_staging_write`. // This failpoint fires AFTER the rewritten_tables/indexed_tables loops // (Lance HEAD advanced) AND AFTER the schema-state staging files are // written, but BEFORE the manifest publish. The recovery sidecar persists. @@ -1079,7 +1079,7 @@ edge WorksAt: Person -> Company operation_id = single_sidecar_operation_id(dir.path()); } - // Phase B: reopen runs the recovery sweep. Sidecar's writer_kind is + // Recovery: reopen runs the recovery sweep. Sidecar's writer_kind is // SchemaApply (loose-match) — classifier accepts the multi-commit // drift on Person, decision is RollForward, manifest extends to the // current Lance HEAD. @@ -1188,7 +1188,7 @@ async fn branch_merge_phase_b_failure_recovered_on_next_open() { version_main(&db).await.unwrap() }; - // Phase A: failpoint fires after the per-table publish loop completes + // Setup: failpoint fires after the per-table publish loop completes // but before commit_manifest_updates. Sidecar persists. { let mut db = Omnigraph::open(&uri).await.unwrap(); @@ -1214,7 +1214,7 @@ async fn branch_merge_phase_b_failure_recovered_on_next_open() { ); } - // Phase B: reopen runs the sweep. BranchMerge uses LOOSE + // Recovery: reopen runs the sweep. BranchMerge uses LOOSE // classification — `publish_rewritten_merge_table` runs multiple // commit_staged calls per table (stage_merge_insert + delete_where + // index rebuilds), so post_commit_pin in the sidecar is a lower @@ -1363,7 +1363,7 @@ async fn branch_merge_phase_b_failure_recovered_on_non_main_target() { .await .unwrap(); - // Phase A: failpoint fires after the per-table publish loop completes + // Setup: failpoint fires after the per-table publish loop completes // but before commit_manifest_updates. Sidecar persists with // branch=Some("target_branch"). { @@ -1389,7 +1389,7 @@ async fn branch_merge_phase_b_failure_recovered_on_non_main_target() { operation_id = single_sidecar_operation_id(dir.path()); } - // Phase B: reopen runs full sweep. The BranchMerge sidecar's branch + // Recovery: reopen runs full sweep. The BranchMerge sidecar's branch // = Some("target_branch"); D2 fix opens a per-branch CommitGraph // for the audit append so the merge-parent linkage is correct. let db = Omnigraph::open(&uri).await.unwrap(); @@ -1554,7 +1554,7 @@ async fn ensure_indices_phase_b_failure_does_not_leak_sidecar_when_no_work_neede .unwrap(); } - // Phase A: trigger the failpoint. Steady-state ensure_indices + // Setup: trigger the failpoint. Steady-state ensure_indices // produces zero sidecar pins (the helpers scope pins to tables // that genuinely need work); no sidecar is written. The failpoint // still fires, surfacing the Err. @@ -1590,7 +1590,7 @@ async fn ensure_indices_phase_b_failure_does_not_leak_sidecar_when_no_work_neede ); } - // Phase B: reopen is a clean no-op (no sidecar to recover). + // Recovery: reopen is a clean no-op (no sidecar to recover). let _db = Omnigraph::open(&uri).await.unwrap(); let recovery_dir = dir.path().join("__recovery"); diff --git a/docs/runs.md b/docs/runs.md index 5e4d6b5..5c8dfd8 100644 --- a/docs/runs.md +++ b/docs/runs.md @@ -163,6 +163,15 @@ are left at `Lance HEAD = manifest_pinned + 1`. 3. **Phase C**: publisher commits the manifest. 4. **Phase D**: writer deletes the sidecar. +> **Phase letter convention.** Throughout the recovery code, log +> messages, failpoint names (e.g. `branch_merge.post_phase_b_pre_manifest_commit`), +> and the per-writer integration tests, "Phase A/B/C/D" refers +> exclusively to the four-step lifecycle above. The per-table +> staged-write contract (`stage_*` then `commit_staged`, two steps) +> is referred to by those API verbs — never by phase letters — so a +> reader of `recovery.rs`, `failpoints.rs`, or this document only +> encounters phase letters in the per-writer context. + A failure between Phase A and Phase D leaves the sidecar on disk. The next `Omnigraph::open` (gated on `OpenMode::ReadWrite`) runs the recovery sweep in `crates/omnigraph/src/db/manifest/recovery.rs`: