mirror of
https://github.com/ModernRelay/omnigraph.git
synced 2026-06-21 02:28:07 +02:00
docs/tests: reserve Phase A/B/C/D for the per-writer recovery flow
Three terminologies were calling themselves Phase A/B in PR #72: 1. Per-writer recovery (canonical, four phases A/B/C/D — sidecar / commit_staged loop / manifest publish / sidecar delete in `docs/runs.md:157`). 2. Per-table staged-write contract from MR-793 (two phases — `stage_*` then `commit_staged`). 3. Test-narrative scaffolding (Phase A = setup the failure, Phase B = verify recovery — used as section dividers in failpoints.rs). Same letters, three meanings; three reviewers including the bots have already misread the code in the resulting fog. This change keeps "Phase A/B/C/D" exclusively for #1 and rewrites the other two: - `ensure_indices_phase_a_btree_failure_leaves_existing_tables_writable` → `ensure_indices_stage_btree_failure_leaves_existing_tables_writable` (matches the `stage_create_btree_index` API verb). - Comment at `table_ops.rs:610` and the test docstring at `failpoints.rs:807` rewrite "a Phase A failure in the staged-index path" → "a stage-step failure in the staged-index path". - Twelve `// Phase A:` / `// Phase B:` test scaffolding comment headers in `failpoints.rs` (across six test fns) become `// Setup:` / `// Recovery:`. - A "Phase letter convention" note added near `docs/runs.md:165` spells the rule out for future readers. Also bundled: rename `composite_flow_init_load_branch_merge_time_travel_optimize_cleanup` → `composite_flow_canonical_lifecycle` so it pairs as a story name with `composite_flow_multi_branch_sequential_merges` (the previously- deferred symmetry rename). No behaviour change. Both renamed tests pass; full failpoints (18) + composite_flow (2) suites pass; workspace baseline + clippy clean. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
fb0f024652
commit
a30666bc38
4 changed files with 27 additions and 17 deletions
|
|
@ -607,8 +607,9 @@ async fn stage_and_commit_btree(
|
||||||
))
|
))
|
||||||
})?;
|
})?;
|
||||||
// Failpoint between stage and commit. Used by `tests/failpoints.rs`
|
// Failpoint between stage and commit. Used by `tests/failpoints.rs`
|
||||||
// to demonstrate that a Phase A failure in the staged-index path
|
// to demonstrate that a stage-step failure in the staged-index
|
||||||
// leaves no Lance-HEAD drift on the touched table.
|
// path (`stage_create_btree_index` succeeded; `commit_staged` not
|
||||||
|
// yet called) leaves no Lance-HEAD drift on the touched table.
|
||||||
crate::failpoints::maybe_fail("ensure_indices.post_stage_pre_commit_btree")?;
|
crate::failpoints::maybe_fail("ensure_indices.post_stage_pre_commit_btree")?;
|
||||||
let new_ds = db
|
let new_ds = db
|
||||||
.table_store
|
.table_store
|
||||||
|
|
|
||||||
|
|
@ -51,7 +51,7 @@ const TEST_DATA: &str = include_str!("fixtures/test.jsonl");
|
||||||
const TEST_QUERIES: &str = include_str!("fixtures/test.gq");
|
const TEST_QUERIES: &str = include_str!("fixtures/test.gq");
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn composite_flow_init_load_branch_merge_time_travel_optimize_cleanup() {
|
async fn composite_flow_canonical_lifecycle() {
|
||||||
let dir = tempfile::tempdir().unwrap();
|
let dir = tempfile::tempdir().unwrap();
|
||||||
let uri = dir.path().to_str().unwrap();
|
let uri = dir.path().to_str().unwrap();
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -232,7 +232,7 @@ async fn recovery_rolls_forward_after_finalize_publisher_failure() {
|
||||||
let uri = dir.path().to_str().unwrap().to_string();
|
let uri = dir.path().to_str().unwrap().to_string();
|
||||||
let operation_id;
|
let operation_id;
|
||||||
|
|
||||||
// Phase A: trigger the residual.
|
// Setup: trigger the residual.
|
||||||
{
|
{
|
||||||
let mut db = Omnigraph::init(&uri, helpers::TEST_SCHEMA).await.unwrap();
|
let mut db = Omnigraph::init(&uri, helpers::TEST_SCHEMA).await.unwrap();
|
||||||
let _failpoint = ScopedFailPoint::new("mutation.post_finalize_pre_publisher", "return");
|
let _failpoint = ScopedFailPoint::new("mutation.post_finalize_pre_publisher", "return");
|
||||||
|
|
@ -272,7 +272,7 @@ async fn recovery_rolls_forward_after_finalize_publisher_failure() {
|
||||||
// Drop the failpoint scope and the engine handle.
|
// Drop the failpoint scope and the engine handle.
|
||||||
}
|
}
|
||||||
|
|
||||||
// Phase B: reopen runs the recovery sweep. The sweep finds the
|
// Recovery: reopen runs the recovery sweep. The sweep finds the
|
||||||
// sidecar, classifies node:Person as RolledPastExpected, decides
|
// sidecar, classifies node:Person as RolledPastExpected, decides
|
||||||
// RollForward, publishes the manifest update, records the audit
|
// RollForward, publishes the manifest update, records the audit
|
||||||
// row, deletes the sidecar.
|
// row, deletes the sidecar.
|
||||||
|
|
@ -557,7 +557,7 @@ async fn refresh_runs_roll_forward_recovery_in_process() {
|
||||||
|
|
||||||
let mut db = Omnigraph::init(&uri, helpers::TEST_SCHEMA).await.unwrap();
|
let mut db = Omnigraph::init(&uri, helpers::TEST_SCHEMA).await.unwrap();
|
||||||
|
|
||||||
// Phase A: trigger the residual (sidecar persists; manifest unchanged).
|
// Setup: trigger the residual (sidecar persists; manifest unchanged).
|
||||||
{
|
{
|
||||||
let _failpoint = ScopedFailPoint::new("mutation.post_finalize_pre_publisher", "return");
|
let _failpoint = ScopedFailPoint::new("mutation.post_finalize_pre_publisher", "return");
|
||||||
let err = mutate_main(
|
let err = mutate_main(
|
||||||
|
|
@ -581,7 +581,7 @@ async fn refresh_runs_roll_forward_recovery_in_process() {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Phase B: explicit refresh runs roll-forward-only recovery
|
// Recovery: explicit refresh runs roll-forward-only recovery
|
||||||
// in-process — no restart needed. Sidecar finds the Person drift,
|
// in-process — no restart needed. Sidecar finds the Person drift,
|
||||||
// classifies RolledPastExpected, rolls forward via publisher CAS,
|
// classifies RolledPastExpected, rolls forward via publisher CAS,
|
||||||
// and deletes the sidecar.
|
// and deletes the sidecar.
|
||||||
|
|
@ -804,7 +804,7 @@ async fn finalize_publisher_residual_does_not_drift_untouched_tables() {
|
||||||
.expect("Company write on a non-drifted table should succeed");
|
.expect("Company write on a non-drifted table should succeed");
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Acceptance test: a Phase A failure in the staged-index path
|
/// Acceptance test: a stage-step failure in the staged-index path
|
||||||
/// (`stage_create_btree_index` succeeded; `commit_staged` not yet
|
/// (`stage_create_btree_index` succeeded; `commit_staged` not yet
|
||||||
/// called) leaves NO Lance-HEAD drift on the existing tables.
|
/// called) leaves NO Lance-HEAD drift on the existing tables.
|
||||||
/// Subsequent operations against those tables succeed without
|
/// Subsequent operations against those tables succeed without
|
||||||
|
|
@ -828,7 +828,7 @@ async fn finalize_publisher_residual_does_not_drift_untouched_tables() {
|
||||||
/// by `cleanup_old_versions` (or removed when a future apply at the
|
/// by `cleanup_old_versions` (or removed when a future apply at the
|
||||||
/// same target path resolves the rename).
|
/// same target path resolves the rename).
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn ensure_indices_phase_a_btree_failure_leaves_existing_tables_writable() {
|
async fn ensure_indices_stage_btree_failure_leaves_existing_tables_writable() {
|
||||||
let _scenario = FailScenario::setup();
|
let _scenario = FailScenario::setup();
|
||||||
let dir = tempfile::tempdir().unwrap();
|
let dir = tempfile::tempdir().unwrap();
|
||||||
let uri = dir.path().to_str().unwrap().to_string();
|
let uri = dir.path().to_str().unwrap().to_string();
|
||||||
|
|
@ -1024,7 +1024,7 @@ async fn schema_apply_phase_b_failure_recovered_on_next_open() {
|
||||||
version_main(&db).await.unwrap()
|
version_main(&db).await.unwrap()
|
||||||
};
|
};
|
||||||
|
|
||||||
// Phase A: trigger the residual via `schema_apply.after_staging_write`.
|
// Setup: trigger the residual via `schema_apply.after_staging_write`.
|
||||||
// This failpoint fires AFTER the rewritten_tables/indexed_tables loops
|
// This failpoint fires AFTER the rewritten_tables/indexed_tables loops
|
||||||
// (Lance HEAD advanced) AND AFTER the schema-state staging files are
|
// (Lance HEAD advanced) AND AFTER the schema-state staging files are
|
||||||
// written, but BEFORE the manifest publish. The recovery sidecar persists.
|
// written, but BEFORE the manifest publish. The recovery sidecar persists.
|
||||||
|
|
@ -1079,7 +1079,7 @@ edge WorksAt: Person -> Company
|
||||||
operation_id = single_sidecar_operation_id(dir.path());
|
operation_id = single_sidecar_operation_id(dir.path());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Phase B: reopen runs the recovery sweep. Sidecar's writer_kind is
|
// Recovery: reopen runs the recovery sweep. Sidecar's writer_kind is
|
||||||
// SchemaApply (loose-match) — classifier accepts the multi-commit
|
// SchemaApply (loose-match) — classifier accepts the multi-commit
|
||||||
// drift on Person, decision is RollForward, manifest extends to the
|
// drift on Person, decision is RollForward, manifest extends to the
|
||||||
// current Lance HEAD.
|
// current Lance HEAD.
|
||||||
|
|
@ -1188,7 +1188,7 @@ async fn branch_merge_phase_b_failure_recovered_on_next_open() {
|
||||||
version_main(&db).await.unwrap()
|
version_main(&db).await.unwrap()
|
||||||
};
|
};
|
||||||
|
|
||||||
// Phase A: failpoint fires after the per-table publish loop completes
|
// Setup: failpoint fires after the per-table publish loop completes
|
||||||
// but before commit_manifest_updates. Sidecar persists.
|
// but before commit_manifest_updates. Sidecar persists.
|
||||||
{
|
{
|
||||||
let mut db = Omnigraph::open(&uri).await.unwrap();
|
let mut db = Omnigraph::open(&uri).await.unwrap();
|
||||||
|
|
@ -1214,7 +1214,7 @@ async fn branch_merge_phase_b_failure_recovered_on_next_open() {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Phase B: reopen runs the sweep. BranchMerge uses LOOSE
|
// Recovery: reopen runs the sweep. BranchMerge uses LOOSE
|
||||||
// classification — `publish_rewritten_merge_table` runs multiple
|
// classification — `publish_rewritten_merge_table` runs multiple
|
||||||
// commit_staged calls per table (stage_merge_insert + delete_where +
|
// commit_staged calls per table (stage_merge_insert + delete_where +
|
||||||
// index rebuilds), so post_commit_pin in the sidecar is a lower
|
// index rebuilds), so post_commit_pin in the sidecar is a lower
|
||||||
|
|
@ -1363,7 +1363,7 @@ async fn branch_merge_phase_b_failure_recovered_on_non_main_target() {
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
// Phase A: failpoint fires after the per-table publish loop completes
|
// Setup: failpoint fires after the per-table publish loop completes
|
||||||
// but before commit_manifest_updates. Sidecar persists with
|
// but before commit_manifest_updates. Sidecar persists with
|
||||||
// branch=Some("target_branch").
|
// branch=Some("target_branch").
|
||||||
{
|
{
|
||||||
|
|
@ -1389,7 +1389,7 @@ async fn branch_merge_phase_b_failure_recovered_on_non_main_target() {
|
||||||
operation_id = single_sidecar_operation_id(dir.path());
|
operation_id = single_sidecar_operation_id(dir.path());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Phase B: reopen runs full sweep. The BranchMerge sidecar's branch
|
// Recovery: reopen runs full sweep. The BranchMerge sidecar's branch
|
||||||
// = Some("target_branch"); D2 fix opens a per-branch CommitGraph
|
// = Some("target_branch"); D2 fix opens a per-branch CommitGraph
|
||||||
// for the audit append so the merge-parent linkage is correct.
|
// for the audit append so the merge-parent linkage is correct.
|
||||||
let db = Omnigraph::open(&uri).await.unwrap();
|
let db = Omnigraph::open(&uri).await.unwrap();
|
||||||
|
|
@ -1554,7 +1554,7 @@ async fn ensure_indices_phase_b_failure_does_not_leak_sidecar_when_no_work_neede
|
||||||
.unwrap();
|
.unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Phase A: trigger the failpoint. Steady-state ensure_indices
|
// Setup: trigger the failpoint. Steady-state ensure_indices
|
||||||
// produces zero sidecar pins (the helpers scope pins to tables
|
// produces zero sidecar pins (the helpers scope pins to tables
|
||||||
// that genuinely need work); no sidecar is written. The failpoint
|
// that genuinely need work); no sidecar is written. The failpoint
|
||||||
// still fires, surfacing the Err.
|
// still fires, surfacing the Err.
|
||||||
|
|
@ -1590,7 +1590,7 @@ async fn ensure_indices_phase_b_failure_does_not_leak_sidecar_when_no_work_neede
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Phase B: reopen is a clean no-op (no sidecar to recover).
|
// Recovery: reopen is a clean no-op (no sidecar to recover).
|
||||||
let _db = Omnigraph::open(&uri).await.unwrap();
|
let _db = Omnigraph::open(&uri).await.unwrap();
|
||||||
|
|
||||||
let recovery_dir = dir.path().join("__recovery");
|
let recovery_dir = dir.path().join("__recovery");
|
||||||
|
|
|
||||||
|
|
@ -163,6 +163,15 @@ are left at `Lance HEAD = manifest_pinned + 1`.
|
||||||
3. **Phase C**: publisher commits the manifest.
|
3. **Phase C**: publisher commits the manifest.
|
||||||
4. **Phase D**: writer deletes the sidecar.
|
4. **Phase D**: writer deletes the sidecar.
|
||||||
|
|
||||||
|
> **Phase letter convention.** Throughout the recovery code, log
|
||||||
|
> messages, failpoint names (e.g. `branch_merge.post_phase_b_pre_manifest_commit`),
|
||||||
|
> and the per-writer integration tests, "Phase A/B/C/D" refers
|
||||||
|
> exclusively to the four-step lifecycle above. The per-table
|
||||||
|
> staged-write contract (`stage_*` then `commit_staged`, two steps)
|
||||||
|
> is referred to by those API verbs — never by phase letters — so a
|
||||||
|
> reader of `recovery.rs`, `failpoints.rs`, or this document only
|
||||||
|
> encounters phase letters in the per-writer context.
|
||||||
|
|
||||||
A failure between Phase A and Phase D leaves the sidecar on disk. The
|
A failure between Phase A and Phase D leaves the sidecar on disk. The
|
||||||
next `Omnigraph::open` (gated on `OpenMode::ReadWrite`) runs the
|
next `Omnigraph::open` (gated on `OpenMode::ReadWrite`) runs the
|
||||||
recovery sweep in `crates/omnigraph/src/db/manifest/recovery.rs`:
|
recovery sweep in `crates/omnigraph/src/db/manifest/recovery.rs`:
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue