mirror of
https://github.com/ModernRelay/omnigraph.git
synced 2026-06-12 01:45:14 +02:00
Production wiring (~120 LOC):
- `MutationStaging::finalize` now takes a `SidecarKind` parameter and
returns an additional `Option<RecoverySidecarHandle>`. Builds a
Vec<SidecarTablePin> from `pending` BEFORE the per-table commit_staged
loop and writes the sidecar via `recovery::write_sidecar`. Skips the
sidecar when `pending` is empty (delete-only mutation; D₂ keeps these
out of the staged-write path so the option is just a clean signal,
not a code path users hit).
- `exec/mutation.rs::execute_mutation_as` (around line 740): destructure
the new third element, pass `SidecarKind::Mutation`, delete the
sidecar after `commit_updates_on_branch_with_expected` succeeds.
- `loader/mod.rs::ingest_loaded` (around line 540): same shape, with
`SidecarKind::Load`. The Overwrite path stays inline-commit (legacy
residual; out of MR-847 scope per docs/runs.md).
- New engine accessors `Omnigraph::storage_adapter()` and
`Omnigraph::root_uri()` for the sidecar I/O. The pre-existing
`db.storage` field stays private; no other engine code reaches around
the accessor.
- Re-exports from `db::manifest`: `new_sidecar`, `write_sidecar`,
`delete_sidecar`, plus the `RecoverySidecar*` types and `SidecarKind`,
so consumers in `exec/` can use them via `crate::db::manifest::...`.
Bugfix folded in (~5 LOC): make `coordinator` mutable in
`Omnigraph::open_with_storage_and_mode` and call `coordinator.refresh()`
after the recovery sweep returns. Roll-forward advances the manifest
pin on disk; without the refresh the returned engine carried a stale
in-memory snapshot. The Phase 4 tests passed only because they
opened Lance datasets directly rather than going through `db.snapshot()`.
Storage adapter (~15 LOC): `LocalStorageAdapter::write_text` now ensures
the parent directory exists via `tokio::fs::create_dir_all`. Required
because the sidecar protocol writes into `__recovery/` which doesn't
pre-exist after `Omnigraph::init`. S3 has no equivalent; PutObject is
path-agnostic.
Headline test flip (~150 LOC):
- `tests/failpoints.rs::finalize_publisher_residual_drifts_lance_head_until_next_writer_recovers`
is replaced by `recovery_rolls_forward_after_finalize_publisher_failure`.
Same setup (failpoint at `mutation.post_finalize_pre_publisher`) but
after the synthetic failure the test:
1. Asserts the sidecar persists in `__recovery/` for the recovery
sweep to find.
2. Drops the engine handle.
3. Reopens via `Omnigraph::open` — recovery sweep classifies
RolledPastExpected, decides RollForward, publishes the manifest
update, records the audit row, deletes the sidecar.
4. Asserts the sidecar is gone.
5. Asserts the originally-attempted Eve insert is now visible
(Person count = 1).
6. Asserts a subsequent insert succeeds without
ExpectedVersionMismatch (Person count = 2).
7. Asserts the audit dataset `_graph_commit_recoveries.lance` exists.
This is the headline contract the MR-847 acceptance criteria require.
All other failpoint and runs tests continue to pass (8 + 24 unchanged).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
384 lines
15 KiB
Rust
384 lines
15 KiB
Rust
#![cfg(feature = "failpoints")]
|
|
|
|
mod helpers;
|
|
|
|
use fail::FailScenario;
|
|
use omnigraph::db::Omnigraph;
|
|
use omnigraph::failpoints::ScopedFailPoint;
|
|
|
|
use helpers::{MUTATION_QUERIES, mixed_params, mutate_main};
|
|
|
|
const SCHEMA_V1: &str = "node Person { name: String @key }\n";
|
|
const SCHEMA_V2_ADDED_TYPE: &str =
|
|
"node Person { name: String @key }\nnode Company { name: String @key }\n";
|
|
|
|
#[tokio::test]
|
|
async fn branch_create_failpoint_triggers() {
|
|
let _scenario = FailScenario::setup();
|
|
let dir = tempfile::tempdir().unwrap();
|
|
let uri = dir.path().to_str().unwrap();
|
|
let mut db = Omnigraph::init(uri, helpers::TEST_SCHEMA).await.unwrap();
|
|
let _failpoint = ScopedFailPoint::new("branch_create.after_manifest_branch_create", "return");
|
|
|
|
let err = db.branch_create("feature").await.unwrap_err();
|
|
assert!(
|
|
err.to_string()
|
|
.contains("injected failpoint triggered: branch_create.after_manifest_branch_create")
|
|
);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn graph_publish_failpoint_triggers_before_commit_append() {
|
|
let _scenario = FailScenario::setup();
|
|
let dir = tempfile::tempdir().unwrap();
|
|
let mut db = Omnigraph::init(dir.path().to_str().unwrap(), helpers::TEST_SCHEMA)
|
|
.await
|
|
.unwrap();
|
|
let _failpoint = ScopedFailPoint::new("graph_publish.before_commit_append", "return");
|
|
|
|
let err = mutate_main(
|
|
&mut db,
|
|
MUTATION_QUERIES,
|
|
"insert_person",
|
|
&mixed_params(&[("$name", "Eve")], &[("$age", 22)]),
|
|
)
|
|
.await
|
|
.unwrap_err();
|
|
assert!(
|
|
err.to_string()
|
|
.contains("injected failpoint triggered: graph_publish.before_commit_append")
|
|
);
|
|
}
|
|
|
|
// Atomic schema apply: schema apply writes staging files first, then commits
|
|
// the manifest, then renames staging → final. Tests below inject crashes at
|
|
// the two boundaries and assert that reopening the repo yields a consistent
|
|
// state.
|
|
|
|
#[tokio::test]
|
|
async fn schema_apply_recovers_pre_commit_crash() {
|
|
let _scenario = FailScenario::setup();
|
|
let dir = tempfile::tempdir().unwrap();
|
|
let uri = dir.path().to_str().unwrap().to_string();
|
|
|
|
{
|
|
let mut db = Omnigraph::init(&uri, SCHEMA_V1).await.unwrap();
|
|
let _failpoint = ScopedFailPoint::new("schema_apply.after_staging_write", "return");
|
|
let err = db.apply_schema(SCHEMA_V2_ADDED_TYPE).await.unwrap_err();
|
|
assert!(
|
|
err.to_string()
|
|
.contains("injected failpoint triggered: schema_apply.after_staging_write"),
|
|
"got: {}",
|
|
err
|
|
);
|
|
}
|
|
|
|
// Reopen — recovery sweep should delete staging files and keep the
|
|
// original schema, since the manifest commit never happened.
|
|
let db = Omnigraph::open(&uri).await.unwrap();
|
|
assert_eq!(db.schema_source(), SCHEMA_V1);
|
|
assert_no_staging_files(dir.path());
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn schema_apply_recovers_post_commit_crash() {
|
|
let _scenario = FailScenario::setup();
|
|
let dir = tempfile::tempdir().unwrap();
|
|
let uri = dir.path().to_str().unwrap().to_string();
|
|
|
|
{
|
|
let mut db = Omnigraph::init(&uri, SCHEMA_V1).await.unwrap();
|
|
let _failpoint = ScopedFailPoint::new("schema_apply.after_manifest_commit", "return");
|
|
let err = db.apply_schema(SCHEMA_V2_ADDED_TYPE).await.unwrap_err();
|
|
assert!(
|
|
err.to_string()
|
|
.contains("injected failpoint triggered: schema_apply.after_manifest_commit"),
|
|
"got: {}",
|
|
err
|
|
);
|
|
}
|
|
|
|
// Reopen — manifest is at the new version, so recovery sweep should
|
|
// complete the rename and the live schema matches v2.
|
|
let db = Omnigraph::open(&uri).await.unwrap();
|
|
assert_eq!(db.schema_source(), SCHEMA_V2_ADDED_TYPE);
|
|
assert_no_staging_files(dir.path());
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn schema_apply_recovers_partial_rename() {
|
|
// Construct a partial-rename state: _schema.pg has been renamed in
|
|
// (matching v2), but _schema.ir.json.staging and __schema_state.json.staging
|
|
// were never renamed. Recovery should detect that the live source matches
|
|
// the staging state's hash and complete the remaining renames.
|
|
let _scenario = FailScenario::setup();
|
|
let dir = tempfile::tempdir().unwrap();
|
|
let uri = dir.path().to_str().unwrap().to_string();
|
|
|
|
{
|
|
let mut db = Omnigraph::init(&uri, SCHEMA_V1).await.unwrap();
|
|
db.apply_schema(SCHEMA_V2_ADDED_TYPE).await.unwrap();
|
|
}
|
|
|
|
// Simulate: one of the renames (the IR or state file) didn't complete by
|
|
// copying the live ir/state files back to their staging names.
|
|
std::fs::copy(
|
|
dir.path().join("_schema.ir.json"),
|
|
dir.path().join("_schema.ir.json.staging"),
|
|
)
|
|
.unwrap();
|
|
std::fs::copy(
|
|
dir.path().join("__schema_state.json"),
|
|
dir.path().join("__schema_state.json.staging"),
|
|
)
|
|
.unwrap();
|
|
|
|
// Reopen — recovery should complete the rename (overwriting final files
|
|
// with identical staging content) and remove the staging files.
|
|
let db = Omnigraph::open(&uri).await.unwrap();
|
|
assert_eq!(db.schema_source(), SCHEMA_V2_ADDED_TYPE);
|
|
assert_no_staging_files(dir.path());
|
|
}
|
|
|
|
/// Prove the MR-847 recovery sweep closes the "finalize → publisher
|
|
/// residual" across one open cycle — the post-MR-847 contract.
|
|
///
|
|
/// `MutationStaging::finalize` runs `commit_staged` per touched table
|
|
/// sequentially before the publisher commits the manifest. Lance has no
|
|
/// multi-dataset atomic commit primitive, so a failure between the
|
|
/// per-table staged commits and the manifest commit leaves Lance HEAD
|
|
/// advanced on the touched tables with no manifest update.
|
|
///
|
|
/// Pre-MR-847: the next mutation surfaced `ExpectedVersionMismatch` and
|
|
/// the residual persisted until process restart. Post-MR-847: the
|
|
/// finalize writes a sidecar at `__recovery/{ulid}.json` BEFORE Phase B,
|
|
/// the failpoint fires AFTER finalize but BEFORE the publisher, the
|
|
/// engine handle is dropped, and the next `Omnigraph::open` runs the
|
|
/// recovery sweep. The sweep classifies every table in the sidecar as
|
|
/// `RolledPastExpected` (Lance HEAD == expected + 1, post_commit_pin
|
|
/// matches), decides RollForward, atomically extends every manifest pin
|
|
/// via `ManifestBatchPublisher::publish`, records an audit row, and
|
|
/// deletes the sidecar.
|
|
///
|
|
/// After this test passes:
|
|
/// - The originally-attempted insert ("Eve") is visible via a normal
|
|
/// query.
|
|
/// - The next mutation succeeds without `ExpectedVersionMismatch`.
|
|
/// - `_graph_commit_recoveries.lance` carries an audit row with
|
|
/// `recovery_kind=RolledForward` and the original sidecar's
|
|
/// `actor_id` in `recovery_for_actor`.
|
|
///
|
|
/// Continuous in-process recovery (no restart needed between failure
|
|
/// and recovery) is MR-856 (background reconciler).
|
|
#[tokio::test]
|
|
async fn recovery_rolls_forward_after_finalize_publisher_failure() {
|
|
let _scenario = FailScenario::setup();
|
|
let dir = tempfile::tempdir().unwrap();
|
|
let uri = dir.path().to_str().unwrap().to_string();
|
|
|
|
// Phase A: trigger the residual.
|
|
{
|
|
let mut db = Omnigraph::init(&uri, helpers::TEST_SCHEMA).await.unwrap();
|
|
let _failpoint =
|
|
ScopedFailPoint::new("mutation.post_finalize_pre_publisher", "return");
|
|
|
|
// The mutation's finalize completes (commit_staged advances Lance
|
|
// HEAD on node:Person AND writes a `__recovery/{ulid}.json`
|
|
// sidecar). Then the failpoint kicks in before the publisher's
|
|
// manifest commit, so the manifest pin stays at the pre-write
|
|
// version. The sidecar persists for the next-open recovery sweep.
|
|
let err = mutate_main(
|
|
&mut db,
|
|
MUTATION_QUERIES,
|
|
"insert_person",
|
|
&mixed_params(&[("$name", "Eve")], &[("$age", 22)]),
|
|
)
|
|
.await
|
|
.unwrap_err();
|
|
assert!(
|
|
err.to_string().contains(
|
|
"injected failpoint triggered: mutation.post_finalize_pre_publisher"
|
|
),
|
|
"unexpected error: {err}"
|
|
);
|
|
|
|
// Sidecar must still exist on disk for the recovery sweep to find.
|
|
let recovery_dir = dir.path().join("__recovery");
|
|
let sidecars: Vec<_> = std::fs::read_dir(&recovery_dir)
|
|
.unwrap()
|
|
.filter_map(|e| e.ok())
|
|
.collect();
|
|
assert_eq!(
|
|
sidecars.len(),
|
|
1,
|
|
"exactly one sidecar should persist after the finalize failure"
|
|
);
|
|
|
|
// Drop the failpoint scope and the engine handle.
|
|
}
|
|
|
|
// Phase B: reopen runs the recovery sweep. The sweep finds the
|
|
// sidecar, classifies node:Person as RolledPastExpected, decides
|
|
// RollForward, publishes the manifest update, records the audit
|
|
// row, deletes the sidecar.
|
|
let mut db = Omnigraph::open(&uri).await.unwrap();
|
|
|
|
// Sidecar gone — sweep completed end to end.
|
|
let recovery_dir = dir.path().join("__recovery");
|
|
if recovery_dir.exists() {
|
|
let remaining: Vec<_> = std::fs::read_dir(&recovery_dir)
|
|
.unwrap()
|
|
.filter_map(|e| e.ok())
|
|
.collect();
|
|
assert!(
|
|
remaining.is_empty(),
|
|
"sidecar must be deleted after successful roll-forward; remaining: {:?}",
|
|
remaining,
|
|
);
|
|
}
|
|
|
|
// The originally-attempted "Eve" insert is now visible — the recovery
|
|
// sweep extended the manifest pin to include the staged commit.
|
|
let person_count = helpers::count_rows(&db, "node:Person").await;
|
|
assert_eq!(
|
|
person_count, 1,
|
|
"exactly one person (Eve) must be visible after roll-forward"
|
|
);
|
|
|
|
// The next mutation on the same table succeeds — no ExpectedVersionMismatch.
|
|
mutate_main(
|
|
&mut db,
|
|
MUTATION_QUERIES,
|
|
"insert_person",
|
|
&mixed_params(&[("$name", "Frank")], &[("$age", 33)]),
|
|
)
|
|
.await
|
|
.expect("next mutation must succeed after recovery rolled forward");
|
|
let person_count = helpers::count_rows(&db, "node:Person").await;
|
|
assert_eq!(
|
|
person_count, 2,
|
|
"Frank's insert must land normally after recovery"
|
|
);
|
|
|
|
// Audit row recorded.
|
|
let audit_dir = dir.path().join("_graph_commit_recoveries.lance");
|
|
assert!(
|
|
audit_dir.exists(),
|
|
"_graph_commit_recoveries.lance must exist after a successful recovery"
|
|
);
|
|
}
|
|
|
|
/// Companion to the above — confirms that a finalize→publisher failure
|
|
/// on one table leaves OTHER tables untouched. Subsequent writes to
|
|
/// non-drifted tables proceed normally; the drift is contained.
|
|
#[tokio::test]
|
|
async fn finalize_publisher_residual_does_not_drift_untouched_tables() {
|
|
let _scenario = FailScenario::setup();
|
|
let dir = tempfile::tempdir().unwrap();
|
|
let mut db = Omnigraph::init(dir.path().to_str().unwrap(), helpers::TEST_SCHEMA)
|
|
.await
|
|
.unwrap();
|
|
|
|
{
|
|
let _failpoint =
|
|
ScopedFailPoint::new("mutation.post_finalize_pre_publisher", "return");
|
|
let _ = mutate_main(
|
|
&mut db,
|
|
MUTATION_QUERIES,
|
|
"insert_person",
|
|
&mixed_params(&[("$name", "Eve")], &[("$age", 22)]),
|
|
)
|
|
.await
|
|
.expect_err("synthetic failpoint must fire");
|
|
}
|
|
|
|
// node:Person drifted. node:Company didn't — try a Company write.
|
|
use omnigraph::loader::{LoadMode, load_jsonl};
|
|
load_jsonl(
|
|
&mut db,
|
|
r#"{"type": "Company", "data": {"name": "Acme"}}"#,
|
|
LoadMode::Append,
|
|
)
|
|
.await
|
|
.expect("Company write on a non-drifted table should succeed");
|
|
}
|
|
|
|
/// MR-793 Phase 4 acceptance bar — proves that a Phase A failure in
|
|
/// the staged-index path (`stage_create_btree_index` succeeded;
|
|
/// `commit_staged` not yet called) leaves NO Lance-HEAD drift on the
|
|
/// existing tables. Subsequent operations against those tables succeed
|
|
/// without `ExpectedVersionMismatch`.
|
|
///
|
|
/// Path: `apply_schema(v1 → v2)` adds a new node type. The
|
|
/// `added_tables` loop in `schema_apply` creates the empty dataset and
|
|
/// then calls `build_indices_on_dataset_for_catalog` →
|
|
/// `stage_and_commit_btree(..., &["id"])`. The failpoint fires
|
|
/// between `stage_create_btree_index` and `commit_staged`, so the
|
|
/// staged segments are written under `_indices/<uuid>/` but Lance HEAD
|
|
/// on the new dataset is unchanged at v=1. The schema-apply lock
|
|
/// branch is released by `apply_schema`'s outer match. Existing
|
|
/// tables (e.g. `node:Person`) are completely untouched by the new
|
|
/// node's added_tables iteration — they're outside the failed apply
|
|
/// path entirely — and we assert that mutations against them continue
|
|
/// to work.
|
|
///
|
|
/// The orphan empty dataset from the failed apply is acceptable
|
|
/// residual: it's unreferenced by `__manifest` and will be reclaimed
|
|
/// by `cleanup_old_versions` (or removed when a future apply at the
|
|
/// same target path resolves the rename).
|
|
#[tokio::test]
|
|
async fn ensure_indices_phase_a_btree_failure_leaves_existing_tables_writable() {
|
|
let _scenario = FailScenario::setup();
|
|
let dir = tempfile::tempdir().unwrap();
|
|
let uri = dir.path().to_str().unwrap().to_string();
|
|
|
|
// Init with TEST_SCHEMA which declares Person + Knows. Indices on
|
|
// those tables get built during init.
|
|
let mut db = Omnigraph::init(&uri, helpers::TEST_SCHEMA).await.unwrap();
|
|
|
|
// Apply a schema that adds a new node type. The added_tables loop
|
|
// will hit the failpoint between stage and commit on the new
|
|
// node:Project table's btree-on-id build. (TEST_SCHEMA already
|
|
// has Person + Company + Knows + WorksAt — pick a name that isn't
|
|
// already declared.)
|
|
let extended_schema = format!("{}\nnode Project {{ name: String @key }}\n", helpers::TEST_SCHEMA);
|
|
|
|
{
|
|
let _failpoint = ScopedFailPoint::new(
|
|
"ensure_indices.post_stage_pre_commit_btree",
|
|
"return",
|
|
);
|
|
let err = db.apply_schema(&extended_schema).await.unwrap_err();
|
|
assert!(
|
|
err.to_string()
|
|
.contains("ensure_indices.post_stage_pre_commit_btree"),
|
|
"schema apply should fail with the synthetic failpoint error, got: {err}"
|
|
);
|
|
}
|
|
|
|
// Existing tables stayed at their pre-apply versions; subsequent
|
|
// mutations against them succeed (no Lance-HEAD drift).
|
|
mutate_main(
|
|
&mut db,
|
|
helpers::MUTATION_QUERIES,
|
|
"insert_person",
|
|
&mixed_params(&[("$name", "Eve")], &[("$age", 22)]),
|
|
)
|
|
.await
|
|
.expect("Person mutation must succeed after the failed schema apply — existing tables are not drifted");
|
|
}
|
|
|
|
fn assert_no_staging_files(repo: &std::path::Path) {
|
|
for name in [
|
|
"_schema.pg.staging",
|
|
"_schema.ir.json.staging",
|
|
"__schema_state.json.staging",
|
|
] {
|
|
let path = repo.join(name);
|
|
assert!(
|
|
!path.exists(),
|
|
"staging file {} still exists after recovery",
|
|
path.display()
|
|
);
|
|
}
|
|
}
|