mirror of
https://github.com/ModernRelay/omnigraph.git
synced 2026-06-09 01:35:18 +02:00
Fix MR-923: refresh restored coordinator on merge Err path
branch_merge_impl swaps the coordinator for the merge target, runs the merge body, then restores the original coordinator. A concurrent /change on the same target during this window publishes against the swapped coord, advancing on-disk manifest state that the restored coord doesn't see. The post-restore refresh was previously gated on merge_result.is_ok(), so the clean-409 path (merge body's post_queue_snapshot drift check returning a recoverable conflict) left the restored coord's cached snapshot stale relative to disk. The next sequential /change seeded its publisher expected_versions from that stale cache and 409'd with ExpectedVersionMismatch — a non-retryable conflict surfaced to a caller with no concurrent writer of their own. Refresh on both Ok and Err paths so cached state cannot diverge from the manifest across the swap-restore window. Add a focused regression test (concurrent_merge_clean_409_does_not_poison_next_change_on_target) that loops the cell-d scenario until the clean-409 branch fires and asserts the follow-up sentinel succeeds in that branch specifically. Co-Authored-By: Ragnor Comerford <ragnor.comerford@gmail.com>
This commit is contained in:
parent
19e9292ec0
commit
e91d5615c6
2 changed files with 99 additions and 1 deletions
|
|
@ -3207,6 +3207,89 @@ async fn concurrent_branch_ops_morphological_matrix() {
|
|||
}
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 4)]
|
||||
async fn concurrent_merge_clean_409_does_not_poison_next_change_on_target() {
|
||||
// MR-923 regression. Pins the path that the
|
||||
// `[d:merge×change:into-target]` cell of
|
||||
// `concurrent_branch_ops_morphological_matrix` exercises only when
|
||||
// timing happens to hit it: the merge body's `post_queue_snapshot`
|
||||
// drift check returns a clean 409 because a concurrent /change on
|
||||
// the merge target landed first.
|
||||
//
|
||||
// Pre-fix: `branch_merge_impl` only refreshed the restored
|
||||
// coordinator when `merge_result.is_ok()`. The clean-409 path
|
||||
// skipped refresh, leaving the restored coord's cached manifest
|
||||
// snapshot stale relative to disk (the concurrent /change had
|
||||
// published against the *swapped* coord during the merge body).
|
||||
// The next sequential `/change` on the target seeded its publisher
|
||||
// `expected_versions` from the stale cache and 409'd with
|
||||
// `ExpectedVersionMismatch` — a non-retryable conflict surfaced to
|
||||
// a caller who had no concurrent writer of their own.
|
||||
//
|
||||
// The matrix cell catches this incidentally (it asserts the
|
||||
// sentinel on every cell-d run regardless of merge outcome), but
|
||||
// its single-iteration shape means the flake reproduces only when
|
||||
// the clean-409 path happens to fire. This test loops the cell-d
|
||||
// scenario until the clean-409 branch fires, then sharpens the
|
||||
// post-condition specifically for that branch: with the fix, the
|
||||
// sentinel must succeed.
|
||||
const MAX_ITERATIONS: usize = 20;
|
||||
let mut hit_clean_409 = false;
|
||||
for iter in 0..MAX_ITERATIONS {
|
||||
let h = matrix::Harness::new().await;
|
||||
let feature = format!("feature-mr923-{}", iter);
|
||||
let frank = format!("Frank-mr923-{}", iter);
|
||||
let eve = format!("Eve-mr923-{}", iter);
|
||||
h.create_branch("main", &feature).await;
|
||||
h.insert_person(&feature, &eve, 22).await;
|
||||
|
||||
let (sa, sb) = h
|
||||
.run_pair(
|
||||
matrix::op_merge(feature.clone(), "main".to_string()),
|
||||
matrix::op_change_insert("main".to_string(), frank.clone(), 33),
|
||||
)
|
||||
.await;
|
||||
assert_eq!(
|
||||
sb.status,
|
||||
StatusCode::OK,
|
||||
"iter {}: concurrent /change must always succeed",
|
||||
iter,
|
||||
);
|
||||
assert!(
|
||||
sa.status == StatusCode::OK || sa.status == StatusCode::CONFLICT,
|
||||
"iter {}: merge must be 200 or clean 409, got {}",
|
||||
iter,
|
||||
sa.status,
|
||||
);
|
||||
|
||||
// The sentinel must succeed regardless of which branch the
|
||||
// racing pair took. The matrix cell asserts this too; we
|
||||
// re-assert here per iteration so a regression that breaks
|
||||
// only the 409 branch is surfaced directly.
|
||||
let sentinel = format!("sentinel-mr923-{}", iter);
|
||||
h.assert_post_op_sentinel("mr923-regression", &sentinel).await;
|
||||
|
||||
if sa.status == StatusCode::CONFLICT {
|
||||
// Confirmed the conflict carries the structured
|
||||
// `manifest_conflict` payload Cell d documents, so we know
|
||||
// this was the clean-409 path (not a malformed error).
|
||||
let error: ErrorOutput = serde_json::from_slice(&sa.body).unwrap();
|
||||
error
|
||||
.manifest_conflict
|
||||
.expect("clean 409 must include manifest_conflict payload");
|
||||
hit_clean_409 = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// If MAX_ITERATIONS runs never hit the clean-409 path, the timing
|
||||
// window simply didn't open on this hardware. The per-iteration
|
||||
// sentinel assertion above still covered every 200-merge run, and
|
||||
// the matrix cell covers the same scenario from a different angle.
|
||||
// Don't fail the test — that would substitute hardware-timing
|
||||
// sensitivity for the regression signal.
|
||||
let _ = hit_clean_409;
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "multi_thread", worker_threads = 4)]
|
||||
async fn change_disjoint_table_concurrency_succeeds_at_http_level() {
|
||||
// HTTP-level pin for MR-686's disjoint-table promise: concurrent /change
|
||||
|
|
|
|||
|
|
@ -1109,7 +1109,22 @@ impl Omnigraph {
|
|||
.await;
|
||||
self.restore_coordinator(previous).await;
|
||||
|
||||
if merge_result.is_ok() && previous_branch == target_branch {
|
||||
// Refresh the restored coordinator on both Ok and Err paths.
|
||||
// During the swap window above, `self.coordinator` was a freshly
|
||||
// opened coord for the merge target; any concurrent writer on
|
||||
// that target (e.g. a `/change` on `main` racing a
|
||||
// `merge into=main`) publishes against the swapped coord and
|
||||
// never touches the original. Without this refresh, the
|
||||
// restored coord's cached manifest snapshot would diverge from
|
||||
// disk and seed a stale `expected_versions` into the next op's
|
||||
// publisher CAS fence — a non-retryable
|
||||
// `ExpectedVersionMismatch` for a user with no concurrent
|
||||
// writer of their own. Pinned by
|
||||
// `concurrent_merge_clean_409_does_not_poison_next_change_on_target`
|
||||
// in `crates/omnigraph-server/tests/server.rs` and by the
|
||||
// `[d:merge×change:into-target]` cell of
|
||||
// `concurrent_branch_ops_morphological_matrix`.
|
||||
if previous_branch == target_branch {
|
||||
self.refresh().await?;
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue