Address review feedback

merge.rs: best-effort refresh on the Err path so a refresh-time storage error doesn't replace the merge body's structured error (typically the manifest_conflict that the HTTP layer maps to a 409 with a structured payload) with a less informative one. Ok-path behavior is unchanged — there a refresh failure is propagated so the caller knows the coord's cache is unsynced. server.rs: bump MAX_ITERATIONS to 50 and assert at the end that the named clean-409 path actually fired at least once. With ~20% per-iter rate on shared-CPU CI (per the original MR-923 repro), P(no hit in 50) is < 0.002%. Without this assertion the test silently degraded to exercising only the 200-merge path — covered already by the matrix cell. Both changes per Devin Review + cubic comments on PR #80. Co-Authored-By: Ragnor Comerford <ragnor.comerford@gmail.com>
2026-06-09 01:35:18 +02:00 · 2026-05-11 21:35:18 +00:00 · 2026-05-11 21:35:18 +00:00 · 7d1a40102c
commit 7d1a40102c
parent b7353e1dc7
2 changed files with 39 additions and 9 deletions
--- a/crates/omnigraph-server/tests/server.rs
+++ b/crates/omnigraph-server/tests/server.rs
@ -3233,7 +3233,16 @@ async fn concurrent_merge_clean_409_does_not_poison_next_change_on_target() {
    // scenario until the clean-409 branch fires, then sharpens the
    // post-condition specifically for that branch: with the fix, the
    // sentinel must succeed.
-    const MAX_ITERATIONS: usize = 20;
+    //
+    // The clean-409 path is timing-dependent (~20% per iteration on
+    // shared-CPU CI per the original MR-923 repro). With 50 iterations
+    // the probability of never hitting it under that rate is < 0.002%.
+    // The test asserts at the end that the named path was actually
+    // exercised at least once; if a future change makes that path
+    // un-reachable in this scenario, the assertion surfaces the
+    // change rather than letting the test silently degrade to
+    // exercising only the 200-merge path.
+    const MAX_ITERATIONS: usize = 50;
    let mut hit_clean_409 = false;
    for iter in 0..MAX_ITERATIONS {
        let h = matrix::Harness::new().await;
@ -3281,13 +3290,16 @@ async fn concurrent_merge_clean_409_does_not_poison_next_change_on_target() {
            break;
        }
    }
-    // If MAX_ITERATIONS runs never hit the clean-409 path, the timing
-    // window simply didn't open on this hardware. The per-iteration
-    // sentinel assertion above still covered every 200-merge run, and
-    // the matrix cell covers the same scenario from a different angle.
-    // Don't fail the test — that would substitute hardware-timing
-    // sensitivity for the regression signal.
-    let _ = hit_clean_409;
+    assert!(
+        hit_clean_409,
+        "clean-409 path never fired in {} iterations — the named regression \
+         scenario is no longer reachable through the merge×change race. \
+         Either the race window has closed (good — but this test no longer \
+         pins the bug) or something is preventing the merge body's \
+         post_queue_snapshot drift check from firing. Investigate before \
+         dropping this test.",
+        MAX_ITERATIONS,
+    );
 }

 #[tokio::test(flavor = "multi_thread", worker_threads = 4)]
--- a/crates/omnigraph/src/exec/merge.rs
+++ b/crates/omnigraph/src/exec/merge.rs
@ -1136,8 +1136,26 @@ impl Omnigraph {
        // sweep emits. Pinned by
        // `branch_merge_phase_b_failure_recovered_on_next_open` in
        // `crates/omnigraph/tests/failpoints.rs`.
+        //
+        // Err-path refresh is best-effort: the merge body's error
+        // (typically the structured `manifest_conflict` from the
+        // post_queue_snapshot drift check) is the value the caller
+        // needs to see. A refresh-time storage error would replace
+        // that with a less informative error; the next op or the next
+        // `Omnigraph::open` will re-sync the coord anyway.
        if previous_branch == target_branch {
-            self.refresh_coordinator_only().await?;
+            match merge_result {
+                Ok(_) => self.refresh_coordinator_only().await?,
+                Err(_) => {
+                    if let Err(err) = self.refresh_coordinator_only().await {
+                        tracing::warn!(
+                            error = %err,
+                            "post-merge coordinator refresh failed on the error path; \
+                             the next op or open will re-sync"
+                        );
+                    }
+                }
+            }
        }

        merge_result