From 7fc00142a481688b5c8ec1b45715be91820c75fb Mon Sep 17 00:00:00 2001 From: Ragnor Comerford Date: Fri, 8 May 2026 17:47:08 +0200 Subject: [PATCH] engine: scope refresh() write guard to recovery; release before schema reload Closes the HIGH-severity deadlock flagged by Cursor Bugbot on PR #75 review of commit b09a097. Pre-fix: `Omnigraph::refresh()` held `coordinator.write().await` from omnigraph.rs:468 through function exit, including across the call to `reload_schema_if_source_changed()` at line 484. That helper's `self.coordinator.read().await` (only reached when on-disk schema source differs from in-memory cache) deadlocked against the outer write guard because tokio's RwLock is non-reentrant. Reachable from `branch_delete` (omnigraph.rs:910) and `branch_merge` (post-merge refresh at merge.rs:1100). Cross-handle scenario: handle A calls apply_schema, handle B's stale cache hits the reload path on its next refresh. Why correct by design (AGENTS.md rule 9): the write guard's purpose is to serialize the recovery sweep's mutation of GraphCoordinator; the schema reload reads coord.branch_list() and stores into the ArcSwap'd schema_source / catalog without touching the coord. The two operations have disjoint lock requirements; coupling them was over-locking. Scoping the guard matches the natural data-flow: snapshot recovery state under the write, release, then reload schema using a fresh read on the same lock. Pinned by `composite_flow_schema_apply_then_branch_ops_no_deadlock_in_refresh` (previous commit). Pre-fix: 15s timeout fires. Post-fix: completes in 0.25s. Both other composite_flow tests still pass: canonical_lifecycle and multi_branch_sequential_merges. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/omnigraph/src/db/omnigraph.rs | 40 +++++++++++++++++----------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/crates/omnigraph/src/db/omnigraph.rs b/crates/omnigraph/src/db/omnigraph.rs index ba0e866..f2082c4 100644 --- a/crates/omnigraph/src/db/omnigraph.rs +++ b/crates/omnigraph/src/db/omnigraph.rs @@ -465,22 +465,30 @@ impl Omnigraph { /// [`refresh_coordinator_only`](Self::refresh_coordinator_only) to /// avoid the recovery sweep racing their own sidecar. pub async fn refresh(&self) -> Result<()> { - let mut coord = self.coordinator.write().await; - coord.refresh().await?; - let schema_state_recovery = recover_schema_state_files( - &self.root_uri, - Arc::clone(&self.storage), - &coord.snapshot(), - ) - .await?; - crate::db::manifest::recover_manifest_drift( - &self.root_uri, - Arc::clone(&self.storage), - &mut *coord, - crate::db::manifest::RecoveryMode::RollForwardOnly, - schema_state_recovery, - ) - .await?; + // Scope the coord write guard to the recovery section only. + // `reload_schema_if_source_changed` (below) acquires + // `self.coordinator.read().await` when the on-disk schema source + // has drifted from the cached `schema_source`. Tokio's RwLock is + // not reentrant, so holding the write across that call deadlocks. + // Pinned by `composite_flow_schema_apply_then_branch_ops_no_deadlock_in_refresh`. + { + let mut coord = self.coordinator.write().await; + coord.refresh().await?; + let schema_state_recovery = recover_schema_state_files( + &self.root_uri, + Arc::clone(&self.storage), + &coord.snapshot(), + ) + .await?; + crate::db::manifest::recover_manifest_drift( + &self.root_uri, + Arc::clone(&self.storage), + &mut *coord, + crate::db::manifest::RecoveryMode::RollForwardOnly, + schema_state_recovery, + ) + .await?; + } // ← write guard released before reload's read acquisition self.reload_schema_if_source_changed().await?; self.runtime_cache.invalidate_all().await; Ok(())