From 1c83377d77850e448afe8eb03212023ffcfa3883 Mon Sep 17 00:00:00 2001 From: Ragnor Comerford Date: Sun, 31 May 2026 16:20:45 +0200 Subject: [PATCH] fix(engine): force-delete in __run__ sweep for concurrency safety `migrate_v2_to_v3` ran `Dataset::delete_branch` (= `branches().delete(.., false)`), which errors "BranchContents not found" if the branch is already gone. Since the sweep now runs in `Omnigraph::open(ReadWrite)`, two processes opening the same legacy v2 graph concurrently would race: one wins each delete, the other's open fails. The migration only claimed idempotency under *sequential* retry. Switch to `Dataset::force_delete_branch` (= `delete(.., true)`), Lance's documented path for cleaning up zombie branches, which tolerates an already-absent branch. The sweep is now idempotent under concurrent runners and robust to partial/zombie state. Found in self-review; no behavior change for the common single-open path. --- crates/omnigraph/src/db/manifest/migrations.rs | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/crates/omnigraph/src/db/manifest/migrations.rs b/crates/omnigraph/src/db/manifest/migrations.rs index 8457e2f..e2801fe 100644 --- a/crates/omnigraph/src/db/manifest/migrations.rs +++ b/crates/omnigraph/src/db/manifest/migrations.rs @@ -144,9 +144,10 @@ async fn migrate_v1_to_v2(dataset: &mut Dataset) -> Result<()> { /// working after the `run_registry` module (the guard) is deleted, so it does /// not depend on it. /// -/// Idempotent under retry: each run re-enumerates `list_branches` fresh, so a -/// crash before the stamp bump simply re-deletes whatever `__run__*` branches -/// still remain on the next open. +/// Idempotent under both sequential retry and concurrent runners: each run +/// re-enumerates `list_branches` fresh, and `force_delete_branch` tolerates a +/// branch that is already gone — so a crash before the stamp bump, or a second +/// process opening the same legacy graph at the same time, never errors out. async fn migrate_v2_to_v3(dataset: &mut Dataset) -> Result<()> { const LEGACY_RUN_BRANCH_PREFIX: &str = "__run__"; let branches = dataset @@ -161,8 +162,13 @@ async fn migrate_v2_to_v3(dataset: &mut Dataset) -> Result<()> { }) .collect(); for name in run_branches { + // `force_delete_branch` deletes even when the `BranchContents` is + // already gone. Plain `delete_branch` errors "BranchContents not + // found", which would fail a second concurrent open (or a retry that + // raced another runner) after the first one swept the branch. Force is + // exactly Lance's documented path for cleaning up zombie branches. dataset - .delete_branch(&name) + .force_delete_branch(&name) .await .map_err(|e| OmniError::Lance(e.to_string()))?; }