Initial release: iai-mcp v0.1.0

Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: XNLLLLH <XNLLLLH@users.noreply.github.com>
2026-05-06 01:04:47 -07:00 · 2026-05-06 01:04:47 -07:00 · f6b876fbe7
commit f6b876fbe7
332 changed files with 97258 additions and 0 deletions
--- a/tests/test_doctor_multi_binder.py
+++ b/tests/test_doctor_multi_binder.py
@ -0,0 +1,622 @@
+"""Phase 7.1 R6 / D7.1-05 — doctor.py multi-binder detection + repair.
+
+Test matrix (8 tests):
+  A. _extract_binder_pids parses lsof -F pn output → set[int]
+  B. _extract_binder_pids skips PIDs bound to UNRELATED sockets
+  C. _extract_binder_pids handles empty input → empty set
+  D. check_g_no_dup_binders skips when socket file absent (PASS-with-skip)
+  E. check_g_no_dup_binders PASSes with single binder (multiprocessing worker)
+  F. check_g_no_dup_binders FAILs with two binders (regression-trap centerpiece)
+  G. _kill_dup_binders keeps oldest, kills the rest (real subprocess daemons)
+  H. iai-mcp doctor --apply --yes recovers from dup-binder scenario (e2e)
+
+A-D: pure unit tests, no daemon, fast (<1s combined).
+E-F: in-process multiprocessing workers — distinct PIDs, lsof-visible.
+G-H: real iai_mcp.daemon subprocesses — required because _kill_dup_binders
+     filters by 'iai_mcp.daemon' substring in psutil cmdline (wrong-PID-kill
+     mitigation). Isolated by HIGH-4 LOCK env propagation pattern from
+     test_doctor_apply_recovery.py:isolated_daemon_paths.
+
+Skip on non-POSIX (AF_UNIX requirement).
+"""
+from __future__ import annotations
+
+import argparse
+import multiprocessing as mp
+import os
+import platform
+import signal
+import socket
+import subprocess
+import sys
+import time
+from pathlib import Path
+
+import psutil
+import pytest
+
+
+pytestmark = pytest.mark.skipif(
+    platform.system() == "Windows",
+    reason="POSIX AF_UNIX required (lsof -U + multiprocessing socket binders)",
+)
+
+
+# ---------------------------------------------------------------------------
+# Section 1 — pure unit tests for _extract_binder_pids (A, B, C)
+# ---------------------------------------------------------------------------
+
+
+def test_extract_binder_pids_parses_lsof_output():
+    """A: hand-crafted lsof -F pn output → expected PID set.
+
+    lsof -F pn format alternates lines `p<pid>` and `n<filename>`. Each
+    PID is followed by 0+ name entries until the next `p<pid>`.
+    """
+    from iai_mcp.doctor import _extract_binder_pids
+
+    target = Path("/tmp/iai-test/d.sock")
+    lsof_output = "\n".join([
+        "p12345",
+        f"n{target}",
+        "p67890",
+        f"n{target}",
+        "p99999",
+        "n/tmp/other-app/socket",
+    ])
+
+    pids = _extract_binder_pids(lsof_output, target)
+
+    assert pids == {12345, 67890}, f"expected {{12345, 67890}}, got {pids}"
+
+
+def test_extract_binder_pids_skips_unrelated_sockets():
+    """B: lsof output with multiple sockets; only PIDs holding OUR path are returned."""
+    from iai_mcp.doctor import _extract_binder_pids
+
+    target = Path("/tmp/iai-test/d.sock")
+    lsof_output = "\n".join([
+        "p1001",
+        "n/var/run/some-other-daemon.sock",
+        "p2002",
+        f"n{target}",
+        "p3003",
+        "n/tmp/X11-unix/X0",
+        "p4004",
+        f"n{target}",
+        "n/some/extra/name/for/p4004",  # PID 4004 holds multiple fds
+    ])
+
+    pids = _extract_binder_pids(lsof_output, target)
+
+    assert pids == {2002, 4004}, f"expected {{2002, 4004}}, got {pids}"
+
+
+def test_extract_binder_pids_handles_empty_output():
+    """C: empty input → empty set (defensive corner case)."""
+    from iai_mcp.doctor import _extract_binder_pids
+
+    target = Path("/tmp/anywhere.sock")
+    assert _extract_binder_pids("", target) == set()
+    assert _extract_binder_pids("\n\n\n", target) == set()
+    # Malformed: PID line without name line; name line without preceding PID.
+    assert _extract_binder_pids("p123\nXgarbage\np\n", target) == set()
+
+
+# ---------------------------------------------------------------------------
+# Section 2 — check_g_no_dup_binders (D, E, F) using monkeypatched socket path
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def short_socket_path(tmp_path, monkeypatch):
+    """Yield a short socket path under /tmp (AF_UNIX 104-byte cap on macOS).
+
+    Honors the IAI_DAEMON_SOCKET_PATH env override that doctor._resolve_socket_path
+    consults. Cleans up the socket file on teardown.
+    """
+    sock_dir = Path(f"/tmp/iai-mb-{os.getpid()}-{id(tmp_path)}")
+    sock_dir.mkdir(parents=True, exist_ok=True)
+    sock_path = sock_dir / "d.sock"
+    monkeypatch.setenv("IAI_DAEMON_SOCKET_PATH", str(sock_path))
+    try:
+        yield sock_path
+    finally:
+        try:
+            if sock_path.exists():
+                sock_path.unlink()
+        except OSError:
+            pass
+        try:
+            sock_dir.rmdir()
+        except OSError:
+            pass
+
+
+def test_check_g_no_socket_skips(short_socket_path, monkeypatch):
+    """D: socket file absent → PASS-with-skip detail "no socket file (skip)".
+
+    Mirrors check_d_no_orphan_core's skip pattern when the resource isn't
+    present (no false-positive on a clean machine).
+    """
+    from iai_mcp.doctor import check_g_no_dup_binders
+
+    # Fixture set the env var; ensure no file exists.
+    assert not short_socket_path.exists()
+
+    result = check_g_no_dup_binders()
+
+    assert result.passed is True
+    assert "no socket file" in result.detail
+
+
+# --- Multiprocessing worker for Tests E and F (distinct PIDs) ---------------
+
+
+def _bind_socket_worker(sock_path_str: str, ready_event: mp.Event, exit_event: mp.Event) -> None:
+    """Subprocess worker: bind an AF_UNIX socket to sock_path, signal ready,
+    block until exit_event is set.
+
+    Each multiprocessing.Process child has a distinct PID and lsof reports
+    its socket fd. Used by Tests E (1 binder) and F (2 binders) to construct
+    deterministic dup-binder scenarios without a real iai_mcp.daemon (whose
+    boot cost is ~3-10s).
+    """
+    s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
+    try:
+        # Each worker handles its own bind; for the 2-binder scenario, the
+        # parent unlinks the path between worker spawns so each worker
+        # successfully bind()s a fresh inode at the same name.
+        s.bind(sock_path_str)
+        s.listen(5)
+        ready_event.set()
+        # Block until parent signals shutdown.
+        exit_event.wait(timeout=30)
+    finally:
+        try:
+            s.close()
+        except OSError:
+            pass
+
+
+def test_check_g_single_binder_passes(short_socket_path):
+    """E: ONE binder bound to the socket → check_g returns PASS with "1 binder(s)".
+
+    Uses a multiprocessing.Process worker (distinct PID from the pytest
+    process) so lsof has something to enumerate.
+    """
+    from iai_mcp.doctor import check_g_no_dup_binders
+
+    # NOTE: use 'spawn' (not 'fork') even on Darwin — lancedb is not fork-safe
+    # (UserWarning surfaces with fork on macOS). Workers don't touch lancedb,
+    # but the parent test process has it imported transitively; spawn isolates.
+    ctx = mp.get_context("spawn")
+    ready = ctx.Event()
+    exit_signal = ctx.Event()
+    worker = ctx.Process(
+        target=_bind_socket_worker,
+        args=(str(short_socket_path), ready, exit_signal),
+    )
+    worker.start()
+    try:
+        assert ready.wait(timeout=10), "binder worker never signaled ready"
+        # Tiny settle so lsof's cache reflects the bind.
+        time.sleep(0.2)
+
+        result = check_g_no_dup_binders()
+
+        assert result.passed is True, (
+            f"single-binder scenario should PASS; got detail={result.detail!r}"
+        )
+        assert "1 binder" in result.detail, f"unexpected detail: {result.detail!r}"
+    finally:
+        exit_signal.set()
+        worker.join(timeout=5)
+        if worker.is_alive():
+            worker.terminate()
+            worker.join(timeout=2)
+
+
+def test_check_g_two_binders_fails(short_socket_path):
+    """F: TWO binders bound to the same socket path → check_g returns FAIL.
+
+    REGRESSION-TRAP CENTERPIECE. Spawns 2 multiprocessing workers, each
+    binding to the same socket path with an unlink between them so both
+    bind() calls succeed at the OS level. lsof reports both PIDs as
+    holding the path; check_g detects the singleton-invariant violation.
+
+    This is exactly the failure mode Phase 7.1's launchd architecture
+    structurally prevents in production — the test bypasses launchd by
+    hand-binding sockets in worker processes. On post-Phase 7.1 production,
+    this scenario can only occur if a user manually bypasses launchd.
+    """
+    from iai_mcp.doctor import _extract_binder_pids, check_g_no_dup_binders
+
+    # NOTE: use 'spawn' (not 'fork') even on Darwin — lancedb is not fork-safe
+    # (UserWarning surfaces with fork on macOS). Workers don't touch lancedb,
+    # but the parent test process has it imported transitively; spawn isolates.
+    ctx = mp.get_context("spawn")
+
+    # Worker 1
+    ready1 = ctx.Event()
+    exit1 = ctx.Event()
+    w1 = ctx.Process(
+        target=_bind_socket_worker,
+        args=(str(short_socket_path), ready1, exit1),
+    )
+    w1.start()
+
+    # Worker 2 — race-window simulation: unlink the path so worker 2's bind()
+    # creates a fresh inode at the same name. Worker 1's fd still holds the
+    # ORIGINAL inode (unlinked but kept alive by the open fd); worker 2 holds
+    # the NEW inode at the same path. lsof reports both PIDs.
+    ready2 = ctx.Event()
+    exit2 = ctx.Event()
+    w2 = None
+    try:
+        assert ready1.wait(timeout=10), "worker 1 never signaled ready"
+        # Unlink so the second bind doesn't EADDRINUSE.
+        try:
+            short_socket_path.unlink()
+        except OSError:
+            pass
+        w2 = ctx.Process(
+            target=_bind_socket_worker,
+            args=(str(short_socket_path), ready2, exit2),
+        )
+        w2.start()
+        assert ready2.wait(timeout=10), "worker 2 never signaled ready"
+        time.sleep(0.3)  # let lsof catch up
+
+        # Belt-and-suspenders: confirm via the parser directly that lsof sees both.
+        lsof_out = subprocess.run(
+            ["lsof", "-U", "-F", "pn"],
+            capture_output=True,
+            text=True,
+            timeout=5,
+            check=False,
+        ).stdout
+        binder_pids = _extract_binder_pids(lsof_out, short_socket_path)
+        assert {w1.pid, w2.pid}.issubset(binder_pids), (
+            f"lsof should report both worker PIDs as binders; got {binder_pids} "
+            f"(workers: {w1.pid}, {w2.pid})"
+        )
+
+        # Centerpiece assertion: check_g detects the dup-binder scenario.
+        result = check_g_no_dup_binders()
+
+        assert result.passed is False, (
+            f"two-binder scenario should FAIL; got detail={result.detail!r}"
+        )
+        # Detail mentions both PIDs.
+        assert str(w1.pid) in result.detail, f"detail missing PID {w1.pid}: {result.detail!r}"
+        assert str(w2.pid) in result.detail, f"detail missing PID {w2.pid}: {result.detail!r}"
+    finally:
+        exit1.set()
+        if w2 is not None:
+            exit2.set()
+        for proc in (w1, w2):
+            if proc is None:
+                continue
+            proc.join(timeout=5)
+            if proc.is_alive():
+                proc.terminate()
+                proc.join(timeout=2)
+
+
+# ---------------------------------------------------------------------------
+# Section 3 — _kill_dup_binders + e2e doctor --apply (G, H)
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def isolated_daemon_paths(tmp_path, monkeypatch):
+    """HOME + socket + store + crypto env propagation for real-daemon tests.
+
+    Mirrors test_doctor_apply_recovery.py:isolated_daemon_paths verbatim
+    (HIGH-4 LOCK precedent, Plan 07-04). Required because _kill_dup_binders
+    filters by 'iai_mcp.daemon' substring in psutil cmdline — only real
+    iai_mcp.daemon subprocesses are killable, so multiprocessing workers
+    cannot serve Tests G/H.
+    """
+    iai_dir = tmp_path / ".iai-mcp"
+    iai_dir.mkdir(parents=True, exist_ok=True)
+
+    state_path = iai_dir / ".daemon-state.json"
+    lock_path = iai_dir / ".lock"
+    store_dir = iai_dir / "store"
+    store_dir.mkdir(parents=True, exist_ok=True)
+
+    sock_dir = Path(f"/tmp/iai-mb2-{os.getpid()}-{id(tmp_path)}")
+    sock_dir.mkdir(parents=True, exist_ok=True)
+    sock_path = sock_dir / "d.sock"
+
+    real_hf_home = Path.home() / ".cache" / "huggingface"
+
+    monkeypatch.setenv("HOME", str(tmp_path))
+    monkeypatch.setenv("HF_HOME", str(real_hf_home))
+    monkeypatch.setenv("IAI_DAEMON_SOCKET_PATH", str(sock_path))
+    monkeypatch.setenv("IAI_MCP_STORE", str(store_dir))
+    monkeypatch.setenv("IAI_DAEMON_IDLE_SHUTDOWN_SECS", "99999")
+    monkeypatch.setenv(
+        "PYTHON_KEYRING_BACKEND", "keyring.backends.fail.Keyring"
+    )
+    monkeypatch.setenv("IAI_MCP_CRYPTO_PASSPHRASE", "test-mb-passphrase")
+    import keyring.core
+
+    keyring.core._keyring_backend = None
+
+    from iai_mcp import cli, daemon_state
+
+    monkeypatch.setattr(daemon_state, "STATE_PATH", state_path)
+    monkeypatch.setattr(cli, "LOCK_PATH", lock_path)
+    monkeypatch.setattr(cli, "SOCKET_PATH", sock_path)
+
+    try:
+        yield sock_path, state_path, store_dir, lock_path
+    finally:
+        _kill_test_daemons(sock_path)
+        try:
+            if sock_path.exists():
+                sock_path.unlink()
+        except OSError:
+            pass
+        try:
+            sock_dir.rmdir()
+        except OSError:
+            pass
+        keyring.core._keyring_backend = None
+
+
+def _spawn_daemon(sock_path: Path, store_dir: Path, home: Path) -> subprocess.Popen:
+    """Spawn `python -m iai_mcp.daemon` with the test's env propagated."""
+    env = os.environ.copy()
+    env["HOME"] = str(home)
+    env["IAI_DAEMON_SOCKET_PATH"] = str(sock_path)
+    env["IAI_MCP_STORE"] = str(store_dir)
+    env["IAI_DAEMON_IDLE_SHUTDOWN_SECS"] = "99999"
+    env["PYTHON_KEYRING_BACKEND"] = "keyring.backends.fail.Keyring"
+    env["IAI_MCP_CRYPTO_PASSPHRASE"] = "test-mb-passphrase"
+    return subprocess.Popen(
+        [sys.executable, "-m", "iai_mcp.daemon"],
+        env=env,
+        stdout=subprocess.DEVNULL,
+        stderr=subprocess.DEVNULL,
+    )
+
+
+def _wait_for_socket(sock_path: Path, timeout_sec: float = 30.0) -> bool:
+    deadline = time.monotonic() + timeout_sec
+    while time.monotonic() < deadline:
+        if sock_path.exists():
+            return True
+        time.sleep(0.1)
+    return False
+
+
+def _kill_test_daemons(sock_path: Path) -> None:
+    """Match-by-env cleanup: SIGTERM iai_mcp.daemon subprocesses whose
+    psutil environ has our IAI_DAEMON_SOCKET_PATH value. Avoids touching
+    the user's real production daemon.
+    """
+    target = str(sock_path)
+    for p in psutil.process_iter(["pid", "cmdline"]):
+        try:
+            cl = " ".join(p.info.get("cmdline") or [])
+            if "iai_mcp.daemon" not in cl:
+                continue
+            try:
+                env = p.environ()
+            except (psutil.AccessDenied, psutil.NoSuchProcess):
+                continue
+            if env.get("IAI_DAEMON_SOCKET_PATH") == target:
+                try:
+                    p.send_signal(signal.SIGTERM)
+                    p.wait(timeout=3)
+                except (psutil.NoSuchProcess, psutil.TimeoutExpired):
+                    try:
+                        p.send_signal(signal.SIGKILL)
+                    except psutil.NoSuchProcess:
+                        pass
+        except (psutil.NoSuchProcess, psutil.AccessDenied):
+            continue
+
+
+def _spawn_dup_daemons(
+    sock_path: Path, store_dir: Path, home: Path
+) -> tuple[subprocess.Popen, subprocess.Popen]:
+    """Spawn 2 real iai_mcp.daemon subprocesses both bound to sock_path.
+
+    Race-window simulation per CONTEXT.md hint: spawn daemon #1, wait for
+    socket, unlink (so daemon #2 can bind a fresh inode at the same path),
+    spawn daemon #2, wait for socket. Daemon #1's listening fd still holds
+    the original (now unlinked) inode; daemon #2 holds the new inode. lsof
+    reports both PIDs as binders of the same path.
+    """
+    p1 = _spawn_daemon(sock_path, store_dir, home)
+    if not _wait_for_socket(sock_path, timeout_sec=30):
+        try:
+            p1.kill()
+        except ProcessLookupError:
+            pass
+        raise AssertionError("daemon #1 never bound socket within 30s")
+
+    # Race-window: unlink so daemon #2's bind() succeeds without EADDRINUSE.
+    try:
+        sock_path.unlink()
+    except OSError:
+        pass
+
+    p2 = _spawn_daemon(sock_path, store_dir, home)
+    if not _wait_for_socket(sock_path, timeout_sec=30):
+        try:
+            p2.kill()
+        except ProcessLookupError:
+            pass
+        try:
+            p1.kill()
+        except ProcessLookupError:
+            pass
+        raise AssertionError("daemon #2 never bound socket within 30s")
+
+    # Settle so lsof reflects both binders.
+    time.sleep(0.5)
+    return p1, p2
+
+
+@pytest.mark.skip(
+    reason=(
+        "Phase 10.6 Plan 10.6-01 Task 1.5: single-machine "
+        "LifecycleLock prevents two daemons from both binding the same "
+        "IAI_MCP_STORE. Daemon #2 raises LifecycleLockConflict and exits "
+        "1 before bind. The dup-binder integration scenario is now "
+        "impossible by design. The unit tests in this file "
+        "(test_extract_binder_pids_*, test_check_g_*) still cover "
+        "check_g's detection logic without spawning two real daemons."
+    )
+)
+def test_kill_dup_binders_keeps_oldest(isolated_daemon_paths):
+    """G: 2 real daemons → _kill_dup_binders kills younger, keeps oldest.
+
+    Re-running check_g afterward returns PASS (1 binder remaining).
+    """
+    from iai_mcp.doctor import (
+        _extract_binder_pids,
+        _kill_dup_binders,
+        check_g_no_dup_binders,
+    )
+
+    sock_path, _, store_dir, _ = isolated_daemon_paths
+    home = Path(os.environ["HOME"])
+
+    p1, p2 = _spawn_dup_daemons(sock_path, store_dir, home)
+    try:
+        # Pre-condition: both daemons must show up as binders for our socket.
+        lsof_out = subprocess.run(
+            ["lsof", "-U", "-F", "pn"],
+            capture_output=True,
+            text=True,
+            timeout=5,
+            check=False,
+        ).stdout
+        binders = _extract_binder_pids(lsof_out, sock_path)
+        assert {p1.pid, p2.pid}.issubset(binders), (
+            f"expected both daemon PIDs in binders; got {binders} "
+            f"(daemons: {p1.pid}, {p2.pid})"
+        )
+        pre_check = check_g_no_dup_binders()
+        assert pre_check.passed is False, (
+            f"pre-condition: dup-binder scenario should FAIL check_g; "
+            f"got {pre_check.detail!r}"
+        )
+
+        # Kill the younger daemon. p1 was spawned first → has greater etime →
+        # is the keep_pid; p2 should be killed.
+        ok, msg, ms = _kill_dup_binders()
+
+        assert ok is True, f"_kill_dup_binders returned ok=False: {msg}"
+        assert "kept PID" in msg, f"msg missing 'kept PID': {msg!r}"
+        assert "killed" in msg, f"msg missing 'killed': {msg!r}"
+        assert ms < 10_000, f"_kill_dup_binders took {ms}ms (>10s); too slow"
+
+        # After kill, a follow-up check_g should report 1 (or 0 — race) binder.
+        post_check = check_g_no_dup_binders()
+        assert post_check.passed is True, (
+            f"post-kill check_g should PASS; got {post_check.detail!r}"
+        )
+
+        # The kept daemon (p1) should still be alive; the other should be dead
+        # within a generous timeout (kill is SIGKILL, instant on macOS).
+        assert p1.poll() is None, "expected oldest daemon (p1) to survive"
+        # Allow up to 2s for SIGKILL signal delivery + reap.
+        deadline = time.monotonic() + 5.0
+        while time.monotonic() < deadline and p2.poll() is None:
+            time.sleep(0.1)
+        assert p2.poll() is not None, "expected younger daemon (p2) to be dead"
+    finally:
+        for proc in (p1, p2):
+            if proc.poll() is None:
+                try:
+                    proc.send_signal(signal.SIGKILL)
+                    proc.wait(timeout=3)
+                except (subprocess.TimeoutExpired, ProcessLookupError):
+                    pass
+
+
+@pytest.mark.skip(
+    reason=(
+        "Phase 10.6 Plan 10.6-01 Task 1.5: single-machine "
+        "LifecycleLock prevents two daemons from both binding the same "
+        "IAI_MCP_STORE. Daemon #2 raises LifecycleLockConflict and exits "
+        "1 before bind. End-to-end recovery from dup-binders cannot run "
+        "because the dup-binders state is now impossible to construct."
+    )
+)
+def test_doctor_apply_yes_recovers_from_dup_binders(isolated_daemon_paths):
+    """H: end-to-end. 2 dup-binder daemons → cmd_doctor(apply=True, yes=True)
+    drives the kill_dup_binders repair → re-check returns 0 OR exit 2 only
+    if a non-related check (e.g., (a) state desync) FAILs.
+
+    NB: spawning two real daemons against the same socket inevitably leaves
+    daemon-state.json pointing at one of the two PIDs (whichever wrote last).
+    After kill_dup_binders, if the survivor is the one daemon-state recorded,
+    check_a passes; if the survivor is the OTHER daemon, check_a FAILs and the
+    respawn action triggers, which (because the surviving daemon already binds
+    the socket) yields a launchd-react-noop OR a benign respawn-timeout. The
+    relevant assertion for THIS test is the dup-binder repair specifically:
+    after recovery, lsof reports exactly 1 binder for our socket path. The
+    overall rc and check_a status are looser assertions because they depend
+    on the state-file-vs-survivor coincidence.
+    """
+    from iai_mcp.doctor import (
+        _extract_binder_pids,
+        check_g_no_dup_binders,
+        cmd_doctor,
+    )
+
+    sock_path, _, store_dir, _ = isolated_daemon_paths
+    home = Path(os.environ["HOME"])
+
+    p1, p2 = _spawn_dup_daemons(sock_path, store_dir, home)
+    try:
+        # Sanity: dup-binder is detectable.
+        pre = check_g_no_dup_binders()
+        assert pre.passed is False, f"pre: dup-binder should FAIL; got {pre.detail!r}"
+
+        args = argparse.Namespace(apply=True, yes=True)
+        rc = cmd_doctor(args)
+
+        # The critical observable: dup-binders cleared.
+        post_check = check_g_no_dup_binders()
+        assert post_check.passed is True, (
+            f"post-recovery: check_g should PASS; got {post_check.detail!r}"
+        )
+        # rc may be 0 (everything green) or 2 (only check_a survived as FAIL
+        # because state-file PID points at the killed survivor); both prove
+        # the dup-binder repair mechanism worked. rc=1 would mean --apply
+        # never ran the repair (regression).
+        assert rc in (0, 2), (
+            f"cmd_doctor rc={rc} unexpected; allowed 0 (full recovery) or 2 "
+            f"(dup-binders fixed but state-file desync persists)."
+        )
+
+        # Belt-and-suspenders: lsof confirms exactly 1 binder remains.
+        lsof_out = subprocess.run(
+            ["lsof", "-U", "-F", "pn"],
+            capture_output=True,
+            text=True,
+            timeout=5,
+            check=False,
+        ).stdout
+        binders = _extract_binder_pids(lsof_out, sock_path)
+        assert len(binders) <= 1, (
+            f"after recovery, expected ≤1 binder for {sock_path}; got {binders}"
+        )
+    finally:
+        for proc in (p1, p2):
+            if proc.poll() is None:
+                try:
+                    proc.send_signal(signal.SIGKILL)
+                    proc.wait(timeout=3)
+                except (subprocess.TimeoutExpired, ProcessLookupError):
+                    pass