iai-mcp-opencode/tests/test_doctor_multi_binder.py

"""Phase 7.1 R6 / D7.1-05 — doctor.py multi-binder detection + repair.

Test matrix (8 tests):
  A. _extract_binder_pids parses lsof -F pn output → set[int]
  B. _extract_binder_pids skips PIDs bound to UNRELATED sockets
  C. _extract_binder_pids handles empty input → empty set
  D. check_g_no_dup_binders skips when socket file absent (PASS-with-skip)
  E. check_g_no_dup_binders PASSes with single binder (multiprocessing worker)
  F. check_g_no_dup_binders FAILs with two binders (regression-trap centerpiece)
  G. _kill_dup_binders keeps oldest, kills the rest (real subprocess daemons)
  H. iai-mcp doctor --apply --yes recovers from dup-binder scenario (e2e)

A-D: pure unit tests, no daemon, fast (<1s combined).
E-F: in-process multiprocessing workers — distinct PIDs, lsof-visible.
G-H: real iai_mcp.daemon subprocesses — required because _kill_dup_binders
     filters by 'iai_mcp.daemon' substring in psutil cmdline (wrong-PID-kill
     mitigation). Isolated by HIGH-4 LOCK env propagation pattern from
     test_doctor_apply_recovery.py:isolated_daemon_paths.

Skip on non-POSIX (AF_UNIX requirement).
"""
from __future__ import annotations

import argparse
import multiprocessing as mp
import os
import platform
import signal
import socket
import subprocess
import sys
import time
from pathlib import Path

import psutil
import pytest


pytestmark = pytest.mark.skipif(
    platform.system() == "Windows",
    reason="POSIX AF_UNIX required (lsof -U + multiprocessing socket binders)",
)


# ---------------------------------------------------------------------------
# Section 1 — pure unit tests for _extract_binder_pids (A, B, C)
# ---------------------------------------------------------------------------


def test_extract_binder_pids_parses_lsof_output():
    """A: hand-crafted lsof -F pn output → expected PID set.

    lsof -F pn format alternates lines `p<pid>` and `n<filename>`. Each
    PID is followed by 0+ name entries until the next `p<pid>`.
    """
    from iai_mcp.doctor import _extract_binder_pids

    target = Path("/tmp/iai-test/d.sock")
    lsof_output = "\n".join([
        "p12345",
        f"n{target}",
        "p67890",
        f"n{target}",
        "p99999",
        "n/tmp/other-app/socket",
    ])

    pids = _extract_binder_pids(lsof_output, target)

    assert pids == {12345, 67890}, f"expected {{12345, 67890}}, got {pids}"


def test_extract_binder_pids_skips_unrelated_sockets():
    """B: lsof output with multiple sockets; only PIDs holding OUR path are returned."""
    from iai_mcp.doctor import _extract_binder_pids

    target = Path("/tmp/iai-test/d.sock")
    lsof_output = "\n".join([
        "p1001",
        "n/var/run/some-other-daemon.sock",
        "p2002",
        f"n{target}",
        "p3003",
        "n/tmp/X11-unix/X0",
        "p4004",
        f"n{target}",
        "n/some/extra/name/for/p4004",  # PID 4004 holds multiple fds
    ])

    pids = _extract_binder_pids(lsof_output, target)

    assert pids == {2002, 4004}, f"expected {{2002, 4004}}, got {pids}"


def test_extract_binder_pids_handles_empty_output():
    """C: empty input → empty set (defensive corner case)."""
    from iai_mcp.doctor import _extract_binder_pids

    target = Path("/tmp/anywhere.sock")
    assert _extract_binder_pids("", target) == set()
    assert _extract_binder_pids("\n\n\n", target) == set()
    # Malformed: PID line without name line; name line without preceding PID.
    assert _extract_binder_pids("p123\nXgarbage\np\n", target) == set()


# ---------------------------------------------------------------------------
# Section 2 — check_g_no_dup_binders (D, E, F) using monkeypatched socket path
# ---------------------------------------------------------------------------


@pytest.fixture
def short_socket_path(tmp_path, monkeypatch):
    """Yield a short socket path under /tmp (AF_UNIX 104-byte cap on macOS).

    Honors the IAI_DAEMON_SOCKET_PATH env override that doctor._resolve_socket_path
    consults. Cleans up the socket file on teardown.
    """
    sock_dir = Path(f"/tmp/iai-mb-{os.getpid()}-{id(tmp_path)}")
    sock_dir.mkdir(parents=True, exist_ok=True)
    sock_path = sock_dir / "d.sock"
    monkeypatch.setenv("IAI_DAEMON_SOCKET_PATH", str(sock_path))
    try:
        yield sock_path
    finally:
        try:
            if sock_path.exists():
                sock_path.unlink()
        except OSError:
            pass
        try:
            sock_dir.rmdir()
        except OSError:
            pass


def test_check_g_no_socket_skips(short_socket_path, monkeypatch):
    """D: socket file absent → PASS-with-skip detail "no socket file (skip)".

    Mirrors check_d_no_orphan_core's skip pattern when the resource isn't
    present (no false-positive on a clean machine).
    """
    from iai_mcp.doctor import check_g_no_dup_binders

    # Fixture set the env var; ensure no file exists.
    assert not short_socket_path.exists()

    result = check_g_no_dup_binders()

    assert result.passed is True
    assert "no socket file" in result.detail


# --- Multiprocessing worker for Tests E and F (distinct PIDs) ---------------


def _bind_socket_worker(sock_path_str: str, ready_event: mp.Event, exit_event: mp.Event) -> None:
    """Subprocess worker: bind an AF_UNIX socket to sock_path, signal ready,
    block until exit_event is set.

    Each multiprocessing.Process child has a distinct PID and lsof reports
    its socket fd. Used by Tests E (1 binder) and F (2 binders) to construct
    deterministic dup-binder scenarios without a real iai_mcp.daemon (whose
    boot cost is ~3-10s).
    """
    s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
    try:
        # Each worker handles its own bind; for the 2-binder scenario, the
        # parent unlinks the path between worker spawns so each worker
        # successfully bind()s a fresh inode at the same name.
        s.bind(sock_path_str)
        s.listen(5)
        ready_event.set()
        # Block until parent signals shutdown.
        exit_event.wait(timeout=30)
    finally:
        try:
            s.close()
        except OSError:
            pass


def test_check_g_single_binder_passes(short_socket_path):
    """E: ONE binder bound to the socket → check_g returns PASS with "1 binder(s)".

    Uses a multiprocessing.Process worker (distinct PID from the pytest
    process) so lsof has something to enumerate.
    """
    from iai_mcp.doctor import check_g_no_dup_binders

    # NOTE: use 'spawn' (not 'fork') even on Darwin — lancedb is not fork-safe
    # (UserWarning surfaces with fork on macOS). Workers don't touch lancedb,
    # but the parent test process has it imported transitively; spawn isolates.
    ctx = mp.get_context("spawn")
    ready = ctx.Event()
    exit_signal = ctx.Event()
    worker = ctx.Process(
        target=_bind_socket_worker,
        args=(str(short_socket_path), ready, exit_signal),
    )
    worker.start()
    try:
        assert ready.wait(timeout=10), "binder worker never signaled ready"
        # Tiny settle so lsof's cache reflects the bind.
        time.sleep(0.2)

        result = check_g_no_dup_binders()

        assert result.passed is True, (
            f"single-binder scenario should PASS; got detail={result.detail!r}"
        )
        assert "1 binder" in result.detail, f"unexpected detail: {result.detail!r}"
    finally:
        exit_signal.set()
        worker.join(timeout=5)
        if worker.is_alive():
            worker.terminate()
            worker.join(timeout=2)


def test_check_g_two_binders_fails(short_socket_path):
    """F: TWO binders bound to the same socket path → check_g returns FAIL.

    REGRESSION-TRAP CENTERPIECE. Spawns 2 multiprocessing workers, each
    binding to the same socket path with an unlink between them so both
    bind() calls succeed at the OS level. lsof reports both PIDs as
    holding the path; check_g detects the singleton-invariant violation.

    This is exactly the failure mode Phase 7.1's launchd architecture
    structurally prevents in production — the test bypasses launchd by
    hand-binding sockets in worker processes. On post-Phase 7.1 production,
    this scenario can only occur if a user manually bypasses launchd.
    """
    from iai_mcp.doctor import _extract_binder_pids, check_g_no_dup_binders

    # NOTE: use 'spawn' (not 'fork') even on Darwin — lancedb is not fork-safe
    # (UserWarning surfaces with fork on macOS). Workers don't touch lancedb,
    # but the parent test process has it imported transitively; spawn isolates.
    ctx = mp.get_context("spawn")

    # Worker 1
    ready1 = ctx.Event()
    exit1 = ctx.Event()
    w1 = ctx.Process(
        target=_bind_socket_worker,
        args=(str(short_socket_path), ready1, exit1),
    )
    w1.start()

    # Worker 2 — race-window simulation: unlink the path so worker 2's bind()
    # creates a fresh inode at the same name. Worker 1's fd still holds the
    # ORIGINAL inode (unlinked but kept alive by the open fd); worker 2 holds
    # the NEW inode at the same path. lsof reports both PIDs.
    ready2 = ctx.Event()
    exit2 = ctx.Event()
    w2 = None
    try:
        assert ready1.wait(timeout=10), "worker 1 never signaled ready"
        # Unlink so the second bind doesn't EADDRINUSE.
        try:
            short_socket_path.unlink()
        except OSError:
            pass
        w2 = ctx.Process(
            target=_bind_socket_worker,
            args=(str(short_socket_path), ready2, exit2),
        )
        w2.start()
        assert ready2.wait(timeout=10), "worker 2 never signaled ready"
        time.sleep(0.3)  # let lsof catch up

        # Belt-and-suspenders: confirm via the parser directly that lsof sees both.
        lsof_out = subprocess.run(
            ["lsof", "-U", "-F", "pn"],
            capture_output=True,
            text=True,
            timeout=5,
            check=False,
        ).stdout
        binder_pids = _extract_binder_pids(lsof_out, short_socket_path)
        assert {w1.pid, w2.pid}.issubset(binder_pids), (
            f"lsof should report both worker PIDs as binders; got {binder_pids} "
            f"(workers: {w1.pid}, {w2.pid})"
        )

        # Centerpiece assertion: check_g detects the dup-binder scenario.
        result = check_g_no_dup_binders()

        assert result.passed is False, (
            f"two-binder scenario should FAIL; got detail={result.detail!r}"
        )
        # Detail mentions both PIDs.
        assert str(w1.pid) in result.detail, f"detail missing PID {w1.pid}: {result.detail!r}"
        assert str(w2.pid) in result.detail, f"detail missing PID {w2.pid}: {result.detail!r}"
    finally:
        exit1.set()
        if w2 is not None:
            exit2.set()
        for proc in (w1, w2):
            if proc is None:
                continue
            proc.join(timeout=5)
            if proc.is_alive():
                proc.terminate()
                proc.join(timeout=2)


# ---------------------------------------------------------------------------
# Section 3 — _kill_dup_binders + e2e doctor --apply (G, H)
# ---------------------------------------------------------------------------


@pytest.fixture
def isolated_daemon_paths(tmp_path, monkeypatch):
    """HOME + socket + store + crypto env propagation for real-daemon tests.

    Mirrors test_doctor_apply_recovery.py:isolated_daemon_paths verbatim
    (HIGH-4 LOCK precedent, Plan 07-04). Required because _kill_dup_binders
    filters by 'iai_mcp.daemon' substring in psutil cmdline — only real
    iai_mcp.daemon subprocesses are killable, so multiprocessing workers
    cannot serve Tests G/H.
    """
    iai_dir = tmp_path / ".iai-mcp"
    iai_dir.mkdir(parents=True, exist_ok=True)

    state_path = iai_dir / ".daemon-state.json"
    lock_path = iai_dir / ".lock"
    store_dir = iai_dir / "store"
    store_dir.mkdir(parents=True, exist_ok=True)

    sock_dir = Path(f"/tmp/iai-mb2-{os.getpid()}-{id(tmp_path)}")
    sock_dir.mkdir(parents=True, exist_ok=True)
    sock_path = sock_dir / "d.sock"

    real_hf_home = Path.home() / ".cache" / "huggingface"

    monkeypatch.setenv("HOME", str(tmp_path))
    monkeypatch.setenv("HF_HOME", str(real_hf_home))
    monkeypatch.setenv("IAI_DAEMON_SOCKET_PATH", str(sock_path))
    monkeypatch.setenv("IAI_MCP_STORE", str(store_dir))
    monkeypatch.setenv("IAI_DAEMON_IDLE_SHUTDOWN_SECS", "99999")
    monkeypatch.setenv(
        "PYTHON_KEYRING_BACKEND", "keyring.backends.fail.Keyring"
    )
    monkeypatch.setenv("IAI_MCP_CRYPTO_PASSPHRASE", "test-mb-passphrase")
    import keyring.core

    keyring.core._keyring_backend = None

    from iai_mcp import cli, daemon_state

    monkeypatch.setattr(daemon_state, "STATE_PATH", state_path)
    monkeypatch.setattr(cli, "LOCK_PATH", lock_path)
    monkeypatch.setattr(cli, "SOCKET_PATH", sock_path)

    try:
        yield sock_path, state_path, store_dir, lock_path
    finally:
        _kill_test_daemons(sock_path)
        try:
            if sock_path.exists():
                sock_path.unlink()
        except OSError:
            pass
        try:
            sock_dir.rmdir()
        except OSError:
            pass
        keyring.core._keyring_backend = None


def _spawn_daemon(sock_path: Path, store_dir: Path, home: Path) -> subprocess.Popen:
    """Spawn `python -m iai_mcp.daemon` with the test's env propagated."""
    env = os.environ.copy()
    env["HOME"] = str(home)
    env["IAI_DAEMON_SOCKET_PATH"] = str(sock_path)
    env["IAI_MCP_STORE"] = str(store_dir)
    env["IAI_DAEMON_IDLE_SHUTDOWN_SECS"] = "99999"
    env["PYTHON_KEYRING_BACKEND"] = "keyring.backends.fail.Keyring"
    env["IAI_MCP_CRYPTO_PASSPHRASE"] = "test-mb-passphrase"
    return subprocess.Popen(
        [sys.executable, "-m", "iai_mcp.daemon"],
        env=env,
        stdout=subprocess.DEVNULL,
        stderr=subprocess.DEVNULL,
    )


def _wait_for_socket(sock_path: Path, timeout_sec: float = 30.0) -> bool:
    deadline = time.monotonic() + timeout_sec
    while time.monotonic() < deadline:
        if sock_path.exists():
            return True
        time.sleep(0.1)
    return False


def _kill_test_daemons(sock_path: Path) -> None:
    """Match-by-env cleanup: SIGTERM iai_mcp.daemon subprocesses whose
    psutil environ has our IAI_DAEMON_SOCKET_PATH value. Avoids touching
    the user's real production daemon.
    """
    target = str(sock_path)
    for p in psutil.process_iter(["pid", "cmdline"]):
        try:
            cl = " ".join(p.info.get("cmdline") or [])
            if "iai_mcp.daemon" not in cl:
                continue
            try:
                env = p.environ()
            except (psutil.AccessDenied, psutil.NoSuchProcess):
                continue
            if env.get("IAI_DAEMON_SOCKET_PATH") == target:
                try:
                    p.send_signal(signal.SIGTERM)
                    p.wait(timeout=3)
                except (psutil.NoSuchProcess, psutil.TimeoutExpired):
                    try:
                        p.send_signal(signal.SIGKILL)
                    except psutil.NoSuchProcess:
                        pass
        except (psutil.NoSuchProcess, psutil.AccessDenied):
            continue


def _spawn_dup_daemons(
    sock_path: Path, store_dir: Path, home: Path
) -> tuple[subprocess.Popen, subprocess.Popen]:
    """Spawn 2 real iai_mcp.daemon subprocesses both bound to sock_path.

    Race-window simulation per CONTEXT.md hint: spawn daemon #1, wait for
    socket, unlink (so daemon #2 can bind a fresh inode at the same path),
    spawn daemon #2, wait for socket. Daemon #1's listening fd still holds
    the original (now unlinked) inode; daemon #2 holds the new inode. lsof
    reports both PIDs as binders of the same path.
    """
    p1 = _spawn_daemon(sock_path, store_dir, home)
    if not _wait_for_socket(sock_path, timeout_sec=30):
        try:
            p1.kill()
        except ProcessLookupError:
            pass
        raise AssertionError("daemon #1 never bound socket within 30s")

    # Race-window: unlink so daemon #2's bind() succeeds without EADDRINUSE.
    try:
        sock_path.unlink()
    except OSError:
        pass

    p2 = _spawn_daemon(sock_path, store_dir, home)
    if not _wait_for_socket(sock_path, timeout_sec=30):
        try:
            p2.kill()
        except ProcessLookupError:
            pass
        try:
            p1.kill()
        except ProcessLookupError:
            pass
        raise AssertionError("daemon #2 never bound socket within 30s")

    # Settle so lsof reflects both binders.
    time.sleep(0.5)
    return p1, p2


@pytest.mark.skip(
    reason=(
        "Phase 10.6 Plan 10.6-01 Task 1.5: single-machine "
        "LifecycleLock prevents two daemons from both binding the same "
        "IAI_MCP_STORE. Daemon #2 raises LifecycleLockConflict and exits "
        "1 before bind. The dup-binder integration scenario is now "
        "impossible by design. The unit tests in this file "
        "(test_extract_binder_pids_*, test_check_g_*) still cover "
        "check_g's detection logic without spawning two real daemons."
    )
)
def test_kill_dup_binders_keeps_oldest(isolated_daemon_paths):
    """G: 2 real daemons → _kill_dup_binders kills younger, keeps oldest.

    Re-running check_g afterward returns PASS (1 binder remaining).
    """
    from iai_mcp.doctor import (
        _extract_binder_pids,
        _kill_dup_binders,
        check_g_no_dup_binders,
    )

    sock_path, _, store_dir, _ = isolated_daemon_paths
    home = Path(os.environ["HOME"])

    p1, p2 = _spawn_dup_daemons(sock_path, store_dir, home)
    try:
        # Pre-condition: both daemons must show up as binders for our socket.
        lsof_out = subprocess.run(
            ["lsof", "-U", "-F", "pn"],
            capture_output=True,
            text=True,
            timeout=5,
            check=False,
        ).stdout
        binders = _extract_binder_pids(lsof_out, sock_path)
        assert {p1.pid, p2.pid}.issubset(binders), (
            f"expected both daemon PIDs in binders; got {binders} "
            f"(daemons: {p1.pid}, {p2.pid})"
        )
        pre_check = check_g_no_dup_binders()
        assert pre_check.passed is False, (
            f"pre-condition: dup-binder scenario should FAIL check_g; "
            f"got {pre_check.detail!r}"
        )

        # Kill the younger daemon. p1 was spawned first → has greater etime →
        # is the keep_pid; p2 should be killed.
        ok, msg, ms = _kill_dup_binders()

        assert ok is True, f"_kill_dup_binders returned ok=False: {msg}"
        assert "kept PID" in msg, f"msg missing 'kept PID': {msg!r}"
        assert "killed" in msg, f"msg missing 'killed': {msg!r}"
        assert ms < 10_000, f"_kill_dup_binders took {ms}ms (>10s); too slow"

        # After kill, a follow-up check_g should report 1 (or 0 — race) binder.
        post_check = check_g_no_dup_binders()
        assert post_check.passed is True, (
            f"post-kill check_g should PASS; got {post_check.detail!r}"
        )

        # The kept daemon (p1) should still be alive; the other should be dead
        # within a generous timeout (kill is SIGKILL, instant on macOS).
        assert p1.poll() is None, "expected oldest daemon (p1) to survive"
        # Allow up to 2s for SIGKILL signal delivery + reap.
        deadline = time.monotonic() + 5.0
        while time.monotonic() < deadline and p2.poll() is None:
            time.sleep(0.1)
        assert p2.poll() is not None, "expected younger daemon (p2) to be dead"
    finally:
        for proc in (p1, p2):
            if proc.poll() is None:
                try:
                    proc.send_signal(signal.SIGKILL)
                    proc.wait(timeout=3)
                except (subprocess.TimeoutExpired, ProcessLookupError):
                    pass


@pytest.mark.skip(
    reason=(
        "Phase 10.6 Plan 10.6-01 Task 1.5: single-machine "
        "LifecycleLock prevents two daemons from both binding the same "
        "IAI_MCP_STORE. Daemon #2 raises LifecycleLockConflict and exits "
        "1 before bind. End-to-end recovery from dup-binders cannot run "
        "because the dup-binders state is now impossible to construct."
    )
)
def test_doctor_apply_yes_recovers_from_dup_binders(isolated_daemon_paths):
    """H: end-to-end. 2 dup-binder daemons → cmd_doctor(apply=True, yes=True)
    drives the kill_dup_binders repair → re-check returns 0 OR exit 2 only
    if a non-related check (e.g., (a) state desync) FAILs.

    NB: spawning two real daemons against the same socket inevitably leaves
    daemon-state.json pointing at one of the two PIDs (whichever wrote last).
    After kill_dup_binders, if the survivor is the one daemon-state recorded,
    check_a passes; if the survivor is the OTHER daemon, check_a FAILs and the
    respawn action triggers, which (because the surviving daemon already binds
    the socket) yields a launchd-react-noop OR a benign respawn-timeout. The
    relevant assertion for THIS test is the dup-binder repair specifically:
    after recovery, lsof reports exactly 1 binder for our socket path. The
    overall rc and check_a status are looser assertions because they depend
    on the state-file-vs-survivor coincidence.
    """
    from iai_mcp.doctor import (
        _extract_binder_pids,
        check_g_no_dup_binders,
        cmd_doctor,
    )

    sock_path, _, store_dir, _ = isolated_daemon_paths
    home = Path(os.environ["HOME"])

    p1, p2 = _spawn_dup_daemons(sock_path, store_dir, home)
    try:
        # Sanity: dup-binder is detectable.
        pre = check_g_no_dup_binders()
        assert pre.passed is False, f"pre: dup-binder should FAIL; got {pre.detail!r}"

        args = argparse.Namespace(apply=True, yes=True)
        rc = cmd_doctor(args)

        # The critical observable: dup-binders cleared.
        post_check = check_g_no_dup_binders()
        assert post_check.passed is True, (
            f"post-recovery: check_g should PASS; got {post_check.detail!r}"
        )
        # rc may be 0 (everything green) or 2 (only check_a survived as FAIL
        # because state-file PID points at the killed survivor); both prove
        # the dup-binder repair mechanism worked. rc=1 would mean --apply
        # never ran the repair (regression).
        assert rc in (0, 2), (
            f"cmd_doctor rc={rc} unexpected; allowed 0 (full recovery) or 2 "
            f"(dup-binders fixed but state-file desync persists)."
        )

        # Belt-and-suspenders: lsof confirms exactly 1 binder remains.
        lsof_out = subprocess.run(
            ["lsof", "-U", "-F", "pn"],
            capture_output=True,
            text=True,
            timeout=5,
            check=False,
        ).stdout
        binders = _extract_binder_pids(lsof_out, sock_path)
        assert len(binders) <= 1, (
            f"after recovery, expected ≤1 binder for {sock_path}; got {binders}"
        )
    finally:
        for proc in (p1, p2):
            if proc.poll() is None:
                try:
                    proc.send_signal(signal.SIGKILL)
                    proc.wait(timeout=3)
                except (subprocess.TimeoutExpired, ProcessLookupError):
                    pass