"""Phase 7.1 R6 / D7.1-05 — doctor.py multi-binder detection + repair. Test matrix (8 tests): A. _extract_binder_pids parses lsof -F pn output → set[int] B. _extract_binder_pids skips PIDs bound to UNRELATED sockets C. _extract_binder_pids handles empty input → empty set D. check_g_no_dup_binders skips when socket file absent (PASS-with-skip) E. check_g_no_dup_binders PASSes with single binder (multiprocessing worker) F. check_g_no_dup_binders FAILs with two binders (regression-trap centerpiece) G. _kill_dup_binders keeps oldest, kills the rest (real subprocess daemons) H. iai-mcp doctor --apply --yes recovers from dup-binder scenario (e2e) A-D: pure unit tests, no daemon, fast (<1s combined). E-F: in-process multiprocessing workers — distinct PIDs, lsof-visible. G-H: real iai_mcp.daemon subprocesses — required because _kill_dup_binders filters by 'iai_mcp.daemon' substring in psutil cmdline (wrong-PID-kill mitigation). Isolated by HIGH-4 LOCK env propagation pattern from test_doctor_apply_recovery.py:isolated_daemon_paths. Skip on non-POSIX (AF_UNIX requirement). """ from __future__ import annotations import argparse import multiprocessing as mp import os import platform import signal import socket import subprocess import sys import time from pathlib import Path import psutil import pytest pytestmark = pytest.mark.skipif( platform.system() == "Windows", reason="POSIX AF_UNIX required (lsof -U + multiprocessing socket binders)", ) # --------------------------------------------------------------------------- # Section 1 — pure unit tests for _extract_binder_pids (A, B, C) # --------------------------------------------------------------------------- def test_extract_binder_pids_parses_lsof_output(): """A: hand-crafted lsof -F pn output → expected PID set. lsof -F pn format alternates lines `p` and `n`. Each PID is followed by 0+ name entries until the next `p`. """ from iai_mcp.doctor import _extract_binder_pids target = Path("/tmp/iai-test/d.sock") lsof_output = "\n".join([ "p12345", f"n{target}", "p67890", f"n{target}", "p99999", "n/tmp/other-app/socket", ]) pids = _extract_binder_pids(lsof_output, target) assert pids == {12345, 67890}, f"expected {{12345, 67890}}, got {pids}" def test_extract_binder_pids_skips_unrelated_sockets(): """B: lsof output with multiple sockets; only PIDs holding OUR path are returned.""" from iai_mcp.doctor import _extract_binder_pids target = Path("/tmp/iai-test/d.sock") lsof_output = "\n".join([ "p1001", "n/var/run/some-other-daemon.sock", "p2002", f"n{target}", "p3003", "n/tmp/X11-unix/X0", "p4004", f"n{target}", "n/some/extra/name/for/p4004", # PID 4004 holds multiple fds ]) pids = _extract_binder_pids(lsof_output, target) assert pids == {2002, 4004}, f"expected {{2002, 4004}}, got {pids}" def test_extract_binder_pids_handles_empty_output(): """C: empty input → empty set (defensive corner case).""" from iai_mcp.doctor import _extract_binder_pids target = Path("/tmp/anywhere.sock") assert _extract_binder_pids("", target) == set() assert _extract_binder_pids("\n\n\n", target) == set() # Malformed: PID line without name line; name line without preceding PID. assert _extract_binder_pids("p123\nXgarbage\np\n", target) == set() # --------------------------------------------------------------------------- # Section 2 — check_g_no_dup_binders (D, E, F) using monkeypatched socket path # --------------------------------------------------------------------------- @pytest.fixture def short_socket_path(tmp_path, monkeypatch): """Yield a short socket path under /tmp (AF_UNIX 104-byte cap on macOS). Honors the IAI_DAEMON_SOCKET_PATH env override that doctor._resolve_socket_path consults. Cleans up the socket file on teardown. """ sock_dir = Path(f"/tmp/iai-mb-{os.getpid()}-{id(tmp_path)}") sock_dir.mkdir(parents=True, exist_ok=True) sock_path = sock_dir / "d.sock" monkeypatch.setenv("IAI_DAEMON_SOCKET_PATH", str(sock_path)) try: yield sock_path finally: try: if sock_path.exists(): sock_path.unlink() except OSError: pass try: sock_dir.rmdir() except OSError: pass def test_check_g_no_socket_skips(short_socket_path, monkeypatch): """D: socket file absent → PASS-with-skip detail "no socket file (skip)". Mirrors check_d_no_orphan_core's skip pattern when the resource isn't present (no false-positive on a clean machine). """ from iai_mcp.doctor import check_g_no_dup_binders # Fixture set the env var; ensure no file exists. assert not short_socket_path.exists() result = check_g_no_dup_binders() assert result.passed is True assert "no socket file" in result.detail # --- Multiprocessing worker for Tests E and F (distinct PIDs) --------------- def _bind_socket_worker(sock_path_str: str, ready_event: mp.Event, exit_event: mp.Event) -> None: """Subprocess worker: bind an AF_UNIX socket to sock_path, signal ready, block until exit_event is set. Each multiprocessing.Process child has a distinct PID and lsof reports its socket fd. Used by Tests E (1 binder) and F (2 binders) to construct deterministic dup-binder scenarios without a real iai_mcp.daemon (whose boot cost is ~3-10s). """ s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) try: # Each worker handles its own bind; for the 2-binder scenario, the # parent unlinks the path between worker spawns so each worker # successfully bind()s a fresh inode at the same name. s.bind(sock_path_str) s.listen(5) ready_event.set() # Block until parent signals shutdown. exit_event.wait(timeout=30) finally: try: s.close() except OSError: pass def test_check_g_single_binder_passes(short_socket_path): """E: ONE binder bound to the socket → check_g returns PASS with "1 binder(s)". Uses a multiprocessing.Process worker (distinct PID from the pytest process) so lsof has something to enumerate. """ from iai_mcp.doctor import check_g_no_dup_binders # NOTE: use 'spawn' (not 'fork') even on Darwin — lancedb is not fork-safe # (UserWarning surfaces with fork on macOS). Workers don't touch lancedb, # but the parent test process has it imported transitively; spawn isolates. ctx = mp.get_context("spawn") ready = ctx.Event() exit_signal = ctx.Event() worker = ctx.Process( target=_bind_socket_worker, args=(str(short_socket_path), ready, exit_signal), ) worker.start() try: assert ready.wait(timeout=10), "binder worker never signaled ready" # Tiny settle so lsof's cache reflects the bind. time.sleep(0.2) result = check_g_no_dup_binders() assert result.passed is True, ( f"single-binder scenario should PASS; got detail={result.detail!r}" ) assert "1 binder" in result.detail, f"unexpected detail: {result.detail!r}" finally: exit_signal.set() worker.join(timeout=5) if worker.is_alive(): worker.terminate() worker.join(timeout=2) def test_check_g_two_binders_fails(short_socket_path): """F: TWO binders bound to the same socket path → check_g returns FAIL. REGRESSION-TRAP CENTERPIECE. Spawns 2 multiprocessing workers, each binding to the same socket path with an unlink between them so both bind() calls succeed at the OS level. lsof reports both PIDs as holding the path; check_g detects the singleton-invariant violation. This is exactly the failure mode Phase 7.1's launchd architecture structurally prevents in production — the test bypasses launchd by hand-binding sockets in worker processes. On post-Phase 7.1 production, this scenario can only occur if a user manually bypasses launchd. """ from iai_mcp.doctor import _extract_binder_pids, check_g_no_dup_binders # NOTE: use 'spawn' (not 'fork') even on Darwin — lancedb is not fork-safe # (UserWarning surfaces with fork on macOS). Workers don't touch lancedb, # but the parent test process has it imported transitively; spawn isolates. ctx = mp.get_context("spawn") # Worker 1 ready1 = ctx.Event() exit1 = ctx.Event() w1 = ctx.Process( target=_bind_socket_worker, args=(str(short_socket_path), ready1, exit1), ) w1.start() # Worker 2 — race-window simulation: unlink the path so worker 2's bind() # creates a fresh inode at the same name. Worker 1's fd still holds the # ORIGINAL inode (unlinked but kept alive by the open fd); worker 2 holds # the NEW inode at the same path. lsof reports both PIDs. ready2 = ctx.Event() exit2 = ctx.Event() w2 = None try: assert ready1.wait(timeout=10), "worker 1 never signaled ready" # Unlink so the second bind doesn't EADDRINUSE. try: short_socket_path.unlink() except OSError: pass w2 = ctx.Process( target=_bind_socket_worker, args=(str(short_socket_path), ready2, exit2), ) w2.start() assert ready2.wait(timeout=10), "worker 2 never signaled ready" time.sleep(0.3) # let lsof catch up # Belt-and-suspenders: confirm via the parser directly that lsof sees both. lsof_out = subprocess.run( ["lsof", "-U", "-F", "pn"], capture_output=True, text=True, timeout=5, check=False, ).stdout binder_pids = _extract_binder_pids(lsof_out, short_socket_path) assert {w1.pid, w2.pid}.issubset(binder_pids), ( f"lsof should report both worker PIDs as binders; got {binder_pids} " f"(workers: {w1.pid}, {w2.pid})" ) # Centerpiece assertion: check_g detects the dup-binder scenario. result = check_g_no_dup_binders() assert result.passed is False, ( f"two-binder scenario should FAIL; got detail={result.detail!r}" ) # Detail mentions both PIDs. assert str(w1.pid) in result.detail, f"detail missing PID {w1.pid}: {result.detail!r}" assert str(w2.pid) in result.detail, f"detail missing PID {w2.pid}: {result.detail!r}" finally: exit1.set() if w2 is not None: exit2.set() for proc in (w1, w2): if proc is None: continue proc.join(timeout=5) if proc.is_alive(): proc.terminate() proc.join(timeout=2) # --------------------------------------------------------------------------- # Section 3 — _kill_dup_binders + e2e doctor --apply (G, H) # --------------------------------------------------------------------------- @pytest.fixture def isolated_daemon_paths(tmp_path, monkeypatch): """HOME + socket + store + crypto env propagation for real-daemon tests. Mirrors test_doctor_apply_recovery.py:isolated_daemon_paths verbatim (HIGH-4 LOCK precedent, Plan 07-04). Required because _kill_dup_binders filters by 'iai_mcp.daemon' substring in psutil cmdline — only real iai_mcp.daemon subprocesses are killable, so multiprocessing workers cannot serve Tests G/H. """ iai_dir = tmp_path / ".iai-mcp" iai_dir.mkdir(parents=True, exist_ok=True) state_path = iai_dir / ".daemon-state.json" lock_path = iai_dir / ".lock" store_dir = iai_dir / "store" store_dir.mkdir(parents=True, exist_ok=True) sock_dir = Path(f"/tmp/iai-mb2-{os.getpid()}-{id(tmp_path)}") sock_dir.mkdir(parents=True, exist_ok=True) sock_path = sock_dir / "d.sock" real_hf_home = Path.home() / ".cache" / "huggingface" monkeypatch.setenv("HOME", str(tmp_path)) monkeypatch.setenv("HF_HOME", str(real_hf_home)) monkeypatch.setenv("IAI_DAEMON_SOCKET_PATH", str(sock_path)) monkeypatch.setenv("IAI_MCP_STORE", str(store_dir)) monkeypatch.setenv("IAI_DAEMON_IDLE_SHUTDOWN_SECS", "99999") monkeypatch.setenv( "PYTHON_KEYRING_BACKEND", "keyring.backends.fail.Keyring" ) monkeypatch.setenv("IAI_MCP_CRYPTO_PASSPHRASE", "test-mb-passphrase") import keyring.core keyring.core._keyring_backend = None from iai_mcp import cli, daemon_state monkeypatch.setattr(daemon_state, "STATE_PATH", state_path) monkeypatch.setattr(cli, "LOCK_PATH", lock_path) monkeypatch.setattr(cli, "SOCKET_PATH", sock_path) try: yield sock_path, state_path, store_dir, lock_path finally: _kill_test_daemons(sock_path) try: if sock_path.exists(): sock_path.unlink() except OSError: pass try: sock_dir.rmdir() except OSError: pass keyring.core._keyring_backend = None def _spawn_daemon(sock_path: Path, store_dir: Path, home: Path) -> subprocess.Popen: """Spawn `python -m iai_mcp.daemon` with the test's env propagated.""" env = os.environ.copy() env["HOME"] = str(home) env["IAI_DAEMON_SOCKET_PATH"] = str(sock_path) env["IAI_MCP_STORE"] = str(store_dir) env["IAI_DAEMON_IDLE_SHUTDOWN_SECS"] = "99999" env["PYTHON_KEYRING_BACKEND"] = "keyring.backends.fail.Keyring" env["IAI_MCP_CRYPTO_PASSPHRASE"] = "test-mb-passphrase" return subprocess.Popen( [sys.executable, "-m", "iai_mcp.daemon"], env=env, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, ) def _wait_for_socket(sock_path: Path, timeout_sec: float = 30.0) -> bool: deadline = time.monotonic() + timeout_sec while time.monotonic() < deadline: if sock_path.exists(): return True time.sleep(0.1) return False def _kill_test_daemons(sock_path: Path) -> None: """Match-by-env cleanup: SIGTERM iai_mcp.daemon subprocesses whose psutil environ has our IAI_DAEMON_SOCKET_PATH value. Avoids touching the user's real production daemon. """ target = str(sock_path) for p in psutil.process_iter(["pid", "cmdline"]): try: cl = " ".join(p.info.get("cmdline") or []) if "iai_mcp.daemon" not in cl: continue try: env = p.environ() except (psutil.AccessDenied, psutil.NoSuchProcess): continue if env.get("IAI_DAEMON_SOCKET_PATH") == target: try: p.send_signal(signal.SIGTERM) p.wait(timeout=3) except (psutil.NoSuchProcess, psutil.TimeoutExpired): try: p.send_signal(signal.SIGKILL) except psutil.NoSuchProcess: pass except (psutil.NoSuchProcess, psutil.AccessDenied): continue def _spawn_dup_daemons( sock_path: Path, store_dir: Path, home: Path ) -> tuple[subprocess.Popen, subprocess.Popen]: """Spawn 2 real iai_mcp.daemon subprocesses both bound to sock_path. Race-window simulation per CONTEXT.md hint: spawn daemon #1, wait for socket, unlink (so daemon #2 can bind a fresh inode at the same path), spawn daemon #2, wait for socket. Daemon #1's listening fd still holds the original (now unlinked) inode; daemon #2 holds the new inode. lsof reports both PIDs as binders of the same path. """ p1 = _spawn_daemon(sock_path, store_dir, home) if not _wait_for_socket(sock_path, timeout_sec=30): try: p1.kill() except ProcessLookupError: pass raise AssertionError("daemon #1 never bound socket within 30s") # Race-window: unlink so daemon #2's bind() succeeds without EADDRINUSE. try: sock_path.unlink() except OSError: pass p2 = _spawn_daemon(sock_path, store_dir, home) if not _wait_for_socket(sock_path, timeout_sec=30): try: p2.kill() except ProcessLookupError: pass try: p1.kill() except ProcessLookupError: pass raise AssertionError("daemon #2 never bound socket within 30s") # Settle so lsof reflects both binders. time.sleep(0.5) return p1, p2 @pytest.mark.skip( reason=( "Phase 10.6 Plan 10.6-01 Task 1.5: single-machine " "LifecycleLock prevents two daemons from both binding the same " "IAI_MCP_STORE. Daemon #2 raises LifecycleLockConflict and exits " "1 before bind. The dup-binder integration scenario is now " "impossible by design. The unit tests in this file " "(test_extract_binder_pids_*, test_check_g_*) still cover " "check_g's detection logic without spawning two real daemons." ) ) def test_kill_dup_binders_keeps_oldest(isolated_daemon_paths): """G: 2 real daemons → _kill_dup_binders kills younger, keeps oldest. Re-running check_g afterward returns PASS (1 binder remaining). """ from iai_mcp.doctor import ( _extract_binder_pids, _kill_dup_binders, check_g_no_dup_binders, ) sock_path, _, store_dir, _ = isolated_daemon_paths home = Path(os.environ["HOME"]) p1, p2 = _spawn_dup_daemons(sock_path, store_dir, home) try: # Pre-condition: both daemons must show up as binders for our socket. lsof_out = subprocess.run( ["lsof", "-U", "-F", "pn"], capture_output=True, text=True, timeout=5, check=False, ).stdout binders = _extract_binder_pids(lsof_out, sock_path) assert {p1.pid, p2.pid}.issubset(binders), ( f"expected both daemon PIDs in binders; got {binders} " f"(daemons: {p1.pid}, {p2.pid})" ) pre_check = check_g_no_dup_binders() assert pre_check.passed is False, ( f"pre-condition: dup-binder scenario should FAIL check_g; " f"got {pre_check.detail!r}" ) # Kill the younger daemon. p1 was spawned first → has greater etime → # is the keep_pid; p2 should be killed. ok, msg, ms = _kill_dup_binders() assert ok is True, f"_kill_dup_binders returned ok=False: {msg}" assert "kept PID" in msg, f"msg missing 'kept PID': {msg!r}" assert "killed" in msg, f"msg missing 'killed': {msg!r}" assert ms < 10_000, f"_kill_dup_binders took {ms}ms (>10s); too slow" # After kill, a follow-up check_g should report 1 (or 0 — race) binder. post_check = check_g_no_dup_binders() assert post_check.passed is True, ( f"post-kill check_g should PASS; got {post_check.detail!r}" ) # The kept daemon (p1) should still be alive; the other should be dead # within a generous timeout (kill is SIGKILL, instant on macOS). assert p1.poll() is None, "expected oldest daemon (p1) to survive" # Allow up to 2s for SIGKILL signal delivery + reap. deadline = time.monotonic() + 5.0 while time.monotonic() < deadline and p2.poll() is None: time.sleep(0.1) assert p2.poll() is not None, "expected younger daemon (p2) to be dead" finally: for proc in (p1, p2): if proc.poll() is None: try: proc.send_signal(signal.SIGKILL) proc.wait(timeout=3) except (subprocess.TimeoutExpired, ProcessLookupError): pass @pytest.mark.skip( reason=( "Phase 10.6 Plan 10.6-01 Task 1.5: single-machine " "LifecycleLock prevents two daemons from both binding the same " "IAI_MCP_STORE. Daemon #2 raises LifecycleLockConflict and exits " "1 before bind. End-to-end recovery from dup-binders cannot run " "because the dup-binders state is now impossible to construct." ) ) def test_doctor_apply_yes_recovers_from_dup_binders(isolated_daemon_paths): """H: end-to-end. 2 dup-binder daemons → cmd_doctor(apply=True, yes=True) drives the kill_dup_binders repair → re-check returns 0 OR exit 2 only if a non-related check (e.g., (a) state desync) FAILs. NB: spawning two real daemons against the same socket inevitably leaves daemon-state.json pointing at one of the two PIDs (whichever wrote last). After kill_dup_binders, if the survivor is the one daemon-state recorded, check_a passes; if the survivor is the OTHER daemon, check_a FAILs and the respawn action triggers, which (because the surviving daemon already binds the socket) yields a launchd-react-noop OR a benign respawn-timeout. The relevant assertion for THIS test is the dup-binder repair specifically: after recovery, lsof reports exactly 1 binder for our socket path. The overall rc and check_a status are looser assertions because they depend on the state-file-vs-survivor coincidence. """ from iai_mcp.doctor import ( _extract_binder_pids, check_g_no_dup_binders, cmd_doctor, ) sock_path, _, store_dir, _ = isolated_daemon_paths home = Path(os.environ["HOME"]) p1, p2 = _spawn_dup_daemons(sock_path, store_dir, home) try: # Sanity: dup-binder is detectable. pre = check_g_no_dup_binders() assert pre.passed is False, f"pre: dup-binder should FAIL; got {pre.detail!r}" args = argparse.Namespace(apply=True, yes=True) rc = cmd_doctor(args) # The critical observable: dup-binders cleared. post_check = check_g_no_dup_binders() assert post_check.passed is True, ( f"post-recovery: check_g should PASS; got {post_check.detail!r}" ) # rc may be 0 (everything green) or 2 (only check_a survived as FAIL # because state-file PID points at the killed survivor); both prove # the dup-binder repair mechanism worked. rc=1 would mean --apply # never ran the repair (regression). assert rc in (0, 2), ( f"cmd_doctor rc={rc} unexpected; allowed 0 (full recovery) or 2 " f"(dup-binders fixed but state-file desync persists)." ) # Belt-and-suspenders: lsof confirms exactly 1 binder remains. lsof_out = subprocess.run( ["lsof", "-U", "-F", "pn"], capture_output=True, text=True, timeout=5, check=False, ).stdout binders = _extract_binder_pids(lsof_out, sock_path) assert len(binders) <= 1, ( f"after recovery, expected ≤1 binder for {sock_path}; got {binders}" ) finally: for proc in (p1, p2): if proc.poll() is None: try: proc.send_signal(signal.SIGKILL) proc.wait(timeout=3) except (subprocess.TimeoutExpired, ProcessLookupError): pass