Initial release: iai-mcp v0.1.0
Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: XNLLLLH <XNLLLLH@users.noreply.github.com>
This commit is contained in:
commit
f6b876fbe7
332 changed files with 97258 additions and 0 deletions
622
tests/test_doctor_multi_binder.py
Normal file
622
tests/test_doctor_multi_binder.py
Normal file
|
|
@ -0,0 +1,622 @@
|
|||
"""Phase 7.1 R6 / D7.1-05 — doctor.py multi-binder detection + repair.
|
||||
|
||||
Test matrix (8 tests):
|
||||
A. _extract_binder_pids parses lsof -F pn output → set[int]
|
||||
B. _extract_binder_pids skips PIDs bound to UNRELATED sockets
|
||||
C. _extract_binder_pids handles empty input → empty set
|
||||
D. check_g_no_dup_binders skips when socket file absent (PASS-with-skip)
|
||||
E. check_g_no_dup_binders PASSes with single binder (multiprocessing worker)
|
||||
F. check_g_no_dup_binders FAILs with two binders (regression-trap centerpiece)
|
||||
G. _kill_dup_binders keeps oldest, kills the rest (real subprocess daemons)
|
||||
H. iai-mcp doctor --apply --yes recovers from dup-binder scenario (e2e)
|
||||
|
||||
A-D: pure unit tests, no daemon, fast (<1s combined).
|
||||
E-F: in-process multiprocessing workers — distinct PIDs, lsof-visible.
|
||||
G-H: real iai_mcp.daemon subprocesses — required because _kill_dup_binders
|
||||
filters by 'iai_mcp.daemon' substring in psutil cmdline (wrong-PID-kill
|
||||
mitigation). Isolated by HIGH-4 LOCK env propagation pattern from
|
||||
test_doctor_apply_recovery.py:isolated_daemon_paths.
|
||||
|
||||
Skip on non-POSIX (AF_UNIX requirement).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import multiprocessing as mp
|
||||
import os
|
||||
import platform
|
||||
import signal
|
||||
import socket
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import psutil
|
||||
import pytest
|
||||
|
||||
|
||||
pytestmark = pytest.mark.skipif(
|
||||
platform.system() == "Windows",
|
||||
reason="POSIX AF_UNIX required (lsof -U + multiprocessing socket binders)",
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Section 1 — pure unit tests for _extract_binder_pids (A, B, C)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_extract_binder_pids_parses_lsof_output():
|
||||
"""A: hand-crafted lsof -F pn output → expected PID set.
|
||||
|
||||
lsof -F pn format alternates lines `p<pid>` and `n<filename>`. Each
|
||||
PID is followed by 0+ name entries until the next `p<pid>`.
|
||||
"""
|
||||
from iai_mcp.doctor import _extract_binder_pids
|
||||
|
||||
target = Path("/tmp/iai-test/d.sock")
|
||||
lsof_output = "\n".join([
|
||||
"p12345",
|
||||
f"n{target}",
|
||||
"p67890",
|
||||
f"n{target}",
|
||||
"p99999",
|
||||
"n/tmp/other-app/socket",
|
||||
])
|
||||
|
||||
pids = _extract_binder_pids(lsof_output, target)
|
||||
|
||||
assert pids == {12345, 67890}, f"expected {{12345, 67890}}, got {pids}"
|
||||
|
||||
|
||||
def test_extract_binder_pids_skips_unrelated_sockets():
|
||||
"""B: lsof output with multiple sockets; only PIDs holding OUR path are returned."""
|
||||
from iai_mcp.doctor import _extract_binder_pids
|
||||
|
||||
target = Path("/tmp/iai-test/d.sock")
|
||||
lsof_output = "\n".join([
|
||||
"p1001",
|
||||
"n/var/run/some-other-daemon.sock",
|
||||
"p2002",
|
||||
f"n{target}",
|
||||
"p3003",
|
||||
"n/tmp/X11-unix/X0",
|
||||
"p4004",
|
||||
f"n{target}",
|
||||
"n/some/extra/name/for/p4004", # PID 4004 holds multiple fds
|
||||
])
|
||||
|
||||
pids = _extract_binder_pids(lsof_output, target)
|
||||
|
||||
assert pids == {2002, 4004}, f"expected {{2002, 4004}}, got {pids}"
|
||||
|
||||
|
||||
def test_extract_binder_pids_handles_empty_output():
|
||||
"""C: empty input → empty set (defensive corner case)."""
|
||||
from iai_mcp.doctor import _extract_binder_pids
|
||||
|
||||
target = Path("/tmp/anywhere.sock")
|
||||
assert _extract_binder_pids("", target) == set()
|
||||
assert _extract_binder_pids("\n\n\n", target) == set()
|
||||
# Malformed: PID line without name line; name line without preceding PID.
|
||||
assert _extract_binder_pids("p123\nXgarbage\np\n", target) == set()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Section 2 — check_g_no_dup_binders (D, E, F) using monkeypatched socket path
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def short_socket_path(tmp_path, monkeypatch):
|
||||
"""Yield a short socket path under /tmp (AF_UNIX 104-byte cap on macOS).
|
||||
|
||||
Honors the IAI_DAEMON_SOCKET_PATH env override that doctor._resolve_socket_path
|
||||
consults. Cleans up the socket file on teardown.
|
||||
"""
|
||||
sock_dir = Path(f"/tmp/iai-mb-{os.getpid()}-{id(tmp_path)}")
|
||||
sock_dir.mkdir(parents=True, exist_ok=True)
|
||||
sock_path = sock_dir / "d.sock"
|
||||
monkeypatch.setenv("IAI_DAEMON_SOCKET_PATH", str(sock_path))
|
||||
try:
|
||||
yield sock_path
|
||||
finally:
|
||||
try:
|
||||
if sock_path.exists():
|
||||
sock_path.unlink()
|
||||
except OSError:
|
||||
pass
|
||||
try:
|
||||
sock_dir.rmdir()
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def test_check_g_no_socket_skips(short_socket_path, monkeypatch):
|
||||
"""D: socket file absent → PASS-with-skip detail "no socket file (skip)".
|
||||
|
||||
Mirrors check_d_no_orphan_core's skip pattern when the resource isn't
|
||||
present (no false-positive on a clean machine).
|
||||
"""
|
||||
from iai_mcp.doctor import check_g_no_dup_binders
|
||||
|
||||
# Fixture set the env var; ensure no file exists.
|
||||
assert not short_socket_path.exists()
|
||||
|
||||
result = check_g_no_dup_binders()
|
||||
|
||||
assert result.passed is True
|
||||
assert "no socket file" in result.detail
|
||||
|
||||
|
||||
# --- Multiprocessing worker for Tests E and F (distinct PIDs) ---------------
|
||||
|
||||
|
||||
def _bind_socket_worker(sock_path_str: str, ready_event: mp.Event, exit_event: mp.Event) -> None:
|
||||
"""Subprocess worker: bind an AF_UNIX socket to sock_path, signal ready,
|
||||
block until exit_event is set.
|
||||
|
||||
Each multiprocessing.Process child has a distinct PID and lsof reports
|
||||
its socket fd. Used by Tests E (1 binder) and F (2 binders) to construct
|
||||
deterministic dup-binder scenarios without a real iai_mcp.daemon (whose
|
||||
boot cost is ~3-10s).
|
||||
"""
|
||||
s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
|
||||
try:
|
||||
# Each worker handles its own bind; for the 2-binder scenario, the
|
||||
# parent unlinks the path between worker spawns so each worker
|
||||
# successfully bind()s a fresh inode at the same name.
|
||||
s.bind(sock_path_str)
|
||||
s.listen(5)
|
||||
ready_event.set()
|
||||
# Block until parent signals shutdown.
|
||||
exit_event.wait(timeout=30)
|
||||
finally:
|
||||
try:
|
||||
s.close()
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def test_check_g_single_binder_passes(short_socket_path):
|
||||
"""E: ONE binder bound to the socket → check_g returns PASS with "1 binder(s)".
|
||||
|
||||
Uses a multiprocessing.Process worker (distinct PID from the pytest
|
||||
process) so lsof has something to enumerate.
|
||||
"""
|
||||
from iai_mcp.doctor import check_g_no_dup_binders
|
||||
|
||||
# NOTE: use 'spawn' (not 'fork') even on Darwin — lancedb is not fork-safe
|
||||
# (UserWarning surfaces with fork on macOS). Workers don't touch lancedb,
|
||||
# but the parent test process has it imported transitively; spawn isolates.
|
||||
ctx = mp.get_context("spawn")
|
||||
ready = ctx.Event()
|
||||
exit_signal = ctx.Event()
|
||||
worker = ctx.Process(
|
||||
target=_bind_socket_worker,
|
||||
args=(str(short_socket_path), ready, exit_signal),
|
||||
)
|
||||
worker.start()
|
||||
try:
|
||||
assert ready.wait(timeout=10), "binder worker never signaled ready"
|
||||
# Tiny settle so lsof's cache reflects the bind.
|
||||
time.sleep(0.2)
|
||||
|
||||
result = check_g_no_dup_binders()
|
||||
|
||||
assert result.passed is True, (
|
||||
f"single-binder scenario should PASS; got detail={result.detail!r}"
|
||||
)
|
||||
assert "1 binder" in result.detail, f"unexpected detail: {result.detail!r}"
|
||||
finally:
|
||||
exit_signal.set()
|
||||
worker.join(timeout=5)
|
||||
if worker.is_alive():
|
||||
worker.terminate()
|
||||
worker.join(timeout=2)
|
||||
|
||||
|
||||
def test_check_g_two_binders_fails(short_socket_path):
|
||||
"""F: TWO binders bound to the same socket path → check_g returns FAIL.
|
||||
|
||||
REGRESSION-TRAP CENTERPIECE. Spawns 2 multiprocessing workers, each
|
||||
binding to the same socket path with an unlink between them so both
|
||||
bind() calls succeed at the OS level. lsof reports both PIDs as
|
||||
holding the path; check_g detects the singleton-invariant violation.
|
||||
|
||||
This is exactly the failure mode Phase 7.1's launchd architecture
|
||||
structurally prevents in production — the test bypasses launchd by
|
||||
hand-binding sockets in worker processes. On post-Phase 7.1 production,
|
||||
this scenario can only occur if a user manually bypasses launchd.
|
||||
"""
|
||||
from iai_mcp.doctor import _extract_binder_pids, check_g_no_dup_binders
|
||||
|
||||
# NOTE: use 'spawn' (not 'fork') even on Darwin — lancedb is not fork-safe
|
||||
# (UserWarning surfaces with fork on macOS). Workers don't touch lancedb,
|
||||
# but the parent test process has it imported transitively; spawn isolates.
|
||||
ctx = mp.get_context("spawn")
|
||||
|
||||
# Worker 1
|
||||
ready1 = ctx.Event()
|
||||
exit1 = ctx.Event()
|
||||
w1 = ctx.Process(
|
||||
target=_bind_socket_worker,
|
||||
args=(str(short_socket_path), ready1, exit1),
|
||||
)
|
||||
w1.start()
|
||||
|
||||
# Worker 2 — race-window simulation: unlink the path so worker 2's bind()
|
||||
# creates a fresh inode at the same name. Worker 1's fd still holds the
|
||||
# ORIGINAL inode (unlinked but kept alive by the open fd); worker 2 holds
|
||||
# the NEW inode at the same path. lsof reports both PIDs.
|
||||
ready2 = ctx.Event()
|
||||
exit2 = ctx.Event()
|
||||
w2 = None
|
||||
try:
|
||||
assert ready1.wait(timeout=10), "worker 1 never signaled ready"
|
||||
# Unlink so the second bind doesn't EADDRINUSE.
|
||||
try:
|
||||
short_socket_path.unlink()
|
||||
except OSError:
|
||||
pass
|
||||
w2 = ctx.Process(
|
||||
target=_bind_socket_worker,
|
||||
args=(str(short_socket_path), ready2, exit2),
|
||||
)
|
||||
w2.start()
|
||||
assert ready2.wait(timeout=10), "worker 2 never signaled ready"
|
||||
time.sleep(0.3) # let lsof catch up
|
||||
|
||||
# Belt-and-suspenders: confirm via the parser directly that lsof sees both.
|
||||
lsof_out = subprocess.run(
|
||||
["lsof", "-U", "-F", "pn"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
check=False,
|
||||
).stdout
|
||||
binder_pids = _extract_binder_pids(lsof_out, short_socket_path)
|
||||
assert {w1.pid, w2.pid}.issubset(binder_pids), (
|
||||
f"lsof should report both worker PIDs as binders; got {binder_pids} "
|
||||
f"(workers: {w1.pid}, {w2.pid})"
|
||||
)
|
||||
|
||||
# Centerpiece assertion: check_g detects the dup-binder scenario.
|
||||
result = check_g_no_dup_binders()
|
||||
|
||||
assert result.passed is False, (
|
||||
f"two-binder scenario should FAIL; got detail={result.detail!r}"
|
||||
)
|
||||
# Detail mentions both PIDs.
|
||||
assert str(w1.pid) in result.detail, f"detail missing PID {w1.pid}: {result.detail!r}"
|
||||
assert str(w2.pid) in result.detail, f"detail missing PID {w2.pid}: {result.detail!r}"
|
||||
finally:
|
||||
exit1.set()
|
||||
if w2 is not None:
|
||||
exit2.set()
|
||||
for proc in (w1, w2):
|
||||
if proc is None:
|
||||
continue
|
||||
proc.join(timeout=5)
|
||||
if proc.is_alive():
|
||||
proc.terminate()
|
||||
proc.join(timeout=2)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Section 3 — _kill_dup_binders + e2e doctor --apply (G, H)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def isolated_daemon_paths(tmp_path, monkeypatch):
|
||||
"""HOME + socket + store + crypto env propagation for real-daemon tests.
|
||||
|
||||
Mirrors test_doctor_apply_recovery.py:isolated_daemon_paths verbatim
|
||||
(HIGH-4 LOCK precedent, Plan 07-04). Required because _kill_dup_binders
|
||||
filters by 'iai_mcp.daemon' substring in psutil cmdline — only real
|
||||
iai_mcp.daemon subprocesses are killable, so multiprocessing workers
|
||||
cannot serve Tests G/H.
|
||||
"""
|
||||
iai_dir = tmp_path / ".iai-mcp"
|
||||
iai_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
state_path = iai_dir / ".daemon-state.json"
|
||||
lock_path = iai_dir / ".lock"
|
||||
store_dir = iai_dir / "store"
|
||||
store_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
sock_dir = Path(f"/tmp/iai-mb2-{os.getpid()}-{id(tmp_path)}")
|
||||
sock_dir.mkdir(parents=True, exist_ok=True)
|
||||
sock_path = sock_dir / "d.sock"
|
||||
|
||||
real_hf_home = Path.home() / ".cache" / "huggingface"
|
||||
|
||||
monkeypatch.setenv("HOME", str(tmp_path))
|
||||
monkeypatch.setenv("HF_HOME", str(real_hf_home))
|
||||
monkeypatch.setenv("IAI_DAEMON_SOCKET_PATH", str(sock_path))
|
||||
monkeypatch.setenv("IAI_MCP_STORE", str(store_dir))
|
||||
monkeypatch.setenv("IAI_DAEMON_IDLE_SHUTDOWN_SECS", "99999")
|
||||
monkeypatch.setenv(
|
||||
"PYTHON_KEYRING_BACKEND", "keyring.backends.fail.Keyring"
|
||||
)
|
||||
monkeypatch.setenv("IAI_MCP_CRYPTO_PASSPHRASE", "test-mb-passphrase")
|
||||
import keyring.core
|
||||
|
||||
keyring.core._keyring_backend = None
|
||||
|
||||
from iai_mcp import cli, daemon_state
|
||||
|
||||
monkeypatch.setattr(daemon_state, "STATE_PATH", state_path)
|
||||
monkeypatch.setattr(cli, "LOCK_PATH", lock_path)
|
||||
monkeypatch.setattr(cli, "SOCKET_PATH", sock_path)
|
||||
|
||||
try:
|
||||
yield sock_path, state_path, store_dir, lock_path
|
||||
finally:
|
||||
_kill_test_daemons(sock_path)
|
||||
try:
|
||||
if sock_path.exists():
|
||||
sock_path.unlink()
|
||||
except OSError:
|
||||
pass
|
||||
try:
|
||||
sock_dir.rmdir()
|
||||
except OSError:
|
||||
pass
|
||||
keyring.core._keyring_backend = None
|
||||
|
||||
|
||||
def _spawn_daemon(sock_path: Path, store_dir: Path, home: Path) -> subprocess.Popen:
|
||||
"""Spawn `python -m iai_mcp.daemon` with the test's env propagated."""
|
||||
env = os.environ.copy()
|
||||
env["HOME"] = str(home)
|
||||
env["IAI_DAEMON_SOCKET_PATH"] = str(sock_path)
|
||||
env["IAI_MCP_STORE"] = str(store_dir)
|
||||
env["IAI_DAEMON_IDLE_SHUTDOWN_SECS"] = "99999"
|
||||
env["PYTHON_KEYRING_BACKEND"] = "keyring.backends.fail.Keyring"
|
||||
env["IAI_MCP_CRYPTO_PASSPHRASE"] = "test-mb-passphrase"
|
||||
return subprocess.Popen(
|
||||
[sys.executable, "-m", "iai_mcp.daemon"],
|
||||
env=env,
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
)
|
||||
|
||||
|
||||
def _wait_for_socket(sock_path: Path, timeout_sec: float = 30.0) -> bool:
|
||||
deadline = time.monotonic() + timeout_sec
|
||||
while time.monotonic() < deadline:
|
||||
if sock_path.exists():
|
||||
return True
|
||||
time.sleep(0.1)
|
||||
return False
|
||||
|
||||
|
||||
def _kill_test_daemons(sock_path: Path) -> None:
|
||||
"""Match-by-env cleanup: SIGTERM iai_mcp.daemon subprocesses whose
|
||||
psutil environ has our IAI_DAEMON_SOCKET_PATH value. Avoids touching
|
||||
the user's real production daemon.
|
||||
"""
|
||||
target = str(sock_path)
|
||||
for p in psutil.process_iter(["pid", "cmdline"]):
|
||||
try:
|
||||
cl = " ".join(p.info.get("cmdline") or [])
|
||||
if "iai_mcp.daemon" not in cl:
|
||||
continue
|
||||
try:
|
||||
env = p.environ()
|
||||
except (psutil.AccessDenied, psutil.NoSuchProcess):
|
||||
continue
|
||||
if env.get("IAI_DAEMON_SOCKET_PATH") == target:
|
||||
try:
|
||||
p.send_signal(signal.SIGTERM)
|
||||
p.wait(timeout=3)
|
||||
except (psutil.NoSuchProcess, psutil.TimeoutExpired):
|
||||
try:
|
||||
p.send_signal(signal.SIGKILL)
|
||||
except psutil.NoSuchProcess:
|
||||
pass
|
||||
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
||||
continue
|
||||
|
||||
|
||||
def _spawn_dup_daemons(
|
||||
sock_path: Path, store_dir: Path, home: Path
|
||||
) -> tuple[subprocess.Popen, subprocess.Popen]:
|
||||
"""Spawn 2 real iai_mcp.daemon subprocesses both bound to sock_path.
|
||||
|
||||
Race-window simulation per CONTEXT.md hint: spawn daemon #1, wait for
|
||||
socket, unlink (so daemon #2 can bind a fresh inode at the same path),
|
||||
spawn daemon #2, wait for socket. Daemon #1's listening fd still holds
|
||||
the original (now unlinked) inode; daemon #2 holds the new inode. lsof
|
||||
reports both PIDs as binders of the same path.
|
||||
"""
|
||||
p1 = _spawn_daemon(sock_path, store_dir, home)
|
||||
if not _wait_for_socket(sock_path, timeout_sec=30):
|
||||
try:
|
||||
p1.kill()
|
||||
except ProcessLookupError:
|
||||
pass
|
||||
raise AssertionError("daemon #1 never bound socket within 30s")
|
||||
|
||||
# Race-window: unlink so daemon #2's bind() succeeds without EADDRINUSE.
|
||||
try:
|
||||
sock_path.unlink()
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
p2 = _spawn_daemon(sock_path, store_dir, home)
|
||||
if not _wait_for_socket(sock_path, timeout_sec=30):
|
||||
try:
|
||||
p2.kill()
|
||||
except ProcessLookupError:
|
||||
pass
|
||||
try:
|
||||
p1.kill()
|
||||
except ProcessLookupError:
|
||||
pass
|
||||
raise AssertionError("daemon #2 never bound socket within 30s")
|
||||
|
||||
# Settle so lsof reflects both binders.
|
||||
time.sleep(0.5)
|
||||
return p1, p2
|
||||
|
||||
|
||||
@pytest.mark.skip(
|
||||
reason=(
|
||||
"Phase 10.6 Plan 10.6-01 Task 1.5: single-machine "
|
||||
"LifecycleLock prevents two daemons from both binding the same "
|
||||
"IAI_MCP_STORE. Daemon #2 raises LifecycleLockConflict and exits "
|
||||
"1 before bind. The dup-binder integration scenario is now "
|
||||
"impossible by design. The unit tests in this file "
|
||||
"(test_extract_binder_pids_*, test_check_g_*) still cover "
|
||||
"check_g's detection logic without spawning two real daemons."
|
||||
)
|
||||
)
|
||||
def test_kill_dup_binders_keeps_oldest(isolated_daemon_paths):
|
||||
"""G: 2 real daemons → _kill_dup_binders kills younger, keeps oldest.
|
||||
|
||||
Re-running check_g afterward returns PASS (1 binder remaining).
|
||||
"""
|
||||
from iai_mcp.doctor import (
|
||||
_extract_binder_pids,
|
||||
_kill_dup_binders,
|
||||
check_g_no_dup_binders,
|
||||
)
|
||||
|
||||
sock_path, _, store_dir, _ = isolated_daemon_paths
|
||||
home = Path(os.environ["HOME"])
|
||||
|
||||
p1, p2 = _spawn_dup_daemons(sock_path, store_dir, home)
|
||||
try:
|
||||
# Pre-condition: both daemons must show up as binders for our socket.
|
||||
lsof_out = subprocess.run(
|
||||
["lsof", "-U", "-F", "pn"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
check=False,
|
||||
).stdout
|
||||
binders = _extract_binder_pids(lsof_out, sock_path)
|
||||
assert {p1.pid, p2.pid}.issubset(binders), (
|
||||
f"expected both daemon PIDs in binders; got {binders} "
|
||||
f"(daemons: {p1.pid}, {p2.pid})"
|
||||
)
|
||||
pre_check = check_g_no_dup_binders()
|
||||
assert pre_check.passed is False, (
|
||||
f"pre-condition: dup-binder scenario should FAIL check_g; "
|
||||
f"got {pre_check.detail!r}"
|
||||
)
|
||||
|
||||
# Kill the younger daemon. p1 was spawned first → has greater etime →
|
||||
# is the keep_pid; p2 should be killed.
|
||||
ok, msg, ms = _kill_dup_binders()
|
||||
|
||||
assert ok is True, f"_kill_dup_binders returned ok=False: {msg}"
|
||||
assert "kept PID" in msg, f"msg missing 'kept PID': {msg!r}"
|
||||
assert "killed" in msg, f"msg missing 'killed': {msg!r}"
|
||||
assert ms < 10_000, f"_kill_dup_binders took {ms}ms (>10s); too slow"
|
||||
|
||||
# After kill, a follow-up check_g should report 1 (or 0 — race) binder.
|
||||
post_check = check_g_no_dup_binders()
|
||||
assert post_check.passed is True, (
|
||||
f"post-kill check_g should PASS; got {post_check.detail!r}"
|
||||
)
|
||||
|
||||
# The kept daemon (p1) should still be alive; the other should be dead
|
||||
# within a generous timeout (kill is SIGKILL, instant on macOS).
|
||||
assert p1.poll() is None, "expected oldest daemon (p1) to survive"
|
||||
# Allow up to 2s for SIGKILL signal delivery + reap.
|
||||
deadline = time.monotonic() + 5.0
|
||||
while time.monotonic() < deadline and p2.poll() is None:
|
||||
time.sleep(0.1)
|
||||
assert p2.poll() is not None, "expected younger daemon (p2) to be dead"
|
||||
finally:
|
||||
for proc in (p1, p2):
|
||||
if proc.poll() is None:
|
||||
try:
|
||||
proc.send_signal(signal.SIGKILL)
|
||||
proc.wait(timeout=3)
|
||||
except (subprocess.TimeoutExpired, ProcessLookupError):
|
||||
pass
|
||||
|
||||
|
||||
@pytest.mark.skip(
|
||||
reason=(
|
||||
"Phase 10.6 Plan 10.6-01 Task 1.5: single-machine "
|
||||
"LifecycleLock prevents two daemons from both binding the same "
|
||||
"IAI_MCP_STORE. Daemon #2 raises LifecycleLockConflict and exits "
|
||||
"1 before bind. End-to-end recovery from dup-binders cannot run "
|
||||
"because the dup-binders state is now impossible to construct."
|
||||
)
|
||||
)
|
||||
def test_doctor_apply_yes_recovers_from_dup_binders(isolated_daemon_paths):
|
||||
"""H: end-to-end. 2 dup-binder daemons → cmd_doctor(apply=True, yes=True)
|
||||
drives the kill_dup_binders repair → re-check returns 0 OR exit 2 only
|
||||
if a non-related check (e.g., (a) state desync) FAILs.
|
||||
|
||||
NB: spawning two real daemons against the same socket inevitably leaves
|
||||
daemon-state.json pointing at one of the two PIDs (whichever wrote last).
|
||||
After kill_dup_binders, if the survivor is the one daemon-state recorded,
|
||||
check_a passes; if the survivor is the OTHER daemon, check_a FAILs and the
|
||||
respawn action triggers, which (because the surviving daemon already binds
|
||||
the socket) yields a launchd-react-noop OR a benign respawn-timeout. The
|
||||
relevant assertion for THIS test is the dup-binder repair specifically:
|
||||
after recovery, lsof reports exactly 1 binder for our socket path. The
|
||||
overall rc and check_a status are looser assertions because they depend
|
||||
on the state-file-vs-survivor coincidence.
|
||||
"""
|
||||
from iai_mcp.doctor import (
|
||||
_extract_binder_pids,
|
||||
check_g_no_dup_binders,
|
||||
cmd_doctor,
|
||||
)
|
||||
|
||||
sock_path, _, store_dir, _ = isolated_daemon_paths
|
||||
home = Path(os.environ["HOME"])
|
||||
|
||||
p1, p2 = _spawn_dup_daemons(sock_path, store_dir, home)
|
||||
try:
|
||||
# Sanity: dup-binder is detectable.
|
||||
pre = check_g_no_dup_binders()
|
||||
assert pre.passed is False, f"pre: dup-binder should FAIL; got {pre.detail!r}"
|
||||
|
||||
args = argparse.Namespace(apply=True, yes=True)
|
||||
rc = cmd_doctor(args)
|
||||
|
||||
# The critical observable: dup-binders cleared.
|
||||
post_check = check_g_no_dup_binders()
|
||||
assert post_check.passed is True, (
|
||||
f"post-recovery: check_g should PASS; got {post_check.detail!r}"
|
||||
)
|
||||
# rc may be 0 (everything green) or 2 (only check_a survived as FAIL
|
||||
# because state-file PID points at the killed survivor); both prove
|
||||
# the dup-binder repair mechanism worked. rc=1 would mean --apply
|
||||
# never ran the repair (regression).
|
||||
assert rc in (0, 2), (
|
||||
f"cmd_doctor rc={rc} unexpected; allowed 0 (full recovery) or 2 "
|
||||
f"(dup-binders fixed but state-file desync persists)."
|
||||
)
|
||||
|
||||
# Belt-and-suspenders: lsof confirms exactly 1 binder remains.
|
||||
lsof_out = subprocess.run(
|
||||
["lsof", "-U", "-F", "pn"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
check=False,
|
||||
).stdout
|
||||
binders = _extract_binder_pids(lsof_out, sock_path)
|
||||
assert len(binders) <= 1, (
|
||||
f"after recovery, expected ≤1 binder for {sock_path}; got {binders}"
|
||||
)
|
||||
finally:
|
||||
for proc in (p1, p2):
|
||||
if proc.poll() is None:
|
||||
try:
|
||||
proc.send_signal(signal.SIGKILL)
|
||||
proc.wait(timeout=3)
|
||||
except (subprocess.TimeoutExpired, ProcessLookupError):
|
||||
pass
|
||||
Loading…
Add table
Add a link
Reference in a new issue