337 lines
13 KiB
Python
337 lines
13 KiB
Python
|
|
"""Plan 07-03 Wave 3 R5 daemon-side fail-loud + HIGH-3 yield acceptance tests.
|
||
|
|
|
||
|
|
R5 daemon-side semantics
|
||
|
|
------------------------
|
||
|
|
|
||
|
|
Killing the live daemon (`kill -9` or `kill -TERM`) mid-call MUST leave NO
|
||
|
|
orphan `iai_mcp.core` processes anywhere on the system (post-Phase-7 there
|
||
|
|
should be ZERO `iai_mcp.core` processes under any circumstance — the
|
||
|
|
singleton invariant), AND the next connect attempt to the socket MUST
|
||
|
|
surface as ECONNREFUSED or ENOENT (which Wave 4's `bridge.ts` will
|
||
|
|
translate to the wrapper-side `daemon_unreachable` rejection).
|
||
|
|
|
||
|
|
HIGH-3 yield acceptance (D7-09 LOCKED)
|
||
|
|
--------------------------------------
|
||
|
|
|
||
|
|
The in-process C1 HUMAN-FIRST yield helper `_should_yield_to_mcp` defers
|
||
|
|
REM cycles when EITHER `mcp_socket.active_connections > 0` OR
|
||
|
|
`(time.monotonic() - mcp_socket.last_activity_ts) < 30`. This file exercises
|
||
|
|
the helper directly with mocked `time.monotonic` so we never wait 35
|
||
|
|
seconds wall-clock — keeps the suite brisk.
|
||
|
|
"""
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
import asyncio
|
||
|
|
import json
|
||
|
|
import os
|
||
|
|
import signal
|
||
|
|
import socket as sk
|
||
|
|
import subprocess
|
||
|
|
import sys
|
||
|
|
import time
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
import psutil
|
||
|
|
import pytest
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Fixture: tmp socket path (mirrors test_socket_server_dispatch.py:short_socket_paths
|
||
|
|
# but does NOT redirect concurrency.SOCKET_PATH because the daemon subprocess
|
||
|
|
# reads IAI_DAEMON_SOCKET_PATH directly via SocketServer.serve()).
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
|
||
|
|
@pytest.fixture
|
||
|
|
def short_socket_paths(tmp_path):
|
||
|
|
"""Yield (lock_path, sock_path, state_path) under a tmp /tmp/iai-fl-... dir.
|
||
|
|
|
||
|
|
AF_UNIX on macOS caps socket paths at ~104 bytes; pytest's tmp_path can
|
||
|
|
be too long under xdist. Use a short /tmp/iai-fl-<pid>-<n>/ fallback.
|
||
|
|
"""
|
||
|
|
lock_path = tmp_path / ".lock"
|
||
|
|
sock_dir = Path(f"/tmp/iai-fl-{os.getpid()}-{id(tmp_path)}")
|
||
|
|
sock_dir.mkdir(parents=True, exist_ok=True)
|
||
|
|
sock_path = sock_dir / "d.sock"
|
||
|
|
state_path = tmp_path / ".daemon-state.json"
|
||
|
|
|
||
|
|
try:
|
||
|
|
yield lock_path, sock_path, state_path
|
||
|
|
finally:
|
||
|
|
try:
|
||
|
|
if sock_path.exists():
|
||
|
|
sock_path.unlink()
|
||
|
|
except OSError:
|
||
|
|
pass
|
||
|
|
try:
|
||
|
|
sock_dir.rmdir()
|
||
|
|
except OSError:
|
||
|
|
pass
|
||
|
|
|
||
|
|
|
||
|
|
def _count_iai_mcp_processes() -> dict[str, int]:
|
||
|
|
"""Snapshot iai_mcp.core / iai_mcp.daemon process counts for fail-loud assertions.
|
||
|
|
|
||
|
|
invariant: `iai_mcp.core` count must be 0 under all
|
||
|
|
circumstances. The daemon is the singleton; wrappers no longer spawn
|
||
|
|
their own Python core processes (Wave 4 bridge.ts refactor).
|
||
|
|
"""
|
||
|
|
counts = {"core": 0, "daemon": 0}
|
||
|
|
for p in psutil.process_iter(["cmdline"]):
|
||
|
|
try:
|
||
|
|
cl = p.info.get("cmdline") or []
|
||
|
|
if not cl:
|
||
|
|
continue
|
||
|
|
joined = " ".join(c or "" for c in cl)
|
||
|
|
if "iai_mcp.core" in joined:
|
||
|
|
counts["core"] += 1
|
||
|
|
if "iai_mcp.daemon" in joined:
|
||
|
|
counts["daemon"] += 1
|
||
|
|
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
||
|
|
continue
|
||
|
|
return counts
|
||
|
|
|
||
|
|
|
||
|
|
def _spawn_daemon_for_test(sock_path: Path, store_root: Path) -> subprocess.Popen:
|
||
|
|
"""Spawn `python -m iai_mcp.daemon` against an isolated tmp socket+store.
|
||
|
|
|
||
|
|
Uses IAI_DAEMON_SOCKET_PATH + IAI_MCP_STORE env overrides so the
|
||
|
|
subprocess never touches the user's real ~/.iai-mcp/.daemon.sock.
|
||
|
|
|
||
|
|
IAI_DAEMON_IDLE_SHUTDOWN_SECS=99999 disables idle shutdown so the
|
||
|
|
daemon stays alive for the duration of the test.
|
||
|
|
"""
|
||
|
|
env = os.environ.copy()
|
||
|
|
env["IAI_DAEMON_SOCKET_PATH"] = str(sock_path)
|
||
|
|
env["IAI_MCP_STORE"] = str(store_root)
|
||
|
|
env["IAI_DAEMON_IDLE_SHUTDOWN_SECS"] = "99999"
|
||
|
|
return subprocess.Popen(
|
||
|
|
[sys.executable, "-m", "iai_mcp.daemon"],
|
||
|
|
env=env,
|
||
|
|
stdout=subprocess.DEVNULL,
|
||
|
|
stderr=subprocess.DEVNULL,
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
def _wait_for_socket(sock_path: Path, timeout_sec: float = 30.0) -> bool:
|
||
|
|
"""Poll for sock_path existence at 0.1 s cadence; return True on bind."""
|
||
|
|
deadline = time.monotonic() + timeout_sec
|
||
|
|
while time.monotonic() < deadline:
|
||
|
|
if sock_path.exists():
|
||
|
|
return True
|
||
|
|
time.sleep(0.1)
|
||
|
|
return False
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Test 1: kill -9 daemon mid-call → no orphan iai_mcp.core, ECONNREFUSED on retry
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
|
||
|
|
def test_kill_daemon_midcall_no_orphan_core_spawn(short_socket_paths, tmp_path):
|
||
|
|
"""R5/A8 daemon-side: kill -9 daemon → daemon does NOT spawn any new iai_mcp.core.
|
||
|
|
|
||
|
|
The wrapper-side semantics (Promise rejection with daemon_unreachable, single
|
||
|
|
retry) live in mcp-wrapper/src/bridge.ts and are tested in Wave 4.
|
||
|
|
|
||
|
|
invariant (DELTA-based): the daemon under test must NOT
|
||
|
|
spawn any `iai_mcp.core` subprocesses, even on hard kill. Pre-existing
|
||
|
|
`iai_mcp.core` processes from the host's other MCP wrappers (live
|
||
|
|
Claude Code sessions, etc.) are out of scope — they belong to the
|
||
|
|
user's running stack, not to this daemon. We measure the DELTA
|
||
|
|
(after - before) to filter them out.
|
||
|
|
"""
|
||
|
|
_, sock_path, _ = short_socket_paths
|
||
|
|
store_root = tmp_path / "store"
|
||
|
|
store_root.mkdir(parents=True, exist_ok=True)
|
||
|
|
|
||
|
|
# Snapshot existing iai_mcp.core processes BEFORE we spawn our daemon.
|
||
|
|
# Anything still present after the kill that wasn't there now is OUR fault.
|
||
|
|
baseline = _count_iai_mcp_processes()
|
||
|
|
|
||
|
|
proc = _spawn_daemon_for_test(sock_path, store_root)
|
||
|
|
try:
|
||
|
|
assert _wait_for_socket(sock_path, timeout_sec=30), (
|
||
|
|
"daemon never bound socket within 30s"
|
||
|
|
)
|
||
|
|
|
||
|
|
before = _count_iai_mcp_processes()
|
||
|
|
assert before["daemon"] >= baseline["daemon"] + 1, (
|
||
|
|
f"our daemon not visible in process list: baseline={baseline}, before={before}"
|
||
|
|
)
|
||
|
|
# The DELTA from baseline tells us if our daemon spawned any cores.
|
||
|
|
# Any pre-existing cores (host's other MCP wrappers) stay constant.
|
||
|
|
before_delta = before["core"] - baseline["core"]
|
||
|
|
assert before_delta == 0, (
|
||
|
|
f"our daemon spawned {before_delta} iai_mcp.core processes BEFORE kill "
|
||
|
|
f"(baseline={baseline}, before={before}) — post-Phase-7 singleton invariant violated"
|
||
|
|
)
|
||
|
|
|
||
|
|
# SIGKILL — simulate hard daemon death (the threat R5 defends against).
|
||
|
|
proc.send_signal(signal.SIGKILL)
|
||
|
|
proc.wait(timeout=5)
|
||
|
|
|
||
|
|
# Brief pause so psutil reflects the death in subsequent process_iter scans.
|
||
|
|
time.sleep(0.5)
|
||
|
|
|
||
|
|
after = _count_iai_mcp_processes()
|
||
|
|
# DELTA-based assertion: any iai_mcp.core present after the kill must
|
||
|
|
# have been there in the baseline too. Our daemon must NEVER spawn
|
||
|
|
# core processes on death.
|
||
|
|
after_delta = after["core"] - baseline["core"]
|
||
|
|
assert after_delta <= 0, (
|
||
|
|
f"FAIL-LOUD VIOLATION: our daemon spawned {after_delta} new "
|
||
|
|
f"iai_mcp.core processes after kill (baseline={baseline}, after={after}) "
|
||
|
|
"— R5 + A8 invariant: post-Phase-7 daemon must never spawn a core."
|
||
|
|
)
|
||
|
|
|
||
|
|
# Subsequent connect attempts MUST fail. Three acceptable outcomes:
|
||
|
|
# - ConnectionRefusedError: socket file still present, no listener bound
|
||
|
|
# - FileNotFoundError: socket file removed (cleanup_socket on Python 3.13+)
|
||
|
|
# - OSError (generic): platform-dependent ECONNREFUSED variant
|
||
|
|
s = sk.socket(sk.AF_UNIX, sk.SOCK_STREAM)
|
||
|
|
s.settimeout(0.5)
|
||
|
|
err_kind = None
|
||
|
|
try:
|
||
|
|
s.connect(str(sock_path))
|
||
|
|
err_kind = "no_error" # unexpected — daemon should be gone
|
||
|
|
except (ConnectionRefusedError, FileNotFoundError, OSError) as e:
|
||
|
|
err_kind = type(e).__name__
|
||
|
|
finally:
|
||
|
|
try:
|
||
|
|
s.close()
|
||
|
|
except OSError:
|
||
|
|
pass
|
||
|
|
assert err_kind in (
|
||
|
|
"ConnectionRefusedError", "FileNotFoundError", "OSError",
|
||
|
|
), f"unexpected post-kill connect outcome: {err_kind}"
|
||
|
|
finally:
|
||
|
|
if proc.poll() is None:
|
||
|
|
proc.send_signal(signal.SIGKILL)
|
||
|
|
try:
|
||
|
|
proc.wait(timeout=5)
|
||
|
|
except subprocess.TimeoutExpired:
|
||
|
|
pass
|
||
|
|
try:
|
||
|
|
if sock_path.exists():
|
||
|
|
sock_path.unlink()
|
||
|
|
except OSError:
|
||
|
|
pass
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Test 2: kill daemon during active connection → wrapper sees EOF on next read
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
|
||
|
|
def test_kill_daemon_during_active_connection(short_socket_paths, tmp_path):
|
||
|
|
"""R5: kill daemon while a wrapper holds an open socket → wrapper sees EOF / OSError.
|
||
|
|
|
||
|
|
The Wave 4 bridge.ts will translate that EOF into a `daemon_unreachable`
|
||
|
|
rejection (which then triggers the single-retry per D7-04). This test
|
||
|
|
just confirms the daemon-side surface: an open connection is broken
|
||
|
|
cleanly when the daemon dies, no half-open zombie socket.
|
||
|
|
"""
|
||
|
|
_, sock_path, _ = short_socket_paths
|
||
|
|
store_root = tmp_path / "store"
|
||
|
|
store_root.mkdir(parents=True, exist_ok=True)
|
||
|
|
|
||
|
|
proc = _spawn_daemon_for_test(sock_path, store_root)
|
||
|
|
try:
|
||
|
|
assert _wait_for_socket(sock_path, timeout_sec=30), (
|
||
|
|
"daemon never bound socket within 30s"
|
||
|
|
)
|
||
|
|
|
||
|
|
# Open a persistent connection. Send a short control message first
|
||
|
|
# to confirm the connection is live BEFORE we kill the daemon.
|
||
|
|
s = sk.socket(sk.AF_UNIX, sk.SOCK_STREAM)
|
||
|
|
s.settimeout(15)
|
||
|
|
s.connect(str(sock_path))
|
||
|
|
msg = (json.dumps({"type": "status"}) + "\n").encode("utf-8")
|
||
|
|
s.sendall(msg)
|
||
|
|
|
||
|
|
# Read the status response (proves the connection is live).
|
||
|
|
first_response = b""
|
||
|
|
while not first_response.endswith(b"\n"):
|
||
|
|
chunk = s.recv(4096)
|
||
|
|
if not chunk:
|
||
|
|
break
|
||
|
|
first_response += chunk
|
||
|
|
assert first_response, "daemon never replied to initial status"
|
||
|
|
decoded = json.loads(first_response.decode("utf-8"))
|
||
|
|
assert decoded.get("ok") is True, decoded
|
||
|
|
|
||
|
|
# Kill the daemon HARD with the connection still open.
|
||
|
|
proc.send_signal(signal.SIGKILL)
|
||
|
|
proc.wait(timeout=5)
|
||
|
|
|
||
|
|
# The next read on the open socket must surface as EOF (b'') OR raise.
|
||
|
|
# Either is an acceptable fail-loud signal for the wrapper-side
|
||
|
|
# daemon_unreachable translation in Wave 4.
|
||
|
|
s.settimeout(2.0)
|
||
|
|
eof_or_error = False
|
||
|
|
try:
|
||
|
|
chunk = s.recv(4096)
|
||
|
|
if chunk == b"":
|
||
|
|
eof_or_error = True # clean EOF
|
||
|
|
except (ConnectionResetError, BrokenPipeError, OSError):
|
||
|
|
eof_or_error = True # OS surfaced the death
|
||
|
|
finally:
|
||
|
|
try:
|
||
|
|
s.close()
|
||
|
|
except OSError:
|
||
|
|
pass
|
||
|
|
assert eof_or_error, (
|
||
|
|
"daemon kill did not surface as EOF / OSError on open connection — "
|
||
|
|
"wrapper-side daemon_unreachable translation would silently hang"
|
||
|
|
)
|
||
|
|
|
||
|
|
# Subsequent connect attempts also fail (same as test 1's tail check).
|
||
|
|
s2 = sk.socket(sk.AF_UNIX, sk.SOCK_STREAM)
|
||
|
|
s2.settimeout(0.5)
|
||
|
|
err_kind = None
|
||
|
|
try:
|
||
|
|
s2.connect(str(sock_path))
|
||
|
|
err_kind = "no_error"
|
||
|
|
except (ConnectionRefusedError, FileNotFoundError, OSError) as e:
|
||
|
|
err_kind = type(e).__name__
|
||
|
|
finally:
|
||
|
|
try:
|
||
|
|
s2.close()
|
||
|
|
except OSError:
|
||
|
|
pass
|
||
|
|
assert err_kind in (
|
||
|
|
"ConnectionRefusedError", "FileNotFoundError", "OSError",
|
||
|
|
), f"unexpected post-kill connect outcome: {err_kind}"
|
||
|
|
finally:
|
||
|
|
if proc.poll() is None:
|
||
|
|
proc.send_signal(signal.SIGKILL)
|
||
|
|
try:
|
||
|
|
proc.wait(timeout=5)
|
||
|
|
except subprocess.TimeoutExpired:
|
||
|
|
pass
|
||
|
|
try:
|
||
|
|
if sock_path.exists():
|
||
|
|
sock_path.unlink()
|
||
|
|
except OSError:
|
||
|
|
pass
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Plan 10.6-01 Task 1.8: REMOVED the HIGH-3 yield-acceptance
|
||
|
|
# tests (test_scheduler_yields_to_mcp_within_35s and
|
||
|
|
# test_should_yield_called_in_loop_returns_true_every_5s).
|
||
|
|
#
|
||
|
|
# The D7-09 in-process C1 HUMAN-FIRST yield helper
|
||
|
|
# `_should_yield_to_mcp(socket)` was removed in Task 1.4. The lifecycle
|
||
|
|
# state machine + sleep_pipeline + heartbeat scanner supersede this
|
||
|
|
# design: SLEEP-state coexistence with active MCP traffic is provided
|
||
|
|
# by the bounded-deferral interrupt_check inside lifecycle_tick (each
|
||
|
|
# sleep_pipeline chunk re-checks `mcp_socket.active_connections > 0
|
||
|
|
# OR (now - last_activity_ts) < 30s` and defers if true).
|
||
|
|
#
|
||
|
|
# The kill-daemon-midcall tests above (test 1, test 2) cover the R5
|
||
|
|
# fail-loud contract and stay green; they do not reference the
|
||
|
|
# removed yield helper.
|
||
|
|
# ---------------------------------------------------------------------------
|