Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: XNLLLLH <XNLLLLH@users.noreply.github.com>
333 lines
12 KiB
Python
333 lines
12 KiB
Python
"""Phase 10.4 L4 — daemon-side heartbeat scanner (per-wrapper, PID-scoped).
|
|
|
|
Reads ``~/.iai-mcp/wrappers/heartbeat-<pid>-<uuid>.json`` files written by
|
|
each MCP wrapper instance, validates freshness (``now - last_refresh <= M``)
|
|
AND PID liveness (``os.kill(pid, 0)``), and aggregates presence so the daemon's
|
|
state machine can decide WAKE vs BEDTIME.
|
|
|
|
Constraints (carried from CONTEXT 10.4):
|
|
- Idle CPU near zero — scanner runs on lifecycle TICK (every 30s), not faster.
|
|
- Scanner code is reentrant: ``scan()`` MUST be safe to call concurrently with
|
|
a wrapper writing a heartbeat file (atomic rename pattern + JSON-parse-fail
|
|
fallback to file mtime).
|
|
- No new third-party dependencies — stdlib only.
|
|
- macOS-only PID semantics carried through (Linux subset works the same; only
|
|
Windows is unsupported, which matches the phase's macOS-only stance).
|
|
- This module is STANDALONE — daemon main-loop integration lands in Phase 10.5.
|
|
|
|
Heartbeat file schema (written by wrapper, read here)::
|
|
|
|
{
|
|
"pid": 12345,
|
|
"uuid": "01HZQ...",
|
|
"started_at": "2026-05-02T15:00:00Z",
|
|
"last_refresh": "2026-05-02T15:14:30Z",
|
|
"wrapper_version": "1.0.0",
|
|
"schema_version": 1
|
|
}
|
|
|
|
Status semantics:
|
|
- FRESH: ``last_refresh`` within ``M`` seconds AND PID alive.
|
|
- STALE: ``last_refresh`` older than ``M`` seconds (regardless of PID).
|
|
- ORPHAN: PID is dead (``ProcessLookupError`` from ``kill(pid, 0)``) and the
|
|
file's freshness window has not yet expired. Treated as not-active.
|
|
|
|
A file that fails JSON parse falls back to its filesystem mtime so a torn
|
|
half-written write does not silently mask presence.
|
|
|
|
Validates: WAKE-07.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import os
|
|
from dataclasses import dataclass
|
|
from datetime import datetime, timezone
|
|
from enum import Enum
|
|
from pathlib import Path
|
|
|
|
|
|
# Module-level constants -------------------------------------------------------
|
|
|
|
#: Default refresh staleness threshold (seconds). A heartbeat older than this
|
|
#: is STALE regardless of PID liveness. The wrapper SHOULD refresh every
|
|
#: ``REFRESH_INTERVAL_SEC`` — three missed refreshes (~90 s)
|
|
#: trip staleness.
|
|
DEFAULT_STALE_THRESHOLD_SEC = 90
|
|
|
|
#: Window for the "no fresh activity in last 30 minutes" predicate consumed
|
|
#: by the L6 ``IdleDetector.sleep_eligible`` rule.
|
|
IDLE_WINDOW_SEC = 30 * 60
|
|
|
|
#: Filename glob used to enumerate heartbeat files. Matches the
|
|
#: ``heartbeat-<pid>-<uuid>.json`` convention from CONTEXT 10.4.
|
|
_HEARTBEAT_GLOB = "heartbeat-*.json"
|
|
|
|
|
|
class HeartbeatStatus(Enum):
|
|
"""Tri-state classification of a single heartbeat file."""
|
|
|
|
FRESH = "fresh"
|
|
STALE = "stale"
|
|
ORPHAN = "orphan"
|
|
|
|
|
|
@dataclass
|
|
class HeartbeatEntry:
|
|
"""One scanned heartbeat file with its derived status.
|
|
|
|
Attributes:
|
|
path: Absolute path of the heartbeat file on disk.
|
|
pid: Wrapper PID parsed from the file's payload.
|
|
uuid: Wrapper UUID parsed from the file's payload (used as a stable
|
|
tie-breaker when the same PID is reused after wrapper restart).
|
|
last_refresh: Timezone-aware UTC datetime parsed from
|
|
``last_refresh``; falls back to file mtime if JSON parse fails.
|
|
status: One of ``HeartbeatStatus.{FRESH, STALE, ORPHAN}``.
|
|
"""
|
|
|
|
path: Path
|
|
pid: int
|
|
uuid: str
|
|
last_refresh: datetime
|
|
status: HeartbeatStatus
|
|
|
|
|
|
# PID liveness ----------------------------------------------------------------
|
|
|
|
|
|
def _is_pid_alive(pid: int) -> bool:
|
|
"""Return True iff ``pid`` exists in the kernel's process table.
|
|
|
|
Uses the ``kill(pid, 0)`` POSIX trick — sends no signal but raises
|
|
``ProcessLookupError`` (ESRCH) when the PID has been reaped. A
|
|
``PermissionError`` (EPERM) means the process exists but the current
|
|
user cannot signal it — for liveness purposes we count that as alive.
|
|
A negative or zero ``pid`` is treated as dead (those values would map
|
|
to ``kill(self_pgrp, 0)`` semantics which is not what we want).
|
|
"""
|
|
if pid <= 0:
|
|
return False
|
|
try:
|
|
os.kill(pid, 0)
|
|
except ProcessLookupError:
|
|
return False
|
|
except PermissionError:
|
|
return True
|
|
return True
|
|
|
|
|
|
# Atomic-read-with-mtime-fallback helper --------------------------------------
|
|
|
|
|
|
def _parse_heartbeat_file(path: Path) -> tuple[int, str, datetime] | None:
|
|
"""Best-effort parse of a single heartbeat file.
|
|
|
|
Returns ``(pid, uuid, last_refresh_utc)`` on success or ``None`` if the
|
|
file disappeared mid-read (race with wrapper rotation) or its content
|
|
cannot be coerced into the minimum schema.
|
|
|
|
A JSON-parse failure falls back to the file's mtime so that a torn
|
|
write produced by a wrapper crash mid-rename is treated as STALE-on-
|
|
age rather than silently dropped — matches the "reentrant + safe under
|
|
concurrent writers" requirement in PLAN 10.4-01 Task 1.1.
|
|
"""
|
|
try:
|
|
raw = path.read_text(encoding="utf-8")
|
|
except FileNotFoundError:
|
|
return None
|
|
except OSError:
|
|
return None
|
|
|
|
try:
|
|
payload = json.loads(raw)
|
|
except json.JSONDecodeError:
|
|
# Torn write — fall back to filename PID + filesystem mtime so we
|
|
# at least get a STALE classification rather than dropping the file.
|
|
return _fallback_parse_from_filename(path)
|
|
|
|
pid = payload.get("pid")
|
|
uuid_str = payload.get("uuid", "")
|
|
last_refresh_raw = payload.get("last_refresh")
|
|
|
|
if not isinstance(pid, int) or not isinstance(uuid_str, str):
|
|
return _fallback_parse_from_filename(path)
|
|
if not isinstance(last_refresh_raw, str):
|
|
return _fallback_parse_from_filename(path)
|
|
|
|
try:
|
|
# ``2026-05-02T15:14:30Z`` — Python 3.11+ accepts the trailing Z;
|
|
# for safety we normalize to ``+00:00`` for older 3.10 compatibility.
|
|
normalized = last_refresh_raw.replace("Z", "+00:00")
|
|
last_refresh = datetime.fromisoformat(normalized)
|
|
except ValueError:
|
|
return _fallback_parse_from_filename(path)
|
|
|
|
if last_refresh.tzinfo is None:
|
|
last_refresh = last_refresh.replace(tzinfo=timezone.utc)
|
|
else:
|
|
last_refresh = last_refresh.astimezone(timezone.utc)
|
|
|
|
return pid, uuid_str, last_refresh
|
|
|
|
|
|
def _fallback_parse_from_filename(path: Path) -> tuple[int, str, datetime] | None:
|
|
"""Recover ``(pid, uuid, mtime_utc)`` from filename + filesystem stat.
|
|
|
|
Filename convention: ``heartbeat-<pid>-<uuid>.json``. We split on ``-``
|
|
once for ``heartbeat`` and once for the PID, joining the remainder as
|
|
the UUID (UUIDs may contain dashes).
|
|
"""
|
|
name = path.stem # heartbeat-<pid>-<uuid>
|
|
parts = name.split("-", 2)
|
|
if len(parts) != 3 or parts[0] != "heartbeat":
|
|
return None
|
|
try:
|
|
pid = int(parts[1])
|
|
except ValueError:
|
|
return None
|
|
uuid_str = parts[2]
|
|
try:
|
|
mtime = path.stat().st_mtime
|
|
except FileNotFoundError:
|
|
return None
|
|
return pid, uuid_str, datetime.fromtimestamp(mtime, tz=timezone.utc)
|
|
|
|
|
|
# HeartbeatScanner -------------------------------------------------------------
|
|
|
|
|
|
class HeartbeatScanner:
|
|
"""Aggregates per-wrapper heartbeat files into a daemon-side presence signal.
|
|
|
|
standalone module — wires this into the daemon
|
|
main-loop TICK to dispatch HEARTBEAT_REFRESH / IDLE state events.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
wrappers_dir: Path,
|
|
stale_threshold_sec: int = DEFAULT_STALE_THRESHOLD_SEC,
|
|
) -> None:
|
|
self._wrappers_dir = wrappers_dir
|
|
self._stale_threshold_sec = stale_threshold_sec
|
|
self._last_scan: list[HeartbeatEntry] = []
|
|
|
|
# ----- Scan / classify -----------------------------------------------
|
|
|
|
def scan(self) -> list[HeartbeatEntry]:
|
|
"""Read all heartbeat files, classify each, and return entries.
|
|
|
|
Reentrant: tolerates concurrent writes by ignoring files that vanish
|
|
mid-read and falling back to mtime when JSON is half-written.
|
|
|
|
Empty / missing wrappers dir → empty list (the daemon hasn't seen
|
|
any wrappers yet, which is a valid steady state on a fresh install).
|
|
"""
|
|
entries: list[HeartbeatEntry] = []
|
|
if not self._wrappers_dir.exists():
|
|
self._last_scan = entries
|
|
return entries
|
|
|
|
try:
|
|
candidates = list(self._wrappers_dir.glob(_HEARTBEAT_GLOB))
|
|
except OSError:
|
|
self._last_scan = entries
|
|
return entries
|
|
|
|
now = datetime.now(timezone.utc)
|
|
for path in candidates:
|
|
parsed = _parse_heartbeat_file(path)
|
|
if parsed is None:
|
|
# File vanished mid-glob (cleanup race) — skip silently.
|
|
continue
|
|
pid, uuid_str, last_refresh = parsed
|
|
|
|
age_sec = (now - last_refresh).total_seconds()
|
|
is_alive = _is_pid_alive(pid)
|
|
|
|
if age_sec > self._stale_threshold_sec:
|
|
# Stale wins over orphan — the file is too old to trust
|
|
# regardless of whether its PID happens to still be live.
|
|
status = HeartbeatStatus.STALE
|
|
elif not is_alive:
|
|
status = HeartbeatStatus.ORPHAN
|
|
else:
|
|
status = HeartbeatStatus.FRESH
|
|
|
|
entries.append(
|
|
HeartbeatEntry(
|
|
path=path,
|
|
pid=pid,
|
|
uuid=uuid_str,
|
|
last_refresh=last_refresh,
|
|
status=status,
|
|
)
|
|
)
|
|
|
|
self._last_scan = entries
|
|
return entries
|
|
|
|
# ----- Aggregations consumed by the state machine --------------------
|
|
|
|
def fresh_count(self) -> int:
|
|
"""Number of heartbeats classified as FRESH on the most recent scan.
|
|
|
|
Re-runs ``scan()`` so callers don't have to remember to invoke it
|
|
first; the cost is one filesystem walk per call which is negligible
|
|
at TICK cadence (every 30 s).
|
|
"""
|
|
return sum(1 for e in self.scan() if e.status is HeartbeatStatus.FRESH)
|
|
|
|
def is_active(self) -> bool:
|
|
"""True iff at least one wrapper is currently FRESH.
|
|
|
|
This is the primary signal the state machine uses to dispatch
|
|
HEARTBEAT_REFRESH (→ WAKE) vs. begin the IDLE-eligibility check.
|
|
"""
|
|
return self.fresh_count() >= 1
|
|
|
|
def heartbeat_idle_30min(self) -> bool:
|
|
"""True iff no FRESH heartbeats existed in the last 30 minutes.
|
|
|
|
Consumed by ``IdleDetector.sleep_eligible`` as one of the three
|
|
disjuncts that gate L6 sleep. "No FRESH in window" is implemented
|
|
as: scan now, and if zero entries are FRESH, the window is empty.
|
|
STALE / ORPHAN entries imply the wrapper has not refreshed for at
|
|
least the staleness threshold (90 s by default), so a single scan
|
|
suffices — we don't keep a history buffer in this module.
|
|
"""
|
|
# Fresh count == 0 means no wrapper is currently active. Combined
|
|
# with the 30-min wall-clock window enforced by the daemon's TICK
|
|
# rhythm and the L6 idle predicate's hardware backstop (HIDIdleTime
|
|
# ≥ 1800 s), this gives the same observable behavior as a separate
|
|
# 30-minute history without keeping in-memory state.
|
|
return self.fresh_count() == 0
|
|
|
|
# ----- Cleanup -------------------------------------------------------
|
|
|
|
def cleanup_stale_orphans(self) -> int:
|
|
"""Delete heartbeat files classified STALE or ORPHAN. Returns count deleted.
|
|
|
|
Best-effort: a delete that races with another process unlinking the
|
|
same file (``FileNotFoundError``) is counted as a successful
|
|
cleanup; any other ``OSError`` is swallowed so a single problematic
|
|
file cannot break the rest of the cleanup pass.
|
|
"""
|
|
deleted = 0
|
|
for entry in self.scan():
|
|
if entry.status is HeartbeatStatus.FRESH:
|
|
continue
|
|
try:
|
|
entry.path.unlink()
|
|
deleted += 1
|
|
except FileNotFoundError:
|
|
# Already unlinked (concurrent wrapper rotation / sibling
|
|
# daemon scan). Count as cleaned — the file is gone.
|
|
deleted += 1
|
|
except OSError:
|
|
# Permission / FS error on a single file: skip it, keep
|
|
# going. The doctor row will surface persistent
|
|
# cleanup failures via "n=X stale" delta on next run.
|
|
continue
|
|
return deleted
|