Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: XNLLLLH <XNLLLLH@users.noreply.github.com>
294 lines
11 KiB
Python
294 lines
11 KiB
Python
"""Phase 4 -- atomic daemon state persistence (DAEMON-01 / D-24).
|
|
|
|
State file at ~/.iai-mcp/.daemon-state.json holds:
|
|
- fsm_state -- WAKE / TRANSITIONING / SLEEP / DREAMING
|
|
- daemon_started_at -- ISO8601 UTC
|
|
- last_digest_shown_at -- ISO8601 UTC, used by morning digest gate
|
|
- pending_digest -- dict ready to surface in next memory_recall
|
|
- last_learned_at -- last quiet-window learn timestamp
|
|
- last_session_ts -- last observed session_started event ts
|
|
|
|
All writes via tempfile + os.replace (POSIX atomic rename). Crash-mid-write
|
|
leaves the old file intact; readers either see old complete or new complete,
|
|
never partial.
|
|
|
|
T-04-01 mitigation: atomic rename precludes torn writes.
|
|
T-04-07 mitigation: file mode 0o600 user-only.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import os
|
|
import tempfile
|
|
from datetime import datetime, timedelta, timezone
|
|
from pathlib import Path
|
|
|
|
STATE_PATH: Path = Path.home() / ".iai-mcp" / ".daemon-state.json"
|
|
|
|
# morning-digest gating threshold. The digest is surfaced only when it
|
|
# has been at least this many hours since the last show (or has never shown).
|
|
DIGEST_SHOW_THRESHOLD_HOURS: int = 18
|
|
|
|
# first_turn_pending eviction guards. A session is considered stale once it
|
|
# has sat in the dict for longer than FIRST_TURN_TTL_HOURS -- typically it
|
|
# means the client died before consuming the flag, so the entry will never
|
|
# be popped by ``consume_first_turn``. MAX_FIRST_TURN_ENTRIES caps the dict
|
|
# as a secondary safety net when many sessions open in a short window.
|
|
FIRST_TURN_TTL_HOURS: int = 24
|
|
MAX_FIRST_TURN_ENTRIES: int = 100
|
|
|
|
|
|
def load_state() -> dict:
|
|
"""Read the state file; return {} if missing or malformed (self-heal)."""
|
|
if not STATE_PATH.exists():
|
|
return {}
|
|
try:
|
|
return json.loads(STATE_PATH.read_text())
|
|
except (OSError, json.JSONDecodeError):
|
|
# Corrupt file -- return empty dict; next save_state writes fresh.
|
|
return {}
|
|
|
|
|
|
def save_state(state: dict) -> None:
|
|
"""Atomically persist state via tempfile + os.replace.
|
|
|
|
Semantics:
|
|
- Creates parent dir if missing.
|
|
- Writes to a sibling temp file in the same directory (required so
|
|
os.replace can do an atomic rename on the same filesystem).
|
|
- fsync the file contents before rename so the data is on disk.
|
|
- chmod 0o600 before the swap so the visible file is never world-readable.
|
|
- On exception: unlink the temp file so `/tmp` doesn't accumulate.
|
|
"""
|
|
STATE_PATH.parent.mkdir(parents=True, exist_ok=True)
|
|
fd, tmp = tempfile.mkstemp(
|
|
prefix=".daemon-state.",
|
|
suffix=".tmp",
|
|
dir=str(STATE_PATH.parent),
|
|
)
|
|
try:
|
|
with os.fdopen(fd, "w") as f:
|
|
json.dump(state, f, indent=2)
|
|
f.flush()
|
|
os.fsync(f.fileno())
|
|
os.chmod(tmp, 0o600)
|
|
os.replace(tmp, STATE_PATH)
|
|
except Exception:
|
|
try:
|
|
os.unlink(tmp)
|
|
except OSError:
|
|
pass
|
|
raise
|
|
|
|
|
|
def prune_stale_first_turn(
|
|
state: dict,
|
|
now: datetime | None = None,
|
|
ttl_hours: int = FIRST_TURN_TTL_HOURS,
|
|
max_entries: int = MAX_FIRST_TURN_ENTRIES,
|
|
) -> int:
|
|
"""Evict first_turn_pending entries older than ``ttl_hours`` and cap the
|
|
dict at ``max_entries`` (keep newest by timestamp). Returns the number
|
|
of entries removed.
|
|
|
|
Accepts legacy values ``True`` / ``False`` as "unknown timestamp" and
|
|
stamps them with ``now`` so they age out on the next prune. Idempotent;
|
|
safe to call on every save.
|
|
"""
|
|
pending = state.get("first_turn_pending")
|
|
if not isinstance(pending, dict) or not pending:
|
|
return 0
|
|
|
|
current = now if now is not None else datetime.now(timezone.utc)
|
|
if current.tzinfo is None:
|
|
current = current.replace(tzinfo=timezone.utc)
|
|
cutoff = current - timedelta(hours=ttl_hours)
|
|
|
|
def _as_dt(value: object) -> datetime:
|
|
"""Parse stored value into an aware datetime; unknown -> epoch (evict).
|
|
|
|
Legacy bool / malformed strings are treated as "stale, evict now" —
|
|
they cannot be aged sensibly without a real timestamp, and the
|
|
former "stamp with current" behaviour kept the dict from ever
|
|
draining when clients died before writing ISO timestamps.
|
|
"""
|
|
if isinstance(value, str):
|
|
try:
|
|
dt = datetime.fromisoformat(value)
|
|
if dt.tzinfo is None:
|
|
dt = dt.replace(tzinfo=timezone.utc)
|
|
return dt
|
|
except ValueError:
|
|
return datetime.fromtimestamp(0, tz=timezone.utc)
|
|
return datetime.fromtimestamp(0, tz=timezone.utc)
|
|
|
|
# Normalise every entry to an ISO timestamp string so downstream
|
|
# callers see a consistent value shape after the first prune.
|
|
removed = 0
|
|
for sid, value in list(pending.items()):
|
|
dt = _as_dt(value)
|
|
if dt < cutoff:
|
|
pending.pop(sid, None)
|
|
removed += 1
|
|
elif not isinstance(value, str):
|
|
pending[sid] = dt.isoformat()
|
|
|
|
# Secondary cap — keep the newest ``max_entries`` by timestamp.
|
|
if len(pending) > max_entries:
|
|
ordered = sorted(
|
|
pending.items(),
|
|
key=lambda kv: _as_dt(kv[1]),
|
|
reverse=True,
|
|
)
|
|
keep = dict(ordered[:max_entries])
|
|
removed += len(pending) - len(keep)
|
|
state["first_turn_pending"] = keep
|
|
|
|
return removed
|
|
|
|
|
|
def mark_session_opened(state: dict, session_id: str) -> None:
|
|
"""Plan 05-03 TOK-12 / D5-03: mark first_turn_pending for a session.
|
|
|
|
Stores the opening timestamp as the dict value so ``prune_stale_first_turn``
|
|
can evict entries whose client died before consuming the flag. Opportunistic
|
|
prune on every mark keeps the dict bounded without a dedicated reaper.
|
|
|
|
Idempotent. Persistence is the caller's responsibility (typical callers:
|
|
concurrency socket handler; tests directly).
|
|
"""
|
|
if not isinstance(session_id, str) or not session_id:
|
|
return
|
|
pending = state.setdefault("first_turn_pending", {})
|
|
pending[session_id] = datetime.now(timezone.utc).isoformat()
|
|
prune_stale_first_turn(state)
|
|
|
|
|
|
def consume_first_turn(state: dict, session_id: str) -> bool:
|
|
"""Return True iff first call for session; atomic pop+save.
|
|
|
|
D5-03: the first memory_recall in a session consumes the
|
|
flag so subsequent recalls bypass the first-turn hook.
|
|
"""
|
|
try:
|
|
pending = state.get("first_turn_pending")
|
|
if not isinstance(pending, dict):
|
|
return False
|
|
if pending.pop(session_id, False):
|
|
try:
|
|
save_state(state)
|
|
except Exception:
|
|
# save failure is non-fatal — returning True still triggers
|
|
# the hook exactly once in-process; cross-process atomicity
|
|
# is best-effort.
|
|
pass
|
|
return True
|
|
return False
|
|
except Exception:
|
|
return False
|
|
|
|
|
|
# R3 (per D7.2-07 / D7.2-08 / D7.2-10): a per-tick + startup
|
|
# reaper for stale `first_turn_pending` entries with a 1-hour TTL and a
|
|
# tuple return shape (updated_state, dropped_session_ids).
|
|
#
|
|
# Distinct from `prune_stale_first_turn` above which has a 24h ceiling and
|
|
# is opportunistically invoked from `mark_session_opened`. Both helpers
|
|
# coexist by design (researcher finding #1 + advisor recommendation):
|
|
# - `prune_stale_first_turn` keeps its 24h opportunistic path on session-open;
|
|
# - `prune_first_turn_pending` is the per-tick + startup reaper that needs
|
|
# the dropped IDs back so the caller can emit
|
|
# `kind=first_turn_pending_expired` events (D7.2-10).
|
|
#
|
|
# Pure function — no I/O. Caller is responsible for `save_state(state)`
|
|
# and the event emit. Idempotent; safe on empty/missing input.
|
|
|
|
FIRST_TURN_PENDING_TTL_SEC_DEFAULT: float = 3600.0 # D7.2-08 1h default
|
|
|
|
|
|
def prune_first_turn_pending(
|
|
state: dict,
|
|
now: datetime | None = None,
|
|
ttl_sec: float = FIRST_TURN_PENDING_TTL_SEC_DEFAULT,
|
|
) -> tuple[dict, list[str]]:
|
|
"""Phase 7.2 R3: drain stale `first_turn_pending` entries.
|
|
|
|
Returns (updated_state_dict, dropped_session_ids). Pure function —
|
|
does NOT call save_state; does NOT emit events. Caller decides
|
|
persistence + event emission.
|
|
|
|
Eviction rules:
|
|
- String value parsed as ISO timestamp; entry evicts if (now - ts) >= ttl_sec.
|
|
- Non-string value (legacy bool / dict / None) treated as stale → evict.
|
|
Matches the established behavior of `prune_stale_first_turn` for
|
|
legacy entries (cannot be aged sensibly without a timestamp).
|
|
- Naive timestamps assumed UTC.
|
|
- Malformed ISO strings → evict (defensive against corruption).
|
|
|
|
Distinct from `prune_stale_first_turn` (24h default, returns int);
|
|
this helper is per-tick + startup with a shorter TTL and visibility
|
|
into which sessions were dropped (D7.2-10 event payload needs the
|
|
session_ids list).
|
|
"""
|
|
pending = state.get("first_turn_pending")
|
|
if not isinstance(pending, dict) or not pending:
|
|
return state, []
|
|
|
|
current = now if now is not None else datetime.now(timezone.utc)
|
|
if current.tzinfo is None:
|
|
current = current.replace(tzinfo=timezone.utc)
|
|
cutoff = current - timedelta(seconds=ttl_sec)
|
|
|
|
dropped: list[str] = []
|
|
fresh: dict = {}
|
|
for sid, value in pending.items():
|
|
if isinstance(value, str):
|
|
try:
|
|
ts = datetime.fromisoformat(value)
|
|
if ts.tzinfo is None:
|
|
ts = ts.replace(tzinfo=timezone.utc)
|
|
except ValueError:
|
|
dropped.append(sid)
|
|
continue
|
|
if ts < cutoff:
|
|
dropped.append(sid)
|
|
continue
|
|
fresh[sid] = value
|
|
else:
|
|
# Legacy bool / dict / None / number — no recoverable timestamp.
|
|
dropped.append(sid)
|
|
|
|
state["first_turn_pending"] = fresh
|
|
return state, dropped
|
|
|
|
|
|
def get_pending_digest(state: dict, now: datetime) -> dict | None:
|
|
"""D-24 / DAEMON-11: return pending morning digest if eligible, else None.
|
|
|
|
Eligibility gate: >= DIGEST_SHOW_THRESHOLD_HOURS since last_digest_shown_at
|
|
OR never shown. When returned, the digest is consumed from state and
|
|
last_digest_shown_at is advanced to `now`; state is persisted via
|
|
save_state so the same digest never appears twice in the same 18h window.
|
|
"""
|
|
last_shown = state.get("last_digest_shown_at")
|
|
if last_shown:
|
|
try:
|
|
last_dt = datetime.fromisoformat(last_shown)
|
|
if last_dt.tzinfo is None:
|
|
last_dt = last_dt.replace(tzinfo=timezone.utc)
|
|
now_cmp = now if now.tzinfo is not None else now.replace(tzinfo=timezone.utc)
|
|
if now_cmp - last_dt < timedelta(hours=DIGEST_SHOW_THRESHOLD_HOURS):
|
|
return None
|
|
except (TypeError, ValueError):
|
|
# Malformed timestamp -- treat as never shown, fall through.
|
|
pass
|
|
|
|
digest = state.get("pending_digest")
|
|
if not digest:
|
|
return None
|
|
|
|
now_cmp = now if now.tzinfo is not None else now.replace(tzinfo=timezone.utc)
|
|
state["last_digest_shown_at"] = now_cmp.isoformat()
|
|
state.pop("pending_digest", None)
|
|
save_state(state)
|
|
return digest
|