#!/bin/bash # IAI-MCP daemon monitor — runs doctor + topology + sleep cycle status # Usage: ./monitor.sh [--cron] [--json] set -euo pipefail # Load Qdrant env from systemd service file (needed for QdrantStore in this script) if [[ -f "$HOME/.config/systemd/user/iai-mcp-daemon.service" ]]; then while IFS= read -r line; do if [[ "$line" =~ ^Environment=\"([A-Z_]+)=(.*)\"$ ]]; then export "${BASH_REMATCH[1]}=${BASH_REMATCH[2]}" fi done < "$HOME/.config/systemd/user/iai-mcp-daemon.service" fi VENV="${VENV:-$HOME/.venv/iai-mcp}" PYTHON="$VENV/bin/python3" STATE_DIR="$HOME/.iai-mcp" LOGS_DIR="$STATE_DIR/logs" DATE=$(date +%Y-%m-%d) HOUR=$(date +%H:%M) # Colors RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' CYAN='\033[0;36m' NC='\033[0m' is_cron=false is_json=false for arg in "$@"; do case $arg in --cron) is_cron=true ;; --json) is_json=true ;; esac done # --- Helper functions --- pass_tag() { echo -e "${GREEN}[PASS]${NC}"; } fail_tag() { echo -e "${RED}[FAIL]${NC}"; } warn_tag() { echo -e "${YELLOW}[WARN]${NC}"; } # --- 1. Systemd status --- get_systemd_status() { systemctl --user status iai-mcp-daemon.service 2>&1 | grep "Active:" | head -1 } # --- 2. Doctor output --- run_doctor() { $PYTHON -m iai_mcp.cli doctor 2>&1 || true } # --- 3. Topology output --- run_topology() { $PYTHON -m iai_mcp.cli topology 2>&1 || true } # --- 4. Sleep cycle results for today --- run_sleep_cycles() { local logfile="$LOGS_DIR/lifecycle-events-${DATE}.jsonl" if [[ ! -f "$logfile" ]]; then echo " No sleep cycle log for ${DATE}" return fi $PYTHON -c " import sys, json log = sys.argv[1] try: with open(log) as f: lines = f.readlines() except: print(' Empty or unreadable log') sys.exit(0) steps = {} for line in lines: try: ev = json.loads(line.strip()) except: continue if ev.get('event') != 'sleep_step_completed': continue step = ev.get('step', '') err = ev.get('error', '') ts = ev.get('timestamp', '') steps[step] = {'ok': not err, 'error': err, 'ts': ts} if not steps: print(' No sleep steps completed today') sys.exit(0) order = ['SCHEMA_MINE', 'KNOB_TUNE', 'DREAM_DECAY', 'OPTIMIZE_LANCE', 'COMPACT_RECORDS'] for s in order: if s in steps: st = steps[s] status = 'ok' if st['ok'] else 'FAIL' err_str = f' {st[\"error\"][:60]}' if st['error'] else '' print(f' [{s}] {status}{err_str}') " "$logfile" } # --- 5. Memory / RSS --- get_rss() { # Get PID from systemctl output local pid pid=$(systemctl --user status iai-mcp-daemon.service 2>/dev/null | grep "Main PID:" | head -1 | sed 's/.*Main PID: \([0-9]*\).*/\1/' | tr -d '[:space:]') if [[ "$pid" =~ ^[0-9]+$ ]] && [[ -d /proc/"$pid" ]]; then local rss_kb rss_kb=$(cat /proc/"$pid"/status 2>/dev/null | grep VmRSS | awk '{print $2}') if [[ -n "$rss_kb" ]]; then echo "$(( rss_kb / 1024 )) MB" return fi fi echo "N/A" } # --- 6. Duplicate edge count --- get_dup_edges() { $PYTHON -c " import sys from iai_mcp.qdrant_store import QdrantStore store = QdrantStore() try: df = store.edges_as_dataframe() if df.empty: print(0) sys.exit(0) keys = df.apply(lambda r: (str(r['src']), str(r['dst']), str(r.get('edge_type',''))), axis=1) dups = len(keys) - len(keys.unique()) print(dups) except Exception as ex: print(f'ERR:{ex}') " 2>/dev/null || echo "ERR:unreachable" } # --- Main --- if [[ "$is_json" == true ]]; then # JSON output for cron/automation python3 -c " import json, subprocess, sys doctor = subprocess.run(['$PYTHON', '-m', 'iai_mcp.cli', 'doctor'], capture_output=True, text=True) topo = subprocess.run(['$PYTHON', '-m', 'iai_mcp.cli', 'topology'], capture_output=True, text=True) sysd = subprocess.run(['systemctl', '--user', 'status', 'iai-mcp-daemon.service'], capture_output=True, text=True) result = { 'timestamp': '$HOUR', 'date': '$DATE', 'daemon': 'running' if 'active (running)' in sysd.stdout else 'DOWN', 'doctor_exit': doctor.returncode, 'topology_output': topo.stdout.strip(), } # Parse topology for line in topo.stdout.strip().split('\n'): if ':' in line: k, v = line.split(':', 1) result[k.strip()] = v.strip() print(json.dumps(result, indent=2)) " exit 0 fi # --- Human-readable output --- echo -e "${CYAN}╔══════════════════════════════════════════════╗${NC}" echo -e "${CYAN}║ IAI-MCP Daemon Monitor — ${HOUR} | ${DATE} ║${NC}" echo -e "${CYAN}╚══════════════════════════════════════════════╝${NC}" echo "" # Systemd echo -e "${CYAN}── Systemd ──${NC}" systemd_line=$(get_systemd_status) if echo "$systemd_line" | grep -q "active (running)"; then pass_tag echo " $(echo "$systemd_line" | sed 's/^[[:space:]]*//')" else fail_tag echo " $(echo "$systemd_line" | sed 's/^[[:space:]]*//')" fi # Memory echo -e "${CYAN}── Memory ──${NC}" echo " RSS: $(get_rss)" # Doctor echo -e "${CYAN}── Doctor ──${NC}" doctor_output=$(run_doctor) pass_count=$(echo "$doctor_output" | grep -c '\[PASS\]' || true) fail_count=$(echo "$doctor_output" | grep -c '\[FAIL\]' || true) warn_count=$(echo "$doctor_output" | grep -c '\[WARN\]' || true) echo " $pass_count pass / $fail_count fail / $warn_count warn" if [[ "$fail_count" -gt 0 ]]; then echo "$doctor_output" | grep '\[FAIL\]' | sed 's/^/ /' fi # Topology echo -e "${CYAN}── Topology ──${NC}" topo_output=$(run_topology) echo "$topo_output" | sed 's/^/ /' # Sleep cycles echo -e "${CYAN}── Sleep Cycles (today) ──${NC}" run_sleep_cycles # Duplicate edges echo -e "${CYAN}── Edge Health ──${NC}" dup_count=$(get_dup_edges) if [[ "$dup_count" == "ERR:"* ]]; then warn_tag echo " Qdrant unreachable: $dup_count" elif [[ "$dup_count" -gt 0 ]]; then warn_tag echo " $dup_count duplicate edge(s) detected" else pass_tag echo " No duplicates" fi echo "" if [[ "$fail_count" -gt 0 ]]; then echo -e "${RED}⚠ $fail_count issue(s) detected — check output above${NC}" else echo -e "${GREEN}✓ All checks clean${NC}" fi