diff --git a/scripts/monitor.sh b/scripts/monitor.sh new file mode 100755 index 0000000..6202bbc --- /dev/null +++ b/scripts/monitor.sh @@ -0,0 +1,227 @@ +#!/bin/bash +# IAI-MCP daemon monitor — runs doctor + topology + sleep cycle status +# Usage: ./monitor.sh [--cron] [--json] +set -euo pipefail + +# Load Qdrant env from systemd service file (needed for QdrantStore in this script) +if [[ -f "$HOME/.config/systemd/user/iai-mcp-daemon.service" ]]; then + while IFS= read -r line; do + if [[ "$line" =~ ^Environment=\"([A-Z_]+)=(.*)\"$ ]]; then + export "${BASH_REMATCH[1]}=${BASH_REMATCH[2]}" + fi + done < "$HOME/.config/systemd/user/iai-mcp-daemon.service" +fi + +VENV="${VENV:-$HOME/.venv/iai-mcp}" +PYTHON="$VENV/bin/python3" +STATE_DIR="$HOME/.iai-mcp" +LOGS_DIR="$STATE_DIR/logs" +DATE=$(date +%Y-%m-%d) +HOUR=$(date +%H:%M) + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +CYAN='\033[0;36m' +NC='\033[0m' + +is_cron=false +is_json=false +for arg in "$@"; do + case $arg in + --cron) is_cron=true ;; + --json) is_json=true ;; + esac +done + +# --- Helper functions --- +pass_tag() { echo -e "${GREEN}[PASS]${NC}"; } +fail_tag() { echo -e "${RED}[FAIL]${NC}"; } +warn_tag() { echo -e "${YELLOW}[WARN]${NC}"; } + +# --- 1. Systemd status --- +get_systemd_status() { + systemctl --user status iai-mcp-daemon.service 2>&1 | grep "Active:" | head -1 +} + +# --- 2. Doctor output --- +run_doctor() { + $PYTHON -m iai_mcp.cli doctor 2>&1 || true +} + +# --- 3. Topology output --- +run_topology() { + $PYTHON -m iai_mcp.cli topology 2>&1 || true +} + +# --- 4. Sleep cycle results for today --- +run_sleep_cycles() { + local logfile="$LOGS_DIR/lifecycle-events-${DATE}.jsonl" + if [[ ! -f "$logfile" ]]; then + echo " No sleep cycle log for ${DATE}" + return + fi + $PYTHON -c " +import sys, json +log = sys.argv[1] +try: + with open(log) as f: + lines = f.readlines() +except: + print(' Empty or unreadable log') + sys.exit(0) + +steps = {} +for line in lines: + try: + ev = json.loads(line.strip()) + except: + continue + if ev.get('event') != 'sleep_step_completed': + continue + step = ev.get('step', '') + err = ev.get('error', '') + ts = ev.get('timestamp', '') + steps[step] = {'ok': not err, 'error': err, 'ts': ts} + +if not steps: + print(' No sleep steps completed today') + sys.exit(0) + +order = ['SCHEMA_MINE', 'KNOB_TUNE', 'DREAM_DECAY', 'OPTIMIZE_LANCE', 'COMPACT_RECORDS'] +for s in order: + if s in steps: + st = steps[s] + status = 'ok' if st['ok'] else 'FAIL' + err_str = f' {st[\"error\"][:60]}' if st['error'] else '' + print(f' [{s}] {status}{err_str}') +" "$logfile" +} + +# --- 5. Memory / RSS --- +get_rss() { + # Get PID from systemctl output + local pid + pid=$(systemctl --user status iai-mcp-daemon.service 2>/dev/null | grep "Main PID:" | head -1 | sed 's/.*Main PID: \([0-9]*\).*/\1/' | tr -d '[:space:]') + if [[ "$pid" =~ ^[0-9]+$ ]] && [[ -d /proc/"$pid" ]]; then + local rss_kb + rss_kb=$(cat /proc/"$pid"/status 2>/dev/null | grep VmRSS | awk '{print $2}') + if [[ -n "$rss_kb" ]]; then + echo "$(( rss_kb / 1024 )) MB" + return + fi + fi + echo "N/A" +} + +# --- 6. Duplicate edge count --- +get_dup_edges() { + $PYTHON -c " +import sys +from iai_mcp.qdrant_store import QdrantStore +store = QdrantStore() +try: + df = store.edges_as_dataframe() + if df.empty: + print(0) + sys.exit(0) + keys = df.apply(lambda r: (str(r['src']), str(r['dst']), str(r.get('edge_type',''))), axis=1) + dups = len(keys) - len(keys.unique()) + print(dups) +except Exception as ex: + print(f'ERR:{ex}') +" 2>/dev/null || echo "ERR:unreachable" +} + +# --- Main --- + +if [[ "$is_json" == true ]]; then + # JSON output for cron/automation + python3 -c " +import json, subprocess, sys + +doctor = subprocess.run(['$PYTHON', '-m', 'iai_mcp.cli', 'doctor'], capture_output=True, text=True) +topo = subprocess.run(['$PYTHON', '-m', 'iai_mcp.cli', 'topology'], capture_output=True, text=True) +sysd = subprocess.run(['systemctl', '--user', 'status', 'iai-mcp-daemon.service'], capture_output=True, text=True) + +result = { + 'timestamp': '$HOUR', + 'date': '$DATE', + 'daemon': 'running' if 'active (running)' in sysd.stdout else 'DOWN', + 'doctor_exit': doctor.returncode, + 'topology_output': topo.stdout.strip(), +} + +# Parse topology +for line in topo.stdout.strip().split('\n'): + if ':' in line: + k, v = line.split(':', 1) + result[k.strip()] = v.strip() + +print(json.dumps(result, indent=2)) +" + exit 0 +fi + +# --- Human-readable output --- +echo -e "${CYAN}╔══════════════════════════════════════════════╗${NC}" +echo -e "${CYAN}║ IAI-MCP Daemon Monitor — ${HOUR} | ${DATE} ║${NC}" +echo -e "${CYAN}╚══════════════════════════════════════════════╝${NC}" +echo "" + +# Systemd +echo -e "${CYAN}── Systemd ──${NC}" +systemd_line=$(get_systemd_status) +if echo "$systemd_line" | grep -q "active (running)"; then + pass_tag + echo " $(echo "$systemd_line" | sed 's/^[[:space:]]*//')" +else + fail_tag + echo " $(echo "$systemd_line" | sed 's/^[[:space:]]*//')" +fi + +# Memory +echo -e "${CYAN}── Memory ──${NC}" +echo " RSS: $(get_rss)" + +# Doctor +echo -e "${CYAN}── Doctor ──${NC}" +doctor_output=$(run_doctor) +pass_count=$(echo "$doctor_output" | grep -c '\[PASS\]' || true) +fail_count=$(echo "$doctor_output" | grep -c '\[FAIL\]' || true) +warn_count=$(echo "$doctor_output" | grep -c '\[WARN\]' || true) +echo " $pass_count pass / $fail_count fail / $warn_count warn" +if [[ "$fail_count" -gt 0 ]]; then + echo "$doctor_output" | grep '\[FAIL\]' | sed 's/^/ /' +fi + +# Topology +echo -e "${CYAN}── Topology ──${NC}" +topo_output=$(run_topology) +echo "$topo_output" | sed 's/^/ /' + +# Sleep cycles +echo -e "${CYAN}── Sleep Cycles (today) ──${NC}" +run_sleep_cycles + +# Duplicate edges +echo -e "${CYAN}── Edge Health ──${NC}" +dup_count=$(get_dup_edges) +if [[ "$dup_count" == "ERR:"* ]]; then + warn_tag + echo " Qdrant unreachable: $dup_count" +elif [[ "$dup_count" -gt 0 ]]; then + warn_tag + echo " $dup_count duplicate edge(s) detected" +else + pass_tag + echo " No duplicates" +fi + +echo "" +if [[ "$fail_count" -gt 0 ]]; then + echo -e "${RED}⚠ $fail_count issue(s) detected — check output above${NC}" +else + echo -e "${GREEN}✓ All checks clean${NC}" +fi