feat: add monitor.sh — daemon health, topology, sleep cycles, edge health

This commit is contained in:
Apunkt 2026-05-21 10:35:07 +02:00
parent 7f01b8824e
commit 46a2de0787
No known key found for this signature in database

227
scripts/monitor.sh Executable file
View file

@ -0,0 +1,227 @@
#!/bin/bash
# IAI-MCP daemon monitor — runs doctor + topology + sleep cycle status
# Usage: ./monitor.sh [--cron] [--json]
set -euo pipefail
# Load Qdrant env from systemd service file (needed for QdrantStore in this script)
if [[ -f "$HOME/.config/systemd/user/iai-mcp-daemon.service" ]]; then
while IFS= read -r line; do
if [[ "$line" =~ ^Environment=\"([A-Z_]+)=(.*)\"$ ]]; then
export "${BASH_REMATCH[1]}=${BASH_REMATCH[2]}"
fi
done < "$HOME/.config/systemd/user/iai-mcp-daemon.service"
fi
VENV="${VENV:-$HOME/.venv/iai-mcp}"
PYTHON="$VENV/bin/python3"
STATE_DIR="$HOME/.iai-mcp"
LOGS_DIR="$STATE_DIR/logs"
DATE=$(date +%Y-%m-%d)
HOUR=$(date +%H:%M)
# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
CYAN='\033[0;36m'
NC='\033[0m'
is_cron=false
is_json=false
for arg in "$@"; do
case $arg in
--cron) is_cron=true ;;
--json) is_json=true ;;
esac
done
# --- Helper functions ---
pass_tag() { echo -e "${GREEN}[PASS]${NC}"; }
fail_tag() { echo -e "${RED}[FAIL]${NC}"; }
warn_tag() { echo -e "${YELLOW}[WARN]${NC}"; }
# --- 1. Systemd status ---
get_systemd_status() {
systemctl --user status iai-mcp-daemon.service 2>&1 | grep "Active:" | head -1
}
# --- 2. Doctor output ---
run_doctor() {
$PYTHON -m iai_mcp.cli doctor 2>&1 || true
}
# --- 3. Topology output ---
run_topology() {
$PYTHON -m iai_mcp.cli topology 2>&1 || true
}
# --- 4. Sleep cycle results for today ---
run_sleep_cycles() {
local logfile="$LOGS_DIR/lifecycle-events-${DATE}.jsonl"
if [[ ! -f "$logfile" ]]; then
echo " No sleep cycle log for ${DATE}"
return
fi
$PYTHON -c "
import sys, json
log = sys.argv[1]
try:
with open(log) as f:
lines = f.readlines()
except:
print(' Empty or unreadable log')
sys.exit(0)
steps = {}
for line in lines:
try:
ev = json.loads(line.strip())
except:
continue
if ev.get('event') != 'sleep_step_completed':
continue
step = ev.get('step', '')
err = ev.get('error', '')
ts = ev.get('timestamp', '')
steps[step] = {'ok': not err, 'error': err, 'ts': ts}
if not steps:
print(' No sleep steps completed today')
sys.exit(0)
order = ['SCHEMA_MINE', 'KNOB_TUNE', 'DREAM_DECAY', 'OPTIMIZE_LANCE', 'COMPACT_RECORDS']
for s in order:
if s in steps:
st = steps[s]
status = 'ok' if st['ok'] else 'FAIL'
err_str = f' {st[\"error\"][:60]}' if st['error'] else ''
print(f' [{s}] {status}{err_str}')
" "$logfile"
}
# --- 5. Memory / RSS ---
get_rss() {
# Get PID from systemctl output
local pid
pid=$(systemctl --user status iai-mcp-daemon.service 2>/dev/null | grep "Main PID:" | head -1 | sed 's/.*Main PID: \([0-9]*\).*/\1/' | tr -d '[:space:]')
if [[ "$pid" =~ ^[0-9]+$ ]] && [[ -d /proc/"$pid" ]]; then
local rss_kb
rss_kb=$(cat /proc/"$pid"/status 2>/dev/null | grep VmRSS | awk '{print $2}')
if [[ -n "$rss_kb" ]]; then
echo "$(( rss_kb / 1024 )) MB"
return
fi
fi
echo "N/A"
}
# --- 6. Duplicate edge count ---
get_dup_edges() {
$PYTHON -c "
import sys
from iai_mcp.qdrant_store import QdrantStore
store = QdrantStore()
try:
df = store.edges_as_dataframe()
if df.empty:
print(0)
sys.exit(0)
keys = df.apply(lambda r: (str(r['src']), str(r['dst']), str(r.get('edge_type',''))), axis=1)
dups = len(keys) - len(keys.unique())
print(dups)
except Exception as ex:
print(f'ERR:{ex}')
" 2>/dev/null || echo "ERR:unreachable"
}
# --- Main ---
if [[ "$is_json" == true ]]; then
# JSON output for cron/automation
python3 -c "
import json, subprocess, sys
doctor = subprocess.run(['$PYTHON', '-m', 'iai_mcp.cli', 'doctor'], capture_output=True, text=True)
topo = subprocess.run(['$PYTHON', '-m', 'iai_mcp.cli', 'topology'], capture_output=True, text=True)
sysd = subprocess.run(['systemctl', '--user', 'status', 'iai-mcp-daemon.service'], capture_output=True, text=True)
result = {
'timestamp': '$HOUR',
'date': '$DATE',
'daemon': 'running' if 'active (running)' in sysd.stdout else 'DOWN',
'doctor_exit': doctor.returncode,
'topology_output': topo.stdout.strip(),
}
# Parse topology
for line in topo.stdout.strip().split('\n'):
if ':' in line:
k, v = line.split(':', 1)
result[k.strip()] = v.strip()
print(json.dumps(result, indent=2))
"
exit 0
fi
# --- Human-readable output ---
echo -e "${CYAN}╔══════════════════════════════════════════════╗${NC}"
echo -e "${CYAN}║ IAI-MCP Daemon Monitor — ${HOUR} | ${DATE}${NC}"
echo -e "${CYAN}╚══════════════════════════════════════════════╝${NC}"
echo ""
# Systemd
echo -e "${CYAN}── Systemd ──${NC}"
systemd_line=$(get_systemd_status)
if echo "$systemd_line" | grep -q "active (running)"; then
pass_tag
echo " $(echo "$systemd_line" | sed 's/^[[:space:]]*//')"
else
fail_tag
echo " $(echo "$systemd_line" | sed 's/^[[:space:]]*//')"
fi
# Memory
echo -e "${CYAN}── Memory ──${NC}"
echo " RSS: $(get_rss)"
# Doctor
echo -e "${CYAN}── Doctor ──${NC}"
doctor_output=$(run_doctor)
pass_count=$(echo "$doctor_output" | grep -c '\[PASS\]' || true)
fail_count=$(echo "$doctor_output" | grep -c '\[FAIL\]' || true)
warn_count=$(echo "$doctor_output" | grep -c '\[WARN\]' || true)
echo " $pass_count pass / $fail_count fail / $warn_count warn"
if [[ "$fail_count" -gt 0 ]]; then
echo "$doctor_output" | grep '\[FAIL\]' | sed 's/^/ /'
fi
# Topology
echo -e "${CYAN}── Topology ──${NC}"
topo_output=$(run_topology)
echo "$topo_output" | sed 's/^/ /'
# Sleep cycles
echo -e "${CYAN}── Sleep Cycles (today) ──${NC}"
run_sleep_cycles
# Duplicate edges
echo -e "${CYAN}── Edge Health ──${NC}"
dup_count=$(get_dup_edges)
if [[ "$dup_count" == "ERR:"* ]]; then
warn_tag
echo " Qdrant unreachable: $dup_count"
elif [[ "$dup_count" -gt 0 ]]; then
warn_tag
echo " $dup_count duplicate edge(s) detected"
else
pass_tag
echo " No duplicates"
fi
echo ""
if [[ "$fail_count" -gt 0 ]]; then
echo -e "${RED}$fail_count issue(s) detected — check output above${NC}"
else
echo -e "${GREEN}✓ All checks clean${NC}"
fi