feat: add monitor.sh — daemon health, topology, sleep cycles, edge health
This commit is contained in:
parent
7f01b8824e
commit
46a2de0787
1 changed files with 227 additions and 0 deletions
227
scripts/monitor.sh
Executable file
227
scripts/monitor.sh
Executable file
|
|
@ -0,0 +1,227 @@
|
|||
#!/bin/bash
|
||||
# IAI-MCP daemon monitor — runs doctor + topology + sleep cycle status
|
||||
# Usage: ./monitor.sh [--cron] [--json]
|
||||
set -euo pipefail
|
||||
|
||||
# Load Qdrant env from systemd service file (needed for QdrantStore in this script)
|
||||
if [[ -f "$HOME/.config/systemd/user/iai-mcp-daemon.service" ]]; then
|
||||
while IFS= read -r line; do
|
||||
if [[ "$line" =~ ^Environment=\"([A-Z_]+)=(.*)\"$ ]]; then
|
||||
export "${BASH_REMATCH[1]}=${BASH_REMATCH[2]}"
|
||||
fi
|
||||
done < "$HOME/.config/systemd/user/iai-mcp-daemon.service"
|
||||
fi
|
||||
|
||||
VENV="${VENV:-$HOME/.venv/iai-mcp}"
|
||||
PYTHON="$VENV/bin/python3"
|
||||
STATE_DIR="$HOME/.iai-mcp"
|
||||
LOGS_DIR="$STATE_DIR/logs"
|
||||
DATE=$(date +%Y-%m-%d)
|
||||
HOUR=$(date +%H:%M)
|
||||
|
||||
# Colors
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
CYAN='\033[0;36m'
|
||||
NC='\033[0m'
|
||||
|
||||
is_cron=false
|
||||
is_json=false
|
||||
for arg in "$@"; do
|
||||
case $arg in
|
||||
--cron) is_cron=true ;;
|
||||
--json) is_json=true ;;
|
||||
esac
|
||||
done
|
||||
|
||||
# --- Helper functions ---
|
||||
pass_tag() { echo -e "${GREEN}[PASS]${NC}"; }
|
||||
fail_tag() { echo -e "${RED}[FAIL]${NC}"; }
|
||||
warn_tag() { echo -e "${YELLOW}[WARN]${NC}"; }
|
||||
|
||||
# --- 1. Systemd status ---
|
||||
get_systemd_status() {
|
||||
systemctl --user status iai-mcp-daemon.service 2>&1 | grep "Active:" | head -1
|
||||
}
|
||||
|
||||
# --- 2. Doctor output ---
|
||||
run_doctor() {
|
||||
$PYTHON -m iai_mcp.cli doctor 2>&1 || true
|
||||
}
|
||||
|
||||
# --- 3. Topology output ---
|
||||
run_topology() {
|
||||
$PYTHON -m iai_mcp.cli topology 2>&1 || true
|
||||
}
|
||||
|
||||
# --- 4. Sleep cycle results for today ---
|
||||
run_sleep_cycles() {
|
||||
local logfile="$LOGS_DIR/lifecycle-events-${DATE}.jsonl"
|
||||
if [[ ! -f "$logfile" ]]; then
|
||||
echo " No sleep cycle log for ${DATE}"
|
||||
return
|
||||
fi
|
||||
$PYTHON -c "
|
||||
import sys, json
|
||||
log = sys.argv[1]
|
||||
try:
|
||||
with open(log) as f:
|
||||
lines = f.readlines()
|
||||
except:
|
||||
print(' Empty or unreadable log')
|
||||
sys.exit(0)
|
||||
|
||||
steps = {}
|
||||
for line in lines:
|
||||
try:
|
||||
ev = json.loads(line.strip())
|
||||
except:
|
||||
continue
|
||||
if ev.get('event') != 'sleep_step_completed':
|
||||
continue
|
||||
step = ev.get('step', '')
|
||||
err = ev.get('error', '')
|
||||
ts = ev.get('timestamp', '')
|
||||
steps[step] = {'ok': not err, 'error': err, 'ts': ts}
|
||||
|
||||
if not steps:
|
||||
print(' No sleep steps completed today')
|
||||
sys.exit(0)
|
||||
|
||||
order = ['SCHEMA_MINE', 'KNOB_TUNE', 'DREAM_DECAY', 'OPTIMIZE_LANCE', 'COMPACT_RECORDS']
|
||||
for s in order:
|
||||
if s in steps:
|
||||
st = steps[s]
|
||||
status = 'ok' if st['ok'] else 'FAIL'
|
||||
err_str = f' {st[\"error\"][:60]}' if st['error'] else ''
|
||||
print(f' [{s}] {status}{err_str}')
|
||||
" "$logfile"
|
||||
}
|
||||
|
||||
# --- 5. Memory / RSS ---
|
||||
get_rss() {
|
||||
# Get PID from systemctl output
|
||||
local pid
|
||||
pid=$(systemctl --user status iai-mcp-daemon.service 2>/dev/null | grep "Main PID:" | head -1 | sed 's/.*Main PID: \([0-9]*\).*/\1/' | tr -d '[:space:]')
|
||||
if [[ "$pid" =~ ^[0-9]+$ ]] && [[ -d /proc/"$pid" ]]; then
|
||||
local rss_kb
|
||||
rss_kb=$(cat /proc/"$pid"/status 2>/dev/null | grep VmRSS | awk '{print $2}')
|
||||
if [[ -n "$rss_kb" ]]; then
|
||||
echo "$(( rss_kb / 1024 )) MB"
|
||||
return
|
||||
fi
|
||||
fi
|
||||
echo "N/A"
|
||||
}
|
||||
|
||||
# --- 6. Duplicate edge count ---
|
||||
get_dup_edges() {
|
||||
$PYTHON -c "
|
||||
import sys
|
||||
from iai_mcp.qdrant_store import QdrantStore
|
||||
store = QdrantStore()
|
||||
try:
|
||||
df = store.edges_as_dataframe()
|
||||
if df.empty:
|
||||
print(0)
|
||||
sys.exit(0)
|
||||
keys = df.apply(lambda r: (str(r['src']), str(r['dst']), str(r.get('edge_type',''))), axis=1)
|
||||
dups = len(keys) - len(keys.unique())
|
||||
print(dups)
|
||||
except Exception as ex:
|
||||
print(f'ERR:{ex}')
|
||||
" 2>/dev/null || echo "ERR:unreachable"
|
||||
}
|
||||
|
||||
# --- Main ---
|
||||
|
||||
if [[ "$is_json" == true ]]; then
|
||||
# JSON output for cron/automation
|
||||
python3 -c "
|
||||
import json, subprocess, sys
|
||||
|
||||
doctor = subprocess.run(['$PYTHON', '-m', 'iai_mcp.cli', 'doctor'], capture_output=True, text=True)
|
||||
topo = subprocess.run(['$PYTHON', '-m', 'iai_mcp.cli', 'topology'], capture_output=True, text=True)
|
||||
sysd = subprocess.run(['systemctl', '--user', 'status', 'iai-mcp-daemon.service'], capture_output=True, text=True)
|
||||
|
||||
result = {
|
||||
'timestamp': '$HOUR',
|
||||
'date': '$DATE',
|
||||
'daemon': 'running' if 'active (running)' in sysd.stdout else 'DOWN',
|
||||
'doctor_exit': doctor.returncode,
|
||||
'topology_output': topo.stdout.strip(),
|
||||
}
|
||||
|
||||
# Parse topology
|
||||
for line in topo.stdout.strip().split('\n'):
|
||||
if ':' in line:
|
||||
k, v = line.split(':', 1)
|
||||
result[k.strip()] = v.strip()
|
||||
|
||||
print(json.dumps(result, indent=2))
|
||||
"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# --- Human-readable output ---
|
||||
echo -e "${CYAN}╔══════════════════════════════════════════════╗${NC}"
|
||||
echo -e "${CYAN}║ IAI-MCP Daemon Monitor — ${HOUR} | ${DATE} ║${NC}"
|
||||
echo -e "${CYAN}╚══════════════════════════════════════════════╝${NC}"
|
||||
echo ""
|
||||
|
||||
# Systemd
|
||||
echo -e "${CYAN}── Systemd ──${NC}"
|
||||
systemd_line=$(get_systemd_status)
|
||||
if echo "$systemd_line" | grep -q "active (running)"; then
|
||||
pass_tag
|
||||
echo " $(echo "$systemd_line" | sed 's/^[[:space:]]*//')"
|
||||
else
|
||||
fail_tag
|
||||
echo " $(echo "$systemd_line" | sed 's/^[[:space:]]*//')"
|
||||
fi
|
||||
|
||||
# Memory
|
||||
echo -e "${CYAN}── Memory ──${NC}"
|
||||
echo " RSS: $(get_rss)"
|
||||
|
||||
# Doctor
|
||||
echo -e "${CYAN}── Doctor ──${NC}"
|
||||
doctor_output=$(run_doctor)
|
||||
pass_count=$(echo "$doctor_output" | grep -c '\[PASS\]' || true)
|
||||
fail_count=$(echo "$doctor_output" | grep -c '\[FAIL\]' || true)
|
||||
warn_count=$(echo "$doctor_output" | grep -c '\[WARN\]' || true)
|
||||
echo " $pass_count pass / $fail_count fail / $warn_count warn"
|
||||
if [[ "$fail_count" -gt 0 ]]; then
|
||||
echo "$doctor_output" | grep '\[FAIL\]' | sed 's/^/ /'
|
||||
fi
|
||||
|
||||
# Topology
|
||||
echo -e "${CYAN}── Topology ──${NC}"
|
||||
topo_output=$(run_topology)
|
||||
echo "$topo_output" | sed 's/^/ /'
|
||||
|
||||
# Sleep cycles
|
||||
echo -e "${CYAN}── Sleep Cycles (today) ──${NC}"
|
||||
run_sleep_cycles
|
||||
|
||||
# Duplicate edges
|
||||
echo -e "${CYAN}── Edge Health ──${NC}"
|
||||
dup_count=$(get_dup_edges)
|
||||
if [[ "$dup_count" == "ERR:"* ]]; then
|
||||
warn_tag
|
||||
echo " Qdrant unreachable: $dup_count"
|
||||
elif [[ "$dup_count" -gt 0 ]]; then
|
||||
warn_tag
|
||||
echo " $dup_count duplicate edge(s) detected"
|
||||
else
|
||||
pass_tag
|
||||
echo " No duplicates"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
if [[ "$fail_count" -gt 0 ]]; then
|
||||
echo -e "${RED}⚠ $fail_count issue(s) detected — check output above${NC}"
|
||||
else
|
||||
echo -e "${GREEN}✓ All checks clean${NC}"
|
||||
fi
|
||||
Loading…
Add table
Add a link
Reference in a new issue