mirror of
https://github.com/syntrex-lab/gomcp.git
synced 2026-05-09 19:42:37 +02:00
331 lines
8.5 KiB
Go
331 lines
8.5 KiB
Go
// Package watchdog implements the SEC-004 Watchdog Mesh Framework.
|
|
//
|
|
// Mutual monitoring between SOC agents (immune, sidecar, shield)
|
|
// with automatic restart escalation:
|
|
//
|
|
// 1. Heartbeat check every 30s
|
|
// 2. 3 missed heartbeats → attempt systemd restart
|
|
// 3. 3 failed restarts → eBPF isolation + CRITICAL alert
|
|
// 4. Architect notification via webhook
|
|
//
|
|
// Each agent registers as a peer and monitors all others.
|
|
package watchdog
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"log/slog"
|
|
"net/http"
|
|
"sync"
|
|
"time"
|
|
)
|
|
|
|
// PeerStatus defines the health state of a peer.
|
|
type PeerStatus string
|
|
|
|
const (
|
|
StatusHealthy PeerStatus = "HEALTHY"
|
|
StatusDegraded PeerStatus = "DEGRADED"
|
|
StatusOffline PeerStatus = "OFFLINE"
|
|
StatusIsolated PeerStatus = "ISOLATED"
|
|
|
|
// DefaultHeartbeatInterval is the check interval.
|
|
DefaultHeartbeatInterval = 30 * time.Second
|
|
|
|
// MaxMissedBeforeRestart triggers auto-restart.
|
|
MaxMissedBeforeRestart = 3
|
|
|
|
// MaxRestartsBeforeIsolate triggers eBPF isolation.
|
|
MaxRestartsBeforeIsolate = 3
|
|
)
|
|
|
|
// PeerHealth tracks the health state of a single peer agent.
|
|
type PeerHealth struct {
|
|
Name string `json:"name"`
|
|
Endpoint string `json:"endpoint"` // HTTP health endpoint
|
|
Status PeerStatus `json:"status"`
|
|
LastSeen time.Time `json:"last_seen"`
|
|
MissedCount int `json:"missed_count"`
|
|
RestartCount int `json:"restart_count"`
|
|
LastRestart time.Time `json:"last_restart,omitempty"`
|
|
ResponseTimeMs int64 `json:"response_time_ms"`
|
|
}
|
|
|
|
// EscalationHandler is called when a peer requires escalation action.
|
|
type EscalationHandler func(action EscalationAction)
|
|
|
|
// EscalationAction describes what the mesh decided to do.
|
|
type EscalationAction struct {
|
|
Timestamp time.Time `json:"timestamp"`
|
|
PeerName string `json:"peer_name"`
|
|
Action string `json:"action"` // restart, isolate, alert_architect
|
|
Reason string `json:"reason"`
|
|
Severity string `json:"severity"`
|
|
}
|
|
|
|
// Monitor is the watchdog mesh peer monitor.
|
|
type Monitor struct {
|
|
mu sync.RWMutex
|
|
selfName string
|
|
peers map[string]*PeerHealth
|
|
interval time.Duration
|
|
handlers []EscalationHandler
|
|
httpClient *http.Client
|
|
logger *slog.Logger
|
|
stats MonitorStats
|
|
}
|
|
|
|
// MonitorStats tracks mesh health metrics.
|
|
type MonitorStats struct {
|
|
mu sync.Mutex
|
|
TotalChecks int64 `json:"total_checks"`
|
|
TotalMisses int64 `json:"total_misses"`
|
|
TotalRestarts int64 `json:"total_restarts"`
|
|
TotalIsolations int64 `json:"total_isolations"`
|
|
StartedAt time.Time `json:"started_at"`
|
|
PeerCount int `json:"peer_count"`
|
|
}
|
|
|
|
// NewMonitor creates a new watchdog mesh monitor.
|
|
func NewMonitor(selfName string) *Monitor {
|
|
return &Monitor{
|
|
selfName: selfName,
|
|
peers: make(map[string]*PeerHealth),
|
|
interval: DefaultHeartbeatInterval,
|
|
httpClient: &http.Client{
|
|
Timeout: 5 * time.Second,
|
|
},
|
|
logger: slog.Default().With("component", "sec-004-watchdog", "self", selfName),
|
|
stats: MonitorStats{
|
|
StartedAt: time.Now(),
|
|
},
|
|
}
|
|
}
|
|
|
|
// RegisterPeer adds a peer agent to the monitoring mesh.
|
|
func (m *Monitor) RegisterPeer(name, endpoint string) {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
m.peers[name] = &PeerHealth{
|
|
Name: name,
|
|
Endpoint: endpoint,
|
|
Status: StatusHealthy,
|
|
LastSeen: time.Now(),
|
|
}
|
|
m.stats.PeerCount = len(m.peers)
|
|
m.logger.Info("peer registered", "peer", name, "endpoint", endpoint)
|
|
}
|
|
|
|
// OnEscalation registers a handler for escalation events.
|
|
func (m *Monitor) OnEscalation(h EscalationHandler) {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
m.handlers = append(m.handlers, h)
|
|
}
|
|
|
|
// Start begins the heartbeat monitoring loop.
|
|
func (m *Monitor) Start(ctx context.Context) {
|
|
m.logger.Info("watchdog mesh started",
|
|
"interval", m.interval,
|
|
"peers", m.peerNames(),
|
|
)
|
|
|
|
ticker := time.NewTicker(m.interval)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
m.logger.Info("watchdog mesh stopped")
|
|
return
|
|
case <-ticker.C:
|
|
m.checkAllPeers(ctx)
|
|
}
|
|
}
|
|
}
|
|
|
|
// checkAllPeers performs a health check on every registered peer.
|
|
func (m *Monitor) checkAllPeers(ctx context.Context) {
|
|
m.mu.RLock()
|
|
peers := make([]*PeerHealth, 0, len(m.peers))
|
|
for _, p := range m.peers {
|
|
peers = append(peers, p)
|
|
}
|
|
m.mu.RUnlock()
|
|
|
|
for _, peer := range peers {
|
|
m.checkPeer(ctx, peer)
|
|
}
|
|
}
|
|
|
|
// checkPeer performs a single health check on a peer.
|
|
func (m *Monitor) checkPeer(ctx context.Context, peer *PeerHealth) {
|
|
m.stats.mu.Lock()
|
|
m.stats.TotalChecks++
|
|
m.stats.mu.Unlock()
|
|
|
|
start := time.Now()
|
|
healthy := m.pingPeer(ctx, peer.Endpoint)
|
|
elapsed := time.Since(start)
|
|
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
if healthy {
|
|
peer.Status = StatusHealthy
|
|
peer.LastSeen = time.Now()
|
|
peer.MissedCount = 0
|
|
peer.ResponseTimeMs = elapsed.Milliseconds()
|
|
return
|
|
}
|
|
|
|
// Missed heartbeat.
|
|
peer.MissedCount++
|
|
m.stats.mu.Lock()
|
|
m.stats.TotalMisses++
|
|
m.stats.mu.Unlock()
|
|
|
|
m.logger.Warn("peer missed heartbeat",
|
|
"peer", peer.Name,
|
|
"missed", peer.MissedCount,
|
|
"last_seen", peer.LastSeen,
|
|
)
|
|
|
|
// Escalation ladder.
|
|
switch {
|
|
case peer.MissedCount >= MaxMissedBeforeRestart && peer.RestartCount >= MaxRestartsBeforeIsolate:
|
|
// Level 3: Isolate via eBPF + alert architect.
|
|
peer.Status = StatusIsolated
|
|
m.stats.mu.Lock()
|
|
m.stats.TotalIsolations++
|
|
m.stats.mu.Unlock()
|
|
|
|
m.escalate(EscalationAction{
|
|
Timestamp: time.Now(),
|
|
PeerName: peer.Name,
|
|
Action: "isolate",
|
|
Reason: fmt.Sprintf("peer %s offline after %d restarts — eBPF isolation engaged", peer.Name, peer.RestartCount),
|
|
Severity: "CRITICAL",
|
|
})
|
|
|
|
case peer.MissedCount >= MaxMissedBeforeRestart:
|
|
// Level 2: Attempt restart.
|
|
peer.Status = StatusOffline
|
|
peer.RestartCount++
|
|
peer.LastRestart = time.Now()
|
|
m.stats.mu.Lock()
|
|
m.stats.TotalRestarts++
|
|
m.stats.mu.Unlock()
|
|
|
|
m.escalate(EscalationAction{
|
|
Timestamp: time.Now(),
|
|
PeerName: peer.Name,
|
|
Action: "restart",
|
|
Reason: fmt.Sprintf("peer %s missed %d heartbeats — restart attempt %d", peer.Name, peer.MissedCount, peer.RestartCount),
|
|
Severity: "HIGH",
|
|
})
|
|
peer.MissedCount = 0 // Reset after restart attempt.
|
|
|
|
default:
|
|
// Level 1: Mark degraded.
|
|
peer.Status = StatusDegraded
|
|
m.escalate(EscalationAction{
|
|
Timestamp: time.Now(),
|
|
PeerName: peer.Name,
|
|
Action: "alert",
|
|
Reason: fmt.Sprintf("peer %s missed %d heartbeat(s)", peer.Name, peer.MissedCount),
|
|
Severity: "MEDIUM",
|
|
})
|
|
}
|
|
}
|
|
|
|
// pingPeer sends an HTTP GET to the peer's health endpoint.
|
|
func (m *Monitor) pingPeer(ctx context.Context, endpoint string) bool {
|
|
req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
|
|
if err != nil {
|
|
return false
|
|
}
|
|
|
|
resp, err := m.httpClient.Do(req)
|
|
if err != nil {
|
|
return false
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
return resp.StatusCode == http.StatusOK
|
|
}
|
|
|
|
// escalate notifies all registered handlers and logs the action.
|
|
func (m *Monitor) escalate(action EscalationAction) {
|
|
m.logger.Warn("WATCHDOG ESCALATION",
|
|
"peer", action.PeerName,
|
|
"action", action.Action,
|
|
"severity", action.Severity,
|
|
"reason", action.Reason,
|
|
)
|
|
|
|
// Notify handlers (must hold read lock or no lock).
|
|
handlers := m.handlers
|
|
for _, h := range handlers {
|
|
h(action)
|
|
}
|
|
}
|
|
|
|
// PeerStatus returns the current status of a specific peer.
|
|
func (m *Monitor) GetPeerStatus(name string) (*PeerHealth, bool) {
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
p, ok := m.peers[name]
|
|
if !ok {
|
|
return nil, false
|
|
}
|
|
cp := *p // Return a copy.
|
|
return &cp, true
|
|
}
|
|
|
|
// AllPeers returns a snapshot of all peer health states.
|
|
func (m *Monitor) AllPeers() []PeerHealth {
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
|
|
result := make([]PeerHealth, 0, len(m.peers))
|
|
for _, p := range m.peers {
|
|
result = append(result, *p)
|
|
}
|
|
return result
|
|
}
|
|
|
|
// Stats returns current watchdog metrics.
|
|
func (m *Monitor) Stats() MonitorStats {
|
|
m.stats.mu.Lock()
|
|
defer m.stats.mu.Unlock()
|
|
return MonitorStats{
|
|
TotalChecks: m.stats.TotalChecks,
|
|
TotalMisses: m.stats.TotalMisses,
|
|
TotalRestarts: m.stats.TotalRestarts,
|
|
TotalIsolations: m.stats.TotalIsolations,
|
|
StartedAt: m.stats.StartedAt,
|
|
PeerCount: m.stats.PeerCount,
|
|
}
|
|
}
|
|
|
|
// ServeHTTP provides the mesh status as JSON (for embedding in other servers).
|
|
func (m *Monitor) ServeHTTP(w http.ResponseWriter, r *http.Request) {
|
|
w.Header().Set("Content-Type", "application/json")
|
|
json.NewEncoder(w).Encode(map[string]any{
|
|
"self": m.selfName,
|
|
"peers": m.AllPeers(),
|
|
"stats": m.Stats(),
|
|
})
|
|
}
|
|
|
|
// peerNames returns a list of registered peer names.
|
|
func (m *Monitor) peerNames() []string {
|
|
names := make([]string, 0, len(m.peers))
|
|
for n := range m.peers {
|
|
names = append(names, n)
|
|
}
|
|
return names
|
|
}
|