// Package watchdog implements the SEC-004 Watchdog Mesh Framework. // // Mutual monitoring between SOC agents (immune, sidecar, shield) // with automatic restart escalation: // // 1. Heartbeat check every 30s // 2. 3 missed heartbeats → attempt systemd restart // 3. 3 failed restarts → eBPF isolation + CRITICAL alert // 4. Architect notification via webhook // // Each agent registers as a peer and monitors all others. package watchdog import ( "context" "encoding/json" "fmt" "log/slog" "net/http" "sync" "time" ) // PeerStatus defines the health state of a peer. type PeerStatus string const ( StatusHealthy PeerStatus = "HEALTHY" StatusDegraded PeerStatus = "DEGRADED" StatusOffline PeerStatus = "OFFLINE" StatusIsolated PeerStatus = "ISOLATED" // DefaultHeartbeatInterval is the check interval. DefaultHeartbeatInterval = 30 * time.Second // MaxMissedBeforeRestart triggers auto-restart. MaxMissedBeforeRestart = 3 // MaxRestartsBeforeIsolate triggers eBPF isolation. MaxRestartsBeforeIsolate = 3 ) // PeerHealth tracks the health state of a single peer agent. type PeerHealth struct { Name string `json:"name"` Endpoint string `json:"endpoint"` // HTTP health endpoint Status PeerStatus `json:"status"` LastSeen time.Time `json:"last_seen"` MissedCount int `json:"missed_count"` RestartCount int `json:"restart_count"` LastRestart time.Time `json:"last_restart,omitempty"` ResponseTimeMs int64 `json:"response_time_ms"` } // EscalationHandler is called when a peer requires escalation action. type EscalationHandler func(action EscalationAction) // EscalationAction describes what the mesh decided to do. type EscalationAction struct { Timestamp time.Time `json:"timestamp"` PeerName string `json:"peer_name"` Action string `json:"action"` // restart, isolate, alert_architect Reason string `json:"reason"` Severity string `json:"severity"` } // Monitor is the watchdog mesh peer monitor. type Monitor struct { mu sync.RWMutex selfName string peers map[string]*PeerHealth interval time.Duration handlers []EscalationHandler httpClient *http.Client logger *slog.Logger stats MonitorStats } // MonitorStats tracks mesh health metrics. type MonitorStats struct { mu sync.Mutex TotalChecks int64 `json:"total_checks"` TotalMisses int64 `json:"total_misses"` TotalRestarts int64 `json:"total_restarts"` TotalIsolations int64 `json:"total_isolations"` StartedAt time.Time `json:"started_at"` PeerCount int `json:"peer_count"` } // NewMonitor creates a new watchdog mesh monitor. func NewMonitor(selfName string) *Monitor { return &Monitor{ selfName: selfName, peers: make(map[string]*PeerHealth), interval: DefaultHeartbeatInterval, httpClient: &http.Client{ Timeout: 5 * time.Second, }, logger: slog.Default().With("component", "sec-004-watchdog", "self", selfName), stats: MonitorStats{ StartedAt: time.Now(), }, } } // RegisterPeer adds a peer agent to the monitoring mesh. func (m *Monitor) RegisterPeer(name, endpoint string) { m.mu.Lock() defer m.mu.Unlock() m.peers[name] = &PeerHealth{ Name: name, Endpoint: endpoint, Status: StatusHealthy, LastSeen: time.Now(), } m.stats.PeerCount = len(m.peers) m.logger.Info("peer registered", "peer", name, "endpoint", endpoint) } // OnEscalation registers a handler for escalation events. func (m *Monitor) OnEscalation(h EscalationHandler) { m.mu.Lock() defer m.mu.Unlock() m.handlers = append(m.handlers, h) } // Start begins the heartbeat monitoring loop. func (m *Monitor) Start(ctx context.Context) { m.logger.Info("watchdog mesh started", "interval", m.interval, "peers", m.peerNames(), ) ticker := time.NewTicker(m.interval) defer ticker.Stop() for { select { case <-ctx.Done(): m.logger.Info("watchdog mesh stopped") return case <-ticker.C: m.checkAllPeers(ctx) } } } // checkAllPeers performs a health check on every registered peer. func (m *Monitor) checkAllPeers(ctx context.Context) { m.mu.RLock() peers := make([]*PeerHealth, 0, len(m.peers)) for _, p := range m.peers { peers = append(peers, p) } m.mu.RUnlock() for _, peer := range peers { m.checkPeer(ctx, peer) } } // checkPeer performs a single health check on a peer. func (m *Monitor) checkPeer(ctx context.Context, peer *PeerHealth) { m.stats.mu.Lock() m.stats.TotalChecks++ m.stats.mu.Unlock() start := time.Now() healthy := m.pingPeer(ctx, peer.Endpoint) elapsed := time.Since(start) m.mu.Lock() defer m.mu.Unlock() if healthy { peer.Status = StatusHealthy peer.LastSeen = time.Now() peer.MissedCount = 0 peer.ResponseTimeMs = elapsed.Milliseconds() return } // Missed heartbeat. peer.MissedCount++ m.stats.mu.Lock() m.stats.TotalMisses++ m.stats.mu.Unlock() m.logger.Warn("peer missed heartbeat", "peer", peer.Name, "missed", peer.MissedCount, "last_seen", peer.LastSeen, ) // Escalation ladder. switch { case peer.MissedCount >= MaxMissedBeforeRestart && peer.RestartCount >= MaxRestartsBeforeIsolate: // Level 3: Isolate via eBPF + alert architect. peer.Status = StatusIsolated m.stats.mu.Lock() m.stats.TotalIsolations++ m.stats.mu.Unlock() m.escalate(EscalationAction{ Timestamp: time.Now(), PeerName: peer.Name, Action: "isolate", Reason: fmt.Sprintf("peer %s offline after %d restarts — eBPF isolation engaged", peer.Name, peer.RestartCount), Severity: "CRITICAL", }) case peer.MissedCount >= MaxMissedBeforeRestart: // Level 2: Attempt restart. peer.Status = StatusOffline peer.RestartCount++ peer.LastRestart = time.Now() m.stats.mu.Lock() m.stats.TotalRestarts++ m.stats.mu.Unlock() m.escalate(EscalationAction{ Timestamp: time.Now(), PeerName: peer.Name, Action: "restart", Reason: fmt.Sprintf("peer %s missed %d heartbeats — restart attempt %d", peer.Name, peer.MissedCount, peer.RestartCount), Severity: "HIGH", }) peer.MissedCount = 0 // Reset after restart attempt. default: // Level 1: Mark degraded. peer.Status = StatusDegraded m.escalate(EscalationAction{ Timestamp: time.Now(), PeerName: peer.Name, Action: "alert", Reason: fmt.Sprintf("peer %s missed %d heartbeat(s)", peer.Name, peer.MissedCount), Severity: "MEDIUM", }) } } // pingPeer sends an HTTP GET to the peer's health endpoint. func (m *Monitor) pingPeer(ctx context.Context, endpoint string) bool { req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil) if err != nil { return false } resp, err := m.httpClient.Do(req) if err != nil { return false } defer resp.Body.Close() return resp.StatusCode == http.StatusOK } // escalate notifies all registered handlers and logs the action. func (m *Monitor) escalate(action EscalationAction) { m.logger.Warn("WATCHDOG ESCALATION", "peer", action.PeerName, "action", action.Action, "severity", action.Severity, "reason", action.Reason, ) // Notify handlers (must hold read lock or no lock). handlers := m.handlers for _, h := range handlers { h(action) } } // PeerStatus returns the current status of a specific peer. func (m *Monitor) GetPeerStatus(name string) (*PeerHealth, bool) { m.mu.RLock() defer m.mu.RUnlock() p, ok := m.peers[name] if !ok { return nil, false } cp := *p // Return a copy. return &cp, true } // AllPeers returns a snapshot of all peer health states. func (m *Monitor) AllPeers() []PeerHealth { m.mu.RLock() defer m.mu.RUnlock() result := make([]PeerHealth, 0, len(m.peers)) for _, p := range m.peers { result = append(result, *p) } return result } // Stats returns current watchdog metrics. func (m *Monitor) Stats() MonitorStats { m.stats.mu.Lock() defer m.stats.mu.Unlock() return MonitorStats{ TotalChecks: m.stats.TotalChecks, TotalMisses: m.stats.TotalMisses, TotalRestarts: m.stats.TotalRestarts, TotalIsolations: m.stats.TotalIsolations, StartedAt: m.stats.StartedAt, PeerCount: m.stats.PeerCount, } } // ServeHTTP provides the mesh status as JSON (for embedding in other servers). func (m *Monitor) ServeHTTP(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/json") json.NewEncoder(w).Encode(map[string]any{ "self": m.selfName, "peers": m.AllPeers(), "stats": m.Stats(), }) } // peerNames returns a list of registered peer names. func (m *Monitor) peerNames() []string { names := make([]string, 0, len(m.peers)) for n := range m.peers { names = append(names, n) } return names }