mirror of
https://github.com/syntrex-lab/gomcp.git
synced 2026-05-10 03:52:36 +02:00
Release prep: 54 engines, self-hosted signatures, i18n, dashboard updates
This commit is contained in:
parent
694e32be26
commit
41cbfd6e0a
178 changed files with 36008 additions and 399 deletions
331
internal/infrastructure/watchdog/watchdog.go
Normal file
331
internal/infrastructure/watchdog/watchdog.go
Normal file
|
|
@ -0,0 +1,331 @@
|
|||
// Package watchdog implements the SEC-004 Watchdog Mesh Framework.
|
||||
//
|
||||
// Mutual monitoring between SOC agents (immune, sidecar, shield)
|
||||
// with automatic restart escalation:
|
||||
//
|
||||
// 1. Heartbeat check every 30s
|
||||
// 2. 3 missed heartbeats → attempt systemd restart
|
||||
// 3. 3 failed restarts → eBPF isolation + CRITICAL alert
|
||||
// 4. Architect notification via webhook
|
||||
//
|
||||
// Each agent registers as a peer and monitors all others.
|
||||
package watchdog
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// PeerStatus defines the health state of a peer.
|
||||
type PeerStatus string
|
||||
|
||||
const (
|
||||
StatusHealthy PeerStatus = "HEALTHY"
|
||||
StatusDegraded PeerStatus = "DEGRADED"
|
||||
StatusOffline PeerStatus = "OFFLINE"
|
||||
StatusIsolated PeerStatus = "ISOLATED"
|
||||
|
||||
// DefaultHeartbeatInterval is the check interval.
|
||||
DefaultHeartbeatInterval = 30 * time.Second
|
||||
|
||||
// MaxMissedBeforeRestart triggers auto-restart.
|
||||
MaxMissedBeforeRestart = 3
|
||||
|
||||
// MaxRestartsBeforeIsolate triggers eBPF isolation.
|
||||
MaxRestartsBeforeIsolate = 3
|
||||
)
|
||||
|
||||
// PeerHealth tracks the health state of a single peer agent.
|
||||
type PeerHealth struct {
|
||||
Name string `json:"name"`
|
||||
Endpoint string `json:"endpoint"` // HTTP health endpoint
|
||||
Status PeerStatus `json:"status"`
|
||||
LastSeen time.Time `json:"last_seen"`
|
||||
MissedCount int `json:"missed_count"`
|
||||
RestartCount int `json:"restart_count"`
|
||||
LastRestart time.Time `json:"last_restart,omitempty"`
|
||||
ResponseTimeMs int64 `json:"response_time_ms"`
|
||||
}
|
||||
|
||||
// EscalationHandler is called when a peer requires escalation action.
|
||||
type EscalationHandler func(action EscalationAction)
|
||||
|
||||
// EscalationAction describes what the mesh decided to do.
|
||||
type EscalationAction struct {
|
||||
Timestamp time.Time `json:"timestamp"`
|
||||
PeerName string `json:"peer_name"`
|
||||
Action string `json:"action"` // restart, isolate, alert_architect
|
||||
Reason string `json:"reason"`
|
||||
Severity string `json:"severity"`
|
||||
}
|
||||
|
||||
// Monitor is the watchdog mesh peer monitor.
|
||||
type Monitor struct {
|
||||
mu sync.RWMutex
|
||||
selfName string
|
||||
peers map[string]*PeerHealth
|
||||
interval time.Duration
|
||||
handlers []EscalationHandler
|
||||
httpClient *http.Client
|
||||
logger *slog.Logger
|
||||
stats MonitorStats
|
||||
}
|
||||
|
||||
// MonitorStats tracks mesh health metrics.
|
||||
type MonitorStats struct {
|
||||
mu sync.Mutex
|
||||
TotalChecks int64 `json:"total_checks"`
|
||||
TotalMisses int64 `json:"total_misses"`
|
||||
TotalRestarts int64 `json:"total_restarts"`
|
||||
TotalIsolations int64 `json:"total_isolations"`
|
||||
StartedAt time.Time `json:"started_at"`
|
||||
PeerCount int `json:"peer_count"`
|
||||
}
|
||||
|
||||
// NewMonitor creates a new watchdog mesh monitor.
|
||||
func NewMonitor(selfName string) *Monitor {
|
||||
return &Monitor{
|
||||
selfName: selfName,
|
||||
peers: make(map[string]*PeerHealth),
|
||||
interval: DefaultHeartbeatInterval,
|
||||
httpClient: &http.Client{
|
||||
Timeout: 5 * time.Second,
|
||||
},
|
||||
logger: slog.Default().With("component", "sec-004-watchdog", "self", selfName),
|
||||
stats: MonitorStats{
|
||||
StartedAt: time.Now(),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// RegisterPeer adds a peer agent to the monitoring mesh.
|
||||
func (m *Monitor) RegisterPeer(name, endpoint string) {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
|
||||
m.peers[name] = &PeerHealth{
|
||||
Name: name,
|
||||
Endpoint: endpoint,
|
||||
Status: StatusHealthy,
|
||||
LastSeen: time.Now(),
|
||||
}
|
||||
m.stats.PeerCount = len(m.peers)
|
||||
m.logger.Info("peer registered", "peer", name, "endpoint", endpoint)
|
||||
}
|
||||
|
||||
// OnEscalation registers a handler for escalation events.
|
||||
func (m *Monitor) OnEscalation(h EscalationHandler) {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
m.handlers = append(m.handlers, h)
|
||||
}
|
||||
|
||||
// Start begins the heartbeat monitoring loop.
|
||||
func (m *Monitor) Start(ctx context.Context) {
|
||||
m.logger.Info("watchdog mesh started",
|
||||
"interval", m.interval,
|
||||
"peers", m.peerNames(),
|
||||
)
|
||||
|
||||
ticker := time.NewTicker(m.interval)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
m.logger.Info("watchdog mesh stopped")
|
||||
return
|
||||
case <-ticker.C:
|
||||
m.checkAllPeers(ctx)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// checkAllPeers performs a health check on every registered peer.
|
||||
func (m *Monitor) checkAllPeers(ctx context.Context) {
|
||||
m.mu.RLock()
|
||||
peers := make([]*PeerHealth, 0, len(m.peers))
|
||||
for _, p := range m.peers {
|
||||
peers = append(peers, p)
|
||||
}
|
||||
m.mu.RUnlock()
|
||||
|
||||
for _, peer := range peers {
|
||||
m.checkPeer(ctx, peer)
|
||||
}
|
||||
}
|
||||
|
||||
// checkPeer performs a single health check on a peer.
|
||||
func (m *Monitor) checkPeer(ctx context.Context, peer *PeerHealth) {
|
||||
m.stats.mu.Lock()
|
||||
m.stats.TotalChecks++
|
||||
m.stats.mu.Unlock()
|
||||
|
||||
start := time.Now()
|
||||
healthy := m.pingPeer(ctx, peer.Endpoint)
|
||||
elapsed := time.Since(start)
|
||||
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
|
||||
if healthy {
|
||||
peer.Status = StatusHealthy
|
||||
peer.LastSeen = time.Now()
|
||||
peer.MissedCount = 0
|
||||
peer.ResponseTimeMs = elapsed.Milliseconds()
|
||||
return
|
||||
}
|
||||
|
||||
// Missed heartbeat.
|
||||
peer.MissedCount++
|
||||
m.stats.mu.Lock()
|
||||
m.stats.TotalMisses++
|
||||
m.stats.mu.Unlock()
|
||||
|
||||
m.logger.Warn("peer missed heartbeat",
|
||||
"peer", peer.Name,
|
||||
"missed", peer.MissedCount,
|
||||
"last_seen", peer.LastSeen,
|
||||
)
|
||||
|
||||
// Escalation ladder.
|
||||
switch {
|
||||
case peer.MissedCount >= MaxMissedBeforeRestart && peer.RestartCount >= MaxRestartsBeforeIsolate:
|
||||
// Level 3: Isolate via eBPF + alert architect.
|
||||
peer.Status = StatusIsolated
|
||||
m.stats.mu.Lock()
|
||||
m.stats.TotalIsolations++
|
||||
m.stats.mu.Unlock()
|
||||
|
||||
m.escalate(EscalationAction{
|
||||
Timestamp: time.Now(),
|
||||
PeerName: peer.Name,
|
||||
Action: "isolate",
|
||||
Reason: fmt.Sprintf("peer %s offline after %d restarts — eBPF isolation engaged", peer.Name, peer.RestartCount),
|
||||
Severity: "CRITICAL",
|
||||
})
|
||||
|
||||
case peer.MissedCount >= MaxMissedBeforeRestart:
|
||||
// Level 2: Attempt restart.
|
||||
peer.Status = StatusOffline
|
||||
peer.RestartCount++
|
||||
peer.LastRestart = time.Now()
|
||||
m.stats.mu.Lock()
|
||||
m.stats.TotalRestarts++
|
||||
m.stats.mu.Unlock()
|
||||
|
||||
m.escalate(EscalationAction{
|
||||
Timestamp: time.Now(),
|
||||
PeerName: peer.Name,
|
||||
Action: "restart",
|
||||
Reason: fmt.Sprintf("peer %s missed %d heartbeats — restart attempt %d", peer.Name, peer.MissedCount, peer.RestartCount),
|
||||
Severity: "HIGH",
|
||||
})
|
||||
peer.MissedCount = 0 // Reset after restart attempt.
|
||||
|
||||
default:
|
||||
// Level 1: Mark degraded.
|
||||
peer.Status = StatusDegraded
|
||||
m.escalate(EscalationAction{
|
||||
Timestamp: time.Now(),
|
||||
PeerName: peer.Name,
|
||||
Action: "alert",
|
||||
Reason: fmt.Sprintf("peer %s missed %d heartbeat(s)", peer.Name, peer.MissedCount),
|
||||
Severity: "MEDIUM",
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// pingPeer sends an HTTP GET to the peer's health endpoint.
|
||||
func (m *Monitor) pingPeer(ctx context.Context, endpoint string) bool {
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
resp, err := m.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
return resp.StatusCode == http.StatusOK
|
||||
}
|
||||
|
||||
// escalate notifies all registered handlers and logs the action.
|
||||
func (m *Monitor) escalate(action EscalationAction) {
|
||||
m.logger.Warn("WATCHDOG ESCALATION",
|
||||
"peer", action.PeerName,
|
||||
"action", action.Action,
|
||||
"severity", action.Severity,
|
||||
"reason", action.Reason,
|
||||
)
|
||||
|
||||
// Notify handlers (must hold read lock or no lock).
|
||||
handlers := m.handlers
|
||||
for _, h := range handlers {
|
||||
h(action)
|
||||
}
|
||||
}
|
||||
|
||||
// PeerStatus returns the current status of a specific peer.
|
||||
func (m *Monitor) GetPeerStatus(name string) (*PeerHealth, bool) {
|
||||
m.mu.RLock()
|
||||
defer m.mu.RUnlock()
|
||||
p, ok := m.peers[name]
|
||||
if !ok {
|
||||
return nil, false
|
||||
}
|
||||
cp := *p // Return a copy.
|
||||
return &cp, true
|
||||
}
|
||||
|
||||
// AllPeers returns a snapshot of all peer health states.
|
||||
func (m *Monitor) AllPeers() []PeerHealth {
|
||||
m.mu.RLock()
|
||||
defer m.mu.RUnlock()
|
||||
|
||||
result := make([]PeerHealth, 0, len(m.peers))
|
||||
for _, p := range m.peers {
|
||||
result = append(result, *p)
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// Stats returns current watchdog metrics.
|
||||
func (m *Monitor) Stats() MonitorStats {
|
||||
m.stats.mu.Lock()
|
||||
defer m.stats.mu.Unlock()
|
||||
return MonitorStats{
|
||||
TotalChecks: m.stats.TotalChecks,
|
||||
TotalMisses: m.stats.TotalMisses,
|
||||
TotalRestarts: m.stats.TotalRestarts,
|
||||
TotalIsolations: m.stats.TotalIsolations,
|
||||
StartedAt: m.stats.StartedAt,
|
||||
PeerCount: m.stats.PeerCount,
|
||||
}
|
||||
}
|
||||
|
||||
// ServeHTTP provides the mesh status as JSON (for embedding in other servers).
|
||||
func (m *Monitor) ServeHTTP(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(map[string]any{
|
||||
"self": m.selfName,
|
||||
"peers": m.AllPeers(),
|
||||
"stats": m.Stats(),
|
||||
})
|
||||
}
|
||||
|
||||
// peerNames returns a list of registered peer names.
|
||||
func (m *Monitor) peerNames() []string {
|
||||
names := make([]string, 0, len(m.peers))
|
||||
for n := range m.peers {
|
||||
names = append(names, n)
|
||||
}
|
||||
return names
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue