Release prep: 54 engines, self-hosted signatures, i18n, dashboard updates

This commit is contained in:
DmitrL-dev 2026-03-23 16:45:40 +10:00
parent 694e32be26
commit 41cbfd6e0a
178 changed files with 36008 additions and 399 deletions

View file

@ -0,0 +1,331 @@
// Package watchdog implements the SEC-004 Watchdog Mesh Framework.
//
// Mutual monitoring between SOC agents (immune, sidecar, shield)
// with automatic restart escalation:
//
// 1. Heartbeat check every 30s
// 2. 3 missed heartbeats → attempt systemd restart
// 3. 3 failed restarts → eBPF isolation + CRITICAL alert
// 4. Architect notification via webhook
//
// Each agent registers as a peer and monitors all others.
package watchdog
import (
"context"
"encoding/json"
"fmt"
"log/slog"
"net/http"
"sync"
"time"
)
// PeerStatus defines the health state of a peer.
type PeerStatus string
const (
StatusHealthy PeerStatus = "HEALTHY"
StatusDegraded PeerStatus = "DEGRADED"
StatusOffline PeerStatus = "OFFLINE"
StatusIsolated PeerStatus = "ISOLATED"
// DefaultHeartbeatInterval is the check interval.
DefaultHeartbeatInterval = 30 * time.Second
// MaxMissedBeforeRestart triggers auto-restart.
MaxMissedBeforeRestart = 3
// MaxRestartsBeforeIsolate triggers eBPF isolation.
MaxRestartsBeforeIsolate = 3
)
// PeerHealth tracks the health state of a single peer agent.
type PeerHealth struct {
Name string `json:"name"`
Endpoint string `json:"endpoint"` // HTTP health endpoint
Status PeerStatus `json:"status"`
LastSeen time.Time `json:"last_seen"`
MissedCount int `json:"missed_count"`
RestartCount int `json:"restart_count"`
LastRestart time.Time `json:"last_restart,omitempty"`
ResponseTimeMs int64 `json:"response_time_ms"`
}
// EscalationHandler is called when a peer requires escalation action.
type EscalationHandler func(action EscalationAction)
// EscalationAction describes what the mesh decided to do.
type EscalationAction struct {
Timestamp time.Time `json:"timestamp"`
PeerName string `json:"peer_name"`
Action string `json:"action"` // restart, isolate, alert_architect
Reason string `json:"reason"`
Severity string `json:"severity"`
}
// Monitor is the watchdog mesh peer monitor.
type Monitor struct {
mu sync.RWMutex
selfName string
peers map[string]*PeerHealth
interval time.Duration
handlers []EscalationHandler
httpClient *http.Client
logger *slog.Logger
stats MonitorStats
}
// MonitorStats tracks mesh health metrics.
type MonitorStats struct {
mu sync.Mutex
TotalChecks int64 `json:"total_checks"`
TotalMisses int64 `json:"total_misses"`
TotalRestarts int64 `json:"total_restarts"`
TotalIsolations int64 `json:"total_isolations"`
StartedAt time.Time `json:"started_at"`
PeerCount int `json:"peer_count"`
}
// NewMonitor creates a new watchdog mesh monitor.
func NewMonitor(selfName string) *Monitor {
return &Monitor{
selfName: selfName,
peers: make(map[string]*PeerHealth),
interval: DefaultHeartbeatInterval,
httpClient: &http.Client{
Timeout: 5 * time.Second,
},
logger: slog.Default().With("component", "sec-004-watchdog", "self", selfName),
stats: MonitorStats{
StartedAt: time.Now(),
},
}
}
// RegisterPeer adds a peer agent to the monitoring mesh.
func (m *Monitor) RegisterPeer(name, endpoint string) {
m.mu.Lock()
defer m.mu.Unlock()
m.peers[name] = &PeerHealth{
Name: name,
Endpoint: endpoint,
Status: StatusHealthy,
LastSeen: time.Now(),
}
m.stats.PeerCount = len(m.peers)
m.logger.Info("peer registered", "peer", name, "endpoint", endpoint)
}
// OnEscalation registers a handler for escalation events.
func (m *Monitor) OnEscalation(h EscalationHandler) {
m.mu.Lock()
defer m.mu.Unlock()
m.handlers = append(m.handlers, h)
}
// Start begins the heartbeat monitoring loop.
func (m *Monitor) Start(ctx context.Context) {
m.logger.Info("watchdog mesh started",
"interval", m.interval,
"peers", m.peerNames(),
)
ticker := time.NewTicker(m.interval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
m.logger.Info("watchdog mesh stopped")
return
case <-ticker.C:
m.checkAllPeers(ctx)
}
}
}
// checkAllPeers performs a health check on every registered peer.
func (m *Monitor) checkAllPeers(ctx context.Context) {
m.mu.RLock()
peers := make([]*PeerHealth, 0, len(m.peers))
for _, p := range m.peers {
peers = append(peers, p)
}
m.mu.RUnlock()
for _, peer := range peers {
m.checkPeer(ctx, peer)
}
}
// checkPeer performs a single health check on a peer.
func (m *Monitor) checkPeer(ctx context.Context, peer *PeerHealth) {
m.stats.mu.Lock()
m.stats.TotalChecks++
m.stats.mu.Unlock()
start := time.Now()
healthy := m.pingPeer(ctx, peer.Endpoint)
elapsed := time.Since(start)
m.mu.Lock()
defer m.mu.Unlock()
if healthy {
peer.Status = StatusHealthy
peer.LastSeen = time.Now()
peer.MissedCount = 0
peer.ResponseTimeMs = elapsed.Milliseconds()
return
}
// Missed heartbeat.
peer.MissedCount++
m.stats.mu.Lock()
m.stats.TotalMisses++
m.stats.mu.Unlock()
m.logger.Warn("peer missed heartbeat",
"peer", peer.Name,
"missed", peer.MissedCount,
"last_seen", peer.LastSeen,
)
// Escalation ladder.
switch {
case peer.MissedCount >= MaxMissedBeforeRestart && peer.RestartCount >= MaxRestartsBeforeIsolate:
// Level 3: Isolate via eBPF + alert architect.
peer.Status = StatusIsolated
m.stats.mu.Lock()
m.stats.TotalIsolations++
m.stats.mu.Unlock()
m.escalate(EscalationAction{
Timestamp: time.Now(),
PeerName: peer.Name,
Action: "isolate",
Reason: fmt.Sprintf("peer %s offline after %d restarts — eBPF isolation engaged", peer.Name, peer.RestartCount),
Severity: "CRITICAL",
})
case peer.MissedCount >= MaxMissedBeforeRestart:
// Level 2: Attempt restart.
peer.Status = StatusOffline
peer.RestartCount++
peer.LastRestart = time.Now()
m.stats.mu.Lock()
m.stats.TotalRestarts++
m.stats.mu.Unlock()
m.escalate(EscalationAction{
Timestamp: time.Now(),
PeerName: peer.Name,
Action: "restart",
Reason: fmt.Sprintf("peer %s missed %d heartbeats — restart attempt %d", peer.Name, peer.MissedCount, peer.RestartCount),
Severity: "HIGH",
})
peer.MissedCount = 0 // Reset after restart attempt.
default:
// Level 1: Mark degraded.
peer.Status = StatusDegraded
m.escalate(EscalationAction{
Timestamp: time.Now(),
PeerName: peer.Name,
Action: "alert",
Reason: fmt.Sprintf("peer %s missed %d heartbeat(s)", peer.Name, peer.MissedCount),
Severity: "MEDIUM",
})
}
}
// pingPeer sends an HTTP GET to the peer's health endpoint.
func (m *Monitor) pingPeer(ctx context.Context, endpoint string) bool {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
if err != nil {
return false
}
resp, err := m.httpClient.Do(req)
if err != nil {
return false
}
defer resp.Body.Close()
return resp.StatusCode == http.StatusOK
}
// escalate notifies all registered handlers and logs the action.
func (m *Monitor) escalate(action EscalationAction) {
m.logger.Warn("WATCHDOG ESCALATION",
"peer", action.PeerName,
"action", action.Action,
"severity", action.Severity,
"reason", action.Reason,
)
// Notify handlers (must hold read lock or no lock).
handlers := m.handlers
for _, h := range handlers {
h(action)
}
}
// PeerStatus returns the current status of a specific peer.
func (m *Monitor) GetPeerStatus(name string) (*PeerHealth, bool) {
m.mu.RLock()
defer m.mu.RUnlock()
p, ok := m.peers[name]
if !ok {
return nil, false
}
cp := *p // Return a copy.
return &cp, true
}
// AllPeers returns a snapshot of all peer health states.
func (m *Monitor) AllPeers() []PeerHealth {
m.mu.RLock()
defer m.mu.RUnlock()
result := make([]PeerHealth, 0, len(m.peers))
for _, p := range m.peers {
result = append(result, *p)
}
return result
}
// Stats returns current watchdog metrics.
func (m *Monitor) Stats() MonitorStats {
m.stats.mu.Lock()
defer m.stats.mu.Unlock()
return MonitorStats{
TotalChecks: m.stats.TotalChecks,
TotalMisses: m.stats.TotalMisses,
TotalRestarts: m.stats.TotalRestarts,
TotalIsolations: m.stats.TotalIsolations,
StartedAt: m.stats.StartedAt,
PeerCount: m.stats.PeerCount,
}
}
// ServeHTTP provides the mesh status as JSON (for embedding in other servers).
func (m *Monitor) ServeHTTP(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]any{
"self": m.selfName,
"peers": m.AllPeers(),
"stats": m.Stats(),
})
}
// peerNames returns a list of registered peer names.
func (m *Monitor) peerNames() []string {
names := make([]string, 0, len(m.peers))
for n := range m.peers {
names = append(names, n)
}
return names
}

View file

@ -0,0 +1,249 @@
package watchdog
import (
"context"
"net/http"
"net/http/httptest"
"testing"
"time"
)
func TestRegisterPeer(t *testing.T) {
m := NewMonitor("test-self")
m.RegisterPeer("immune", "http://localhost:9760/health")
m.RegisterPeer("sidecar", "http://localhost:9770/health")
peers := m.AllPeers()
if len(peers) != 2 {
t.Fatalf("peer count = %d, want 2", len(peers))
}
}
func TestHealthyPeer(t *testing.T) {
// Create a mock healthy peer.
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
}))
defer srv.Close()
m := NewMonitor("test-self")
m.RegisterPeer("healthy-peer", srv.URL+"/health")
// Run one check cycle.
ctx := context.Background()
m.checkAllPeers(ctx)
peer, ok := m.GetPeerStatus("healthy-peer")
if !ok {
t.Fatal("peer not found")
}
if peer.Status != StatusHealthy {
t.Errorf("status = %s, want HEALTHY", peer.Status)
}
if peer.MissedCount != 0 {
t.Errorf("missed = %d, want 0", peer.MissedCount)
}
}
func TestUnhealthyPeerDegraded(t *testing.T) {
// Peer that's down (no server listening).
m := NewMonitor("test-self")
m.RegisterPeer("dead-peer", "http://127.0.0.1:19999/health")
ctx := context.Background()
// One miss → DEGRADED.
m.checkAllPeers(ctx)
peer, _ := m.GetPeerStatus("dead-peer")
if peer.Status != StatusDegraded {
t.Errorf("status = %s, want DEGRADED", peer.Status)
}
if peer.MissedCount != 1 {
t.Errorf("missed = %d, want 1", peer.MissedCount)
}
}
func TestEscalationToRestart(t *testing.T) {
m := NewMonitor("test-self")
m.RegisterPeer("flaky-peer", "http://127.0.0.1:19999/health")
var escalations []EscalationAction
m.OnEscalation(func(a EscalationAction) {
escalations = append(escalations, a)
})
ctx := context.Background()
// Miss 3 heartbeats → should trigger restart.
for i := 0; i < MaxMissedBeforeRestart; i++ {
m.checkAllPeers(ctx)
}
peer, _ := m.GetPeerStatus("flaky-peer")
if peer.Status != StatusOffline {
t.Errorf("status = %s, want OFFLINE", peer.Status)
}
if peer.RestartCount != 1 {
t.Errorf("restart_count = %d, want 1", peer.RestartCount)
}
// Check that escalation was fired.
found := false
for _, e := range escalations {
if e.Action == "restart" {
found = true
break
}
}
if !found {
t.Error("expected 'restart' escalation, got none")
}
}
func TestEscalationToIsolate(t *testing.T) {
m := NewMonitor("test-self")
m.RegisterPeer("broken-peer", "http://127.0.0.1:19999/health")
var escalations []EscalationAction
m.OnEscalation(func(a EscalationAction) {
escalations = append(escalations, a)
})
ctx := context.Background()
// Trigger MaxRestartsBeforeIsolate restart cycles.
for r := 0; r < MaxRestartsBeforeIsolate; r++ {
for i := 0; i < MaxMissedBeforeRestart; i++ {
m.checkAllPeers(ctx)
}
}
// Now one more miss cycle should trigger isolation.
for i := 0; i < MaxMissedBeforeRestart; i++ {
m.checkAllPeers(ctx)
}
peer, _ := m.GetPeerStatus("broken-peer")
if peer.Status != StatusIsolated {
t.Errorf("status = %s, want ISOLATED", peer.Status)
}
// Check for isolate escalation.
found := false
for _, e := range escalations {
if e.Action == "isolate" {
found = true
if e.Severity != "CRITICAL" {
t.Errorf("isolate severity = %s, want CRITICAL", e.Severity)
}
break
}
}
if !found {
t.Error("expected 'isolate' escalation, got none")
}
}
func TestRecoveryAfterRestart(t *testing.T) {
// Peer goes down, gets restarted (simulated), then comes back.
healthy := true
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if healthy {
w.WriteHeader(http.StatusOK)
} else {
w.WriteHeader(http.StatusServiceUnavailable)
}
}))
defer srv.Close()
m := NewMonitor("test-self")
m.RegisterPeer("recovering-peer", srv.URL+"/health")
ctx := context.Background()
// Initially healthy.
m.checkAllPeers(ctx)
peer, _ := m.GetPeerStatus("recovering-peer")
if peer.Status != StatusHealthy {
t.Fatalf("initial status = %s, want HEALTHY", peer.Status)
}
// Goes down.
healthy = false
m.checkAllPeers(ctx)
peer, _ = m.GetPeerStatus("recovering-peer")
if peer.Status != StatusDegraded {
t.Fatalf("down status = %s, want DEGRADED", peer.Status)
}
// Comes back.
healthy = true
m.checkAllPeers(ctx)
peer, _ = m.GetPeerStatus("recovering-peer")
if peer.Status != StatusHealthy {
t.Errorf("recovered status = %s, want HEALTHY", peer.Status)
}
if peer.MissedCount != 0 {
t.Errorf("missed after recovery = %d, want 0", peer.MissedCount)
}
}
func TestStats(t *testing.T) {
m := NewMonitor("test-self")
m.RegisterPeer("p1", "http://127.0.0.1:19999/health")
ctx := context.Background()
m.checkAllPeers(ctx)
m.checkAllPeers(ctx)
stats := m.Stats()
if stats.TotalChecks != 2 {
t.Errorf("total_checks = %d, want 2", stats.TotalChecks)
}
if stats.TotalMisses != 2 {
t.Errorf("total_misses = %d, want 2", stats.TotalMisses)
}
if stats.PeerCount != 1 {
t.Errorf("peer_count = %d, want 1", stats.PeerCount)
}
}
func TestServeHTTP(t *testing.T) {
m := NewMonitor("test-self")
m.RegisterPeer("p1", "http://localhost:9760/health")
w := httptest.NewRecorder()
r := httptest.NewRequest(http.MethodGet, "/mesh", nil)
m.ServeHTTP(w, r)
if w.Code != http.StatusOK {
t.Errorf("status = %d, want 200", w.Code)
}
if ct := w.Header().Get("Content-Type"); ct != "application/json" {
t.Errorf("content-type = %s, want application/json", ct)
}
}
func TestMonitorStartStop(t *testing.T) {
m := NewMonitor("test-self")
m.interval = 50 * time.Millisecond // Fast for tests.
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
}))
defer srv.Close()
m.RegisterPeer("fast-peer", srv.URL+"/health")
ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond)
defer cancel()
m.Start(ctx) // Blocks until context expires.
stats := m.Stats()
if stats.TotalChecks < 2 {
t.Errorf("expected at least 2 checks in 200ms, got %d", stats.TotalChecks)
}
}