mirror of
https://github.com/syntrex-lab/gomcp.git
synced 2026-05-15 06:12:37 +02:00
Release prep: 54 engines, self-hosted signatures, i18n, dashboard updates
This commit is contained in:
parent
694e32be26
commit
41cbfd6e0a
178 changed files with 36008 additions and 399 deletions
165
internal/application/resilience/behavioral.go
Normal file
165
internal/application/resilience/behavioral.go
Normal file
|
|
@ -0,0 +1,165 @@
|
|||
package resilience
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log/slog"
|
||||
"runtime"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// BehaviorProfile captures the runtime behavior of a component.
|
||||
type BehaviorProfile struct {
|
||||
Goroutines int `json:"goroutines"`
|
||||
HeapAllocMB float64 `json:"heap_alloc_mb"`
|
||||
HeapObjectsK float64 `json:"heap_objects_k"`
|
||||
GCPauseMs float64 `json:"gc_pause_ms"`
|
||||
NumGC uint32 `json:"num_gc"`
|
||||
FileDescriptors int `json:"file_descriptors,omitempty"`
|
||||
CustomMetrics map[string]float64 `json:"custom_metrics,omitempty"`
|
||||
}
|
||||
|
||||
// BehavioralAlert is emitted when a behavioral anomaly is detected.
|
||||
type BehavioralAlert struct {
|
||||
Component string `json:"component"`
|
||||
AnomalyType string `json:"anomaly_type"` // goroutine_leak, memory_leak, gc_pressure, etc.
|
||||
Metric string `json:"metric"`
|
||||
Current float64 `json:"current"`
|
||||
Baseline float64 `json:"baseline"`
|
||||
ZScore float64 `json:"z_score"`
|
||||
Severity string `json:"severity"`
|
||||
Timestamp time.Time `json:"timestamp"`
|
||||
}
|
||||
|
||||
// BehavioralAnalyzer provides Go-side runtime behavioral analysis.
|
||||
// It profiles the current process and compares against learned baselines.
|
||||
// On Linux, eBPF hooks (immune/resilience_hooks.c) extend this to kernel level.
|
||||
type BehavioralAnalyzer struct {
|
||||
mu sync.RWMutex
|
||||
metricsDB *MetricsDB
|
||||
alertBus chan BehavioralAlert
|
||||
interval time.Duration
|
||||
component string // self component name
|
||||
logger *slog.Logger
|
||||
}
|
||||
|
||||
// NewBehavioralAnalyzer creates a new behavioral analyzer.
|
||||
func NewBehavioralAnalyzer(component string, alertBufSize int) *BehavioralAnalyzer {
|
||||
if alertBufSize <= 0 {
|
||||
alertBufSize = 50
|
||||
}
|
||||
return &BehavioralAnalyzer{
|
||||
metricsDB: NewMetricsDB(DefaultMetricsWindow, DefaultMetricsMaxSize),
|
||||
alertBus: make(chan BehavioralAlert, alertBufSize),
|
||||
interval: 1 * time.Minute,
|
||||
component: component,
|
||||
logger: slog.Default().With("component", "sarl-behavioral"),
|
||||
}
|
||||
}
|
||||
|
||||
// AlertBus returns the channel for consuming behavioral alerts.
|
||||
func (ba *BehavioralAnalyzer) AlertBus() <-chan BehavioralAlert {
|
||||
return ba.alertBus
|
||||
}
|
||||
|
||||
// Start begins continuous behavioral monitoring. Blocks until ctx cancelled.
|
||||
func (ba *BehavioralAnalyzer) Start(ctx context.Context) {
|
||||
ba.logger.Info("behavioral analyzer started", "interval", ba.interval)
|
||||
|
||||
ticker := time.NewTicker(ba.interval)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
ba.logger.Info("behavioral analyzer stopped")
|
||||
return
|
||||
case <-ticker.C:
|
||||
ba.collectAndAnalyze()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// collectAndAnalyze profiles runtime and checks for anomalies.
|
||||
func (ba *BehavioralAnalyzer) collectAndAnalyze() {
|
||||
profile := ba.collectProfile()
|
||||
ba.storeMetrics(profile)
|
||||
ba.detectAnomalies(profile)
|
||||
}
|
||||
|
||||
// collectProfile gathers current Go runtime stats.
|
||||
func (ba *BehavioralAnalyzer) collectProfile() BehaviorProfile {
|
||||
var mem runtime.MemStats
|
||||
runtime.ReadMemStats(&mem)
|
||||
|
||||
return BehaviorProfile{
|
||||
Goroutines: runtime.NumGoroutine(),
|
||||
HeapAllocMB: float64(mem.HeapAlloc) / (1024 * 1024),
|
||||
HeapObjectsK: float64(mem.HeapObjects) / 1000,
|
||||
GCPauseMs: float64(mem.PauseNs[(mem.NumGC+255)%256]) / 1e6,
|
||||
NumGC: mem.NumGC,
|
||||
}
|
||||
}
|
||||
|
||||
// storeMetrics records profile data in the time-series DB.
|
||||
func (ba *BehavioralAnalyzer) storeMetrics(p BehaviorProfile) {
|
||||
ba.metricsDB.AddDataPoint(ba.component, "goroutines", float64(p.Goroutines))
|
||||
ba.metricsDB.AddDataPoint(ba.component, "heap_alloc_mb", p.HeapAllocMB)
|
||||
ba.metricsDB.AddDataPoint(ba.component, "heap_objects_k", p.HeapObjectsK)
|
||||
ba.metricsDB.AddDataPoint(ba.component, "gc_pause_ms", p.GCPauseMs)
|
||||
}
|
||||
|
||||
// detectAnomalies checks each metric against its baseline via Z-score.
|
||||
func (ba *BehavioralAnalyzer) detectAnomalies(p BehaviorProfile) {
|
||||
checks := []struct {
|
||||
metric string
|
||||
value float64
|
||||
anomalyType string
|
||||
severity string
|
||||
}{
|
||||
{"goroutines", float64(p.Goroutines), "goroutine_leak", "WARNING"},
|
||||
{"heap_alloc_mb", p.HeapAllocMB, "memory_leak", "CRITICAL"},
|
||||
{"heap_objects_k", p.HeapObjectsK, "object_leak", "WARNING"},
|
||||
{"gc_pause_ms", p.GCPauseMs, "gc_pressure", "WARNING"},
|
||||
}
|
||||
|
||||
for _, c := range checks {
|
||||
baseline := ba.metricsDB.GetBaseline(ba.component, c.metric, DefaultMetricsWindow)
|
||||
if !IsAnomaly(c.value, baseline, AnomalyZScoreThreshold) {
|
||||
continue
|
||||
}
|
||||
|
||||
zscore := CalculateZScore(c.value, baseline)
|
||||
alert := BehavioralAlert{
|
||||
Component: ba.component,
|
||||
AnomalyType: c.anomalyType,
|
||||
Metric: c.metric,
|
||||
Current: c.value,
|
||||
Baseline: baseline.Mean,
|
||||
ZScore: zscore,
|
||||
Severity: c.severity,
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
|
||||
select {
|
||||
case ba.alertBus <- alert:
|
||||
ba.logger.Warn("behavioral anomaly detected",
|
||||
"type", c.anomalyType,
|
||||
"metric", c.metric,
|
||||
"z_score", zscore,
|
||||
)
|
||||
default:
|
||||
ba.logger.Error("behavioral alert bus full")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// InjectMetric allows manually injecting a metric for testing.
|
||||
func (ba *BehavioralAnalyzer) InjectMetric(metric string, value float64) {
|
||||
ba.metricsDB.AddDataPoint(ba.component, metric, value)
|
||||
}
|
||||
|
||||
// CurrentProfile returns a snapshot of the current runtime profile.
|
||||
func (ba *BehavioralAnalyzer) CurrentProfile() BehaviorProfile {
|
||||
return ba.collectProfile()
|
||||
}
|
||||
206
internal/application/resilience/behavioral_test.go
Normal file
206
internal/application/resilience/behavioral_test.go
Normal file
|
|
@ -0,0 +1,206 @@
|
|||
package resilience
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// IM-01: Goroutine leak detection.
|
||||
func TestBehavioral_IM01_GoroutineLeak(t *testing.T) {
|
||||
ba := NewBehavioralAnalyzer("soc-ingest", 10)
|
||||
|
||||
// Build baseline of 10 goroutines.
|
||||
for i := 0; i < 50; i++ {
|
||||
ba.InjectMetric("goroutines", 10)
|
||||
}
|
||||
|
||||
// Spike to 1000 goroutines — should trigger anomaly.
|
||||
ba.metricsDB.AddDataPoint("soc-ingest", "goroutines", 1000)
|
||||
profile := BehaviorProfile{Goroutines: 1000}
|
||||
ba.detectAnomalies(profile)
|
||||
|
||||
select {
|
||||
case alert := <-ba.alertBus:
|
||||
if alert.AnomalyType != "goroutine_leak" {
|
||||
t.Errorf("expected goroutine_leak, got %s", alert.AnomalyType)
|
||||
}
|
||||
if alert.ZScore <= 3 {
|
||||
t.Errorf("expected Z > 3, got %f", alert.ZScore)
|
||||
}
|
||||
default:
|
||||
t.Error("expected goroutine leak alert")
|
||||
}
|
||||
}
|
||||
|
||||
// IM-02: Memory leak detection.
|
||||
func TestBehavioral_IM02_MemoryLeak(t *testing.T) {
|
||||
ba := NewBehavioralAnalyzer("soc-correlate", 10)
|
||||
|
||||
// Baseline: 50 MB.
|
||||
for i := 0; i < 50; i++ {
|
||||
ba.InjectMetric("heap_alloc_mb", 50)
|
||||
}
|
||||
|
||||
// Spike to 500 MB.
|
||||
ba.metricsDB.AddDataPoint("soc-correlate", "heap_alloc_mb", 500)
|
||||
profile := BehaviorProfile{HeapAllocMB: 500}
|
||||
ba.detectAnomalies(profile)
|
||||
|
||||
select {
|
||||
case alert := <-ba.alertBus:
|
||||
if alert.AnomalyType != "memory_leak" {
|
||||
t.Errorf("expected memory_leak, got %s", alert.AnomalyType)
|
||||
}
|
||||
if alert.Severity != "CRITICAL" {
|
||||
t.Errorf("expected CRITICAL severity, got %s", alert.Severity)
|
||||
}
|
||||
default:
|
||||
t.Error("expected memory leak alert")
|
||||
}
|
||||
}
|
||||
|
||||
// IM-03: GC pressure detection.
|
||||
func TestBehavioral_IM03_GCPressure(t *testing.T) {
|
||||
ba := NewBehavioralAnalyzer("soc-respond", 10)
|
||||
|
||||
// Baseline: 1ms GC pause.
|
||||
for i := 0; i < 50; i++ {
|
||||
ba.InjectMetric("gc_pause_ms", 1)
|
||||
}
|
||||
|
||||
// Spike to 100ms.
|
||||
ba.metricsDB.AddDataPoint("soc-respond", "gc_pause_ms", 100)
|
||||
profile := BehaviorProfile{GCPauseMs: 100}
|
||||
ba.detectAnomalies(profile)
|
||||
|
||||
select {
|
||||
case alert := <-ba.alertBus:
|
||||
if alert.AnomalyType != "gc_pressure" {
|
||||
t.Errorf("expected gc_pressure, got %s", alert.AnomalyType)
|
||||
}
|
||||
default:
|
||||
t.Error("expected gc_pressure alert")
|
||||
}
|
||||
}
|
||||
|
||||
// IM-04: Object leak detection.
|
||||
func TestBehavioral_IM04_ObjectLeak(t *testing.T) {
|
||||
ba := NewBehavioralAnalyzer("shield", 10)
|
||||
|
||||
for i := 0; i < 50; i++ {
|
||||
ba.InjectMetric("heap_objects_k", 100)
|
||||
}
|
||||
|
||||
ba.metricsDB.AddDataPoint("shield", "heap_objects_k", 5000)
|
||||
profile := BehaviorProfile{HeapObjectsK: 5000}
|
||||
ba.detectAnomalies(profile)
|
||||
|
||||
select {
|
||||
case alert := <-ba.alertBus:
|
||||
if alert.AnomalyType != "object_leak" {
|
||||
t.Errorf("expected object_leak, got %s", alert.AnomalyType)
|
||||
}
|
||||
default:
|
||||
t.Error("expected object leak alert")
|
||||
}
|
||||
}
|
||||
|
||||
// IM-05: Normal behavior — no alerts.
|
||||
func TestBehavioral_IM05_NormalBehavior(t *testing.T) {
|
||||
ba := NewBehavioralAnalyzer("sidecar", 10)
|
||||
|
||||
for i := 0; i < 50; i++ {
|
||||
ba.InjectMetric("goroutines", 10)
|
||||
ba.InjectMetric("heap_alloc_mb", 50)
|
||||
ba.InjectMetric("heap_objects_k", 100)
|
||||
ba.InjectMetric("gc_pause_ms", 1)
|
||||
}
|
||||
|
||||
profile := BehaviorProfile{
|
||||
Goroutines: 10,
|
||||
HeapAllocMB: 50,
|
||||
HeapObjectsK: 100,
|
||||
GCPauseMs: 1,
|
||||
}
|
||||
ba.detectAnomalies(profile)
|
||||
|
||||
select {
|
||||
case alert := <-ba.alertBus:
|
||||
t.Errorf("expected no alerts for normal behavior, got %+v", alert)
|
||||
default:
|
||||
// Good — no alerts.
|
||||
}
|
||||
}
|
||||
|
||||
// IM-06: Start/Stop lifecycle.
|
||||
func TestBehavioral_IM06_StartStop(t *testing.T) {
|
||||
ba := NewBehavioralAnalyzer("test", 10)
|
||||
ba.interval = 50 * time.Millisecond
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
done := make(chan struct{})
|
||||
|
||||
go func() {
|
||||
ba.Start(ctx)
|
||||
close(done)
|
||||
}()
|
||||
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
cancel()
|
||||
|
||||
select {
|
||||
case <-done:
|
||||
case <-time.After(time.Second):
|
||||
t.Fatal("Start() did not return after context cancellation")
|
||||
}
|
||||
}
|
||||
|
||||
// IM-07: CurrentProfile returns valid data.
|
||||
func TestBehavioral_IM07_CurrentProfile(t *testing.T) {
|
||||
ba := NewBehavioralAnalyzer("test", 10)
|
||||
profile := ba.CurrentProfile()
|
||||
|
||||
if profile.Goroutines <= 0 {
|
||||
t.Error("expected positive goroutine count")
|
||||
}
|
||||
if profile.HeapAllocMB <= 0 {
|
||||
t.Error("expected positive heap alloc")
|
||||
}
|
||||
}
|
||||
|
||||
// IM-08: Alert bus overflow (non-blocking).
|
||||
func TestBehavioral_IM08_AlertBusOverflow(t *testing.T) {
|
||||
ba := NewBehavioralAnalyzer("test", 2)
|
||||
|
||||
// Fill bus.
|
||||
ba.alertBus <- BehavioralAlert{AnomalyType: "fill1"}
|
||||
ba.alertBus <- BehavioralAlert{AnomalyType: "fill2"}
|
||||
|
||||
// Build baseline.
|
||||
for i := 0; i < 50; i++ {
|
||||
ba.InjectMetric("goroutines", 10)
|
||||
}
|
||||
|
||||
// This should not panic.
|
||||
ba.metricsDB.AddDataPoint("test", "goroutines", 10000)
|
||||
ba.detectAnomalies(BehaviorProfile{Goroutines: 10000})
|
||||
}
|
||||
|
||||
// Test collectAndAnalyze runs without error.
|
||||
func TestBehavioral_CollectAndAnalyze(t *testing.T) {
|
||||
ba := NewBehavioralAnalyzer("test", 10)
|
||||
// Should not panic.
|
||||
ba.collectAndAnalyze()
|
||||
}
|
||||
|
||||
// Test InjectMetric stores data.
|
||||
func TestBehavioral_InjectMetric(t *testing.T) {
|
||||
ba := NewBehavioralAnalyzer("test", 10)
|
||||
ba.InjectMetric("custom", 42.0)
|
||||
|
||||
recent := ba.metricsDB.GetRecent("test", "custom", 1)
|
||||
if len(recent) != 1 || recent[0].Value != 42.0 {
|
||||
t.Errorf("expected 42.0, got %v", recent)
|
||||
}
|
||||
}
|
||||
524
internal/application/resilience/healing_engine.go
Normal file
524
internal/application/resilience/healing_engine.go
Normal file
|
|
@ -0,0 +1,524 @@
|
|||
package resilience
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// HealingState represents the FSM state of a healing operation.
|
||||
type HealingState string
|
||||
|
||||
const (
|
||||
HealingIdle HealingState = "IDLE"
|
||||
HealingDiagnosing HealingState = "DIAGNOSING"
|
||||
HealingActive HealingState = "HEALING"
|
||||
HealingVerifying HealingState = "VERIFYING"
|
||||
HealingCompleted HealingState = "COMPLETED"
|
||||
HealingFailed HealingState = "FAILED"
|
||||
)
|
||||
|
||||
// HealingResult summarizes a completed healing operation.
|
||||
type HealingResult string
|
||||
|
||||
const (
|
||||
ResultSuccess HealingResult = "SUCCESS"
|
||||
ResultFailed HealingResult = "FAILED"
|
||||
ResultSkipped HealingResult = "SKIPPED"
|
||||
)
|
||||
|
||||
// ActionType defines the kinds of healing actions.
|
||||
type ActionType string
|
||||
|
||||
const (
|
||||
ActionGracefulStop ActionType = "graceful_stop"
|
||||
ActionClearTempFiles ActionType = "clear_temp_files"
|
||||
ActionStartComponent ActionType = "start_component"
|
||||
ActionVerifyHealth ActionType = "verify_health"
|
||||
ActionNotifySOC ActionType = "notify_soc"
|
||||
ActionFreezeConfig ActionType = "freeze_config"
|
||||
ActionRollbackConfig ActionType = "rollback_config"
|
||||
ActionVerifyConfig ActionType = "verify_config"
|
||||
ActionSwitchReadOnly ActionType = "switch_to_readonly"
|
||||
ActionBackupDB ActionType = "backup_db"
|
||||
ActionRestoreSnapshot ActionType = "restore_snapshot"
|
||||
ActionVerifyIntegrity ActionType = "verify_integrity"
|
||||
ActionResumeWrites ActionType = "resume_writes"
|
||||
ActionDisableRules ActionType = "disable_rules"
|
||||
ActionRevertRules ActionType = "revert_rules"
|
||||
ActionReloadEngine ActionType = "reload_engine"
|
||||
ActionIsolateNetwork ActionType = "isolate_network"
|
||||
ActionRegenCerts ActionType = "regenerate_certs"
|
||||
ActionRestoreNetwork ActionType = "restore_network"
|
||||
ActionNotifyArchitect ActionType = "notify_architect"
|
||||
ActionEnterSafeMode ActionType = "enter_safe_mode"
|
||||
)
|
||||
|
||||
// Action is a single step in a healing strategy.
|
||||
type Action struct {
|
||||
Type ActionType `json:"type"`
|
||||
Params map[string]interface{} `json:"params,omitempty"`
|
||||
Timeout time.Duration `json:"timeout"`
|
||||
OnError string `json:"on_error"` // "continue", "abort", "rollback"
|
||||
}
|
||||
|
||||
// TriggerCondition defines when a healing strategy activates.
|
||||
type TriggerCondition struct {
|
||||
Metrics []string `json:"metrics,omitempty"`
|
||||
Statuses []ComponentStatus `json:"statuses,omitempty"`
|
||||
ConsecutiveFailures int `json:"consecutive_failures"`
|
||||
WithinWindow time.Duration `json:"within_window"`
|
||||
}
|
||||
|
||||
// RollbackPlan defines what happens if healing fails.
|
||||
type RollbackPlan struct {
|
||||
OnFailure string `json:"on_failure"` // "escalate", "enter_safe_mode", "maintain_isolation"
|
||||
Actions []Action `json:"actions,omitempty"`
|
||||
}
|
||||
|
||||
// HealingStrategy is a complete self-healing plan.
|
||||
type HealingStrategy struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Trigger TriggerCondition `json:"trigger"`
|
||||
Actions []Action `json:"actions"`
|
||||
Rollback RollbackPlan `json:"rollback"`
|
||||
MaxAttempts int `json:"max_attempts"`
|
||||
Cooldown time.Duration `json:"cooldown"`
|
||||
}
|
||||
|
||||
// Diagnosis is the result of root cause analysis.
|
||||
type Diagnosis struct {
|
||||
Component string `json:"component"`
|
||||
Metric string `json:"metric"`
|
||||
RootCause string `json:"root_cause"`
|
||||
Confidence float64 `json:"confidence"`
|
||||
SuggestedFix string `json:"suggested_fix"`
|
||||
RelatedAlerts []HealthAlert `json:"related_alerts,omitempty"`
|
||||
}
|
||||
|
||||
// HealingOperation tracks a single healing attempt.
|
||||
type HealingOperation struct {
|
||||
ID string `json:"id"`
|
||||
StrategyID string `json:"strategy_id"`
|
||||
Component string `json:"component"`
|
||||
State HealingState `json:"state"`
|
||||
Diagnosis *Diagnosis `json:"diagnosis,omitempty"`
|
||||
ActionsRun []ActionLog `json:"actions_run"`
|
||||
Result HealingResult `json:"result"`
|
||||
StartedAt time.Time `json:"started_at"`
|
||||
CompletedAt time.Time `json:"completed_at,omitempty"`
|
||||
Error string `json:"error,omitempty"`
|
||||
AttemptNumber int `json:"attempt_number"`
|
||||
}
|
||||
|
||||
// ActionLog records the execution of a single action.
|
||||
type ActionLog struct {
|
||||
Action ActionType `json:"action"`
|
||||
StartedAt time.Time `json:"started_at"`
|
||||
Duration time.Duration `json:"duration"`
|
||||
Success bool `json:"success"`
|
||||
Error string `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
// ActionExecutorFunc is the callback that actually runs an action.
|
||||
// Implementations handle the real system operations (restart, rollback, etc.).
|
||||
type ActionExecutorFunc func(ctx context.Context, action Action, component string) error
|
||||
|
||||
// HealingEngine is the L2 Self-Healing orchestrator.
|
||||
type HealingEngine struct {
|
||||
mu sync.RWMutex
|
||||
strategies []HealingStrategy
|
||||
cooldowns map[string]time.Time // strategyID → earliest next run
|
||||
operations []*HealingOperation
|
||||
opCounter int64
|
||||
executor ActionExecutorFunc
|
||||
alertBus <-chan HealthAlert
|
||||
escalateFn func(HealthAlert) // called on unrecoverable failure
|
||||
logger *slog.Logger
|
||||
}
|
||||
|
||||
// NewHealingEngine creates a new self-healing engine.
|
||||
func NewHealingEngine(
|
||||
alertBus <-chan HealthAlert,
|
||||
executor ActionExecutorFunc,
|
||||
escalateFn func(HealthAlert),
|
||||
) *HealingEngine {
|
||||
return &HealingEngine{
|
||||
cooldowns: make(map[string]time.Time),
|
||||
operations: make([]*HealingOperation, 0),
|
||||
executor: executor,
|
||||
alertBus: alertBus,
|
||||
escalateFn: escalateFn,
|
||||
logger: slog.Default().With("component", "sarl-healing-engine"),
|
||||
}
|
||||
}
|
||||
|
||||
// RegisterStrategy adds a healing strategy.
|
||||
func (he *HealingEngine) RegisterStrategy(s HealingStrategy) {
|
||||
he.mu.Lock()
|
||||
defer he.mu.Unlock()
|
||||
he.strategies = append(he.strategies, s)
|
||||
he.logger.Info("strategy registered", "id", s.ID, "name", s.Name)
|
||||
}
|
||||
|
||||
// Start begins listening for alerts and initiating healing. Blocks until ctx is cancelled.
|
||||
func (he *HealingEngine) Start(ctx context.Context) {
|
||||
he.logger.Info("healing engine started", "strategies", len(he.strategies))
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
he.logger.Info("healing engine stopped")
|
||||
return
|
||||
case alert, ok := <-he.alertBus:
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
if alert.Severity == SeverityCritical || alert.Severity == SeverityWarning {
|
||||
he.initiateHealing(ctx, alert)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// initiateHealing runs the healing pipeline for an alert.
|
||||
func (he *HealingEngine) initiateHealing(ctx context.Context, alert HealthAlert) {
|
||||
strategy := he.findStrategy(alert)
|
||||
if strategy == nil {
|
||||
he.logger.Info("no matching strategy for alert",
|
||||
"component", alert.Component,
|
||||
"metric", alert.Metric,
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
if he.isInCooldown(strategy.ID) {
|
||||
he.logger.Info("strategy in cooldown",
|
||||
"strategy", strategy.ID,
|
||||
"component", alert.Component,
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
op := he.createOperation(strategy, alert.Component)
|
||||
|
||||
he.logger.Info("healing initiated",
|
||||
"op_id", op.ID,
|
||||
"strategy", strategy.ID,
|
||||
"component", alert.Component,
|
||||
)
|
||||
|
||||
// Phase 1: Diagnose.
|
||||
he.transitionOp(op, HealingDiagnosing)
|
||||
diagnosis := he.diagnose(alert)
|
||||
op.Diagnosis = &diagnosis
|
||||
|
||||
// Phase 2: Execute healing actions.
|
||||
he.transitionOp(op, HealingActive)
|
||||
execErr := he.executeActions(ctx, strategy, op)
|
||||
|
||||
// Phase 3: Verify recovery.
|
||||
if execErr == nil {
|
||||
he.transitionOp(op, HealingVerifying)
|
||||
verifyErr := he.verifyRecovery(ctx, strategy, op.Component)
|
||||
if verifyErr != nil {
|
||||
execErr = verifyErr
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 4: Complete or fail.
|
||||
if execErr == nil {
|
||||
he.transitionOp(op, HealingCompleted)
|
||||
op.Result = ResultSuccess
|
||||
he.logger.Info("healing completed successfully",
|
||||
"op_id", op.ID,
|
||||
"component", op.Component,
|
||||
"duration", time.Since(op.StartedAt),
|
||||
)
|
||||
} else {
|
||||
he.transitionOp(op, HealingFailed)
|
||||
op.Result = ResultFailed
|
||||
op.Error = execErr.Error()
|
||||
he.logger.Error("healing failed",
|
||||
"op_id", op.ID,
|
||||
"component", op.Component,
|
||||
"error", execErr,
|
||||
)
|
||||
|
||||
// Execute rollback.
|
||||
he.executeRollback(ctx, strategy, op)
|
||||
|
||||
// Escalate.
|
||||
if he.escalateFn != nil {
|
||||
he.escalateFn(alert)
|
||||
}
|
||||
}
|
||||
|
||||
op.CompletedAt = time.Now()
|
||||
he.setCooldown(strategy.ID, strategy.Cooldown)
|
||||
}
|
||||
|
||||
// findStrategy returns the first matching strategy for an alert.
|
||||
func (he *HealingEngine) findStrategy(alert HealthAlert) *HealingStrategy {
|
||||
he.mu.RLock()
|
||||
defer he.mu.RUnlock()
|
||||
|
||||
for i := range he.strategies {
|
||||
s := &he.strategies[i]
|
||||
if he.matchesTrigger(s.Trigger, alert) {
|
||||
return s
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// matchesTrigger checks if an alert matches a strategy's trigger condition.
|
||||
func (he *HealingEngine) matchesTrigger(trigger TriggerCondition, alert HealthAlert) bool {
|
||||
// Match by metric name.
|
||||
for _, m := range trigger.Metrics {
|
||||
if m == alert.Metric {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
// Match by component status.
|
||||
for _, s := range trigger.Statuses {
|
||||
switch s {
|
||||
case StatusCritical:
|
||||
if alert.Severity == SeverityCritical {
|
||||
return true
|
||||
}
|
||||
case StatusOffline:
|
||||
if alert.Severity == SeverityCritical && alert.SuggestedAction == "restart" {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
// isInCooldown checks if a strategy is still in its cooldown period.
|
||||
func (he *HealingEngine) isInCooldown(strategyID string) bool {
|
||||
he.mu.RLock()
|
||||
defer he.mu.RUnlock()
|
||||
|
||||
earliest, ok := he.cooldowns[strategyID]
|
||||
return ok && time.Now().Before(earliest)
|
||||
}
|
||||
|
||||
// setCooldown marks a strategy as cooling down.
|
||||
func (he *HealingEngine) setCooldown(strategyID string, duration time.Duration) {
|
||||
he.mu.Lock()
|
||||
defer he.mu.Unlock()
|
||||
he.cooldowns[strategyID] = time.Now().Add(duration)
|
||||
}
|
||||
|
||||
// createOperation creates and records a new healing operation.
|
||||
func (he *HealingEngine) createOperation(strategy *HealingStrategy, component string) *HealingOperation {
|
||||
he.mu.Lock()
|
||||
defer he.mu.Unlock()
|
||||
|
||||
he.opCounter++
|
||||
op := &HealingOperation{
|
||||
ID: fmt.Sprintf("heal-%d", he.opCounter),
|
||||
StrategyID: strategy.ID,
|
||||
Component: component,
|
||||
State: HealingIdle,
|
||||
StartedAt: time.Now(),
|
||||
ActionsRun: make([]ActionLog, 0),
|
||||
}
|
||||
he.operations = append(he.operations, op)
|
||||
return op
|
||||
}
|
||||
|
||||
// transitionOp moves an operation to a new state.
|
||||
func (he *HealingEngine) transitionOp(op *HealingOperation, newState HealingState) {
|
||||
he.logger.Debug("healing state transition",
|
||||
"op_id", op.ID,
|
||||
"from", op.State,
|
||||
"to", newState,
|
||||
)
|
||||
op.State = newState
|
||||
}
|
||||
|
||||
// diagnose performs root cause analysis for an alert.
|
||||
func (he *HealingEngine) diagnose(alert HealthAlert) Diagnosis {
|
||||
rootCause := "unknown"
|
||||
confidence := 0.5
|
||||
suggestedFix := "restart component"
|
||||
|
||||
switch {
|
||||
case alert.Metric == "memory" && alert.Current > 90:
|
||||
rootCause = "memory_exhaustion"
|
||||
confidence = 0.9
|
||||
suggestedFix = "restart with increased limits"
|
||||
case alert.Metric == "cpu" && alert.Current > 90:
|
||||
rootCause = "cpu_saturation"
|
||||
confidence = 0.8
|
||||
suggestedFix = "check for runaway goroutines"
|
||||
case alert.Metric == "error_rate":
|
||||
rootCause = "elevated_error_rate"
|
||||
confidence = 0.7
|
||||
suggestedFix = "check dependencies and config"
|
||||
case alert.Metric == "latency_p99":
|
||||
rootCause = "latency_degradation"
|
||||
confidence = 0.6
|
||||
suggestedFix = "check database and network"
|
||||
case alert.Metric == "quorum":
|
||||
rootCause = "quorum_loss"
|
||||
confidence = 0.95
|
||||
suggestedFix = "activate safe mode"
|
||||
default:
|
||||
rootCause = fmt.Sprintf("threshold_breach_%s", alert.Metric)
|
||||
confidence = 0.5
|
||||
suggestedFix = "investigate manually"
|
||||
}
|
||||
|
||||
return Diagnosis{
|
||||
Component: alert.Component,
|
||||
Metric: alert.Metric,
|
||||
RootCause: rootCause,
|
||||
Confidence: confidence,
|
||||
SuggestedFix: suggestedFix,
|
||||
}
|
||||
}
|
||||
|
||||
// executeActions runs each action in sequence.
|
||||
func (he *HealingEngine) executeActions(ctx context.Context, strategy *HealingStrategy, op *HealingOperation) error {
|
||||
for _, action := range strategy.Actions {
|
||||
actionCtx := ctx
|
||||
var cancel context.CancelFunc
|
||||
if action.Timeout > 0 {
|
||||
actionCtx, cancel = context.WithTimeout(ctx, action.Timeout)
|
||||
}
|
||||
|
||||
start := time.Now()
|
||||
err := he.executor(actionCtx, action, op.Component)
|
||||
duration := time.Since(start)
|
||||
|
||||
if cancel != nil {
|
||||
cancel()
|
||||
}
|
||||
|
||||
logEntry := ActionLog{
|
||||
Action: action.Type,
|
||||
StartedAt: start,
|
||||
Duration: duration,
|
||||
Success: err == nil,
|
||||
}
|
||||
if err != nil {
|
||||
logEntry.Error = err.Error()
|
||||
}
|
||||
op.ActionsRun = append(op.ActionsRun, logEntry)
|
||||
|
||||
if err != nil {
|
||||
switch action.OnError {
|
||||
case "continue":
|
||||
he.logger.Warn("action failed, continuing",
|
||||
"action", action.Type,
|
||||
"error", err,
|
||||
)
|
||||
case "rollback":
|
||||
return fmt.Errorf("action %s failed (rollback): %w", action.Type, err)
|
||||
default: // "abort"
|
||||
return fmt.Errorf("action %s failed: %w", action.Type, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// verifyRecovery checks if the component is healthy after healing.
|
||||
func (he *HealingEngine) verifyRecovery(ctx context.Context, strategy *HealingStrategy, component string) error {
|
||||
// Execute a verify_health action if not already in the strategy.
|
||||
verifyAction := Action{
|
||||
Type: ActionVerifyHealth,
|
||||
Timeout: 30 * time.Second,
|
||||
}
|
||||
return he.executor(ctx, verifyAction, component)
|
||||
}
|
||||
|
||||
// executeRollback runs the rollback plan for a failed healing.
|
||||
func (he *HealingEngine) executeRollback(ctx context.Context, strategy *HealingStrategy, op *HealingOperation) {
|
||||
if len(strategy.Rollback.Actions) == 0 {
|
||||
he.logger.Info("no rollback actions defined",
|
||||
"strategy", strategy.ID,
|
||||
)
|
||||
return
|
||||
}
|
||||
|
||||
he.logger.Warn("executing rollback",
|
||||
"strategy", strategy.ID,
|
||||
"component", op.Component,
|
||||
)
|
||||
|
||||
for _, action := range strategy.Rollback.Actions {
|
||||
if err := he.executor(ctx, action, op.Component); err != nil {
|
||||
he.logger.Error("rollback action failed",
|
||||
"action", action.Type,
|
||||
"error", err,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// GetOperation returns a healing operation by ID.
|
||||
// Returns a deep copy to prevent data races with the healing goroutine.
|
||||
func (he *HealingEngine) GetOperation(id string) (*HealingOperation, bool) {
|
||||
he.mu.RLock()
|
||||
defer he.mu.RUnlock()
|
||||
|
||||
for _, op := range he.operations {
|
||||
if op.ID == id {
|
||||
cp := *op
|
||||
cp.ActionsRun = make([]ActionLog, len(op.ActionsRun))
|
||||
copy(cp.ActionsRun, op.ActionsRun)
|
||||
if op.Diagnosis != nil {
|
||||
diag := *op.Diagnosis
|
||||
cp.Diagnosis = &diag
|
||||
}
|
||||
return &cp, true
|
||||
}
|
||||
}
|
||||
return nil, false
|
||||
}
|
||||
|
||||
// RecentOperations returns the last N operations.
|
||||
// Returns deep copies to prevent data races with the healing goroutine.
|
||||
func (he *HealingEngine) RecentOperations(n int) []HealingOperation {
|
||||
he.mu.RLock()
|
||||
defer he.mu.RUnlock()
|
||||
|
||||
total := len(he.operations)
|
||||
if total == 0 {
|
||||
return nil
|
||||
}
|
||||
start := total - n
|
||||
if start < 0 {
|
||||
start = 0
|
||||
}
|
||||
|
||||
result := make([]HealingOperation, 0, n)
|
||||
for i := start; i < total; i++ {
|
||||
cp := *he.operations[i]
|
||||
cp.ActionsRun = make([]ActionLog, len(he.operations[i].ActionsRun))
|
||||
copy(cp.ActionsRun, he.operations[i].ActionsRun)
|
||||
if he.operations[i].Diagnosis != nil {
|
||||
diag := *he.operations[i].Diagnosis
|
||||
cp.Diagnosis = &diag
|
||||
}
|
||||
result = append(result, cp)
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// StrategyCount returns the number of registered strategies.
|
||||
func (he *HealingEngine) StrategyCount() int {
|
||||
he.mu.RLock()
|
||||
defer he.mu.RUnlock()
|
||||
return len(he.strategies)
|
||||
}
|
||||
588
internal/application/resilience/healing_engine_test.go
Normal file
588
internal/application/resilience/healing_engine_test.go
Normal file
|
|
@ -0,0 +1,588 @@
|
|||
package resilience
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// --- Mock executor for tests ---
|
||||
|
||||
type mockExecutorLog struct {
|
||||
actions []ActionType
|
||||
fail map[ActionType]bool
|
||||
count atomic.Int64
|
||||
}
|
||||
|
||||
func newMockExecutor() *mockExecutorLog {
|
||||
return &mockExecutorLog{
|
||||
fail: make(map[ActionType]bool),
|
||||
}
|
||||
}
|
||||
|
||||
func (m *mockExecutorLog) execute(_ context.Context, action Action, _ string) error {
|
||||
m.count.Add(1)
|
||||
m.actions = append(m.actions, action.Type)
|
||||
if m.fail[action.Type] {
|
||||
return fmt.Errorf("action %s failed", action.Type)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// --- Healing Engine Tests ---
|
||||
|
||||
// HE-01: Component restart (success).
|
||||
func TestHealingEngine_HE01_RestartSuccess(t *testing.T) {
|
||||
mock := newMockExecutor()
|
||||
alertCh := make(chan HealthAlert, 10)
|
||||
escalated := false
|
||||
|
||||
he := NewHealingEngine(alertCh, mock.execute, func(_ HealthAlert) {
|
||||
escalated = true
|
||||
})
|
||||
he.RegisterStrategy(RestartComponentStrategy())
|
||||
|
||||
alertCh <- HealthAlert{
|
||||
Component: "soc-ingest",
|
||||
Severity: SeverityCritical,
|
||||
Metric: "quorum",
|
||||
SuggestedAction: "restart",
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
|
||||
// Run one healing cycle.
|
||||
go he.Start(ctx)
|
||||
time.Sleep(200 * time.Millisecond)
|
||||
cancel()
|
||||
|
||||
ops := he.RecentOperations(10)
|
||||
if len(ops) == 0 {
|
||||
t.Fatal("expected at least 1 operation")
|
||||
}
|
||||
if ops[0].Result != ResultSuccess {
|
||||
t.Errorf("expected SUCCESS, got %s (error: %s)", ops[0].Result, ops[0].Error)
|
||||
}
|
||||
if escalated {
|
||||
t.Error("should not have escalated on success")
|
||||
}
|
||||
}
|
||||
|
||||
// HE-02: Component restart (failure ×3 → escalate).
|
||||
func TestHealingEngine_HE02_RestartFailureEscalate(t *testing.T) {
|
||||
mock := newMockExecutor()
|
||||
mock.fail[ActionStartComponent] = true // Start always fails.
|
||||
|
||||
alertCh := make(chan HealthAlert, 10)
|
||||
escalated := false
|
||||
|
||||
he := NewHealingEngine(alertCh, mock.execute, func(_ HealthAlert) {
|
||||
escalated = true
|
||||
})
|
||||
he.RegisterStrategy(RestartComponentStrategy())
|
||||
|
||||
alertCh <- HealthAlert{
|
||||
Component: "soc-correlate",
|
||||
Severity: SeverityCritical,
|
||||
Metric: "quorum",
|
||||
SuggestedAction: "restart",
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
|
||||
go he.Start(ctx)
|
||||
time.Sleep(200 * time.Millisecond)
|
||||
cancel()
|
||||
|
||||
if !escalated {
|
||||
t.Error("expected escalation on failure")
|
||||
}
|
||||
|
||||
ops := he.RecentOperations(10)
|
||||
if len(ops) == 0 {
|
||||
t.Fatal("expected operation")
|
||||
}
|
||||
if ops[0].Result != ResultFailed {
|
||||
t.Errorf("expected FAILED, got %s", ops[0].Result)
|
||||
}
|
||||
}
|
||||
|
||||
// HE-03: Config rollback strategy matching.
|
||||
func TestHealingEngine_HE03_ConfigRollback(t *testing.T) {
|
||||
mock := newMockExecutor()
|
||||
alertCh := make(chan HealthAlert, 10)
|
||||
|
||||
he := NewHealingEngine(alertCh, mock.execute, nil)
|
||||
he.RegisterStrategy(RollbackConfigStrategy())
|
||||
|
||||
alertCh <- HealthAlert{
|
||||
Component: "soc-ingest",
|
||||
Severity: SeverityWarning,
|
||||
Metric: "config_tampering",
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
|
||||
go he.Start(ctx)
|
||||
time.Sleep(200 * time.Millisecond)
|
||||
cancel()
|
||||
|
||||
ops := he.RecentOperations(10)
|
||||
if len(ops) == 0 {
|
||||
t.Fatal("expected operation for config rollback")
|
||||
}
|
||||
if ops[0].StrategyID != "ROLLBACK_CONFIG" {
|
||||
t.Errorf("expected ROLLBACK_CONFIG, got %s", ops[0].StrategyID)
|
||||
}
|
||||
}
|
||||
|
||||
// HE-04: Database recovery.
|
||||
func TestHealingEngine_HE04_DatabaseRecovery(t *testing.T) {
|
||||
mock := newMockExecutor()
|
||||
alertCh := make(chan HealthAlert, 10)
|
||||
|
||||
he := NewHealingEngine(alertCh, mock.execute, nil)
|
||||
he.RegisterStrategy(RecoverDatabaseStrategy())
|
||||
|
||||
alertCh <- HealthAlert{
|
||||
Component: "soc-correlate",
|
||||
Severity: SeverityCritical,
|
||||
Metric: "database_corruption",
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
|
||||
go he.Start(ctx)
|
||||
time.Sleep(200 * time.Millisecond)
|
||||
cancel()
|
||||
|
||||
ops := he.RecentOperations(10)
|
||||
if len(ops) == 0 {
|
||||
t.Fatal("expected DB recovery op")
|
||||
}
|
||||
if ops[0].StrategyID != "RECOVER_DATABASE" {
|
||||
t.Errorf("expected RECOVER_DATABASE, got %s", ops[0].StrategyID)
|
||||
}
|
||||
}
|
||||
|
||||
// HE-05: Rule poisoning defense.
|
||||
func TestHealingEngine_HE05_RulePoisoning(t *testing.T) {
|
||||
mock := newMockExecutor()
|
||||
alertCh := make(chan HealthAlert, 10)
|
||||
|
||||
he := NewHealingEngine(alertCh, mock.execute, nil)
|
||||
he.RegisterStrategy(RecoverRulesStrategy())
|
||||
|
||||
alertCh <- HealthAlert{
|
||||
Component: "soc-correlate",
|
||||
Severity: SeverityWarning,
|
||||
Metric: "rule_execution_failure_rate",
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
|
||||
go he.Start(ctx)
|
||||
time.Sleep(200 * time.Millisecond)
|
||||
cancel()
|
||||
|
||||
ops := he.RecentOperations(10)
|
||||
if len(ops) == 0 {
|
||||
t.Fatal("expected rule recovery op")
|
||||
}
|
||||
if ops[0].StrategyID != "RECOVER_RULES" {
|
||||
t.Errorf("expected RECOVER_RULES, got %s", ops[0].StrategyID)
|
||||
}
|
||||
}
|
||||
|
||||
// HE-06: Network isolation recovery.
|
||||
func TestHealingEngine_HE06_NetworkRecovery(t *testing.T) {
|
||||
mock := newMockExecutor()
|
||||
alertCh := make(chan HealthAlert, 10)
|
||||
|
||||
he := NewHealingEngine(alertCh, mock.execute, nil)
|
||||
he.RegisterStrategy(RecoverNetworkStrategy())
|
||||
|
||||
alertCh <- HealthAlert{
|
||||
Component: "soc-respond",
|
||||
Severity: SeverityWarning,
|
||||
Metric: "network_partition",
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
|
||||
go he.Start(ctx)
|
||||
time.Sleep(200 * time.Millisecond)
|
||||
cancel()
|
||||
|
||||
ops := he.RecentOperations(10)
|
||||
if len(ops) == 0 {
|
||||
t.Fatal("expected network recovery op")
|
||||
}
|
||||
if ops[0].StrategyID != "RECOVER_NETWORK" {
|
||||
t.Errorf("expected RECOVER_NETWORK, got %s", ops[0].StrategyID)
|
||||
}
|
||||
}
|
||||
|
||||
// HE-07: Cooldown enforcement.
|
||||
func TestHealingEngine_HE07_Cooldown(t *testing.T) {
|
||||
mock := newMockExecutor()
|
||||
alertCh := make(chan HealthAlert, 10)
|
||||
|
||||
he := NewHealingEngine(alertCh, mock.execute, nil)
|
||||
he.RegisterStrategy(RestartComponentStrategy())
|
||||
|
||||
// Set cooldown manually.
|
||||
he.setCooldown("RESTART_COMPONENT", 1*time.Hour)
|
||||
|
||||
if !he.isInCooldown("RESTART_COMPONENT") {
|
||||
t.Error("expected cooldown active")
|
||||
}
|
||||
|
||||
alertCh <- HealthAlert{
|
||||
Component: "soc-ingest",
|
||||
Severity: SeverityCritical,
|
||||
Metric: "quorum",
|
||||
SuggestedAction: "restart",
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond)
|
||||
defer cancel()
|
||||
|
||||
go he.Start(ctx)
|
||||
time.Sleep(200 * time.Millisecond)
|
||||
cancel()
|
||||
|
||||
ops := he.RecentOperations(10)
|
||||
if len(ops) != 0 {
|
||||
t.Error("expected 0 operations during cooldown")
|
||||
}
|
||||
}
|
||||
|
||||
// HE-08: Rollback on failure.
|
||||
func TestHealingEngine_HE08_Rollback(t *testing.T) {
|
||||
mock := newMockExecutor()
|
||||
mock.fail[ActionStartComponent] = true
|
||||
|
||||
alertCh := make(chan HealthAlert, 10)
|
||||
he := NewHealingEngine(alertCh, mock.execute, func(_ HealthAlert) {})
|
||||
|
||||
strategy := RollbackConfigStrategy()
|
||||
he.RegisterStrategy(strategy)
|
||||
|
||||
alertCh <- HealthAlert{
|
||||
Component: "soc-ingest",
|
||||
Severity: SeverityWarning,
|
||||
Metric: "config_tampering",
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
|
||||
go he.Start(ctx)
|
||||
time.Sleep(200 * time.Millisecond)
|
||||
cancel()
|
||||
|
||||
// Rollback should have executed enter_safe_mode.
|
||||
foundSafeMode := false
|
||||
for _, a := range mock.actions {
|
||||
if a == ActionEnterSafeMode {
|
||||
foundSafeMode = true
|
||||
}
|
||||
}
|
||||
if !foundSafeMode {
|
||||
t.Errorf("expected safe mode in rollback, actions: %v", mock.actions)
|
||||
}
|
||||
}
|
||||
|
||||
// HE-09: State machine transitions.
|
||||
func TestHealingEngine_HE09_StateTransitions(t *testing.T) {
|
||||
mock := newMockExecutor()
|
||||
alertCh := make(chan HealthAlert, 10)
|
||||
|
||||
he := NewHealingEngine(alertCh, mock.execute, nil)
|
||||
he.RegisterStrategy(RestartComponentStrategy())
|
||||
|
||||
alertCh <- HealthAlert{
|
||||
Component: "comp",
|
||||
Severity: SeverityCritical,
|
||||
Metric: "quorum",
|
||||
SuggestedAction: "restart",
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
|
||||
go he.Start(ctx)
|
||||
time.Sleep(200 * time.Millisecond)
|
||||
cancel()
|
||||
|
||||
ops := he.RecentOperations(10)
|
||||
if len(ops) == 0 {
|
||||
t.Fatal("expected operation")
|
||||
}
|
||||
// Final state should be COMPLETED.
|
||||
if ops[0].State != HealingCompleted {
|
||||
t.Errorf("expected COMPLETED, got %s", ops[0].State)
|
||||
}
|
||||
}
|
||||
|
||||
// HE-10: Audit logging — all actions recorded.
|
||||
func TestHealingEngine_HE10_AuditLogging(t *testing.T) {
|
||||
mock := newMockExecutor()
|
||||
alertCh := make(chan HealthAlert, 10)
|
||||
|
||||
he := NewHealingEngine(alertCh, mock.execute, nil)
|
||||
he.RegisterStrategy(RestartComponentStrategy())
|
||||
|
||||
alertCh <- HealthAlert{
|
||||
Component: "comp",
|
||||
Severity: SeverityCritical,
|
||||
Metric: "quorum",
|
||||
SuggestedAction: "restart",
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
|
||||
go he.Start(ctx)
|
||||
time.Sleep(200 * time.Millisecond)
|
||||
cancel()
|
||||
|
||||
ops := he.RecentOperations(10)
|
||||
if len(ops) == 0 {
|
||||
t.Fatal("expected operation")
|
||||
}
|
||||
if len(ops[0].ActionsRun) == 0 {
|
||||
t.Error("expected action logs")
|
||||
}
|
||||
for _, al := range ops[0].ActionsRun {
|
||||
if al.StartedAt.IsZero() {
|
||||
t.Error("action log missing start time")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// HE-11: Parallel healing — no race conditions.
|
||||
func TestHealingEngine_HE11_Parallel(t *testing.T) {
|
||||
mock := newMockExecutor()
|
||||
alertCh := make(chan HealthAlert, 100)
|
||||
|
||||
he := NewHealingEngine(alertCh, mock.execute, nil)
|
||||
for _, s := range DefaultStrategies() {
|
||||
he.RegisterStrategy(s)
|
||||
}
|
||||
|
||||
// Send many alerts concurrently.
|
||||
for i := 0; i < 10; i++ {
|
||||
alertCh <- HealthAlert{
|
||||
Component: fmt.Sprintf("comp-%d", i),
|
||||
Severity: SeverityCritical,
|
||||
Metric: "quorum",
|
||||
SuggestedAction: "restart",
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
|
||||
defer cancel()
|
||||
|
||||
go he.Start(ctx)
|
||||
time.Sleep(1 * time.Second)
|
||||
cancel()
|
||||
|
||||
// All 10 alerts processed (first gets an op, rest hit cooldown).
|
||||
ops := he.RecentOperations(100)
|
||||
if len(ops) == 0 {
|
||||
t.Fatal("expected at least 1 operation")
|
||||
}
|
||||
}
|
||||
|
||||
// HE-12: No matching strategy → no operation.
|
||||
func TestHealingEngine_HE12_NoStrategy(t *testing.T) {
|
||||
mock := newMockExecutor()
|
||||
alertCh := make(chan HealthAlert, 10)
|
||||
|
||||
he := NewHealingEngine(alertCh, mock.execute, nil)
|
||||
// No strategies registered.
|
||||
|
||||
alertCh <- HealthAlert{
|
||||
Component: "comp",
|
||||
Severity: SeverityCritical,
|
||||
Metric: "unknown_metric",
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond)
|
||||
defer cancel()
|
||||
|
||||
go he.Start(ctx)
|
||||
time.Sleep(200 * time.Millisecond)
|
||||
cancel()
|
||||
|
||||
ops := he.RecentOperations(10)
|
||||
if len(ops) != 0 {
|
||||
t.Errorf("expected 0 operations, got %d", len(ops))
|
||||
}
|
||||
}
|
||||
|
||||
// Test diagnosis (various root causes).
|
||||
func TestHealingEngine_Diagnosis(t *testing.T) {
|
||||
mock := newMockExecutor()
|
||||
he := NewHealingEngine(nil, mock.execute, nil)
|
||||
|
||||
tests := []struct {
|
||||
metric string
|
||||
current float64
|
||||
wantCause string
|
||||
}{
|
||||
{"memory", 95, "memory_exhaustion"},
|
||||
{"cpu", 95, "cpu_saturation"},
|
||||
{"error_rate", 10, "elevated_error_rate"},
|
||||
{"latency_p99", 200, "latency_degradation"},
|
||||
{"quorum", 0.3, "quorum_loss"},
|
||||
{"custom", 100, "threshold_breach_custom"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
alert := HealthAlert{
|
||||
Component: "test",
|
||||
Metric: tt.metric,
|
||||
Current: tt.current,
|
||||
}
|
||||
d := he.diagnose(alert)
|
||||
if d.RootCause != tt.wantCause {
|
||||
t.Errorf("metric=%s: expected %s, got %s", tt.metric, tt.wantCause, d.RootCause)
|
||||
}
|
||||
if d.Confidence <= 0 || d.Confidence > 1 {
|
||||
t.Errorf("metric=%s: invalid confidence %f", tt.metric, d.Confidence)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Test DefaultStrategies returns 5 strategies.
|
||||
func TestDefaultStrategies(t *testing.T) {
|
||||
strategies := DefaultStrategies()
|
||||
if len(strategies) != 5 {
|
||||
t.Errorf("expected 5 strategies, got %d", len(strategies))
|
||||
}
|
||||
|
||||
ids := map[string]bool{}
|
||||
for _, s := range strategies {
|
||||
if ids[s.ID] {
|
||||
t.Errorf("duplicate strategy ID: %s", s.ID)
|
||||
}
|
||||
ids[s.ID] = true
|
||||
if s.MaxAttempts <= 0 {
|
||||
t.Errorf("strategy %s: invalid max_attempts %d", s.ID, s.MaxAttempts)
|
||||
}
|
||||
if s.Cooldown <= 0 {
|
||||
t.Errorf("strategy %s: invalid cooldown %v", s.ID, s.Cooldown)
|
||||
}
|
||||
if len(s.Actions) == 0 {
|
||||
t.Errorf("strategy %s: no actions defined", s.ID)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Test StrategyCount.
|
||||
func TestHealingEngine_StrategyCount(t *testing.T) {
|
||||
he := NewHealingEngine(nil, nil, nil)
|
||||
if he.StrategyCount() != 0 {
|
||||
t.Error("expected 0")
|
||||
}
|
||||
for _, s := range DefaultStrategies() {
|
||||
he.RegisterStrategy(s)
|
||||
}
|
||||
if he.StrategyCount() != 5 {
|
||||
t.Errorf("expected 5, got %d", he.StrategyCount())
|
||||
}
|
||||
}
|
||||
|
||||
// Test GetOperation.
|
||||
func TestHealingEngine_GetOperation(t *testing.T) {
|
||||
mock := newMockExecutor()
|
||||
alertCh := make(chan HealthAlert, 10)
|
||||
|
||||
he := NewHealingEngine(alertCh, mock.execute, nil)
|
||||
he.RegisterStrategy(RestartComponentStrategy())
|
||||
|
||||
alertCh <- HealthAlert{
|
||||
Component: "comp",
|
||||
Severity: SeverityCritical,
|
||||
Metric: "quorum",
|
||||
SuggestedAction: "restart",
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
|
||||
go he.Start(ctx)
|
||||
time.Sleep(200 * time.Millisecond)
|
||||
cancel()
|
||||
|
||||
op, ok := he.GetOperation("heal-1")
|
||||
if !ok {
|
||||
t.Fatal("expected operation heal-1")
|
||||
}
|
||||
if op.Component != "comp" {
|
||||
t.Errorf("expected comp, got %s", op.Component)
|
||||
}
|
||||
|
||||
_, ok = he.GetOperation("nonexistent")
|
||||
if ok {
|
||||
t.Error("expected not found for nonexistent")
|
||||
}
|
||||
}
|
||||
|
||||
// Test action OnError=continue.
|
||||
func TestHealingEngine_ActionContinueOnError(t *testing.T) {
|
||||
mock := newMockExecutor()
|
||||
mock.fail[ActionGracefulStop] = true // First action fails but marked continue.
|
||||
|
||||
alertCh := make(chan HealthAlert, 10)
|
||||
he := NewHealingEngine(alertCh, mock.execute, nil)
|
||||
he.RegisterStrategy(RestartComponentStrategy())
|
||||
|
||||
alertCh <- HealthAlert{
|
||||
Component: "comp",
|
||||
Severity: SeverityCritical,
|
||||
Metric: "quorum",
|
||||
SuggestedAction: "restart",
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
|
||||
go he.Start(ctx)
|
||||
time.Sleep(200 * time.Millisecond)
|
||||
cancel()
|
||||
|
||||
ops := he.RecentOperations(10)
|
||||
if len(ops) == 0 {
|
||||
t.Fatal("expected operation")
|
||||
}
|
||||
// Should still succeed because graceful_stop has OnError=continue.
|
||||
if ops[0].Result != ResultSuccess {
|
||||
t.Errorf("expected SUCCESS (continue on error), got %s", ops[0].Result)
|
||||
}
|
||||
}
|
||||
215
internal/application/resilience/healing_strategies.go
Normal file
215
internal/application/resilience/healing_strategies.go
Normal file
|
|
@ -0,0 +1,215 @@
|
|||
package resilience
|
||||
|
||||
import "time"
|
||||
|
||||
// Built-in healing strategies per ТЗ §4.1.1.
|
||||
// These are registered at startup via HealingEngine.RegisterStrategy().
|
||||
|
||||
// DefaultStrategies returns the 5 built-in healing strategies.
|
||||
func DefaultStrategies() []HealingStrategy {
|
||||
return []HealingStrategy{
|
||||
RestartComponentStrategy(),
|
||||
RollbackConfigStrategy(),
|
||||
RecoverDatabaseStrategy(),
|
||||
RecoverRulesStrategy(),
|
||||
RecoverNetworkStrategy(),
|
||||
}
|
||||
}
|
||||
|
||||
// RestartComponentStrategy handles component crashes and offline states.
|
||||
// Trigger: component_offline OR component_critical, 2 consecutive failures within 5m.
|
||||
// Actions: graceful_stop → clear_temp → start → verify → notify.
|
||||
// Rollback: escalate to next strategy.
|
||||
func RestartComponentStrategy() HealingStrategy {
|
||||
return HealingStrategy{
|
||||
ID: "RESTART_COMPONENT",
|
||||
Name: "Component Restart",
|
||||
Trigger: TriggerCondition{
|
||||
Statuses: []ComponentStatus{StatusOffline, StatusCritical},
|
||||
ConsecutiveFailures: 2,
|
||||
WithinWindow: 5 * time.Minute,
|
||||
},
|
||||
Actions: []Action{
|
||||
{Type: ActionGracefulStop, Timeout: 10 * time.Second, OnError: "continue"},
|
||||
{Type: ActionClearTempFiles, Timeout: 5 * time.Second, OnError: "continue"},
|
||||
{Type: ActionStartComponent, Timeout: 30 * time.Second, OnError: "abort"},
|
||||
{Type: ActionVerifyHealth, Timeout: 60 * time.Second, OnError: "abort"},
|
||||
{Type: ActionNotifySOC, Timeout: 5 * time.Second, OnError: "continue",
|
||||
Params: map[string]interface{}{
|
||||
"severity": "INFO",
|
||||
"message": "Component restarted successfully",
|
||||
},
|
||||
},
|
||||
},
|
||||
Rollback: RollbackPlan{
|
||||
OnFailure: "escalate",
|
||||
Actions: []Action{
|
||||
{Type: ActionNotifyArchitect, Timeout: 5 * time.Second,
|
||||
Params: map[string]interface{}{
|
||||
"severity": "CRITICAL",
|
||||
"message": "Component restart failed after max attempts",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
MaxAttempts: 3,
|
||||
Cooldown: 5 * time.Minute,
|
||||
}
|
||||
}
|
||||
|
||||
// RollbackConfigStrategy handles config tampering or validation failures.
|
||||
// Trigger: config_tampering_detected OR config_validation_failed.
|
||||
// Actions: freeze → verify_backup → rollback → restart → verify → notify.
|
||||
func RollbackConfigStrategy() HealingStrategy {
|
||||
return HealingStrategy{
|
||||
ID: "ROLLBACK_CONFIG",
|
||||
Name: "Configuration Rollback",
|
||||
Trigger: TriggerCondition{
|
||||
Metrics: []string{"config_tampering", "config_validation"},
|
||||
},
|
||||
Actions: []Action{
|
||||
{Type: ActionFreezeConfig, Timeout: 5 * time.Second, OnError: "abort"},
|
||||
{Type: ActionRollbackConfig, Timeout: 15 * time.Second, OnError: "abort"},
|
||||
{Type: ActionStartComponent, Timeout: 30 * time.Second, OnError: "rollback"},
|
||||
{Type: ActionVerifyConfig, Timeout: 10 * time.Second, OnError: "abort"},
|
||||
{Type: ActionNotifyArchitect, Timeout: 5 * time.Second, OnError: "continue",
|
||||
Params: map[string]interface{}{
|
||||
"severity": "WARNING",
|
||||
"message": "Config rolled back due to tampering",
|
||||
},
|
||||
},
|
||||
},
|
||||
Rollback: RollbackPlan{
|
||||
OnFailure: "enter_safe_mode",
|
||||
Actions: []Action{
|
||||
{Type: ActionEnterSafeMode, Timeout: 10 * time.Second},
|
||||
},
|
||||
},
|
||||
MaxAttempts: 1,
|
||||
Cooldown: 1 * time.Hour,
|
||||
}
|
||||
}
|
||||
|
||||
// RecoverDatabaseStrategy handles SQLite corruption.
|
||||
// Trigger: database_corruption OR sqlite_integrity_failed.
|
||||
// Actions: readonly → backup → restore → verify → resume → notify.
|
||||
func RecoverDatabaseStrategy() HealingStrategy {
|
||||
return HealingStrategy{
|
||||
ID: "RECOVER_DATABASE",
|
||||
Name: "Database Recovery",
|
||||
Trigger: TriggerCondition{
|
||||
Metrics: []string{"database_corruption", "sqlite_integrity"},
|
||||
},
|
||||
Actions: []Action{
|
||||
{Type: ActionSwitchReadOnly, Timeout: 5 * time.Second, OnError: "abort"},
|
||||
{Type: ActionBackupDB, Timeout: 30 * time.Second, OnError: "continue"},
|
||||
{Type: ActionRestoreSnapshot, Timeout: 60 * time.Second, OnError: "abort",
|
||||
Params: map[string]interface{}{
|
||||
"snapshot_age_max": "1h",
|
||||
},
|
||||
},
|
||||
{Type: ActionVerifyIntegrity, Timeout: 30 * time.Second, OnError: "abort"},
|
||||
{Type: ActionResumeWrites, Timeout: 5 * time.Second, OnError: "abort"},
|
||||
{Type: ActionNotifySOC, Timeout: 5 * time.Second, OnError: "continue",
|
||||
Params: map[string]interface{}{
|
||||
"severity": "WARNING",
|
||||
"message": "Database recovered from snapshot",
|
||||
},
|
||||
},
|
||||
},
|
||||
Rollback: RollbackPlan{
|
||||
OnFailure: "enter_lockdown",
|
||||
Actions: []Action{
|
||||
{Type: ActionEnterSafeMode, Timeout: 10 * time.Second},
|
||||
{Type: ActionNotifyArchitect, Timeout: 5 * time.Second,
|
||||
Params: map[string]interface{}{
|
||||
"severity": "CRITICAL",
|
||||
"message": "Database recovery failed",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
MaxAttempts: 2,
|
||||
Cooldown: 2 * time.Hour,
|
||||
}
|
||||
}
|
||||
|
||||
// RecoverRulesStrategy handles correlation rule poisoning.
|
||||
// Trigger: rule execution failure rate > 50%.
|
||||
// Actions: disable_suspicious → revert_baseline → verify → reload → notify.
|
||||
func RecoverRulesStrategy() HealingStrategy {
|
||||
return HealingStrategy{
|
||||
ID: "RECOVER_RULES",
|
||||
Name: "Rule Poisoning Defense",
|
||||
Trigger: TriggerCondition{
|
||||
Metrics: []string{"rule_execution_failure_rate", "correlation_rule_anomaly"},
|
||||
},
|
||||
Actions: []Action{
|
||||
{Type: ActionDisableRules, Timeout: 10 * time.Second, OnError: "abort",
|
||||
Params: map[string]interface{}{
|
||||
"criteria": "failure_rate > 80%",
|
||||
},
|
||||
},
|
||||
{Type: ActionRevertRules, Timeout: 15 * time.Second, OnError: "abort"},
|
||||
{Type: ActionReloadEngine, Timeout: 30 * time.Second, OnError: "abort"},
|
||||
{Type: ActionVerifyHealth, Timeout: 30 * time.Second, OnError: "continue"},
|
||||
{Type: ActionNotifyArchitect, Timeout: 5 * time.Second, OnError: "continue",
|
||||
Params: map[string]interface{}{
|
||||
"severity": "WARNING",
|
||||
"message": "Rules recovered from baseline",
|
||||
},
|
||||
},
|
||||
},
|
||||
Rollback: RollbackPlan{
|
||||
OnFailure: "disable_correlation",
|
||||
},
|
||||
MaxAttempts: 2,
|
||||
Cooldown: 4 * time.Hour,
|
||||
}
|
||||
}
|
||||
|
||||
// RecoverNetworkStrategy handles network partition or mTLS cert expiry.
|
||||
// Trigger: network_partition_detected OR mTLS_cert_expired.
|
||||
// Actions: isolate → regen_certs → verify → restore → notify.
|
||||
func RecoverNetworkStrategy() HealingStrategy {
|
||||
return HealingStrategy{
|
||||
ID: "RECOVER_NETWORK",
|
||||
Name: "Network Isolation Recovery",
|
||||
Trigger: TriggerCondition{
|
||||
Metrics: []string{"network_partition", "mtls_cert_expiry"},
|
||||
},
|
||||
Actions: []Action{
|
||||
{Type: ActionIsolateNetwork, Timeout: 5 * time.Second, OnError: "abort",
|
||||
Params: map[string]interface{}{
|
||||
"scope": "external_only",
|
||||
},
|
||||
},
|
||||
{Type: ActionRegenCerts, Timeout: 30 * time.Second, OnError: "abort",
|
||||
Params: map[string]interface{}{
|
||||
"validity": "24h",
|
||||
},
|
||||
},
|
||||
{Type: ActionVerifyHealth, Timeout: 30 * time.Second, OnError: "rollback"},
|
||||
{Type: ActionRestoreNetwork, Timeout: 10 * time.Second, OnError: "abort"},
|
||||
{Type: ActionNotifySOC, Timeout: 5 * time.Second, OnError: "continue",
|
||||
Params: map[string]interface{}{
|
||||
"severity": "INFO",
|
||||
"message": "Network connectivity restored",
|
||||
},
|
||||
},
|
||||
},
|
||||
Rollback: RollbackPlan{
|
||||
OnFailure: "maintain_isolation",
|
||||
Actions: []Action{
|
||||
{Type: ActionNotifyArchitect, Timeout: 5 * time.Second,
|
||||
Params: map[string]interface{}{
|
||||
"severity": "CRITICAL",
|
||||
"message": "Network recovery failed, maintaining isolation",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
MaxAttempts: 3,
|
||||
Cooldown: 1 * time.Hour,
|
||||
}
|
||||
}
|
||||
445
internal/application/resilience/health_monitor.go
Normal file
445
internal/application/resilience/health_monitor.go
Normal file
|
|
@ -0,0 +1,445 @@
|
|||
package resilience
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// ComponentStatus defines the health state of a monitored component.
|
||||
type ComponentStatus string
|
||||
|
||||
const (
|
||||
StatusHealthy ComponentStatus = "HEALTHY"
|
||||
StatusDegraded ComponentStatus = "DEGRADED"
|
||||
StatusCritical ComponentStatus = "CRITICAL"
|
||||
StatusOffline ComponentStatus = "OFFLINE"
|
||||
)
|
||||
|
||||
// AlertSeverity defines the severity of a health alert.
|
||||
type AlertSeverity string
|
||||
|
||||
const (
|
||||
SeverityInfo AlertSeverity = "INFO"
|
||||
SeverityWarning AlertSeverity = "WARNING"
|
||||
SeverityCritical AlertSeverity = "CRITICAL"
|
||||
)
|
||||
|
||||
// OverallStatus aggregates component statuses into a system-wide status.
|
||||
type OverallStatus string
|
||||
|
||||
const (
|
||||
OverallHealthy OverallStatus = "HEALTHY"
|
||||
OverallDegraded OverallStatus = "DEGRADED"
|
||||
OverallCritical OverallStatus = "CRITICAL"
|
||||
)
|
||||
|
||||
// Default intervals per ТЗ §3.1.2.
|
||||
const (
|
||||
MetricsCollectionInterval = 10 * time.Second
|
||||
HealthCheckInterval = 30 * time.Second
|
||||
QuorumValidationInterval = 60 * time.Second
|
||||
|
||||
// AnomalyZScoreThreshold — Z > 3.0 = anomaly (99.7% confidence).
|
||||
AnomalyZScoreThreshold = 3.0
|
||||
|
||||
// QuorumThreshold — 2/3 must be healthy.
|
||||
QuorumThreshold = 0.66
|
||||
|
||||
// MaxConsecutiveFailures before marking CRITICAL.
|
||||
MaxConsecutiveFailures = 3
|
||||
)
|
||||
|
||||
// ComponentConfig defines monitoring thresholds for a component.
|
||||
type ComponentConfig struct {
|
||||
Name string `json:"name"`
|
||||
Type string `json:"type"` // go_binary, c_binary, c_kernel_module
|
||||
Thresholds map[string]float64 `json:"thresholds"`
|
||||
// Whether threshold is an upper bound (true) or lower bound (false).
|
||||
ThresholdIsMax map[string]bool `json:"threshold_is_max"`
|
||||
}
|
||||
|
||||
// ComponentHealth tracks the health state of a single component.
|
||||
type ComponentHealth struct {
|
||||
Name string `json:"name"`
|
||||
Status ComponentStatus `json:"status"`
|
||||
Metrics map[string]float64 `json:"metrics"`
|
||||
LastCheck time.Time `json:"last_check"`
|
||||
Consecutive int `json:"consecutive_failures"`
|
||||
Config ComponentConfig `json:"-"`
|
||||
}
|
||||
|
||||
// HealthAlert represents a detected health anomaly.
|
||||
type HealthAlert struct {
|
||||
Component string `json:"component"`
|
||||
Severity AlertSeverity `json:"severity"`
|
||||
Metric string `json:"metric"`
|
||||
Current float64 `json:"current"`
|
||||
Threshold float64 `json:"threshold"`
|
||||
ZScore float64 `json:"z_score,omitempty"`
|
||||
Timestamp time.Time `json:"timestamp"`
|
||||
SuggestedAction string `json:"suggested_action"`
|
||||
}
|
||||
|
||||
// HealthResponse is the API response for GET /api/v1/resilience/health.
|
||||
type HealthResponse struct {
|
||||
OverallStatus OverallStatus `json:"overall_status"`
|
||||
Components []ComponentHealth `json:"components"`
|
||||
QuorumValid bool `json:"quorum_valid"`
|
||||
LastCheck time.Time `json:"last_check"`
|
||||
AnomaliesDetected []HealthAlert `json:"anomalies_detected"`
|
||||
}
|
||||
|
||||
// MetricsCollector is the interface for collecting metrics from components.
|
||||
// Implementations can use /healthz endpoints, /metrics, or runtime stats.
|
||||
type MetricsCollector interface {
|
||||
Collect(ctx context.Context, component string) (map[string]float64, error)
|
||||
}
|
||||
|
||||
// HealthMonitor is the L1 Self-Monitoring orchestrator.
|
||||
// It collects metrics, runs anomaly detection, validates quorum,
|
||||
// and emits HealthAlerts to the alert bus.
|
||||
type HealthMonitor struct {
|
||||
mu sync.RWMutex
|
||||
components map[string]*ComponentHealth
|
||||
metricsDB *MetricsDB
|
||||
alertBus chan HealthAlert
|
||||
collector MetricsCollector
|
||||
logger *slog.Logger
|
||||
|
||||
// anomalyWindow is the baseline window for Z-score calculation.
|
||||
anomalyWindow time.Duration
|
||||
}
|
||||
|
||||
// NewHealthMonitor creates a new health monitor.
|
||||
func NewHealthMonitor(collector MetricsCollector, alertBufSize int) *HealthMonitor {
|
||||
if alertBufSize <= 0 {
|
||||
alertBufSize = 100
|
||||
}
|
||||
return &HealthMonitor{
|
||||
components: make(map[string]*ComponentHealth),
|
||||
metricsDB: NewMetricsDB(DefaultMetricsWindow, DefaultMetricsMaxSize),
|
||||
alertBus: make(chan HealthAlert, alertBufSize),
|
||||
collector: collector,
|
||||
logger: slog.Default().With("component", "sarl-health-monitor"),
|
||||
anomalyWindow: 24 * time.Hour,
|
||||
}
|
||||
}
|
||||
|
||||
// RegisterComponent adds a component to be monitored.
|
||||
func (hm *HealthMonitor) RegisterComponent(config ComponentConfig) {
|
||||
hm.mu.Lock()
|
||||
defer hm.mu.Unlock()
|
||||
|
||||
hm.components[config.Name] = &ComponentHealth{
|
||||
Name: config.Name,
|
||||
Status: StatusHealthy,
|
||||
Metrics: make(map[string]float64),
|
||||
Config: config,
|
||||
}
|
||||
hm.logger.Info("component registered", "name", config.Name, "type", config.Type)
|
||||
}
|
||||
|
||||
// AlertBus returns the channel for consuming health alerts.
|
||||
func (hm *HealthMonitor) AlertBus() <-chan HealthAlert {
|
||||
return hm.alertBus
|
||||
}
|
||||
|
||||
// Start begins the monitoring loops. Blocks until ctx is cancelled.
|
||||
func (hm *HealthMonitor) Start(ctx context.Context) {
|
||||
hm.logger.Info("health monitor started")
|
||||
|
||||
metricsTicker := time.NewTicker(MetricsCollectionInterval)
|
||||
healthTicker := time.NewTicker(HealthCheckInterval)
|
||||
quorumTicker := time.NewTicker(QuorumValidationInterval)
|
||||
defer metricsTicker.Stop()
|
||||
defer healthTicker.Stop()
|
||||
defer quorumTicker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
hm.logger.Info("health monitor stopped")
|
||||
return
|
||||
case <-metricsTicker.C:
|
||||
hm.collectMetrics(ctx)
|
||||
case <-healthTicker.C:
|
||||
hm.checkHealth()
|
||||
case <-quorumTicker.C:
|
||||
hm.validateQuorum()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// collectMetrics gathers metrics from all registered components.
|
||||
func (hm *HealthMonitor) collectMetrics(ctx context.Context) {
|
||||
hm.mu.RLock()
|
||||
names := make([]string, 0, len(hm.components))
|
||||
for name := range hm.components {
|
||||
names = append(names, name)
|
||||
}
|
||||
hm.mu.RUnlock()
|
||||
|
||||
for _, name := range names {
|
||||
metrics, err := hm.collector.Collect(ctx, name)
|
||||
if err != nil {
|
||||
hm.logger.Warn("metrics collection failed", "component", name, "error", err)
|
||||
hm.mu.Lock()
|
||||
if comp, ok := hm.components[name]; ok {
|
||||
comp.Consecutive++
|
||||
}
|
||||
hm.mu.Unlock()
|
||||
continue
|
||||
}
|
||||
|
||||
hm.mu.Lock()
|
||||
comp, ok := hm.components[name]
|
||||
if ok {
|
||||
comp.Metrics = metrics
|
||||
comp.LastCheck = time.Now()
|
||||
// Store each metric in time-series DB.
|
||||
for metric, value := range metrics {
|
||||
hm.metricsDB.AddDataPoint(name, metric, value)
|
||||
}
|
||||
}
|
||||
hm.mu.Unlock()
|
||||
}
|
||||
}
|
||||
|
||||
// checkHealth evaluates each component against thresholds and anomalies.
|
||||
func (hm *HealthMonitor) checkHealth() {
|
||||
hm.mu.Lock()
|
||||
defer hm.mu.Unlock()
|
||||
|
||||
for _, comp := range hm.components {
|
||||
alerts := hm.evaluateComponent(comp)
|
||||
for _, alert := range alerts {
|
||||
hm.emitAlert(alert)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// evaluateComponent checks a single component's metrics against thresholds
|
||||
// and runs Z-score anomaly detection. Returns any generated alerts.
|
||||
func (hm *HealthMonitor) evaluateComponent(comp *ComponentHealth) []HealthAlert {
|
||||
var alerts []HealthAlert
|
||||
breached := false
|
||||
|
||||
for metric, value := range comp.Metrics {
|
||||
threshold, hasThreshold := comp.Config.Thresholds[metric]
|
||||
if !hasThreshold {
|
||||
continue
|
||||
}
|
||||
|
||||
isMax := comp.Config.ThresholdIsMax[metric]
|
||||
var exceeded bool
|
||||
if isMax {
|
||||
exceeded = value > threshold
|
||||
} else {
|
||||
exceeded = value < threshold
|
||||
}
|
||||
|
||||
if exceeded {
|
||||
breached = true
|
||||
action := "restart"
|
||||
if metric == "error_rate" || metric == "latency_p99" {
|
||||
action = "investigate"
|
||||
}
|
||||
|
||||
alerts = append(alerts, HealthAlert{
|
||||
Component: comp.Name,
|
||||
Severity: SeverityWarning,
|
||||
Metric: metric,
|
||||
Current: value,
|
||||
Threshold: threshold,
|
||||
Timestamp: time.Now(),
|
||||
SuggestedAction: action,
|
||||
})
|
||||
}
|
||||
|
||||
// Z-score anomaly detection.
|
||||
baseline := hm.metricsDB.GetBaseline(comp.Name, metric, hm.anomalyWindow)
|
||||
if IsAnomaly(value, baseline, AnomalyZScoreThreshold) {
|
||||
zscore := CalculateZScore(value, baseline)
|
||||
alerts = append(alerts, HealthAlert{
|
||||
Component: comp.Name,
|
||||
Severity: SeverityCritical,
|
||||
Metric: metric,
|
||||
Current: value,
|
||||
Threshold: baseline.Mean + AnomalyZScoreThreshold*baseline.StdDev,
|
||||
ZScore: zscore,
|
||||
Timestamp: time.Now(),
|
||||
SuggestedAction: fmt.Sprintf("anomaly detected (Z=%.2f), investigate %s", zscore, metric),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// Update component status.
|
||||
if breached {
|
||||
comp.Consecutive++
|
||||
if comp.Consecutive >= MaxConsecutiveFailures {
|
||||
comp.Status = StatusCritical
|
||||
} else {
|
||||
comp.Status = StatusDegraded
|
||||
}
|
||||
} else {
|
||||
comp.Consecutive = 0
|
||||
comp.Status = StatusHealthy
|
||||
}
|
||||
|
||||
return alerts
|
||||
}
|
||||
|
||||
// emitAlert sends an alert to the bus (non-blocking).
|
||||
func (hm *HealthMonitor) emitAlert(alert HealthAlert) {
|
||||
select {
|
||||
case hm.alertBus <- alert:
|
||||
hm.logger.Warn("health alert emitted",
|
||||
"component", alert.Component,
|
||||
"severity", alert.Severity,
|
||||
"metric", alert.Metric,
|
||||
"current", alert.Current,
|
||||
"threshold", alert.Threshold,
|
||||
)
|
||||
default:
|
||||
hm.logger.Error("alert bus full, dropping alert",
|
||||
"component", alert.Component,
|
||||
"metric", alert.Metric,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// validateQuorum checks if 2/3 of components are healthy.
|
||||
func (hm *HealthMonitor) validateQuorum() {
|
||||
hm.mu.RLock()
|
||||
defer hm.mu.RUnlock()
|
||||
|
||||
if len(hm.components) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
valid := ValidateQuorum(hm.componentStatuses())
|
||||
|
||||
if !valid {
|
||||
hm.logger.Error("QUORUM LOST — entering degraded state",
|
||||
"healthy_ratio", hm.healthyRatio(),
|
||||
"threshold", QuorumThreshold,
|
||||
)
|
||||
hm.emitAlert(HealthAlert{
|
||||
Component: "system",
|
||||
Severity: SeverityCritical,
|
||||
Metric: "quorum",
|
||||
Current: hm.healthyRatio(),
|
||||
Threshold: QuorumThreshold,
|
||||
Timestamp: time.Now(),
|
||||
SuggestedAction: "activate safe mode",
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// ValidateQuorum checks if the healthy ratio meets the 2/3 threshold.
|
||||
func ValidateQuorum(statuses map[string]ComponentStatus) bool {
|
||||
if len(statuses) == 0 {
|
||||
return false
|
||||
}
|
||||
|
||||
healthy := 0
|
||||
for _, status := range statuses {
|
||||
if status == StatusHealthy {
|
||||
healthy++
|
||||
}
|
||||
}
|
||||
return float64(healthy)/float64(len(statuses)) >= QuorumThreshold
|
||||
}
|
||||
|
||||
// componentStatuses returns current status map (caller must hold RLock).
|
||||
func (hm *HealthMonitor) componentStatuses() map[string]ComponentStatus {
|
||||
statuses := make(map[string]ComponentStatus, len(hm.components))
|
||||
for name, comp := range hm.components {
|
||||
statuses[name] = comp.Status
|
||||
}
|
||||
return statuses
|
||||
}
|
||||
|
||||
// healthyRatio returns the fraction of healthy components (caller must hold RLock).
|
||||
func (hm *HealthMonitor) healthyRatio() float64 {
|
||||
if len(hm.components) == 0 {
|
||||
return 0
|
||||
}
|
||||
healthy := 0
|
||||
for _, comp := range hm.components {
|
||||
if comp.Status == StatusHealthy {
|
||||
healthy++
|
||||
}
|
||||
}
|
||||
return float64(healthy) / float64(len(hm.components))
|
||||
}
|
||||
|
||||
// GetHealth returns a snapshot of the entire system health.
|
||||
func (hm *HealthMonitor) GetHealth() HealthResponse {
|
||||
hm.mu.RLock()
|
||||
defer hm.mu.RUnlock()
|
||||
|
||||
components := make([]ComponentHealth, 0, len(hm.components))
|
||||
for _, comp := range hm.components {
|
||||
cp := *comp
|
||||
// Deep copy metrics.
|
||||
cp.Metrics = make(map[string]float64, len(comp.Metrics))
|
||||
for k, v := range comp.Metrics {
|
||||
cp.Metrics[k] = v
|
||||
}
|
||||
components = append(components, cp)
|
||||
}
|
||||
|
||||
overall := OverallHealthy
|
||||
for _, comp := range components {
|
||||
switch comp.Status {
|
||||
case StatusCritical, StatusOffline:
|
||||
overall = OverallCritical
|
||||
case StatusDegraded:
|
||||
if overall != OverallCritical {
|
||||
overall = OverallDegraded
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return HealthResponse{
|
||||
OverallStatus: overall,
|
||||
Components: components,
|
||||
QuorumValid: ValidateQuorum(hm.componentStatuses()),
|
||||
LastCheck: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
// SetComponentStatus manually sets a component's status (for testing/override).
|
||||
func (hm *HealthMonitor) SetComponentStatus(name string, status ComponentStatus) {
|
||||
hm.mu.Lock()
|
||||
defer hm.mu.Unlock()
|
||||
|
||||
if comp, ok := hm.components[name]; ok {
|
||||
comp.Status = status
|
||||
}
|
||||
}
|
||||
|
||||
// UpdateMetrics manually updates a component's metrics (for testing/override).
|
||||
func (hm *HealthMonitor) UpdateMetrics(name string, metrics map[string]float64) {
|
||||
hm.mu.Lock()
|
||||
defer hm.mu.Unlock()
|
||||
|
||||
if comp, ok := hm.components[name]; ok {
|
||||
comp.Metrics = metrics
|
||||
comp.LastCheck = time.Now()
|
||||
for metric, value := range metrics {
|
||||
hm.metricsDB.AddDataPoint(name, metric, value)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ComponentCount returns the number of registered components.
|
||||
func (hm *HealthMonitor) ComponentCount() int {
|
||||
hm.mu.RLock()
|
||||
defer hm.mu.RUnlock()
|
||||
return len(hm.components)
|
||||
}
|
||||
499
internal/application/resilience/health_monitor_test.go
Normal file
499
internal/application/resilience/health_monitor_test.go
Normal file
|
|
@ -0,0 +1,499 @@
|
|||
package resilience
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"math"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// --- MetricsDB Tests ---
|
||||
|
||||
func TestRingBuffer_AddAndAll(t *testing.T) {
|
||||
rb := newRingBuffer(5)
|
||||
now := time.Now()
|
||||
|
||||
for i := 0; i < 3; i++ {
|
||||
rb.Add(DataPoint{Timestamp: now.Add(time.Duration(i) * time.Second), Value: float64(i)})
|
||||
}
|
||||
|
||||
if rb.Len() != 3 {
|
||||
t.Fatalf("expected 3, got %d", rb.Len())
|
||||
}
|
||||
|
||||
all := rb.All()
|
||||
if len(all) != 3 {
|
||||
t.Fatalf("expected 3 points, got %d", len(all))
|
||||
}
|
||||
for i, dp := range all {
|
||||
if dp.Value != float64(i) {
|
||||
t.Errorf("point %d: expected %f, got %f", i, float64(i), dp.Value)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestRingBuffer_Wrap(t *testing.T) {
|
||||
rb := newRingBuffer(3)
|
||||
now := time.Now()
|
||||
|
||||
for i := 0; i < 5; i++ {
|
||||
rb.Add(DataPoint{Timestamp: now.Add(time.Duration(i) * time.Second), Value: float64(i)})
|
||||
}
|
||||
|
||||
if rb.Len() != 3 {
|
||||
t.Fatalf("expected 3 (buffer size), got %d", rb.Len())
|
||||
}
|
||||
|
||||
all := rb.All()
|
||||
// Should contain values 2, 3, 4 (oldest 0, 1 overwritten).
|
||||
expected := []float64{2, 3, 4}
|
||||
for i, dp := range all {
|
||||
if dp.Value != expected[i] {
|
||||
t.Errorf("point %d: expected %f, got %f", i, expected[i], dp.Value)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestMetricsDB_AddAndBaseline(t *testing.T) {
|
||||
db := NewMetricsDB(time.Hour, 100)
|
||||
for i := 0; i < 20; i++ {
|
||||
db.AddDataPoint("soc-ingest", "cpu", 30.0+float64(i%5))
|
||||
}
|
||||
|
||||
baseline := db.GetBaseline("soc-ingest", "cpu", time.Hour)
|
||||
if baseline.Count != 20 {
|
||||
t.Fatalf("expected 20 points, got %d", baseline.Count)
|
||||
}
|
||||
if baseline.Mean < 30 || baseline.Mean > 35 {
|
||||
t.Errorf("mean out of expected range: %f", baseline.Mean)
|
||||
}
|
||||
if baseline.StdDev == 0 {
|
||||
t.Error("expected non-zero stddev")
|
||||
}
|
||||
}
|
||||
|
||||
func TestMetricsDB_EmptyBaseline(t *testing.T) {
|
||||
db := NewMetricsDB(time.Hour, 100)
|
||||
baseline := db.GetBaseline("nonexistent", "cpu", time.Hour)
|
||||
if baseline.Count != 0 {
|
||||
t.Errorf("expected 0 count for nonexistent, got %d", baseline.Count)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCalculateZScore(t *testing.T) {
|
||||
baseline := Baseline{Mean: 30.0, StdDev: 5.0, Count: 100}
|
||||
|
||||
// Normal value (Z = 1.0).
|
||||
z := CalculateZScore(35.0, baseline)
|
||||
if math.Abs(z-1.0) > 0.01 {
|
||||
t.Errorf("expected Z≈1.0, got %f", z)
|
||||
}
|
||||
|
||||
// Anomalous value (Z = 4.0).
|
||||
z = CalculateZScore(50.0, baseline)
|
||||
if math.Abs(z-4.0) > 0.01 {
|
||||
t.Errorf("expected Z≈4.0, got %f", z)
|
||||
}
|
||||
|
||||
// Insufficient data → 0.
|
||||
z = CalculateZScore(50.0, Baseline{Mean: 30, StdDev: 5, Count: 5})
|
||||
if z != 0 {
|
||||
t.Errorf("expected 0 for insufficient data, got %f", z)
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsAnomaly(t *testing.T) {
|
||||
baseline := Baseline{Mean: 30.0, StdDev: 5.0, Count: 100}
|
||||
|
||||
if IsAnomaly(35.0, baseline, 3.0) {
|
||||
t.Error("35 should not be anomaly (Z=1.0)")
|
||||
}
|
||||
if !IsAnomaly(50.0, baseline, 3.0) {
|
||||
t.Error("50 should be anomaly (Z=4.0)")
|
||||
}
|
||||
if !IsAnomaly(10.0, baseline, 3.0) {
|
||||
t.Error("10 should be anomaly (Z=-4.0)")
|
||||
}
|
||||
}
|
||||
|
||||
func TestMetricsDB_Purge(t *testing.T) {
|
||||
db := NewMetricsDB(100*time.Millisecond, 100)
|
||||
db.AddDataPoint("comp", "cpu", 50)
|
||||
time.Sleep(150 * time.Millisecond)
|
||||
db.AddDataPoint("comp", "cpu", 60)
|
||||
|
||||
removed := db.Purge()
|
||||
if removed != 1 {
|
||||
t.Errorf("expected 1 purged, got %d", removed)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMetricsDB_GetRecent(t *testing.T) {
|
||||
db := NewMetricsDB(time.Hour, 100)
|
||||
for i := 0; i < 10; i++ {
|
||||
db.AddDataPoint("comp", "mem", float64(i*10))
|
||||
}
|
||||
|
||||
recent := db.GetRecent("comp", "mem", 3)
|
||||
if len(recent) != 3 {
|
||||
t.Fatalf("expected 3 recent, got %d", len(recent))
|
||||
}
|
||||
// Should be last 3: 70, 80, 90.
|
||||
if recent[0].Value != 70 || recent[2].Value != 90 {
|
||||
t.Errorf("unexpected recent values: %v", recent)
|
||||
}
|
||||
}
|
||||
|
||||
// --- MockCollector for HealthMonitor tests ---
|
||||
|
||||
type mockCollector struct {
|
||||
results map[string]map[string]float64
|
||||
errors map[string]error
|
||||
}
|
||||
|
||||
func (m *mockCollector) Collect(_ context.Context, component string) (map[string]float64, error) {
|
||||
if err, ok := m.errors[component]; ok && err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if metrics, ok := m.results[component]; ok {
|
||||
return metrics, nil
|
||||
}
|
||||
return map[string]float64{}, nil
|
||||
}
|
||||
|
||||
// --- HealthMonitor Tests ---
|
||||
|
||||
// HM-01: Normal health check — all HEALTHY.
|
||||
func TestHealthMonitor_HM01_AllHealthy(t *testing.T) {
|
||||
hm := NewHealthMonitor(&mockCollector{}, 10)
|
||||
registerTestComponents(hm, 6)
|
||||
|
||||
health := hm.GetHealth()
|
||||
if health.OverallStatus != OverallHealthy {
|
||||
t.Errorf("expected HEALTHY, got %s", health.OverallStatus)
|
||||
}
|
||||
if !health.QuorumValid {
|
||||
t.Error("expected quorum valid")
|
||||
}
|
||||
if len(health.Components) != 6 {
|
||||
t.Errorf("expected 6 components, got %d", len(health.Components))
|
||||
}
|
||||
}
|
||||
|
||||
// HM-02: Single component DEGRADED.
|
||||
func TestHealthMonitor_HM02_SingleDegraded(t *testing.T) {
|
||||
hm := NewHealthMonitor(&mockCollector{}, 10)
|
||||
registerTestComponents(hm, 6)
|
||||
hm.SetComponentStatus("comp-0", StatusDegraded)
|
||||
|
||||
health := hm.GetHealth()
|
||||
if health.OverallStatus != OverallDegraded {
|
||||
t.Errorf("expected DEGRADED, got %s", health.OverallStatus)
|
||||
}
|
||||
if !health.QuorumValid {
|
||||
t.Error("expected quorum still valid with 5/6 healthy")
|
||||
}
|
||||
}
|
||||
|
||||
// HM-03: Multiple components CRITICAL → quorum lost.
|
||||
func TestHealthMonitor_HM03_MultipleCritical(t *testing.T) {
|
||||
hm := NewHealthMonitor(&mockCollector{}, 10)
|
||||
registerTestComponents(hm, 6)
|
||||
hm.SetComponentStatus("comp-0", StatusCritical)
|
||||
hm.SetComponentStatus("comp-1", StatusCritical)
|
||||
hm.SetComponentStatus("comp-2", StatusCritical)
|
||||
|
||||
health := hm.GetHealth()
|
||||
if health.OverallStatus != OverallCritical {
|
||||
t.Errorf("expected CRITICAL, got %s", health.OverallStatus)
|
||||
}
|
||||
if health.QuorumValid {
|
||||
t.Error("expected quorum INVALID with 3/6 critical")
|
||||
}
|
||||
}
|
||||
|
||||
// HM-04: Anomaly detection (CPU spike).
|
||||
func TestHealthMonitor_HM04_CPUAnomaly(t *testing.T) {
|
||||
hm := NewHealthMonitor(&mockCollector{}, 100)
|
||||
hm.RegisterComponent(ComponentConfig{
|
||||
Name: "soc-ingest",
|
||||
Type: "go_binary",
|
||||
Thresholds: map[string]float64{"cpu": 80},
|
||||
ThresholdIsMax: map[string]bool{"cpu": true},
|
||||
})
|
||||
|
||||
// Build baseline of normal CPU (30%).
|
||||
for i := 0; i < 50; i++ {
|
||||
hm.metricsDB.AddDataPoint("soc-ingest", "cpu", 30.0)
|
||||
}
|
||||
|
||||
// Spike to 95%.
|
||||
hm.UpdateMetrics("soc-ingest", map[string]float64{"cpu": 95.0})
|
||||
hm.checkHealth()
|
||||
|
||||
// Should have alert(s).
|
||||
select {
|
||||
case alert := <-hm.alertBus:
|
||||
if alert.Component != "soc-ingest" {
|
||||
t.Errorf("expected soc-ingest, got %s", alert.Component)
|
||||
}
|
||||
if alert.Metric != "cpu" {
|
||||
t.Errorf("expected cpu metric, got %s", alert.Metric)
|
||||
}
|
||||
default:
|
||||
t.Error("expected alert for CPU spike")
|
||||
}
|
||||
}
|
||||
|
||||
// HM-05: Memory leak detection.
|
||||
func TestHealthMonitor_HM05_MemoryLeak(t *testing.T) {
|
||||
hm := NewHealthMonitor(&mockCollector{}, 100)
|
||||
hm.RegisterComponent(ComponentConfig{
|
||||
Name: "soc-correlate",
|
||||
Type: "go_binary",
|
||||
Thresholds: map[string]float64{"memory": 90},
|
||||
ThresholdIsMax: map[string]bool{"memory": true},
|
||||
})
|
||||
|
||||
// Build baseline of normal memory (40%).
|
||||
for i := 0; i < 50; i++ {
|
||||
hm.metricsDB.AddDataPoint("soc-correlate", "memory", 40.0)
|
||||
}
|
||||
|
||||
// Memory spike to 95%.
|
||||
hm.UpdateMetrics("soc-correlate", map[string]float64{"memory": 95.0})
|
||||
hm.checkHealth()
|
||||
|
||||
select {
|
||||
case alert := <-hm.alertBus:
|
||||
if alert.Metric != "memory" {
|
||||
t.Errorf("expected memory metric, got %s", alert.Metric)
|
||||
}
|
||||
default:
|
||||
t.Error("expected alert for memory spike")
|
||||
}
|
||||
}
|
||||
|
||||
// HM-06: Quorum validation failure.
|
||||
func TestHealthMonitor_HM06_QuorumFailure(t *testing.T) {
|
||||
statuses := map[string]ComponentStatus{
|
||||
"a": StatusOffline,
|
||||
"b": StatusOffline,
|
||||
"c": StatusOffline,
|
||||
"d": StatusOffline,
|
||||
"e": StatusHealthy,
|
||||
"f": StatusHealthy,
|
||||
}
|
||||
if ValidateQuorum(statuses) {
|
||||
t.Error("expected quorum invalid with 4/6 offline")
|
||||
}
|
||||
}
|
||||
|
||||
// HM-06b: Quorum validation success (edge case: exactly 2/3).
|
||||
func TestHealthMonitor_HM06b_QuorumEdge(t *testing.T) {
|
||||
statuses := map[string]ComponentStatus{
|
||||
"a": StatusHealthy,
|
||||
"b": StatusHealthy,
|
||||
"c": StatusCritical,
|
||||
}
|
||||
if !ValidateQuorum(statuses) {
|
||||
t.Error("expected quorum valid with 2/3 healthy (exact threshold)")
|
||||
}
|
||||
}
|
||||
|
||||
// HM-06c: Empty quorum.
|
||||
func TestHealthMonitor_HM06c_EmptyQuorum(t *testing.T) {
|
||||
if ValidateQuorum(map[string]ComponentStatus{}) {
|
||||
t.Error("expected quorum invalid with 0 components")
|
||||
}
|
||||
}
|
||||
|
||||
// HM-07: Metrics collection (no data loss).
|
||||
func TestHealthMonitor_HM07_MetricsCollection(t *testing.T) {
|
||||
collector := &mockCollector{
|
||||
results: map[string]map[string]float64{
|
||||
"comp-0": {"cpu": 25, "memory": 40},
|
||||
},
|
||||
}
|
||||
hm := NewHealthMonitor(collector, 10)
|
||||
hm.RegisterComponent(ComponentConfig{Name: "comp-0", Type: "go_binary"})
|
||||
|
||||
hm.collectMetrics(context.Background())
|
||||
|
||||
hm.mu.RLock()
|
||||
comp := hm.components["comp-0"]
|
||||
hm.mu.RUnlock()
|
||||
|
||||
if comp.Metrics["cpu"] != 25 {
|
||||
t.Errorf("expected cpu=25, got %f", comp.Metrics["cpu"])
|
||||
}
|
||||
if comp.Metrics["memory"] != 40 {
|
||||
t.Errorf("expected memory=40, got %f", comp.Metrics["memory"])
|
||||
}
|
||||
}
|
||||
|
||||
// HM-07b: Collection error increments consecutive failures.
|
||||
func TestHealthMonitor_HM07b_CollectionError(t *testing.T) {
|
||||
collector := &mockCollector{
|
||||
errors: map[string]error{
|
||||
"comp-0": fmt.Errorf("connection refused"),
|
||||
},
|
||||
}
|
||||
hm := NewHealthMonitor(collector, 10)
|
||||
hm.RegisterComponent(ComponentConfig{Name: "comp-0", Type: "go_binary"})
|
||||
|
||||
hm.collectMetrics(context.Background())
|
||||
|
||||
hm.mu.RLock()
|
||||
comp := hm.components["comp-0"]
|
||||
hm.mu.RUnlock()
|
||||
|
||||
if comp.Consecutive != 1 {
|
||||
t.Errorf("expected 1 consecutive failure, got %d", comp.Consecutive)
|
||||
}
|
||||
}
|
||||
|
||||
// HM-08: Alert bus fan-out (non-blocking).
|
||||
func TestHealthMonitor_HM08_AlertBusFanOut(t *testing.T) {
|
||||
hm := NewHealthMonitor(&mockCollector{}, 5)
|
||||
hm.RegisterComponent(ComponentConfig{
|
||||
Name: "comp",
|
||||
Type: "go_binary",
|
||||
Thresholds: map[string]float64{"cpu": 50},
|
||||
ThresholdIsMax: map[string]bool{"cpu": true},
|
||||
})
|
||||
|
||||
// Fill alert bus.
|
||||
for i := 0; i < 5; i++ {
|
||||
hm.alertBus <- HealthAlert{Component: fmt.Sprintf("test-%d", i)}
|
||||
}
|
||||
|
||||
// Emit one more — should be dropped (non-blocking).
|
||||
hm.emitAlert(HealthAlert{Component: "overflow"})
|
||||
// No panic = success.
|
||||
}
|
||||
|
||||
// Test GetHealth returns a deep copy.
|
||||
func TestHealthMonitor_GetHealthDeepCopy(t *testing.T) {
|
||||
hm := NewHealthMonitor(&mockCollector{}, 10)
|
||||
hm.RegisterComponent(ComponentConfig{Name: "test", Type: "go_binary"})
|
||||
hm.UpdateMetrics("test", map[string]float64{"cpu": 50})
|
||||
|
||||
health := hm.GetHealth()
|
||||
health.Components[0].Metrics["cpu"] = 999
|
||||
|
||||
// Original should be unchanged.
|
||||
hm.mu.RLock()
|
||||
original := hm.components["test"].Metrics["cpu"]
|
||||
hm.mu.RUnlock()
|
||||
|
||||
if original != 50 {
|
||||
t.Errorf("deep copy failed: original modified to %f", original)
|
||||
}
|
||||
}
|
||||
|
||||
// Test threshold breach transitions status to DEGRADED then CRITICAL.
|
||||
func TestHealthMonitor_StatusTransitions(t *testing.T) {
|
||||
hm := NewHealthMonitor(&mockCollector{}, 100)
|
||||
hm.RegisterComponent(ComponentConfig{
|
||||
Name: "comp",
|
||||
Type: "go_binary",
|
||||
Thresholds: map[string]float64{"error_rate": 5},
|
||||
ThresholdIsMax: map[string]bool{"error_rate": true},
|
||||
})
|
||||
|
||||
// Breach once → DEGRADED.
|
||||
hm.UpdateMetrics("comp", map[string]float64{"error_rate": 10})
|
||||
hm.checkHealth()
|
||||
|
||||
hm.mu.RLock()
|
||||
status := hm.components["comp"].Status
|
||||
hm.mu.RUnlock()
|
||||
if status != StatusDegraded {
|
||||
t.Errorf("expected DEGRADED after 1 breach, got %s", status)
|
||||
}
|
||||
|
||||
// Breach 3× → CRITICAL.
|
||||
for i := 0; i < 3; i++ {
|
||||
hm.checkHealth()
|
||||
}
|
||||
hm.mu.RLock()
|
||||
status = hm.components["comp"].Status
|
||||
hm.mu.RUnlock()
|
||||
if status != StatusCritical {
|
||||
t.Errorf("expected CRITICAL after repeated breaches, got %s", status)
|
||||
}
|
||||
}
|
||||
|
||||
// Test lower-bound threshold (ThresholdIsMax=false).
|
||||
func TestHealthMonitor_LowerBoundThreshold(t *testing.T) {
|
||||
hm := NewHealthMonitor(&mockCollector{}, 100)
|
||||
hm.RegisterComponent(ComponentConfig{
|
||||
Name: "immune",
|
||||
Type: "c_kernel_module",
|
||||
Thresholds: map[string]float64{"hooks_active": 10},
|
||||
ThresholdIsMax: map[string]bool{"hooks_active": false},
|
||||
})
|
||||
|
||||
// hooks_active = 5 (below threshold of 10) → warning.
|
||||
hm.UpdateMetrics("immune", map[string]float64{"hooks_active": 5})
|
||||
hm.checkHealth()
|
||||
|
||||
select {
|
||||
case alert := <-hm.alertBus:
|
||||
if alert.Component != "immune" || alert.Metric != "hooks_active" {
|
||||
t.Errorf("unexpected alert: %+v", alert)
|
||||
}
|
||||
default:
|
||||
t.Error("expected alert for hooks_active below threshold")
|
||||
}
|
||||
}
|
||||
|
||||
// Test ComponentCount.
|
||||
func TestHealthMonitor_ComponentCount(t *testing.T) {
|
||||
hm := NewHealthMonitor(&mockCollector{}, 10)
|
||||
if hm.ComponentCount() != 0 {
|
||||
t.Error("expected 0 initially")
|
||||
}
|
||||
registerTestComponents(hm, 4)
|
||||
if hm.ComponentCount() != 4 {
|
||||
t.Errorf("expected 4, got %d", hm.ComponentCount())
|
||||
}
|
||||
}
|
||||
|
||||
// Test Start/Stop lifecycle.
|
||||
func TestHealthMonitor_StartStop(t *testing.T) {
|
||||
hm := NewHealthMonitor(&mockCollector{}, 10)
|
||||
registerTestComponents(hm, 2)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
done := make(chan struct{})
|
||||
|
||||
go func() {
|
||||
hm.Start(ctx)
|
||||
close(done)
|
||||
}()
|
||||
|
||||
// Let it run briefly.
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
cancel()
|
||||
|
||||
select {
|
||||
case <-done:
|
||||
// Clean shutdown.
|
||||
case <-time.After(time.Second):
|
||||
t.Fatal("Start() did not return after context cancellation")
|
||||
}
|
||||
}
|
||||
|
||||
// --- Helpers ---
|
||||
|
||||
func registerTestComponents(hm *HealthMonitor, n int) {
|
||||
for i := 0; i < n; i++ {
|
||||
hm.RegisterComponent(ComponentConfig{
|
||||
Name: fmt.Sprintf("comp-%d", i),
|
||||
Type: "go_binary",
|
||||
})
|
||||
}
|
||||
}
|
||||
247
internal/application/resilience/integrity.go
Normal file
247
internal/application/resilience/integrity.go
Normal file
|
|
@ -0,0 +1,247 @@
|
|||
package resilience
|
||||
|
||||
import (
|
||||
"crypto/hmac"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"os"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// IntegrityStatus represents the result of an integrity check.
|
||||
type IntegrityStatus string
|
||||
|
||||
const (
|
||||
IntegrityVerified IntegrityStatus = "VERIFIED"
|
||||
IntegrityCompromised IntegrityStatus = "COMPROMISED"
|
||||
IntegrityUnknown IntegrityStatus = "UNKNOWN"
|
||||
)
|
||||
|
||||
// IntegrityReport is the full result of an integrity verification.
|
||||
type IntegrityReport struct {
|
||||
Overall IntegrityStatus `json:"overall"`
|
||||
Timestamp time.Time `json:"timestamp"`
|
||||
Binaries map[string]BinaryStatus `json:"binaries,omitempty"`
|
||||
Chain *ChainStatus `json:"chain,omitempty"`
|
||||
Configs map[string]ConfigStatus `json:"configs,omitempty"`
|
||||
}
|
||||
|
||||
// BinaryStatus is the integrity status of a single binary.
|
||||
type BinaryStatus struct {
|
||||
Status IntegrityStatus `json:"status"`
|
||||
Expected string `json:"expected"`
|
||||
Current string `json:"current"`
|
||||
}
|
||||
|
||||
// ChainStatus is the integrity status of the decision chain.
|
||||
type ChainStatus struct {
|
||||
Valid bool `json:"valid"`
|
||||
Error string `json:"error,omitempty"`
|
||||
BreakPoint int `json:"break_point,omitempty"`
|
||||
Entries int `json:"entries"`
|
||||
}
|
||||
|
||||
// ConfigStatus is the integrity status of a config file.
|
||||
type ConfigStatus struct {
|
||||
Valid bool `json:"valid"`
|
||||
Error string `json:"error,omitempty"`
|
||||
StoredHMAC string `json:"stored_hmac,omitempty"`
|
||||
CurrentHMAC string `json:"current_hmac,omitempty"`
|
||||
}
|
||||
|
||||
// IntegrityVerifier performs periodic integrity checks on binaries,
|
||||
// decision chain, and config files.
|
||||
type IntegrityVerifier struct {
|
||||
mu sync.RWMutex
|
||||
binaryHashes map[string]string // path → expected SHA-256
|
||||
configPaths []string // config files to verify
|
||||
hmacKey []byte // key for config HMAC-SHA256
|
||||
chainPath string // path to decision chain log
|
||||
logger *slog.Logger
|
||||
lastReport *IntegrityReport
|
||||
}
|
||||
|
||||
// NewIntegrityVerifier creates a new integrity verifier.
|
||||
func NewIntegrityVerifier(hmacKey []byte) *IntegrityVerifier {
|
||||
return &IntegrityVerifier{
|
||||
binaryHashes: make(map[string]string),
|
||||
hmacKey: hmacKey,
|
||||
logger: slog.Default().With("component", "sarl-integrity"),
|
||||
}
|
||||
}
|
||||
|
||||
// RegisterBinary adds a binary with its expected SHA-256 hash.
|
||||
func (iv *IntegrityVerifier) RegisterBinary(path, expectedHash string) {
|
||||
iv.mu.Lock()
|
||||
defer iv.mu.Unlock()
|
||||
iv.binaryHashes[path] = expectedHash
|
||||
}
|
||||
|
||||
// RegisterConfig adds a config file to verify.
|
||||
func (iv *IntegrityVerifier) RegisterConfig(path string) {
|
||||
iv.mu.Lock()
|
||||
defer iv.mu.Unlock()
|
||||
iv.configPaths = append(iv.configPaths, path)
|
||||
}
|
||||
|
||||
// SetChainPath sets the decision chain log path.
|
||||
func (iv *IntegrityVerifier) SetChainPath(path string) {
|
||||
iv.mu.Lock()
|
||||
defer iv.mu.Unlock()
|
||||
iv.chainPath = path
|
||||
}
|
||||
|
||||
// VerifyAll runs all integrity checks and returns a comprehensive report.
|
||||
// Note: file I/O (binary hashing, config reading) is done WITHOUT holding
|
||||
// the mutex to prevent thread starvation on slow storage.
|
||||
func (iv *IntegrityVerifier) VerifyAll() IntegrityReport {
|
||||
report := IntegrityReport{
|
||||
Overall: IntegrityVerified,
|
||||
Timestamp: time.Now(),
|
||||
Binaries: make(map[string]BinaryStatus),
|
||||
Configs: make(map[string]ConfigStatus),
|
||||
}
|
||||
|
||||
// Snapshot config under lock, then release before I/O.
|
||||
iv.mu.RLock()
|
||||
binaryHashesCopy := make(map[string]string, len(iv.binaryHashes))
|
||||
for k, v := range iv.binaryHashes {
|
||||
binaryHashesCopy[k] = v
|
||||
}
|
||||
configPathsCopy := make([]string, len(iv.configPaths))
|
||||
copy(configPathsCopy, iv.configPaths)
|
||||
hmacKeyCopy := make([]byte, len(iv.hmacKey))
|
||||
copy(hmacKeyCopy, iv.hmacKey)
|
||||
chainPath := iv.chainPath
|
||||
iv.mu.RUnlock()
|
||||
|
||||
// Check binaries (file I/O — no lock held).
|
||||
for path, expected := range binaryHashesCopy {
|
||||
status := iv.verifyBinary(path, expected)
|
||||
report.Binaries[path] = status
|
||||
if status.Status == IntegrityCompromised {
|
||||
report.Overall = IntegrityCompromised
|
||||
}
|
||||
}
|
||||
|
||||
// Check configs (file I/O — no lock held).
|
||||
for _, path := range configPathsCopy {
|
||||
status := iv.verifyConfigFile(path)
|
||||
report.Configs[path] = status
|
||||
if !status.Valid {
|
||||
report.Overall = IntegrityCompromised
|
||||
}
|
||||
}
|
||||
|
||||
// Check decision chain (file I/O — no lock held).
|
||||
if chainPath != "" {
|
||||
chain := iv.verifyDecisionChain(chainPath)
|
||||
report.Chain = &chain
|
||||
if !chain.Valid {
|
||||
report.Overall = IntegrityCompromised
|
||||
}
|
||||
}
|
||||
|
||||
iv.mu.Lock()
|
||||
iv.lastReport = &report
|
||||
iv.mu.Unlock()
|
||||
|
||||
if report.Overall == IntegrityCompromised {
|
||||
iv.logger.Error("INTEGRITY COMPROMISED", "report", report)
|
||||
} else {
|
||||
iv.logger.Debug("integrity verified", "binaries", len(report.Binaries))
|
||||
}
|
||||
|
||||
return report
|
||||
}
|
||||
|
||||
// LastReport returns the most recent integrity report.
|
||||
func (iv *IntegrityVerifier) LastReport() *IntegrityReport {
|
||||
iv.mu.RLock()
|
||||
defer iv.mu.RUnlock()
|
||||
return iv.lastReport
|
||||
}
|
||||
|
||||
// verifyBinary calculates SHA-256 of a file and compares to expected.
|
||||
func (iv *IntegrityVerifier) verifyBinary(path, expected string) BinaryStatus {
|
||||
current, err := fileSHA256(path)
|
||||
if err != nil {
|
||||
return BinaryStatus{
|
||||
Status: IntegrityUnknown,
|
||||
Expected: expected,
|
||||
Current: fmt.Sprintf("error: %v", err),
|
||||
}
|
||||
}
|
||||
|
||||
if current != expected {
|
||||
return BinaryStatus{
|
||||
Status: IntegrityCompromised,
|
||||
Expected: expected,
|
||||
Current: current,
|
||||
}
|
||||
}
|
||||
|
||||
return BinaryStatus{
|
||||
Status: IntegrityVerified,
|
||||
Expected: expected,
|
||||
Current: current,
|
||||
}
|
||||
}
|
||||
|
||||
// verifyConfigFile checks HMAC-SHA256 of a config file.
|
||||
func (iv *IntegrityVerifier) verifyConfigFile(path string) ConfigStatus {
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return ConfigStatus{Valid: false, Error: fmt.Sprintf("unreadable: %v", err)}
|
||||
}
|
||||
|
||||
currentHMAC := computeHMAC(data, iv.hmacKey)
|
||||
// For now, we just verify the file is readable and compute HMAC.
|
||||
// In production, the stored HMAC would be extracted from a sidecar file.
|
||||
return ConfigStatus{
|
||||
Valid: true,
|
||||
CurrentHMAC: currentHMAC,
|
||||
}
|
||||
}
|
||||
|
||||
// verifyDecisionChain verifies the SHA-256 hash chain in the decision log.
|
||||
func (iv *IntegrityVerifier) verifyDecisionChain(path string) ChainStatus {
|
||||
_, err := os.Stat(path)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return ChainStatus{Valid: true, Entries: 0} // No chain yet.
|
||||
}
|
||||
return ChainStatus{Valid: false, Error: fmt.Sprintf("unreadable: %v", err)}
|
||||
}
|
||||
|
||||
// In a real implementation, we'd parse the chain entries and verify
|
||||
// that each entry's hash includes the previous entry's hash.
|
||||
// For now, verify the file exists and is readable.
|
||||
return ChainStatus{Valid: true}
|
||||
}
|
||||
|
||||
// fileSHA256 computes the SHA-256 hash of a file.
|
||||
func fileSHA256(path string) (string, error) {
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
h := sha256.New()
|
||||
if _, err := io.Copy(h, f); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return hex.EncodeToString(h.Sum(nil)), nil
|
||||
}
|
||||
|
||||
// computeHMAC computes HMAC-SHA256 of data with the given key.
|
||||
func computeHMAC(data, key []byte) string {
|
||||
mac := hmac.New(sha256.New, key)
|
||||
mac.Write(data)
|
||||
return hex.EncodeToString(mac.Sum(nil))
|
||||
}
|
||||
283
internal/application/resilience/metrics_collector.go
Normal file
283
internal/application/resilience/metrics_collector.go
Normal file
|
|
@ -0,0 +1,283 @@
|
|||
// Package resilience implements the Sentinel Autonomous Resilience Layer (SARL).
|
||||
//
|
||||
// Five levels of autonomous self-recovery:
|
||||
//
|
||||
// L1 — Self-Monitoring: health checks, quorum, anomaly detection
|
||||
// L2 — Self-Healing: restart, rollback, recovery strategies
|
||||
// L3 — Self-Preservation: emergency modes (safe/lockdown/apoptosis)
|
||||
// L4 — Immune Integration: behavioral anomaly detection
|
||||
// L5 — Autonomous Recovery: playbooks for resurrection, consensus, crypto
|
||||
package resilience
|
||||
|
||||
import (
|
||||
"math"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// MetricsDB provides an in-memory time-series store with ring buffers
|
||||
// for each component/metric pair. Supports rolling baselines (mean/stddev)
|
||||
// for Z-score anomaly detection.
|
||||
type MetricsDB struct {
|
||||
mu sync.RWMutex
|
||||
series map[string]*RingBuffer // key = "component:metric"
|
||||
window time.Duration // retention window (default 1h)
|
||||
maxSize int // max data points per series
|
||||
}
|
||||
|
||||
// DataPoint is a single timestamped metric value.
|
||||
type DataPoint struct {
|
||||
Timestamp time.Time `json:"timestamp"`
|
||||
Value float64 `json:"value"`
|
||||
}
|
||||
|
||||
// Baseline holds rolling statistics for anomaly detection.
|
||||
type Baseline struct {
|
||||
Mean float64 `json:"mean"`
|
||||
StdDev float64 `json:"std_dev"`
|
||||
Count int `json:"count"`
|
||||
Min float64 `json:"min"`
|
||||
Max float64 `json:"max"`
|
||||
}
|
||||
|
||||
// RingBuffer is a fixed-size circular buffer for DataPoints.
|
||||
type RingBuffer struct {
|
||||
data []DataPoint
|
||||
head int
|
||||
count int
|
||||
size int
|
||||
}
|
||||
|
||||
// DefaultMetricsWindow is the default retention window (1 hour).
|
||||
const DefaultMetricsWindow = 1 * time.Hour
|
||||
|
||||
// DefaultMetricsMaxSize is the default max points per series (1h / 10s = 360).
|
||||
const DefaultMetricsMaxSize = 360
|
||||
|
||||
// NewMetricsDB creates a new in-memory time-series store.
|
||||
func NewMetricsDB(window time.Duration, maxSize int) *MetricsDB {
|
||||
if window <= 0 {
|
||||
window = DefaultMetricsWindow
|
||||
}
|
||||
if maxSize <= 0 {
|
||||
maxSize = DefaultMetricsMaxSize
|
||||
}
|
||||
return &MetricsDB{
|
||||
series: make(map[string]*RingBuffer),
|
||||
window: window,
|
||||
maxSize: maxSize,
|
||||
}
|
||||
}
|
||||
|
||||
// AddDataPoint records a metric value for a component.
|
||||
func (db *MetricsDB) AddDataPoint(component, metric string, value float64) {
|
||||
key := component + ":" + metric
|
||||
db.mu.Lock()
|
||||
defer db.mu.Unlock()
|
||||
|
||||
rb, ok := db.series[key]
|
||||
if !ok {
|
||||
rb = newRingBuffer(db.maxSize)
|
||||
db.series[key] = rb
|
||||
}
|
||||
rb.Add(DataPoint{Timestamp: time.Now(), Value: value})
|
||||
}
|
||||
|
||||
// GetBaseline returns rolling mean/stddev for a component metric
|
||||
// calculated over the specified window duration.
|
||||
func (db *MetricsDB) GetBaseline(component, metric string, window time.Duration) Baseline {
|
||||
key := component + ":" + metric
|
||||
db.mu.RLock()
|
||||
defer db.mu.RUnlock()
|
||||
|
||||
rb, ok := db.series[key]
|
||||
if !ok {
|
||||
return Baseline{}
|
||||
}
|
||||
|
||||
cutoff := time.Now().Add(-window)
|
||||
points := rb.After(cutoff)
|
||||
|
||||
if len(points) == 0 {
|
||||
return Baseline{}
|
||||
}
|
||||
|
||||
return calculateBaseline(points)
|
||||
}
|
||||
|
||||
// GetRecent returns the most recent N data points for a component metric.
|
||||
func (db *MetricsDB) GetRecent(component, metric string, n int) []DataPoint {
|
||||
key := component + ":" + metric
|
||||
db.mu.RLock()
|
||||
defer db.mu.RUnlock()
|
||||
|
||||
rb, ok := db.series[key]
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
|
||||
all := rb.All()
|
||||
if len(all) <= n {
|
||||
return all
|
||||
}
|
||||
return all[len(all)-n:]
|
||||
}
|
||||
|
||||
// CalculateZScore returns the Z-score for a value against the baseline.
|
||||
// Returns 0 if baseline has insufficient data or zero stddev.
|
||||
func CalculateZScore(value float64, baseline Baseline) float64 {
|
||||
if baseline.Count < 10 || baseline.StdDev == 0 {
|
||||
return 0
|
||||
}
|
||||
return (value - baseline.Mean) / baseline.StdDev
|
||||
}
|
||||
|
||||
// IsAnomaly returns true if the Z-score exceeds the threshold (default 3.0).
|
||||
func IsAnomaly(value float64, baseline Baseline, threshold float64) bool {
|
||||
if threshold <= 0 {
|
||||
threshold = 3.0
|
||||
}
|
||||
zscore := CalculateZScore(value, baseline)
|
||||
return math.Abs(zscore) > threshold
|
||||
}
|
||||
|
||||
// SeriesCount returns the number of tracked series.
|
||||
func (db *MetricsDB) SeriesCount() int {
|
||||
db.mu.RLock()
|
||||
defer db.mu.RUnlock()
|
||||
return len(db.series)
|
||||
}
|
||||
|
||||
// Purge removes data points older than the retention window.
|
||||
func (db *MetricsDB) Purge() int {
|
||||
db.mu.Lock()
|
||||
defer db.mu.Unlock()
|
||||
|
||||
cutoff := time.Now().Add(-db.window)
|
||||
total := 0
|
||||
for key, rb := range db.series {
|
||||
removed := rb.RemoveBefore(cutoff)
|
||||
total += removed
|
||||
if rb.Len() == 0 {
|
||||
delete(db.series, key)
|
||||
}
|
||||
}
|
||||
return total
|
||||
}
|
||||
|
||||
// --- RingBuffer implementation ---
|
||||
|
||||
func newRingBuffer(size int) *RingBuffer {
|
||||
return &RingBuffer{
|
||||
data: make([]DataPoint, size),
|
||||
size: size,
|
||||
}
|
||||
}
|
||||
|
||||
// Add inserts a DataPoint, overwriting the oldest if full.
|
||||
func (rb *RingBuffer) Add(dp DataPoint) {
|
||||
rb.data[rb.head] = dp
|
||||
rb.head = (rb.head + 1) % rb.size
|
||||
if rb.count < rb.size {
|
||||
rb.count++
|
||||
}
|
||||
}
|
||||
|
||||
// Len returns the number of data points in the buffer.
|
||||
func (rb *RingBuffer) Len() int {
|
||||
return rb.count
|
||||
}
|
||||
|
||||
// All returns all data points in chronological order.
|
||||
func (rb *RingBuffer) All() []DataPoint {
|
||||
if rb.count == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
result := make([]DataPoint, rb.count)
|
||||
if rb.count < rb.size {
|
||||
// Buffer not yet full — data starts at 0.
|
||||
copy(result, rb.data[:rb.count])
|
||||
} else {
|
||||
// Buffer wrapped — oldest is at head.
|
||||
n := copy(result, rb.data[rb.head:rb.size])
|
||||
copy(result[n:], rb.data[:rb.head])
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// After returns points with timestamp after the cutoff.
|
||||
func (rb *RingBuffer) After(cutoff time.Time) []DataPoint {
|
||||
all := rb.All()
|
||||
result := make([]DataPoint, 0, len(all))
|
||||
for _, dp := range all {
|
||||
if dp.Timestamp.After(cutoff) {
|
||||
result = append(result, dp)
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// RemoveBefore removes data points before the cutoff by compacting.
|
||||
// Returns the number of points removed.
|
||||
func (rb *RingBuffer) RemoveBefore(cutoff time.Time) int {
|
||||
all := rb.All()
|
||||
kept := make([]DataPoint, 0, len(all))
|
||||
for _, dp := range all {
|
||||
if !dp.Timestamp.Before(cutoff) {
|
||||
kept = append(kept, dp)
|
||||
}
|
||||
}
|
||||
|
||||
removed := len(all) - len(kept)
|
||||
if removed == 0 {
|
||||
return 0
|
||||
}
|
||||
|
||||
// Rebuild the ring buffer with kept data.
|
||||
rb.count = 0
|
||||
rb.head = 0
|
||||
for _, dp := range kept {
|
||||
rb.Add(dp)
|
||||
}
|
||||
return removed
|
||||
}
|
||||
|
||||
// --- Statistics ---
|
||||
|
||||
func calculateBaseline(points []DataPoint) Baseline {
|
||||
n := len(points)
|
||||
if n == 0 {
|
||||
return Baseline{}
|
||||
}
|
||||
|
||||
var sum, min, max float64
|
||||
min = points[0].Value
|
||||
max = points[0].Value
|
||||
|
||||
for _, p := range points {
|
||||
sum += p.Value
|
||||
if p.Value < min {
|
||||
min = p.Value
|
||||
}
|
||||
if p.Value > max {
|
||||
max = p.Value
|
||||
}
|
||||
}
|
||||
mean := sum / float64(n)
|
||||
|
||||
var variance float64
|
||||
for _, p := range points {
|
||||
diff := p.Value - mean
|
||||
variance += diff * diff
|
||||
}
|
||||
variance /= float64(n)
|
||||
|
||||
return Baseline{
|
||||
Mean: mean,
|
||||
StdDev: math.Sqrt(variance),
|
||||
Count: n,
|
||||
Min: min,
|
||||
Max: max,
|
||||
}
|
||||
}
|
||||
290
internal/application/resilience/preservation.go
Normal file
290
internal/application/resilience/preservation.go
Normal file
|
|
@ -0,0 +1,290 @@
|
|||
package resilience
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// EmergencyMode defines the system's emergency state.
|
||||
type EmergencyMode string
|
||||
|
||||
const (
|
||||
ModeNone EmergencyMode = "NONE"
|
||||
ModeSafe EmergencyMode = "SAFE"
|
||||
ModeLockdown EmergencyMode = "LOCKDOWN"
|
||||
ModeApoptosis EmergencyMode = "APOPTOSIS"
|
||||
)
|
||||
|
||||
// ModeActivation records when and why a mode was activated.
|
||||
type ModeActivation struct {
|
||||
Mode EmergencyMode `json:"mode"`
|
||||
ActivatedAt time.Time `json:"activated_at"`
|
||||
ActivatedBy string `json:"activated_by"` // "auto" or "architect:<name>"
|
||||
Reason string `json:"reason"`
|
||||
AutoExit bool `json:"auto_exit"`
|
||||
AutoExitAt time.Time `json:"auto_exit_at,omitempty"`
|
||||
}
|
||||
|
||||
// PreservationEvent is an audit log entry for preservation actions.
|
||||
type PreservationEvent struct {
|
||||
Timestamp time.Time `json:"timestamp"`
|
||||
Mode EmergencyMode `json:"mode"`
|
||||
Action string `json:"action"`
|
||||
Detail string `json:"detail"`
|
||||
Success bool `json:"success"`
|
||||
Error string `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
// ModeActionFunc is a callback to perform mode-specific actions.
|
||||
// Implementations handle the real system operations (network isolation, process freeze, etc.).
|
||||
type ModeActionFunc func(mode EmergencyMode, action string, params map[string]interface{}) error
|
||||
|
||||
// PreservationEngine manages emergency modes (safe/lockdown/apoptosis).
|
||||
type PreservationEngine struct {
|
||||
mu sync.RWMutex
|
||||
currentMode EmergencyMode
|
||||
activation *ModeActivation
|
||||
history []PreservationEvent
|
||||
actionFn ModeActionFunc
|
||||
integrityFn func() IntegrityReport // pluggable integrity check
|
||||
logger *slog.Logger
|
||||
}
|
||||
|
||||
// NewPreservationEngine creates a new preservation engine.
|
||||
func NewPreservationEngine(actionFn ModeActionFunc) *PreservationEngine {
|
||||
return &PreservationEngine{
|
||||
currentMode: ModeNone,
|
||||
history: make([]PreservationEvent, 0),
|
||||
actionFn: actionFn,
|
||||
logger: slog.Default().With("component", "sarl-preservation"),
|
||||
}
|
||||
}
|
||||
|
||||
// CurrentMode returns the active emergency mode.
|
||||
func (pe *PreservationEngine) CurrentMode() EmergencyMode {
|
||||
pe.mu.RLock()
|
||||
defer pe.mu.RUnlock()
|
||||
return pe.currentMode
|
||||
}
|
||||
|
||||
// Activation returns the current mode activation details (nil if NONE).
|
||||
func (pe *PreservationEngine) Activation() *ModeActivation {
|
||||
pe.mu.RLock()
|
||||
defer pe.mu.RUnlock()
|
||||
if pe.activation == nil {
|
||||
return nil
|
||||
}
|
||||
cp := *pe.activation
|
||||
return &cp
|
||||
}
|
||||
|
||||
// ActivateMode enters an emergency mode. Returns error if transition is invalid.
|
||||
func (pe *PreservationEngine) ActivateMode(mode EmergencyMode, reason, activatedBy string) error {
|
||||
pe.mu.Lock()
|
||||
defer pe.mu.Unlock()
|
||||
|
||||
if mode == ModeNone {
|
||||
return fmt.Errorf("use DeactivateMode to exit emergency mode")
|
||||
}
|
||||
|
||||
// Validate transitions: can always escalate, can't downgrade.
|
||||
if !pe.isValidTransition(pe.currentMode, mode) {
|
||||
return fmt.Errorf("invalid transition: %s → %s", pe.currentMode, mode)
|
||||
}
|
||||
|
||||
pe.logger.Warn("EMERGENCY MODE ACTIVATION",
|
||||
"mode", mode,
|
||||
"reason", reason,
|
||||
"activated_by", activatedBy,
|
||||
)
|
||||
|
||||
// Execute mode-specific actions.
|
||||
actions := pe.actionsForMode(mode)
|
||||
for _, action := range actions {
|
||||
err := pe.executeAction(mode, action.name, action.params)
|
||||
if err != nil {
|
||||
pe.logger.Error("mode action failed",
|
||||
"mode", mode,
|
||||
"action", action.name,
|
||||
"error", err,
|
||||
)
|
||||
// In critical modes, continue despite errors.
|
||||
if mode != ModeApoptosis {
|
||||
return fmt.Errorf("failed to activate %s: action %s: %w", mode, action.name, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
activation := &ModeActivation{
|
||||
Mode: mode,
|
||||
ActivatedAt: time.Now(),
|
||||
ActivatedBy: activatedBy,
|
||||
Reason: reason,
|
||||
}
|
||||
|
||||
if mode == ModeSafe {
|
||||
activation.AutoExit = true
|
||||
activation.AutoExitAt = time.Now().Add(15 * time.Minute)
|
||||
}
|
||||
|
||||
pe.currentMode = mode
|
||||
pe.activation = activation
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// DeactivateMode exits the current emergency mode and returns to NONE.
|
||||
func (pe *PreservationEngine) DeactivateMode(deactivatedBy string) error {
|
||||
pe.mu.Lock()
|
||||
defer pe.mu.Unlock()
|
||||
|
||||
if pe.currentMode == ModeNone {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Lockdown and apoptosis require manual deactivation by architect.
|
||||
if pe.currentMode == ModeApoptosis {
|
||||
return fmt.Errorf("apoptosis mode cannot be deactivated — system rebuild required")
|
||||
}
|
||||
|
||||
pe.logger.Info("EMERGENCY MODE DEACTIVATION",
|
||||
"mode", pe.currentMode,
|
||||
"deactivated_by", deactivatedBy,
|
||||
)
|
||||
|
||||
pe.recordEvent(pe.currentMode, "deactivated",
|
||||
fmt.Sprintf("deactivated by %s", deactivatedBy), true, "")
|
||||
|
||||
pe.currentMode = ModeNone
|
||||
pe.activation = nil
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// ShouldAutoExit checks if safe mode should auto-exit based on timer.
|
||||
func (pe *PreservationEngine) ShouldAutoExit() bool {
|
||||
pe.mu.RLock()
|
||||
defer pe.mu.RUnlock()
|
||||
|
||||
if pe.currentMode != ModeSafe || pe.activation == nil {
|
||||
return false
|
||||
}
|
||||
return pe.activation.AutoExit && time.Now().After(pe.activation.AutoExitAt)
|
||||
}
|
||||
|
||||
// isValidTransition checks if a mode transition is allowed.
|
||||
// Escalation order: NONE → SAFE → LOCKDOWN → APOPTOSIS.
|
||||
func (pe *PreservationEngine) isValidTransition(from, to EmergencyMode) bool {
|
||||
rank := map[EmergencyMode]int{
|
||||
ModeNone: 0,
|
||||
ModeSafe: 1,
|
||||
ModeLockdown: 2,
|
||||
ModeApoptosis: 3,
|
||||
}
|
||||
// Can always escalate or re-enter same mode.
|
||||
return rank[to] >= rank[from]
|
||||
}
|
||||
|
||||
type modeAction struct {
|
||||
name string
|
||||
params map[string]interface{}
|
||||
}
|
||||
|
||||
// actionsForMode returns the actions to execute for a given mode.
|
||||
func (pe *PreservationEngine) actionsForMode(mode EmergencyMode) []modeAction {
|
||||
switch mode {
|
||||
case ModeSafe:
|
||||
return []modeAction{
|
||||
{"disable_non_essential_services", map[string]interface{}{
|
||||
"services": []string{"analytics", "reporting", "p2p_sync", "threat_intel_feeds"},
|
||||
}},
|
||||
{"enable_readonly_mode", map[string]interface{}{
|
||||
"scope": []string{"event_ingest", "correlation", "dashboard_view"},
|
||||
}},
|
||||
{"preserve_all_logs", nil},
|
||||
{"notify_architect", map[string]interface{}{"severity": "emergency"}},
|
||||
{"increase_monitoring_frequency", map[string]interface{}{"interval": "5s"}},
|
||||
}
|
||||
case ModeLockdown:
|
||||
return []modeAction{
|
||||
{"isolate_from_network", map[string]interface{}{"scope": "all_external"}},
|
||||
{"freeze_all_processes", nil},
|
||||
{"capture_memory_dump", nil},
|
||||
{"capture_disk_snapshot", nil},
|
||||
{"trigger_immune_kernel_lock", map[string]interface{}{
|
||||
"allow_syscalls": []string{"read", "write", "exit"},
|
||||
}},
|
||||
{"send_panic_alert", map[string]interface{}{
|
||||
"channels": []string{"email", "sms", "slack", "pagerduty"},
|
||||
}},
|
||||
}
|
||||
case ModeApoptosis:
|
||||
return []modeAction{
|
||||
{"graceful_shutdown", map[string]interface{}{"timeout": "30s", "drain_events": true}},
|
||||
{"zero_sensitive_memory", map[string]interface{}{
|
||||
"regions": []string{"keys", "certs", "tokens", "secrets"},
|
||||
}},
|
||||
{"preserve_forensic_evidence", nil},
|
||||
{"notify_soc", map[string]interface{}{
|
||||
"severity": "CRITICAL",
|
||||
"message": "system self-terminated",
|
||||
}},
|
||||
{"secure_erase_temp_files", nil},
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// executeAction runs a mode action and records the result.
|
||||
func (pe *PreservationEngine) executeAction(mode EmergencyMode, name string, params map[string]interface{}) error {
|
||||
err := pe.actionFn(mode, name, params)
|
||||
success := err == nil
|
||||
errStr := ""
|
||||
if err != nil {
|
||||
errStr = err.Error()
|
||||
}
|
||||
pe.recordEvent(mode, name, fmt.Sprintf("params: %v", params), success, errStr)
|
||||
return err
|
||||
}
|
||||
|
||||
// recordEvent appends to the audit history.
|
||||
func (pe *PreservationEngine) recordEvent(mode EmergencyMode, action, detail string, success bool, errStr string) {
|
||||
pe.history = append(pe.history, PreservationEvent{
|
||||
Timestamp: time.Now(),
|
||||
Mode: mode,
|
||||
Action: action,
|
||||
Detail: detail,
|
||||
Success: success,
|
||||
Error: errStr,
|
||||
})
|
||||
}
|
||||
|
||||
// History returns the preservation audit log.
|
||||
func (pe *PreservationEngine) History() []PreservationEvent {
|
||||
pe.mu.RLock()
|
||||
defer pe.mu.RUnlock()
|
||||
result := make([]PreservationEvent, len(pe.history))
|
||||
copy(result, pe.history)
|
||||
return result
|
||||
}
|
||||
|
||||
// SetIntegrityCheck sets the pluggable integrity checker.
|
||||
func (pe *PreservationEngine) SetIntegrityCheck(fn func() IntegrityReport) {
|
||||
pe.mu.Lock()
|
||||
defer pe.mu.Unlock()
|
||||
pe.integrityFn = fn
|
||||
}
|
||||
|
||||
// CheckIntegrity runs the pluggable integrity check and returns the report.
|
||||
func (pe *PreservationEngine) CheckIntegrity() IntegrityReport {
|
||||
pe.mu.RLock()
|
||||
fn := pe.integrityFn
|
||||
pe.mu.RUnlock()
|
||||
|
||||
if fn == nil {
|
||||
return IntegrityReport{Overall: IntegrityVerified, Timestamp: time.Now()}
|
||||
}
|
||||
return fn()
|
||||
}
|
||||
439
internal/application/resilience/preservation_test.go
Normal file
439
internal/application/resilience/preservation_test.go
Normal file
|
|
@ -0,0 +1,439 @@
|
|||
package resilience
|
||||
|
||||
import (
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// --- Mock action function ---
|
||||
|
||||
type modeActionLog struct {
|
||||
calls []struct {
|
||||
mode EmergencyMode
|
||||
action string
|
||||
}
|
||||
failAction string // if set, this action will fail
|
||||
}
|
||||
|
||||
func newModeActionLog() *modeActionLog {
|
||||
return &modeActionLog{}
|
||||
}
|
||||
|
||||
func (m *modeActionLog) execute(mode EmergencyMode, action string, _ map[string]interface{}) error {
|
||||
m.calls = append(m.calls, struct {
|
||||
mode EmergencyMode
|
||||
action string
|
||||
}{mode, action})
|
||||
if m.failAction == action {
|
||||
return errActionFailed
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
var errActionFailed = &actionError{"simulated failure"}
|
||||
|
||||
type actionError struct{ msg string }
|
||||
|
||||
func (e *actionError) Error() string { return e.msg }
|
||||
|
||||
// --- Preservation Engine Tests ---
|
||||
|
||||
// SP-01: Safe mode activation.
|
||||
func TestPreservation_SP01_SafeMode(t *testing.T) {
|
||||
log := newModeActionLog()
|
||||
pe := NewPreservationEngine(log.execute)
|
||||
|
||||
err := pe.ActivateMode(ModeSafe, "quorum lost (3/6 offline)", "auto")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
if pe.CurrentMode() != ModeSafe {
|
||||
t.Errorf("expected SAFE, got %s", pe.CurrentMode())
|
||||
}
|
||||
|
||||
activation := pe.Activation()
|
||||
if activation == nil {
|
||||
t.Fatal("expected activation details")
|
||||
}
|
||||
if !activation.AutoExit {
|
||||
t.Error("safe mode should have auto-exit enabled")
|
||||
}
|
||||
|
||||
// Should have executed safe mode actions.
|
||||
if len(log.calls) == 0 {
|
||||
t.Error("expected mode actions to be executed")
|
||||
}
|
||||
// First action should be disable_non_essential_services.
|
||||
if log.calls[0].action != "disable_non_essential_services" {
|
||||
t.Errorf("expected first action disable_non_essential_services, got %s", log.calls[0].action)
|
||||
}
|
||||
}
|
||||
|
||||
// SP-02: Lockdown mode activation.
|
||||
func TestPreservation_SP02_LockdownMode(t *testing.T) {
|
||||
log := newModeActionLog()
|
||||
pe := NewPreservationEngine(log.execute)
|
||||
|
||||
err := pe.ActivateMode(ModeLockdown, "binary tampering detected", "auto")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
if pe.CurrentMode() != ModeLockdown {
|
||||
t.Errorf("expected LOCKDOWN, got %s", pe.CurrentMode())
|
||||
}
|
||||
|
||||
// Should have network isolation action.
|
||||
foundIsolate := false
|
||||
for _, c := range log.calls {
|
||||
if c.action == "isolate_from_network" {
|
||||
foundIsolate = true
|
||||
}
|
||||
}
|
||||
if !foundIsolate {
|
||||
t.Error("expected isolate_from_network in lockdown actions")
|
||||
}
|
||||
}
|
||||
|
||||
// SP-03: Apoptosis mode activation.
|
||||
func TestPreservation_SP03_ApoptosisMode(t *testing.T) {
|
||||
log := newModeActionLog()
|
||||
pe := NewPreservationEngine(log.execute)
|
||||
|
||||
err := pe.ActivateMode(ModeApoptosis, "rootkit detected", "architect:admin")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
if pe.CurrentMode() != ModeApoptosis {
|
||||
t.Errorf("expected APOPTOSIS, got %s", pe.CurrentMode())
|
||||
}
|
||||
|
||||
// Should have graceful_shutdown action.
|
||||
foundShutdown := false
|
||||
for _, c := range log.calls {
|
||||
if c.action == "graceful_shutdown" {
|
||||
foundShutdown = true
|
||||
}
|
||||
}
|
||||
if !foundShutdown {
|
||||
t.Error("expected graceful_shutdown in apoptosis actions")
|
||||
}
|
||||
|
||||
// Cannot deactivate apoptosis.
|
||||
err = pe.DeactivateMode("architect:admin")
|
||||
if err == nil {
|
||||
t.Error("expected error deactivating apoptosis")
|
||||
}
|
||||
}
|
||||
|
||||
// SP-04: Invalid transition (downgrade).
|
||||
func TestPreservation_SP04_InvalidTransition(t *testing.T) {
|
||||
log := newModeActionLog()
|
||||
pe := NewPreservationEngine(log.execute)
|
||||
|
||||
pe.ActivateMode(ModeLockdown, "test", "auto")
|
||||
|
||||
// Can't downgrade from LOCKDOWN to SAFE.
|
||||
err := pe.ActivateMode(ModeSafe, "test downgrade", "auto")
|
||||
if err == nil {
|
||||
t.Error("expected error on downgrade from LOCKDOWN to SAFE")
|
||||
}
|
||||
}
|
||||
|
||||
// SP-05: Escalation (SAFE → LOCKDOWN → APOPTOSIS).
|
||||
func TestPreservation_SP05_Escalation(t *testing.T) {
|
||||
log := newModeActionLog()
|
||||
pe := NewPreservationEngine(log.execute)
|
||||
|
||||
pe.ActivateMode(ModeSafe, "quorum lost", "auto")
|
||||
if pe.CurrentMode() != ModeSafe {
|
||||
t.Fatal("expected SAFE")
|
||||
}
|
||||
|
||||
pe.ActivateMode(ModeLockdown, "compromise detected", "auto")
|
||||
if pe.CurrentMode() != ModeLockdown {
|
||||
t.Fatal("expected LOCKDOWN")
|
||||
}
|
||||
|
||||
pe.ActivateMode(ModeApoptosis, "rootkit", "auto")
|
||||
if pe.CurrentMode() != ModeApoptosis {
|
||||
t.Fatal("expected APOPTOSIS")
|
||||
}
|
||||
}
|
||||
|
||||
// SP-06: Safe mode auto-exit.
|
||||
func TestPreservation_SP06_AutoExit(t *testing.T) {
|
||||
log := newModeActionLog()
|
||||
pe := NewPreservationEngine(log.execute)
|
||||
|
||||
pe.ActivateMode(ModeSafe, "test", "auto")
|
||||
|
||||
// Not yet time.
|
||||
if pe.ShouldAutoExit() {
|
||||
t.Error("should not auto-exit immediately")
|
||||
}
|
||||
|
||||
// Fast-forward activation's auto_exit_at.
|
||||
pe.mu.Lock()
|
||||
pe.activation.AutoExitAt = time.Now().Add(-1 * time.Second)
|
||||
pe.mu.Unlock()
|
||||
|
||||
if !pe.ShouldAutoExit() {
|
||||
t.Error("should auto-exit after timer expired")
|
||||
}
|
||||
}
|
||||
|
||||
// SP-07: Manual deactivation of safe mode.
|
||||
func TestPreservation_SP07_ManualDeactivate(t *testing.T) {
|
||||
log := newModeActionLog()
|
||||
pe := NewPreservationEngine(log.execute)
|
||||
|
||||
pe.ActivateMode(ModeSafe, "test", "auto")
|
||||
err := pe.DeactivateMode("architect:admin")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if pe.CurrentMode() != ModeNone {
|
||||
t.Errorf("expected NONE, got %s", pe.CurrentMode())
|
||||
}
|
||||
}
|
||||
|
||||
// SP-08: Lockdown deactivation.
|
||||
func TestPreservation_SP08_LockdownDeactivate(t *testing.T) {
|
||||
log := newModeActionLog()
|
||||
pe := NewPreservationEngine(log.execute)
|
||||
|
||||
pe.ActivateMode(ModeLockdown, "test", "auto")
|
||||
err := pe.DeactivateMode("architect:admin")
|
||||
if err != nil {
|
||||
t.Fatalf("lockdown deactivation should succeed: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// SP-09: History audit log.
|
||||
func TestPreservation_SP09_AuditHistory(t *testing.T) {
|
||||
log := newModeActionLog()
|
||||
pe := NewPreservationEngine(log.execute)
|
||||
|
||||
pe.ActivateMode(ModeSafe, "test", "auto")
|
||||
pe.DeactivateMode("admin")
|
||||
|
||||
history := pe.History()
|
||||
if len(history) == 0 {
|
||||
t.Error("expected audit history entries")
|
||||
}
|
||||
|
||||
// Last entry should be deactivation.
|
||||
last := history[len(history)-1]
|
||||
if last.Action != "deactivated" {
|
||||
t.Errorf("expected deactivated, got %s", last.Action)
|
||||
}
|
||||
}
|
||||
|
||||
// SP-10: Action failure in non-apoptosis mode aborts.
|
||||
func TestPreservation_SP10_ActionFailure(t *testing.T) {
|
||||
log := newModeActionLog()
|
||||
log.failAction = "disable_non_essential_services"
|
||||
pe := NewPreservationEngine(log.execute)
|
||||
|
||||
err := pe.ActivateMode(ModeSafe, "test", "auto")
|
||||
if err == nil {
|
||||
t.Error("expected error when safe mode action fails")
|
||||
}
|
||||
// Mode should not have changed due to failure.
|
||||
if pe.CurrentMode() != ModeNone {
|
||||
t.Errorf("expected NONE after failed activation, got %s", pe.CurrentMode())
|
||||
}
|
||||
}
|
||||
|
||||
// SP-10b: Action failure in apoptosis mode continues.
|
||||
func TestPreservation_SP10b_ApoptosisActionFailure(t *testing.T) {
|
||||
log := newModeActionLog()
|
||||
log.failAction = "graceful_shutdown"
|
||||
pe := NewPreservationEngine(log.execute)
|
||||
|
||||
// Apoptosis should continue despite action failures.
|
||||
err := pe.ActivateMode(ModeApoptosis, "rootkit", "auto")
|
||||
if err != nil {
|
||||
t.Fatalf("apoptosis should not fail on action errors: %v", err)
|
||||
}
|
||||
if pe.CurrentMode() != ModeApoptosis {
|
||||
t.Errorf("expected APOPTOSIS, got %s", pe.CurrentMode())
|
||||
}
|
||||
}
|
||||
|
||||
// Test ModeNone activation rejected.
|
||||
func TestPreservation_ModeNoneRejected(t *testing.T) {
|
||||
pe := NewPreservationEngine(func(_ EmergencyMode, _ string, _ map[string]interface{}) error { return nil })
|
||||
err := pe.ActivateMode(ModeNone, "test", "auto")
|
||||
if err == nil {
|
||||
t.Error("expected error activating ModeNone")
|
||||
}
|
||||
}
|
||||
|
||||
// Test deactivate when already NONE.
|
||||
func TestPreservation_DeactivateNone(t *testing.T) {
|
||||
pe := NewPreservationEngine(func(_ EmergencyMode, _ string, _ map[string]interface{}) error { return nil })
|
||||
err := pe.DeactivateMode("admin")
|
||||
if err != nil {
|
||||
t.Errorf("deactivating NONE should be no-op: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Test ShouldAutoExit when not in safe mode.
|
||||
func TestPreservation_AutoExitNotSafe(t *testing.T) {
|
||||
pe := NewPreservationEngine(func(_ EmergencyMode, _ string, _ map[string]interface{}) error { return nil })
|
||||
if pe.ShouldAutoExit() {
|
||||
t.Error("should not auto-exit when mode is NONE")
|
||||
}
|
||||
}
|
||||
|
||||
// --- Integrity Verifier Tests ---
|
||||
|
||||
// SP-04 (ТЗ): Binary integrity check — hash mismatch.
|
||||
func TestIntegrity_BinaryMismatch(t *testing.T) {
|
||||
tmpDir := t.TempDir()
|
||||
binPath := filepath.Join(tmpDir, "test-binary")
|
||||
os.WriteFile(binPath, []byte("original content"), 0o644)
|
||||
|
||||
// Calculate correct hash.
|
||||
h := sha256.Sum256([]byte("original content"))
|
||||
correctHash := hex.EncodeToString(h[:])
|
||||
|
||||
iv := NewIntegrityVerifier([]byte("test-key"))
|
||||
iv.RegisterBinary(binPath, correctHash)
|
||||
|
||||
// Verify (should pass).
|
||||
report := iv.VerifyAll()
|
||||
if report.Overall != IntegrityVerified {
|
||||
t.Errorf("expected VERIFIED, got %s", report.Overall)
|
||||
}
|
||||
|
||||
// Tamper with the binary.
|
||||
os.WriteFile(binPath, []byte("tampered content"), 0o644)
|
||||
|
||||
// Verify (should fail).
|
||||
report = iv.VerifyAll()
|
||||
if report.Overall != IntegrityCompromised {
|
||||
t.Errorf("expected COMPROMISED, got %s", report.Overall)
|
||||
}
|
||||
bs := report.Binaries[binPath]
|
||||
if bs.Status != IntegrityCompromised {
|
||||
t.Errorf("expected binary COMPROMISED, got %s", bs.Status)
|
||||
}
|
||||
}
|
||||
|
||||
// Binary not found.
|
||||
func TestIntegrity_BinaryNotFound(t *testing.T) {
|
||||
iv := NewIntegrityVerifier([]byte("test-key"))
|
||||
iv.RegisterBinary("/nonexistent/binary", "abc123")
|
||||
|
||||
report := iv.VerifyAll()
|
||||
bs := report.Binaries["/nonexistent/binary"]
|
||||
if bs.Status != IntegrityUnknown {
|
||||
t.Errorf("expected UNKNOWN for missing binary, got %s", bs.Status)
|
||||
}
|
||||
}
|
||||
|
||||
// Config HMAC computation.
|
||||
func TestIntegrity_ConfigHMAC(t *testing.T) {
|
||||
tmpDir := t.TempDir()
|
||||
cfgPath := filepath.Join(tmpDir, "config.yaml")
|
||||
os.WriteFile(cfgPath, []byte("server:\n port: 8080"), 0o644)
|
||||
|
||||
iv := NewIntegrityVerifier([]byte("hmac-key"))
|
||||
iv.RegisterConfig(cfgPath)
|
||||
|
||||
report := iv.VerifyAll()
|
||||
cs := report.Configs[cfgPath]
|
||||
if !cs.Valid {
|
||||
t.Errorf("expected valid config, got error: %s", cs.Error)
|
||||
}
|
||||
if cs.CurrentHMAC == "" {
|
||||
t.Error("expected non-empty HMAC")
|
||||
}
|
||||
}
|
||||
|
||||
// Config file unreadable.
|
||||
func TestIntegrity_ConfigUnreadable(t *testing.T) {
|
||||
iv := NewIntegrityVerifier([]byte("key"))
|
||||
iv.RegisterConfig("/nonexistent/config.yaml")
|
||||
|
||||
report := iv.VerifyAll()
|
||||
cs := report.Configs["/nonexistent/config.yaml"]
|
||||
if cs.Valid {
|
||||
t.Error("expected invalid for unreadable config")
|
||||
}
|
||||
}
|
||||
|
||||
// Decision chain — file does not exist (OK, no chain yet).
|
||||
func TestIntegrity_ChainNotExist(t *testing.T) {
|
||||
iv := NewIntegrityVerifier([]byte("key"))
|
||||
iv.SetChainPath("/nonexistent/decisions.log")
|
||||
|
||||
report := iv.VerifyAll()
|
||||
if report.Chain == nil {
|
||||
t.Fatal("expected chain status")
|
||||
}
|
||||
if !report.Chain.Valid {
|
||||
t.Error("nonexistent chain should be valid (no entries)")
|
||||
}
|
||||
}
|
||||
|
||||
// Decision chain — file exists.
|
||||
func TestIntegrity_ChainExists(t *testing.T) {
|
||||
tmpDir := t.TempDir()
|
||||
chainPath := filepath.Join(tmpDir, "decisions.log")
|
||||
os.WriteFile(chainPath, []byte("entry1\nentry2\n"), 0o644)
|
||||
|
||||
iv := NewIntegrityVerifier([]byte("key"))
|
||||
iv.SetChainPath(chainPath)
|
||||
|
||||
report := iv.VerifyAll()
|
||||
if report.Chain == nil {
|
||||
t.Fatal("expected chain status")
|
||||
}
|
||||
if !report.Chain.Valid {
|
||||
t.Error("expected valid chain")
|
||||
}
|
||||
}
|
||||
|
||||
// LastReport.
|
||||
func TestIntegrity_LastReport(t *testing.T) {
|
||||
iv := NewIntegrityVerifier([]byte("key"))
|
||||
if iv.LastReport() != nil {
|
||||
t.Error("expected nil before first verify")
|
||||
}
|
||||
|
||||
iv.VerifyAll()
|
||||
if iv.LastReport() == nil {
|
||||
t.Error("expected report after verify")
|
||||
}
|
||||
}
|
||||
|
||||
// Pluggable integrity check in PreservationEngine.
|
||||
func TestPreservation_IntegrityCheck(t *testing.T) {
|
||||
pe := NewPreservationEngine(func(_ EmergencyMode, _ string, _ map[string]interface{}) error { return nil })
|
||||
|
||||
// Default: no integrity fn → VERIFIED.
|
||||
report := pe.CheckIntegrity()
|
||||
if report.Overall != IntegrityVerified {
|
||||
t.Errorf("expected VERIFIED, got %s", report.Overall)
|
||||
}
|
||||
|
||||
// Set custom checker.
|
||||
pe.SetIntegrityCheck(func() IntegrityReport {
|
||||
return IntegrityReport{Overall: IntegrityCompromised, Timestamp: time.Now()}
|
||||
})
|
||||
|
||||
report = pe.CheckIntegrity()
|
||||
if report.Overall != IntegrityCompromised {
|
||||
t.Errorf("expected COMPROMISED from custom checker, got %s", report.Overall)
|
||||
}
|
||||
}
|
||||
398
internal/application/resilience/recovery_playbooks.go
Normal file
398
internal/application/resilience/recovery_playbooks.go
Normal file
|
|
@ -0,0 +1,398 @@
|
|||
package resilience
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// PlaybookStatus tracks the state of a running playbook.
|
||||
type PlaybookStatus string
|
||||
|
||||
const (
|
||||
PlaybookPending PlaybookStatus = "PENDING"
|
||||
PlaybookRunning PlaybookStatus = "RUNNING"
|
||||
PlaybookSucceeded PlaybookStatus = "SUCCEEDED"
|
||||
PlaybookFailed PlaybookStatus = "FAILED"
|
||||
PlaybookRolledBack PlaybookStatus = "ROLLED_BACK"
|
||||
)
|
||||
|
||||
// PlaybookStep is a single step in a recovery playbook.
|
||||
type PlaybookStep struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Type string `json:"type"` // shell, api, consensus, crypto, systemd, http, prometheus
|
||||
Timeout time.Duration `json:"timeout"`
|
||||
Retries int `json:"retries"`
|
||||
Params map[string]interface{} `json:"params,omitempty"`
|
||||
OnError string `json:"on_error"` // abort, continue, rollback
|
||||
Condition string `json:"condition,omitempty"` // prerequisite condition
|
||||
}
|
||||
|
||||
// Playbook defines a complete recovery procedure.
|
||||
type Playbook struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Version string `json:"version"`
|
||||
TriggerMetric string `json:"trigger_metric"`
|
||||
TriggerSeverity string `json:"trigger_severity"`
|
||||
DiagnosisChecks []PlaybookStep `json:"diagnosis_checks"`
|
||||
Actions []PlaybookStep `json:"actions"`
|
||||
RollbackActions []PlaybookStep `json:"rollback_actions"`
|
||||
SuccessCriteria []string `json:"success_criteria"`
|
||||
}
|
||||
|
||||
// PlaybookExecution tracks a single playbook run.
|
||||
type PlaybookExecution struct {
|
||||
ID string `json:"id"`
|
||||
PlaybookID string `json:"playbook_id"`
|
||||
Component string `json:"component"`
|
||||
Status PlaybookStatus `json:"status"`
|
||||
StartedAt time.Time `json:"started_at"`
|
||||
CompletedAt time.Time `json:"completed_at,omitempty"`
|
||||
StepsRun []StepResult `json:"steps_run"`
|
||||
Error string `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
// StepResult records the execution of a single playbook step.
|
||||
type StepResult struct {
|
||||
StepID string `json:"step_id"`
|
||||
StepName string `json:"step_name"`
|
||||
Success bool `json:"success"`
|
||||
Duration time.Duration `json:"duration"`
|
||||
Output string `json:"output,omitempty"`
|
||||
Error string `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
// PlaybookExecutorFunc runs a single playbook step.
|
||||
type PlaybookExecutorFunc func(ctx context.Context, step PlaybookStep, component string) (string, error)
|
||||
|
||||
// RecoveryPlaybookEngine manages and executes recovery playbooks.
|
||||
type RecoveryPlaybookEngine struct {
|
||||
mu sync.RWMutex
|
||||
playbooks map[string]*Playbook
|
||||
executions []*PlaybookExecution
|
||||
execCount int64
|
||||
executor PlaybookExecutorFunc
|
||||
logger *slog.Logger
|
||||
}
|
||||
|
||||
// NewRecoveryPlaybookEngine creates a new playbook engine.
|
||||
func NewRecoveryPlaybookEngine(executor PlaybookExecutorFunc) *RecoveryPlaybookEngine {
|
||||
return &RecoveryPlaybookEngine{
|
||||
playbooks: make(map[string]*Playbook),
|
||||
executions: make([]*PlaybookExecution, 0),
|
||||
executor: executor,
|
||||
logger: slog.Default().With("component", "sarl-recovery-playbooks"),
|
||||
}
|
||||
}
|
||||
|
||||
// RegisterPlaybook adds a playbook to the engine.
|
||||
func (rpe *RecoveryPlaybookEngine) RegisterPlaybook(pb Playbook) {
|
||||
rpe.mu.Lock()
|
||||
defer rpe.mu.Unlock()
|
||||
rpe.playbooks[pb.ID] = &pb
|
||||
rpe.logger.Info("playbook registered", "id", pb.ID, "name", pb.Name)
|
||||
}
|
||||
|
||||
// Execute runs a playbook for a given component. Returns the execution ID.
|
||||
func (rpe *RecoveryPlaybookEngine) Execute(ctx context.Context, playbookID, component string) (string, error) {
|
||||
rpe.mu.Lock()
|
||||
pb, ok := rpe.playbooks[playbookID]
|
||||
if !ok {
|
||||
rpe.mu.Unlock()
|
||||
return "", fmt.Errorf("playbook %s not found", playbookID)
|
||||
}
|
||||
|
||||
rpe.execCount++
|
||||
exec := &PlaybookExecution{
|
||||
ID: fmt.Sprintf("exec-%d", rpe.execCount),
|
||||
PlaybookID: playbookID,
|
||||
Component: component,
|
||||
Status: PlaybookRunning,
|
||||
StartedAt: time.Now(),
|
||||
StepsRun: make([]StepResult, 0),
|
||||
}
|
||||
rpe.executions = append(rpe.executions, exec)
|
||||
rpe.mu.Unlock()
|
||||
|
||||
rpe.logger.Info("playbook execution started",
|
||||
"exec_id", exec.ID,
|
||||
"playbook", pb.Name,
|
||||
"component", component,
|
||||
)
|
||||
|
||||
// Phase 1: Diagnosis checks.
|
||||
for _, check := range pb.DiagnosisChecks {
|
||||
result := rpe.runStep(ctx, check, component)
|
||||
exec.StepsRun = append(exec.StepsRun, result)
|
||||
if !result.Success {
|
||||
rpe.logger.Warn("diagnosis check failed",
|
||||
"step", check.ID,
|
||||
"error", result.Error,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 2: Execute recovery actions.
|
||||
var execErr error
|
||||
for _, action := range pb.Actions {
|
||||
result := rpe.runStep(ctx, action, component)
|
||||
exec.StepsRun = append(exec.StepsRun, result)
|
||||
|
||||
if !result.Success {
|
||||
switch action.OnError {
|
||||
case "continue":
|
||||
continue
|
||||
case "rollback":
|
||||
execErr = fmt.Errorf("step %s failed (rollback): %s", action.ID, result.Error)
|
||||
default: // "abort"
|
||||
execErr = fmt.Errorf("step %s failed: %s", action.ID, result.Error)
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 3: Handle result.
|
||||
if execErr != nil {
|
||||
rpe.logger.Error("playbook failed, executing rollback",
|
||||
"exec_id", exec.ID,
|
||||
"error", execErr,
|
||||
)
|
||||
|
||||
// Execute rollback.
|
||||
for _, rb := range pb.RollbackActions {
|
||||
result := rpe.runStep(ctx, rb, component)
|
||||
exec.StepsRun = append(exec.StepsRun, result)
|
||||
}
|
||||
|
||||
exec.Status = PlaybookRolledBack
|
||||
exec.Error = execErr.Error()
|
||||
} else {
|
||||
exec.Status = PlaybookSucceeded
|
||||
rpe.logger.Info("playbook succeeded",
|
||||
"exec_id", exec.ID,
|
||||
"component", component,
|
||||
"duration", time.Since(exec.StartedAt),
|
||||
)
|
||||
}
|
||||
|
||||
exec.CompletedAt = time.Now()
|
||||
return exec.ID, execErr
|
||||
}
|
||||
|
||||
// runStep executes a single step with timeout and retries.
|
||||
func (rpe *RecoveryPlaybookEngine) runStep(ctx context.Context, step PlaybookStep, component string) StepResult {
|
||||
start := time.Now()
|
||||
result := StepResult{
|
||||
StepID: step.ID,
|
||||
StepName: step.Name,
|
||||
}
|
||||
|
||||
retries := step.Retries
|
||||
if retries <= 0 {
|
||||
retries = 1
|
||||
}
|
||||
|
||||
var lastErr error
|
||||
for attempt := 0; attempt < retries; attempt++ {
|
||||
stepCtx := ctx
|
||||
var cancel context.CancelFunc
|
||||
if step.Timeout > 0 {
|
||||
stepCtx, cancel = context.WithTimeout(ctx, step.Timeout)
|
||||
}
|
||||
|
||||
output, err := rpe.executor(stepCtx, step, component)
|
||||
|
||||
if cancel != nil {
|
||||
cancel()
|
||||
}
|
||||
|
||||
if err == nil {
|
||||
result.Success = true
|
||||
result.Output = output
|
||||
result.Duration = time.Since(start)
|
||||
return result
|
||||
}
|
||||
lastErr = err
|
||||
|
||||
if attempt < retries-1 {
|
||||
rpe.logger.Warn("step retry",
|
||||
"step", step.ID,
|
||||
"attempt", attempt+1,
|
||||
"error", err,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
result.Success = false
|
||||
result.Error = lastErr.Error()
|
||||
result.Duration = time.Since(start)
|
||||
return result
|
||||
}
|
||||
|
||||
// GetExecution returns a playbook execution by ID.
|
||||
// Returns a deep copy to prevent data races with the execution goroutine.
|
||||
func (rpe *RecoveryPlaybookEngine) GetExecution(id string) (*PlaybookExecution, bool) {
|
||||
rpe.mu.RLock()
|
||||
defer rpe.mu.RUnlock()
|
||||
|
||||
for _, exec := range rpe.executions {
|
||||
if exec.ID == id {
|
||||
cp := *exec
|
||||
cp.StepsRun = make([]StepResult, len(exec.StepsRun))
|
||||
copy(cp.StepsRun, exec.StepsRun)
|
||||
return &cp, true
|
||||
}
|
||||
}
|
||||
return nil, false
|
||||
}
|
||||
|
||||
// RecentExecutions returns the last N executions.
|
||||
// Returns deep copies to prevent data races with the execution goroutine.
|
||||
func (rpe *RecoveryPlaybookEngine) RecentExecutions(n int) []PlaybookExecution {
|
||||
rpe.mu.RLock()
|
||||
defer rpe.mu.RUnlock()
|
||||
|
||||
total := len(rpe.executions)
|
||||
if total == 0 {
|
||||
return nil
|
||||
}
|
||||
start := total - n
|
||||
if start < 0 {
|
||||
start = 0
|
||||
}
|
||||
|
||||
result := make([]PlaybookExecution, 0, n)
|
||||
for i := start; i < total; i++ {
|
||||
cp := *rpe.executions[i]
|
||||
cp.StepsRun = make([]StepResult, len(rpe.executions[i].StepsRun))
|
||||
copy(cp.StepsRun, rpe.executions[i].StepsRun)
|
||||
result = append(result, cp)
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// PlaybookCount returns the number of registered playbooks.
|
||||
func (rpe *RecoveryPlaybookEngine) PlaybookCount() int {
|
||||
rpe.mu.RLock()
|
||||
defer rpe.mu.RUnlock()
|
||||
return len(rpe.playbooks)
|
||||
}
|
||||
|
||||
// --- Built-in playbooks per ТЗ §7.1 ---
|
||||
|
||||
// DefaultPlaybooks returns the 3 built-in recovery playbooks.
|
||||
func DefaultPlaybooks() []Playbook {
|
||||
return []Playbook{
|
||||
ComponentResurrectionPlaybook(),
|
||||
ConsensusRecoveryPlaybook(),
|
||||
CryptoRotationPlaybook(),
|
||||
}
|
||||
}
|
||||
|
||||
// ComponentResurrectionPlaybook per ТЗ §7.1.1.
|
||||
func ComponentResurrectionPlaybook() Playbook {
|
||||
return Playbook{
|
||||
ID: "component-resurrection",
|
||||
Name: "Component Resurrection",
|
||||
Version: "1.0",
|
||||
TriggerMetric: "component_offline",
|
||||
TriggerSeverity: "CRITICAL",
|
||||
DiagnosisChecks: []PlaybookStep{
|
||||
{ID: "diag-process", Name: "Check process exists", Type: "shell", Timeout: 5 * time.Second},
|
||||
{ID: "diag-crashes", Name: "Check recent crashes", Type: "shell", Timeout: 5 * time.Second},
|
||||
{ID: "diag-resources", Name: "Check resource exhaustion", Type: "prometheus", Timeout: 5 * time.Second},
|
||||
{ID: "diag-deps", Name: "Check dependency health", Type: "http", Timeout: 10 * time.Second},
|
||||
},
|
||||
Actions: []PlaybookStep{
|
||||
{ID: "capture-forensics", Name: "Capture forensics", Type: "shell", Timeout: 30 * time.Second, OnError: "continue"},
|
||||
{ID: "clear-resources", Name: "Clear temp resources", Type: "shell", Timeout: 10 * time.Second, OnError: "continue"},
|
||||
{ID: "restart-component", Name: "Restart component", Type: "systemd", Timeout: 60 * time.Second, OnError: "abort"},
|
||||
{ID: "verify-health", Name: "Verify health", Type: "http", Timeout: 30 * time.Second, Retries: 3, OnError: "abort"},
|
||||
{ID: "verify-metrics", Name: "Verify metrics", Type: "prometheus", Timeout: 30 * time.Second, OnError: "continue"},
|
||||
{ID: "notify-success", Name: "Notify SOC", Type: "api", Timeout: 5 * time.Second, OnError: "continue"},
|
||||
},
|
||||
RollbackActions: []PlaybookStep{
|
||||
{ID: "rb-safe-mode", Name: "Enter safe mode", Type: "api", Timeout: 10 * time.Second},
|
||||
{ID: "rb-notify", Name: "Notify architect", Type: "api", Timeout: 5 * time.Second},
|
||||
},
|
||||
SuccessCriteria: []string{
|
||||
"component_status == HEALTHY",
|
||||
"health_check_passed == true",
|
||||
"no_crashes_for_5min == true",
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// ConsensusRecoveryPlaybook per ТЗ §7.1.2.
|
||||
func ConsensusRecoveryPlaybook() Playbook {
|
||||
return Playbook{
|
||||
ID: "consensus-recovery",
|
||||
Name: "Distributed Consensus Recovery",
|
||||
Version: "1.0",
|
||||
TriggerMetric: "split_brain",
|
||||
TriggerSeverity: "CRITICAL",
|
||||
DiagnosisChecks: []PlaybookStep{
|
||||
{ID: "diag-peers", Name: "Check peer connectivity", Type: "api", Timeout: 10 * time.Second},
|
||||
{ID: "diag-sync", Name: "Check sync status", Type: "api", Timeout: 10 * time.Second},
|
||||
{ID: "diag-genome", Name: "Verify genome", Type: "api", Timeout: 5 * time.Second},
|
||||
},
|
||||
Actions: []PlaybookStep{
|
||||
{ID: "pause-writes", Name: "Pause all writes", Type: "api", Timeout: 10 * time.Second, OnError: "abort"},
|
||||
{ID: "elect-leader", Name: "Elect leader (Raft)", Type: "consensus", Timeout: 60 * time.Second, OnError: "abort"},
|
||||
{ID: "sync-state", Name: "Sync state from leader", Type: "api", Timeout: 300 * time.Second, OnError: "rollback"},
|
||||
{ID: "verify-consistency", Name: "Verify consistency", Type: "api", Timeout: 60 * time.Second, OnError: "abort"},
|
||||
{ID: "resume-writes", Name: "Resume writes", Type: "api", Timeout: 10 * time.Second, OnError: "abort"},
|
||||
{ID: "notify-cluster", Name: "Notify cluster", Type: "api", Timeout: 5 * time.Second, OnError: "continue"},
|
||||
},
|
||||
RollbackActions: []PlaybookStep{
|
||||
{ID: "rb-readonly", Name: "Maintain readonly", Type: "api", Timeout: 10 * time.Second},
|
||||
{ID: "rb-notify", Name: "Notify architect", Type: "api", Timeout: 5 * time.Second},
|
||||
},
|
||||
SuccessCriteria: []string{
|
||||
"leader_elected == true",
|
||||
"state_synced == true",
|
||||
"consistency_verified == true",
|
||||
"writes_resumed == true",
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// CryptoRotationPlaybook per ТЗ §7.1.3.
|
||||
func CryptoRotationPlaybook() Playbook {
|
||||
return Playbook{
|
||||
ID: "crypto-rotation",
|
||||
Name: "Cryptographic Key Rotation",
|
||||
Version: "1.0",
|
||||
TriggerMetric: "key_compromise",
|
||||
TriggerSeverity: "HIGH",
|
||||
DiagnosisChecks: []PlaybookStep{
|
||||
{ID: "diag-key-age", Name: "Check key age", Type: "crypto", Timeout: 5 * time.Second},
|
||||
{ID: "diag-usage", Name: "Check key usage anomaly", Type: "prometheus", Timeout: 5 * time.Second},
|
||||
{ID: "diag-tpm", Name: "Check TPM health", Type: "shell", Timeout: 5 * time.Second},
|
||||
},
|
||||
Actions: []PlaybookStep{
|
||||
{ID: "gen-keys", Name: "Generate new keys", Type: "crypto", Timeout: 30 * time.Second, OnError: "abort",
|
||||
Params: map[string]interface{}{"algorithm": "ECDSA-P256"},
|
||||
},
|
||||
{ID: "rotate-certs", Name: "Rotate mTLS certs", Type: "crypto", Timeout: 120 * time.Second, OnError: "rollback"},
|
||||
{ID: "resign-chain", Name: "Re-sign decision chain", Type: "crypto", Timeout: 300 * time.Second, OnError: "continue"},
|
||||
{ID: "verify-peers", Name: "Verify peer certs", Type: "api", Timeout: 60 * time.Second, OnError: "abort"},
|
||||
{ID: "revoke-old", Name: "Revoke old keys", Type: "crypto", Timeout: 30 * time.Second, OnError: "continue"},
|
||||
{ID: "notify-soc", Name: "Notify SOC", Type: "api", Timeout: 5 * time.Second, OnError: "continue"},
|
||||
},
|
||||
RollbackActions: []PlaybookStep{
|
||||
{ID: "rb-revert-keys", Name: "Revert to previous keys", Type: "crypto", Timeout: 30 * time.Second},
|
||||
{ID: "rb-notify", Name: "Notify architect", Type: "api", Timeout: 5 * time.Second},
|
||||
},
|
||||
SuccessCriteria: []string{
|
||||
"new_keys_generated == true",
|
||||
"certs_distributed == true",
|
||||
"peers_verified == true",
|
||||
"old_keys_revoked == true",
|
||||
},
|
||||
}
|
||||
}
|
||||
318
internal/application/resilience/recovery_playbooks_test.go
Normal file
318
internal/application/resilience/recovery_playbooks_test.go
Normal file
|
|
@ -0,0 +1,318 @@
|
|||
package resilience
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
// --- Mock playbook executor ---
|
||||
|
||||
type mockPlaybookExecutor struct {
|
||||
failSteps map[string]bool
|
||||
callCount int
|
||||
}
|
||||
|
||||
func newMockPlaybookExecutor() *mockPlaybookExecutor {
|
||||
return &mockPlaybookExecutor{failSteps: make(map[string]bool)}
|
||||
}
|
||||
|
||||
func (m *mockPlaybookExecutor) execute(_ context.Context, step PlaybookStep, _ string) (string, error) {
|
||||
m.callCount++
|
||||
if m.failSteps[step.ID] {
|
||||
return "", fmt.Errorf("step %s failed", step.ID)
|
||||
}
|
||||
return fmt.Sprintf("step %s completed", step.ID), nil
|
||||
}
|
||||
|
||||
// --- Recovery Playbook Tests ---
|
||||
|
||||
// AR-01: Component resurrection (success).
|
||||
func TestPlaybook_AR01_ResurrectionSuccess(t *testing.T) {
|
||||
mock := newMockPlaybookExecutor()
|
||||
rpe := NewRecoveryPlaybookEngine(mock.execute)
|
||||
rpe.RegisterPlaybook(ComponentResurrectionPlaybook())
|
||||
|
||||
execID, err := rpe.Execute(context.Background(), "component-resurrection", "soc-ingest")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
exec, ok := rpe.GetExecution(execID)
|
||||
if !ok {
|
||||
t.Fatal("execution not found")
|
||||
}
|
||||
if exec.Status != PlaybookSucceeded {
|
||||
t.Errorf("expected SUCCEEDED, got %s", exec.Status)
|
||||
}
|
||||
if len(exec.StepsRun) == 0 {
|
||||
t.Error("expected steps to be recorded")
|
||||
}
|
||||
}
|
||||
|
||||
// AR-02: Component resurrection (failure → rollback).
|
||||
func TestPlaybook_AR02_ResurrectionFailure(t *testing.T) {
|
||||
mock := newMockPlaybookExecutor()
|
||||
mock.failSteps["restart-component"] = true
|
||||
|
||||
rpe := NewRecoveryPlaybookEngine(mock.execute)
|
||||
rpe.RegisterPlaybook(ComponentResurrectionPlaybook())
|
||||
|
||||
_, err := rpe.Execute(context.Background(), "component-resurrection", "soc-ingest")
|
||||
if err == nil {
|
||||
t.Fatal("expected error")
|
||||
}
|
||||
|
||||
execs := rpe.RecentExecutions(10)
|
||||
if len(execs) == 0 {
|
||||
t.Fatal("expected execution")
|
||||
}
|
||||
if execs[0].Status != PlaybookRolledBack {
|
||||
t.Errorf("expected ROLLED_BACK, got %s", execs[0].Status)
|
||||
}
|
||||
}
|
||||
|
||||
// AR-03: Consensus recovery (success).
|
||||
func TestPlaybook_AR03_ConsensusSuccess(t *testing.T) {
|
||||
mock := newMockPlaybookExecutor()
|
||||
rpe := NewRecoveryPlaybookEngine(mock.execute)
|
||||
rpe.RegisterPlaybook(ConsensusRecoveryPlaybook())
|
||||
|
||||
_, err := rpe.Execute(context.Background(), "consensus-recovery", "cluster")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// AR-04: Consensus recovery (failure → readonly maintained).
|
||||
func TestPlaybook_AR04_ConsensusFailure(t *testing.T) {
|
||||
mock := newMockPlaybookExecutor()
|
||||
mock.failSteps["elect-leader"] = true
|
||||
|
||||
rpe := NewRecoveryPlaybookEngine(mock.execute)
|
||||
rpe.RegisterPlaybook(ConsensusRecoveryPlaybook())
|
||||
|
||||
_, err := rpe.Execute(context.Background(), "consensus-recovery", "cluster")
|
||||
if err == nil {
|
||||
t.Fatal("expected error")
|
||||
}
|
||||
|
||||
execs := rpe.RecentExecutions(10)
|
||||
if execs[0].Status != PlaybookRolledBack {
|
||||
t.Errorf("expected ROLLED_BACK, got %s", execs[0].Status)
|
||||
}
|
||||
}
|
||||
|
||||
// AR-05: Crypto key rotation (success).
|
||||
func TestPlaybook_AR05_CryptoSuccess(t *testing.T) {
|
||||
mock := newMockPlaybookExecutor()
|
||||
rpe := NewRecoveryPlaybookEngine(mock.execute)
|
||||
rpe.RegisterPlaybook(CryptoRotationPlaybook())
|
||||
|
||||
_, err := rpe.Execute(context.Background(), "crypto-rotation", "system")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// AR-06: Crypto rotation (emergency — cert rotation fails → rollback).
|
||||
func TestPlaybook_AR06_CryptoRollback(t *testing.T) {
|
||||
mock := newMockPlaybookExecutor()
|
||||
mock.failSteps["rotate-certs"] = true
|
||||
|
||||
rpe := NewRecoveryPlaybookEngine(mock.execute)
|
||||
rpe.RegisterPlaybook(CryptoRotationPlaybook())
|
||||
|
||||
_, err := rpe.Execute(context.Background(), "crypto-rotation", "system")
|
||||
if err == nil {
|
||||
t.Fatal("expected error on cert rotation failure")
|
||||
}
|
||||
|
||||
execs := rpe.RecentExecutions(10)
|
||||
// Should have run rollback (revert keys).
|
||||
found := false
|
||||
for _, s := range execs[0].StepsRun {
|
||||
if s.StepID == "rb-revert-keys" {
|
||||
found = true
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Error("expected rollback step rb-revert-keys")
|
||||
}
|
||||
}
|
||||
|
||||
// AR-07: Forensic capture (all steps recorded).
|
||||
func TestPlaybook_AR07_ForensicCapture(t *testing.T) {
|
||||
mock := newMockPlaybookExecutor()
|
||||
rpe := NewRecoveryPlaybookEngine(mock.execute)
|
||||
rpe.RegisterPlaybook(ComponentResurrectionPlaybook())
|
||||
|
||||
execID, _ := rpe.Execute(context.Background(), "component-resurrection", "comp")
|
||||
exec, _ := rpe.GetExecution(execID)
|
||||
|
||||
for _, step := range exec.StepsRun {
|
||||
if step.StepID == "" {
|
||||
t.Error("step missing ID")
|
||||
}
|
||||
if step.StepName == "" {
|
||||
t.Errorf("step %s has empty name", step.StepID)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// AR-08: Rollback execution on action failure.
|
||||
func TestPlaybook_AR08_RollbackExecution(t *testing.T) {
|
||||
mock := newMockPlaybookExecutor()
|
||||
mock.failSteps["sync-state"] = true // Sync fails → rollback trigger.
|
||||
|
||||
rpe := NewRecoveryPlaybookEngine(mock.execute)
|
||||
rpe.RegisterPlaybook(ConsensusRecoveryPlaybook())
|
||||
|
||||
rpe.Execute(context.Background(), "consensus-recovery", "cluster")
|
||||
|
||||
execs := rpe.RecentExecutions(10)
|
||||
if execs[0].Status != PlaybookRolledBack {
|
||||
t.Errorf("expected ROLLED_BACK, got %s", execs[0].Status)
|
||||
}
|
||||
}
|
||||
|
||||
// AR-09: Step retries.
|
||||
func TestPlaybook_AR09_StepRetries(t *testing.T) {
|
||||
callCount := 0
|
||||
executor := func(_ context.Context, step PlaybookStep, _ string) (string, error) {
|
||||
callCount++
|
||||
if step.ID == "verify-health" && callCount <= 2 {
|
||||
return "", fmt.Errorf("not healthy yet")
|
||||
}
|
||||
return "ok", nil
|
||||
}
|
||||
|
||||
rpe := NewRecoveryPlaybookEngine(executor)
|
||||
rpe.RegisterPlaybook(ComponentResurrectionPlaybook())
|
||||
|
||||
_, err := rpe.Execute(context.Background(), "component-resurrection", "comp")
|
||||
if err != nil {
|
||||
t.Fatalf("expected success after retries: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// AR-10: Playbook not found.
|
||||
func TestPlaybook_AR10_NotFound(t *testing.T) {
|
||||
rpe := NewRecoveryPlaybookEngine(nil)
|
||||
_, err := rpe.Execute(context.Background(), "nonexistent", "comp")
|
||||
if err == nil {
|
||||
t.Fatal("expected error for nonexistent playbook")
|
||||
}
|
||||
}
|
||||
|
||||
// AR-11: Audit logging (all step timestamps).
|
||||
func TestPlaybook_AR11_AuditTimestamps(t *testing.T) {
|
||||
mock := newMockPlaybookExecutor()
|
||||
rpe := NewRecoveryPlaybookEngine(mock.execute)
|
||||
rpe.RegisterPlaybook(ComponentResurrectionPlaybook())
|
||||
|
||||
execID, _ := rpe.Execute(context.Background(), "component-resurrection", "comp")
|
||||
exec, _ := rpe.GetExecution(execID)
|
||||
|
||||
if exec.StartedAt.IsZero() {
|
||||
t.Error("missing started_at")
|
||||
}
|
||||
if exec.CompletedAt.IsZero() {
|
||||
t.Error("missing completed_at")
|
||||
}
|
||||
}
|
||||
|
||||
// AR-12: OnError=continue skips non-critical failures.
|
||||
func TestPlaybook_AR12_ContinueOnError(t *testing.T) {
|
||||
mock := newMockPlaybookExecutor()
|
||||
mock.failSteps["capture-forensics"] = true // OnError=continue.
|
||||
mock.failSteps["notify-success"] = true // OnError=continue.
|
||||
|
||||
rpe := NewRecoveryPlaybookEngine(mock.execute)
|
||||
rpe.RegisterPlaybook(ComponentResurrectionPlaybook())
|
||||
|
||||
_, err := rpe.Execute(context.Background(), "component-resurrection", "comp")
|
||||
if err != nil {
|
||||
t.Fatalf("expected success despite continue-on-error steps: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// AR-13: Context cancellation.
|
||||
func TestPlaybook_AR13_ContextCancel(t *testing.T) {
|
||||
executor := func(ctx context.Context, _ PlaybookStep, _ string) (string, error) {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return "", ctx.Err()
|
||||
case <-time.After(10 * time.Millisecond):
|
||||
return "ok", nil
|
||||
}
|
||||
}
|
||||
|
||||
rpe := NewRecoveryPlaybookEngine(executor)
|
||||
rpe.RegisterPlaybook(ComponentResurrectionPlaybook())
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
cancel() // Cancel immediately.
|
||||
|
||||
_, err := rpe.Execute(ctx, "component-resurrection", "comp")
|
||||
// May or may not error depending on timing, but should not hang.
|
||||
_ = err
|
||||
}
|
||||
|
||||
// AR-14: DefaultPlaybooks returns 3.
|
||||
func TestPlaybook_AR14_DefaultPlaybooks(t *testing.T) {
|
||||
pbs := DefaultPlaybooks()
|
||||
if len(pbs) != 3 {
|
||||
t.Errorf("expected 3 playbooks, got %d", len(pbs))
|
||||
}
|
||||
|
||||
ids := map[string]bool{}
|
||||
for _, pb := range pbs {
|
||||
if ids[pb.ID] {
|
||||
t.Errorf("duplicate playbook ID: %s", pb.ID)
|
||||
}
|
||||
ids[pb.ID] = true
|
||||
|
||||
if len(pb.Actions) == 0 {
|
||||
t.Errorf("playbook %s has no actions", pb.ID)
|
||||
}
|
||||
if len(pb.SuccessCriteria) == 0 {
|
||||
t.Errorf("playbook %s has no success criteria", pb.ID)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// AR-15: PlaybookCount and RecentExecutions.
|
||||
func TestPlaybook_AR15_CountsAndRecent(t *testing.T) {
|
||||
mock := newMockPlaybookExecutor()
|
||||
rpe := NewRecoveryPlaybookEngine(mock.execute)
|
||||
|
||||
if rpe.PlaybookCount() != 0 {
|
||||
t.Error("expected 0")
|
||||
}
|
||||
|
||||
for _, pb := range DefaultPlaybooks() {
|
||||
rpe.RegisterPlaybook(pb)
|
||||
}
|
||||
if rpe.PlaybookCount() != 3 {
|
||||
t.Errorf("expected 3, got %d", rpe.PlaybookCount())
|
||||
}
|
||||
|
||||
// Run two playbooks.
|
||||
rpe.Execute(context.Background(), "component-resurrection", "comp1")
|
||||
rpe.Execute(context.Background(), "crypto-rotation", "comp2")
|
||||
|
||||
recent := rpe.RecentExecutions(1)
|
||||
if len(recent) != 1 {
|
||||
t.Errorf("expected 1 recent, got %d", len(recent))
|
||||
}
|
||||
if recent[0].PlaybookID != "crypto-rotation" {
|
||||
t.Errorf("expected crypto-rotation, got %s", recent[0].PlaybookID)
|
||||
}
|
||||
|
||||
all := rpe.RecentExecutions(100)
|
||||
if len(all) != 2 {
|
||||
t.Errorf("expected 2 total, got %d", len(all))
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue