Release prep: 54 engines, self-hosted signatures, i18n, dashboard updates

This commit is contained in:
DmitrL-dev 2026-03-23 16:45:40 +10:00
parent 694e32be26
commit 41cbfd6e0a
178 changed files with 36008 additions and 399 deletions

View file

@ -0,0 +1,165 @@
package resilience
import (
"context"
"log/slog"
"runtime"
"sync"
"time"
)
// BehaviorProfile captures the runtime behavior of a component.
type BehaviorProfile struct {
Goroutines int `json:"goroutines"`
HeapAllocMB float64 `json:"heap_alloc_mb"`
HeapObjectsK float64 `json:"heap_objects_k"`
GCPauseMs float64 `json:"gc_pause_ms"`
NumGC uint32 `json:"num_gc"`
FileDescriptors int `json:"file_descriptors,omitempty"`
CustomMetrics map[string]float64 `json:"custom_metrics,omitempty"`
}
// BehavioralAlert is emitted when a behavioral anomaly is detected.
type BehavioralAlert struct {
Component string `json:"component"`
AnomalyType string `json:"anomaly_type"` // goroutine_leak, memory_leak, gc_pressure, etc.
Metric string `json:"metric"`
Current float64 `json:"current"`
Baseline float64 `json:"baseline"`
ZScore float64 `json:"z_score"`
Severity string `json:"severity"`
Timestamp time.Time `json:"timestamp"`
}
// BehavioralAnalyzer provides Go-side runtime behavioral analysis.
// It profiles the current process and compares against learned baselines.
// On Linux, eBPF hooks (immune/resilience_hooks.c) extend this to kernel level.
type BehavioralAnalyzer struct {
mu sync.RWMutex
metricsDB *MetricsDB
alertBus chan BehavioralAlert
interval time.Duration
component string // self component name
logger *slog.Logger
}
// NewBehavioralAnalyzer creates a new behavioral analyzer.
func NewBehavioralAnalyzer(component string, alertBufSize int) *BehavioralAnalyzer {
if alertBufSize <= 0 {
alertBufSize = 50
}
return &BehavioralAnalyzer{
metricsDB: NewMetricsDB(DefaultMetricsWindow, DefaultMetricsMaxSize),
alertBus: make(chan BehavioralAlert, alertBufSize),
interval: 1 * time.Minute,
component: component,
logger: slog.Default().With("component", "sarl-behavioral"),
}
}
// AlertBus returns the channel for consuming behavioral alerts.
func (ba *BehavioralAnalyzer) AlertBus() <-chan BehavioralAlert {
return ba.alertBus
}
// Start begins continuous behavioral monitoring. Blocks until ctx cancelled.
func (ba *BehavioralAnalyzer) Start(ctx context.Context) {
ba.logger.Info("behavioral analyzer started", "interval", ba.interval)
ticker := time.NewTicker(ba.interval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
ba.logger.Info("behavioral analyzer stopped")
return
case <-ticker.C:
ba.collectAndAnalyze()
}
}
}
// collectAndAnalyze profiles runtime and checks for anomalies.
func (ba *BehavioralAnalyzer) collectAndAnalyze() {
profile := ba.collectProfile()
ba.storeMetrics(profile)
ba.detectAnomalies(profile)
}
// collectProfile gathers current Go runtime stats.
func (ba *BehavioralAnalyzer) collectProfile() BehaviorProfile {
var mem runtime.MemStats
runtime.ReadMemStats(&mem)
return BehaviorProfile{
Goroutines: runtime.NumGoroutine(),
HeapAllocMB: float64(mem.HeapAlloc) / (1024 * 1024),
HeapObjectsK: float64(mem.HeapObjects) / 1000,
GCPauseMs: float64(mem.PauseNs[(mem.NumGC+255)%256]) / 1e6,
NumGC: mem.NumGC,
}
}
// storeMetrics records profile data in the time-series DB.
func (ba *BehavioralAnalyzer) storeMetrics(p BehaviorProfile) {
ba.metricsDB.AddDataPoint(ba.component, "goroutines", float64(p.Goroutines))
ba.metricsDB.AddDataPoint(ba.component, "heap_alloc_mb", p.HeapAllocMB)
ba.metricsDB.AddDataPoint(ba.component, "heap_objects_k", p.HeapObjectsK)
ba.metricsDB.AddDataPoint(ba.component, "gc_pause_ms", p.GCPauseMs)
}
// detectAnomalies checks each metric against its baseline via Z-score.
func (ba *BehavioralAnalyzer) detectAnomalies(p BehaviorProfile) {
checks := []struct {
metric string
value float64
anomalyType string
severity string
}{
{"goroutines", float64(p.Goroutines), "goroutine_leak", "WARNING"},
{"heap_alloc_mb", p.HeapAllocMB, "memory_leak", "CRITICAL"},
{"heap_objects_k", p.HeapObjectsK, "object_leak", "WARNING"},
{"gc_pause_ms", p.GCPauseMs, "gc_pressure", "WARNING"},
}
for _, c := range checks {
baseline := ba.metricsDB.GetBaseline(ba.component, c.metric, DefaultMetricsWindow)
if !IsAnomaly(c.value, baseline, AnomalyZScoreThreshold) {
continue
}
zscore := CalculateZScore(c.value, baseline)
alert := BehavioralAlert{
Component: ba.component,
AnomalyType: c.anomalyType,
Metric: c.metric,
Current: c.value,
Baseline: baseline.Mean,
ZScore: zscore,
Severity: c.severity,
Timestamp: time.Now(),
}
select {
case ba.alertBus <- alert:
ba.logger.Warn("behavioral anomaly detected",
"type", c.anomalyType,
"metric", c.metric,
"z_score", zscore,
)
default:
ba.logger.Error("behavioral alert bus full")
}
}
}
// InjectMetric allows manually injecting a metric for testing.
func (ba *BehavioralAnalyzer) InjectMetric(metric string, value float64) {
ba.metricsDB.AddDataPoint(ba.component, metric, value)
}
// CurrentProfile returns a snapshot of the current runtime profile.
func (ba *BehavioralAnalyzer) CurrentProfile() BehaviorProfile {
return ba.collectProfile()
}

View file

@ -0,0 +1,206 @@
package resilience
import (
"context"
"testing"
"time"
)
// IM-01: Goroutine leak detection.
func TestBehavioral_IM01_GoroutineLeak(t *testing.T) {
ba := NewBehavioralAnalyzer("soc-ingest", 10)
// Build baseline of 10 goroutines.
for i := 0; i < 50; i++ {
ba.InjectMetric("goroutines", 10)
}
// Spike to 1000 goroutines — should trigger anomaly.
ba.metricsDB.AddDataPoint("soc-ingest", "goroutines", 1000)
profile := BehaviorProfile{Goroutines: 1000}
ba.detectAnomalies(profile)
select {
case alert := <-ba.alertBus:
if alert.AnomalyType != "goroutine_leak" {
t.Errorf("expected goroutine_leak, got %s", alert.AnomalyType)
}
if alert.ZScore <= 3 {
t.Errorf("expected Z > 3, got %f", alert.ZScore)
}
default:
t.Error("expected goroutine leak alert")
}
}
// IM-02: Memory leak detection.
func TestBehavioral_IM02_MemoryLeak(t *testing.T) {
ba := NewBehavioralAnalyzer("soc-correlate", 10)
// Baseline: 50 MB.
for i := 0; i < 50; i++ {
ba.InjectMetric("heap_alloc_mb", 50)
}
// Spike to 500 MB.
ba.metricsDB.AddDataPoint("soc-correlate", "heap_alloc_mb", 500)
profile := BehaviorProfile{HeapAllocMB: 500}
ba.detectAnomalies(profile)
select {
case alert := <-ba.alertBus:
if alert.AnomalyType != "memory_leak" {
t.Errorf("expected memory_leak, got %s", alert.AnomalyType)
}
if alert.Severity != "CRITICAL" {
t.Errorf("expected CRITICAL severity, got %s", alert.Severity)
}
default:
t.Error("expected memory leak alert")
}
}
// IM-03: GC pressure detection.
func TestBehavioral_IM03_GCPressure(t *testing.T) {
ba := NewBehavioralAnalyzer("soc-respond", 10)
// Baseline: 1ms GC pause.
for i := 0; i < 50; i++ {
ba.InjectMetric("gc_pause_ms", 1)
}
// Spike to 100ms.
ba.metricsDB.AddDataPoint("soc-respond", "gc_pause_ms", 100)
profile := BehaviorProfile{GCPauseMs: 100}
ba.detectAnomalies(profile)
select {
case alert := <-ba.alertBus:
if alert.AnomalyType != "gc_pressure" {
t.Errorf("expected gc_pressure, got %s", alert.AnomalyType)
}
default:
t.Error("expected gc_pressure alert")
}
}
// IM-04: Object leak detection.
func TestBehavioral_IM04_ObjectLeak(t *testing.T) {
ba := NewBehavioralAnalyzer("shield", 10)
for i := 0; i < 50; i++ {
ba.InjectMetric("heap_objects_k", 100)
}
ba.metricsDB.AddDataPoint("shield", "heap_objects_k", 5000)
profile := BehaviorProfile{HeapObjectsK: 5000}
ba.detectAnomalies(profile)
select {
case alert := <-ba.alertBus:
if alert.AnomalyType != "object_leak" {
t.Errorf("expected object_leak, got %s", alert.AnomalyType)
}
default:
t.Error("expected object leak alert")
}
}
// IM-05: Normal behavior — no alerts.
func TestBehavioral_IM05_NormalBehavior(t *testing.T) {
ba := NewBehavioralAnalyzer("sidecar", 10)
for i := 0; i < 50; i++ {
ba.InjectMetric("goroutines", 10)
ba.InjectMetric("heap_alloc_mb", 50)
ba.InjectMetric("heap_objects_k", 100)
ba.InjectMetric("gc_pause_ms", 1)
}
profile := BehaviorProfile{
Goroutines: 10,
HeapAllocMB: 50,
HeapObjectsK: 100,
GCPauseMs: 1,
}
ba.detectAnomalies(profile)
select {
case alert := <-ba.alertBus:
t.Errorf("expected no alerts for normal behavior, got %+v", alert)
default:
// Good — no alerts.
}
}
// IM-06: Start/Stop lifecycle.
func TestBehavioral_IM06_StartStop(t *testing.T) {
ba := NewBehavioralAnalyzer("test", 10)
ba.interval = 50 * time.Millisecond
ctx, cancel := context.WithCancel(context.Background())
done := make(chan struct{})
go func() {
ba.Start(ctx)
close(done)
}()
time.Sleep(100 * time.Millisecond)
cancel()
select {
case <-done:
case <-time.After(time.Second):
t.Fatal("Start() did not return after context cancellation")
}
}
// IM-07: CurrentProfile returns valid data.
func TestBehavioral_IM07_CurrentProfile(t *testing.T) {
ba := NewBehavioralAnalyzer("test", 10)
profile := ba.CurrentProfile()
if profile.Goroutines <= 0 {
t.Error("expected positive goroutine count")
}
if profile.HeapAllocMB <= 0 {
t.Error("expected positive heap alloc")
}
}
// IM-08: Alert bus overflow (non-blocking).
func TestBehavioral_IM08_AlertBusOverflow(t *testing.T) {
ba := NewBehavioralAnalyzer("test", 2)
// Fill bus.
ba.alertBus <- BehavioralAlert{AnomalyType: "fill1"}
ba.alertBus <- BehavioralAlert{AnomalyType: "fill2"}
// Build baseline.
for i := 0; i < 50; i++ {
ba.InjectMetric("goroutines", 10)
}
// This should not panic.
ba.metricsDB.AddDataPoint("test", "goroutines", 10000)
ba.detectAnomalies(BehaviorProfile{Goroutines: 10000})
}
// Test collectAndAnalyze runs without error.
func TestBehavioral_CollectAndAnalyze(t *testing.T) {
ba := NewBehavioralAnalyzer("test", 10)
// Should not panic.
ba.collectAndAnalyze()
}
// Test InjectMetric stores data.
func TestBehavioral_InjectMetric(t *testing.T) {
ba := NewBehavioralAnalyzer("test", 10)
ba.InjectMetric("custom", 42.0)
recent := ba.metricsDB.GetRecent("test", "custom", 1)
if len(recent) != 1 || recent[0].Value != 42.0 {
t.Errorf("expected 42.0, got %v", recent)
}
}

View file

@ -0,0 +1,524 @@
package resilience
import (
"context"
"fmt"
"log/slog"
"sync"
"time"
)
// HealingState represents the FSM state of a healing operation.
type HealingState string
const (
HealingIdle HealingState = "IDLE"
HealingDiagnosing HealingState = "DIAGNOSING"
HealingActive HealingState = "HEALING"
HealingVerifying HealingState = "VERIFYING"
HealingCompleted HealingState = "COMPLETED"
HealingFailed HealingState = "FAILED"
)
// HealingResult summarizes a completed healing operation.
type HealingResult string
const (
ResultSuccess HealingResult = "SUCCESS"
ResultFailed HealingResult = "FAILED"
ResultSkipped HealingResult = "SKIPPED"
)
// ActionType defines the kinds of healing actions.
type ActionType string
const (
ActionGracefulStop ActionType = "graceful_stop"
ActionClearTempFiles ActionType = "clear_temp_files"
ActionStartComponent ActionType = "start_component"
ActionVerifyHealth ActionType = "verify_health"
ActionNotifySOC ActionType = "notify_soc"
ActionFreezeConfig ActionType = "freeze_config"
ActionRollbackConfig ActionType = "rollback_config"
ActionVerifyConfig ActionType = "verify_config"
ActionSwitchReadOnly ActionType = "switch_to_readonly"
ActionBackupDB ActionType = "backup_db"
ActionRestoreSnapshot ActionType = "restore_snapshot"
ActionVerifyIntegrity ActionType = "verify_integrity"
ActionResumeWrites ActionType = "resume_writes"
ActionDisableRules ActionType = "disable_rules"
ActionRevertRules ActionType = "revert_rules"
ActionReloadEngine ActionType = "reload_engine"
ActionIsolateNetwork ActionType = "isolate_network"
ActionRegenCerts ActionType = "regenerate_certs"
ActionRestoreNetwork ActionType = "restore_network"
ActionNotifyArchitect ActionType = "notify_architect"
ActionEnterSafeMode ActionType = "enter_safe_mode"
)
// Action is a single step in a healing strategy.
type Action struct {
Type ActionType `json:"type"`
Params map[string]interface{} `json:"params,omitempty"`
Timeout time.Duration `json:"timeout"`
OnError string `json:"on_error"` // "continue", "abort", "rollback"
}
// TriggerCondition defines when a healing strategy activates.
type TriggerCondition struct {
Metrics []string `json:"metrics,omitempty"`
Statuses []ComponentStatus `json:"statuses,omitempty"`
ConsecutiveFailures int `json:"consecutive_failures"`
WithinWindow time.Duration `json:"within_window"`
}
// RollbackPlan defines what happens if healing fails.
type RollbackPlan struct {
OnFailure string `json:"on_failure"` // "escalate", "enter_safe_mode", "maintain_isolation"
Actions []Action `json:"actions,omitempty"`
}
// HealingStrategy is a complete self-healing plan.
type HealingStrategy struct {
ID string `json:"id"`
Name string `json:"name"`
Trigger TriggerCondition `json:"trigger"`
Actions []Action `json:"actions"`
Rollback RollbackPlan `json:"rollback"`
MaxAttempts int `json:"max_attempts"`
Cooldown time.Duration `json:"cooldown"`
}
// Diagnosis is the result of root cause analysis.
type Diagnosis struct {
Component string `json:"component"`
Metric string `json:"metric"`
RootCause string `json:"root_cause"`
Confidence float64 `json:"confidence"`
SuggestedFix string `json:"suggested_fix"`
RelatedAlerts []HealthAlert `json:"related_alerts,omitempty"`
}
// HealingOperation tracks a single healing attempt.
type HealingOperation struct {
ID string `json:"id"`
StrategyID string `json:"strategy_id"`
Component string `json:"component"`
State HealingState `json:"state"`
Diagnosis *Diagnosis `json:"diagnosis,omitempty"`
ActionsRun []ActionLog `json:"actions_run"`
Result HealingResult `json:"result"`
StartedAt time.Time `json:"started_at"`
CompletedAt time.Time `json:"completed_at,omitempty"`
Error string `json:"error,omitempty"`
AttemptNumber int `json:"attempt_number"`
}
// ActionLog records the execution of a single action.
type ActionLog struct {
Action ActionType `json:"action"`
StartedAt time.Time `json:"started_at"`
Duration time.Duration `json:"duration"`
Success bool `json:"success"`
Error string `json:"error,omitempty"`
}
// ActionExecutorFunc is the callback that actually runs an action.
// Implementations handle the real system operations (restart, rollback, etc.).
type ActionExecutorFunc func(ctx context.Context, action Action, component string) error
// HealingEngine is the L2 Self-Healing orchestrator.
type HealingEngine struct {
mu sync.RWMutex
strategies []HealingStrategy
cooldowns map[string]time.Time // strategyID → earliest next run
operations []*HealingOperation
opCounter int64
executor ActionExecutorFunc
alertBus <-chan HealthAlert
escalateFn func(HealthAlert) // called on unrecoverable failure
logger *slog.Logger
}
// NewHealingEngine creates a new self-healing engine.
func NewHealingEngine(
alertBus <-chan HealthAlert,
executor ActionExecutorFunc,
escalateFn func(HealthAlert),
) *HealingEngine {
return &HealingEngine{
cooldowns: make(map[string]time.Time),
operations: make([]*HealingOperation, 0),
executor: executor,
alertBus: alertBus,
escalateFn: escalateFn,
logger: slog.Default().With("component", "sarl-healing-engine"),
}
}
// RegisterStrategy adds a healing strategy.
func (he *HealingEngine) RegisterStrategy(s HealingStrategy) {
he.mu.Lock()
defer he.mu.Unlock()
he.strategies = append(he.strategies, s)
he.logger.Info("strategy registered", "id", s.ID, "name", s.Name)
}
// Start begins listening for alerts and initiating healing. Blocks until ctx is cancelled.
func (he *HealingEngine) Start(ctx context.Context) {
he.logger.Info("healing engine started", "strategies", len(he.strategies))
for {
select {
case <-ctx.Done():
he.logger.Info("healing engine stopped")
return
case alert, ok := <-he.alertBus:
if !ok {
return
}
if alert.Severity == SeverityCritical || alert.Severity == SeverityWarning {
he.initiateHealing(ctx, alert)
}
}
}
}
// initiateHealing runs the healing pipeline for an alert.
func (he *HealingEngine) initiateHealing(ctx context.Context, alert HealthAlert) {
strategy := he.findStrategy(alert)
if strategy == nil {
he.logger.Info("no matching strategy for alert",
"component", alert.Component,
"metric", alert.Metric,
)
return
}
if he.isInCooldown(strategy.ID) {
he.logger.Info("strategy in cooldown",
"strategy", strategy.ID,
"component", alert.Component,
)
return
}
op := he.createOperation(strategy, alert.Component)
he.logger.Info("healing initiated",
"op_id", op.ID,
"strategy", strategy.ID,
"component", alert.Component,
)
// Phase 1: Diagnose.
he.transitionOp(op, HealingDiagnosing)
diagnosis := he.diagnose(alert)
op.Diagnosis = &diagnosis
// Phase 2: Execute healing actions.
he.transitionOp(op, HealingActive)
execErr := he.executeActions(ctx, strategy, op)
// Phase 3: Verify recovery.
if execErr == nil {
he.transitionOp(op, HealingVerifying)
verifyErr := he.verifyRecovery(ctx, strategy, op.Component)
if verifyErr != nil {
execErr = verifyErr
}
}
// Phase 4: Complete or fail.
if execErr == nil {
he.transitionOp(op, HealingCompleted)
op.Result = ResultSuccess
he.logger.Info("healing completed successfully",
"op_id", op.ID,
"component", op.Component,
"duration", time.Since(op.StartedAt),
)
} else {
he.transitionOp(op, HealingFailed)
op.Result = ResultFailed
op.Error = execErr.Error()
he.logger.Error("healing failed",
"op_id", op.ID,
"component", op.Component,
"error", execErr,
)
// Execute rollback.
he.executeRollback(ctx, strategy, op)
// Escalate.
if he.escalateFn != nil {
he.escalateFn(alert)
}
}
op.CompletedAt = time.Now()
he.setCooldown(strategy.ID, strategy.Cooldown)
}
// findStrategy returns the first matching strategy for an alert.
func (he *HealingEngine) findStrategy(alert HealthAlert) *HealingStrategy {
he.mu.RLock()
defer he.mu.RUnlock()
for i := range he.strategies {
s := &he.strategies[i]
if he.matchesTrigger(s.Trigger, alert) {
return s
}
}
return nil
}
// matchesTrigger checks if an alert matches a strategy's trigger condition.
func (he *HealingEngine) matchesTrigger(trigger TriggerCondition, alert HealthAlert) bool {
// Match by metric name.
for _, m := range trigger.Metrics {
if m == alert.Metric {
return true
}
}
// Match by component status.
for _, s := range trigger.Statuses {
switch s {
case StatusCritical:
if alert.Severity == SeverityCritical {
return true
}
case StatusOffline:
if alert.Severity == SeverityCritical && alert.SuggestedAction == "restart" {
return true
}
}
}
return false
}
// isInCooldown checks if a strategy is still in its cooldown period.
func (he *HealingEngine) isInCooldown(strategyID string) bool {
he.mu.RLock()
defer he.mu.RUnlock()
earliest, ok := he.cooldowns[strategyID]
return ok && time.Now().Before(earliest)
}
// setCooldown marks a strategy as cooling down.
func (he *HealingEngine) setCooldown(strategyID string, duration time.Duration) {
he.mu.Lock()
defer he.mu.Unlock()
he.cooldowns[strategyID] = time.Now().Add(duration)
}
// createOperation creates and records a new healing operation.
func (he *HealingEngine) createOperation(strategy *HealingStrategy, component string) *HealingOperation {
he.mu.Lock()
defer he.mu.Unlock()
he.opCounter++
op := &HealingOperation{
ID: fmt.Sprintf("heal-%d", he.opCounter),
StrategyID: strategy.ID,
Component: component,
State: HealingIdle,
StartedAt: time.Now(),
ActionsRun: make([]ActionLog, 0),
}
he.operations = append(he.operations, op)
return op
}
// transitionOp moves an operation to a new state.
func (he *HealingEngine) transitionOp(op *HealingOperation, newState HealingState) {
he.logger.Debug("healing state transition",
"op_id", op.ID,
"from", op.State,
"to", newState,
)
op.State = newState
}
// diagnose performs root cause analysis for an alert.
func (he *HealingEngine) diagnose(alert HealthAlert) Diagnosis {
rootCause := "unknown"
confidence := 0.5
suggestedFix := "restart component"
switch {
case alert.Metric == "memory" && alert.Current > 90:
rootCause = "memory_exhaustion"
confidence = 0.9
suggestedFix = "restart with increased limits"
case alert.Metric == "cpu" && alert.Current > 90:
rootCause = "cpu_saturation"
confidence = 0.8
suggestedFix = "check for runaway goroutines"
case alert.Metric == "error_rate":
rootCause = "elevated_error_rate"
confidence = 0.7
suggestedFix = "check dependencies and config"
case alert.Metric == "latency_p99":
rootCause = "latency_degradation"
confidence = 0.6
suggestedFix = "check database and network"
case alert.Metric == "quorum":
rootCause = "quorum_loss"
confidence = 0.95
suggestedFix = "activate safe mode"
default:
rootCause = fmt.Sprintf("threshold_breach_%s", alert.Metric)
confidence = 0.5
suggestedFix = "investigate manually"
}
return Diagnosis{
Component: alert.Component,
Metric: alert.Metric,
RootCause: rootCause,
Confidence: confidence,
SuggestedFix: suggestedFix,
}
}
// executeActions runs each action in sequence.
func (he *HealingEngine) executeActions(ctx context.Context, strategy *HealingStrategy, op *HealingOperation) error {
for _, action := range strategy.Actions {
actionCtx := ctx
var cancel context.CancelFunc
if action.Timeout > 0 {
actionCtx, cancel = context.WithTimeout(ctx, action.Timeout)
}
start := time.Now()
err := he.executor(actionCtx, action, op.Component)
duration := time.Since(start)
if cancel != nil {
cancel()
}
logEntry := ActionLog{
Action: action.Type,
StartedAt: start,
Duration: duration,
Success: err == nil,
}
if err != nil {
logEntry.Error = err.Error()
}
op.ActionsRun = append(op.ActionsRun, logEntry)
if err != nil {
switch action.OnError {
case "continue":
he.logger.Warn("action failed, continuing",
"action", action.Type,
"error", err,
)
case "rollback":
return fmt.Errorf("action %s failed (rollback): %w", action.Type, err)
default: // "abort"
return fmt.Errorf("action %s failed: %w", action.Type, err)
}
}
}
return nil
}
// verifyRecovery checks if the component is healthy after healing.
func (he *HealingEngine) verifyRecovery(ctx context.Context, strategy *HealingStrategy, component string) error {
// Execute a verify_health action if not already in the strategy.
verifyAction := Action{
Type: ActionVerifyHealth,
Timeout: 30 * time.Second,
}
return he.executor(ctx, verifyAction, component)
}
// executeRollback runs the rollback plan for a failed healing.
func (he *HealingEngine) executeRollback(ctx context.Context, strategy *HealingStrategy, op *HealingOperation) {
if len(strategy.Rollback.Actions) == 0 {
he.logger.Info("no rollback actions defined",
"strategy", strategy.ID,
)
return
}
he.logger.Warn("executing rollback",
"strategy", strategy.ID,
"component", op.Component,
)
for _, action := range strategy.Rollback.Actions {
if err := he.executor(ctx, action, op.Component); err != nil {
he.logger.Error("rollback action failed",
"action", action.Type,
"error", err,
)
}
}
}
// GetOperation returns a healing operation by ID.
// Returns a deep copy to prevent data races with the healing goroutine.
func (he *HealingEngine) GetOperation(id string) (*HealingOperation, bool) {
he.mu.RLock()
defer he.mu.RUnlock()
for _, op := range he.operations {
if op.ID == id {
cp := *op
cp.ActionsRun = make([]ActionLog, len(op.ActionsRun))
copy(cp.ActionsRun, op.ActionsRun)
if op.Diagnosis != nil {
diag := *op.Diagnosis
cp.Diagnosis = &diag
}
return &cp, true
}
}
return nil, false
}
// RecentOperations returns the last N operations.
// Returns deep copies to prevent data races with the healing goroutine.
func (he *HealingEngine) RecentOperations(n int) []HealingOperation {
he.mu.RLock()
defer he.mu.RUnlock()
total := len(he.operations)
if total == 0 {
return nil
}
start := total - n
if start < 0 {
start = 0
}
result := make([]HealingOperation, 0, n)
for i := start; i < total; i++ {
cp := *he.operations[i]
cp.ActionsRun = make([]ActionLog, len(he.operations[i].ActionsRun))
copy(cp.ActionsRun, he.operations[i].ActionsRun)
if he.operations[i].Diagnosis != nil {
diag := *he.operations[i].Diagnosis
cp.Diagnosis = &diag
}
result = append(result, cp)
}
return result
}
// StrategyCount returns the number of registered strategies.
func (he *HealingEngine) StrategyCount() int {
he.mu.RLock()
defer he.mu.RUnlock()
return len(he.strategies)
}

View file

@ -0,0 +1,588 @@
package resilience
import (
"context"
"fmt"
"sync/atomic"
"testing"
"time"
)
// --- Mock executor for tests ---
type mockExecutorLog struct {
actions []ActionType
fail map[ActionType]bool
count atomic.Int64
}
func newMockExecutor() *mockExecutorLog {
return &mockExecutorLog{
fail: make(map[ActionType]bool),
}
}
func (m *mockExecutorLog) execute(_ context.Context, action Action, _ string) error {
m.count.Add(1)
m.actions = append(m.actions, action.Type)
if m.fail[action.Type] {
return fmt.Errorf("action %s failed", action.Type)
}
return nil
}
// --- Healing Engine Tests ---
// HE-01: Component restart (success).
func TestHealingEngine_HE01_RestartSuccess(t *testing.T) {
mock := newMockExecutor()
alertCh := make(chan HealthAlert, 10)
escalated := false
he := NewHealingEngine(alertCh, mock.execute, func(_ HealthAlert) {
escalated = true
})
he.RegisterStrategy(RestartComponentStrategy())
alertCh <- HealthAlert{
Component: "soc-ingest",
Severity: SeverityCritical,
Metric: "quorum",
SuggestedAction: "restart",
Timestamp: time.Now(),
}
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
// Run one healing cycle.
go he.Start(ctx)
time.Sleep(200 * time.Millisecond)
cancel()
ops := he.RecentOperations(10)
if len(ops) == 0 {
t.Fatal("expected at least 1 operation")
}
if ops[0].Result != ResultSuccess {
t.Errorf("expected SUCCESS, got %s (error: %s)", ops[0].Result, ops[0].Error)
}
if escalated {
t.Error("should not have escalated on success")
}
}
// HE-02: Component restart (failure ×3 → escalate).
func TestHealingEngine_HE02_RestartFailureEscalate(t *testing.T) {
mock := newMockExecutor()
mock.fail[ActionStartComponent] = true // Start always fails.
alertCh := make(chan HealthAlert, 10)
escalated := false
he := NewHealingEngine(alertCh, mock.execute, func(_ HealthAlert) {
escalated = true
})
he.RegisterStrategy(RestartComponentStrategy())
alertCh <- HealthAlert{
Component: "soc-correlate",
Severity: SeverityCritical,
Metric: "quorum",
SuggestedAction: "restart",
Timestamp: time.Now(),
}
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
go he.Start(ctx)
time.Sleep(200 * time.Millisecond)
cancel()
if !escalated {
t.Error("expected escalation on failure")
}
ops := he.RecentOperations(10)
if len(ops) == 0 {
t.Fatal("expected operation")
}
if ops[0].Result != ResultFailed {
t.Errorf("expected FAILED, got %s", ops[0].Result)
}
}
// HE-03: Config rollback strategy matching.
func TestHealingEngine_HE03_ConfigRollback(t *testing.T) {
mock := newMockExecutor()
alertCh := make(chan HealthAlert, 10)
he := NewHealingEngine(alertCh, mock.execute, nil)
he.RegisterStrategy(RollbackConfigStrategy())
alertCh <- HealthAlert{
Component: "soc-ingest",
Severity: SeverityWarning,
Metric: "config_tampering",
Timestamp: time.Now(),
}
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
go he.Start(ctx)
time.Sleep(200 * time.Millisecond)
cancel()
ops := he.RecentOperations(10)
if len(ops) == 0 {
t.Fatal("expected operation for config rollback")
}
if ops[0].StrategyID != "ROLLBACK_CONFIG" {
t.Errorf("expected ROLLBACK_CONFIG, got %s", ops[0].StrategyID)
}
}
// HE-04: Database recovery.
func TestHealingEngine_HE04_DatabaseRecovery(t *testing.T) {
mock := newMockExecutor()
alertCh := make(chan HealthAlert, 10)
he := NewHealingEngine(alertCh, mock.execute, nil)
he.RegisterStrategy(RecoverDatabaseStrategy())
alertCh <- HealthAlert{
Component: "soc-correlate",
Severity: SeverityCritical,
Metric: "database_corruption",
Timestamp: time.Now(),
}
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
go he.Start(ctx)
time.Sleep(200 * time.Millisecond)
cancel()
ops := he.RecentOperations(10)
if len(ops) == 0 {
t.Fatal("expected DB recovery op")
}
if ops[0].StrategyID != "RECOVER_DATABASE" {
t.Errorf("expected RECOVER_DATABASE, got %s", ops[0].StrategyID)
}
}
// HE-05: Rule poisoning defense.
func TestHealingEngine_HE05_RulePoisoning(t *testing.T) {
mock := newMockExecutor()
alertCh := make(chan HealthAlert, 10)
he := NewHealingEngine(alertCh, mock.execute, nil)
he.RegisterStrategy(RecoverRulesStrategy())
alertCh <- HealthAlert{
Component: "soc-correlate",
Severity: SeverityWarning,
Metric: "rule_execution_failure_rate",
Timestamp: time.Now(),
}
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
go he.Start(ctx)
time.Sleep(200 * time.Millisecond)
cancel()
ops := he.RecentOperations(10)
if len(ops) == 0 {
t.Fatal("expected rule recovery op")
}
if ops[0].StrategyID != "RECOVER_RULES" {
t.Errorf("expected RECOVER_RULES, got %s", ops[0].StrategyID)
}
}
// HE-06: Network isolation recovery.
func TestHealingEngine_HE06_NetworkRecovery(t *testing.T) {
mock := newMockExecutor()
alertCh := make(chan HealthAlert, 10)
he := NewHealingEngine(alertCh, mock.execute, nil)
he.RegisterStrategy(RecoverNetworkStrategy())
alertCh <- HealthAlert{
Component: "soc-respond",
Severity: SeverityWarning,
Metric: "network_partition",
Timestamp: time.Now(),
}
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
go he.Start(ctx)
time.Sleep(200 * time.Millisecond)
cancel()
ops := he.RecentOperations(10)
if len(ops) == 0 {
t.Fatal("expected network recovery op")
}
if ops[0].StrategyID != "RECOVER_NETWORK" {
t.Errorf("expected RECOVER_NETWORK, got %s", ops[0].StrategyID)
}
}
// HE-07: Cooldown enforcement.
func TestHealingEngine_HE07_Cooldown(t *testing.T) {
mock := newMockExecutor()
alertCh := make(chan HealthAlert, 10)
he := NewHealingEngine(alertCh, mock.execute, nil)
he.RegisterStrategy(RestartComponentStrategy())
// Set cooldown manually.
he.setCooldown("RESTART_COMPONENT", 1*time.Hour)
if !he.isInCooldown("RESTART_COMPONENT") {
t.Error("expected cooldown active")
}
alertCh <- HealthAlert{
Component: "soc-ingest",
Severity: SeverityCritical,
Metric: "quorum",
SuggestedAction: "restart",
Timestamp: time.Now(),
}
ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond)
defer cancel()
go he.Start(ctx)
time.Sleep(200 * time.Millisecond)
cancel()
ops := he.RecentOperations(10)
if len(ops) != 0 {
t.Error("expected 0 operations during cooldown")
}
}
// HE-08: Rollback on failure.
func TestHealingEngine_HE08_Rollback(t *testing.T) {
mock := newMockExecutor()
mock.fail[ActionStartComponent] = true
alertCh := make(chan HealthAlert, 10)
he := NewHealingEngine(alertCh, mock.execute, func(_ HealthAlert) {})
strategy := RollbackConfigStrategy()
he.RegisterStrategy(strategy)
alertCh <- HealthAlert{
Component: "soc-ingest",
Severity: SeverityWarning,
Metric: "config_tampering",
Timestamp: time.Now(),
}
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
go he.Start(ctx)
time.Sleep(200 * time.Millisecond)
cancel()
// Rollback should have executed enter_safe_mode.
foundSafeMode := false
for _, a := range mock.actions {
if a == ActionEnterSafeMode {
foundSafeMode = true
}
}
if !foundSafeMode {
t.Errorf("expected safe mode in rollback, actions: %v", mock.actions)
}
}
// HE-09: State machine transitions.
func TestHealingEngine_HE09_StateTransitions(t *testing.T) {
mock := newMockExecutor()
alertCh := make(chan HealthAlert, 10)
he := NewHealingEngine(alertCh, mock.execute, nil)
he.RegisterStrategy(RestartComponentStrategy())
alertCh <- HealthAlert{
Component: "comp",
Severity: SeverityCritical,
Metric: "quorum",
SuggestedAction: "restart",
Timestamp: time.Now(),
}
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
go he.Start(ctx)
time.Sleep(200 * time.Millisecond)
cancel()
ops := he.RecentOperations(10)
if len(ops) == 0 {
t.Fatal("expected operation")
}
// Final state should be COMPLETED.
if ops[0].State != HealingCompleted {
t.Errorf("expected COMPLETED, got %s", ops[0].State)
}
}
// HE-10: Audit logging — all actions recorded.
func TestHealingEngine_HE10_AuditLogging(t *testing.T) {
mock := newMockExecutor()
alertCh := make(chan HealthAlert, 10)
he := NewHealingEngine(alertCh, mock.execute, nil)
he.RegisterStrategy(RestartComponentStrategy())
alertCh <- HealthAlert{
Component: "comp",
Severity: SeverityCritical,
Metric: "quorum",
SuggestedAction: "restart",
Timestamp: time.Now(),
}
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
go he.Start(ctx)
time.Sleep(200 * time.Millisecond)
cancel()
ops := he.RecentOperations(10)
if len(ops) == 0 {
t.Fatal("expected operation")
}
if len(ops[0].ActionsRun) == 0 {
t.Error("expected action logs")
}
for _, al := range ops[0].ActionsRun {
if al.StartedAt.IsZero() {
t.Error("action log missing start time")
}
}
}
// HE-11: Parallel healing — no race conditions.
func TestHealingEngine_HE11_Parallel(t *testing.T) {
mock := newMockExecutor()
alertCh := make(chan HealthAlert, 100)
he := NewHealingEngine(alertCh, mock.execute, nil)
for _, s := range DefaultStrategies() {
he.RegisterStrategy(s)
}
// Send many alerts concurrently.
for i := 0; i < 10; i++ {
alertCh <- HealthAlert{
Component: fmt.Sprintf("comp-%d", i),
Severity: SeverityCritical,
Metric: "quorum",
SuggestedAction: "restart",
Timestamp: time.Now(),
}
}
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
defer cancel()
go he.Start(ctx)
time.Sleep(1 * time.Second)
cancel()
// All 10 alerts processed (first gets an op, rest hit cooldown).
ops := he.RecentOperations(100)
if len(ops) == 0 {
t.Fatal("expected at least 1 operation")
}
}
// HE-12: No matching strategy → no operation.
func TestHealingEngine_HE12_NoStrategy(t *testing.T) {
mock := newMockExecutor()
alertCh := make(chan HealthAlert, 10)
he := NewHealingEngine(alertCh, mock.execute, nil)
// No strategies registered.
alertCh <- HealthAlert{
Component: "comp",
Severity: SeverityCritical,
Metric: "unknown_metric",
Timestamp: time.Now(),
}
ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond)
defer cancel()
go he.Start(ctx)
time.Sleep(200 * time.Millisecond)
cancel()
ops := he.RecentOperations(10)
if len(ops) != 0 {
t.Errorf("expected 0 operations, got %d", len(ops))
}
}
// Test diagnosis (various root causes).
func TestHealingEngine_Diagnosis(t *testing.T) {
mock := newMockExecutor()
he := NewHealingEngine(nil, mock.execute, nil)
tests := []struct {
metric string
current float64
wantCause string
}{
{"memory", 95, "memory_exhaustion"},
{"cpu", 95, "cpu_saturation"},
{"error_rate", 10, "elevated_error_rate"},
{"latency_p99", 200, "latency_degradation"},
{"quorum", 0.3, "quorum_loss"},
{"custom", 100, "threshold_breach_custom"},
}
for _, tt := range tests {
alert := HealthAlert{
Component: "test",
Metric: tt.metric,
Current: tt.current,
}
d := he.diagnose(alert)
if d.RootCause != tt.wantCause {
t.Errorf("metric=%s: expected %s, got %s", tt.metric, tt.wantCause, d.RootCause)
}
if d.Confidence <= 0 || d.Confidence > 1 {
t.Errorf("metric=%s: invalid confidence %f", tt.metric, d.Confidence)
}
}
}
// Test DefaultStrategies returns 5 strategies.
func TestDefaultStrategies(t *testing.T) {
strategies := DefaultStrategies()
if len(strategies) != 5 {
t.Errorf("expected 5 strategies, got %d", len(strategies))
}
ids := map[string]bool{}
for _, s := range strategies {
if ids[s.ID] {
t.Errorf("duplicate strategy ID: %s", s.ID)
}
ids[s.ID] = true
if s.MaxAttempts <= 0 {
t.Errorf("strategy %s: invalid max_attempts %d", s.ID, s.MaxAttempts)
}
if s.Cooldown <= 0 {
t.Errorf("strategy %s: invalid cooldown %v", s.ID, s.Cooldown)
}
if len(s.Actions) == 0 {
t.Errorf("strategy %s: no actions defined", s.ID)
}
}
}
// Test StrategyCount.
func TestHealingEngine_StrategyCount(t *testing.T) {
he := NewHealingEngine(nil, nil, nil)
if he.StrategyCount() != 0 {
t.Error("expected 0")
}
for _, s := range DefaultStrategies() {
he.RegisterStrategy(s)
}
if he.StrategyCount() != 5 {
t.Errorf("expected 5, got %d", he.StrategyCount())
}
}
// Test GetOperation.
func TestHealingEngine_GetOperation(t *testing.T) {
mock := newMockExecutor()
alertCh := make(chan HealthAlert, 10)
he := NewHealingEngine(alertCh, mock.execute, nil)
he.RegisterStrategy(RestartComponentStrategy())
alertCh <- HealthAlert{
Component: "comp",
Severity: SeverityCritical,
Metric: "quorum",
SuggestedAction: "restart",
Timestamp: time.Now(),
}
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
go he.Start(ctx)
time.Sleep(200 * time.Millisecond)
cancel()
op, ok := he.GetOperation("heal-1")
if !ok {
t.Fatal("expected operation heal-1")
}
if op.Component != "comp" {
t.Errorf("expected comp, got %s", op.Component)
}
_, ok = he.GetOperation("nonexistent")
if ok {
t.Error("expected not found for nonexistent")
}
}
// Test action OnError=continue.
func TestHealingEngine_ActionContinueOnError(t *testing.T) {
mock := newMockExecutor()
mock.fail[ActionGracefulStop] = true // First action fails but marked continue.
alertCh := make(chan HealthAlert, 10)
he := NewHealingEngine(alertCh, mock.execute, nil)
he.RegisterStrategy(RestartComponentStrategy())
alertCh <- HealthAlert{
Component: "comp",
Severity: SeverityCritical,
Metric: "quorum",
SuggestedAction: "restart",
Timestamp: time.Now(),
}
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
go he.Start(ctx)
time.Sleep(200 * time.Millisecond)
cancel()
ops := he.RecentOperations(10)
if len(ops) == 0 {
t.Fatal("expected operation")
}
// Should still succeed because graceful_stop has OnError=continue.
if ops[0].Result != ResultSuccess {
t.Errorf("expected SUCCESS (continue on error), got %s", ops[0].Result)
}
}

View file

@ -0,0 +1,215 @@
package resilience
import "time"
// Built-in healing strategies per ТЗ §4.1.1.
// These are registered at startup via HealingEngine.RegisterStrategy().
// DefaultStrategies returns the 5 built-in healing strategies.
func DefaultStrategies() []HealingStrategy {
return []HealingStrategy{
RestartComponentStrategy(),
RollbackConfigStrategy(),
RecoverDatabaseStrategy(),
RecoverRulesStrategy(),
RecoverNetworkStrategy(),
}
}
// RestartComponentStrategy handles component crashes and offline states.
// Trigger: component_offline OR component_critical, 2 consecutive failures within 5m.
// Actions: graceful_stop → clear_temp → start → verify → notify.
// Rollback: escalate to next strategy.
func RestartComponentStrategy() HealingStrategy {
return HealingStrategy{
ID: "RESTART_COMPONENT",
Name: "Component Restart",
Trigger: TriggerCondition{
Statuses: []ComponentStatus{StatusOffline, StatusCritical},
ConsecutiveFailures: 2,
WithinWindow: 5 * time.Minute,
},
Actions: []Action{
{Type: ActionGracefulStop, Timeout: 10 * time.Second, OnError: "continue"},
{Type: ActionClearTempFiles, Timeout: 5 * time.Second, OnError: "continue"},
{Type: ActionStartComponent, Timeout: 30 * time.Second, OnError: "abort"},
{Type: ActionVerifyHealth, Timeout: 60 * time.Second, OnError: "abort"},
{Type: ActionNotifySOC, Timeout: 5 * time.Second, OnError: "continue",
Params: map[string]interface{}{
"severity": "INFO",
"message": "Component restarted successfully",
},
},
},
Rollback: RollbackPlan{
OnFailure: "escalate",
Actions: []Action{
{Type: ActionNotifyArchitect, Timeout: 5 * time.Second,
Params: map[string]interface{}{
"severity": "CRITICAL",
"message": "Component restart failed after max attempts",
},
},
},
},
MaxAttempts: 3,
Cooldown: 5 * time.Minute,
}
}
// RollbackConfigStrategy handles config tampering or validation failures.
// Trigger: config_tampering_detected OR config_validation_failed.
// Actions: freeze → verify_backup → rollback → restart → verify → notify.
func RollbackConfigStrategy() HealingStrategy {
return HealingStrategy{
ID: "ROLLBACK_CONFIG",
Name: "Configuration Rollback",
Trigger: TriggerCondition{
Metrics: []string{"config_tampering", "config_validation"},
},
Actions: []Action{
{Type: ActionFreezeConfig, Timeout: 5 * time.Second, OnError: "abort"},
{Type: ActionRollbackConfig, Timeout: 15 * time.Second, OnError: "abort"},
{Type: ActionStartComponent, Timeout: 30 * time.Second, OnError: "rollback"},
{Type: ActionVerifyConfig, Timeout: 10 * time.Second, OnError: "abort"},
{Type: ActionNotifyArchitect, Timeout: 5 * time.Second, OnError: "continue",
Params: map[string]interface{}{
"severity": "WARNING",
"message": "Config rolled back due to tampering",
},
},
},
Rollback: RollbackPlan{
OnFailure: "enter_safe_mode",
Actions: []Action{
{Type: ActionEnterSafeMode, Timeout: 10 * time.Second},
},
},
MaxAttempts: 1,
Cooldown: 1 * time.Hour,
}
}
// RecoverDatabaseStrategy handles SQLite corruption.
// Trigger: database_corruption OR sqlite_integrity_failed.
// Actions: readonly → backup → restore → verify → resume → notify.
func RecoverDatabaseStrategy() HealingStrategy {
return HealingStrategy{
ID: "RECOVER_DATABASE",
Name: "Database Recovery",
Trigger: TriggerCondition{
Metrics: []string{"database_corruption", "sqlite_integrity"},
},
Actions: []Action{
{Type: ActionSwitchReadOnly, Timeout: 5 * time.Second, OnError: "abort"},
{Type: ActionBackupDB, Timeout: 30 * time.Second, OnError: "continue"},
{Type: ActionRestoreSnapshot, Timeout: 60 * time.Second, OnError: "abort",
Params: map[string]interface{}{
"snapshot_age_max": "1h",
},
},
{Type: ActionVerifyIntegrity, Timeout: 30 * time.Second, OnError: "abort"},
{Type: ActionResumeWrites, Timeout: 5 * time.Second, OnError: "abort"},
{Type: ActionNotifySOC, Timeout: 5 * time.Second, OnError: "continue",
Params: map[string]interface{}{
"severity": "WARNING",
"message": "Database recovered from snapshot",
},
},
},
Rollback: RollbackPlan{
OnFailure: "enter_lockdown",
Actions: []Action{
{Type: ActionEnterSafeMode, Timeout: 10 * time.Second},
{Type: ActionNotifyArchitect, Timeout: 5 * time.Second,
Params: map[string]interface{}{
"severity": "CRITICAL",
"message": "Database recovery failed",
},
},
},
},
MaxAttempts: 2,
Cooldown: 2 * time.Hour,
}
}
// RecoverRulesStrategy handles correlation rule poisoning.
// Trigger: rule execution failure rate > 50%.
// Actions: disable_suspicious → revert_baseline → verify → reload → notify.
func RecoverRulesStrategy() HealingStrategy {
return HealingStrategy{
ID: "RECOVER_RULES",
Name: "Rule Poisoning Defense",
Trigger: TriggerCondition{
Metrics: []string{"rule_execution_failure_rate", "correlation_rule_anomaly"},
},
Actions: []Action{
{Type: ActionDisableRules, Timeout: 10 * time.Second, OnError: "abort",
Params: map[string]interface{}{
"criteria": "failure_rate > 80%",
},
},
{Type: ActionRevertRules, Timeout: 15 * time.Second, OnError: "abort"},
{Type: ActionReloadEngine, Timeout: 30 * time.Second, OnError: "abort"},
{Type: ActionVerifyHealth, Timeout: 30 * time.Second, OnError: "continue"},
{Type: ActionNotifyArchitect, Timeout: 5 * time.Second, OnError: "continue",
Params: map[string]interface{}{
"severity": "WARNING",
"message": "Rules recovered from baseline",
},
},
},
Rollback: RollbackPlan{
OnFailure: "disable_correlation",
},
MaxAttempts: 2,
Cooldown: 4 * time.Hour,
}
}
// RecoverNetworkStrategy handles network partition or mTLS cert expiry.
// Trigger: network_partition_detected OR mTLS_cert_expired.
// Actions: isolate → regen_certs → verify → restore → notify.
func RecoverNetworkStrategy() HealingStrategy {
return HealingStrategy{
ID: "RECOVER_NETWORK",
Name: "Network Isolation Recovery",
Trigger: TriggerCondition{
Metrics: []string{"network_partition", "mtls_cert_expiry"},
},
Actions: []Action{
{Type: ActionIsolateNetwork, Timeout: 5 * time.Second, OnError: "abort",
Params: map[string]interface{}{
"scope": "external_only",
},
},
{Type: ActionRegenCerts, Timeout: 30 * time.Second, OnError: "abort",
Params: map[string]interface{}{
"validity": "24h",
},
},
{Type: ActionVerifyHealth, Timeout: 30 * time.Second, OnError: "rollback"},
{Type: ActionRestoreNetwork, Timeout: 10 * time.Second, OnError: "abort"},
{Type: ActionNotifySOC, Timeout: 5 * time.Second, OnError: "continue",
Params: map[string]interface{}{
"severity": "INFO",
"message": "Network connectivity restored",
},
},
},
Rollback: RollbackPlan{
OnFailure: "maintain_isolation",
Actions: []Action{
{Type: ActionNotifyArchitect, Timeout: 5 * time.Second,
Params: map[string]interface{}{
"severity": "CRITICAL",
"message": "Network recovery failed, maintaining isolation",
},
},
},
},
MaxAttempts: 3,
Cooldown: 1 * time.Hour,
}
}

View file

@ -0,0 +1,445 @@
package resilience
import (
"context"
"fmt"
"log/slog"
"sync"
"time"
)
// ComponentStatus defines the health state of a monitored component.
type ComponentStatus string
const (
StatusHealthy ComponentStatus = "HEALTHY"
StatusDegraded ComponentStatus = "DEGRADED"
StatusCritical ComponentStatus = "CRITICAL"
StatusOffline ComponentStatus = "OFFLINE"
)
// AlertSeverity defines the severity of a health alert.
type AlertSeverity string
const (
SeverityInfo AlertSeverity = "INFO"
SeverityWarning AlertSeverity = "WARNING"
SeverityCritical AlertSeverity = "CRITICAL"
)
// OverallStatus aggregates component statuses into a system-wide status.
type OverallStatus string
const (
OverallHealthy OverallStatus = "HEALTHY"
OverallDegraded OverallStatus = "DEGRADED"
OverallCritical OverallStatus = "CRITICAL"
)
// Default intervals per ТЗ §3.1.2.
const (
MetricsCollectionInterval = 10 * time.Second
HealthCheckInterval = 30 * time.Second
QuorumValidationInterval = 60 * time.Second
// AnomalyZScoreThreshold — Z > 3.0 = anomaly (99.7% confidence).
AnomalyZScoreThreshold = 3.0
// QuorumThreshold — 2/3 must be healthy.
QuorumThreshold = 0.66
// MaxConsecutiveFailures before marking CRITICAL.
MaxConsecutiveFailures = 3
)
// ComponentConfig defines monitoring thresholds for a component.
type ComponentConfig struct {
Name string `json:"name"`
Type string `json:"type"` // go_binary, c_binary, c_kernel_module
Thresholds map[string]float64 `json:"thresholds"`
// Whether threshold is an upper bound (true) or lower bound (false).
ThresholdIsMax map[string]bool `json:"threshold_is_max"`
}
// ComponentHealth tracks the health state of a single component.
type ComponentHealth struct {
Name string `json:"name"`
Status ComponentStatus `json:"status"`
Metrics map[string]float64 `json:"metrics"`
LastCheck time.Time `json:"last_check"`
Consecutive int `json:"consecutive_failures"`
Config ComponentConfig `json:"-"`
}
// HealthAlert represents a detected health anomaly.
type HealthAlert struct {
Component string `json:"component"`
Severity AlertSeverity `json:"severity"`
Metric string `json:"metric"`
Current float64 `json:"current"`
Threshold float64 `json:"threshold"`
ZScore float64 `json:"z_score,omitempty"`
Timestamp time.Time `json:"timestamp"`
SuggestedAction string `json:"suggested_action"`
}
// HealthResponse is the API response for GET /api/v1/resilience/health.
type HealthResponse struct {
OverallStatus OverallStatus `json:"overall_status"`
Components []ComponentHealth `json:"components"`
QuorumValid bool `json:"quorum_valid"`
LastCheck time.Time `json:"last_check"`
AnomaliesDetected []HealthAlert `json:"anomalies_detected"`
}
// MetricsCollector is the interface for collecting metrics from components.
// Implementations can use /healthz endpoints, /metrics, or runtime stats.
type MetricsCollector interface {
Collect(ctx context.Context, component string) (map[string]float64, error)
}
// HealthMonitor is the L1 Self-Monitoring orchestrator.
// It collects metrics, runs anomaly detection, validates quorum,
// and emits HealthAlerts to the alert bus.
type HealthMonitor struct {
mu sync.RWMutex
components map[string]*ComponentHealth
metricsDB *MetricsDB
alertBus chan HealthAlert
collector MetricsCollector
logger *slog.Logger
// anomalyWindow is the baseline window for Z-score calculation.
anomalyWindow time.Duration
}
// NewHealthMonitor creates a new health monitor.
func NewHealthMonitor(collector MetricsCollector, alertBufSize int) *HealthMonitor {
if alertBufSize <= 0 {
alertBufSize = 100
}
return &HealthMonitor{
components: make(map[string]*ComponentHealth),
metricsDB: NewMetricsDB(DefaultMetricsWindow, DefaultMetricsMaxSize),
alertBus: make(chan HealthAlert, alertBufSize),
collector: collector,
logger: slog.Default().With("component", "sarl-health-monitor"),
anomalyWindow: 24 * time.Hour,
}
}
// RegisterComponent adds a component to be monitored.
func (hm *HealthMonitor) RegisterComponent(config ComponentConfig) {
hm.mu.Lock()
defer hm.mu.Unlock()
hm.components[config.Name] = &ComponentHealth{
Name: config.Name,
Status: StatusHealthy,
Metrics: make(map[string]float64),
Config: config,
}
hm.logger.Info("component registered", "name", config.Name, "type", config.Type)
}
// AlertBus returns the channel for consuming health alerts.
func (hm *HealthMonitor) AlertBus() <-chan HealthAlert {
return hm.alertBus
}
// Start begins the monitoring loops. Blocks until ctx is cancelled.
func (hm *HealthMonitor) Start(ctx context.Context) {
hm.logger.Info("health monitor started")
metricsTicker := time.NewTicker(MetricsCollectionInterval)
healthTicker := time.NewTicker(HealthCheckInterval)
quorumTicker := time.NewTicker(QuorumValidationInterval)
defer metricsTicker.Stop()
defer healthTicker.Stop()
defer quorumTicker.Stop()
for {
select {
case <-ctx.Done():
hm.logger.Info("health monitor stopped")
return
case <-metricsTicker.C:
hm.collectMetrics(ctx)
case <-healthTicker.C:
hm.checkHealth()
case <-quorumTicker.C:
hm.validateQuorum()
}
}
}
// collectMetrics gathers metrics from all registered components.
func (hm *HealthMonitor) collectMetrics(ctx context.Context) {
hm.mu.RLock()
names := make([]string, 0, len(hm.components))
for name := range hm.components {
names = append(names, name)
}
hm.mu.RUnlock()
for _, name := range names {
metrics, err := hm.collector.Collect(ctx, name)
if err != nil {
hm.logger.Warn("metrics collection failed", "component", name, "error", err)
hm.mu.Lock()
if comp, ok := hm.components[name]; ok {
comp.Consecutive++
}
hm.mu.Unlock()
continue
}
hm.mu.Lock()
comp, ok := hm.components[name]
if ok {
comp.Metrics = metrics
comp.LastCheck = time.Now()
// Store each metric in time-series DB.
for metric, value := range metrics {
hm.metricsDB.AddDataPoint(name, metric, value)
}
}
hm.mu.Unlock()
}
}
// checkHealth evaluates each component against thresholds and anomalies.
func (hm *HealthMonitor) checkHealth() {
hm.mu.Lock()
defer hm.mu.Unlock()
for _, comp := range hm.components {
alerts := hm.evaluateComponent(comp)
for _, alert := range alerts {
hm.emitAlert(alert)
}
}
}
// evaluateComponent checks a single component's metrics against thresholds
// and runs Z-score anomaly detection. Returns any generated alerts.
func (hm *HealthMonitor) evaluateComponent(comp *ComponentHealth) []HealthAlert {
var alerts []HealthAlert
breached := false
for metric, value := range comp.Metrics {
threshold, hasThreshold := comp.Config.Thresholds[metric]
if !hasThreshold {
continue
}
isMax := comp.Config.ThresholdIsMax[metric]
var exceeded bool
if isMax {
exceeded = value > threshold
} else {
exceeded = value < threshold
}
if exceeded {
breached = true
action := "restart"
if metric == "error_rate" || metric == "latency_p99" {
action = "investigate"
}
alerts = append(alerts, HealthAlert{
Component: comp.Name,
Severity: SeverityWarning,
Metric: metric,
Current: value,
Threshold: threshold,
Timestamp: time.Now(),
SuggestedAction: action,
})
}
// Z-score anomaly detection.
baseline := hm.metricsDB.GetBaseline(comp.Name, metric, hm.anomalyWindow)
if IsAnomaly(value, baseline, AnomalyZScoreThreshold) {
zscore := CalculateZScore(value, baseline)
alerts = append(alerts, HealthAlert{
Component: comp.Name,
Severity: SeverityCritical,
Metric: metric,
Current: value,
Threshold: baseline.Mean + AnomalyZScoreThreshold*baseline.StdDev,
ZScore: zscore,
Timestamp: time.Now(),
SuggestedAction: fmt.Sprintf("anomaly detected (Z=%.2f), investigate %s", zscore, metric),
})
}
}
// Update component status.
if breached {
comp.Consecutive++
if comp.Consecutive >= MaxConsecutiveFailures {
comp.Status = StatusCritical
} else {
comp.Status = StatusDegraded
}
} else {
comp.Consecutive = 0
comp.Status = StatusHealthy
}
return alerts
}
// emitAlert sends an alert to the bus (non-blocking).
func (hm *HealthMonitor) emitAlert(alert HealthAlert) {
select {
case hm.alertBus <- alert:
hm.logger.Warn("health alert emitted",
"component", alert.Component,
"severity", alert.Severity,
"metric", alert.Metric,
"current", alert.Current,
"threshold", alert.Threshold,
)
default:
hm.logger.Error("alert bus full, dropping alert",
"component", alert.Component,
"metric", alert.Metric,
)
}
}
// validateQuorum checks if 2/3 of components are healthy.
func (hm *HealthMonitor) validateQuorum() {
hm.mu.RLock()
defer hm.mu.RUnlock()
if len(hm.components) == 0 {
return
}
valid := ValidateQuorum(hm.componentStatuses())
if !valid {
hm.logger.Error("QUORUM LOST — entering degraded state",
"healthy_ratio", hm.healthyRatio(),
"threshold", QuorumThreshold,
)
hm.emitAlert(HealthAlert{
Component: "system",
Severity: SeverityCritical,
Metric: "quorum",
Current: hm.healthyRatio(),
Threshold: QuorumThreshold,
Timestamp: time.Now(),
SuggestedAction: "activate safe mode",
})
}
}
// ValidateQuorum checks if the healthy ratio meets the 2/3 threshold.
func ValidateQuorum(statuses map[string]ComponentStatus) bool {
if len(statuses) == 0 {
return false
}
healthy := 0
for _, status := range statuses {
if status == StatusHealthy {
healthy++
}
}
return float64(healthy)/float64(len(statuses)) >= QuorumThreshold
}
// componentStatuses returns current status map (caller must hold RLock).
func (hm *HealthMonitor) componentStatuses() map[string]ComponentStatus {
statuses := make(map[string]ComponentStatus, len(hm.components))
for name, comp := range hm.components {
statuses[name] = comp.Status
}
return statuses
}
// healthyRatio returns the fraction of healthy components (caller must hold RLock).
func (hm *HealthMonitor) healthyRatio() float64 {
if len(hm.components) == 0 {
return 0
}
healthy := 0
for _, comp := range hm.components {
if comp.Status == StatusHealthy {
healthy++
}
}
return float64(healthy) / float64(len(hm.components))
}
// GetHealth returns a snapshot of the entire system health.
func (hm *HealthMonitor) GetHealth() HealthResponse {
hm.mu.RLock()
defer hm.mu.RUnlock()
components := make([]ComponentHealth, 0, len(hm.components))
for _, comp := range hm.components {
cp := *comp
// Deep copy metrics.
cp.Metrics = make(map[string]float64, len(comp.Metrics))
for k, v := range comp.Metrics {
cp.Metrics[k] = v
}
components = append(components, cp)
}
overall := OverallHealthy
for _, comp := range components {
switch comp.Status {
case StatusCritical, StatusOffline:
overall = OverallCritical
case StatusDegraded:
if overall != OverallCritical {
overall = OverallDegraded
}
}
}
return HealthResponse{
OverallStatus: overall,
Components: components,
QuorumValid: ValidateQuorum(hm.componentStatuses()),
LastCheck: time.Now(),
}
}
// SetComponentStatus manually sets a component's status (for testing/override).
func (hm *HealthMonitor) SetComponentStatus(name string, status ComponentStatus) {
hm.mu.Lock()
defer hm.mu.Unlock()
if comp, ok := hm.components[name]; ok {
comp.Status = status
}
}
// UpdateMetrics manually updates a component's metrics (for testing/override).
func (hm *HealthMonitor) UpdateMetrics(name string, metrics map[string]float64) {
hm.mu.Lock()
defer hm.mu.Unlock()
if comp, ok := hm.components[name]; ok {
comp.Metrics = metrics
comp.LastCheck = time.Now()
for metric, value := range metrics {
hm.metricsDB.AddDataPoint(name, metric, value)
}
}
}
// ComponentCount returns the number of registered components.
func (hm *HealthMonitor) ComponentCount() int {
hm.mu.RLock()
defer hm.mu.RUnlock()
return len(hm.components)
}

View file

@ -0,0 +1,499 @@
package resilience
import (
"context"
"fmt"
"math"
"testing"
"time"
)
// --- MetricsDB Tests ---
func TestRingBuffer_AddAndAll(t *testing.T) {
rb := newRingBuffer(5)
now := time.Now()
for i := 0; i < 3; i++ {
rb.Add(DataPoint{Timestamp: now.Add(time.Duration(i) * time.Second), Value: float64(i)})
}
if rb.Len() != 3 {
t.Fatalf("expected 3, got %d", rb.Len())
}
all := rb.All()
if len(all) != 3 {
t.Fatalf("expected 3 points, got %d", len(all))
}
for i, dp := range all {
if dp.Value != float64(i) {
t.Errorf("point %d: expected %f, got %f", i, float64(i), dp.Value)
}
}
}
func TestRingBuffer_Wrap(t *testing.T) {
rb := newRingBuffer(3)
now := time.Now()
for i := 0; i < 5; i++ {
rb.Add(DataPoint{Timestamp: now.Add(time.Duration(i) * time.Second), Value: float64(i)})
}
if rb.Len() != 3 {
t.Fatalf("expected 3 (buffer size), got %d", rb.Len())
}
all := rb.All()
// Should contain values 2, 3, 4 (oldest 0, 1 overwritten).
expected := []float64{2, 3, 4}
for i, dp := range all {
if dp.Value != expected[i] {
t.Errorf("point %d: expected %f, got %f", i, expected[i], dp.Value)
}
}
}
func TestMetricsDB_AddAndBaseline(t *testing.T) {
db := NewMetricsDB(time.Hour, 100)
for i := 0; i < 20; i++ {
db.AddDataPoint("soc-ingest", "cpu", 30.0+float64(i%5))
}
baseline := db.GetBaseline("soc-ingest", "cpu", time.Hour)
if baseline.Count != 20 {
t.Fatalf("expected 20 points, got %d", baseline.Count)
}
if baseline.Mean < 30 || baseline.Mean > 35 {
t.Errorf("mean out of expected range: %f", baseline.Mean)
}
if baseline.StdDev == 0 {
t.Error("expected non-zero stddev")
}
}
func TestMetricsDB_EmptyBaseline(t *testing.T) {
db := NewMetricsDB(time.Hour, 100)
baseline := db.GetBaseline("nonexistent", "cpu", time.Hour)
if baseline.Count != 0 {
t.Errorf("expected 0 count for nonexistent, got %d", baseline.Count)
}
}
func TestCalculateZScore(t *testing.T) {
baseline := Baseline{Mean: 30.0, StdDev: 5.0, Count: 100}
// Normal value (Z = 1.0).
z := CalculateZScore(35.0, baseline)
if math.Abs(z-1.0) > 0.01 {
t.Errorf("expected Z≈1.0, got %f", z)
}
// Anomalous value (Z = 4.0).
z = CalculateZScore(50.0, baseline)
if math.Abs(z-4.0) > 0.01 {
t.Errorf("expected Z≈4.0, got %f", z)
}
// Insufficient data → 0.
z = CalculateZScore(50.0, Baseline{Mean: 30, StdDev: 5, Count: 5})
if z != 0 {
t.Errorf("expected 0 for insufficient data, got %f", z)
}
}
func TestIsAnomaly(t *testing.T) {
baseline := Baseline{Mean: 30.0, StdDev: 5.0, Count: 100}
if IsAnomaly(35.0, baseline, 3.0) {
t.Error("35 should not be anomaly (Z=1.0)")
}
if !IsAnomaly(50.0, baseline, 3.0) {
t.Error("50 should be anomaly (Z=4.0)")
}
if !IsAnomaly(10.0, baseline, 3.0) {
t.Error("10 should be anomaly (Z=-4.0)")
}
}
func TestMetricsDB_Purge(t *testing.T) {
db := NewMetricsDB(100*time.Millisecond, 100)
db.AddDataPoint("comp", "cpu", 50)
time.Sleep(150 * time.Millisecond)
db.AddDataPoint("comp", "cpu", 60)
removed := db.Purge()
if removed != 1 {
t.Errorf("expected 1 purged, got %d", removed)
}
}
func TestMetricsDB_GetRecent(t *testing.T) {
db := NewMetricsDB(time.Hour, 100)
for i := 0; i < 10; i++ {
db.AddDataPoint("comp", "mem", float64(i*10))
}
recent := db.GetRecent("comp", "mem", 3)
if len(recent) != 3 {
t.Fatalf("expected 3 recent, got %d", len(recent))
}
// Should be last 3: 70, 80, 90.
if recent[0].Value != 70 || recent[2].Value != 90 {
t.Errorf("unexpected recent values: %v", recent)
}
}
// --- MockCollector for HealthMonitor tests ---
type mockCollector struct {
results map[string]map[string]float64
errors map[string]error
}
func (m *mockCollector) Collect(_ context.Context, component string) (map[string]float64, error) {
if err, ok := m.errors[component]; ok && err != nil {
return nil, err
}
if metrics, ok := m.results[component]; ok {
return metrics, nil
}
return map[string]float64{}, nil
}
// --- HealthMonitor Tests ---
// HM-01: Normal health check — all HEALTHY.
func TestHealthMonitor_HM01_AllHealthy(t *testing.T) {
hm := NewHealthMonitor(&mockCollector{}, 10)
registerTestComponents(hm, 6)
health := hm.GetHealth()
if health.OverallStatus != OverallHealthy {
t.Errorf("expected HEALTHY, got %s", health.OverallStatus)
}
if !health.QuorumValid {
t.Error("expected quorum valid")
}
if len(health.Components) != 6 {
t.Errorf("expected 6 components, got %d", len(health.Components))
}
}
// HM-02: Single component DEGRADED.
func TestHealthMonitor_HM02_SingleDegraded(t *testing.T) {
hm := NewHealthMonitor(&mockCollector{}, 10)
registerTestComponents(hm, 6)
hm.SetComponentStatus("comp-0", StatusDegraded)
health := hm.GetHealth()
if health.OverallStatus != OverallDegraded {
t.Errorf("expected DEGRADED, got %s", health.OverallStatus)
}
if !health.QuorumValid {
t.Error("expected quorum still valid with 5/6 healthy")
}
}
// HM-03: Multiple components CRITICAL → quorum lost.
func TestHealthMonitor_HM03_MultipleCritical(t *testing.T) {
hm := NewHealthMonitor(&mockCollector{}, 10)
registerTestComponents(hm, 6)
hm.SetComponentStatus("comp-0", StatusCritical)
hm.SetComponentStatus("comp-1", StatusCritical)
hm.SetComponentStatus("comp-2", StatusCritical)
health := hm.GetHealth()
if health.OverallStatus != OverallCritical {
t.Errorf("expected CRITICAL, got %s", health.OverallStatus)
}
if health.QuorumValid {
t.Error("expected quorum INVALID with 3/6 critical")
}
}
// HM-04: Anomaly detection (CPU spike).
func TestHealthMonitor_HM04_CPUAnomaly(t *testing.T) {
hm := NewHealthMonitor(&mockCollector{}, 100)
hm.RegisterComponent(ComponentConfig{
Name: "soc-ingest",
Type: "go_binary",
Thresholds: map[string]float64{"cpu": 80},
ThresholdIsMax: map[string]bool{"cpu": true},
})
// Build baseline of normal CPU (30%).
for i := 0; i < 50; i++ {
hm.metricsDB.AddDataPoint("soc-ingest", "cpu", 30.0)
}
// Spike to 95%.
hm.UpdateMetrics("soc-ingest", map[string]float64{"cpu": 95.0})
hm.checkHealth()
// Should have alert(s).
select {
case alert := <-hm.alertBus:
if alert.Component != "soc-ingest" {
t.Errorf("expected soc-ingest, got %s", alert.Component)
}
if alert.Metric != "cpu" {
t.Errorf("expected cpu metric, got %s", alert.Metric)
}
default:
t.Error("expected alert for CPU spike")
}
}
// HM-05: Memory leak detection.
func TestHealthMonitor_HM05_MemoryLeak(t *testing.T) {
hm := NewHealthMonitor(&mockCollector{}, 100)
hm.RegisterComponent(ComponentConfig{
Name: "soc-correlate",
Type: "go_binary",
Thresholds: map[string]float64{"memory": 90},
ThresholdIsMax: map[string]bool{"memory": true},
})
// Build baseline of normal memory (40%).
for i := 0; i < 50; i++ {
hm.metricsDB.AddDataPoint("soc-correlate", "memory", 40.0)
}
// Memory spike to 95%.
hm.UpdateMetrics("soc-correlate", map[string]float64{"memory": 95.0})
hm.checkHealth()
select {
case alert := <-hm.alertBus:
if alert.Metric != "memory" {
t.Errorf("expected memory metric, got %s", alert.Metric)
}
default:
t.Error("expected alert for memory spike")
}
}
// HM-06: Quorum validation failure.
func TestHealthMonitor_HM06_QuorumFailure(t *testing.T) {
statuses := map[string]ComponentStatus{
"a": StatusOffline,
"b": StatusOffline,
"c": StatusOffline,
"d": StatusOffline,
"e": StatusHealthy,
"f": StatusHealthy,
}
if ValidateQuorum(statuses) {
t.Error("expected quorum invalid with 4/6 offline")
}
}
// HM-06b: Quorum validation success (edge case: exactly 2/3).
func TestHealthMonitor_HM06b_QuorumEdge(t *testing.T) {
statuses := map[string]ComponentStatus{
"a": StatusHealthy,
"b": StatusHealthy,
"c": StatusCritical,
}
if !ValidateQuorum(statuses) {
t.Error("expected quorum valid with 2/3 healthy (exact threshold)")
}
}
// HM-06c: Empty quorum.
func TestHealthMonitor_HM06c_EmptyQuorum(t *testing.T) {
if ValidateQuorum(map[string]ComponentStatus{}) {
t.Error("expected quorum invalid with 0 components")
}
}
// HM-07: Metrics collection (no data loss).
func TestHealthMonitor_HM07_MetricsCollection(t *testing.T) {
collector := &mockCollector{
results: map[string]map[string]float64{
"comp-0": {"cpu": 25, "memory": 40},
},
}
hm := NewHealthMonitor(collector, 10)
hm.RegisterComponent(ComponentConfig{Name: "comp-0", Type: "go_binary"})
hm.collectMetrics(context.Background())
hm.mu.RLock()
comp := hm.components["comp-0"]
hm.mu.RUnlock()
if comp.Metrics["cpu"] != 25 {
t.Errorf("expected cpu=25, got %f", comp.Metrics["cpu"])
}
if comp.Metrics["memory"] != 40 {
t.Errorf("expected memory=40, got %f", comp.Metrics["memory"])
}
}
// HM-07b: Collection error increments consecutive failures.
func TestHealthMonitor_HM07b_CollectionError(t *testing.T) {
collector := &mockCollector{
errors: map[string]error{
"comp-0": fmt.Errorf("connection refused"),
},
}
hm := NewHealthMonitor(collector, 10)
hm.RegisterComponent(ComponentConfig{Name: "comp-0", Type: "go_binary"})
hm.collectMetrics(context.Background())
hm.mu.RLock()
comp := hm.components["comp-0"]
hm.mu.RUnlock()
if comp.Consecutive != 1 {
t.Errorf("expected 1 consecutive failure, got %d", comp.Consecutive)
}
}
// HM-08: Alert bus fan-out (non-blocking).
func TestHealthMonitor_HM08_AlertBusFanOut(t *testing.T) {
hm := NewHealthMonitor(&mockCollector{}, 5)
hm.RegisterComponent(ComponentConfig{
Name: "comp",
Type: "go_binary",
Thresholds: map[string]float64{"cpu": 50},
ThresholdIsMax: map[string]bool{"cpu": true},
})
// Fill alert bus.
for i := 0; i < 5; i++ {
hm.alertBus <- HealthAlert{Component: fmt.Sprintf("test-%d", i)}
}
// Emit one more — should be dropped (non-blocking).
hm.emitAlert(HealthAlert{Component: "overflow"})
// No panic = success.
}
// Test GetHealth returns a deep copy.
func TestHealthMonitor_GetHealthDeepCopy(t *testing.T) {
hm := NewHealthMonitor(&mockCollector{}, 10)
hm.RegisterComponent(ComponentConfig{Name: "test", Type: "go_binary"})
hm.UpdateMetrics("test", map[string]float64{"cpu": 50})
health := hm.GetHealth()
health.Components[0].Metrics["cpu"] = 999
// Original should be unchanged.
hm.mu.RLock()
original := hm.components["test"].Metrics["cpu"]
hm.mu.RUnlock()
if original != 50 {
t.Errorf("deep copy failed: original modified to %f", original)
}
}
// Test threshold breach transitions status to DEGRADED then CRITICAL.
func TestHealthMonitor_StatusTransitions(t *testing.T) {
hm := NewHealthMonitor(&mockCollector{}, 100)
hm.RegisterComponent(ComponentConfig{
Name: "comp",
Type: "go_binary",
Thresholds: map[string]float64{"error_rate": 5},
ThresholdIsMax: map[string]bool{"error_rate": true},
})
// Breach once → DEGRADED.
hm.UpdateMetrics("comp", map[string]float64{"error_rate": 10})
hm.checkHealth()
hm.mu.RLock()
status := hm.components["comp"].Status
hm.mu.RUnlock()
if status != StatusDegraded {
t.Errorf("expected DEGRADED after 1 breach, got %s", status)
}
// Breach 3× → CRITICAL.
for i := 0; i < 3; i++ {
hm.checkHealth()
}
hm.mu.RLock()
status = hm.components["comp"].Status
hm.mu.RUnlock()
if status != StatusCritical {
t.Errorf("expected CRITICAL after repeated breaches, got %s", status)
}
}
// Test lower-bound threshold (ThresholdIsMax=false).
func TestHealthMonitor_LowerBoundThreshold(t *testing.T) {
hm := NewHealthMonitor(&mockCollector{}, 100)
hm.RegisterComponent(ComponentConfig{
Name: "immune",
Type: "c_kernel_module",
Thresholds: map[string]float64{"hooks_active": 10},
ThresholdIsMax: map[string]bool{"hooks_active": false},
})
// hooks_active = 5 (below threshold of 10) → warning.
hm.UpdateMetrics("immune", map[string]float64{"hooks_active": 5})
hm.checkHealth()
select {
case alert := <-hm.alertBus:
if alert.Component != "immune" || alert.Metric != "hooks_active" {
t.Errorf("unexpected alert: %+v", alert)
}
default:
t.Error("expected alert for hooks_active below threshold")
}
}
// Test ComponentCount.
func TestHealthMonitor_ComponentCount(t *testing.T) {
hm := NewHealthMonitor(&mockCollector{}, 10)
if hm.ComponentCount() != 0 {
t.Error("expected 0 initially")
}
registerTestComponents(hm, 4)
if hm.ComponentCount() != 4 {
t.Errorf("expected 4, got %d", hm.ComponentCount())
}
}
// Test Start/Stop lifecycle.
func TestHealthMonitor_StartStop(t *testing.T) {
hm := NewHealthMonitor(&mockCollector{}, 10)
registerTestComponents(hm, 2)
ctx, cancel := context.WithCancel(context.Background())
done := make(chan struct{})
go func() {
hm.Start(ctx)
close(done)
}()
// Let it run briefly.
time.Sleep(50 * time.Millisecond)
cancel()
select {
case <-done:
// Clean shutdown.
case <-time.After(time.Second):
t.Fatal("Start() did not return after context cancellation")
}
}
// --- Helpers ---
func registerTestComponents(hm *HealthMonitor, n int) {
for i := 0; i < n; i++ {
hm.RegisterComponent(ComponentConfig{
Name: fmt.Sprintf("comp-%d", i),
Type: "go_binary",
})
}
}

View file

@ -0,0 +1,247 @@
package resilience
import (
"crypto/hmac"
"crypto/sha256"
"encoding/hex"
"fmt"
"io"
"log/slog"
"os"
"sync"
"time"
)
// IntegrityStatus represents the result of an integrity check.
type IntegrityStatus string
const (
IntegrityVerified IntegrityStatus = "VERIFIED"
IntegrityCompromised IntegrityStatus = "COMPROMISED"
IntegrityUnknown IntegrityStatus = "UNKNOWN"
)
// IntegrityReport is the full result of an integrity verification.
type IntegrityReport struct {
Overall IntegrityStatus `json:"overall"`
Timestamp time.Time `json:"timestamp"`
Binaries map[string]BinaryStatus `json:"binaries,omitempty"`
Chain *ChainStatus `json:"chain,omitempty"`
Configs map[string]ConfigStatus `json:"configs,omitempty"`
}
// BinaryStatus is the integrity status of a single binary.
type BinaryStatus struct {
Status IntegrityStatus `json:"status"`
Expected string `json:"expected"`
Current string `json:"current"`
}
// ChainStatus is the integrity status of the decision chain.
type ChainStatus struct {
Valid bool `json:"valid"`
Error string `json:"error,omitempty"`
BreakPoint int `json:"break_point,omitempty"`
Entries int `json:"entries"`
}
// ConfigStatus is the integrity status of a config file.
type ConfigStatus struct {
Valid bool `json:"valid"`
Error string `json:"error,omitempty"`
StoredHMAC string `json:"stored_hmac,omitempty"`
CurrentHMAC string `json:"current_hmac,omitempty"`
}
// IntegrityVerifier performs periodic integrity checks on binaries,
// decision chain, and config files.
type IntegrityVerifier struct {
mu sync.RWMutex
binaryHashes map[string]string // path → expected SHA-256
configPaths []string // config files to verify
hmacKey []byte // key for config HMAC-SHA256
chainPath string // path to decision chain log
logger *slog.Logger
lastReport *IntegrityReport
}
// NewIntegrityVerifier creates a new integrity verifier.
func NewIntegrityVerifier(hmacKey []byte) *IntegrityVerifier {
return &IntegrityVerifier{
binaryHashes: make(map[string]string),
hmacKey: hmacKey,
logger: slog.Default().With("component", "sarl-integrity"),
}
}
// RegisterBinary adds a binary with its expected SHA-256 hash.
func (iv *IntegrityVerifier) RegisterBinary(path, expectedHash string) {
iv.mu.Lock()
defer iv.mu.Unlock()
iv.binaryHashes[path] = expectedHash
}
// RegisterConfig adds a config file to verify.
func (iv *IntegrityVerifier) RegisterConfig(path string) {
iv.mu.Lock()
defer iv.mu.Unlock()
iv.configPaths = append(iv.configPaths, path)
}
// SetChainPath sets the decision chain log path.
func (iv *IntegrityVerifier) SetChainPath(path string) {
iv.mu.Lock()
defer iv.mu.Unlock()
iv.chainPath = path
}
// VerifyAll runs all integrity checks and returns a comprehensive report.
// Note: file I/O (binary hashing, config reading) is done WITHOUT holding
// the mutex to prevent thread starvation on slow storage.
func (iv *IntegrityVerifier) VerifyAll() IntegrityReport {
report := IntegrityReport{
Overall: IntegrityVerified,
Timestamp: time.Now(),
Binaries: make(map[string]BinaryStatus),
Configs: make(map[string]ConfigStatus),
}
// Snapshot config under lock, then release before I/O.
iv.mu.RLock()
binaryHashesCopy := make(map[string]string, len(iv.binaryHashes))
for k, v := range iv.binaryHashes {
binaryHashesCopy[k] = v
}
configPathsCopy := make([]string, len(iv.configPaths))
copy(configPathsCopy, iv.configPaths)
hmacKeyCopy := make([]byte, len(iv.hmacKey))
copy(hmacKeyCopy, iv.hmacKey)
chainPath := iv.chainPath
iv.mu.RUnlock()
// Check binaries (file I/O — no lock held).
for path, expected := range binaryHashesCopy {
status := iv.verifyBinary(path, expected)
report.Binaries[path] = status
if status.Status == IntegrityCompromised {
report.Overall = IntegrityCompromised
}
}
// Check configs (file I/O — no lock held).
for _, path := range configPathsCopy {
status := iv.verifyConfigFile(path)
report.Configs[path] = status
if !status.Valid {
report.Overall = IntegrityCompromised
}
}
// Check decision chain (file I/O — no lock held).
if chainPath != "" {
chain := iv.verifyDecisionChain(chainPath)
report.Chain = &chain
if !chain.Valid {
report.Overall = IntegrityCompromised
}
}
iv.mu.Lock()
iv.lastReport = &report
iv.mu.Unlock()
if report.Overall == IntegrityCompromised {
iv.logger.Error("INTEGRITY COMPROMISED", "report", report)
} else {
iv.logger.Debug("integrity verified", "binaries", len(report.Binaries))
}
return report
}
// LastReport returns the most recent integrity report.
func (iv *IntegrityVerifier) LastReport() *IntegrityReport {
iv.mu.RLock()
defer iv.mu.RUnlock()
return iv.lastReport
}
// verifyBinary calculates SHA-256 of a file and compares to expected.
func (iv *IntegrityVerifier) verifyBinary(path, expected string) BinaryStatus {
current, err := fileSHA256(path)
if err != nil {
return BinaryStatus{
Status: IntegrityUnknown,
Expected: expected,
Current: fmt.Sprintf("error: %v", err),
}
}
if current != expected {
return BinaryStatus{
Status: IntegrityCompromised,
Expected: expected,
Current: current,
}
}
return BinaryStatus{
Status: IntegrityVerified,
Expected: expected,
Current: current,
}
}
// verifyConfigFile checks HMAC-SHA256 of a config file.
func (iv *IntegrityVerifier) verifyConfigFile(path string) ConfigStatus {
data, err := os.ReadFile(path)
if err != nil {
return ConfigStatus{Valid: false, Error: fmt.Sprintf("unreadable: %v", err)}
}
currentHMAC := computeHMAC(data, iv.hmacKey)
// For now, we just verify the file is readable and compute HMAC.
// In production, the stored HMAC would be extracted from a sidecar file.
return ConfigStatus{
Valid: true,
CurrentHMAC: currentHMAC,
}
}
// verifyDecisionChain verifies the SHA-256 hash chain in the decision log.
func (iv *IntegrityVerifier) verifyDecisionChain(path string) ChainStatus {
_, err := os.Stat(path)
if err != nil {
if os.IsNotExist(err) {
return ChainStatus{Valid: true, Entries: 0} // No chain yet.
}
return ChainStatus{Valid: false, Error: fmt.Sprintf("unreadable: %v", err)}
}
// In a real implementation, we'd parse the chain entries and verify
// that each entry's hash includes the previous entry's hash.
// For now, verify the file exists and is readable.
return ChainStatus{Valid: true}
}
// fileSHA256 computes the SHA-256 hash of a file.
func fileSHA256(path string) (string, error) {
f, err := os.Open(path)
if err != nil {
return "", err
}
defer f.Close()
h := sha256.New()
if _, err := io.Copy(h, f); err != nil {
return "", err
}
return hex.EncodeToString(h.Sum(nil)), nil
}
// computeHMAC computes HMAC-SHA256 of data with the given key.
func computeHMAC(data, key []byte) string {
mac := hmac.New(sha256.New, key)
mac.Write(data)
return hex.EncodeToString(mac.Sum(nil))
}

View file

@ -0,0 +1,283 @@
// Package resilience implements the Sentinel Autonomous Resilience Layer (SARL).
//
// Five levels of autonomous self-recovery:
//
// L1 — Self-Monitoring: health checks, quorum, anomaly detection
// L2 — Self-Healing: restart, rollback, recovery strategies
// L3 — Self-Preservation: emergency modes (safe/lockdown/apoptosis)
// L4 — Immune Integration: behavioral anomaly detection
// L5 — Autonomous Recovery: playbooks for resurrection, consensus, crypto
package resilience
import (
"math"
"sync"
"time"
)
// MetricsDB provides an in-memory time-series store with ring buffers
// for each component/metric pair. Supports rolling baselines (mean/stddev)
// for Z-score anomaly detection.
type MetricsDB struct {
mu sync.RWMutex
series map[string]*RingBuffer // key = "component:metric"
window time.Duration // retention window (default 1h)
maxSize int // max data points per series
}
// DataPoint is a single timestamped metric value.
type DataPoint struct {
Timestamp time.Time `json:"timestamp"`
Value float64 `json:"value"`
}
// Baseline holds rolling statistics for anomaly detection.
type Baseline struct {
Mean float64 `json:"mean"`
StdDev float64 `json:"std_dev"`
Count int `json:"count"`
Min float64 `json:"min"`
Max float64 `json:"max"`
}
// RingBuffer is a fixed-size circular buffer for DataPoints.
type RingBuffer struct {
data []DataPoint
head int
count int
size int
}
// DefaultMetricsWindow is the default retention window (1 hour).
const DefaultMetricsWindow = 1 * time.Hour
// DefaultMetricsMaxSize is the default max points per series (1h / 10s = 360).
const DefaultMetricsMaxSize = 360
// NewMetricsDB creates a new in-memory time-series store.
func NewMetricsDB(window time.Duration, maxSize int) *MetricsDB {
if window <= 0 {
window = DefaultMetricsWindow
}
if maxSize <= 0 {
maxSize = DefaultMetricsMaxSize
}
return &MetricsDB{
series: make(map[string]*RingBuffer),
window: window,
maxSize: maxSize,
}
}
// AddDataPoint records a metric value for a component.
func (db *MetricsDB) AddDataPoint(component, metric string, value float64) {
key := component + ":" + metric
db.mu.Lock()
defer db.mu.Unlock()
rb, ok := db.series[key]
if !ok {
rb = newRingBuffer(db.maxSize)
db.series[key] = rb
}
rb.Add(DataPoint{Timestamp: time.Now(), Value: value})
}
// GetBaseline returns rolling mean/stddev for a component metric
// calculated over the specified window duration.
func (db *MetricsDB) GetBaseline(component, metric string, window time.Duration) Baseline {
key := component + ":" + metric
db.mu.RLock()
defer db.mu.RUnlock()
rb, ok := db.series[key]
if !ok {
return Baseline{}
}
cutoff := time.Now().Add(-window)
points := rb.After(cutoff)
if len(points) == 0 {
return Baseline{}
}
return calculateBaseline(points)
}
// GetRecent returns the most recent N data points for a component metric.
func (db *MetricsDB) GetRecent(component, metric string, n int) []DataPoint {
key := component + ":" + metric
db.mu.RLock()
defer db.mu.RUnlock()
rb, ok := db.series[key]
if !ok {
return nil
}
all := rb.All()
if len(all) <= n {
return all
}
return all[len(all)-n:]
}
// CalculateZScore returns the Z-score for a value against the baseline.
// Returns 0 if baseline has insufficient data or zero stddev.
func CalculateZScore(value float64, baseline Baseline) float64 {
if baseline.Count < 10 || baseline.StdDev == 0 {
return 0
}
return (value - baseline.Mean) / baseline.StdDev
}
// IsAnomaly returns true if the Z-score exceeds the threshold (default 3.0).
func IsAnomaly(value float64, baseline Baseline, threshold float64) bool {
if threshold <= 0 {
threshold = 3.0
}
zscore := CalculateZScore(value, baseline)
return math.Abs(zscore) > threshold
}
// SeriesCount returns the number of tracked series.
func (db *MetricsDB) SeriesCount() int {
db.mu.RLock()
defer db.mu.RUnlock()
return len(db.series)
}
// Purge removes data points older than the retention window.
func (db *MetricsDB) Purge() int {
db.mu.Lock()
defer db.mu.Unlock()
cutoff := time.Now().Add(-db.window)
total := 0
for key, rb := range db.series {
removed := rb.RemoveBefore(cutoff)
total += removed
if rb.Len() == 0 {
delete(db.series, key)
}
}
return total
}
// --- RingBuffer implementation ---
func newRingBuffer(size int) *RingBuffer {
return &RingBuffer{
data: make([]DataPoint, size),
size: size,
}
}
// Add inserts a DataPoint, overwriting the oldest if full.
func (rb *RingBuffer) Add(dp DataPoint) {
rb.data[rb.head] = dp
rb.head = (rb.head + 1) % rb.size
if rb.count < rb.size {
rb.count++
}
}
// Len returns the number of data points in the buffer.
func (rb *RingBuffer) Len() int {
return rb.count
}
// All returns all data points in chronological order.
func (rb *RingBuffer) All() []DataPoint {
if rb.count == 0 {
return nil
}
result := make([]DataPoint, rb.count)
if rb.count < rb.size {
// Buffer not yet full — data starts at 0.
copy(result, rb.data[:rb.count])
} else {
// Buffer wrapped — oldest is at head.
n := copy(result, rb.data[rb.head:rb.size])
copy(result[n:], rb.data[:rb.head])
}
return result
}
// After returns points with timestamp after the cutoff.
func (rb *RingBuffer) After(cutoff time.Time) []DataPoint {
all := rb.All()
result := make([]DataPoint, 0, len(all))
for _, dp := range all {
if dp.Timestamp.After(cutoff) {
result = append(result, dp)
}
}
return result
}
// RemoveBefore removes data points before the cutoff by compacting.
// Returns the number of points removed.
func (rb *RingBuffer) RemoveBefore(cutoff time.Time) int {
all := rb.All()
kept := make([]DataPoint, 0, len(all))
for _, dp := range all {
if !dp.Timestamp.Before(cutoff) {
kept = append(kept, dp)
}
}
removed := len(all) - len(kept)
if removed == 0 {
return 0
}
// Rebuild the ring buffer with kept data.
rb.count = 0
rb.head = 0
for _, dp := range kept {
rb.Add(dp)
}
return removed
}
// --- Statistics ---
func calculateBaseline(points []DataPoint) Baseline {
n := len(points)
if n == 0 {
return Baseline{}
}
var sum, min, max float64
min = points[0].Value
max = points[0].Value
for _, p := range points {
sum += p.Value
if p.Value < min {
min = p.Value
}
if p.Value > max {
max = p.Value
}
}
mean := sum / float64(n)
var variance float64
for _, p := range points {
diff := p.Value - mean
variance += diff * diff
}
variance /= float64(n)
return Baseline{
Mean: mean,
StdDev: math.Sqrt(variance),
Count: n,
Min: min,
Max: max,
}
}

View file

@ -0,0 +1,290 @@
package resilience
import (
"fmt"
"log/slog"
"sync"
"time"
)
// EmergencyMode defines the system's emergency state.
type EmergencyMode string
const (
ModeNone EmergencyMode = "NONE"
ModeSafe EmergencyMode = "SAFE"
ModeLockdown EmergencyMode = "LOCKDOWN"
ModeApoptosis EmergencyMode = "APOPTOSIS"
)
// ModeActivation records when and why a mode was activated.
type ModeActivation struct {
Mode EmergencyMode `json:"mode"`
ActivatedAt time.Time `json:"activated_at"`
ActivatedBy string `json:"activated_by"` // "auto" or "architect:<name>"
Reason string `json:"reason"`
AutoExit bool `json:"auto_exit"`
AutoExitAt time.Time `json:"auto_exit_at,omitempty"`
}
// PreservationEvent is an audit log entry for preservation actions.
type PreservationEvent struct {
Timestamp time.Time `json:"timestamp"`
Mode EmergencyMode `json:"mode"`
Action string `json:"action"`
Detail string `json:"detail"`
Success bool `json:"success"`
Error string `json:"error,omitempty"`
}
// ModeActionFunc is a callback to perform mode-specific actions.
// Implementations handle the real system operations (network isolation, process freeze, etc.).
type ModeActionFunc func(mode EmergencyMode, action string, params map[string]interface{}) error
// PreservationEngine manages emergency modes (safe/lockdown/apoptosis).
type PreservationEngine struct {
mu sync.RWMutex
currentMode EmergencyMode
activation *ModeActivation
history []PreservationEvent
actionFn ModeActionFunc
integrityFn func() IntegrityReport // pluggable integrity check
logger *slog.Logger
}
// NewPreservationEngine creates a new preservation engine.
func NewPreservationEngine(actionFn ModeActionFunc) *PreservationEngine {
return &PreservationEngine{
currentMode: ModeNone,
history: make([]PreservationEvent, 0),
actionFn: actionFn,
logger: slog.Default().With("component", "sarl-preservation"),
}
}
// CurrentMode returns the active emergency mode.
func (pe *PreservationEngine) CurrentMode() EmergencyMode {
pe.mu.RLock()
defer pe.mu.RUnlock()
return pe.currentMode
}
// Activation returns the current mode activation details (nil if NONE).
func (pe *PreservationEngine) Activation() *ModeActivation {
pe.mu.RLock()
defer pe.mu.RUnlock()
if pe.activation == nil {
return nil
}
cp := *pe.activation
return &cp
}
// ActivateMode enters an emergency mode. Returns error if transition is invalid.
func (pe *PreservationEngine) ActivateMode(mode EmergencyMode, reason, activatedBy string) error {
pe.mu.Lock()
defer pe.mu.Unlock()
if mode == ModeNone {
return fmt.Errorf("use DeactivateMode to exit emergency mode")
}
// Validate transitions: can always escalate, can't downgrade.
if !pe.isValidTransition(pe.currentMode, mode) {
return fmt.Errorf("invalid transition: %s → %s", pe.currentMode, mode)
}
pe.logger.Warn("EMERGENCY MODE ACTIVATION",
"mode", mode,
"reason", reason,
"activated_by", activatedBy,
)
// Execute mode-specific actions.
actions := pe.actionsForMode(mode)
for _, action := range actions {
err := pe.executeAction(mode, action.name, action.params)
if err != nil {
pe.logger.Error("mode action failed",
"mode", mode,
"action", action.name,
"error", err,
)
// In critical modes, continue despite errors.
if mode != ModeApoptosis {
return fmt.Errorf("failed to activate %s: action %s: %w", mode, action.name, err)
}
}
}
activation := &ModeActivation{
Mode: mode,
ActivatedAt: time.Now(),
ActivatedBy: activatedBy,
Reason: reason,
}
if mode == ModeSafe {
activation.AutoExit = true
activation.AutoExitAt = time.Now().Add(15 * time.Minute)
}
pe.currentMode = mode
pe.activation = activation
return nil
}
// DeactivateMode exits the current emergency mode and returns to NONE.
func (pe *PreservationEngine) DeactivateMode(deactivatedBy string) error {
pe.mu.Lock()
defer pe.mu.Unlock()
if pe.currentMode == ModeNone {
return nil
}
// Lockdown and apoptosis require manual deactivation by architect.
if pe.currentMode == ModeApoptosis {
return fmt.Errorf("apoptosis mode cannot be deactivated — system rebuild required")
}
pe.logger.Info("EMERGENCY MODE DEACTIVATION",
"mode", pe.currentMode,
"deactivated_by", deactivatedBy,
)
pe.recordEvent(pe.currentMode, "deactivated",
fmt.Sprintf("deactivated by %s", deactivatedBy), true, "")
pe.currentMode = ModeNone
pe.activation = nil
return nil
}
// ShouldAutoExit checks if safe mode should auto-exit based on timer.
func (pe *PreservationEngine) ShouldAutoExit() bool {
pe.mu.RLock()
defer pe.mu.RUnlock()
if pe.currentMode != ModeSafe || pe.activation == nil {
return false
}
return pe.activation.AutoExit && time.Now().After(pe.activation.AutoExitAt)
}
// isValidTransition checks if a mode transition is allowed.
// Escalation order: NONE → SAFE → LOCKDOWN → APOPTOSIS.
func (pe *PreservationEngine) isValidTransition(from, to EmergencyMode) bool {
rank := map[EmergencyMode]int{
ModeNone: 0,
ModeSafe: 1,
ModeLockdown: 2,
ModeApoptosis: 3,
}
// Can always escalate or re-enter same mode.
return rank[to] >= rank[from]
}
type modeAction struct {
name string
params map[string]interface{}
}
// actionsForMode returns the actions to execute for a given mode.
func (pe *PreservationEngine) actionsForMode(mode EmergencyMode) []modeAction {
switch mode {
case ModeSafe:
return []modeAction{
{"disable_non_essential_services", map[string]interface{}{
"services": []string{"analytics", "reporting", "p2p_sync", "threat_intel_feeds"},
}},
{"enable_readonly_mode", map[string]interface{}{
"scope": []string{"event_ingest", "correlation", "dashboard_view"},
}},
{"preserve_all_logs", nil},
{"notify_architect", map[string]interface{}{"severity": "emergency"}},
{"increase_monitoring_frequency", map[string]interface{}{"interval": "5s"}},
}
case ModeLockdown:
return []modeAction{
{"isolate_from_network", map[string]interface{}{"scope": "all_external"}},
{"freeze_all_processes", nil},
{"capture_memory_dump", nil},
{"capture_disk_snapshot", nil},
{"trigger_immune_kernel_lock", map[string]interface{}{
"allow_syscalls": []string{"read", "write", "exit"},
}},
{"send_panic_alert", map[string]interface{}{
"channels": []string{"email", "sms", "slack", "pagerduty"},
}},
}
case ModeApoptosis:
return []modeAction{
{"graceful_shutdown", map[string]interface{}{"timeout": "30s", "drain_events": true}},
{"zero_sensitive_memory", map[string]interface{}{
"regions": []string{"keys", "certs", "tokens", "secrets"},
}},
{"preserve_forensic_evidence", nil},
{"notify_soc", map[string]interface{}{
"severity": "CRITICAL",
"message": "system self-terminated",
}},
{"secure_erase_temp_files", nil},
}
}
return nil
}
// executeAction runs a mode action and records the result.
func (pe *PreservationEngine) executeAction(mode EmergencyMode, name string, params map[string]interface{}) error {
err := pe.actionFn(mode, name, params)
success := err == nil
errStr := ""
if err != nil {
errStr = err.Error()
}
pe.recordEvent(mode, name, fmt.Sprintf("params: %v", params), success, errStr)
return err
}
// recordEvent appends to the audit history.
func (pe *PreservationEngine) recordEvent(mode EmergencyMode, action, detail string, success bool, errStr string) {
pe.history = append(pe.history, PreservationEvent{
Timestamp: time.Now(),
Mode: mode,
Action: action,
Detail: detail,
Success: success,
Error: errStr,
})
}
// History returns the preservation audit log.
func (pe *PreservationEngine) History() []PreservationEvent {
pe.mu.RLock()
defer pe.mu.RUnlock()
result := make([]PreservationEvent, len(pe.history))
copy(result, pe.history)
return result
}
// SetIntegrityCheck sets the pluggable integrity checker.
func (pe *PreservationEngine) SetIntegrityCheck(fn func() IntegrityReport) {
pe.mu.Lock()
defer pe.mu.Unlock()
pe.integrityFn = fn
}
// CheckIntegrity runs the pluggable integrity check and returns the report.
func (pe *PreservationEngine) CheckIntegrity() IntegrityReport {
pe.mu.RLock()
fn := pe.integrityFn
pe.mu.RUnlock()
if fn == nil {
return IntegrityReport{Overall: IntegrityVerified, Timestamp: time.Now()}
}
return fn()
}

View file

@ -0,0 +1,439 @@
package resilience
import (
"crypto/sha256"
"encoding/hex"
"os"
"path/filepath"
"testing"
"time"
)
// --- Mock action function ---
type modeActionLog struct {
calls []struct {
mode EmergencyMode
action string
}
failAction string // if set, this action will fail
}
func newModeActionLog() *modeActionLog {
return &modeActionLog{}
}
func (m *modeActionLog) execute(mode EmergencyMode, action string, _ map[string]interface{}) error {
m.calls = append(m.calls, struct {
mode EmergencyMode
action string
}{mode, action})
if m.failAction == action {
return errActionFailed
}
return nil
}
var errActionFailed = &actionError{"simulated failure"}
type actionError struct{ msg string }
func (e *actionError) Error() string { return e.msg }
// --- Preservation Engine Tests ---
// SP-01: Safe mode activation.
func TestPreservation_SP01_SafeMode(t *testing.T) {
log := newModeActionLog()
pe := NewPreservationEngine(log.execute)
err := pe.ActivateMode(ModeSafe, "quorum lost (3/6 offline)", "auto")
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if pe.CurrentMode() != ModeSafe {
t.Errorf("expected SAFE, got %s", pe.CurrentMode())
}
activation := pe.Activation()
if activation == nil {
t.Fatal("expected activation details")
}
if !activation.AutoExit {
t.Error("safe mode should have auto-exit enabled")
}
// Should have executed safe mode actions.
if len(log.calls) == 0 {
t.Error("expected mode actions to be executed")
}
// First action should be disable_non_essential_services.
if log.calls[0].action != "disable_non_essential_services" {
t.Errorf("expected first action disable_non_essential_services, got %s", log.calls[0].action)
}
}
// SP-02: Lockdown mode activation.
func TestPreservation_SP02_LockdownMode(t *testing.T) {
log := newModeActionLog()
pe := NewPreservationEngine(log.execute)
err := pe.ActivateMode(ModeLockdown, "binary tampering detected", "auto")
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if pe.CurrentMode() != ModeLockdown {
t.Errorf("expected LOCKDOWN, got %s", pe.CurrentMode())
}
// Should have network isolation action.
foundIsolate := false
for _, c := range log.calls {
if c.action == "isolate_from_network" {
foundIsolate = true
}
}
if !foundIsolate {
t.Error("expected isolate_from_network in lockdown actions")
}
}
// SP-03: Apoptosis mode activation.
func TestPreservation_SP03_ApoptosisMode(t *testing.T) {
log := newModeActionLog()
pe := NewPreservationEngine(log.execute)
err := pe.ActivateMode(ModeApoptosis, "rootkit detected", "architect:admin")
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if pe.CurrentMode() != ModeApoptosis {
t.Errorf("expected APOPTOSIS, got %s", pe.CurrentMode())
}
// Should have graceful_shutdown action.
foundShutdown := false
for _, c := range log.calls {
if c.action == "graceful_shutdown" {
foundShutdown = true
}
}
if !foundShutdown {
t.Error("expected graceful_shutdown in apoptosis actions")
}
// Cannot deactivate apoptosis.
err = pe.DeactivateMode("architect:admin")
if err == nil {
t.Error("expected error deactivating apoptosis")
}
}
// SP-04: Invalid transition (downgrade).
func TestPreservation_SP04_InvalidTransition(t *testing.T) {
log := newModeActionLog()
pe := NewPreservationEngine(log.execute)
pe.ActivateMode(ModeLockdown, "test", "auto")
// Can't downgrade from LOCKDOWN to SAFE.
err := pe.ActivateMode(ModeSafe, "test downgrade", "auto")
if err == nil {
t.Error("expected error on downgrade from LOCKDOWN to SAFE")
}
}
// SP-05: Escalation (SAFE → LOCKDOWN → APOPTOSIS).
func TestPreservation_SP05_Escalation(t *testing.T) {
log := newModeActionLog()
pe := NewPreservationEngine(log.execute)
pe.ActivateMode(ModeSafe, "quorum lost", "auto")
if pe.CurrentMode() != ModeSafe {
t.Fatal("expected SAFE")
}
pe.ActivateMode(ModeLockdown, "compromise detected", "auto")
if pe.CurrentMode() != ModeLockdown {
t.Fatal("expected LOCKDOWN")
}
pe.ActivateMode(ModeApoptosis, "rootkit", "auto")
if pe.CurrentMode() != ModeApoptosis {
t.Fatal("expected APOPTOSIS")
}
}
// SP-06: Safe mode auto-exit.
func TestPreservation_SP06_AutoExit(t *testing.T) {
log := newModeActionLog()
pe := NewPreservationEngine(log.execute)
pe.ActivateMode(ModeSafe, "test", "auto")
// Not yet time.
if pe.ShouldAutoExit() {
t.Error("should not auto-exit immediately")
}
// Fast-forward activation's auto_exit_at.
pe.mu.Lock()
pe.activation.AutoExitAt = time.Now().Add(-1 * time.Second)
pe.mu.Unlock()
if !pe.ShouldAutoExit() {
t.Error("should auto-exit after timer expired")
}
}
// SP-07: Manual deactivation of safe mode.
func TestPreservation_SP07_ManualDeactivate(t *testing.T) {
log := newModeActionLog()
pe := NewPreservationEngine(log.execute)
pe.ActivateMode(ModeSafe, "test", "auto")
err := pe.DeactivateMode("architect:admin")
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if pe.CurrentMode() != ModeNone {
t.Errorf("expected NONE, got %s", pe.CurrentMode())
}
}
// SP-08: Lockdown deactivation.
func TestPreservation_SP08_LockdownDeactivate(t *testing.T) {
log := newModeActionLog()
pe := NewPreservationEngine(log.execute)
pe.ActivateMode(ModeLockdown, "test", "auto")
err := pe.DeactivateMode("architect:admin")
if err != nil {
t.Fatalf("lockdown deactivation should succeed: %v", err)
}
}
// SP-09: History audit log.
func TestPreservation_SP09_AuditHistory(t *testing.T) {
log := newModeActionLog()
pe := NewPreservationEngine(log.execute)
pe.ActivateMode(ModeSafe, "test", "auto")
pe.DeactivateMode("admin")
history := pe.History()
if len(history) == 0 {
t.Error("expected audit history entries")
}
// Last entry should be deactivation.
last := history[len(history)-1]
if last.Action != "deactivated" {
t.Errorf("expected deactivated, got %s", last.Action)
}
}
// SP-10: Action failure in non-apoptosis mode aborts.
func TestPreservation_SP10_ActionFailure(t *testing.T) {
log := newModeActionLog()
log.failAction = "disable_non_essential_services"
pe := NewPreservationEngine(log.execute)
err := pe.ActivateMode(ModeSafe, "test", "auto")
if err == nil {
t.Error("expected error when safe mode action fails")
}
// Mode should not have changed due to failure.
if pe.CurrentMode() != ModeNone {
t.Errorf("expected NONE after failed activation, got %s", pe.CurrentMode())
}
}
// SP-10b: Action failure in apoptosis mode continues.
func TestPreservation_SP10b_ApoptosisActionFailure(t *testing.T) {
log := newModeActionLog()
log.failAction = "graceful_shutdown"
pe := NewPreservationEngine(log.execute)
// Apoptosis should continue despite action failures.
err := pe.ActivateMode(ModeApoptosis, "rootkit", "auto")
if err != nil {
t.Fatalf("apoptosis should not fail on action errors: %v", err)
}
if pe.CurrentMode() != ModeApoptosis {
t.Errorf("expected APOPTOSIS, got %s", pe.CurrentMode())
}
}
// Test ModeNone activation rejected.
func TestPreservation_ModeNoneRejected(t *testing.T) {
pe := NewPreservationEngine(func(_ EmergencyMode, _ string, _ map[string]interface{}) error { return nil })
err := pe.ActivateMode(ModeNone, "test", "auto")
if err == nil {
t.Error("expected error activating ModeNone")
}
}
// Test deactivate when already NONE.
func TestPreservation_DeactivateNone(t *testing.T) {
pe := NewPreservationEngine(func(_ EmergencyMode, _ string, _ map[string]interface{}) error { return nil })
err := pe.DeactivateMode("admin")
if err != nil {
t.Errorf("deactivating NONE should be no-op: %v", err)
}
}
// Test ShouldAutoExit when not in safe mode.
func TestPreservation_AutoExitNotSafe(t *testing.T) {
pe := NewPreservationEngine(func(_ EmergencyMode, _ string, _ map[string]interface{}) error { return nil })
if pe.ShouldAutoExit() {
t.Error("should not auto-exit when mode is NONE")
}
}
// --- Integrity Verifier Tests ---
// SP-04 (ТЗ): Binary integrity check — hash mismatch.
func TestIntegrity_BinaryMismatch(t *testing.T) {
tmpDir := t.TempDir()
binPath := filepath.Join(tmpDir, "test-binary")
os.WriteFile(binPath, []byte("original content"), 0o644)
// Calculate correct hash.
h := sha256.Sum256([]byte("original content"))
correctHash := hex.EncodeToString(h[:])
iv := NewIntegrityVerifier([]byte("test-key"))
iv.RegisterBinary(binPath, correctHash)
// Verify (should pass).
report := iv.VerifyAll()
if report.Overall != IntegrityVerified {
t.Errorf("expected VERIFIED, got %s", report.Overall)
}
// Tamper with the binary.
os.WriteFile(binPath, []byte("tampered content"), 0o644)
// Verify (should fail).
report = iv.VerifyAll()
if report.Overall != IntegrityCompromised {
t.Errorf("expected COMPROMISED, got %s", report.Overall)
}
bs := report.Binaries[binPath]
if bs.Status != IntegrityCompromised {
t.Errorf("expected binary COMPROMISED, got %s", bs.Status)
}
}
// Binary not found.
func TestIntegrity_BinaryNotFound(t *testing.T) {
iv := NewIntegrityVerifier([]byte("test-key"))
iv.RegisterBinary("/nonexistent/binary", "abc123")
report := iv.VerifyAll()
bs := report.Binaries["/nonexistent/binary"]
if bs.Status != IntegrityUnknown {
t.Errorf("expected UNKNOWN for missing binary, got %s", bs.Status)
}
}
// Config HMAC computation.
func TestIntegrity_ConfigHMAC(t *testing.T) {
tmpDir := t.TempDir()
cfgPath := filepath.Join(tmpDir, "config.yaml")
os.WriteFile(cfgPath, []byte("server:\n port: 8080"), 0o644)
iv := NewIntegrityVerifier([]byte("hmac-key"))
iv.RegisterConfig(cfgPath)
report := iv.VerifyAll()
cs := report.Configs[cfgPath]
if !cs.Valid {
t.Errorf("expected valid config, got error: %s", cs.Error)
}
if cs.CurrentHMAC == "" {
t.Error("expected non-empty HMAC")
}
}
// Config file unreadable.
func TestIntegrity_ConfigUnreadable(t *testing.T) {
iv := NewIntegrityVerifier([]byte("key"))
iv.RegisterConfig("/nonexistent/config.yaml")
report := iv.VerifyAll()
cs := report.Configs["/nonexistent/config.yaml"]
if cs.Valid {
t.Error("expected invalid for unreadable config")
}
}
// Decision chain — file does not exist (OK, no chain yet).
func TestIntegrity_ChainNotExist(t *testing.T) {
iv := NewIntegrityVerifier([]byte("key"))
iv.SetChainPath("/nonexistent/decisions.log")
report := iv.VerifyAll()
if report.Chain == nil {
t.Fatal("expected chain status")
}
if !report.Chain.Valid {
t.Error("nonexistent chain should be valid (no entries)")
}
}
// Decision chain — file exists.
func TestIntegrity_ChainExists(t *testing.T) {
tmpDir := t.TempDir()
chainPath := filepath.Join(tmpDir, "decisions.log")
os.WriteFile(chainPath, []byte("entry1\nentry2\n"), 0o644)
iv := NewIntegrityVerifier([]byte("key"))
iv.SetChainPath(chainPath)
report := iv.VerifyAll()
if report.Chain == nil {
t.Fatal("expected chain status")
}
if !report.Chain.Valid {
t.Error("expected valid chain")
}
}
// LastReport.
func TestIntegrity_LastReport(t *testing.T) {
iv := NewIntegrityVerifier([]byte("key"))
if iv.LastReport() != nil {
t.Error("expected nil before first verify")
}
iv.VerifyAll()
if iv.LastReport() == nil {
t.Error("expected report after verify")
}
}
// Pluggable integrity check in PreservationEngine.
func TestPreservation_IntegrityCheck(t *testing.T) {
pe := NewPreservationEngine(func(_ EmergencyMode, _ string, _ map[string]interface{}) error { return nil })
// Default: no integrity fn → VERIFIED.
report := pe.CheckIntegrity()
if report.Overall != IntegrityVerified {
t.Errorf("expected VERIFIED, got %s", report.Overall)
}
// Set custom checker.
pe.SetIntegrityCheck(func() IntegrityReport {
return IntegrityReport{Overall: IntegrityCompromised, Timestamp: time.Now()}
})
report = pe.CheckIntegrity()
if report.Overall != IntegrityCompromised {
t.Errorf("expected COMPROMISED from custom checker, got %s", report.Overall)
}
}

View file

@ -0,0 +1,398 @@
package resilience
import (
"context"
"fmt"
"log/slog"
"sync"
"time"
)
// PlaybookStatus tracks the state of a running playbook.
type PlaybookStatus string
const (
PlaybookPending PlaybookStatus = "PENDING"
PlaybookRunning PlaybookStatus = "RUNNING"
PlaybookSucceeded PlaybookStatus = "SUCCEEDED"
PlaybookFailed PlaybookStatus = "FAILED"
PlaybookRolledBack PlaybookStatus = "ROLLED_BACK"
)
// PlaybookStep is a single step in a recovery playbook.
type PlaybookStep struct {
ID string `json:"id"`
Name string `json:"name"`
Type string `json:"type"` // shell, api, consensus, crypto, systemd, http, prometheus
Timeout time.Duration `json:"timeout"`
Retries int `json:"retries"`
Params map[string]interface{} `json:"params,omitempty"`
OnError string `json:"on_error"` // abort, continue, rollback
Condition string `json:"condition,omitempty"` // prerequisite condition
}
// Playbook defines a complete recovery procedure.
type Playbook struct {
ID string `json:"id"`
Name string `json:"name"`
Version string `json:"version"`
TriggerMetric string `json:"trigger_metric"`
TriggerSeverity string `json:"trigger_severity"`
DiagnosisChecks []PlaybookStep `json:"diagnosis_checks"`
Actions []PlaybookStep `json:"actions"`
RollbackActions []PlaybookStep `json:"rollback_actions"`
SuccessCriteria []string `json:"success_criteria"`
}
// PlaybookExecution tracks a single playbook run.
type PlaybookExecution struct {
ID string `json:"id"`
PlaybookID string `json:"playbook_id"`
Component string `json:"component"`
Status PlaybookStatus `json:"status"`
StartedAt time.Time `json:"started_at"`
CompletedAt time.Time `json:"completed_at,omitempty"`
StepsRun []StepResult `json:"steps_run"`
Error string `json:"error,omitempty"`
}
// StepResult records the execution of a single playbook step.
type StepResult struct {
StepID string `json:"step_id"`
StepName string `json:"step_name"`
Success bool `json:"success"`
Duration time.Duration `json:"duration"`
Output string `json:"output,omitempty"`
Error string `json:"error,omitempty"`
}
// PlaybookExecutorFunc runs a single playbook step.
type PlaybookExecutorFunc func(ctx context.Context, step PlaybookStep, component string) (string, error)
// RecoveryPlaybookEngine manages and executes recovery playbooks.
type RecoveryPlaybookEngine struct {
mu sync.RWMutex
playbooks map[string]*Playbook
executions []*PlaybookExecution
execCount int64
executor PlaybookExecutorFunc
logger *slog.Logger
}
// NewRecoveryPlaybookEngine creates a new playbook engine.
func NewRecoveryPlaybookEngine(executor PlaybookExecutorFunc) *RecoveryPlaybookEngine {
return &RecoveryPlaybookEngine{
playbooks: make(map[string]*Playbook),
executions: make([]*PlaybookExecution, 0),
executor: executor,
logger: slog.Default().With("component", "sarl-recovery-playbooks"),
}
}
// RegisterPlaybook adds a playbook to the engine.
func (rpe *RecoveryPlaybookEngine) RegisterPlaybook(pb Playbook) {
rpe.mu.Lock()
defer rpe.mu.Unlock()
rpe.playbooks[pb.ID] = &pb
rpe.logger.Info("playbook registered", "id", pb.ID, "name", pb.Name)
}
// Execute runs a playbook for a given component. Returns the execution ID.
func (rpe *RecoveryPlaybookEngine) Execute(ctx context.Context, playbookID, component string) (string, error) {
rpe.mu.Lock()
pb, ok := rpe.playbooks[playbookID]
if !ok {
rpe.mu.Unlock()
return "", fmt.Errorf("playbook %s not found", playbookID)
}
rpe.execCount++
exec := &PlaybookExecution{
ID: fmt.Sprintf("exec-%d", rpe.execCount),
PlaybookID: playbookID,
Component: component,
Status: PlaybookRunning,
StartedAt: time.Now(),
StepsRun: make([]StepResult, 0),
}
rpe.executions = append(rpe.executions, exec)
rpe.mu.Unlock()
rpe.logger.Info("playbook execution started",
"exec_id", exec.ID,
"playbook", pb.Name,
"component", component,
)
// Phase 1: Diagnosis checks.
for _, check := range pb.DiagnosisChecks {
result := rpe.runStep(ctx, check, component)
exec.StepsRun = append(exec.StepsRun, result)
if !result.Success {
rpe.logger.Warn("diagnosis check failed",
"step", check.ID,
"error", result.Error,
)
}
}
// Phase 2: Execute recovery actions.
var execErr error
for _, action := range pb.Actions {
result := rpe.runStep(ctx, action, component)
exec.StepsRun = append(exec.StepsRun, result)
if !result.Success {
switch action.OnError {
case "continue":
continue
case "rollback":
execErr = fmt.Errorf("step %s failed (rollback): %s", action.ID, result.Error)
default: // "abort"
execErr = fmt.Errorf("step %s failed: %s", action.ID, result.Error)
}
break
}
}
// Phase 3: Handle result.
if execErr != nil {
rpe.logger.Error("playbook failed, executing rollback",
"exec_id", exec.ID,
"error", execErr,
)
// Execute rollback.
for _, rb := range pb.RollbackActions {
result := rpe.runStep(ctx, rb, component)
exec.StepsRun = append(exec.StepsRun, result)
}
exec.Status = PlaybookRolledBack
exec.Error = execErr.Error()
} else {
exec.Status = PlaybookSucceeded
rpe.logger.Info("playbook succeeded",
"exec_id", exec.ID,
"component", component,
"duration", time.Since(exec.StartedAt),
)
}
exec.CompletedAt = time.Now()
return exec.ID, execErr
}
// runStep executes a single step with timeout and retries.
func (rpe *RecoveryPlaybookEngine) runStep(ctx context.Context, step PlaybookStep, component string) StepResult {
start := time.Now()
result := StepResult{
StepID: step.ID,
StepName: step.Name,
}
retries := step.Retries
if retries <= 0 {
retries = 1
}
var lastErr error
for attempt := 0; attempt < retries; attempt++ {
stepCtx := ctx
var cancel context.CancelFunc
if step.Timeout > 0 {
stepCtx, cancel = context.WithTimeout(ctx, step.Timeout)
}
output, err := rpe.executor(stepCtx, step, component)
if cancel != nil {
cancel()
}
if err == nil {
result.Success = true
result.Output = output
result.Duration = time.Since(start)
return result
}
lastErr = err
if attempt < retries-1 {
rpe.logger.Warn("step retry",
"step", step.ID,
"attempt", attempt+1,
"error", err,
)
}
}
result.Success = false
result.Error = lastErr.Error()
result.Duration = time.Since(start)
return result
}
// GetExecution returns a playbook execution by ID.
// Returns a deep copy to prevent data races with the execution goroutine.
func (rpe *RecoveryPlaybookEngine) GetExecution(id string) (*PlaybookExecution, bool) {
rpe.mu.RLock()
defer rpe.mu.RUnlock()
for _, exec := range rpe.executions {
if exec.ID == id {
cp := *exec
cp.StepsRun = make([]StepResult, len(exec.StepsRun))
copy(cp.StepsRun, exec.StepsRun)
return &cp, true
}
}
return nil, false
}
// RecentExecutions returns the last N executions.
// Returns deep copies to prevent data races with the execution goroutine.
func (rpe *RecoveryPlaybookEngine) RecentExecutions(n int) []PlaybookExecution {
rpe.mu.RLock()
defer rpe.mu.RUnlock()
total := len(rpe.executions)
if total == 0 {
return nil
}
start := total - n
if start < 0 {
start = 0
}
result := make([]PlaybookExecution, 0, n)
for i := start; i < total; i++ {
cp := *rpe.executions[i]
cp.StepsRun = make([]StepResult, len(rpe.executions[i].StepsRun))
copy(cp.StepsRun, rpe.executions[i].StepsRun)
result = append(result, cp)
}
return result
}
// PlaybookCount returns the number of registered playbooks.
func (rpe *RecoveryPlaybookEngine) PlaybookCount() int {
rpe.mu.RLock()
defer rpe.mu.RUnlock()
return len(rpe.playbooks)
}
// --- Built-in playbooks per ТЗ §7.1 ---
// DefaultPlaybooks returns the 3 built-in recovery playbooks.
func DefaultPlaybooks() []Playbook {
return []Playbook{
ComponentResurrectionPlaybook(),
ConsensusRecoveryPlaybook(),
CryptoRotationPlaybook(),
}
}
// ComponentResurrectionPlaybook per ТЗ §7.1.1.
func ComponentResurrectionPlaybook() Playbook {
return Playbook{
ID: "component-resurrection",
Name: "Component Resurrection",
Version: "1.0",
TriggerMetric: "component_offline",
TriggerSeverity: "CRITICAL",
DiagnosisChecks: []PlaybookStep{
{ID: "diag-process", Name: "Check process exists", Type: "shell", Timeout: 5 * time.Second},
{ID: "diag-crashes", Name: "Check recent crashes", Type: "shell", Timeout: 5 * time.Second},
{ID: "diag-resources", Name: "Check resource exhaustion", Type: "prometheus", Timeout: 5 * time.Second},
{ID: "diag-deps", Name: "Check dependency health", Type: "http", Timeout: 10 * time.Second},
},
Actions: []PlaybookStep{
{ID: "capture-forensics", Name: "Capture forensics", Type: "shell", Timeout: 30 * time.Second, OnError: "continue"},
{ID: "clear-resources", Name: "Clear temp resources", Type: "shell", Timeout: 10 * time.Second, OnError: "continue"},
{ID: "restart-component", Name: "Restart component", Type: "systemd", Timeout: 60 * time.Second, OnError: "abort"},
{ID: "verify-health", Name: "Verify health", Type: "http", Timeout: 30 * time.Second, Retries: 3, OnError: "abort"},
{ID: "verify-metrics", Name: "Verify metrics", Type: "prometheus", Timeout: 30 * time.Second, OnError: "continue"},
{ID: "notify-success", Name: "Notify SOC", Type: "api", Timeout: 5 * time.Second, OnError: "continue"},
},
RollbackActions: []PlaybookStep{
{ID: "rb-safe-mode", Name: "Enter safe mode", Type: "api", Timeout: 10 * time.Second},
{ID: "rb-notify", Name: "Notify architect", Type: "api", Timeout: 5 * time.Second},
},
SuccessCriteria: []string{
"component_status == HEALTHY",
"health_check_passed == true",
"no_crashes_for_5min == true",
},
}
}
// ConsensusRecoveryPlaybook per ТЗ §7.1.2.
func ConsensusRecoveryPlaybook() Playbook {
return Playbook{
ID: "consensus-recovery",
Name: "Distributed Consensus Recovery",
Version: "1.0",
TriggerMetric: "split_brain",
TriggerSeverity: "CRITICAL",
DiagnosisChecks: []PlaybookStep{
{ID: "diag-peers", Name: "Check peer connectivity", Type: "api", Timeout: 10 * time.Second},
{ID: "diag-sync", Name: "Check sync status", Type: "api", Timeout: 10 * time.Second},
{ID: "diag-genome", Name: "Verify genome", Type: "api", Timeout: 5 * time.Second},
},
Actions: []PlaybookStep{
{ID: "pause-writes", Name: "Pause all writes", Type: "api", Timeout: 10 * time.Second, OnError: "abort"},
{ID: "elect-leader", Name: "Elect leader (Raft)", Type: "consensus", Timeout: 60 * time.Second, OnError: "abort"},
{ID: "sync-state", Name: "Sync state from leader", Type: "api", Timeout: 300 * time.Second, OnError: "rollback"},
{ID: "verify-consistency", Name: "Verify consistency", Type: "api", Timeout: 60 * time.Second, OnError: "abort"},
{ID: "resume-writes", Name: "Resume writes", Type: "api", Timeout: 10 * time.Second, OnError: "abort"},
{ID: "notify-cluster", Name: "Notify cluster", Type: "api", Timeout: 5 * time.Second, OnError: "continue"},
},
RollbackActions: []PlaybookStep{
{ID: "rb-readonly", Name: "Maintain readonly", Type: "api", Timeout: 10 * time.Second},
{ID: "rb-notify", Name: "Notify architect", Type: "api", Timeout: 5 * time.Second},
},
SuccessCriteria: []string{
"leader_elected == true",
"state_synced == true",
"consistency_verified == true",
"writes_resumed == true",
},
}
}
// CryptoRotationPlaybook per ТЗ §7.1.3.
func CryptoRotationPlaybook() Playbook {
return Playbook{
ID: "crypto-rotation",
Name: "Cryptographic Key Rotation",
Version: "1.0",
TriggerMetric: "key_compromise",
TriggerSeverity: "HIGH",
DiagnosisChecks: []PlaybookStep{
{ID: "diag-key-age", Name: "Check key age", Type: "crypto", Timeout: 5 * time.Second},
{ID: "diag-usage", Name: "Check key usage anomaly", Type: "prometheus", Timeout: 5 * time.Second},
{ID: "diag-tpm", Name: "Check TPM health", Type: "shell", Timeout: 5 * time.Second},
},
Actions: []PlaybookStep{
{ID: "gen-keys", Name: "Generate new keys", Type: "crypto", Timeout: 30 * time.Second, OnError: "abort",
Params: map[string]interface{}{"algorithm": "ECDSA-P256"},
},
{ID: "rotate-certs", Name: "Rotate mTLS certs", Type: "crypto", Timeout: 120 * time.Second, OnError: "rollback"},
{ID: "resign-chain", Name: "Re-sign decision chain", Type: "crypto", Timeout: 300 * time.Second, OnError: "continue"},
{ID: "verify-peers", Name: "Verify peer certs", Type: "api", Timeout: 60 * time.Second, OnError: "abort"},
{ID: "revoke-old", Name: "Revoke old keys", Type: "crypto", Timeout: 30 * time.Second, OnError: "continue"},
{ID: "notify-soc", Name: "Notify SOC", Type: "api", Timeout: 5 * time.Second, OnError: "continue"},
},
RollbackActions: []PlaybookStep{
{ID: "rb-revert-keys", Name: "Revert to previous keys", Type: "crypto", Timeout: 30 * time.Second},
{ID: "rb-notify", Name: "Notify architect", Type: "api", Timeout: 5 * time.Second},
},
SuccessCriteria: []string{
"new_keys_generated == true",
"certs_distributed == true",
"peers_verified == true",
"old_keys_revoked == true",
},
}
}

View file

@ -0,0 +1,318 @@
package resilience
import (
"context"
"fmt"
"testing"
"time"
)
// --- Mock playbook executor ---
type mockPlaybookExecutor struct {
failSteps map[string]bool
callCount int
}
func newMockPlaybookExecutor() *mockPlaybookExecutor {
return &mockPlaybookExecutor{failSteps: make(map[string]bool)}
}
func (m *mockPlaybookExecutor) execute(_ context.Context, step PlaybookStep, _ string) (string, error) {
m.callCount++
if m.failSteps[step.ID] {
return "", fmt.Errorf("step %s failed", step.ID)
}
return fmt.Sprintf("step %s completed", step.ID), nil
}
// --- Recovery Playbook Tests ---
// AR-01: Component resurrection (success).
func TestPlaybook_AR01_ResurrectionSuccess(t *testing.T) {
mock := newMockPlaybookExecutor()
rpe := NewRecoveryPlaybookEngine(mock.execute)
rpe.RegisterPlaybook(ComponentResurrectionPlaybook())
execID, err := rpe.Execute(context.Background(), "component-resurrection", "soc-ingest")
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
exec, ok := rpe.GetExecution(execID)
if !ok {
t.Fatal("execution not found")
}
if exec.Status != PlaybookSucceeded {
t.Errorf("expected SUCCEEDED, got %s", exec.Status)
}
if len(exec.StepsRun) == 0 {
t.Error("expected steps to be recorded")
}
}
// AR-02: Component resurrection (failure → rollback).
func TestPlaybook_AR02_ResurrectionFailure(t *testing.T) {
mock := newMockPlaybookExecutor()
mock.failSteps["restart-component"] = true
rpe := NewRecoveryPlaybookEngine(mock.execute)
rpe.RegisterPlaybook(ComponentResurrectionPlaybook())
_, err := rpe.Execute(context.Background(), "component-resurrection", "soc-ingest")
if err == nil {
t.Fatal("expected error")
}
execs := rpe.RecentExecutions(10)
if len(execs) == 0 {
t.Fatal("expected execution")
}
if execs[0].Status != PlaybookRolledBack {
t.Errorf("expected ROLLED_BACK, got %s", execs[0].Status)
}
}
// AR-03: Consensus recovery (success).
func TestPlaybook_AR03_ConsensusSuccess(t *testing.T) {
mock := newMockPlaybookExecutor()
rpe := NewRecoveryPlaybookEngine(mock.execute)
rpe.RegisterPlaybook(ConsensusRecoveryPlaybook())
_, err := rpe.Execute(context.Background(), "consensus-recovery", "cluster")
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
}
// AR-04: Consensus recovery (failure → readonly maintained).
func TestPlaybook_AR04_ConsensusFailure(t *testing.T) {
mock := newMockPlaybookExecutor()
mock.failSteps["elect-leader"] = true
rpe := NewRecoveryPlaybookEngine(mock.execute)
rpe.RegisterPlaybook(ConsensusRecoveryPlaybook())
_, err := rpe.Execute(context.Background(), "consensus-recovery", "cluster")
if err == nil {
t.Fatal("expected error")
}
execs := rpe.RecentExecutions(10)
if execs[0].Status != PlaybookRolledBack {
t.Errorf("expected ROLLED_BACK, got %s", execs[0].Status)
}
}
// AR-05: Crypto key rotation (success).
func TestPlaybook_AR05_CryptoSuccess(t *testing.T) {
mock := newMockPlaybookExecutor()
rpe := NewRecoveryPlaybookEngine(mock.execute)
rpe.RegisterPlaybook(CryptoRotationPlaybook())
_, err := rpe.Execute(context.Background(), "crypto-rotation", "system")
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
}
// AR-06: Crypto rotation (emergency — cert rotation fails → rollback).
func TestPlaybook_AR06_CryptoRollback(t *testing.T) {
mock := newMockPlaybookExecutor()
mock.failSteps["rotate-certs"] = true
rpe := NewRecoveryPlaybookEngine(mock.execute)
rpe.RegisterPlaybook(CryptoRotationPlaybook())
_, err := rpe.Execute(context.Background(), "crypto-rotation", "system")
if err == nil {
t.Fatal("expected error on cert rotation failure")
}
execs := rpe.RecentExecutions(10)
// Should have run rollback (revert keys).
found := false
for _, s := range execs[0].StepsRun {
if s.StepID == "rb-revert-keys" {
found = true
}
}
if !found {
t.Error("expected rollback step rb-revert-keys")
}
}
// AR-07: Forensic capture (all steps recorded).
func TestPlaybook_AR07_ForensicCapture(t *testing.T) {
mock := newMockPlaybookExecutor()
rpe := NewRecoveryPlaybookEngine(mock.execute)
rpe.RegisterPlaybook(ComponentResurrectionPlaybook())
execID, _ := rpe.Execute(context.Background(), "component-resurrection", "comp")
exec, _ := rpe.GetExecution(execID)
for _, step := range exec.StepsRun {
if step.StepID == "" {
t.Error("step missing ID")
}
if step.StepName == "" {
t.Errorf("step %s has empty name", step.StepID)
}
}
}
// AR-08: Rollback execution on action failure.
func TestPlaybook_AR08_RollbackExecution(t *testing.T) {
mock := newMockPlaybookExecutor()
mock.failSteps["sync-state"] = true // Sync fails → rollback trigger.
rpe := NewRecoveryPlaybookEngine(mock.execute)
rpe.RegisterPlaybook(ConsensusRecoveryPlaybook())
rpe.Execute(context.Background(), "consensus-recovery", "cluster")
execs := rpe.RecentExecutions(10)
if execs[0].Status != PlaybookRolledBack {
t.Errorf("expected ROLLED_BACK, got %s", execs[0].Status)
}
}
// AR-09: Step retries.
func TestPlaybook_AR09_StepRetries(t *testing.T) {
callCount := 0
executor := func(_ context.Context, step PlaybookStep, _ string) (string, error) {
callCount++
if step.ID == "verify-health" && callCount <= 2 {
return "", fmt.Errorf("not healthy yet")
}
return "ok", nil
}
rpe := NewRecoveryPlaybookEngine(executor)
rpe.RegisterPlaybook(ComponentResurrectionPlaybook())
_, err := rpe.Execute(context.Background(), "component-resurrection", "comp")
if err != nil {
t.Fatalf("expected success after retries: %v", err)
}
}
// AR-10: Playbook not found.
func TestPlaybook_AR10_NotFound(t *testing.T) {
rpe := NewRecoveryPlaybookEngine(nil)
_, err := rpe.Execute(context.Background(), "nonexistent", "comp")
if err == nil {
t.Fatal("expected error for nonexistent playbook")
}
}
// AR-11: Audit logging (all step timestamps).
func TestPlaybook_AR11_AuditTimestamps(t *testing.T) {
mock := newMockPlaybookExecutor()
rpe := NewRecoveryPlaybookEngine(mock.execute)
rpe.RegisterPlaybook(ComponentResurrectionPlaybook())
execID, _ := rpe.Execute(context.Background(), "component-resurrection", "comp")
exec, _ := rpe.GetExecution(execID)
if exec.StartedAt.IsZero() {
t.Error("missing started_at")
}
if exec.CompletedAt.IsZero() {
t.Error("missing completed_at")
}
}
// AR-12: OnError=continue skips non-critical failures.
func TestPlaybook_AR12_ContinueOnError(t *testing.T) {
mock := newMockPlaybookExecutor()
mock.failSteps["capture-forensics"] = true // OnError=continue.
mock.failSteps["notify-success"] = true // OnError=continue.
rpe := NewRecoveryPlaybookEngine(mock.execute)
rpe.RegisterPlaybook(ComponentResurrectionPlaybook())
_, err := rpe.Execute(context.Background(), "component-resurrection", "comp")
if err != nil {
t.Fatalf("expected success despite continue-on-error steps: %v", err)
}
}
// AR-13: Context cancellation.
func TestPlaybook_AR13_ContextCancel(t *testing.T) {
executor := func(ctx context.Context, _ PlaybookStep, _ string) (string, error) {
select {
case <-ctx.Done():
return "", ctx.Err()
case <-time.After(10 * time.Millisecond):
return "ok", nil
}
}
rpe := NewRecoveryPlaybookEngine(executor)
rpe.RegisterPlaybook(ComponentResurrectionPlaybook())
ctx, cancel := context.WithCancel(context.Background())
cancel() // Cancel immediately.
_, err := rpe.Execute(ctx, "component-resurrection", "comp")
// May or may not error depending on timing, but should not hang.
_ = err
}
// AR-14: DefaultPlaybooks returns 3.
func TestPlaybook_AR14_DefaultPlaybooks(t *testing.T) {
pbs := DefaultPlaybooks()
if len(pbs) != 3 {
t.Errorf("expected 3 playbooks, got %d", len(pbs))
}
ids := map[string]bool{}
for _, pb := range pbs {
if ids[pb.ID] {
t.Errorf("duplicate playbook ID: %s", pb.ID)
}
ids[pb.ID] = true
if len(pb.Actions) == 0 {
t.Errorf("playbook %s has no actions", pb.ID)
}
if len(pb.SuccessCriteria) == 0 {
t.Errorf("playbook %s has no success criteria", pb.ID)
}
}
}
// AR-15: PlaybookCount and RecentExecutions.
func TestPlaybook_AR15_CountsAndRecent(t *testing.T) {
mock := newMockPlaybookExecutor()
rpe := NewRecoveryPlaybookEngine(mock.execute)
if rpe.PlaybookCount() != 0 {
t.Error("expected 0")
}
for _, pb := range DefaultPlaybooks() {
rpe.RegisterPlaybook(pb)
}
if rpe.PlaybookCount() != 3 {
t.Errorf("expected 3, got %d", rpe.PlaybookCount())
}
// Run two playbooks.
rpe.Execute(context.Background(), "component-resurrection", "comp1")
rpe.Execute(context.Background(), "crypto-rotation", "comp2")
recent := rpe.RecentExecutions(1)
if len(recent) != 1 {
t.Errorf("expected 1 recent, got %d", len(recent))
}
if recent[0].PlaybookID != "crypto-rotation" {
t.Errorf("expected crypto-rotation, got %s", recent[0].PlaybookID)
}
all := rpe.RecentExecutions(100)
if len(all) != 2 {
t.Errorf("expected 2 total, got %d", len(all))
}
}