Release prep: 54 engines, self-hosted signatures, i18n, dashboard updates

2026-05-15 06:12:37 +02:00 · 2026-03-23 16:45:40 +10:00 · 2026-03-23 16:45:40 +10:00 · 41cbfd6e0a
commit 41cbfd6e0a
parent 694e32be26
178 changed files with 36008 additions and 399 deletions
--- a/internal/application/resilience/behavioral.go
+++ b/internal/application/resilience/behavioral.go
@ -0,0 +1,165 @@
+package resilience
+
+import (
+	"context"
+	"log/slog"
+	"runtime"
+	"sync"
+	"time"
+)
+
+// BehaviorProfile captures the runtime behavior of a component.
+type BehaviorProfile struct {
+	Goroutines     int                `json:"goroutines"`
+	HeapAllocMB    float64            `json:"heap_alloc_mb"`
+	HeapObjectsK   float64            `json:"heap_objects_k"`
+	GCPauseMs      float64            `json:"gc_pause_ms"`
+	NumGC          uint32             `json:"num_gc"`
+	FileDescriptors int               `json:"file_descriptors,omitempty"`
+	CustomMetrics  map[string]float64 `json:"custom_metrics,omitempty"`
+}
+
+// BehavioralAlert is emitted when a behavioral anomaly is detected.
+type BehavioralAlert struct {
+	Component   string  `json:"component"`
+	AnomalyType string  `json:"anomaly_type"` // goroutine_leak, memory_leak, gc_pressure, etc.
+	Metric      string  `json:"metric"`
+	Current     float64 `json:"current"`
+	Baseline    float64 `json:"baseline"`
+	ZScore      float64 `json:"z_score"`
+	Severity    string  `json:"severity"`
+	Timestamp   time.Time `json:"timestamp"`
+}
+
+// BehavioralAnalyzer provides Go-side runtime behavioral analysis.
+// It profiles the current process and compares against learned baselines.
+// On Linux, eBPF hooks (immune/resilience_hooks.c) extend this to kernel level.
+type BehavioralAnalyzer struct {
+	mu         sync.RWMutex
+	metricsDB  *MetricsDB
+	alertBus   chan BehavioralAlert
+	interval   time.Duration
+	component  string // self component name
+	logger     *slog.Logger
+}
+
+// NewBehavioralAnalyzer creates a new behavioral analyzer.
+func NewBehavioralAnalyzer(component string, alertBufSize int) *BehavioralAnalyzer {
+	if alertBufSize <= 0 {
+		alertBufSize = 50
+	}
+	return &BehavioralAnalyzer{
+		metricsDB: NewMetricsDB(DefaultMetricsWindow, DefaultMetricsMaxSize),
+		alertBus:  make(chan BehavioralAlert, alertBufSize),
+		interval:  1 * time.Minute,
+		component: component,
+		logger:    slog.Default().With("component", "sarl-behavioral"),
+	}
+}
+
+// AlertBus returns the channel for consuming behavioral alerts.
+func (ba *BehavioralAnalyzer) AlertBus() <-chan BehavioralAlert {
+	return ba.alertBus
+}
+
+// Start begins continuous behavioral monitoring. Blocks until ctx cancelled.
+func (ba *BehavioralAnalyzer) Start(ctx context.Context) {
+	ba.logger.Info("behavioral analyzer started", "interval", ba.interval)
+
+	ticker := time.NewTicker(ba.interval)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-ctx.Done():
+			ba.logger.Info("behavioral analyzer stopped")
+			return
+		case <-ticker.C:
+			ba.collectAndAnalyze()
+		}
+	}
+}
+
+// collectAndAnalyze profiles runtime and checks for anomalies.
+func (ba *BehavioralAnalyzer) collectAndAnalyze() {
+	profile := ba.collectProfile()
+	ba.storeMetrics(profile)
+	ba.detectAnomalies(profile)
+}
+
+// collectProfile gathers current Go runtime stats.
+func (ba *BehavioralAnalyzer) collectProfile() BehaviorProfile {
+	var mem runtime.MemStats
+	runtime.ReadMemStats(&mem)
+
+	return BehaviorProfile{
+		Goroutines:   runtime.NumGoroutine(),
+		HeapAllocMB:  float64(mem.HeapAlloc) / (1024 * 1024),
+		HeapObjectsK: float64(mem.HeapObjects) / 1000,
+		GCPauseMs:    float64(mem.PauseNs[(mem.NumGC+255)%256]) / 1e6,
+		NumGC:        mem.NumGC,
+	}
+}
+
+// storeMetrics records profile data in the time-series DB.
+func (ba *BehavioralAnalyzer) storeMetrics(p BehaviorProfile) {
+	ba.metricsDB.AddDataPoint(ba.component, "goroutines", float64(p.Goroutines))
+	ba.metricsDB.AddDataPoint(ba.component, "heap_alloc_mb", p.HeapAllocMB)
+	ba.metricsDB.AddDataPoint(ba.component, "heap_objects_k", p.HeapObjectsK)
+	ba.metricsDB.AddDataPoint(ba.component, "gc_pause_ms", p.GCPauseMs)
+}
+
+// detectAnomalies checks each metric against its baseline via Z-score.
+func (ba *BehavioralAnalyzer) detectAnomalies(p BehaviorProfile) {
+	checks := []struct {
+		metric    string
+		value     float64
+		anomalyType string
+		severity  string
+	}{
+		{"goroutines", float64(p.Goroutines), "goroutine_leak", "WARNING"},
+		{"heap_alloc_mb", p.HeapAllocMB, "memory_leak", "CRITICAL"},
+		{"heap_objects_k", p.HeapObjectsK, "object_leak", "WARNING"},
+		{"gc_pause_ms", p.GCPauseMs, "gc_pressure", "WARNING"},
+	}
+
+	for _, c := range checks {
+		baseline := ba.metricsDB.GetBaseline(ba.component, c.metric, DefaultMetricsWindow)
+		if !IsAnomaly(c.value, baseline, AnomalyZScoreThreshold) {
+			continue
+		}
+
+		zscore := CalculateZScore(c.value, baseline)
+		alert := BehavioralAlert{
+			Component:   ba.component,
+			AnomalyType: c.anomalyType,
+			Metric:      c.metric,
+			Current:     c.value,
+			Baseline:    baseline.Mean,
+			ZScore:      zscore,
+			Severity:    c.severity,
+			Timestamp:   time.Now(),
+		}
+
+		select {
+		case ba.alertBus <- alert:
+			ba.logger.Warn("behavioral anomaly detected",
+				"type", c.anomalyType,
+				"metric", c.metric,
+				"z_score", zscore,
+			)
+		default:
+			ba.logger.Error("behavioral alert bus full")
+		}
+	}
+}
+
+// InjectMetric allows manually injecting a metric for testing.
+func (ba *BehavioralAnalyzer) InjectMetric(metric string, value float64) {
+	ba.metricsDB.AddDataPoint(ba.component, metric, value)
+}
+
+// CurrentProfile returns a snapshot of the current runtime profile.
+func (ba *BehavioralAnalyzer) CurrentProfile() BehaviorProfile {
+	return ba.collectProfile()
+}
--- a/internal/application/resilience/behavioral_test.go
+++ b/internal/application/resilience/behavioral_test.go
@ -0,0 +1,206 @@
+package resilience
+
+import (
+	"context"
+	"testing"
+	"time"
+)
+
+// IM-01: Goroutine leak detection.
+func TestBehavioral_IM01_GoroutineLeak(t *testing.T) {
+	ba := NewBehavioralAnalyzer("soc-ingest", 10)
+
+	// Build baseline of 10 goroutines.
+	for i := 0; i < 50; i++ {
+		ba.InjectMetric("goroutines", 10)
+	}
+
+	// Spike to 1000 goroutines — should trigger anomaly.
+	ba.metricsDB.AddDataPoint("soc-ingest", "goroutines", 1000)
+	profile := BehaviorProfile{Goroutines: 1000}
+	ba.detectAnomalies(profile)
+
+	select {
+	case alert := <-ba.alertBus:
+		if alert.AnomalyType != "goroutine_leak" {
+			t.Errorf("expected goroutine_leak, got %s", alert.AnomalyType)
+		}
+		if alert.ZScore <= 3 {
+			t.Errorf("expected Z > 3, got %f", alert.ZScore)
+		}
+	default:
+		t.Error("expected goroutine leak alert")
+	}
+}
+
+// IM-02: Memory leak detection.
+func TestBehavioral_IM02_MemoryLeak(t *testing.T) {
+	ba := NewBehavioralAnalyzer("soc-correlate", 10)
+
+	// Baseline: 50 MB.
+	for i := 0; i < 50; i++ {
+		ba.InjectMetric("heap_alloc_mb", 50)
+	}
+
+	// Spike to 500 MB.
+	ba.metricsDB.AddDataPoint("soc-correlate", "heap_alloc_mb", 500)
+	profile := BehaviorProfile{HeapAllocMB: 500}
+	ba.detectAnomalies(profile)
+
+	select {
+	case alert := <-ba.alertBus:
+		if alert.AnomalyType != "memory_leak" {
+			t.Errorf("expected memory_leak, got %s", alert.AnomalyType)
+		}
+		if alert.Severity != "CRITICAL" {
+			t.Errorf("expected CRITICAL severity, got %s", alert.Severity)
+		}
+	default:
+		t.Error("expected memory leak alert")
+	}
+}
+
+// IM-03: GC pressure detection.
+func TestBehavioral_IM03_GCPressure(t *testing.T) {
+	ba := NewBehavioralAnalyzer("soc-respond", 10)
+
+	// Baseline: 1ms GC pause.
+	for i := 0; i < 50; i++ {
+		ba.InjectMetric("gc_pause_ms", 1)
+	}
+
+	// Spike to 100ms.
+	ba.metricsDB.AddDataPoint("soc-respond", "gc_pause_ms", 100)
+	profile := BehaviorProfile{GCPauseMs: 100}
+	ba.detectAnomalies(profile)
+
+	select {
+	case alert := <-ba.alertBus:
+		if alert.AnomalyType != "gc_pressure" {
+			t.Errorf("expected gc_pressure, got %s", alert.AnomalyType)
+		}
+	default:
+		t.Error("expected gc_pressure alert")
+	}
+}
+
+// IM-04: Object leak detection.
+func TestBehavioral_IM04_ObjectLeak(t *testing.T) {
+	ba := NewBehavioralAnalyzer("shield", 10)
+
+	for i := 0; i < 50; i++ {
+		ba.InjectMetric("heap_objects_k", 100)
+	}
+
+	ba.metricsDB.AddDataPoint("shield", "heap_objects_k", 5000)
+	profile := BehaviorProfile{HeapObjectsK: 5000}
+	ba.detectAnomalies(profile)
+
+	select {
+	case alert := <-ba.alertBus:
+		if alert.AnomalyType != "object_leak" {
+			t.Errorf("expected object_leak, got %s", alert.AnomalyType)
+		}
+	default:
+		t.Error("expected object leak alert")
+	}
+}
+
+// IM-05: Normal behavior — no alerts.
+func TestBehavioral_IM05_NormalBehavior(t *testing.T) {
+	ba := NewBehavioralAnalyzer("sidecar", 10)
+
+	for i := 0; i < 50; i++ {
+		ba.InjectMetric("goroutines", 10)
+		ba.InjectMetric("heap_alloc_mb", 50)
+		ba.InjectMetric("heap_objects_k", 100)
+		ba.InjectMetric("gc_pause_ms", 1)
+	}
+
+	profile := BehaviorProfile{
+		Goroutines:   10,
+		HeapAllocMB:  50,
+		HeapObjectsK: 100,
+		GCPauseMs:    1,
+	}
+	ba.detectAnomalies(profile)
+
+	select {
+	case alert := <-ba.alertBus:
+		t.Errorf("expected no alerts for normal behavior, got %+v", alert)
+	default:
+		// Good — no alerts.
+	}
+}
+
+// IM-06: Start/Stop lifecycle.
+func TestBehavioral_IM06_StartStop(t *testing.T) {
+	ba := NewBehavioralAnalyzer("test", 10)
+	ba.interval = 50 * time.Millisecond
+
+	ctx, cancel := context.WithCancel(context.Background())
+	done := make(chan struct{})
+
+	go func() {
+		ba.Start(ctx)
+		close(done)
+	}()
+
+	time.Sleep(100 * time.Millisecond)
+	cancel()
+
+	select {
+	case <-done:
+	case <-time.After(time.Second):
+		t.Fatal("Start() did not return after context cancellation")
+	}
+}
+
+// IM-07: CurrentProfile returns valid data.
+func TestBehavioral_IM07_CurrentProfile(t *testing.T) {
+	ba := NewBehavioralAnalyzer("test", 10)
+	profile := ba.CurrentProfile()
+
+	if profile.Goroutines <= 0 {
+		t.Error("expected positive goroutine count")
+	}
+	if profile.HeapAllocMB <= 0 {
+		t.Error("expected positive heap alloc")
+	}
+}
+
+// IM-08: Alert bus overflow (non-blocking).
+func TestBehavioral_IM08_AlertBusOverflow(t *testing.T) {
+	ba := NewBehavioralAnalyzer("test", 2)
+
+	// Fill bus.
+	ba.alertBus <- BehavioralAlert{AnomalyType: "fill1"}
+	ba.alertBus <- BehavioralAlert{AnomalyType: "fill2"}
+
+	// Build baseline.
+	for i := 0; i < 50; i++ {
+		ba.InjectMetric("goroutines", 10)
+	}
+
+	// This should not panic.
+	ba.metricsDB.AddDataPoint("test", "goroutines", 10000)
+	ba.detectAnomalies(BehaviorProfile{Goroutines: 10000})
+}
+
+// Test collectAndAnalyze runs without error.
+func TestBehavioral_CollectAndAnalyze(t *testing.T) {
+	ba := NewBehavioralAnalyzer("test", 10)
+	// Should not panic.
+	ba.collectAndAnalyze()
+}
+
+// Test InjectMetric stores data.
+func TestBehavioral_InjectMetric(t *testing.T) {
+	ba := NewBehavioralAnalyzer("test", 10)
+	ba.InjectMetric("custom", 42.0)
+
+	recent := ba.metricsDB.GetRecent("test", "custom", 1)
+	if len(recent) != 1 || recent[0].Value != 42.0 {
+		t.Errorf("expected 42.0, got %v", recent)
+	}
+}
--- a/internal/application/resilience/healing_engine.go
+++ b/internal/application/resilience/healing_engine.go
@ -0,0 +1,524 @@
+package resilience
+
+import (
+	"context"
+	"fmt"
+	"log/slog"
+	"sync"
+	"time"
+)
+
+// HealingState represents the FSM state of a healing operation.
+type HealingState string
+
+const (
+	HealingIdle       HealingState = "IDLE"
+	HealingDiagnosing HealingState = "DIAGNOSING"
+	HealingActive     HealingState = "HEALING"
+	HealingVerifying  HealingState = "VERIFYING"
+	HealingCompleted  HealingState = "COMPLETED"
+	HealingFailed     HealingState = "FAILED"
+)
+
+// HealingResult summarizes a completed healing operation.
+type HealingResult string
+
+const (
+	ResultSuccess HealingResult = "SUCCESS"
+	ResultFailed  HealingResult = "FAILED"
+	ResultSkipped HealingResult = "SKIPPED"
+)
+
+// ActionType defines the kinds of healing actions.
+type ActionType string
+
+const (
+	ActionGracefulStop    ActionType = "graceful_stop"
+	ActionClearTempFiles  ActionType = "clear_temp_files"
+	ActionStartComponent  ActionType = "start_component"
+	ActionVerifyHealth    ActionType = "verify_health"
+	ActionNotifySOC       ActionType = "notify_soc"
+	ActionFreezeConfig    ActionType = "freeze_config"
+	ActionRollbackConfig  ActionType = "rollback_config"
+	ActionVerifyConfig    ActionType = "verify_config"
+	ActionSwitchReadOnly  ActionType = "switch_to_readonly"
+	ActionBackupDB        ActionType = "backup_db"
+	ActionRestoreSnapshot ActionType = "restore_snapshot"
+	ActionVerifyIntegrity ActionType = "verify_integrity"
+	ActionResumeWrites    ActionType = "resume_writes"
+	ActionDisableRules    ActionType = "disable_rules"
+	ActionRevertRules     ActionType = "revert_rules"
+	ActionReloadEngine    ActionType = "reload_engine"
+	ActionIsolateNetwork  ActionType = "isolate_network"
+	ActionRegenCerts      ActionType = "regenerate_certs"
+	ActionRestoreNetwork  ActionType = "restore_network"
+	ActionNotifyArchitect ActionType = "notify_architect"
+	ActionEnterSafeMode   ActionType = "enter_safe_mode"
+)
+
+// Action is a single step in a healing strategy.
+type Action struct {
+	Type    ActionType             `json:"type"`
+	Params  map[string]interface{} `json:"params,omitempty"`
+	Timeout time.Duration          `json:"timeout"`
+	OnError string                 `json:"on_error"` // "continue", "abort", "rollback"
+}
+
+// TriggerCondition defines when a healing strategy activates.
+type TriggerCondition struct {
+	Metrics             []string      `json:"metrics,omitempty"`
+	Statuses            []ComponentStatus `json:"statuses,omitempty"`
+	ConsecutiveFailures int           `json:"consecutive_failures"`
+	WithinWindow        time.Duration `json:"within_window"`
+}
+
+// RollbackPlan defines what happens if healing fails.
+type RollbackPlan struct {
+	OnFailure string   `json:"on_failure"` // "escalate", "enter_safe_mode", "maintain_isolation"
+	Actions   []Action `json:"actions,omitempty"`
+}
+
+// HealingStrategy is a complete self-healing plan.
+type HealingStrategy struct {
+	ID          string           `json:"id"`
+	Name        string           `json:"name"`
+	Trigger     TriggerCondition `json:"trigger"`
+	Actions     []Action         `json:"actions"`
+	Rollback    RollbackPlan     `json:"rollback"`
+	MaxAttempts int              `json:"max_attempts"`
+	Cooldown    time.Duration    `json:"cooldown"`
+}
+
+// Diagnosis is the result of root cause analysis.
+type Diagnosis struct {
+	Component    string   `json:"component"`
+	Metric       string   `json:"metric"`
+	RootCause    string   `json:"root_cause"`
+	Confidence   float64  `json:"confidence"`
+	SuggestedFix string   `json:"suggested_fix"`
+	RelatedAlerts []HealthAlert `json:"related_alerts,omitempty"`
+}
+
+// HealingOperation tracks a single healing attempt.
+type HealingOperation struct {
+	ID            string        `json:"id"`
+	StrategyID    string        `json:"strategy_id"`
+	Component     string        `json:"component"`
+	State         HealingState  `json:"state"`
+	Diagnosis     *Diagnosis    `json:"diagnosis,omitempty"`
+	ActionsRun    []ActionLog   `json:"actions_run"`
+	Result        HealingResult `json:"result"`
+	StartedAt     time.Time     `json:"started_at"`
+	CompletedAt   time.Time     `json:"completed_at,omitempty"`
+	Error         string        `json:"error,omitempty"`
+	AttemptNumber int           `json:"attempt_number"`
+}
+
+// ActionLog records the execution of a single action.
+type ActionLog struct {
+	Action    ActionType    `json:"action"`
+	StartedAt time.Time    `json:"started_at"`
+	Duration  time.Duration `json:"duration"`
+	Success   bool          `json:"success"`
+	Error     string        `json:"error,omitempty"`
+}
+
+// ActionExecutorFunc is the callback that actually runs an action.
+// Implementations handle the real system operations (restart, rollback, etc.).
+type ActionExecutorFunc func(ctx context.Context, action Action, component string) error
+
+// HealingEngine is the L2 Self-Healing orchestrator.
+type HealingEngine struct {
+	mu         sync.RWMutex
+	strategies []HealingStrategy
+	cooldowns  map[string]time.Time // strategyID → earliest next run
+	operations []*HealingOperation
+	opCounter  int64
+	executor   ActionExecutorFunc
+	alertBus   <-chan HealthAlert
+	escalateFn func(HealthAlert) // called on unrecoverable failure
+	logger     *slog.Logger
+}
+
+// NewHealingEngine creates a new self-healing engine.
+func NewHealingEngine(
+	alertBus <-chan HealthAlert,
+	executor ActionExecutorFunc,
+	escalateFn func(HealthAlert),
+) *HealingEngine {
+	return &HealingEngine{
+		cooldowns:  make(map[string]time.Time),
+		operations: make([]*HealingOperation, 0),
+		executor:   executor,
+		alertBus:   alertBus,
+		escalateFn: escalateFn,
+		logger:     slog.Default().With("component", "sarl-healing-engine"),
+	}
+}
+
+// RegisterStrategy adds a healing strategy.
+func (he *HealingEngine) RegisterStrategy(s HealingStrategy) {
+	he.mu.Lock()
+	defer he.mu.Unlock()
+	he.strategies = append(he.strategies, s)
+	he.logger.Info("strategy registered", "id", s.ID, "name", s.Name)
+}
+
+// Start begins listening for alerts and initiating healing. Blocks until ctx is cancelled.
+func (he *HealingEngine) Start(ctx context.Context) {
+	he.logger.Info("healing engine started", "strategies", len(he.strategies))
+
+	for {
+		select {
+		case <-ctx.Done():
+			he.logger.Info("healing engine stopped")
+			return
+		case alert, ok := <-he.alertBus:
+			if !ok {
+				return
+			}
+			if alert.Severity == SeverityCritical || alert.Severity == SeverityWarning {
+				he.initiateHealing(ctx, alert)
+			}
+		}
+	}
+}
+
+// initiateHealing runs the healing pipeline for an alert.
+func (he *HealingEngine) initiateHealing(ctx context.Context, alert HealthAlert) {
+	strategy := he.findStrategy(alert)
+	if strategy == nil {
+		he.logger.Info("no matching strategy for alert",
+			"component", alert.Component,
+			"metric", alert.Metric,
+		)
+		return
+	}
+
+	if he.isInCooldown(strategy.ID) {
+		he.logger.Info("strategy in cooldown",
+			"strategy", strategy.ID,
+			"component", alert.Component,
+		)
+		return
+	}
+
+	op := he.createOperation(strategy, alert.Component)
+
+	he.logger.Info("healing initiated",
+		"op_id", op.ID,
+		"strategy", strategy.ID,
+		"component", alert.Component,
+	)
+
+	// Phase 1: Diagnose.
+	he.transitionOp(op, HealingDiagnosing)
+	diagnosis := he.diagnose(alert)
+	op.Diagnosis = &diagnosis
+
+	// Phase 2: Execute healing actions.
+	he.transitionOp(op, HealingActive)
+	execErr := he.executeActions(ctx, strategy, op)
+
+	// Phase 3: Verify recovery.
+	if execErr == nil {
+		he.transitionOp(op, HealingVerifying)
+		verifyErr := he.verifyRecovery(ctx, strategy, op.Component)
+		if verifyErr != nil {
+			execErr = verifyErr
+		}
+	}
+
+	// Phase 4: Complete or fail.
+	if execErr == nil {
+		he.transitionOp(op, HealingCompleted)
+		op.Result = ResultSuccess
+		he.logger.Info("healing completed successfully",
+			"op_id", op.ID,
+			"component", op.Component,
+			"duration", time.Since(op.StartedAt),
+		)
+	} else {
+		he.transitionOp(op, HealingFailed)
+		op.Result = ResultFailed
+		op.Error = execErr.Error()
+		he.logger.Error("healing failed",
+			"op_id", op.ID,
+			"component", op.Component,
+			"error", execErr,
+		)
+
+		// Execute rollback.
+		he.executeRollback(ctx, strategy, op)
+
+		// Escalate.
+		if he.escalateFn != nil {
+			he.escalateFn(alert)
+		}
+	}
+
+	op.CompletedAt = time.Now()
+	he.setCooldown(strategy.ID, strategy.Cooldown)
+}
+
+// findStrategy returns the first matching strategy for an alert.
+func (he *HealingEngine) findStrategy(alert HealthAlert) *HealingStrategy {
+	he.mu.RLock()
+	defer he.mu.RUnlock()
+
+	for i := range he.strategies {
+		s := &he.strategies[i]
+		if he.matchesTrigger(s.Trigger, alert) {
+			return s
+		}
+	}
+	return nil
+}
+
+// matchesTrigger checks if an alert matches a strategy's trigger condition.
+func (he *HealingEngine) matchesTrigger(trigger TriggerCondition, alert HealthAlert) bool {
+	// Match by metric name.
+	for _, m := range trigger.Metrics {
+		if m == alert.Metric {
+			return true
+		}
+	}
+
+	// Match by component status.
+	for _, s := range trigger.Statuses {
+		switch s {
+		case StatusCritical:
+			if alert.Severity == SeverityCritical {
+				return true
+			}
+		case StatusOffline:
+			if alert.Severity == SeverityCritical && alert.SuggestedAction == "restart" {
+				return true
+			}
+		}
+	}
+
+	return false
+}
+
+// isInCooldown checks if a strategy is still in its cooldown period.
+func (he *HealingEngine) isInCooldown(strategyID string) bool {
+	he.mu.RLock()
+	defer he.mu.RUnlock()
+
+	earliest, ok := he.cooldowns[strategyID]
+	return ok && time.Now().Before(earliest)
+}
+
+// setCooldown marks a strategy as cooling down.
+func (he *HealingEngine) setCooldown(strategyID string, duration time.Duration) {
+	he.mu.Lock()
+	defer he.mu.Unlock()
+	he.cooldowns[strategyID] = time.Now().Add(duration)
+}
+
+// createOperation creates and records a new healing operation.
+func (he *HealingEngine) createOperation(strategy *HealingStrategy, component string) *HealingOperation {
+	he.mu.Lock()
+	defer he.mu.Unlock()
+
+	he.opCounter++
+	op := &HealingOperation{
+		ID:         fmt.Sprintf("heal-%d", he.opCounter),
+		StrategyID: strategy.ID,
+		Component:  component,
+		State:      HealingIdle,
+		StartedAt:  time.Now(),
+		ActionsRun: make([]ActionLog, 0),
+	}
+	he.operations = append(he.operations, op)
+	return op
+}
+
+// transitionOp moves an operation to a new state.
+func (he *HealingEngine) transitionOp(op *HealingOperation, newState HealingState) {
+	he.logger.Debug("healing state transition",
+		"op_id", op.ID,
+		"from", op.State,
+		"to", newState,
+	)
+	op.State = newState
+}
+
+// diagnose performs root cause analysis for an alert.
+func (he *HealingEngine) diagnose(alert HealthAlert) Diagnosis {
+	rootCause := "unknown"
+	confidence := 0.5
+	suggestedFix := "restart component"
+
+	switch {
+	case alert.Metric == "memory" && alert.Current > 90:
+		rootCause = "memory_exhaustion"
+		confidence = 0.9
+		suggestedFix = "restart with increased limits"
+	case alert.Metric == "cpu" && alert.Current > 90:
+		rootCause = "cpu_saturation"
+		confidence = 0.8
+		suggestedFix = "check for runaway goroutines"
+	case alert.Metric == "error_rate":
+		rootCause = "elevated_error_rate"
+		confidence = 0.7
+		suggestedFix = "check dependencies and config"
+	case alert.Metric == "latency_p99":
+		rootCause = "latency_degradation"
+		confidence = 0.6
+		suggestedFix = "check database and network"
+	case alert.Metric == "quorum":
+		rootCause = "quorum_loss"
+		confidence = 0.95
+		suggestedFix = "activate safe mode"
+	default:
+		rootCause = fmt.Sprintf("threshold_breach_%s", alert.Metric)
+		confidence = 0.5
+		suggestedFix = "investigate manually"
+	}
+
+	return Diagnosis{
+		Component:    alert.Component,
+		Metric:       alert.Metric,
+		RootCause:    rootCause,
+		Confidence:   confidence,
+		SuggestedFix: suggestedFix,
+	}
+}
+
+// executeActions runs each action in sequence.
+func (he *HealingEngine) executeActions(ctx context.Context, strategy *HealingStrategy, op *HealingOperation) error {
+	for _, action := range strategy.Actions {
+		actionCtx := ctx
+		var cancel context.CancelFunc
+		if action.Timeout > 0 {
+			actionCtx, cancel = context.WithTimeout(ctx, action.Timeout)
+		}
+
+		start := time.Now()
+		err := he.executor(actionCtx, action, op.Component)
+		duration := time.Since(start)
+
+		if cancel != nil {
+			cancel()
+		}
+
+		logEntry := ActionLog{
+			Action:    action.Type,
+			StartedAt: start,
+			Duration:  duration,
+			Success:   err == nil,
+		}
+		if err != nil {
+			logEntry.Error = err.Error()
+		}
+		op.ActionsRun = append(op.ActionsRun, logEntry)
+
+		if err != nil {
+			switch action.OnError {
+			case "continue":
+				he.logger.Warn("action failed, continuing",
+					"action", action.Type,
+					"error", err,
+				)
+			case "rollback":
+				return fmt.Errorf("action %s failed (rollback): %w", action.Type, err)
+			default: // "abort"
+				return fmt.Errorf("action %s failed: %w", action.Type, err)
+			}
+		}
+	}
+	return nil
+}
+
+// verifyRecovery checks if the component is healthy after healing.
+func (he *HealingEngine) verifyRecovery(ctx context.Context, strategy *HealingStrategy, component string) error {
+	// Execute a verify_health action if not already in the strategy.
+	verifyAction := Action{
+		Type:    ActionVerifyHealth,
+		Timeout: 30 * time.Second,
+	}
+	return he.executor(ctx, verifyAction, component)
+}
+
+// executeRollback runs the rollback plan for a failed healing.
+func (he *HealingEngine) executeRollback(ctx context.Context, strategy *HealingStrategy, op *HealingOperation) {
+	if len(strategy.Rollback.Actions) == 0 {
+		he.logger.Info("no rollback actions defined",
+			"strategy", strategy.ID,
+		)
+		return
+	}
+
+	he.logger.Warn("executing rollback",
+		"strategy", strategy.ID,
+		"component", op.Component,
+	)
+
+	for _, action := range strategy.Rollback.Actions {
+		if err := he.executor(ctx, action, op.Component); err != nil {
+			he.logger.Error("rollback action failed",
+				"action", action.Type,
+				"error", err,
+			)
+		}
+	}
+}
+
+// GetOperation returns a healing operation by ID.
+// Returns a deep copy to prevent data races with the healing goroutine.
+func (he *HealingEngine) GetOperation(id string) (*HealingOperation, bool) {
+	he.mu.RLock()
+	defer he.mu.RUnlock()
+
+	for _, op := range he.operations {
+		if op.ID == id {
+			cp := *op
+			cp.ActionsRun = make([]ActionLog, len(op.ActionsRun))
+			copy(cp.ActionsRun, op.ActionsRun)
+			if op.Diagnosis != nil {
+				diag := *op.Diagnosis
+				cp.Diagnosis = &diag
+			}
+			return &cp, true
+		}
+	}
+	return nil, false
+}
+
+// RecentOperations returns the last N operations.
+// Returns deep copies to prevent data races with the healing goroutine.
+func (he *HealingEngine) RecentOperations(n int) []HealingOperation {
+	he.mu.RLock()
+	defer he.mu.RUnlock()
+
+	total := len(he.operations)
+	if total == 0 {
+		return nil
+	}
+	start := total - n
+	if start < 0 {
+		start = 0
+	}
+
+	result := make([]HealingOperation, 0, n)
+	for i := start; i < total; i++ {
+		cp := *he.operations[i]
+		cp.ActionsRun = make([]ActionLog, len(he.operations[i].ActionsRun))
+		copy(cp.ActionsRun, he.operations[i].ActionsRun)
+		if he.operations[i].Diagnosis != nil {
+			diag := *he.operations[i].Diagnosis
+			cp.Diagnosis = &diag
+		}
+		result = append(result, cp)
+	}
+	return result
+}
+
+// StrategyCount returns the number of registered strategies.
+func (he *HealingEngine) StrategyCount() int {
+	he.mu.RLock()
+	defer he.mu.RUnlock()
+	return len(he.strategies)
+}
--- a/internal/application/resilience/healing_engine_test.go
+++ b/internal/application/resilience/healing_engine_test.go
@ -0,0 +1,588 @@
+package resilience
+
+import (
+	"context"
+	"fmt"
+	"sync/atomic"
+	"testing"
+	"time"
+)
+
+// --- Mock executor for tests ---
+
+type mockExecutorLog struct {
+	actions []ActionType
+	fail    map[ActionType]bool
+	count   atomic.Int64
+}
+
+func newMockExecutor() *mockExecutorLog {
+	return &mockExecutorLog{
+		fail: make(map[ActionType]bool),
+	}
+}
+
+func (m *mockExecutorLog) execute(_ context.Context, action Action, _ string) error {
+	m.count.Add(1)
+	m.actions = append(m.actions, action.Type)
+	if m.fail[action.Type] {
+		return fmt.Errorf("action %s failed", action.Type)
+	}
+	return nil
+}
+
+// --- Healing Engine Tests ---
+
+// HE-01: Component restart (success).
+func TestHealingEngine_HE01_RestartSuccess(t *testing.T) {
+	mock := newMockExecutor()
+	alertCh := make(chan HealthAlert, 10)
+	escalated := false
+
+	he := NewHealingEngine(alertCh, mock.execute, func(_ HealthAlert) {
+		escalated = true
+	})
+	he.RegisterStrategy(RestartComponentStrategy())
+
+	alertCh <- HealthAlert{
+		Component:       "soc-ingest",
+		Severity:        SeverityCritical,
+		Metric:          "quorum",
+		SuggestedAction: "restart",
+		Timestamp:       time.Now(),
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+	defer cancel()
+
+	// Run one healing cycle.
+	go he.Start(ctx)
+	time.Sleep(200 * time.Millisecond)
+	cancel()
+
+	ops := he.RecentOperations(10)
+	if len(ops) == 0 {
+		t.Fatal("expected at least 1 operation")
+	}
+	if ops[0].Result != ResultSuccess {
+		t.Errorf("expected SUCCESS, got %s (error: %s)", ops[0].Result, ops[0].Error)
+	}
+	if escalated {
+		t.Error("should not have escalated on success")
+	}
+}
+
+// HE-02: Component restart (failure ×3 → escalate).
+func TestHealingEngine_HE02_RestartFailureEscalate(t *testing.T) {
+	mock := newMockExecutor()
+	mock.fail[ActionStartComponent] = true // Start always fails.
+
+	alertCh := make(chan HealthAlert, 10)
+	escalated := false
+
+	he := NewHealingEngine(alertCh, mock.execute, func(_ HealthAlert) {
+		escalated = true
+	})
+	he.RegisterStrategy(RestartComponentStrategy())
+
+	alertCh <- HealthAlert{
+		Component:       "soc-correlate",
+		Severity:        SeverityCritical,
+		Metric:          "quorum",
+		SuggestedAction: "restart",
+		Timestamp:       time.Now(),
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+	defer cancel()
+
+	go he.Start(ctx)
+	time.Sleep(200 * time.Millisecond)
+	cancel()
+
+	if !escalated {
+		t.Error("expected escalation on failure")
+	}
+
+	ops := he.RecentOperations(10)
+	if len(ops) == 0 {
+		t.Fatal("expected operation")
+	}
+	if ops[0].Result != ResultFailed {
+		t.Errorf("expected FAILED, got %s", ops[0].Result)
+	}
+}
+
+// HE-03: Config rollback strategy matching.
+func TestHealingEngine_HE03_ConfigRollback(t *testing.T) {
+	mock := newMockExecutor()
+	alertCh := make(chan HealthAlert, 10)
+
+	he := NewHealingEngine(alertCh, mock.execute, nil)
+	he.RegisterStrategy(RollbackConfigStrategy())
+
+	alertCh <- HealthAlert{
+		Component: "soc-ingest",
+		Severity:  SeverityWarning,
+		Metric:    "config_tampering",
+		Timestamp: time.Now(),
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+	defer cancel()
+
+	go he.Start(ctx)
+	time.Sleep(200 * time.Millisecond)
+	cancel()
+
+	ops := he.RecentOperations(10)
+	if len(ops) == 0 {
+		t.Fatal("expected operation for config rollback")
+	}
+	if ops[0].StrategyID != "ROLLBACK_CONFIG" {
+		t.Errorf("expected ROLLBACK_CONFIG, got %s", ops[0].StrategyID)
+	}
+}
+
+// HE-04: Database recovery.
+func TestHealingEngine_HE04_DatabaseRecovery(t *testing.T) {
+	mock := newMockExecutor()
+	alertCh := make(chan HealthAlert, 10)
+
+	he := NewHealingEngine(alertCh, mock.execute, nil)
+	he.RegisterStrategy(RecoverDatabaseStrategy())
+
+	alertCh <- HealthAlert{
+		Component: "soc-correlate",
+		Severity:  SeverityCritical,
+		Metric:    "database_corruption",
+		Timestamp: time.Now(),
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+	defer cancel()
+
+	go he.Start(ctx)
+	time.Sleep(200 * time.Millisecond)
+	cancel()
+
+	ops := he.RecentOperations(10)
+	if len(ops) == 0 {
+		t.Fatal("expected DB recovery op")
+	}
+	if ops[0].StrategyID != "RECOVER_DATABASE" {
+		t.Errorf("expected RECOVER_DATABASE, got %s", ops[0].StrategyID)
+	}
+}
+
+// HE-05: Rule poisoning defense.
+func TestHealingEngine_HE05_RulePoisoning(t *testing.T) {
+	mock := newMockExecutor()
+	alertCh := make(chan HealthAlert, 10)
+
+	he := NewHealingEngine(alertCh, mock.execute, nil)
+	he.RegisterStrategy(RecoverRulesStrategy())
+
+	alertCh <- HealthAlert{
+		Component: "soc-correlate",
+		Severity:  SeverityWarning,
+		Metric:    "rule_execution_failure_rate",
+		Timestamp: time.Now(),
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+	defer cancel()
+
+	go he.Start(ctx)
+	time.Sleep(200 * time.Millisecond)
+	cancel()
+
+	ops := he.RecentOperations(10)
+	if len(ops) == 0 {
+		t.Fatal("expected rule recovery op")
+	}
+	if ops[0].StrategyID != "RECOVER_RULES" {
+		t.Errorf("expected RECOVER_RULES, got %s", ops[0].StrategyID)
+	}
+}
+
+// HE-06: Network isolation recovery.
+func TestHealingEngine_HE06_NetworkRecovery(t *testing.T) {
+	mock := newMockExecutor()
+	alertCh := make(chan HealthAlert, 10)
+
+	he := NewHealingEngine(alertCh, mock.execute, nil)
+	he.RegisterStrategy(RecoverNetworkStrategy())
+
+	alertCh <- HealthAlert{
+		Component: "soc-respond",
+		Severity:  SeverityWarning,
+		Metric:    "network_partition",
+		Timestamp: time.Now(),
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+	defer cancel()
+
+	go he.Start(ctx)
+	time.Sleep(200 * time.Millisecond)
+	cancel()
+
+	ops := he.RecentOperations(10)
+	if len(ops) == 0 {
+		t.Fatal("expected network recovery op")
+	}
+	if ops[0].StrategyID != "RECOVER_NETWORK" {
+		t.Errorf("expected RECOVER_NETWORK, got %s", ops[0].StrategyID)
+	}
+}
+
+// HE-07: Cooldown enforcement.
+func TestHealingEngine_HE07_Cooldown(t *testing.T) {
+	mock := newMockExecutor()
+	alertCh := make(chan HealthAlert, 10)
+
+	he := NewHealingEngine(alertCh, mock.execute, nil)
+	he.RegisterStrategy(RestartComponentStrategy())
+
+	// Set cooldown manually.
+	he.setCooldown("RESTART_COMPONENT", 1*time.Hour)
+
+	if !he.isInCooldown("RESTART_COMPONENT") {
+		t.Error("expected cooldown active")
+	}
+
+	alertCh <- HealthAlert{
+		Component:       "soc-ingest",
+		Severity:        SeverityCritical,
+		Metric:          "quorum",
+		SuggestedAction: "restart",
+		Timestamp:       time.Now(),
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond)
+	defer cancel()
+
+	go he.Start(ctx)
+	time.Sleep(200 * time.Millisecond)
+	cancel()
+
+	ops := he.RecentOperations(10)
+	if len(ops) != 0 {
+		t.Error("expected 0 operations during cooldown")
+	}
+}
+
+// HE-08: Rollback on failure.
+func TestHealingEngine_HE08_Rollback(t *testing.T) {
+	mock := newMockExecutor()
+	mock.fail[ActionStartComponent] = true
+
+	alertCh := make(chan HealthAlert, 10)
+	he := NewHealingEngine(alertCh, mock.execute, func(_ HealthAlert) {})
+
+	strategy := RollbackConfigStrategy()
+	he.RegisterStrategy(strategy)
+
+	alertCh <- HealthAlert{
+		Component: "soc-ingest",
+		Severity:  SeverityWarning,
+		Metric:    "config_tampering",
+		Timestamp: time.Now(),
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+	defer cancel()
+
+	go he.Start(ctx)
+	time.Sleep(200 * time.Millisecond)
+	cancel()
+
+	// Rollback should have executed enter_safe_mode.
+	foundSafeMode := false
+	for _, a := range mock.actions {
+		if a == ActionEnterSafeMode {
+			foundSafeMode = true
+		}
+	}
+	if !foundSafeMode {
+		t.Errorf("expected safe mode in rollback, actions: %v", mock.actions)
+	}
+}
+
+// HE-09: State machine transitions.
+func TestHealingEngine_HE09_StateTransitions(t *testing.T) {
+	mock := newMockExecutor()
+	alertCh := make(chan HealthAlert, 10)
+
+	he := NewHealingEngine(alertCh, mock.execute, nil)
+	he.RegisterStrategy(RestartComponentStrategy())
+
+	alertCh <- HealthAlert{
+		Component:       "comp",
+		Severity:        SeverityCritical,
+		Metric:          "quorum",
+		SuggestedAction: "restart",
+		Timestamp:       time.Now(),
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+	defer cancel()
+
+	go he.Start(ctx)
+	time.Sleep(200 * time.Millisecond)
+	cancel()
+
+	ops := he.RecentOperations(10)
+	if len(ops) == 0 {
+		t.Fatal("expected operation")
+	}
+	// Final state should be COMPLETED.
+	if ops[0].State != HealingCompleted {
+		t.Errorf("expected COMPLETED, got %s", ops[0].State)
+	}
+}
+
+// HE-10: Audit logging — all actions recorded.
+func TestHealingEngine_HE10_AuditLogging(t *testing.T) {
+	mock := newMockExecutor()
+	alertCh := make(chan HealthAlert, 10)
+
+	he := NewHealingEngine(alertCh, mock.execute, nil)
+	he.RegisterStrategy(RestartComponentStrategy())
+
+	alertCh <- HealthAlert{
+		Component:       "comp",
+		Severity:        SeverityCritical,
+		Metric:          "quorum",
+		SuggestedAction: "restart",
+		Timestamp:       time.Now(),
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+	defer cancel()
+
+	go he.Start(ctx)
+	time.Sleep(200 * time.Millisecond)
+	cancel()
+
+	ops := he.RecentOperations(10)
+	if len(ops) == 0 {
+		t.Fatal("expected operation")
+	}
+	if len(ops[0].ActionsRun) == 0 {
+		t.Error("expected action logs")
+	}
+	for _, al := range ops[0].ActionsRun {
+		if al.StartedAt.IsZero() {
+			t.Error("action log missing start time")
+		}
+	}
+}
+
+// HE-11: Parallel healing — no race conditions.
+func TestHealingEngine_HE11_Parallel(t *testing.T) {
+	mock := newMockExecutor()
+	alertCh := make(chan HealthAlert, 100)
+
+	he := NewHealingEngine(alertCh, mock.execute, nil)
+	for _, s := range DefaultStrategies() {
+		he.RegisterStrategy(s)
+	}
+
+	// Send many alerts concurrently.
+	for i := 0; i < 10; i++ {
+		alertCh <- HealthAlert{
+			Component:       fmt.Sprintf("comp-%d", i),
+			Severity:        SeverityCritical,
+			Metric:          "quorum",
+			SuggestedAction: "restart",
+			Timestamp:       time.Now(),
+		}
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
+	defer cancel()
+
+	go he.Start(ctx)
+	time.Sleep(1 * time.Second)
+	cancel()
+
+	// All 10 alerts processed (first gets an op, rest hit cooldown).
+	ops := he.RecentOperations(100)
+	if len(ops) == 0 {
+		t.Fatal("expected at least 1 operation")
+	}
+}
+
+// HE-12: No matching strategy → no operation.
+func TestHealingEngine_HE12_NoStrategy(t *testing.T) {
+	mock := newMockExecutor()
+	alertCh := make(chan HealthAlert, 10)
+
+	he := NewHealingEngine(alertCh, mock.execute, nil)
+	// No strategies registered.
+
+	alertCh <- HealthAlert{
+		Component: "comp",
+		Severity:  SeverityCritical,
+		Metric:    "unknown_metric",
+		Timestamp: time.Now(),
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond)
+	defer cancel()
+
+	go he.Start(ctx)
+	time.Sleep(200 * time.Millisecond)
+	cancel()
+
+	ops := he.RecentOperations(10)
+	if len(ops) != 0 {
+		t.Errorf("expected 0 operations, got %d", len(ops))
+	}
+}
+
+// Test diagnosis (various root causes).
+func TestHealingEngine_Diagnosis(t *testing.T) {
+	mock := newMockExecutor()
+	he := NewHealingEngine(nil, mock.execute, nil)
+
+	tests := []struct {
+		metric    string
+		current   float64
+		wantCause string
+	}{
+		{"memory", 95, "memory_exhaustion"},
+		{"cpu", 95, "cpu_saturation"},
+		{"error_rate", 10, "elevated_error_rate"},
+		{"latency_p99", 200, "latency_degradation"},
+		{"quorum", 0.3, "quorum_loss"},
+		{"custom", 100, "threshold_breach_custom"},
+	}
+
+	for _, tt := range tests {
+		alert := HealthAlert{
+			Component: "test",
+			Metric:    tt.metric,
+			Current:   tt.current,
+		}
+		d := he.diagnose(alert)
+		if d.RootCause != tt.wantCause {
+			t.Errorf("metric=%s: expected %s, got %s", tt.metric, tt.wantCause, d.RootCause)
+		}
+		if d.Confidence <= 0 || d.Confidence > 1 {
+			t.Errorf("metric=%s: invalid confidence %f", tt.metric, d.Confidence)
+		}
+	}
+}
+
+// Test DefaultStrategies returns 5 strategies.
+func TestDefaultStrategies(t *testing.T) {
+	strategies := DefaultStrategies()
+	if len(strategies) != 5 {
+		t.Errorf("expected 5 strategies, got %d", len(strategies))
+	}
+
+	ids := map[string]bool{}
+	for _, s := range strategies {
+		if ids[s.ID] {
+			t.Errorf("duplicate strategy ID: %s", s.ID)
+		}
+		ids[s.ID] = true
+		if s.MaxAttempts <= 0 {
+			t.Errorf("strategy %s: invalid max_attempts %d", s.ID, s.MaxAttempts)
+		}
+		if s.Cooldown <= 0 {
+			t.Errorf("strategy %s: invalid cooldown %v", s.ID, s.Cooldown)
+		}
+		if len(s.Actions) == 0 {
+			t.Errorf("strategy %s: no actions defined", s.ID)
+		}
+	}
+}
+
+// Test StrategyCount.
+func TestHealingEngine_StrategyCount(t *testing.T) {
+	he := NewHealingEngine(nil, nil, nil)
+	if he.StrategyCount() != 0 {
+		t.Error("expected 0")
+	}
+	for _, s := range DefaultStrategies() {
+		he.RegisterStrategy(s)
+	}
+	if he.StrategyCount() != 5 {
+		t.Errorf("expected 5, got %d", he.StrategyCount())
+	}
+}
+
+// Test GetOperation.
+func TestHealingEngine_GetOperation(t *testing.T) {
+	mock := newMockExecutor()
+	alertCh := make(chan HealthAlert, 10)
+
+	he := NewHealingEngine(alertCh, mock.execute, nil)
+	he.RegisterStrategy(RestartComponentStrategy())
+
+	alertCh <- HealthAlert{
+		Component:       "comp",
+		Severity:        SeverityCritical,
+		Metric:          "quorum",
+		SuggestedAction: "restart",
+		Timestamp:       time.Now(),
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+	defer cancel()
+
+	go he.Start(ctx)
+	time.Sleep(200 * time.Millisecond)
+	cancel()
+
+	op, ok := he.GetOperation("heal-1")
+	if !ok {
+		t.Fatal("expected operation heal-1")
+	}
+	if op.Component != "comp" {
+		t.Errorf("expected comp, got %s", op.Component)
+	}
+
+	_, ok = he.GetOperation("nonexistent")
+	if ok {
+		t.Error("expected not found for nonexistent")
+	}
+}
+
+// Test action OnError=continue.
+func TestHealingEngine_ActionContinueOnError(t *testing.T) {
+	mock := newMockExecutor()
+	mock.fail[ActionGracefulStop] = true // First action fails but marked continue.
+
+	alertCh := make(chan HealthAlert, 10)
+	he := NewHealingEngine(alertCh, mock.execute, nil)
+	he.RegisterStrategy(RestartComponentStrategy())
+
+	alertCh <- HealthAlert{
+		Component:       "comp",
+		Severity:        SeverityCritical,
+		Metric:          "quorum",
+		SuggestedAction: "restart",
+		Timestamp:       time.Now(),
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+	defer cancel()
+
+	go he.Start(ctx)
+	time.Sleep(200 * time.Millisecond)
+	cancel()
+
+	ops := he.RecentOperations(10)
+	if len(ops) == 0 {
+		t.Fatal("expected operation")
+	}
+	// Should still succeed because graceful_stop has OnError=continue.
+	if ops[0].Result != ResultSuccess {
+		t.Errorf("expected SUCCESS (continue on error), got %s", ops[0].Result)
+	}
+}
--- a/internal/application/resilience/healing_strategies.go
+++ b/internal/application/resilience/healing_strategies.go
@ -0,0 +1,215 @@
+package resilience
+
+import "time"
+
+// Built-in healing strategies per ТЗ §4.1.1.
+// These are registered at startup via HealingEngine.RegisterStrategy().
+
+// DefaultStrategies returns the 5 built-in healing strategies.
+func DefaultStrategies() []HealingStrategy {
+	return []HealingStrategy{
+		RestartComponentStrategy(),
+		RollbackConfigStrategy(),
+		RecoverDatabaseStrategy(),
+		RecoverRulesStrategy(),
+		RecoverNetworkStrategy(),
+	}
+}
+
+// RestartComponentStrategy handles component crashes and offline states.
+// Trigger: component_offline OR component_critical, 2 consecutive failures within 5m.
+// Actions: graceful_stop → clear_temp → start → verify → notify.
+// Rollback: escalate to next strategy.
+func RestartComponentStrategy() HealingStrategy {
+	return HealingStrategy{
+		ID:   "RESTART_COMPONENT",
+		Name: "Component Restart",
+		Trigger: TriggerCondition{
+			Statuses:            []ComponentStatus{StatusOffline, StatusCritical},
+			ConsecutiveFailures: 2,
+			WithinWindow:        5 * time.Minute,
+		},
+		Actions: []Action{
+			{Type: ActionGracefulStop, Timeout: 10 * time.Second, OnError: "continue"},
+			{Type: ActionClearTempFiles, Timeout: 5 * time.Second, OnError: "continue"},
+			{Type: ActionStartComponent, Timeout: 30 * time.Second, OnError: "abort"},
+			{Type: ActionVerifyHealth, Timeout: 60 * time.Second, OnError: "abort"},
+			{Type: ActionNotifySOC, Timeout: 5 * time.Second, OnError: "continue",
+				Params: map[string]interface{}{
+					"severity": "INFO",
+					"message":  "Component restarted successfully",
+				},
+			},
+		},
+		Rollback: RollbackPlan{
+			OnFailure: "escalate",
+			Actions: []Action{
+				{Type: ActionNotifyArchitect, Timeout: 5 * time.Second,
+					Params: map[string]interface{}{
+						"severity": "CRITICAL",
+						"message":  "Component restart failed after max attempts",
+					},
+				},
+			},
+		},
+		MaxAttempts: 3,
+		Cooldown:    5 * time.Minute,
+	}
+}
+
+// RollbackConfigStrategy handles config tampering or validation failures.
+// Trigger: config_tampering_detected OR config_validation_failed.
+// Actions: freeze → verify_backup → rollback → restart → verify → notify.
+func RollbackConfigStrategy() HealingStrategy {
+	return HealingStrategy{
+		ID:   "ROLLBACK_CONFIG",
+		Name: "Configuration Rollback",
+		Trigger: TriggerCondition{
+			Metrics: []string{"config_tampering", "config_validation"},
+		},
+		Actions: []Action{
+			{Type: ActionFreezeConfig, Timeout: 5 * time.Second, OnError: "abort"},
+			{Type: ActionRollbackConfig, Timeout: 15 * time.Second, OnError: "abort"},
+			{Type: ActionStartComponent, Timeout: 30 * time.Second, OnError: "rollback"},
+			{Type: ActionVerifyConfig, Timeout: 10 * time.Second, OnError: "abort"},
+			{Type: ActionNotifyArchitect, Timeout: 5 * time.Second, OnError: "continue",
+				Params: map[string]interface{}{
+					"severity": "WARNING",
+					"message":  "Config rolled back due to tampering",
+				},
+			},
+		},
+		Rollback: RollbackPlan{
+			OnFailure: "enter_safe_mode",
+			Actions: []Action{
+				{Type: ActionEnterSafeMode, Timeout: 10 * time.Second},
+			},
+		},
+		MaxAttempts: 1,
+		Cooldown:    1 * time.Hour,
+	}
+}
+
+// RecoverDatabaseStrategy handles SQLite corruption.
+// Trigger: database_corruption OR sqlite_integrity_failed.
+// Actions: readonly → backup → restore → verify → resume → notify.
+func RecoverDatabaseStrategy() HealingStrategy {
+	return HealingStrategy{
+		ID:   "RECOVER_DATABASE",
+		Name: "Database Recovery",
+		Trigger: TriggerCondition{
+			Metrics: []string{"database_corruption", "sqlite_integrity"},
+		},
+		Actions: []Action{
+			{Type: ActionSwitchReadOnly, Timeout: 5 * time.Second, OnError: "abort"},
+			{Type: ActionBackupDB, Timeout: 30 * time.Second, OnError: "continue"},
+			{Type: ActionRestoreSnapshot, Timeout: 60 * time.Second, OnError: "abort",
+				Params: map[string]interface{}{
+					"snapshot_age_max": "1h",
+				},
+			},
+			{Type: ActionVerifyIntegrity, Timeout: 30 * time.Second, OnError: "abort"},
+			{Type: ActionResumeWrites, Timeout: 5 * time.Second, OnError: "abort"},
+			{Type: ActionNotifySOC, Timeout: 5 * time.Second, OnError: "continue",
+				Params: map[string]interface{}{
+					"severity": "WARNING",
+					"message":  "Database recovered from snapshot",
+				},
+			},
+		},
+		Rollback: RollbackPlan{
+			OnFailure: "enter_lockdown",
+			Actions: []Action{
+				{Type: ActionEnterSafeMode, Timeout: 10 * time.Second},
+				{Type: ActionNotifyArchitect, Timeout: 5 * time.Second,
+					Params: map[string]interface{}{
+						"severity": "CRITICAL",
+						"message":  "Database recovery failed",
+					},
+				},
+			},
+		},
+		MaxAttempts: 2,
+		Cooldown:    2 * time.Hour,
+	}
+}
+
+// RecoverRulesStrategy handles correlation rule poisoning.
+// Trigger: rule execution failure rate > 50%.
+// Actions: disable_suspicious → revert_baseline → verify → reload → notify.
+func RecoverRulesStrategy() HealingStrategy {
+	return HealingStrategy{
+		ID:   "RECOVER_RULES",
+		Name: "Rule Poisoning Defense",
+		Trigger: TriggerCondition{
+			Metrics: []string{"rule_execution_failure_rate", "correlation_rule_anomaly"},
+		},
+		Actions: []Action{
+			{Type: ActionDisableRules, Timeout: 10 * time.Second, OnError: "abort",
+				Params: map[string]interface{}{
+					"criteria": "failure_rate > 80%",
+				},
+			},
+			{Type: ActionRevertRules, Timeout: 15 * time.Second, OnError: "abort"},
+			{Type: ActionReloadEngine, Timeout: 30 * time.Second, OnError: "abort"},
+			{Type: ActionVerifyHealth, Timeout: 30 * time.Second, OnError: "continue"},
+			{Type: ActionNotifyArchitect, Timeout: 5 * time.Second, OnError: "continue",
+				Params: map[string]interface{}{
+					"severity": "WARNING",
+					"message":  "Rules recovered from baseline",
+				},
+			},
+		},
+		Rollback: RollbackPlan{
+			OnFailure: "disable_correlation",
+		},
+		MaxAttempts: 2,
+		Cooldown:    4 * time.Hour,
+	}
+}
+
+// RecoverNetworkStrategy handles network partition or mTLS cert expiry.
+// Trigger: network_partition_detected OR mTLS_cert_expired.
+// Actions: isolate → regen_certs → verify → restore → notify.
+func RecoverNetworkStrategy() HealingStrategy {
+	return HealingStrategy{
+		ID:   "RECOVER_NETWORK",
+		Name: "Network Isolation Recovery",
+		Trigger: TriggerCondition{
+			Metrics: []string{"network_partition", "mtls_cert_expiry"},
+		},
+		Actions: []Action{
+			{Type: ActionIsolateNetwork, Timeout: 5 * time.Second, OnError: "abort",
+				Params: map[string]interface{}{
+					"scope": "external_only",
+				},
+			},
+			{Type: ActionRegenCerts, Timeout: 30 * time.Second, OnError: "abort",
+				Params: map[string]interface{}{
+					"validity": "24h",
+				},
+			},
+			{Type: ActionVerifyHealth, Timeout: 30 * time.Second, OnError: "rollback"},
+			{Type: ActionRestoreNetwork, Timeout: 10 * time.Second, OnError: "abort"},
+			{Type: ActionNotifySOC, Timeout: 5 * time.Second, OnError: "continue",
+				Params: map[string]interface{}{
+					"severity": "INFO",
+					"message":  "Network connectivity restored",
+				},
+			},
+		},
+		Rollback: RollbackPlan{
+			OnFailure: "maintain_isolation",
+			Actions: []Action{
+				{Type: ActionNotifyArchitect, Timeout: 5 * time.Second,
+					Params: map[string]interface{}{
+						"severity": "CRITICAL",
+						"message":  "Network recovery failed, maintaining isolation",
+					},
+				},
+			},
+		},
+		MaxAttempts: 3,
+		Cooldown:    1 * time.Hour,
+	}
+}
--- a/internal/application/resilience/health_monitor.go
+++ b/internal/application/resilience/health_monitor.go
@ -0,0 +1,445 @@
+package resilience
+
+import (
+	"context"
+	"fmt"
+	"log/slog"
+	"sync"
+	"time"
+)
+
+// ComponentStatus defines the health state of a monitored component.
+type ComponentStatus string
+
+const (
+	StatusHealthy  ComponentStatus = "HEALTHY"
+	StatusDegraded ComponentStatus = "DEGRADED"
+	StatusCritical ComponentStatus = "CRITICAL"
+	StatusOffline  ComponentStatus = "OFFLINE"
+)
+
+// AlertSeverity defines the severity of a health alert.
+type AlertSeverity string
+
+const (
+	SeverityInfo     AlertSeverity = "INFO"
+	SeverityWarning  AlertSeverity = "WARNING"
+	SeverityCritical AlertSeverity = "CRITICAL"
+)
+
+// OverallStatus aggregates component statuses into a system-wide status.
+type OverallStatus string
+
+const (
+	OverallHealthy  OverallStatus = "HEALTHY"
+	OverallDegraded OverallStatus = "DEGRADED"
+	OverallCritical OverallStatus = "CRITICAL"
+)
+
+// Default intervals per ТЗ §3.1.2.
+const (
+	MetricsCollectionInterval = 10 * time.Second
+	HealthCheckInterval       = 30 * time.Second
+	QuorumValidationInterval  = 60 * time.Second
+
+	// AnomalyZScoreThreshold — Z > 3.0 = anomaly (99.7% confidence).
+	AnomalyZScoreThreshold = 3.0
+
+	// QuorumThreshold — 2/3 must be healthy.
+	QuorumThreshold = 0.66
+
+	// MaxConsecutiveFailures before marking CRITICAL.
+	MaxConsecutiveFailures = 3
+)
+
+// ComponentConfig defines monitoring thresholds for a component.
+type ComponentConfig struct {
+	Name       string             `json:"name"`
+	Type       string             `json:"type"` // go_binary, c_binary, c_kernel_module
+	Thresholds map[string]float64 `json:"thresholds"`
+	// Whether threshold is an upper bound (true) or lower bound (false).
+	ThresholdIsMax map[string]bool `json:"threshold_is_max"`
+}
+
+// ComponentHealth tracks the health state of a single component.
+type ComponentHealth struct {
+	Name        string            `json:"name"`
+	Status      ComponentStatus   `json:"status"`
+	Metrics     map[string]float64 `json:"metrics"`
+	LastCheck   time.Time         `json:"last_check"`
+	Consecutive int               `json:"consecutive_failures"`
+	Config      ComponentConfig   `json:"-"`
+}
+
+// HealthAlert represents a detected health anomaly.
+type HealthAlert struct {
+	Component       string        `json:"component"`
+	Severity        AlertSeverity `json:"severity"`
+	Metric          string        `json:"metric"`
+	Current         float64       `json:"current"`
+	Threshold       float64       `json:"threshold"`
+	ZScore          float64       `json:"z_score,omitempty"`
+	Timestamp       time.Time     `json:"timestamp"`
+	SuggestedAction string        `json:"suggested_action"`
+}
+
+// HealthResponse is the API response for GET /api/v1/resilience/health.
+type HealthResponse struct {
+	OverallStatus     OverallStatus     `json:"overall_status"`
+	Components        []ComponentHealth `json:"components"`
+	QuorumValid       bool              `json:"quorum_valid"`
+	LastCheck         time.Time         `json:"last_check"`
+	AnomaliesDetected []HealthAlert     `json:"anomalies_detected"`
+}
+
+// MetricsCollector is the interface for collecting metrics from components.
+// Implementations can use /healthz endpoints, /metrics, or runtime stats.
+type MetricsCollector interface {
+	Collect(ctx context.Context, component string) (map[string]float64, error)
+}
+
+// HealthMonitor is the L1 Self-Monitoring orchestrator.
+// It collects metrics, runs anomaly detection, validates quorum,
+// and emits HealthAlerts to the alert bus.
+type HealthMonitor struct {
+	mu         sync.RWMutex
+	components map[string]*ComponentHealth
+	metricsDB  *MetricsDB
+	alertBus   chan HealthAlert
+	collector  MetricsCollector
+	logger     *slog.Logger
+
+	// anomalyWindow is the baseline window for Z-score calculation.
+	anomalyWindow time.Duration
+}
+
+// NewHealthMonitor creates a new health monitor.
+func NewHealthMonitor(collector MetricsCollector, alertBufSize int) *HealthMonitor {
+	if alertBufSize <= 0 {
+		alertBufSize = 100
+	}
+	return &HealthMonitor{
+		components:    make(map[string]*ComponentHealth),
+		metricsDB:     NewMetricsDB(DefaultMetricsWindow, DefaultMetricsMaxSize),
+		alertBus:      make(chan HealthAlert, alertBufSize),
+		collector:     collector,
+		logger:        slog.Default().With("component", "sarl-health-monitor"),
+		anomalyWindow: 24 * time.Hour,
+	}
+}
+
+// RegisterComponent adds a component to be monitored.
+func (hm *HealthMonitor) RegisterComponent(config ComponentConfig) {
+	hm.mu.Lock()
+	defer hm.mu.Unlock()
+
+	hm.components[config.Name] = &ComponentHealth{
+		Name:    config.Name,
+		Status:  StatusHealthy,
+		Metrics: make(map[string]float64),
+		Config:  config,
+	}
+	hm.logger.Info("component registered", "name", config.Name, "type", config.Type)
+}
+
+// AlertBus returns the channel for consuming health alerts.
+func (hm *HealthMonitor) AlertBus() <-chan HealthAlert {
+	return hm.alertBus
+}
+
+// Start begins the monitoring loops. Blocks until ctx is cancelled.
+func (hm *HealthMonitor) Start(ctx context.Context) {
+	hm.logger.Info("health monitor started")
+
+	metricsTicker := time.NewTicker(MetricsCollectionInterval)
+	healthTicker := time.NewTicker(HealthCheckInterval)
+	quorumTicker := time.NewTicker(QuorumValidationInterval)
+	defer metricsTicker.Stop()
+	defer healthTicker.Stop()
+	defer quorumTicker.Stop()
+
+	for {
+		select {
+		case <-ctx.Done():
+			hm.logger.Info("health monitor stopped")
+			return
+		case <-metricsTicker.C:
+			hm.collectMetrics(ctx)
+		case <-healthTicker.C:
+			hm.checkHealth()
+		case <-quorumTicker.C:
+			hm.validateQuorum()
+		}
+	}
+}
+
+// collectMetrics gathers metrics from all registered components.
+func (hm *HealthMonitor) collectMetrics(ctx context.Context) {
+	hm.mu.RLock()
+	names := make([]string, 0, len(hm.components))
+	for name := range hm.components {
+		names = append(names, name)
+	}
+	hm.mu.RUnlock()
+
+	for _, name := range names {
+		metrics, err := hm.collector.Collect(ctx, name)
+		if err != nil {
+			hm.logger.Warn("metrics collection failed", "component", name, "error", err)
+			hm.mu.Lock()
+			if comp, ok := hm.components[name]; ok {
+				comp.Consecutive++
+			}
+			hm.mu.Unlock()
+			continue
+		}
+
+		hm.mu.Lock()
+		comp, ok := hm.components[name]
+		if ok {
+			comp.Metrics = metrics
+			comp.LastCheck = time.Now()
+			// Store each metric in time-series DB.
+			for metric, value := range metrics {
+				hm.metricsDB.AddDataPoint(name, metric, value)
+			}
+		}
+		hm.mu.Unlock()
+	}
+}
+
+// checkHealth evaluates each component against thresholds and anomalies.
+func (hm *HealthMonitor) checkHealth() {
+	hm.mu.Lock()
+	defer hm.mu.Unlock()
+
+	for _, comp := range hm.components {
+		alerts := hm.evaluateComponent(comp)
+		for _, alert := range alerts {
+			hm.emitAlert(alert)
+		}
+	}
+}
+
+// evaluateComponent checks a single component's metrics against thresholds
+// and runs Z-score anomaly detection. Returns any generated alerts.
+func (hm *HealthMonitor) evaluateComponent(comp *ComponentHealth) []HealthAlert {
+	var alerts []HealthAlert
+	breached := false
+
+	for metric, value := range comp.Metrics {
+		threshold, hasThreshold := comp.Config.Thresholds[metric]
+		if !hasThreshold {
+			continue
+		}
+
+		isMax := comp.Config.ThresholdIsMax[metric]
+		var exceeded bool
+		if isMax {
+			exceeded = value > threshold
+		} else {
+			exceeded = value < threshold
+		}
+
+		if exceeded {
+			breached = true
+			action := "restart"
+			if metric == "error_rate" || metric == "latency_p99" {
+				action = "investigate"
+			}
+
+			alerts = append(alerts, HealthAlert{
+				Component:       comp.Name,
+				Severity:        SeverityWarning,
+				Metric:          metric,
+				Current:         value,
+				Threshold:       threshold,
+				Timestamp:       time.Now(),
+				SuggestedAction: action,
+			})
+		}
+
+		// Z-score anomaly detection.
+		baseline := hm.metricsDB.GetBaseline(comp.Name, metric, hm.anomalyWindow)
+		if IsAnomaly(value, baseline, AnomalyZScoreThreshold) {
+			zscore := CalculateZScore(value, baseline)
+			alerts = append(alerts, HealthAlert{
+				Component:       comp.Name,
+				Severity:        SeverityCritical,
+				Metric:          metric,
+				Current:         value,
+				Threshold:       baseline.Mean + AnomalyZScoreThreshold*baseline.StdDev,
+				ZScore:          zscore,
+				Timestamp:       time.Now(),
+				SuggestedAction: fmt.Sprintf("anomaly detected (Z=%.2f), investigate %s", zscore, metric),
+			})
+		}
+	}
+
+	// Update component status.
+	if breached {
+		comp.Consecutive++
+		if comp.Consecutive >= MaxConsecutiveFailures {
+			comp.Status = StatusCritical
+		} else {
+			comp.Status = StatusDegraded
+		}
+	} else {
+		comp.Consecutive = 0
+		comp.Status = StatusHealthy
+	}
+
+	return alerts
+}
+
+// emitAlert sends an alert to the bus (non-blocking).
+func (hm *HealthMonitor) emitAlert(alert HealthAlert) {
+	select {
+	case hm.alertBus <- alert:
+		hm.logger.Warn("health alert emitted",
+			"component", alert.Component,
+			"severity", alert.Severity,
+			"metric", alert.Metric,
+			"current", alert.Current,
+			"threshold", alert.Threshold,
+		)
+	default:
+		hm.logger.Error("alert bus full, dropping alert",
+			"component", alert.Component,
+			"metric", alert.Metric,
+		)
+	}
+}
+
+// validateQuorum checks if 2/3 of components are healthy.
+func (hm *HealthMonitor) validateQuorum() {
+	hm.mu.RLock()
+	defer hm.mu.RUnlock()
+
+	if len(hm.components) == 0 {
+		return
+	}
+
+	valid := ValidateQuorum(hm.componentStatuses())
+
+	if !valid {
+		hm.logger.Error("QUORUM LOST — entering degraded state",
+			"healthy_ratio", hm.healthyRatio(),
+			"threshold", QuorumThreshold,
+		)
+		hm.emitAlert(HealthAlert{
+			Component:       "system",
+			Severity:        SeverityCritical,
+			Metric:          "quorum",
+			Current:         hm.healthyRatio(),
+			Threshold:       QuorumThreshold,
+			Timestamp:       time.Now(),
+			SuggestedAction: "activate safe mode",
+		})
+	}
+}
+
+// ValidateQuorum checks if the healthy ratio meets the 2/3 threshold.
+func ValidateQuorum(statuses map[string]ComponentStatus) bool {
+	if len(statuses) == 0 {
+		return false
+	}
+
+	healthy := 0
+	for _, status := range statuses {
+		if status == StatusHealthy {
+			healthy++
+		}
+	}
+	return float64(healthy)/float64(len(statuses)) >= QuorumThreshold
+}
+
+// componentStatuses returns current status map (caller must hold RLock).
+func (hm *HealthMonitor) componentStatuses() map[string]ComponentStatus {
+	statuses := make(map[string]ComponentStatus, len(hm.components))
+	for name, comp := range hm.components {
+		statuses[name] = comp.Status
+	}
+	return statuses
+}
+
+// healthyRatio returns the fraction of healthy components (caller must hold RLock).
+func (hm *HealthMonitor) healthyRatio() float64 {
+	if len(hm.components) == 0 {
+		return 0
+	}
+	healthy := 0
+	for _, comp := range hm.components {
+		if comp.Status == StatusHealthy {
+			healthy++
+		}
+	}
+	return float64(healthy) / float64(len(hm.components))
+}
+
+// GetHealth returns a snapshot of the entire system health.
+func (hm *HealthMonitor) GetHealth() HealthResponse {
+	hm.mu.RLock()
+	defer hm.mu.RUnlock()
+
+	components := make([]ComponentHealth, 0, len(hm.components))
+	for _, comp := range hm.components {
+		cp := *comp
+		// Deep copy metrics.
+		cp.Metrics = make(map[string]float64, len(comp.Metrics))
+		for k, v := range comp.Metrics {
+			cp.Metrics[k] = v
+		}
+		components = append(components, cp)
+	}
+
+	overall := OverallHealthy
+	for _, comp := range components {
+		switch comp.Status {
+		case StatusCritical, StatusOffline:
+			overall = OverallCritical
+		case StatusDegraded:
+			if overall != OverallCritical {
+				overall = OverallDegraded
+			}
+		}
+	}
+
+	return HealthResponse{
+		OverallStatus: overall,
+		Components:    components,
+		QuorumValid:   ValidateQuorum(hm.componentStatuses()),
+		LastCheck:     time.Now(),
+	}
+}
+
+// SetComponentStatus manually sets a component's status (for testing/override).
+func (hm *HealthMonitor) SetComponentStatus(name string, status ComponentStatus) {
+	hm.mu.Lock()
+	defer hm.mu.Unlock()
+
+	if comp, ok := hm.components[name]; ok {
+		comp.Status = status
+	}
+}
+
+// UpdateMetrics manually updates a component's metrics (for testing/override).
+func (hm *HealthMonitor) UpdateMetrics(name string, metrics map[string]float64) {
+	hm.mu.Lock()
+	defer hm.mu.Unlock()
+
+	if comp, ok := hm.components[name]; ok {
+		comp.Metrics = metrics
+		comp.LastCheck = time.Now()
+		for metric, value := range metrics {
+			hm.metricsDB.AddDataPoint(name, metric, value)
+		}
+	}
+}
+
+// ComponentCount returns the number of registered components.
+func (hm *HealthMonitor) ComponentCount() int {
+	hm.mu.RLock()
+	defer hm.mu.RUnlock()
+	return len(hm.components)
+}
--- a/internal/application/resilience/health_monitor_test.go
+++ b/internal/application/resilience/health_monitor_test.go
@ -0,0 +1,499 @@
+package resilience
+
+import (
+	"context"
+	"fmt"
+	"math"
+	"testing"
+	"time"
+)
+
+// --- MetricsDB Tests ---
+
+func TestRingBuffer_AddAndAll(t *testing.T) {
+	rb := newRingBuffer(5)
+	now := time.Now()
+
+	for i := 0; i < 3; i++ {
+		rb.Add(DataPoint{Timestamp: now.Add(time.Duration(i) * time.Second), Value: float64(i)})
+	}
+
+	if rb.Len() != 3 {
+		t.Fatalf("expected 3, got %d", rb.Len())
+	}
+
+	all := rb.All()
+	if len(all) != 3 {
+		t.Fatalf("expected 3 points, got %d", len(all))
+	}
+	for i, dp := range all {
+		if dp.Value != float64(i) {
+			t.Errorf("point %d: expected %f, got %f", i, float64(i), dp.Value)
+		}
+	}
+}
+
+func TestRingBuffer_Wrap(t *testing.T) {
+	rb := newRingBuffer(3)
+	now := time.Now()
+
+	for i := 0; i < 5; i++ {
+		rb.Add(DataPoint{Timestamp: now.Add(time.Duration(i) * time.Second), Value: float64(i)})
+	}
+
+	if rb.Len() != 3 {
+		t.Fatalf("expected 3 (buffer size), got %d", rb.Len())
+	}
+
+	all := rb.All()
+	// Should contain values 2, 3, 4 (oldest 0, 1 overwritten).
+	expected := []float64{2, 3, 4}
+	for i, dp := range all {
+		if dp.Value != expected[i] {
+			t.Errorf("point %d: expected %f, got %f", i, expected[i], dp.Value)
+		}
+	}
+}
+
+func TestMetricsDB_AddAndBaseline(t *testing.T) {
+	db := NewMetricsDB(time.Hour, 100)
+	for i := 0; i < 20; i++ {
+		db.AddDataPoint("soc-ingest", "cpu", 30.0+float64(i%5))
+	}
+
+	baseline := db.GetBaseline("soc-ingest", "cpu", time.Hour)
+	if baseline.Count != 20 {
+		t.Fatalf("expected 20 points, got %d", baseline.Count)
+	}
+	if baseline.Mean < 30 || baseline.Mean > 35 {
+		t.Errorf("mean out of expected range: %f", baseline.Mean)
+	}
+	if baseline.StdDev == 0 {
+		t.Error("expected non-zero stddev")
+	}
+}
+
+func TestMetricsDB_EmptyBaseline(t *testing.T) {
+	db := NewMetricsDB(time.Hour, 100)
+	baseline := db.GetBaseline("nonexistent", "cpu", time.Hour)
+	if baseline.Count != 0 {
+		t.Errorf("expected 0 count for nonexistent, got %d", baseline.Count)
+	}
+}
+
+func TestCalculateZScore(t *testing.T) {
+	baseline := Baseline{Mean: 30.0, StdDev: 5.0, Count: 100}
+
+	// Normal value (Z = 1.0).
+	z := CalculateZScore(35.0, baseline)
+	if math.Abs(z-1.0) > 0.01 {
+		t.Errorf("expected Z≈1.0, got %f", z)
+	}
+
+	// Anomalous value (Z = 4.0).
+	z = CalculateZScore(50.0, baseline)
+	if math.Abs(z-4.0) > 0.01 {
+		t.Errorf("expected Z≈4.0, got %f", z)
+	}
+
+	// Insufficient data → 0.
+	z = CalculateZScore(50.0, Baseline{Mean: 30, StdDev: 5, Count: 5})
+	if z != 0 {
+		t.Errorf("expected 0 for insufficient data, got %f", z)
+	}
+}
+
+func TestIsAnomaly(t *testing.T) {
+	baseline := Baseline{Mean: 30.0, StdDev: 5.0, Count: 100}
+
+	if IsAnomaly(35.0, baseline, 3.0) {
+		t.Error("35 should not be anomaly (Z=1.0)")
+	}
+	if !IsAnomaly(50.0, baseline, 3.0) {
+		t.Error("50 should be anomaly (Z=4.0)")
+	}
+	if !IsAnomaly(10.0, baseline, 3.0) {
+		t.Error("10 should be anomaly (Z=-4.0)")
+	}
+}
+
+func TestMetricsDB_Purge(t *testing.T) {
+	db := NewMetricsDB(100*time.Millisecond, 100)
+	db.AddDataPoint("comp", "cpu", 50)
+	time.Sleep(150 * time.Millisecond)
+	db.AddDataPoint("comp", "cpu", 60)
+
+	removed := db.Purge()
+	if removed != 1 {
+		t.Errorf("expected 1 purged, got %d", removed)
+	}
+}
+
+func TestMetricsDB_GetRecent(t *testing.T) {
+	db := NewMetricsDB(time.Hour, 100)
+	for i := 0; i < 10; i++ {
+		db.AddDataPoint("comp", "mem", float64(i*10))
+	}
+
+	recent := db.GetRecent("comp", "mem", 3)
+	if len(recent) != 3 {
+		t.Fatalf("expected 3 recent, got %d", len(recent))
+	}
+	// Should be last 3: 70, 80, 90.
+	if recent[0].Value != 70 || recent[2].Value != 90 {
+		t.Errorf("unexpected recent values: %v", recent)
+	}
+}
+
+// --- MockCollector for HealthMonitor tests ---
+
+type mockCollector struct {
+	results map[string]map[string]float64
+	errors  map[string]error
+}
+
+func (m *mockCollector) Collect(_ context.Context, component string) (map[string]float64, error) {
+	if err, ok := m.errors[component]; ok && err != nil {
+		return nil, err
+	}
+	if metrics, ok := m.results[component]; ok {
+		return metrics, nil
+	}
+	return map[string]float64{}, nil
+}
+
+// --- HealthMonitor Tests ---
+
+// HM-01: Normal health check — all HEALTHY.
+func TestHealthMonitor_HM01_AllHealthy(t *testing.T) {
+	hm := NewHealthMonitor(&mockCollector{}, 10)
+	registerTestComponents(hm, 6)
+
+	health := hm.GetHealth()
+	if health.OverallStatus != OverallHealthy {
+		t.Errorf("expected HEALTHY, got %s", health.OverallStatus)
+	}
+	if !health.QuorumValid {
+		t.Error("expected quorum valid")
+	}
+	if len(health.Components) != 6 {
+		t.Errorf("expected 6 components, got %d", len(health.Components))
+	}
+}
+
+// HM-02: Single component DEGRADED.
+func TestHealthMonitor_HM02_SingleDegraded(t *testing.T) {
+	hm := NewHealthMonitor(&mockCollector{}, 10)
+	registerTestComponents(hm, 6)
+	hm.SetComponentStatus("comp-0", StatusDegraded)
+
+	health := hm.GetHealth()
+	if health.OverallStatus != OverallDegraded {
+		t.Errorf("expected DEGRADED, got %s", health.OverallStatus)
+	}
+	if !health.QuorumValid {
+		t.Error("expected quorum still valid with 5/6 healthy")
+	}
+}
+
+// HM-03: Multiple components CRITICAL → quorum lost.
+func TestHealthMonitor_HM03_MultipleCritical(t *testing.T) {
+	hm := NewHealthMonitor(&mockCollector{}, 10)
+	registerTestComponents(hm, 6)
+	hm.SetComponentStatus("comp-0", StatusCritical)
+	hm.SetComponentStatus("comp-1", StatusCritical)
+	hm.SetComponentStatus("comp-2", StatusCritical)
+
+	health := hm.GetHealth()
+	if health.OverallStatus != OverallCritical {
+		t.Errorf("expected CRITICAL, got %s", health.OverallStatus)
+	}
+	if health.QuorumValid {
+		t.Error("expected quorum INVALID with 3/6 critical")
+	}
+}
+
+// HM-04: Anomaly detection (CPU spike).
+func TestHealthMonitor_HM04_CPUAnomaly(t *testing.T) {
+	hm := NewHealthMonitor(&mockCollector{}, 100)
+	hm.RegisterComponent(ComponentConfig{
+		Name:           "soc-ingest",
+		Type:           "go_binary",
+		Thresholds:     map[string]float64{"cpu": 80},
+		ThresholdIsMax: map[string]bool{"cpu": true},
+	})
+
+	// Build baseline of normal CPU (30%).
+	for i := 0; i < 50; i++ {
+		hm.metricsDB.AddDataPoint("soc-ingest", "cpu", 30.0)
+	}
+
+	// Spike to 95%.
+	hm.UpdateMetrics("soc-ingest", map[string]float64{"cpu": 95.0})
+	hm.checkHealth()
+
+	// Should have alert(s).
+	select {
+	case alert := <-hm.alertBus:
+		if alert.Component != "soc-ingest" {
+			t.Errorf("expected soc-ingest, got %s", alert.Component)
+		}
+		if alert.Metric != "cpu" {
+			t.Errorf("expected cpu metric, got %s", alert.Metric)
+		}
+	default:
+		t.Error("expected alert for CPU spike")
+	}
+}
+
+// HM-05: Memory leak detection.
+func TestHealthMonitor_HM05_MemoryLeak(t *testing.T) {
+	hm := NewHealthMonitor(&mockCollector{}, 100)
+	hm.RegisterComponent(ComponentConfig{
+		Name:           "soc-correlate",
+		Type:           "go_binary",
+		Thresholds:     map[string]float64{"memory": 90},
+		ThresholdIsMax: map[string]bool{"memory": true},
+	})
+
+	// Build baseline of normal memory (40%).
+	for i := 0; i < 50; i++ {
+		hm.metricsDB.AddDataPoint("soc-correlate", "memory", 40.0)
+	}
+
+	// Memory spike to 95%.
+	hm.UpdateMetrics("soc-correlate", map[string]float64{"memory": 95.0})
+	hm.checkHealth()
+
+	select {
+	case alert := <-hm.alertBus:
+		if alert.Metric != "memory" {
+			t.Errorf("expected memory metric, got %s", alert.Metric)
+		}
+	default:
+		t.Error("expected alert for memory spike")
+	}
+}
+
+// HM-06: Quorum validation failure.
+func TestHealthMonitor_HM06_QuorumFailure(t *testing.T) {
+	statuses := map[string]ComponentStatus{
+		"a": StatusOffline,
+		"b": StatusOffline,
+		"c": StatusOffline,
+		"d": StatusOffline,
+		"e": StatusHealthy,
+		"f": StatusHealthy,
+	}
+	if ValidateQuorum(statuses) {
+		t.Error("expected quorum invalid with 4/6 offline")
+	}
+}
+
+// HM-06b: Quorum validation success (edge case: exactly 2/3).
+func TestHealthMonitor_HM06b_QuorumEdge(t *testing.T) {
+	statuses := map[string]ComponentStatus{
+		"a": StatusHealthy,
+		"b": StatusHealthy,
+		"c": StatusCritical,
+	}
+	if !ValidateQuorum(statuses) {
+		t.Error("expected quorum valid with 2/3 healthy (exact threshold)")
+	}
+}
+
+// HM-06c: Empty quorum.
+func TestHealthMonitor_HM06c_EmptyQuorum(t *testing.T) {
+	if ValidateQuorum(map[string]ComponentStatus{}) {
+		t.Error("expected quorum invalid with 0 components")
+	}
+}
+
+// HM-07: Metrics collection (no data loss).
+func TestHealthMonitor_HM07_MetricsCollection(t *testing.T) {
+	collector := &mockCollector{
+		results: map[string]map[string]float64{
+			"comp-0": {"cpu": 25, "memory": 40},
+		},
+	}
+	hm := NewHealthMonitor(collector, 10)
+	hm.RegisterComponent(ComponentConfig{Name: "comp-0", Type: "go_binary"})
+
+	hm.collectMetrics(context.Background())
+
+	hm.mu.RLock()
+	comp := hm.components["comp-0"]
+	hm.mu.RUnlock()
+
+	if comp.Metrics["cpu"] != 25 {
+		t.Errorf("expected cpu=25, got %f", comp.Metrics["cpu"])
+	}
+	if comp.Metrics["memory"] != 40 {
+		t.Errorf("expected memory=40, got %f", comp.Metrics["memory"])
+	}
+}
+
+// HM-07b: Collection error increments consecutive failures.
+func TestHealthMonitor_HM07b_CollectionError(t *testing.T) {
+	collector := &mockCollector{
+		errors: map[string]error{
+			"comp-0": fmt.Errorf("connection refused"),
+		},
+	}
+	hm := NewHealthMonitor(collector, 10)
+	hm.RegisterComponent(ComponentConfig{Name: "comp-0", Type: "go_binary"})
+
+	hm.collectMetrics(context.Background())
+
+	hm.mu.RLock()
+	comp := hm.components["comp-0"]
+	hm.mu.RUnlock()
+
+	if comp.Consecutive != 1 {
+		t.Errorf("expected 1 consecutive failure, got %d", comp.Consecutive)
+	}
+}
+
+// HM-08: Alert bus fan-out (non-blocking).
+func TestHealthMonitor_HM08_AlertBusFanOut(t *testing.T) {
+	hm := NewHealthMonitor(&mockCollector{}, 5)
+	hm.RegisterComponent(ComponentConfig{
+		Name:           "comp",
+		Type:           "go_binary",
+		Thresholds:     map[string]float64{"cpu": 50},
+		ThresholdIsMax: map[string]bool{"cpu": true},
+	})
+
+	// Fill alert bus.
+	for i := 0; i < 5; i++ {
+		hm.alertBus <- HealthAlert{Component: fmt.Sprintf("test-%d", i)}
+	}
+
+	// Emit one more — should be dropped (non-blocking).
+	hm.emitAlert(HealthAlert{Component: "overflow"})
+	// No panic = success.
+}
+
+// Test GetHealth returns a deep copy.
+func TestHealthMonitor_GetHealthDeepCopy(t *testing.T) {
+	hm := NewHealthMonitor(&mockCollector{}, 10)
+	hm.RegisterComponent(ComponentConfig{Name: "test", Type: "go_binary"})
+	hm.UpdateMetrics("test", map[string]float64{"cpu": 50})
+
+	health := hm.GetHealth()
+	health.Components[0].Metrics["cpu"] = 999
+
+	// Original should be unchanged.
+	hm.mu.RLock()
+	original := hm.components["test"].Metrics["cpu"]
+	hm.mu.RUnlock()
+
+	if original != 50 {
+		t.Errorf("deep copy failed: original modified to %f", original)
+	}
+}
+
+// Test threshold breach transitions status to DEGRADED then CRITICAL.
+func TestHealthMonitor_StatusTransitions(t *testing.T) {
+	hm := NewHealthMonitor(&mockCollector{}, 100)
+	hm.RegisterComponent(ComponentConfig{
+		Name:           "comp",
+		Type:           "go_binary",
+		Thresholds:     map[string]float64{"error_rate": 5},
+		ThresholdIsMax: map[string]bool{"error_rate": true},
+	})
+
+	// Breach once → DEGRADED.
+	hm.UpdateMetrics("comp", map[string]float64{"error_rate": 10})
+	hm.checkHealth()
+
+	hm.mu.RLock()
+	status := hm.components["comp"].Status
+	hm.mu.RUnlock()
+	if status != StatusDegraded {
+		t.Errorf("expected DEGRADED after 1 breach, got %s", status)
+	}
+
+	// Breach 3× → CRITICAL.
+	for i := 0; i < 3; i++ {
+		hm.checkHealth()
+	}
+	hm.mu.RLock()
+	status = hm.components["comp"].Status
+	hm.mu.RUnlock()
+	if status != StatusCritical {
+		t.Errorf("expected CRITICAL after repeated breaches, got %s", status)
+	}
+}
+
+// Test lower-bound threshold (ThresholdIsMax=false).
+func TestHealthMonitor_LowerBoundThreshold(t *testing.T) {
+	hm := NewHealthMonitor(&mockCollector{}, 100)
+	hm.RegisterComponent(ComponentConfig{
+		Name:           "immune",
+		Type:           "c_kernel_module",
+		Thresholds:     map[string]float64{"hooks_active": 10},
+		ThresholdIsMax: map[string]bool{"hooks_active": false},
+	})
+
+	// hooks_active = 5 (below threshold of 10) → warning.
+	hm.UpdateMetrics("immune", map[string]float64{"hooks_active": 5})
+	hm.checkHealth()
+
+	select {
+	case alert := <-hm.alertBus:
+		if alert.Component != "immune" || alert.Metric != "hooks_active" {
+			t.Errorf("unexpected alert: %+v", alert)
+		}
+	default:
+		t.Error("expected alert for hooks_active below threshold")
+	}
+}
+
+// Test ComponentCount.
+func TestHealthMonitor_ComponentCount(t *testing.T) {
+	hm := NewHealthMonitor(&mockCollector{}, 10)
+	if hm.ComponentCount() != 0 {
+		t.Error("expected 0 initially")
+	}
+	registerTestComponents(hm, 4)
+	if hm.ComponentCount() != 4 {
+		t.Errorf("expected 4, got %d", hm.ComponentCount())
+	}
+}
+
+// Test Start/Stop lifecycle.
+func TestHealthMonitor_StartStop(t *testing.T) {
+	hm := NewHealthMonitor(&mockCollector{}, 10)
+	registerTestComponents(hm, 2)
+
+	ctx, cancel := context.WithCancel(context.Background())
+	done := make(chan struct{})
+
+	go func() {
+		hm.Start(ctx)
+		close(done)
+	}()
+
+	// Let it run briefly.
+	time.Sleep(50 * time.Millisecond)
+	cancel()
+
+	select {
+	case <-done:
+		// Clean shutdown.
+	case <-time.After(time.Second):
+		t.Fatal("Start() did not return after context cancellation")
+	}
+}
+
+// --- Helpers ---
+
+func registerTestComponents(hm *HealthMonitor, n int) {
+	for i := 0; i < n; i++ {
+		hm.RegisterComponent(ComponentConfig{
+			Name: fmt.Sprintf("comp-%d", i),
+			Type: "go_binary",
+		})
+	}
+}
--- a/internal/application/resilience/integrity.go
+++ b/internal/application/resilience/integrity.go
@ -0,0 +1,247 @@
+package resilience
+
+import (
+	"crypto/hmac"
+	"crypto/sha256"
+	"encoding/hex"
+	"fmt"
+	"io"
+	"log/slog"
+	"os"
+	"sync"
+	"time"
+)
+
+// IntegrityStatus represents the result of an integrity check.
+type IntegrityStatus string
+
+const (
+	IntegrityVerified    IntegrityStatus = "VERIFIED"
+	IntegrityCompromised IntegrityStatus = "COMPROMISED"
+	IntegrityUnknown     IntegrityStatus = "UNKNOWN"
+)
+
+// IntegrityReport is the full result of an integrity verification.
+type IntegrityReport struct {
+	Overall    IntegrityStatus            `json:"overall"`
+	Timestamp  time.Time                  `json:"timestamp"`
+	Binaries   map[string]BinaryStatus    `json:"binaries,omitempty"`
+	Chain      *ChainStatus               `json:"chain,omitempty"`
+	Configs    map[string]ConfigStatus     `json:"configs,omitempty"`
+}
+
+// BinaryStatus is the integrity status of a single binary.
+type BinaryStatus struct {
+	Status   IntegrityStatus `json:"status"`
+	Expected string          `json:"expected"`
+	Current  string          `json:"current"`
+}
+
+// ChainStatus is the integrity status of the decision chain.
+type ChainStatus struct {
+	Valid      bool   `json:"valid"`
+	Error      string `json:"error,omitempty"`
+	BreakPoint int    `json:"break_point,omitempty"`
+	Entries    int    `json:"entries"`
+}
+
+// ConfigStatus is the integrity status of a config file.
+type ConfigStatus struct {
+	Valid       bool   `json:"valid"`
+	Error       string `json:"error,omitempty"`
+	StoredHMAC  string `json:"stored_hmac,omitempty"`
+	CurrentHMAC string `json:"current_hmac,omitempty"`
+}
+
+// IntegrityVerifier performs periodic integrity checks on binaries,
+// decision chain, and config files.
+type IntegrityVerifier struct {
+	mu            sync.RWMutex
+	binaryHashes  map[string]string // path → expected SHA-256
+	configPaths   []string          // config files to verify
+	hmacKey       []byte            // key for config HMAC-SHA256
+	chainPath     string            // path to decision chain log
+	logger        *slog.Logger
+	lastReport    *IntegrityReport
+}
+
+// NewIntegrityVerifier creates a new integrity verifier.
+func NewIntegrityVerifier(hmacKey []byte) *IntegrityVerifier {
+	return &IntegrityVerifier{
+		binaryHashes: make(map[string]string),
+		hmacKey:      hmacKey,
+		logger:       slog.Default().With("component", "sarl-integrity"),
+	}
+}
+
+// RegisterBinary adds a binary with its expected SHA-256 hash.
+func (iv *IntegrityVerifier) RegisterBinary(path, expectedHash string) {
+	iv.mu.Lock()
+	defer iv.mu.Unlock()
+	iv.binaryHashes[path] = expectedHash
+}
+
+// RegisterConfig adds a config file to verify.
+func (iv *IntegrityVerifier) RegisterConfig(path string) {
+	iv.mu.Lock()
+	defer iv.mu.Unlock()
+	iv.configPaths = append(iv.configPaths, path)
+}
+
+// SetChainPath sets the decision chain log path.
+func (iv *IntegrityVerifier) SetChainPath(path string) {
+	iv.mu.Lock()
+	defer iv.mu.Unlock()
+	iv.chainPath = path
+}
+
+// VerifyAll runs all integrity checks and returns a comprehensive report.
+// Note: file I/O (binary hashing, config reading) is done WITHOUT holding
+// the mutex to prevent thread starvation on slow storage.
+func (iv *IntegrityVerifier) VerifyAll() IntegrityReport {
+	report := IntegrityReport{
+		Overall:   IntegrityVerified,
+		Timestamp: time.Now(),
+		Binaries:  make(map[string]BinaryStatus),
+		Configs:   make(map[string]ConfigStatus),
+	}
+
+	// Snapshot config under lock, then release before I/O.
+	iv.mu.RLock()
+	binaryHashesCopy := make(map[string]string, len(iv.binaryHashes))
+	for k, v := range iv.binaryHashes {
+		binaryHashesCopy[k] = v
+	}
+	configPathsCopy := make([]string, len(iv.configPaths))
+	copy(configPathsCopy, iv.configPaths)
+	hmacKeyCopy := make([]byte, len(iv.hmacKey))
+	copy(hmacKeyCopy, iv.hmacKey)
+	chainPath := iv.chainPath
+	iv.mu.RUnlock()
+
+	// Check binaries (file I/O — no lock held).
+	for path, expected := range binaryHashesCopy {
+		status := iv.verifyBinary(path, expected)
+		report.Binaries[path] = status
+		if status.Status == IntegrityCompromised {
+			report.Overall = IntegrityCompromised
+		}
+	}
+
+	// Check configs (file I/O — no lock held).
+	for _, path := range configPathsCopy {
+		status := iv.verifyConfigFile(path)
+		report.Configs[path] = status
+		if !status.Valid {
+			report.Overall = IntegrityCompromised
+		}
+	}
+
+	// Check decision chain (file I/O — no lock held).
+	if chainPath != "" {
+		chain := iv.verifyDecisionChain(chainPath)
+		report.Chain = &chain
+		if !chain.Valid {
+			report.Overall = IntegrityCompromised
+		}
+	}
+
+	iv.mu.Lock()
+	iv.lastReport = &report
+	iv.mu.Unlock()
+
+	if report.Overall == IntegrityCompromised {
+		iv.logger.Error("INTEGRITY COMPROMISED", "report", report)
+	} else {
+		iv.logger.Debug("integrity verified", "binaries", len(report.Binaries))
+	}
+
+	return report
+}
+
+// LastReport returns the most recent integrity report.
+func (iv *IntegrityVerifier) LastReport() *IntegrityReport {
+	iv.mu.RLock()
+	defer iv.mu.RUnlock()
+	return iv.lastReport
+}
+
+// verifyBinary calculates SHA-256 of a file and compares to expected.
+func (iv *IntegrityVerifier) verifyBinary(path, expected string) BinaryStatus {
+	current, err := fileSHA256(path)
+	if err != nil {
+		return BinaryStatus{
+			Status:   IntegrityUnknown,
+			Expected: expected,
+			Current:  fmt.Sprintf("error: %v", err),
+		}
+	}
+
+	if current != expected {
+		return BinaryStatus{
+			Status:   IntegrityCompromised,
+			Expected: expected,
+			Current:  current,
+		}
+	}
+
+	return BinaryStatus{
+		Status:   IntegrityVerified,
+		Expected: expected,
+		Current:  current,
+	}
+}
+
+// verifyConfigFile checks HMAC-SHA256 of a config file.
+func (iv *IntegrityVerifier) verifyConfigFile(path string) ConfigStatus {
+	data, err := os.ReadFile(path)
+	if err != nil {
+		return ConfigStatus{Valid: false, Error: fmt.Sprintf("unreadable: %v", err)}
+	}
+
+	currentHMAC := computeHMAC(data, iv.hmacKey)
+	// For now, we just verify the file is readable and compute HMAC.
+	// In production, the stored HMAC would be extracted from a sidecar file.
+	return ConfigStatus{
+		Valid:       true,
+		CurrentHMAC: currentHMAC,
+	}
+}
+
+// verifyDecisionChain verifies the SHA-256 hash chain in the decision log.
+func (iv *IntegrityVerifier) verifyDecisionChain(path string) ChainStatus {
+	_, err := os.Stat(path)
+	if err != nil {
+		if os.IsNotExist(err) {
+			return ChainStatus{Valid: true, Entries: 0} // No chain yet.
+		}
+		return ChainStatus{Valid: false, Error: fmt.Sprintf("unreadable: %v", err)}
+	}
+
+	// In a real implementation, we'd parse the chain entries and verify
+	// that each entry's hash includes the previous entry's hash.
+	// For now, verify the file exists and is readable.
+	return ChainStatus{Valid: true}
+}
+
+// fileSHA256 computes the SHA-256 hash of a file.
+func fileSHA256(path string) (string, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		return "", err
+	}
+	defer f.Close()
+
+	h := sha256.New()
+	if _, err := io.Copy(h, f); err != nil {
+		return "", err
+	}
+	return hex.EncodeToString(h.Sum(nil)), nil
+}
+
+// computeHMAC computes HMAC-SHA256 of data with the given key.
+func computeHMAC(data, key []byte) string {
+	mac := hmac.New(sha256.New, key)
+	mac.Write(data)
+	return hex.EncodeToString(mac.Sum(nil))
+}
--- a/internal/application/resilience/metrics_collector.go
+++ b/internal/application/resilience/metrics_collector.go
@ -0,0 +1,283 @@
+// Package resilience implements the Sentinel Autonomous Resilience Layer (SARL).
+//
+// Five levels of autonomous self-recovery:
+//
+//	L1 — Self-Monitoring: health checks, quorum, anomaly detection
+//	L2 — Self-Healing: restart, rollback, recovery strategies
+//	L3 — Self-Preservation: emergency modes (safe/lockdown/apoptosis)
+//	L4 — Immune Integration: behavioral anomaly detection
+//	L5 — Autonomous Recovery: playbooks for resurrection, consensus, crypto
+package resilience
+
+import (
+	"math"
+	"sync"
+	"time"
+)
+
+// MetricsDB provides an in-memory time-series store with ring buffers
+// for each component/metric pair. Supports rolling baselines (mean/stddev)
+// for Z-score anomaly detection.
+type MetricsDB struct {
+	mu      sync.RWMutex
+	series  map[string]*RingBuffer // key = "component:metric"
+	window  time.Duration          // retention window (default 1h)
+	maxSize int                    // max data points per series
+}
+
+// DataPoint is a single timestamped metric value.
+type DataPoint struct {
+	Timestamp time.Time `json:"timestamp"`
+	Value     float64   `json:"value"`
+}
+
+// Baseline holds rolling statistics for anomaly detection.
+type Baseline struct {
+	Mean   float64 `json:"mean"`
+	StdDev float64 `json:"std_dev"`
+	Count  int     `json:"count"`
+	Min    float64 `json:"min"`
+	Max    float64 `json:"max"`
+}
+
+// RingBuffer is a fixed-size circular buffer for DataPoints.
+type RingBuffer struct {
+	data  []DataPoint
+	head  int
+	count int
+	size  int
+}
+
+// DefaultMetricsWindow is the default retention window (1 hour).
+const DefaultMetricsWindow = 1 * time.Hour
+
+// DefaultMetricsMaxSize is the default max points per series (1h / 10s = 360).
+const DefaultMetricsMaxSize = 360
+
+// NewMetricsDB creates a new in-memory time-series store.
+func NewMetricsDB(window time.Duration, maxSize int) *MetricsDB {
+	if window <= 0 {
+		window = DefaultMetricsWindow
+	}
+	if maxSize <= 0 {
+		maxSize = DefaultMetricsMaxSize
+	}
+	return &MetricsDB{
+		series:  make(map[string]*RingBuffer),
+		window:  window,
+		maxSize: maxSize,
+	}
+}
+
+// AddDataPoint records a metric value for a component.
+func (db *MetricsDB) AddDataPoint(component, metric string, value float64) {
+	key := component + ":" + metric
+	db.mu.Lock()
+	defer db.mu.Unlock()
+
+	rb, ok := db.series[key]
+	if !ok {
+		rb = newRingBuffer(db.maxSize)
+		db.series[key] = rb
+	}
+	rb.Add(DataPoint{Timestamp: time.Now(), Value: value})
+}
+
+// GetBaseline returns rolling mean/stddev for a component metric
+// calculated over the specified window duration.
+func (db *MetricsDB) GetBaseline(component, metric string, window time.Duration) Baseline {
+	key := component + ":" + metric
+	db.mu.RLock()
+	defer db.mu.RUnlock()
+
+	rb, ok := db.series[key]
+	if !ok {
+		return Baseline{}
+	}
+
+	cutoff := time.Now().Add(-window)
+	points := rb.After(cutoff)
+
+	if len(points) == 0 {
+		return Baseline{}
+	}
+
+	return calculateBaseline(points)
+}
+
+// GetRecent returns the most recent N data points for a component metric.
+func (db *MetricsDB) GetRecent(component, metric string, n int) []DataPoint {
+	key := component + ":" + metric
+	db.mu.RLock()
+	defer db.mu.RUnlock()
+
+	rb, ok := db.series[key]
+	if !ok {
+		return nil
+	}
+
+	all := rb.All()
+	if len(all) <= n {
+		return all
+	}
+	return all[len(all)-n:]
+}
+
+// CalculateZScore returns the Z-score for a value against the baseline.
+// Returns 0 if baseline has insufficient data or zero stddev.
+func CalculateZScore(value float64, baseline Baseline) float64 {
+	if baseline.Count < 10 || baseline.StdDev == 0 {
+		return 0
+	}
+	return (value - baseline.Mean) / baseline.StdDev
+}
+
+// IsAnomaly returns true if the Z-score exceeds the threshold (default 3.0).
+func IsAnomaly(value float64, baseline Baseline, threshold float64) bool {
+	if threshold <= 0 {
+		threshold = 3.0
+	}
+	zscore := CalculateZScore(value, baseline)
+	return math.Abs(zscore) > threshold
+}
+
+// SeriesCount returns the number of tracked series.
+func (db *MetricsDB) SeriesCount() int {
+	db.mu.RLock()
+	defer db.mu.RUnlock()
+	return len(db.series)
+}
+
+// Purge removes data points older than the retention window.
+func (db *MetricsDB) Purge() int {
+	db.mu.Lock()
+	defer db.mu.Unlock()
+
+	cutoff := time.Now().Add(-db.window)
+	total := 0
+	for key, rb := range db.series {
+		removed := rb.RemoveBefore(cutoff)
+		total += removed
+		if rb.Len() == 0 {
+			delete(db.series, key)
+		}
+	}
+	return total
+}
+
+// --- RingBuffer implementation ---
+
+func newRingBuffer(size int) *RingBuffer {
+	return &RingBuffer{
+		data: make([]DataPoint, size),
+		size: size,
+	}
+}
+
+// Add inserts a DataPoint, overwriting the oldest if full.
+func (rb *RingBuffer) Add(dp DataPoint) {
+	rb.data[rb.head] = dp
+	rb.head = (rb.head + 1) % rb.size
+	if rb.count < rb.size {
+		rb.count++
+	}
+}
+
+// Len returns the number of data points in the buffer.
+func (rb *RingBuffer) Len() int {
+	return rb.count
+}
+
+// All returns all data points in chronological order.
+func (rb *RingBuffer) All() []DataPoint {
+	if rb.count == 0 {
+		return nil
+	}
+
+	result := make([]DataPoint, rb.count)
+	if rb.count < rb.size {
+		// Buffer not yet full — data starts at 0.
+		copy(result, rb.data[:rb.count])
+	} else {
+		// Buffer wrapped — oldest is at head.
+		n := copy(result, rb.data[rb.head:rb.size])
+		copy(result[n:], rb.data[:rb.head])
+	}
+	return result
+}
+
+// After returns points with timestamp after the cutoff.
+func (rb *RingBuffer) After(cutoff time.Time) []DataPoint {
+	all := rb.All()
+	result := make([]DataPoint, 0, len(all))
+	for _, dp := range all {
+		if dp.Timestamp.After(cutoff) {
+			result = append(result, dp)
+		}
+	}
+	return result
+}
+
+// RemoveBefore removes data points before the cutoff by compacting.
+// Returns the number of points removed.
+func (rb *RingBuffer) RemoveBefore(cutoff time.Time) int {
+	all := rb.All()
+	kept := make([]DataPoint, 0, len(all))
+	for _, dp := range all {
+		if !dp.Timestamp.Before(cutoff) {
+			kept = append(kept, dp)
+		}
+	}
+
+	removed := len(all) - len(kept)
+	if removed == 0 {
+		return 0
+	}
+
+	// Rebuild the ring buffer with kept data.
+	rb.count = 0
+	rb.head = 0
+	for _, dp := range kept {
+		rb.Add(dp)
+	}
+	return removed
+}
+
+// --- Statistics ---
+
+func calculateBaseline(points []DataPoint) Baseline {
+	n := len(points)
+	if n == 0 {
+		return Baseline{}
+	}
+
+	var sum, min, max float64
+	min = points[0].Value
+	max = points[0].Value
+
+	for _, p := range points {
+		sum += p.Value
+		if p.Value < min {
+			min = p.Value
+		}
+		if p.Value > max {
+			max = p.Value
+		}
+	}
+	mean := sum / float64(n)
+
+	var variance float64
+	for _, p := range points {
+		diff := p.Value - mean
+		variance += diff * diff
+	}
+	variance /= float64(n)
+
+	return Baseline{
+		Mean:   mean,
+		StdDev: math.Sqrt(variance),
+		Count:  n,
+		Min:    min,
+		Max:    max,
+	}
+}
--- a/internal/application/resilience/preservation.go
+++ b/internal/application/resilience/preservation.go
@ -0,0 +1,290 @@
+package resilience
+
+import (
+	"fmt"
+	"log/slog"
+	"sync"
+	"time"
+)
+
+// EmergencyMode defines the system's emergency state.
+type EmergencyMode string
+
+const (
+	ModeNone      EmergencyMode = "NONE"
+	ModeSafe      EmergencyMode = "SAFE"
+	ModeLockdown  EmergencyMode = "LOCKDOWN"
+	ModeApoptosis EmergencyMode = "APOPTOSIS"
+)
+
+// ModeActivation records when and why a mode was activated.
+type ModeActivation struct {
+	Mode        EmergencyMode `json:"mode"`
+	ActivatedAt time.Time     `json:"activated_at"`
+	ActivatedBy string        `json:"activated_by"` // "auto" or "architect:<name>"
+	Reason      string        `json:"reason"`
+	AutoExit    bool          `json:"auto_exit"`
+	AutoExitAt  time.Time     `json:"auto_exit_at,omitempty"`
+}
+
+// PreservationEvent is an audit log entry for preservation actions.
+type PreservationEvent struct {
+	Timestamp time.Time     `json:"timestamp"`
+	Mode      EmergencyMode `json:"mode"`
+	Action    string        `json:"action"`
+	Detail    string        `json:"detail"`
+	Success   bool          `json:"success"`
+	Error     string        `json:"error,omitempty"`
+}
+
+// ModeActionFunc is a callback to perform mode-specific actions.
+// Implementations handle the real system operations (network isolation, process freeze, etc.).
+type ModeActionFunc func(mode EmergencyMode, action string, params map[string]interface{}) error
+
+// PreservationEngine manages emergency modes (safe/lockdown/apoptosis).
+type PreservationEngine struct {
+	mu           sync.RWMutex
+	currentMode  EmergencyMode
+	activation   *ModeActivation
+	history      []PreservationEvent
+	actionFn     ModeActionFunc
+	integrityFn  func() IntegrityReport // pluggable integrity check
+	logger       *slog.Logger
+}
+
+// NewPreservationEngine creates a new preservation engine.
+func NewPreservationEngine(actionFn ModeActionFunc) *PreservationEngine {
+	return &PreservationEngine{
+		currentMode: ModeNone,
+		history:     make([]PreservationEvent, 0),
+		actionFn:    actionFn,
+		logger:      slog.Default().With("component", "sarl-preservation"),
+	}
+}
+
+// CurrentMode returns the active emergency mode.
+func (pe *PreservationEngine) CurrentMode() EmergencyMode {
+	pe.mu.RLock()
+	defer pe.mu.RUnlock()
+	return pe.currentMode
+}
+
+// Activation returns the current mode activation details (nil if NONE).
+func (pe *PreservationEngine) Activation() *ModeActivation {
+	pe.mu.RLock()
+	defer pe.mu.RUnlock()
+	if pe.activation == nil {
+		return nil
+	}
+	cp := *pe.activation
+	return &cp
+}
+
+// ActivateMode enters an emergency mode. Returns error if transition is invalid.
+func (pe *PreservationEngine) ActivateMode(mode EmergencyMode, reason, activatedBy string) error {
+	pe.mu.Lock()
+	defer pe.mu.Unlock()
+
+	if mode == ModeNone {
+		return fmt.Errorf("use DeactivateMode to exit emergency mode")
+	}
+
+	// Validate transitions: can always escalate, can't downgrade.
+	if !pe.isValidTransition(pe.currentMode, mode) {
+		return fmt.Errorf("invalid transition: %s → %s", pe.currentMode, mode)
+	}
+
+	pe.logger.Warn("EMERGENCY MODE ACTIVATION",
+		"mode", mode,
+		"reason", reason,
+		"activated_by", activatedBy,
+	)
+
+	// Execute mode-specific actions.
+	actions := pe.actionsForMode(mode)
+	for _, action := range actions {
+		err := pe.executeAction(mode, action.name, action.params)
+		if err != nil {
+			pe.logger.Error("mode action failed",
+				"mode", mode,
+				"action", action.name,
+				"error", err,
+			)
+			// In critical modes, continue despite errors.
+			if mode != ModeApoptosis {
+				return fmt.Errorf("failed to activate %s: action %s: %w", mode, action.name, err)
+			}
+		}
+	}
+
+	activation := &ModeActivation{
+		Mode:        mode,
+		ActivatedAt: time.Now(),
+		ActivatedBy: activatedBy,
+		Reason:      reason,
+	}
+
+	if mode == ModeSafe {
+		activation.AutoExit = true
+		activation.AutoExitAt = time.Now().Add(15 * time.Minute)
+	}
+
+	pe.currentMode = mode
+	pe.activation = activation
+
+	return nil
+}
+
+// DeactivateMode exits the current emergency mode and returns to NONE.
+func (pe *PreservationEngine) DeactivateMode(deactivatedBy string) error {
+	pe.mu.Lock()
+	defer pe.mu.Unlock()
+
+	if pe.currentMode == ModeNone {
+		return nil
+	}
+
+	// Lockdown and apoptosis require manual deactivation by architect.
+	if pe.currentMode == ModeApoptosis {
+		return fmt.Errorf("apoptosis mode cannot be deactivated — system rebuild required")
+	}
+
+	pe.logger.Info("EMERGENCY MODE DEACTIVATION",
+		"mode", pe.currentMode,
+		"deactivated_by", deactivatedBy,
+	)
+
+	pe.recordEvent(pe.currentMode, "deactivated",
+		fmt.Sprintf("deactivated by %s", deactivatedBy), true, "")
+
+	pe.currentMode = ModeNone
+	pe.activation = nil
+
+	return nil
+}
+
+// ShouldAutoExit checks if safe mode should auto-exit based on timer.
+func (pe *PreservationEngine) ShouldAutoExit() bool {
+	pe.mu.RLock()
+	defer pe.mu.RUnlock()
+
+	if pe.currentMode != ModeSafe || pe.activation == nil {
+		return false
+	}
+	return pe.activation.AutoExit && time.Now().After(pe.activation.AutoExitAt)
+}
+
+// isValidTransition checks if a mode transition is allowed.
+// Escalation order: NONE → SAFE → LOCKDOWN → APOPTOSIS.
+func (pe *PreservationEngine) isValidTransition(from, to EmergencyMode) bool {
+	rank := map[EmergencyMode]int{
+		ModeNone:      0,
+		ModeSafe:      1,
+		ModeLockdown:  2,
+		ModeApoptosis: 3,
+	}
+	// Can always escalate or re-enter same mode.
+	return rank[to] >= rank[from]
+}
+
+type modeAction struct {
+	name   string
+	params map[string]interface{}
+}
+
+// actionsForMode returns the actions to execute for a given mode.
+func (pe *PreservationEngine) actionsForMode(mode EmergencyMode) []modeAction {
+	switch mode {
+	case ModeSafe:
+		return []modeAction{
+			{"disable_non_essential_services", map[string]interface{}{
+				"services": []string{"analytics", "reporting", "p2p_sync", "threat_intel_feeds"},
+			}},
+			{"enable_readonly_mode", map[string]interface{}{
+				"scope": []string{"event_ingest", "correlation", "dashboard_view"},
+			}},
+			{"preserve_all_logs", nil},
+			{"notify_architect", map[string]interface{}{"severity": "emergency"}},
+			{"increase_monitoring_frequency", map[string]interface{}{"interval": "5s"}},
+		}
+	case ModeLockdown:
+		return []modeAction{
+			{"isolate_from_network", map[string]interface{}{"scope": "all_external"}},
+			{"freeze_all_processes", nil},
+			{"capture_memory_dump", nil},
+			{"capture_disk_snapshot", nil},
+			{"trigger_immune_kernel_lock", map[string]interface{}{
+				"allow_syscalls": []string{"read", "write", "exit"},
+			}},
+			{"send_panic_alert", map[string]interface{}{
+				"channels": []string{"email", "sms", "slack", "pagerduty"},
+			}},
+		}
+	case ModeApoptosis:
+		return []modeAction{
+			{"graceful_shutdown", map[string]interface{}{"timeout": "30s", "drain_events": true}},
+			{"zero_sensitive_memory", map[string]interface{}{
+				"regions": []string{"keys", "certs", "tokens", "secrets"},
+			}},
+			{"preserve_forensic_evidence", nil},
+			{"notify_soc", map[string]interface{}{
+				"severity": "CRITICAL",
+				"message":  "system self-terminated",
+			}},
+			{"secure_erase_temp_files", nil},
+		}
+	}
+	return nil
+}
+
+// executeAction runs a mode action and records the result.
+func (pe *PreservationEngine) executeAction(mode EmergencyMode, name string, params map[string]interface{}) error {
+	err := pe.actionFn(mode, name, params)
+	success := err == nil
+	errStr := ""
+	if err != nil {
+		errStr = err.Error()
+	}
+	pe.recordEvent(mode, name, fmt.Sprintf("params: %v", params), success, errStr)
+	return err
+}
+
+// recordEvent appends to the audit history.
+func (pe *PreservationEngine) recordEvent(mode EmergencyMode, action, detail string, success bool, errStr string) {
+	pe.history = append(pe.history, PreservationEvent{
+		Timestamp: time.Now(),
+		Mode:      mode,
+		Action:    action,
+		Detail:    detail,
+		Success:   success,
+		Error:     errStr,
+	})
+}
+
+// History returns the preservation audit log.
+func (pe *PreservationEngine) History() []PreservationEvent {
+	pe.mu.RLock()
+	defer pe.mu.RUnlock()
+	result := make([]PreservationEvent, len(pe.history))
+	copy(result, pe.history)
+	return result
+}
+
+// SetIntegrityCheck sets the pluggable integrity checker.
+func (pe *PreservationEngine) SetIntegrityCheck(fn func() IntegrityReport) {
+	pe.mu.Lock()
+	defer pe.mu.Unlock()
+	pe.integrityFn = fn
+}
+
+// CheckIntegrity runs the pluggable integrity check and returns the report.
+func (pe *PreservationEngine) CheckIntegrity() IntegrityReport {
+	pe.mu.RLock()
+	fn := pe.integrityFn
+	pe.mu.RUnlock()
+
+	if fn == nil {
+		return IntegrityReport{Overall: IntegrityVerified, Timestamp: time.Now()}
+	}
+	return fn()
+}
--- a/internal/application/resilience/preservation_test.go
+++ b/internal/application/resilience/preservation_test.go
@ -0,0 +1,439 @@
+package resilience
+
+import (
+	"crypto/sha256"
+	"encoding/hex"
+	"os"
+	"path/filepath"
+	"testing"
+	"time"
+)
+
+// --- Mock action function ---
+
+type modeActionLog struct {
+	calls []struct {
+		mode   EmergencyMode
+		action string
+	}
+	failAction string // if set, this action will fail
+}
+
+func newModeActionLog() *modeActionLog {
+	return &modeActionLog{}
+}
+
+func (m *modeActionLog) execute(mode EmergencyMode, action string, _ map[string]interface{}) error {
+	m.calls = append(m.calls, struct {
+		mode   EmergencyMode
+		action string
+	}{mode, action})
+	if m.failAction == action {
+		return errActionFailed
+	}
+	return nil
+}
+
+var errActionFailed = &actionError{"simulated failure"}
+
+type actionError struct{ msg string }
+
+func (e *actionError) Error() string { return e.msg }
+
+// --- Preservation Engine Tests ---
+
+// SP-01: Safe mode activation.
+func TestPreservation_SP01_SafeMode(t *testing.T) {
+	log := newModeActionLog()
+	pe := NewPreservationEngine(log.execute)
+
+	err := pe.ActivateMode(ModeSafe, "quorum lost (3/6 offline)", "auto")
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	if pe.CurrentMode() != ModeSafe {
+		t.Errorf("expected SAFE, got %s", pe.CurrentMode())
+	}
+
+	activation := pe.Activation()
+	if activation == nil {
+		t.Fatal("expected activation details")
+	}
+	if !activation.AutoExit {
+		t.Error("safe mode should have auto-exit enabled")
+	}
+
+	// Should have executed safe mode actions.
+	if len(log.calls) == 0 {
+		t.Error("expected mode actions to be executed")
+	}
+	// First action should be disable_non_essential_services.
+	if log.calls[0].action != "disable_non_essential_services" {
+		t.Errorf("expected first action disable_non_essential_services, got %s", log.calls[0].action)
+	}
+}
+
+// SP-02: Lockdown mode activation.
+func TestPreservation_SP02_LockdownMode(t *testing.T) {
+	log := newModeActionLog()
+	pe := NewPreservationEngine(log.execute)
+
+	err := pe.ActivateMode(ModeLockdown, "binary tampering detected", "auto")
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	if pe.CurrentMode() != ModeLockdown {
+		t.Errorf("expected LOCKDOWN, got %s", pe.CurrentMode())
+	}
+
+	// Should have network isolation action.
+	foundIsolate := false
+	for _, c := range log.calls {
+		if c.action == "isolate_from_network" {
+			foundIsolate = true
+		}
+	}
+	if !foundIsolate {
+		t.Error("expected isolate_from_network in lockdown actions")
+	}
+}
+
+// SP-03: Apoptosis mode activation.
+func TestPreservation_SP03_ApoptosisMode(t *testing.T) {
+	log := newModeActionLog()
+	pe := NewPreservationEngine(log.execute)
+
+	err := pe.ActivateMode(ModeApoptosis, "rootkit detected", "architect:admin")
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	if pe.CurrentMode() != ModeApoptosis {
+		t.Errorf("expected APOPTOSIS, got %s", pe.CurrentMode())
+	}
+
+	// Should have graceful_shutdown action.
+	foundShutdown := false
+	for _, c := range log.calls {
+		if c.action == "graceful_shutdown" {
+			foundShutdown = true
+		}
+	}
+	if !foundShutdown {
+		t.Error("expected graceful_shutdown in apoptosis actions")
+	}
+
+	// Cannot deactivate apoptosis.
+	err = pe.DeactivateMode("architect:admin")
+	if err == nil {
+		t.Error("expected error deactivating apoptosis")
+	}
+}
+
+// SP-04: Invalid transition (downgrade).
+func TestPreservation_SP04_InvalidTransition(t *testing.T) {
+	log := newModeActionLog()
+	pe := NewPreservationEngine(log.execute)
+
+	pe.ActivateMode(ModeLockdown, "test", "auto")
+
+	// Can't downgrade from LOCKDOWN to SAFE.
+	err := pe.ActivateMode(ModeSafe, "test downgrade", "auto")
+	if err == nil {
+		t.Error("expected error on downgrade from LOCKDOWN to SAFE")
+	}
+}
+
+// SP-05: Escalation (SAFE → LOCKDOWN → APOPTOSIS).
+func TestPreservation_SP05_Escalation(t *testing.T) {
+	log := newModeActionLog()
+	pe := NewPreservationEngine(log.execute)
+
+	pe.ActivateMode(ModeSafe, "quorum lost", "auto")
+	if pe.CurrentMode() != ModeSafe {
+		t.Fatal("expected SAFE")
+	}
+
+	pe.ActivateMode(ModeLockdown, "compromise detected", "auto")
+	if pe.CurrentMode() != ModeLockdown {
+		t.Fatal("expected LOCKDOWN")
+	}
+
+	pe.ActivateMode(ModeApoptosis, "rootkit", "auto")
+	if pe.CurrentMode() != ModeApoptosis {
+		t.Fatal("expected APOPTOSIS")
+	}
+}
+
+// SP-06: Safe mode auto-exit.
+func TestPreservation_SP06_AutoExit(t *testing.T) {
+	log := newModeActionLog()
+	pe := NewPreservationEngine(log.execute)
+
+	pe.ActivateMode(ModeSafe, "test", "auto")
+
+	// Not yet time.
+	if pe.ShouldAutoExit() {
+		t.Error("should not auto-exit immediately")
+	}
+
+	// Fast-forward activation's auto_exit_at.
+	pe.mu.Lock()
+	pe.activation.AutoExitAt = time.Now().Add(-1 * time.Second)
+	pe.mu.Unlock()
+
+	if !pe.ShouldAutoExit() {
+		t.Error("should auto-exit after timer expired")
+	}
+}
+
+// SP-07: Manual deactivation of safe mode.
+func TestPreservation_SP07_ManualDeactivate(t *testing.T) {
+	log := newModeActionLog()
+	pe := NewPreservationEngine(log.execute)
+
+	pe.ActivateMode(ModeSafe, "test", "auto")
+	err := pe.DeactivateMode("architect:admin")
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if pe.CurrentMode() != ModeNone {
+		t.Errorf("expected NONE, got %s", pe.CurrentMode())
+	}
+}
+
+// SP-08: Lockdown deactivation.
+func TestPreservation_SP08_LockdownDeactivate(t *testing.T) {
+	log := newModeActionLog()
+	pe := NewPreservationEngine(log.execute)
+
+	pe.ActivateMode(ModeLockdown, "test", "auto")
+	err := pe.DeactivateMode("architect:admin")
+	if err != nil {
+		t.Fatalf("lockdown deactivation should succeed: %v", err)
+	}
+}
+
+// SP-09: History audit log.
+func TestPreservation_SP09_AuditHistory(t *testing.T) {
+	log := newModeActionLog()
+	pe := NewPreservationEngine(log.execute)
+
+	pe.ActivateMode(ModeSafe, "test", "auto")
+	pe.DeactivateMode("admin")
+
+	history := pe.History()
+	if len(history) == 0 {
+		t.Error("expected audit history entries")
+	}
+
+	// Last entry should be deactivation.
+	last := history[len(history)-1]
+	if last.Action != "deactivated" {
+		t.Errorf("expected deactivated, got %s", last.Action)
+	}
+}
+
+// SP-10: Action failure in non-apoptosis mode aborts.
+func TestPreservation_SP10_ActionFailure(t *testing.T) {
+	log := newModeActionLog()
+	log.failAction = "disable_non_essential_services"
+	pe := NewPreservationEngine(log.execute)
+
+	err := pe.ActivateMode(ModeSafe, "test", "auto")
+	if err == nil {
+		t.Error("expected error when safe mode action fails")
+	}
+	// Mode should not have changed due to failure.
+	if pe.CurrentMode() != ModeNone {
+		t.Errorf("expected NONE after failed activation, got %s", pe.CurrentMode())
+	}
+}
+
+// SP-10b: Action failure in apoptosis mode continues.
+func TestPreservation_SP10b_ApoptosisActionFailure(t *testing.T) {
+	log := newModeActionLog()
+	log.failAction = "graceful_shutdown"
+	pe := NewPreservationEngine(log.execute)
+
+	// Apoptosis should continue despite action failures.
+	err := pe.ActivateMode(ModeApoptosis, "rootkit", "auto")
+	if err != nil {
+		t.Fatalf("apoptosis should not fail on action errors: %v", err)
+	}
+	if pe.CurrentMode() != ModeApoptosis {
+		t.Errorf("expected APOPTOSIS, got %s", pe.CurrentMode())
+	}
+}
+
+// Test ModeNone activation rejected.
+func TestPreservation_ModeNoneRejected(t *testing.T) {
+	pe := NewPreservationEngine(func(_ EmergencyMode, _ string, _ map[string]interface{}) error { return nil })
+	err := pe.ActivateMode(ModeNone, "test", "auto")
+	if err == nil {
+		t.Error("expected error activating ModeNone")
+	}
+}
+
+// Test deactivate when already NONE.
+func TestPreservation_DeactivateNone(t *testing.T) {
+	pe := NewPreservationEngine(func(_ EmergencyMode, _ string, _ map[string]interface{}) error { return nil })
+	err := pe.DeactivateMode("admin")
+	if err != nil {
+		t.Errorf("deactivating NONE should be no-op: %v", err)
+	}
+}
+
+// Test ShouldAutoExit when not in safe mode.
+func TestPreservation_AutoExitNotSafe(t *testing.T) {
+	pe := NewPreservationEngine(func(_ EmergencyMode, _ string, _ map[string]interface{}) error { return nil })
+	if pe.ShouldAutoExit() {
+		t.Error("should not auto-exit when mode is NONE")
+	}
+}
+
+// --- Integrity Verifier Tests ---
+
+// SP-04 (ТЗ): Binary integrity check — hash mismatch.
+func TestIntegrity_BinaryMismatch(t *testing.T) {
+	tmpDir := t.TempDir()
+	binPath := filepath.Join(tmpDir, "test-binary")
+	os.WriteFile(binPath, []byte("original content"), 0o644)
+
+	// Calculate correct hash.
+	h := sha256.Sum256([]byte("original content"))
+	correctHash := hex.EncodeToString(h[:])
+
+	iv := NewIntegrityVerifier([]byte("test-key"))
+	iv.RegisterBinary(binPath, correctHash)
+
+	// Verify (should pass).
+	report := iv.VerifyAll()
+	if report.Overall != IntegrityVerified {
+		t.Errorf("expected VERIFIED, got %s", report.Overall)
+	}
+
+	// Tamper with the binary.
+	os.WriteFile(binPath, []byte("tampered content"), 0o644)
+
+	// Verify (should fail).
+	report = iv.VerifyAll()
+	if report.Overall != IntegrityCompromised {
+		t.Errorf("expected COMPROMISED, got %s", report.Overall)
+	}
+	bs := report.Binaries[binPath]
+	if bs.Status != IntegrityCompromised {
+		t.Errorf("expected binary COMPROMISED, got %s", bs.Status)
+	}
+}
+
+// Binary not found.
+func TestIntegrity_BinaryNotFound(t *testing.T) {
+	iv := NewIntegrityVerifier([]byte("test-key"))
+	iv.RegisterBinary("/nonexistent/binary", "abc123")
+
+	report := iv.VerifyAll()
+	bs := report.Binaries["/nonexistent/binary"]
+	if bs.Status != IntegrityUnknown {
+		t.Errorf("expected UNKNOWN for missing binary, got %s", bs.Status)
+	}
+}
+
+// Config HMAC computation.
+func TestIntegrity_ConfigHMAC(t *testing.T) {
+	tmpDir := t.TempDir()
+	cfgPath := filepath.Join(tmpDir, "config.yaml")
+	os.WriteFile(cfgPath, []byte("server:\n  port: 8080"), 0o644)
+
+	iv := NewIntegrityVerifier([]byte("hmac-key"))
+	iv.RegisterConfig(cfgPath)
+
+	report := iv.VerifyAll()
+	cs := report.Configs[cfgPath]
+	if !cs.Valid {
+		t.Errorf("expected valid config, got error: %s", cs.Error)
+	}
+	if cs.CurrentHMAC == "" {
+		t.Error("expected non-empty HMAC")
+	}
+}
+
+// Config file unreadable.
+func TestIntegrity_ConfigUnreadable(t *testing.T) {
+	iv := NewIntegrityVerifier([]byte("key"))
+	iv.RegisterConfig("/nonexistent/config.yaml")
+
+	report := iv.VerifyAll()
+	cs := report.Configs["/nonexistent/config.yaml"]
+	if cs.Valid {
+		t.Error("expected invalid for unreadable config")
+	}
+}
+
+// Decision chain — file does not exist (OK, no chain yet).
+func TestIntegrity_ChainNotExist(t *testing.T) {
+	iv := NewIntegrityVerifier([]byte("key"))
+	iv.SetChainPath("/nonexistent/decisions.log")
+
+	report := iv.VerifyAll()
+	if report.Chain == nil {
+		t.Fatal("expected chain status")
+	}
+	if !report.Chain.Valid {
+		t.Error("nonexistent chain should be valid (no entries)")
+	}
+}
+
+// Decision chain — file exists.
+func TestIntegrity_ChainExists(t *testing.T) {
+	tmpDir := t.TempDir()
+	chainPath := filepath.Join(tmpDir, "decisions.log")
+	os.WriteFile(chainPath, []byte("entry1\nentry2\n"), 0o644)
+
+	iv := NewIntegrityVerifier([]byte("key"))
+	iv.SetChainPath(chainPath)
+
+	report := iv.VerifyAll()
+	if report.Chain == nil {
+		t.Fatal("expected chain status")
+	}
+	if !report.Chain.Valid {
+		t.Error("expected valid chain")
+	}
+}
+
+// LastReport.
+func TestIntegrity_LastReport(t *testing.T) {
+	iv := NewIntegrityVerifier([]byte("key"))
+	if iv.LastReport() != nil {
+		t.Error("expected nil before first verify")
+	}
+
+	iv.VerifyAll()
+	if iv.LastReport() == nil {
+		t.Error("expected report after verify")
+	}
+}
+
+// Pluggable integrity check in PreservationEngine.
+func TestPreservation_IntegrityCheck(t *testing.T) {
+	pe := NewPreservationEngine(func(_ EmergencyMode, _ string, _ map[string]interface{}) error { return nil })
+
+	// Default: no integrity fn → VERIFIED.
+	report := pe.CheckIntegrity()
+	if report.Overall != IntegrityVerified {
+		t.Errorf("expected VERIFIED, got %s", report.Overall)
+	}
+
+	// Set custom checker.
+	pe.SetIntegrityCheck(func() IntegrityReport {
+		return IntegrityReport{Overall: IntegrityCompromised, Timestamp: time.Now()}
+	})
+
+	report = pe.CheckIntegrity()
+	if report.Overall != IntegrityCompromised {
+		t.Errorf("expected COMPROMISED from custom checker, got %s", report.Overall)
+	}
+}
--- a/internal/application/resilience/recovery_playbooks.go
+++ b/internal/application/resilience/recovery_playbooks.go
@ -0,0 +1,398 @@
+package resilience
+
+import (
+	"context"
+	"fmt"
+	"log/slog"
+	"sync"
+	"time"
+)
+
+// PlaybookStatus tracks the state of a running playbook.
+type PlaybookStatus string
+
+const (
+	PlaybookPending   PlaybookStatus = "PENDING"
+	PlaybookRunning   PlaybookStatus = "RUNNING"
+	PlaybookSucceeded PlaybookStatus = "SUCCEEDED"
+	PlaybookFailed    PlaybookStatus = "FAILED"
+	PlaybookRolledBack PlaybookStatus = "ROLLED_BACK"
+)
+
+// PlaybookStep is a single step in a recovery playbook.
+type PlaybookStep struct {
+	ID          string                 `json:"id"`
+	Name        string                 `json:"name"`
+	Type        string                 `json:"type"` // shell, api, consensus, crypto, systemd, http, prometheus
+	Timeout     time.Duration          `json:"timeout"`
+	Retries     int                    `json:"retries"`
+	Params      map[string]interface{} `json:"params,omitempty"`
+	OnError     string                 `json:"on_error"` // abort, continue, rollback
+	Condition   string                 `json:"condition,omitempty"` // prerequisite condition
+}
+
+// Playbook defines a complete recovery procedure.
+type Playbook struct {
+	ID              string                 `json:"id"`
+	Name            string                 `json:"name"`
+	Version         string                 `json:"version"`
+	TriggerMetric   string                 `json:"trigger_metric"`
+	TriggerSeverity string                 `json:"trigger_severity"`
+	DiagnosisChecks []PlaybookStep         `json:"diagnosis_checks"`
+	Actions         []PlaybookStep         `json:"actions"`
+	RollbackActions []PlaybookStep         `json:"rollback_actions"`
+	SuccessCriteria []string               `json:"success_criteria"`
+}
+
+// PlaybookExecution tracks a single playbook run.
+type PlaybookExecution struct {
+	ID           string         `json:"id"`
+	PlaybookID   string         `json:"playbook_id"`
+	Component    string         `json:"component"`
+	Status       PlaybookStatus `json:"status"`
+	StartedAt    time.Time      `json:"started_at"`
+	CompletedAt  time.Time      `json:"completed_at,omitempty"`
+	StepsRun     []StepResult   `json:"steps_run"`
+	Error        string         `json:"error,omitempty"`
+}
+
+// StepResult records the execution of a single playbook step.
+type StepResult struct {
+	StepID    string        `json:"step_id"`
+	StepName  string        `json:"step_name"`
+	Success   bool          `json:"success"`
+	Duration  time.Duration `json:"duration"`
+	Output    string        `json:"output,omitempty"`
+	Error     string        `json:"error,omitempty"`
+}
+
+// PlaybookExecutorFunc runs a single playbook step.
+type PlaybookExecutorFunc func(ctx context.Context, step PlaybookStep, component string) (string, error)
+
+// RecoveryPlaybookEngine manages and executes recovery playbooks.
+type RecoveryPlaybookEngine struct {
+	mu         sync.RWMutex
+	playbooks  map[string]*Playbook
+	executions []*PlaybookExecution
+	execCount  int64
+	executor   PlaybookExecutorFunc
+	logger     *slog.Logger
+}
+
+// NewRecoveryPlaybookEngine creates a new playbook engine.
+func NewRecoveryPlaybookEngine(executor PlaybookExecutorFunc) *RecoveryPlaybookEngine {
+	return &RecoveryPlaybookEngine{
+		playbooks:  make(map[string]*Playbook),
+		executions: make([]*PlaybookExecution, 0),
+		executor:   executor,
+		logger:     slog.Default().With("component", "sarl-recovery-playbooks"),
+	}
+}
+
+// RegisterPlaybook adds a playbook to the engine.
+func (rpe *RecoveryPlaybookEngine) RegisterPlaybook(pb Playbook) {
+	rpe.mu.Lock()
+	defer rpe.mu.Unlock()
+	rpe.playbooks[pb.ID] = &pb
+	rpe.logger.Info("playbook registered", "id", pb.ID, "name", pb.Name)
+}
+
+// Execute runs a playbook for a given component. Returns the execution ID.
+func (rpe *RecoveryPlaybookEngine) Execute(ctx context.Context, playbookID, component string) (string, error) {
+	rpe.mu.Lock()
+	pb, ok := rpe.playbooks[playbookID]
+	if !ok {
+		rpe.mu.Unlock()
+		return "", fmt.Errorf("playbook %s not found", playbookID)
+	}
+
+	rpe.execCount++
+	exec := &PlaybookExecution{
+		ID:         fmt.Sprintf("exec-%d", rpe.execCount),
+		PlaybookID: playbookID,
+		Component:  component,
+		Status:     PlaybookRunning,
+		StartedAt:  time.Now(),
+		StepsRun:   make([]StepResult, 0),
+	}
+	rpe.executions = append(rpe.executions, exec)
+	rpe.mu.Unlock()
+
+	rpe.logger.Info("playbook execution started",
+		"exec_id", exec.ID,
+		"playbook", pb.Name,
+		"component", component,
+	)
+
+	// Phase 1: Diagnosis checks.
+	for _, check := range pb.DiagnosisChecks {
+		result := rpe.runStep(ctx, check, component)
+		exec.StepsRun = append(exec.StepsRun, result)
+		if !result.Success {
+			rpe.logger.Warn("diagnosis check failed",
+				"step", check.ID,
+				"error", result.Error,
+			)
+		}
+	}
+
+	// Phase 2: Execute recovery actions.
+	var execErr error
+	for _, action := range pb.Actions {
+		result := rpe.runStep(ctx, action, component)
+		exec.StepsRun = append(exec.StepsRun, result)
+
+		if !result.Success {
+			switch action.OnError {
+			case "continue":
+				continue
+			case "rollback":
+				execErr = fmt.Errorf("step %s failed (rollback): %s", action.ID, result.Error)
+			default: // "abort"
+				execErr = fmt.Errorf("step %s failed: %s", action.ID, result.Error)
+			}
+			break
+		}
+	}
+
+	// Phase 3: Handle result.
+	if execErr != nil {
+		rpe.logger.Error("playbook failed, executing rollback",
+			"exec_id", exec.ID,
+			"error", execErr,
+		)
+
+		// Execute rollback.
+		for _, rb := range pb.RollbackActions {
+			result := rpe.runStep(ctx, rb, component)
+			exec.StepsRun = append(exec.StepsRun, result)
+		}
+
+		exec.Status = PlaybookRolledBack
+		exec.Error = execErr.Error()
+	} else {
+		exec.Status = PlaybookSucceeded
+		rpe.logger.Info("playbook succeeded",
+			"exec_id", exec.ID,
+			"component", component,
+			"duration", time.Since(exec.StartedAt),
+		)
+	}
+
+	exec.CompletedAt = time.Now()
+	return exec.ID, execErr
+}
+
+// runStep executes a single step with timeout and retries.
+func (rpe *RecoveryPlaybookEngine) runStep(ctx context.Context, step PlaybookStep, component string) StepResult {
+	start := time.Now()
+	result := StepResult{
+		StepID:   step.ID,
+		StepName: step.Name,
+	}
+
+	retries := step.Retries
+	if retries <= 0 {
+		retries = 1
+	}
+
+	var lastErr error
+	for attempt := 0; attempt < retries; attempt++ {
+		stepCtx := ctx
+		var cancel context.CancelFunc
+		if step.Timeout > 0 {
+			stepCtx, cancel = context.WithTimeout(ctx, step.Timeout)
+		}
+
+		output, err := rpe.executor(stepCtx, step, component)
+
+		if cancel != nil {
+			cancel()
+		}
+
+		if err == nil {
+			result.Success = true
+			result.Output = output
+			result.Duration = time.Since(start)
+			return result
+		}
+		lastErr = err
+
+		if attempt < retries-1 {
+			rpe.logger.Warn("step retry",
+				"step", step.ID,
+				"attempt", attempt+1,
+				"error", err,
+			)
+		}
+	}
+
+	result.Success = false
+	result.Error = lastErr.Error()
+	result.Duration = time.Since(start)
+	return result
+}
+
+// GetExecution returns a playbook execution by ID.
+// Returns a deep copy to prevent data races with the execution goroutine.
+func (rpe *RecoveryPlaybookEngine) GetExecution(id string) (*PlaybookExecution, bool) {
+	rpe.mu.RLock()
+	defer rpe.mu.RUnlock()
+
+	for _, exec := range rpe.executions {
+		if exec.ID == id {
+			cp := *exec
+			cp.StepsRun = make([]StepResult, len(exec.StepsRun))
+			copy(cp.StepsRun, exec.StepsRun)
+			return &cp, true
+		}
+	}
+	return nil, false
+}
+
+// RecentExecutions returns the last N executions.
+// Returns deep copies to prevent data races with the execution goroutine.
+func (rpe *RecoveryPlaybookEngine) RecentExecutions(n int) []PlaybookExecution {
+	rpe.mu.RLock()
+	defer rpe.mu.RUnlock()
+
+	total := len(rpe.executions)
+	if total == 0 {
+		return nil
+	}
+	start := total - n
+	if start < 0 {
+		start = 0
+	}
+
+	result := make([]PlaybookExecution, 0, n)
+	for i := start; i < total; i++ {
+		cp := *rpe.executions[i]
+		cp.StepsRun = make([]StepResult, len(rpe.executions[i].StepsRun))
+		copy(cp.StepsRun, rpe.executions[i].StepsRun)
+		result = append(result, cp)
+	}
+	return result
+}
+
+// PlaybookCount returns the number of registered playbooks.
+func (rpe *RecoveryPlaybookEngine) PlaybookCount() int {
+	rpe.mu.RLock()
+	defer rpe.mu.RUnlock()
+	return len(rpe.playbooks)
+}
+
+// --- Built-in playbooks per ТЗ §7.1 ---
+
+// DefaultPlaybooks returns the 3 built-in recovery playbooks.
+func DefaultPlaybooks() []Playbook {
+	return []Playbook{
+		ComponentResurrectionPlaybook(),
+		ConsensusRecoveryPlaybook(),
+		CryptoRotationPlaybook(),
+	}
+}
+
+// ComponentResurrectionPlaybook per ТЗ §7.1.1.
+func ComponentResurrectionPlaybook() Playbook {
+	return Playbook{
+		ID:              "component-resurrection",
+		Name:            "Component Resurrection",
+		Version:         "1.0",
+		TriggerMetric:   "component_offline",
+		TriggerSeverity: "CRITICAL",
+		DiagnosisChecks: []PlaybookStep{
+			{ID: "diag-process", Name: "Check process exists", Type: "shell", Timeout: 5 * time.Second},
+			{ID: "diag-crashes", Name: "Check recent crashes", Type: "shell", Timeout: 5 * time.Second},
+			{ID: "diag-resources", Name: "Check resource exhaustion", Type: "prometheus", Timeout: 5 * time.Second},
+			{ID: "diag-deps", Name: "Check dependency health", Type: "http", Timeout: 10 * time.Second},
+		},
+		Actions: []PlaybookStep{
+			{ID: "capture-forensics", Name: "Capture forensics", Type: "shell", Timeout: 30 * time.Second, OnError: "continue"},
+			{ID: "clear-resources", Name: "Clear temp resources", Type: "shell", Timeout: 10 * time.Second, OnError: "continue"},
+			{ID: "restart-component", Name: "Restart component", Type: "systemd", Timeout: 60 * time.Second, OnError: "abort"},
+			{ID: "verify-health", Name: "Verify health", Type: "http", Timeout: 30 * time.Second, Retries: 3, OnError: "abort"},
+			{ID: "verify-metrics", Name: "Verify metrics", Type: "prometheus", Timeout: 30 * time.Second, OnError: "continue"},
+			{ID: "notify-success", Name: "Notify SOC", Type: "api", Timeout: 5 * time.Second, OnError: "continue"},
+		},
+		RollbackActions: []PlaybookStep{
+			{ID: "rb-safe-mode", Name: "Enter safe mode", Type: "api", Timeout: 10 * time.Second},
+			{ID: "rb-notify", Name: "Notify architect", Type: "api", Timeout: 5 * time.Second},
+		},
+		SuccessCriteria: []string{
+			"component_status == HEALTHY",
+			"health_check_passed == true",
+			"no_crashes_for_5min == true",
+		},
+	}
+}
+
+// ConsensusRecoveryPlaybook per ТЗ §7.1.2.
+func ConsensusRecoveryPlaybook() Playbook {
+	return Playbook{
+		ID:              "consensus-recovery",
+		Name:            "Distributed Consensus Recovery",
+		Version:         "1.0",
+		TriggerMetric:   "split_brain",
+		TriggerSeverity: "CRITICAL",
+		DiagnosisChecks: []PlaybookStep{
+			{ID: "diag-peers", Name: "Check peer connectivity", Type: "api", Timeout: 10 * time.Second},
+			{ID: "diag-sync", Name: "Check sync status", Type: "api", Timeout: 10 * time.Second},
+			{ID: "diag-genome", Name: "Verify genome", Type: "api", Timeout: 5 * time.Second},
+		},
+		Actions: []PlaybookStep{
+			{ID: "pause-writes", Name: "Pause all writes", Type: "api", Timeout: 10 * time.Second, OnError: "abort"},
+			{ID: "elect-leader", Name: "Elect leader (Raft)", Type: "consensus", Timeout: 60 * time.Second, OnError: "abort"},
+			{ID: "sync-state", Name: "Sync state from leader", Type: "api", Timeout: 300 * time.Second, OnError: "rollback"},
+			{ID: "verify-consistency", Name: "Verify consistency", Type: "api", Timeout: 60 * time.Second, OnError: "abort"},
+			{ID: "resume-writes", Name: "Resume writes", Type: "api", Timeout: 10 * time.Second, OnError: "abort"},
+			{ID: "notify-cluster", Name: "Notify cluster", Type: "api", Timeout: 5 * time.Second, OnError: "continue"},
+		},
+		RollbackActions: []PlaybookStep{
+			{ID: "rb-readonly", Name: "Maintain readonly", Type: "api", Timeout: 10 * time.Second},
+			{ID: "rb-notify", Name: "Notify architect", Type: "api", Timeout: 5 * time.Second},
+		},
+		SuccessCriteria: []string{
+			"leader_elected == true",
+			"state_synced == true",
+			"consistency_verified == true",
+			"writes_resumed == true",
+		},
+	}
+}
+
+// CryptoRotationPlaybook per ТЗ §7.1.3.
+func CryptoRotationPlaybook() Playbook {
+	return Playbook{
+		ID:              "crypto-rotation",
+		Name:            "Cryptographic Key Rotation",
+		Version:         "1.0",
+		TriggerMetric:   "key_compromise",
+		TriggerSeverity: "HIGH",
+		DiagnosisChecks: []PlaybookStep{
+			{ID: "diag-key-age", Name: "Check key age", Type: "crypto", Timeout: 5 * time.Second},
+			{ID: "diag-usage", Name: "Check key usage anomaly", Type: "prometheus", Timeout: 5 * time.Second},
+			{ID: "diag-tpm", Name: "Check TPM health", Type: "shell", Timeout: 5 * time.Second},
+		},
+		Actions: []PlaybookStep{
+			{ID: "gen-keys", Name: "Generate new keys", Type: "crypto", Timeout: 30 * time.Second, OnError: "abort",
+				Params: map[string]interface{}{"algorithm": "ECDSA-P256"},
+			},
+			{ID: "rotate-certs", Name: "Rotate mTLS certs", Type: "crypto", Timeout: 120 * time.Second, OnError: "rollback"},
+			{ID: "resign-chain", Name: "Re-sign decision chain", Type: "crypto", Timeout: 300 * time.Second, OnError: "continue"},
+			{ID: "verify-peers", Name: "Verify peer certs", Type: "api", Timeout: 60 * time.Second, OnError: "abort"},
+			{ID: "revoke-old", Name: "Revoke old keys", Type: "crypto", Timeout: 30 * time.Second, OnError: "continue"},
+			{ID: "notify-soc", Name: "Notify SOC", Type: "api", Timeout: 5 * time.Second, OnError: "continue"},
+		},
+		RollbackActions: []PlaybookStep{
+			{ID: "rb-revert-keys", Name: "Revert to previous keys", Type: "crypto", Timeout: 30 * time.Second},
+			{ID: "rb-notify", Name: "Notify architect", Type: "api", Timeout: 5 * time.Second},
+		},
+		SuccessCriteria: []string{
+			"new_keys_generated == true",
+			"certs_distributed == true",
+			"peers_verified == true",
+			"old_keys_revoked == true",
+		},
+	}
+}
--- a/internal/application/resilience/recovery_playbooks_test.go
+++ b/internal/application/resilience/recovery_playbooks_test.go
@ -0,0 +1,318 @@
+package resilience
+
+import (
+	"context"
+	"fmt"
+	"testing"
+	"time"
+)
+
+// --- Mock playbook executor ---
+
+type mockPlaybookExecutor struct {
+	failSteps map[string]bool
+	callCount int
+}
+
+func newMockPlaybookExecutor() *mockPlaybookExecutor {
+	return &mockPlaybookExecutor{failSteps: make(map[string]bool)}
+}
+
+func (m *mockPlaybookExecutor) execute(_ context.Context, step PlaybookStep, _ string) (string, error) {
+	m.callCount++
+	if m.failSteps[step.ID] {
+		return "", fmt.Errorf("step %s failed", step.ID)
+	}
+	return fmt.Sprintf("step %s completed", step.ID), nil
+}
+
+// --- Recovery Playbook Tests ---
+
+// AR-01: Component resurrection (success).
+func TestPlaybook_AR01_ResurrectionSuccess(t *testing.T) {
+	mock := newMockPlaybookExecutor()
+	rpe := NewRecoveryPlaybookEngine(mock.execute)
+	rpe.RegisterPlaybook(ComponentResurrectionPlaybook())
+
+	execID, err := rpe.Execute(context.Background(), "component-resurrection", "soc-ingest")
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+
+	exec, ok := rpe.GetExecution(execID)
+	if !ok {
+		t.Fatal("execution not found")
+	}
+	if exec.Status != PlaybookSucceeded {
+		t.Errorf("expected SUCCEEDED, got %s", exec.Status)
+	}
+	if len(exec.StepsRun) == 0 {
+		t.Error("expected steps to be recorded")
+	}
+}
+
+// AR-02: Component resurrection (failure → rollback).
+func TestPlaybook_AR02_ResurrectionFailure(t *testing.T) {
+	mock := newMockPlaybookExecutor()
+	mock.failSteps["restart-component"] = true
+
+	rpe := NewRecoveryPlaybookEngine(mock.execute)
+	rpe.RegisterPlaybook(ComponentResurrectionPlaybook())
+
+	_, err := rpe.Execute(context.Background(), "component-resurrection", "soc-ingest")
+	if err == nil {
+		t.Fatal("expected error")
+	}
+
+	execs := rpe.RecentExecutions(10)
+	if len(execs) == 0 {
+		t.Fatal("expected execution")
+	}
+	if execs[0].Status != PlaybookRolledBack {
+		t.Errorf("expected ROLLED_BACK, got %s", execs[0].Status)
+	}
+}
+
+// AR-03: Consensus recovery (success).
+func TestPlaybook_AR03_ConsensusSuccess(t *testing.T) {
+	mock := newMockPlaybookExecutor()
+	rpe := NewRecoveryPlaybookEngine(mock.execute)
+	rpe.RegisterPlaybook(ConsensusRecoveryPlaybook())
+
+	_, err := rpe.Execute(context.Background(), "consensus-recovery", "cluster")
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+}
+
+// AR-04: Consensus recovery (failure → readonly maintained).
+func TestPlaybook_AR04_ConsensusFailure(t *testing.T) {
+	mock := newMockPlaybookExecutor()
+	mock.failSteps["elect-leader"] = true
+
+	rpe := NewRecoveryPlaybookEngine(mock.execute)
+	rpe.RegisterPlaybook(ConsensusRecoveryPlaybook())
+
+	_, err := rpe.Execute(context.Background(), "consensus-recovery", "cluster")
+	if err == nil {
+		t.Fatal("expected error")
+	}
+
+	execs := rpe.RecentExecutions(10)
+	if execs[0].Status != PlaybookRolledBack {
+		t.Errorf("expected ROLLED_BACK, got %s", execs[0].Status)
+	}
+}
+
+// AR-05: Crypto key rotation (success).
+func TestPlaybook_AR05_CryptoSuccess(t *testing.T) {
+	mock := newMockPlaybookExecutor()
+	rpe := NewRecoveryPlaybookEngine(mock.execute)
+	rpe.RegisterPlaybook(CryptoRotationPlaybook())
+
+	_, err := rpe.Execute(context.Background(), "crypto-rotation", "system")
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+}
+
+// AR-06: Crypto rotation (emergency — cert rotation fails → rollback).
+func TestPlaybook_AR06_CryptoRollback(t *testing.T) {
+	mock := newMockPlaybookExecutor()
+	mock.failSteps["rotate-certs"] = true
+
+	rpe := NewRecoveryPlaybookEngine(mock.execute)
+	rpe.RegisterPlaybook(CryptoRotationPlaybook())
+
+	_, err := rpe.Execute(context.Background(), "crypto-rotation", "system")
+	if err == nil {
+		t.Fatal("expected error on cert rotation failure")
+	}
+
+	execs := rpe.RecentExecutions(10)
+	// Should have run rollback (revert keys).
+	found := false
+	for _, s := range execs[0].StepsRun {
+		if s.StepID == "rb-revert-keys" {
+			found = true
+		}
+	}
+	if !found {
+		t.Error("expected rollback step rb-revert-keys")
+	}
+}
+
+// AR-07: Forensic capture (all steps recorded).
+func TestPlaybook_AR07_ForensicCapture(t *testing.T) {
+	mock := newMockPlaybookExecutor()
+	rpe := NewRecoveryPlaybookEngine(mock.execute)
+	rpe.RegisterPlaybook(ComponentResurrectionPlaybook())
+
+	execID, _ := rpe.Execute(context.Background(), "component-resurrection", "comp")
+	exec, _ := rpe.GetExecution(execID)
+
+	for _, step := range exec.StepsRun {
+		if step.StepID == "" {
+			t.Error("step missing ID")
+		}
+		if step.StepName == "" {
+			t.Errorf("step %s has empty name", step.StepID)
+		}
+	}
+}
+
+// AR-08: Rollback execution on action failure.
+func TestPlaybook_AR08_RollbackExecution(t *testing.T) {
+	mock := newMockPlaybookExecutor()
+	mock.failSteps["sync-state"] = true // Sync fails → rollback trigger.
+
+	rpe := NewRecoveryPlaybookEngine(mock.execute)
+	rpe.RegisterPlaybook(ConsensusRecoveryPlaybook())
+
+	rpe.Execute(context.Background(), "consensus-recovery", "cluster")
+
+	execs := rpe.RecentExecutions(10)
+	if execs[0].Status != PlaybookRolledBack {
+		t.Errorf("expected ROLLED_BACK, got %s", execs[0].Status)
+	}
+}
+
+// AR-09: Step retries.
+func TestPlaybook_AR09_StepRetries(t *testing.T) {
+	callCount := 0
+	executor := func(_ context.Context, step PlaybookStep, _ string) (string, error) {
+		callCount++
+		if step.ID == "verify-health" && callCount <= 2 {
+			return "", fmt.Errorf("not healthy yet")
+		}
+		return "ok", nil
+	}
+
+	rpe := NewRecoveryPlaybookEngine(executor)
+	rpe.RegisterPlaybook(ComponentResurrectionPlaybook())
+
+	_, err := rpe.Execute(context.Background(), "component-resurrection", "comp")
+	if err != nil {
+		t.Fatalf("expected success after retries: %v", err)
+	}
+}
+
+// AR-10: Playbook not found.
+func TestPlaybook_AR10_NotFound(t *testing.T) {
+	rpe := NewRecoveryPlaybookEngine(nil)
+	_, err := rpe.Execute(context.Background(), "nonexistent", "comp")
+	if err == nil {
+		t.Fatal("expected error for nonexistent playbook")
+	}
+}
+
+// AR-11: Audit logging (all step timestamps).
+func TestPlaybook_AR11_AuditTimestamps(t *testing.T) {
+	mock := newMockPlaybookExecutor()
+	rpe := NewRecoveryPlaybookEngine(mock.execute)
+	rpe.RegisterPlaybook(ComponentResurrectionPlaybook())
+
+	execID, _ := rpe.Execute(context.Background(), "component-resurrection", "comp")
+	exec, _ := rpe.GetExecution(execID)
+
+	if exec.StartedAt.IsZero() {
+		t.Error("missing started_at")
+	}
+	if exec.CompletedAt.IsZero() {
+		t.Error("missing completed_at")
+	}
+}
+
+// AR-12: OnError=continue skips non-critical failures.
+func TestPlaybook_AR12_ContinueOnError(t *testing.T) {
+	mock := newMockPlaybookExecutor()
+	mock.failSteps["capture-forensics"] = true // OnError=continue.
+	mock.failSteps["notify-success"] = true    // OnError=continue.
+
+	rpe := NewRecoveryPlaybookEngine(mock.execute)
+	rpe.RegisterPlaybook(ComponentResurrectionPlaybook())
+
+	_, err := rpe.Execute(context.Background(), "component-resurrection", "comp")
+	if err != nil {
+		t.Fatalf("expected success despite continue-on-error steps: %v", err)
+	}
+}
+
+// AR-13: Context cancellation.
+func TestPlaybook_AR13_ContextCancel(t *testing.T) {
+	executor := func(ctx context.Context, _ PlaybookStep, _ string) (string, error) {
+		select {
+		case <-ctx.Done():
+			return "", ctx.Err()
+		case <-time.After(10 * time.Millisecond):
+			return "ok", nil
+		}
+	}
+
+	rpe := NewRecoveryPlaybookEngine(executor)
+	rpe.RegisterPlaybook(ComponentResurrectionPlaybook())
+
+	ctx, cancel := context.WithCancel(context.Background())
+	cancel() // Cancel immediately.
+
+	_, err := rpe.Execute(ctx, "component-resurrection", "comp")
+	// May or may not error depending on timing, but should not hang.
+	_ = err
+}
+
+// AR-14: DefaultPlaybooks returns 3.
+func TestPlaybook_AR14_DefaultPlaybooks(t *testing.T) {
+	pbs := DefaultPlaybooks()
+	if len(pbs) != 3 {
+		t.Errorf("expected 3 playbooks, got %d", len(pbs))
+	}
+
+	ids := map[string]bool{}
+	for _, pb := range pbs {
+		if ids[pb.ID] {
+			t.Errorf("duplicate playbook ID: %s", pb.ID)
+		}
+		ids[pb.ID] = true
+
+		if len(pb.Actions) == 0 {
+			t.Errorf("playbook %s has no actions", pb.ID)
+		}
+		if len(pb.SuccessCriteria) == 0 {
+			t.Errorf("playbook %s has no success criteria", pb.ID)
+		}
+	}
+}
+
+// AR-15: PlaybookCount and RecentExecutions.
+func TestPlaybook_AR15_CountsAndRecent(t *testing.T) {
+	mock := newMockPlaybookExecutor()
+	rpe := NewRecoveryPlaybookEngine(mock.execute)
+
+	if rpe.PlaybookCount() != 0 {
+		t.Error("expected 0")
+	}
+
+	for _, pb := range DefaultPlaybooks() {
+		rpe.RegisterPlaybook(pb)
+	}
+	if rpe.PlaybookCount() != 3 {
+		t.Errorf("expected 3, got %d", rpe.PlaybookCount())
+	}
+
+	// Run two playbooks.
+	rpe.Execute(context.Background(), "component-resurrection", "comp1")
+	rpe.Execute(context.Background(), "crypto-rotation", "comp2")
+
+	recent := rpe.RecentExecutions(1)
+	if len(recent) != 1 {
+		t.Errorf("expected 1 recent, got %d", len(recent))
+	}
+	if recent[0].PlaybookID != "crypto-rotation" {
+		t.Errorf("expected crypto-rotation, got %s", recent[0].PlaybookID)
+	}
+
+	all := rpe.RecentExecutions(100)
+	if len(all) != 2 {
+		t.Errorf("expected 2 total, got %d", len(all))
+	}
+}