gomcp/internal/application/resilience/healing_engine.go

528 lines
15 KiB
Go

// Copyright 2026 Syntrex Lab. All rights reserved.
// Use of this source code is governed by an Apache-2.0 license
// that can be found in the LICENSE file.
package resilience
import (
"context"
"fmt"
"log/slog"
"sync"
"time"
)
// HealingState represents the FSM state of a healing operation.
type HealingState string
const (
HealingIdle HealingState = "IDLE"
HealingDiagnosing HealingState = "DIAGNOSING"
HealingActive HealingState = "HEALING"
HealingVerifying HealingState = "VERIFYING"
HealingCompleted HealingState = "COMPLETED"
HealingFailed HealingState = "FAILED"
)
// HealingResult summarizes a completed healing operation.
type HealingResult string
const (
ResultSuccess HealingResult = "SUCCESS"
ResultFailed HealingResult = "FAILED"
ResultSkipped HealingResult = "SKIPPED"
)
// ActionType defines the kinds of healing actions.
type ActionType string
const (
ActionGracefulStop ActionType = "graceful_stop"
ActionClearTempFiles ActionType = "clear_temp_files"
ActionStartComponent ActionType = "start_component"
ActionVerifyHealth ActionType = "verify_health"
ActionNotifySOC ActionType = "notify_soc"
ActionFreezeConfig ActionType = "freeze_config"
ActionRollbackConfig ActionType = "rollback_config"
ActionVerifyConfig ActionType = "verify_config"
ActionSwitchReadOnly ActionType = "switch_to_readonly"
ActionBackupDB ActionType = "backup_db"
ActionRestoreSnapshot ActionType = "restore_snapshot"
ActionVerifyIntegrity ActionType = "verify_integrity"
ActionResumeWrites ActionType = "resume_writes"
ActionDisableRules ActionType = "disable_rules"
ActionRevertRules ActionType = "revert_rules"
ActionReloadEngine ActionType = "reload_engine"
ActionIsolateNetwork ActionType = "isolate_network"
ActionRegenCerts ActionType = "regenerate_certs"
ActionRestoreNetwork ActionType = "restore_network"
ActionNotifyArchitect ActionType = "notify_architect"
ActionEnterSafeMode ActionType = "enter_safe_mode"
)
// Action is a single step in a healing strategy.
type Action struct {
Type ActionType `json:"type"`
Params map[string]interface{} `json:"params,omitempty"`
Timeout time.Duration `json:"timeout"`
OnError string `json:"on_error"` // "continue", "abort", "rollback"
}
// TriggerCondition defines when a healing strategy activates.
type TriggerCondition struct {
Metrics []string `json:"metrics,omitempty"`
Statuses []ComponentStatus `json:"statuses,omitempty"`
ConsecutiveFailures int `json:"consecutive_failures"`
WithinWindow time.Duration `json:"within_window"`
}
// RollbackPlan defines what happens if healing fails.
type RollbackPlan struct {
OnFailure string `json:"on_failure"` // "escalate", "enter_safe_mode", "maintain_isolation"
Actions []Action `json:"actions,omitempty"`
}
// HealingStrategy is a complete self-healing plan.
type HealingStrategy struct {
ID string `json:"id"`
Name string `json:"name"`
Trigger TriggerCondition `json:"trigger"`
Actions []Action `json:"actions"`
Rollback RollbackPlan `json:"rollback"`
MaxAttempts int `json:"max_attempts"`
Cooldown time.Duration `json:"cooldown"`
}
// Diagnosis is the result of root cause analysis.
type Diagnosis struct {
Component string `json:"component"`
Metric string `json:"metric"`
RootCause string `json:"root_cause"`
Confidence float64 `json:"confidence"`
SuggestedFix string `json:"suggested_fix"`
RelatedAlerts []HealthAlert `json:"related_alerts,omitempty"`
}
// HealingOperation tracks a single healing attempt.
type HealingOperation struct {
ID string `json:"id"`
StrategyID string `json:"strategy_id"`
Component string `json:"component"`
State HealingState `json:"state"`
Diagnosis *Diagnosis `json:"diagnosis,omitempty"`
ActionsRun []ActionLog `json:"actions_run"`
Result HealingResult `json:"result"`
StartedAt time.Time `json:"started_at"`
CompletedAt time.Time `json:"completed_at,omitempty"`
Error string `json:"error,omitempty"`
AttemptNumber int `json:"attempt_number"`
}
// ActionLog records the execution of a single action.
type ActionLog struct {
Action ActionType `json:"action"`
StartedAt time.Time `json:"started_at"`
Duration time.Duration `json:"duration"`
Success bool `json:"success"`
Error string `json:"error,omitempty"`
}
// ActionExecutorFunc is the callback that actually runs an action.
// Implementations handle the real system operations (restart, rollback, etc.).
type ActionExecutorFunc func(ctx context.Context, action Action, component string) error
// HealingEngine is the L2 Self-Healing orchestrator.
type HealingEngine struct {
mu sync.RWMutex
strategies []HealingStrategy
cooldowns map[string]time.Time // strategyID → earliest next run
operations []*HealingOperation
opCounter int64
executor ActionExecutorFunc
alertBus <-chan HealthAlert
escalateFn func(HealthAlert) // called on unrecoverable failure
logger *slog.Logger
}
// NewHealingEngine creates a new self-healing engine.
func NewHealingEngine(
alertBus <-chan HealthAlert,
executor ActionExecutorFunc,
escalateFn func(HealthAlert),
) *HealingEngine {
return &HealingEngine{
cooldowns: make(map[string]time.Time),
operations: make([]*HealingOperation, 0),
executor: executor,
alertBus: alertBus,
escalateFn: escalateFn,
logger: slog.Default().With("component", "sarl-healing-engine"),
}
}
// RegisterStrategy adds a healing strategy.
func (he *HealingEngine) RegisterStrategy(s HealingStrategy) {
he.mu.Lock()
defer he.mu.Unlock()
he.strategies = append(he.strategies, s)
he.logger.Info("strategy registered", "id", s.ID, "name", s.Name)
}
// Start begins listening for alerts and initiating healing. Blocks until ctx is cancelled.
func (he *HealingEngine) Start(ctx context.Context) {
he.logger.Info("healing engine started", "strategies", len(he.strategies))
for {
select {
case <-ctx.Done():
he.logger.Info("healing engine stopped")
return
case alert, ok := <-he.alertBus:
if !ok {
return
}
if alert.Severity == SeverityCritical || alert.Severity == SeverityWarning {
he.initiateHealing(ctx, alert)
}
}
}
}
// initiateHealing runs the healing pipeline for an alert.
func (he *HealingEngine) initiateHealing(ctx context.Context, alert HealthAlert) {
strategy := he.findStrategy(alert)
if strategy == nil {
he.logger.Info("no matching strategy for alert",
"component", alert.Component,
"metric", alert.Metric,
)
return
}
if he.isInCooldown(strategy.ID) {
he.logger.Info("strategy in cooldown",
"strategy", strategy.ID,
"component", alert.Component,
)
return
}
op := he.createOperation(strategy, alert.Component)
he.logger.Info("healing initiated",
"op_id", op.ID,
"strategy", strategy.ID,
"component", alert.Component,
)
// Phase 1: Diagnose.
he.transitionOp(op, HealingDiagnosing)
diagnosis := he.diagnose(alert)
op.Diagnosis = &diagnosis
// Phase 2: Execute healing actions.
he.transitionOp(op, HealingActive)
execErr := he.executeActions(ctx, strategy, op)
// Phase 3: Verify recovery.
if execErr == nil {
he.transitionOp(op, HealingVerifying)
verifyErr := he.verifyRecovery(ctx, strategy, op.Component)
if verifyErr != nil {
execErr = verifyErr
}
}
// Phase 4: Complete or fail.
if execErr == nil {
he.transitionOp(op, HealingCompleted)
op.Result = ResultSuccess
he.logger.Info("healing completed successfully",
"op_id", op.ID,
"component", op.Component,
"duration", time.Since(op.StartedAt),
)
} else {
he.transitionOp(op, HealingFailed)
op.Result = ResultFailed
op.Error = execErr.Error()
he.logger.Error("healing failed",
"op_id", op.ID,
"component", op.Component,
"error", execErr,
)
// Execute rollback.
he.executeRollback(ctx, strategy, op)
// Escalate.
if he.escalateFn != nil {
he.escalateFn(alert)
}
}
op.CompletedAt = time.Now()
he.setCooldown(strategy.ID, strategy.Cooldown)
}
// findStrategy returns the first matching strategy for an alert.
func (he *HealingEngine) findStrategy(alert HealthAlert) *HealingStrategy {
he.mu.RLock()
defer he.mu.RUnlock()
for i := range he.strategies {
s := &he.strategies[i]
if he.matchesTrigger(s.Trigger, alert) {
return s
}
}
return nil
}
// matchesTrigger checks if an alert matches a strategy's trigger condition.
func (he *HealingEngine) matchesTrigger(trigger TriggerCondition, alert HealthAlert) bool {
// Match by metric name.
for _, m := range trigger.Metrics {
if m == alert.Metric {
return true
}
}
// Match by component status.
for _, s := range trigger.Statuses {
switch s {
case StatusCritical:
if alert.Severity == SeverityCritical {
return true
}
case StatusOffline:
if alert.Severity == SeverityCritical && alert.SuggestedAction == "restart" {
return true
}
}
}
return false
}
// isInCooldown checks if a strategy is still in its cooldown period.
func (he *HealingEngine) isInCooldown(strategyID string) bool {
he.mu.RLock()
defer he.mu.RUnlock()
earliest, ok := he.cooldowns[strategyID]
return ok && time.Now().Before(earliest)
}
// setCooldown marks a strategy as cooling down.
func (he *HealingEngine) setCooldown(strategyID string, duration time.Duration) {
he.mu.Lock()
defer he.mu.Unlock()
he.cooldowns[strategyID] = time.Now().Add(duration)
}
// createOperation creates and records a new healing operation.
func (he *HealingEngine) createOperation(strategy *HealingStrategy, component string) *HealingOperation {
he.mu.Lock()
defer he.mu.Unlock()
he.opCounter++
op := &HealingOperation{
ID: fmt.Sprintf("heal-%d", he.opCounter),
StrategyID: strategy.ID,
Component: component,
State: HealingIdle,
StartedAt: time.Now(),
ActionsRun: make([]ActionLog, 0),
}
he.operations = append(he.operations, op)
return op
}
// transitionOp moves an operation to a new state.
func (he *HealingEngine) transitionOp(op *HealingOperation, newState HealingState) {
he.logger.Debug("healing state transition",
"op_id", op.ID,
"from", op.State,
"to", newState,
)
op.State = newState
}
// diagnose performs root cause analysis for an alert.
func (he *HealingEngine) diagnose(alert HealthAlert) Diagnosis {
rootCause := "unknown"
confidence := 0.5
suggestedFix := "restart component"
switch {
case alert.Metric == "memory" && alert.Current > 90:
rootCause = "memory_exhaustion"
confidence = 0.9
suggestedFix = "restart with increased limits"
case alert.Metric == "cpu" && alert.Current > 90:
rootCause = "cpu_saturation"
confidence = 0.8
suggestedFix = "check for runaway goroutines"
case alert.Metric == "error_rate":
rootCause = "elevated_error_rate"
confidence = 0.7
suggestedFix = "check dependencies and config"
case alert.Metric == "latency_p99":
rootCause = "latency_degradation"
confidence = 0.6
suggestedFix = "check database and network"
case alert.Metric == "quorum":
rootCause = "quorum_loss"
confidence = 0.95
suggestedFix = "activate safe mode"
default:
rootCause = fmt.Sprintf("threshold_breach_%s", alert.Metric)
confidence = 0.5
suggestedFix = "investigate manually"
}
return Diagnosis{
Component: alert.Component,
Metric: alert.Metric,
RootCause: rootCause,
Confidence: confidence,
SuggestedFix: suggestedFix,
}
}
// executeActions runs each action in sequence.
func (he *HealingEngine) executeActions(ctx context.Context, strategy *HealingStrategy, op *HealingOperation) error {
for _, action := range strategy.Actions {
actionCtx := ctx
var cancel context.CancelFunc
if action.Timeout > 0 {
actionCtx, cancel = context.WithTimeout(ctx, action.Timeout)
}
start := time.Now()
err := he.executor(actionCtx, action, op.Component)
duration := time.Since(start)
if cancel != nil {
cancel()
}
logEntry := ActionLog{
Action: action.Type,
StartedAt: start,
Duration: duration,
Success: err == nil,
}
if err != nil {
logEntry.Error = err.Error()
}
op.ActionsRun = append(op.ActionsRun, logEntry)
if err != nil {
switch action.OnError {
case "continue":
he.logger.Warn("action failed, continuing",
"action", action.Type,
"error", err,
)
case "rollback":
return fmt.Errorf("action %s failed (rollback): %w", action.Type, err)
default: // "abort"
return fmt.Errorf("action %s failed: %w", action.Type, err)
}
}
}
return nil
}
// verifyRecovery checks if the component is healthy after healing.
func (he *HealingEngine) verifyRecovery(ctx context.Context, strategy *HealingStrategy, component string) error {
// Execute a verify_health action if not already in the strategy.
verifyAction := Action{
Type: ActionVerifyHealth,
Timeout: 30 * time.Second,
}
return he.executor(ctx, verifyAction, component)
}
// executeRollback runs the rollback plan for a failed healing.
func (he *HealingEngine) executeRollback(ctx context.Context, strategy *HealingStrategy, op *HealingOperation) {
if len(strategy.Rollback.Actions) == 0 {
he.logger.Info("no rollback actions defined",
"strategy", strategy.ID,
)
return
}
he.logger.Warn("executing rollback",
"strategy", strategy.ID,
"component", op.Component,
)
for _, action := range strategy.Rollback.Actions {
if err := he.executor(ctx, action, op.Component); err != nil {
he.logger.Error("rollback action failed",
"action", action.Type,
"error", err,
)
}
}
}
// GetOperation returns a healing operation by ID.
// Returns a deep copy to prevent data races with the healing goroutine.
func (he *HealingEngine) GetOperation(id string) (*HealingOperation, bool) {
he.mu.RLock()
defer he.mu.RUnlock()
for _, op := range he.operations {
if op.ID == id {
cp := *op
cp.ActionsRun = make([]ActionLog, len(op.ActionsRun))
copy(cp.ActionsRun, op.ActionsRun)
if op.Diagnosis != nil {
diag := *op.Diagnosis
cp.Diagnosis = &diag
}
return &cp, true
}
}
return nil, false
}
// RecentOperations returns the last N operations.
// Returns deep copies to prevent data races with the healing goroutine.
func (he *HealingEngine) RecentOperations(n int) []HealingOperation {
he.mu.RLock()
defer he.mu.RUnlock()
total := len(he.operations)
if total == 0 {
return nil
}
start := total - n
if start < 0 {
start = 0
}
result := make([]HealingOperation, 0, n)
for i := start; i < total; i++ {
cp := *he.operations[i]
cp.ActionsRun = make([]ActionLog, len(he.operations[i].ActionsRun))
copy(cp.ActionsRun, he.operations[i].ActionsRun)
if he.operations[i].Diagnosis != nil {
diag := *he.operations[i].Diagnosis
cp.Diagnosis = &diag
}
result = append(result, cp)
}
return result
}
// StrategyCount returns the number of registered strategies.
func (he *HealingEngine) StrategyCount() int {
he.mu.RLock()
defer he.mu.RUnlock()
return len(he.strategies)
}