mirror of
https://github.com/syntrex-lab/gomcp.git
synced 2026-04-24 20:06:21 +02:00
528 lines
15 KiB
Go
528 lines
15 KiB
Go
// Copyright 2026 Syntrex Lab. All rights reserved.
|
|
// Use of this source code is governed by an Apache-2.0 license
|
|
// that can be found in the LICENSE file.
|
|
|
|
package resilience
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"log/slog"
|
|
"sync"
|
|
"time"
|
|
)
|
|
|
|
// HealingState represents the FSM state of a healing operation.
|
|
type HealingState string
|
|
|
|
const (
|
|
HealingIdle HealingState = "IDLE"
|
|
HealingDiagnosing HealingState = "DIAGNOSING"
|
|
HealingActive HealingState = "HEALING"
|
|
HealingVerifying HealingState = "VERIFYING"
|
|
HealingCompleted HealingState = "COMPLETED"
|
|
HealingFailed HealingState = "FAILED"
|
|
)
|
|
|
|
// HealingResult summarizes a completed healing operation.
|
|
type HealingResult string
|
|
|
|
const (
|
|
ResultSuccess HealingResult = "SUCCESS"
|
|
ResultFailed HealingResult = "FAILED"
|
|
ResultSkipped HealingResult = "SKIPPED"
|
|
)
|
|
|
|
// ActionType defines the kinds of healing actions.
|
|
type ActionType string
|
|
|
|
const (
|
|
ActionGracefulStop ActionType = "graceful_stop"
|
|
ActionClearTempFiles ActionType = "clear_temp_files"
|
|
ActionStartComponent ActionType = "start_component"
|
|
ActionVerifyHealth ActionType = "verify_health"
|
|
ActionNotifySOC ActionType = "notify_soc"
|
|
ActionFreezeConfig ActionType = "freeze_config"
|
|
ActionRollbackConfig ActionType = "rollback_config"
|
|
ActionVerifyConfig ActionType = "verify_config"
|
|
ActionSwitchReadOnly ActionType = "switch_to_readonly"
|
|
ActionBackupDB ActionType = "backup_db"
|
|
ActionRestoreSnapshot ActionType = "restore_snapshot"
|
|
ActionVerifyIntegrity ActionType = "verify_integrity"
|
|
ActionResumeWrites ActionType = "resume_writes"
|
|
ActionDisableRules ActionType = "disable_rules"
|
|
ActionRevertRules ActionType = "revert_rules"
|
|
ActionReloadEngine ActionType = "reload_engine"
|
|
ActionIsolateNetwork ActionType = "isolate_network"
|
|
ActionRegenCerts ActionType = "regenerate_certs"
|
|
ActionRestoreNetwork ActionType = "restore_network"
|
|
ActionNotifyArchitect ActionType = "notify_architect"
|
|
ActionEnterSafeMode ActionType = "enter_safe_mode"
|
|
)
|
|
|
|
// Action is a single step in a healing strategy.
|
|
type Action struct {
|
|
Type ActionType `json:"type"`
|
|
Params map[string]interface{} `json:"params,omitempty"`
|
|
Timeout time.Duration `json:"timeout"`
|
|
OnError string `json:"on_error"` // "continue", "abort", "rollback"
|
|
}
|
|
|
|
// TriggerCondition defines when a healing strategy activates.
|
|
type TriggerCondition struct {
|
|
Metrics []string `json:"metrics,omitempty"`
|
|
Statuses []ComponentStatus `json:"statuses,omitempty"`
|
|
ConsecutiveFailures int `json:"consecutive_failures"`
|
|
WithinWindow time.Duration `json:"within_window"`
|
|
}
|
|
|
|
// RollbackPlan defines what happens if healing fails.
|
|
type RollbackPlan struct {
|
|
OnFailure string `json:"on_failure"` // "escalate", "enter_safe_mode", "maintain_isolation"
|
|
Actions []Action `json:"actions,omitempty"`
|
|
}
|
|
|
|
// HealingStrategy is a complete self-healing plan.
|
|
type HealingStrategy struct {
|
|
ID string `json:"id"`
|
|
Name string `json:"name"`
|
|
Trigger TriggerCondition `json:"trigger"`
|
|
Actions []Action `json:"actions"`
|
|
Rollback RollbackPlan `json:"rollback"`
|
|
MaxAttempts int `json:"max_attempts"`
|
|
Cooldown time.Duration `json:"cooldown"`
|
|
}
|
|
|
|
// Diagnosis is the result of root cause analysis.
|
|
type Diagnosis struct {
|
|
Component string `json:"component"`
|
|
Metric string `json:"metric"`
|
|
RootCause string `json:"root_cause"`
|
|
Confidence float64 `json:"confidence"`
|
|
SuggestedFix string `json:"suggested_fix"`
|
|
RelatedAlerts []HealthAlert `json:"related_alerts,omitempty"`
|
|
}
|
|
|
|
// HealingOperation tracks a single healing attempt.
|
|
type HealingOperation struct {
|
|
ID string `json:"id"`
|
|
StrategyID string `json:"strategy_id"`
|
|
Component string `json:"component"`
|
|
State HealingState `json:"state"`
|
|
Diagnosis *Diagnosis `json:"diagnosis,omitempty"`
|
|
ActionsRun []ActionLog `json:"actions_run"`
|
|
Result HealingResult `json:"result"`
|
|
StartedAt time.Time `json:"started_at"`
|
|
CompletedAt time.Time `json:"completed_at,omitempty"`
|
|
Error string `json:"error,omitempty"`
|
|
AttemptNumber int `json:"attempt_number"`
|
|
}
|
|
|
|
// ActionLog records the execution of a single action.
|
|
type ActionLog struct {
|
|
Action ActionType `json:"action"`
|
|
StartedAt time.Time `json:"started_at"`
|
|
Duration time.Duration `json:"duration"`
|
|
Success bool `json:"success"`
|
|
Error string `json:"error,omitempty"`
|
|
}
|
|
|
|
// ActionExecutorFunc is the callback that actually runs an action.
|
|
// Implementations handle the real system operations (restart, rollback, etc.).
|
|
type ActionExecutorFunc func(ctx context.Context, action Action, component string) error
|
|
|
|
// HealingEngine is the L2 Self-Healing orchestrator.
|
|
type HealingEngine struct {
|
|
mu sync.RWMutex
|
|
strategies []HealingStrategy
|
|
cooldowns map[string]time.Time // strategyID → earliest next run
|
|
operations []*HealingOperation
|
|
opCounter int64
|
|
executor ActionExecutorFunc
|
|
alertBus <-chan HealthAlert
|
|
escalateFn func(HealthAlert) // called on unrecoverable failure
|
|
logger *slog.Logger
|
|
}
|
|
|
|
// NewHealingEngine creates a new self-healing engine.
|
|
func NewHealingEngine(
|
|
alertBus <-chan HealthAlert,
|
|
executor ActionExecutorFunc,
|
|
escalateFn func(HealthAlert),
|
|
) *HealingEngine {
|
|
return &HealingEngine{
|
|
cooldowns: make(map[string]time.Time),
|
|
operations: make([]*HealingOperation, 0),
|
|
executor: executor,
|
|
alertBus: alertBus,
|
|
escalateFn: escalateFn,
|
|
logger: slog.Default().With("component", "sarl-healing-engine"),
|
|
}
|
|
}
|
|
|
|
// RegisterStrategy adds a healing strategy.
|
|
func (he *HealingEngine) RegisterStrategy(s HealingStrategy) {
|
|
he.mu.Lock()
|
|
defer he.mu.Unlock()
|
|
he.strategies = append(he.strategies, s)
|
|
he.logger.Info("strategy registered", "id", s.ID, "name", s.Name)
|
|
}
|
|
|
|
// Start begins listening for alerts and initiating healing. Blocks until ctx is cancelled.
|
|
func (he *HealingEngine) Start(ctx context.Context) {
|
|
he.logger.Info("healing engine started", "strategies", len(he.strategies))
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
he.logger.Info("healing engine stopped")
|
|
return
|
|
case alert, ok := <-he.alertBus:
|
|
if !ok {
|
|
return
|
|
}
|
|
if alert.Severity == SeverityCritical || alert.Severity == SeverityWarning {
|
|
he.initiateHealing(ctx, alert)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// initiateHealing runs the healing pipeline for an alert.
|
|
func (he *HealingEngine) initiateHealing(ctx context.Context, alert HealthAlert) {
|
|
strategy := he.findStrategy(alert)
|
|
if strategy == nil {
|
|
he.logger.Info("no matching strategy for alert",
|
|
"component", alert.Component,
|
|
"metric", alert.Metric,
|
|
)
|
|
return
|
|
}
|
|
|
|
if he.isInCooldown(strategy.ID) {
|
|
he.logger.Info("strategy in cooldown",
|
|
"strategy", strategy.ID,
|
|
"component", alert.Component,
|
|
)
|
|
return
|
|
}
|
|
|
|
op := he.createOperation(strategy, alert.Component)
|
|
|
|
he.logger.Info("healing initiated",
|
|
"op_id", op.ID,
|
|
"strategy", strategy.ID,
|
|
"component", alert.Component,
|
|
)
|
|
|
|
// Phase 1: Diagnose.
|
|
he.transitionOp(op, HealingDiagnosing)
|
|
diagnosis := he.diagnose(alert)
|
|
op.Diagnosis = &diagnosis
|
|
|
|
// Phase 2: Execute healing actions.
|
|
he.transitionOp(op, HealingActive)
|
|
execErr := he.executeActions(ctx, strategy, op)
|
|
|
|
// Phase 3: Verify recovery.
|
|
if execErr == nil {
|
|
he.transitionOp(op, HealingVerifying)
|
|
verifyErr := he.verifyRecovery(ctx, strategy, op.Component)
|
|
if verifyErr != nil {
|
|
execErr = verifyErr
|
|
}
|
|
}
|
|
|
|
// Phase 4: Complete or fail.
|
|
if execErr == nil {
|
|
he.transitionOp(op, HealingCompleted)
|
|
op.Result = ResultSuccess
|
|
he.logger.Info("healing completed successfully",
|
|
"op_id", op.ID,
|
|
"component", op.Component,
|
|
"duration", time.Since(op.StartedAt),
|
|
)
|
|
} else {
|
|
he.transitionOp(op, HealingFailed)
|
|
op.Result = ResultFailed
|
|
op.Error = execErr.Error()
|
|
he.logger.Error("healing failed",
|
|
"op_id", op.ID,
|
|
"component", op.Component,
|
|
"error", execErr,
|
|
)
|
|
|
|
// Execute rollback.
|
|
he.executeRollback(ctx, strategy, op)
|
|
|
|
// Escalate.
|
|
if he.escalateFn != nil {
|
|
he.escalateFn(alert)
|
|
}
|
|
}
|
|
|
|
op.CompletedAt = time.Now()
|
|
he.setCooldown(strategy.ID, strategy.Cooldown)
|
|
}
|
|
|
|
// findStrategy returns the first matching strategy for an alert.
|
|
func (he *HealingEngine) findStrategy(alert HealthAlert) *HealingStrategy {
|
|
he.mu.RLock()
|
|
defer he.mu.RUnlock()
|
|
|
|
for i := range he.strategies {
|
|
s := &he.strategies[i]
|
|
if he.matchesTrigger(s.Trigger, alert) {
|
|
return s
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// matchesTrigger checks if an alert matches a strategy's trigger condition.
|
|
func (he *HealingEngine) matchesTrigger(trigger TriggerCondition, alert HealthAlert) bool {
|
|
// Match by metric name.
|
|
for _, m := range trigger.Metrics {
|
|
if m == alert.Metric {
|
|
return true
|
|
}
|
|
}
|
|
|
|
// Match by component status.
|
|
for _, s := range trigger.Statuses {
|
|
switch s {
|
|
case StatusCritical:
|
|
if alert.Severity == SeverityCritical {
|
|
return true
|
|
}
|
|
case StatusOffline:
|
|
if alert.Severity == SeverityCritical && alert.SuggestedAction == "restart" {
|
|
return true
|
|
}
|
|
}
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
// isInCooldown checks if a strategy is still in its cooldown period.
|
|
func (he *HealingEngine) isInCooldown(strategyID string) bool {
|
|
he.mu.RLock()
|
|
defer he.mu.RUnlock()
|
|
|
|
earliest, ok := he.cooldowns[strategyID]
|
|
return ok && time.Now().Before(earliest)
|
|
}
|
|
|
|
// setCooldown marks a strategy as cooling down.
|
|
func (he *HealingEngine) setCooldown(strategyID string, duration time.Duration) {
|
|
he.mu.Lock()
|
|
defer he.mu.Unlock()
|
|
he.cooldowns[strategyID] = time.Now().Add(duration)
|
|
}
|
|
|
|
// createOperation creates and records a new healing operation.
|
|
func (he *HealingEngine) createOperation(strategy *HealingStrategy, component string) *HealingOperation {
|
|
he.mu.Lock()
|
|
defer he.mu.Unlock()
|
|
|
|
he.opCounter++
|
|
op := &HealingOperation{
|
|
ID: fmt.Sprintf("heal-%d", he.opCounter),
|
|
StrategyID: strategy.ID,
|
|
Component: component,
|
|
State: HealingIdle,
|
|
StartedAt: time.Now(),
|
|
ActionsRun: make([]ActionLog, 0),
|
|
}
|
|
he.operations = append(he.operations, op)
|
|
return op
|
|
}
|
|
|
|
// transitionOp moves an operation to a new state.
|
|
func (he *HealingEngine) transitionOp(op *HealingOperation, newState HealingState) {
|
|
he.logger.Debug("healing state transition",
|
|
"op_id", op.ID,
|
|
"from", op.State,
|
|
"to", newState,
|
|
)
|
|
op.State = newState
|
|
}
|
|
|
|
// diagnose performs root cause analysis for an alert.
|
|
func (he *HealingEngine) diagnose(alert HealthAlert) Diagnosis {
|
|
rootCause := "unknown"
|
|
confidence := 0.5
|
|
suggestedFix := "restart component"
|
|
|
|
switch {
|
|
case alert.Metric == "memory" && alert.Current > 90:
|
|
rootCause = "memory_exhaustion"
|
|
confidence = 0.9
|
|
suggestedFix = "restart with increased limits"
|
|
case alert.Metric == "cpu" && alert.Current > 90:
|
|
rootCause = "cpu_saturation"
|
|
confidence = 0.8
|
|
suggestedFix = "check for runaway goroutines"
|
|
case alert.Metric == "error_rate":
|
|
rootCause = "elevated_error_rate"
|
|
confidence = 0.7
|
|
suggestedFix = "check dependencies and config"
|
|
case alert.Metric == "latency_p99":
|
|
rootCause = "latency_degradation"
|
|
confidence = 0.6
|
|
suggestedFix = "check database and network"
|
|
case alert.Metric == "quorum":
|
|
rootCause = "quorum_loss"
|
|
confidence = 0.95
|
|
suggestedFix = "activate safe mode"
|
|
default:
|
|
rootCause = fmt.Sprintf("threshold_breach_%s", alert.Metric)
|
|
confidence = 0.5
|
|
suggestedFix = "investigate manually"
|
|
}
|
|
|
|
return Diagnosis{
|
|
Component: alert.Component,
|
|
Metric: alert.Metric,
|
|
RootCause: rootCause,
|
|
Confidence: confidence,
|
|
SuggestedFix: suggestedFix,
|
|
}
|
|
}
|
|
|
|
// executeActions runs each action in sequence.
|
|
func (he *HealingEngine) executeActions(ctx context.Context, strategy *HealingStrategy, op *HealingOperation) error {
|
|
for _, action := range strategy.Actions {
|
|
actionCtx := ctx
|
|
var cancel context.CancelFunc
|
|
if action.Timeout > 0 {
|
|
actionCtx, cancel = context.WithTimeout(ctx, action.Timeout)
|
|
}
|
|
|
|
start := time.Now()
|
|
err := he.executor(actionCtx, action, op.Component)
|
|
duration := time.Since(start)
|
|
|
|
if cancel != nil {
|
|
cancel()
|
|
}
|
|
|
|
logEntry := ActionLog{
|
|
Action: action.Type,
|
|
StartedAt: start,
|
|
Duration: duration,
|
|
Success: err == nil,
|
|
}
|
|
if err != nil {
|
|
logEntry.Error = err.Error()
|
|
}
|
|
op.ActionsRun = append(op.ActionsRun, logEntry)
|
|
|
|
if err != nil {
|
|
switch action.OnError {
|
|
case "continue":
|
|
he.logger.Warn("action failed, continuing",
|
|
"action", action.Type,
|
|
"error", err,
|
|
)
|
|
case "rollback":
|
|
return fmt.Errorf("action %s failed (rollback): %w", action.Type, err)
|
|
default: // "abort"
|
|
return fmt.Errorf("action %s failed: %w", action.Type, err)
|
|
}
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// verifyRecovery checks if the component is healthy after healing.
|
|
func (he *HealingEngine) verifyRecovery(ctx context.Context, strategy *HealingStrategy, component string) error {
|
|
// Execute a verify_health action if not already in the strategy.
|
|
verifyAction := Action{
|
|
Type: ActionVerifyHealth,
|
|
Timeout: 30 * time.Second,
|
|
}
|
|
return he.executor(ctx, verifyAction, component)
|
|
}
|
|
|
|
// executeRollback runs the rollback plan for a failed healing.
|
|
func (he *HealingEngine) executeRollback(ctx context.Context, strategy *HealingStrategy, op *HealingOperation) {
|
|
if len(strategy.Rollback.Actions) == 0 {
|
|
he.logger.Info("no rollback actions defined",
|
|
"strategy", strategy.ID,
|
|
)
|
|
return
|
|
}
|
|
|
|
he.logger.Warn("executing rollback",
|
|
"strategy", strategy.ID,
|
|
"component", op.Component,
|
|
)
|
|
|
|
for _, action := range strategy.Rollback.Actions {
|
|
if err := he.executor(ctx, action, op.Component); err != nil {
|
|
he.logger.Error("rollback action failed",
|
|
"action", action.Type,
|
|
"error", err,
|
|
)
|
|
}
|
|
}
|
|
}
|
|
|
|
// GetOperation returns a healing operation by ID.
|
|
// Returns a deep copy to prevent data races with the healing goroutine.
|
|
func (he *HealingEngine) GetOperation(id string) (*HealingOperation, bool) {
|
|
he.mu.RLock()
|
|
defer he.mu.RUnlock()
|
|
|
|
for _, op := range he.operations {
|
|
if op.ID == id {
|
|
cp := *op
|
|
cp.ActionsRun = make([]ActionLog, len(op.ActionsRun))
|
|
copy(cp.ActionsRun, op.ActionsRun)
|
|
if op.Diagnosis != nil {
|
|
diag := *op.Diagnosis
|
|
cp.Diagnosis = &diag
|
|
}
|
|
return &cp, true
|
|
}
|
|
}
|
|
return nil, false
|
|
}
|
|
|
|
// RecentOperations returns the last N operations.
|
|
// Returns deep copies to prevent data races with the healing goroutine.
|
|
func (he *HealingEngine) RecentOperations(n int) []HealingOperation {
|
|
he.mu.RLock()
|
|
defer he.mu.RUnlock()
|
|
|
|
total := len(he.operations)
|
|
if total == 0 {
|
|
return nil
|
|
}
|
|
start := total - n
|
|
if start < 0 {
|
|
start = 0
|
|
}
|
|
|
|
result := make([]HealingOperation, 0, n)
|
|
for i := start; i < total; i++ {
|
|
cp := *he.operations[i]
|
|
cp.ActionsRun = make([]ActionLog, len(he.operations[i].ActionsRun))
|
|
copy(cp.ActionsRun, he.operations[i].ActionsRun)
|
|
if he.operations[i].Diagnosis != nil {
|
|
diag := *he.operations[i].Diagnosis
|
|
cp.Diagnosis = &diag
|
|
}
|
|
result = append(result, cp)
|
|
}
|
|
return result
|
|
}
|
|
|
|
// StrategyCount returns the number of registered strategies.
|
|
func (he *HealingEngine) StrategyCount() int {
|
|
he.mu.RLock()
|
|
defer he.mu.RUnlock()
|
|
return len(he.strategies)
|
|
}
|