gomcp/internal/application/resilience/recovery_playbooks.go

399 lines
13 KiB
Go
Raw Normal View History

package resilience
import (
"context"
"fmt"
"log/slog"
"sync"
"time"
)
// PlaybookStatus tracks the state of a running playbook.
type PlaybookStatus string
const (
PlaybookPending PlaybookStatus = "PENDING"
PlaybookRunning PlaybookStatus = "RUNNING"
PlaybookSucceeded PlaybookStatus = "SUCCEEDED"
PlaybookFailed PlaybookStatus = "FAILED"
PlaybookRolledBack PlaybookStatus = "ROLLED_BACK"
)
// PlaybookStep is a single step in a recovery playbook.
type PlaybookStep struct {
ID string `json:"id"`
Name string `json:"name"`
Type string `json:"type"` // shell, api, consensus, crypto, systemd, http, prometheus
Timeout time.Duration `json:"timeout"`
Retries int `json:"retries"`
Params map[string]interface{} `json:"params,omitempty"`
OnError string `json:"on_error"` // abort, continue, rollback
Condition string `json:"condition,omitempty"` // prerequisite condition
}
// Playbook defines a complete recovery procedure.
type Playbook struct {
ID string `json:"id"`
Name string `json:"name"`
Version string `json:"version"`
TriggerMetric string `json:"trigger_metric"`
TriggerSeverity string `json:"trigger_severity"`
DiagnosisChecks []PlaybookStep `json:"diagnosis_checks"`
Actions []PlaybookStep `json:"actions"`
RollbackActions []PlaybookStep `json:"rollback_actions"`
SuccessCriteria []string `json:"success_criteria"`
}
// PlaybookExecution tracks a single playbook run.
type PlaybookExecution struct {
ID string `json:"id"`
PlaybookID string `json:"playbook_id"`
Component string `json:"component"`
Status PlaybookStatus `json:"status"`
StartedAt time.Time `json:"started_at"`
CompletedAt time.Time `json:"completed_at,omitempty"`
StepsRun []StepResult `json:"steps_run"`
Error string `json:"error,omitempty"`
}
// StepResult records the execution of a single playbook step.
type StepResult struct {
StepID string `json:"step_id"`
StepName string `json:"step_name"`
Success bool `json:"success"`
Duration time.Duration `json:"duration"`
Output string `json:"output,omitempty"`
Error string `json:"error,omitempty"`
}
// PlaybookExecutorFunc runs a single playbook step.
type PlaybookExecutorFunc func(ctx context.Context, step PlaybookStep, component string) (string, error)
// RecoveryPlaybookEngine manages and executes recovery playbooks.
type RecoveryPlaybookEngine struct {
mu sync.RWMutex
playbooks map[string]*Playbook
executions []*PlaybookExecution
execCount int64
executor PlaybookExecutorFunc
logger *slog.Logger
}
// NewRecoveryPlaybookEngine creates a new playbook engine.
func NewRecoveryPlaybookEngine(executor PlaybookExecutorFunc) *RecoveryPlaybookEngine {
return &RecoveryPlaybookEngine{
playbooks: make(map[string]*Playbook),
executions: make([]*PlaybookExecution, 0),
executor: executor,
logger: slog.Default().With("component", "sarl-recovery-playbooks"),
}
}
// RegisterPlaybook adds a playbook to the engine.
func (rpe *RecoveryPlaybookEngine) RegisterPlaybook(pb Playbook) {
rpe.mu.Lock()
defer rpe.mu.Unlock()
rpe.playbooks[pb.ID] = &pb
rpe.logger.Info("playbook registered", "id", pb.ID, "name", pb.Name)
}
// Execute runs a playbook for a given component. Returns the execution ID.
func (rpe *RecoveryPlaybookEngine) Execute(ctx context.Context, playbookID, component string) (string, error) {
rpe.mu.Lock()
pb, ok := rpe.playbooks[playbookID]
if !ok {
rpe.mu.Unlock()
return "", fmt.Errorf("playbook %s not found", playbookID)
}
rpe.execCount++
exec := &PlaybookExecution{
ID: fmt.Sprintf("exec-%d", rpe.execCount),
PlaybookID: playbookID,
Component: component,
Status: PlaybookRunning,
StartedAt: time.Now(),
StepsRun: make([]StepResult, 0),
}
rpe.executions = append(rpe.executions, exec)
rpe.mu.Unlock()
rpe.logger.Info("playbook execution started",
"exec_id", exec.ID,
"playbook", pb.Name,
"component", component,
)
// Phase 1: Diagnosis checks.
for _, check := range pb.DiagnosisChecks {
result := rpe.runStep(ctx, check, component)
exec.StepsRun = append(exec.StepsRun, result)
if !result.Success {
rpe.logger.Warn("diagnosis check failed",
"step", check.ID,
"error", result.Error,
)
}
}
// Phase 2: Execute recovery actions.
var execErr error
for _, action := range pb.Actions {
result := rpe.runStep(ctx, action, component)
exec.StepsRun = append(exec.StepsRun, result)
if !result.Success {
switch action.OnError {
case "continue":
continue
case "rollback":
execErr = fmt.Errorf("step %s failed (rollback): %s", action.ID, result.Error)
default: // "abort"
execErr = fmt.Errorf("step %s failed: %s", action.ID, result.Error)
}
break
}
}
// Phase 3: Handle result.
if execErr != nil {
rpe.logger.Error("playbook failed, executing rollback",
"exec_id", exec.ID,
"error", execErr,
)
// Execute rollback.
for _, rb := range pb.RollbackActions {
result := rpe.runStep(ctx, rb, component)
exec.StepsRun = append(exec.StepsRun, result)
}
exec.Status = PlaybookRolledBack
exec.Error = execErr.Error()
} else {
exec.Status = PlaybookSucceeded
rpe.logger.Info("playbook succeeded",
"exec_id", exec.ID,
"component", component,
"duration", time.Since(exec.StartedAt),
)
}
exec.CompletedAt = time.Now()
return exec.ID, execErr
}
// runStep executes a single step with timeout and retries.
func (rpe *RecoveryPlaybookEngine) runStep(ctx context.Context, step PlaybookStep, component string) StepResult {
start := time.Now()
result := StepResult{
StepID: step.ID,
StepName: step.Name,
}
retries := step.Retries
if retries <= 0 {
retries = 1
}
var lastErr error
for attempt := 0; attempt < retries; attempt++ {
stepCtx := ctx
var cancel context.CancelFunc
if step.Timeout > 0 {
stepCtx, cancel = context.WithTimeout(ctx, step.Timeout)
}
output, err := rpe.executor(stepCtx, step, component)
if cancel != nil {
cancel()
}
if err == nil {
result.Success = true
result.Output = output
result.Duration = time.Since(start)
return result
}
lastErr = err
if attempt < retries-1 {
rpe.logger.Warn("step retry",
"step", step.ID,
"attempt", attempt+1,
"error", err,
)
}
}
result.Success = false
result.Error = lastErr.Error()
result.Duration = time.Since(start)
return result
}
// GetExecution returns a playbook execution by ID.
// Returns a deep copy to prevent data races with the execution goroutine.
func (rpe *RecoveryPlaybookEngine) GetExecution(id string) (*PlaybookExecution, bool) {
rpe.mu.RLock()
defer rpe.mu.RUnlock()
for _, exec := range rpe.executions {
if exec.ID == id {
cp := *exec
cp.StepsRun = make([]StepResult, len(exec.StepsRun))
copy(cp.StepsRun, exec.StepsRun)
return &cp, true
}
}
return nil, false
}
// RecentExecutions returns the last N executions.
// Returns deep copies to prevent data races with the execution goroutine.
func (rpe *RecoveryPlaybookEngine) RecentExecutions(n int) []PlaybookExecution {
rpe.mu.RLock()
defer rpe.mu.RUnlock()
total := len(rpe.executions)
if total == 0 {
return nil
}
start := total - n
if start < 0 {
start = 0
}
result := make([]PlaybookExecution, 0, n)
for i := start; i < total; i++ {
cp := *rpe.executions[i]
cp.StepsRun = make([]StepResult, len(rpe.executions[i].StepsRun))
copy(cp.StepsRun, rpe.executions[i].StepsRun)
result = append(result, cp)
}
return result
}
// PlaybookCount returns the number of registered playbooks.
func (rpe *RecoveryPlaybookEngine) PlaybookCount() int {
rpe.mu.RLock()
defer rpe.mu.RUnlock()
return len(rpe.playbooks)
}
// --- Built-in playbooks per ТЗ §7.1 ---
// DefaultPlaybooks returns the 3 built-in recovery playbooks.
func DefaultPlaybooks() []Playbook {
return []Playbook{
ComponentResurrectionPlaybook(),
ConsensusRecoveryPlaybook(),
CryptoRotationPlaybook(),
}
}
// ComponentResurrectionPlaybook per ТЗ §7.1.1.
func ComponentResurrectionPlaybook() Playbook {
return Playbook{
ID: "component-resurrection",
Name: "Component Resurrection",
Version: "1.0",
TriggerMetric: "component_offline",
TriggerSeverity: "CRITICAL",
DiagnosisChecks: []PlaybookStep{
{ID: "diag-process", Name: "Check process exists", Type: "shell", Timeout: 5 * time.Second},
{ID: "diag-crashes", Name: "Check recent crashes", Type: "shell", Timeout: 5 * time.Second},
{ID: "diag-resources", Name: "Check resource exhaustion", Type: "prometheus", Timeout: 5 * time.Second},
{ID: "diag-deps", Name: "Check dependency health", Type: "http", Timeout: 10 * time.Second},
},
Actions: []PlaybookStep{
{ID: "capture-forensics", Name: "Capture forensics", Type: "shell", Timeout: 30 * time.Second, OnError: "continue"},
{ID: "clear-resources", Name: "Clear temp resources", Type: "shell", Timeout: 10 * time.Second, OnError: "continue"},
{ID: "restart-component", Name: "Restart component", Type: "systemd", Timeout: 60 * time.Second, OnError: "abort"},
{ID: "verify-health", Name: "Verify health", Type: "http", Timeout: 30 * time.Second, Retries: 3, OnError: "abort"},
{ID: "verify-metrics", Name: "Verify metrics", Type: "prometheus", Timeout: 30 * time.Second, OnError: "continue"},
{ID: "notify-success", Name: "Notify SOC", Type: "api", Timeout: 5 * time.Second, OnError: "continue"},
},
RollbackActions: []PlaybookStep{
{ID: "rb-safe-mode", Name: "Enter safe mode", Type: "api", Timeout: 10 * time.Second},
{ID: "rb-notify", Name: "Notify architect", Type: "api", Timeout: 5 * time.Second},
},
SuccessCriteria: []string{
"component_status == HEALTHY",
"health_check_passed == true",
"no_crashes_for_5min == true",
},
}
}
// ConsensusRecoveryPlaybook per ТЗ §7.1.2.
func ConsensusRecoveryPlaybook() Playbook {
return Playbook{
ID: "consensus-recovery",
Name: "Distributed Consensus Recovery",
Version: "1.0",
TriggerMetric: "split_brain",
TriggerSeverity: "CRITICAL",
DiagnosisChecks: []PlaybookStep{
{ID: "diag-peers", Name: "Check peer connectivity", Type: "api", Timeout: 10 * time.Second},
{ID: "diag-sync", Name: "Check sync status", Type: "api", Timeout: 10 * time.Second},
{ID: "diag-genome", Name: "Verify genome", Type: "api", Timeout: 5 * time.Second},
},
Actions: []PlaybookStep{
{ID: "pause-writes", Name: "Pause all writes", Type: "api", Timeout: 10 * time.Second, OnError: "abort"},
{ID: "elect-leader", Name: "Elect leader (Raft)", Type: "consensus", Timeout: 60 * time.Second, OnError: "abort"},
{ID: "sync-state", Name: "Sync state from leader", Type: "api", Timeout: 300 * time.Second, OnError: "rollback"},
{ID: "verify-consistency", Name: "Verify consistency", Type: "api", Timeout: 60 * time.Second, OnError: "abort"},
{ID: "resume-writes", Name: "Resume writes", Type: "api", Timeout: 10 * time.Second, OnError: "abort"},
{ID: "notify-cluster", Name: "Notify cluster", Type: "api", Timeout: 5 * time.Second, OnError: "continue"},
},
RollbackActions: []PlaybookStep{
{ID: "rb-readonly", Name: "Maintain readonly", Type: "api", Timeout: 10 * time.Second},
{ID: "rb-notify", Name: "Notify architect", Type: "api", Timeout: 5 * time.Second},
},
SuccessCriteria: []string{
"leader_elected == true",
"state_synced == true",
"consistency_verified == true",
"writes_resumed == true",
},
}
}
// CryptoRotationPlaybook per ТЗ §7.1.3.
func CryptoRotationPlaybook() Playbook {
return Playbook{
ID: "crypto-rotation",
Name: "Cryptographic Key Rotation",
Version: "1.0",
TriggerMetric: "key_compromise",
TriggerSeverity: "HIGH",
DiagnosisChecks: []PlaybookStep{
{ID: "diag-key-age", Name: "Check key age", Type: "crypto", Timeout: 5 * time.Second},
{ID: "diag-usage", Name: "Check key usage anomaly", Type: "prometheus", Timeout: 5 * time.Second},
{ID: "diag-tpm", Name: "Check TPM health", Type: "shell", Timeout: 5 * time.Second},
},
Actions: []PlaybookStep{
{ID: "gen-keys", Name: "Generate new keys", Type: "crypto", Timeout: 30 * time.Second, OnError: "abort",
Params: map[string]interface{}{"algorithm": "ECDSA-P256"},
},
{ID: "rotate-certs", Name: "Rotate mTLS certs", Type: "crypto", Timeout: 120 * time.Second, OnError: "rollback"},
{ID: "resign-chain", Name: "Re-sign decision chain", Type: "crypto", Timeout: 300 * time.Second, OnError: "continue"},
{ID: "verify-peers", Name: "Verify peer certs", Type: "api", Timeout: 60 * time.Second, OnError: "abort"},
{ID: "revoke-old", Name: "Revoke old keys", Type: "crypto", Timeout: 30 * time.Second, OnError: "continue"},
{ID: "notify-soc", Name: "Notify SOC", Type: "api", Timeout: 5 * time.Second, OnError: "continue"},
},
RollbackActions: []PlaybookStep{
{ID: "rb-revert-keys", Name: "Revert to previous keys", Type: "crypto", Timeout: 30 * time.Second},
{ID: "rb-notify", Name: "Notify architect", Type: "api", Timeout: 5 * time.Second},
},
SuccessCriteria: []string{
"new_keys_generated == true",
"certs_distributed == true",
"peers_verified == true",
"old_keys_revoked == true",
},
}
}