mirror of
https://github.com/syntrex-lab/gomcp.git
synced 2026-05-08 19:12:37 +02:00
399 lines
13 KiB
Go
399 lines
13 KiB
Go
|
|
package resilience
|
|||
|
|
|
|||
|
|
import (
|
|||
|
|
"context"
|
|||
|
|
"fmt"
|
|||
|
|
"log/slog"
|
|||
|
|
"sync"
|
|||
|
|
"time"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// PlaybookStatus tracks the state of a running playbook.
|
|||
|
|
type PlaybookStatus string
|
|||
|
|
|
|||
|
|
const (
|
|||
|
|
PlaybookPending PlaybookStatus = "PENDING"
|
|||
|
|
PlaybookRunning PlaybookStatus = "RUNNING"
|
|||
|
|
PlaybookSucceeded PlaybookStatus = "SUCCEEDED"
|
|||
|
|
PlaybookFailed PlaybookStatus = "FAILED"
|
|||
|
|
PlaybookRolledBack PlaybookStatus = "ROLLED_BACK"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// PlaybookStep is a single step in a recovery playbook.
|
|||
|
|
type PlaybookStep struct {
|
|||
|
|
ID string `json:"id"`
|
|||
|
|
Name string `json:"name"`
|
|||
|
|
Type string `json:"type"` // shell, api, consensus, crypto, systemd, http, prometheus
|
|||
|
|
Timeout time.Duration `json:"timeout"`
|
|||
|
|
Retries int `json:"retries"`
|
|||
|
|
Params map[string]interface{} `json:"params,omitempty"`
|
|||
|
|
OnError string `json:"on_error"` // abort, continue, rollback
|
|||
|
|
Condition string `json:"condition,omitempty"` // prerequisite condition
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Playbook defines a complete recovery procedure.
|
|||
|
|
type Playbook struct {
|
|||
|
|
ID string `json:"id"`
|
|||
|
|
Name string `json:"name"`
|
|||
|
|
Version string `json:"version"`
|
|||
|
|
TriggerMetric string `json:"trigger_metric"`
|
|||
|
|
TriggerSeverity string `json:"trigger_severity"`
|
|||
|
|
DiagnosisChecks []PlaybookStep `json:"diagnosis_checks"`
|
|||
|
|
Actions []PlaybookStep `json:"actions"`
|
|||
|
|
RollbackActions []PlaybookStep `json:"rollback_actions"`
|
|||
|
|
SuccessCriteria []string `json:"success_criteria"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// PlaybookExecution tracks a single playbook run.
|
|||
|
|
type PlaybookExecution struct {
|
|||
|
|
ID string `json:"id"`
|
|||
|
|
PlaybookID string `json:"playbook_id"`
|
|||
|
|
Component string `json:"component"`
|
|||
|
|
Status PlaybookStatus `json:"status"`
|
|||
|
|
StartedAt time.Time `json:"started_at"`
|
|||
|
|
CompletedAt time.Time `json:"completed_at,omitempty"`
|
|||
|
|
StepsRun []StepResult `json:"steps_run"`
|
|||
|
|
Error string `json:"error,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// StepResult records the execution of a single playbook step.
|
|||
|
|
type StepResult struct {
|
|||
|
|
StepID string `json:"step_id"`
|
|||
|
|
StepName string `json:"step_name"`
|
|||
|
|
Success bool `json:"success"`
|
|||
|
|
Duration time.Duration `json:"duration"`
|
|||
|
|
Output string `json:"output,omitempty"`
|
|||
|
|
Error string `json:"error,omitempty"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// PlaybookExecutorFunc runs a single playbook step.
|
|||
|
|
type PlaybookExecutorFunc func(ctx context.Context, step PlaybookStep, component string) (string, error)
|
|||
|
|
|
|||
|
|
// RecoveryPlaybookEngine manages and executes recovery playbooks.
|
|||
|
|
type RecoveryPlaybookEngine struct {
|
|||
|
|
mu sync.RWMutex
|
|||
|
|
playbooks map[string]*Playbook
|
|||
|
|
executions []*PlaybookExecution
|
|||
|
|
execCount int64
|
|||
|
|
executor PlaybookExecutorFunc
|
|||
|
|
logger *slog.Logger
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// NewRecoveryPlaybookEngine creates a new playbook engine.
|
|||
|
|
func NewRecoveryPlaybookEngine(executor PlaybookExecutorFunc) *RecoveryPlaybookEngine {
|
|||
|
|
return &RecoveryPlaybookEngine{
|
|||
|
|
playbooks: make(map[string]*Playbook),
|
|||
|
|
executions: make([]*PlaybookExecution, 0),
|
|||
|
|
executor: executor,
|
|||
|
|
logger: slog.Default().With("component", "sarl-recovery-playbooks"),
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// RegisterPlaybook adds a playbook to the engine.
|
|||
|
|
func (rpe *RecoveryPlaybookEngine) RegisterPlaybook(pb Playbook) {
|
|||
|
|
rpe.mu.Lock()
|
|||
|
|
defer rpe.mu.Unlock()
|
|||
|
|
rpe.playbooks[pb.ID] = &pb
|
|||
|
|
rpe.logger.Info("playbook registered", "id", pb.ID, "name", pb.Name)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Execute runs a playbook for a given component. Returns the execution ID.
|
|||
|
|
func (rpe *RecoveryPlaybookEngine) Execute(ctx context.Context, playbookID, component string) (string, error) {
|
|||
|
|
rpe.mu.Lock()
|
|||
|
|
pb, ok := rpe.playbooks[playbookID]
|
|||
|
|
if !ok {
|
|||
|
|
rpe.mu.Unlock()
|
|||
|
|
return "", fmt.Errorf("playbook %s not found", playbookID)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
rpe.execCount++
|
|||
|
|
exec := &PlaybookExecution{
|
|||
|
|
ID: fmt.Sprintf("exec-%d", rpe.execCount),
|
|||
|
|
PlaybookID: playbookID,
|
|||
|
|
Component: component,
|
|||
|
|
Status: PlaybookRunning,
|
|||
|
|
StartedAt: time.Now(),
|
|||
|
|
StepsRun: make([]StepResult, 0),
|
|||
|
|
}
|
|||
|
|
rpe.executions = append(rpe.executions, exec)
|
|||
|
|
rpe.mu.Unlock()
|
|||
|
|
|
|||
|
|
rpe.logger.Info("playbook execution started",
|
|||
|
|
"exec_id", exec.ID,
|
|||
|
|
"playbook", pb.Name,
|
|||
|
|
"component", component,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// Phase 1: Diagnosis checks.
|
|||
|
|
for _, check := range pb.DiagnosisChecks {
|
|||
|
|
result := rpe.runStep(ctx, check, component)
|
|||
|
|
exec.StepsRun = append(exec.StepsRun, result)
|
|||
|
|
if !result.Success {
|
|||
|
|
rpe.logger.Warn("diagnosis check failed",
|
|||
|
|
"step", check.ID,
|
|||
|
|
"error", result.Error,
|
|||
|
|
)
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Phase 2: Execute recovery actions.
|
|||
|
|
var execErr error
|
|||
|
|
for _, action := range pb.Actions {
|
|||
|
|
result := rpe.runStep(ctx, action, component)
|
|||
|
|
exec.StepsRun = append(exec.StepsRun, result)
|
|||
|
|
|
|||
|
|
if !result.Success {
|
|||
|
|
switch action.OnError {
|
|||
|
|
case "continue":
|
|||
|
|
continue
|
|||
|
|
case "rollback":
|
|||
|
|
execErr = fmt.Errorf("step %s failed (rollback): %s", action.ID, result.Error)
|
|||
|
|
default: // "abort"
|
|||
|
|
execErr = fmt.Errorf("step %s failed: %s", action.ID, result.Error)
|
|||
|
|
}
|
|||
|
|
break
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Phase 3: Handle result.
|
|||
|
|
if execErr != nil {
|
|||
|
|
rpe.logger.Error("playbook failed, executing rollback",
|
|||
|
|
"exec_id", exec.ID,
|
|||
|
|
"error", execErr,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// Execute rollback.
|
|||
|
|
for _, rb := range pb.RollbackActions {
|
|||
|
|
result := rpe.runStep(ctx, rb, component)
|
|||
|
|
exec.StepsRun = append(exec.StepsRun, result)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
exec.Status = PlaybookRolledBack
|
|||
|
|
exec.Error = execErr.Error()
|
|||
|
|
} else {
|
|||
|
|
exec.Status = PlaybookSucceeded
|
|||
|
|
rpe.logger.Info("playbook succeeded",
|
|||
|
|
"exec_id", exec.ID,
|
|||
|
|
"component", component,
|
|||
|
|
"duration", time.Since(exec.StartedAt),
|
|||
|
|
)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
exec.CompletedAt = time.Now()
|
|||
|
|
return exec.ID, execErr
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// runStep executes a single step with timeout and retries.
|
|||
|
|
func (rpe *RecoveryPlaybookEngine) runStep(ctx context.Context, step PlaybookStep, component string) StepResult {
|
|||
|
|
start := time.Now()
|
|||
|
|
result := StepResult{
|
|||
|
|
StepID: step.ID,
|
|||
|
|
StepName: step.Name,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
retries := step.Retries
|
|||
|
|
if retries <= 0 {
|
|||
|
|
retries = 1
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
var lastErr error
|
|||
|
|
for attempt := 0; attempt < retries; attempt++ {
|
|||
|
|
stepCtx := ctx
|
|||
|
|
var cancel context.CancelFunc
|
|||
|
|
if step.Timeout > 0 {
|
|||
|
|
stepCtx, cancel = context.WithTimeout(ctx, step.Timeout)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
output, err := rpe.executor(stepCtx, step, component)
|
|||
|
|
|
|||
|
|
if cancel != nil {
|
|||
|
|
cancel()
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if err == nil {
|
|||
|
|
result.Success = true
|
|||
|
|
result.Output = output
|
|||
|
|
result.Duration = time.Since(start)
|
|||
|
|
return result
|
|||
|
|
}
|
|||
|
|
lastErr = err
|
|||
|
|
|
|||
|
|
if attempt < retries-1 {
|
|||
|
|
rpe.logger.Warn("step retry",
|
|||
|
|
"step", step.ID,
|
|||
|
|
"attempt", attempt+1,
|
|||
|
|
"error", err,
|
|||
|
|
)
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
result.Success = false
|
|||
|
|
result.Error = lastErr.Error()
|
|||
|
|
result.Duration = time.Since(start)
|
|||
|
|
return result
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// GetExecution returns a playbook execution by ID.
|
|||
|
|
// Returns a deep copy to prevent data races with the execution goroutine.
|
|||
|
|
func (rpe *RecoveryPlaybookEngine) GetExecution(id string) (*PlaybookExecution, bool) {
|
|||
|
|
rpe.mu.RLock()
|
|||
|
|
defer rpe.mu.RUnlock()
|
|||
|
|
|
|||
|
|
for _, exec := range rpe.executions {
|
|||
|
|
if exec.ID == id {
|
|||
|
|
cp := *exec
|
|||
|
|
cp.StepsRun = make([]StepResult, len(exec.StepsRun))
|
|||
|
|
copy(cp.StepsRun, exec.StepsRun)
|
|||
|
|
return &cp, true
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
return nil, false
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// RecentExecutions returns the last N executions.
|
|||
|
|
// Returns deep copies to prevent data races with the execution goroutine.
|
|||
|
|
func (rpe *RecoveryPlaybookEngine) RecentExecutions(n int) []PlaybookExecution {
|
|||
|
|
rpe.mu.RLock()
|
|||
|
|
defer rpe.mu.RUnlock()
|
|||
|
|
|
|||
|
|
total := len(rpe.executions)
|
|||
|
|
if total == 0 {
|
|||
|
|
return nil
|
|||
|
|
}
|
|||
|
|
start := total - n
|
|||
|
|
if start < 0 {
|
|||
|
|
start = 0
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
result := make([]PlaybookExecution, 0, n)
|
|||
|
|
for i := start; i < total; i++ {
|
|||
|
|
cp := *rpe.executions[i]
|
|||
|
|
cp.StepsRun = make([]StepResult, len(rpe.executions[i].StepsRun))
|
|||
|
|
copy(cp.StepsRun, rpe.executions[i].StepsRun)
|
|||
|
|
result = append(result, cp)
|
|||
|
|
}
|
|||
|
|
return result
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// PlaybookCount returns the number of registered playbooks.
|
|||
|
|
func (rpe *RecoveryPlaybookEngine) PlaybookCount() int {
|
|||
|
|
rpe.mu.RLock()
|
|||
|
|
defer rpe.mu.RUnlock()
|
|||
|
|
return len(rpe.playbooks)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// --- Built-in playbooks per ТЗ §7.1 ---
|
|||
|
|
|
|||
|
|
// DefaultPlaybooks returns the 3 built-in recovery playbooks.
|
|||
|
|
func DefaultPlaybooks() []Playbook {
|
|||
|
|
return []Playbook{
|
|||
|
|
ComponentResurrectionPlaybook(),
|
|||
|
|
ConsensusRecoveryPlaybook(),
|
|||
|
|
CryptoRotationPlaybook(),
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ComponentResurrectionPlaybook per ТЗ §7.1.1.
|
|||
|
|
func ComponentResurrectionPlaybook() Playbook {
|
|||
|
|
return Playbook{
|
|||
|
|
ID: "component-resurrection",
|
|||
|
|
Name: "Component Resurrection",
|
|||
|
|
Version: "1.0",
|
|||
|
|
TriggerMetric: "component_offline",
|
|||
|
|
TriggerSeverity: "CRITICAL",
|
|||
|
|
DiagnosisChecks: []PlaybookStep{
|
|||
|
|
{ID: "diag-process", Name: "Check process exists", Type: "shell", Timeout: 5 * time.Second},
|
|||
|
|
{ID: "diag-crashes", Name: "Check recent crashes", Type: "shell", Timeout: 5 * time.Second},
|
|||
|
|
{ID: "diag-resources", Name: "Check resource exhaustion", Type: "prometheus", Timeout: 5 * time.Second},
|
|||
|
|
{ID: "diag-deps", Name: "Check dependency health", Type: "http", Timeout: 10 * time.Second},
|
|||
|
|
},
|
|||
|
|
Actions: []PlaybookStep{
|
|||
|
|
{ID: "capture-forensics", Name: "Capture forensics", Type: "shell", Timeout: 30 * time.Second, OnError: "continue"},
|
|||
|
|
{ID: "clear-resources", Name: "Clear temp resources", Type: "shell", Timeout: 10 * time.Second, OnError: "continue"},
|
|||
|
|
{ID: "restart-component", Name: "Restart component", Type: "systemd", Timeout: 60 * time.Second, OnError: "abort"},
|
|||
|
|
{ID: "verify-health", Name: "Verify health", Type: "http", Timeout: 30 * time.Second, Retries: 3, OnError: "abort"},
|
|||
|
|
{ID: "verify-metrics", Name: "Verify metrics", Type: "prometheus", Timeout: 30 * time.Second, OnError: "continue"},
|
|||
|
|
{ID: "notify-success", Name: "Notify SOC", Type: "api", Timeout: 5 * time.Second, OnError: "continue"},
|
|||
|
|
},
|
|||
|
|
RollbackActions: []PlaybookStep{
|
|||
|
|
{ID: "rb-safe-mode", Name: "Enter safe mode", Type: "api", Timeout: 10 * time.Second},
|
|||
|
|
{ID: "rb-notify", Name: "Notify architect", Type: "api", Timeout: 5 * time.Second},
|
|||
|
|
},
|
|||
|
|
SuccessCriteria: []string{
|
|||
|
|
"component_status == HEALTHY",
|
|||
|
|
"health_check_passed == true",
|
|||
|
|
"no_crashes_for_5min == true",
|
|||
|
|
},
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ConsensusRecoveryPlaybook per ТЗ §7.1.2.
|
|||
|
|
func ConsensusRecoveryPlaybook() Playbook {
|
|||
|
|
return Playbook{
|
|||
|
|
ID: "consensus-recovery",
|
|||
|
|
Name: "Distributed Consensus Recovery",
|
|||
|
|
Version: "1.0",
|
|||
|
|
TriggerMetric: "split_brain",
|
|||
|
|
TriggerSeverity: "CRITICAL",
|
|||
|
|
DiagnosisChecks: []PlaybookStep{
|
|||
|
|
{ID: "diag-peers", Name: "Check peer connectivity", Type: "api", Timeout: 10 * time.Second},
|
|||
|
|
{ID: "diag-sync", Name: "Check sync status", Type: "api", Timeout: 10 * time.Second},
|
|||
|
|
{ID: "diag-genome", Name: "Verify genome", Type: "api", Timeout: 5 * time.Second},
|
|||
|
|
},
|
|||
|
|
Actions: []PlaybookStep{
|
|||
|
|
{ID: "pause-writes", Name: "Pause all writes", Type: "api", Timeout: 10 * time.Second, OnError: "abort"},
|
|||
|
|
{ID: "elect-leader", Name: "Elect leader (Raft)", Type: "consensus", Timeout: 60 * time.Second, OnError: "abort"},
|
|||
|
|
{ID: "sync-state", Name: "Sync state from leader", Type: "api", Timeout: 300 * time.Second, OnError: "rollback"},
|
|||
|
|
{ID: "verify-consistency", Name: "Verify consistency", Type: "api", Timeout: 60 * time.Second, OnError: "abort"},
|
|||
|
|
{ID: "resume-writes", Name: "Resume writes", Type: "api", Timeout: 10 * time.Second, OnError: "abort"},
|
|||
|
|
{ID: "notify-cluster", Name: "Notify cluster", Type: "api", Timeout: 5 * time.Second, OnError: "continue"},
|
|||
|
|
},
|
|||
|
|
RollbackActions: []PlaybookStep{
|
|||
|
|
{ID: "rb-readonly", Name: "Maintain readonly", Type: "api", Timeout: 10 * time.Second},
|
|||
|
|
{ID: "rb-notify", Name: "Notify architect", Type: "api", Timeout: 5 * time.Second},
|
|||
|
|
},
|
|||
|
|
SuccessCriteria: []string{
|
|||
|
|
"leader_elected == true",
|
|||
|
|
"state_synced == true",
|
|||
|
|
"consistency_verified == true",
|
|||
|
|
"writes_resumed == true",
|
|||
|
|
},
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// CryptoRotationPlaybook per ТЗ §7.1.3.
|
|||
|
|
func CryptoRotationPlaybook() Playbook {
|
|||
|
|
return Playbook{
|
|||
|
|
ID: "crypto-rotation",
|
|||
|
|
Name: "Cryptographic Key Rotation",
|
|||
|
|
Version: "1.0",
|
|||
|
|
TriggerMetric: "key_compromise",
|
|||
|
|
TriggerSeverity: "HIGH",
|
|||
|
|
DiagnosisChecks: []PlaybookStep{
|
|||
|
|
{ID: "diag-key-age", Name: "Check key age", Type: "crypto", Timeout: 5 * time.Second},
|
|||
|
|
{ID: "diag-usage", Name: "Check key usage anomaly", Type: "prometheus", Timeout: 5 * time.Second},
|
|||
|
|
{ID: "diag-tpm", Name: "Check TPM health", Type: "shell", Timeout: 5 * time.Second},
|
|||
|
|
},
|
|||
|
|
Actions: []PlaybookStep{
|
|||
|
|
{ID: "gen-keys", Name: "Generate new keys", Type: "crypto", Timeout: 30 * time.Second, OnError: "abort",
|
|||
|
|
Params: map[string]interface{}{"algorithm": "ECDSA-P256"},
|
|||
|
|
},
|
|||
|
|
{ID: "rotate-certs", Name: "Rotate mTLS certs", Type: "crypto", Timeout: 120 * time.Second, OnError: "rollback"},
|
|||
|
|
{ID: "resign-chain", Name: "Re-sign decision chain", Type: "crypto", Timeout: 300 * time.Second, OnError: "continue"},
|
|||
|
|
{ID: "verify-peers", Name: "Verify peer certs", Type: "api", Timeout: 60 * time.Second, OnError: "abort"},
|
|||
|
|
{ID: "revoke-old", Name: "Revoke old keys", Type: "crypto", Timeout: 30 * time.Second, OnError: "continue"},
|
|||
|
|
{ID: "notify-soc", Name: "Notify SOC", Type: "api", Timeout: 5 * time.Second, OnError: "continue"},
|
|||
|
|
},
|
|||
|
|
RollbackActions: []PlaybookStep{
|
|||
|
|
{ID: "rb-revert-keys", Name: "Revert to previous keys", Type: "crypto", Timeout: 30 * time.Second},
|
|||
|
|
{ID: "rb-notify", Name: "Notify architect", Type: "api", Timeout: 5 * time.Second},
|
|||
|
|
},
|
|||
|
|
SuccessCriteria: []string{
|
|||
|
|
"new_keys_generated == true",
|
|||
|
|
"certs_distributed == true",
|
|||
|
|
"peers_verified == true",
|
|||
|
|
"old_keys_revoked == true",
|
|||
|
|
},
|
|||
|
|
}
|
|||
|
|
}
|