mirror of
https://github.com/syntrex-lab/gomcp.git
synced 2026-04-24 20:06:21 +02:00
402 lines
13 KiB
Go
402 lines
13 KiB
Go
// Copyright 2026 Syntrex Lab. All rights reserved.
|
||
// Use of this source code is governed by an Apache-2.0 license
|
||
// that can be found in the LICENSE file.
|
||
|
||
package resilience
|
||
|
||
import (
|
||
"context"
|
||
"fmt"
|
||
"log/slog"
|
||
"sync"
|
||
"time"
|
||
)
|
||
|
||
// PlaybookStatus tracks the state of a running playbook.
|
||
type PlaybookStatus string
|
||
|
||
const (
|
||
PlaybookPending PlaybookStatus = "PENDING"
|
||
PlaybookRunning PlaybookStatus = "RUNNING"
|
||
PlaybookSucceeded PlaybookStatus = "SUCCEEDED"
|
||
PlaybookFailed PlaybookStatus = "FAILED"
|
||
PlaybookRolledBack PlaybookStatus = "ROLLED_BACK"
|
||
)
|
||
|
||
// PlaybookStep is a single step in a recovery playbook.
|
||
type PlaybookStep struct {
|
||
ID string `json:"id"`
|
||
Name string `json:"name"`
|
||
Type string `json:"type"` // shell, api, consensus, crypto, systemd, http, prometheus
|
||
Timeout time.Duration `json:"timeout"`
|
||
Retries int `json:"retries"`
|
||
Params map[string]interface{} `json:"params,omitempty"`
|
||
OnError string `json:"on_error"` // abort, continue, rollback
|
||
Condition string `json:"condition,omitempty"` // prerequisite condition
|
||
}
|
||
|
||
// Playbook defines a complete recovery procedure.
|
||
type Playbook struct {
|
||
ID string `json:"id"`
|
||
Name string `json:"name"`
|
||
Version string `json:"version"`
|
||
TriggerMetric string `json:"trigger_metric"`
|
||
TriggerSeverity string `json:"trigger_severity"`
|
||
DiagnosisChecks []PlaybookStep `json:"diagnosis_checks"`
|
||
Actions []PlaybookStep `json:"actions"`
|
||
RollbackActions []PlaybookStep `json:"rollback_actions"`
|
||
SuccessCriteria []string `json:"success_criteria"`
|
||
}
|
||
|
||
// PlaybookExecution tracks a single playbook run.
|
||
type PlaybookExecution struct {
|
||
ID string `json:"id"`
|
||
PlaybookID string `json:"playbook_id"`
|
||
Component string `json:"component"`
|
||
Status PlaybookStatus `json:"status"`
|
||
StartedAt time.Time `json:"started_at"`
|
||
CompletedAt time.Time `json:"completed_at,omitempty"`
|
||
StepsRun []StepResult `json:"steps_run"`
|
||
Error string `json:"error,omitempty"`
|
||
}
|
||
|
||
// StepResult records the execution of a single playbook step.
|
||
type StepResult struct {
|
||
StepID string `json:"step_id"`
|
||
StepName string `json:"step_name"`
|
||
Success bool `json:"success"`
|
||
Duration time.Duration `json:"duration"`
|
||
Output string `json:"output,omitempty"`
|
||
Error string `json:"error,omitempty"`
|
||
}
|
||
|
||
// PlaybookExecutorFunc runs a single playbook step.
|
||
type PlaybookExecutorFunc func(ctx context.Context, step PlaybookStep, component string) (string, error)
|
||
|
||
// RecoveryPlaybookEngine manages and executes recovery playbooks.
|
||
type RecoveryPlaybookEngine struct {
|
||
mu sync.RWMutex
|
||
playbooks map[string]*Playbook
|
||
executions []*PlaybookExecution
|
||
execCount int64
|
||
executor PlaybookExecutorFunc
|
||
logger *slog.Logger
|
||
}
|
||
|
||
// NewRecoveryPlaybookEngine creates a new playbook engine.
|
||
func NewRecoveryPlaybookEngine(executor PlaybookExecutorFunc) *RecoveryPlaybookEngine {
|
||
return &RecoveryPlaybookEngine{
|
||
playbooks: make(map[string]*Playbook),
|
||
executions: make([]*PlaybookExecution, 0),
|
||
executor: executor,
|
||
logger: slog.Default().With("component", "sarl-recovery-playbooks"),
|
||
}
|
||
}
|
||
|
||
// RegisterPlaybook adds a playbook to the engine.
|
||
func (rpe *RecoveryPlaybookEngine) RegisterPlaybook(pb Playbook) {
|
||
rpe.mu.Lock()
|
||
defer rpe.mu.Unlock()
|
||
rpe.playbooks[pb.ID] = &pb
|
||
rpe.logger.Info("playbook registered", "id", pb.ID, "name", pb.Name)
|
||
}
|
||
|
||
// Execute runs a playbook for a given component. Returns the execution ID.
|
||
func (rpe *RecoveryPlaybookEngine) Execute(ctx context.Context, playbookID, component string) (string, error) {
|
||
rpe.mu.Lock()
|
||
pb, ok := rpe.playbooks[playbookID]
|
||
if !ok {
|
||
rpe.mu.Unlock()
|
||
return "", fmt.Errorf("playbook %s not found", playbookID)
|
||
}
|
||
|
||
rpe.execCount++
|
||
exec := &PlaybookExecution{
|
||
ID: fmt.Sprintf("exec-%d", rpe.execCount),
|
||
PlaybookID: playbookID,
|
||
Component: component,
|
||
Status: PlaybookRunning,
|
||
StartedAt: time.Now(),
|
||
StepsRun: make([]StepResult, 0),
|
||
}
|
||
rpe.executions = append(rpe.executions, exec)
|
||
rpe.mu.Unlock()
|
||
|
||
rpe.logger.Info("playbook execution started",
|
||
"exec_id", exec.ID,
|
||
"playbook", pb.Name,
|
||
"component", component,
|
||
)
|
||
|
||
// Phase 1: Diagnosis checks.
|
||
for _, check := range pb.DiagnosisChecks {
|
||
result := rpe.runStep(ctx, check, component)
|
||
exec.StepsRun = append(exec.StepsRun, result)
|
||
if !result.Success {
|
||
rpe.logger.Warn("diagnosis check failed",
|
||
"step", check.ID,
|
||
"error", result.Error,
|
||
)
|
||
}
|
||
}
|
||
|
||
// Phase 2: Execute recovery actions.
|
||
var execErr error
|
||
for _, action := range pb.Actions {
|
||
result := rpe.runStep(ctx, action, component)
|
||
exec.StepsRun = append(exec.StepsRun, result)
|
||
|
||
if !result.Success {
|
||
switch action.OnError {
|
||
case "continue":
|
||
continue
|
||
case "rollback":
|
||
execErr = fmt.Errorf("step %s failed (rollback): %s", action.ID, result.Error)
|
||
default: // "abort"
|
||
execErr = fmt.Errorf("step %s failed: %s", action.ID, result.Error)
|
||
}
|
||
break
|
||
}
|
||
}
|
||
|
||
// Phase 3: Handle result.
|
||
if execErr != nil {
|
||
rpe.logger.Error("playbook failed, executing rollback",
|
||
"exec_id", exec.ID,
|
||
"error", execErr,
|
||
)
|
||
|
||
// Execute rollback.
|
||
for _, rb := range pb.RollbackActions {
|
||
result := rpe.runStep(ctx, rb, component)
|
||
exec.StepsRun = append(exec.StepsRun, result)
|
||
}
|
||
|
||
exec.Status = PlaybookRolledBack
|
||
exec.Error = execErr.Error()
|
||
} else {
|
||
exec.Status = PlaybookSucceeded
|
||
rpe.logger.Info("playbook succeeded",
|
||
"exec_id", exec.ID,
|
||
"component", component,
|
||
"duration", time.Since(exec.StartedAt),
|
||
)
|
||
}
|
||
|
||
exec.CompletedAt = time.Now()
|
||
return exec.ID, execErr
|
||
}
|
||
|
||
// runStep executes a single step with timeout and retries.
|
||
func (rpe *RecoveryPlaybookEngine) runStep(ctx context.Context, step PlaybookStep, component string) StepResult {
|
||
start := time.Now()
|
||
result := StepResult{
|
||
StepID: step.ID,
|
||
StepName: step.Name,
|
||
}
|
||
|
||
retries := step.Retries
|
||
if retries <= 0 {
|
||
retries = 1
|
||
}
|
||
|
||
var lastErr error
|
||
for attempt := 0; attempt < retries; attempt++ {
|
||
stepCtx := ctx
|
||
var cancel context.CancelFunc
|
||
if step.Timeout > 0 {
|
||
stepCtx, cancel = context.WithTimeout(ctx, step.Timeout)
|
||
}
|
||
|
||
output, err := rpe.executor(stepCtx, step, component)
|
||
|
||
if cancel != nil {
|
||
cancel()
|
||
}
|
||
|
||
if err == nil {
|
||
result.Success = true
|
||
result.Output = output
|
||
result.Duration = time.Since(start)
|
||
return result
|
||
}
|
||
lastErr = err
|
||
|
||
if attempt < retries-1 {
|
||
rpe.logger.Warn("step retry",
|
||
"step", step.ID,
|
||
"attempt", attempt+1,
|
||
"error", err,
|
||
)
|
||
}
|
||
}
|
||
|
||
result.Success = false
|
||
result.Error = lastErr.Error()
|
||
result.Duration = time.Since(start)
|
||
return result
|
||
}
|
||
|
||
// GetExecution returns a playbook execution by ID.
|
||
// Returns a deep copy to prevent data races with the execution goroutine.
|
||
func (rpe *RecoveryPlaybookEngine) GetExecution(id string) (*PlaybookExecution, bool) {
|
||
rpe.mu.RLock()
|
||
defer rpe.mu.RUnlock()
|
||
|
||
for _, exec := range rpe.executions {
|
||
if exec.ID == id {
|
||
cp := *exec
|
||
cp.StepsRun = make([]StepResult, len(exec.StepsRun))
|
||
copy(cp.StepsRun, exec.StepsRun)
|
||
return &cp, true
|
||
}
|
||
}
|
||
return nil, false
|
||
}
|
||
|
||
// RecentExecutions returns the last N executions.
|
||
// Returns deep copies to prevent data races with the execution goroutine.
|
||
func (rpe *RecoveryPlaybookEngine) RecentExecutions(n int) []PlaybookExecution {
|
||
rpe.mu.RLock()
|
||
defer rpe.mu.RUnlock()
|
||
|
||
total := len(rpe.executions)
|
||
if total == 0 {
|
||
return nil
|
||
}
|
||
start := total - n
|
||
if start < 0 {
|
||
start = 0
|
||
}
|
||
|
||
result := make([]PlaybookExecution, 0, n)
|
||
for i := start; i < total; i++ {
|
||
cp := *rpe.executions[i]
|
||
cp.StepsRun = make([]StepResult, len(rpe.executions[i].StepsRun))
|
||
copy(cp.StepsRun, rpe.executions[i].StepsRun)
|
||
result = append(result, cp)
|
||
}
|
||
return result
|
||
}
|
||
|
||
// PlaybookCount returns the number of registered playbooks.
|
||
func (rpe *RecoveryPlaybookEngine) PlaybookCount() int {
|
||
rpe.mu.RLock()
|
||
defer rpe.mu.RUnlock()
|
||
return len(rpe.playbooks)
|
||
}
|
||
|
||
// --- Built-in playbooks per ТЗ §7.1 ---
|
||
|
||
// DefaultPlaybooks returns the 3 built-in recovery playbooks.
|
||
func DefaultPlaybooks() []Playbook {
|
||
return []Playbook{
|
||
ComponentResurrectionPlaybook(),
|
||
ConsensusRecoveryPlaybook(),
|
||
CryptoRotationPlaybook(),
|
||
}
|
||
}
|
||
|
||
// ComponentResurrectionPlaybook per ТЗ §7.1.1.
|
||
func ComponentResurrectionPlaybook() Playbook {
|
||
return Playbook{
|
||
ID: "component-resurrection",
|
||
Name: "Component Resurrection",
|
||
Version: "1.0",
|
||
TriggerMetric: "component_offline",
|
||
TriggerSeverity: "CRITICAL",
|
||
DiagnosisChecks: []PlaybookStep{
|
||
{ID: "diag-process", Name: "Check process exists", Type: "shell", Timeout: 5 * time.Second},
|
||
{ID: "diag-crashes", Name: "Check recent crashes", Type: "shell", Timeout: 5 * time.Second},
|
||
{ID: "diag-resources", Name: "Check resource exhaustion", Type: "prometheus", Timeout: 5 * time.Second},
|
||
{ID: "diag-deps", Name: "Check dependency health", Type: "http", Timeout: 10 * time.Second},
|
||
},
|
||
Actions: []PlaybookStep{
|
||
{ID: "capture-forensics", Name: "Capture forensics", Type: "shell", Timeout: 30 * time.Second, OnError: "continue"},
|
||
{ID: "clear-resources", Name: "Clear temp resources", Type: "shell", Timeout: 10 * time.Second, OnError: "continue"},
|
||
{ID: "restart-component", Name: "Restart component", Type: "systemd", Timeout: 60 * time.Second, OnError: "abort"},
|
||
{ID: "verify-health", Name: "Verify health", Type: "http", Timeout: 30 * time.Second, Retries: 3, OnError: "abort"},
|
||
{ID: "verify-metrics", Name: "Verify metrics", Type: "prometheus", Timeout: 30 * time.Second, OnError: "continue"},
|
||
{ID: "notify-success", Name: "Notify SOC", Type: "api", Timeout: 5 * time.Second, OnError: "continue"},
|
||
},
|
||
RollbackActions: []PlaybookStep{
|
||
{ID: "rb-safe-mode", Name: "Enter safe mode", Type: "api", Timeout: 10 * time.Second},
|
||
{ID: "rb-notify", Name: "Notify architect", Type: "api", Timeout: 5 * time.Second},
|
||
},
|
||
SuccessCriteria: []string{
|
||
"component_status == HEALTHY",
|
||
"health_check_passed == true",
|
||
"no_crashes_for_5min == true",
|
||
},
|
||
}
|
||
}
|
||
|
||
// ConsensusRecoveryPlaybook per ТЗ §7.1.2.
|
||
func ConsensusRecoveryPlaybook() Playbook {
|
||
return Playbook{
|
||
ID: "consensus-recovery",
|
||
Name: "Distributed Consensus Recovery",
|
||
Version: "1.0",
|
||
TriggerMetric: "split_brain",
|
||
TriggerSeverity: "CRITICAL",
|
||
DiagnosisChecks: []PlaybookStep{
|
||
{ID: "diag-peers", Name: "Check peer connectivity", Type: "api", Timeout: 10 * time.Second},
|
||
{ID: "diag-sync", Name: "Check sync status", Type: "api", Timeout: 10 * time.Second},
|
||
{ID: "diag-genome", Name: "Verify genome", Type: "api", Timeout: 5 * time.Second},
|
||
},
|
||
Actions: []PlaybookStep{
|
||
{ID: "pause-writes", Name: "Pause all writes", Type: "api", Timeout: 10 * time.Second, OnError: "abort"},
|
||
{ID: "elect-leader", Name: "Elect leader (Raft)", Type: "consensus", Timeout: 60 * time.Second, OnError: "abort"},
|
||
{ID: "sync-state", Name: "Sync state from leader", Type: "api", Timeout: 300 * time.Second, OnError: "rollback"},
|
||
{ID: "verify-consistency", Name: "Verify consistency", Type: "api", Timeout: 60 * time.Second, OnError: "abort"},
|
||
{ID: "resume-writes", Name: "Resume writes", Type: "api", Timeout: 10 * time.Second, OnError: "abort"},
|
||
{ID: "notify-cluster", Name: "Notify cluster", Type: "api", Timeout: 5 * time.Second, OnError: "continue"},
|
||
},
|
||
RollbackActions: []PlaybookStep{
|
||
{ID: "rb-readonly", Name: "Maintain readonly", Type: "api", Timeout: 10 * time.Second},
|
||
{ID: "rb-notify", Name: "Notify architect", Type: "api", Timeout: 5 * time.Second},
|
||
},
|
||
SuccessCriteria: []string{
|
||
"leader_elected == true",
|
||
"state_synced == true",
|
||
"consistency_verified == true",
|
||
"writes_resumed == true",
|
||
},
|
||
}
|
||
}
|
||
|
||
// CryptoRotationPlaybook per ТЗ §7.1.3.
|
||
func CryptoRotationPlaybook() Playbook {
|
||
return Playbook{
|
||
ID: "crypto-rotation",
|
||
Name: "Cryptographic Key Rotation",
|
||
Version: "1.0",
|
||
TriggerMetric: "key_compromise",
|
||
TriggerSeverity: "HIGH",
|
||
DiagnosisChecks: []PlaybookStep{
|
||
{ID: "diag-key-age", Name: "Check key age", Type: "crypto", Timeout: 5 * time.Second},
|
||
{ID: "diag-usage", Name: "Check key usage anomaly", Type: "prometheus", Timeout: 5 * time.Second},
|
||
{ID: "diag-tpm", Name: "Check TPM health", Type: "shell", Timeout: 5 * time.Second},
|
||
},
|
||
Actions: []PlaybookStep{
|
||
{ID: "gen-keys", Name: "Generate new keys", Type: "crypto", Timeout: 30 * time.Second, OnError: "abort",
|
||
Params: map[string]interface{}{"algorithm": "ECDSA-P256"},
|
||
},
|
||
{ID: "rotate-certs", Name: "Rotate mTLS certs", Type: "crypto", Timeout: 120 * time.Second, OnError: "rollback"},
|
||
{ID: "resign-chain", Name: "Re-sign decision chain", Type: "crypto", Timeout: 300 * time.Second, OnError: "continue"},
|
||
{ID: "verify-peers", Name: "Verify peer certs", Type: "api", Timeout: 60 * time.Second, OnError: "abort"},
|
||
{ID: "revoke-old", Name: "Revoke old keys", Type: "crypto", Timeout: 30 * time.Second, OnError: "continue"},
|
||
{ID: "notify-soc", Name: "Notify SOC", Type: "api", Timeout: 5 * time.Second, OnError: "continue"},
|
||
},
|
||
RollbackActions: []PlaybookStep{
|
||
{ID: "rb-revert-keys", Name: "Revert to previous keys", Type: "crypto", Timeout: 30 * time.Second},
|
||
{ID: "rb-notify", Name: "Notify architect", Type: "api", Timeout: 5 * time.Second},
|
||
},
|
||
SuccessCriteria: []string{
|
||
"new_keys_generated == true",
|
||
"certs_distributed == true",
|
||
"peers_verified == true",
|
||
"old_keys_revoked == true",
|
||
},
|
||
}
|
||
}
|