mirror of
https://github.com/syntrex-lab/gomcp.git
synced 2026-05-08 19:12:37 +02:00
Release prep: 54 engines, self-hosted signatures, i18n, dashboard updates
This commit is contained in:
parent
694e32be26
commit
41cbfd6e0a
178 changed files with 36008 additions and 399 deletions
398
internal/application/resilience/recovery_playbooks.go
Normal file
398
internal/application/resilience/recovery_playbooks.go
Normal file
|
|
@ -0,0 +1,398 @@
|
|||
package resilience
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// PlaybookStatus tracks the state of a running playbook.
|
||||
type PlaybookStatus string
|
||||
|
||||
const (
|
||||
PlaybookPending PlaybookStatus = "PENDING"
|
||||
PlaybookRunning PlaybookStatus = "RUNNING"
|
||||
PlaybookSucceeded PlaybookStatus = "SUCCEEDED"
|
||||
PlaybookFailed PlaybookStatus = "FAILED"
|
||||
PlaybookRolledBack PlaybookStatus = "ROLLED_BACK"
|
||||
)
|
||||
|
||||
// PlaybookStep is a single step in a recovery playbook.
|
||||
type PlaybookStep struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Type string `json:"type"` // shell, api, consensus, crypto, systemd, http, prometheus
|
||||
Timeout time.Duration `json:"timeout"`
|
||||
Retries int `json:"retries"`
|
||||
Params map[string]interface{} `json:"params,omitempty"`
|
||||
OnError string `json:"on_error"` // abort, continue, rollback
|
||||
Condition string `json:"condition,omitempty"` // prerequisite condition
|
||||
}
|
||||
|
||||
// Playbook defines a complete recovery procedure.
|
||||
type Playbook struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Version string `json:"version"`
|
||||
TriggerMetric string `json:"trigger_metric"`
|
||||
TriggerSeverity string `json:"trigger_severity"`
|
||||
DiagnosisChecks []PlaybookStep `json:"diagnosis_checks"`
|
||||
Actions []PlaybookStep `json:"actions"`
|
||||
RollbackActions []PlaybookStep `json:"rollback_actions"`
|
||||
SuccessCriteria []string `json:"success_criteria"`
|
||||
}
|
||||
|
||||
// PlaybookExecution tracks a single playbook run.
|
||||
type PlaybookExecution struct {
|
||||
ID string `json:"id"`
|
||||
PlaybookID string `json:"playbook_id"`
|
||||
Component string `json:"component"`
|
||||
Status PlaybookStatus `json:"status"`
|
||||
StartedAt time.Time `json:"started_at"`
|
||||
CompletedAt time.Time `json:"completed_at,omitempty"`
|
||||
StepsRun []StepResult `json:"steps_run"`
|
||||
Error string `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
// StepResult records the execution of a single playbook step.
|
||||
type StepResult struct {
|
||||
StepID string `json:"step_id"`
|
||||
StepName string `json:"step_name"`
|
||||
Success bool `json:"success"`
|
||||
Duration time.Duration `json:"duration"`
|
||||
Output string `json:"output,omitempty"`
|
||||
Error string `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
// PlaybookExecutorFunc runs a single playbook step.
|
||||
type PlaybookExecutorFunc func(ctx context.Context, step PlaybookStep, component string) (string, error)
|
||||
|
||||
// RecoveryPlaybookEngine manages and executes recovery playbooks.
|
||||
type RecoveryPlaybookEngine struct {
|
||||
mu sync.RWMutex
|
||||
playbooks map[string]*Playbook
|
||||
executions []*PlaybookExecution
|
||||
execCount int64
|
||||
executor PlaybookExecutorFunc
|
||||
logger *slog.Logger
|
||||
}
|
||||
|
||||
// NewRecoveryPlaybookEngine creates a new playbook engine.
|
||||
func NewRecoveryPlaybookEngine(executor PlaybookExecutorFunc) *RecoveryPlaybookEngine {
|
||||
return &RecoveryPlaybookEngine{
|
||||
playbooks: make(map[string]*Playbook),
|
||||
executions: make([]*PlaybookExecution, 0),
|
||||
executor: executor,
|
||||
logger: slog.Default().With("component", "sarl-recovery-playbooks"),
|
||||
}
|
||||
}
|
||||
|
||||
// RegisterPlaybook adds a playbook to the engine.
|
||||
func (rpe *RecoveryPlaybookEngine) RegisterPlaybook(pb Playbook) {
|
||||
rpe.mu.Lock()
|
||||
defer rpe.mu.Unlock()
|
||||
rpe.playbooks[pb.ID] = &pb
|
||||
rpe.logger.Info("playbook registered", "id", pb.ID, "name", pb.Name)
|
||||
}
|
||||
|
||||
// Execute runs a playbook for a given component. Returns the execution ID.
|
||||
func (rpe *RecoveryPlaybookEngine) Execute(ctx context.Context, playbookID, component string) (string, error) {
|
||||
rpe.mu.Lock()
|
||||
pb, ok := rpe.playbooks[playbookID]
|
||||
if !ok {
|
||||
rpe.mu.Unlock()
|
||||
return "", fmt.Errorf("playbook %s not found", playbookID)
|
||||
}
|
||||
|
||||
rpe.execCount++
|
||||
exec := &PlaybookExecution{
|
||||
ID: fmt.Sprintf("exec-%d", rpe.execCount),
|
||||
PlaybookID: playbookID,
|
||||
Component: component,
|
||||
Status: PlaybookRunning,
|
||||
StartedAt: time.Now(),
|
||||
StepsRun: make([]StepResult, 0),
|
||||
}
|
||||
rpe.executions = append(rpe.executions, exec)
|
||||
rpe.mu.Unlock()
|
||||
|
||||
rpe.logger.Info("playbook execution started",
|
||||
"exec_id", exec.ID,
|
||||
"playbook", pb.Name,
|
||||
"component", component,
|
||||
)
|
||||
|
||||
// Phase 1: Diagnosis checks.
|
||||
for _, check := range pb.DiagnosisChecks {
|
||||
result := rpe.runStep(ctx, check, component)
|
||||
exec.StepsRun = append(exec.StepsRun, result)
|
||||
if !result.Success {
|
||||
rpe.logger.Warn("diagnosis check failed",
|
||||
"step", check.ID,
|
||||
"error", result.Error,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 2: Execute recovery actions.
|
||||
var execErr error
|
||||
for _, action := range pb.Actions {
|
||||
result := rpe.runStep(ctx, action, component)
|
||||
exec.StepsRun = append(exec.StepsRun, result)
|
||||
|
||||
if !result.Success {
|
||||
switch action.OnError {
|
||||
case "continue":
|
||||
continue
|
||||
case "rollback":
|
||||
execErr = fmt.Errorf("step %s failed (rollback): %s", action.ID, result.Error)
|
||||
default: // "abort"
|
||||
execErr = fmt.Errorf("step %s failed: %s", action.ID, result.Error)
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 3: Handle result.
|
||||
if execErr != nil {
|
||||
rpe.logger.Error("playbook failed, executing rollback",
|
||||
"exec_id", exec.ID,
|
||||
"error", execErr,
|
||||
)
|
||||
|
||||
// Execute rollback.
|
||||
for _, rb := range pb.RollbackActions {
|
||||
result := rpe.runStep(ctx, rb, component)
|
||||
exec.StepsRun = append(exec.StepsRun, result)
|
||||
}
|
||||
|
||||
exec.Status = PlaybookRolledBack
|
||||
exec.Error = execErr.Error()
|
||||
} else {
|
||||
exec.Status = PlaybookSucceeded
|
||||
rpe.logger.Info("playbook succeeded",
|
||||
"exec_id", exec.ID,
|
||||
"component", component,
|
||||
"duration", time.Since(exec.StartedAt),
|
||||
)
|
||||
}
|
||||
|
||||
exec.CompletedAt = time.Now()
|
||||
return exec.ID, execErr
|
||||
}
|
||||
|
||||
// runStep executes a single step with timeout and retries.
|
||||
func (rpe *RecoveryPlaybookEngine) runStep(ctx context.Context, step PlaybookStep, component string) StepResult {
|
||||
start := time.Now()
|
||||
result := StepResult{
|
||||
StepID: step.ID,
|
||||
StepName: step.Name,
|
||||
}
|
||||
|
||||
retries := step.Retries
|
||||
if retries <= 0 {
|
||||
retries = 1
|
||||
}
|
||||
|
||||
var lastErr error
|
||||
for attempt := 0; attempt < retries; attempt++ {
|
||||
stepCtx := ctx
|
||||
var cancel context.CancelFunc
|
||||
if step.Timeout > 0 {
|
||||
stepCtx, cancel = context.WithTimeout(ctx, step.Timeout)
|
||||
}
|
||||
|
||||
output, err := rpe.executor(stepCtx, step, component)
|
||||
|
||||
if cancel != nil {
|
||||
cancel()
|
||||
}
|
||||
|
||||
if err == nil {
|
||||
result.Success = true
|
||||
result.Output = output
|
||||
result.Duration = time.Since(start)
|
||||
return result
|
||||
}
|
||||
lastErr = err
|
||||
|
||||
if attempt < retries-1 {
|
||||
rpe.logger.Warn("step retry",
|
||||
"step", step.ID,
|
||||
"attempt", attempt+1,
|
||||
"error", err,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
result.Success = false
|
||||
result.Error = lastErr.Error()
|
||||
result.Duration = time.Since(start)
|
||||
return result
|
||||
}
|
||||
|
||||
// GetExecution returns a playbook execution by ID.
|
||||
// Returns a deep copy to prevent data races with the execution goroutine.
|
||||
func (rpe *RecoveryPlaybookEngine) GetExecution(id string) (*PlaybookExecution, bool) {
|
||||
rpe.mu.RLock()
|
||||
defer rpe.mu.RUnlock()
|
||||
|
||||
for _, exec := range rpe.executions {
|
||||
if exec.ID == id {
|
||||
cp := *exec
|
||||
cp.StepsRun = make([]StepResult, len(exec.StepsRun))
|
||||
copy(cp.StepsRun, exec.StepsRun)
|
||||
return &cp, true
|
||||
}
|
||||
}
|
||||
return nil, false
|
||||
}
|
||||
|
||||
// RecentExecutions returns the last N executions.
|
||||
// Returns deep copies to prevent data races with the execution goroutine.
|
||||
func (rpe *RecoveryPlaybookEngine) RecentExecutions(n int) []PlaybookExecution {
|
||||
rpe.mu.RLock()
|
||||
defer rpe.mu.RUnlock()
|
||||
|
||||
total := len(rpe.executions)
|
||||
if total == 0 {
|
||||
return nil
|
||||
}
|
||||
start := total - n
|
||||
if start < 0 {
|
||||
start = 0
|
||||
}
|
||||
|
||||
result := make([]PlaybookExecution, 0, n)
|
||||
for i := start; i < total; i++ {
|
||||
cp := *rpe.executions[i]
|
||||
cp.StepsRun = make([]StepResult, len(rpe.executions[i].StepsRun))
|
||||
copy(cp.StepsRun, rpe.executions[i].StepsRun)
|
||||
result = append(result, cp)
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// PlaybookCount returns the number of registered playbooks.
|
||||
func (rpe *RecoveryPlaybookEngine) PlaybookCount() int {
|
||||
rpe.mu.RLock()
|
||||
defer rpe.mu.RUnlock()
|
||||
return len(rpe.playbooks)
|
||||
}
|
||||
|
||||
// --- Built-in playbooks per ТЗ §7.1 ---
|
||||
|
||||
// DefaultPlaybooks returns the 3 built-in recovery playbooks.
|
||||
func DefaultPlaybooks() []Playbook {
|
||||
return []Playbook{
|
||||
ComponentResurrectionPlaybook(),
|
||||
ConsensusRecoveryPlaybook(),
|
||||
CryptoRotationPlaybook(),
|
||||
}
|
||||
}
|
||||
|
||||
// ComponentResurrectionPlaybook per ТЗ §7.1.1.
|
||||
func ComponentResurrectionPlaybook() Playbook {
|
||||
return Playbook{
|
||||
ID: "component-resurrection",
|
||||
Name: "Component Resurrection",
|
||||
Version: "1.0",
|
||||
TriggerMetric: "component_offline",
|
||||
TriggerSeverity: "CRITICAL",
|
||||
DiagnosisChecks: []PlaybookStep{
|
||||
{ID: "diag-process", Name: "Check process exists", Type: "shell", Timeout: 5 * time.Second},
|
||||
{ID: "diag-crashes", Name: "Check recent crashes", Type: "shell", Timeout: 5 * time.Second},
|
||||
{ID: "diag-resources", Name: "Check resource exhaustion", Type: "prometheus", Timeout: 5 * time.Second},
|
||||
{ID: "diag-deps", Name: "Check dependency health", Type: "http", Timeout: 10 * time.Second},
|
||||
},
|
||||
Actions: []PlaybookStep{
|
||||
{ID: "capture-forensics", Name: "Capture forensics", Type: "shell", Timeout: 30 * time.Second, OnError: "continue"},
|
||||
{ID: "clear-resources", Name: "Clear temp resources", Type: "shell", Timeout: 10 * time.Second, OnError: "continue"},
|
||||
{ID: "restart-component", Name: "Restart component", Type: "systemd", Timeout: 60 * time.Second, OnError: "abort"},
|
||||
{ID: "verify-health", Name: "Verify health", Type: "http", Timeout: 30 * time.Second, Retries: 3, OnError: "abort"},
|
||||
{ID: "verify-metrics", Name: "Verify metrics", Type: "prometheus", Timeout: 30 * time.Second, OnError: "continue"},
|
||||
{ID: "notify-success", Name: "Notify SOC", Type: "api", Timeout: 5 * time.Second, OnError: "continue"},
|
||||
},
|
||||
RollbackActions: []PlaybookStep{
|
||||
{ID: "rb-safe-mode", Name: "Enter safe mode", Type: "api", Timeout: 10 * time.Second},
|
||||
{ID: "rb-notify", Name: "Notify architect", Type: "api", Timeout: 5 * time.Second},
|
||||
},
|
||||
SuccessCriteria: []string{
|
||||
"component_status == HEALTHY",
|
||||
"health_check_passed == true",
|
||||
"no_crashes_for_5min == true",
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// ConsensusRecoveryPlaybook per ТЗ §7.1.2.
|
||||
func ConsensusRecoveryPlaybook() Playbook {
|
||||
return Playbook{
|
||||
ID: "consensus-recovery",
|
||||
Name: "Distributed Consensus Recovery",
|
||||
Version: "1.0",
|
||||
TriggerMetric: "split_brain",
|
||||
TriggerSeverity: "CRITICAL",
|
||||
DiagnosisChecks: []PlaybookStep{
|
||||
{ID: "diag-peers", Name: "Check peer connectivity", Type: "api", Timeout: 10 * time.Second},
|
||||
{ID: "diag-sync", Name: "Check sync status", Type: "api", Timeout: 10 * time.Second},
|
||||
{ID: "diag-genome", Name: "Verify genome", Type: "api", Timeout: 5 * time.Second},
|
||||
},
|
||||
Actions: []PlaybookStep{
|
||||
{ID: "pause-writes", Name: "Pause all writes", Type: "api", Timeout: 10 * time.Second, OnError: "abort"},
|
||||
{ID: "elect-leader", Name: "Elect leader (Raft)", Type: "consensus", Timeout: 60 * time.Second, OnError: "abort"},
|
||||
{ID: "sync-state", Name: "Sync state from leader", Type: "api", Timeout: 300 * time.Second, OnError: "rollback"},
|
||||
{ID: "verify-consistency", Name: "Verify consistency", Type: "api", Timeout: 60 * time.Second, OnError: "abort"},
|
||||
{ID: "resume-writes", Name: "Resume writes", Type: "api", Timeout: 10 * time.Second, OnError: "abort"},
|
||||
{ID: "notify-cluster", Name: "Notify cluster", Type: "api", Timeout: 5 * time.Second, OnError: "continue"},
|
||||
},
|
||||
RollbackActions: []PlaybookStep{
|
||||
{ID: "rb-readonly", Name: "Maintain readonly", Type: "api", Timeout: 10 * time.Second},
|
||||
{ID: "rb-notify", Name: "Notify architect", Type: "api", Timeout: 5 * time.Second},
|
||||
},
|
||||
SuccessCriteria: []string{
|
||||
"leader_elected == true",
|
||||
"state_synced == true",
|
||||
"consistency_verified == true",
|
||||
"writes_resumed == true",
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// CryptoRotationPlaybook per ТЗ §7.1.3.
|
||||
func CryptoRotationPlaybook() Playbook {
|
||||
return Playbook{
|
||||
ID: "crypto-rotation",
|
||||
Name: "Cryptographic Key Rotation",
|
||||
Version: "1.0",
|
||||
TriggerMetric: "key_compromise",
|
||||
TriggerSeverity: "HIGH",
|
||||
DiagnosisChecks: []PlaybookStep{
|
||||
{ID: "diag-key-age", Name: "Check key age", Type: "crypto", Timeout: 5 * time.Second},
|
||||
{ID: "diag-usage", Name: "Check key usage anomaly", Type: "prometheus", Timeout: 5 * time.Second},
|
||||
{ID: "diag-tpm", Name: "Check TPM health", Type: "shell", Timeout: 5 * time.Second},
|
||||
},
|
||||
Actions: []PlaybookStep{
|
||||
{ID: "gen-keys", Name: "Generate new keys", Type: "crypto", Timeout: 30 * time.Second, OnError: "abort",
|
||||
Params: map[string]interface{}{"algorithm": "ECDSA-P256"},
|
||||
},
|
||||
{ID: "rotate-certs", Name: "Rotate mTLS certs", Type: "crypto", Timeout: 120 * time.Second, OnError: "rollback"},
|
||||
{ID: "resign-chain", Name: "Re-sign decision chain", Type: "crypto", Timeout: 300 * time.Second, OnError: "continue"},
|
||||
{ID: "verify-peers", Name: "Verify peer certs", Type: "api", Timeout: 60 * time.Second, OnError: "abort"},
|
||||
{ID: "revoke-old", Name: "Revoke old keys", Type: "crypto", Timeout: 30 * time.Second, OnError: "continue"},
|
||||
{ID: "notify-soc", Name: "Notify SOC", Type: "api", Timeout: 5 * time.Second, OnError: "continue"},
|
||||
},
|
||||
RollbackActions: []PlaybookStep{
|
||||
{ID: "rb-revert-keys", Name: "Revert to previous keys", Type: "crypto", Timeout: 30 * time.Second},
|
||||
{ID: "rb-notify", Name: "Notify architect", Type: "api", Timeout: 5 * time.Second},
|
||||
},
|
||||
SuccessCriteria: []string{
|
||||
"new_keys_generated == true",
|
||||
"certs_distributed == true",
|
||||
"peers_verified == true",
|
||||
"old_keys_revoked == true",
|
||||
},
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue