gomcp/internal/application/resilience/healing_strategies.go

219 lines
7.3 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Copyright 2026 Syntrex Lab. All rights reserved.
// Use of this source code is governed by an Apache-2.0 license
// that can be found in the LICENSE file.
package resilience
import "time"
// Built-in healing strategies per ТЗ §4.1.1.
// These are registered at startup via HealingEngine.RegisterStrategy().
// DefaultStrategies returns the 5 built-in healing strategies.
func DefaultStrategies() []HealingStrategy {
return []HealingStrategy{
RestartComponentStrategy(),
RollbackConfigStrategy(),
RecoverDatabaseStrategy(),
RecoverRulesStrategy(),
RecoverNetworkStrategy(),
}
}
// RestartComponentStrategy handles component crashes and offline states.
// Trigger: component_offline OR component_critical, 2 consecutive failures within 5m.
// Actions: graceful_stop → clear_temp → start → verify → notify.
// Rollback: escalate to next strategy.
func RestartComponentStrategy() HealingStrategy {
return HealingStrategy{
ID: "RESTART_COMPONENT",
Name: "Component Restart",
Trigger: TriggerCondition{
Statuses: []ComponentStatus{StatusOffline, StatusCritical},
ConsecutiveFailures: 2,
WithinWindow: 5 * time.Minute,
},
Actions: []Action{
{Type: ActionGracefulStop, Timeout: 10 * time.Second, OnError: "continue"},
{Type: ActionClearTempFiles, Timeout: 5 * time.Second, OnError: "continue"},
{Type: ActionStartComponent, Timeout: 30 * time.Second, OnError: "abort"},
{Type: ActionVerifyHealth, Timeout: 60 * time.Second, OnError: "abort"},
{Type: ActionNotifySOC, Timeout: 5 * time.Second, OnError: "continue",
Params: map[string]interface{}{
"severity": "INFO",
"message": "Component restarted successfully",
},
},
},
Rollback: RollbackPlan{
OnFailure: "escalate",
Actions: []Action{
{Type: ActionNotifyArchitect, Timeout: 5 * time.Second,
Params: map[string]interface{}{
"severity": "CRITICAL",
"message": "Component restart failed after max attempts",
},
},
},
},
MaxAttempts: 3,
Cooldown: 5 * time.Minute,
}
}
// RollbackConfigStrategy handles config tampering or validation failures.
// Trigger: config_tampering_detected OR config_validation_failed.
// Actions: freeze → verify_backup → rollback → restart → verify → notify.
func RollbackConfigStrategy() HealingStrategy {
return HealingStrategy{
ID: "ROLLBACK_CONFIG",
Name: "Configuration Rollback",
Trigger: TriggerCondition{
Metrics: []string{"config_tampering", "config_validation"},
},
Actions: []Action{
{Type: ActionFreezeConfig, Timeout: 5 * time.Second, OnError: "abort"},
{Type: ActionRollbackConfig, Timeout: 15 * time.Second, OnError: "abort"},
{Type: ActionStartComponent, Timeout: 30 * time.Second, OnError: "rollback"},
{Type: ActionVerifyConfig, Timeout: 10 * time.Second, OnError: "abort"},
{Type: ActionNotifyArchitect, Timeout: 5 * time.Second, OnError: "continue",
Params: map[string]interface{}{
"severity": "WARNING",
"message": "Config rolled back due to tampering",
},
},
},
Rollback: RollbackPlan{
OnFailure: "enter_safe_mode",
Actions: []Action{
{Type: ActionEnterSafeMode, Timeout: 10 * time.Second},
},
},
MaxAttempts: 1,
Cooldown: 1 * time.Hour,
}
}
// RecoverDatabaseStrategy handles SQLite corruption.
// Trigger: database_corruption OR sqlite_integrity_failed.
// Actions: readonly → backup → restore → verify → resume → notify.
func RecoverDatabaseStrategy() HealingStrategy {
return HealingStrategy{
ID: "RECOVER_DATABASE",
Name: "Database Recovery",
Trigger: TriggerCondition{
Metrics: []string{"database_corruption", "sqlite_integrity"},
},
Actions: []Action{
{Type: ActionSwitchReadOnly, Timeout: 5 * time.Second, OnError: "abort"},
{Type: ActionBackupDB, Timeout: 30 * time.Second, OnError: "continue"},
{Type: ActionRestoreSnapshot, Timeout: 60 * time.Second, OnError: "abort",
Params: map[string]interface{}{
"snapshot_age_max": "1h",
},
},
{Type: ActionVerifyIntegrity, Timeout: 30 * time.Second, OnError: "abort"},
{Type: ActionResumeWrites, Timeout: 5 * time.Second, OnError: "abort"},
{Type: ActionNotifySOC, Timeout: 5 * time.Second, OnError: "continue",
Params: map[string]interface{}{
"severity": "WARNING",
"message": "Database recovered from snapshot",
},
},
},
Rollback: RollbackPlan{
OnFailure: "enter_lockdown",
Actions: []Action{
{Type: ActionEnterSafeMode, Timeout: 10 * time.Second},
{Type: ActionNotifyArchitect, Timeout: 5 * time.Second,
Params: map[string]interface{}{
"severity": "CRITICAL",
"message": "Database recovery failed",
},
},
},
},
MaxAttempts: 2,
Cooldown: 2 * time.Hour,
}
}
// RecoverRulesStrategy handles correlation rule poisoning.
// Trigger: rule execution failure rate > 50%.
// Actions: disable_suspicious → revert_baseline → verify → reload → notify.
func RecoverRulesStrategy() HealingStrategy {
return HealingStrategy{
ID: "RECOVER_RULES",
Name: "Rule Poisoning Defense",
Trigger: TriggerCondition{
Metrics: []string{"rule_execution_failure_rate", "correlation_rule_anomaly"},
},
Actions: []Action{
{Type: ActionDisableRules, Timeout: 10 * time.Second, OnError: "abort",
Params: map[string]interface{}{
"criteria": "failure_rate > 80%",
},
},
{Type: ActionRevertRules, Timeout: 15 * time.Second, OnError: "abort"},
{Type: ActionReloadEngine, Timeout: 30 * time.Second, OnError: "abort"},
{Type: ActionVerifyHealth, Timeout: 30 * time.Second, OnError: "continue"},
{Type: ActionNotifyArchitect, Timeout: 5 * time.Second, OnError: "continue",
Params: map[string]interface{}{
"severity": "WARNING",
"message": "Rules recovered from baseline",
},
},
},
Rollback: RollbackPlan{
OnFailure: "disable_correlation",
},
MaxAttempts: 2,
Cooldown: 4 * time.Hour,
}
}
// RecoverNetworkStrategy handles network partition or mTLS cert expiry.
// Trigger: network_partition_detected OR mTLS_cert_expired.
// Actions: isolate → regen_certs → verify → restore → notify.
func RecoverNetworkStrategy() HealingStrategy {
return HealingStrategy{
ID: "RECOVER_NETWORK",
Name: "Network Isolation Recovery",
Trigger: TriggerCondition{
Metrics: []string{"network_partition", "mtls_cert_expiry"},
},
Actions: []Action{
{Type: ActionIsolateNetwork, Timeout: 5 * time.Second, OnError: "abort",
Params: map[string]interface{}{
"scope": "external_only",
},
},
{Type: ActionRegenCerts, Timeout: 30 * time.Second, OnError: "abort",
Params: map[string]interface{}{
"validity": "24h",
},
},
{Type: ActionVerifyHealth, Timeout: 30 * time.Second, OnError: "rollback"},
{Type: ActionRestoreNetwork, Timeout: 10 * time.Second, OnError: "abort"},
{Type: ActionNotifySOC, Timeout: 5 * time.Second, OnError: "continue",
Params: map[string]interface{}{
"severity": "INFO",
"message": "Network connectivity restored",
},
},
},
Rollback: RollbackPlan{
OnFailure: "maintain_isolation",
Actions: []Action{
{Type: ActionNotifyArchitect, Timeout: 5 * time.Second,
Params: map[string]interface{}{
"severity": "CRITICAL",
"message": "Network recovery failed, maintaining isolation",
},
},
},
},
MaxAttempts: 3,
Cooldown: 1 * time.Hour,
}
}