mirror of
https://github.com/syntrex-lab/gomcp.git
synced 2026-04-24 20:06:21 +02:00
219 lines
7.3 KiB
Go
219 lines
7.3 KiB
Go
// Copyright 2026 Syntrex Lab. All rights reserved.
|
||
// Use of this source code is governed by an Apache-2.0 license
|
||
// that can be found in the LICENSE file.
|
||
|
||
package resilience
|
||
|
||
import "time"
|
||
|
||
// Built-in healing strategies per ТЗ §4.1.1.
|
||
// These are registered at startup via HealingEngine.RegisterStrategy().
|
||
|
||
// DefaultStrategies returns the 5 built-in healing strategies.
|
||
func DefaultStrategies() []HealingStrategy {
|
||
return []HealingStrategy{
|
||
RestartComponentStrategy(),
|
||
RollbackConfigStrategy(),
|
||
RecoverDatabaseStrategy(),
|
||
RecoverRulesStrategy(),
|
||
RecoverNetworkStrategy(),
|
||
}
|
||
}
|
||
|
||
// RestartComponentStrategy handles component crashes and offline states.
|
||
// Trigger: component_offline OR component_critical, 2 consecutive failures within 5m.
|
||
// Actions: graceful_stop → clear_temp → start → verify → notify.
|
||
// Rollback: escalate to next strategy.
|
||
func RestartComponentStrategy() HealingStrategy {
|
||
return HealingStrategy{
|
||
ID: "RESTART_COMPONENT",
|
||
Name: "Component Restart",
|
||
Trigger: TriggerCondition{
|
||
Statuses: []ComponentStatus{StatusOffline, StatusCritical},
|
||
ConsecutiveFailures: 2,
|
||
WithinWindow: 5 * time.Minute,
|
||
},
|
||
Actions: []Action{
|
||
{Type: ActionGracefulStop, Timeout: 10 * time.Second, OnError: "continue"},
|
||
{Type: ActionClearTempFiles, Timeout: 5 * time.Second, OnError: "continue"},
|
||
{Type: ActionStartComponent, Timeout: 30 * time.Second, OnError: "abort"},
|
||
{Type: ActionVerifyHealth, Timeout: 60 * time.Second, OnError: "abort"},
|
||
{Type: ActionNotifySOC, Timeout: 5 * time.Second, OnError: "continue",
|
||
Params: map[string]interface{}{
|
||
"severity": "INFO",
|
||
"message": "Component restarted successfully",
|
||
},
|
||
},
|
||
},
|
||
Rollback: RollbackPlan{
|
||
OnFailure: "escalate",
|
||
Actions: []Action{
|
||
{Type: ActionNotifyArchitect, Timeout: 5 * time.Second,
|
||
Params: map[string]interface{}{
|
||
"severity": "CRITICAL",
|
||
"message": "Component restart failed after max attempts",
|
||
},
|
||
},
|
||
},
|
||
},
|
||
MaxAttempts: 3,
|
||
Cooldown: 5 * time.Minute,
|
||
}
|
||
}
|
||
|
||
// RollbackConfigStrategy handles config tampering or validation failures.
|
||
// Trigger: config_tampering_detected OR config_validation_failed.
|
||
// Actions: freeze → verify_backup → rollback → restart → verify → notify.
|
||
func RollbackConfigStrategy() HealingStrategy {
|
||
return HealingStrategy{
|
||
ID: "ROLLBACK_CONFIG",
|
||
Name: "Configuration Rollback",
|
||
Trigger: TriggerCondition{
|
||
Metrics: []string{"config_tampering", "config_validation"},
|
||
},
|
||
Actions: []Action{
|
||
{Type: ActionFreezeConfig, Timeout: 5 * time.Second, OnError: "abort"},
|
||
{Type: ActionRollbackConfig, Timeout: 15 * time.Second, OnError: "abort"},
|
||
{Type: ActionStartComponent, Timeout: 30 * time.Second, OnError: "rollback"},
|
||
{Type: ActionVerifyConfig, Timeout: 10 * time.Second, OnError: "abort"},
|
||
{Type: ActionNotifyArchitect, Timeout: 5 * time.Second, OnError: "continue",
|
||
Params: map[string]interface{}{
|
||
"severity": "WARNING",
|
||
"message": "Config rolled back due to tampering",
|
||
},
|
||
},
|
||
},
|
||
Rollback: RollbackPlan{
|
||
OnFailure: "enter_safe_mode",
|
||
Actions: []Action{
|
||
{Type: ActionEnterSafeMode, Timeout: 10 * time.Second},
|
||
},
|
||
},
|
||
MaxAttempts: 1,
|
||
Cooldown: 1 * time.Hour,
|
||
}
|
||
}
|
||
|
||
// RecoverDatabaseStrategy handles SQLite corruption.
|
||
// Trigger: database_corruption OR sqlite_integrity_failed.
|
||
// Actions: readonly → backup → restore → verify → resume → notify.
|
||
func RecoverDatabaseStrategy() HealingStrategy {
|
||
return HealingStrategy{
|
||
ID: "RECOVER_DATABASE",
|
||
Name: "Database Recovery",
|
||
Trigger: TriggerCondition{
|
||
Metrics: []string{"database_corruption", "sqlite_integrity"},
|
||
},
|
||
Actions: []Action{
|
||
{Type: ActionSwitchReadOnly, Timeout: 5 * time.Second, OnError: "abort"},
|
||
{Type: ActionBackupDB, Timeout: 30 * time.Second, OnError: "continue"},
|
||
{Type: ActionRestoreSnapshot, Timeout: 60 * time.Second, OnError: "abort",
|
||
Params: map[string]interface{}{
|
||
"snapshot_age_max": "1h",
|
||
},
|
||
},
|
||
{Type: ActionVerifyIntegrity, Timeout: 30 * time.Second, OnError: "abort"},
|
||
{Type: ActionResumeWrites, Timeout: 5 * time.Second, OnError: "abort"},
|
||
{Type: ActionNotifySOC, Timeout: 5 * time.Second, OnError: "continue",
|
||
Params: map[string]interface{}{
|
||
"severity": "WARNING",
|
||
"message": "Database recovered from snapshot",
|
||
},
|
||
},
|
||
},
|
||
Rollback: RollbackPlan{
|
||
OnFailure: "enter_lockdown",
|
||
Actions: []Action{
|
||
{Type: ActionEnterSafeMode, Timeout: 10 * time.Second},
|
||
{Type: ActionNotifyArchitect, Timeout: 5 * time.Second,
|
||
Params: map[string]interface{}{
|
||
"severity": "CRITICAL",
|
||
"message": "Database recovery failed",
|
||
},
|
||
},
|
||
},
|
||
},
|
||
MaxAttempts: 2,
|
||
Cooldown: 2 * time.Hour,
|
||
}
|
||
}
|
||
|
||
// RecoverRulesStrategy handles correlation rule poisoning.
|
||
// Trigger: rule execution failure rate > 50%.
|
||
// Actions: disable_suspicious → revert_baseline → verify → reload → notify.
|
||
func RecoverRulesStrategy() HealingStrategy {
|
||
return HealingStrategy{
|
||
ID: "RECOVER_RULES",
|
||
Name: "Rule Poisoning Defense",
|
||
Trigger: TriggerCondition{
|
||
Metrics: []string{"rule_execution_failure_rate", "correlation_rule_anomaly"},
|
||
},
|
||
Actions: []Action{
|
||
{Type: ActionDisableRules, Timeout: 10 * time.Second, OnError: "abort",
|
||
Params: map[string]interface{}{
|
||
"criteria": "failure_rate > 80%",
|
||
},
|
||
},
|
||
{Type: ActionRevertRules, Timeout: 15 * time.Second, OnError: "abort"},
|
||
{Type: ActionReloadEngine, Timeout: 30 * time.Second, OnError: "abort"},
|
||
{Type: ActionVerifyHealth, Timeout: 30 * time.Second, OnError: "continue"},
|
||
{Type: ActionNotifyArchitect, Timeout: 5 * time.Second, OnError: "continue",
|
||
Params: map[string]interface{}{
|
||
"severity": "WARNING",
|
||
"message": "Rules recovered from baseline",
|
||
},
|
||
},
|
||
},
|
||
Rollback: RollbackPlan{
|
||
OnFailure: "disable_correlation",
|
||
},
|
||
MaxAttempts: 2,
|
||
Cooldown: 4 * time.Hour,
|
||
}
|
||
}
|
||
|
||
// RecoverNetworkStrategy handles network partition or mTLS cert expiry.
|
||
// Trigger: network_partition_detected OR mTLS_cert_expired.
|
||
// Actions: isolate → regen_certs → verify → restore → notify.
|
||
func RecoverNetworkStrategy() HealingStrategy {
|
||
return HealingStrategy{
|
||
ID: "RECOVER_NETWORK",
|
||
Name: "Network Isolation Recovery",
|
||
Trigger: TriggerCondition{
|
||
Metrics: []string{"network_partition", "mtls_cert_expiry"},
|
||
},
|
||
Actions: []Action{
|
||
{Type: ActionIsolateNetwork, Timeout: 5 * time.Second, OnError: "abort",
|
||
Params: map[string]interface{}{
|
||
"scope": "external_only",
|
||
},
|
||
},
|
||
{Type: ActionRegenCerts, Timeout: 30 * time.Second, OnError: "abort",
|
||
Params: map[string]interface{}{
|
||
"validity": "24h",
|
||
},
|
||
},
|
||
{Type: ActionVerifyHealth, Timeout: 30 * time.Second, OnError: "rollback"},
|
||
{Type: ActionRestoreNetwork, Timeout: 10 * time.Second, OnError: "abort"},
|
||
{Type: ActionNotifySOC, Timeout: 5 * time.Second, OnError: "continue",
|
||
Params: map[string]interface{}{
|
||
"severity": "INFO",
|
||
"message": "Network connectivity restored",
|
||
},
|
||
},
|
||
},
|
||
Rollback: RollbackPlan{
|
||
OnFailure: "maintain_isolation",
|
||
Actions: []Action{
|
||
{Type: ActionNotifyArchitect, Timeout: 5 * time.Second,
|
||
Params: map[string]interface{}{
|
||
"severity": "CRITICAL",
|
||
"message": "Network recovery failed, maintaining isolation",
|
||
},
|
||
},
|
||
},
|
||
},
|
||
MaxAttempts: 3,
|
||
Cooldown: 1 * time.Hour,
|
||
}
|
||
}
|