mirror of
https://github.com/syntrex-lab/gomcp.git
synced 2026-05-08 19:12:37 +02:00
Release prep: 54 engines, self-hosted signatures, i18n, dashboard updates
This commit is contained in:
parent
694e32be26
commit
41cbfd6e0a
178 changed files with 36008 additions and 399 deletions
215
internal/application/resilience/healing_strategies.go
Normal file
215
internal/application/resilience/healing_strategies.go
Normal file
|
|
@ -0,0 +1,215 @@
|
|||
package resilience
|
||||
|
||||
import "time"
|
||||
|
||||
// Built-in healing strategies per ТЗ §4.1.1.
|
||||
// These are registered at startup via HealingEngine.RegisterStrategy().
|
||||
|
||||
// DefaultStrategies returns the 5 built-in healing strategies.
|
||||
func DefaultStrategies() []HealingStrategy {
|
||||
return []HealingStrategy{
|
||||
RestartComponentStrategy(),
|
||||
RollbackConfigStrategy(),
|
||||
RecoverDatabaseStrategy(),
|
||||
RecoverRulesStrategy(),
|
||||
RecoverNetworkStrategy(),
|
||||
}
|
||||
}
|
||||
|
||||
// RestartComponentStrategy handles component crashes and offline states.
|
||||
// Trigger: component_offline OR component_critical, 2 consecutive failures within 5m.
|
||||
// Actions: graceful_stop → clear_temp → start → verify → notify.
|
||||
// Rollback: escalate to next strategy.
|
||||
func RestartComponentStrategy() HealingStrategy {
|
||||
return HealingStrategy{
|
||||
ID: "RESTART_COMPONENT",
|
||||
Name: "Component Restart",
|
||||
Trigger: TriggerCondition{
|
||||
Statuses: []ComponentStatus{StatusOffline, StatusCritical},
|
||||
ConsecutiveFailures: 2,
|
||||
WithinWindow: 5 * time.Minute,
|
||||
},
|
||||
Actions: []Action{
|
||||
{Type: ActionGracefulStop, Timeout: 10 * time.Second, OnError: "continue"},
|
||||
{Type: ActionClearTempFiles, Timeout: 5 * time.Second, OnError: "continue"},
|
||||
{Type: ActionStartComponent, Timeout: 30 * time.Second, OnError: "abort"},
|
||||
{Type: ActionVerifyHealth, Timeout: 60 * time.Second, OnError: "abort"},
|
||||
{Type: ActionNotifySOC, Timeout: 5 * time.Second, OnError: "continue",
|
||||
Params: map[string]interface{}{
|
||||
"severity": "INFO",
|
||||
"message": "Component restarted successfully",
|
||||
},
|
||||
},
|
||||
},
|
||||
Rollback: RollbackPlan{
|
||||
OnFailure: "escalate",
|
||||
Actions: []Action{
|
||||
{Type: ActionNotifyArchitect, Timeout: 5 * time.Second,
|
||||
Params: map[string]interface{}{
|
||||
"severity": "CRITICAL",
|
||||
"message": "Component restart failed after max attempts",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
MaxAttempts: 3,
|
||||
Cooldown: 5 * time.Minute,
|
||||
}
|
||||
}
|
||||
|
||||
// RollbackConfigStrategy handles config tampering or validation failures.
|
||||
// Trigger: config_tampering_detected OR config_validation_failed.
|
||||
// Actions: freeze → verify_backup → rollback → restart → verify → notify.
|
||||
func RollbackConfigStrategy() HealingStrategy {
|
||||
return HealingStrategy{
|
||||
ID: "ROLLBACK_CONFIG",
|
||||
Name: "Configuration Rollback",
|
||||
Trigger: TriggerCondition{
|
||||
Metrics: []string{"config_tampering", "config_validation"},
|
||||
},
|
||||
Actions: []Action{
|
||||
{Type: ActionFreezeConfig, Timeout: 5 * time.Second, OnError: "abort"},
|
||||
{Type: ActionRollbackConfig, Timeout: 15 * time.Second, OnError: "abort"},
|
||||
{Type: ActionStartComponent, Timeout: 30 * time.Second, OnError: "rollback"},
|
||||
{Type: ActionVerifyConfig, Timeout: 10 * time.Second, OnError: "abort"},
|
||||
{Type: ActionNotifyArchitect, Timeout: 5 * time.Second, OnError: "continue",
|
||||
Params: map[string]interface{}{
|
||||
"severity": "WARNING",
|
||||
"message": "Config rolled back due to tampering",
|
||||
},
|
||||
},
|
||||
},
|
||||
Rollback: RollbackPlan{
|
||||
OnFailure: "enter_safe_mode",
|
||||
Actions: []Action{
|
||||
{Type: ActionEnterSafeMode, Timeout: 10 * time.Second},
|
||||
},
|
||||
},
|
||||
MaxAttempts: 1,
|
||||
Cooldown: 1 * time.Hour,
|
||||
}
|
||||
}
|
||||
|
||||
// RecoverDatabaseStrategy handles SQLite corruption.
|
||||
// Trigger: database_corruption OR sqlite_integrity_failed.
|
||||
// Actions: readonly → backup → restore → verify → resume → notify.
|
||||
func RecoverDatabaseStrategy() HealingStrategy {
|
||||
return HealingStrategy{
|
||||
ID: "RECOVER_DATABASE",
|
||||
Name: "Database Recovery",
|
||||
Trigger: TriggerCondition{
|
||||
Metrics: []string{"database_corruption", "sqlite_integrity"},
|
||||
},
|
||||
Actions: []Action{
|
||||
{Type: ActionSwitchReadOnly, Timeout: 5 * time.Second, OnError: "abort"},
|
||||
{Type: ActionBackupDB, Timeout: 30 * time.Second, OnError: "continue"},
|
||||
{Type: ActionRestoreSnapshot, Timeout: 60 * time.Second, OnError: "abort",
|
||||
Params: map[string]interface{}{
|
||||
"snapshot_age_max": "1h",
|
||||
},
|
||||
},
|
||||
{Type: ActionVerifyIntegrity, Timeout: 30 * time.Second, OnError: "abort"},
|
||||
{Type: ActionResumeWrites, Timeout: 5 * time.Second, OnError: "abort"},
|
||||
{Type: ActionNotifySOC, Timeout: 5 * time.Second, OnError: "continue",
|
||||
Params: map[string]interface{}{
|
||||
"severity": "WARNING",
|
||||
"message": "Database recovered from snapshot",
|
||||
},
|
||||
},
|
||||
},
|
||||
Rollback: RollbackPlan{
|
||||
OnFailure: "enter_lockdown",
|
||||
Actions: []Action{
|
||||
{Type: ActionEnterSafeMode, Timeout: 10 * time.Second},
|
||||
{Type: ActionNotifyArchitect, Timeout: 5 * time.Second,
|
||||
Params: map[string]interface{}{
|
||||
"severity": "CRITICAL",
|
||||
"message": "Database recovery failed",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
MaxAttempts: 2,
|
||||
Cooldown: 2 * time.Hour,
|
||||
}
|
||||
}
|
||||
|
||||
// RecoverRulesStrategy handles correlation rule poisoning.
|
||||
// Trigger: rule execution failure rate > 50%.
|
||||
// Actions: disable_suspicious → revert_baseline → verify → reload → notify.
|
||||
func RecoverRulesStrategy() HealingStrategy {
|
||||
return HealingStrategy{
|
||||
ID: "RECOVER_RULES",
|
||||
Name: "Rule Poisoning Defense",
|
||||
Trigger: TriggerCondition{
|
||||
Metrics: []string{"rule_execution_failure_rate", "correlation_rule_anomaly"},
|
||||
},
|
||||
Actions: []Action{
|
||||
{Type: ActionDisableRules, Timeout: 10 * time.Second, OnError: "abort",
|
||||
Params: map[string]interface{}{
|
||||
"criteria": "failure_rate > 80%",
|
||||
},
|
||||
},
|
||||
{Type: ActionRevertRules, Timeout: 15 * time.Second, OnError: "abort"},
|
||||
{Type: ActionReloadEngine, Timeout: 30 * time.Second, OnError: "abort"},
|
||||
{Type: ActionVerifyHealth, Timeout: 30 * time.Second, OnError: "continue"},
|
||||
{Type: ActionNotifyArchitect, Timeout: 5 * time.Second, OnError: "continue",
|
||||
Params: map[string]interface{}{
|
||||
"severity": "WARNING",
|
||||
"message": "Rules recovered from baseline",
|
||||
},
|
||||
},
|
||||
},
|
||||
Rollback: RollbackPlan{
|
||||
OnFailure: "disable_correlation",
|
||||
},
|
||||
MaxAttempts: 2,
|
||||
Cooldown: 4 * time.Hour,
|
||||
}
|
||||
}
|
||||
|
||||
// RecoverNetworkStrategy handles network partition or mTLS cert expiry.
|
||||
// Trigger: network_partition_detected OR mTLS_cert_expired.
|
||||
// Actions: isolate → regen_certs → verify → restore → notify.
|
||||
func RecoverNetworkStrategy() HealingStrategy {
|
||||
return HealingStrategy{
|
||||
ID: "RECOVER_NETWORK",
|
||||
Name: "Network Isolation Recovery",
|
||||
Trigger: TriggerCondition{
|
||||
Metrics: []string{"network_partition", "mtls_cert_expiry"},
|
||||
},
|
||||
Actions: []Action{
|
||||
{Type: ActionIsolateNetwork, Timeout: 5 * time.Second, OnError: "abort",
|
||||
Params: map[string]interface{}{
|
||||
"scope": "external_only",
|
||||
},
|
||||
},
|
||||
{Type: ActionRegenCerts, Timeout: 30 * time.Second, OnError: "abort",
|
||||
Params: map[string]interface{}{
|
||||
"validity": "24h",
|
||||
},
|
||||
},
|
||||
{Type: ActionVerifyHealth, Timeout: 30 * time.Second, OnError: "rollback"},
|
||||
{Type: ActionRestoreNetwork, Timeout: 10 * time.Second, OnError: "abort"},
|
||||
{Type: ActionNotifySOC, Timeout: 5 * time.Second, OnError: "continue",
|
||||
Params: map[string]interface{}{
|
||||
"severity": "INFO",
|
||||
"message": "Network connectivity restored",
|
||||
},
|
||||
},
|
||||
},
|
||||
Rollback: RollbackPlan{
|
||||
OnFailure: "maintain_isolation",
|
||||
Actions: []Action{
|
||||
{Type: ActionNotifyArchitect, Timeout: 5 * time.Second,
|
||||
Params: map[string]interface{}{
|
||||
"severity": "CRITICAL",
|
||||
"message": "Network recovery failed, maintaining isolation",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
MaxAttempts: 3,
|
||||
Cooldown: 1 * time.Hour,
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue