Release prep: 54 engines, self-hosted signatures, i18n, dashboard updates

This commit is contained in:
DmitrL-dev 2026-03-23 16:45:40 +10:00
parent 694e32be26
commit 41cbfd6e0a
178 changed files with 36008 additions and 399 deletions

View file

@ -0,0 +1,215 @@
package resilience
import "time"
// Built-in healing strategies per ТЗ §4.1.1.
// These are registered at startup via HealingEngine.RegisterStrategy().
// DefaultStrategies returns the 5 built-in healing strategies.
func DefaultStrategies() []HealingStrategy {
return []HealingStrategy{
RestartComponentStrategy(),
RollbackConfigStrategy(),
RecoverDatabaseStrategy(),
RecoverRulesStrategy(),
RecoverNetworkStrategy(),
}
}
// RestartComponentStrategy handles component crashes and offline states.
// Trigger: component_offline OR component_critical, 2 consecutive failures within 5m.
// Actions: graceful_stop → clear_temp → start → verify → notify.
// Rollback: escalate to next strategy.
func RestartComponentStrategy() HealingStrategy {
return HealingStrategy{
ID: "RESTART_COMPONENT",
Name: "Component Restart",
Trigger: TriggerCondition{
Statuses: []ComponentStatus{StatusOffline, StatusCritical},
ConsecutiveFailures: 2,
WithinWindow: 5 * time.Minute,
},
Actions: []Action{
{Type: ActionGracefulStop, Timeout: 10 * time.Second, OnError: "continue"},
{Type: ActionClearTempFiles, Timeout: 5 * time.Second, OnError: "continue"},
{Type: ActionStartComponent, Timeout: 30 * time.Second, OnError: "abort"},
{Type: ActionVerifyHealth, Timeout: 60 * time.Second, OnError: "abort"},
{Type: ActionNotifySOC, Timeout: 5 * time.Second, OnError: "continue",
Params: map[string]interface{}{
"severity": "INFO",
"message": "Component restarted successfully",
},
},
},
Rollback: RollbackPlan{
OnFailure: "escalate",
Actions: []Action{
{Type: ActionNotifyArchitect, Timeout: 5 * time.Second,
Params: map[string]interface{}{
"severity": "CRITICAL",
"message": "Component restart failed after max attempts",
},
},
},
},
MaxAttempts: 3,
Cooldown: 5 * time.Minute,
}
}
// RollbackConfigStrategy handles config tampering or validation failures.
// Trigger: config_tampering_detected OR config_validation_failed.
// Actions: freeze → verify_backup → rollback → restart → verify → notify.
func RollbackConfigStrategy() HealingStrategy {
return HealingStrategy{
ID: "ROLLBACK_CONFIG",
Name: "Configuration Rollback",
Trigger: TriggerCondition{
Metrics: []string{"config_tampering", "config_validation"},
},
Actions: []Action{
{Type: ActionFreezeConfig, Timeout: 5 * time.Second, OnError: "abort"},
{Type: ActionRollbackConfig, Timeout: 15 * time.Second, OnError: "abort"},
{Type: ActionStartComponent, Timeout: 30 * time.Second, OnError: "rollback"},
{Type: ActionVerifyConfig, Timeout: 10 * time.Second, OnError: "abort"},
{Type: ActionNotifyArchitect, Timeout: 5 * time.Second, OnError: "continue",
Params: map[string]interface{}{
"severity": "WARNING",
"message": "Config rolled back due to tampering",
},
},
},
Rollback: RollbackPlan{
OnFailure: "enter_safe_mode",
Actions: []Action{
{Type: ActionEnterSafeMode, Timeout: 10 * time.Second},
},
},
MaxAttempts: 1,
Cooldown: 1 * time.Hour,
}
}
// RecoverDatabaseStrategy handles SQLite corruption.
// Trigger: database_corruption OR sqlite_integrity_failed.
// Actions: readonly → backup → restore → verify → resume → notify.
func RecoverDatabaseStrategy() HealingStrategy {
return HealingStrategy{
ID: "RECOVER_DATABASE",
Name: "Database Recovery",
Trigger: TriggerCondition{
Metrics: []string{"database_corruption", "sqlite_integrity"},
},
Actions: []Action{
{Type: ActionSwitchReadOnly, Timeout: 5 * time.Second, OnError: "abort"},
{Type: ActionBackupDB, Timeout: 30 * time.Second, OnError: "continue"},
{Type: ActionRestoreSnapshot, Timeout: 60 * time.Second, OnError: "abort",
Params: map[string]interface{}{
"snapshot_age_max": "1h",
},
},
{Type: ActionVerifyIntegrity, Timeout: 30 * time.Second, OnError: "abort"},
{Type: ActionResumeWrites, Timeout: 5 * time.Second, OnError: "abort"},
{Type: ActionNotifySOC, Timeout: 5 * time.Second, OnError: "continue",
Params: map[string]interface{}{
"severity": "WARNING",
"message": "Database recovered from snapshot",
},
},
},
Rollback: RollbackPlan{
OnFailure: "enter_lockdown",
Actions: []Action{
{Type: ActionEnterSafeMode, Timeout: 10 * time.Second},
{Type: ActionNotifyArchitect, Timeout: 5 * time.Second,
Params: map[string]interface{}{
"severity": "CRITICAL",
"message": "Database recovery failed",
},
},
},
},
MaxAttempts: 2,
Cooldown: 2 * time.Hour,
}
}
// RecoverRulesStrategy handles correlation rule poisoning.
// Trigger: rule execution failure rate > 50%.
// Actions: disable_suspicious → revert_baseline → verify → reload → notify.
func RecoverRulesStrategy() HealingStrategy {
return HealingStrategy{
ID: "RECOVER_RULES",
Name: "Rule Poisoning Defense",
Trigger: TriggerCondition{
Metrics: []string{"rule_execution_failure_rate", "correlation_rule_anomaly"},
},
Actions: []Action{
{Type: ActionDisableRules, Timeout: 10 * time.Second, OnError: "abort",
Params: map[string]interface{}{
"criteria": "failure_rate > 80%",
},
},
{Type: ActionRevertRules, Timeout: 15 * time.Second, OnError: "abort"},
{Type: ActionReloadEngine, Timeout: 30 * time.Second, OnError: "abort"},
{Type: ActionVerifyHealth, Timeout: 30 * time.Second, OnError: "continue"},
{Type: ActionNotifyArchitect, Timeout: 5 * time.Second, OnError: "continue",
Params: map[string]interface{}{
"severity": "WARNING",
"message": "Rules recovered from baseline",
},
},
},
Rollback: RollbackPlan{
OnFailure: "disable_correlation",
},
MaxAttempts: 2,
Cooldown: 4 * time.Hour,
}
}
// RecoverNetworkStrategy handles network partition or mTLS cert expiry.
// Trigger: network_partition_detected OR mTLS_cert_expired.
// Actions: isolate → regen_certs → verify → restore → notify.
func RecoverNetworkStrategy() HealingStrategy {
return HealingStrategy{
ID: "RECOVER_NETWORK",
Name: "Network Isolation Recovery",
Trigger: TriggerCondition{
Metrics: []string{"network_partition", "mtls_cert_expiry"},
},
Actions: []Action{
{Type: ActionIsolateNetwork, Timeout: 5 * time.Second, OnError: "abort",
Params: map[string]interface{}{
"scope": "external_only",
},
},
{Type: ActionRegenCerts, Timeout: 30 * time.Second, OnError: "abort",
Params: map[string]interface{}{
"validity": "24h",
},
},
{Type: ActionVerifyHealth, Timeout: 30 * time.Second, OnError: "rollback"},
{Type: ActionRestoreNetwork, Timeout: 10 * time.Second, OnError: "abort"},
{Type: ActionNotifySOC, Timeout: 5 * time.Second, OnError: "continue",
Params: map[string]interface{}{
"severity": "INFO",
"message": "Network connectivity restored",
},
},
},
Rollback: RollbackPlan{
OnFailure: "maintain_isolation",
Actions: []Action{
{Type: ActionNotifyArchitect, Timeout: 5 * time.Second,
Params: map[string]interface{}{
"severity": "CRITICAL",
"message": "Network recovery failed, maintaining isolation",
},
},
},
},
MaxAttempts: 3,
Cooldown: 1 * time.Hour,
}
}