Release prep: 54 engines, self-hosted signatures, i18n, dashboard updates

This commit is contained in:
DmitrL-dev 2026-03-23 16:45:40 +10:00
parent 694e32be26
commit 41cbfd6e0a
178 changed files with 36008 additions and 399 deletions

View file

@ -0,0 +1,588 @@
package resilience
import (
"context"
"fmt"
"sync/atomic"
"testing"
"time"
)
// --- Mock executor for tests ---
type mockExecutorLog struct {
actions []ActionType
fail map[ActionType]bool
count atomic.Int64
}
func newMockExecutor() *mockExecutorLog {
return &mockExecutorLog{
fail: make(map[ActionType]bool),
}
}
func (m *mockExecutorLog) execute(_ context.Context, action Action, _ string) error {
m.count.Add(1)
m.actions = append(m.actions, action.Type)
if m.fail[action.Type] {
return fmt.Errorf("action %s failed", action.Type)
}
return nil
}
// --- Healing Engine Tests ---
// HE-01: Component restart (success).
func TestHealingEngine_HE01_RestartSuccess(t *testing.T) {
mock := newMockExecutor()
alertCh := make(chan HealthAlert, 10)
escalated := false
he := NewHealingEngine(alertCh, mock.execute, func(_ HealthAlert) {
escalated = true
})
he.RegisterStrategy(RestartComponentStrategy())
alertCh <- HealthAlert{
Component: "soc-ingest",
Severity: SeverityCritical,
Metric: "quorum",
SuggestedAction: "restart",
Timestamp: time.Now(),
}
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
// Run one healing cycle.
go he.Start(ctx)
time.Sleep(200 * time.Millisecond)
cancel()
ops := he.RecentOperations(10)
if len(ops) == 0 {
t.Fatal("expected at least 1 operation")
}
if ops[0].Result != ResultSuccess {
t.Errorf("expected SUCCESS, got %s (error: %s)", ops[0].Result, ops[0].Error)
}
if escalated {
t.Error("should not have escalated on success")
}
}
// HE-02: Component restart (failure ×3 → escalate).
func TestHealingEngine_HE02_RestartFailureEscalate(t *testing.T) {
mock := newMockExecutor()
mock.fail[ActionStartComponent] = true // Start always fails.
alertCh := make(chan HealthAlert, 10)
escalated := false
he := NewHealingEngine(alertCh, mock.execute, func(_ HealthAlert) {
escalated = true
})
he.RegisterStrategy(RestartComponentStrategy())
alertCh <- HealthAlert{
Component: "soc-correlate",
Severity: SeverityCritical,
Metric: "quorum",
SuggestedAction: "restart",
Timestamp: time.Now(),
}
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
go he.Start(ctx)
time.Sleep(200 * time.Millisecond)
cancel()
if !escalated {
t.Error("expected escalation on failure")
}
ops := he.RecentOperations(10)
if len(ops) == 0 {
t.Fatal("expected operation")
}
if ops[0].Result != ResultFailed {
t.Errorf("expected FAILED, got %s", ops[0].Result)
}
}
// HE-03: Config rollback strategy matching.
func TestHealingEngine_HE03_ConfigRollback(t *testing.T) {
mock := newMockExecutor()
alertCh := make(chan HealthAlert, 10)
he := NewHealingEngine(alertCh, mock.execute, nil)
he.RegisterStrategy(RollbackConfigStrategy())
alertCh <- HealthAlert{
Component: "soc-ingest",
Severity: SeverityWarning,
Metric: "config_tampering",
Timestamp: time.Now(),
}
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
go he.Start(ctx)
time.Sleep(200 * time.Millisecond)
cancel()
ops := he.RecentOperations(10)
if len(ops) == 0 {
t.Fatal("expected operation for config rollback")
}
if ops[0].StrategyID != "ROLLBACK_CONFIG" {
t.Errorf("expected ROLLBACK_CONFIG, got %s", ops[0].StrategyID)
}
}
// HE-04: Database recovery.
func TestHealingEngine_HE04_DatabaseRecovery(t *testing.T) {
mock := newMockExecutor()
alertCh := make(chan HealthAlert, 10)
he := NewHealingEngine(alertCh, mock.execute, nil)
he.RegisterStrategy(RecoverDatabaseStrategy())
alertCh <- HealthAlert{
Component: "soc-correlate",
Severity: SeverityCritical,
Metric: "database_corruption",
Timestamp: time.Now(),
}
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
go he.Start(ctx)
time.Sleep(200 * time.Millisecond)
cancel()
ops := he.RecentOperations(10)
if len(ops) == 0 {
t.Fatal("expected DB recovery op")
}
if ops[0].StrategyID != "RECOVER_DATABASE" {
t.Errorf("expected RECOVER_DATABASE, got %s", ops[0].StrategyID)
}
}
// HE-05: Rule poisoning defense.
func TestHealingEngine_HE05_RulePoisoning(t *testing.T) {
mock := newMockExecutor()
alertCh := make(chan HealthAlert, 10)
he := NewHealingEngine(alertCh, mock.execute, nil)
he.RegisterStrategy(RecoverRulesStrategy())
alertCh <- HealthAlert{
Component: "soc-correlate",
Severity: SeverityWarning,
Metric: "rule_execution_failure_rate",
Timestamp: time.Now(),
}
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
go he.Start(ctx)
time.Sleep(200 * time.Millisecond)
cancel()
ops := he.RecentOperations(10)
if len(ops) == 0 {
t.Fatal("expected rule recovery op")
}
if ops[0].StrategyID != "RECOVER_RULES" {
t.Errorf("expected RECOVER_RULES, got %s", ops[0].StrategyID)
}
}
// HE-06: Network isolation recovery.
func TestHealingEngine_HE06_NetworkRecovery(t *testing.T) {
mock := newMockExecutor()
alertCh := make(chan HealthAlert, 10)
he := NewHealingEngine(alertCh, mock.execute, nil)
he.RegisterStrategy(RecoverNetworkStrategy())
alertCh <- HealthAlert{
Component: "soc-respond",
Severity: SeverityWarning,
Metric: "network_partition",
Timestamp: time.Now(),
}
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
go he.Start(ctx)
time.Sleep(200 * time.Millisecond)
cancel()
ops := he.RecentOperations(10)
if len(ops) == 0 {
t.Fatal("expected network recovery op")
}
if ops[0].StrategyID != "RECOVER_NETWORK" {
t.Errorf("expected RECOVER_NETWORK, got %s", ops[0].StrategyID)
}
}
// HE-07: Cooldown enforcement.
func TestHealingEngine_HE07_Cooldown(t *testing.T) {
mock := newMockExecutor()
alertCh := make(chan HealthAlert, 10)
he := NewHealingEngine(alertCh, mock.execute, nil)
he.RegisterStrategy(RestartComponentStrategy())
// Set cooldown manually.
he.setCooldown("RESTART_COMPONENT", 1*time.Hour)
if !he.isInCooldown("RESTART_COMPONENT") {
t.Error("expected cooldown active")
}
alertCh <- HealthAlert{
Component: "soc-ingest",
Severity: SeverityCritical,
Metric: "quorum",
SuggestedAction: "restart",
Timestamp: time.Now(),
}
ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond)
defer cancel()
go he.Start(ctx)
time.Sleep(200 * time.Millisecond)
cancel()
ops := he.RecentOperations(10)
if len(ops) != 0 {
t.Error("expected 0 operations during cooldown")
}
}
// HE-08: Rollback on failure.
func TestHealingEngine_HE08_Rollback(t *testing.T) {
mock := newMockExecutor()
mock.fail[ActionStartComponent] = true
alertCh := make(chan HealthAlert, 10)
he := NewHealingEngine(alertCh, mock.execute, func(_ HealthAlert) {})
strategy := RollbackConfigStrategy()
he.RegisterStrategy(strategy)
alertCh <- HealthAlert{
Component: "soc-ingest",
Severity: SeverityWarning,
Metric: "config_tampering",
Timestamp: time.Now(),
}
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
go he.Start(ctx)
time.Sleep(200 * time.Millisecond)
cancel()
// Rollback should have executed enter_safe_mode.
foundSafeMode := false
for _, a := range mock.actions {
if a == ActionEnterSafeMode {
foundSafeMode = true
}
}
if !foundSafeMode {
t.Errorf("expected safe mode in rollback, actions: %v", mock.actions)
}
}
// HE-09: State machine transitions.
func TestHealingEngine_HE09_StateTransitions(t *testing.T) {
mock := newMockExecutor()
alertCh := make(chan HealthAlert, 10)
he := NewHealingEngine(alertCh, mock.execute, nil)
he.RegisterStrategy(RestartComponentStrategy())
alertCh <- HealthAlert{
Component: "comp",
Severity: SeverityCritical,
Metric: "quorum",
SuggestedAction: "restart",
Timestamp: time.Now(),
}
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
go he.Start(ctx)
time.Sleep(200 * time.Millisecond)
cancel()
ops := he.RecentOperations(10)
if len(ops) == 0 {
t.Fatal("expected operation")
}
// Final state should be COMPLETED.
if ops[0].State != HealingCompleted {
t.Errorf("expected COMPLETED, got %s", ops[0].State)
}
}
// HE-10: Audit logging — all actions recorded.
func TestHealingEngine_HE10_AuditLogging(t *testing.T) {
mock := newMockExecutor()
alertCh := make(chan HealthAlert, 10)
he := NewHealingEngine(alertCh, mock.execute, nil)
he.RegisterStrategy(RestartComponentStrategy())
alertCh <- HealthAlert{
Component: "comp",
Severity: SeverityCritical,
Metric: "quorum",
SuggestedAction: "restart",
Timestamp: time.Now(),
}
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
go he.Start(ctx)
time.Sleep(200 * time.Millisecond)
cancel()
ops := he.RecentOperations(10)
if len(ops) == 0 {
t.Fatal("expected operation")
}
if len(ops[0].ActionsRun) == 0 {
t.Error("expected action logs")
}
for _, al := range ops[0].ActionsRun {
if al.StartedAt.IsZero() {
t.Error("action log missing start time")
}
}
}
// HE-11: Parallel healing — no race conditions.
func TestHealingEngine_HE11_Parallel(t *testing.T) {
mock := newMockExecutor()
alertCh := make(chan HealthAlert, 100)
he := NewHealingEngine(alertCh, mock.execute, nil)
for _, s := range DefaultStrategies() {
he.RegisterStrategy(s)
}
// Send many alerts concurrently.
for i := 0; i < 10; i++ {
alertCh <- HealthAlert{
Component: fmt.Sprintf("comp-%d", i),
Severity: SeverityCritical,
Metric: "quorum",
SuggestedAction: "restart",
Timestamp: time.Now(),
}
}
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
defer cancel()
go he.Start(ctx)
time.Sleep(1 * time.Second)
cancel()
// All 10 alerts processed (first gets an op, rest hit cooldown).
ops := he.RecentOperations(100)
if len(ops) == 0 {
t.Fatal("expected at least 1 operation")
}
}
// HE-12: No matching strategy → no operation.
func TestHealingEngine_HE12_NoStrategy(t *testing.T) {
mock := newMockExecutor()
alertCh := make(chan HealthAlert, 10)
he := NewHealingEngine(alertCh, mock.execute, nil)
// No strategies registered.
alertCh <- HealthAlert{
Component: "comp",
Severity: SeverityCritical,
Metric: "unknown_metric",
Timestamp: time.Now(),
}
ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond)
defer cancel()
go he.Start(ctx)
time.Sleep(200 * time.Millisecond)
cancel()
ops := he.RecentOperations(10)
if len(ops) != 0 {
t.Errorf("expected 0 operations, got %d", len(ops))
}
}
// Test diagnosis (various root causes).
func TestHealingEngine_Diagnosis(t *testing.T) {
mock := newMockExecutor()
he := NewHealingEngine(nil, mock.execute, nil)
tests := []struct {
metric string
current float64
wantCause string
}{
{"memory", 95, "memory_exhaustion"},
{"cpu", 95, "cpu_saturation"},
{"error_rate", 10, "elevated_error_rate"},
{"latency_p99", 200, "latency_degradation"},
{"quorum", 0.3, "quorum_loss"},
{"custom", 100, "threshold_breach_custom"},
}
for _, tt := range tests {
alert := HealthAlert{
Component: "test",
Metric: tt.metric,
Current: tt.current,
}
d := he.diagnose(alert)
if d.RootCause != tt.wantCause {
t.Errorf("metric=%s: expected %s, got %s", tt.metric, tt.wantCause, d.RootCause)
}
if d.Confidence <= 0 || d.Confidence > 1 {
t.Errorf("metric=%s: invalid confidence %f", tt.metric, d.Confidence)
}
}
}
// Test DefaultStrategies returns 5 strategies.
func TestDefaultStrategies(t *testing.T) {
strategies := DefaultStrategies()
if len(strategies) != 5 {
t.Errorf("expected 5 strategies, got %d", len(strategies))
}
ids := map[string]bool{}
for _, s := range strategies {
if ids[s.ID] {
t.Errorf("duplicate strategy ID: %s", s.ID)
}
ids[s.ID] = true
if s.MaxAttempts <= 0 {
t.Errorf("strategy %s: invalid max_attempts %d", s.ID, s.MaxAttempts)
}
if s.Cooldown <= 0 {
t.Errorf("strategy %s: invalid cooldown %v", s.ID, s.Cooldown)
}
if len(s.Actions) == 0 {
t.Errorf("strategy %s: no actions defined", s.ID)
}
}
}
// Test StrategyCount.
func TestHealingEngine_StrategyCount(t *testing.T) {
he := NewHealingEngine(nil, nil, nil)
if he.StrategyCount() != 0 {
t.Error("expected 0")
}
for _, s := range DefaultStrategies() {
he.RegisterStrategy(s)
}
if he.StrategyCount() != 5 {
t.Errorf("expected 5, got %d", he.StrategyCount())
}
}
// Test GetOperation.
func TestHealingEngine_GetOperation(t *testing.T) {
mock := newMockExecutor()
alertCh := make(chan HealthAlert, 10)
he := NewHealingEngine(alertCh, mock.execute, nil)
he.RegisterStrategy(RestartComponentStrategy())
alertCh <- HealthAlert{
Component: "comp",
Severity: SeverityCritical,
Metric: "quorum",
SuggestedAction: "restart",
Timestamp: time.Now(),
}
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
go he.Start(ctx)
time.Sleep(200 * time.Millisecond)
cancel()
op, ok := he.GetOperation("heal-1")
if !ok {
t.Fatal("expected operation heal-1")
}
if op.Component != "comp" {
t.Errorf("expected comp, got %s", op.Component)
}
_, ok = he.GetOperation("nonexistent")
if ok {
t.Error("expected not found for nonexistent")
}
}
// Test action OnError=continue.
func TestHealingEngine_ActionContinueOnError(t *testing.T) {
mock := newMockExecutor()
mock.fail[ActionGracefulStop] = true // First action fails but marked continue.
alertCh := make(chan HealthAlert, 10)
he := NewHealingEngine(alertCh, mock.execute, nil)
he.RegisterStrategy(RestartComponentStrategy())
alertCh <- HealthAlert{
Component: "comp",
Severity: SeverityCritical,
Metric: "quorum",
SuggestedAction: "restart",
Timestamp: time.Now(),
}
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
go he.Start(ctx)
time.Sleep(200 * time.Millisecond)
cancel()
ops := he.RecentOperations(10)
if len(ops) == 0 {
t.Fatal("expected operation")
}
// Should still succeed because graceful_stop has OnError=continue.
if ops[0].Result != ResultSuccess {
t.Errorf("expected SUCCESS (continue on error), got %s", ops[0].Result)
}
}