gomcp/internal/application/resilience/health_monitor_test.go

503 lines
13 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Copyright 2026 Syntrex Lab. All rights reserved.
// Use of this source code is governed by an Apache-2.0 license
// that can be found in the LICENSE file.
package resilience
import (
"context"
"fmt"
"math"
"testing"
"time"
)
// --- MetricsDB Tests ---
func TestRingBuffer_AddAndAll(t *testing.T) {
rb := newRingBuffer(5)
now := time.Now()
for i := 0; i < 3; i++ {
rb.Add(DataPoint{Timestamp: now.Add(time.Duration(i) * time.Second), Value: float64(i)})
}
if rb.Len() != 3 {
t.Fatalf("expected 3, got %d", rb.Len())
}
all := rb.All()
if len(all) != 3 {
t.Fatalf("expected 3 points, got %d", len(all))
}
for i, dp := range all {
if dp.Value != float64(i) {
t.Errorf("point %d: expected %f, got %f", i, float64(i), dp.Value)
}
}
}
func TestRingBuffer_Wrap(t *testing.T) {
rb := newRingBuffer(3)
now := time.Now()
for i := 0; i < 5; i++ {
rb.Add(DataPoint{Timestamp: now.Add(time.Duration(i) * time.Second), Value: float64(i)})
}
if rb.Len() != 3 {
t.Fatalf("expected 3 (buffer size), got %d", rb.Len())
}
all := rb.All()
// Should contain values 2, 3, 4 (oldest 0, 1 overwritten).
expected := []float64{2, 3, 4}
for i, dp := range all {
if dp.Value != expected[i] {
t.Errorf("point %d: expected %f, got %f", i, expected[i], dp.Value)
}
}
}
func TestMetricsDB_AddAndBaseline(t *testing.T) {
db := NewMetricsDB(time.Hour, 100)
for i := 0; i < 20; i++ {
db.AddDataPoint("soc-ingest", "cpu", 30.0+float64(i%5))
}
baseline := db.GetBaseline("soc-ingest", "cpu", time.Hour)
if baseline.Count != 20 {
t.Fatalf("expected 20 points, got %d", baseline.Count)
}
if baseline.Mean < 30 || baseline.Mean > 35 {
t.Errorf("mean out of expected range: %f", baseline.Mean)
}
if baseline.StdDev == 0 {
t.Error("expected non-zero stddev")
}
}
func TestMetricsDB_EmptyBaseline(t *testing.T) {
db := NewMetricsDB(time.Hour, 100)
baseline := db.GetBaseline("nonexistent", "cpu", time.Hour)
if baseline.Count != 0 {
t.Errorf("expected 0 count for nonexistent, got %d", baseline.Count)
}
}
func TestCalculateZScore(t *testing.T) {
baseline := Baseline{Mean: 30.0, StdDev: 5.0, Count: 100}
// Normal value (Z = 1.0).
z := CalculateZScore(35.0, baseline)
if math.Abs(z-1.0) > 0.01 {
t.Errorf("expected Z≈1.0, got %f", z)
}
// Anomalous value (Z = 4.0).
z = CalculateZScore(50.0, baseline)
if math.Abs(z-4.0) > 0.01 {
t.Errorf("expected Z≈4.0, got %f", z)
}
// Insufficient data → 0.
z = CalculateZScore(50.0, Baseline{Mean: 30, StdDev: 5, Count: 5})
if z != 0 {
t.Errorf("expected 0 for insufficient data, got %f", z)
}
}
func TestIsAnomaly(t *testing.T) {
baseline := Baseline{Mean: 30.0, StdDev: 5.0, Count: 100}
if IsAnomaly(35.0, baseline, 3.0) {
t.Error("35 should not be anomaly (Z=1.0)")
}
if !IsAnomaly(50.0, baseline, 3.0) {
t.Error("50 should be anomaly (Z=4.0)")
}
if !IsAnomaly(10.0, baseline, 3.0) {
t.Error("10 should be anomaly (Z=-4.0)")
}
}
func TestMetricsDB_Purge(t *testing.T) {
db := NewMetricsDB(100*time.Millisecond, 100)
db.AddDataPoint("comp", "cpu", 50)
time.Sleep(150 * time.Millisecond)
db.AddDataPoint("comp", "cpu", 60)
removed := db.Purge()
if removed != 1 {
t.Errorf("expected 1 purged, got %d", removed)
}
}
func TestMetricsDB_GetRecent(t *testing.T) {
db := NewMetricsDB(time.Hour, 100)
for i := 0; i < 10; i++ {
db.AddDataPoint("comp", "mem", float64(i*10))
}
recent := db.GetRecent("comp", "mem", 3)
if len(recent) != 3 {
t.Fatalf("expected 3 recent, got %d", len(recent))
}
// Should be last 3: 70, 80, 90.
if recent[0].Value != 70 || recent[2].Value != 90 {
t.Errorf("unexpected recent values: %v", recent)
}
}
// --- MockCollector for HealthMonitor tests ---
type mockCollector struct {
results map[string]map[string]float64
errors map[string]error
}
func (m *mockCollector) Collect(_ context.Context, component string) (map[string]float64, error) {
if err, ok := m.errors[component]; ok && err != nil {
return nil, err
}
if metrics, ok := m.results[component]; ok {
return metrics, nil
}
return map[string]float64{}, nil
}
// --- HealthMonitor Tests ---
// HM-01: Normal health check — all HEALTHY.
func TestHealthMonitor_HM01_AllHealthy(t *testing.T) {
hm := NewHealthMonitor(&mockCollector{}, 10)
registerTestComponents(hm, 6)
health := hm.GetHealth()
if health.OverallStatus != OverallHealthy {
t.Errorf("expected HEALTHY, got %s", health.OverallStatus)
}
if !health.QuorumValid {
t.Error("expected quorum valid")
}
if len(health.Components) != 6 {
t.Errorf("expected 6 components, got %d", len(health.Components))
}
}
// HM-02: Single component DEGRADED.
func TestHealthMonitor_HM02_SingleDegraded(t *testing.T) {
hm := NewHealthMonitor(&mockCollector{}, 10)
registerTestComponents(hm, 6)
hm.SetComponentStatus("comp-0", StatusDegraded)
health := hm.GetHealth()
if health.OverallStatus != OverallDegraded {
t.Errorf("expected DEGRADED, got %s", health.OverallStatus)
}
if !health.QuorumValid {
t.Error("expected quorum still valid with 5/6 healthy")
}
}
// HM-03: Multiple components CRITICAL → quorum lost.
func TestHealthMonitor_HM03_MultipleCritical(t *testing.T) {
hm := NewHealthMonitor(&mockCollector{}, 10)
registerTestComponents(hm, 6)
hm.SetComponentStatus("comp-0", StatusCritical)
hm.SetComponentStatus("comp-1", StatusCritical)
hm.SetComponentStatus("comp-2", StatusCritical)
health := hm.GetHealth()
if health.OverallStatus != OverallCritical {
t.Errorf("expected CRITICAL, got %s", health.OverallStatus)
}
if health.QuorumValid {
t.Error("expected quorum INVALID with 3/6 critical")
}
}
// HM-04: Anomaly detection (CPU spike).
func TestHealthMonitor_HM04_CPUAnomaly(t *testing.T) {
hm := NewHealthMonitor(&mockCollector{}, 100)
hm.RegisterComponent(ComponentConfig{
Name: "soc-ingest",
Type: "go_binary",
Thresholds: map[string]float64{"cpu": 80},
ThresholdIsMax: map[string]bool{"cpu": true},
})
// Build baseline of normal CPU (30%).
for i := 0; i < 50; i++ {
hm.metricsDB.AddDataPoint("soc-ingest", "cpu", 30.0)
}
// Spike to 95%.
hm.UpdateMetrics("soc-ingest", map[string]float64{"cpu": 95.0})
hm.checkHealth()
// Should have alert(s).
select {
case alert := <-hm.alertBus:
if alert.Component != "soc-ingest" {
t.Errorf("expected soc-ingest, got %s", alert.Component)
}
if alert.Metric != "cpu" {
t.Errorf("expected cpu metric, got %s", alert.Metric)
}
default:
t.Error("expected alert for CPU spike")
}
}
// HM-05: Memory leak detection.
func TestHealthMonitor_HM05_MemoryLeak(t *testing.T) {
hm := NewHealthMonitor(&mockCollector{}, 100)
hm.RegisterComponent(ComponentConfig{
Name: "soc-correlate",
Type: "go_binary",
Thresholds: map[string]float64{"memory": 90},
ThresholdIsMax: map[string]bool{"memory": true},
})
// Build baseline of normal memory (40%).
for i := 0; i < 50; i++ {
hm.metricsDB.AddDataPoint("soc-correlate", "memory", 40.0)
}
// Memory spike to 95%.
hm.UpdateMetrics("soc-correlate", map[string]float64{"memory": 95.0})
hm.checkHealth()
select {
case alert := <-hm.alertBus:
if alert.Metric != "memory" {
t.Errorf("expected memory metric, got %s", alert.Metric)
}
default:
t.Error("expected alert for memory spike")
}
}
// HM-06: Quorum validation failure.
func TestHealthMonitor_HM06_QuorumFailure(t *testing.T) {
statuses := map[string]ComponentStatus{
"a": StatusOffline,
"b": StatusOffline,
"c": StatusOffline,
"d": StatusOffline,
"e": StatusHealthy,
"f": StatusHealthy,
}
if ValidateQuorum(statuses) {
t.Error("expected quorum invalid with 4/6 offline")
}
}
// HM-06b: Quorum validation success (edge case: exactly 2/3).
func TestHealthMonitor_HM06b_QuorumEdge(t *testing.T) {
statuses := map[string]ComponentStatus{
"a": StatusHealthy,
"b": StatusHealthy,
"c": StatusCritical,
}
if !ValidateQuorum(statuses) {
t.Error("expected quorum valid with 2/3 healthy (exact threshold)")
}
}
// HM-06c: Empty quorum.
func TestHealthMonitor_HM06c_EmptyQuorum(t *testing.T) {
if ValidateQuorum(map[string]ComponentStatus{}) {
t.Error("expected quorum invalid with 0 components")
}
}
// HM-07: Metrics collection (no data loss).
func TestHealthMonitor_HM07_MetricsCollection(t *testing.T) {
collector := &mockCollector{
results: map[string]map[string]float64{
"comp-0": {"cpu": 25, "memory": 40},
},
}
hm := NewHealthMonitor(collector, 10)
hm.RegisterComponent(ComponentConfig{Name: "comp-0", Type: "go_binary"})
hm.collectMetrics(context.Background())
hm.mu.RLock()
comp := hm.components["comp-0"]
hm.mu.RUnlock()
if comp.Metrics["cpu"] != 25 {
t.Errorf("expected cpu=25, got %f", comp.Metrics["cpu"])
}
if comp.Metrics["memory"] != 40 {
t.Errorf("expected memory=40, got %f", comp.Metrics["memory"])
}
}
// HM-07b: Collection error increments consecutive failures.
func TestHealthMonitor_HM07b_CollectionError(t *testing.T) {
collector := &mockCollector{
errors: map[string]error{
"comp-0": fmt.Errorf("connection refused"),
},
}
hm := NewHealthMonitor(collector, 10)
hm.RegisterComponent(ComponentConfig{Name: "comp-0", Type: "go_binary"})
hm.collectMetrics(context.Background())
hm.mu.RLock()
comp := hm.components["comp-0"]
hm.mu.RUnlock()
if comp.Consecutive != 1 {
t.Errorf("expected 1 consecutive failure, got %d", comp.Consecutive)
}
}
// HM-08: Alert bus fan-out (non-blocking).
func TestHealthMonitor_HM08_AlertBusFanOut(t *testing.T) {
hm := NewHealthMonitor(&mockCollector{}, 5)
hm.RegisterComponent(ComponentConfig{
Name: "comp",
Type: "go_binary",
Thresholds: map[string]float64{"cpu": 50},
ThresholdIsMax: map[string]bool{"cpu": true},
})
// Fill alert bus.
for i := 0; i < 5; i++ {
hm.alertBus <- HealthAlert{Component: fmt.Sprintf("test-%d", i)}
}
// Emit one more — should be dropped (non-blocking).
hm.emitAlert(HealthAlert{Component: "overflow"})
// No panic = success.
}
// Test GetHealth returns a deep copy.
func TestHealthMonitor_GetHealthDeepCopy(t *testing.T) {
hm := NewHealthMonitor(&mockCollector{}, 10)
hm.RegisterComponent(ComponentConfig{Name: "test", Type: "go_binary"})
hm.UpdateMetrics("test", map[string]float64{"cpu": 50})
health := hm.GetHealth()
health.Components[0].Metrics["cpu"] = 999
// Original should be unchanged.
hm.mu.RLock()
original := hm.components["test"].Metrics["cpu"]
hm.mu.RUnlock()
if original != 50 {
t.Errorf("deep copy failed: original modified to %f", original)
}
}
// Test threshold breach transitions status to DEGRADED then CRITICAL.
func TestHealthMonitor_StatusTransitions(t *testing.T) {
hm := NewHealthMonitor(&mockCollector{}, 100)
hm.RegisterComponent(ComponentConfig{
Name: "comp",
Type: "go_binary",
Thresholds: map[string]float64{"error_rate": 5},
ThresholdIsMax: map[string]bool{"error_rate": true},
})
// Breach once → DEGRADED.
hm.UpdateMetrics("comp", map[string]float64{"error_rate": 10})
hm.checkHealth()
hm.mu.RLock()
status := hm.components["comp"].Status
hm.mu.RUnlock()
if status != StatusDegraded {
t.Errorf("expected DEGRADED after 1 breach, got %s", status)
}
// Breach 3× → CRITICAL.
for i := 0; i < 3; i++ {
hm.checkHealth()
}
hm.mu.RLock()
status = hm.components["comp"].Status
hm.mu.RUnlock()
if status != StatusCritical {
t.Errorf("expected CRITICAL after repeated breaches, got %s", status)
}
}
// Test lower-bound threshold (ThresholdIsMax=false).
func TestHealthMonitor_LowerBoundThreshold(t *testing.T) {
hm := NewHealthMonitor(&mockCollector{}, 100)
hm.RegisterComponent(ComponentConfig{
Name: "immune",
Type: "c_kernel_module",
Thresholds: map[string]float64{"hooks_active": 10},
ThresholdIsMax: map[string]bool{"hooks_active": false},
})
// hooks_active = 5 (below threshold of 10) → warning.
hm.UpdateMetrics("immune", map[string]float64{"hooks_active": 5})
hm.checkHealth()
select {
case alert := <-hm.alertBus:
if alert.Component != "immune" || alert.Metric != "hooks_active" {
t.Errorf("unexpected alert: %+v", alert)
}
default:
t.Error("expected alert for hooks_active below threshold")
}
}
// Test ComponentCount.
func TestHealthMonitor_ComponentCount(t *testing.T) {
hm := NewHealthMonitor(&mockCollector{}, 10)
if hm.ComponentCount() != 0 {
t.Error("expected 0 initially")
}
registerTestComponents(hm, 4)
if hm.ComponentCount() != 4 {
t.Errorf("expected 4, got %d", hm.ComponentCount())
}
}
// Test Start/Stop lifecycle.
func TestHealthMonitor_StartStop(t *testing.T) {
hm := NewHealthMonitor(&mockCollector{}, 10)
registerTestComponents(hm, 2)
ctx, cancel := context.WithCancel(context.Background())
done := make(chan struct{})
go func() {
hm.Start(ctx)
close(done)
}()
// Let it run briefly.
time.Sleep(50 * time.Millisecond)
cancel()
select {
case <-done:
// Clean shutdown.
case <-time.After(time.Second):
t.Fatal("Start() did not return after context cancellation")
}
}
// --- Helpers ---
func registerTestComponents(hm *HealthMonitor, n int) {
for i := 0; i < n; i++ {
hm.RegisterComponent(ComponentConfig{
Name: fmt.Sprintf("comp-%d", i),
Type: "go_binary",
})
}
}