gomcp/internal/infrastructure/watchdog/watchdog_test.go

249 lines
6 KiB
Go

package watchdog
import (
"context"
"net/http"
"net/http/httptest"
"testing"
"time"
)
func TestRegisterPeer(t *testing.T) {
m := NewMonitor("test-self")
m.RegisterPeer("immune", "http://localhost:9760/health")
m.RegisterPeer("sidecar", "http://localhost:9770/health")
peers := m.AllPeers()
if len(peers) != 2 {
t.Fatalf("peer count = %d, want 2", len(peers))
}
}
func TestHealthyPeer(t *testing.T) {
// Create a mock healthy peer.
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
}))
defer srv.Close()
m := NewMonitor("test-self")
m.RegisterPeer("healthy-peer", srv.URL+"/health")
// Run one check cycle.
ctx := context.Background()
m.checkAllPeers(ctx)
peer, ok := m.GetPeerStatus("healthy-peer")
if !ok {
t.Fatal("peer not found")
}
if peer.Status != StatusHealthy {
t.Errorf("status = %s, want HEALTHY", peer.Status)
}
if peer.MissedCount != 0 {
t.Errorf("missed = %d, want 0", peer.MissedCount)
}
}
func TestUnhealthyPeerDegraded(t *testing.T) {
// Peer that's down (no server listening).
m := NewMonitor("test-self")
m.RegisterPeer("dead-peer", "http://127.0.0.1:19999/health")
ctx := context.Background()
// One miss → DEGRADED.
m.checkAllPeers(ctx)
peer, _ := m.GetPeerStatus("dead-peer")
if peer.Status != StatusDegraded {
t.Errorf("status = %s, want DEGRADED", peer.Status)
}
if peer.MissedCount != 1 {
t.Errorf("missed = %d, want 1", peer.MissedCount)
}
}
func TestEscalationToRestart(t *testing.T) {
m := NewMonitor("test-self")
m.RegisterPeer("flaky-peer", "http://127.0.0.1:19999/health")
var escalations []EscalationAction
m.OnEscalation(func(a EscalationAction) {
escalations = append(escalations, a)
})
ctx := context.Background()
// Miss 3 heartbeats → should trigger restart.
for i := 0; i < MaxMissedBeforeRestart; i++ {
m.checkAllPeers(ctx)
}
peer, _ := m.GetPeerStatus("flaky-peer")
if peer.Status != StatusOffline {
t.Errorf("status = %s, want OFFLINE", peer.Status)
}
if peer.RestartCount != 1 {
t.Errorf("restart_count = %d, want 1", peer.RestartCount)
}
// Check that escalation was fired.
found := false
for _, e := range escalations {
if e.Action == "restart" {
found = true
break
}
}
if !found {
t.Error("expected 'restart' escalation, got none")
}
}
func TestEscalationToIsolate(t *testing.T) {
m := NewMonitor("test-self")
m.RegisterPeer("broken-peer", "http://127.0.0.1:19999/health")
var escalations []EscalationAction
m.OnEscalation(func(a EscalationAction) {
escalations = append(escalations, a)
})
ctx := context.Background()
// Trigger MaxRestartsBeforeIsolate restart cycles.
for r := 0; r < MaxRestartsBeforeIsolate; r++ {
for i := 0; i < MaxMissedBeforeRestart; i++ {
m.checkAllPeers(ctx)
}
}
// Now one more miss cycle should trigger isolation.
for i := 0; i < MaxMissedBeforeRestart; i++ {
m.checkAllPeers(ctx)
}
peer, _ := m.GetPeerStatus("broken-peer")
if peer.Status != StatusIsolated {
t.Errorf("status = %s, want ISOLATED", peer.Status)
}
// Check for isolate escalation.
found := false
for _, e := range escalations {
if e.Action == "isolate" {
found = true
if e.Severity != "CRITICAL" {
t.Errorf("isolate severity = %s, want CRITICAL", e.Severity)
}
break
}
}
if !found {
t.Error("expected 'isolate' escalation, got none")
}
}
func TestRecoveryAfterRestart(t *testing.T) {
// Peer goes down, gets restarted (simulated), then comes back.
healthy := true
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if healthy {
w.WriteHeader(http.StatusOK)
} else {
w.WriteHeader(http.StatusServiceUnavailable)
}
}))
defer srv.Close()
m := NewMonitor("test-self")
m.RegisterPeer("recovering-peer", srv.URL+"/health")
ctx := context.Background()
// Initially healthy.
m.checkAllPeers(ctx)
peer, _ := m.GetPeerStatus("recovering-peer")
if peer.Status != StatusHealthy {
t.Fatalf("initial status = %s, want HEALTHY", peer.Status)
}
// Goes down.
healthy = false
m.checkAllPeers(ctx)
peer, _ = m.GetPeerStatus("recovering-peer")
if peer.Status != StatusDegraded {
t.Fatalf("down status = %s, want DEGRADED", peer.Status)
}
// Comes back.
healthy = true
m.checkAllPeers(ctx)
peer, _ = m.GetPeerStatus("recovering-peer")
if peer.Status != StatusHealthy {
t.Errorf("recovered status = %s, want HEALTHY", peer.Status)
}
if peer.MissedCount != 0 {
t.Errorf("missed after recovery = %d, want 0", peer.MissedCount)
}
}
func TestStats(t *testing.T) {
m := NewMonitor("test-self")
m.RegisterPeer("p1", "http://127.0.0.1:19999/health")
ctx := context.Background()
m.checkAllPeers(ctx)
m.checkAllPeers(ctx)
stats := m.Stats()
if stats.TotalChecks != 2 {
t.Errorf("total_checks = %d, want 2", stats.TotalChecks)
}
if stats.TotalMisses != 2 {
t.Errorf("total_misses = %d, want 2", stats.TotalMisses)
}
if stats.PeerCount != 1 {
t.Errorf("peer_count = %d, want 1", stats.PeerCount)
}
}
func TestServeHTTP(t *testing.T) {
m := NewMonitor("test-self")
m.RegisterPeer("p1", "http://localhost:9760/health")
w := httptest.NewRecorder()
r := httptest.NewRequest(http.MethodGet, "/mesh", nil)
m.ServeHTTP(w, r)
if w.Code != http.StatusOK {
t.Errorf("status = %d, want 200", w.Code)
}
if ct := w.Header().Get("Content-Type"); ct != "application/json" {
t.Errorf("content-type = %s, want application/json", ct)
}
}
func TestMonitorStartStop(t *testing.T) {
m := NewMonitor("test-self")
m.interval = 50 * time.Millisecond // Fast for tests.
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
}))
defer srv.Close()
m.RegisterPeer("fast-peer", srv.URL+"/health")
ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond)
defer cancel()
m.Start(ctx) // Blocks until context expires.
stats := m.Stats()
if stats.TotalChecks < 2 {
t.Errorf("expected at least 2 checks in 200ms, got %d", stats.TotalChecks)
}
}