gomcp/internal/application/resilience/health_monitor.go

449 lines
12 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Copyright 2026 Syntrex Lab. All rights reserved.
// Use of this source code is governed by an Apache-2.0 license
// that can be found in the LICENSE file.
package resilience
import (
"context"
"fmt"
"log/slog"
"sync"
"time"
)
// ComponentStatus defines the health state of a monitored component.
type ComponentStatus string
const (
StatusHealthy ComponentStatus = "HEALTHY"
StatusDegraded ComponentStatus = "DEGRADED"
StatusCritical ComponentStatus = "CRITICAL"
StatusOffline ComponentStatus = "OFFLINE"
)
// AlertSeverity defines the severity of a health alert.
type AlertSeverity string
const (
SeverityInfo AlertSeverity = "INFO"
SeverityWarning AlertSeverity = "WARNING"
SeverityCritical AlertSeverity = "CRITICAL"
)
// OverallStatus aggregates component statuses into a system-wide status.
type OverallStatus string
const (
OverallHealthy OverallStatus = "HEALTHY"
OverallDegraded OverallStatus = "DEGRADED"
OverallCritical OverallStatus = "CRITICAL"
)
// Default intervals per ТЗ §3.1.2.
const (
MetricsCollectionInterval = 10 * time.Second
HealthCheckInterval = 30 * time.Second
QuorumValidationInterval = 60 * time.Second
// AnomalyZScoreThreshold — Z > 3.0 = anomaly (99.7% confidence).
AnomalyZScoreThreshold = 3.0
// QuorumThreshold — 2/3 must be healthy.
QuorumThreshold = 0.66
// MaxConsecutiveFailures before marking CRITICAL.
MaxConsecutiveFailures = 3
)
// ComponentConfig defines monitoring thresholds for a component.
type ComponentConfig struct {
Name string `json:"name"`
Type string `json:"type"` // go_binary, c_binary, c_kernel_module
Thresholds map[string]float64 `json:"thresholds"`
// Whether threshold is an upper bound (true) or lower bound (false).
ThresholdIsMax map[string]bool `json:"threshold_is_max"`
}
// ComponentHealth tracks the health state of a single component.
type ComponentHealth struct {
Name string `json:"name"`
Status ComponentStatus `json:"status"`
Metrics map[string]float64 `json:"metrics"`
LastCheck time.Time `json:"last_check"`
Consecutive int `json:"consecutive_failures"`
Config ComponentConfig `json:"-"`
}
// HealthAlert represents a detected health anomaly.
type HealthAlert struct {
Component string `json:"component"`
Severity AlertSeverity `json:"severity"`
Metric string `json:"metric"`
Current float64 `json:"current"`
Threshold float64 `json:"threshold"`
ZScore float64 `json:"z_score,omitempty"`
Timestamp time.Time `json:"timestamp"`
SuggestedAction string `json:"suggested_action"`
}
// HealthResponse is the API response for GET /api/v1/resilience/health.
type HealthResponse struct {
OverallStatus OverallStatus `json:"overall_status"`
Components []ComponentHealth `json:"components"`
QuorumValid bool `json:"quorum_valid"`
LastCheck time.Time `json:"last_check"`
AnomaliesDetected []HealthAlert `json:"anomalies_detected"`
}
// MetricsCollector is the interface for collecting metrics from components.
// Implementations can use /healthz endpoints, /metrics, or runtime stats.
type MetricsCollector interface {
Collect(ctx context.Context, component string) (map[string]float64, error)
}
// HealthMonitor is the L1 Self-Monitoring orchestrator.
// It collects metrics, runs anomaly detection, validates quorum,
// and emits HealthAlerts to the alert bus.
type HealthMonitor struct {
mu sync.RWMutex
components map[string]*ComponentHealth
metricsDB *MetricsDB
alertBus chan HealthAlert
collector MetricsCollector
logger *slog.Logger
// anomalyWindow is the baseline window for Z-score calculation.
anomalyWindow time.Duration
}
// NewHealthMonitor creates a new health monitor.
func NewHealthMonitor(collector MetricsCollector, alertBufSize int) *HealthMonitor {
if alertBufSize <= 0 {
alertBufSize = 100
}
return &HealthMonitor{
components: make(map[string]*ComponentHealth),
metricsDB: NewMetricsDB(DefaultMetricsWindow, DefaultMetricsMaxSize),
alertBus: make(chan HealthAlert, alertBufSize),
collector: collector,
logger: slog.Default().With("component", "sarl-health-monitor"),
anomalyWindow: 24 * time.Hour,
}
}
// RegisterComponent adds a component to be monitored.
func (hm *HealthMonitor) RegisterComponent(config ComponentConfig) {
hm.mu.Lock()
defer hm.mu.Unlock()
hm.components[config.Name] = &ComponentHealth{
Name: config.Name,
Status: StatusHealthy,
Metrics: make(map[string]float64),
Config: config,
}
hm.logger.Info("component registered", "name", config.Name, "type", config.Type)
}
// AlertBus returns the channel for consuming health alerts.
func (hm *HealthMonitor) AlertBus() <-chan HealthAlert {
return hm.alertBus
}
// Start begins the monitoring loops. Blocks until ctx is cancelled.
func (hm *HealthMonitor) Start(ctx context.Context) {
hm.logger.Info("health monitor started")
metricsTicker := time.NewTicker(MetricsCollectionInterval)
healthTicker := time.NewTicker(HealthCheckInterval)
quorumTicker := time.NewTicker(QuorumValidationInterval)
defer metricsTicker.Stop()
defer healthTicker.Stop()
defer quorumTicker.Stop()
for {
select {
case <-ctx.Done():
hm.logger.Info("health monitor stopped")
return
case <-metricsTicker.C:
hm.collectMetrics(ctx)
case <-healthTicker.C:
hm.checkHealth()
case <-quorumTicker.C:
hm.validateQuorum()
}
}
}
// collectMetrics gathers metrics from all registered components.
func (hm *HealthMonitor) collectMetrics(ctx context.Context) {
hm.mu.RLock()
names := make([]string, 0, len(hm.components))
for name := range hm.components {
names = append(names, name)
}
hm.mu.RUnlock()
for _, name := range names {
metrics, err := hm.collector.Collect(ctx, name)
if err != nil {
hm.logger.Warn("metrics collection failed", "component", name, "error", err)
hm.mu.Lock()
if comp, ok := hm.components[name]; ok {
comp.Consecutive++
}
hm.mu.Unlock()
continue
}
hm.mu.Lock()
comp, ok := hm.components[name]
if ok {
comp.Metrics = metrics
comp.LastCheck = time.Now()
// Store each metric in time-series DB.
for metric, value := range metrics {
hm.metricsDB.AddDataPoint(name, metric, value)
}
}
hm.mu.Unlock()
}
}
// checkHealth evaluates each component against thresholds and anomalies.
func (hm *HealthMonitor) checkHealth() {
hm.mu.Lock()
defer hm.mu.Unlock()
for _, comp := range hm.components {
alerts := hm.evaluateComponent(comp)
for _, alert := range alerts {
hm.emitAlert(alert)
}
}
}
// evaluateComponent checks a single component's metrics against thresholds
// and runs Z-score anomaly detection. Returns any generated alerts.
func (hm *HealthMonitor) evaluateComponent(comp *ComponentHealth) []HealthAlert {
var alerts []HealthAlert
breached := false
for metric, value := range comp.Metrics {
threshold, hasThreshold := comp.Config.Thresholds[metric]
if !hasThreshold {
continue
}
isMax := comp.Config.ThresholdIsMax[metric]
var exceeded bool
if isMax {
exceeded = value > threshold
} else {
exceeded = value < threshold
}
if exceeded {
breached = true
action := "restart"
if metric == "error_rate" || metric == "latency_p99" {
action = "investigate"
}
alerts = append(alerts, HealthAlert{
Component: comp.Name,
Severity: SeverityWarning,
Metric: metric,
Current: value,
Threshold: threshold,
Timestamp: time.Now(),
SuggestedAction: action,
})
}
// Z-score anomaly detection.
baseline := hm.metricsDB.GetBaseline(comp.Name, metric, hm.anomalyWindow)
if IsAnomaly(value, baseline, AnomalyZScoreThreshold) {
zscore := CalculateZScore(value, baseline)
alerts = append(alerts, HealthAlert{
Component: comp.Name,
Severity: SeverityCritical,
Metric: metric,
Current: value,
Threshold: baseline.Mean + AnomalyZScoreThreshold*baseline.StdDev,
ZScore: zscore,
Timestamp: time.Now(),
SuggestedAction: fmt.Sprintf("anomaly detected (Z=%.2f), investigate %s", zscore, metric),
})
}
}
// Update component status.
if breached {
comp.Consecutive++
if comp.Consecutive >= MaxConsecutiveFailures {
comp.Status = StatusCritical
} else {
comp.Status = StatusDegraded
}
} else {
comp.Consecutive = 0
comp.Status = StatusHealthy
}
return alerts
}
// emitAlert sends an alert to the bus (non-blocking).
func (hm *HealthMonitor) emitAlert(alert HealthAlert) {
select {
case hm.alertBus <- alert:
hm.logger.Warn("health alert emitted",
"component", alert.Component,
"severity", alert.Severity,
"metric", alert.Metric,
"current", alert.Current,
"threshold", alert.Threshold,
)
default:
hm.logger.Error("alert bus full, dropping alert",
"component", alert.Component,
"metric", alert.Metric,
)
}
}
// validateQuorum checks if 2/3 of components are healthy.
func (hm *HealthMonitor) validateQuorum() {
hm.mu.RLock()
defer hm.mu.RUnlock()
if len(hm.components) == 0 {
return
}
valid := ValidateQuorum(hm.componentStatuses())
if !valid {
hm.logger.Error("QUORUM LOST — entering degraded state",
"healthy_ratio", hm.healthyRatio(),
"threshold", QuorumThreshold,
)
hm.emitAlert(HealthAlert{
Component: "system",
Severity: SeverityCritical,
Metric: "quorum",
Current: hm.healthyRatio(),
Threshold: QuorumThreshold,
Timestamp: time.Now(),
SuggestedAction: "activate safe mode",
})
}
}
// ValidateQuorum checks if the healthy ratio meets the 2/3 threshold.
func ValidateQuorum(statuses map[string]ComponentStatus) bool {
if len(statuses) == 0 {
return false
}
healthy := 0
for _, status := range statuses {
if status == StatusHealthy {
healthy++
}
}
return float64(healthy)/float64(len(statuses)) >= QuorumThreshold
}
// componentStatuses returns current status map (caller must hold RLock).
func (hm *HealthMonitor) componentStatuses() map[string]ComponentStatus {
statuses := make(map[string]ComponentStatus, len(hm.components))
for name, comp := range hm.components {
statuses[name] = comp.Status
}
return statuses
}
// healthyRatio returns the fraction of healthy components (caller must hold RLock).
func (hm *HealthMonitor) healthyRatio() float64 {
if len(hm.components) == 0 {
return 0
}
healthy := 0
for _, comp := range hm.components {
if comp.Status == StatusHealthy {
healthy++
}
}
return float64(healthy) / float64(len(hm.components))
}
// GetHealth returns a snapshot of the entire system health.
func (hm *HealthMonitor) GetHealth() HealthResponse {
hm.mu.RLock()
defer hm.mu.RUnlock()
components := make([]ComponentHealth, 0, len(hm.components))
for _, comp := range hm.components {
cp := *comp
// Deep copy metrics.
cp.Metrics = make(map[string]float64, len(comp.Metrics))
for k, v := range comp.Metrics {
cp.Metrics[k] = v
}
components = append(components, cp)
}
overall := OverallHealthy
for _, comp := range components {
switch comp.Status {
case StatusCritical, StatusOffline:
overall = OverallCritical
case StatusDegraded:
if overall != OverallCritical {
overall = OverallDegraded
}
}
}
return HealthResponse{
OverallStatus: overall,
Components: components,
QuorumValid: ValidateQuorum(hm.componentStatuses()),
LastCheck: time.Now(),
}
}
// SetComponentStatus manually sets a component's status (for testing/override).
func (hm *HealthMonitor) SetComponentStatus(name string, status ComponentStatus) {
hm.mu.Lock()
defer hm.mu.Unlock()
if comp, ok := hm.components[name]; ok {
comp.Status = status
}
}
// UpdateMetrics manually updates a component's metrics (for testing/override).
func (hm *HealthMonitor) UpdateMetrics(name string, metrics map[string]float64) {
hm.mu.Lock()
defer hm.mu.Unlock()
if comp, ok := hm.components[name]; ok {
comp.Metrics = metrics
comp.LastCheck = time.Now()
for metric, value := range metrics {
hm.metricsDB.AddDataPoint(name, metric, value)
}
}
}
// ComponentCount returns the number of registered components.
func (hm *HealthMonitor) ComponentCount() int {
hm.mu.RLock()
defer hm.mu.RUnlock()
return len(hm.components)
}