mirror of
https://github.com/syntrex-lab/gomcp.git
synced 2026-04-24 20:06:21 +02:00
449 lines
12 KiB
Go
449 lines
12 KiB
Go
// Copyright 2026 Syntrex Lab. All rights reserved.
|
||
// Use of this source code is governed by an Apache-2.0 license
|
||
// that can be found in the LICENSE file.
|
||
|
||
package resilience
|
||
|
||
import (
|
||
"context"
|
||
"fmt"
|
||
"log/slog"
|
||
"sync"
|
||
"time"
|
||
)
|
||
|
||
// ComponentStatus defines the health state of a monitored component.
|
||
type ComponentStatus string
|
||
|
||
const (
|
||
StatusHealthy ComponentStatus = "HEALTHY"
|
||
StatusDegraded ComponentStatus = "DEGRADED"
|
||
StatusCritical ComponentStatus = "CRITICAL"
|
||
StatusOffline ComponentStatus = "OFFLINE"
|
||
)
|
||
|
||
// AlertSeverity defines the severity of a health alert.
|
||
type AlertSeverity string
|
||
|
||
const (
|
||
SeverityInfo AlertSeverity = "INFO"
|
||
SeverityWarning AlertSeverity = "WARNING"
|
||
SeverityCritical AlertSeverity = "CRITICAL"
|
||
)
|
||
|
||
// OverallStatus aggregates component statuses into a system-wide status.
|
||
type OverallStatus string
|
||
|
||
const (
|
||
OverallHealthy OverallStatus = "HEALTHY"
|
||
OverallDegraded OverallStatus = "DEGRADED"
|
||
OverallCritical OverallStatus = "CRITICAL"
|
||
)
|
||
|
||
// Default intervals per ТЗ §3.1.2.
|
||
const (
|
||
MetricsCollectionInterval = 10 * time.Second
|
||
HealthCheckInterval = 30 * time.Second
|
||
QuorumValidationInterval = 60 * time.Second
|
||
|
||
// AnomalyZScoreThreshold — Z > 3.0 = anomaly (99.7% confidence).
|
||
AnomalyZScoreThreshold = 3.0
|
||
|
||
// QuorumThreshold — 2/3 must be healthy.
|
||
QuorumThreshold = 0.66
|
||
|
||
// MaxConsecutiveFailures before marking CRITICAL.
|
||
MaxConsecutiveFailures = 3
|
||
)
|
||
|
||
// ComponentConfig defines monitoring thresholds for a component.
|
||
type ComponentConfig struct {
|
||
Name string `json:"name"`
|
||
Type string `json:"type"` // go_binary, c_binary, c_kernel_module
|
||
Thresholds map[string]float64 `json:"thresholds"`
|
||
// Whether threshold is an upper bound (true) or lower bound (false).
|
||
ThresholdIsMax map[string]bool `json:"threshold_is_max"`
|
||
}
|
||
|
||
// ComponentHealth tracks the health state of a single component.
|
||
type ComponentHealth struct {
|
||
Name string `json:"name"`
|
||
Status ComponentStatus `json:"status"`
|
||
Metrics map[string]float64 `json:"metrics"`
|
||
LastCheck time.Time `json:"last_check"`
|
||
Consecutive int `json:"consecutive_failures"`
|
||
Config ComponentConfig `json:"-"`
|
||
}
|
||
|
||
// HealthAlert represents a detected health anomaly.
|
||
type HealthAlert struct {
|
||
Component string `json:"component"`
|
||
Severity AlertSeverity `json:"severity"`
|
||
Metric string `json:"metric"`
|
||
Current float64 `json:"current"`
|
||
Threshold float64 `json:"threshold"`
|
||
ZScore float64 `json:"z_score,omitempty"`
|
||
Timestamp time.Time `json:"timestamp"`
|
||
SuggestedAction string `json:"suggested_action"`
|
||
}
|
||
|
||
// HealthResponse is the API response for GET /api/v1/resilience/health.
|
||
type HealthResponse struct {
|
||
OverallStatus OverallStatus `json:"overall_status"`
|
||
Components []ComponentHealth `json:"components"`
|
||
QuorumValid bool `json:"quorum_valid"`
|
||
LastCheck time.Time `json:"last_check"`
|
||
AnomaliesDetected []HealthAlert `json:"anomalies_detected"`
|
||
}
|
||
|
||
// MetricsCollector is the interface for collecting metrics from components.
|
||
// Implementations can use /healthz endpoints, /metrics, or runtime stats.
|
||
type MetricsCollector interface {
|
||
Collect(ctx context.Context, component string) (map[string]float64, error)
|
||
}
|
||
|
||
// HealthMonitor is the L1 Self-Monitoring orchestrator.
|
||
// It collects metrics, runs anomaly detection, validates quorum,
|
||
// and emits HealthAlerts to the alert bus.
|
||
type HealthMonitor struct {
|
||
mu sync.RWMutex
|
||
components map[string]*ComponentHealth
|
||
metricsDB *MetricsDB
|
||
alertBus chan HealthAlert
|
||
collector MetricsCollector
|
||
logger *slog.Logger
|
||
|
||
// anomalyWindow is the baseline window for Z-score calculation.
|
||
anomalyWindow time.Duration
|
||
}
|
||
|
||
// NewHealthMonitor creates a new health monitor.
|
||
func NewHealthMonitor(collector MetricsCollector, alertBufSize int) *HealthMonitor {
|
||
if alertBufSize <= 0 {
|
||
alertBufSize = 100
|
||
}
|
||
return &HealthMonitor{
|
||
components: make(map[string]*ComponentHealth),
|
||
metricsDB: NewMetricsDB(DefaultMetricsWindow, DefaultMetricsMaxSize),
|
||
alertBus: make(chan HealthAlert, alertBufSize),
|
||
collector: collector,
|
||
logger: slog.Default().With("component", "sarl-health-monitor"),
|
||
anomalyWindow: 24 * time.Hour,
|
||
}
|
||
}
|
||
|
||
// RegisterComponent adds a component to be monitored.
|
||
func (hm *HealthMonitor) RegisterComponent(config ComponentConfig) {
|
||
hm.mu.Lock()
|
||
defer hm.mu.Unlock()
|
||
|
||
hm.components[config.Name] = &ComponentHealth{
|
||
Name: config.Name,
|
||
Status: StatusHealthy,
|
||
Metrics: make(map[string]float64),
|
||
Config: config,
|
||
}
|
||
hm.logger.Info("component registered", "name", config.Name, "type", config.Type)
|
||
}
|
||
|
||
// AlertBus returns the channel for consuming health alerts.
|
||
func (hm *HealthMonitor) AlertBus() <-chan HealthAlert {
|
||
return hm.alertBus
|
||
}
|
||
|
||
// Start begins the monitoring loops. Blocks until ctx is cancelled.
|
||
func (hm *HealthMonitor) Start(ctx context.Context) {
|
||
hm.logger.Info("health monitor started")
|
||
|
||
metricsTicker := time.NewTicker(MetricsCollectionInterval)
|
||
healthTicker := time.NewTicker(HealthCheckInterval)
|
||
quorumTicker := time.NewTicker(QuorumValidationInterval)
|
||
defer metricsTicker.Stop()
|
||
defer healthTicker.Stop()
|
||
defer quorumTicker.Stop()
|
||
|
||
for {
|
||
select {
|
||
case <-ctx.Done():
|
||
hm.logger.Info("health monitor stopped")
|
||
return
|
||
case <-metricsTicker.C:
|
||
hm.collectMetrics(ctx)
|
||
case <-healthTicker.C:
|
||
hm.checkHealth()
|
||
case <-quorumTicker.C:
|
||
hm.validateQuorum()
|
||
}
|
||
}
|
||
}
|
||
|
||
// collectMetrics gathers metrics from all registered components.
|
||
func (hm *HealthMonitor) collectMetrics(ctx context.Context) {
|
||
hm.mu.RLock()
|
||
names := make([]string, 0, len(hm.components))
|
||
for name := range hm.components {
|
||
names = append(names, name)
|
||
}
|
||
hm.mu.RUnlock()
|
||
|
||
for _, name := range names {
|
||
metrics, err := hm.collector.Collect(ctx, name)
|
||
if err != nil {
|
||
hm.logger.Warn("metrics collection failed", "component", name, "error", err)
|
||
hm.mu.Lock()
|
||
if comp, ok := hm.components[name]; ok {
|
||
comp.Consecutive++
|
||
}
|
||
hm.mu.Unlock()
|
||
continue
|
||
}
|
||
|
||
hm.mu.Lock()
|
||
comp, ok := hm.components[name]
|
||
if ok {
|
||
comp.Metrics = metrics
|
||
comp.LastCheck = time.Now()
|
||
// Store each metric in time-series DB.
|
||
for metric, value := range metrics {
|
||
hm.metricsDB.AddDataPoint(name, metric, value)
|
||
}
|
||
}
|
||
hm.mu.Unlock()
|
||
}
|
||
}
|
||
|
||
// checkHealth evaluates each component against thresholds and anomalies.
|
||
func (hm *HealthMonitor) checkHealth() {
|
||
hm.mu.Lock()
|
||
defer hm.mu.Unlock()
|
||
|
||
for _, comp := range hm.components {
|
||
alerts := hm.evaluateComponent(comp)
|
||
for _, alert := range alerts {
|
||
hm.emitAlert(alert)
|
||
}
|
||
}
|
||
}
|
||
|
||
// evaluateComponent checks a single component's metrics against thresholds
|
||
// and runs Z-score anomaly detection. Returns any generated alerts.
|
||
func (hm *HealthMonitor) evaluateComponent(comp *ComponentHealth) []HealthAlert {
|
||
var alerts []HealthAlert
|
||
breached := false
|
||
|
||
for metric, value := range comp.Metrics {
|
||
threshold, hasThreshold := comp.Config.Thresholds[metric]
|
||
if !hasThreshold {
|
||
continue
|
||
}
|
||
|
||
isMax := comp.Config.ThresholdIsMax[metric]
|
||
var exceeded bool
|
||
if isMax {
|
||
exceeded = value > threshold
|
||
} else {
|
||
exceeded = value < threshold
|
||
}
|
||
|
||
if exceeded {
|
||
breached = true
|
||
action := "restart"
|
||
if metric == "error_rate" || metric == "latency_p99" {
|
||
action = "investigate"
|
||
}
|
||
|
||
alerts = append(alerts, HealthAlert{
|
||
Component: comp.Name,
|
||
Severity: SeverityWarning,
|
||
Metric: metric,
|
||
Current: value,
|
||
Threshold: threshold,
|
||
Timestamp: time.Now(),
|
||
SuggestedAction: action,
|
||
})
|
||
}
|
||
|
||
// Z-score anomaly detection.
|
||
baseline := hm.metricsDB.GetBaseline(comp.Name, metric, hm.anomalyWindow)
|
||
if IsAnomaly(value, baseline, AnomalyZScoreThreshold) {
|
||
zscore := CalculateZScore(value, baseline)
|
||
alerts = append(alerts, HealthAlert{
|
||
Component: comp.Name,
|
||
Severity: SeverityCritical,
|
||
Metric: metric,
|
||
Current: value,
|
||
Threshold: baseline.Mean + AnomalyZScoreThreshold*baseline.StdDev,
|
||
ZScore: zscore,
|
||
Timestamp: time.Now(),
|
||
SuggestedAction: fmt.Sprintf("anomaly detected (Z=%.2f), investigate %s", zscore, metric),
|
||
})
|
||
}
|
||
}
|
||
|
||
// Update component status.
|
||
if breached {
|
||
comp.Consecutive++
|
||
if comp.Consecutive >= MaxConsecutiveFailures {
|
||
comp.Status = StatusCritical
|
||
} else {
|
||
comp.Status = StatusDegraded
|
||
}
|
||
} else {
|
||
comp.Consecutive = 0
|
||
comp.Status = StatusHealthy
|
||
}
|
||
|
||
return alerts
|
||
}
|
||
|
||
// emitAlert sends an alert to the bus (non-blocking).
|
||
func (hm *HealthMonitor) emitAlert(alert HealthAlert) {
|
||
select {
|
||
case hm.alertBus <- alert:
|
||
hm.logger.Warn("health alert emitted",
|
||
"component", alert.Component,
|
||
"severity", alert.Severity,
|
||
"metric", alert.Metric,
|
||
"current", alert.Current,
|
||
"threshold", alert.Threshold,
|
||
)
|
||
default:
|
||
hm.logger.Error("alert bus full, dropping alert",
|
||
"component", alert.Component,
|
||
"metric", alert.Metric,
|
||
)
|
||
}
|
||
}
|
||
|
||
// validateQuorum checks if 2/3 of components are healthy.
|
||
func (hm *HealthMonitor) validateQuorum() {
|
||
hm.mu.RLock()
|
||
defer hm.mu.RUnlock()
|
||
|
||
if len(hm.components) == 0 {
|
||
return
|
||
}
|
||
|
||
valid := ValidateQuorum(hm.componentStatuses())
|
||
|
||
if !valid {
|
||
hm.logger.Error("QUORUM LOST — entering degraded state",
|
||
"healthy_ratio", hm.healthyRatio(),
|
||
"threshold", QuorumThreshold,
|
||
)
|
||
hm.emitAlert(HealthAlert{
|
||
Component: "system",
|
||
Severity: SeverityCritical,
|
||
Metric: "quorum",
|
||
Current: hm.healthyRatio(),
|
||
Threshold: QuorumThreshold,
|
||
Timestamp: time.Now(),
|
||
SuggestedAction: "activate safe mode",
|
||
})
|
||
}
|
||
}
|
||
|
||
// ValidateQuorum checks if the healthy ratio meets the 2/3 threshold.
|
||
func ValidateQuorum(statuses map[string]ComponentStatus) bool {
|
||
if len(statuses) == 0 {
|
||
return false
|
||
}
|
||
|
||
healthy := 0
|
||
for _, status := range statuses {
|
||
if status == StatusHealthy {
|
||
healthy++
|
||
}
|
||
}
|
||
return float64(healthy)/float64(len(statuses)) >= QuorumThreshold
|
||
}
|
||
|
||
// componentStatuses returns current status map (caller must hold RLock).
|
||
func (hm *HealthMonitor) componentStatuses() map[string]ComponentStatus {
|
||
statuses := make(map[string]ComponentStatus, len(hm.components))
|
||
for name, comp := range hm.components {
|
||
statuses[name] = comp.Status
|
||
}
|
||
return statuses
|
||
}
|
||
|
||
// healthyRatio returns the fraction of healthy components (caller must hold RLock).
|
||
func (hm *HealthMonitor) healthyRatio() float64 {
|
||
if len(hm.components) == 0 {
|
||
return 0
|
||
}
|
||
healthy := 0
|
||
for _, comp := range hm.components {
|
||
if comp.Status == StatusHealthy {
|
||
healthy++
|
||
}
|
||
}
|
||
return float64(healthy) / float64(len(hm.components))
|
||
}
|
||
|
||
// GetHealth returns a snapshot of the entire system health.
|
||
func (hm *HealthMonitor) GetHealth() HealthResponse {
|
||
hm.mu.RLock()
|
||
defer hm.mu.RUnlock()
|
||
|
||
components := make([]ComponentHealth, 0, len(hm.components))
|
||
for _, comp := range hm.components {
|
||
cp := *comp
|
||
// Deep copy metrics.
|
||
cp.Metrics = make(map[string]float64, len(comp.Metrics))
|
||
for k, v := range comp.Metrics {
|
||
cp.Metrics[k] = v
|
||
}
|
||
components = append(components, cp)
|
||
}
|
||
|
||
overall := OverallHealthy
|
||
for _, comp := range components {
|
||
switch comp.Status {
|
||
case StatusCritical, StatusOffline:
|
||
overall = OverallCritical
|
||
case StatusDegraded:
|
||
if overall != OverallCritical {
|
||
overall = OverallDegraded
|
||
}
|
||
}
|
||
}
|
||
|
||
return HealthResponse{
|
||
OverallStatus: overall,
|
||
Components: components,
|
||
QuorumValid: ValidateQuorum(hm.componentStatuses()),
|
||
LastCheck: time.Now(),
|
||
}
|
||
}
|
||
|
||
// SetComponentStatus manually sets a component's status (for testing/override).
|
||
func (hm *HealthMonitor) SetComponentStatus(name string, status ComponentStatus) {
|
||
hm.mu.Lock()
|
||
defer hm.mu.Unlock()
|
||
|
||
if comp, ok := hm.components[name]; ok {
|
||
comp.Status = status
|
||
}
|
||
}
|
||
|
||
// UpdateMetrics manually updates a component's metrics (for testing/override).
|
||
func (hm *HealthMonitor) UpdateMetrics(name string, metrics map[string]float64) {
|
||
hm.mu.Lock()
|
||
defer hm.mu.Unlock()
|
||
|
||
if comp, ok := hm.components[name]; ok {
|
||
comp.Metrics = metrics
|
||
comp.LastCheck = time.Now()
|
||
for metric, value := range metrics {
|
||
hm.metricsDB.AddDataPoint(name, metric, value)
|
||
}
|
||
}
|
||
}
|
||
|
||
// ComponentCount returns the number of registered components.
|
||
func (hm *HealthMonitor) ComponentCount() int {
|
||
hm.mu.RLock()
|
||
defer hm.mu.RUnlock()
|
||
return len(hm.components)
|
||
}
|