mirror of
https://github.com/syntrex-lab/gomcp.git
synced 2026-05-08 19:12:37 +02:00
Release prep: 54 engines, self-hosted signatures, i18n, dashboard updates
This commit is contained in:
parent
694e32be26
commit
41cbfd6e0a
178 changed files with 36008 additions and 399 deletions
445
internal/application/resilience/health_monitor.go
Normal file
445
internal/application/resilience/health_monitor.go
Normal file
|
|
@ -0,0 +1,445 @@
|
|||
package resilience
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// ComponentStatus defines the health state of a monitored component.
|
||||
type ComponentStatus string
|
||||
|
||||
const (
|
||||
StatusHealthy ComponentStatus = "HEALTHY"
|
||||
StatusDegraded ComponentStatus = "DEGRADED"
|
||||
StatusCritical ComponentStatus = "CRITICAL"
|
||||
StatusOffline ComponentStatus = "OFFLINE"
|
||||
)
|
||||
|
||||
// AlertSeverity defines the severity of a health alert.
|
||||
type AlertSeverity string
|
||||
|
||||
const (
|
||||
SeverityInfo AlertSeverity = "INFO"
|
||||
SeverityWarning AlertSeverity = "WARNING"
|
||||
SeverityCritical AlertSeverity = "CRITICAL"
|
||||
)
|
||||
|
||||
// OverallStatus aggregates component statuses into a system-wide status.
|
||||
type OverallStatus string
|
||||
|
||||
const (
|
||||
OverallHealthy OverallStatus = "HEALTHY"
|
||||
OverallDegraded OverallStatus = "DEGRADED"
|
||||
OverallCritical OverallStatus = "CRITICAL"
|
||||
)
|
||||
|
||||
// Default intervals per ТЗ §3.1.2.
|
||||
const (
|
||||
MetricsCollectionInterval = 10 * time.Second
|
||||
HealthCheckInterval = 30 * time.Second
|
||||
QuorumValidationInterval = 60 * time.Second
|
||||
|
||||
// AnomalyZScoreThreshold — Z > 3.0 = anomaly (99.7% confidence).
|
||||
AnomalyZScoreThreshold = 3.0
|
||||
|
||||
// QuorumThreshold — 2/3 must be healthy.
|
||||
QuorumThreshold = 0.66
|
||||
|
||||
// MaxConsecutiveFailures before marking CRITICAL.
|
||||
MaxConsecutiveFailures = 3
|
||||
)
|
||||
|
||||
// ComponentConfig defines monitoring thresholds for a component.
|
||||
type ComponentConfig struct {
|
||||
Name string `json:"name"`
|
||||
Type string `json:"type"` // go_binary, c_binary, c_kernel_module
|
||||
Thresholds map[string]float64 `json:"thresholds"`
|
||||
// Whether threshold is an upper bound (true) or lower bound (false).
|
||||
ThresholdIsMax map[string]bool `json:"threshold_is_max"`
|
||||
}
|
||||
|
||||
// ComponentHealth tracks the health state of a single component.
|
||||
type ComponentHealth struct {
|
||||
Name string `json:"name"`
|
||||
Status ComponentStatus `json:"status"`
|
||||
Metrics map[string]float64 `json:"metrics"`
|
||||
LastCheck time.Time `json:"last_check"`
|
||||
Consecutive int `json:"consecutive_failures"`
|
||||
Config ComponentConfig `json:"-"`
|
||||
}
|
||||
|
||||
// HealthAlert represents a detected health anomaly.
|
||||
type HealthAlert struct {
|
||||
Component string `json:"component"`
|
||||
Severity AlertSeverity `json:"severity"`
|
||||
Metric string `json:"metric"`
|
||||
Current float64 `json:"current"`
|
||||
Threshold float64 `json:"threshold"`
|
||||
ZScore float64 `json:"z_score,omitempty"`
|
||||
Timestamp time.Time `json:"timestamp"`
|
||||
SuggestedAction string `json:"suggested_action"`
|
||||
}
|
||||
|
||||
// HealthResponse is the API response for GET /api/v1/resilience/health.
|
||||
type HealthResponse struct {
|
||||
OverallStatus OverallStatus `json:"overall_status"`
|
||||
Components []ComponentHealth `json:"components"`
|
||||
QuorumValid bool `json:"quorum_valid"`
|
||||
LastCheck time.Time `json:"last_check"`
|
||||
AnomaliesDetected []HealthAlert `json:"anomalies_detected"`
|
||||
}
|
||||
|
||||
// MetricsCollector is the interface for collecting metrics from components.
|
||||
// Implementations can use /healthz endpoints, /metrics, or runtime stats.
|
||||
type MetricsCollector interface {
|
||||
Collect(ctx context.Context, component string) (map[string]float64, error)
|
||||
}
|
||||
|
||||
// HealthMonitor is the L1 Self-Monitoring orchestrator.
|
||||
// It collects metrics, runs anomaly detection, validates quorum,
|
||||
// and emits HealthAlerts to the alert bus.
|
||||
type HealthMonitor struct {
|
||||
mu sync.RWMutex
|
||||
components map[string]*ComponentHealth
|
||||
metricsDB *MetricsDB
|
||||
alertBus chan HealthAlert
|
||||
collector MetricsCollector
|
||||
logger *slog.Logger
|
||||
|
||||
// anomalyWindow is the baseline window for Z-score calculation.
|
||||
anomalyWindow time.Duration
|
||||
}
|
||||
|
||||
// NewHealthMonitor creates a new health monitor.
|
||||
func NewHealthMonitor(collector MetricsCollector, alertBufSize int) *HealthMonitor {
|
||||
if alertBufSize <= 0 {
|
||||
alertBufSize = 100
|
||||
}
|
||||
return &HealthMonitor{
|
||||
components: make(map[string]*ComponentHealth),
|
||||
metricsDB: NewMetricsDB(DefaultMetricsWindow, DefaultMetricsMaxSize),
|
||||
alertBus: make(chan HealthAlert, alertBufSize),
|
||||
collector: collector,
|
||||
logger: slog.Default().With("component", "sarl-health-monitor"),
|
||||
anomalyWindow: 24 * time.Hour,
|
||||
}
|
||||
}
|
||||
|
||||
// RegisterComponent adds a component to be monitored.
|
||||
func (hm *HealthMonitor) RegisterComponent(config ComponentConfig) {
|
||||
hm.mu.Lock()
|
||||
defer hm.mu.Unlock()
|
||||
|
||||
hm.components[config.Name] = &ComponentHealth{
|
||||
Name: config.Name,
|
||||
Status: StatusHealthy,
|
||||
Metrics: make(map[string]float64),
|
||||
Config: config,
|
||||
}
|
||||
hm.logger.Info("component registered", "name", config.Name, "type", config.Type)
|
||||
}
|
||||
|
||||
// AlertBus returns the channel for consuming health alerts.
|
||||
func (hm *HealthMonitor) AlertBus() <-chan HealthAlert {
|
||||
return hm.alertBus
|
||||
}
|
||||
|
||||
// Start begins the monitoring loops. Blocks until ctx is cancelled.
|
||||
func (hm *HealthMonitor) Start(ctx context.Context) {
|
||||
hm.logger.Info("health monitor started")
|
||||
|
||||
metricsTicker := time.NewTicker(MetricsCollectionInterval)
|
||||
healthTicker := time.NewTicker(HealthCheckInterval)
|
||||
quorumTicker := time.NewTicker(QuorumValidationInterval)
|
||||
defer metricsTicker.Stop()
|
||||
defer healthTicker.Stop()
|
||||
defer quorumTicker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
hm.logger.Info("health monitor stopped")
|
||||
return
|
||||
case <-metricsTicker.C:
|
||||
hm.collectMetrics(ctx)
|
||||
case <-healthTicker.C:
|
||||
hm.checkHealth()
|
||||
case <-quorumTicker.C:
|
||||
hm.validateQuorum()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// collectMetrics gathers metrics from all registered components.
|
||||
func (hm *HealthMonitor) collectMetrics(ctx context.Context) {
|
||||
hm.mu.RLock()
|
||||
names := make([]string, 0, len(hm.components))
|
||||
for name := range hm.components {
|
||||
names = append(names, name)
|
||||
}
|
||||
hm.mu.RUnlock()
|
||||
|
||||
for _, name := range names {
|
||||
metrics, err := hm.collector.Collect(ctx, name)
|
||||
if err != nil {
|
||||
hm.logger.Warn("metrics collection failed", "component", name, "error", err)
|
||||
hm.mu.Lock()
|
||||
if comp, ok := hm.components[name]; ok {
|
||||
comp.Consecutive++
|
||||
}
|
||||
hm.mu.Unlock()
|
||||
continue
|
||||
}
|
||||
|
||||
hm.mu.Lock()
|
||||
comp, ok := hm.components[name]
|
||||
if ok {
|
||||
comp.Metrics = metrics
|
||||
comp.LastCheck = time.Now()
|
||||
// Store each metric in time-series DB.
|
||||
for metric, value := range metrics {
|
||||
hm.metricsDB.AddDataPoint(name, metric, value)
|
||||
}
|
||||
}
|
||||
hm.mu.Unlock()
|
||||
}
|
||||
}
|
||||
|
||||
// checkHealth evaluates each component against thresholds and anomalies.
|
||||
func (hm *HealthMonitor) checkHealth() {
|
||||
hm.mu.Lock()
|
||||
defer hm.mu.Unlock()
|
||||
|
||||
for _, comp := range hm.components {
|
||||
alerts := hm.evaluateComponent(comp)
|
||||
for _, alert := range alerts {
|
||||
hm.emitAlert(alert)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// evaluateComponent checks a single component's metrics against thresholds
|
||||
// and runs Z-score anomaly detection. Returns any generated alerts.
|
||||
func (hm *HealthMonitor) evaluateComponent(comp *ComponentHealth) []HealthAlert {
|
||||
var alerts []HealthAlert
|
||||
breached := false
|
||||
|
||||
for metric, value := range comp.Metrics {
|
||||
threshold, hasThreshold := comp.Config.Thresholds[metric]
|
||||
if !hasThreshold {
|
||||
continue
|
||||
}
|
||||
|
||||
isMax := comp.Config.ThresholdIsMax[metric]
|
||||
var exceeded bool
|
||||
if isMax {
|
||||
exceeded = value > threshold
|
||||
} else {
|
||||
exceeded = value < threshold
|
||||
}
|
||||
|
||||
if exceeded {
|
||||
breached = true
|
||||
action := "restart"
|
||||
if metric == "error_rate" || metric == "latency_p99" {
|
||||
action = "investigate"
|
||||
}
|
||||
|
||||
alerts = append(alerts, HealthAlert{
|
||||
Component: comp.Name,
|
||||
Severity: SeverityWarning,
|
||||
Metric: metric,
|
||||
Current: value,
|
||||
Threshold: threshold,
|
||||
Timestamp: time.Now(),
|
||||
SuggestedAction: action,
|
||||
})
|
||||
}
|
||||
|
||||
// Z-score anomaly detection.
|
||||
baseline := hm.metricsDB.GetBaseline(comp.Name, metric, hm.anomalyWindow)
|
||||
if IsAnomaly(value, baseline, AnomalyZScoreThreshold) {
|
||||
zscore := CalculateZScore(value, baseline)
|
||||
alerts = append(alerts, HealthAlert{
|
||||
Component: comp.Name,
|
||||
Severity: SeverityCritical,
|
||||
Metric: metric,
|
||||
Current: value,
|
||||
Threshold: baseline.Mean + AnomalyZScoreThreshold*baseline.StdDev,
|
||||
ZScore: zscore,
|
||||
Timestamp: time.Now(),
|
||||
SuggestedAction: fmt.Sprintf("anomaly detected (Z=%.2f), investigate %s", zscore, metric),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// Update component status.
|
||||
if breached {
|
||||
comp.Consecutive++
|
||||
if comp.Consecutive >= MaxConsecutiveFailures {
|
||||
comp.Status = StatusCritical
|
||||
} else {
|
||||
comp.Status = StatusDegraded
|
||||
}
|
||||
} else {
|
||||
comp.Consecutive = 0
|
||||
comp.Status = StatusHealthy
|
||||
}
|
||||
|
||||
return alerts
|
||||
}
|
||||
|
||||
// emitAlert sends an alert to the bus (non-blocking).
|
||||
func (hm *HealthMonitor) emitAlert(alert HealthAlert) {
|
||||
select {
|
||||
case hm.alertBus <- alert:
|
||||
hm.logger.Warn("health alert emitted",
|
||||
"component", alert.Component,
|
||||
"severity", alert.Severity,
|
||||
"metric", alert.Metric,
|
||||
"current", alert.Current,
|
||||
"threshold", alert.Threshold,
|
||||
)
|
||||
default:
|
||||
hm.logger.Error("alert bus full, dropping alert",
|
||||
"component", alert.Component,
|
||||
"metric", alert.Metric,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// validateQuorum checks if 2/3 of components are healthy.
|
||||
func (hm *HealthMonitor) validateQuorum() {
|
||||
hm.mu.RLock()
|
||||
defer hm.mu.RUnlock()
|
||||
|
||||
if len(hm.components) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
valid := ValidateQuorum(hm.componentStatuses())
|
||||
|
||||
if !valid {
|
||||
hm.logger.Error("QUORUM LOST — entering degraded state",
|
||||
"healthy_ratio", hm.healthyRatio(),
|
||||
"threshold", QuorumThreshold,
|
||||
)
|
||||
hm.emitAlert(HealthAlert{
|
||||
Component: "system",
|
||||
Severity: SeverityCritical,
|
||||
Metric: "quorum",
|
||||
Current: hm.healthyRatio(),
|
||||
Threshold: QuorumThreshold,
|
||||
Timestamp: time.Now(),
|
||||
SuggestedAction: "activate safe mode",
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// ValidateQuorum checks if the healthy ratio meets the 2/3 threshold.
|
||||
func ValidateQuorum(statuses map[string]ComponentStatus) bool {
|
||||
if len(statuses) == 0 {
|
||||
return false
|
||||
}
|
||||
|
||||
healthy := 0
|
||||
for _, status := range statuses {
|
||||
if status == StatusHealthy {
|
||||
healthy++
|
||||
}
|
||||
}
|
||||
return float64(healthy)/float64(len(statuses)) >= QuorumThreshold
|
||||
}
|
||||
|
||||
// componentStatuses returns current status map (caller must hold RLock).
|
||||
func (hm *HealthMonitor) componentStatuses() map[string]ComponentStatus {
|
||||
statuses := make(map[string]ComponentStatus, len(hm.components))
|
||||
for name, comp := range hm.components {
|
||||
statuses[name] = comp.Status
|
||||
}
|
||||
return statuses
|
||||
}
|
||||
|
||||
// healthyRatio returns the fraction of healthy components (caller must hold RLock).
|
||||
func (hm *HealthMonitor) healthyRatio() float64 {
|
||||
if len(hm.components) == 0 {
|
||||
return 0
|
||||
}
|
||||
healthy := 0
|
||||
for _, comp := range hm.components {
|
||||
if comp.Status == StatusHealthy {
|
||||
healthy++
|
||||
}
|
||||
}
|
||||
return float64(healthy) / float64(len(hm.components))
|
||||
}
|
||||
|
||||
// GetHealth returns a snapshot of the entire system health.
|
||||
func (hm *HealthMonitor) GetHealth() HealthResponse {
|
||||
hm.mu.RLock()
|
||||
defer hm.mu.RUnlock()
|
||||
|
||||
components := make([]ComponentHealth, 0, len(hm.components))
|
||||
for _, comp := range hm.components {
|
||||
cp := *comp
|
||||
// Deep copy metrics.
|
||||
cp.Metrics = make(map[string]float64, len(comp.Metrics))
|
||||
for k, v := range comp.Metrics {
|
||||
cp.Metrics[k] = v
|
||||
}
|
||||
components = append(components, cp)
|
||||
}
|
||||
|
||||
overall := OverallHealthy
|
||||
for _, comp := range components {
|
||||
switch comp.Status {
|
||||
case StatusCritical, StatusOffline:
|
||||
overall = OverallCritical
|
||||
case StatusDegraded:
|
||||
if overall != OverallCritical {
|
||||
overall = OverallDegraded
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return HealthResponse{
|
||||
OverallStatus: overall,
|
||||
Components: components,
|
||||
QuorumValid: ValidateQuorum(hm.componentStatuses()),
|
||||
LastCheck: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
// SetComponentStatus manually sets a component's status (for testing/override).
|
||||
func (hm *HealthMonitor) SetComponentStatus(name string, status ComponentStatus) {
|
||||
hm.mu.Lock()
|
||||
defer hm.mu.Unlock()
|
||||
|
||||
if comp, ok := hm.components[name]; ok {
|
||||
comp.Status = status
|
||||
}
|
||||
}
|
||||
|
||||
// UpdateMetrics manually updates a component's metrics (for testing/override).
|
||||
func (hm *HealthMonitor) UpdateMetrics(name string, metrics map[string]float64) {
|
||||
hm.mu.Lock()
|
||||
defer hm.mu.Unlock()
|
||||
|
||||
if comp, ok := hm.components[name]; ok {
|
||||
comp.Metrics = metrics
|
||||
comp.LastCheck = time.Now()
|
||||
for metric, value := range metrics {
|
||||
hm.metricsDB.AddDataPoint(name, metric, value)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ComponentCount returns the number of registered components.
|
||||
func (hm *HealthMonitor) ComponentCount() int {
|
||||
hm.mu.RLock()
|
||||
defer hm.mu.RUnlock()
|
||||
return len(hm.components)
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue