mirror of
https://github.com/syntrex-lab/gomcp.git
synced 2026-04-24 20:06:21 +02:00
167 lines
4.5 KiB
Go
167 lines
4.5 KiB
Go
// Copyright 2026 Syntrex Lab. All rights reserved.
|
|
// Use of this source code is governed by an Apache-2.0 license
|
|
// that can be found in the LICENSE file.
|
|
|
|
package shadow_ai
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"log/slog"
|
|
"sync"
|
|
"time"
|
|
)
|
|
|
|
// PluginStatus represents a plugin's operational state.
|
|
type PluginStatus string
|
|
|
|
const (
|
|
PluginStatusHealthy PluginStatus = "healthy"
|
|
PluginStatusDegraded PluginStatus = "degraded"
|
|
PluginStatusOffline PluginStatus = "offline"
|
|
)
|
|
|
|
// PluginHealth tracks the health state of a single plugin.
|
|
type PluginHealth struct {
|
|
Vendor string `json:"vendor"`
|
|
Type PluginType `json:"type"`
|
|
Status PluginStatus `json:"status"`
|
|
LastCheck time.Time `json:"last_check"`
|
|
Consecutive int `json:"consecutive_failures"`
|
|
Latency time.Duration `json:"latency"`
|
|
LastError string `json:"last_error,omitempty"`
|
|
}
|
|
|
|
// MaxConsecutivePluginFailures before marking offline.
|
|
const MaxConsecutivePluginFailures = 3
|
|
|
|
// HealthChecker performs continuous health monitoring of all registered plugins.
|
|
type HealthChecker struct {
|
|
mu sync.RWMutex
|
|
registry *PluginRegistry
|
|
interval time.Duration
|
|
alertFn func(vendor string, status PluginStatus, msg string)
|
|
logger *slog.Logger
|
|
}
|
|
|
|
// NewHealthChecker creates a health checker that monitors plugin health.
|
|
func NewHealthChecker(registry *PluginRegistry, interval time.Duration, alertFn func(string, PluginStatus, string)) *HealthChecker {
|
|
if interval <= 0 {
|
|
interval = 30 * time.Second
|
|
}
|
|
return &HealthChecker{
|
|
registry: registry,
|
|
interval: interval,
|
|
alertFn: alertFn,
|
|
logger: slog.Default().With("component", "shadow-ai-health"),
|
|
}
|
|
}
|
|
|
|
// Start begins continuous health monitoring. Blocks until ctx is cancelled.
|
|
func (hc *HealthChecker) Start(ctx context.Context) {
|
|
hc.logger.Info("health checker started", "interval", hc.interval)
|
|
ticker := time.NewTicker(hc.interval)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
hc.logger.Info("health checker stopped")
|
|
return
|
|
case <-ticker.C:
|
|
hc.checkAllPlugins(ctx)
|
|
}
|
|
}
|
|
}
|
|
|
|
// checkAllPlugins runs health checks on all registered plugins.
|
|
func (hc *HealthChecker) checkAllPlugins(ctx context.Context) {
|
|
vendors := hc.registry.Vendors()
|
|
|
|
for _, vendor := range vendors {
|
|
plugin, ok := hc.registry.Get(vendor)
|
|
if !ok {
|
|
continue
|
|
}
|
|
|
|
existing, _ := hc.registry.GetHealth(vendor)
|
|
if existing == nil {
|
|
continue
|
|
}
|
|
|
|
start := time.Now()
|
|
err := hc.checkPlugin(ctx, plugin)
|
|
latency := time.Since(start)
|
|
|
|
health := &PluginHealth{
|
|
Vendor: vendor,
|
|
Type: existing.Type,
|
|
LastCheck: time.Now(),
|
|
Latency: latency,
|
|
}
|
|
|
|
if err != nil {
|
|
health.Consecutive = existing.Consecutive + 1
|
|
health.LastError = err.Error()
|
|
|
|
if health.Consecutive >= MaxConsecutivePluginFailures {
|
|
health.Status = PluginStatusOffline
|
|
if existing.Status != PluginStatusOffline {
|
|
hc.logger.Error("plugin went OFFLINE",
|
|
"vendor", vendor,
|
|
"consecutive", health.Consecutive,
|
|
"error", err,
|
|
)
|
|
if hc.alertFn != nil {
|
|
hc.alertFn(vendor, PluginStatusOffline,
|
|
fmt.Sprintf("Plugin %s offline after %d consecutive failures: %v",
|
|
vendor, health.Consecutive, err))
|
|
}
|
|
}
|
|
} else {
|
|
health.Status = PluginStatusDegraded
|
|
hc.logger.Warn("plugin health check failed",
|
|
"vendor", vendor,
|
|
"consecutive", health.Consecutive,
|
|
"error", err,
|
|
)
|
|
}
|
|
} else {
|
|
health.Status = PluginStatusHealthy
|
|
health.Consecutive = 0
|
|
|
|
// Log recovery if previously degraded/offline.
|
|
if existing.Status != PluginStatusHealthy {
|
|
hc.logger.Info("plugin recovered", "vendor", vendor, "latency", latency)
|
|
if hc.alertFn != nil {
|
|
hc.alertFn(vendor, PluginStatusHealthy,
|
|
fmt.Sprintf("Plugin %s recovered, latency %s", vendor, latency))
|
|
}
|
|
}
|
|
}
|
|
|
|
hc.registry.SetHealth(vendor, health)
|
|
}
|
|
}
|
|
|
|
// checkPlugin runs the health check for a single plugin.
|
|
func (hc *HealthChecker) checkPlugin(ctx context.Context, plugin interface{}) error {
|
|
checkCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
|
|
defer cancel()
|
|
|
|
switch p := plugin.(type) {
|
|
case NetworkEnforcer:
|
|
return p.HealthCheck(checkCtx)
|
|
case EndpointController:
|
|
return p.HealthCheck(checkCtx)
|
|
case WebGateway:
|
|
return p.HealthCheck(checkCtx)
|
|
default:
|
|
return fmt.Errorf("plugin does not implement HealthCheck")
|
|
}
|
|
}
|
|
|
|
// CheckNow runs an immediate health check on all plugins (non-blocking).
|
|
func (hc *HealthChecker) CheckNow(ctx context.Context) {
|
|
hc.checkAllPlugins(ctx)
|
|
}
|