Release prep: 54 engines, self-hosted signatures, i18n, dashboard updates

2026-05-08 19:12:37 +02:00 · 2026-03-23 16:45:40 +10:00 · 2026-03-23 16:45:40 +10:00 · 41cbfd6e0a
commit 41cbfd6e0a
parent 694e32be26
178 changed files with 36008 additions and 399 deletions
--- a/internal/application/shadow_ai/health.go
+++ b/internal/application/shadow_ai/health.go
@ -0,0 +1,163 @@
+package shadow_ai
+
+import (
+	"context"
+	"fmt"
+	"log/slog"
+	"sync"
+	"time"
+)
+
+// PluginStatus represents a plugin's operational state.
+type PluginStatus string
+
+const (
+	PluginStatusHealthy  PluginStatus = "healthy"
+	PluginStatusDegraded PluginStatus = "degraded"
+	PluginStatusOffline  PluginStatus = "offline"
+)
+
+// PluginHealth tracks the health state of a single plugin.
+type PluginHealth struct {
+	Vendor      string       `json:"vendor"`
+	Type        PluginType   `json:"type"`
+	Status      PluginStatus `json:"status"`
+	LastCheck   time.Time    `json:"last_check"`
+	Consecutive int          `json:"consecutive_failures"`
+	Latency     time.Duration `json:"latency"`
+	LastError   string       `json:"last_error,omitempty"`
+}
+
+// MaxConsecutivePluginFailures before marking offline.
+const MaxConsecutivePluginFailures = 3
+
+// HealthChecker performs continuous health monitoring of all registered plugins.
+type HealthChecker struct {
+	mu       sync.RWMutex
+	registry *PluginRegistry
+	interval time.Duration
+	alertFn  func(vendor string, status PluginStatus, msg string)
+	logger   *slog.Logger
+}
+
+// NewHealthChecker creates a health checker that monitors plugin health.
+func NewHealthChecker(registry *PluginRegistry, interval time.Duration, alertFn func(string, PluginStatus, string)) *HealthChecker {
+	if interval <= 0 {
+		interval = 30 * time.Second
+	}
+	return &HealthChecker{
+		registry: registry,
+		interval: interval,
+		alertFn:  alertFn,
+		logger:   slog.Default().With("component", "shadow-ai-health"),
+	}
+}
+
+// Start begins continuous health monitoring. Blocks until ctx is cancelled.
+func (hc *HealthChecker) Start(ctx context.Context) {
+	hc.logger.Info("health checker started", "interval", hc.interval)
+	ticker := time.NewTicker(hc.interval)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-ctx.Done():
+			hc.logger.Info("health checker stopped")
+			return
+		case <-ticker.C:
+			hc.checkAllPlugins(ctx)
+		}
+	}
+}
+
+// checkAllPlugins runs health checks on all registered plugins.
+func (hc *HealthChecker) checkAllPlugins(ctx context.Context) {
+	vendors := hc.registry.Vendors()
+
+	for _, vendor := range vendors {
+		plugin, ok := hc.registry.Get(vendor)
+		if !ok {
+			continue
+		}
+
+		existing, _ := hc.registry.GetHealth(vendor)
+		if existing == nil {
+			continue
+		}
+
+		start := time.Now()
+		err := hc.checkPlugin(ctx, plugin)
+		latency := time.Since(start)
+
+		health := &PluginHealth{
+			Vendor:    vendor,
+			Type:      existing.Type,
+			LastCheck: time.Now(),
+			Latency:   latency,
+		}
+
+		if err != nil {
+			health.Consecutive = existing.Consecutive + 1
+			health.LastError = err.Error()
+
+			if health.Consecutive >= MaxConsecutivePluginFailures {
+				health.Status = PluginStatusOffline
+				if existing.Status != PluginStatusOffline {
+					hc.logger.Error("plugin went OFFLINE",
+						"vendor", vendor,
+						"consecutive", health.Consecutive,
+						"error", err,
+					)
+					if hc.alertFn != nil {
+						hc.alertFn(vendor, PluginStatusOffline,
+							fmt.Sprintf("Plugin %s offline after %d consecutive failures: %v",
+								vendor, health.Consecutive, err))
+					}
+				}
+			} else {
+				health.Status = PluginStatusDegraded
+				hc.logger.Warn("plugin health check failed",
+					"vendor", vendor,
+					"consecutive", health.Consecutive,
+					"error", err,
+				)
+			}
+		} else {
+			health.Status = PluginStatusHealthy
+			health.Consecutive = 0
+
+			// Log recovery if previously degraded/offline.
+			if existing.Status != PluginStatusHealthy {
+				hc.logger.Info("plugin recovered", "vendor", vendor, "latency", latency)
+				if hc.alertFn != nil {
+					hc.alertFn(vendor, PluginStatusHealthy,
+						fmt.Sprintf("Plugin %s recovered, latency %s", vendor, latency))
+				}
+			}
+		}
+
+		hc.registry.SetHealth(vendor, health)
+	}
+}
+
+// checkPlugin runs the health check for a single plugin.
+func (hc *HealthChecker) checkPlugin(ctx context.Context, plugin interface{}) error {
+	checkCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
+	defer cancel()
+
+	switch p := plugin.(type) {
+	case NetworkEnforcer:
+		return p.HealthCheck(checkCtx)
+	case EndpointController:
+		return p.HealthCheck(checkCtx)
+	case WebGateway:
+		return p.HealthCheck(checkCtx)
+	default:
+		return fmt.Errorf("plugin does not implement HealthCheck")
+	}
+}
+
+// CheckNow runs an immediate health check on all plugins (non-blocking).
+func (hc *HealthChecker) CheckNow(ctx context.Context) {
+	hc.checkAllPlugins(ctx)
+}