gomcp/internal/domain/soc/clustering.go

272 lines
7.6 KiB
Go

package soc
import (
"math"
"sync"
"time"
)
// AlertCluster groups related SOC events using temporal + categorical similarity.
// Phase 1: temporal+session_id fallback (cold start).
// Phase 2: embedding-based DBSCAN when enough events accumulated.
//
// Cold start strategy (§7.6):
//
// fallback: temporal_clustering
// timeout: 5m — force embedding mode after 5 minutes even if <50 events
// min_events_for_embedding: 50
type AlertCluster struct {
ID string `json:"id"`
Events []string `json:"events"` // Event IDs
Category string `json:"category"` // Dominant category
Severity string `json:"severity"` // Max severity
Source string `json:"source"` // Dominant source
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
}
// ClusterEngine groups related alerts using configurable strategies.
type ClusterEngine struct {
mu sync.RWMutex
clusters map[string]*AlertCluster
config ClusterConfig
// Cold start tracking
startTime time.Time
eventCount int
mode ClusterMode
}
// ClusterConfig holds Alert Clustering parameters.
type ClusterConfig struct {
// Cold start (§7.6)
MinEventsForEmbedding int `yaml:"min_events_for_embedding" json:"min_events_for_embedding"`
ColdStartTimeout time.Duration `yaml:"cold_start_timeout" json:"cold_start_timeout"`
// Temporal clustering parameters
TemporalWindow time.Duration `yaml:"temporal_window" json:"temporal_window"` // Group events within this window
MaxClusterSize int `yaml:"max_cluster_size" json:"max_cluster_size"`
// Embedding clustering parameters (Phase 2)
SimilarityThreshold float64 `yaml:"similarity_threshold" json:"similarity_threshold"` // 0.0-1.0
EmbeddingModel string `yaml:"embedding_model" json:"embedding_model"` // e.g., "all-MiniLM-L6-v2"
}
// DefaultClusterConfig returns the default clustering configuration (§7.6).
func DefaultClusterConfig() ClusterConfig {
return ClusterConfig{
MinEventsForEmbedding: 50,
ColdStartTimeout: 5 * time.Minute,
TemporalWindow: 2 * time.Minute,
MaxClusterSize: 50,
SimilarityThreshold: 0.75,
EmbeddingModel: "all-MiniLM-L6-v2",
}
}
// ClusterMode tracks the engine operating mode.
type ClusterMode int
const (
ClusterModeColdStart ClusterMode = iota // Temporal+session_id fallback
ClusterModeEmbedding // Full embedding-based clustering
)
func (m ClusterMode) String() string {
switch m {
case ClusterModeEmbedding:
return "embedding"
default:
return "cold_start"
}
}
// NewClusterEngine creates a cluster engine with the given config.
func NewClusterEngine(config ClusterConfig) *ClusterEngine {
return &ClusterEngine{
clusters: make(map[string]*AlertCluster),
config: config,
startTime: time.Now(),
mode: ClusterModeColdStart,
}
}
// AddEvent assigns an event to a cluster. Returns the cluster ID.
func (ce *ClusterEngine) AddEvent(event SOCEvent) string {
ce.mu.Lock()
defer ce.mu.Unlock()
ce.eventCount++
// Check if we should transition to embedding mode
if ce.mode == ClusterModeColdStart {
if ce.eventCount >= ce.config.MinEventsForEmbedding ||
time.Since(ce.startTime) >= ce.config.ColdStartTimeout {
ce.mode = ClusterModeEmbedding
}
}
// Phase 2: Embedding/semantic clustering (DBSCAN-inspired)
if ce.mode == ClusterModeEmbedding {
clusterID := ce.findSemanticCluster(event)
if clusterID != "" {
return clusterID
}
}
// Fallback: Temporal + category clustering (Phase 1)
clusterID := ce.findOrCreateTemporalCluster(event)
return clusterID
}
// findSemanticCluster uses cosine similarity of event descriptions to find matching clusters.
// This is a simplified DBSCAN-inspired approach that works without an external ML model.
func (ce *ClusterEngine) findSemanticCluster(event SOCEvent) string {
if event.Description == "" {
return ""
}
eventVec := textToVector(event.Description)
bestScore := 0.0
bestCluster := ""
for id, cluster := range ce.clusters {
if len(cluster.Events) >= ce.config.MaxClusterSize {
continue
}
// Use cluster category + source as proxy embedding when no ML model
clusterVec := textToVector(cluster.Category + " " + cluster.Source)
sim := cosineSimilarity(eventVec, clusterVec)
if sim > ce.config.SimilarityThreshold && sim > bestScore {
bestScore = sim
bestCluster = id
}
}
if bestCluster != "" {
c := ce.clusters[bestCluster]
c.Events = append(c.Events, event.ID)
c.UpdatedAt = time.Now()
if event.Severity.Rank() > EventSeverity(c.Severity).Rank() {
c.Severity = string(event.Severity)
}
return bestCluster
}
return ""
}
// textToVector creates a simple character-frequency vector for cosine similarity.
// Serves as fallback when no external embedding model is available.
func textToVector(text string) map[rune]float64 {
vec := make(map[rune]float64)
for _, r := range text {
if r >= 'a' && r <= 'z' || r >= 'A' && r <= 'Z' || r == '_' {
vec[r]++
}
}
return vec
}
// cosineSimilarity computes cosine similarity between two sparse vectors.
func cosineSimilarity(a, b map[rune]float64) float64 {
dot := 0.0
magA := 0.0
magB := 0.0
for k, v := range a {
magA += v * v
if bv, ok := b[k]; ok {
dot += v * bv
}
}
for _, v := range b {
magB += v * v
}
if magA == 0 || magB == 0 {
return 0
}
return dot / (math.Sqrt(magA) * math.Sqrt(magB))
}
// findOrCreateTemporalCluster groups by (category + source) within temporal window.
func (ce *ClusterEngine) findOrCreateTemporalCluster(event SOCEvent) string {
now := time.Now()
key := string(event.Source) + ":" + event.Category
// Search existing clusters within temporal window
for id, cluster := range ce.clusters {
if cluster.Category == event.Category &&
cluster.Source == string(event.Source) &&
now.Sub(cluster.UpdatedAt) <= ce.config.TemporalWindow &&
len(cluster.Events) < ce.config.MaxClusterSize {
// Add to existing cluster
cluster.Events = append(cluster.Events, event.ID)
cluster.UpdatedAt = now
if event.Severity.Rank() > EventSeverity(cluster.Severity).Rank() {
cluster.Severity = string(event.Severity)
}
return id
}
}
// Create new cluster
clusterID := "clst-" + key + "-" + now.Format("150405")
ce.clusters[clusterID] = &AlertCluster{
ID: clusterID,
Events: []string{event.ID},
Category: event.Category,
Severity: string(event.Severity),
Source: string(event.Source),
CreatedAt: now,
UpdatedAt: now,
}
return clusterID
}
// Stats returns clustering statistics.
func (ce *ClusterEngine) Stats() map[string]any {
ce.mu.RLock()
defer ce.mu.RUnlock()
totalEvents := 0
maxSize := 0
for _, c := range ce.clusters {
totalEvents += len(c.Events)
if len(c.Events) > maxSize {
maxSize = len(c.Events)
}
}
avgSize := 0.0
if len(ce.clusters) > 0 {
avgSize = math.Round(float64(totalEvents)/float64(len(ce.clusters))*100) / 100
}
uiHint := "Smart clustering active"
if ce.mode == ClusterModeColdStart {
uiHint = "Clustering warming up..."
}
return map[string]any{
"mode": ce.mode.String(),
"ui_hint": uiHint,
"total_clusters": len(ce.clusters),
"total_events": totalEvents,
"avg_cluster_size": avgSize,
"max_cluster_size": maxSize,
"events_processed": ce.eventCount,
"embedding_model": ce.config.EmbeddingModel,
"cold_start_threshold": ce.config.MinEventsForEmbedding,
}
}
// Clusters returns all current clusters.
func (ce *ClusterEngine) Clusters() []*AlertCluster {
ce.mu.RLock()
defer ce.mu.RUnlock()
result := make([]*AlertCluster, 0, len(ce.clusters))
for _, c := range ce.clusters {
result = append(result, c)
}
return result
}