mirror of
https://github.com/syntrex-lab/gomcp.git
synced 2026-04-28 05:46:22 +02:00
272 lines
7.6 KiB
Go
272 lines
7.6 KiB
Go
package soc
|
|
|
|
import (
|
|
"math"
|
|
"sync"
|
|
"time"
|
|
)
|
|
|
|
// AlertCluster groups related SOC events using temporal + categorical similarity.
|
|
// Phase 1: temporal+session_id fallback (cold start).
|
|
// Phase 2: embedding-based DBSCAN when enough events accumulated.
|
|
//
|
|
// Cold start strategy (§7.6):
|
|
//
|
|
// fallback: temporal_clustering
|
|
// timeout: 5m — force embedding mode after 5 minutes even if <50 events
|
|
// min_events_for_embedding: 50
|
|
type AlertCluster struct {
|
|
ID string `json:"id"`
|
|
Events []string `json:"events"` // Event IDs
|
|
Category string `json:"category"` // Dominant category
|
|
Severity string `json:"severity"` // Max severity
|
|
Source string `json:"source"` // Dominant source
|
|
CreatedAt time.Time `json:"created_at"`
|
|
UpdatedAt time.Time `json:"updated_at"`
|
|
}
|
|
|
|
// ClusterEngine groups related alerts using configurable strategies.
|
|
type ClusterEngine struct {
|
|
mu sync.RWMutex
|
|
clusters map[string]*AlertCluster
|
|
config ClusterConfig
|
|
|
|
// Cold start tracking
|
|
startTime time.Time
|
|
eventCount int
|
|
mode ClusterMode
|
|
}
|
|
|
|
// ClusterConfig holds Alert Clustering parameters.
|
|
type ClusterConfig struct {
|
|
// Cold start (§7.6)
|
|
MinEventsForEmbedding int `yaml:"min_events_for_embedding" json:"min_events_for_embedding"`
|
|
ColdStartTimeout time.Duration `yaml:"cold_start_timeout" json:"cold_start_timeout"`
|
|
|
|
// Temporal clustering parameters
|
|
TemporalWindow time.Duration `yaml:"temporal_window" json:"temporal_window"` // Group events within this window
|
|
MaxClusterSize int `yaml:"max_cluster_size" json:"max_cluster_size"`
|
|
|
|
// Embedding clustering parameters (Phase 2)
|
|
SimilarityThreshold float64 `yaml:"similarity_threshold" json:"similarity_threshold"` // 0.0-1.0
|
|
EmbeddingModel string `yaml:"embedding_model" json:"embedding_model"` // e.g., "all-MiniLM-L6-v2"
|
|
}
|
|
|
|
// DefaultClusterConfig returns the default clustering configuration (§7.6).
|
|
func DefaultClusterConfig() ClusterConfig {
|
|
return ClusterConfig{
|
|
MinEventsForEmbedding: 50,
|
|
ColdStartTimeout: 5 * time.Minute,
|
|
TemporalWindow: 2 * time.Minute,
|
|
MaxClusterSize: 50,
|
|
SimilarityThreshold: 0.75,
|
|
EmbeddingModel: "all-MiniLM-L6-v2",
|
|
}
|
|
}
|
|
|
|
// ClusterMode tracks the engine operating mode.
|
|
type ClusterMode int
|
|
|
|
const (
|
|
ClusterModeColdStart ClusterMode = iota // Temporal+session_id fallback
|
|
ClusterModeEmbedding // Full embedding-based clustering
|
|
)
|
|
|
|
func (m ClusterMode) String() string {
|
|
switch m {
|
|
case ClusterModeEmbedding:
|
|
return "embedding"
|
|
default:
|
|
return "cold_start"
|
|
}
|
|
}
|
|
|
|
// NewClusterEngine creates a cluster engine with the given config.
|
|
func NewClusterEngine(config ClusterConfig) *ClusterEngine {
|
|
return &ClusterEngine{
|
|
clusters: make(map[string]*AlertCluster),
|
|
config: config,
|
|
startTime: time.Now(),
|
|
mode: ClusterModeColdStart,
|
|
}
|
|
}
|
|
|
|
// AddEvent assigns an event to a cluster. Returns the cluster ID.
|
|
func (ce *ClusterEngine) AddEvent(event SOCEvent) string {
|
|
ce.mu.Lock()
|
|
defer ce.mu.Unlock()
|
|
|
|
ce.eventCount++
|
|
|
|
// Check if we should transition to embedding mode
|
|
if ce.mode == ClusterModeColdStart {
|
|
if ce.eventCount >= ce.config.MinEventsForEmbedding ||
|
|
time.Since(ce.startTime) >= ce.config.ColdStartTimeout {
|
|
ce.mode = ClusterModeEmbedding
|
|
}
|
|
}
|
|
|
|
// Phase 2: Embedding/semantic clustering (DBSCAN-inspired)
|
|
if ce.mode == ClusterModeEmbedding {
|
|
clusterID := ce.findSemanticCluster(event)
|
|
if clusterID != "" {
|
|
return clusterID
|
|
}
|
|
}
|
|
|
|
// Fallback: Temporal + category clustering (Phase 1)
|
|
clusterID := ce.findOrCreateTemporalCluster(event)
|
|
return clusterID
|
|
}
|
|
|
|
// findSemanticCluster uses cosine similarity of event descriptions to find matching clusters.
|
|
// This is a simplified DBSCAN-inspired approach that works without an external ML model.
|
|
func (ce *ClusterEngine) findSemanticCluster(event SOCEvent) string {
|
|
if event.Description == "" {
|
|
return ""
|
|
}
|
|
|
|
eventVec := textToVector(event.Description)
|
|
bestScore := 0.0
|
|
bestCluster := ""
|
|
|
|
for id, cluster := range ce.clusters {
|
|
if len(cluster.Events) >= ce.config.MaxClusterSize {
|
|
continue
|
|
}
|
|
// Use cluster category + source as proxy embedding when no ML model
|
|
clusterVec := textToVector(cluster.Category + " " + cluster.Source)
|
|
sim := cosineSimilarity(eventVec, clusterVec)
|
|
if sim > ce.config.SimilarityThreshold && sim > bestScore {
|
|
bestScore = sim
|
|
bestCluster = id
|
|
}
|
|
}
|
|
|
|
if bestCluster != "" {
|
|
c := ce.clusters[bestCluster]
|
|
c.Events = append(c.Events, event.ID)
|
|
c.UpdatedAt = time.Now()
|
|
if event.Severity.Rank() > EventSeverity(c.Severity).Rank() {
|
|
c.Severity = string(event.Severity)
|
|
}
|
|
return bestCluster
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// textToVector creates a simple character-frequency vector for cosine similarity.
|
|
// Serves as fallback when no external embedding model is available.
|
|
func textToVector(text string) map[rune]float64 {
|
|
vec := make(map[rune]float64)
|
|
for _, r := range text {
|
|
if r >= 'a' && r <= 'z' || r >= 'A' && r <= 'Z' || r == '_' {
|
|
vec[r]++
|
|
}
|
|
}
|
|
return vec
|
|
}
|
|
|
|
// cosineSimilarity computes cosine similarity between two sparse vectors.
|
|
func cosineSimilarity(a, b map[rune]float64) float64 {
|
|
dot := 0.0
|
|
magA := 0.0
|
|
magB := 0.0
|
|
for k, v := range a {
|
|
magA += v * v
|
|
if bv, ok := b[k]; ok {
|
|
dot += v * bv
|
|
}
|
|
}
|
|
for _, v := range b {
|
|
magB += v * v
|
|
}
|
|
if magA == 0 || magB == 0 {
|
|
return 0
|
|
}
|
|
return dot / (math.Sqrt(magA) * math.Sqrt(magB))
|
|
}
|
|
|
|
// findOrCreateTemporalCluster groups by (category + source) within temporal window.
|
|
func (ce *ClusterEngine) findOrCreateTemporalCluster(event SOCEvent) string {
|
|
now := time.Now()
|
|
key := string(event.Source) + ":" + event.Category
|
|
|
|
// Search existing clusters within temporal window
|
|
for id, cluster := range ce.clusters {
|
|
if cluster.Category == event.Category &&
|
|
cluster.Source == string(event.Source) &&
|
|
now.Sub(cluster.UpdatedAt) <= ce.config.TemporalWindow &&
|
|
len(cluster.Events) < ce.config.MaxClusterSize {
|
|
// Add to existing cluster
|
|
cluster.Events = append(cluster.Events, event.ID)
|
|
cluster.UpdatedAt = now
|
|
if event.Severity.Rank() > EventSeverity(cluster.Severity).Rank() {
|
|
cluster.Severity = string(event.Severity)
|
|
}
|
|
return id
|
|
}
|
|
}
|
|
|
|
// Create new cluster
|
|
clusterID := "clst-" + key + "-" + now.Format("150405")
|
|
ce.clusters[clusterID] = &AlertCluster{
|
|
ID: clusterID,
|
|
Events: []string{event.ID},
|
|
Category: event.Category,
|
|
Severity: string(event.Severity),
|
|
Source: string(event.Source),
|
|
CreatedAt: now,
|
|
UpdatedAt: now,
|
|
}
|
|
return clusterID
|
|
}
|
|
|
|
// Stats returns clustering statistics.
|
|
func (ce *ClusterEngine) Stats() map[string]any {
|
|
ce.mu.RLock()
|
|
defer ce.mu.RUnlock()
|
|
|
|
totalEvents := 0
|
|
maxSize := 0
|
|
for _, c := range ce.clusters {
|
|
totalEvents += len(c.Events)
|
|
if len(c.Events) > maxSize {
|
|
maxSize = len(c.Events)
|
|
}
|
|
}
|
|
|
|
avgSize := 0.0
|
|
if len(ce.clusters) > 0 {
|
|
avgSize = math.Round(float64(totalEvents)/float64(len(ce.clusters))*100) / 100
|
|
}
|
|
|
|
uiHint := "Smart clustering active"
|
|
if ce.mode == ClusterModeColdStart {
|
|
uiHint = "Clustering warming up..."
|
|
}
|
|
|
|
return map[string]any{
|
|
"mode": ce.mode.String(),
|
|
"ui_hint": uiHint,
|
|
"total_clusters": len(ce.clusters),
|
|
"total_events": totalEvents,
|
|
"avg_cluster_size": avgSize,
|
|
"max_cluster_size": maxSize,
|
|
"events_processed": ce.eventCount,
|
|
"embedding_model": ce.config.EmbeddingModel,
|
|
"cold_start_threshold": ce.config.MinEventsForEmbedding,
|
|
}
|
|
}
|
|
|
|
// Clusters returns all current clusters.
|
|
func (ce *ClusterEngine) Clusters() []*AlertCluster {
|
|
ce.mu.RLock()
|
|
defer ce.mu.RUnlock()
|
|
|
|
result := make([]*AlertCluster, 0, len(ce.clusters))
|
|
for _, c := range ce.clusters {
|
|
result = append(result, c)
|
|
}
|
|
return result
|
|
}
|