mirror of
https://github.com/syntrex-lab/gomcp.git
synced 2026-05-08 11:02:37 +02:00
Release prep: 54 engines, self-hosted signatures, i18n, dashboard updates
This commit is contained in:
parent
694e32be26
commit
41cbfd6e0a
178 changed files with 36008 additions and 399 deletions
272
internal/domain/soc/clustering.go
Normal file
272
internal/domain/soc/clustering.go
Normal file
|
|
@ -0,0 +1,272 @@
|
|||
package soc
|
||||
|
||||
import (
|
||||
"math"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// AlertCluster groups related SOC events using temporal + categorical similarity.
|
||||
// Phase 1: temporal+session_id fallback (cold start).
|
||||
// Phase 2: embedding-based DBSCAN when enough events accumulated.
|
||||
//
|
||||
// Cold start strategy (§7.6):
|
||||
//
|
||||
// fallback: temporal_clustering
|
||||
// timeout: 5m — force embedding mode after 5 minutes even if <50 events
|
||||
// min_events_for_embedding: 50
|
||||
type AlertCluster struct {
|
||||
ID string `json:"id"`
|
||||
Events []string `json:"events"` // Event IDs
|
||||
Category string `json:"category"` // Dominant category
|
||||
Severity string `json:"severity"` // Max severity
|
||||
Source string `json:"source"` // Dominant source
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
}
|
||||
|
||||
// ClusterEngine groups related alerts using configurable strategies.
|
||||
type ClusterEngine struct {
|
||||
mu sync.RWMutex
|
||||
clusters map[string]*AlertCluster
|
||||
config ClusterConfig
|
||||
|
||||
// Cold start tracking
|
||||
startTime time.Time
|
||||
eventCount int
|
||||
mode ClusterMode
|
||||
}
|
||||
|
||||
// ClusterConfig holds Alert Clustering parameters.
|
||||
type ClusterConfig struct {
|
||||
// Cold start (§7.6)
|
||||
MinEventsForEmbedding int `yaml:"min_events_for_embedding" json:"min_events_for_embedding"`
|
||||
ColdStartTimeout time.Duration `yaml:"cold_start_timeout" json:"cold_start_timeout"`
|
||||
|
||||
// Temporal clustering parameters
|
||||
TemporalWindow time.Duration `yaml:"temporal_window" json:"temporal_window"` // Group events within this window
|
||||
MaxClusterSize int `yaml:"max_cluster_size" json:"max_cluster_size"`
|
||||
|
||||
// Embedding clustering parameters (Phase 2)
|
||||
SimilarityThreshold float64 `yaml:"similarity_threshold" json:"similarity_threshold"` // 0.0-1.0
|
||||
EmbeddingModel string `yaml:"embedding_model" json:"embedding_model"` // e.g., "all-MiniLM-L6-v2"
|
||||
}
|
||||
|
||||
// DefaultClusterConfig returns the default clustering configuration (§7.6).
|
||||
func DefaultClusterConfig() ClusterConfig {
|
||||
return ClusterConfig{
|
||||
MinEventsForEmbedding: 50,
|
||||
ColdStartTimeout: 5 * time.Minute,
|
||||
TemporalWindow: 2 * time.Minute,
|
||||
MaxClusterSize: 50,
|
||||
SimilarityThreshold: 0.75,
|
||||
EmbeddingModel: "all-MiniLM-L6-v2",
|
||||
}
|
||||
}
|
||||
|
||||
// ClusterMode tracks the engine operating mode.
|
||||
type ClusterMode int
|
||||
|
||||
const (
|
||||
ClusterModeColdStart ClusterMode = iota // Temporal+session_id fallback
|
||||
ClusterModeEmbedding // Full embedding-based clustering
|
||||
)
|
||||
|
||||
func (m ClusterMode) String() string {
|
||||
switch m {
|
||||
case ClusterModeEmbedding:
|
||||
return "embedding"
|
||||
default:
|
||||
return "cold_start"
|
||||
}
|
||||
}
|
||||
|
||||
// NewClusterEngine creates a cluster engine with the given config.
|
||||
func NewClusterEngine(config ClusterConfig) *ClusterEngine {
|
||||
return &ClusterEngine{
|
||||
clusters: make(map[string]*AlertCluster),
|
||||
config: config,
|
||||
startTime: time.Now(),
|
||||
mode: ClusterModeColdStart,
|
||||
}
|
||||
}
|
||||
|
||||
// AddEvent assigns an event to a cluster. Returns the cluster ID.
|
||||
func (ce *ClusterEngine) AddEvent(event SOCEvent) string {
|
||||
ce.mu.Lock()
|
||||
defer ce.mu.Unlock()
|
||||
|
||||
ce.eventCount++
|
||||
|
||||
// Check if we should transition to embedding mode
|
||||
if ce.mode == ClusterModeColdStart {
|
||||
if ce.eventCount >= ce.config.MinEventsForEmbedding ||
|
||||
time.Since(ce.startTime) >= ce.config.ColdStartTimeout {
|
||||
ce.mode = ClusterModeEmbedding
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 2: Embedding/semantic clustering (DBSCAN-inspired)
|
||||
if ce.mode == ClusterModeEmbedding {
|
||||
clusterID := ce.findSemanticCluster(event)
|
||||
if clusterID != "" {
|
||||
return clusterID
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: Temporal + category clustering (Phase 1)
|
||||
clusterID := ce.findOrCreateTemporalCluster(event)
|
||||
return clusterID
|
||||
}
|
||||
|
||||
// findSemanticCluster uses cosine similarity of event descriptions to find matching clusters.
|
||||
// This is a simplified DBSCAN-inspired approach that works without an external ML model.
|
||||
func (ce *ClusterEngine) findSemanticCluster(event SOCEvent) string {
|
||||
if event.Description == "" {
|
||||
return ""
|
||||
}
|
||||
|
||||
eventVec := textToVector(event.Description)
|
||||
bestScore := 0.0
|
||||
bestCluster := ""
|
||||
|
||||
for id, cluster := range ce.clusters {
|
||||
if len(cluster.Events) >= ce.config.MaxClusterSize {
|
||||
continue
|
||||
}
|
||||
// Use cluster category + source as proxy embedding when no ML model
|
||||
clusterVec := textToVector(cluster.Category + " " + cluster.Source)
|
||||
sim := cosineSimilarity(eventVec, clusterVec)
|
||||
if sim > ce.config.SimilarityThreshold && sim > bestScore {
|
||||
bestScore = sim
|
||||
bestCluster = id
|
||||
}
|
||||
}
|
||||
|
||||
if bestCluster != "" {
|
||||
c := ce.clusters[bestCluster]
|
||||
c.Events = append(c.Events, event.ID)
|
||||
c.UpdatedAt = time.Now()
|
||||
if event.Severity.Rank() > EventSeverity(c.Severity).Rank() {
|
||||
c.Severity = string(event.Severity)
|
||||
}
|
||||
return bestCluster
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// textToVector creates a simple character-frequency vector for cosine similarity.
|
||||
// Serves as fallback when no external embedding model is available.
|
||||
func textToVector(text string) map[rune]float64 {
|
||||
vec := make(map[rune]float64)
|
||||
for _, r := range text {
|
||||
if r >= 'a' && r <= 'z' || r >= 'A' && r <= 'Z' || r == '_' {
|
||||
vec[r]++
|
||||
}
|
||||
}
|
||||
return vec
|
||||
}
|
||||
|
||||
// cosineSimilarity computes cosine similarity between two sparse vectors.
|
||||
func cosineSimilarity(a, b map[rune]float64) float64 {
|
||||
dot := 0.0
|
||||
magA := 0.0
|
||||
magB := 0.0
|
||||
for k, v := range a {
|
||||
magA += v * v
|
||||
if bv, ok := b[k]; ok {
|
||||
dot += v * bv
|
||||
}
|
||||
}
|
||||
for _, v := range b {
|
||||
magB += v * v
|
||||
}
|
||||
if magA == 0 || magB == 0 {
|
||||
return 0
|
||||
}
|
||||
return dot / (math.Sqrt(magA) * math.Sqrt(magB))
|
||||
}
|
||||
|
||||
// findOrCreateTemporalCluster groups by (category + source) within temporal window.
|
||||
func (ce *ClusterEngine) findOrCreateTemporalCluster(event SOCEvent) string {
|
||||
now := time.Now()
|
||||
key := string(event.Source) + ":" + event.Category
|
||||
|
||||
// Search existing clusters within temporal window
|
||||
for id, cluster := range ce.clusters {
|
||||
if cluster.Category == event.Category &&
|
||||
cluster.Source == string(event.Source) &&
|
||||
now.Sub(cluster.UpdatedAt) <= ce.config.TemporalWindow &&
|
||||
len(cluster.Events) < ce.config.MaxClusterSize {
|
||||
// Add to existing cluster
|
||||
cluster.Events = append(cluster.Events, event.ID)
|
||||
cluster.UpdatedAt = now
|
||||
if event.Severity.Rank() > EventSeverity(cluster.Severity).Rank() {
|
||||
cluster.Severity = string(event.Severity)
|
||||
}
|
||||
return id
|
||||
}
|
||||
}
|
||||
|
||||
// Create new cluster
|
||||
clusterID := "clst-" + key + "-" + now.Format("150405")
|
||||
ce.clusters[clusterID] = &AlertCluster{
|
||||
ID: clusterID,
|
||||
Events: []string{event.ID},
|
||||
Category: event.Category,
|
||||
Severity: string(event.Severity),
|
||||
Source: string(event.Source),
|
||||
CreatedAt: now,
|
||||
UpdatedAt: now,
|
||||
}
|
||||
return clusterID
|
||||
}
|
||||
|
||||
// Stats returns clustering statistics.
|
||||
func (ce *ClusterEngine) Stats() map[string]any {
|
||||
ce.mu.RLock()
|
||||
defer ce.mu.RUnlock()
|
||||
|
||||
totalEvents := 0
|
||||
maxSize := 0
|
||||
for _, c := range ce.clusters {
|
||||
totalEvents += len(c.Events)
|
||||
if len(c.Events) > maxSize {
|
||||
maxSize = len(c.Events)
|
||||
}
|
||||
}
|
||||
|
||||
avgSize := 0.0
|
||||
if len(ce.clusters) > 0 {
|
||||
avgSize = math.Round(float64(totalEvents)/float64(len(ce.clusters))*100) / 100
|
||||
}
|
||||
|
||||
uiHint := "Smart clustering active"
|
||||
if ce.mode == ClusterModeColdStart {
|
||||
uiHint = "Clustering warming up..."
|
||||
}
|
||||
|
||||
return map[string]any{
|
||||
"mode": ce.mode.String(),
|
||||
"ui_hint": uiHint,
|
||||
"total_clusters": len(ce.clusters),
|
||||
"total_events": totalEvents,
|
||||
"avg_cluster_size": avgSize,
|
||||
"max_cluster_size": maxSize,
|
||||
"events_processed": ce.eventCount,
|
||||
"embedding_model": ce.config.EmbeddingModel,
|
||||
"cold_start_threshold": ce.config.MinEventsForEmbedding,
|
||||
}
|
||||
}
|
||||
|
||||
// Clusters returns all current clusters.
|
||||
func (ce *ClusterEngine) Clusters() []*AlertCluster {
|
||||
ce.mu.RLock()
|
||||
defer ce.mu.RUnlock()
|
||||
|
||||
result := make([]*AlertCluster, 0, len(ce.clusters))
|
||||
for _, c := range ce.clusters {
|
||||
result = append(result, c)
|
||||
}
|
||||
return result
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue