mirror of
https://github.com/syntrex-lab/gomcp.git
synced 2026-04-26 12:56:21 +02:00
185 lines
5.2 KiB
Go
185 lines
5.2 KiB
Go
// Package eval implements the CLASP Evaluation Framework (SDD-005).
|
|
//
|
|
// Provides structured capability scoring for SOC agents across 6 dimensions
|
|
// with 5 maturity levels each. Supports automated scoring via LLM-as-judge
|
|
// and trend analysis via stored results.
|
|
package eval
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"time"
|
|
)
|
|
|
|
// Dimension represents a capability axis for agent evaluation.
|
|
type Dimension string
|
|
|
|
const (
|
|
DimPlanning Dimension = "planning"
|
|
DimToolUse Dimension = "tool_use"
|
|
DimMemory Dimension = "memory"
|
|
DimReasoning Dimension = "reasoning"
|
|
DimReflection Dimension = "reflection"
|
|
DimPerception Dimension = "perception"
|
|
)
|
|
|
|
// AllDimensions returns the 6 CLASP dimensions.
|
|
func AllDimensions() []Dimension {
|
|
return []Dimension{
|
|
DimPlanning, DimToolUse, DimMemory,
|
|
DimReasoning, DimReflection, DimPerception,
|
|
}
|
|
}
|
|
|
|
// Stage represents the security lifecycle stage of an eval scenario.
|
|
type Stage string
|
|
|
|
const (
|
|
StageFind Stage = "find"
|
|
StageConfirm Stage = "confirm"
|
|
StageRootCause Stage = "root_cause"
|
|
StageValidate Stage = "validate"
|
|
)
|
|
|
|
// Score represents a capability score for one dimension.
|
|
type Score struct {
|
|
Level int `json:"level"` // 1-5 maturity
|
|
Confidence float64 `json:"confidence"` // 0.0-1.0
|
|
Evidence string `json:"evidence"` // Justification
|
|
}
|
|
|
|
// EvalScenario defines a test scenario for agent evaluation.
|
|
type EvalScenario struct {
|
|
ID string `json:"id"`
|
|
Name string `json:"name"`
|
|
Stage Stage `json:"stage"`
|
|
Description string `json:"description"`
|
|
Inputs []string `json:"inputs"`
|
|
Expected string `json:"expected"`
|
|
Dimensions []Dimension `json:"dimensions"` // Which dimensions this tests
|
|
}
|
|
|
|
// EvalResult represents the outcome of evaluating an agent on a scenario.
|
|
type EvalResult struct {
|
|
AgentID string `json:"agent_id"`
|
|
Timestamp time.Time `json:"timestamp"`
|
|
ScenarioID string `json:"scenario_id"`
|
|
Scores map[Dimension]Score `json:"scores"`
|
|
OverallL int `json:"overall_l"` // 1-5 aggregate
|
|
JudgeModel string `json:"judge_model,omitempty"`
|
|
}
|
|
|
|
// ComputeOverall calculates the aggregate maturity level (average, rounded down).
|
|
func (r *EvalResult) ComputeOverall() int {
|
|
if len(r.Scores) == 0 {
|
|
return 0
|
|
}
|
|
total := 0
|
|
for _, s := range r.Scores {
|
|
total += s.Level
|
|
}
|
|
r.OverallL = total / len(r.Scores)
|
|
return r.OverallL
|
|
}
|
|
|
|
// AgentProfile aggregates multiple EvalResults into a capability profile.
|
|
type AgentProfile struct {
|
|
AgentID string `json:"agent_id"`
|
|
Results []EvalResult `json:"results"`
|
|
Averages map[Dimension]float64 `json:"averages"`
|
|
OverallL int `json:"overall_l"`
|
|
EvalCount int `json:"eval_count"`
|
|
LastEvalAt time.Time `json:"last_eval_at"`
|
|
}
|
|
|
|
// ComputeAverages calculates per-dimension average scores across all results.
|
|
func (p *AgentProfile) ComputeAverages() {
|
|
if len(p.Results) == 0 {
|
|
return
|
|
}
|
|
|
|
dimSums := make(map[Dimension]float64)
|
|
dimCounts := make(map[Dimension]int)
|
|
|
|
for _, r := range p.Results {
|
|
for dim, score := range r.Scores {
|
|
dimSums[dim] += float64(score.Level)
|
|
dimCounts[dim]++
|
|
}
|
|
}
|
|
|
|
p.Averages = make(map[Dimension]float64)
|
|
totalAvg := 0.0
|
|
for _, dim := range AllDimensions() {
|
|
if count, ok := dimCounts[dim]; ok && count > 0 {
|
|
avg := dimSums[dim] / float64(count)
|
|
p.Averages[dim] = avg
|
|
totalAvg += avg
|
|
}
|
|
}
|
|
|
|
if len(p.Averages) > 0 {
|
|
p.OverallL = int(totalAvg / float64(len(p.Averages)))
|
|
}
|
|
p.EvalCount = len(p.Results)
|
|
if len(p.Results) > 0 {
|
|
p.LastEvalAt = p.Results[len(p.Results)-1].Timestamp
|
|
}
|
|
}
|
|
|
|
// DetectRegression compares current profile to a previous one.
|
|
// Returns dimensions where the score dropped.
|
|
type Regression struct {
|
|
Dimension Dimension `json:"dimension"`
|
|
Previous float64 `json:"previous"`
|
|
Current float64 `json:"current"`
|
|
Delta float64 `json:"delta"`
|
|
}
|
|
|
|
func DetectRegressions(previous, current *AgentProfile) []Regression {
|
|
var regressions []Regression
|
|
for _, dim := range AllDimensions() {
|
|
prev, hasPrev := previous.Averages[dim]
|
|
curr, hasCurr := current.Averages[dim]
|
|
if hasPrev && hasCurr && curr < prev {
|
|
regressions = append(regressions, Regression{
|
|
Dimension: dim,
|
|
Previous: prev,
|
|
Current: curr,
|
|
Delta: curr - prev,
|
|
})
|
|
}
|
|
}
|
|
return regressions
|
|
}
|
|
|
|
// LoadScenarios loads eval scenarios from a JSON file.
|
|
func LoadScenarios(path string) ([]EvalScenario, error) {
|
|
data, err := os.ReadFile(path)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("load scenarios: %w", err)
|
|
}
|
|
var scenarios []EvalScenario
|
|
if err := json.Unmarshal(data, &scenarios); err != nil {
|
|
return nil, fmt.Errorf("parse scenarios: %w", err)
|
|
}
|
|
return scenarios, nil
|
|
}
|
|
|
|
// SaveResult saves an eval result to the results directory.
|
|
func SaveResult(dir string, result *EvalResult) error {
|
|
if err := os.MkdirAll(dir, 0755); err != nil {
|
|
return err
|
|
}
|
|
filename := fmt.Sprintf("%s_%s_%d.json",
|
|
result.AgentID, result.ScenarioID, result.Timestamp.Unix())
|
|
path := filepath.Join(dir, filename)
|
|
|
|
data, err := json.MarshalIndent(result, "", " ")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return os.WriteFile(path, data, 0644)
|
|
}
|