gomcp/internal/domain/eval/eval.go

// Package eval implements the CLASP Evaluation Framework (SDD-005).
//
// Provides structured capability scoring for SOC agents across 6 dimensions
// with 5 maturity levels each. Supports automated scoring via LLM-as-judge
// and trend analysis via stored results.
package eval

import (
	"encoding/json"
	"fmt"
	"os"
	"path/filepath"
	"time"
)

// Dimension represents a capability axis for agent evaluation.
type Dimension string

const (
	DimPlanning   Dimension = "planning"
	DimToolUse    Dimension = "tool_use"
	DimMemory     Dimension = "memory"
	DimReasoning  Dimension = "reasoning"
	DimReflection Dimension = "reflection"
	DimPerception Dimension = "perception"
)

// AllDimensions returns the 6 CLASP dimensions.
func AllDimensions() []Dimension {
	return []Dimension{
		DimPlanning, DimToolUse, DimMemory,
		DimReasoning, DimReflection, DimPerception,
	}
}

// Stage represents the security lifecycle stage of an eval scenario.
type Stage string

const (
	StageFind      Stage = "find"
	StageConfirm   Stage = "confirm"
	StageRootCause Stage = "root_cause"
	StageValidate  Stage = "validate"
)

// Score represents a capability score for one dimension.
type Score struct {
	Level      int     `json:"level"`      // 1-5 maturity
	Confidence float64 `json:"confidence"` // 0.0-1.0
	Evidence   string  `json:"evidence"`   // Justification
}

// EvalScenario defines a test scenario for agent evaluation.
type EvalScenario struct {
	ID          string   `json:"id"`
	Name        string   `json:"name"`
	Stage       Stage    `json:"stage"`
	Description string   `json:"description"`
	Inputs      []string `json:"inputs"`
	Expected    string   `json:"expected"`
	Dimensions  []Dimension `json:"dimensions"` // Which dimensions this tests
}

// EvalResult represents the outcome of evaluating an agent on a scenario.
type EvalResult struct {
	AgentID     string              `json:"agent_id"`
	Timestamp   time.Time           `json:"timestamp"`
	ScenarioID  string              `json:"scenario_id"`
	Scores      map[Dimension]Score `json:"scores"`
	OverallL    int                 `json:"overall_l"` // 1-5 aggregate
	JudgeModel  string              `json:"judge_model,omitempty"`
}

// ComputeOverall calculates the aggregate maturity level (average, rounded down).
func (r *EvalResult) ComputeOverall() int {
	if len(r.Scores) == 0 {
		return 0
	}
	total := 0
	for _, s := range r.Scores {
		total += s.Level
	}
	r.OverallL = total / len(r.Scores)
	return r.OverallL
}

// AgentProfile aggregates multiple EvalResults into a capability profile.
type AgentProfile struct {
	AgentID    string              `json:"agent_id"`
	Results    []EvalResult        `json:"results"`
	Averages   map[Dimension]float64 `json:"averages"`
	OverallL   int                 `json:"overall_l"`
	EvalCount  int                 `json:"eval_count"`
	LastEvalAt time.Time           `json:"last_eval_at"`
}

// ComputeAverages calculates per-dimension average scores across all results.
func (p *AgentProfile) ComputeAverages() {
	if len(p.Results) == 0 {
		return
	}

	dimSums := make(map[Dimension]float64)
	dimCounts := make(map[Dimension]int)

	for _, r := range p.Results {
		for dim, score := range r.Scores {
			dimSums[dim] += float64(score.Level)
			dimCounts[dim]++
		}
	}

	p.Averages = make(map[Dimension]float64)
	totalAvg := 0.0
	for _, dim := range AllDimensions() {
		if count, ok := dimCounts[dim]; ok && count > 0 {
			avg := dimSums[dim] / float64(count)
			p.Averages[dim] = avg
			totalAvg += avg
		}
	}

	if len(p.Averages) > 0 {
		p.OverallL = int(totalAvg / float64(len(p.Averages)))
	}
	p.EvalCount = len(p.Results)
	if len(p.Results) > 0 {
		p.LastEvalAt = p.Results[len(p.Results)-1].Timestamp
	}
}

// DetectRegression compares current profile to a previous one.
// Returns dimensions where the score dropped.
type Regression struct {
	Dimension Dimension `json:"dimension"`
	Previous  float64   `json:"previous"`
	Current   float64   `json:"current"`
	Delta     float64   `json:"delta"`
}

func DetectRegressions(previous, current *AgentProfile) []Regression {
	var regressions []Regression
	for _, dim := range AllDimensions() {
		prev, hasPrev := previous.Averages[dim]
		curr, hasCurr := current.Averages[dim]
		if hasPrev && hasCurr && curr < prev {
			regressions = append(regressions, Regression{
				Dimension: dim,
				Previous:  prev,
				Current:   curr,
				Delta:     curr - prev,
			})
		}
	}
	return regressions
}

// LoadScenarios loads eval scenarios from a JSON file.
func LoadScenarios(path string) ([]EvalScenario, error) {
	data, err := os.ReadFile(path)
	if err != nil {
		return nil, fmt.Errorf("load scenarios: %w", err)
	}
	var scenarios []EvalScenario
	if err := json.Unmarshal(data, &scenarios); err != nil {
		return nil, fmt.Errorf("parse scenarios: %w", err)
	}
	return scenarios, nil
}

// SaveResult saves an eval result to the results directory.
func SaveResult(dir string, result *EvalResult) error {
	if err := os.MkdirAll(dir, 0755); err != nil {
		return err
	}
	filename := fmt.Sprintf("%s_%s_%d.json",
		result.AgentID, result.ScenarioID, result.Timestamp.Unix())
	path := filepath.Join(dir, filename)

	data, err := json.MarshalIndent(result, "", "  ")
	if err != nil {
		return err
	}
	return os.WriteFile(path, data, 0644)
}