mirror of
https://github.com/syntrex-lab/gomcp.git
synced 2026-04-25 04:16:22 +02:00
130 lines
2.9 KiB
Go
130 lines
2.9 KiB
Go
package eval
|
|
|
|
import (
|
|
"os"
|
|
"path/filepath"
|
|
"testing"
|
|
"time"
|
|
)
|
|
|
|
func TestAllDimensionsCount(t *testing.T) {
|
|
dims := AllDimensions()
|
|
if len(dims) != 6 {
|
|
t.Errorf("expected 6 dimensions, got %d", len(dims))
|
|
}
|
|
}
|
|
|
|
func TestComputeOverall(t *testing.T) {
|
|
result := &EvalResult{
|
|
Scores: map[Dimension]Score{
|
|
DimPlanning: {Level: 3},
|
|
DimToolUse: {Level: 4},
|
|
DimMemory: {Level: 2},
|
|
DimReasoning: {Level: 5},
|
|
DimReflection: {Level: 3},
|
|
DimPerception: {Level: 1},
|
|
},
|
|
}
|
|
overall := result.ComputeOverall()
|
|
// (3+4+2+5+3+1)/6 = 18/6 = 3
|
|
if overall != 3 {
|
|
t.Errorf("expected overall 3, got %d", overall)
|
|
}
|
|
}
|
|
|
|
func TestAgentProfileAverages(t *testing.T) {
|
|
profile := &AgentProfile{
|
|
AgentID: "test-agent",
|
|
Results: []EvalResult{
|
|
{
|
|
Scores: map[Dimension]Score{
|
|
DimPlanning: {Level: 2},
|
|
DimToolUse: {Level: 4},
|
|
},
|
|
Timestamp: time.Now(),
|
|
},
|
|
{
|
|
Scores: map[Dimension]Score{
|
|
DimPlanning: {Level: 4},
|
|
DimToolUse: {Level: 4},
|
|
},
|
|
Timestamp: time.Now(),
|
|
},
|
|
},
|
|
}
|
|
profile.ComputeAverages()
|
|
|
|
if profile.Averages[DimPlanning] != 3.0 {
|
|
t.Errorf("planning avg should be 3.0, got %.1f", profile.Averages[DimPlanning])
|
|
}
|
|
if profile.Averages[DimToolUse] != 4.0 {
|
|
t.Errorf("tool_use avg should be 4.0, got %.1f", profile.Averages[DimToolUse])
|
|
}
|
|
if profile.EvalCount != 2 {
|
|
t.Errorf("expected 2 evals, got %d", profile.EvalCount)
|
|
}
|
|
}
|
|
|
|
func TestDetectRegressions(t *testing.T) {
|
|
prev := &AgentProfile{
|
|
Averages: map[Dimension]float64{
|
|
DimPlanning: 4.0,
|
|
DimToolUse: 3.0,
|
|
DimMemory: 2.0,
|
|
},
|
|
}
|
|
curr := &AgentProfile{
|
|
Averages: map[Dimension]float64{
|
|
DimPlanning: 3.0, // regression
|
|
DimToolUse: 4.0, // improvement
|
|
DimMemory: 2.0, // same
|
|
},
|
|
}
|
|
|
|
regressions := DetectRegressions(prev, curr)
|
|
if len(regressions) != 1 {
|
|
t.Fatalf("expected 1 regression, got %d", len(regressions))
|
|
}
|
|
if regressions[0].Dimension != DimPlanning {
|
|
t.Errorf("expected regression in planning, got %s", regressions[0].Dimension)
|
|
}
|
|
if regressions[0].Delta != -1.0 {
|
|
t.Errorf("expected delta -1.0, got %.1f", regressions[0].Delta)
|
|
}
|
|
}
|
|
|
|
func TestSaveAndLoadResult(t *testing.T) {
|
|
dir := filepath.Join(t.TempDir(), "results")
|
|
|
|
result := &EvalResult{
|
|
AgentID: "test-agent",
|
|
Timestamp: time.Now(),
|
|
ScenarioID: "scenario-001",
|
|
Scores: map[Dimension]Score{
|
|
DimPlanning: {Level: 3, Confidence: 0.9, Evidence: "good planning"},
|
|
},
|
|
OverallL: 3,
|
|
}
|
|
|
|
if err := SaveResult(dir, result); err != nil {
|
|
t.Fatalf("SaveResult error: %v", err)
|
|
}
|
|
|
|
// Verify file was created
|
|
entries, err := os.ReadDir(dir)
|
|
if err != nil {
|
|
t.Fatalf("ReadDir error: %v", err)
|
|
}
|
|
if len(entries) != 1 {
|
|
t.Errorf("expected 1 result file, got %d", len(entries))
|
|
}
|
|
}
|
|
|
|
func TestScoreValidLevels(t *testing.T) {
|
|
for level := 1; level <= 5; level++ {
|
|
s := Score{Level: level, Confidence: 0.8}
|
|
if s.Level < 1 || s.Level > 5 {
|
|
t.Errorf("level %d out of range", s.Level)
|
|
}
|
|
}
|
|
}
|