This commit is contained in:
Cyber MacGeddon 2025-09-03 18:58:04 +01:00
parent da8868b2e5
commit 015e48a6a2
2 changed files with 366 additions and 0 deletions

View file

@ -0,0 +1 @@
# Confidence agent unit tests

View file

@ -0,0 +1,365 @@
"""
Unit tests for the confidence evaluator module.
Tests confidence scoring logic for different tool types following TrustGraph testing patterns.
"""
import pytest
import json
from unittest.mock import Mock, AsyncMock
from trustgraph.agent.confidence.confidence import ConfidenceEvaluator
from trustgraph.agent.confidence.types import ConfidenceMetrics
class TestConfidenceEvaluator:
"""Test cases for the ConfidenceEvaluator class."""
@pytest.fixture
def evaluator(self):
"""Create a ConfidenceEvaluator instance for testing."""
return ConfidenceEvaluator()
# Graph Query Tests
def test_evaluate_graph_query_success_json_results(self, evaluator):
"""Test confidence evaluation for successful GraphQuery with JSON results."""
# Arrange
function_name = "GraphQuery"
arguments = {"query": "MATCH (n) RETURN n LIMIT 10"}
output = json.dumps([{"id": 1, "name": "entity1"}, {"id": 2, "name": "entity2"}])
execution_time_ms = 500
# Act
result = evaluator.evaluate(function_name, arguments, output, execution_time_ms)
# Assert
assert isinstance(result, ConfidenceMetrics)
assert 0.8 <= result.score <= 1.0 # Good result size should have high confidence
assert "2 items" in result.reasoning
assert result.retry_count == 0
def test_evaluate_graph_query_empty_results(self, evaluator):
"""Test confidence evaluation for GraphQuery with empty results."""
# Arrange
function_name = "GraphQuery"
arguments = {"query": "MATCH (n) WHERE false RETURN n"}
output = "[]"
execution_time_ms = 200
# Act
result = evaluator.evaluate(function_name, arguments, output, execution_time_ms)
# Assert
assert isinstance(result, ConfidenceMetrics)
assert result.score <= 0.4 # Empty results should have low confidence
assert "Empty result set" in result.reasoning
assert result.retry_count == 0
def test_evaluate_graph_query_large_results(self, evaluator):
"""Test confidence evaluation for GraphQuery with very large results."""
# Arrange
function_name = "GraphQuery"
arguments = {"query": "MATCH (n) RETURN n"}
large_result = [{"id": i} for i in range(1500)]
output = json.dumps(large_result)
execution_time_ms = 2000
# Act
result = evaluator.evaluate(function_name, arguments, output, execution_time_ms)
# Assert
assert isinstance(result, ConfidenceMetrics)
assert 0.6 <= result.score <= 0.8 # Large results should have moderate confidence
assert "1500 items" in result.reasoning
def test_evaluate_graph_query_non_json_output(self, evaluator):
"""Test confidence evaluation for GraphQuery with non-JSON output."""
# Arrange
function_name = "GraphQuery"
arguments = {"query": "RETURN 'hello world'"}
output = "hello world"
execution_time_ms = 100
# Act
result = evaluator.evaluate(function_name, arguments, output, execution_time_ms)
# Assert
assert isinstance(result, ConfidenceMetrics)
assert 0.5 <= result.score <= 0.7
assert "Non-JSON response" in result.reasoning
def test_evaluate_graph_query_very_fast_execution(self, evaluator):
"""Test confidence penalty for very fast execution."""
# Arrange
function_name = "GraphQuery"
arguments = {"query": "RETURN 1"}
output = json.dumps([{"result": 1}])
execution_time_ms = 50 # Very fast execution
# Act
result = evaluator.evaluate(function_name, arguments, output, execution_time_ms)
# Assert
assert "Very fast execution" in result.reasoning
# Score should be penalized for suspiciously fast execution
# Text Completion Tests
def test_evaluate_text_completion_success(self, evaluator):
"""Test confidence evaluation for successful TextCompletion."""
# Arrange
function_name = "TextCompletion"
arguments = {"prompt": "Explain quantum physics"}
output = "Quantum physics is the branch of physics that studies matter and energy at the smallest scales. It describes phenomena that occur at atomic and subatomic levels, where classical physics breaks down."
execution_time_ms = 1500
# Act
result = evaluator.evaluate(function_name, arguments, output, execution_time_ms)
# Assert
assert isinstance(result, ConfidenceMetrics)
assert result.score >= 0.7
assert "Reasonable response length" in result.reasoning
assert "Well-structured response" in result.reasoning
assert result.retry_count == 0
def test_evaluate_text_completion_empty_response(self, evaluator):
"""Test confidence evaluation for TextCompletion with empty response."""
# Arrange
function_name = "TextCompletion"
arguments = {"prompt": "Tell me about AI"}
output = ""
execution_time_ms = 500
# Act
result = evaluator.evaluate(function_name, arguments, output, execution_time_ms)
# Assert
assert isinstance(result, ConfidenceMetrics)
assert result.score <= 0.2
assert "Empty response" in result.reasoning
def test_evaluate_text_completion_short_response(self, evaluator):
"""Test confidence evaluation for TextCompletion with very short response."""
# Arrange
function_name = "TextCompletion"
arguments = {"prompt": "What is AI?"}
output = "AI is AI."
execution_time_ms = 300
# Act
result = evaluator.evaluate(function_name, arguments, output, execution_time_ms)
# Assert
assert isinstance(result, ConfidenceMetrics)
assert result.score <= 0.5
assert "Very short response" in result.reasoning
def test_evaluate_text_completion_error_indicators(self, evaluator):
"""Test confidence evaluation for TextCompletion with error indicators."""
# Arrange
function_name = "TextCompletion"
arguments = {"prompt": "Solve this complex equation"}
output = "I don't know how to solve this equation. Sorry, I cannot help with this."
execution_time_ms = 800
# Act
result = evaluator.evaluate(function_name, arguments, output, execution_time_ms)
# Assert
assert isinstance(result, ConfidenceMetrics)
assert result.score <= 0.6
assert "Response indicates uncertainty" in result.reasoning
def test_evaluate_text_completion_very_long_response(self, evaluator):
"""Test confidence evaluation for TextCompletion with suspiciously long response."""
# Arrange
function_name = "TextCompletion"
arguments = {"prompt": "What is 2+2?"}
output = "Two plus two equals four. " * 1000 # Very long response for simple question
execution_time_ms = 3000
# Act
result = evaluator.evaluate(function_name, arguments, output, execution_time_ms)
# Assert
assert isinstance(result, ConfidenceMetrics)
assert result.score <= 0.7
assert "Very long response" in result.reasoning
# MCP Tool Tests
def test_evaluate_mcp_tool_success(self, evaluator):
"""Test confidence evaluation for successful McpTool execution."""
# Arrange
function_name = "McpTool"
arguments = {"name": "weather_tool", "parameters": {"location": "San Francisco"}}
output = json.dumps({"temperature": 72, "conditions": "sunny", "success": True})
execution_time_ms = 1200
# Act
result = evaluator.evaluate(function_name, arguments, output, execution_time_ms)
# Assert
assert isinstance(result, ConfidenceMetrics)
assert result.score >= 0.7
assert "Tool execution successful" in result.reasoning
def test_evaluate_mcp_tool_error_response(self, evaluator):
"""Test confidence evaluation for McpTool with error response."""
# Arrange
function_name = "McpTool"
arguments = {"name": "broken_tool", "parameters": {}}
output = json.dumps({"error": "Tool execution failed", "success": False})
execution_time_ms = 500
# Act
result = evaluator.evaluate(function_name, arguments, output, execution_time_ms)
# Assert
assert isinstance(result, ConfidenceMetrics)
assert result.score <= 0.3
assert "Tool returned error" in result.reasoning
def test_evaluate_mcp_tool_text_error(self, evaluator):
"""Test confidence evaluation for McpTool with text error output."""
# Arrange
function_name = "McpTool"
arguments = {"name": "api_tool", "parameters": {"endpoint": "/invalid"}}
output = "Error: API endpoint not found. Request failed."
execution_time_ms = 800
# Act
result = evaluator.evaluate(function_name, arguments, output, execution_time_ms)
# Assert
assert isinstance(result, ConfidenceMetrics)
assert result.score <= 0.4
assert "Error indicated in output" in result.reasoning
def test_evaluate_mcp_tool_timeout_risk(self, evaluator):
"""Test confidence evaluation for McpTool with long execution time."""
# Arrange
function_name = "McpTool"
arguments = {"name": "slow_tool", "parameters": {}}
output = json.dumps({"result": "completed", "success": True})
execution_time_ms = 65000 # Over 1 minute
# Act
result = evaluator.evaluate(function_name, arguments, output, execution_time_ms)
# Assert
assert isinstance(result, ConfidenceMetrics)
assert result.score <= 0.7 # Should be penalized for long execution
assert "Tool execution timeout risk" in result.reasoning
# Retry Testing
def test_evaluate_with_retries(self, evaluator):
"""Test confidence evaluation with retry penalties."""
# Arrange
function_name = "GraphQuery"
arguments = {"query": "MATCH (n) RETURN n"}
output = json.dumps([{"id": 1}])
execution_time_ms = 500
retry_count = 2
# Act
result = evaluator.evaluate(function_name, arguments, output, execution_time_ms, retry_count)
# Assert
assert isinstance(result, ConfidenceMetrics)
assert result.retry_count == retry_count
assert "Retry penalty applied (2 retries)" in result.reasoning
# Score should be lower than without retries due to penalty
# Generic Function Tests
def test_evaluate_unknown_function(self, evaluator):
"""Test confidence evaluation for unknown function type."""
# Arrange
function_name = "UnknownFunction"
arguments = {"param": "value"}
output = "Some output"
execution_time_ms = 1000
# Act
result = evaluator.evaluate(function_name, arguments, output, execution_time_ms)
# Assert
assert isinstance(result, ConfidenceMetrics)
assert 0.5 <= result.score <= 0.7
assert "Generic evaluation for unknown function" in result.reasoning
assert "Function returned output" in result.reasoning
def test_evaluate_unknown_function_no_output(self, evaluator):
"""Test confidence evaluation for unknown function with no output."""
# Arrange
function_name = "UnknownFunction"
arguments = {"param": "value"}
output = ""
execution_time_ms = 500
# Act
result = evaluator.evaluate(function_name, arguments, output, execution_time_ms)
# Assert
assert isinstance(result, ConfidenceMetrics)
assert result.score <= 0.4
assert "No output from function" in result.reasoning
# Prompt Function Tests
def test_evaluate_prompt_function(self, evaluator):
"""Test confidence evaluation for Prompt function."""
# Arrange
function_name = "Prompt"
arguments = {"prompt": "analyze this data"}
output = "The data shows a clear upward trend with seasonal variations."
execution_time_ms = 1200
# Act
result = evaluator.evaluate(function_name, arguments, output, execution_time_ms)
# Assert
assert isinstance(result, ConfidenceMetrics)
assert result.score >= 0.7 # Should use text completion evaluation
assert "Reasonable response length" in result.reasoning
# Edge Cases
def test_evaluate_with_exception_in_evaluation(self, evaluator):
"""Test confidence evaluation handles exceptions gracefully."""
# Arrange
function_name = "GraphQuery"
arguments = {"query": "test"}
# Create problematic output that might cause JSON parsing issues
output = "{'invalid': json, syntax}"
execution_time_ms = 500
# Act
result = evaluator.evaluate(function_name, arguments, output, execution_time_ms)
# Assert
assert isinstance(result, ConfidenceMetrics)
assert result.score <= 0.4
# Should handle the exception and provide reasonable confidence score
def test_confidence_score_bounds(self, evaluator):
"""Test that confidence scores are always within valid bounds."""
# Arrange
test_cases = [
("GraphQuery", {}, "[]", 100),
("TextCompletion", {}, "Great response!", 1000),
("McpTool", {}, '{"success": true}', 500),
("UnknownFunction", {}, "", 0),
]
# Act & Assert
for function_name, arguments, output, execution_time_ms in test_cases:
result = evaluator.evaluate(function_name, arguments, output, execution_time_ms)
assert 0.0 <= result.score <= 1.0, f"Score out of bounds for {function_name}: {result.score}"
assert isinstance(result.reasoning, str)
assert len(result.reasoning) > 0
assert result.retry_count >= 0