mirror of
https://github.com/flakestorm/flakestorm.git
synced 2026-04-25 00:36:54 +02:00
148 lines
4.9 KiB
Python
148 lines
4.9 KiB
Python
"""Integration tests for replay: loader, resolve_contract, runner."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
import yaml
|
|
|
|
from flakestorm.core.config import (
|
|
FlakeStormConfig,
|
|
AgentConfig,
|
|
AgentType,
|
|
ModelConfig,
|
|
MutationConfig,
|
|
InvariantConfig,
|
|
InvariantType,
|
|
OutputConfig,
|
|
AdvancedConfig,
|
|
ContractConfig,
|
|
ContractInvariantConfig,
|
|
ReplaySessionConfig,
|
|
ReplayToolResponseConfig,
|
|
)
|
|
from flakestorm.replay.loader import ReplayLoader, resolve_contract
|
|
from flakestorm.replay.runner import ReplayRunner, ReplayResult
|
|
from flakestorm.core.protocol import AgentResponse, BaseAgentAdapter
|
|
|
|
|
|
class _MockAgent(BaseAgentAdapter):
|
|
"""Sync mock adapter that returns a fixed response."""
|
|
|
|
def __init__(self, output: str = "ok", error: str | None = None):
|
|
self._output = output
|
|
self._error = error
|
|
|
|
async def invoke(self, input: str) -> AgentResponse:
|
|
return AgentResponse(
|
|
output=self._output,
|
|
latency_ms=10.0,
|
|
error=self._error,
|
|
)
|
|
|
|
|
|
class TestReplayLoader:
|
|
"""Test replay file and contract resolution."""
|
|
|
|
def test_load_file_yaml(self):
|
|
with tempfile.NamedTemporaryFile(
|
|
suffix=".yaml", delete=False, mode="w", encoding="utf-8"
|
|
) as f:
|
|
yaml.dump({
|
|
"id": "r1",
|
|
"input": "What is 2+2?",
|
|
"tool_responses": [],
|
|
"contract": "default",
|
|
}, f)
|
|
f.flush()
|
|
path = f.name
|
|
try:
|
|
loader = ReplayLoader()
|
|
session = loader.load_file(path)
|
|
assert session.id == "r1"
|
|
assert session.input == "What is 2+2?"
|
|
assert session.contract == "default"
|
|
finally:
|
|
Path(path).unlink(missing_ok=True)
|
|
|
|
def test_resolve_contract_by_name(self):
|
|
contract = ContractConfig(
|
|
name="my_contract",
|
|
invariants=[ContractInvariantConfig(id="i1", type="contains", value="x")],
|
|
)
|
|
config = FlakeStormConfig(
|
|
agent=AgentConfig(endpoint="http://x", type=AgentType.HTTP),
|
|
model=ModelConfig(),
|
|
mutations=MutationConfig(),
|
|
golden_prompts=["p"],
|
|
invariants=[InvariantConfig(type=InvariantType.LATENCY, max_ms=1000)],
|
|
output=OutputConfig(),
|
|
advanced=AdvancedConfig(),
|
|
contract=contract,
|
|
)
|
|
resolved = resolve_contract("my_contract", config, None)
|
|
assert resolved.name == "my_contract"
|
|
assert len(resolved.invariants) == 1
|
|
|
|
def test_resolve_contract_not_found(self):
|
|
config = FlakeStormConfig(
|
|
agent=AgentConfig(endpoint="http://x", type=AgentType.HTTP),
|
|
model=ModelConfig(),
|
|
mutations=MutationConfig(),
|
|
golden_prompts=["p"],
|
|
invariants=[InvariantConfig(type=InvariantType.LATENCY, max_ms=1000)],
|
|
output=OutputConfig(),
|
|
advanced=AdvancedConfig(),
|
|
)
|
|
with pytest.raises(FileNotFoundError):
|
|
resolve_contract("nonexistent", config, None)
|
|
|
|
|
|
class TestReplayRunner:
|
|
"""Test replay runner and verification."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_run_without_contract(self):
|
|
agent = _MockAgent(output="hello")
|
|
runner = ReplayRunner(agent)
|
|
session = ReplaySessionConfig(
|
|
id="s1",
|
|
input="hi",
|
|
tool_responses=[],
|
|
contract="default",
|
|
)
|
|
result = await runner.run(session)
|
|
assert isinstance(result, ReplayResult)
|
|
assert result.response.output == "hello"
|
|
assert result.passed is True
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_run_with_contract_passes(self):
|
|
agent = _MockAgent(output="the answer is 42")
|
|
contract = ContractConfig(
|
|
name="c1",
|
|
invariants=[
|
|
ContractInvariantConfig(id="i1", type="contains", value="answer"),
|
|
],
|
|
)
|
|
runner = ReplayRunner(agent, contract=contract)
|
|
session = ReplaySessionConfig(id="s1", input="?", tool_responses=[], contract="c1")
|
|
result = await runner.run(session, contract=contract)
|
|
assert result.passed is True
|
|
assert "contains" in str(result.verification_details).lower() or result.verification_details
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_run_with_contract_fails(self):
|
|
agent = _MockAgent(output="no match")
|
|
contract = ContractConfig(
|
|
name="c1",
|
|
invariants=[
|
|
ContractInvariantConfig(id="i1", type="contains", value="required_word"),
|
|
],
|
|
)
|
|
runner = ReplayRunner(agent, contract=contract)
|
|
session = ReplaySessionConfig(id="s1", input="?", tool_responses=[], contract="c1")
|
|
result = await runner.run(session, contract=contract)
|
|
assert result.passed is False
|