mirror of
https://github.com/flakestorm/flakestorm.git
synced 2026-04-25 00:36:54 +02:00
203 lines
7.1 KiB
Python
203 lines
7.1 KiB
Python
"""Integration tests for replay: loader, resolve_contract, runner."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
import yaml
|
|
|
|
from flakestorm.core.config import (
|
|
FlakeStormConfig,
|
|
AgentConfig,
|
|
AgentType,
|
|
ModelConfig,
|
|
MutationConfig,
|
|
InvariantConfig,
|
|
InvariantType,
|
|
OutputConfig,
|
|
AdvancedConfig,
|
|
ContractConfig,
|
|
ContractInvariantConfig,
|
|
ReplayConfig,
|
|
ReplaySessionConfig,
|
|
ReplayToolResponseConfig,
|
|
)
|
|
from flakestorm.replay.loader import ReplayLoader, resolve_contract, resolve_sessions_from_config
|
|
from flakestorm.replay.runner import ReplayRunner, ReplayResult
|
|
from flakestorm.core.protocol import AgentResponse, BaseAgentAdapter
|
|
|
|
|
|
class _MockAgent(BaseAgentAdapter):
|
|
"""Sync mock adapter that returns a fixed response."""
|
|
|
|
def __init__(self, output: str = "ok", error: str | None = None):
|
|
self._output = output
|
|
self._error = error
|
|
|
|
async def invoke(self, input: str) -> AgentResponse:
|
|
return AgentResponse(
|
|
output=self._output,
|
|
latency_ms=10.0,
|
|
error=self._error,
|
|
)
|
|
|
|
|
|
class TestReplayLoader:
|
|
"""Test replay file and contract resolution."""
|
|
|
|
def test_load_file_yaml(self):
|
|
with tempfile.NamedTemporaryFile(
|
|
suffix=".yaml", delete=False, mode="w", encoding="utf-8"
|
|
) as f:
|
|
yaml.dump({
|
|
"id": "r1",
|
|
"input": "What is 2+2?",
|
|
"tool_responses": [],
|
|
"contract": "default",
|
|
}, f)
|
|
f.flush()
|
|
path = f.name
|
|
try:
|
|
loader = ReplayLoader()
|
|
session = loader.load_file(path)
|
|
assert session.id == "r1"
|
|
assert session.input == "What is 2+2?"
|
|
assert session.contract == "default"
|
|
finally:
|
|
Path(path).unlink(missing_ok=True)
|
|
|
|
def test_resolve_contract_by_name(self):
|
|
contract = ContractConfig(
|
|
name="my_contract",
|
|
invariants=[ContractInvariantConfig(id="i1", type="contains", value="x")],
|
|
)
|
|
config = FlakeStormConfig(
|
|
agent=AgentConfig(endpoint="http://x", type=AgentType.HTTP),
|
|
model=ModelConfig(),
|
|
mutations=MutationConfig(),
|
|
golden_prompts=["p"],
|
|
invariants=[InvariantConfig(type=InvariantType.LATENCY, max_ms=1000)],
|
|
output=OutputConfig(),
|
|
advanced=AdvancedConfig(),
|
|
contract=contract,
|
|
)
|
|
resolved = resolve_contract("my_contract", config, None)
|
|
assert resolved.name == "my_contract"
|
|
assert len(resolved.invariants) == 1
|
|
|
|
def test_resolve_contract_not_found(self):
|
|
config = FlakeStormConfig(
|
|
agent=AgentConfig(endpoint="http://x", type=AgentType.HTTP),
|
|
model=ModelConfig(),
|
|
mutations=MutationConfig(),
|
|
golden_prompts=["p"],
|
|
invariants=[InvariantConfig(type=InvariantType.LATENCY, max_ms=1000)],
|
|
output=OutputConfig(),
|
|
advanced=AdvancedConfig(),
|
|
)
|
|
with pytest.raises(FileNotFoundError):
|
|
resolve_contract("nonexistent", config, None)
|
|
|
|
def test_resolve_sessions_from_config_inline_only(self):
|
|
"""resolve_sessions_from_config returns inline sessions when no sources."""
|
|
replays = ReplayConfig(
|
|
sessions=[
|
|
ReplaySessionConfig(id="a", input="q1", contract="default"),
|
|
ReplaySessionConfig(id="b", input="q2", contract="default"),
|
|
],
|
|
sources=[],
|
|
)
|
|
out = resolve_sessions_from_config(replays, None, include_sources=True)
|
|
assert len(out) == 2
|
|
assert out[0].id == "a"
|
|
assert out[1].id == "b"
|
|
|
|
def test_resolve_sessions_from_config_file_backed(self):
|
|
"""resolve_sessions_from_config loads file-backed sessions from config_dir."""
|
|
with tempfile.NamedTemporaryFile(
|
|
suffix=".yaml", delete=False, mode="w", encoding="utf-8"
|
|
) as f:
|
|
yaml.dump({
|
|
"id": "file-session",
|
|
"input": "from file",
|
|
"tool_responses": [],
|
|
"contract": "default",
|
|
}, f)
|
|
f.flush()
|
|
fpath = Path(f.name)
|
|
try:
|
|
config_dir = fpath.parent
|
|
replays = ReplayConfig(
|
|
sessions=[ReplaySessionConfig(id="", input="", file=fpath.name)],
|
|
sources=[],
|
|
)
|
|
out = resolve_sessions_from_config(replays, config_dir, include_sources=True)
|
|
assert len(out) == 1
|
|
assert out[0].id == "file-session"
|
|
assert out[0].input == "from file"
|
|
finally:
|
|
fpath.unlink(missing_ok=True)
|
|
|
|
def test_replay_config_sources_parsed_from_dict(self):
|
|
"""ReplayConfig.sources parses langsmith and langsmith_run from dict (YAML)."""
|
|
cfg = ReplayConfig.model_validate({
|
|
"sessions": [],
|
|
"sources": [
|
|
{"type": "langsmith", "project": "my-agent", "auto_import": True},
|
|
{"type": "langsmith_run", "run_id": "abc-123"},
|
|
],
|
|
})
|
|
assert len(cfg.sources) == 2
|
|
assert cfg.sources[0].project == "my-agent"
|
|
assert cfg.sources[0].auto_import is True
|
|
assert cfg.sources[1].run_id == "abc-123"
|
|
|
|
|
|
class TestReplayRunner:
|
|
"""Test replay runner and verification."""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_run_without_contract(self):
|
|
agent = _MockAgent(output="hello")
|
|
runner = ReplayRunner(agent)
|
|
session = ReplaySessionConfig(
|
|
id="s1",
|
|
input="hi",
|
|
tool_responses=[],
|
|
contract="default",
|
|
)
|
|
result = await runner.run(session)
|
|
assert isinstance(result, ReplayResult)
|
|
assert result.response.output == "hello"
|
|
assert result.passed is True
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_run_with_contract_passes(self):
|
|
agent = _MockAgent(output="the answer is 42")
|
|
contract = ContractConfig(
|
|
name="c1",
|
|
invariants=[
|
|
ContractInvariantConfig(id="i1", type="contains", value="answer"),
|
|
],
|
|
)
|
|
runner = ReplayRunner(agent, contract=contract)
|
|
session = ReplaySessionConfig(id="s1", input="?", tool_responses=[], contract="c1")
|
|
result = await runner.run(session, contract=contract)
|
|
assert result.passed is True
|
|
assert "contains" in str(result.verification_details).lower() or result.verification_details
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_run_with_contract_fails(self):
|
|
agent = _MockAgent(output="no match")
|
|
contract = ContractConfig(
|
|
name="c1",
|
|
invariants=[
|
|
ContractInvariantConfig(id="i1", type="contains", value="required_word"),
|
|
],
|
|
)
|
|
runner = ReplayRunner(agent, contract=contract)
|
|
session = ReplaySessionConfig(id="s1", input="?", tool_responses=[], contract="c1")
|
|
result = await runner.run(session, contract=contract)
|
|
assert result.passed is False
|