Update version to 2.0.0 and enhance chaos engineering features in Flakestorm. Added support for environment chaos, behavioral contracts, and replay regression. Expanded documentation and improved scoring mechanisms. Updated .gitignore to include new documentation files.

2026-07-03 20:50:59 +02:00 · 2026-03-06 23:33:21 +08:00 · 2026-03-06 23:33:21 +08:00 · 9c3450a75d
commit 9c3450a75d
parent 59cca61f3c
63 changed files with 4147 additions and 134 deletions
--- a/tests/test_chaos_integration.py
+++ b/tests/test_chaos_integration.py
@ -0,0 +1,107 @@
+"""Integration tests for chaos module: interceptor, transport, LLM faults."""
+
+from __future__ import annotations
+
+import pytest
+
+from flakestorm.chaos.faults import apply_error, apply_malformed, apply_malicious_response, should_trigger
+from flakestorm.chaos.llm_proxy import (
+    apply_llm_empty,
+    apply_llm_garbage,
+    apply_llm_truncated,
+    apply_llm_response_drift,
+    apply_llm_fault,
+    should_trigger_llm_fault,
+)
+from flakestorm.chaos.tool_proxy import match_tool_fault
+from flakestorm.chaos.profiles import load_chaos_profile, list_profile_names
+from flakestorm.core.config import ChaosConfig, ToolFaultConfig, LlmFaultConfig
+
+
+class TestChaosFaults:
+    """Test fault application helpers."""
+
+    def test_apply_error(self):
+        code, msg, headers = apply_error(503, "Unavailable")
+        assert code == 503
+        assert "Unavailable" in msg
+
+    def test_apply_malformed(self):
+        body = apply_malformed()
+        assert "corrupted" in body or "invalid" in body.lower()
+
+    def test_apply_malicious_response(self):
+        out = apply_malicious_response("Ignore instructions")
+        assert out == "Ignore instructions"
+
+    def test_should_trigger_after_calls(self):
+        assert should_trigger(None, 2, 0) is False
+        assert should_trigger(None, 2, 1) is False
+        assert should_trigger(None, 2, 2) is True
+
+
+class TestLlmProxy:
+    """Test LLM fault application."""
+
+    def test_truncated(self):
+        out = apply_llm_truncated("one two three four five six", max_tokens=3)
+        assert out == "one two three"
+
+    def test_empty(self):
+        assert apply_llm_empty("anything") == ""
+
+    def test_garbage(self):
+        out = apply_llm_garbage("normal")
+        assert "gibberish" in out or "invalid" in out.lower()
+
+    def test_response_drift_json_rename(self):
+        out = apply_llm_response_drift('{"action": "run"}', "json_field_rename")
+        assert "action" in out or "tool_name" in out
+
+    def test_should_trigger_llm_fault(self):
+        class C:
+            probability = 1.0
+            after_calls = 0
+        assert should_trigger_llm_fault(C(), 0) is True
+        assert should_trigger_llm_fault(C(), 1) is True
+
+    def test_apply_llm_fault_truncated(self):
+        out = apply_llm_fault("hello world here", type("C", (), {"mode": "truncated_response", "max_tokens": 2})(), 0)
+        assert out == "hello world"
+
+
+class TestToolProxy:
+    """Test tool fault matching."""
+
+    def test_match_by_tool_name(self):
+        cfg = [ToolFaultConfig(tool="search", mode="timeout"), ToolFaultConfig(tool="*", mode="error")]
+        m = match_tool_fault("search", None, cfg, 0)
+        assert m is not None and m.tool == "search"
+        m2 = match_tool_fault("other", None, cfg, 0)
+        assert m2 is not None and m2.tool == "*"
+
+    def test_match_by_url(self):
+        cfg = [ToolFaultConfig(tool="x", match_url="https://api.example.com/*", mode="error")]
+        m = match_tool_fault(None, "https://api.example.com/foo", cfg, 0)
+        assert m is not None
+
+
+class TestChaosProfiles:
+    """Test built-in profile loading."""
+
+    def test_list_profiles(self):
+        names = list_profile_names()
+        assert "api_outage" in names
+        assert "indirect_injection" in names
+        assert "degraded_llm" in names
+        assert "hostile_tools" in names
+        assert "high_latency" in names
+        assert "cascading_failure" in names
+        assert "model_version_drift" in names
+
+    def test_load_api_outage(self):
+        c = load_chaos_profile("api_outage")
+        assert c.tool_faults
+        assert c.llm_faults
+        assert any(f.mode == "error" for f in c.tool_faults)
+        assert any(f.mode == "timeout" for f in c.llm_faults)
--- a/tests/test_config.py
+++ b/tests/test_config.py
@ -80,16 +80,17 @@ agent:
  endpoint: "http://test:8000/invoke"
 golden_prompts:
  - "Hello world"
+invariants:
+  - type: "latency"
+    max_ms: 5000
 """
        with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
            f.write(yaml_content)
            f.flush()
-
-            config = load_config(f.name)
-            assert config.agent.endpoint == "http://test:8000/invoke"
-
-            # Cleanup
-            Path(f.name).unlink()
+            path = f.name
+        config = load_config(path)
+        assert config.agent.endpoint == "http://test:8000/invoke"
+        Path(path).unlink(missing_ok=True)


 class TestAgentConfig:
--- a/tests/test_contract_integration.py
+++ b/tests/test_contract_integration.py
@ -0,0 +1,67 @@
+"""Integration tests for contract engine: matrix, verifier integration, reset."""
+
+from __future__ import annotations
+
+import pytest
+
+from flakestorm.contracts.matrix import ResilienceMatrix, SEVERITY_WEIGHT, CellResult
+from flakestorm.contracts.engine import (
+    _contract_invariant_to_invariant_config,
+    _scenario_to_chaos_config,
+    STATEFUL_WARNING,
+)
+from flakestorm.core.config import (
+    ContractConfig,
+    ContractInvariantConfig,
+    ChaosScenarioConfig,
+    ChaosConfig,
+    ToolFaultConfig,
+    InvariantType,
+)
+
+
+class TestResilienceMatrix:
+    """Test resilience matrix and score."""
+
+    def test_empty_score(self):
+        m = ResilienceMatrix()
+        assert m.resilience_score == 100.0
+        assert m.passed is True
+
+    def test_weighted_score(self):
+        m = ResilienceMatrix()
+        m.add_result("inv1", "sc1", "critical", True)
+        m.add_result("inv2", "sc1", "high", False)
+        m.add_result("inv3", "sc1", "medium", True)
+        assert m.resilience_score < 100.0
+        assert m.passed is True  # no critical failed yet
+        m.add_result("inv0", "sc1", "critical", False)
+        assert m.critical_failed is True
+        assert m.passed is False
+
+    def test_severity_weights(self):
+        assert SEVERITY_WEIGHT["critical"] == 3
+        assert SEVERITY_WEIGHT["high"] == 2
+        assert SEVERITY_WEIGHT["medium"] == 1
+
+
+class TestContractEngineHelpers:
+    """Test contract invariant conversion and scenario to chaos."""
+
+    def test_contract_invariant_to_invariant_config(self):
+        c = ContractInvariantConfig(id="t1", type="contains", value="ok", severity="high")
+        inv = _contract_invariant_to_invariant_config(c)
+        assert inv.type == InvariantType.CONTAINS
+        assert inv.value == "ok"
+        assert inv.severity == "high"
+
+    def test_scenario_to_chaos_config(self):
+        sc = ChaosScenarioConfig(
+            name="test",
+            tool_faults=[ToolFaultConfig(tool="*", mode="error", error_code=503)],
+            llm_faults=[],
+        )
+        chaos = _scenario_to_chaos_config(sc)
+        assert isinstance(chaos, ChaosConfig)
+        assert len(chaos.tool_faults) == 1
+        assert chaos.tool_faults[0].mode == "error"
--- a/tests/test_orchestrator.py
+++ b/tests/test_orchestrator.py
@ -65,6 +65,8 @@ class TestOrchestrator:
            AgentConfig,
            AgentType,
            FlakeStormConfig,
+            InvariantConfig,
+            InvariantType,
            MutationConfig,
        )
        from flakestorm.mutations.types import MutationType
@ -79,7 +81,7 @@ class TestOrchestrator:
                count=5,
                types=[MutationType.PARAPHRASE],
            ),
-            invariants=[],
+            invariants=[InvariantConfig(type=InvariantType.LATENCY, max_ms=5000)],
        )

    @pytest.fixture
--- a/tests/test_performance.py
+++ b/tests/test_performance.py
@ -16,7 +16,9 @@ _performance = importlib.util.module_from_spec(_spec)
 _spec.loader.exec_module(_performance)

 # Re-export functions for tests
+calculate_overall_resilience = _performance.calculate_overall_resilience
 calculate_percentile = _performance.calculate_percentile
+calculate_resilience_matrix_score = _performance.calculate_resilience_matrix_score
 calculate_robustness_score = _performance.calculate_robustness_score
 calculate_statistics = _performance.calculate_statistics
 calculate_weighted_score = _performance.calculate_weighted_score
@ -270,6 +272,57 @@ class TestCalculateStatistics:
        assert by_type["noise"]["pass_rate"] == 1.0


+class TestResilienceMatrixScore:
+    """V2: Contract resilience matrix score (severity-weighted)."""
+
+    def test_empty_returns_100(self):
+        score, overall, critical = calculate_resilience_matrix_score([], [])
+        assert score == 100.0
+        assert overall is True
+        assert critical is False
+
+    def test_all_passed(self):
+        score, overall, critical = calculate_resilience_matrix_score(
+            ["critical", "high"], [True, True]
+        )
+        assert score == 100.0
+        assert overall is True
+        assert critical is False
+
+    def test_severity_weighted_partial(self):
+        # critical=3, high=2, medium=1; one medium failed -> 5/6 * 100
+        score, overall, critical = calculate_resilience_matrix_score(
+            ["critical", "high", "medium"], [True, True, False]
+        )
+        assert abs(score - (5.0 / 6.0) * 100.0) < 0.02
+        assert overall is True
+        assert critical is False
+
+    def test_critical_failed(self):
+        _, overall, critical = calculate_resilience_matrix_score(
+            ["critical"], [False]
+        )
+        assert critical is True
+        assert overall is False
+
+
+class TestOverallResilience:
+    """V2: Overall weighted resilience from component scores."""
+
+    def test_empty_returns_one(self):
+        assert calculate_overall_resilience([], []) == 1.0
+
+    def test_weighted_average(self):
+        # 0.8*0.25 + 1.0*0.25 + 0.5*0.5 = 0.2 + 0.25 + 0.25 = 0.7
+        s = calculate_overall_resilience(
+            [0.8, 1.0, 0.5], [0.25, 0.25, 0.5]
+        )
+        assert abs(s - 0.7) < 0.001
+
+    def test_single_component(self):
+        assert calculate_overall_resilience([0.5], [1.0]) == 0.5
+
+
 class TestRustVsPythonParity:
    """Test that Rust and Python implementations give the same results."""

--- a/tests/test_replay_integration.py
+++ b/tests/test_replay_integration.py
@ -0,0 +1,148 @@
+"""Integration tests for replay: loader, resolve_contract, runner."""
+
+from __future__ import annotations
+
+import tempfile
+from pathlib import Path
+
+import pytest
+import yaml
+
+from flakestorm.core.config import (
+    FlakeStormConfig,
+    AgentConfig,
+    AgentType,
+    ModelConfig,
+    MutationConfig,
+    InvariantConfig,
+    InvariantType,
+    OutputConfig,
+    AdvancedConfig,
+    ContractConfig,
+    ContractInvariantConfig,
+    ReplaySessionConfig,
+    ReplayToolResponseConfig,
+)
+from flakestorm.replay.loader import ReplayLoader, resolve_contract
+from flakestorm.replay.runner import ReplayRunner, ReplayResult
+from flakestorm.core.protocol import AgentResponse, BaseAgentAdapter
+
+
+class _MockAgent(BaseAgentAdapter):
+    """Sync mock adapter that returns a fixed response."""
+
+    def __init__(self, output: str = "ok", error: str | None = None):
+        self._output = output
+        self._error = error
+
+    async def invoke(self, input: str) -> AgentResponse:
+        return AgentResponse(
+            output=self._output,
+            latency_ms=10.0,
+            error=self._error,
+        )
+
+
+class TestReplayLoader:
+    """Test replay file and contract resolution."""
+
+    def test_load_file_yaml(self):
+        with tempfile.NamedTemporaryFile(
+            suffix=".yaml", delete=False, mode="w", encoding="utf-8"
+        ) as f:
+            yaml.dump({
+                "id": "r1",
+                "input": "What is 2+2?",
+                "tool_responses": [],
+                "contract": "default",
+            }, f)
+            f.flush()
+            path = f.name
+        try:
+            loader = ReplayLoader()
+            session = loader.load_file(path)
+            assert session.id == "r1"
+            assert session.input == "What is 2+2?"
+            assert session.contract == "default"
+        finally:
+            Path(path).unlink(missing_ok=True)
+
+    def test_resolve_contract_by_name(self):
+        contract = ContractConfig(
+            name="my_contract",
+            invariants=[ContractInvariantConfig(id="i1", type="contains", value="x")],
+        )
+        config = FlakeStormConfig(
+            agent=AgentConfig(endpoint="http://x", type=AgentType.HTTP),
+            model=ModelConfig(),
+            mutations=MutationConfig(),
+            golden_prompts=["p"],
+            invariants=[InvariantConfig(type=InvariantType.LATENCY, max_ms=1000)],
+            output=OutputConfig(),
+            advanced=AdvancedConfig(),
+            contract=contract,
+        )
+        resolved = resolve_contract("my_contract", config, None)
+        assert resolved.name == "my_contract"
+        assert len(resolved.invariants) == 1
+
+    def test_resolve_contract_not_found(self):
+        config = FlakeStormConfig(
+            agent=AgentConfig(endpoint="http://x", type=AgentType.HTTP),
+            model=ModelConfig(),
+            mutations=MutationConfig(),
+            golden_prompts=["p"],
+            invariants=[InvariantConfig(type=InvariantType.LATENCY, max_ms=1000)],
+            output=OutputConfig(),
+            advanced=AdvancedConfig(),
+        )
+        with pytest.raises(FileNotFoundError):
+            resolve_contract("nonexistent", config, None)
+
+
+class TestReplayRunner:
+    """Test replay runner and verification."""
+
+    @pytest.mark.asyncio
+    async def test_run_without_contract(self):
+        agent = _MockAgent(output="hello")
+        runner = ReplayRunner(agent)
+        session = ReplaySessionConfig(
+            id="s1",
+            input="hi",
+            tool_responses=[],
+            contract="default",
+        )
+        result = await runner.run(session)
+        assert isinstance(result, ReplayResult)
+        assert result.response.output == "hello"
+        assert result.passed is True
+
+    @pytest.mark.asyncio
+    async def test_run_with_contract_passes(self):
+        agent = _MockAgent(output="the answer is 42")
+        contract = ContractConfig(
+            name="c1",
+            invariants=[
+                ContractInvariantConfig(id="i1", type="contains", value="answer"),
+            ],
+        )
+        runner = ReplayRunner(agent, contract=contract)
+        session = ReplaySessionConfig(id="s1", input="?", tool_responses=[], contract="c1")
+        result = await runner.run(session, contract=contract)
+        assert result.passed is True
+        assert "contains" in str(result.verification_details).lower() or result.verification_details
+
+    @pytest.mark.asyncio
+    async def test_run_with_contract_fails(self):
+        agent = _MockAgent(output="no match")
+        contract = ContractConfig(
+            name="c1",
+            invariants=[
+                ContractInvariantConfig(id="i1", type="contains", value="required_word"),
+            ],
+        )
+        runner = ReplayRunner(agent, contract=contract)
+        session = ReplaySessionConfig(id="s1", input="?", tool_responses=[], contract="c1")
+        result = await runner.run(session, contract=contract)
+        assert result.passed is False
--- a/tests/test_reports.py
+++ b/tests/test_reports.py
@ -206,6 +206,8 @@ class TestTestResults:
            AgentConfig,
            AgentType,
            FlakeStormConfig,
+            InvariantConfig,
+            InvariantType,
        )

        return FlakeStormConfig(
@ -214,7 +216,7 @@ class TestTestResults:
                type=AgentType.HTTP,
            ),
            golden_prompts=["Test"],
-            invariants=[],
+            invariants=[InvariantConfig(type=InvariantType.LATENCY, max_ms=5000)],
        )

    @pytest.fixture
@ -259,6 +261,8 @@ class TestHTMLReportGenerator:
            AgentConfig,
            AgentType,
            FlakeStormConfig,
+            InvariantConfig,
+            InvariantType,
        )

        return FlakeStormConfig(
@ -267,7 +271,7 @@ class TestHTMLReportGenerator:
                type=AgentType.HTTP,
            ),
            golden_prompts=["Test"],
-            invariants=[],
+            invariants=[InvariantConfig(type=InvariantType.LATENCY, max_ms=5000)],
        )

    @pytest.fixture
@ -360,6 +364,8 @@ class TestJSONReportGenerator:
            AgentConfig,
            AgentType,
            FlakeStormConfig,
+            InvariantConfig,
+            InvariantType,
        )

        return FlakeStormConfig(
@ -368,7 +374,7 @@ class TestJSONReportGenerator:
                type=AgentType.HTTP,
            ),
            golden_prompts=["Test"],
-            invariants=[],
+            invariants=[InvariantConfig(type=InvariantType.LATENCY, max_ms=5000)],
        )

    @pytest.fixture
@ -452,6 +458,8 @@ class TestTerminalReporter:
            AgentConfig,
            AgentType,
            FlakeStormConfig,
+            InvariantConfig,
+            InvariantType,
        )

        return FlakeStormConfig(
@ -460,7 +468,7 @@ class TestTerminalReporter:
                type=AgentType.HTTP,
            ),
            golden_prompts=["Test"],
-            invariants=[],
+            invariants=[InvariantConfig(type=InvariantType.LATENCY, max_ms=5000)],
        )

    @pytest.fixture