flakestorm/examples/v2_research_agent/flakestorm.yaml

# Flakestorm v2.0 — Research Assistant Example
# Demonstrates: mutation testing, chaos, behavioral contract, replay, ci

version: "2.0"

# -----------------------------------------------------------------------------
# Agent (HTTP). Start with: python agent.py  (or uvicorn agent:app --port 8790)
# -----------------------------------------------------------------------------
agent:
  endpoint: "http://localhost:8790/invoke"
  type: "http"
  method: "POST"
  request_template: '{"input": "{prompt}"}'
  response_path: "result"
  timeout: 15000
  reset_endpoint: "http://localhost:8790/reset"

# -----------------------------------------------------------------------------
# Model (for mutation generation only)
# -----------------------------------------------------------------------------
model:
  provider: "ollama"
  name: "gemma3:1b"
  base_url: "http://localhost:11434"

# -----------------------------------------------------------------------------
# Mutations
# -----------------------------------------------------------------------------
mutations:
  count: 5
  types:
    - paraphrase
    - noise
    - tone_shift
    - prompt_injection

# -----------------------------------------------------------------------------
# Golden prompts
# -----------------------------------------------------------------------------
golden_prompts:
  - "What is the capital of France?"
  - "Summarize the benefits of renewable energy."

# -----------------------------------------------------------------------------
# Invariants (run invariants)
# -----------------------------------------------------------------------------
invariants:
  - type: latency
    max_ms: 30000
  - type: contains
    value: "source"
  - type: output_not_empty

# -----------------------------------------------------------------------------
# V2: Environment Chaos (tool/LLM faults)
# For HTTP agent, tool_faults with tool "*" apply to the single request to endpoint.
# -----------------------------------------------------------------------------
chaos:
  tool_faults:
    - tool: "*"
      mode: error
      error_code: 503
      message: "Service Unavailable"
      probability: 0.3
  llm_faults:
    - mode: truncated_response
      max_tokens: 5
      probability: 0.2

# -----------------------------------------------------------------------------
# V2: Behavioral Contract + Chaos Matrix
# -----------------------------------------------------------------------------
contract:
  name: "Research Agent Contract"
  description: "Must cite source and complete under chaos"
  invariants:
    - id: always-cite-source
      type: regex
      pattern: "(?i)(source|according to)"
      severity: critical
      when: always
      description: "Must cite a source"
    - id: completes
      type: completes
      severity: high
      when: always
      description: "Must return a response"
    - id: max-latency
      type: latency
      max_ms: 60000
      severity: medium
      when: always
  chaos_matrix:
    - name: "no-chaos"
      tool_faults: []
      llm_faults: []
    - name: "api-outage"
      tool_faults:
        - tool: "*"
          mode: error
          error_code: 503
          message: "Service Unavailable"

# -----------------------------------------------------------------------------
# V2: Replay regression (sessions can reference file or be inline)
# -----------------------------------------------------------------------------
replays:
  sessions:
    - file: "replays/incident_001.yaml"

# -----------------------------------------------------------------------------
# V2: Scoring weights (overall = mutation*0.2 + chaos*0.35 + contract*0.35 + replay*0.1)
# -----------------------------------------------------------------------------
scoring:
  mutation: 0.20
  chaos: 0.35
  contract: 0.35
  replay: 0.10

# -----------------------------------------------------------------------------
# Output
# -----------------------------------------------------------------------------
output:
  format: "html"
  path: "./reports"

advanced:
  concurrency: 5
  retries: 2