flakestorm/examples/v2_research_agent/flakestorm.yaml

129 lines
4 KiB
YAML

# Flakestorm v2.0 — Research Assistant Example
# Demonstrates: mutation testing, chaos, behavioral contract, replay, ci
version: "2.0"
# -----------------------------------------------------------------------------
# Agent (HTTP). Start with: python agent.py (or uvicorn agent:app --port 8790)
# -----------------------------------------------------------------------------
agent:
endpoint: "http://localhost:8790/invoke"
type: "http"
method: "POST"
request_template: '{"input": "{prompt}"}'
response_path: "result"
timeout: 15000
reset_endpoint: "http://localhost:8790/reset"
# -----------------------------------------------------------------------------
# Model (for mutation generation only)
# -----------------------------------------------------------------------------
model:
provider: "ollama"
name: "gemma3:1b"
base_url: "http://localhost:11434"
# -----------------------------------------------------------------------------
# Mutations
# -----------------------------------------------------------------------------
mutations:
count: 5
types:
- paraphrase
- noise
- tone_shift
- prompt_injection
# -----------------------------------------------------------------------------
# Golden prompts
# -----------------------------------------------------------------------------
golden_prompts:
- "What is the capital of France?"
- "Summarize the benefits of renewable energy."
# -----------------------------------------------------------------------------
# Invariants (run invariants)
# -----------------------------------------------------------------------------
invariants:
- type: latency
max_ms: 30000
- type: contains
value: "source"
- type: output_not_empty
# -----------------------------------------------------------------------------
# V2: Environment Chaos (tool/LLM faults)
# For HTTP agent, tool_faults with tool "*" apply to the single request to endpoint.
# -----------------------------------------------------------------------------
chaos:
tool_faults:
- tool: "*"
mode: error
error_code: 503
message: "Service Unavailable"
probability: 0.3
llm_faults:
- mode: truncated_response
max_tokens: 5
probability: 0.2
# -----------------------------------------------------------------------------
# V2: Behavioral Contract + Chaos Matrix
# -----------------------------------------------------------------------------
contract:
name: "Research Agent Contract"
description: "Must cite source and complete under chaos"
invariants:
- id: always-cite-source
type: regex
pattern: "(?i)(source|according to)"
severity: critical
when: always
description: "Must cite a source"
- id: completes
type: completes
severity: high
when: always
description: "Must return a response"
- id: max-latency
type: latency
max_ms: 60000
severity: medium
when: always
chaos_matrix:
- name: "no-chaos"
tool_faults: []
llm_faults: []
- name: "api-outage"
tool_faults:
- tool: "*"
mode: error
error_code: 503
message: "Service Unavailable"
# -----------------------------------------------------------------------------
# V2: Replay regression (sessions can reference file or be inline)
# -----------------------------------------------------------------------------
replays:
sessions:
- file: "replays/incident_001.yaml"
# -----------------------------------------------------------------------------
# V2: Scoring weights (overall = mutation*0.2 + chaos*0.35 + contract*0.35 + replay*0.1)
# -----------------------------------------------------------------------------
scoring:
mutation: 0.20
chaos: 0.35
contract: 0.35
replay: 0.10
# -----------------------------------------------------------------------------
# Output
# -----------------------------------------------------------------------------
output:
format: "html"
path: "./reports"
advanced:
concurrency: 5
retries: 2