mirror of
https://github.com/flakestorm/flakestorm.git
synced 2026-04-25 00:36:54 +02:00
129 lines
4 KiB
YAML
129 lines
4 KiB
YAML
# Flakestorm v2.0 — Research Assistant Example
|
|
# Demonstrates: mutation testing, chaos, behavioral contract, replay, ci
|
|
|
|
version: "2.0"
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# Agent (HTTP). Start with: python agent.py (or uvicorn agent:app --port 8790)
|
|
# -----------------------------------------------------------------------------
|
|
agent:
|
|
endpoint: "http://localhost:8790/invoke"
|
|
type: "http"
|
|
method: "POST"
|
|
request_template: '{"input": "{prompt}"}'
|
|
response_path: "result"
|
|
timeout: 15000
|
|
reset_endpoint: "http://localhost:8790/reset"
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# Model (for mutation generation only)
|
|
# -----------------------------------------------------------------------------
|
|
model:
|
|
provider: "ollama"
|
|
name: "gemma3:1b"
|
|
base_url: "http://localhost:11434"
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# Mutations
|
|
# -----------------------------------------------------------------------------
|
|
mutations:
|
|
count: 5
|
|
types:
|
|
- paraphrase
|
|
- noise
|
|
- tone_shift
|
|
- prompt_injection
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# Golden prompts
|
|
# -----------------------------------------------------------------------------
|
|
golden_prompts:
|
|
- "What is the capital of France?"
|
|
- "Summarize the benefits of renewable energy."
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# Invariants (run invariants)
|
|
# -----------------------------------------------------------------------------
|
|
invariants:
|
|
- type: latency
|
|
max_ms: 30000
|
|
- type: contains
|
|
value: "source"
|
|
- type: output_not_empty
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# V2: Environment Chaos (tool/LLM faults)
|
|
# For HTTP agent, tool_faults with tool "*" apply to the single request to endpoint.
|
|
# -----------------------------------------------------------------------------
|
|
chaos:
|
|
tool_faults:
|
|
- tool: "*"
|
|
mode: error
|
|
error_code: 503
|
|
message: "Service Unavailable"
|
|
probability: 0.3
|
|
llm_faults:
|
|
- mode: truncated_response
|
|
max_tokens: 5
|
|
probability: 0.2
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# V2: Behavioral Contract + Chaos Matrix
|
|
# -----------------------------------------------------------------------------
|
|
contract:
|
|
name: "Research Agent Contract"
|
|
description: "Must cite source and complete under chaos"
|
|
invariants:
|
|
- id: always-cite-source
|
|
type: regex
|
|
pattern: "(?i)(source|according to)"
|
|
severity: critical
|
|
when: always
|
|
description: "Must cite a source"
|
|
- id: completes
|
|
type: completes
|
|
severity: high
|
|
when: always
|
|
description: "Must return a response"
|
|
- id: max-latency
|
|
type: latency
|
|
max_ms: 60000
|
|
severity: medium
|
|
when: always
|
|
chaos_matrix:
|
|
- name: "no-chaos"
|
|
tool_faults: []
|
|
llm_faults: []
|
|
- name: "api-outage"
|
|
tool_faults:
|
|
- tool: "*"
|
|
mode: error
|
|
error_code: 503
|
|
message: "Service Unavailable"
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# V2: Replay regression (sessions can reference file or be inline)
|
|
# -----------------------------------------------------------------------------
|
|
replays:
|
|
sessions:
|
|
- file: "replays/incident_001.yaml"
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# V2: Scoring weights (overall = mutation*0.2 + chaos*0.35 + contract*0.35 + replay*0.1)
|
|
# -----------------------------------------------------------------------------
|
|
scoring:
|
|
mutation: 0.20
|
|
chaos: 0.35
|
|
contract: 0.35
|
|
replay: 0.10
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# Output
|
|
# -----------------------------------------------------------------------------
|
|
output:
|
|
format: "html"
|
|
path: "./reports"
|
|
|
|
advanced:
|
|
concurrency: 5
|
|
retries: 2
|