From f4d45d40531c90ed8a302623956d49bc2e9aa699 Mon Sep 17 00:00:00 2001 From: "Francisco M Humarang Jr." Date: Thu, 12 Mar 2026 20:05:51 +0800 Subject: [PATCH] Update documentation and configuration for Flakestorm V2, enhancing clarity on CI processes, report generation, and reproducibility features. Added details on the new `--output` option for saving reports, clarified the use of `--min-score`, and improved descriptions of the `seed` configuration for deterministic runs. Updated README and usage guides to reflect these changes and ensure comprehensive understanding of the CI pipeline and report outputs. --- .gitignore | 1 + README.md | 5 +- docs/API_SPECIFICATION.md | 13 +- docs/CONFIGURATION_GUIDE.md | 2 +- docs/DEVELOPER_FAQ.md | 2 +- docs/USAGE_GUIDE.md | 18 ++- examples/v2_research_agent/README.md | 4 + src/flakestorm/cli/main.py | 163 +++++++++++++++++++--- src/flakestorm/core/config.py | 3 +- src/flakestorm/core/orchestrator.py | 38 ++--- src/flakestorm/core/performance.py | 4 +- src/flakestorm/core/runner.py | 16 ++- src/flakestorm/reports/ci_report.py | 133 ++++++++++++++++++ src/flakestorm/reports/contract_report.py | 3 +- 14 files changed, 356 insertions(+), 49 deletions(-) create mode 100644 src/flakestorm/reports/ci_report.py diff --git a/.gitignore b/.gitignore index 426c8cb..98648ed 100644 --- a/.gitignore +++ b/.gitignore @@ -30,6 +30,7 @@ venv/ ENV/ env/ .env +examples/v2_research_agent/venv_sample # PyInstaller *.manifest diff --git a/README.md b/README.md index b2a6462..ab089d4 100644 --- a/README.md +++ b/README.md @@ -74,7 +74,7 @@ On top of that, Flakestorm still runs **adversarial prompt mutations** (22+ muta | **Chaos only** | `flakestorm run --chaos --chaos-only` | No mutations; golden prompts only, with chaos. Single chaos resilience score. | | **Contract only** | `flakestorm contract run` | Contract × chaos matrix; resilience score. | | **Replay only** | `flakestorm replay run path/to/replay.yaml -c flakestorm.yaml` | One or more replay sessions. | -| **ALL (full CI)** | `flakestorm ci` | Mutation run + contract (if configured) + chaos-only run (if chaos configured) + all replay sessions (if configured); then **overall** weighted score. | +| **ALL (full CI)** | `flakestorm ci` | Mutation run + contract (if configured) + chaos-only run (if chaos configured) + all replay sessions (if configured); then **overall** weighted score. Writes a **summary report** (e.g. `flakestorm-ci-report.html`) with per-phase scores and links to detailed reports; use `--output DIR` or `--output report.html` and `--min-score N`. | **Context attacks** are part of environment chaos: adversarial content is applied to **tool responses or to the input before invoke**, not to the user prompt itself. The chaos interceptor applies **memory_poisoning** to the user input before each invoke; LLM faults (timeout, truncated, empty, garbage, rate_limit, response_drift) are applied in the interceptor (timeout before the call, others after the response). Types: **indirect_injection** (tool returns valid-looking content with hidden instructions), **memory_poisoning** (payload into input before invoke; strategy `prepend` | `append` | `replace`), **system_prompt_leak_probe** (contract assertion using probe prompts). Config: list of attack configs or dict (e.g. `memory_poisoning: { payload: "...", strategy: "append" }`). Scenarios in the contract chaos matrix can each define `context_attacks`. See [Context Attacks](docs/CONTEXT_ATTACKS.md). @@ -158,7 +158,8 @@ For the full **V1 vs V2 flow** (mutation-only vs four pillars, contract matrix i - **Unified resilience score** — For full CI: weighted combination of **mutation robustness**, chaos resilience, contract compliance, and replay regression; weights (mutation, chaos, contract, replay) configurable in YAML and must sum to 1.0. - **Context attacks** — indirect_injection (into tool/context), memory_poisoning (into input before invoke; strategy: prepend/append/replace), system_prompt_leak_probe (contract assertion with probe prompts). Config: list or dict. [→ Context Attacks](docs/CONTEXT_ATTACKS.md) - **LLM providers** — Ollama, OpenAI, Anthropic, Google (Gemini); API keys via env only. [→ LLM Providers](docs/LLM_PROVIDERS.md) -- **Reports** — Interactive HTML and JSON; contract matrix and replay reports. +- **Reports** — Interactive HTML and JSON; contract matrix and replay reports. **`flakestorm ci`** writes a **summary report** (`flakestorm-ci-report.html`) with per-phase scores and **links to detailed reports** (mutation, contract, chaos, replay). Contract PASS/FAIL in the summary matches the contract detailed report (FAIL if any critical invariant fails). +- **Reproducible runs** — Set `advanced.seed` in config (e.g. `seed: 42`) for deterministic results: Python random is seeded (chaos behavior fixed) and the mutation-generation LLM uses temperature=0 so the same config yields the same scores run-to-run. **Try it:** [Working example](examples/v2_research_agent/README.md) with chaos, contracts, and replay from the CLI. diff --git a/docs/API_SPECIFICATION.md b/docs/API_SPECIFICATION.md index 83f0350..890b162 100644 --- a/docs/API_SPECIFICATION.md +++ b/docs/API_SPECIFICATION.md @@ -538,13 +538,24 @@ flakestorm replay export --from-report FILE # Export from an existing report ### V2: `flakestorm ci` -Run full CI pipeline: mutation run, contract run (if configured), chaos-only (if chaos configured), replay (if configured); then compute overall weighted score from `scoring.weights`. +Run full CI pipeline: mutation run, contract run (if configured), chaos-only (if chaos configured), replay (if configured); then compute overall weighted score from `scoring.weights`. Writes a **CI summary report** (e.g. `flakestorm-ci-report.html`) with per-phase scores and **"View detailed report"** links to phase-specific reports (mutation, contract, chaos, replay). Contract phase PASS/FAIL in the summary matches the contract detailed report (FAIL if any critical invariant fails). ```bash flakestorm ci flakestorm ci --config custom.yaml +flakestorm ci --min-score 0.5 # Fail if overall score below 0.5 +flakestorm ci --output ./reports # Save summary + detailed reports to directory +flakestorm ci --output report.html # Save summary report to file +flakestorm ci --quiet # Minimal output, no progress bars ``` +| Option | Description | +|--------|-------------| +| `--config`, `-c` | Config file path (default: `flakestorm.yaml`) | +| `--min-score` | Minimum overall (weighted) score to pass (default: 0.0) | +| `--output`, `-o` | Path to save reports: directory (creates `flakestorm-ci-report.html` + phase reports) or HTML file path | +| `--quiet`, `-q` | Minimal output, no progress bars | + --- ## Environment Variables diff --git a/docs/CONFIGURATION_GUIDE.md b/docs/CONFIGURATION_GUIDE.md index 209015b..d20ca2b 100644 --- a/docs/CONFIGURATION_GUIDE.md +++ b/docs/CONFIGURATION_GUIDE.md @@ -960,7 +960,7 @@ advanced: |--------|------|---------|-------------| | `concurrency` | integer | `10` | Max concurrent agent requests (1-100) | | `retries` | integer | `2` | Retry failed requests (0-5) | -| `seed` | integer | null | Random seed for reproducibility | +| `seed` | integer | null | **Reproducible runs:** when set, Python's random is seeded (chaos behavior fixed) and the mutation-generation LLM uses temperature=0 so the same config yields the same results run-to-run. Omit for exploratory, varying runs. | --- diff --git a/docs/DEVELOPER_FAQ.md b/docs/DEVELOPER_FAQ.md index 89ec0f1..3871fd6 100644 --- a/docs/DEVELOPER_FAQ.md +++ b/docs/DEVELOPER_FAQ.md @@ -107,7 +107,7 @@ This separation allows: ### Q: What does `flakestorm ci` run? -**A:** It runs, in order: (1) mutation run (with chaos if configured), (2) contract run if `contract` + `chaos_matrix` are configured, (3) chaos-only run if chaos is configured, (4) replay run if `replays` is configured. Then it computes an **overall weighted score** from `scoring.weights` (mutation, chaos, contract, replay); weights must sum to 1.0. Default weights: mutation 0.20, chaos 0.35, contract 0.35, replay 0.10. +**A:** It runs, in order: (1) mutation run (with chaos if configured), (2) contract run if `contract` + `chaos_matrix` are configured, (3) chaos-only run if chaos is configured, (4) replay run if `replays` is configured. Then it computes an **overall weighted score** from `scoring.weights` (mutation, chaos, contract, replay); weights must sum to 1.0. Default weights: mutation 0.20, chaos 0.35, contract 0.35, replay 0.10. It also writes a **CI summary report** (e.g. `flakestorm-ci-report.html`) with per-phase scores and links to **detailed reports** (mutation, contract, chaos, replay). Contract phase PASS/FAIL in the summary matches the contract detailed report (FAIL if any critical invariant fails). Use `--output` to control where reports are saved and `--min-score` for the overall pass threshold. --- diff --git a/docs/USAGE_GUIDE.md b/docs/USAGE_GUIDE.md index 19207e4..6e93cae 100644 --- a/docs/USAGE_GUIDE.md +++ b/docs/USAGE_GUIDE.md @@ -76,7 +76,7 @@ With **`version: "2.0"`** in your config, Flakestorm adds environment chaos, beh | **Behavioral contracts** | Contracts (invariants × severity) × chaos matrix scenarios; each cell is an independent run (optional reset per cell). | **Resilience score** (0–100%). Use `flakestorm contract run`. Per-contract formula: weighted by severity (critical×3, high×2, medium×1); **auto-FAIL** if any critical fails. | | **Replay regression** | Replay saved sessions (e.g. production incidents) and verify against a contract. | Per-session pass/fail; **replay regression** score when run via CI. Use `flakestorm replay run [path]`. | -**Unified CI:** `flakestorm ci` runs mutation run, contract run (if configured), chaos-only run (if chaos configured), and all replay sessions; then computes an **overall resilience score** from `scoring.weights` (default: mutation 0.20, chaos 0.35, contract 0.35, replay 0.10). Weights must sum to 1.0. +**Unified CI:** `flakestorm ci` runs mutation run, contract run (if configured), chaos-only run (if chaos configured), and all replay sessions; then computes an **overall resilience score** from `scoring.weights` (default: mutation 0.20, chaos 0.35, contract 0.35, replay 0.10). Weights must sum to 1.0. It writes a **CI summary report** (e.g. `flakestorm-ci-report.html`) with per-phase scores and links to **detailed reports** (mutation, contract, chaos, replay). Contract PASS/FAIL in the summary matches the contract detailed report (FAIL if any critical invariant fails). Use `--output DIR` or `--output report.html` and `--min-score N`. **Reports:** Use `flakestorm contract run --output report.html` and `flakestorm replay run --output report.html` to save HTML reports; both include **suggested actions** for failed cells or sessions (e.g. add reset_endpoint, tighten invariants). Replay accepts a single session file or a directory: `flakestorm replay run path/to/session.yaml` or `flakestorm replay run path/to/replays/`. @@ -1858,6 +1858,22 @@ advanced: retries: 3 # Retry failed requests 3 times ``` +### Reproducible Runs + +By default, mutation generation (LLM) and chaos (e.g. fault triggers, payload choice) can vary between runs, so scores may differ. For **deterministic, reproducible runs** (e.g. CI or regression checks), set a **random seed** in config: + +```yaml +advanced: + seed: 42 # Same config → same mutations and chaos → same scores +``` + +When `advanced.seed` is set: + +- **Python random** is seeded at run start, so chaos behavior (which faults trigger, which payloads) is fixed. +- The **mutation-generation LLM** uses temperature=0, so the same golden prompts produce the same mutations each run. + +Use a fixed seed when you need comparable run-to-run results; omit it for exploratory testing where variation is acceptable. + ### Golden Prompt Guide A comprehensive guide to creating effective golden prompts for your agent. diff --git a/examples/v2_research_agent/README.md b/examples/v2_research_agent/README.md index 0af7bf8..3643620 100644 --- a/examples/v2_research_agent/README.md +++ b/examples/v2_research_agent/README.md @@ -52,6 +52,9 @@ flakestorm replay export --from-report reports/report.json -o examples/v2_resear # Full CI run (mutation + contract + chaos + replay, overall weighted score) flakestorm ci -c examples/v2_research_agent/flakestorm.yaml --min-score 0.5 + +# CI with reports: summary + detailed phase reports (mutation, contract, chaos, replay) +flakestorm ci -c examples/v2_research_agent/flakestorm.yaml -o ./reports --min-score 0.5 ``` ## 3. What this example demonstrates @@ -63,6 +66,7 @@ flakestorm ci -c examples/v2_research_agent/flakestorm.yaml --min-score 0.5 | **Replay** | `replays.sessions` with `file: replays/incident_001.yaml`; contract resolved by name "Research Agent Contract" | | **Scoring** | `scoring` weights (mutation 20%, chaos 35%, contract 35%, replay 10%); used in `flakestorm ci` | | **Reset** | `agent.reset_endpoint: http://localhost:8790/reset` for contract matrix isolation | +| **Reproducibility** | Set `advanced.seed` (e.g. `42`) for deterministic chaos and mutation generation; same config → same scores. | ## 4. Config layout (v2.0) diff --git a/src/flakestorm/cli/main.py b/src/flakestorm/cli/main.py index 0351c98..f712eab 100644 --- a/src/flakestorm/cli/main.py +++ b/src/flakestorm/cli/main.py @@ -807,12 +807,24 @@ def ci( help="Path to configuration file", ), min_score: float = typer.Option(0.0, "--min-score", help="Minimum overall score"), + output: Path | None = typer.Option( + None, + "--output", + "-o", + help="Save reports to this path (file or directory). Saves CI summary and mutation report.", + ), + quiet: bool = typer.Option(False, "--quiet", "-q", help="Minimal output, no progress bars"), ) -> None: - """Run all configured modes and output unified exit code (v2).""" - asyncio.run(_ci_async(config, min_score)) + """Run all configured modes with interactive progress and optional report (v2).""" + asyncio.run(_ci_async(config, min_score, output, quiet)) -async def _ci_async(config: Path, min_score: float) -> None: +async def _ci_async( + config: Path, + min_score: float, + output: Path | None = None, + quiet: bool = False, +) -> None: from flakestorm.core.config import load_config cfg = load_config(config) exit_code = 0 @@ -825,11 +837,15 @@ async def _ci_async(config: Path, min_score: float) -> None: if cfg.replays and (cfg.replays.sessions or cfg.replays.sources): phases.append("replay") n_phases = len(phases) + show_progress = not quiet + matrix = None # contract phase result (for detailed report) + chaos_results = None # chaos phase result (for detailed report) + replay_report_results: list[dict] = [] # replay phase results (for detailed report) - # Run mutation tests + # Run mutation tests (with interactive progress like flakestorm run) idx = phases.index("mutation") + 1 if "mutation" in phases else 0 console.print(f"[bold blue][{idx}/{n_phases}] Mutation[/bold blue]") - runner = FlakeStormRunner(config=config, console=console, show_progress=False) + runner = FlakeStormRunner(config=config, console=console, show_progress=show_progress) results = await runner.run() mutation_score = results.statistics.robustness_score scores["mutation_robustness"] = mutation_score @@ -844,24 +860,34 @@ async def _ci_async(config: Path, min_score: float) -> None: console.print(f"[bold blue][{idx}/{n_phases}] Contract[/bold blue]") from flakestorm.core.protocol import create_agent_adapter, create_instrumented_adapter from flakestorm.contracts.engine import ContractEngine + from rich.progress import Progress, SpinnerColumn, TextColumn agent = create_agent_adapter(cfg.agent) if cfg.chaos: agent = create_instrumented_adapter(agent, cfg.chaos) engine = ContractEngine(cfg, cfg.contract, agent) - matrix = await engine.run() + if show_progress: + with Progress( + SpinnerColumn(), + TextColumn("[progress.description]{task.description}"), + console=console, + ) as progress: + progress.add_task("Running contract matrix...", total=None) + matrix = await engine.run() + else: + matrix = await engine.run() contract_score = matrix.resilience_score / 100.0 scores["contract_compliance"] = contract_score console.print(f"[bold]Contract score:[/bold] {matrix.resilience_score:.1f}%") if not matrix.passed or matrix.resilience_score < min_score * 100: exit_code = 1 - # Chaos-only run when chaos configured + # Chaos-only run when chaos configured (with interactive progress) chaos_score = 1.0 if cfg.chaos: idx = phases.index("chaos") + 1 console.print(f"[bold blue][{idx}/{n_phases}] Chaos[/bold blue]") chaos_runner = FlakeStormRunner( - config=config, console=console, show_progress=False, + config=config, console=console, show_progress=show_progress, chaos_only=True, chaos=True, ) chaos_results = await chaos_runner.run() @@ -879,6 +905,7 @@ async def _ci_async(config: Path, min_score: float) -> None: from flakestorm.core.protocol import create_agent_adapter, create_instrumented_adapter from flakestorm.replay.loader import resolve_contract, resolve_sessions_from_config from flakestorm.replay.runner import ReplayRunner + from rich.progress import Progress, SpinnerColumn, TextColumn agent = create_agent_adapter(cfg.agent) if cfg.chaos: agent = create_instrumented_adapter(agent, cfg.chaos) @@ -887,22 +914,53 @@ async def _ci_async(config: Path, min_score: float) -> None: cfg.replays, config_path.parent, include_sources=True ) if sessions: - passed = 0 - total = 0 - for session in sessions: - contract = None - try: - contract = resolve_contract(session.contract, cfg, config_path.parent) - except FileNotFoundError: - pass - runner = ReplayRunner(agent, contract=contract) - result = await runner.run(session, contract=contract) - total += 1 - if result.passed: - passed += 1 - replay_score = passed / total if total else 1.0 + passed_count = 0 + total = len(sessions) + replay_report_results = [] + if show_progress: + with Progress( + SpinnerColumn(), + TextColumn("[progress.description]{task.description}"), + console=console, + ) as progress: + task = progress.add_task("Replaying sessions...", total=total) + for session in sessions: + contract = None + try: + contract = resolve_contract(session.contract, cfg, config_path.parent) + except FileNotFoundError: + pass + runner = ReplayRunner(agent, contract=contract) + result = await runner.run(session, contract=contract) + if result.passed: + passed_count += 1 + replay_report_results.append({ + "id": getattr(session, "id", "") or "", + "name": getattr(session, "name", None) or getattr(session, "id", "") or "", + "passed": result.passed, + "verification_details": getattr(result, "verification_details", []) or [], + }) + progress.advance(task) + else: + for session in sessions: + contract = None + try: + contract = resolve_contract(session.contract, cfg, config_path.parent) + except FileNotFoundError: + pass + runner = ReplayRunner(agent, contract=contract) + result = await runner.run(session, contract=contract) + if result.passed: + passed_count += 1 + replay_report_results.append({ + "id": getattr(session, "id", "") or "", + "name": getattr(session, "name", None) or getattr(session, "id", "") or "", + "passed": result.passed, + "verification_details": getattr(result, "verification_details", []) or [], + }) + replay_score = passed_count / total if total else 1.0 scores["replay_regression"] = replay_score - console.print(f"[bold]Replay score:[/bold] {replay_score:.1%} ({passed}/{total})") + console.print(f"[bold]Replay score:[/bold] {replay_score:.1%} ({passed_count}/{total})") if replay_score < min_score: exit_code = 1 @@ -914,9 +972,68 @@ async def _ci_async(config: Path, min_score: float) -> None: used_w = [w[k] for k in scores if k in w] used_s = [scores[k] for k in scores if k in w] overall = calculate_overall_resilience(used_s, used_w) + passed = overall >= min_score console.print(f"[bold]Overall (weighted):[/bold] {overall:.1%}") if overall < min_score: exit_code = 1 + + # Generate reports: use --output if set, else config output.path (so CI always produces reports) + report_dir_or_file = output if output is not None else Path(cfg.output.path) + from datetime import datetime + from flakestorm.reports.html import HTMLReportGenerator + from flakestorm.reports.ci_report import save_ci_report + from flakestorm.reports.contract_report import save_contract_report + from flakestorm.reports.replay_report import save_replay_report + output_path = Path(report_dir_or_file) + if output_path.suffix.lower() in (".html", ".htm"): + report_dir = output_path.parent + ci_report_path = output_path + else: + report_dir = output_path + report_dir.mkdir(parents=True, exist_ok=True) + ci_report_path = report_dir / "flakestorm-ci-report.html" + ts = datetime.now().strftime("%Y%m%d-%H%M%S") + report_links: dict[str, str] = {} + + # Mutation detailed report (always) + mutation_report_path = report_dir / f"flakestorm-mutation-{ts}.html" + HTMLReportGenerator(results).save(mutation_report_path) + report_links["mutation_robustness"] = mutation_report_path.name + + # Contract detailed report (with suggested actions for failed cells) + if matrix is not None: + contract_report_path = report_dir / f"flakestorm-contract-{ts}.html" + save_contract_report(matrix, contract_report_path, title="Contract Resilience Report (CI)") + report_links["contract_compliance"] = contract_report_path.name + + # Chaos detailed report (same format as mutation) + if chaos_results is not None: + chaos_report_path = report_dir / f"flakestorm-chaos-{ts}.html" + HTMLReportGenerator(chaos_results).save(chaos_report_path) + report_links["chaos_resilience"] = chaos_report_path.name + + # Replay detailed report (with suggested actions for failed sessions) + if replay_report_results: + replay_report_path = report_dir / f"flakestorm-replay-{ts}.html" + save_replay_report(replay_report_results, replay_report_path, title="Replay Regression Report (CI)") + report_links["replay_regression"] = replay_report_path.name + + # Contract phase: summary status must match detailed report (FAIL if any critical invariant failed) + phase_overall_passed: dict[str, bool] = {} + if matrix is not None: + phase_overall_passed["contract_compliance"] = matrix.passed + save_ci_report(scores, overall, passed, ci_report_path, min_score=min_score, report_links=report_links, phase_overall_passed=phase_overall_passed) + if not quiet: + console.print() + console.print(f"[green]CI summary:[/green] {ci_report_path}") + console.print(f"[green]Mutation (detailed):[/green] {mutation_report_path}") + if matrix is not None: + console.print(f"[green]Contract (detailed, with recommendations):[/green] {report_dir / report_links.get('contract_compliance', '')}") + if chaos_results is not None: + console.print(f"[green]Chaos (detailed):[/green] {report_dir / report_links.get('chaos_resilience', '')}") + if replay_report_results: + console.print(f"[green]Replay (detailed, with recommendations):[/green] {report_dir / report_links.get('replay_regression', '')}") + raise typer.Exit(exit_code) diff --git a/src/flakestorm/core/config.py b/src/flakestorm/core/config.py index 54426f4..bfb2c53 100644 --- a/src/flakestorm/core/config.py +++ b/src/flakestorm/core/config.py @@ -368,7 +368,8 @@ class AdvancedConfig(BaseModel): default=2, ge=0, le=5, description="Number of retries for failed requests" ) seed: int | None = Field( - default=None, description="Random seed for reproducibility" + default=None, + description="Random seed for reproducible runs. When set: Python random is seeded (chaos behavior fixed) and mutation-generation LLM uses temperature=0 so the same config yields the same results.", ) diff --git a/src/flakestorm/core/orchestrator.py b/src/flakestorm/core/orchestrator.py index 2061733..e7b4eef 100644 --- a/src/flakestorm/core/orchestrator.py +++ b/src/flakestorm/core/orchestrator.py @@ -84,21 +84,25 @@ class Orchestrator: console: Console | None = None, show_progress: bool = True, chaos_only: bool = False, + preflight_agent: BaseAgentAdapter | None = None, ): """ Initialize the orchestrator. Args: config: flakestorm configuration - agent: Agent adapter to test + agent: Agent adapter to test (used for the actual run) mutation_engine: Engine for generating mutations verifier: Invariant verification engine console: Rich console for output show_progress: Whether to show progress bars chaos_only: If True, run only golden prompts (no mutation generation) + preflight_agent: If set, use this adapter for pre-flight check only (e.g. raw + agent when agent is chaos-wrapped, so validation does not fail on injected 503). """ self.config = config self.agent = agent + self.preflight_agent = preflight_agent self.mutation_engine = mutation_engine self.verifier = verifier self.console = console or Console() @@ -254,31 +258,33 @@ class Orchestrator: ) self.console.print() - # Test the first golden prompt + # Test the first golden prompt (use preflight_agent when set, e.g. raw agent for + # chaos_only so we don't fail on chaos-injected 503) if self.show_progress: self.console.print(" Testing with first golden prompt...", style="dim") - response = await self.agent.invoke_with_timing(test_prompt) + agent_for_preflight = self.preflight_agent if self.preflight_agent is not None else self.agent + response = await agent_for_preflight.invoke_with_timing(test_prompt) if not response.success or response.error: error_msg = response.error or "Unknown error" prompt_preview = ( test_prompt[:50] + "..." if len(test_prompt) > 50 else test_prompt ) - - if self.show_progress: - self.console.print() - self.console.print( - Panel( - f"[red]Agent validation failed![/red]\n\n" - f"[yellow]Test prompt:[/yellow] {prompt_preview}\n" - f"[yellow]Error:[/yellow] {error_msg}\n\n" - f"[dim]Please fix the agent errors (e.g., missing API keys, configuration issues) " - f"before running mutations. This prevents wasting time on a broken agent.[/dim]", - title="[red]Pre-flight Check Failed[/red]", - border_style="red", - ) + # Always print failure details so user sees the real error (e.g. connection refused) + # even when show_progress=False (e.g. flakestorm ci) + self.console.print() + self.console.print( + Panel( + f"[red]Agent validation failed![/red]\n\n" + f"[yellow]Test prompt:[/yellow] {prompt_preview}\n" + f"[yellow]Error:[/yellow] {error_msg}\n\n" + f"[dim]Please fix the agent errors (e.g., missing API keys, configuration issues) " + f"before running mutations. This prevents wasting time on a broken agent.[/dim]", + title="[red]Pre-flight Check Failed[/red]", + border_style="red", ) + ) return False else: if self.show_progress: diff --git a/src/flakestorm/core/performance.py b/src/flakestorm/core/performance.py index 2944cee..042e171 100644 --- a/src/flakestorm/core/performance.py +++ b/src/flakestorm/core/performance.py @@ -210,7 +210,9 @@ def calculate_overall_resilience(scores: list[float], weights: list[float]) -> f Weighted average for mutation_robustness, chaos_resilience, contract_compliance, replay_regression. """ if _RUST_AVAILABLE: - return flakestorm_rust.calculate_overall_resilience(scores, weights) + rust_fn = getattr(flakestorm_rust, "calculate_overall_resilience", None) + if rust_fn is not None: + return rust_fn(scores, weights) n = min(len(scores), len(weights)) if n == 0: diff --git a/src/flakestorm/core/runner.py b/src/flakestorm/core/runner.py index a8b4513..bdf0fda 100644 --- a/src/flakestorm/core/runner.py +++ b/src/flakestorm/core/runner.py @@ -7,6 +7,7 @@ and provides a simple API for executing reliability tests. from __future__ import annotations +import random from pathlib import Path from typing import TYPE_CHECKING @@ -65,6 +66,10 @@ class FlakeStormRunner: else: self.config = config + # Reproducibility: fix Python random seed so chaos and any sampling are deterministic + if self.config.advanced.seed is not None: + random.seed(self.config.advanced.seed) + self.chaos_only = chaos_only # Load chaos profile if requested @@ -108,9 +113,17 @@ class FlakeStormRunner: self.agent = create_instrumented_adapter(base_agent, self.config.chaos) else: self.agent = base_agent - self.mutation_engine = MutationEngine(self.config.model) + # When seed is set, use temperature=0 for mutation generation so same prompts → same mutations + model_cfg = self.config.model + if self.config.advanced.seed is not None: + model_cfg = model_cfg.model_copy(update={"temperature": 0.0}) + self.mutation_engine = MutationEngine(model_cfg) self.verifier = InvariantVerifier(self.config.invariants) + # When agent is chaos-wrapped, pre-flight must use the raw agent so we don't fail on + # chaos-injected 503 (e.g. in CI mutation phase or chaos_only phase). + preflight_agent = base_agent if self.config.chaos else None + # Create orchestrator self.orchestrator = Orchestrator( config=self.config, @@ -118,6 +131,7 @@ class FlakeStormRunner: mutation_engine=self.mutation_engine, verifier=self.verifier, console=self.console, + preflight_agent=preflight_agent, show_progress=self.show_progress, chaos_only=chaos_only, ) diff --git a/src/flakestorm/reports/ci_report.py b/src/flakestorm/reports/ci_report.py new file mode 100644 index 0000000..9ed7ab3 --- /dev/null +++ b/src/flakestorm/reports/ci_report.py @@ -0,0 +1,133 @@ +"""HTML report for flakestorm ci (all phases + overall score).""" + +from __future__ import annotations + +from datetime import datetime +from pathlib import Path +from typing import Any + + +def _escape(s: Any) -> str: + if s is None: + return "" + t = str(s) + return ( + t.replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace('"', """) + ) + + +def generate_ci_report_html( + phase_scores: dict[str, float], + overall: float, + passed: bool, + min_score: float = 0.0, + timestamp: str | None = None, + report_links: dict[str, str] | None = None, + phase_overall_passed: dict[str, bool] | None = None, +) -> str: + """Generate HTML for the CI run: phase scores, overall, and links to detailed reports. + phase_overall_passed: when a phase has its own pass/fail (e.g. contract: critical fail = FAIL), + pass False for that key so the summary matches the detailed report.""" + timestamp = timestamp or datetime.now().strftime("%Y-%m-%d %H:%M:%S") + report_links = report_links or {} + phase_overall_passed = phase_overall_passed or {} + phase_names = { + "mutation_robustness": "Mutation", + "chaos_resilience": "Chaos", + "contract_compliance": "Contract", + "replay_regression": "Replay", + } + rows = [] + for key, score in phase_scores.items(): + name = phase_names.get(key, key.replace("_", " ").title()) + pct = round(score * 100, 1) + # Fail if score below threshold OR phase has its own fail (e.g. contract critical failure) + phase_passed = phase_overall_passed.get(key, True) + row_failed = score < min_score or phase_passed is False + status = "FAIL" if row_failed else "PASS" + row_class = "fail" if row_failed else "" + link = report_links.get(key) + link_cell = f'View detailed report' if link else "" + rows.append( + f'{_escape(name)}{pct}%{status}{link_cell}' + ) + body = "\n".join(rows) + overall_pct = round(overall * 100, 1) + overall_status = "PASS" if passed else "FAIL" + overall_class = "fail" if not passed else "" + + return f""" + + + + +flakestorm CI Report - {_escape(timestamp)} + + + +
+

flakestorm CI Report

+

Run at {_escape(timestamp)} · min score: {min_score:.0%}

+

Each phase has a detailed report with failure reasons and recommended next steps. Use the links below to inspect failures.

+ + + +{body} + +
PhaseScoreStatusDetailed report
+
Overall (weighted): {overall_pct}% — {overall_status}
+
+ + +""" + + +def save_ci_report( + phase_scores: dict[str, float], + overall: float, + passed: bool, + path: Path, + min_score: float = 0.0, + report_links: dict[str, str] | None = None, + phase_overall_passed: dict[str, bool] | None = None, +) -> Path: + """Write CI report HTML to path. report_links: phase key -> filename. phase_overall_passed: phase key -> False when phase failed (e.g. contract critical fail).""" + path = Path(path) + path.parent.mkdir(parents=True, exist_ok=True) + html = generate_ci_report_html( + phase_scores=phase_scores, + overall=overall, + passed=passed, + min_score=min_score, + report_links=report_links, + phase_overall_passed=phase_overall_passed, + ) + path.write_text(html, encoding="utf-8") + return path diff --git a/src/flakestorm/reports/contract_report.py b/src/flakestorm/reports/contract_report.py index 5d7b87e..00f3bd7 100644 --- a/src/flakestorm/reports/contract_report.py +++ b/src/flakestorm/reports/contract_report.py @@ -57,7 +57,7 @@ def generate_contract_html(matrix: "ResilienceMatrix", title: str = "Contract Re suggestions_html = "" if failed_cells: suggestions_html = """ -

Suggested actions (failed cells)

+

Recommended next steps

The following actions may help fix the failed contract cells: