From f4d45d40531c90ed8a302623956d49bc2e9aa699 Mon Sep 17 00:00:00 2001
From: "Francisco M Humarang Jr." <fhumarang@jetsupport.com>
Date: Thu, 12 Mar 2026 20:05:51 +0800
Subject: [PATCH] Update documentation and configuration for Flakestorm V2,
 enhancing clarity on CI processes, report generation, and reproducibility
 features. Added details on the new `--output` option for saving reports,
 clarified the use of `--min-score`, and improved descriptions of the `seed`
 configuration for deterministic runs. Updated README and usage guides to
 reflect these changes and ensure comprehensive understanding of the CI
 pipeline and report outputs.

---
 .gitignore                                |   1 +
 README.md                                 |   5 +-
 docs/API_SPECIFICATION.md                 |  13 +-
 docs/CONFIGURATION_GUIDE.md               |   2 +-
 docs/DEVELOPER_FAQ.md                     |   2 +-
 docs/USAGE_GUIDE.md                       |  18 ++-
 examples/v2_research_agent/README.md      |   4 +
 src/flakestorm/cli/main.py                | 163 +++++++++++++++++++---
 src/flakestorm/core/config.py             |   3 +-
 src/flakestorm/core/orchestrator.py       |  38 ++---
 src/flakestorm/core/performance.py        |   4 +-
 src/flakestorm/core/runner.py             |  16 ++-
 src/flakestorm/reports/ci_report.py       | 133 ++++++++++++++++++
 src/flakestorm/reports/contract_report.py |   3 +-
 14 files changed, 356 insertions(+), 49 deletions(-)
 create mode 100644 src/flakestorm/reports/ci_report.py

diff --git a/.gitignore b/.gitignore
index 426c8cb..98648ed 100644
--- a/.gitignore
+++ b/.gitignore
@@ -30,6 +30,7 @@ venv/
 ENV/
 env/
 .env
+examples/v2_research_agent/venv_sample
 
 # PyInstaller
 *.manifest
diff --git a/README.md b/README.md
index b2a6462..ab089d4 100644
--- a/README.md
+++ b/README.md
@@ -74,7 +74,7 @@ On top of that, Flakestorm still runs **adversarial prompt mutations** (22+ muta
 | **Chaos only** | `flakestorm run --chaos --chaos-only` | No mutations; golden prompts only, with chaos. Single chaos resilience score. |
 | **Contract only** | `flakestorm contract run` | Contract × chaos matrix; resilience score. |
 | **Replay only** | `flakestorm replay run path/to/replay.yaml -c flakestorm.yaml` | One or more replay sessions. |
-| **ALL (full CI)** | `flakestorm ci` | Mutation run + contract (if configured) + chaos-only run (if chaos configured) + all replay sessions (if configured); then **overall** weighted score. |
+| **ALL (full CI)** | `flakestorm ci` | Mutation run + contract (if configured) + chaos-only run (if chaos configured) + all replay sessions (if configured); then **overall** weighted score. Writes a **summary report** (e.g. `flakestorm-ci-report.html`) with per-phase scores and links to detailed reports; use `--output DIR` or `--output report.html` and `--min-score N`. |
 
 **Context attacks** are part of environment chaos: adversarial content is applied to **tool responses or to the input before invoke**, not to the user prompt itself. The chaos interceptor applies **memory_poisoning** to the user input before each invoke; LLM faults (timeout, truncated, empty, garbage, rate_limit, response_drift) are applied in the interceptor (timeout before the call, others after the response). Types: **indirect_injection** (tool returns valid-looking content with hidden instructions), **memory_poisoning** (payload into input before invoke; strategy `prepend` | `append` | `replace`), **system_prompt_leak_probe** (contract assertion using probe prompts). Config: list of attack configs or dict (e.g. `memory_poisoning: { payload: "...", strategy: "append" }`). Scenarios in the contract chaos matrix can each define `context_attacks`. See [Context Attacks](docs/CONTEXT_ATTACKS.md).
 
@@ -158,7 +158,8 @@ For the full **V1 vs V2 flow** (mutation-only vs four pillars, contract matrix i
 - **Unified resilience score** — For full CI: weighted combination of **mutation robustness**, chaos resilience, contract compliance, and replay regression; weights (mutation, chaos, contract, replay) configurable in YAML and must sum to 1.0.
 - **Context attacks** — indirect_injection (into tool/context), memory_poisoning (into input before invoke; strategy: prepend/append/replace), system_prompt_leak_probe (contract assertion with probe prompts). Config: list or dict. [→ Context Attacks](docs/CONTEXT_ATTACKS.md)
 - **LLM providers** — Ollama, OpenAI, Anthropic, Google (Gemini); API keys via env only. [→ LLM Providers](docs/LLM_PROVIDERS.md)
-- **Reports** — Interactive HTML and JSON; contract matrix and replay reports.
+- **Reports** — Interactive HTML and JSON; contract matrix and replay reports. **`flakestorm ci`** writes a **summary report** (`flakestorm-ci-report.html`) with per-phase scores and **links to detailed reports** (mutation, contract, chaos, replay). Contract PASS/FAIL in the summary matches the contract detailed report (FAIL if any critical invariant fails).
+- **Reproducible runs** — Set `advanced.seed` in config (e.g. `seed: 42`) for deterministic results: Python random is seeded (chaos behavior fixed) and the mutation-generation LLM uses temperature=0 so the same config yields the same scores run-to-run.
 
 **Try it:** [Working example](examples/v2_research_agent/README.md) with chaos, contracts, and replay from the CLI.
 
diff --git a/docs/API_SPECIFICATION.md b/docs/API_SPECIFICATION.md
index 83f0350..890b162 100644
--- a/docs/API_SPECIFICATION.md
+++ b/docs/API_SPECIFICATION.md
@@ -538,13 +538,24 @@ flakestorm replay export --from-report FILE  # Export from an existing report
 
 ### V2: `flakestorm ci`
 
-Run full CI pipeline: mutation run, contract run (if configured), chaos-only (if chaos configured), replay (if configured); then compute overall weighted score from `scoring.weights`.
+Run full CI pipeline: mutation run, contract run (if configured), chaos-only (if chaos configured), replay (if configured); then compute overall weighted score from `scoring.weights`. Writes a **CI summary report** (e.g. `flakestorm-ci-report.html`) with per-phase scores and **"View detailed report"** links to phase-specific reports (mutation, contract, chaos, replay). Contract phase PASS/FAIL in the summary matches the contract detailed report (FAIL if any critical invariant fails).
 
 ```bash
 flakestorm ci
 flakestorm ci --config custom.yaml
+flakestorm ci --min-score 0.5              # Fail if overall score below 0.5
+flakestorm ci --output ./reports            # Save summary + detailed reports to directory
+flakestorm ci --output report.html          # Save summary report to file
+flakestorm ci --quiet                       # Minimal output, no progress bars
 ```
 
+| Option | Description |
+|--------|-------------|
+| `--config`, `-c` | Config file path (default: `flakestorm.yaml`) |
+| `--min-score` | Minimum overall (weighted) score to pass (default: 0.0) |
+| `--output`, `-o` | Path to save reports: directory (creates `flakestorm-ci-report.html` + phase reports) or HTML file path |
+| `--quiet`, `-q` | Minimal output, no progress bars |
+
 ---
 
 ## Environment Variables
diff --git a/docs/CONFIGURATION_GUIDE.md b/docs/CONFIGURATION_GUIDE.md
index 209015b..d20ca2b 100644
--- a/docs/CONFIGURATION_GUIDE.md
+++ b/docs/CONFIGURATION_GUIDE.md
@@ -960,7 +960,7 @@ advanced:
 |--------|------|---------|-------------|
 | `concurrency` | integer | `10` | Max concurrent agent requests (1-100) |
 | `retries` | integer | `2` | Retry failed requests (0-5) |
-| `seed` | integer | null | Random seed for reproducibility |
+| `seed` | integer | null | **Reproducible runs:** when set, Python's random is seeded (chaos behavior fixed) and the mutation-generation LLM uses temperature=0 so the same config yields the same results run-to-run. Omit for exploratory, varying runs. |
 
 ---
 
diff --git a/docs/DEVELOPER_FAQ.md b/docs/DEVELOPER_FAQ.md
index 89ec0f1..3871fd6 100644
--- a/docs/DEVELOPER_FAQ.md
+++ b/docs/DEVELOPER_FAQ.md
@@ -107,7 +107,7 @@ This separation allows:
 
 ### Q: What does `flakestorm ci` run?
 
-**A:** It runs, in order: (1) mutation run (with chaos if configured), (2) contract run if `contract` + `chaos_matrix` are configured, (3) chaos-only run if chaos is configured, (4) replay run if `replays` is configured. Then it computes an **overall weighted score** from `scoring.weights` (mutation, chaos, contract, replay); weights must sum to 1.0. Default weights: mutation 0.20, chaos 0.35, contract 0.35, replay 0.10.
+**A:** It runs, in order: (1) mutation run (with chaos if configured), (2) contract run if `contract` + `chaos_matrix` are configured, (3) chaos-only run if chaos is configured, (4) replay run if `replays` is configured. Then it computes an **overall weighted score** from `scoring.weights` (mutation, chaos, contract, replay); weights must sum to 1.0. Default weights: mutation 0.20, chaos 0.35, contract 0.35, replay 0.10. It also writes a **CI summary report** (e.g. `flakestorm-ci-report.html`) with per-phase scores and links to **detailed reports** (mutation, contract, chaos, replay). Contract phase PASS/FAIL in the summary matches the contract detailed report (FAIL if any critical invariant fails). Use `--output` to control where reports are saved and `--min-score` for the overall pass threshold.
 
 ---
 
diff --git a/docs/USAGE_GUIDE.md b/docs/USAGE_GUIDE.md
index 19207e4..6e93cae 100644
--- a/docs/USAGE_GUIDE.md
+++ b/docs/USAGE_GUIDE.md
@@ -76,7 +76,7 @@ With **`version: "2.0"`** in your config, Flakestorm adds environment chaos, beh
 | **Behavioral contracts** | Contracts (invariants × severity) × chaos matrix scenarios; each cell is an independent run (optional reset per cell). | **Resilience score** (0–100%). Use `flakestorm contract run`. Per-contract formula: weighted by severity (critical×3, high×2, medium×1); **auto-FAIL** if any critical fails. |
 | **Replay regression** | Replay saved sessions (e.g. production incidents) and verify against a contract. | Per-session pass/fail; **replay regression** score when run via CI. Use `flakestorm replay run [path]`. |
 
-**Unified CI:** `flakestorm ci` runs mutation run, contract run (if configured), chaos-only run (if chaos configured), and all replay sessions; then computes an **overall resilience score** from `scoring.weights` (default: mutation 0.20, chaos 0.35, contract 0.35, replay 0.10). Weights must sum to 1.0.
+**Unified CI:** `flakestorm ci` runs mutation run, contract run (if configured), chaos-only run (if chaos configured), and all replay sessions; then computes an **overall resilience score** from `scoring.weights` (default: mutation 0.20, chaos 0.35, contract 0.35, replay 0.10). Weights must sum to 1.0. It writes a **CI summary report** (e.g. `flakestorm-ci-report.html`) with per-phase scores and links to **detailed reports** (mutation, contract, chaos, replay). Contract PASS/FAIL in the summary matches the contract detailed report (FAIL if any critical invariant fails). Use `--output DIR` or `--output report.html` and `--min-score N`.
 
 **Reports:** Use `flakestorm contract run --output report.html` and `flakestorm replay run --output report.html` to save HTML reports; both include **suggested actions** for failed cells or sessions (e.g. add reset_endpoint, tighten invariants). Replay accepts a single session file or a directory: `flakestorm replay run path/to/session.yaml` or `flakestorm replay run path/to/replays/`.
 
@@ -1858,6 +1858,22 @@ advanced:
   retries: 3      # Retry failed requests 3 times
 ```
 
+### Reproducible Runs
+
+By default, mutation generation (LLM) and chaos (e.g. fault triggers, payload choice) can vary between runs, so scores may differ. For **deterministic, reproducible runs** (e.g. CI or regression checks), set a **random seed** in config:
+
+```yaml
+advanced:
+  seed: 42   # Same config → same mutations and chaos → same scores
+```
+
+When `advanced.seed` is set:
+
+- **Python random** is seeded at run start, so chaos behavior (which faults trigger, which payloads) is fixed.
+- The **mutation-generation LLM** uses temperature=0, so the same golden prompts produce the same mutations each run.
+
+Use a fixed seed when you need comparable run-to-run results; omit it for exploratory testing where variation is acceptable.
+
 ### Golden Prompt Guide
 
 A comprehensive guide to creating effective golden prompts for your agent.
diff --git a/examples/v2_research_agent/README.md b/examples/v2_research_agent/README.md
index 0af7bf8..3643620 100644
--- a/examples/v2_research_agent/README.md
+++ b/examples/v2_research_agent/README.md
@@ -52,6 +52,9 @@ flakestorm replay export --from-report reports/report.json -o examples/v2_resear
 
 # Full CI run (mutation + contract + chaos + replay, overall weighted score)
 flakestorm ci -c examples/v2_research_agent/flakestorm.yaml --min-score 0.5
+
+# CI with reports: summary + detailed phase reports (mutation, contract, chaos, replay)
+flakestorm ci -c examples/v2_research_agent/flakestorm.yaml -o ./reports --min-score 0.5
 ```
 
 ## 3. What this example demonstrates
@@ -63,6 +66,7 @@ flakestorm ci -c examples/v2_research_agent/flakestorm.yaml --min-score 0.5
 | **Replay** | `replays.sessions` with `file: replays/incident_001.yaml`; contract resolved by name "Research Agent Contract" |
 | **Scoring** | `scoring` weights (mutation 20%, chaos 35%, contract 35%, replay 10%); used in `flakestorm ci` |
 | **Reset** | `agent.reset_endpoint: http://localhost:8790/reset` for contract matrix isolation |
+| **Reproducibility** | Set `advanced.seed` (e.g. `42`) for deterministic chaos and mutation generation; same config → same scores. |
 
 ## 4. Config layout (v2.0)
 
diff --git a/src/flakestorm/cli/main.py b/src/flakestorm/cli/main.py
index 0351c98..f712eab 100644
--- a/src/flakestorm/cli/main.py
+++ b/src/flakestorm/cli/main.py
@@ -807,12 +807,24 @@ def ci(
         help="Path to configuration file",
     ),
     min_score: float = typer.Option(0.0, "--min-score", help="Minimum overall score"),
+    output: Path | None = typer.Option(
+        None,
+        "--output",
+        "-o",
+        help="Save reports to this path (file or directory). Saves CI summary and mutation report.",
+    ),
+    quiet: bool = typer.Option(False, "--quiet", "-q", help="Minimal output, no progress bars"),
 ) -> None:
-    """Run all configured modes and output unified exit code (v2)."""
-    asyncio.run(_ci_async(config, min_score))
+    """Run all configured modes with interactive progress and optional report (v2)."""
+    asyncio.run(_ci_async(config, min_score, output, quiet))
 
 
-async def _ci_async(config: Path, min_score: float) -> None:
+async def _ci_async(
+    config: Path,
+    min_score: float,
+    output: Path | None = None,
+    quiet: bool = False,
+) -> None:
     from flakestorm.core.config import load_config
     cfg = load_config(config)
     exit_code = 0
@@ -825,11 +837,15 @@ async def _ci_async(config: Path, min_score: float) -> None:
     if cfg.replays and (cfg.replays.sessions or cfg.replays.sources):
         phases.append("replay")
     n_phases = len(phases)
+    show_progress = not quiet
+    matrix = None  # contract phase result (for detailed report)
+    chaos_results = None  # chaos phase result (for detailed report)
+    replay_report_results: list[dict] = []  # replay phase results (for detailed report)
 
-    # Run mutation tests
+    # Run mutation tests (with interactive progress like flakestorm run)
     idx = phases.index("mutation") + 1 if "mutation" in phases else 0
     console.print(f"[bold blue][{idx}/{n_phases}] Mutation[/bold blue]")
-    runner = FlakeStormRunner(config=config, console=console, show_progress=False)
+    runner = FlakeStormRunner(config=config, console=console, show_progress=show_progress)
     results = await runner.run()
     mutation_score = results.statistics.robustness_score
     scores["mutation_robustness"] = mutation_score
@@ -844,24 +860,34 @@ async def _ci_async(config: Path, min_score: float) -> None:
         console.print(f"[bold blue][{idx}/{n_phases}] Contract[/bold blue]")
         from flakestorm.core.protocol import create_agent_adapter, create_instrumented_adapter
         from flakestorm.contracts.engine import ContractEngine
+        from rich.progress import Progress, SpinnerColumn, TextColumn
         agent = create_agent_adapter(cfg.agent)
         if cfg.chaos:
             agent = create_instrumented_adapter(agent, cfg.chaos)
         engine = ContractEngine(cfg, cfg.contract, agent)
-        matrix = await engine.run()
+        if show_progress:
+            with Progress(
+                SpinnerColumn(),
+                TextColumn("[progress.description]{task.description}"),
+                console=console,
+            ) as progress:
+                progress.add_task("Running contract matrix...", total=None)
+                matrix = await engine.run()
+        else:
+            matrix = await engine.run()
         contract_score = matrix.resilience_score / 100.0
         scores["contract_compliance"] = contract_score
         console.print(f"[bold]Contract score:[/bold] {matrix.resilience_score:.1f}%")
         if not matrix.passed or matrix.resilience_score < min_score * 100:
             exit_code = 1
 
-    # Chaos-only run when chaos configured
+    # Chaos-only run when chaos configured (with interactive progress)
     chaos_score = 1.0
     if cfg.chaos:
         idx = phases.index("chaos") + 1
         console.print(f"[bold blue][{idx}/{n_phases}] Chaos[/bold blue]")
         chaos_runner = FlakeStormRunner(
-            config=config, console=console, show_progress=False,
+            config=config, console=console, show_progress=show_progress,
             chaos_only=True, chaos=True,
         )
         chaos_results = await chaos_runner.run()
@@ -879,6 +905,7 @@ async def _ci_async(config: Path, min_score: float) -> None:
         from flakestorm.core.protocol import create_agent_adapter, create_instrumented_adapter
         from flakestorm.replay.loader import resolve_contract, resolve_sessions_from_config
         from flakestorm.replay.runner import ReplayRunner
+        from rich.progress import Progress, SpinnerColumn, TextColumn
         agent = create_agent_adapter(cfg.agent)
         if cfg.chaos:
             agent = create_instrumented_adapter(agent, cfg.chaos)
@@ -887,22 +914,53 @@ async def _ci_async(config: Path, min_score: float) -> None:
             cfg.replays, config_path.parent, include_sources=True
         )
         if sessions:
-            passed = 0
-            total = 0
-            for session in sessions:
-                contract = None
-                try:
-                    contract = resolve_contract(session.contract, cfg, config_path.parent)
-                except FileNotFoundError:
-                    pass
-                runner = ReplayRunner(agent, contract=contract)
-                result = await runner.run(session, contract=contract)
-                total += 1
-                if result.passed:
-                    passed += 1
-            replay_score = passed / total if total else 1.0
+            passed_count = 0
+            total = len(sessions)
+            replay_report_results = []
+            if show_progress:
+                with Progress(
+                    SpinnerColumn(),
+                    TextColumn("[progress.description]{task.description}"),
+                    console=console,
+                ) as progress:
+                    task = progress.add_task("Replaying sessions...", total=total)
+                    for session in sessions:
+                        contract = None
+                        try:
+                            contract = resolve_contract(session.contract, cfg, config_path.parent)
+                        except FileNotFoundError:
+                            pass
+                        runner = ReplayRunner(agent, contract=contract)
+                        result = await runner.run(session, contract=contract)
+                        if result.passed:
+                            passed_count += 1
+                        replay_report_results.append({
+                            "id": getattr(session, "id", "") or "",
+                            "name": getattr(session, "name", None) or getattr(session, "id", "") or "",
+                            "passed": result.passed,
+                            "verification_details": getattr(result, "verification_details", []) or [],
+                        })
+                        progress.advance(task)
+            else:
+                for session in sessions:
+                    contract = None
+                    try:
+                        contract = resolve_contract(session.contract, cfg, config_path.parent)
+                    except FileNotFoundError:
+                        pass
+                    runner = ReplayRunner(agent, contract=contract)
+                    result = await runner.run(session, contract=contract)
+                    if result.passed:
+                        passed_count += 1
+                    replay_report_results.append({
+                        "id": getattr(session, "id", "") or "",
+                        "name": getattr(session, "name", None) or getattr(session, "id", "") or "",
+                        "passed": result.passed,
+                        "verification_details": getattr(result, "verification_details", []) or [],
+                    })
+            replay_score = passed_count / total if total else 1.0
             scores["replay_regression"] = replay_score
-            console.print(f"[bold]Replay score:[/bold] {replay_score:.1%} ({passed}/{total})")
+            console.print(f"[bold]Replay score:[/bold] {replay_score:.1%} ({passed_count}/{total})")
             if replay_score < min_score:
                 exit_code = 1
 
@@ -914,9 +972,68 @@ async def _ci_async(config: Path, min_score: float) -> None:
     used_w = [w[k] for k in scores if k in w]
     used_s = [scores[k] for k in scores if k in w]
     overall = calculate_overall_resilience(used_s, used_w)
+    passed = overall >= min_score
     console.print(f"[bold]Overall (weighted):[/bold] {overall:.1%}")
     if overall < min_score:
         exit_code = 1
+
+    # Generate reports: use --output if set, else config output.path (so CI always produces reports)
+    report_dir_or_file = output if output is not None else Path(cfg.output.path)
+    from datetime import datetime
+    from flakestorm.reports.html import HTMLReportGenerator
+    from flakestorm.reports.ci_report import save_ci_report
+    from flakestorm.reports.contract_report import save_contract_report
+    from flakestorm.reports.replay_report import save_replay_report
+    output_path = Path(report_dir_or_file)
+    if output_path.suffix.lower() in (".html", ".htm"):
+        report_dir = output_path.parent
+        ci_report_path = output_path
+    else:
+        report_dir = output_path
+        report_dir.mkdir(parents=True, exist_ok=True)
+        ci_report_path = report_dir / "flakestorm-ci-report.html"
+    ts = datetime.now().strftime("%Y%m%d-%H%M%S")
+    report_links: dict[str, str] = {}
+
+    # Mutation detailed report (always)
+    mutation_report_path = report_dir / f"flakestorm-mutation-{ts}.html"
+    HTMLReportGenerator(results).save(mutation_report_path)
+    report_links["mutation_robustness"] = mutation_report_path.name
+
+    # Contract detailed report (with suggested actions for failed cells)
+    if matrix is not None:
+        contract_report_path = report_dir / f"flakestorm-contract-{ts}.html"
+        save_contract_report(matrix, contract_report_path, title="Contract Resilience Report (CI)")
+        report_links["contract_compliance"] = contract_report_path.name
+
+    # Chaos detailed report (same format as mutation)
+    if chaos_results is not None:
+        chaos_report_path = report_dir / f"flakestorm-chaos-{ts}.html"
+        HTMLReportGenerator(chaos_results).save(chaos_report_path)
+        report_links["chaos_resilience"] = chaos_report_path.name
+
+    # Replay detailed report (with suggested actions for failed sessions)
+    if replay_report_results:
+        replay_report_path = report_dir / f"flakestorm-replay-{ts}.html"
+        save_replay_report(replay_report_results, replay_report_path, title="Replay Regression Report (CI)")
+        report_links["replay_regression"] = replay_report_path.name
+
+    # Contract phase: summary status must match detailed report (FAIL if any critical invariant failed)
+    phase_overall_passed: dict[str, bool] = {}
+    if matrix is not None:
+        phase_overall_passed["contract_compliance"] = matrix.passed
+    save_ci_report(scores, overall, passed, ci_report_path, min_score=min_score, report_links=report_links, phase_overall_passed=phase_overall_passed)
+    if not quiet:
+        console.print()
+        console.print(f"[green]CI summary:[/green] {ci_report_path}")
+        console.print(f"[green]Mutation (detailed):[/green] {mutation_report_path}")
+        if matrix is not None:
+            console.print(f"[green]Contract (detailed, with recommendations):[/green] {report_dir / report_links.get('contract_compliance', '')}")
+        if chaos_results is not None:
+            console.print(f"[green]Chaos (detailed):[/green] {report_dir / report_links.get('chaos_resilience', '')}")
+        if replay_report_results:
+            console.print(f"[green]Replay (detailed, with recommendations):[/green] {report_dir / report_links.get('replay_regression', '')}")
+
     raise typer.Exit(exit_code)
 
 
diff --git a/src/flakestorm/core/config.py b/src/flakestorm/core/config.py
index 54426f4..bfb2c53 100644
--- a/src/flakestorm/core/config.py
+++ b/src/flakestorm/core/config.py
@@ -368,7 +368,8 @@ class AdvancedConfig(BaseModel):
         default=2, ge=0, le=5, description="Number of retries for failed requests"
     )
     seed: int | None = Field(
-        default=None, description="Random seed for reproducibility"
+        default=None,
+        description="Random seed for reproducible runs. When set: Python random is seeded (chaos behavior fixed) and mutation-generation LLM uses temperature=0 so the same config yields the same results.",
     )
 
 
diff --git a/src/flakestorm/core/orchestrator.py b/src/flakestorm/core/orchestrator.py
index 2061733..e7b4eef 100644
--- a/src/flakestorm/core/orchestrator.py
+++ b/src/flakestorm/core/orchestrator.py
@@ -84,21 +84,25 @@ class Orchestrator:
         console: Console | None = None,
         show_progress: bool = True,
         chaos_only: bool = False,
+        preflight_agent: BaseAgentAdapter | None = None,
     ):
         """
         Initialize the orchestrator.
 
         Args:
             config: flakestorm configuration
-            agent: Agent adapter to test
+            agent: Agent adapter to test (used for the actual run)
             mutation_engine: Engine for generating mutations
             verifier: Invariant verification engine
             console: Rich console for output
             show_progress: Whether to show progress bars
             chaos_only: If True, run only golden prompts (no mutation generation)
+            preflight_agent: If set, use this adapter for pre-flight check only (e.g. raw
+                agent when agent is chaos-wrapped, so validation does not fail on injected 503).
         """
         self.config = config
         self.agent = agent
+        self.preflight_agent = preflight_agent
         self.mutation_engine = mutation_engine
         self.verifier = verifier
         self.console = console or Console()
@@ -254,31 +258,33 @@ class Orchestrator:
             )
             self.console.print()
 
-        # Test the first golden prompt
+        # Test the first golden prompt (use preflight_agent when set, e.g. raw agent for
+        # chaos_only so we don't fail on chaos-injected 503)
         if self.show_progress:
             self.console.print("  Testing with first golden prompt...", style="dim")
 
-        response = await self.agent.invoke_with_timing(test_prompt)
+        agent_for_preflight = self.preflight_agent if self.preflight_agent is not None else self.agent
+        response = await agent_for_preflight.invoke_with_timing(test_prompt)
 
         if not response.success or response.error:
             error_msg = response.error or "Unknown error"
             prompt_preview = (
                 test_prompt[:50] + "..." if len(test_prompt) > 50 else test_prompt
             )
-
-            if self.show_progress:
-                self.console.print()
-                self.console.print(
-                    Panel(
-                        f"[red]Agent validation failed![/red]\n\n"
-                        f"[yellow]Test prompt:[/yellow] {prompt_preview}\n"
-                        f"[yellow]Error:[/yellow] {error_msg}\n\n"
-                        f"[dim]Please fix the agent errors (e.g., missing API keys, configuration issues) "
-                        f"before running mutations. This prevents wasting time on a broken agent.[/dim]",
-                        title="[red]Pre-flight Check Failed[/red]",
-                        border_style="red",
-                    )
+            # Always print failure details so user sees the real error (e.g. connection refused)
+            # even when show_progress=False (e.g. flakestorm ci)
+            self.console.print()
+            self.console.print(
+                Panel(
+                    f"[red]Agent validation failed![/red]\n\n"
+                    f"[yellow]Test prompt:[/yellow] {prompt_preview}\n"
+                    f"[yellow]Error:[/yellow] {error_msg}\n\n"
+                    f"[dim]Please fix the agent errors (e.g., missing API keys, configuration issues) "
+                    f"before running mutations. This prevents wasting time on a broken agent.[/dim]",
+                    title="[red]Pre-flight Check Failed[/red]",
+                    border_style="red",
                 )
+            )
             return False
         else:
             if self.show_progress:
diff --git a/src/flakestorm/core/performance.py b/src/flakestorm/core/performance.py
index 2944cee..042e171 100644
--- a/src/flakestorm/core/performance.py
+++ b/src/flakestorm/core/performance.py
@@ -210,7 +210,9 @@ def calculate_overall_resilience(scores: list[float], weights: list[float]) -> f
     Weighted average for mutation_robustness, chaos_resilience, contract_compliance, replay_regression.
     """
     if _RUST_AVAILABLE:
-        return flakestorm_rust.calculate_overall_resilience(scores, weights)
+        rust_fn = getattr(flakestorm_rust, "calculate_overall_resilience", None)
+        if rust_fn is not None:
+            return rust_fn(scores, weights)
 
     n = min(len(scores), len(weights))
     if n == 0:
diff --git a/src/flakestorm/core/runner.py b/src/flakestorm/core/runner.py
index a8b4513..bdf0fda 100644
--- a/src/flakestorm/core/runner.py
+++ b/src/flakestorm/core/runner.py
@@ -7,6 +7,7 @@ and provides a simple API for executing reliability tests.
 
 from __future__ import annotations
 
+import random
 from pathlib import Path
 from typing import TYPE_CHECKING
 
@@ -65,6 +66,10 @@ class FlakeStormRunner:
         else:
             self.config = config
 
+        # Reproducibility: fix Python random seed so chaos and any sampling are deterministic
+        if self.config.advanced.seed is not None:
+            random.seed(self.config.advanced.seed)
+
         self.chaos_only = chaos_only
 
         # Load chaos profile if requested
@@ -108,9 +113,17 @@ class FlakeStormRunner:
             self.agent = create_instrumented_adapter(base_agent, self.config.chaos)
         else:
             self.agent = base_agent
-        self.mutation_engine = MutationEngine(self.config.model)
+        # When seed is set, use temperature=0 for mutation generation so same prompts → same mutations
+        model_cfg = self.config.model
+        if self.config.advanced.seed is not None:
+            model_cfg = model_cfg.model_copy(update={"temperature": 0.0})
+        self.mutation_engine = MutationEngine(model_cfg)
         self.verifier = InvariantVerifier(self.config.invariants)
 
+        # When agent is chaos-wrapped, pre-flight must use the raw agent so we don't fail on
+        # chaos-injected 503 (e.g. in CI mutation phase or chaos_only phase).
+        preflight_agent = base_agent if self.config.chaos else None
+
         # Create orchestrator
         self.orchestrator = Orchestrator(
             config=self.config,
@@ -118,6 +131,7 @@ class FlakeStormRunner:
             mutation_engine=self.mutation_engine,
             verifier=self.verifier,
             console=self.console,
+            preflight_agent=preflight_agent,
             show_progress=self.show_progress,
             chaos_only=chaos_only,
         )
diff --git a/src/flakestorm/reports/ci_report.py b/src/flakestorm/reports/ci_report.py
new file mode 100644
index 0000000..9ed7ab3
--- /dev/null
+++ b/src/flakestorm/reports/ci_report.py
@@ -0,0 +1,133 @@
+"""HTML report for flakestorm ci (all phases + overall score)."""
+
+from __future__ import annotations
+
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+
+
+def _escape(s: Any) -> str:
+    if s is None:
+        return ""
+    t = str(s)
+    return (
+        t.replace("&", "&amp;")
+        .replace("<", "&lt;")
+        .replace(">", "&gt;")
+        .replace('"', "&quot;")
+    )
+
+
+def generate_ci_report_html(
+    phase_scores: dict[str, float],
+    overall: float,
+    passed: bool,
+    min_score: float = 0.0,
+    timestamp: str | None = None,
+    report_links: dict[str, str] | None = None,
+    phase_overall_passed: dict[str, bool] | None = None,
+) -> str:
+    """Generate HTML for the CI run: phase scores, overall, and links to detailed reports.
+    phase_overall_passed: when a phase has its own pass/fail (e.g. contract: critical fail = FAIL),
+    pass False for that key so the summary matches the detailed report."""
+    timestamp = timestamp or datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    report_links = report_links or {}
+    phase_overall_passed = phase_overall_passed or {}
+    phase_names = {
+        "mutation_robustness": "Mutation",
+        "chaos_resilience": "Chaos",
+        "contract_compliance": "Contract",
+        "replay_regression": "Replay",
+    }
+    rows = []
+    for key, score in phase_scores.items():
+        name = phase_names.get(key, key.replace("_", " ").title())
+        pct = round(score * 100, 1)
+        # Fail if score below threshold OR phase has its own fail (e.g. contract critical failure)
+        phase_passed = phase_overall_passed.get(key, True)
+        row_failed = score < min_score or phase_passed is False
+        status = "FAIL" if row_failed else "PASS"
+        row_class = "fail" if row_failed else ""
+        link = report_links.get(key)
+        link_cell = f'<a href="{_escape(link)}" style="color: var(--accent);">View detailed report</a>' if link else "<span style=\"color: var(--text-secondary);\">—</span>"
+        rows.append(
+            f'<tr class="{row_class}"><td>{_escape(name)}</td><td>{pct}%</td><td>{status}</td><td>{link_cell}</td></tr>'
+        )
+    body = "\n".join(rows)
+    overall_pct = round(overall * 100, 1)
+    overall_status = "PASS" if passed else "FAIL"
+    overall_class = "fail" if not passed else ""
+
+    return f"""<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<title>flakestorm CI Report - {_escape(timestamp)}</title>
+<style>
+:root {{
+  --bg-primary: #0a0a0f;
+  --bg-card: #1a1a24;
+  --text-primary: #e8e8ed;
+  --text-secondary: #8b8b9e;
+  --success: #22c55e;
+  --danger: #ef4444;
+  --accent: #818cf8;
+  --border: #2a2a3a;
+}}
+body {{ font-family: system-ui, sans-serif; background: var(--bg-primary); color: var(--text-primary); padding: 2rem; }}
+.container {{ max-width: 900px; margin: 0 auto; }}
+h1 {{ margin-bottom: 0.5rem; }}
+.meta {{ color: var(--text-secondary); margin-bottom: 1.5rem; }}
+table {{ width: 100%; border-collapse: collapse; background: var(--bg-card); border-radius: 8px; overflow: hidden; }}
+th, td {{ padding: 0.75rem 1rem; text-align: left; border-bottom: 1px solid var(--border); }}
+th {{ background: rgba(99,102,241,0.2); }}
+tr.fail {{ color: var(--danger); }}
+.overall {{ margin-top: 1.5rem; padding: 1rem; background: var(--bg-card); border-radius: 8px; font-size: 1.25rem; }}
+.overall.fail {{ color: var(--danger); }}
+.overall:not(.fail) {{ color: var(--success); }}
+a {{ text-decoration: none; }}
+a:hover {{ text-decoration: underline; }}
+</style>
+</head>
+<body>
+<div class="container">
+<h1>flakestorm CI Report</h1>
+<p class="meta">Run at {_escape(timestamp)} · min score: {min_score:.0%}</p>
+<p class="meta">Each phase has a <strong>detailed report</strong> with failure reasons and recommended next steps. Use the links below to inspect failures.</p>
+<table>
+<thead><tr><th>Phase</th><th>Score</th><th>Status</th><th>Detailed report</th></tr></thead>
+<tbody>
+{body}
+</tbody>
+</table>
+<div class="overall {overall_class}"><strong>Overall (weighted):</strong> {overall_pct}% — {overall_status}</div>
+</div>
+</body>
+</html>
+"""
+
+
+def save_ci_report(
+    phase_scores: dict[str, float],
+    overall: float,
+    passed: bool,
+    path: Path,
+    min_score: float = 0.0,
+    report_links: dict[str, str] | None = None,
+    phase_overall_passed: dict[str, bool] | None = None,
+) -> Path:
+    """Write CI report HTML to path. report_links: phase key -> filename. phase_overall_passed: phase key -> False when phase failed (e.g. contract critical fail)."""
+    path = Path(path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    html = generate_ci_report_html(
+        phase_scores=phase_scores,
+        overall=overall,
+        passed=passed,
+        min_score=min_score,
+        report_links=report_links,
+        phase_overall_passed=phase_overall_passed,
+    )
+    path.write_text(html, encoding="utf-8")
+    return path
diff --git a/src/flakestorm/reports/contract_report.py b/src/flakestorm/reports/contract_report.py
index 5d7b87e..00f3bd7 100644
--- a/src/flakestorm/reports/contract_report.py
+++ b/src/flakestorm/reports/contract_report.py
@@ -57,7 +57,7 @@ def generate_contract_html(matrix: "ResilienceMatrix", title: str = "Contract Re
     suggestions_html = ""
     if failed_cells:
         suggestions_html = """
-<h2>Suggested actions (failed cells)</h2>
+<h2>Recommended next steps</h2>
 <p>The following actions may help fix the failed contract cells:</p>
 <ul>
 """
@@ -110,6 +110,7 @@ li {{ margin: 0.5rem 0; }}
   <strong>Resilience score:</strong> <span class="score">{matrix.resilience_score:.1f}%</span><br>
   <strong>Overall:</strong> {'PASS' if matrix.passed else 'FAIL'}
 </div>
+{f'<p class="fail-intro" style="margin-top:1rem;color:var(--danger);"><strong>Why did Contract fail?</strong> One or more invariant × scenario cells did not pass. Check the table below for failed cells, then follow the <strong>Recommended next steps</strong> to fix them.</p>' if not matrix.passed and failed_cells else ''}
 <table>
 <thead><tr><th>Invariant</th><th>Scenario</th><th>Severity</th><th>Result</th></tr></thead>
 <tbody>