diff --git a/.gitignore b/.gitignore
index a51a674..69b98ee 100644
--- a/.gitignore
+++ b/.gitignore
@@ -80,10 +80,12 @@ Cargo.lock
# =============================================================================
# Project-specific
# =============================================================================
-# Generated reports
-reports/
+# Generated reports (root only, not src/flakestorm/reports/)
+/reports/
*.html
!docs/*.html
+# Explicitly include source code reports module
+!src/flakestorm/reports/
# Local configuration (may contain secrets)
flakestorm.yaml
diff --git a/src/flakestorm/reports/__init__.py b/src/flakestorm/reports/__init__.py
new file mode 100644
index 0000000..cc3c590
--- /dev/null
+++ b/src/flakestorm/reports/__init__.py
@@ -0,0 +1,30 @@
+"""
+flakestorm Reports Module
+
+Provides report generation in multiple formats:
+- Interactive HTML reports
+- JSON exports
+- Terminal output
+"""
+
+from flakestorm.reports.html import HTMLReportGenerator
+from flakestorm.reports.json_export import JSONReportGenerator
+from flakestorm.reports.models import (
+ CheckResult,
+ MutationResult,
+ TestResults,
+ TestStatistics,
+ TypeStatistics,
+)
+from flakestorm.reports.terminal import TerminalReporter
+
+__all__ = [
+ "TestResults",
+ "TestStatistics",
+ "MutationResult",
+ "CheckResult",
+ "TypeStatistics",
+ "HTMLReportGenerator",
+ "JSONReportGenerator",
+ "TerminalReporter",
+]
diff --git a/src/flakestorm/reports/html.py b/src/flakestorm/reports/html.py
new file mode 100644
index 0000000..8abe178
--- /dev/null
+++ b/src/flakestorm/reports/html.py
@@ -0,0 +1,655 @@
+"""
+HTML Report Generator
+
+Generates interactive HTML reports with:
+- Robustness score visualization
+- Pass/fail matrix grid
+- Drill-down into failed mutations
+- Latency charts
+"""
+
+from __future__ import annotations
+
+import json
+from datetime import datetime
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+from jinja2 import Template
+
+if TYPE_CHECKING:
+ from flakestorm.reports.models import TestResults
+
+
+HTML_TEMPLATE = """
+
+
+
+
+
+ flakestorm Report - {{ report_date }}
+
+
+
+
+
+
+
+
+
+
+
{{ score_percent }}%
+
+
Robustness Score
+
+
+
+
+
Total Mutations
+
{{ total_mutations }}
+
+
+
Passed
+
{{ passed_mutations }}
+
+
+
Failed
+
{{ failed_mutations }}
+
+
+
Avg Latency
+
{{ avg_latency }}ms
+
+
+
+
+
+
📊 By Mutation Type
+
+ {% for type_stat in type_stats %}
+
+
+
+
+ {{ type_stat.passed }}/{{ type_stat.total }} passed
+
+
+ {% endfor %}
+
+
+
+
+
🔬 Mutation Results
+
+ {% for result in mutations %}
+
+
{{ result.mutation.type }}
+
{{ result.mutation.mutated[:100] }}...
+
+ {{ result.latency_ms|round(0)|int }}ms
+ {{ '✓' if result.passed else '✗' }}
+
+
+ {% endfor %}
+
+
+
+
+
+
+
+
+
+"""
+
+
+class HTMLReportGenerator:
+ """
+ Generates interactive HTML reports from test results.
+
+ Creates a single-file HTML report with embedded CSS and JavaScript
+ for easy sharing and viewing.
+ """
+
+ def __init__(self, results: TestResults):
+ """
+ Initialize the generator.
+
+ Args:
+ results: Test results to generate report from
+ """
+ self.results = results
+ self.template = Template(HTML_TEMPLATE)
+
+ def generate(self) -> str:
+ """
+ Generate the HTML report.
+
+ Returns:
+ Complete HTML document as a string
+ """
+ stats = self.results.statistics
+
+ # Calculate score ring values
+ circumference = 2 * 3.14159 * 78
+ score_offset = circumference * (1 - stats.robustness_score)
+
+ # Prepare type stats
+ type_stats = [
+ {
+ "mutation_type": t.mutation_type.replace("_", " "),
+ "total": t.total,
+ "passed": t.passed,
+ "pass_rate_percent": round(t.pass_rate * 100, 1),
+ }
+ for t in stats.by_type
+ ]
+
+ # Prepare mutations data
+ mutations_data = [m.to_dict() for m in self.results.mutations]
+
+ return self.template.render(
+ report_date=self.results.started_at.strftime("%Y-%m-%d %H:%M:%S"),
+ duration=round(self.results.duration, 1),
+ circumference=circumference,
+ score_offset=score_offset,
+ score_percent=round(stats.robustness_score * 100, 1),
+ total_mutations=stats.total_mutations,
+ passed_mutations=stats.passed_mutations,
+ failed_mutations=stats.failed_mutations,
+ avg_latency=round(stats.avg_latency_ms),
+ type_stats=type_stats,
+ mutations=self.results.mutations,
+ mutations_json=json.dumps(mutations_data),
+ )
+
+ def save(self, path: str | Path | None = None) -> Path:
+ """
+ Save the HTML report to a file.
+
+ Args:
+ path: Output path (default: auto-generated in reports dir)
+
+ Returns:
+ Path to the saved file
+ """
+ if path is None:
+ output_dir = Path(self.results.config.output.path)
+ output_dir.mkdir(parents=True, exist_ok=True)
+
+ timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
+ filename = f"flakestorm-{timestamp}.html"
+ path = output_dir / filename
+ else:
+ path = Path(path)
+ path.parent.mkdir(parents=True, exist_ok=True)
+
+ html = self.generate()
+ path.write_text(html, encoding="utf-8")
+
+ return path
diff --git a/src/flakestorm/reports/json_export.py b/src/flakestorm/reports/json_export.py
new file mode 100644
index 0000000..8187296
--- /dev/null
+++ b/src/flakestorm/reports/json_export.py
@@ -0,0 +1,115 @@
+"""
+JSON Report Generator
+
+Exports test results to JSON format for programmatic consumption
+and integration with other tools.
+"""
+
+from __future__ import annotations
+
+import json
+from datetime import datetime
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+
+if TYPE_CHECKING:
+ from flakestorm.reports.models import TestResults
+
+
+class JSONReportGenerator:
+ """
+ Generates JSON reports from test results.
+
+ Creates structured JSON output suitable for:
+ - CI/CD pipeline consumption
+ - Data analysis tools
+ - Dashboard integrations
+ """
+
+ def __init__(self, results: TestResults):
+ """
+ Initialize the generator.
+
+ Args:
+ results: Test results to generate report from
+ """
+ self.results = results
+
+ def generate(self, pretty: bool = True) -> str:
+ """
+ Generate the JSON report.
+
+ Args:
+ pretty: Whether to format with indentation
+
+ Returns:
+ JSON string
+ """
+ data = self.results.to_dict()
+
+ if pretty:
+ return json.dumps(data, indent=2, default=str)
+ return json.dumps(data, default=str)
+
+ def generate_summary(self) -> dict[str, Any]:
+ """
+ Generate a summary-only report (no mutation details).
+
+ Useful for quick status checks in CI/CD.
+ """
+ stats = self.results.statistics
+
+ return {
+ "version": "1.0",
+ "started_at": self.results.started_at.isoformat(),
+ "completed_at": self.results.completed_at.isoformat(),
+ "duration_seconds": self.results.duration,
+ "robustness_score": stats.robustness_score,
+ "pass_rate": stats.pass_rate,
+ "total_mutations": stats.total_mutations,
+ "passed_mutations": stats.passed_mutations,
+ "failed_mutations": stats.failed_mutations,
+ "avg_latency_ms": stats.avg_latency_ms,
+ "p95_latency_ms": stats.p95_latency_ms,
+ "by_type": {
+ t.mutation_type: {
+ "total": t.total,
+ "passed": t.passed,
+ "pass_rate": t.pass_rate,
+ }
+ for t in stats.by_type
+ },
+ }
+
+ def save(self, path: str | Path | None = None, summary_only: bool = False) -> Path:
+ """
+ Save the JSON report to a file.
+
+ Args:
+ path: Output path (default: auto-generated in reports dir)
+ summary_only: Only include summary, no mutation details
+
+ Returns:
+ Path to the saved file
+ """
+ if path is None:
+ output_dir = Path(self.results.config.output.path)
+ output_dir.mkdir(parents=True, exist_ok=True)
+
+ timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
+ suffix = "-summary" if summary_only else ""
+ filename = f"flakestorm-{timestamp}{suffix}.json"
+ path = output_dir / filename
+ else:
+ path = Path(path)
+ path.parent.mkdir(parents=True, exist_ok=True)
+
+ if summary_only:
+ data = self.generate_summary()
+ content = json.dumps(data, indent=2, default=str)
+ else:
+ content = self.generate()
+
+ path.write_text(content, encoding="utf-8")
+
+ return path
diff --git a/src/flakestorm/reports/models.py b/src/flakestorm/reports/models.py
new file mode 100644
index 0000000..b97539b
--- /dev/null
+++ b/src/flakestorm/reports/models.py
@@ -0,0 +1,220 @@
+"""
+Report Data Models
+
+Data structures for representing test results and statistics.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from datetime import datetime
+from typing import TYPE_CHECKING, Any
+
+if TYPE_CHECKING:
+ from flakestorm.core.config import FlakeStormConfig
+ from flakestorm.mutations.types import Mutation
+
+
+@dataclass
+class CheckResult:
+ """Result of a single invariant check."""
+
+ check_type: str
+ """Type of the check (e.g., 'latency', 'contains')."""
+
+ passed: bool
+ """Whether the check passed."""
+
+ details: str
+ """Human-readable details about the check result."""
+
+ def to_dict(self) -> dict[str, Any]:
+ """Convert to dictionary for serialization."""
+ return {
+ "check_type": self.check_type,
+ "passed": self.passed,
+ "details": self.details,
+ }
+
+
+@dataclass
+class TypeStatistics:
+ """Statistics for a specific mutation type."""
+
+ mutation_type: str
+ """Name of the mutation type."""
+
+ total: int
+ """Total number of tests of this type."""
+
+ passed: int
+ """Number of tests that passed."""
+
+ pass_rate: float
+ """Pass rate as a decimal (0.0 to 1.0)."""
+
+ def to_dict(self) -> dict[str, Any]:
+ """Convert to dictionary for serialization."""
+ return {
+ "mutation_type": self.mutation_type,
+ "total": self.total,
+ "passed": self.passed,
+ "failed": self.total - self.passed,
+ "pass_rate": self.pass_rate,
+ }
+
+
+@dataclass
+class TestStatistics:
+ """Aggregate statistics for a test run."""
+
+ total_mutations: int
+ """Total number of mutations tested."""
+
+ passed_mutations: int
+ """Number of mutations that passed all checks."""
+
+ failed_mutations: int
+ """Number of mutations that failed one or more checks."""
+
+ robustness_score: float
+ """Weighted robustness score (0.0 to 1.0)."""
+
+ avg_latency_ms: float
+ """Average response latency in milliseconds."""
+
+ p50_latency_ms: float
+ """50th percentile (median) latency."""
+
+ p95_latency_ms: float
+ """95th percentile latency."""
+
+ p99_latency_ms: float
+ """99th percentile latency."""
+
+ by_type: list[TypeStatistics] = field(default_factory=list)
+ """Statistics broken down by mutation type."""
+
+ duration_seconds: float = 0.0
+ """Total test duration in seconds."""
+
+ @property
+ def pass_rate(self) -> float:
+ """Simple pass rate (passed / total)."""
+ if self.total_mutations == 0:
+ return 0.0
+ return self.passed_mutations / self.total_mutations
+
+ def to_dict(self) -> dict[str, Any]:
+ """Convert to dictionary for serialization."""
+ return {
+ "total_mutations": self.total_mutations,
+ "passed_mutations": self.passed_mutations,
+ "failed_mutations": self.failed_mutations,
+ "robustness_score": self.robustness_score,
+ "pass_rate": self.pass_rate,
+ "avg_latency_ms": self.avg_latency_ms,
+ "p50_latency_ms": self.p50_latency_ms,
+ "p95_latency_ms": self.p95_latency_ms,
+ "p99_latency_ms": self.p99_latency_ms,
+ "duration_seconds": self.duration_seconds,
+ "by_type": [t.to_dict() for t in self.by_type],
+ }
+
+
+@dataclass
+class MutationResult:
+ """Result of testing a single mutation."""
+
+ original_prompt: str
+ """The original golden prompt."""
+
+ mutation: Mutation
+ """The mutation that was tested."""
+
+ response: str
+ """The agent's response."""
+
+ latency_ms: float
+ """Response latency in milliseconds."""
+
+ passed: bool
+ """Whether all invariant checks passed."""
+
+ checks: list[CheckResult] = field(default_factory=list)
+ """Individual check results."""
+
+ error: str | None = None
+ """Error message if the agent call failed."""
+
+ @property
+ def failed_checks(self) -> list[CheckResult]:
+ """Get list of failed checks."""
+ return [c for c in self.checks if not c.passed]
+
+ def to_dict(self) -> dict[str, Any]:
+ """Convert to dictionary for serialization."""
+ return {
+ "original_prompt": self.original_prompt,
+ "mutation": self.mutation.to_dict(),
+ "response": self.response,
+ "latency_ms": self.latency_ms,
+ "passed": self.passed,
+ "checks": [c.to_dict() for c in self.checks],
+ "error": self.error,
+ }
+
+
+@dataclass
+class TestResults:
+ """Complete results from a test run."""
+
+ config: FlakeStormConfig
+ """Configuration used for the test."""
+
+ started_at: datetime
+ """When the test started."""
+
+ completed_at: datetime
+ """When the test completed."""
+
+ mutations: list[MutationResult]
+ """Results for each mutation."""
+
+ statistics: TestStatistics
+ """Aggregate statistics."""
+
+ @property
+ def duration(self) -> float:
+ """Test duration in seconds."""
+ return (self.completed_at - self.started_at).total_seconds()
+
+ @property
+ def passed_mutations(self) -> list[MutationResult]:
+ """Get mutations that passed."""
+ return [m for m in self.mutations if m.passed]
+
+ @property
+ def failed_mutations(self) -> list[MutationResult]:
+ """Get mutations that failed."""
+ return [m for m in self.mutations if not m.passed]
+
+ def get_by_type(self, mutation_type: str) -> list[MutationResult]:
+ """Get mutations of a specific type."""
+ return [m for m in self.mutations if m.mutation.type.value == mutation_type]
+
+ def get_by_prompt(self, prompt: str) -> list[MutationResult]:
+ """Get mutations for a specific golden prompt."""
+ return [m for m in self.mutations if m.original_prompt == prompt]
+
+ def to_dict(self) -> dict[str, Any]:
+ """Convert to dictionary for serialization."""
+ return {
+ "version": "1.0",
+ "started_at": self.started_at.isoformat(),
+ "completed_at": self.completed_at.isoformat(),
+ "duration_seconds": self.duration,
+ "statistics": self.statistics.to_dict(),
+ "mutations": [m.to_dict() for m in self.mutations],
+ "golden_prompts": self.config.golden_prompts,
+ }
diff --git a/src/flakestorm/reports/terminal.py b/src/flakestorm/reports/terminal.py
new file mode 100644
index 0000000..68597e7
--- /dev/null
+++ b/src/flakestorm/reports/terminal.py
@@ -0,0 +1,156 @@
+"""
+Terminal Report Generator
+
+Displays test results directly in the terminal using rich formatting.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from rich.console import Console
+from rich.panel import Panel
+from rich.table import Table
+from rich.text import Text
+
+if TYPE_CHECKING:
+ from flakestorm.reports.models import TestResults
+
+
+class TerminalReporter:
+ """
+ Displays test results in the terminal using rich formatting.
+
+ Provides colorful, informative output for interactive use.
+ """
+
+ def __init__(self, results: TestResults, console: Console | None = None):
+ """
+ Initialize the reporter.
+
+ Args:
+ results: Test results to display
+ console: Rich console (default: new console)
+ """
+ self.results = results
+ self.console = console or Console()
+
+ def print_summary(self) -> None:
+ """Print a summary of the test results."""
+ stats = self.results.statistics
+
+ # Robustness score with color
+ score = stats.robustness_score
+ if score >= 0.9:
+ score_style = "bold green"
+ score_emoji = "🎉"
+ elif score >= 0.7:
+ score_style = "bold yellow"
+ score_emoji = "⚠️"
+ else:
+ score_style = "bold red"
+ score_emoji = "❌"
+
+ score_text = Text()
+ score_text.append(f"{score_emoji} Robustness Score: ", style="bold")
+ score_text.append(f"{score:.1%}", style=score_style)
+
+ # Create summary panel
+ summary_lines = [
+ score_text,
+ "",
+ f"Total Mutations: {stats.total_mutations}",
+ Text.assemble(
+ ("Passed: ", ""),
+ (str(stats.passed_mutations), "green"),
+ (" | Failed: ", ""),
+ (str(stats.failed_mutations), "red"),
+ ),
+ "",
+ f"Avg Latency: {stats.avg_latency_ms:.0f}ms",
+ f"P95 Latency: {stats.p95_latency_ms:.0f}ms",
+ f"Duration: {self.results.duration:.1f}s",
+ ]
+
+ panel_content = "\n".join(str(line) for line in summary_lines)
+
+ self.console.print(
+ Panel(
+ panel_content,
+ title="flakestorm Results",
+ border_style="blue",
+ )
+ )
+
+ def print_type_breakdown(self) -> None:
+ """Print breakdown by mutation type."""
+ stats = self.results.statistics
+
+ table = Table(title="By Mutation Type", show_header=True)
+ table.add_column("Type", style="cyan")
+ table.add_column("Passed", justify="right", style="green")
+ table.add_column("Failed", justify="right", style="red")
+ table.add_column("Pass Rate", justify="right")
+ table.add_column("Progress", width=20)
+
+ for type_stat in stats.by_type:
+ # Create a simple text-based progress bar
+ bar_width = 15
+ filled = int(type_stat.pass_rate * bar_width)
+ bar = "█" * filled + "░" * (bar_width - filled)
+
+ table.add_row(
+ type_stat.mutation_type.replace("_", " ").title(),
+ str(type_stat.passed),
+ str(type_stat.total - type_stat.passed),
+ f"{type_stat.pass_rate:.1%}",
+ bar,
+ )
+
+ self.console.print(table)
+
+ def print_failures(self, limit: int = 10) -> None:
+ """
+ Print details of failed mutations.
+
+ Args:
+ limit: Maximum number of failures to show
+ """
+ failed = self.results.failed_mutations
+
+ if not failed:
+ self.console.print("[green]✓ No failures![/green]")
+ return
+
+ self.console.print(
+ f"\n[bold red]Failed Mutations ({len(failed)} total):[/bold red]"
+ )
+
+ for i, result in enumerate(failed[:limit]):
+ self.console.print(f"\n[bold]#{i+1} - {result.mutation.type.value}[/bold]")
+ self.console.print(
+ f" [dim]Original:[/dim] {result.original_prompt[:50]}..."
+ )
+ self.console.print(
+ f" [dim]Mutated:[/dim] {result.mutation.mutated[:50]}..."
+ )
+
+ for check in result.failed_checks:
+ self.console.print(
+ f" [red]✗ {check.check_type}:[/red] {check.details}"
+ )
+
+ if len(failed) > limit:
+ self.console.print(
+ f"\n[dim]...and {len(failed) - limit} more failures. "
+ "See HTML report for details.[/dim]"
+ )
+
+ def print_full_report(self) -> None:
+ """Print the complete terminal report."""
+ self.console.print()
+ self.print_summary()
+ self.console.print()
+ self.print_type_breakdown()
+ self.print_failures()
+ self.console.print()