diff --git a/.gitignore b/.gitignore index a51a674..69b98ee 100644 --- a/.gitignore +++ b/.gitignore @@ -80,10 +80,12 @@ Cargo.lock # ============================================================================= # Project-specific # ============================================================================= -# Generated reports -reports/ +# Generated reports (root only, not src/flakestorm/reports/) +/reports/ *.html !docs/*.html +# Explicitly include source code reports module +!src/flakestorm/reports/ # Local configuration (may contain secrets) flakestorm.yaml diff --git a/src/flakestorm/reports/__init__.py b/src/flakestorm/reports/__init__.py new file mode 100644 index 0000000..cc3c590 --- /dev/null +++ b/src/flakestorm/reports/__init__.py @@ -0,0 +1,30 @@ +""" +flakestorm Reports Module + +Provides report generation in multiple formats: +- Interactive HTML reports +- JSON exports +- Terminal output +""" + +from flakestorm.reports.html import HTMLReportGenerator +from flakestorm.reports.json_export import JSONReportGenerator +from flakestorm.reports.models import ( + CheckResult, + MutationResult, + TestResults, + TestStatistics, + TypeStatistics, +) +from flakestorm.reports.terminal import TerminalReporter + +__all__ = [ + "TestResults", + "TestStatistics", + "MutationResult", + "CheckResult", + "TypeStatistics", + "HTMLReportGenerator", + "JSONReportGenerator", + "TerminalReporter", +] diff --git a/src/flakestorm/reports/html.py b/src/flakestorm/reports/html.py new file mode 100644 index 0000000..8abe178 --- /dev/null +++ b/src/flakestorm/reports/html.py @@ -0,0 +1,655 @@ +""" +HTML Report Generator + +Generates interactive HTML reports with: +- Robustness score visualization +- Pass/fail matrix grid +- Drill-down into failed mutations +- Latency charts +""" + +from __future__ import annotations + +import json +from datetime import datetime +from pathlib import Path +from typing import TYPE_CHECKING + +from jinja2 import Template + +if TYPE_CHECKING: + from flakestorm.reports.models import TestResults + + +HTML_TEMPLATE = """ + + + + + + flakestorm Report - {{ report_date }} + + + +
+
+ +
+
{{ report_date }}
+
Duration: {{ duration }}s
+
+
+ +
+
+
+ + + + + +
{{ score_percent }}%
+
+
Robustness Score
+
+ +
+
+
Total Mutations
+
{{ total_mutations }}
+
+
+
Passed
+
{{ passed_mutations }}
+
+
+
Failed
+
{{ failed_mutations }}
+
+
+
Avg Latency
+
{{ avg_latency }}ms
+
+
+
+ +
+

📊 By Mutation Type

+
+ {% for type_stat in type_stats %} +
+
+ {{ type_stat.mutation_type }} + {{ type_stat.pass_rate_percent }}% +
+
+
+
+
+ {{ type_stat.passed }}/{{ type_stat.total }} passed +
+
+ {% endfor %} +
+
+ +
+

🔬 Mutation Results

+
+ {% for result in mutations %} +
+
{{ result.mutation.type }}
+
{{ result.mutation.mutated[:100] }}...
+
+ {{ result.latency_ms|round(0)|int }}ms + {{ '✓' if result.passed else '✗' }} +
+
+ {% endfor %} +
+
+
+ + + + + + +""" + + +class HTMLReportGenerator: + """ + Generates interactive HTML reports from test results. + + Creates a single-file HTML report with embedded CSS and JavaScript + for easy sharing and viewing. + """ + + def __init__(self, results: TestResults): + """ + Initialize the generator. + + Args: + results: Test results to generate report from + """ + self.results = results + self.template = Template(HTML_TEMPLATE) + + def generate(self) -> str: + """ + Generate the HTML report. + + Returns: + Complete HTML document as a string + """ + stats = self.results.statistics + + # Calculate score ring values + circumference = 2 * 3.14159 * 78 + score_offset = circumference * (1 - stats.robustness_score) + + # Prepare type stats + type_stats = [ + { + "mutation_type": t.mutation_type.replace("_", " "), + "total": t.total, + "passed": t.passed, + "pass_rate_percent": round(t.pass_rate * 100, 1), + } + for t in stats.by_type + ] + + # Prepare mutations data + mutations_data = [m.to_dict() for m in self.results.mutations] + + return self.template.render( + report_date=self.results.started_at.strftime("%Y-%m-%d %H:%M:%S"), + duration=round(self.results.duration, 1), + circumference=circumference, + score_offset=score_offset, + score_percent=round(stats.robustness_score * 100, 1), + total_mutations=stats.total_mutations, + passed_mutations=stats.passed_mutations, + failed_mutations=stats.failed_mutations, + avg_latency=round(stats.avg_latency_ms), + type_stats=type_stats, + mutations=self.results.mutations, + mutations_json=json.dumps(mutations_data), + ) + + def save(self, path: str | Path | None = None) -> Path: + """ + Save the HTML report to a file. + + Args: + path: Output path (default: auto-generated in reports dir) + + Returns: + Path to the saved file + """ + if path is None: + output_dir = Path(self.results.config.output.path) + output_dir.mkdir(parents=True, exist_ok=True) + + timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") + filename = f"flakestorm-{timestamp}.html" + path = output_dir / filename + else: + path = Path(path) + path.parent.mkdir(parents=True, exist_ok=True) + + html = self.generate() + path.write_text(html, encoding="utf-8") + + return path diff --git a/src/flakestorm/reports/json_export.py b/src/flakestorm/reports/json_export.py new file mode 100644 index 0000000..8187296 --- /dev/null +++ b/src/flakestorm/reports/json_export.py @@ -0,0 +1,115 @@ +""" +JSON Report Generator + +Exports test results to JSON format for programmatic consumption +and integration with other tools. +""" + +from __future__ import annotations + +import json +from datetime import datetime +from pathlib import Path +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from flakestorm.reports.models import TestResults + + +class JSONReportGenerator: + """ + Generates JSON reports from test results. + + Creates structured JSON output suitable for: + - CI/CD pipeline consumption + - Data analysis tools + - Dashboard integrations + """ + + def __init__(self, results: TestResults): + """ + Initialize the generator. + + Args: + results: Test results to generate report from + """ + self.results = results + + def generate(self, pretty: bool = True) -> str: + """ + Generate the JSON report. + + Args: + pretty: Whether to format with indentation + + Returns: + JSON string + """ + data = self.results.to_dict() + + if pretty: + return json.dumps(data, indent=2, default=str) + return json.dumps(data, default=str) + + def generate_summary(self) -> dict[str, Any]: + """ + Generate a summary-only report (no mutation details). + + Useful for quick status checks in CI/CD. + """ + stats = self.results.statistics + + return { + "version": "1.0", + "started_at": self.results.started_at.isoformat(), + "completed_at": self.results.completed_at.isoformat(), + "duration_seconds": self.results.duration, + "robustness_score": stats.robustness_score, + "pass_rate": stats.pass_rate, + "total_mutations": stats.total_mutations, + "passed_mutations": stats.passed_mutations, + "failed_mutations": stats.failed_mutations, + "avg_latency_ms": stats.avg_latency_ms, + "p95_latency_ms": stats.p95_latency_ms, + "by_type": { + t.mutation_type: { + "total": t.total, + "passed": t.passed, + "pass_rate": t.pass_rate, + } + for t in stats.by_type + }, + } + + def save(self, path: str | Path | None = None, summary_only: bool = False) -> Path: + """ + Save the JSON report to a file. + + Args: + path: Output path (default: auto-generated in reports dir) + summary_only: Only include summary, no mutation details + + Returns: + Path to the saved file + """ + if path is None: + output_dir = Path(self.results.config.output.path) + output_dir.mkdir(parents=True, exist_ok=True) + + timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") + suffix = "-summary" if summary_only else "" + filename = f"flakestorm-{timestamp}{suffix}.json" + path = output_dir / filename + else: + path = Path(path) + path.parent.mkdir(parents=True, exist_ok=True) + + if summary_only: + data = self.generate_summary() + content = json.dumps(data, indent=2, default=str) + else: + content = self.generate() + + path.write_text(content, encoding="utf-8") + + return path diff --git a/src/flakestorm/reports/models.py b/src/flakestorm/reports/models.py new file mode 100644 index 0000000..b97539b --- /dev/null +++ b/src/flakestorm/reports/models.py @@ -0,0 +1,220 @@ +""" +Report Data Models + +Data structures for representing test results and statistics. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from datetime import datetime +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from flakestorm.core.config import FlakeStormConfig + from flakestorm.mutations.types import Mutation + + +@dataclass +class CheckResult: + """Result of a single invariant check.""" + + check_type: str + """Type of the check (e.g., 'latency', 'contains').""" + + passed: bool + """Whether the check passed.""" + + details: str + """Human-readable details about the check result.""" + + def to_dict(self) -> dict[str, Any]: + """Convert to dictionary for serialization.""" + return { + "check_type": self.check_type, + "passed": self.passed, + "details": self.details, + } + + +@dataclass +class TypeStatistics: + """Statistics for a specific mutation type.""" + + mutation_type: str + """Name of the mutation type.""" + + total: int + """Total number of tests of this type.""" + + passed: int + """Number of tests that passed.""" + + pass_rate: float + """Pass rate as a decimal (0.0 to 1.0).""" + + def to_dict(self) -> dict[str, Any]: + """Convert to dictionary for serialization.""" + return { + "mutation_type": self.mutation_type, + "total": self.total, + "passed": self.passed, + "failed": self.total - self.passed, + "pass_rate": self.pass_rate, + } + + +@dataclass +class TestStatistics: + """Aggregate statistics for a test run.""" + + total_mutations: int + """Total number of mutations tested.""" + + passed_mutations: int + """Number of mutations that passed all checks.""" + + failed_mutations: int + """Number of mutations that failed one or more checks.""" + + robustness_score: float + """Weighted robustness score (0.0 to 1.0).""" + + avg_latency_ms: float + """Average response latency in milliseconds.""" + + p50_latency_ms: float + """50th percentile (median) latency.""" + + p95_latency_ms: float + """95th percentile latency.""" + + p99_latency_ms: float + """99th percentile latency.""" + + by_type: list[TypeStatistics] = field(default_factory=list) + """Statistics broken down by mutation type.""" + + duration_seconds: float = 0.0 + """Total test duration in seconds.""" + + @property + def pass_rate(self) -> float: + """Simple pass rate (passed / total).""" + if self.total_mutations == 0: + return 0.0 + return self.passed_mutations / self.total_mutations + + def to_dict(self) -> dict[str, Any]: + """Convert to dictionary for serialization.""" + return { + "total_mutations": self.total_mutations, + "passed_mutations": self.passed_mutations, + "failed_mutations": self.failed_mutations, + "robustness_score": self.robustness_score, + "pass_rate": self.pass_rate, + "avg_latency_ms": self.avg_latency_ms, + "p50_latency_ms": self.p50_latency_ms, + "p95_latency_ms": self.p95_latency_ms, + "p99_latency_ms": self.p99_latency_ms, + "duration_seconds": self.duration_seconds, + "by_type": [t.to_dict() for t in self.by_type], + } + + +@dataclass +class MutationResult: + """Result of testing a single mutation.""" + + original_prompt: str + """The original golden prompt.""" + + mutation: Mutation + """The mutation that was tested.""" + + response: str + """The agent's response.""" + + latency_ms: float + """Response latency in milliseconds.""" + + passed: bool + """Whether all invariant checks passed.""" + + checks: list[CheckResult] = field(default_factory=list) + """Individual check results.""" + + error: str | None = None + """Error message if the agent call failed.""" + + @property + def failed_checks(self) -> list[CheckResult]: + """Get list of failed checks.""" + return [c for c in self.checks if not c.passed] + + def to_dict(self) -> dict[str, Any]: + """Convert to dictionary for serialization.""" + return { + "original_prompt": self.original_prompt, + "mutation": self.mutation.to_dict(), + "response": self.response, + "latency_ms": self.latency_ms, + "passed": self.passed, + "checks": [c.to_dict() for c in self.checks], + "error": self.error, + } + + +@dataclass +class TestResults: + """Complete results from a test run.""" + + config: FlakeStormConfig + """Configuration used for the test.""" + + started_at: datetime + """When the test started.""" + + completed_at: datetime + """When the test completed.""" + + mutations: list[MutationResult] + """Results for each mutation.""" + + statistics: TestStatistics + """Aggregate statistics.""" + + @property + def duration(self) -> float: + """Test duration in seconds.""" + return (self.completed_at - self.started_at).total_seconds() + + @property + def passed_mutations(self) -> list[MutationResult]: + """Get mutations that passed.""" + return [m for m in self.mutations if m.passed] + + @property + def failed_mutations(self) -> list[MutationResult]: + """Get mutations that failed.""" + return [m for m in self.mutations if not m.passed] + + def get_by_type(self, mutation_type: str) -> list[MutationResult]: + """Get mutations of a specific type.""" + return [m for m in self.mutations if m.mutation.type.value == mutation_type] + + def get_by_prompt(self, prompt: str) -> list[MutationResult]: + """Get mutations for a specific golden prompt.""" + return [m for m in self.mutations if m.original_prompt == prompt] + + def to_dict(self) -> dict[str, Any]: + """Convert to dictionary for serialization.""" + return { + "version": "1.0", + "started_at": self.started_at.isoformat(), + "completed_at": self.completed_at.isoformat(), + "duration_seconds": self.duration, + "statistics": self.statistics.to_dict(), + "mutations": [m.to_dict() for m in self.mutations], + "golden_prompts": self.config.golden_prompts, + } diff --git a/src/flakestorm/reports/terminal.py b/src/flakestorm/reports/terminal.py new file mode 100644 index 0000000..68597e7 --- /dev/null +++ b/src/flakestorm/reports/terminal.py @@ -0,0 +1,156 @@ +""" +Terminal Report Generator + +Displays test results directly in the terminal using rich formatting. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from rich.console import Console +from rich.panel import Panel +from rich.table import Table +from rich.text import Text + +if TYPE_CHECKING: + from flakestorm.reports.models import TestResults + + +class TerminalReporter: + """ + Displays test results in the terminal using rich formatting. + + Provides colorful, informative output for interactive use. + """ + + def __init__(self, results: TestResults, console: Console | None = None): + """ + Initialize the reporter. + + Args: + results: Test results to display + console: Rich console (default: new console) + """ + self.results = results + self.console = console or Console() + + def print_summary(self) -> None: + """Print a summary of the test results.""" + stats = self.results.statistics + + # Robustness score with color + score = stats.robustness_score + if score >= 0.9: + score_style = "bold green" + score_emoji = "🎉" + elif score >= 0.7: + score_style = "bold yellow" + score_emoji = "⚠️" + else: + score_style = "bold red" + score_emoji = "❌" + + score_text = Text() + score_text.append(f"{score_emoji} Robustness Score: ", style="bold") + score_text.append(f"{score:.1%}", style=score_style) + + # Create summary panel + summary_lines = [ + score_text, + "", + f"Total Mutations: {stats.total_mutations}", + Text.assemble( + ("Passed: ", ""), + (str(stats.passed_mutations), "green"), + (" | Failed: ", ""), + (str(stats.failed_mutations), "red"), + ), + "", + f"Avg Latency: {stats.avg_latency_ms:.0f}ms", + f"P95 Latency: {stats.p95_latency_ms:.0f}ms", + f"Duration: {self.results.duration:.1f}s", + ] + + panel_content = "\n".join(str(line) for line in summary_lines) + + self.console.print( + Panel( + panel_content, + title="flakestorm Results", + border_style="blue", + ) + ) + + def print_type_breakdown(self) -> None: + """Print breakdown by mutation type.""" + stats = self.results.statistics + + table = Table(title="By Mutation Type", show_header=True) + table.add_column("Type", style="cyan") + table.add_column("Passed", justify="right", style="green") + table.add_column("Failed", justify="right", style="red") + table.add_column("Pass Rate", justify="right") + table.add_column("Progress", width=20) + + for type_stat in stats.by_type: + # Create a simple text-based progress bar + bar_width = 15 + filled = int(type_stat.pass_rate * bar_width) + bar = "█" * filled + "░" * (bar_width - filled) + + table.add_row( + type_stat.mutation_type.replace("_", " ").title(), + str(type_stat.passed), + str(type_stat.total - type_stat.passed), + f"{type_stat.pass_rate:.1%}", + bar, + ) + + self.console.print(table) + + def print_failures(self, limit: int = 10) -> None: + """ + Print details of failed mutations. + + Args: + limit: Maximum number of failures to show + """ + failed = self.results.failed_mutations + + if not failed: + self.console.print("[green]✓ No failures![/green]") + return + + self.console.print( + f"\n[bold red]Failed Mutations ({len(failed)} total):[/bold red]" + ) + + for i, result in enumerate(failed[:limit]): + self.console.print(f"\n[bold]#{i+1} - {result.mutation.type.value}[/bold]") + self.console.print( + f" [dim]Original:[/dim] {result.original_prompt[:50]}..." + ) + self.console.print( + f" [dim]Mutated:[/dim] {result.mutation.mutated[:50]}..." + ) + + for check in result.failed_checks: + self.console.print( + f" [red]✗ {check.check_type}:[/red] {check.details}" + ) + + if len(failed) > limit: + self.console.print( + f"\n[dim]...and {len(failed) - limit} more failures. " + "See HTML report for details.[/dim]" + ) + + def print_full_report(self) -> None: + """Print the complete terminal report.""" + self.console.print() + self.print_summary() + self.console.print() + self.print_type_breakdown() + self.print_failures() + self.console.print()