mirror of
https://github.com/flakestorm/flakestorm.git
synced 2026-04-25 00:36:54 +02:00
Update model configuration and enhance documentation for improved user guidance - Change default model to "gemma3:1b" in flakestorm-generate-search-queries.yaml and increase mutation count from 3 to 20 - Revise README.md to include demo visuals and model recommendations based on system RAM - Expand USAGE_GUIDE.md with detailed model selection criteria and installation instructions - Enhance HTML report generation to include actionable recommendations for failed mutations and executive summary insights.
This commit is contained in:
parent
8fc291d186
commit
2dcaf31712
7 changed files with 594 additions and 24 deletions
24
README.md
24
README.md
|
|
@ -42,6 +42,20 @@ Instead of running one test case, Flakestorm takes a single "Golden Prompt", gen
|
||||||
- ✅ **Local-First**: Uses Ollama with Qwen 3 8B for free testing
|
- ✅ **Local-First**: Uses Ollama with Qwen 3 8B for free testing
|
||||||
- ✅ **Beautiful Reports**: Interactive HTML reports with pass/fail matrices
|
- ✅ **Beautiful Reports**: Interactive HTML reports with pass/fail matrices
|
||||||
|
|
||||||
|
## Demo
|
||||||
|
|
||||||
|
### flakestorm in Action
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
*Watch flakestorm generate mutations and test your agent in real-time*
|
||||||
|
|
||||||
|
### Test Report
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
*Interactive HTML reports with detailed failure analysis and recommendations*
|
||||||
|
|
||||||
## Quick Start
|
## Quick Start
|
||||||
|
|
||||||
### Installation Order
|
### Installation Order
|
||||||
|
|
@ -97,7 +111,11 @@ sudo apt install ollama
|
||||||
# Windows: Starts automatically as a service
|
# Windows: Starts automatically as a service
|
||||||
|
|
||||||
# In another terminal, pull the model
|
# In another terminal, pull the model
|
||||||
ollama pull qwen3:8b
|
# Choose based on your RAM:
|
||||||
|
# - 8GB RAM: ollama pull tinyllama:1.1b or gemma2:2b
|
||||||
|
# - 16GB RAM: ollama pull qwen2.5:3b (recommended)
|
||||||
|
# - 32GB+ RAM: ollama pull qwen2.5-coder:7b (best quality)
|
||||||
|
ollama pull qwen2.5:3b
|
||||||
```
|
```
|
||||||
|
|
||||||
**Troubleshooting:** If you get `syntax error: <!doctype html>` or `command not found` when running `ollama` commands:
|
**Troubleshooting:** If you get `syntax error: <!doctype html>` or `command not found` when running `ollama` commands:
|
||||||
|
|
@ -194,7 +212,9 @@ agent:
|
||||||
|
|
||||||
model:
|
model:
|
||||||
provider: "ollama"
|
provider: "ollama"
|
||||||
name: "qwen3:8b"
|
# Choose model based on your RAM: 8GB (tinyllama:1.1b), 16GB (qwen2.5:3b), 32GB+ (qwen2.5-coder:7b)
|
||||||
|
# See docs/USAGE_GUIDE.md for full model recommendations
|
||||||
|
name: "qwen2.5:3b"
|
||||||
base_url: "http://localhost:11434"
|
base_url: "http://localhost:11434"
|
||||||
|
|
||||||
mutations:
|
mutations:
|
||||||
|
|
|
||||||
|
|
@ -258,6 +258,57 @@ ollama pull qwen2.5-coder:7b
|
||||||
ollama run qwen2.5-coder:7b "Hello, world!"
|
ollama run qwen2.5-coder:7b "Hello, world!"
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Choosing the Right Model for Your System
|
||||||
|
|
||||||
|
FlakeStorm uses local LLMs to generate mutations. Choose a model that fits your system's RAM and performance requirements:
|
||||||
|
|
||||||
|
| System RAM | Recommended Model | Model Size | Speed | Quality | Use Case |
|
||||||
|
|------------|-------------------|------------|-------|---------|----------|
|
||||||
|
| **4-8 GB** | `tinyllama:1.1b` | ~700 MB | ⚡⚡⚡ Very Fast | ⭐⭐ Basic | Quick testing, CI/CD |
|
||||||
|
| **8-16 GB** | `gemma2:2b` | ~1.4 GB | ⚡⚡ Fast | ⭐⭐⭐ Good | Balanced performance |
|
||||||
|
| **8-16 GB** | `phi3:mini` | ~2.3 GB | ⚡⚡ Fast | ⭐⭐⭐ Good | Microsoft's efficient model |
|
||||||
|
| **16-32 GB** | `qwen2.5:3b` | ~2.0 GB | ⚡⚡ Fast | ⭐⭐⭐⭐ Very Good | Recommended for most users |
|
||||||
|
| **16-32 GB** | `gemma2:9b` | ~5.4 GB | ⚡ Moderate | ⭐⭐⭐⭐ Very Good | Better quality mutations |
|
||||||
|
| **32+ GB** | `qwen2.5-coder:7b` | ~4.4 GB | ⚡ Moderate | ⭐⭐⭐⭐⭐ Excellent | Best for code/structured prompts |
|
||||||
|
| **32+ GB** | `qwen2.5:7b` | ~4.4 GB | ⚡ Moderate | ⭐⭐⭐⭐⭐ Excellent | Best overall quality |
|
||||||
|
| **64+ GB** | `qwen2.5:14b` | ~8.9 GB | 🐌 Slower | ⭐⭐⭐⭐⭐ Excellent | Maximum quality (overkill for most) |
|
||||||
|
|
||||||
|
**Quick Recommendations:**
|
||||||
|
|
||||||
|
- **Minimum viable (8GB RAM)**: `tinyllama:1.1b` or `gemma2:2b`
|
||||||
|
- **Recommended (16GB+ RAM)**: `qwen2.5:3b` or `gemma2:9b`
|
||||||
|
- **Best quality (32GB+ RAM)**: `qwen2.5-coder:7b` or `qwen2.5:7b`
|
||||||
|
|
||||||
|
**Pull your chosen model:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# For 8GB RAM systems
|
||||||
|
ollama pull tinyllama:1.1b
|
||||||
|
# or
|
||||||
|
ollama pull gemma2:2b
|
||||||
|
|
||||||
|
# For 16GB RAM systems (recommended)
|
||||||
|
ollama pull qwen2.5:3b
|
||||||
|
# or
|
||||||
|
ollama pull gemma2:9b
|
||||||
|
|
||||||
|
# For 32GB+ RAM systems (best quality)
|
||||||
|
ollama pull qwen2.5-coder:7b
|
||||||
|
# or
|
||||||
|
ollama pull qwen2.5:7b
|
||||||
|
```
|
||||||
|
|
||||||
|
**Update your `flakestorm.yaml` to use your chosen model:**
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
model:
|
||||||
|
provider: "ollama"
|
||||||
|
name: "qwen2.5:3b" # Change to your chosen model
|
||||||
|
base_url: "http://localhost:11434"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Note:** Smaller models are faster but may produce less diverse mutations. Larger models produce higher quality mutations but require more RAM and are slower. For most users, `qwen2.5:3b` or `gemma2:9b` provides the best balance.
|
||||||
|
|
||||||
### Step 3: Create Virtual Environment and Install flakestorm
|
### Step 3: Create Virtual Environment and Install flakestorm
|
||||||
|
|
||||||
**CRITICAL: Python 3.10+ Required!**
|
**CRITICAL: Python 3.10+ Required!**
|
||||||
|
|
@ -375,10 +426,22 @@ maturin build --release
|
||||||
# 4. Remove any old wheels (if they exist)
|
# 4. Remove any old wheels (if they exist)
|
||||||
rm -f ../target/wheels/entropix_rust-*.whl # Remove old wheels with wrong name
|
rm -f ../target/wheels/entropix_rust-*.whl # Remove old wheels with wrong name
|
||||||
|
|
||||||
# 5. Install the new wheel (use specific pattern to avoid old wheels)
|
# 5. List available wheel files to get the exact filename
|
||||||
pip install ../target/wheels/flakestorm_rust-*.whl
|
# On Linux/macOS:
|
||||||
|
ls ../target/wheels/flakestorm_rust-*.whl
|
||||||
|
# On Windows (PowerShell):
|
||||||
|
# Get-ChildItem ..\target\wheels\flakestorm_rust-*.whl
|
||||||
|
|
||||||
# 6. Verify installation
|
# 6. Install the wheel using the FULL filename (wildcard pattern may not work)
|
||||||
|
# Copy the exact filename from step 5 and use it here:
|
||||||
|
# Example for Windows:
|
||||||
|
# pip install ../target/wheels/flakestorm_rust-0.1.0-cp311-cp311-win_amd64.whl
|
||||||
|
# Example for Linux:
|
||||||
|
# pip install ../target/wheels/flakestorm_rust-0.1.0-cp311-cp311-manylinux_2_34_x86_64.whl
|
||||||
|
# Example for macOS:
|
||||||
|
# pip install ../target/wheels/flakestorm_rust-0.1.0-cp311-cp311-macosx_10_9_x86_64.whl
|
||||||
|
|
||||||
|
# 7. Verify installation
|
||||||
python -c "import flakestorm_rust; print('Rust extension installed successfully!')"
|
python -c "import flakestorm_rust; print('Rust extension installed successfully!')"
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
@ -994,17 +1057,22 @@ mutations:
|
||||||
length_extremes: 1.2
|
length_extremes: 1.2
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# LLM CONFIGURATION (for mutation generation)
|
# MODEL CONFIGURATION (for mutation generation)
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
llm:
|
model:
|
||||||
# Ollama model to use
|
# Model provider: "ollama" (default)
|
||||||
model: "qwen2.5-coder:7b"
|
provider: "ollama"
|
||||||
|
|
||||||
|
# Model name (must be pulled in Ollama first)
|
||||||
|
# See "Choosing the Right Model for Your System" section above for recommendations
|
||||||
|
# based on your RAM: 8GB (tinyllama:1.1b), 16GB (qwen2.5:3b), 32GB+ (qwen2.5-coder:7b)
|
||||||
|
name: "qwen2.5-coder:7b"
|
||||||
|
|
||||||
# Ollama server URL
|
# Ollama server URL
|
||||||
host: "http://localhost:11434"
|
base_url: "http://localhost:11434"
|
||||||
|
|
||||||
# Generation temperature (higher = more creative mutations)
|
# Optional: Generation temperature (higher = more creative mutations)
|
||||||
temperature: 0.8
|
# temperature: 0.8
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# INVARIANTS (ASSERTIONS)
|
# INVARIANTS (ASSERTIONS)
|
||||||
|
|
|
||||||
|
|
@ -34,7 +34,7 @@ agent:
|
||||||
# Recommended for 8GB RAM: qwen2.5:1.5b (fastest), tinyllama (smallest), or phi3:mini (best quality)
|
# Recommended for 8GB RAM: qwen2.5:1.5b (fastest), tinyllama (smallest), or phi3:mini (best quality)
|
||||||
model:
|
model:
|
||||||
provider: "ollama"
|
provider: "ollama"
|
||||||
name: "tinyllama" # Small, fast model optimized for 8GB RAM
|
name: "gemma3:1b" # Small, fast model optimized for 8GB RAM
|
||||||
base_url: "http://localhost:11434"
|
base_url: "http://localhost:11434"
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|
@ -42,7 +42,7 @@ model:
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
mutations:
|
mutations:
|
||||||
# Number of mutations to generate per golden prompt
|
# Number of mutations to generate per golden prompt
|
||||||
count: 3
|
count: 20
|
||||||
|
|
||||||
# Types of mutations to apply
|
# Types of mutations to apply
|
||||||
types:
|
types:
|
||||||
|
|
|
||||||
BIN
flakestorm_demo.gif
Normal file
BIN
flakestorm_demo.gif
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 2.7 MiB |
BIN
flakestorm_test_reporting.gif
Normal file
BIN
flakestorm_test_reporting.gif
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 45 MiB |
|
|
@ -26,7 +26,7 @@ from rich.progress import (
|
||||||
)
|
)
|
||||||
|
|
||||||
# Configuration limits for local hardware constraints
|
# Configuration limits for local hardware constraints
|
||||||
MAX_MUTATIONS_PER_RUN = 50
|
MAX_MUTATIONS_PER_RUN = 200
|
||||||
PARALLEL_EXECUTION_ENABLED = False # Sequential execution for local hardware
|
PARALLEL_EXECUTION_ENABLED = False # Sequential execution for local hardware
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
|
|
|
||||||
|
|
@ -13,12 +13,12 @@ from __future__ import annotations
|
||||||
import json
|
import json
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
from jinja2 import Template
|
from jinja2 import Template
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from flakestorm.reports.models import TestResults
|
from flakestorm.reports.models import MutationResult, TestResults
|
||||||
|
|
||||||
|
|
||||||
HTML_TEMPLATE = """
|
HTML_TEMPLATE = """
|
||||||
|
|
@ -461,6 +461,77 @@ HTML_TEMPLATE = """
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
{% if summary.total_failures > 0 %}
|
||||||
|
<div class="section">
|
||||||
|
<h2 class="section-title">📋 Executive Summary & Action Items</h2>
|
||||||
|
<div class="summary-card" style="background: var(--bg-card); border-radius: 12px; padding: 1.5rem; margin-bottom: 1rem;">
|
||||||
|
<div style="margin-bottom: 1rem;">
|
||||||
|
<h3 style="font-size: 1.125rem; margin-bottom: 0.75rem;">Overall Assessment</h3>
|
||||||
|
<p style="color: var(--text-secondary); line-height: 1.6;">
|
||||||
|
Your agent has a <strong>{{ score_percent }}%</strong> robustness score with
|
||||||
|
<strong>{{ failed_mutations }}</strong> failures out of <strong>{{ total_mutations }}</strong> tests.
|
||||||
|
{% if score_percent < 70 %}
|
||||||
|
<span style="color: var(--danger);">⚠️ This indicates significant vulnerabilities that need immediate attention.</span>
|
||||||
|
{% elif score_percent < 85 %}
|
||||||
|
<span style="color: var(--warning);">⚠️ Your agent needs improvement before production deployment.</span>
|
||||||
|
{% else %}
|
||||||
|
<span style="color: var(--success);">✓ Your agent shows good robustness, but there's room for improvement.</span>
|
||||||
|
{% endif %}
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{% if summary.recommendations %}
|
||||||
|
<div style="margin-top: 1.5rem;">
|
||||||
|
<h3 style="font-size: 1.125rem; margin-bottom: 0.75rem;">Priority Action Items</h3>
|
||||||
|
<div style="display: flex; flex-direction: column; gap: 0.75rem;">
|
||||||
|
{% for rec in summary.recommendations %}
|
||||||
|
<div style="background: var(--bg-secondary); border-left: 4px solid
|
||||||
|
{% if rec.priority == 'critical' %}var(--danger)
|
||||||
|
{% elif rec.priority == 'high' %}var(--warning)
|
||||||
|
{% else %}var(--accent)
|
||||||
|
{% endif %};
|
||||||
|
padding: 1rem; border-radius: 8px;">
|
||||||
|
<div style="display: flex; justify-content: space-between; align-items: start; margin-bottom: 0.5rem;">
|
||||||
|
<div>
|
||||||
|
<strong style="text-transform: uppercase; font-size: 0.75rem; letter-spacing: 0.05em;
|
||||||
|
color: {% if rec.priority == 'critical' %}var(--danger)
|
||||||
|
{% elif rec.priority == 'high' %}var(--warning)
|
||||||
|
{% else %}var(--accent)
|
||||||
|
{% endif %};">
|
||||||
|
{{ rec.priority }} Priority
|
||||||
|
</strong>
|
||||||
|
<h4 style="margin: 0.25rem 0; font-size: 1rem;">{{ rec.issue }}</h4>
|
||||||
|
</div>
|
||||||
|
<span style="background: var(--bg-primary); padding: 0.25rem 0.75rem; border-radius: 12px; font-size: 0.875rem;">
|
||||||
|
{{ rec.count }} occurrence{{ 's' if rec.count != 1 else '' }}
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
<p style="margin: 0; color: var(--text-secondary); line-height: 1.5;">{{ rec.action }}</p>
|
||||||
|
</div>
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
{% if summary.top_issues %}
|
||||||
|
<div style="margin-top: 1.5rem;">
|
||||||
|
<h3 style="font-size: 1.125rem; margin-bottom: 0.75rem;">Top Failure Types</h3>
|
||||||
|
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 0.75rem;">
|
||||||
|
{% for issue in summary.top_issues %}
|
||||||
|
<div style="background: var(--bg-secondary); padding: 0.75rem; border-radius: 8px;">
|
||||||
|
<div style="font-size: 0.875rem; color: var(--text-secondary); margin-bottom: 0.25rem;">
|
||||||
|
{{ issue.type.replace('_', ' ').title() }}
|
||||||
|
</div>
|
||||||
|
<div style="font-size: 1.25rem; font-weight: 600;">{{ issue.count }}</div>
|
||||||
|
</div>
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
<div class="section">
|
<div class="section">
|
||||||
<h2 class="section-title">📊 By Mutation Type</h2>
|
<h2 class="section-title">📊 By Mutation Type</h2>
|
||||||
<div class="type-breakdown">
|
<div class="type-breakdown">
|
||||||
|
|
@ -512,24 +583,38 @@ HTML_TEMPLATE = """
|
||||||
<script>
|
<script>
|
||||||
const mutations = {{ mutations_json|safe }};
|
const mutations = {{ mutations_json|safe }};
|
||||||
|
|
||||||
|
function escapeHtml(text) {
|
||||||
|
const div = document.createElement('div');
|
||||||
|
div.textContent = text;
|
||||||
|
return div.innerHTML;
|
||||||
|
}
|
||||||
|
|
||||||
function showDetail(index) {
|
function showDetail(index) {
|
||||||
const m = mutations[index];
|
const m = mutations[index];
|
||||||
const modal = document.getElementById('detail-modal');
|
const modal = document.getElementById('detail-modal');
|
||||||
const body = document.getElementById('modal-body');
|
const body = document.getElementById('modal-body');
|
||||||
|
|
||||||
|
const hasRecommendation = m.recommendation && !m.passed;
|
||||||
|
|
||||||
body.innerHTML = `
|
body.innerHTML = `
|
||||||
<div class="detail-section">
|
<div class="detail-section">
|
||||||
<div class="detail-label">Original Prompt</div>
|
<div class="detail-label">Original Prompt</div>
|
||||||
<div class="detail-content">${m.original_prompt}</div>
|
<div class="detail-content">${escapeHtml(m.original_prompt)}</div>
|
||||||
</div>
|
</div>
|
||||||
<div class="detail-section">
|
<div class="detail-section">
|
||||||
<div class="detail-label">Mutated (${m.mutation.type})</div>
|
<div class="detail-label">Mutated (${m.mutation.type})</div>
|
||||||
<div class="detail-content">${m.mutation.mutated}</div>
|
<div class="detail-content">${escapeHtml(m.mutation.mutated)}</div>
|
||||||
</div>
|
</div>
|
||||||
<div class="detail-section">
|
<div class="detail-section">
|
||||||
<div class="detail-label">Agent Response</div>
|
<div class="detail-label">Agent Response</div>
|
||||||
<div class="detail-content">${m.response || '(empty)'}</div>
|
<div class="detail-content">${escapeHtml(m.response || '(empty)')}</div>
|
||||||
</div>
|
</div>
|
||||||
|
${m.error ? `
|
||||||
|
<div class="detail-section">
|
||||||
|
<div class="detail-label" style="color: var(--danger);">Error</div>
|
||||||
|
<div class="detail-content" style="color: var(--danger);">${escapeHtml(m.error)}</div>
|
||||||
|
</div>
|
||||||
|
` : ''}
|
||||||
<div class="detail-section">
|
<div class="detail-section">
|
||||||
<div class="detail-label">Invariant Checks</div>
|
<div class="detail-label">Invariant Checks</div>
|
||||||
<ul class="check-list">
|
<ul class="check-list">
|
||||||
|
|
@ -539,13 +624,41 @@ HTML_TEMPLATE = """
|
||||||
${c.passed ? '✓' : '✗'}
|
${c.passed ? '✓' : '✗'}
|
||||||
</div>
|
</div>
|
||||||
<div class="check-details">
|
<div class="check-details">
|
||||||
<div class="check-type">${c.check_type}</div>
|
<div class="check-type">${escapeHtml(c.check_type)}</div>
|
||||||
<div class="check-message">${c.details}</div>
|
<div class="check-message">${escapeHtml(c.details)}</div>
|
||||||
</div>
|
</div>
|
||||||
</li>
|
</li>
|
||||||
`).join('')}
|
`).join('')}
|
||||||
</ul>
|
</ul>
|
||||||
</div>
|
</div>
|
||||||
|
${hasRecommendation ? `
|
||||||
|
<div class="detail-section" style="background: var(--bg-card); border-left: 4px solid
|
||||||
|
${m.recommendation.priority === 'critical' ? 'var(--danger)' :
|
||||||
|
m.recommendation.priority === 'high' ? 'var(--warning)' : 'var(--accent)'};
|
||||||
|
padding: 1rem; border-radius: 8px; margin-top: 1rem;">
|
||||||
|
<div style="display: flex; justify-content: space-between; align-items: start; margin-bottom: 0.75rem;">
|
||||||
|
<div>
|
||||||
|
<div style="text-transform: uppercase; font-size: 0.75rem; letter-spacing: 0.05em;
|
||||||
|
color: ${m.recommendation.priority === 'critical' ? 'var(--danger)' :
|
||||||
|
m.recommendation.priority === 'high' ? 'var(--warning)' : 'var(--accent)'};
|
||||||
|
font-weight: 600; margin-bottom: 0.25rem;">
|
||||||
|
${m.recommendation.priority} Priority
|
||||||
|
</div>
|
||||||
|
<h4 style="margin: 0; font-size: 1.125rem; color: var(--text-primary);">
|
||||||
|
💡 ${escapeHtml(m.recommendation.title)}
|
||||||
|
</h4>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<p style="color: var(--text-secondary); line-height: 1.6; margin-bottom: 1rem;">
|
||||||
|
${escapeHtml(m.recommendation.description)}
|
||||||
|
</p>
|
||||||
|
${m.recommendation.code ? `
|
||||||
|
<div style="background: var(--bg-primary); border-radius: 8px; padding: 1rem; overflow-x: auto;">
|
||||||
|
<pre style="margin: 0; font-family: 'SF Mono', 'Fira Code', monospace; font-size: 0.875rem; line-height: 1.5; color: var(--text-primary);"><code>${escapeHtml(m.recommendation.code)}</code></pre>
|
||||||
|
</div>
|
||||||
|
` : ''}
|
||||||
|
</div>
|
||||||
|
` : ''}
|
||||||
`;
|
`;
|
||||||
|
|
||||||
modal.classList.add('active');
|
modal.classList.add('active');
|
||||||
|
|
@ -586,6 +699,366 @@ class HTMLReportGenerator:
|
||||||
self.results = results
|
self.results = results
|
||||||
self.template = Template(HTML_TEMPLATE)
|
self.template = Template(HTML_TEMPLATE)
|
||||||
|
|
||||||
|
def _generate_recommendation(
|
||||||
|
self, mutation_result: Any
|
||||||
|
) -> dict[str, str]:
|
||||||
|
"""
|
||||||
|
Generate actionable recommendation for a failed mutation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
mutation_result: The failed mutation result
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with title, description, and code example
|
||||||
|
"""
|
||||||
|
failed_checks = mutation_result.failed_checks
|
||||||
|
mutation_type = mutation_result.mutation.type.value
|
||||||
|
error = mutation_result.error
|
||||||
|
|
||||||
|
# Check for agent errors (HTTP 500, connection errors, etc.)
|
||||||
|
if error:
|
||||||
|
if "JSON" in error or "json" in error.lower():
|
||||||
|
if "control character" in error.lower():
|
||||||
|
return {
|
||||||
|
"title": "Fix JSON Input Sanitization",
|
||||||
|
"description": "The mutated input contains control characters (newlines, tabs) that break JSON parsing. Your agent needs to sanitize inputs before inserting them into JSON.",
|
||||||
|
"priority": "high",
|
||||||
|
"code": '''# Python example
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
|
||||||
|
def sanitize_for_json(text: str) -> str:
|
||||||
|
"""Remove control characters that break JSON."""
|
||||||
|
# Remove control characters (0x00-0x1F, 0x7F-0x9F)
|
||||||
|
return re.sub(r'[\\x00-\\x1f\\x7f-\\x9f]', '', text)
|
||||||
|
|
||||||
|
# In your request handler:
|
||||||
|
sanitized = sanitize_for_json(user_input)
|
||||||
|
request_body = json.dumps({"productDescription": sanitized})''',
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
return {
|
||||||
|
"title": "Fix JSON Parsing Error",
|
||||||
|
"description": f"The agent returned invalid JSON. Error: {error[:100]}",
|
||||||
|
"priority": "high",
|
||||||
|
"code": '''# Ensure your agent always returns valid JSON
|
||||||
|
# Wrap responses in try/except:
|
||||||
|
try:
|
||||||
|
response = json.loads(agent_output)
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
return {"error": "Invalid JSON response", "details": str(e)}''',
|
||||||
|
}
|
||||||
|
elif "HTTP 500" in error or "500" in error:
|
||||||
|
return {
|
||||||
|
"title": "Fix Server Error Handling",
|
||||||
|
"description": "The agent's backend returned HTTP 500. This indicates a server-side error that needs investigation.",
|
||||||
|
"priority": "critical",
|
||||||
|
"code": '''# Add error handling in your agent:
|
||||||
|
# 1. Check server logs for the actual error
|
||||||
|
# 2. Add input validation before processing
|
||||||
|
# 3. Return proper error responses instead of 500
|
||||||
|
|
||||||
|
def handle_request(input_text):
|
||||||
|
try:
|
||||||
|
# Validate input
|
||||||
|
if not input_text or len(input_text) > MAX_LENGTH:
|
||||||
|
return {"error": "Invalid input"}
|
||||||
|
|
||||||
|
# Process request
|
||||||
|
result = process(input_text)
|
||||||
|
return {"success": True, "data": result}
|
||||||
|
except Exception as e:
|
||||||
|
# Log error, return 400 instead of 500
|
||||||
|
logger.error(f"Error: {e}")
|
||||||
|
return {"error": "Processing failed", "status": 400}''',
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
return {
|
||||||
|
"title": "Fix Agent Error",
|
||||||
|
"description": f"The agent failed with error: {error[:150]}",
|
||||||
|
"priority": "high",
|
||||||
|
"code": "# Check agent logs and add proper error handling",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Check for specific invariant failures
|
||||||
|
check_types = [c.check_type for c in failed_checks]
|
||||||
|
|
||||||
|
if "latency" in check_types:
|
||||||
|
latency_check = next(c for c in failed_checks if c.check_type == "latency")
|
||||||
|
return {
|
||||||
|
"title": "Optimize Response Latency",
|
||||||
|
"description": f"Response took {mutation_result.latency_ms:.0f}ms, exceeding the threshold. This mutation type ({mutation_type}) is causing performance issues.",
|
||||||
|
"priority": "medium",
|
||||||
|
"code": f'''# Performance optimization strategies:
|
||||||
|
# 1. Add caching for similar requests
|
||||||
|
# 2. Optimize LLM calls (reduce max_tokens, use faster models)
|
||||||
|
# 3. Add request timeout and circuit breaker
|
||||||
|
# 4. Consider async processing for long operations
|
||||||
|
|
||||||
|
# Example timeout:
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
async def process_with_timeout(input_text, max_ms=10000):
|
||||||
|
try:
|
||||||
|
return await asyncio.wait_for(
|
||||||
|
process_request(input_text),
|
||||||
|
timeout=max_ms / 1000
|
||||||
|
)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
return {{"error": "Request timeout"}}''',
|
||||||
|
}
|
||||||
|
|
||||||
|
if "valid_json" in check_types:
|
||||||
|
return {
|
||||||
|
"title": "Ensure Valid JSON Response",
|
||||||
|
"description": "The agent's response is not valid JSON. All responses must be properly formatted JSON.",
|
||||||
|
"priority": "high",
|
||||||
|
"code": '''# Always return valid JSON:
|
||||||
|
import json
|
||||||
|
|
||||||
|
def format_response(data):
|
||||||
|
"""Ensure response is always valid JSON."""
|
||||||
|
try:
|
||||||
|
# If data is already a dict/list, serialize it
|
||||||
|
if isinstance(data, (dict, list)):
|
||||||
|
return json.dumps(data)
|
||||||
|
# If it's a string, try to parse it first
|
||||||
|
try:
|
||||||
|
parsed = json.loads(data)
|
||||||
|
return json.dumps(parsed)
|
||||||
|
except:
|
||||||
|
# Wrap in a JSON object
|
||||||
|
return json.dumps({"output": data})
|
||||||
|
except Exception as e:
|
||||||
|
return json.dumps({"error": str(e)})''',
|
||||||
|
}
|
||||||
|
|
||||||
|
if "contains" in check_types:
|
||||||
|
contains_check = next(c for c in failed_checks if c.check_type == "contains")
|
||||||
|
return {
|
||||||
|
"title": "Fix Response Content Validation",
|
||||||
|
"description": f"Response doesn't contain expected content. {contains_check.details}",
|
||||||
|
"priority": "medium",
|
||||||
|
"code": "# Review your agent's response logic to ensure it includes required content",
|
||||||
|
}
|
||||||
|
|
||||||
|
if "excludes_pii" in check_types:
|
||||||
|
return {
|
||||||
|
"title": "Fix PII Leakage",
|
||||||
|
"description": "The response contains personally identifiable information (PII) that should not be exposed.",
|
||||||
|
"priority": "critical",
|
||||||
|
"code": '''# Add PII detection and filtering:
|
||||||
|
import re
|
||||||
|
|
||||||
|
PII_PATTERNS = [
|
||||||
|
r'\\b\\d{3}-\\d{2}-\\d{4}\\b', # SSN
|
||||||
|
r'\\b\\d{16}\\b', # Credit card
|
||||||
|
r'\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b', # Email
|
||||||
|
]
|
||||||
|
|
||||||
|
def filter_pii(text):
|
||||||
|
"""Remove PII from text."""
|
||||||
|
for pattern in PII_PATTERNS:
|
||||||
|
text = re.sub(pattern, '[REDACTED]', text)
|
||||||
|
return text''',
|
||||||
|
}
|
||||||
|
|
||||||
|
# Default recommendation based on mutation type
|
||||||
|
mutation_recommendations = {
|
||||||
|
"encoding_attacks": {
|
||||||
|
"title": "Handle Encoded Inputs",
|
||||||
|
"description": "The agent failed on encoded inputs (Base64, Unicode, URL encoding). Add input decoding and validation.",
|
||||||
|
"priority": "high",
|
||||||
|
"code": '''# Decode various encoding formats:
|
||||||
|
import base64
|
||||||
|
import urllib.parse
|
||||||
|
|
||||||
|
def decode_input(text):
|
||||||
|
"""Try to decode various encoding formats."""
|
||||||
|
# Try URL decoding
|
||||||
|
try:
|
||||||
|
decoded = urllib.parse.unquote(text)
|
||||||
|
if decoded != text:
|
||||||
|
return decoded
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Try Base64
|
||||||
|
try:
|
||||||
|
decoded = base64.b64decode(text).decode('utf-8')
|
||||||
|
return decoded
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return text''',
|
||||||
|
},
|
||||||
|
"context_manipulation": {
|
||||||
|
"title": "Improve Context Extraction",
|
||||||
|
"description": "The agent failed when context was manipulated. Improve intent extraction from noisy inputs.",
|
||||||
|
"priority": "medium",
|
||||||
|
"code": "# Use semantic search or LLM to extract core intent from noisy context",
|
||||||
|
},
|
||||||
|
"prompt_injection": {
|
||||||
|
"title": "Strengthen Prompt Injection Defense",
|
||||||
|
"description": "The agent is vulnerable to prompt injection attacks. Add input validation and filtering.",
|
||||||
|
"priority": "critical",
|
||||||
|
"code": '''# Add prompt injection detection:
|
||||||
|
INJECTION_PATTERNS = [
|
||||||
|
"ignore previous instructions",
|
||||||
|
"forget your rules",
|
||||||
|
"you are now",
|
||||||
|
"system:",
|
||||||
|
"assistant:",
|
||||||
|
]
|
||||||
|
|
||||||
|
def detect_injection(text):
|
||||||
|
"""Detect potential prompt injection."""
|
||||||
|
text_lower = text.lower()
|
||||||
|
for pattern in INJECTION_PATTERNS:
|
||||||
|
if pattern in text_lower:
|
||||||
|
return True
|
||||||
|
return False''',
|
||||||
|
},
|
||||||
|
"length_extremes": {
|
||||||
|
"title": "Handle Edge Case Inputs",
|
||||||
|
"description": "The agent failed on extreme input lengths (empty or very long). Add input validation.",
|
||||||
|
"priority": "medium",
|
||||||
|
"code": '''# Add input length validation:
|
||||||
|
MIN_LENGTH = 1
|
||||||
|
MAX_LENGTH = 10000
|
||||||
|
|
||||||
|
def validate_input(text):
|
||||||
|
"""Validate input length."""
|
||||||
|
if len(text) < MIN_LENGTH:
|
||||||
|
return {"error": "Input too short"}
|
||||||
|
if len(text) > MAX_LENGTH:
|
||||||
|
return {"error": "Input too long"}
|
||||||
|
return None''',
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
if mutation_type in mutation_recommendations:
|
||||||
|
return mutation_recommendations[mutation_type]
|
||||||
|
|
||||||
|
# Generic recommendation
|
||||||
|
return {
|
||||||
|
"title": "Review Agent Logic",
|
||||||
|
"description": f"The agent failed on {mutation_type} mutation. Review the agent's handling of this input type.",
|
||||||
|
"priority": "medium",
|
||||||
|
"code": "# Review agent logs and add appropriate error handling",
|
||||||
|
}
|
||||||
|
|
||||||
|
def _generate_summary(self) -> dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Generate executive summary with actionable insights.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with summary data
|
||||||
|
"""
|
||||||
|
stats = self.results.statistics
|
||||||
|
failed = self.results.failed_mutations
|
||||||
|
|
||||||
|
# Group failures by type
|
||||||
|
failures_by_type: dict[str, list] = {}
|
||||||
|
failures_by_check: dict[str, int] = {}
|
||||||
|
error_types: dict[str, int] = {}
|
||||||
|
|
||||||
|
for mutation in failed:
|
||||||
|
# Group by mutation type
|
||||||
|
mut_type = mutation.mutation.type.value
|
||||||
|
if mut_type not in failures_by_type:
|
||||||
|
failures_by_type[mut_type] = []
|
||||||
|
failures_by_type[mut_type].append(mutation)
|
||||||
|
|
||||||
|
# Count check failures
|
||||||
|
for check in mutation.failed_checks:
|
||||||
|
failures_by_check[check.check_type] = (
|
||||||
|
failures_by_check.get(check.check_type, 0) + 1
|
||||||
|
)
|
||||||
|
|
||||||
|
# Count error types
|
||||||
|
if mutation.error:
|
||||||
|
if "JSON" in mutation.error or "json" in mutation.error.lower():
|
||||||
|
error_types["JSON Parsing"] = error_types.get("JSON Parsing", 0) + 1
|
||||||
|
elif "500" in mutation.error or "HTTP 500" in mutation.error:
|
||||||
|
error_types["HTTP 500"] = error_types.get("HTTP 500", 0) + 1
|
||||||
|
elif "timeout" in mutation.error.lower():
|
||||||
|
error_types["Timeout"] = error_types.get("Timeout", 0) + 1
|
||||||
|
else:
|
||||||
|
error_types["Other Errors"] = (
|
||||||
|
error_types.get("Other Errors", 0) + 1
|
||||||
|
)
|
||||||
|
|
||||||
|
# Top issues
|
||||||
|
top_issues = []
|
||||||
|
if failures_by_check:
|
||||||
|
sorted_checks = sorted(
|
||||||
|
failures_by_check.items(), key=lambda x: x[1], reverse=True
|
||||||
|
)
|
||||||
|
top_issues = [
|
||||||
|
{"type": check_type, "count": count}
|
||||||
|
for check_type, count in sorted_checks[:5]
|
||||||
|
]
|
||||||
|
|
||||||
|
# Recommendations
|
||||||
|
recommendations = []
|
||||||
|
if error_types.get("JSON Parsing", 0) > 0:
|
||||||
|
recommendations.append(
|
||||||
|
{
|
||||||
|
"priority": "high",
|
||||||
|
"issue": "JSON Parsing Errors",
|
||||||
|
"count": error_types["JSON Parsing"],
|
||||||
|
"action": "Add input sanitization to remove control characters before JSON serialization",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
if error_types.get("HTTP 500", 0) > 0:
|
||||||
|
recommendations.append(
|
||||||
|
{
|
||||||
|
"priority": "critical",
|
||||||
|
"issue": "Server Errors",
|
||||||
|
"count": error_types["HTTP 500"],
|
||||||
|
"action": "Investigate server logs and add proper error handling to return 400 instead of 500",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
if failures_by_check.get("latency", 0) > 0:
|
||||||
|
recommendations.append(
|
||||||
|
{
|
||||||
|
"priority": "medium",
|
||||||
|
"issue": "Performance Issues",
|
||||||
|
"count": failures_by_check["latency"],
|
||||||
|
"action": "Optimize agent response time - consider caching, reducing LLM tokens, or async processing",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
if failures_by_type.get("encoding_attacks", []):
|
||||||
|
recommendations.append(
|
||||||
|
{
|
||||||
|
"priority": "high",
|
||||||
|
"issue": "Encoding Attack Vulnerabilities",
|
||||||
|
"count": len(failures_by_type["encoding_attacks"]),
|
||||||
|
"action": "Add input decoding for Base64, Unicode, and URL-encoded inputs",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
if failures_by_type.get("prompt_injection", []):
|
||||||
|
recommendations.append(
|
||||||
|
{
|
||||||
|
"priority": "critical",
|
||||||
|
"issue": "Prompt Injection Vulnerabilities",
|
||||||
|
"count": len(failures_by_type["prompt_injection"]),
|
||||||
|
"action": "Add prompt injection detection and filtering",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"total_failures": len(failed),
|
||||||
|
"failures_by_type": {
|
||||||
|
k: len(v) for k, v in failures_by_type.items()
|
||||||
|
},
|
||||||
|
"failures_by_check": failures_by_check,
|
||||||
|
"error_types": error_types,
|
||||||
|
"top_issues": top_issues,
|
||||||
|
"recommendations": recommendations,
|
||||||
|
}
|
||||||
|
|
||||||
def generate(self) -> str:
|
def generate(self) -> str:
|
||||||
"""
|
"""
|
||||||
Generate the HTML report.
|
Generate the HTML report.
|
||||||
|
|
@ -610,8 +1083,16 @@ class HTMLReportGenerator:
|
||||||
for t in stats.by_type
|
for t in stats.by_type
|
||||||
]
|
]
|
||||||
|
|
||||||
# Prepare mutations data
|
# Prepare mutations data with recommendations
|
||||||
mutations_data = [m.to_dict() for m in self.results.mutations]
|
mutations_data = []
|
||||||
|
for m in self.results.mutations:
|
||||||
|
mut_dict = m.to_dict()
|
||||||
|
if not m.passed:
|
||||||
|
mut_dict["recommendation"] = self._generate_recommendation(m)
|
||||||
|
mutations_data.append(mut_dict)
|
||||||
|
|
||||||
|
# Generate summary
|
||||||
|
summary = self._generate_summary()
|
||||||
|
|
||||||
return self.template.render(
|
return self.template.render(
|
||||||
report_date=self.results.started_at.strftime("%Y-%m-%d %H:%M:%S"),
|
report_date=self.results.started_at.strftime("%Y-%m-%d %H:%M:%S"),
|
||||||
|
|
@ -626,6 +1107,7 @@ class HTMLReportGenerator:
|
||||||
type_stats=type_stats,
|
type_stats=type_stats,
|
||||||
mutations=self.results.mutations,
|
mutations=self.results.mutations,
|
||||||
mutations_json=json.dumps(mutations_data),
|
mutations_json=json.dumps(mutations_data),
|
||||||
|
summary=summary,
|
||||||
)
|
)
|
||||||
|
|
||||||
def save(self, path: str | Path | None = None) -> Path:
|
def save(self, path: str | Path | None = None) -> Path:
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue