diff --git a/README.md b/README.md index 4b962b9..e66040d 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,20 @@ Instead of running one test case, Flakestorm takes a single "Golden Prompt", gen - ✅ **Local-First**: Uses Ollama with Qwen 3 8B for free testing - ✅ **Beautiful Reports**: Interactive HTML reports with pass/fail matrices +## Demo + +### flakestorm in Action + +![flakestorm Demo](flakestorm_demo.gif) + +*Watch flakestorm generate mutations and test your agent in real-time* + +### Test Report + +![flakestorm Test Report](flakestorm_test_reporting.gif) + +*Interactive HTML reports with detailed failure analysis and recommendations* + ## Quick Start ### Installation Order @@ -97,7 +111,11 @@ sudo apt install ollama # Windows: Starts automatically as a service # In another terminal, pull the model -ollama pull qwen3:8b +# Choose based on your RAM: +# - 8GB RAM: ollama pull tinyllama:1.1b or gemma2:2b +# - 16GB RAM: ollama pull qwen2.5:3b (recommended) +# - 32GB+ RAM: ollama pull qwen2.5-coder:7b (best quality) +ollama pull qwen2.5:3b ``` **Troubleshooting:** If you get `syntax error: ` or `command not found` when running `ollama` commands: @@ -194,7 +212,9 @@ agent: model: provider: "ollama" - name: "qwen3:8b" + # Choose model based on your RAM: 8GB (tinyllama:1.1b), 16GB (qwen2.5:3b), 32GB+ (qwen2.5-coder:7b) + # See docs/USAGE_GUIDE.md for full model recommendations + name: "qwen2.5:3b" base_url: "http://localhost:11434" mutations: diff --git a/docs/USAGE_GUIDE.md b/docs/USAGE_GUIDE.md index 876dfc0..8c7aadf 100644 --- a/docs/USAGE_GUIDE.md +++ b/docs/USAGE_GUIDE.md @@ -258,6 +258,57 @@ ollama pull qwen2.5-coder:7b ollama run qwen2.5-coder:7b "Hello, world!" ``` +### Choosing the Right Model for Your System + +FlakeStorm uses local LLMs to generate mutations. Choose a model that fits your system's RAM and performance requirements: + +| System RAM | Recommended Model | Model Size | Speed | Quality | Use Case | +|------------|-------------------|------------|-------|---------|----------| +| **4-8 GB** | `tinyllama:1.1b` | ~700 MB | ⚡⚡⚡ Very Fast | ⭐⭐ Basic | Quick testing, CI/CD | +| **8-16 GB** | `gemma2:2b` | ~1.4 GB | ⚡⚡ Fast | ⭐⭐⭐ Good | Balanced performance | +| **8-16 GB** | `phi3:mini` | ~2.3 GB | ⚡⚡ Fast | ⭐⭐⭐ Good | Microsoft's efficient model | +| **16-32 GB** | `qwen2.5:3b` | ~2.0 GB | ⚡⚡ Fast | ⭐⭐⭐⭐ Very Good | Recommended for most users | +| **16-32 GB** | `gemma2:9b` | ~5.4 GB | ⚡ Moderate | ⭐⭐⭐⭐ Very Good | Better quality mutations | +| **32+ GB** | `qwen2.5-coder:7b` | ~4.4 GB | ⚡ Moderate | ⭐⭐⭐⭐⭐ Excellent | Best for code/structured prompts | +| **32+ GB** | `qwen2.5:7b` | ~4.4 GB | ⚡ Moderate | ⭐⭐⭐⭐⭐ Excellent | Best overall quality | +| **64+ GB** | `qwen2.5:14b` | ~8.9 GB | 🐌 Slower | ⭐⭐⭐⭐⭐ Excellent | Maximum quality (overkill for most) | + +**Quick Recommendations:** + +- **Minimum viable (8GB RAM)**: `tinyllama:1.1b` or `gemma2:2b` +- **Recommended (16GB+ RAM)**: `qwen2.5:3b` or `gemma2:9b` +- **Best quality (32GB+ RAM)**: `qwen2.5-coder:7b` or `qwen2.5:7b` + +**Pull your chosen model:** + +```bash +# For 8GB RAM systems +ollama pull tinyllama:1.1b +# or +ollama pull gemma2:2b + +# For 16GB RAM systems (recommended) +ollama pull qwen2.5:3b +# or +ollama pull gemma2:9b + +# For 32GB+ RAM systems (best quality) +ollama pull qwen2.5-coder:7b +# or +ollama pull qwen2.5:7b +``` + +**Update your `flakestorm.yaml` to use your chosen model:** + +```yaml +model: + provider: "ollama" + name: "qwen2.5:3b" # Change to your chosen model + base_url: "http://localhost:11434" +``` + +**Note:** Smaller models are faster but may produce less diverse mutations. Larger models produce higher quality mutations but require more RAM and are slower. For most users, `qwen2.5:3b` or `gemma2:9b` provides the best balance. + ### Step 3: Create Virtual Environment and Install flakestorm **CRITICAL: Python 3.10+ Required!** @@ -375,10 +426,22 @@ maturin build --release # 4. Remove any old wheels (if they exist) rm -f ../target/wheels/entropix_rust-*.whl # Remove old wheels with wrong name -# 5. Install the new wheel (use specific pattern to avoid old wheels) -pip install ../target/wheels/flakestorm_rust-*.whl +# 5. List available wheel files to get the exact filename +# On Linux/macOS: +ls ../target/wheels/flakestorm_rust-*.whl +# On Windows (PowerShell): +# Get-ChildItem ..\target\wheels\flakestorm_rust-*.whl -# 6. Verify installation +# 6. Install the wheel using the FULL filename (wildcard pattern may not work) +# Copy the exact filename from step 5 and use it here: +# Example for Windows: +# pip install ../target/wheels/flakestorm_rust-0.1.0-cp311-cp311-win_amd64.whl +# Example for Linux: +# pip install ../target/wheels/flakestorm_rust-0.1.0-cp311-cp311-manylinux_2_34_x86_64.whl +# Example for macOS: +# pip install ../target/wheels/flakestorm_rust-0.1.0-cp311-cp311-macosx_10_9_x86_64.whl + +# 7. Verify installation python -c "import flakestorm_rust; print('Rust extension installed successfully!')" ``` @@ -994,17 +1057,22 @@ mutations: length_extremes: 1.2 # ============================================================================= -# LLM CONFIGURATION (for mutation generation) +# MODEL CONFIGURATION (for mutation generation) # ============================================================================= -llm: - # Ollama model to use - model: "qwen2.5-coder:7b" +model: + # Model provider: "ollama" (default) + provider: "ollama" + + # Model name (must be pulled in Ollama first) + # See "Choosing the Right Model for Your System" section above for recommendations + # based on your RAM: 8GB (tinyllama:1.1b), 16GB (qwen2.5:3b), 32GB+ (qwen2.5-coder:7b) + name: "qwen2.5-coder:7b" # Ollama server URL - host: "http://localhost:11434" + base_url: "http://localhost:11434" - # Generation temperature (higher = more creative mutations) - temperature: 0.8 + # Optional: Generation temperature (higher = more creative mutations) + # temperature: 0.8 # ============================================================================= # INVARIANTS (ASSERTIONS) diff --git a/flakestorm-generate-search-queries.yaml b/flakestorm-generate-search-queries.yaml index 4c9b406..14c0a37 100644 --- a/flakestorm-generate-search-queries.yaml +++ b/flakestorm-generate-search-queries.yaml @@ -34,7 +34,7 @@ agent: # Recommended for 8GB RAM: qwen2.5:1.5b (fastest), tinyllama (smallest), or phi3:mini (best quality) model: provider: "ollama" - name: "tinyllama" # Small, fast model optimized for 8GB RAM + name: "gemma3:1b" # Small, fast model optimized for 8GB RAM base_url: "http://localhost:11434" # ============================================================================= @@ -42,7 +42,7 @@ model: # ============================================================================= mutations: # Number of mutations to generate per golden prompt - count: 3 + count: 20 # Types of mutations to apply types: diff --git a/flakestorm_demo.gif b/flakestorm_demo.gif new file mode 100644 index 0000000..b4e61f1 Binary files /dev/null and b/flakestorm_demo.gif differ diff --git a/flakestorm_test_reporting.gif b/flakestorm_test_reporting.gif new file mode 100644 index 0000000..7ff86c7 Binary files /dev/null and b/flakestorm_test_reporting.gif differ diff --git a/src/flakestorm/core/orchestrator.py b/src/flakestorm/core/orchestrator.py index 5ac12d1..3025dc4 100644 --- a/src/flakestorm/core/orchestrator.py +++ b/src/flakestorm/core/orchestrator.py @@ -26,7 +26,7 @@ from rich.progress import ( ) # Configuration limits for local hardware constraints -MAX_MUTATIONS_PER_RUN = 50 +MAX_MUTATIONS_PER_RUN = 200 PARALLEL_EXECUTION_ENABLED = False # Sequential execution for local hardware if TYPE_CHECKING: diff --git a/src/flakestorm/reports/html.py b/src/flakestorm/reports/html.py index 8abe178..1dc8be2 100644 --- a/src/flakestorm/reports/html.py +++ b/src/flakestorm/reports/html.py @@ -13,12 +13,12 @@ from __future__ import annotations import json from datetime import datetime from pathlib import Path -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any from jinja2 import Template if TYPE_CHECKING: - from flakestorm.reports.models import TestResults + from flakestorm.reports.models import MutationResult, TestResults HTML_TEMPLATE = """ @@ -461,6 +461,77 @@ HTML_TEMPLATE = """ + {% if summary.total_failures > 0 %} +
+

📋 Executive Summary & Action Items

+
+
+

Overall Assessment

+

+ Your agent has a {{ score_percent }}% robustness score with + {{ failed_mutations }} failures out of {{ total_mutations }} tests. + {% if score_percent < 70 %} + ⚠️ This indicates significant vulnerabilities that need immediate attention. + {% elif score_percent < 85 %} + ⚠️ Your agent needs improvement before production deployment. + {% else %} + ✓ Your agent shows good robustness, but there's room for improvement. + {% endif %} +

+
+ + {% if summary.recommendations %} +
+

Priority Action Items

+
+ {% for rec in summary.recommendations %} +
+
+
+ + {{ rec.priority }} Priority + +

{{ rec.issue }}

+
+ + {{ rec.count }} occurrence{{ 's' if rec.count != 1 else '' }} + +
+

{{ rec.action }}

+
+ {% endfor %} +
+
+ {% endif %} + + {% if summary.top_issues %} +
+

Top Failure Types

+
+ {% for issue in summary.top_issues %} +
+
+ {{ issue.type.replace('_', ' ').title() }} +
+
{{ issue.count }}
+
+ {% endfor %} +
+
+ {% endif %} +
+
+ {% endif %} +

📊 By Mutation Type

@@ -512,24 +583,38 @@ HTML_TEMPLATE = """