Add Flakestorm V2 PRD Plan and enhance v2 research agent to utilize Ollama for LLM responses. Updated README and requirements for dependencies. Implemented LLM API key support and chaos features in the agent.

This commit is contained in:
Francisco M Humarang Jr. 2026-03-07 00:31:26 +08:00
parent 9c3450a75d
commit 61a81a7f4b
4 changed files with 316 additions and 17 deletions

View file

@ -5,8 +5,8 @@ A **working** HTTP agent and v2.0 config that demonstrates all three V2 pillars:
## Prerequisites
- Python 3.10+
- Ollama running (for mutation generation): `ollama run gemma3:1b` or any model
- Optional: `pip install fastapi uvicorn` (agent server)
- **Ollama** running with a model (e.g. `ollama pull gemma3:1b` then `ollama run gemma3:1b`). The agent calls Ollama to generate real LLM responses; Flakestorm uses the same Ollama for mutation generation.
- Dependencies: `pip install -r requirements.txt` (fastapi, uvicorn, pydantic, httpx)
## 1. Start the agent
@ -73,4 +73,4 @@ flakestorm ci -c examples/v2_research_agent/flakestorm.yaml --min-score 0.5
- `replays.sessions` (file reference)
- `scoring` (weights)
The agent is stateless except for a call counter; `/reset` clears it so contract cells stay isolated.
The agent calls **Ollama** (same model as in `flakestorm.yaml`: `gemma3:1b` by default). Set `OLLAMA_BASE_URL` or `OLLAMA_MODEL` if your Ollama runs elsewhere or uses a different model. The agent is stateless except for a call counter; `/reset` clears it so contract cells stay isolated.

View file

@ -1,17 +1,20 @@
"""
V2 Research Assistant Agent Working example for Flakestorm v2.
A minimal HTTP agent that simulates a research assistant: it responds to queries
and always cites a source (so behavioral contracts can be verified). Supports
/reset for contract matrix isolation. Used to demonstrate:
An HTTP agent that calls a real LLM (Ollama) to answer queries. It uses a
system prompt so responses tend to cite a source (behavioral contract).
Supports /reset for contract matrix isolation. Demonstrates:
- flakestorm run (mutation testing)
- flakestorm run --chaos / --chaos-profile (environment chaos)
- flakestorm contract run (behavioral contract × chaos matrix)
- flakestorm replay run (replay regression)
- flakestorm ci (unified run with overall score)
Requires: Ollama running with the same model as in flakestorm.yaml (e.g. gemma3:1b).
"""
import os
import time
from fastapi import FastAPI
from pydantic import BaseModel
@ -20,6 +23,13 @@ app = FastAPI(title="V2 Research Assistant Agent")
# In-memory state (cleared by /reset for contract isolation)
_state = {"calls": 0}
# Ollama config (match flakestorm.yaml or set OLLAMA_BASE_URL, OLLAMA_MODEL)
OLLAMA_BASE_URL = os.environ.get("OLLAMA_BASE_URL", "http://localhost:11434").rstrip("/")
OLLAMA_MODEL = os.environ.get("OLLAMA_MODEL", "gemma3:1b")
OLLAMA_TIMEOUT = float(os.environ.get("OLLAMA_TIMEOUT", "60"))
SYSTEM_PROMPT = """You are a research assistant. For every answer, you must cite a source using phrases like "According to ...", "Source: ...", or "Per ...". Keep answers concise (2-4 sentences). If you don't know, say so and still cite that you couldn't find a source."""
class InvokeRequest(BaseModel):
"""Request body: prompt or input."""
@ -31,10 +41,29 @@ class InvokeRequest(BaseModel):
class InvokeResponse(BaseModel):
"""Response with result and optional metadata."""
result: str
source: str = "demo_knowledge_base"
source: str = "ollama"
latency_ms: float | None = None
def _call_ollama(prompt: str) -> tuple[str, float]:
"""Call Ollama generate API. Returns (response_text, latency_ms). Raises on failure."""
import httpx
start = time.perf_counter()
url = f"{OLLAMA_BASE_URL}/api/generate"
body = {
"model": OLLAMA_MODEL,
"prompt": f"{SYSTEM_PROMPT}\n\nUser: {prompt}\n\nAssistant:",
"stream": False,
}
with httpx.Client(timeout=OLLAMA_TIMEOUT) as client:
r = client.post(url, json=body)
r.raise_for_status()
data = r.json()
elapsed_ms = (time.perf_counter() - start) * 1000
text = (data.get("response") or "").strip()
return text or "(No response from model)", elapsed_ms
@app.post("/reset")
def reset():
"""Reset agent state. Called by Flakestorm before each contract matrix cell."""
@ -44,21 +73,28 @@ def reset():
@app.post("/invoke", response_model=InvokeResponse)
def invoke(req: InvokeRequest):
"""Handle a single user query. Always cites a source (contract invariant)."""
"""Handle a single user query. Calls Ollama and returns the model response."""
_state["calls"] += 1
text = req.input or req.prompt or req.query or ""
if not text.strip():
text = (req.input or req.prompt or req.query or "").strip()
if not text:
return InvokeResponse(
result="I didn't receive a question. Please ask something.",
source="none",
)
# Simulate a research response that cites a source (contract: always-cite-source)
response = (
f"According to [source: {_state['source']}], "
f"here is what I found for your query: \"{text[:100]}\". "
"Data may be incomplete when tools are degraded."
)
return InvokeResponse(result=response, source=_state["source"])
try:
response, latency_ms = _call_ollama(text)
return InvokeResponse(
result=response,
source="ollama",
latency_ms=round(latency_ms, 2),
)
except Exception as e:
# Graceful fallback so "completes" invariant can still pass under chaos
return InvokeResponse(
result=f"According to [source: system], I couldn't reach the knowledge base right now ({type(e).__name__}). Please try again.",
source="fallback",
latency_ms=None,
)
@app.get("/health")

View file

@ -2,3 +2,4 @@
fastapi>=0.100.0
uvicorn>=0.22.0
pydantic>=2.0
httpx>=0.25.0