simulation_runner: added transcript and details to result

This commit is contained in:
arkml 2025-02-20 15:19:21 +05:30
parent c4f1406444
commit 5da21d3c39
2 changed files with 9 additions and 6 deletions

View file

@ -31,6 +31,7 @@ class SimulationResult(BaseModel):
scenarioId: str scenarioId: str
result: Literal["pass", "fail"] result: Literal["pass", "fail"]
details: str details: str
transcript: str
class SimulationAggregateResult(BaseModel): class SimulationAggregateResult(BaseModel):
total: int total: int

View file

@ -4,7 +4,7 @@ import json
import os import os
from openai import OpenAI from openai import OpenAI
from scenario_types import Scenario, SimulationResult, SimulationAggregateResult from scenario_types import Scenario, SimulationResult, SimulationAggregateResult
from db import write_simulation_result, set_simulation_run_to_completed from db import write_simulation_result
openai_client = OpenAI() openai_client = OpenAI()
@ -64,14 +64,14 @@ def simulate_scenario(scenario: Scenario, rowboat_client: Client, workflow_id: s
"role": "system", "role": "system",
"content": ( "content": (
f"You are a neutral evaluator. Evaluate based on these criteria:\n{scenario.criteria}\n\nReturn ONLY a JSON object with format: " f"You are a neutral evaluator. Evaluate based on these criteria:\n{scenario.criteria}\n\nReturn ONLY a JSON object with format: "
'{"verdict": "pass"} if the support bot answered correctly, or {"verdict": "fail"} if not.' '{"verdict": "pass", "details": <the reason for pass in 2 sentences>} if the support bot answered correctly, or {"verdict": "fail", "details": <the reason for fail in 2 sentences>} if not.'
) )
}, },
{ {
"role": "user", "role": "user",
"content": ( "content": (
f"Here is the conversation transcript:\n\n{transcript_str}\n\n" f"Here is the conversation transcript:\n\n{transcript_str}\n\n"
"Did the support bot answer correctly or not? Return only 'pass' or 'fail'." "Did the support bot answer correctly or not? Return only 'pass' or 'fail' for verdict, and a brief 2 sentence explanation for details."
) )
} }
] ]
@ -88,10 +88,11 @@ def simulate_scenario(scenario: Scenario, rowboat_client: Client, workflow_id: s
else: else:
response_json = json.loads(eval_response.choices[0].message.content) response_json = json.loads(eval_response.choices[0].message.content)
evaluation_result = response_json.get("verdict") evaluation_result = response_json.get("verdict")
details = response_json.get("details")
if evaluation_result is None: if evaluation_result is None:
raise Exception("No verdict field found in evaluation response") raise Exception("No verdict field found in evaluation response")
return(evaluation_result, transcript_str) return(evaluation_result, details, transcript_str)
async def simulate_scenarios(scenarios: List[Scenario], runId: str, workflow_id: str, api_key: str, max_iterations: int = 5): async def simulate_scenarios(scenarios: List[Scenario], runId: str, workflow_id: str, api_key: str, max_iterations: int = 5):
@ -103,14 +104,15 @@ async def simulate_scenarios(scenarios: List[Scenario], runId: str, workflow_id:
) )
results = [] results = []
for scenario in scenarios: for scenario in scenarios:
result, transcript = simulate_scenario(scenario, client, workflow_id, max_iterations) result, details, transcript = simulate_scenario(scenario, client, workflow_id, max_iterations)
simulation_result = SimulationResult( simulation_result = SimulationResult(
projectId=project_id, projectId=project_id,
runId=runId, runId=runId,
scenarioId=scenario.id, scenarioId=scenario.id,
result=result, result=result,
details=transcript details=details,
transcript=transcript
) )
results.append(simulation_result) results.append(simulation_result)
write_simulation_result(simulation_result) write_simulation_result(simulation_result)