mirror of
https://github.com/rowboatlabs/rowboat.git
synced 2026-07-03 20:41:07 +02:00
simulation_runner: added transcript and details to result
This commit is contained in:
parent
c4f1406444
commit
5da21d3c39
2 changed files with 9 additions and 6 deletions
|
|
@ -31,6 +31,7 @@ class SimulationResult(BaseModel):
|
||||||
scenarioId: str
|
scenarioId: str
|
||||||
result: Literal["pass", "fail"]
|
result: Literal["pass", "fail"]
|
||||||
details: str
|
details: str
|
||||||
|
transcript: str
|
||||||
|
|
||||||
class SimulationAggregateResult(BaseModel):
|
class SimulationAggregateResult(BaseModel):
|
||||||
total: int
|
total: int
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,7 @@ import json
|
||||||
import os
|
import os
|
||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
from scenario_types import Scenario, SimulationResult, SimulationAggregateResult
|
from scenario_types import Scenario, SimulationResult, SimulationAggregateResult
|
||||||
from db import write_simulation_result, set_simulation_run_to_completed
|
from db import write_simulation_result
|
||||||
|
|
||||||
|
|
||||||
openai_client = OpenAI()
|
openai_client = OpenAI()
|
||||||
|
|
@ -64,14 +64,14 @@ def simulate_scenario(scenario: Scenario, rowboat_client: Client, workflow_id: s
|
||||||
"role": "system",
|
"role": "system",
|
||||||
"content": (
|
"content": (
|
||||||
f"You are a neutral evaluator. Evaluate based on these criteria:\n{scenario.criteria}\n\nReturn ONLY a JSON object with format: "
|
f"You are a neutral evaluator. Evaluate based on these criteria:\n{scenario.criteria}\n\nReturn ONLY a JSON object with format: "
|
||||||
'{"verdict": "pass"} if the support bot answered correctly, or {"verdict": "fail"} if not.'
|
'{"verdict": "pass", "details": <the reason for pass in 2 sentences>} if the support bot answered correctly, or {"verdict": "fail", "details": <the reason for fail in 2 sentences>} if not.'
|
||||||
)
|
)
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": (
|
"content": (
|
||||||
f"Here is the conversation transcript:\n\n{transcript_str}\n\n"
|
f"Here is the conversation transcript:\n\n{transcript_str}\n\n"
|
||||||
"Did the support bot answer correctly or not? Return only 'pass' or 'fail'."
|
"Did the support bot answer correctly or not? Return only 'pass' or 'fail' for verdict, and a brief 2 sentence explanation for details."
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
@ -88,10 +88,11 @@ def simulate_scenario(scenario: Scenario, rowboat_client: Client, workflow_id: s
|
||||||
else:
|
else:
|
||||||
response_json = json.loads(eval_response.choices[0].message.content)
|
response_json = json.loads(eval_response.choices[0].message.content)
|
||||||
evaluation_result = response_json.get("verdict")
|
evaluation_result = response_json.get("verdict")
|
||||||
|
details = response_json.get("details")
|
||||||
if evaluation_result is None:
|
if evaluation_result is None:
|
||||||
raise Exception("No verdict field found in evaluation response")
|
raise Exception("No verdict field found in evaluation response")
|
||||||
|
|
||||||
return(evaluation_result, transcript_str)
|
return(evaluation_result, details, transcript_str)
|
||||||
|
|
||||||
|
|
||||||
async def simulate_scenarios(scenarios: List[Scenario], runId: str, workflow_id: str, api_key: str, max_iterations: int = 5):
|
async def simulate_scenarios(scenarios: List[Scenario], runId: str, workflow_id: str, api_key: str, max_iterations: int = 5):
|
||||||
|
|
@ -103,14 +104,15 @@ async def simulate_scenarios(scenarios: List[Scenario], runId: str, workflow_id:
|
||||||
)
|
)
|
||||||
results = []
|
results = []
|
||||||
for scenario in scenarios:
|
for scenario in scenarios:
|
||||||
result, transcript = simulate_scenario(scenario, client, workflow_id, max_iterations)
|
result, details, transcript = simulate_scenario(scenario, client, workflow_id, max_iterations)
|
||||||
|
|
||||||
simulation_result = SimulationResult(
|
simulation_result = SimulationResult(
|
||||||
projectId=project_id,
|
projectId=project_id,
|
||||||
runId=runId,
|
runId=runId,
|
||||||
scenarioId=scenario.id,
|
scenarioId=scenario.id,
|
||||||
result=result,
|
result=result,
|
||||||
details=transcript
|
details=details,
|
||||||
|
transcript=transcript
|
||||||
)
|
)
|
||||||
results.append(simulation_result)
|
results.append(simulation_result)
|
||||||
write_simulation_result(simulation_result)
|
write_simulation_result(simulation_result)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue