2025-02-17 23:00:15 +05:30
from rowboat import Client , StatefulChat
from typing import List
import json
import os
2025-02-20 18:51:49 +05:30
import asyncio
2025-02-17 23:00:15 +05:30
from openai import OpenAI
from scenario_types import Scenario , SimulationResult , SimulationAggregateResult
2025-02-20 15:19:21 +05:30
from db import write_simulation_result
2025-02-17 23:00:15 +05:30
openai_client = OpenAI ( )
MODEL_NAME = " gpt-4o "
ROWBOAT_API_HOST = os . environ . get ( " ROWBOAT_API_HOST " , " http://127.0.0.1:3000 " ) . strip ( )
2025-02-20 18:51:49 +05:30
async def simulate_scenario ( scenario : Scenario , rowboat_client : Client , workflow_id : str , max_iterations : int = 5 ) - > tuple [ str , str , str ] :
2025-02-17 23:00:15 +05:30
"""
2025-02-20 18:51:49 +05:30
Runs a mock simulation for a given scenario asynchronously .
2025-02-17 23:00:15 +05:30
After simulating several turns of conversation , it evaluates the conversation .
2025-02-20 18:51:49 +05:30
Returns a tuple of ( evaluation_result , details , transcript_str ) .
2025-02-17 23:00:15 +05:30
"""
2025-02-20 18:51:49 +05:30
loop = asyncio . get_running_loop ( )
2025-02-17 23:00:15 +05:30
support_chat = StatefulChat (
rowboat_client ,
system_prompt = f " { f ' Context: { scenario . context } ' if scenario . context else ' ' } " ,
workflow_id = workflow_id
)
messages = [
{
" role " : " system " ,
" content " : f " Simulate the user based on the scenario: \n { scenario . description } "
}
]
# -------------------------
# 1) MAIN SIMULATION LOOP
# -------------------------
for i in range ( max_iterations ) :
openai_input = messages
2025-02-20 18:51:49 +05:30
# Run OpenAI API call in a separate thread
simulated_user_response = await loop . run_in_executor (
None , # Use default thread pool
lambda : openai_client . chat . completions . create (
model = MODEL_NAME ,
messages = openai_input ,
temperature = 0.0 ,
)
2025-02-17 23:00:15 +05:30
)
simulated_content = simulated_user_response . choices [ 0 ] . message . content
2025-02-20 18:51:49 +05:30
# Run support_chat.run in a thread if it's synchronous
rowboat_response = await loop . run_in_executor (
None ,
lambda : support_chat . run ( simulated_content )
)
2025-02-17 23:00:15 +05:30
messages . append ( { " role " : " assistant " , " content " : rowboat_response } )
# -------------------------
# 2) EVALUATION STEP
# -------------------------
transcript_str = " "
for m in messages :
role = m . get ( " role " , " unknown " )
content = m . get ( " content " , " " )
transcript_str + = f " { role . upper ( ) } : { content } \n "
evaluation_prompt = [
{
" role " : " system " ,
" content " : (
f " You are a neutral evaluator. Evaluate based on these criteria: \n { scenario . criteria } \n \n Return ONLY a JSON object with format: "
2025-02-20 15:19:21 +05:30
' { " verdict " : " pass " , " details " : <the reason for pass in 2 sentences>} if the support bot answered correctly, or { " verdict " : " fail " , " details " : <the reason for fail in 2 sentences>} if not. '
2025-02-17 23:00:15 +05:30
)
} ,
{
" role " : " user " ,
" content " : (
f " Here is the conversation transcript: \n \n { transcript_str } \n \n "
2025-02-20 15:19:21 +05:30
" Did the support bot answer correctly or not? Return only ' pass ' or ' fail ' for verdict, and a brief 2 sentence explanation for details. "
2025-02-17 23:00:15 +05:30
)
}
]
2025-02-20 18:51:49 +05:30
# Run evaluation in a separate thread
eval_response = await loop . run_in_executor (
None ,
lambda : openai_client . chat . completions . create (
model = MODEL_NAME ,
messages = evaluation_prompt ,
temperature = 0.0 ,
response_format = { " type " : " json_object " }
)
2025-02-17 23:00:15 +05:30
)
if not eval_response . choices :
raise Exception ( " No evaluation response received from model " )
else :
response_json = json . loads ( eval_response . choices [ 0 ] . message . content )
evaluation_result = response_json . get ( " verdict " )
2025-02-20 15:19:21 +05:30
details = response_json . get ( " details " )
2025-02-17 23:00:15 +05:30
if evaluation_result is None :
raise Exception ( " No verdict field found in evaluation response " )
2025-02-20 18:51:49 +05:30
return ( evaluation_result , details , transcript_str )
2025-02-17 23:00:15 +05:30
async def simulate_scenarios ( scenarios : List [ Scenario ] , runId : str , workflow_id : str , api_key : str , max_iterations : int = 5 ) :
2025-02-20 18:51:49 +05:30
"""
Simulates a list of scenarios asynchronously and aggregates the results .
"""
2025-02-17 23:00:15 +05:30
project_id = scenarios [ 0 ] . projectId
client = Client (
host = ROWBOAT_API_HOST ,
project_id = project_id ,
api_key = api_key
)
results = [ ]
2025-02-20 18:51:49 +05:30
2025-02-17 23:00:15 +05:30
for scenario in scenarios :
2025-02-20 18:51:49 +05:30
# Await the asynchronous simulate_scenario
result , details , transcript = await simulate_scenario ( scenario , client , workflow_id , max_iterations )
2025-02-17 23:00:15 +05:30
simulation_result = SimulationResult (
projectId = project_id ,
runId = runId ,
scenarioId = scenario . id ,
result = result ,
2025-02-20 15:19:21 +05:30
details = details ,
transcript = transcript
2025-02-17 23:00:15 +05:30
)
results . append ( simulation_result )
write_simulation_result ( simulation_result )
aggregate_result = SimulationAggregateResult ( * * {
" total " : len ( scenarios ) ,
" pass " : sum ( 1 for result in results if result . result == " pass " ) ,
" fail " : sum ( 1 for result in results if result . result == " fail " )
} )
return aggregate_result