import asyncio import logging from typing import List import json import os from openai import OpenAI # Updated imports from your new schema/types from scenario_types import TestSimulation, TestResult, AggregateResults # If your DB functions changed names, adapt here: from db import write_test_result # replaced write_simulation_result from rowboat import Client, StatefulChat openai_client = OpenAI() MODEL_NAME = "gpt-4o" ROWBOAT_API_HOST = os.environ.get("ROWBOAT_API_HOST", "http://127.0.0.1:3000").strip() async def simulate_simulation( simulation: TestSimulation, rowboat_client: Client, workflow_id: str, max_iterations: int = 5 ) -> tuple[str, str, str]: """ Runs a mock simulation for a given TestSimulation asynchronously. After simulating several turns of conversation, it evaluates the conversation. Returns a tuple of (evaluation_result, details, transcript_str). """ loop = asyncio.get_running_loop() # Optionally embed passCriteria in the system prompt, if it’s relevant to context: pass_criteria = simulation.passCriteria or "" # Or place it separately below if you prefer. # Prepare a Rowboat chat support_chat = StatefulChat( rowboat_client, system_prompt=f"Context: {pass_criteria}" if pass_criteria else "", workflow_id=workflow_id ) # You might want to describe the simulation or scenario more thoroughly. # Here, we just embed simulation.name in the system message: messages = [ { "role": "system", "content": ( f"Simulate the user based on this simulation:\n{simulation.name}" ) } ] # ------------------------- # (1) MAIN SIMULATION LOOP # ------------------------- for _ in range(max_iterations): openai_input = messages # Run OpenAI API call in a separate thread (non-blocking) simulated_user_response = await loop.run_in_executor( None, # default ThreadPool lambda: openai_client.chat.completions.create( model=MODEL_NAME, messages=openai_input, temperature=0.0, ) ) simulated_content = simulated_user_response.choices[0].message.content.strip() # Run Rowboat chat in a thread if it's synchronous rowboat_response = await loop.run_in_executor( None, lambda: support_chat.run(simulated_content) ) messages.append({"role": "assistant", "content": rowboat_response}) # ------------------------- # (2) EVALUATION STEP # ------------------------- transcript_str = "" for m in messages: role = m.get("role", "unknown") content = m.get("content", "") transcript_str += f"{role.upper()}: {content}\n" # We use passCriteria as the evaluation “criteria.” evaluation_prompt = [ { "role": "system", "content": ( f"You are a neutral evaluator. Evaluate based on these criteria:\n" f"{simulation.passCriteria}\n\n" "Return ONLY a JSON object in this format:\n" '{"verdict": "pass", "details": } or ' '{"verdict": "fail", "details": }.' ) }, { "role": "user", "content": ( f"Here is the conversation transcript:\n\n{transcript_str}\n\n" "Did the support bot answer correctly or not? " "Return only 'pass' or 'fail' for verdict, and a brief explanation for details." ) } ] # Run evaluation in a separate thread eval_response = await loop.run_in_executor( None, lambda: openai_client.chat.completions.create( model=MODEL_NAME, messages=evaluation_prompt, temperature=0.0, # If your LLM supports a structured response format, you can specify it. # Otherwise, remove or adapt 'response_format': response_format={"type": "json_object"} ) ) if not eval_response.choices: raise Exception("No evaluation response received from model") response_json_str = eval_response.choices[0].message.content # Attempt to parse the JSON response_json = json.loads(response_json_str) evaluation_result = response_json.get("verdict") details = response_json.get("details") if evaluation_result is None: raise Exception("No 'verdict' field found in evaluation response") return (evaluation_result, details, transcript_str) async def simulate_simulations( simulations: List[TestSimulation], run_id: str, workflow_id: str, api_key: str, max_iterations: int = 5 ) -> AggregateResults: """ Simulates a list of TestSimulations asynchronously and aggregates the results. """ if not simulations: # Return an empty result if there's nothing to simulate return AggregateResults(total=0, pass_=0, fail=0) # We assume all simulations belong to the same project project_id = simulations[0].projectId # Create a Rowboat client instance client = Client( host=ROWBOAT_API_HOST, project_id=project_id, api_key=api_key ) # Store results here results: List[TestResult] = [] for simulation in simulations: # Run each simulation verdict, details, transcript = await simulate_simulation( simulation=simulation, rowboat_client=client, workflow_id=workflow_id, max_iterations=max_iterations ) # Create a new TestResult test_result = TestResult( projectId=project_id, runId=run_id, simulationId=simulation.id, result=verdict, details=details ) results.append(test_result) # Persist the test result write_test_result(test_result) # Aggregate pass/fail total_count = len(results) pass_count = sum(1 for r in results if r.result == "pass") fail_count = sum(1 for r in results if r.result == "fail") return AggregateResults( total=total_count, passCount=pass_count, failCount=fail_count )