rowboat/apps/simulation_runner/simulation.py

from rowboat import Client, StatefulChat
from typing import List
import json
import os
import asyncio
from openai import OpenAI
from scenario_types import Scenario, SimulationResult, SimulationAggregateResult
from db import write_simulation_result

openai_client = OpenAI()
MODEL_NAME = "gpt-4o"
ROWBOAT_API_HOST = os.environ.get("ROWBOAT_API_HOST", "http://127.0.0.1:3000").strip()

async def simulate_scenario(scenario: Scenario, rowboat_client: Client, workflow_id: str, max_iterations: int = 5) -> tuple[str, str, str]:
    """
    Runs a mock simulation for a given scenario asynchronously.
    After simulating several turns of conversation, it evaluates the conversation.
    Returns a tuple of (evaluation_result, details, transcript_str).
    """
    loop = asyncio.get_running_loop()

    support_chat = StatefulChat(
        rowboat_client,
        system_prompt=f"{f'Context: {scenario.context}' if scenario.context else ''}",
        workflow_id=workflow_id
    )

    messages = [
        {
            "role": "system",
            "content": f"Simulate the user based on the scenario: \n {scenario.description}"
        }
    ]

    # -------------------------
    # 1) MAIN SIMULATION LOOP
    # -------------------------
    for i in range(max_iterations):
        openai_input = messages

        # Run OpenAI API call in a separate thread
        simulated_user_response = await loop.run_in_executor(
            None,  # Use default thread pool
            lambda: openai_client.chat.completions.create(
                model=MODEL_NAME,
                messages=openai_input,
                temperature=0.0,
            )
        )

        simulated_content = simulated_user_response.choices[0].message.content

        # Run support_chat.run in a thread if it's synchronous
        rowboat_response = await loop.run_in_executor(
            None,
            lambda: support_chat.run(simulated_content)
        )

        messages.append({"role": "assistant", "content": rowboat_response})

    # -------------------------
    # 2) EVALUATION STEP
    # -------------------------
    transcript_str = ""
    for m in messages:
        role = m.get("role", "unknown")
        content = m.get("content", "")
        transcript_str += f"{role.upper()}: {content}\n"

    evaluation_prompt = [
        {
            "role": "system",
            "content": (
                f"You are a neutral evaluator. Evaluate based on these criteria:\n{scenario.criteria}\n\nReturn ONLY a JSON object with format: "
                '{"verdict": "pass", "details": <the reason for pass in 2 sentences>} if the support bot answered correctly, or {"verdict": "fail", "details": <the reason for fail in 2 sentences>} if not.'
            )
        },
        {
            "role": "user",
            "content": (
                f"Here is the conversation transcript:\n\n{transcript_str}\n\n"
                "Did the support bot answer correctly or not? Return only 'pass' or 'fail' for verdict, and a brief 2 sentence explanation for details."
            )
        }
    ]

    # Run evaluation in a separate thread
    eval_response = await loop.run_in_executor(
        None,
        lambda: openai_client.chat.completions.create(
            model=MODEL_NAME,
            messages=evaluation_prompt,
            temperature=0.0,
            response_format={"type": "json_object"}
        )
    )

    if not eval_response.choices:
        raise Exception("No evaluation response received from model")
    else:
        response_json = json.loads(eval_response.choices[0].message.content)
        evaluation_result = response_json.get("verdict")
        details = response_json.get("details")
        if evaluation_result is None:
            raise Exception("No verdict field found in evaluation response")

    return (evaluation_result, details, transcript_str)

async def simulate_scenarios(scenarios: List[Scenario], runId: str, workflow_id: str, api_key: str, max_iterations: int = 5):
    """
    Simulates a list of scenarios asynchronously and aggregates the results.
    """
    project_id = scenarios[0].projectId
    client = Client(
        host=ROWBOAT_API_HOST,
        project_id=project_id,
        api_key=api_key
    )
    results = []

    for scenario in scenarios:
        # Await the asynchronous simulate_scenario
        result, details, transcript = await simulate_scenario(scenario, client, workflow_id, max_iterations)

        simulation_result = SimulationResult(
            projectId=project_id,
            runId=runId,
            scenarioId=scenario.id,
            result=result,
            details=details,
            transcript=transcript
        )
        results.append(simulation_result)
        write_simulation_result(simulation_result)

    aggregate_result = SimulationAggregateResult(**{
        "total": len(scenarios),
        "pass": sum(1 for result in results if result.result == "pass"),
        "fail": sum(1 for result in results if result.result == "fail")
    })
    return aggregate_result
Add simulation runner 2025-02-17 23:00:15 +05:30			`from rowboat import Client, StatefulChat`
			`from typing import List`
			`import json`
			`import os`
simulation_runner: added failed job cleanup 2025-02-20 18:51:49 +05:30			`import asyncio`
Add simulation runner 2025-02-17 23:00:15 +05:30			`from openai import OpenAI`
			`from scenario_types import Scenario, SimulationResult, SimulationAggregateResult`
simulation_runner: added transcript and details to result 2025-02-20 15:19:21 +05:30			`from db import write_simulation_result`
Add simulation runner 2025-02-17 23:00:15 +05:30
			`openai_client = OpenAI()`
			`MODEL_NAME = "gpt-4o"`
			`ROWBOAT_API_HOST = os.environ.get("ROWBOAT_API_HOST", "http://127.0.0.1:3000").strip()`

simulation_runner: added failed job cleanup 2025-02-20 18:51:49 +05:30			`async def simulate_scenario(scenario: Scenario, rowboat_client: Client, workflow_id: str, max_iterations: int = 5) -> tuple[str, str, str]:`
Add simulation runner 2025-02-17 23:00:15 +05:30			`"""`
simulation_runner: added failed job cleanup 2025-02-20 18:51:49 +05:30			`Runs a mock simulation for a given scenario asynchronously.`
Add simulation runner 2025-02-17 23:00:15 +05:30			`After simulating several turns of conversation, it evaluates the conversation.`
simulation_runner: added failed job cleanup 2025-02-20 18:51:49 +05:30			`Returns a tuple of (evaluation_result, details, transcript_str).`
Add simulation runner 2025-02-17 23:00:15 +05:30			`"""`
simulation_runner: added failed job cleanup 2025-02-20 18:51:49 +05:30			`loop = asyncio.get_running_loop()`
Add simulation runner 2025-02-17 23:00:15 +05:30
			`support_chat = StatefulChat(`
			`rowboat_client,`
			`system_prompt=f"{f'Context: {scenario.context}' if scenario.context else ''}",`
			`workflow_id=workflow_id`
			`)`

			`messages = [`
			`{`
			`"role": "system",`
			`"content": f"Simulate the user based on the scenario: \n {scenario.description}"`
			`}`
			`]`

			`# -------------------------`
			`# 1) MAIN SIMULATION LOOP`
			`# -------------------------`
			`for i in range(max_iterations):`
			`openai_input = messages`

simulation_runner: added failed job cleanup 2025-02-20 18:51:49 +05:30			`# Run OpenAI API call in a separate thread`
			`simulated_user_response = await loop.run_in_executor(`
			`None, # Use default thread pool`
			`lambda: openai_client.chat.completions.create(`
			`model=MODEL_NAME,`
			`messages=openai_input,`
			`temperature=0.0,`
			`)`
Add simulation runner 2025-02-17 23:00:15 +05:30			`)`

			`simulated_content = simulated_user_response.choices[0].message.content`

simulation_runner: added failed job cleanup 2025-02-20 18:51:49 +05:30			`# Run support_chat.run in a thread if it's synchronous`
			`rowboat_response = await loop.run_in_executor(`
			`None,`
			`lambda: support_chat.run(simulated_content)`
			`)`
Add simulation runner 2025-02-17 23:00:15 +05:30
			`messages.append({"role": "assistant", "content": rowboat_response})`

			`# -------------------------`
			`# 2) EVALUATION STEP`
			`# -------------------------`
			`transcript_str = ""`
			`for m in messages:`
			`role = m.get("role", "unknown")`
			`content = m.get("content", "")`
			`transcript_str += f"{role.upper()}: {content}\n"`

			`evaluation_prompt = [`
			`{`
			`"role": "system",`
			`"content": (`
			`f"You are a neutral evaluator. Evaluate based on these criteria:\n{scenario.criteria}\n\nReturn ONLY a JSON object with format: "`
simulation_runner: added transcript and details to result 2025-02-20 15:19:21 +05:30			`'{"verdict": "pass", "details": <the reason for pass in 2 sentences>} if the support bot answered correctly, or {"verdict": "fail", "details": <the reason for fail in 2 sentences>} if not.'`
Add simulation runner 2025-02-17 23:00:15 +05:30			`)`
			`},`
			`{`
			`"role": "user",`
			`"content": (`
			`f"Here is the conversation transcript:\n\n{transcript_str}\n\n"`
simulation_runner: added transcript and details to result 2025-02-20 15:19:21 +05:30			`"Did the support bot answer correctly or not? Return only 'pass' or 'fail' for verdict, and a brief 2 sentence explanation for details."`
Add simulation runner 2025-02-17 23:00:15 +05:30			`)`
			`}`
			`]`

simulation_runner: added failed job cleanup 2025-02-20 18:51:49 +05:30			`# Run evaluation in a separate thread`
			`eval_response = await loop.run_in_executor(`
			`None,`
			`lambda: openai_client.chat.completions.create(`
			`model=MODEL_NAME,`
			`messages=evaluation_prompt,`
			`temperature=0.0,`
			`response_format={"type": "json_object"}`
			`)`
Add simulation runner 2025-02-17 23:00:15 +05:30			`)`

			`if not eval_response.choices:`
			`raise Exception("No evaluation response received from model")`
			`else:`
			`response_json = json.loads(eval_response.choices[0].message.content)`
			`evaluation_result = response_json.get("verdict")`
simulation_runner: added transcript and details to result 2025-02-20 15:19:21 +05:30			`details = response_json.get("details")`
Add simulation runner 2025-02-17 23:00:15 +05:30			`if evaluation_result is None:`
			`raise Exception("No verdict field found in evaluation response")`

simulation_runner: added failed job cleanup 2025-02-20 18:51:49 +05:30			`return (evaluation_result, details, transcript_str)`
Add simulation runner 2025-02-17 23:00:15 +05:30
			`async def simulate_scenarios(scenarios: List[Scenario], runId: str, workflow_id: str, api_key: str, max_iterations: int = 5):`
simulation_runner: added failed job cleanup 2025-02-20 18:51:49 +05:30			`"""`
			`Simulates a list of scenarios asynchronously and aggregates the results.`
			`"""`
Add simulation runner 2025-02-17 23:00:15 +05:30			`project_id = scenarios[0].projectId`
			`client = Client(`
			`host=ROWBOAT_API_HOST,`
			`project_id=project_id,`
			`api_key=api_key`
			`)`
			`results = []`
simulation_runner: added failed job cleanup 2025-02-20 18:51:49 +05:30
Add simulation runner 2025-02-17 23:00:15 +05:30			`for scenario in scenarios:`
simulation_runner: added failed job cleanup 2025-02-20 18:51:49 +05:30			`# Await the asynchronous simulate_scenario`
			`result, details, transcript = await simulate_scenario(scenario, client, workflow_id, max_iterations)`
Add simulation runner 2025-02-17 23:00:15 +05:30
			`simulation_result = SimulationResult(`
			`projectId=project_id,`
			`runId=runId,`
			`scenarioId=scenario.id,`
			`result=result,`
simulation_runner: added transcript and details to result 2025-02-20 15:19:21 +05:30			`details=details,`
			`transcript=transcript`
Add simulation runner 2025-02-17 23:00:15 +05:30			`)`
			`results.append(simulation_result)`
			`write_simulation_result(simulation_result)`

			`aggregate_result = SimulationAggregateResult(**{`
			`"total": len(scenarios),`
			`"pass": sum(1 for result in results if result.result == "pass"),`
			`"fail": sum(1 for result in results if result.result == "fail")`
			`})`
			`return aggregate_result`