rowboat/apps/experimental/simulation_runner/simulation.py

import asyncio
import logging
from typing import List
import json
import os
from openai import OpenAI

from scenario_types import TestSimulation, TestResult, AggregateResults, TestScenario

from db import write_test_result, get_scenario_by_id
from rowboat import Client, StatefulChat

openai_client = OpenAI()
MODEL_NAME = "gpt-4.1"
ROWBOAT_API_HOST = os.environ.get("ROWBOAT_API_HOST", "http://127.0.0.1:3000").strip()

async def simulate_simulation(
    scenario: TestScenario,
    profile_id: str,
    pass_criteria: str,
    rowboat_client: Client,
    workflow_id: str,
    max_iterations: int = 5
) -> tuple[str, str, str]:
    """
    Runs a mock simulation for a given TestSimulation asynchronously.
    After simulating several turns of conversation, it evaluates the conversation.
    Returns a tuple of (evaluation_result, details, transcript_str).
    """

    loop = asyncio.get_running_loop()
    pass_criteria = pass_criteria

    # Todo: add profile_id
    support_chat = StatefulChat(
        rowboat_client,
        workflow_id=workflow_id,
        test_profile_id=profile_id
    )

    messages = [
        {
            "role": "system",
            "content": (
                f"You are role playing a customer talking to a chatbot (the user is role playing the chatbot). Have the following chat with the chatbot. Scenario:\n{scenario.description}. You are provided no other information. If the chatbot asks you for information that is not in context, go ahead and provide one unless stated otherwise in the scenario. Directly have the chat with the chatbot. Start now with your first message."
            )
        }
    ]

    # -------------------------
    # (1) MAIN SIMULATION LOOP
    # -------------------------
    for _ in range(max_iterations):
        openai_input = messages

        # Run OpenAI API call in a separate thread (non-blocking)
        simulated_user_response = await loop.run_in_executor(
            None,  # default ThreadPool
            lambda: openai_client.chat.completions.create(
                model=MODEL_NAME,
                messages=openai_input,
                temperature=0.0,
            )
        )

        simulated_content = simulated_user_response.choices[0].message.content.strip()
        messages.append({"role": "assistant", "content": simulated_content})
        # Run Rowboat chat in a thread if it's synchronous
        rowboat_response = await loop.run_in_executor(
            None,
            lambda: support_chat.run(simulated_content)
        )

        messages.append({"role": "user", "content": rowboat_response})

    # -------------------------
    # (2) EVALUATION STEP
    # -------------------------
    # swap the roles of the assistant and the user
    transcript_str = ""
    for m in messages:
        if m.get("role") == "assistant":
            m["role"] = "user"
        elif m.get("role") == "user":
            m["role"] = "assistant"
        role = m.get("role", "unknown")
        content = m.get("content", "")
        transcript_str += f"{role.upper()}: {content}\n"

    # Store the transcript as a JSON string
    transcript = json.dumps(messages)

    # We use passCriteria as the evaluation "criteria."
    evaluation_prompt = [
        {
            "role": "system",
            "content": (
                f"You are a neutral evaluator. Evaluate based on these criteria:\n"
                f"{pass_criteria}\n\n"
                "Return ONLY a JSON object in this format:\n"
                '{"verdict": "pass", "details": <reason>} or '
                '{"verdict": "fail", "details": <reason>}.'
            )
        },
        {
            "role": "user",
            "content": (
                f"Here is the conversation transcript:\n\n{transcript_str}\n\n"
                "Did the support bot answer correctly or not? "
                "Return only 'pass' or 'fail' for verdict, and a brief explanation for details."
            )
        }
    ]

    # Run evaluation in a separate thread
    eval_response = await loop.run_in_executor(
        None,
        lambda: openai_client.chat.completions.create(
            model=MODEL_NAME,
            messages=evaluation_prompt,
            temperature=0.0,
            response_format={"type": "json_object"}
        )
    )

    if not eval_response.choices:
        raise Exception("No evaluation response received from model")

    response_json_str = eval_response.choices[0].message.content
    # Attempt to parse the JSON
    response_json = json.loads(response_json_str)
    evaluation_result = response_json.get("verdict")
    details = response_json.get("details")

    if evaluation_result is None:
        raise Exception("No 'verdict' field found in evaluation response")

    return (evaluation_result, details, transcript)

async def simulate_simulations(
    simulations: List[TestSimulation],
    run_id: str,
    workflow_id: str,
    api_key: str,
    max_iterations: int = 5
) -> AggregateResults:
    """
    Simulates a list of TestSimulations asynchronously and aggregates the results.
    """
    if not simulations:
        # Return an empty result if there's nothing to simulate
        return AggregateResults(total=0, pass_=0, fail=0)

    project_id = simulations[0].projectId

    client = Client(
        host=ROWBOAT_API_HOST,
        project_id=project_id,
        api_key=api_key
    )

    # Store results here
    results: List[TestResult] = []

    for simulation in simulations:
        verdict, details, transcript = await simulate_simulation(
            scenario=get_scenario_by_id(simulation.scenarioId),
            profile_id=simulation.profileId,
            pass_criteria=simulation.passCriteria,
            rowboat_client=client,
            workflow_id=workflow_id,
            max_iterations=max_iterations
        )

        # Create a new TestResult
        test_result = TestResult(
            projectId=project_id,
            runId=run_id,
            simulationId=simulation.id,
            result=verdict,
            details=details,
            transcript=transcript
        )
        results.append(test_result)

        # Persist the test result
        write_test_result(test_result)

    # Aggregate pass/fail
    total_count = len(results)
    pass_count = sum(1 for r in results if r.result == "pass")
    fail_count = sum(1 for r in results if r.result == "fail")

    return AggregateResults(
        total=total_count,
        passCount=pass_count,
        failCount=fail_count
    )
updated simulation runner to the new collections 2025-03-01 12:56:08 +05:30			`import asyncio`
			`import logging`
Add simulation runner 2025-02-17 23:00:15 +05:30			`from typing import List`
			`import json`
			`import os`
			`from openai import OpenAI`
updated simulation runner to the new collections 2025-03-01 12:56:08 +05:30
added transcript and fixed simulation bugs 2025-03-11 13:31:26 +05:30			`from scenario_types import TestSimulation, TestResult, AggregateResults, TestScenario`
updated simulation runner to the new collections 2025-03-01 12:56:08 +05:30
added transcript and fixed simulation bugs 2025-03-11 13:31:26 +05:30			`from db import write_test_result, get_scenario_by_id`
updated simulation runner to the new collections 2025-03-01 12:56:08 +05:30			`from rowboat import Client, StatefulChat`
Add simulation runner 2025-02-17 23:00:15 +05:30
			`openai_client = OpenAI()`
Fix/prebuilt cards updates (#263) * updates to the twitter prebuilt cards * update the default fallback model from gpt-4o to gpt-4.1 2025-09-16 15:29:48 +05:30			`MODEL_NAME = "gpt-4.1"`
Add simulation runner 2025-02-17 23:00:15 +05:30			`ROWBOAT_API_HOST = os.environ.get("ROWBOAT_API_HOST", "http://127.0.0.1:3000").strip()`

updated simulation runner to the new collections 2025-03-01 12:56:08 +05:30			`async def simulate_simulation(`
added transcript and fixed simulation bugs 2025-03-11 13:31:26 +05:30			`scenario: TestScenario,`
			`profile_id: str,`
			`pass_criteria: str,`
updated simulation runner to the new collections 2025-03-01 12:56:08 +05:30			`rowboat_client: Client,`
			`workflow_id: str,`
			`max_iterations: int = 5`
			`) -> tuple[str, str, str]:`
Add simulation runner 2025-02-17 23:00:15 +05:30			`"""`
updated simulation runner to the new collections 2025-03-01 12:56:08 +05:30			`Runs a mock simulation for a given TestSimulation asynchronously.`
Add simulation runner 2025-02-17 23:00:15 +05:30			`After simulating several turns of conversation, it evaluates the conversation.`
simulation_runner: added failed job cleanup 2025-02-20 18:51:49 +05:30			`Returns a tuple of (evaluation_result, details, transcript_str).`
Add simulation runner 2025-02-17 23:00:15 +05:30			`"""`
updated simulation runner to the new collections 2025-03-01 12:56:08 +05:30
simulation_runner: added failed job cleanup 2025-02-20 18:51:49 +05:30			`loop = asyncio.get_running_loop()`
added transcript and fixed simulation bugs 2025-03-11 13:31:26 +05:30			`pass_criteria = pass_criteria`
Add simulation runner 2025-02-17 23:00:15 +05:30
added transcript and fixed simulation bugs 2025-03-11 13:31:26 +05:30			`# Todo: add profile_id`
Add simulation runner 2025-02-17 23:00:15 +05:30			`support_chat = StatefulChat(`
			`rowboat_client,`
add test profile to simulations 2025-03-11 18:36:17 +05:30			`workflow_id=workflow_id,`
			`test_profile_id=profile_id`
Add simulation runner 2025-02-17 23:00:15 +05:30			`)`

			`messages = [`
			`{`
			`"role": "system",`
updated simulation runner to the new collections 2025-03-01 12:56:08 +05:30			`"content": (`
Made agent run function async (#49) * fixed sync run function * fixed simulation 2025-03-28 00:51:53 +05:30			`f"You are role playing a customer talking to a chatbot (the user is role playing the chatbot). Have the following chat with the chatbot. Scenario:\n{scenario.description}. You are provided no other information. If the chatbot asks you for information that is not in context, go ahead and provide one unless stated otherwise in the scenario. Directly have the chat with the chatbot. Start now with your first message."`
updated simulation runner to the new collections 2025-03-01 12:56:08 +05:30			`)`
Add simulation runner 2025-02-17 23:00:15 +05:30			`}`
			`]`

			`# -------------------------`
updated simulation runner to the new collections 2025-03-01 12:56:08 +05:30			`# (1) MAIN SIMULATION LOOP`
Add simulation runner 2025-02-17 23:00:15 +05:30			`# -------------------------`
updated simulation runner to the new collections 2025-03-01 12:56:08 +05:30			`for _ in range(max_iterations):`
Add simulation runner 2025-02-17 23:00:15 +05:30			`openai_input = messages`

updated simulation runner to the new collections 2025-03-01 12:56:08 +05:30			`# Run OpenAI API call in a separate thread (non-blocking)`
simulation_runner: added failed job cleanup 2025-02-20 18:51:49 +05:30			`simulated_user_response = await loop.run_in_executor(`
updated simulation runner to the new collections 2025-03-01 12:56:08 +05:30			`None, # default ThreadPool`
simulation_runner: added failed job cleanup 2025-02-20 18:51:49 +05:30			`lambda: openai_client.chat.completions.create(`
			`model=MODEL_NAME,`
			`messages=openai_input,`
			`temperature=0.0,`
			`)`
Add simulation runner 2025-02-17 23:00:15 +05:30			`)`

updated simulation runner to the new collections 2025-03-01 12:56:08 +05:30			`simulated_content = simulated_user_response.choices[0].message.content.strip()`
Made agent run function async (#49) * fixed sync run function * fixed simulation 2025-03-28 00:51:53 +05:30			`messages.append({"role": "assistant", "content": simulated_content})`
updated simulation runner to the new collections 2025-03-01 12:56:08 +05:30			`# Run Rowboat chat in a thread if it's synchronous`
simulation_runner: added failed job cleanup 2025-02-20 18:51:49 +05:30			`rowboat_response = await loop.run_in_executor(`
			`None,`
			`lambda: support_chat.run(simulated_content)`
			`)`
Add simulation runner 2025-02-17 23:00:15 +05:30
Made agent run function async (#49) * fixed sync run function * fixed simulation 2025-03-28 00:51:53 +05:30			`messages.append({"role": "user", "content": rowboat_response})`
Add simulation runner 2025-02-17 23:00:15 +05:30
			`# -------------------------`
updated simulation runner to the new collections 2025-03-01 12:56:08 +05:30			`# (2) EVALUATION STEP`
Add simulation runner 2025-02-17 23:00:15 +05:30			`# -------------------------`
Made agent run function async (#49) * fixed sync run function * fixed simulation 2025-03-28 00:51:53 +05:30			`# swap the roles of the assistant and the user`
Add simulation runner 2025-02-17 23:00:15 +05:30			`transcript_str = ""`
			`for m in messages:`
Made agent run function async (#49) * fixed sync run function * fixed simulation 2025-03-28 00:51:53 +05:30			`if m.get("role") == "assistant":`
			`m["role"] = "user"`
			`elif m.get("role") == "user":`
			`m["role"] = "assistant"`
Add simulation runner 2025-02-17 23:00:15 +05:30			`role = m.get("role", "unknown")`
			`content = m.get("content", "")`
			`transcript_str += f"{role.upper()}: {content}\n"`

added transcript and fixed simulation bugs 2025-03-11 13:31:26 +05:30			`# Store the transcript as a JSON string`
			`transcript = json.dumps(messages)`

			`# We use passCriteria as the evaluation "criteria."`
Add simulation runner 2025-02-17 23:00:15 +05:30			`evaluation_prompt = [`
			`{`
			`"role": "system",`
			`"content": (`
updated simulation runner to the new collections 2025-03-01 12:56:08 +05:30			`f"You are a neutral evaluator. Evaluate based on these criteria:\n"`
added transcript and fixed simulation bugs 2025-03-11 13:31:26 +05:30			`f"{pass_criteria}\n\n"`
updated simulation runner to the new collections 2025-03-01 12:56:08 +05:30			`"Return ONLY a JSON object in this format:\n"`
			`'{"verdict": "pass", "details": <reason>} or '`
			`'{"verdict": "fail", "details": <reason>}.'`
Add simulation runner 2025-02-17 23:00:15 +05:30			`)`
			`},`
			`{`
			`"role": "user",`
			`"content": (`
			`f"Here is the conversation transcript:\n\n{transcript_str}\n\n"`
updated simulation runner to the new collections 2025-03-01 12:56:08 +05:30			`"Did the support bot answer correctly or not? "`
			`"Return only 'pass' or 'fail' for verdict, and a brief explanation for details."`
Add simulation runner 2025-02-17 23:00:15 +05:30			`)`
			`}`
			`]`

simulation_runner: added failed job cleanup 2025-02-20 18:51:49 +05:30			`# Run evaluation in a separate thread`
			`eval_response = await loop.run_in_executor(`
			`None,`
			`lambda: openai_client.chat.completions.create(`
			`model=MODEL_NAME,`
			`messages=evaluation_prompt,`
			`temperature=0.0,`
			`response_format={"type": "json_object"}`
			`)`
Add simulation runner 2025-02-17 23:00:15 +05:30			`)`

			`if not eval_response.choices:`
			`raise Exception("No evaluation response received from model")`
updated simulation runner to the new collections 2025-03-01 12:56:08 +05:30
			`response_json_str = eval_response.choices[0].message.content`
			`# Attempt to parse the JSON`
			`response_json = json.loads(response_json_str)`
			`evaluation_result = response_json.get("verdict")`
			`details = response_json.get("details")`

			`if evaluation_result is None:`
			`raise Exception("No 'verdict' field found in evaluation response")`
Add simulation runner 2025-02-17 23:00:15 +05:30
added transcript and fixed simulation bugs 2025-03-11 13:31:26 +05:30			`return (evaluation_result, details, transcript)`
Add simulation runner 2025-02-17 23:00:15 +05:30
updated simulation runner to the new collections 2025-03-01 12:56:08 +05:30			`async def simulate_simulations(`
			`simulations: List[TestSimulation],`
			`run_id: str,`
			`workflow_id: str,`
			`api_key: str,`
			`max_iterations: int = 5`
			`) -> AggregateResults:`
simulation_runner: added failed job cleanup 2025-02-20 18:51:49 +05:30			`"""`
updated simulation runner to the new collections 2025-03-01 12:56:08 +05:30			`Simulates a list of TestSimulations asynchronously and aggregates the results.`
simulation_runner: added failed job cleanup 2025-02-20 18:51:49 +05:30			`"""`
updated simulation runner to the new collections 2025-03-01 12:56:08 +05:30			`if not simulations:`
			`# Return an empty result if there's nothing to simulate`
			`return AggregateResults(total=0, pass_=0, fail=0)`

			`project_id = simulations[0].projectId`

Add simulation runner 2025-02-17 23:00:15 +05:30			`client = Client(`
			`host=ROWBOAT_API_HOST,`
			`project_id=project_id,`
			`api_key=api_key`
			`)`
simulation_runner: added failed job cleanup 2025-02-20 18:51:49 +05:30
updated simulation runner to the new collections 2025-03-01 12:56:08 +05:30			`# Store results here`
			`results: List[TestResult] = []`

			`for simulation in simulations:`
			`verdict, details, transcript = await simulate_simulation(`
added transcript and fixed simulation bugs 2025-03-11 13:31:26 +05:30			`scenario=get_scenario_by_id(simulation.scenarioId),`
			`profile_id=simulation.profileId,`
			`pass_criteria=simulation.passCriteria,`
updated simulation runner to the new collections 2025-03-01 12:56:08 +05:30			`rowboat_client=client,`
			`workflow_id=workflow_id,`
			`max_iterations=max_iterations`
			`)`
Add simulation runner 2025-02-17 23:00:15 +05:30
updated simulation runner to the new collections 2025-03-01 12:56:08 +05:30			`# Create a new TestResult`
			`test_result = TestResult(`
Add simulation runner 2025-02-17 23:00:15 +05:30			`projectId=project_id,`
updated simulation runner to the new collections 2025-03-01 12:56:08 +05:30			`runId=run_id,`
			`simulationId=simulation.id,`
			`result=verdict,`
added transcript and fixed simulation bugs 2025-03-11 13:31:26 +05:30			`details=details,`
			`transcript=transcript`
Add simulation runner 2025-02-17 23:00:15 +05:30			`)`
updated simulation runner to the new collections 2025-03-01 12:56:08 +05:30			`results.append(test_result)`

			`# Persist the test result`
			`write_test_result(test_result)`

			`# Aggregate pass/fail`
			`total_count = len(results)`
			`pass_count = sum(1 for r in results if r.result == "pass")`
			`fail_count = sum(1 for r in results if r.result == "fail")`

			`return AggregateResults(`
			`total=total_count,`
			`passCount=pass_count,`
			`failCount=fail_count`
			`)`