2025-03-01 12:56:08 +05:30
|
|
|
|
import asyncio
|
|
|
|
|
|
import logging
|
2025-02-17 23:00:15 +05:30
|
|
|
|
from typing import List
|
|
|
|
|
|
import json
|
|
|
|
|
|
import os
|
|
|
|
|
|
from openai import OpenAI
|
2025-03-01 12:56:08 +05:30
|
|
|
|
|
|
|
|
|
|
# Updated imports from your new schema/types
|
|
|
|
|
|
from scenario_types import TestSimulation, TestResult, AggregateResults
|
|
|
|
|
|
|
|
|
|
|
|
# If your DB functions changed names, adapt here:
|
|
|
|
|
|
from db import write_test_result # replaced write_simulation_result
|
|
|
|
|
|
|
|
|
|
|
|
from rowboat import Client, StatefulChat
|
2025-02-17 23:00:15 +05:30
|
|
|
|
|
|
|
|
|
|
openai_client = OpenAI()
|
|
|
|
|
|
MODEL_NAME = "gpt-4o"
|
|
|
|
|
|
ROWBOAT_API_HOST = os.environ.get("ROWBOAT_API_HOST", "http://127.0.0.1:3000").strip()
|
|
|
|
|
|
|
2025-03-01 12:56:08 +05:30
|
|
|
|
async def simulate_simulation(
|
|
|
|
|
|
simulation: TestSimulation,
|
|
|
|
|
|
rowboat_client: Client,
|
|
|
|
|
|
workflow_id: str,
|
|
|
|
|
|
max_iterations: int = 5
|
|
|
|
|
|
) -> tuple[str, str, str]:
|
2025-02-17 23:00:15 +05:30
|
|
|
|
"""
|
2025-03-01 12:56:08 +05:30
|
|
|
|
Runs a mock simulation for a given TestSimulation asynchronously.
|
2025-02-17 23:00:15 +05:30
|
|
|
|
After simulating several turns of conversation, it evaluates the conversation.
|
2025-02-20 18:51:49 +05:30
|
|
|
|
Returns a tuple of (evaluation_result, details, transcript_str).
|
2025-02-17 23:00:15 +05:30
|
|
|
|
"""
|
2025-03-01 12:56:08 +05:30
|
|
|
|
|
2025-02-20 18:51:49 +05:30
|
|
|
|
loop = asyncio.get_running_loop()
|
2025-02-17 23:00:15 +05:30
|
|
|
|
|
2025-03-01 12:56:08 +05:30
|
|
|
|
# Optionally embed passCriteria in the system prompt, if it’s relevant to context:
|
|
|
|
|
|
pass_criteria = simulation.passCriteria or ""
|
|
|
|
|
|
# Or place it separately below if you prefer.
|
|
|
|
|
|
|
|
|
|
|
|
# Prepare a Rowboat chat
|
2025-02-17 23:00:15 +05:30
|
|
|
|
support_chat = StatefulChat(
|
|
|
|
|
|
rowboat_client,
|
2025-03-01 12:56:08 +05:30
|
|
|
|
system_prompt=f"Context: {pass_criteria}" if pass_criteria else "",
|
2025-02-17 23:00:15 +05:30
|
|
|
|
workflow_id=workflow_id
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2025-03-01 12:56:08 +05:30
|
|
|
|
# You might want to describe the simulation or scenario more thoroughly.
|
|
|
|
|
|
# Here, we just embed simulation.name in the system message:
|
2025-02-17 23:00:15 +05:30
|
|
|
|
messages = [
|
|
|
|
|
|
{
|
|
|
|
|
|
"role": "system",
|
2025-03-01 12:56:08 +05:30
|
|
|
|
"content": (
|
|
|
|
|
|
f"Simulate the user based on this simulation:\n{simulation.name}"
|
|
|
|
|
|
)
|
2025-02-17 23:00:15 +05:30
|
|
|
|
}
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
# -------------------------
|
2025-03-01 12:56:08 +05:30
|
|
|
|
# (1) MAIN SIMULATION LOOP
|
2025-02-17 23:00:15 +05:30
|
|
|
|
# -------------------------
|
2025-03-01 12:56:08 +05:30
|
|
|
|
for _ in range(max_iterations):
|
2025-02-17 23:00:15 +05:30
|
|
|
|
openai_input = messages
|
|
|
|
|
|
|
2025-03-01 12:56:08 +05:30
|
|
|
|
# Run OpenAI API call in a separate thread (non-blocking)
|
2025-02-20 18:51:49 +05:30
|
|
|
|
simulated_user_response = await loop.run_in_executor(
|
2025-03-01 12:56:08 +05:30
|
|
|
|
None, # default ThreadPool
|
2025-02-20 18:51:49 +05:30
|
|
|
|
lambda: openai_client.chat.completions.create(
|
|
|
|
|
|
model=MODEL_NAME,
|
|
|
|
|
|
messages=openai_input,
|
|
|
|
|
|
temperature=0.0,
|
|
|
|
|
|
)
|
2025-02-17 23:00:15 +05:30
|
|
|
|
)
|
|
|
|
|
|
|
2025-03-01 12:56:08 +05:30
|
|
|
|
simulated_content = simulated_user_response.choices[0].message.content.strip()
|
2025-02-17 23:00:15 +05:30
|
|
|
|
|
2025-03-01 12:56:08 +05:30
|
|
|
|
# Run Rowboat chat in a thread if it's synchronous
|
2025-02-20 18:51:49 +05:30
|
|
|
|
rowboat_response = await loop.run_in_executor(
|
|
|
|
|
|
None,
|
|
|
|
|
|
lambda: support_chat.run(simulated_content)
|
|
|
|
|
|
)
|
2025-02-17 23:00:15 +05:30
|
|
|
|
|
|
|
|
|
|
messages.append({"role": "assistant", "content": rowboat_response})
|
|
|
|
|
|
|
|
|
|
|
|
# -------------------------
|
2025-03-01 12:56:08 +05:30
|
|
|
|
# (2) EVALUATION STEP
|
2025-02-17 23:00:15 +05:30
|
|
|
|
# -------------------------
|
|
|
|
|
|
transcript_str = ""
|
|
|
|
|
|
for m in messages:
|
|
|
|
|
|
role = m.get("role", "unknown")
|
|
|
|
|
|
content = m.get("content", "")
|
|
|
|
|
|
transcript_str += f"{role.upper()}: {content}\n"
|
|
|
|
|
|
|
2025-03-01 12:56:08 +05:30
|
|
|
|
# We use passCriteria as the evaluation “criteria.”
|
2025-02-17 23:00:15 +05:30
|
|
|
|
evaluation_prompt = [
|
|
|
|
|
|
{
|
|
|
|
|
|
"role": "system",
|
|
|
|
|
|
"content": (
|
2025-03-01 12:56:08 +05:30
|
|
|
|
f"You are a neutral evaluator. Evaluate based on these criteria:\n"
|
|
|
|
|
|
f"{simulation.passCriteria}\n\n"
|
|
|
|
|
|
"Return ONLY a JSON object in this format:\n"
|
|
|
|
|
|
'{"verdict": "pass", "details": <reason>} or '
|
|
|
|
|
|
'{"verdict": "fail", "details": <reason>}.'
|
2025-02-17 23:00:15 +05:30
|
|
|
|
)
|
|
|
|
|
|
},
|
|
|
|
|
|
{
|
|
|
|
|
|
"role": "user",
|
|
|
|
|
|
"content": (
|
|
|
|
|
|
f"Here is the conversation transcript:\n\n{transcript_str}\n\n"
|
2025-03-01 12:56:08 +05:30
|
|
|
|
"Did the support bot answer correctly or not? "
|
|
|
|
|
|
"Return only 'pass' or 'fail' for verdict, and a brief explanation for details."
|
2025-02-17 23:00:15 +05:30
|
|
|
|
)
|
|
|
|
|
|
}
|
|
|
|
|
|
]
|
|
|
|
|
|
|
2025-02-20 18:51:49 +05:30
|
|
|
|
# Run evaluation in a separate thread
|
|
|
|
|
|
eval_response = await loop.run_in_executor(
|
|
|
|
|
|
None,
|
|
|
|
|
|
lambda: openai_client.chat.completions.create(
|
|
|
|
|
|
model=MODEL_NAME,
|
|
|
|
|
|
messages=evaluation_prompt,
|
|
|
|
|
|
temperature=0.0,
|
2025-03-01 12:56:08 +05:30
|
|
|
|
# If your LLM supports a structured response format, you can specify it.
|
|
|
|
|
|
# Otherwise, remove or adapt 'response_format':
|
2025-02-20 18:51:49 +05:30
|
|
|
|
response_format={"type": "json_object"}
|
|
|
|
|
|
)
|
2025-02-17 23:00:15 +05:30
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
if not eval_response.choices:
|
|
|
|
|
|
raise Exception("No evaluation response received from model")
|
2025-03-01 12:56:08 +05:30
|
|
|
|
|
|
|
|
|
|
response_json_str = eval_response.choices[0].message.content
|
|
|
|
|
|
# Attempt to parse the JSON
|
|
|
|
|
|
response_json = json.loads(response_json_str)
|
|
|
|
|
|
evaluation_result = response_json.get("verdict")
|
|
|
|
|
|
details = response_json.get("details")
|
|
|
|
|
|
|
|
|
|
|
|
if evaluation_result is None:
|
|
|
|
|
|
raise Exception("No 'verdict' field found in evaluation response")
|
2025-02-17 23:00:15 +05:30
|
|
|
|
|
2025-02-20 18:51:49 +05:30
|
|
|
|
return (evaluation_result, details, transcript_str)
|
2025-02-17 23:00:15 +05:30
|
|
|
|
|
2025-03-01 12:56:08 +05:30
|
|
|
|
async def simulate_simulations(
|
|
|
|
|
|
simulations: List[TestSimulation],
|
|
|
|
|
|
run_id: str,
|
|
|
|
|
|
workflow_id: str,
|
|
|
|
|
|
api_key: str,
|
|
|
|
|
|
max_iterations: int = 5
|
|
|
|
|
|
) -> AggregateResults:
|
2025-02-20 18:51:49 +05:30
|
|
|
|
"""
|
2025-03-01 12:56:08 +05:30
|
|
|
|
Simulates a list of TestSimulations asynchronously and aggregates the results.
|
2025-02-20 18:51:49 +05:30
|
|
|
|
"""
|
2025-03-01 12:56:08 +05:30
|
|
|
|
if not simulations:
|
|
|
|
|
|
# Return an empty result if there's nothing to simulate
|
|
|
|
|
|
return AggregateResults(total=0, pass_=0, fail=0)
|
|
|
|
|
|
|
|
|
|
|
|
# We assume all simulations belong to the same project
|
|
|
|
|
|
project_id = simulations[0].projectId
|
|
|
|
|
|
|
|
|
|
|
|
# Create a Rowboat client instance
|
2025-02-17 23:00:15 +05:30
|
|
|
|
client = Client(
|
|
|
|
|
|
host=ROWBOAT_API_HOST,
|
|
|
|
|
|
project_id=project_id,
|
|
|
|
|
|
api_key=api_key
|
|
|
|
|
|
)
|
2025-02-20 18:51:49 +05:30
|
|
|
|
|
2025-03-01 12:56:08 +05:30
|
|
|
|
# Store results here
|
|
|
|
|
|
results: List[TestResult] = []
|
|
|
|
|
|
|
|
|
|
|
|
for simulation in simulations:
|
|
|
|
|
|
# Run each simulation
|
|
|
|
|
|
verdict, details, transcript = await simulate_simulation(
|
|
|
|
|
|
simulation=simulation,
|
|
|
|
|
|
rowboat_client=client,
|
|
|
|
|
|
workflow_id=workflow_id,
|
|
|
|
|
|
max_iterations=max_iterations
|
|
|
|
|
|
)
|
2025-02-17 23:00:15 +05:30
|
|
|
|
|
2025-03-01 12:56:08 +05:30
|
|
|
|
# Create a new TestResult
|
|
|
|
|
|
test_result = TestResult(
|
2025-02-17 23:00:15 +05:30
|
|
|
|
projectId=project_id,
|
2025-03-01 12:56:08 +05:30
|
|
|
|
runId=run_id,
|
|
|
|
|
|
simulationId=simulation.id,
|
|
|
|
|
|
result=verdict,
|
|
|
|
|
|
details=details
|
2025-02-17 23:00:15 +05:30
|
|
|
|
)
|
2025-03-01 12:56:08 +05:30
|
|
|
|
results.append(test_result)
|
|
|
|
|
|
|
|
|
|
|
|
# Persist the test result
|
|
|
|
|
|
write_test_result(test_result)
|
|
|
|
|
|
|
|
|
|
|
|
# Aggregate pass/fail
|
|
|
|
|
|
total_count = len(results)
|
|
|
|
|
|
pass_count = sum(1 for r in results if r.result == "pass")
|
|
|
|
|
|
fail_count = sum(1 for r in results if r.result == "fail")
|
|
|
|
|
|
|
|
|
|
|
|
return AggregateResults(
|
|
|
|
|
|
total=total_count,
|
|
|
|
|
|
passCount=pass_count,
|
|
|
|
|
|
failCount=fail_count
|
|
|
|
|
|
)
|