feat: add chat based testing for voice agent (#308)

* feat: add backend foundations

* feat: add text chat UI

* chore: simplify the reload behaviour

* fix: fix upgrade banner to be triggered after package upload

* feat: simplify TesterPanel design

* chore: fix formatting and generate client

* chore: fix tracing for text chat mode

* fix: fix revert and edit CTA

* refactor: refactor TesterPanel into smaller components

* feat: enable runtime transition of nodes

* fix: fix review comments
This commit is contained in:
Abhishek 2026-05-21 15:20:02 +05:30 committed by GitHub
parent 67479e98fd
commit d97d1d72cd
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
96 changed files with 7630 additions and 1684 deletions

View file

@ -7,7 +7,7 @@ from api.enums import PostHogEvent, WorkflowRunState
from api.services.campaign.circuit_breaker import circuit_breaker
from api.services.integrations import IntegrationRuntimeSession
from api.services.pipecat.audio_config import AudioConfig
from api.services.pipecat.audio_playback import play_audio, play_audio_loop
from api.services.pipecat.audio_playback import play_audio_loop
from api.services.pipecat.in_memory_buffers import (
InMemoryAudioBuffer,
InMemoryLogsBuffer,
@ -20,8 +20,6 @@ from api.tasks.arq import enqueue_job
from api.tasks.function_names import FunctionNames
from pipecat.frames.frames import (
Frame,
LLMContextFrame,
TTSSpeakFrame,
)
from pipecat.pipeline.task import PipelineTask
from pipecat.processors.audio.audio_buffer_processor import AudioBufferProcessor
@ -69,7 +67,6 @@ def register_event_handlers(
pipeline_metrics_aggregator: PipelineMetricsAggregator,
audio_config=AudioConfig,
pre_call_fetch_task: asyncio.Task | None = None,
fetch_recording_audio=None,
user_provider_id: str | None = None,
integration_runtime_sessions: list[IntegrationRuntimeSession] | None = None,
):
@ -99,20 +96,11 @@ def register_event_handlers(
"initial_response_triggered": False,
}
async def queue_initial_llm_context():
# Queue LLMContextFrame after the VoicemailDetector since the detector
# gates LLMContextFrames until voicemail detection completes. We also
# don't want to trigger the Voicemail LLM with this initial frame.
await engine.llm.queue_frame(LLMContextFrame(engine.context))
async def maybe_trigger_initial_response():
"""Start the conversation after both pipeline_started and client_connected events.
If a pre-call fetch is in progress, plays a ringer while waiting for the
response, then merges the result into the call context before proceeding.
If the start node has a greeting configured, play it directly via TTS.
Otherwise, trigger an LLM generation for the opening message.
"""
if (
ready_state["pipeline_started"]
@ -167,46 +155,11 @@ def register_event_handlers(
# Set the start node now (after pre-call fetch data is merged)
# so that render_template() has the complete _call_context_vars.
await engine.set_node(engine.workflow.start_node_id)
greeting_info = engine.get_start_greeting()
if greeting_info:
greeting_type, greeting_value = greeting_info
if (
greeting_type == "audio"
and greeting_value
and fetch_recording_audio
):
logger.debug(f"Playing audio greeting recording: {greeting_value}")
result = await fetch_recording_audio(
recording_pk=int(greeting_value)
)
if result:
await play_audio(
result.audio,
sample_rate=audio_config.pipeline_sample_rate or 16000,
queue_frame=transport.output().queue_frame,
transcript=result.transcript,
append_to_context=True,
)
else:
logger.warning(
f"Failed to fetch audio greeting {greeting_value}, "
"falling back to LLM generation"
)
await queue_initial_llm_context()
else:
logger.debug("Playing text greeting via TTS")
# append_to_context=True so the assistant aggregator commits
# the greeting to the LLM context once TTS finishes; without
# it the LLM would re-greet on its first generation.
await task.queue_frame(
TTSSpeakFrame(greeting_value, append_to_context=True)
)
else:
logger.debug(
"Both pipeline_started and client_connected received - triggering initial LLM generation"
)
await queue_initial_llm_context()
await engine.queue_node_opening(
node_id=engine.workflow.start_node_id,
previous_node_id=None,
generate_if_no_greeting=True,
)
@transport.event_handler("on_client_connected")
async def on_client_connected(_transport, _participant):

View file

@ -6,6 +6,10 @@ from typing import List, Optional
from loguru import logger
from api.services.pipecat.realtime_feedback_events import (
realtime_feedback_event_sort_key,
stamp_realtime_feedback_event,
)
from api.utils.transcript import generate_transcript_text as _generate_transcript_text
from pipecat.utils.enums import RealtimeFeedbackType
@ -98,16 +102,13 @@ class InMemoryLogsBuffer:
async def append(self, event: dict):
"""Append a feedback event to the buffer with timestamp and current node."""
# Add timestamp, turn tracking, and current node
timestamped_event = {
**event,
"timestamp": datetime.now(UTC).isoformat(),
"turn": self._turn_counter,
}
if self._current_node_id:
timestamped_event["node_id"] = self._current_node_id
if self._current_node_name:
timestamped_event["node_name"] = self._current_node_name
timestamped_event = stamp_realtime_feedback_event(
event,
timestamp=datetime.now(UTC).isoformat(),
turn=self._turn_counter,
node_id=self._current_node_id,
node_name=self._current_node_name,
)
self._events.append(timestamped_event)
logger.trace(
f"Appended event {event.get('type')} to logs buffer for workflow {self._workflow_run_id}"
@ -120,17 +121,12 @@ class InMemoryLogsBuffer:
f"Incremented turn counter to {self._turn_counter} for workflow {self._workflow_run_id}"
)
@staticmethod
def _event_sort_key(event: dict) -> str:
payload_ts = event.get("payload", {}).get("timestamp")
return payload_ts or event.get("timestamp", "")
def _sorted_events(self) -> List[dict]:
# Stable sort by the realtime (payload) timestamp when available, falling
# back to the buffer-append timestamp. Python's sort is stable, so events
# sharing a key retain their original insertion order — this keeps
# consecutive bot-text chunks of a single turn contiguous.
return sorted(self._events, key=self._event_sort_key)
return sorted(self._events, key=realtime_feedback_event_sort_key)
def get_events(self) -> List[dict]:
"""Get all events for final storage, ordered by realtime timestamp."""

View file

@ -152,8 +152,30 @@ def build_realtime_pipeline(
return Pipeline(processors)
def create_pipeline_task(pipeline, workflow_run_id, audio_config: AudioConfig = None):
"""Create a pipeline task with appropriate parameters"""
def create_pipeline_task(
pipeline,
workflow_run_id,
audio_config: AudioConfig = None,
*,
conversation_parent_context=None,
conversation_type: str = "voice",
additional_span_attributes: dict | None = None,
):
"""Create a pipeline task with appropriate parameters.
Args:
pipeline: The pipeline to run.
workflow_run_id: Run id, used as the conversation id.
audio_config: Optional audio configuration.
conversation_parent_context: Optional OTEL context carrying a fixed
trace id. When provided, the conversation span attaches to that
trace instead of starting a new root trace (used by text chat to
stitch every per-turn pipeline into one trace).
conversation_type: ``conversation.type`` span attribute value.
additional_span_attributes: Extra attributes set on the conversation
span (e.g. ``langfuse.trace.name`` to name a stitched trace that
has no real root span).
"""
# Set up pipeline params with audio configuration if provided
pipeline_params = PipelineParams(
enable_metrics=True,
@ -178,6 +200,9 @@ def create_pipeline_task(pipeline, workflow_run_id, audio_config: AudioConfig =
enable_tracing=True,
enable_rtvi=False,
conversation_id=f"{workflow_run_id}",
conversation_parent_context=conversation_parent_context,
conversation_type=conversation_type,
additional_span_attributes=additional_span_attributes,
)
# Check if turn logging is enabled

View file

@ -0,0 +1,163 @@
"""Shared helpers for building and ordering realtime feedback events."""
from typing import Any
from pipecat.utils.enums import RealtimeFeedbackType
def build_node_transition_event(
*,
node_id: str | None,
node_name: str | None,
previous_node_id: str | None,
previous_node_name: str | None,
allow_interrupt: bool = False,
) -> dict[str, Any]:
return {
"type": RealtimeFeedbackType.NODE_TRANSITION.value,
"payload": {
"node_id": node_id,
"node_name": node_name,
"previous_node_id": previous_node_id,
"previous_node_name": previous_node_name,
"allow_interrupt": allow_interrupt,
},
}
def build_user_transcription_event(
*,
text: str,
final: bool,
timestamp: str | None = None,
user_id: str | None = None,
) -> dict[str, Any]:
payload: dict[str, Any] = {
"text": text,
"final": final,
}
if timestamp is not None:
payload["timestamp"] = timestamp
if user_id is not None:
payload["user_id"] = user_id
return {
"type": RealtimeFeedbackType.USER_TRANSCRIPTION.value,
"payload": payload,
}
def build_bot_text_event(
*,
text: str,
timestamp: str | None = None,
) -> dict[str, Any]:
payload: dict[str, Any] = {"text": text}
if timestamp is not None:
payload["timestamp"] = timestamp
return {
"type": RealtimeFeedbackType.BOT_TEXT.value,
"payload": payload,
}
def build_function_call_start_event(
*,
function_name: str | None,
tool_call_id: str | None,
arguments: dict[str, Any] | None = None,
) -> dict[str, Any]:
payload: dict[str, Any] = {
"function_name": function_name,
"tool_call_id": tool_call_id,
}
if arguments is not None:
payload["arguments"] = arguments
return {
"type": RealtimeFeedbackType.FUNCTION_CALL_START.value,
"payload": payload,
}
def serialize_realtime_feedback_tool_result(result: Any) -> str | None:
"""Normalize function-call results to the string shape stored in logs."""
if result is None:
return None
return str(result)
def build_function_call_end_event(
*,
function_name: str | None,
tool_call_id: str | None,
result: Any,
) -> dict[str, Any]:
return {
"type": RealtimeFeedbackType.FUNCTION_CALL_END.value,
"payload": {
"function_name": function_name,
"tool_call_id": tool_call_id,
"result": serialize_realtime_feedback_tool_result(result),
},
}
def build_ttfb_metric_event(
*,
ttfb_seconds: float,
processor: str | None,
model: str | None,
) -> dict[str, Any]:
return {
"type": RealtimeFeedbackType.TTFB_METRIC.value,
"payload": {
"ttfb_seconds": ttfb_seconds,
"processor": processor,
"model": model,
},
}
def build_pipeline_error_event(
*,
error: str,
fatal: bool,
processor: str | None = None,
extra_payload: dict[str, Any] | None = None,
) -> dict[str, Any]:
payload: dict[str, Any] = {
"error": error,
"fatal": fatal,
}
if processor is not None:
payload["processor"] = processor
if extra_payload:
payload.update(extra_payload)
return {
"type": RealtimeFeedbackType.PIPELINE_ERROR.value,
"payload": payload,
}
def stamp_realtime_feedback_event(
event: dict[str, Any],
*,
timestamp: str | None = None,
turn: int | None = None,
node_id: str | None = None,
node_name: str | None = None,
) -> dict[str, Any]:
stamped = dict(event)
if timestamp is not None:
stamped["timestamp"] = timestamp
if turn is not None:
stamped["turn"] = turn
if node_id is not None:
stamped["node_id"] = node_id
if node_name is not None:
stamped["node_name"] = node_name
return stamped
def realtime_feedback_event_sort_key(event: dict[str, Any]) -> str:
payload_timestamp = (event.get("payload") or {}).get("timestamp")
return payload_timestamp or event.get("timestamp") or ""

View file

@ -27,6 +27,15 @@ from typing import TYPE_CHECKING, Awaitable, Callable, Optional, Set
from loguru import logger
from api.services.pipecat.realtime_feedback_events import (
build_bot_text_event,
build_function_call_end_event,
build_function_call_start_event,
build_pipeline_error_event,
build_ttfb_metric_event,
build_user_transcription_event,
)
if TYPE_CHECKING:
from api.services.pipecat.in_memory_buffers import InMemoryLogsBuffer
@ -211,29 +220,23 @@ class RealtimeFeedbackObserver(BaseObserver):
# Handle user transcriptions (interim) - WebSocket only
elif isinstance(frame, InterimTranscriptionFrame):
await self._send_ws(
{
"type": RealtimeFeedbackType.USER_TRANSCRIPTION.value,
"payload": {
"text": frame.text,
"final": False,
"user_id": frame.user_id,
"timestamp": frame.timestamp,
},
}
build_user_transcription_event(
text=frame.text,
final=False,
user_id=frame.user_id,
timestamp=frame.timestamp,
)
)
# Handle user transcriptions (final) - WebSocket only
# Complete turn text is persisted via register_turn_handlers
elif isinstance(frame, TranscriptionFrame):
await self._send_ws(
{
"type": RealtimeFeedbackType.USER_TRANSCRIPTION.value,
"payload": {
"text": frame.text,
"final": True,
"user_id": frame.user_id,
"timestamp": frame.timestamp,
},
}
build_user_transcription_event(
text=frame.text,
final=True,
user_id=frame.user_id,
timestamp=frame.timestamp,
)
)
# Handle engine-queued speech (transition/tool messages) marked for
# log persistence. The downstream TTSTextFrame(s) from the TTS service
@ -241,23 +244,13 @@ class RealtimeFeedbackObserver(BaseObserver):
# to avoid word-level log entries from word-timestamp providers.
elif isinstance(frame, TTSSpeakFrame):
if getattr(frame, "persist_to_logs", False):
await self._append_to_buffer(
{
"type": RealtimeFeedbackType.BOT_TEXT.value,
"payload": {"text": frame.text},
}
)
await self._append_to_buffer(build_bot_text_event(text=frame.text))
# Handle bot TTS text - respect pts timing, WebSocket only
# Complete turn text is persisted via register_turn_handlers,
# except for frames explicitly flagged persist_to_logs (e.g. recording
# transcripts from play_audio) which bypass the aggregator path.
elif isinstance(frame, TTSTextFrame):
message = {
"type": RealtimeFeedbackType.BOT_TEXT.value,
"payload": {
"text": frame.text,
},
}
message = build_bot_text_event(text=frame.text)
# If frame has pts, queue it for timed delivery
if frame.pts:
@ -280,13 +273,11 @@ class RealtimeFeedbackObserver(BaseObserver):
and frame_direction == FrameDirection.DOWNSTREAM
):
await self._send_message(
{
"type": RealtimeFeedbackType.FUNCTION_CALL_START.value,
"payload": {
"function_name": frame.function_name,
"tool_call_id": frame.tool_call_id,
},
}
build_function_call_start_event(
function_name=frame.function_name,
tool_call_id=frame.tool_call_id,
arguments=dict(frame.arguments or {}),
)
)
# Handle function call result
elif (
@ -294,14 +285,11 @@ class RealtimeFeedbackObserver(BaseObserver):
and frame_direction == FrameDirection.DOWNSTREAM
):
await self._send_message(
{
"type": RealtimeFeedbackType.FUNCTION_CALL_END.value,
"payload": {
"function_name": frame.function_name,
"tool_call_id": frame.tool_call_id,
"result": str(frame.result) if frame.result else None,
},
}
build_function_call_end_event(
function_name=frame.function_name,
tool_call_id=frame.tool_call_id,
result=frame.result,
)
)
# Handle TTFB metrics - capture LLM generation time only
elif isinstance(frame, MetricsFrame):
@ -311,47 +299,42 @@ class RealtimeFeedbackObserver(BaseObserver):
# Only send TTFB if it's from an LLM processor
if metric_data.processor and "LLM" in metric_data.processor:
await self._send_message(
{
"type": RealtimeFeedbackType.TTFB_METRIC.value,
"payload": {
"ttfb_seconds": metric_data.value,
"processor": metric_data.processor,
"model": metric_data.model,
},
}
build_ttfb_metric_event(
ttfb_seconds=metric_data.value,
processor=metric_data.processor,
model=metric_data.model,
)
)
# Handle pipeline errors
elif isinstance(frame, ErrorFrame):
processor_name = str(frame.processor) if frame.processor else None
payload = {
"error": frame.error,
"fatal": frame.fatal,
"processor": processor_name,
}
extra_payload: dict[str, object] = {}
# Surface structured fields when the underlying exception carries
# them (e.g. google.genai APIError: code=1008, status=None,
# message="Your project has been denied access...").
exc = frame.exception
if exc is not None:
exc_type = type(exc).__name__
payload["exception_type"] = exc_type
payload["exception_message"] = str(exc)
extra_payload["exception_type"] = exc_type
extra_payload["exception_message"] = str(exc)
for attr in ("code", "status", "message", "details"):
value = getattr(exc, attr, None)
if value is None or attr in payload:
if value is None or attr in extra_payload:
continue
try:
# Ensure the value is JSON-serializable; fall back
# to str() for opaque objects (e.g. raw response).
json.dumps(value)
payload[attr] = value
extra_payload[attr] = value
except (TypeError, ValueError):
payload[attr] = str(value)
extra_payload[attr] = str(value)
await self._send_message(
{
"type": RealtimeFeedbackType.PIPELINE_ERROR.value,
"payload": payload,
}
build_pipeline_error_event(
error=frame.error,
fatal=frame.fatal,
processor=processor_name,
extra_payload=extra_payload or None,
)
)
async def _send_ws(self, message: dict):
@ -401,14 +384,11 @@ def register_turn_log_handlers(
logs_buffer.increment_turn()
try:
await logs_buffer.append(
{
"type": RealtimeFeedbackType.USER_TRANSCRIPTION.value,
"payload": {
"text": message.content,
"final": True,
"timestamp": message.timestamp,
},
}
build_user_transcription_event(
text=message.content,
final=True,
timestamp=message.timestamp,
)
)
except Exception as e:
logger.error(f"Failed to append user turn to logs buffer: {e}")
@ -418,13 +398,10 @@ def register_turn_log_handlers(
if message.content:
try:
await logs_buffer.append(
{
"type": RealtimeFeedbackType.BOT_TEXT.value,
"payload": {
"text": message.content,
"timestamp": message.timestamp,
},
}
build_bot_text_event(
text=message.content,
timestamp=message.timestamp,
)
)
except Exception as e:
logger.error(f"Failed to append assistant turn to logs buffer: {e}")

View file

@ -28,6 +28,9 @@ from api.services.pipecat.pipeline_engine_callbacks_processor import (
)
from api.services.pipecat.pipeline_metrics_aggregator import PipelineMetricsAggregator
from api.services.pipecat.pre_call_fetch import execute_pre_call_fetch
from api.services.pipecat.realtime_feedback_events import (
build_node_transition_event,
)
from api.services.pipecat.realtime_feedback_observer import (
RealtimeFeedbackObserver,
register_turn_log_handlers,
@ -465,16 +468,13 @@ async def _run_pipeline(
# Update current node on the buffer so subsequent events are tagged
in_memory_logs_buffer.set_current_node(node_id, node_name)
message = {
"type": RealtimeFeedbackType.NODE_TRANSITION.value,
"payload": {
"node_id": node_id,
"node_name": node_name,
"previous_node_id": previous_node_id,
"previous_node_name": previous_node_name,
"allow_interrupt": allow_interrupt,
},
}
message = build_node_transition_event(
node_id=node_id,
node_name=node_name,
previous_node_id=previous_node_id,
previous_node_name=previous_node_name,
allow_interrupt=allow_interrupt,
)
# Send via WebSocket if available
if ws_sender:
try:
@ -803,7 +803,6 @@ async def _run_pipeline(
pipeline_metrics_aggregator=pipeline_metrics_aggregator,
audio_config=audio_config,
pre_call_fetch_task=pre_call_fetch_task,
fetch_recording_audio=fetch_audio,
user_provider_id=user_provider_id,
integration_runtime_sessions=integration_runtime_sessions,
)

View file

@ -254,6 +254,44 @@ async def handle_langfuse_sync(event):
unregister_org_langfuse_credentials(org_id)
def build_remote_parent_context(trace_id: str | None):
"""Build an OTEL context whose active span carries ``trace_id``.
Spans started under the returned context join the Langfuse trace identified
by ``trace_id`` (Langfuse groups observations by trace id). The parent span
id is a non-existent placeholder, so spans created under it attach at the
trace root rather than nesting under a real parent span.
This is the shared primitive behind both post-call QA tracing and text-chat
trace stitching. Returns the context, or ``None`` when tracing is
unavailable or ``trace_id`` is missing/invalid.
"""
if not trace_id:
return None
if not ensure_tracing():
return None
try:
from opentelemetry.trace import (
NonRecordingSpan,
SpanContext,
TraceFlags,
set_span_in_context,
)
parent_span_context = SpanContext(
trace_id=int(trace_id, 16),
span_id=0x1,
is_remote=True,
trace_flags=TraceFlags(0x01),
)
return set_span_in_context(NonRecordingSpan(parent_span_context))
except Exception as e:
logger.warning(
f"Failed to build remote parent context for trace {trace_id}: {e}"
)
return None
def get_trace_url(trace_id: str, org_id=None) -> str | None:
"""Build a Langfuse trace URL, using org-specific host when available."""
if org_id is None: