dograh/api/services/pipecat/realtime/ultravox_realtime.py
Abhishek 3892b58486
feat: add ultravox realtime and fix signature issue in telephony (#345)
* feat: add ultravox realtime and fix signature issue in telephony

- Add UltraVox realtime
- Fix signature issue on telephony

* fix: fix regression for wss_backend_endpoint
2026-05-23 12:51:55 +05:30

653 lines
26 KiB
Python

"""Dograh subclass of pipecat's Ultravox realtime LLM service.
Ultravox is audio-native and realtime, but prompt and tool configuration is
bound to call creation. Dograh therefore cannot lean on in-session updates or
Gemini-style session resumption handles. This wrapper adapts Ultravox to the
Dograh engine contract by:
- deferring the first call creation until the engine queues the initial node
opening via ``TTSSpeakFrame`` or ``LLMContextFrame``
- marking the call for recreation when ``system_instruction`` changes across
node transitions, then rebuilding it on the follow-up ``LLMContextFrame``
so the transition tool result is present in ``initialMessages``
- reconstructing Ultravox ``initialMessages`` from Dograh context when the
call must be recreated after a node transition
- appending a transient resumptive user nudge to recreated ``initialMessages``
after tool-result transitions, without mutating Dograh's stored context
- handling Dograh-only frames such as user mute and idle append prompts
- tagging user transcripts with ``finalized=True`` for downstream parity
"""
import hashlib
import json
from typing import Any
from loguru import logger
from pydantic import Field
from websockets.exceptions import ConnectionClosed
from pipecat.frames.frames import (
Frame,
LLMMessagesAppendFrame,
TranscriptionFrame,
TTSSpeakFrame,
UserMuteStartedFrame,
UserMuteStoppedFrame,
)
from pipecat.processors.aggregators import async_tool_messages
from pipecat.processors.aggregators.llm_context import (
LLMContext,
LLMSpecificMessage,
is_given,
)
from pipecat.processors.frame_processor import FrameDirection
from pipecat.services.llm_service import LLMService
from pipecat.services.settings import _NotGiven, assert_given
from pipecat.services.ultravox.llm import (
OneShotInputParams,
UltravoxRealtimeLLMService,
websocket_client,
)
from pipecat.utils.time import time_now_iso8601
class DograhUltravoxOneShotInputParams(OneShotInputParams):
"""Dograh-friendly OneShot params with string voice support."""
voice: str | None = Field(default=None)
_ULTRAVOX_MAX_TOOL_TIMEOUT_SECS = 40.0
_RESUMPTION_USER_MESSAGE = (
"IMPORTANT: We are resuming an existing conversation. You are given previous turns ONLY for your reference. "
"Do not use that to frame your response. Follow your ORIGINAL INSTRUCTIONS ONLY."
)
class DograhUltravoxRealtimeLLMService(UltravoxRealtimeLLMService):
"""Ultravox realtime with Dograh engine integration quirks."""
def __init__(self, **kwargs):
super().__init__(**kwargs)
self._context: LLMContext | None = None
self._selected_tools = None
self._user_is_muted: bool = False
self._call_system_instruction: str | None = None
self._reconnect_required: bool = False
self._call_started: bool = False
self._has_connected_once: bool = False
self._pending_reconnect_system_instruction: str | None = None
self._pending_initial_messages: list[dict[str, Any]] | None = None
self._pending_user_text_messages: list[str] = []
async def start(self, frame):
# Dograh defers call creation until the engine queues the node opening.
await LLMService.start(self, frame)
async def process_frame(self, frame: Frame, direction: FrameDirection):
if isinstance(frame, UserMuteStartedFrame):
self._user_is_muted = True
await self.push_frame(frame, direction)
return
if isinstance(frame, UserMuteStoppedFrame):
self._user_is_muted = False
await self.push_frame(frame, direction)
return
if isinstance(frame, TTSSpeakFrame):
if not self._socket:
await self._connect_call(
system_instruction=self._current_system_instruction(),
greeting_text=frame.text,
initial_messages=None,
agent_speaks_first=True,
)
else:
logger.warning(
f"{self}: TTSSpeakFrame received after the Ultravox call was "
"already created; ignoring because Ultravox owns speech output"
)
return
if isinstance(frame, LLMMessagesAppendFrame):
await self._handle_messages_append(frame)
return
await super().process_frame(frame, direction)
async def _update_settings(self, delta: UltravoxRealtimeLLMService.Settings):
changed = await super(UltravoxRealtimeLLMService, self)._update_settings(delta)
if "output_medium" in changed:
await self._update_output_medium(assert_given(self._settings.output_medium))
if "system_instruction" in changed and self._has_connected_once:
# Mirror Gemini's "settings change means reconnect" intent, but
# defer the actual new-call creation until the subsequent
# LLMContextFrame arrives with the transition tool result. Ultravox
# cannot accept that historical tool result over a formal
# post-connect tool-response channel the way Gemini can.
self._reconnect_required = True
handled = {"output_medium", "system_instruction"}
self._warn_unhandled_updated_settings(changed.keys() - handled)
return changed
async def _disconnect(self, preserve_completed_tool_calls: bool = True):
self._disconnecting = True
await self.stop_all_metrics()
if self._socket:
await self._socket.close()
self._socket = None
if self._receive_task:
await self.cancel_task(self._receive_task, timeout=1.0)
self._receive_task = None
if not preserve_completed_tool_calls:
self._completed_tool_calls = set()
self._call_started = False
self._started_placeholder_sent = set()
self._disconnecting = False
async def _send_user_audio(self, frame):
if self._user_is_muted:
return
await super()._send_user_audio(frame)
async def _handle_context(self, context: LLMContext):
self._context = context
system_instruction = self._current_system_instruction()
if self._socket and not self._reconnect_required:
await super()._handle_context(context)
return
initial_messages, history_tool_call_ids = self._build_initial_messages(context)
if history_tool_call_ids:
self._completed_tool_calls.update(history_tool_call_ids)
if self._bot_responding:
self._pending_reconnect_system_instruction = system_instruction
self._pending_initial_messages = initial_messages
return
await self._reconnect_with_context(
system_instruction=system_instruction,
initial_messages=initial_messages,
)
async def _handle_response_end(self):
await super()._handle_response_end()
if self._pending_reconnect_system_instruction is None:
return
system_instruction = self._pending_reconnect_system_instruction
initial_messages = self._pending_initial_messages
self._pending_reconnect_system_instruction = None
self._pending_initial_messages = None
await self._reconnect_with_context(
system_instruction=system_instruction,
initial_messages=initial_messages,
)
async def _handle_messages_append(self, frame: LLMMessagesAppendFrame):
texts = [
text
for text in (
self._extract_text_content(message.get("content"))
for message in frame.messages
if isinstance(message, dict)
)
if text
]
if not texts:
return
if not self._socket:
self._pending_user_text_messages.extend(texts)
await self._connect_call(
system_instruction=self._current_system_instruction(),
greeting_text=None,
initial_messages=None,
agent_speaks_first=False,
)
return
if not self._call_started:
self._pending_user_text_messages.extend(texts)
logger.debug(
f"{self}: queueing {len(texts)} user text message(s) until call_started"
)
return
for text in texts:
await self._send_user_text(text)
async def _handle_user_transcript(self, text: str):
transcript = text.strip() if text else ""
if not transcript:
return
await self.broadcast_frame(
TranscriptionFrame,
user_id=self._last_user_id or "",
timestamp=time_now_iso8601(),
result=text,
text=transcript,
finalized=True,
)
async def _connect_call(
self,
*,
system_instruction: str | None,
greeting_text: str | None,
initial_messages: list[dict[str, Any]] | None,
agent_speaks_first: bool,
):
params = self._build_one_shot_params(
greeting_text=greeting_text,
initial_messages=initial_messages,
agent_speaks_first=agent_speaks_first,
)
self._params = params
self._selected_tools = self._current_tools_schema(self._context)
tool_names = (
[tool.name for tool in self._selected_tools.standard_tools]
if self._selected_tools
else []
)
prompt = params.system_prompt or ""
prompt_hash = hashlib.sha256(prompt.encode("utf-8")).hexdigest()[:12]
try:
logger.info(
f"{self}: creating Ultravox call "
f"(agent_speaks_first={agent_speaks_first}, "
f"voice={params.voice!r}, "
f"tools={tool_names}, "
f"system_prompt_len={len(prompt)}, "
f"system_prompt_sha256={prompt_hash})"
)
join_url = await self._start_one_shot_call(params)
logger.info(f"Joining Ultravox Realtime call via URL: {join_url}")
self._socket = await websocket_client.connect(join_url)
self._receive_task = self.create_task(self._receive_messages())
self._call_system_instruction = system_instruction
self._call_started = False
self._has_connected_once = True
except Exception as e:
logger.error(
f"{self}: Ultravox call creation/join failed "
f"for tools={tool_names}: {e}"
)
await self.push_error(f"Failed to connect to Ultravox: {e}", e, fatal=True)
async def _receive_messages(self):
"""Receive messages from the Ultravox Realtime WebSocket.
Upstream handles exceptions raised while processing individual messages,
but websocket close exceptions are raised by the async iterator itself.
During user hangup / pipeline teardown that close is expected, so treat
normal websocket shutdown as a debug condition rather than a pipeline
error.
"""
if not self._socket:
return
try:
async for message in self._socket:
try:
if isinstance(message, bytes):
await self._handle_audio(message)
continue
data = json.loads(message)
match data.get("type"):
case "call_started":
self._call_started = True
logger.debug(
f"{self}: Ultravox call_started received for callId="
f"{data.get('callId')}"
)
await self._flush_pending_user_text_messages()
case "state":
if self._bot_responding and data.get("state") != "speaking":
await self._handle_response_end()
case "client_tool_invocation":
await self._handle_tool_invocation(
data.get("toolName"),
data.get("invocationId"),
data.get("parameters"),
)
case "transcript":
match data.get("role"):
case "user":
if not data.get("final"):
logger.warning(
"Unexpected non-final user transcript from Ultravox Realtime; ignoring."
)
else:
await self._handle_user_transcript(
data.get("text")
)
case "agent":
await self._handle_agent_transcript(
data.get("medium"),
data.get("text"),
data.get("delta"),
data.get("final", False),
)
case _:
logger.debug(
f"Received transcript with unknown role from Ultravox Realtime: {data}"
)
case _:
logger.debug(f"Received unhandled Ultravox message: {data}")
except Exception as e:
if self._disconnecting or not self._socket:
return
await self.push_error(
"Ultravox websocket receive error", e, fatal=True
)
except ConnectionClosed as e:
if (
self._disconnecting
or not self._socket
or self._is_benign_websocket_close(e)
):
logger.debug(f"{self}: Ultravox websocket closed: {e}")
return
await self.push_error("Ultravox websocket receive error", e, fatal=True)
async def _flush_pending_user_text_messages(self):
if (
not self._socket
or not self._call_started
or not self._pending_user_text_messages
):
return
pending_texts = self._pending_user_text_messages
self._pending_user_text_messages = []
for pending_text in pending_texts:
await self._send_user_text(pending_text)
async def _reconnect_with_context(
self,
*,
system_instruction: str | None,
initial_messages: list[dict[str, Any]] | None,
):
call_initial_messages = self._initial_messages_for_call(initial_messages)
logger.debug(
f"{self}: reconnecting Ultravox call with initialMessages="
f"{json.dumps(call_initial_messages, ensure_ascii=True, default=str)}"
)
if self._socket:
await self._disconnect(preserve_completed_tool_calls=True)
await self._connect_call(
system_instruction=system_instruction,
greeting_text=None,
initial_messages=initial_messages,
agent_speaks_first=self._should_agent_speak_first(initial_messages),
)
self._reconnect_required = False
def _build_one_shot_params(
self,
*,
greeting_text: str | None,
initial_messages: list[dict[str, Any]] | None,
agent_speaks_first: bool,
) -> DograhUltravoxOneShotInputParams:
current_params = self._params
extra = {
key: value
for key, value in current_params.extra.items()
if key not in {"firstSpeakerSettings", "initialMessages"}
}
if greeting_text is not None:
extra["firstSpeakerSettings"] = {"agent": {"text": greeting_text}}
elif agent_speaks_first:
extra["firstSpeakerSettings"] = {"agent": {}}
else:
extra["firstSpeakerSettings"] = {"user": {}}
call_initial_messages = self._initial_messages_for_call(initial_messages)
if call_initial_messages:
extra["initialMessages"] = call_initial_messages
output_medium = self._settings.output_medium
if isinstance(output_medium, _NotGiven):
output_medium = current_params.output_medium
return DograhUltravoxOneShotInputParams(
api_key=current_params.api_key,
system_prompt=self._current_system_instruction(),
temperature=current_params.temperature,
model=assert_given(self._settings.model),
voice=current_params.voice,
metadata=current_params.metadata,
output_medium=output_medium,
max_duration=current_params.max_duration,
extra=extra,
)
def _current_tools_schema(self, context: LLMContext | None):
if context is None or not is_given(context.tools):
return None
return context.tools
def _to_selected_tools(self, tool: Any) -> list[dict[str, Any]]:
selected_tools = super()._to_selected_tools(tool)
for selected_tool in selected_tools:
temporary_tool = selected_tool.get("temporaryTool")
if not isinstance(temporary_tool, dict):
continue
tool_name = temporary_tool.get("modelToolName")
if not isinstance(tool_name, str):
continue
timeout = self._ultravox_timeout_for_tool(tool_name)
if timeout is not None:
temporary_tool["timeout"] = timeout
return selected_tools
def _current_system_instruction(self) -> str | None:
system_instruction = self._settings.system_instruction
if isinstance(system_instruction, _NotGiven):
return None
return system_instruction
def _ultravox_timeout_for_tool(self, function_name: str) -> str | None:
item = self._functions.get(function_name) or self._functions.get(None)
if item is None or item.timeout_secs is None or item.timeout_secs <= 0:
return None
timeout_secs = min(float(item.timeout_secs), _ULTRAVOX_MAX_TOOL_TIMEOUT_SECS)
return f"{timeout_secs:g}s"
def _initial_messages_for_call(
self, initial_messages: list[dict[str, Any]] | None
) -> list[dict[str, Any]] | None:
if not initial_messages:
return None
if not self._should_add_resumption_user_message(initial_messages):
return initial_messages
return [
*initial_messages,
{
"role": "MESSAGE_ROLE_USER",
"text": _RESUMPTION_USER_MESSAGE,
},
]
def _build_initial_messages(
self, context: LLMContext
) -> tuple[list[dict[str, Any]] | None, set[str]]:
initial_messages: list[dict[str, Any]] = []
tool_call_id_to_name: dict[str, str] = {}
completed_tool_call_ids: set[str] = set()
for message in context.get_messages():
if isinstance(message, LLMSpecificMessage):
continue
async_payload = async_tool_messages.parse_message(message)
if async_payload is not None:
if async_payload.kind == "intermediate":
logger.error(
f"{self}: Ultravox does not support streamed async tool results; "
f"dropping intermediate result from initialMessages for "
f"tool_call_id={async_payload.tool_call_id}."
)
continue
if async_payload.kind == "final":
initial_message = self._build_ultravox_message(
role="MESSAGE_ROLE_TOOL_RESULT",
text=async_payload.result or "",
invocation_id=async_payload.tool_call_id,
tool_name=tool_call_id_to_name.get(async_payload.tool_call_id),
)
if initial_message is not None:
initial_messages.append(initial_message)
completed_tool_call_ids.add(async_payload.tool_call_id)
continue
role = message.get("role")
if role == "user":
initial_message = self._build_ultravox_message(
role="MESSAGE_ROLE_USER",
text=self._extract_text_content(message.get("content")),
)
if initial_message is not None:
initial_messages.append(initial_message)
elif role == "assistant":
text = self._extract_text_content(message.get("content"))
initial_message = self._build_ultravox_message(
role="MESSAGE_ROLE_AGENT",
text=text,
)
if initial_message is not None:
initial_messages.append(initial_message)
tool_calls = message.get("tool_calls")
if isinstance(tool_calls, list):
for tool_call in tool_calls:
if not isinstance(tool_call, dict):
continue
tool_id = tool_call.get("id")
function = tool_call.get("function")
tool_name = (
function.get("name") if isinstance(function, dict) else None
)
if isinstance(tool_id, str) and isinstance(tool_name, str):
tool_call_id_to_name[tool_id] = tool_name
initial_message = self._build_ultravox_message(
role="MESSAGE_ROLE_TOOL_CALL",
text="",
invocation_id=tool_id,
tool_name=tool_name,
)
if initial_message is not None:
initial_messages.append(initial_message)
elif (
role == "tool"
and message.get("content") != "IN_PROGRESS"
and message.get("content") != "CANCELLED"
):
tool_call_id = message.get("tool_call_id")
initial_message = self._build_ultravox_message(
role="MESSAGE_ROLE_TOOL_RESULT",
text=self._stringify_tool_result(message.get("content")),
invocation_id=tool_call_id
if isinstance(tool_call_id, str)
else None,
tool_name=(
tool_call_id_to_name.get(tool_call_id)
if isinstance(tool_call_id, str)
else None
),
)
if initial_message is not None:
initial_messages.append(initial_message)
if isinstance(tool_call_id, str):
completed_tool_call_ids.add(tool_call_id)
return (initial_messages or None), completed_tool_call_ids
@staticmethod
def _build_ultravox_message(
*,
role: str,
text: str | None,
invocation_id: str | None = None,
tool_name: str | None = None,
) -> dict[str, Any] | None:
if text is None:
return None
message: dict[str, Any] = {
"role": role,
"text": text,
}
if invocation_id is not None:
message["invocationId"] = invocation_id
if tool_name is not None:
message["toolName"] = tool_name
return message
@staticmethod
def _should_agent_speak_first(
initial_messages: list[dict[str, Any]] | None,
) -> bool:
if not initial_messages:
return True
return initial_messages[-1].get("role") in {
"MESSAGE_ROLE_USER",
"MESSAGE_ROLE_TOOL_RESULT",
}
@staticmethod
def _should_add_resumption_user_message(
initial_messages: list[dict[str, Any]] | None,
) -> bool:
if not initial_messages:
return False
return initial_messages[-1].get("role") == "MESSAGE_ROLE_TOOL_RESULT"
@staticmethod
def _is_benign_websocket_close(exc: ConnectionClosed) -> bool:
return any(
close is not None and close.code in {1000, 1001}
for close in (exc.sent, exc.rcvd)
)
@staticmethod
def _extract_text_content(content: Any) -> str | None:
if isinstance(content, str):
return content
if isinstance(content, list):
parts: list[str] = []
for part in content:
if not isinstance(part, dict):
return None
if part.get("type") != "text":
return None
text = part.get("text")
if not isinstance(text, str):
return None
parts.append(text)
return "\n".join(parts) if parts else None
return None
@staticmethod
def _stringify_tool_result(content: Any) -> str:
if isinstance(content, str):
return content
if isinstance(content, list):
parts: list[str] = []
for part in content:
if isinstance(part, dict):
text = part.get("text")
if isinstance(text, str):
parts.append(text)
if parts:
return "".join(parts)
return json.dumps(content, ensure_ascii=True, default=str)