feat: add ultravox realtime and fix signature issue in telephony

- Add UltraVox realtime - Fix signature issue on telephony
2026-06-22 08:38:13 +02:00 · 2026-05-23 12:34:54 +05:30 · 2026-05-23 12:34:54 +05:30 · ea0cac63cd
commit ea0cac63cd
parent 9135c2da13
24 changed files with 2082 additions and 133 deletions
--- a/api/services/pipecat/realtime/ultravox_realtime.py
+++ b/api/services/pipecat/realtime/ultravox_realtime.py
@ -0,0 +1,653 @@
+"""Dograh subclass of pipecat's Ultravox realtime LLM service.
+
+Ultravox is audio-native and realtime, but prompt and tool configuration is
+bound to call creation. Dograh therefore cannot lean on in-session updates or
+Gemini-style session resumption handles. This wrapper adapts Ultravox to the
+Dograh engine contract by:
+
+- deferring the first call creation until the engine queues the initial node
+  opening via ``TTSSpeakFrame`` or ``LLMContextFrame``
+- marking the call for recreation when ``system_instruction`` changes across
+  node transitions, then rebuilding it on the follow-up ``LLMContextFrame``
+  so the transition tool result is present in ``initialMessages``
+- reconstructing Ultravox ``initialMessages`` from Dograh context when the
+  call must be recreated after a node transition
+- appending a transient resumptive user nudge to recreated ``initialMessages``
+  after tool-result transitions, without mutating Dograh's stored context
+- handling Dograh-only frames such as user mute and idle append prompts
+- tagging user transcripts with ``finalized=True`` for downstream parity
+"""
+
+import hashlib
+import json
+from typing import Any
+
+from loguru import logger
+from pydantic import Field
+from websockets.exceptions import ConnectionClosed
+
+from pipecat.frames.frames import (
+    Frame,
+    LLMMessagesAppendFrame,
+    TranscriptionFrame,
+    TTSSpeakFrame,
+    UserMuteStartedFrame,
+    UserMuteStoppedFrame,
+)
+from pipecat.processors.aggregators import async_tool_messages
+from pipecat.processors.aggregators.llm_context import (
+    LLMContext,
+    LLMSpecificMessage,
+    is_given,
+)
+from pipecat.processors.frame_processor import FrameDirection
+from pipecat.services.llm_service import LLMService
+from pipecat.services.settings import _NotGiven, assert_given
+from pipecat.services.ultravox.llm import (
+    OneShotInputParams,
+    UltravoxRealtimeLLMService,
+    websocket_client,
+)
+from pipecat.utils.time import time_now_iso8601
+
+
+class DograhUltravoxOneShotInputParams(OneShotInputParams):
+    """Dograh-friendly OneShot params with string voice support."""
+
+    voice: str | None = Field(default=None)
+
+
+_ULTRAVOX_MAX_TOOL_TIMEOUT_SECS = 40.0
+_RESUMPTION_USER_MESSAGE = (
+    "IMPORTANT: We are resuming an existing conversation. You are given previous turns ONLY for your reference. "
+    "Do not use that to frame your response. Follow your ORIGINAL INSTRUCTIONS ONLY."
+)
+
+
+class DograhUltravoxRealtimeLLMService(UltravoxRealtimeLLMService):
+    """Ultravox realtime with Dograh engine integration quirks."""
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self._context: LLMContext | None = None
+        self._selected_tools = None
+        self._user_is_muted: bool = False
+        self._call_system_instruction: str | None = None
+        self._reconnect_required: bool = False
+        self._call_started: bool = False
+        self._has_connected_once: bool = False
+        self._pending_reconnect_system_instruction: str | None = None
+        self._pending_initial_messages: list[dict[str, Any]] | None = None
+        self._pending_user_text_messages: list[str] = []
+
+    async def start(self, frame):
+        # Dograh defers call creation until the engine queues the node opening.
+        await LLMService.start(self, frame)
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        if isinstance(frame, UserMuteStartedFrame):
+            self._user_is_muted = True
+            await self.push_frame(frame, direction)
+            return
+        if isinstance(frame, UserMuteStoppedFrame):
+            self._user_is_muted = False
+            await self.push_frame(frame, direction)
+            return
+        if isinstance(frame, TTSSpeakFrame):
+            if not self._socket:
+                await self._connect_call(
+                    system_instruction=self._current_system_instruction(),
+                    greeting_text=frame.text,
+                    initial_messages=None,
+                    agent_speaks_first=True,
+                )
+            else:
+                logger.warning(
+                    f"{self}: TTSSpeakFrame received after the Ultravox call was "
+                    "already created; ignoring because Ultravox owns speech output"
+                )
+            return
+        if isinstance(frame, LLMMessagesAppendFrame):
+            await self._handle_messages_append(frame)
+            return
+        await super().process_frame(frame, direction)
+
+    async def _update_settings(self, delta: UltravoxRealtimeLLMService.Settings):
+        changed = await super(UltravoxRealtimeLLMService, self)._update_settings(delta)
+        if "output_medium" in changed:
+            await self._update_output_medium(assert_given(self._settings.output_medium))
+        if "system_instruction" in changed and self._has_connected_once:
+            # Mirror Gemini's "settings change means reconnect" intent, but
+            # defer the actual new-call creation until the subsequent
+            # LLMContextFrame arrives with the transition tool result. Ultravox
+            # cannot accept that historical tool result over a formal
+            # post-connect tool-response channel the way Gemini can.
+            self._reconnect_required = True
+        handled = {"output_medium", "system_instruction"}
+        self._warn_unhandled_updated_settings(changed.keys() - handled)
+        return changed
+
+    async def _disconnect(self, preserve_completed_tool_calls: bool = True):
+        self._disconnecting = True
+        await self.stop_all_metrics()
+        if self._socket:
+            await self._socket.close()
+            self._socket = None
+        if self._receive_task:
+            await self.cancel_task(self._receive_task, timeout=1.0)
+            self._receive_task = None
+        if not preserve_completed_tool_calls:
+            self._completed_tool_calls = set()
+        self._call_started = False
+        self._started_placeholder_sent = set()
+        self._disconnecting = False
+
+    async def _send_user_audio(self, frame):
+        if self._user_is_muted:
+            return
+        await super()._send_user_audio(frame)
+
+    async def _handle_context(self, context: LLMContext):
+        self._context = context
+        system_instruction = self._current_system_instruction()
+
+        if self._socket and not self._reconnect_required:
+            await super()._handle_context(context)
+            return
+
+        initial_messages, history_tool_call_ids = self._build_initial_messages(context)
+        if history_tool_call_ids:
+            self._completed_tool_calls.update(history_tool_call_ids)
+
+        if self._bot_responding:
+            self._pending_reconnect_system_instruction = system_instruction
+            self._pending_initial_messages = initial_messages
+            return
+
+        await self._reconnect_with_context(
+            system_instruction=system_instruction,
+            initial_messages=initial_messages,
+        )
+
+    async def _handle_response_end(self):
+        await super()._handle_response_end()
+        if self._pending_reconnect_system_instruction is None:
+            return
+
+        system_instruction = self._pending_reconnect_system_instruction
+        initial_messages = self._pending_initial_messages
+        self._pending_reconnect_system_instruction = None
+        self._pending_initial_messages = None
+        await self._reconnect_with_context(
+            system_instruction=system_instruction,
+            initial_messages=initial_messages,
+        )
+
+    async def _handle_messages_append(self, frame: LLMMessagesAppendFrame):
+        texts = [
+            text
+            for text in (
+                self._extract_text_content(message.get("content"))
+                for message in frame.messages
+                if isinstance(message, dict)
+            )
+            if text
+        ]
+        if not texts:
+            return
+
+        if not self._socket:
+            self._pending_user_text_messages.extend(texts)
+            await self._connect_call(
+                system_instruction=self._current_system_instruction(),
+                greeting_text=None,
+                initial_messages=None,
+                agent_speaks_first=False,
+            )
+            return
+
+        if not self._call_started:
+            self._pending_user_text_messages.extend(texts)
+            logger.debug(
+                f"{self}: queueing {len(texts)} user text message(s) until call_started"
+            )
+            return
+
+        for text in texts:
+            await self._send_user_text(text)
+
+    async def _handle_user_transcript(self, text: str):
+        transcript = text.strip() if text else ""
+        if not transcript:
+            return
+        await self.broadcast_frame(
+            TranscriptionFrame,
+            user_id=self._last_user_id or "",
+            timestamp=time_now_iso8601(),
+            result=text,
+            text=transcript,
+            finalized=True,
+        )
+
+    async def _connect_call(
+        self,
+        *,
+        system_instruction: str | None,
+        greeting_text: str | None,
+        initial_messages: list[dict[str, Any]] | None,
+        agent_speaks_first: bool,
+    ):
+        params = self._build_one_shot_params(
+            greeting_text=greeting_text,
+            initial_messages=initial_messages,
+            agent_speaks_first=agent_speaks_first,
+        )
+        self._params = params
+        self._selected_tools = self._current_tools_schema(self._context)
+        tool_names = (
+            [tool.name for tool in self._selected_tools.standard_tools]
+            if self._selected_tools
+            else []
+        )
+        prompt = params.system_prompt or ""
+        prompt_hash = hashlib.sha256(prompt.encode("utf-8")).hexdigest()[:12]
+
+        try:
+            logger.info(
+                f"{self}: creating Ultravox call "
+                f"(agent_speaks_first={agent_speaks_first}, "
+                f"voice={params.voice!r}, "
+                f"tools={tool_names}, "
+                f"system_prompt_len={len(prompt)}, "
+                f"system_prompt_sha256={prompt_hash})"
+            )
+            join_url = await self._start_one_shot_call(params)
+            logger.info(f"Joining Ultravox Realtime call via URL: {join_url}")
+            self._socket = await websocket_client.connect(join_url)
+            self._receive_task = self.create_task(self._receive_messages())
+            self._call_system_instruction = system_instruction
+            self._call_started = False
+            self._has_connected_once = True
+        except Exception as e:
+            logger.error(
+                f"{self}: Ultravox call creation/join failed "
+                f"for tools={tool_names}: {e}"
+            )
+            await self.push_error(f"Failed to connect to Ultravox: {e}", e, fatal=True)
+
+    async def _receive_messages(self):
+        """Receive messages from the Ultravox Realtime WebSocket.
+
+        Upstream handles exceptions raised while processing individual messages,
+        but websocket close exceptions are raised by the async iterator itself.
+        During user hangup / pipeline teardown that close is expected, so treat
+        normal websocket shutdown as a debug condition rather than a pipeline
+        error.
+        """
+        if not self._socket:
+            return
+
+        try:
+            async for message in self._socket:
+                try:
+                    if isinstance(message, bytes):
+                        await self._handle_audio(message)
+                        continue
+
+                    data = json.loads(message)
+                    match data.get("type"):
+                        case "call_started":
+                            self._call_started = True
+                            logger.debug(
+                                f"{self}: Ultravox call_started received for callId="
+                                f"{data.get('callId')}"
+                            )
+                            await self._flush_pending_user_text_messages()
+                        case "state":
+                            if self._bot_responding and data.get("state") != "speaking":
+                                await self._handle_response_end()
+                        case "client_tool_invocation":
+                            await self._handle_tool_invocation(
+                                data.get("toolName"),
+                                data.get("invocationId"),
+                                data.get("parameters"),
+                            )
+                        case "transcript":
+                            match data.get("role"):
+                                case "user":
+                                    if not data.get("final"):
+                                        logger.warning(
+                                            "Unexpected non-final user transcript from Ultravox Realtime; ignoring."
+                                        )
+                                    else:
+                                        await self._handle_user_transcript(
+                                            data.get("text")
+                                        )
+                                case "agent":
+                                    await self._handle_agent_transcript(
+                                        data.get("medium"),
+                                        data.get("text"),
+                                        data.get("delta"),
+                                        data.get("final", False),
+                                    )
+                                case _:
+                                    logger.debug(
+                                        f"Received transcript with unknown role from Ultravox Realtime: {data}"
+                                    )
+                        case _:
+                            logger.debug(f"Received unhandled Ultravox message: {data}")
+                except Exception as e:
+                    if self._disconnecting or not self._socket:
+                        return
+                    await self.push_error(
+                        "Ultravox websocket receive error", e, fatal=True
+                    )
+        except ConnectionClosed as e:
+            if (
+                self._disconnecting
+                or not self._socket
+                or self._is_benign_websocket_close(e)
+            ):
+                logger.debug(f"{self}: Ultravox websocket closed: {e}")
+                return
+            await self.push_error("Ultravox websocket receive error", e, fatal=True)
+
+    async def _flush_pending_user_text_messages(self):
+        if (
+            not self._socket
+            or not self._call_started
+            or not self._pending_user_text_messages
+        ):
+            return
+
+        pending_texts = self._pending_user_text_messages
+        self._pending_user_text_messages = []
+        for pending_text in pending_texts:
+            await self._send_user_text(pending_text)
+
+    async def _reconnect_with_context(
+        self,
+        *,
+        system_instruction: str | None,
+        initial_messages: list[dict[str, Any]] | None,
+    ):
+        call_initial_messages = self._initial_messages_for_call(initial_messages)
+        logger.debug(
+            f"{self}: reconnecting Ultravox call with initialMessages="
+            f"{json.dumps(call_initial_messages, ensure_ascii=True, default=str)}"
+        )
+        if self._socket:
+            await self._disconnect(preserve_completed_tool_calls=True)
+
+        await self._connect_call(
+            system_instruction=system_instruction,
+            greeting_text=None,
+            initial_messages=initial_messages,
+            agent_speaks_first=self._should_agent_speak_first(initial_messages),
+        )
+        self._reconnect_required = False
+
+    def _build_one_shot_params(
+        self,
+        *,
+        greeting_text: str | None,
+        initial_messages: list[dict[str, Any]] | None,
+        agent_speaks_first: bool,
+    ) -> DograhUltravoxOneShotInputParams:
+        current_params = self._params
+        extra = {
+            key: value
+            for key, value in current_params.extra.items()
+            if key not in {"firstSpeakerSettings", "initialMessages"}
+        }
+
+        if greeting_text is not None:
+            extra["firstSpeakerSettings"] = {"agent": {"text": greeting_text}}
+        elif agent_speaks_first:
+            extra["firstSpeakerSettings"] = {"agent": {}}
+        else:
+            extra["firstSpeakerSettings"] = {"user": {}}
+        call_initial_messages = self._initial_messages_for_call(initial_messages)
+        if call_initial_messages:
+            extra["initialMessages"] = call_initial_messages
+
+        output_medium = self._settings.output_medium
+        if isinstance(output_medium, _NotGiven):
+            output_medium = current_params.output_medium
+
+        return DograhUltravoxOneShotInputParams(
+            api_key=current_params.api_key,
+            system_prompt=self._current_system_instruction(),
+            temperature=current_params.temperature,
+            model=assert_given(self._settings.model),
+            voice=current_params.voice,
+            metadata=current_params.metadata,
+            output_medium=output_medium,
+            max_duration=current_params.max_duration,
+            extra=extra,
+        )
+
+    def _current_tools_schema(self, context: LLMContext | None):
+        if context is None or not is_given(context.tools):
+            return None
+        return context.tools
+
+    def _to_selected_tools(self, tool: Any) -> list[dict[str, Any]]:
+        selected_tools = super()._to_selected_tools(tool)
+        for selected_tool in selected_tools:
+            temporary_tool = selected_tool.get("temporaryTool")
+            if not isinstance(temporary_tool, dict):
+                continue
+
+            tool_name = temporary_tool.get("modelToolName")
+            if not isinstance(tool_name, str):
+                continue
+
+            timeout = self._ultravox_timeout_for_tool(tool_name)
+            if timeout is not None:
+                temporary_tool["timeout"] = timeout
+        return selected_tools
+
+    def _current_system_instruction(self) -> str | None:
+        system_instruction = self._settings.system_instruction
+        if isinstance(system_instruction, _NotGiven):
+            return None
+        return system_instruction
+
+    def _ultravox_timeout_for_tool(self, function_name: str) -> str | None:
+        item = self._functions.get(function_name) or self._functions.get(None)
+        if item is None or item.timeout_secs is None or item.timeout_secs <= 0:
+            return None
+
+        timeout_secs = min(float(item.timeout_secs), _ULTRAVOX_MAX_TOOL_TIMEOUT_SECS)
+        return f"{timeout_secs:g}s"
+
+    def _initial_messages_for_call(
+        self, initial_messages: list[dict[str, Any]] | None
+    ) -> list[dict[str, Any]] | None:
+        if not initial_messages:
+            return None
+        if not self._should_add_resumption_user_message(initial_messages):
+            return initial_messages
+
+        return [
+            *initial_messages,
+            {
+                "role": "MESSAGE_ROLE_USER",
+                "text": _RESUMPTION_USER_MESSAGE,
+            },
+        ]
+
+    def _build_initial_messages(
+        self, context: LLMContext
+    ) -> tuple[list[dict[str, Any]] | None, set[str]]:
+        initial_messages: list[dict[str, Any]] = []
+        tool_call_id_to_name: dict[str, str] = {}
+        completed_tool_call_ids: set[str] = set()
+
+        for message in context.get_messages():
+            if isinstance(message, LLMSpecificMessage):
+                continue
+
+            async_payload = async_tool_messages.parse_message(message)
+            if async_payload is not None:
+                if async_payload.kind == "intermediate":
+                    logger.error(
+                        f"{self}: Ultravox does not support streamed async tool results; "
+                        f"dropping intermediate result from initialMessages for "
+                        f"tool_call_id={async_payload.tool_call_id}."
+                    )
+                    continue
+                if async_payload.kind == "final":
+                    initial_message = self._build_ultravox_message(
+                        role="MESSAGE_ROLE_TOOL_RESULT",
+                        text=async_payload.result or "",
+                        invocation_id=async_payload.tool_call_id,
+                        tool_name=tool_call_id_to_name.get(async_payload.tool_call_id),
+                    )
+                    if initial_message is not None:
+                        initial_messages.append(initial_message)
+                    completed_tool_call_ids.add(async_payload.tool_call_id)
+                continue
+
+            role = message.get("role")
+            if role == "user":
+                initial_message = self._build_ultravox_message(
+                    role="MESSAGE_ROLE_USER",
+                    text=self._extract_text_content(message.get("content")),
+                )
+                if initial_message is not None:
+                    initial_messages.append(initial_message)
+            elif role == "assistant":
+                text = self._extract_text_content(message.get("content"))
+                initial_message = self._build_ultravox_message(
+                    role="MESSAGE_ROLE_AGENT",
+                    text=text,
+                )
+                if initial_message is not None:
+                    initial_messages.append(initial_message)
+
+                tool_calls = message.get("tool_calls")
+                if isinstance(tool_calls, list):
+                    for tool_call in tool_calls:
+                        if not isinstance(tool_call, dict):
+                            continue
+                        tool_id = tool_call.get("id")
+                        function = tool_call.get("function")
+                        tool_name = (
+                            function.get("name") if isinstance(function, dict) else None
+                        )
+                        if isinstance(tool_id, str) and isinstance(tool_name, str):
+                            tool_call_id_to_name[tool_id] = tool_name
+                            initial_message = self._build_ultravox_message(
+                                role="MESSAGE_ROLE_TOOL_CALL",
+                                text="",
+                                invocation_id=tool_id,
+                                tool_name=tool_name,
+                            )
+                            if initial_message is not None:
+                                initial_messages.append(initial_message)
+            elif (
+                role == "tool"
+                and message.get("content") != "IN_PROGRESS"
+                and message.get("content") != "CANCELLED"
+            ):
+                tool_call_id = message.get("tool_call_id")
+                initial_message = self._build_ultravox_message(
+                    role="MESSAGE_ROLE_TOOL_RESULT",
+                    text=self._stringify_tool_result(message.get("content")),
+                    invocation_id=tool_call_id
+                    if isinstance(tool_call_id, str)
+                    else None,
+                    tool_name=(
+                        tool_call_id_to_name.get(tool_call_id)
+                        if isinstance(tool_call_id, str)
+                        else None
+                    ),
+                )
+                if initial_message is not None:
+                    initial_messages.append(initial_message)
+                if isinstance(tool_call_id, str):
+                    completed_tool_call_ids.add(tool_call_id)
+
+        return (initial_messages or None), completed_tool_call_ids
+
+    @staticmethod
+    def _build_ultravox_message(
+        *,
+        role: str,
+        text: str | None,
+        invocation_id: str | None = None,
+        tool_name: str | None = None,
+    ) -> dict[str, Any] | None:
+        if text is None:
+            return None
+
+        message: dict[str, Any] = {
+            "role": role,
+            "text": text,
+        }
+        if invocation_id is not None:
+            message["invocationId"] = invocation_id
+        if tool_name is not None:
+            message["toolName"] = tool_name
+        return message
+
+    @staticmethod
+    def _should_agent_speak_first(
+        initial_messages: list[dict[str, Any]] | None,
+    ) -> bool:
+        if not initial_messages:
+            return True
+        return initial_messages[-1].get("role") in {
+            "MESSAGE_ROLE_USER",
+            "MESSAGE_ROLE_TOOL_RESULT",
+        }
+
+    @staticmethod
+    def _should_add_resumption_user_message(
+        initial_messages: list[dict[str, Any]] | None,
+    ) -> bool:
+        if not initial_messages:
+            return False
+        return initial_messages[-1].get("role") == "MESSAGE_ROLE_TOOL_RESULT"
+
+    @staticmethod
+    def _is_benign_websocket_close(exc: ConnectionClosed) -> bool:
+        return any(
+            close is not None and close.code in {1000, 1001}
+            for close in (exc.sent, exc.rcvd)
+        )
+
+    @staticmethod
+    def _extract_text_content(content: Any) -> str | None:
+        if isinstance(content, str):
+            return content
+        if isinstance(content, list):
+            parts: list[str] = []
+            for part in content:
+                if not isinstance(part, dict):
+                    return None
+                if part.get("type") != "text":
+                    return None
+                text = part.get("text")
+                if not isinstance(text, str):
+                    return None
+                parts.append(text)
+            return "\n".join(parts) if parts else None
+        return None
+
+    @staticmethod
+    def _stringify_tool_result(content: Any) -> str:
+        if isinstance(content, str):
+            return content
+        if isinstance(content, list):
+            parts: list[str] = []
+            for part in content:
+                if isinstance(part, dict):
+                    text = part.get("text")
+                    if isinstance(text, str):
+                        parts.append(text)
+            if parts:
+                return "".join(parts)
+        return json.dumps(content, ensure_ascii=True, default=str)
--- a/api/services/pipecat/service_factory.py
+++ b/api/services/pipecat/service_factory.py
@ -640,6 +640,24 @@ def create_realtime_llm_service(user_config, audio_config: "AudioConfig"):
                ),
            ),
        )
+    elif provider == ServiceProviders.ULTRAVOX_REALTIME.value:
+        from api.services.pipecat.realtime.ultravox_realtime import (
+            DograhUltravoxOneShotInputParams,
+            DograhUltravoxRealtimeLLMService,
+        )
+
+        return DograhUltravoxRealtimeLLMService(
+            params=DograhUltravoxOneShotInputParams(
+                api_key=api_key,
+                model=model,
+                voice=voice,
+                output_medium="voice",
+            ),
+            settings=DograhUltravoxRealtimeLLMService.Settings(
+                model=model,
+                output_medium="voice",
+            ),
+        )
    elif provider == ServiceProviders.GOOGLE_REALTIME.value:
        from api.services.pipecat.realtime.gemini_live import (
            DograhGeminiLiveLLMService,