mirror of
https://github.com/dograh-hq/dograh.git
synced 2026-06-22 08:38:13 +02:00
feat: add ultravox realtime and fix signature issue in telephony
- Add UltraVox realtime - Fix signature issue on telephony
This commit is contained in:
parent
9135c2da13
commit
ea0cac63cd
24 changed files with 2082 additions and 133 deletions
653
api/services/pipecat/realtime/ultravox_realtime.py
Normal file
653
api/services/pipecat/realtime/ultravox_realtime.py
Normal file
|
|
@ -0,0 +1,653 @@
|
|||
"""Dograh subclass of pipecat's Ultravox realtime LLM service.
|
||||
|
||||
Ultravox is audio-native and realtime, but prompt and tool configuration is
|
||||
bound to call creation. Dograh therefore cannot lean on in-session updates or
|
||||
Gemini-style session resumption handles. This wrapper adapts Ultravox to the
|
||||
Dograh engine contract by:
|
||||
|
||||
- deferring the first call creation until the engine queues the initial node
|
||||
opening via ``TTSSpeakFrame`` or ``LLMContextFrame``
|
||||
- marking the call for recreation when ``system_instruction`` changes across
|
||||
node transitions, then rebuilding it on the follow-up ``LLMContextFrame``
|
||||
so the transition tool result is present in ``initialMessages``
|
||||
- reconstructing Ultravox ``initialMessages`` from Dograh context when the
|
||||
call must be recreated after a node transition
|
||||
- appending a transient resumptive user nudge to recreated ``initialMessages``
|
||||
after tool-result transitions, without mutating Dograh's stored context
|
||||
- handling Dograh-only frames such as user mute and idle append prompts
|
||||
- tagging user transcripts with ``finalized=True`` for downstream parity
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
from typing import Any
|
||||
|
||||
from loguru import logger
|
||||
from pydantic import Field
|
||||
from websockets.exceptions import ConnectionClosed
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
Frame,
|
||||
LLMMessagesAppendFrame,
|
||||
TranscriptionFrame,
|
||||
TTSSpeakFrame,
|
||||
UserMuteStartedFrame,
|
||||
UserMuteStoppedFrame,
|
||||
)
|
||||
from pipecat.processors.aggregators import async_tool_messages
|
||||
from pipecat.processors.aggregators.llm_context import (
|
||||
LLMContext,
|
||||
LLMSpecificMessage,
|
||||
is_given,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.llm_service import LLMService
|
||||
from pipecat.services.settings import _NotGiven, assert_given
|
||||
from pipecat.services.ultravox.llm import (
|
||||
OneShotInputParams,
|
||||
UltravoxRealtimeLLMService,
|
||||
websocket_client,
|
||||
)
|
||||
from pipecat.utils.time import time_now_iso8601
|
||||
|
||||
|
||||
class DograhUltravoxOneShotInputParams(OneShotInputParams):
|
||||
"""Dograh-friendly OneShot params with string voice support."""
|
||||
|
||||
voice: str | None = Field(default=None)
|
||||
|
||||
|
||||
_ULTRAVOX_MAX_TOOL_TIMEOUT_SECS = 40.0
|
||||
_RESUMPTION_USER_MESSAGE = (
|
||||
"IMPORTANT: We are resuming an existing conversation. You are given previous turns ONLY for your reference. "
|
||||
"Do not use that to frame your response. Follow your ORIGINAL INSTRUCTIONS ONLY."
|
||||
)
|
||||
|
||||
|
||||
class DograhUltravoxRealtimeLLMService(UltravoxRealtimeLLMService):
|
||||
"""Ultravox realtime with Dograh engine integration quirks."""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._context: LLMContext | None = None
|
||||
self._selected_tools = None
|
||||
self._user_is_muted: bool = False
|
||||
self._call_system_instruction: str | None = None
|
||||
self._reconnect_required: bool = False
|
||||
self._call_started: bool = False
|
||||
self._has_connected_once: bool = False
|
||||
self._pending_reconnect_system_instruction: str | None = None
|
||||
self._pending_initial_messages: list[dict[str, Any]] | None = None
|
||||
self._pending_user_text_messages: list[str] = []
|
||||
|
||||
async def start(self, frame):
|
||||
# Dograh defers call creation until the engine queues the node opening.
|
||||
await LLMService.start(self, frame)
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
if isinstance(frame, UserMuteStartedFrame):
|
||||
self._user_is_muted = True
|
||||
await self.push_frame(frame, direction)
|
||||
return
|
||||
if isinstance(frame, UserMuteStoppedFrame):
|
||||
self._user_is_muted = False
|
||||
await self.push_frame(frame, direction)
|
||||
return
|
||||
if isinstance(frame, TTSSpeakFrame):
|
||||
if not self._socket:
|
||||
await self._connect_call(
|
||||
system_instruction=self._current_system_instruction(),
|
||||
greeting_text=frame.text,
|
||||
initial_messages=None,
|
||||
agent_speaks_first=True,
|
||||
)
|
||||
else:
|
||||
logger.warning(
|
||||
f"{self}: TTSSpeakFrame received after the Ultravox call was "
|
||||
"already created; ignoring because Ultravox owns speech output"
|
||||
)
|
||||
return
|
||||
if isinstance(frame, LLMMessagesAppendFrame):
|
||||
await self._handle_messages_append(frame)
|
||||
return
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
async def _update_settings(self, delta: UltravoxRealtimeLLMService.Settings):
|
||||
changed = await super(UltravoxRealtimeLLMService, self)._update_settings(delta)
|
||||
if "output_medium" in changed:
|
||||
await self._update_output_medium(assert_given(self._settings.output_medium))
|
||||
if "system_instruction" in changed and self._has_connected_once:
|
||||
# Mirror Gemini's "settings change means reconnect" intent, but
|
||||
# defer the actual new-call creation until the subsequent
|
||||
# LLMContextFrame arrives with the transition tool result. Ultravox
|
||||
# cannot accept that historical tool result over a formal
|
||||
# post-connect tool-response channel the way Gemini can.
|
||||
self._reconnect_required = True
|
||||
handled = {"output_medium", "system_instruction"}
|
||||
self._warn_unhandled_updated_settings(changed.keys() - handled)
|
||||
return changed
|
||||
|
||||
async def _disconnect(self, preserve_completed_tool_calls: bool = True):
|
||||
self._disconnecting = True
|
||||
await self.stop_all_metrics()
|
||||
if self._socket:
|
||||
await self._socket.close()
|
||||
self._socket = None
|
||||
if self._receive_task:
|
||||
await self.cancel_task(self._receive_task, timeout=1.0)
|
||||
self._receive_task = None
|
||||
if not preserve_completed_tool_calls:
|
||||
self._completed_tool_calls = set()
|
||||
self._call_started = False
|
||||
self._started_placeholder_sent = set()
|
||||
self._disconnecting = False
|
||||
|
||||
async def _send_user_audio(self, frame):
|
||||
if self._user_is_muted:
|
||||
return
|
||||
await super()._send_user_audio(frame)
|
||||
|
||||
async def _handle_context(self, context: LLMContext):
|
||||
self._context = context
|
||||
system_instruction = self._current_system_instruction()
|
||||
|
||||
if self._socket and not self._reconnect_required:
|
||||
await super()._handle_context(context)
|
||||
return
|
||||
|
||||
initial_messages, history_tool_call_ids = self._build_initial_messages(context)
|
||||
if history_tool_call_ids:
|
||||
self._completed_tool_calls.update(history_tool_call_ids)
|
||||
|
||||
if self._bot_responding:
|
||||
self._pending_reconnect_system_instruction = system_instruction
|
||||
self._pending_initial_messages = initial_messages
|
||||
return
|
||||
|
||||
await self._reconnect_with_context(
|
||||
system_instruction=system_instruction,
|
||||
initial_messages=initial_messages,
|
||||
)
|
||||
|
||||
async def _handle_response_end(self):
|
||||
await super()._handle_response_end()
|
||||
if self._pending_reconnect_system_instruction is None:
|
||||
return
|
||||
|
||||
system_instruction = self._pending_reconnect_system_instruction
|
||||
initial_messages = self._pending_initial_messages
|
||||
self._pending_reconnect_system_instruction = None
|
||||
self._pending_initial_messages = None
|
||||
await self._reconnect_with_context(
|
||||
system_instruction=system_instruction,
|
||||
initial_messages=initial_messages,
|
||||
)
|
||||
|
||||
async def _handle_messages_append(self, frame: LLMMessagesAppendFrame):
|
||||
texts = [
|
||||
text
|
||||
for text in (
|
||||
self._extract_text_content(message.get("content"))
|
||||
for message in frame.messages
|
||||
if isinstance(message, dict)
|
||||
)
|
||||
if text
|
||||
]
|
||||
if not texts:
|
||||
return
|
||||
|
||||
if not self._socket:
|
||||
self._pending_user_text_messages.extend(texts)
|
||||
await self._connect_call(
|
||||
system_instruction=self._current_system_instruction(),
|
||||
greeting_text=None,
|
||||
initial_messages=None,
|
||||
agent_speaks_first=False,
|
||||
)
|
||||
return
|
||||
|
||||
if not self._call_started:
|
||||
self._pending_user_text_messages.extend(texts)
|
||||
logger.debug(
|
||||
f"{self}: queueing {len(texts)} user text message(s) until call_started"
|
||||
)
|
||||
return
|
||||
|
||||
for text in texts:
|
||||
await self._send_user_text(text)
|
||||
|
||||
async def _handle_user_transcript(self, text: str):
|
||||
transcript = text.strip() if text else ""
|
||||
if not transcript:
|
||||
return
|
||||
await self.broadcast_frame(
|
||||
TranscriptionFrame,
|
||||
user_id=self._last_user_id or "",
|
||||
timestamp=time_now_iso8601(),
|
||||
result=text,
|
||||
text=transcript,
|
||||
finalized=True,
|
||||
)
|
||||
|
||||
async def _connect_call(
|
||||
self,
|
||||
*,
|
||||
system_instruction: str | None,
|
||||
greeting_text: str | None,
|
||||
initial_messages: list[dict[str, Any]] | None,
|
||||
agent_speaks_first: bool,
|
||||
):
|
||||
params = self._build_one_shot_params(
|
||||
greeting_text=greeting_text,
|
||||
initial_messages=initial_messages,
|
||||
agent_speaks_first=agent_speaks_first,
|
||||
)
|
||||
self._params = params
|
||||
self._selected_tools = self._current_tools_schema(self._context)
|
||||
tool_names = (
|
||||
[tool.name for tool in self._selected_tools.standard_tools]
|
||||
if self._selected_tools
|
||||
else []
|
||||
)
|
||||
prompt = params.system_prompt or ""
|
||||
prompt_hash = hashlib.sha256(prompt.encode("utf-8")).hexdigest()[:12]
|
||||
|
||||
try:
|
||||
logger.info(
|
||||
f"{self}: creating Ultravox call "
|
||||
f"(agent_speaks_first={agent_speaks_first}, "
|
||||
f"voice={params.voice!r}, "
|
||||
f"tools={tool_names}, "
|
||||
f"system_prompt_len={len(prompt)}, "
|
||||
f"system_prompt_sha256={prompt_hash})"
|
||||
)
|
||||
join_url = await self._start_one_shot_call(params)
|
||||
logger.info(f"Joining Ultravox Realtime call via URL: {join_url}")
|
||||
self._socket = await websocket_client.connect(join_url)
|
||||
self._receive_task = self.create_task(self._receive_messages())
|
||||
self._call_system_instruction = system_instruction
|
||||
self._call_started = False
|
||||
self._has_connected_once = True
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"{self}: Ultravox call creation/join failed "
|
||||
f"for tools={tool_names}: {e}"
|
||||
)
|
||||
await self.push_error(f"Failed to connect to Ultravox: {e}", e, fatal=True)
|
||||
|
||||
async def _receive_messages(self):
|
||||
"""Receive messages from the Ultravox Realtime WebSocket.
|
||||
|
||||
Upstream handles exceptions raised while processing individual messages,
|
||||
but websocket close exceptions are raised by the async iterator itself.
|
||||
During user hangup / pipeline teardown that close is expected, so treat
|
||||
normal websocket shutdown as a debug condition rather than a pipeline
|
||||
error.
|
||||
"""
|
||||
if not self._socket:
|
||||
return
|
||||
|
||||
try:
|
||||
async for message in self._socket:
|
||||
try:
|
||||
if isinstance(message, bytes):
|
||||
await self._handle_audio(message)
|
||||
continue
|
||||
|
||||
data = json.loads(message)
|
||||
match data.get("type"):
|
||||
case "call_started":
|
||||
self._call_started = True
|
||||
logger.debug(
|
||||
f"{self}: Ultravox call_started received for callId="
|
||||
f"{data.get('callId')}"
|
||||
)
|
||||
await self._flush_pending_user_text_messages()
|
||||
case "state":
|
||||
if self._bot_responding and data.get("state") != "speaking":
|
||||
await self._handle_response_end()
|
||||
case "client_tool_invocation":
|
||||
await self._handle_tool_invocation(
|
||||
data.get("toolName"),
|
||||
data.get("invocationId"),
|
||||
data.get("parameters"),
|
||||
)
|
||||
case "transcript":
|
||||
match data.get("role"):
|
||||
case "user":
|
||||
if not data.get("final"):
|
||||
logger.warning(
|
||||
"Unexpected non-final user transcript from Ultravox Realtime; ignoring."
|
||||
)
|
||||
else:
|
||||
await self._handle_user_transcript(
|
||||
data.get("text")
|
||||
)
|
||||
case "agent":
|
||||
await self._handle_agent_transcript(
|
||||
data.get("medium"),
|
||||
data.get("text"),
|
||||
data.get("delta"),
|
||||
data.get("final", False),
|
||||
)
|
||||
case _:
|
||||
logger.debug(
|
||||
f"Received transcript with unknown role from Ultravox Realtime: {data}"
|
||||
)
|
||||
case _:
|
||||
logger.debug(f"Received unhandled Ultravox message: {data}")
|
||||
except Exception as e:
|
||||
if self._disconnecting or not self._socket:
|
||||
return
|
||||
await self.push_error(
|
||||
"Ultravox websocket receive error", e, fatal=True
|
||||
)
|
||||
except ConnectionClosed as e:
|
||||
if (
|
||||
self._disconnecting
|
||||
or not self._socket
|
||||
or self._is_benign_websocket_close(e)
|
||||
):
|
||||
logger.debug(f"{self}: Ultravox websocket closed: {e}")
|
||||
return
|
||||
await self.push_error("Ultravox websocket receive error", e, fatal=True)
|
||||
|
||||
async def _flush_pending_user_text_messages(self):
|
||||
if (
|
||||
not self._socket
|
||||
or not self._call_started
|
||||
or not self._pending_user_text_messages
|
||||
):
|
||||
return
|
||||
|
||||
pending_texts = self._pending_user_text_messages
|
||||
self._pending_user_text_messages = []
|
||||
for pending_text in pending_texts:
|
||||
await self._send_user_text(pending_text)
|
||||
|
||||
async def _reconnect_with_context(
|
||||
self,
|
||||
*,
|
||||
system_instruction: str | None,
|
||||
initial_messages: list[dict[str, Any]] | None,
|
||||
):
|
||||
call_initial_messages = self._initial_messages_for_call(initial_messages)
|
||||
logger.debug(
|
||||
f"{self}: reconnecting Ultravox call with initialMessages="
|
||||
f"{json.dumps(call_initial_messages, ensure_ascii=True, default=str)}"
|
||||
)
|
||||
if self._socket:
|
||||
await self._disconnect(preserve_completed_tool_calls=True)
|
||||
|
||||
await self._connect_call(
|
||||
system_instruction=system_instruction,
|
||||
greeting_text=None,
|
||||
initial_messages=initial_messages,
|
||||
agent_speaks_first=self._should_agent_speak_first(initial_messages),
|
||||
)
|
||||
self._reconnect_required = False
|
||||
|
||||
def _build_one_shot_params(
|
||||
self,
|
||||
*,
|
||||
greeting_text: str | None,
|
||||
initial_messages: list[dict[str, Any]] | None,
|
||||
agent_speaks_first: bool,
|
||||
) -> DograhUltravoxOneShotInputParams:
|
||||
current_params = self._params
|
||||
extra = {
|
||||
key: value
|
||||
for key, value in current_params.extra.items()
|
||||
if key not in {"firstSpeakerSettings", "initialMessages"}
|
||||
}
|
||||
|
||||
if greeting_text is not None:
|
||||
extra["firstSpeakerSettings"] = {"agent": {"text": greeting_text}}
|
||||
elif agent_speaks_first:
|
||||
extra["firstSpeakerSettings"] = {"agent": {}}
|
||||
else:
|
||||
extra["firstSpeakerSettings"] = {"user": {}}
|
||||
call_initial_messages = self._initial_messages_for_call(initial_messages)
|
||||
if call_initial_messages:
|
||||
extra["initialMessages"] = call_initial_messages
|
||||
|
||||
output_medium = self._settings.output_medium
|
||||
if isinstance(output_medium, _NotGiven):
|
||||
output_medium = current_params.output_medium
|
||||
|
||||
return DograhUltravoxOneShotInputParams(
|
||||
api_key=current_params.api_key,
|
||||
system_prompt=self._current_system_instruction(),
|
||||
temperature=current_params.temperature,
|
||||
model=assert_given(self._settings.model),
|
||||
voice=current_params.voice,
|
||||
metadata=current_params.metadata,
|
||||
output_medium=output_medium,
|
||||
max_duration=current_params.max_duration,
|
||||
extra=extra,
|
||||
)
|
||||
|
||||
def _current_tools_schema(self, context: LLMContext | None):
|
||||
if context is None or not is_given(context.tools):
|
||||
return None
|
||||
return context.tools
|
||||
|
||||
def _to_selected_tools(self, tool: Any) -> list[dict[str, Any]]:
|
||||
selected_tools = super()._to_selected_tools(tool)
|
||||
for selected_tool in selected_tools:
|
||||
temporary_tool = selected_tool.get("temporaryTool")
|
||||
if not isinstance(temporary_tool, dict):
|
||||
continue
|
||||
|
||||
tool_name = temporary_tool.get("modelToolName")
|
||||
if not isinstance(tool_name, str):
|
||||
continue
|
||||
|
||||
timeout = self._ultravox_timeout_for_tool(tool_name)
|
||||
if timeout is not None:
|
||||
temporary_tool["timeout"] = timeout
|
||||
return selected_tools
|
||||
|
||||
def _current_system_instruction(self) -> str | None:
|
||||
system_instruction = self._settings.system_instruction
|
||||
if isinstance(system_instruction, _NotGiven):
|
||||
return None
|
||||
return system_instruction
|
||||
|
||||
def _ultravox_timeout_for_tool(self, function_name: str) -> str | None:
|
||||
item = self._functions.get(function_name) or self._functions.get(None)
|
||||
if item is None or item.timeout_secs is None or item.timeout_secs <= 0:
|
||||
return None
|
||||
|
||||
timeout_secs = min(float(item.timeout_secs), _ULTRAVOX_MAX_TOOL_TIMEOUT_SECS)
|
||||
return f"{timeout_secs:g}s"
|
||||
|
||||
def _initial_messages_for_call(
|
||||
self, initial_messages: list[dict[str, Any]] | None
|
||||
) -> list[dict[str, Any]] | None:
|
||||
if not initial_messages:
|
||||
return None
|
||||
if not self._should_add_resumption_user_message(initial_messages):
|
||||
return initial_messages
|
||||
|
||||
return [
|
||||
*initial_messages,
|
||||
{
|
||||
"role": "MESSAGE_ROLE_USER",
|
||||
"text": _RESUMPTION_USER_MESSAGE,
|
||||
},
|
||||
]
|
||||
|
||||
def _build_initial_messages(
|
||||
self, context: LLMContext
|
||||
) -> tuple[list[dict[str, Any]] | None, set[str]]:
|
||||
initial_messages: list[dict[str, Any]] = []
|
||||
tool_call_id_to_name: dict[str, str] = {}
|
||||
completed_tool_call_ids: set[str] = set()
|
||||
|
||||
for message in context.get_messages():
|
||||
if isinstance(message, LLMSpecificMessage):
|
||||
continue
|
||||
|
||||
async_payload = async_tool_messages.parse_message(message)
|
||||
if async_payload is not None:
|
||||
if async_payload.kind == "intermediate":
|
||||
logger.error(
|
||||
f"{self}: Ultravox does not support streamed async tool results; "
|
||||
f"dropping intermediate result from initialMessages for "
|
||||
f"tool_call_id={async_payload.tool_call_id}."
|
||||
)
|
||||
continue
|
||||
if async_payload.kind == "final":
|
||||
initial_message = self._build_ultravox_message(
|
||||
role="MESSAGE_ROLE_TOOL_RESULT",
|
||||
text=async_payload.result or "",
|
||||
invocation_id=async_payload.tool_call_id,
|
||||
tool_name=tool_call_id_to_name.get(async_payload.tool_call_id),
|
||||
)
|
||||
if initial_message is not None:
|
||||
initial_messages.append(initial_message)
|
||||
completed_tool_call_ids.add(async_payload.tool_call_id)
|
||||
continue
|
||||
|
||||
role = message.get("role")
|
||||
if role == "user":
|
||||
initial_message = self._build_ultravox_message(
|
||||
role="MESSAGE_ROLE_USER",
|
||||
text=self._extract_text_content(message.get("content")),
|
||||
)
|
||||
if initial_message is not None:
|
||||
initial_messages.append(initial_message)
|
||||
elif role == "assistant":
|
||||
text = self._extract_text_content(message.get("content"))
|
||||
initial_message = self._build_ultravox_message(
|
||||
role="MESSAGE_ROLE_AGENT",
|
||||
text=text,
|
||||
)
|
||||
if initial_message is not None:
|
||||
initial_messages.append(initial_message)
|
||||
|
||||
tool_calls = message.get("tool_calls")
|
||||
if isinstance(tool_calls, list):
|
||||
for tool_call in tool_calls:
|
||||
if not isinstance(tool_call, dict):
|
||||
continue
|
||||
tool_id = tool_call.get("id")
|
||||
function = tool_call.get("function")
|
||||
tool_name = (
|
||||
function.get("name") if isinstance(function, dict) else None
|
||||
)
|
||||
if isinstance(tool_id, str) and isinstance(tool_name, str):
|
||||
tool_call_id_to_name[tool_id] = tool_name
|
||||
initial_message = self._build_ultravox_message(
|
||||
role="MESSAGE_ROLE_TOOL_CALL",
|
||||
text="",
|
||||
invocation_id=tool_id,
|
||||
tool_name=tool_name,
|
||||
)
|
||||
if initial_message is not None:
|
||||
initial_messages.append(initial_message)
|
||||
elif (
|
||||
role == "tool"
|
||||
and message.get("content") != "IN_PROGRESS"
|
||||
and message.get("content") != "CANCELLED"
|
||||
):
|
||||
tool_call_id = message.get("tool_call_id")
|
||||
initial_message = self._build_ultravox_message(
|
||||
role="MESSAGE_ROLE_TOOL_RESULT",
|
||||
text=self._stringify_tool_result(message.get("content")),
|
||||
invocation_id=tool_call_id
|
||||
if isinstance(tool_call_id, str)
|
||||
else None,
|
||||
tool_name=(
|
||||
tool_call_id_to_name.get(tool_call_id)
|
||||
if isinstance(tool_call_id, str)
|
||||
else None
|
||||
),
|
||||
)
|
||||
if initial_message is not None:
|
||||
initial_messages.append(initial_message)
|
||||
if isinstance(tool_call_id, str):
|
||||
completed_tool_call_ids.add(tool_call_id)
|
||||
|
||||
return (initial_messages or None), completed_tool_call_ids
|
||||
|
||||
@staticmethod
|
||||
def _build_ultravox_message(
|
||||
*,
|
||||
role: str,
|
||||
text: str | None,
|
||||
invocation_id: str | None = None,
|
||||
tool_name: str | None = None,
|
||||
) -> dict[str, Any] | None:
|
||||
if text is None:
|
||||
return None
|
||||
|
||||
message: dict[str, Any] = {
|
||||
"role": role,
|
||||
"text": text,
|
||||
}
|
||||
if invocation_id is not None:
|
||||
message["invocationId"] = invocation_id
|
||||
if tool_name is not None:
|
||||
message["toolName"] = tool_name
|
||||
return message
|
||||
|
||||
@staticmethod
|
||||
def _should_agent_speak_first(
|
||||
initial_messages: list[dict[str, Any]] | None,
|
||||
) -> bool:
|
||||
if not initial_messages:
|
||||
return True
|
||||
return initial_messages[-1].get("role") in {
|
||||
"MESSAGE_ROLE_USER",
|
||||
"MESSAGE_ROLE_TOOL_RESULT",
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def _should_add_resumption_user_message(
|
||||
initial_messages: list[dict[str, Any]] | None,
|
||||
) -> bool:
|
||||
if not initial_messages:
|
||||
return False
|
||||
return initial_messages[-1].get("role") == "MESSAGE_ROLE_TOOL_RESULT"
|
||||
|
||||
@staticmethod
|
||||
def _is_benign_websocket_close(exc: ConnectionClosed) -> bool:
|
||||
return any(
|
||||
close is not None and close.code in {1000, 1001}
|
||||
for close in (exc.sent, exc.rcvd)
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _extract_text_content(content: Any) -> str | None:
|
||||
if isinstance(content, str):
|
||||
return content
|
||||
if isinstance(content, list):
|
||||
parts: list[str] = []
|
||||
for part in content:
|
||||
if not isinstance(part, dict):
|
||||
return None
|
||||
if part.get("type") != "text":
|
||||
return None
|
||||
text = part.get("text")
|
||||
if not isinstance(text, str):
|
||||
return None
|
||||
parts.append(text)
|
||||
return "\n".join(parts) if parts else None
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _stringify_tool_result(content: Any) -> str:
|
||||
if isinstance(content, str):
|
||||
return content
|
||||
if isinstance(content, list):
|
||||
parts: list[str] = []
|
||||
for part in content:
|
||||
if isinstance(part, dict):
|
||||
text = part.get("text")
|
||||
if isinstance(text, str):
|
||||
parts.append(text)
|
||||
if parts:
|
||||
return "".join(parts)
|
||||
return json.dumps(content, ensure_ascii=True, default=str)
|
||||
|
|
@ -640,6 +640,24 @@ def create_realtime_llm_service(user_config, audio_config: "AudioConfig"):
|
|||
),
|
||||
),
|
||||
)
|
||||
elif provider == ServiceProviders.ULTRAVOX_REALTIME.value:
|
||||
from api.services.pipecat.realtime.ultravox_realtime import (
|
||||
DograhUltravoxOneShotInputParams,
|
||||
DograhUltravoxRealtimeLLMService,
|
||||
)
|
||||
|
||||
return DograhUltravoxRealtimeLLMService(
|
||||
params=DograhUltravoxOneShotInputParams(
|
||||
api_key=api_key,
|
||||
model=model,
|
||||
voice=voice,
|
||||
output_medium="voice",
|
||||
),
|
||||
settings=DograhUltravoxRealtimeLLMService.Settings(
|
||||
model=model,
|
||||
output_medium="voice",
|
||||
),
|
||||
)
|
||||
elif provider == ServiceProviders.GOOGLE_REALTIME.value:
|
||||
from api.services.pipecat.realtime.gemini_live import (
|
||||
DograhGeminiLiveLLMService,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue