mirror of
https://github.com/dograh-hq/dograh.git
synced 2026-07-01 08:59:46 +02:00
Fix realtime initial greeting handling (#481)
This commit is contained in:
parent
d9800fddd6
commit
090d042a78
20 changed files with 714 additions and 70 deletions
|
|
@ -11,6 +11,7 @@ from typing import Any
|
|||
|
||||
from loguru import logger
|
||||
|
||||
from api.services.pipecat.realtime.static_greeting import format_static_greeting_prompt
|
||||
from pipecat.frames.frames import (
|
||||
BotStartedSpeakingFrame,
|
||||
BotStoppedSpeakingFrame,
|
||||
|
|
@ -49,6 +50,7 @@ class DograhAzureRealtimeLLMService(AzureRealtimeLLMService):
|
|||
self._handled_initial_context: bool = False
|
||||
self._bot_is_speaking: bool = False
|
||||
self._deferred_function_calls: list[FunctionCallFromLLM] = []
|
||||
self._pending_initial_greeting_text: str | None = None
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
if isinstance(frame, UserMuteStartedFrame):
|
||||
|
|
@ -61,7 +63,11 @@ class DograhAzureRealtimeLLMService(AzureRealtimeLLMService):
|
|||
return
|
||||
if isinstance(frame, TTSSpeakFrame):
|
||||
if not self._handled_initial_context:
|
||||
await self._handle_context(self._context)
|
||||
greeting_text = frame.text.strip() if frame.text else ""
|
||||
if greeting_text:
|
||||
await self._handle_initial_greeting(self._context, greeting_text)
|
||||
else:
|
||||
await self._handle_context(self._context)
|
||||
else:
|
||||
logger.warning(
|
||||
f"{self}: TTSSpeakFrame after initial context already handled — "
|
||||
|
|
@ -118,6 +124,57 @@ class DograhAzureRealtimeLLMService(AzureRealtimeLLMService):
|
|||
self._context = context
|
||||
await self._process_completed_function_calls(send_new_results=True)
|
||||
|
||||
async def _handle_initial_greeting(self, context: LLMContext, greeting_text: str):
|
||||
if context is None:
|
||||
logger.warning(
|
||||
f"{self}: received initial greeting trigger before context was set"
|
||||
)
|
||||
return
|
||||
|
||||
self._handled_initial_context = True
|
||||
self._context = context
|
||||
await self._create_initial_greeting_response(greeting_text)
|
||||
|
||||
async def _create_initial_greeting_response(self, greeting_text: str):
|
||||
if self._disconnecting:
|
||||
return
|
||||
|
||||
if not self._api_session_ready:
|
||||
self._pending_initial_greeting_text = greeting_text
|
||||
self._run_llm_when_api_session_ready = True
|
||||
return
|
||||
|
||||
self._pending_initial_greeting_text = None
|
||||
await self._ensure_conversation_setup()
|
||||
await self._send_manual_response_create(
|
||||
instructions=format_static_greeting_prompt(greeting_text),
|
||||
tool_choice="none",
|
||||
)
|
||||
|
||||
async def _ensure_conversation_setup(self):
|
||||
if not self._llm_needs_conversation_setup:
|
||||
return
|
||||
|
||||
adapter = self.get_llm_adapter()
|
||||
llm_invocation_params = adapter.get_llm_invocation_params(self._context)
|
||||
for item in llm_invocation_params["messages"]:
|
||||
evt = events.ConversationItemCreateEvent(item=item)
|
||||
self._messages_added_manually[evt.item.id] = True
|
||||
await self.send_client_event(evt)
|
||||
|
||||
await self._send_session_update()
|
||||
self._llm_needs_conversation_setup = False
|
||||
|
||||
async def _handle_evt_session_updated(self, evt):
|
||||
self._api_session_ready = True
|
||||
if self._pending_initial_greeting_text is not None:
|
||||
greeting_text = self._pending_initial_greeting_text
|
||||
self._run_llm_when_api_session_ready = False
|
||||
await self._create_initial_greeting_response(greeting_text)
|
||||
elif self._run_llm_when_api_session_ready:
|
||||
self._run_llm_when_api_session_ready = False
|
||||
await self._create_response()
|
||||
|
||||
async def _send_user_audio(self, frame):
|
||||
if self._user_is_muted:
|
||||
return
|
||||
|
|
@ -171,14 +228,21 @@ class DograhAzureRealtimeLLMService(AzureRealtimeLLMService):
|
|||
return "\n".join(parts) if parts else None
|
||||
return None
|
||||
|
||||
async def _send_manual_response_create(self):
|
||||
async def _send_manual_response_create(
|
||||
self,
|
||||
*,
|
||||
instructions: str | None = None,
|
||||
tool_choice: str | None = None,
|
||||
):
|
||||
await self.push_frame(LLMFullResponseStartFrame())
|
||||
await self.start_processing_metrics()
|
||||
await self.start_ttfb_metrics()
|
||||
await self.send_client_event(
|
||||
events.ResponseCreateEvent(
|
||||
response=events.ResponseProperties(
|
||||
output_modalities=self._get_enabled_modalities()
|
||||
output_modalities=self._get_enabled_modalities(),
|
||||
instructions=instructions,
|
||||
tool_choice=tool_choice,
|
||||
)
|
||||
)
|
||||
)
|
||||
|
|
|
|||
|
|
@ -20,11 +20,13 @@ Layers Dograh engine integration quirks onto upstream-pristine
|
|||
|
||||
from typing import Any
|
||||
|
||||
from google.genai.types import Content, Part
|
||||
from loguru import logger
|
||||
|
||||
from api.services.pipecat.gemini_json_schema_adapter import (
|
||||
DograhGeminiJSONSchemaAdapter,
|
||||
)
|
||||
from api.services.pipecat.realtime.static_greeting import format_static_greeting_prompt
|
||||
from pipecat.frames.frames import (
|
||||
BotStoppedSpeakingFrame,
|
||||
Frame,
|
||||
|
|
@ -63,6 +65,9 @@ class DograhGeminiLiveLLMService(GeminiLiveLLMService):
|
|||
# Function calls emitted by Gemini mid-bot-turn are deferred here and
|
||||
# invoked when the turn ends, so they don't race the turn's audio.
|
||||
self._pending_function_calls: list[FunctionCallFromLLM] = []
|
||||
# Text greeting captured from the first TTSSpeakFrame while the Gemini
|
||||
# session is still connecting.
|
||||
self._pending_initial_greeting_text: str | None = None
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Hooks from upstream GeminiLiveLLMService
|
||||
|
|
@ -142,10 +147,15 @@ class DograhGeminiLiveLLMService(GeminiLiveLLMService):
|
|||
if isinstance(frame, TTSSpeakFrame):
|
||||
# Greeting trigger: the engine queues a TTSSpeakFrame to start the
|
||||
# bot's first turn after node setup. Gemini Live renders its own
|
||||
# audio, so we don't pass the frame through — we re-enter
|
||||
# _handle_context to kick off the initial response.
|
||||
# audio, so we don't pass the frame through. For configured static
|
||||
# text greetings, ask Gemini to say the exact greeting; otherwise
|
||||
# re-enter _handle_context to kick off the normal initial response.
|
||||
if not self._handled_initial_context:
|
||||
await self._handle_context(self._context)
|
||||
greeting_text = frame.text.strip() if frame.text else ""
|
||||
if greeting_text:
|
||||
await self._handle_initial_greeting(self._context, greeting_text)
|
||||
else:
|
||||
await self._handle_context(self._context)
|
||||
else:
|
||||
logger.warning(
|
||||
f"{self}: TTSSpeakFrame after initial context already "
|
||||
|
|
@ -183,6 +193,49 @@ class DograhGeminiLiveLLMService(GeminiLiveLLMService):
|
|||
self._context = context
|
||||
await self._process_completed_function_calls(send_new_results=True)
|
||||
|
||||
async def _handle_initial_greeting(self, context: LLMContext, greeting_text: str):
|
||||
"""Trigger the first Gemini turn with an exact static text greeting."""
|
||||
if context is None:
|
||||
logger.warning(
|
||||
f"{self}: received initial greeting trigger before context was set"
|
||||
)
|
||||
return
|
||||
|
||||
self._handled_initial_context = True
|
||||
self._context = context
|
||||
await self._create_initial_greeting_response(greeting_text)
|
||||
|
||||
async def _create_initial_greeting_response(self, greeting_text: str):
|
||||
"""Ask Gemini Live to speak the configured greeting exactly once."""
|
||||
if self._disconnecting:
|
||||
return
|
||||
|
||||
if not self._session:
|
||||
self._pending_initial_greeting_text = greeting_text
|
||||
self._run_llm_when_session_ready = True
|
||||
return
|
||||
|
||||
self._pending_initial_greeting_text = None
|
||||
prompt = format_static_greeting_prompt(greeting_text)
|
||||
turn = Content(role="user", parts=[Part(text=prompt)])
|
||||
|
||||
logger.debug("Creating Gemini Live initial response from static greeting")
|
||||
|
||||
await self.start_ttfb_metrics()
|
||||
|
||||
try:
|
||||
await self._session.send_client_content(
|
||||
turns=[turn],
|
||||
turn_complete=True,
|
||||
)
|
||||
# Gemini 3.x also needs a realtime-input nudge to begin inference.
|
||||
if self._is_gemini_3:
|
||||
await self._session.send_realtime_input(text=" ")
|
||||
except Exception as e:
|
||||
await self._handle_send_error(e)
|
||||
|
||||
self._ready_for_realtime_input = True
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Session lifecycle: drop upstream's automatic reconnect-seed and
|
||||
# initial-context-seed paths. The TTSSpeakFrame trigger and the
|
||||
|
|
@ -201,7 +254,12 @@ class DograhGeminiLiveLLMService(GeminiLiveLLMService):
|
|||
# Context arrived before session was ready — fulfil the queued
|
||||
# initial response now.
|
||||
self._run_llm_when_session_ready = False
|
||||
await self._create_initial_response()
|
||||
if self._pending_initial_greeting_text is not None:
|
||||
await self._create_initial_greeting_response(
|
||||
self._pending_initial_greeting_text
|
||||
)
|
||||
else:
|
||||
await self._create_initial_response()
|
||||
await self._drain_pending_tool_results()
|
||||
# Otherwise: no automatic seed. Reconnect after a session-resumption
|
||||
# update relies on the server-side restored state; reconnects without
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@ from typing import Any
|
|||
|
||||
from loguru import logger
|
||||
|
||||
from api.services.pipecat.realtime.static_greeting import format_static_greeting_prompt
|
||||
from pipecat.frames.frames import (
|
||||
BotStartedSpeakingFrame,
|
||||
BotStoppedSpeakingFrame,
|
||||
|
|
@ -50,6 +51,7 @@ class DograhGrokRealtimeLLMService(GrokRealtimeLLMService):
|
|||
self._handled_initial_context: bool = False
|
||||
self._bot_is_speaking: bool = False
|
||||
self._deferred_function_calls: list[FunctionCallFromLLM] = []
|
||||
self._pending_initial_greeting_text: str | None = None
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
if isinstance(frame, UserMuteStartedFrame):
|
||||
|
|
@ -62,7 +64,11 @@ class DograhGrokRealtimeLLMService(GrokRealtimeLLMService):
|
|||
return
|
||||
if isinstance(frame, TTSSpeakFrame):
|
||||
if not self._handled_initial_context:
|
||||
await self._handle_context(self._context)
|
||||
greeting_text = frame.text.strip() if frame.text else ""
|
||||
if greeting_text:
|
||||
await self._handle_initial_greeting(self._context, greeting_text)
|
||||
else:
|
||||
await self._handle_context(self._context)
|
||||
else:
|
||||
logger.warning(
|
||||
f"{self}: TTSSpeakFrame after initial context already "
|
||||
|
|
@ -120,6 +126,67 @@ class DograhGrokRealtimeLLMService(GrokRealtimeLLMService):
|
|||
self._context = context
|
||||
await self._process_completed_function_calls(send_new_results=True)
|
||||
|
||||
async def _handle_initial_greeting(self, context: LLMContext, greeting_text: str):
|
||||
if context is None:
|
||||
logger.warning(
|
||||
f"{self}: received initial greeting trigger before context was set"
|
||||
)
|
||||
return
|
||||
|
||||
self._handled_initial_context = True
|
||||
self._context = context
|
||||
await self._create_initial_greeting_response(greeting_text)
|
||||
|
||||
async def _create_initial_greeting_response(self, greeting_text: str):
|
||||
if self._disconnecting:
|
||||
return
|
||||
|
||||
if not self._api_session_ready:
|
||||
self._pending_initial_greeting_text = greeting_text
|
||||
self._run_llm_when_api_session_ready = True
|
||||
return
|
||||
|
||||
self._pending_initial_greeting_text = None
|
||||
await self._ensure_conversation_setup()
|
||||
item = events.ConversationItem(
|
||||
type="message",
|
||||
role="user",
|
||||
content=[
|
||||
events.ItemContent(
|
||||
type="input_text",
|
||||
text=format_static_greeting_prompt(greeting_text),
|
||||
)
|
||||
],
|
||||
)
|
||||
evt = events.ConversationItemCreateEvent(item=item)
|
||||
self._messages_added_manually[evt.item.id] = True
|
||||
await self.send_client_event(evt)
|
||||
await self._send_manual_response_create()
|
||||
|
||||
async def _ensure_conversation_setup(self):
|
||||
if not self._llm_needs_conversation_setup:
|
||||
return
|
||||
|
||||
adapter = self.get_llm_adapter()
|
||||
llm_invocation_params = adapter.get_llm_invocation_params(self._context)
|
||||
for item in llm_invocation_params["messages"]:
|
||||
evt = events.ConversationItemCreateEvent(item=item)
|
||||
self._messages_added_manually[evt.item.id] = True
|
||||
await self.send_client_event(evt)
|
||||
|
||||
await self._send_session_update()
|
||||
self._llm_needs_conversation_setup = False
|
||||
|
||||
async def _handle_evt_session_updated(self, evt):
|
||||
self._api_session_ready = True
|
||||
if self._pending_initial_greeting_text is not None:
|
||||
greeting_text = self._pending_initial_greeting_text
|
||||
self._run_llm_when_api_session_ready = False
|
||||
await self._create_initial_greeting_response(greeting_text)
|
||||
elif self._run_llm_when_api_session_ready:
|
||||
self._run_llm_when_api_session_ready = False
|
||||
await self._create_response()
|
||||
|
||||
async def _send_user_audio(self, frame):
|
||||
if self._user_is_muted:
|
||||
return
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@ from typing import Any
|
|||
|
||||
from loguru import logger
|
||||
|
||||
from api.services.pipecat.realtime.static_greeting import format_static_greeting_prompt
|
||||
from pipecat.frames.frames import (
|
||||
BotStartedSpeakingFrame,
|
||||
BotStoppedSpeakingFrame,
|
||||
|
|
@ -56,6 +57,7 @@ class DograhOpenAIRealtimeLLMService(OpenAIRealtimeLLMService):
|
|||
# has finished speaking, matching Dograh's Gemini Live behavior.
|
||||
self._bot_is_speaking: bool = False
|
||||
self._deferred_function_calls: list[FunctionCallFromLLM] = []
|
||||
self._pending_initial_greeting_text: str | None = None
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Frame handling: mute, TTSSpeakFrame as greeting trigger
|
||||
|
|
@ -73,11 +75,16 @@ class DograhOpenAIRealtimeLLMService(OpenAIRealtimeLLMService):
|
|||
if isinstance(frame, TTSSpeakFrame):
|
||||
# Greeting trigger: the engine queues a TTSSpeakFrame after node
|
||||
# setup. OpenAI Realtime renders its own audio, so we don't pass
|
||||
# the frame to TTS. Route through _handle_context so the initial
|
||||
# response and later tool-result turns share the same context
|
||||
# lifecycle even when Dograh has already pre-populated self._context.
|
||||
# the frame to TTS. For configured static text greetings, ask the
|
||||
# model to say the exact greeting; otherwise route through
|
||||
# _handle_context so the initial response and later tool-result
|
||||
# turns share the same context lifecycle.
|
||||
if not self._handled_initial_context:
|
||||
await self._handle_context(self._context)
|
||||
greeting_text = frame.text.strip() if frame.text else ""
|
||||
if greeting_text:
|
||||
await self._handle_initial_greeting(self._context, greeting_text)
|
||||
else:
|
||||
await self._handle_context(self._context)
|
||||
else:
|
||||
logger.warning(
|
||||
f"{self}: TTSSpeakFrame after initial context already "
|
||||
|
|
@ -137,6 +144,57 @@ class DograhOpenAIRealtimeLLMService(OpenAIRealtimeLLMService):
|
|||
self._context = context
|
||||
await self._process_completed_function_calls(send_new_results=True)
|
||||
|
||||
async def _handle_initial_greeting(self, context: LLMContext, greeting_text: str):
|
||||
if context is None:
|
||||
logger.warning(
|
||||
f"{self}: received initial greeting trigger before context was set"
|
||||
)
|
||||
return
|
||||
|
||||
self._handled_initial_context = True
|
||||
self._context = context
|
||||
await self._create_initial_greeting_response(greeting_text)
|
||||
|
||||
async def _create_initial_greeting_response(self, greeting_text: str):
|
||||
if self._disconnecting:
|
||||
return
|
||||
|
||||
if not self._api_session_ready:
|
||||
self._pending_initial_greeting_text = greeting_text
|
||||
self._run_llm_when_api_session_ready = True
|
||||
return
|
||||
|
||||
self._pending_initial_greeting_text = None
|
||||
await self._ensure_conversation_setup()
|
||||
await self._send_manual_response_create(
|
||||
instructions=format_static_greeting_prompt(greeting_text),
|
||||
tool_choice="none",
|
||||
)
|
||||
|
||||
async def _ensure_conversation_setup(self):
|
||||
if not self._llm_needs_conversation_setup:
|
||||
return
|
||||
|
||||
adapter = self.get_llm_adapter()
|
||||
llm_invocation_params = adapter.get_llm_invocation_params(self._context)
|
||||
for item in llm_invocation_params["messages"]:
|
||||
evt = events.ConversationItemCreateEvent(item=item)
|
||||
self._messages_added_manually[evt.item.id] = True
|
||||
await self.send_client_event(evt)
|
||||
|
||||
await self._send_session_update()
|
||||
self._llm_needs_conversation_setup = False
|
||||
|
||||
async def _handle_evt_session_updated(self, evt):
|
||||
self._api_session_ready = True
|
||||
if self._pending_initial_greeting_text is not None:
|
||||
greeting_text = self._pending_initial_greeting_text
|
||||
self._run_llm_when_api_session_ready = False
|
||||
await self._create_initial_greeting_response(greeting_text)
|
||||
elif self._run_llm_when_api_session_ready:
|
||||
self._run_llm_when_api_session_ready = False
|
||||
await self._create_response()
|
||||
|
||||
async def _send_user_audio(self, frame):
|
||||
if self._user_is_muted:
|
||||
return
|
||||
|
|
@ -190,7 +248,12 @@ class DograhOpenAIRealtimeLLMService(OpenAIRealtimeLLMService):
|
|||
return "\n".join(parts) if parts else None
|
||||
return None
|
||||
|
||||
async def _send_manual_response_create(self):
|
||||
async def _send_manual_response_create(
|
||||
self,
|
||||
*,
|
||||
instructions: str | None = None,
|
||||
tool_choice: str | None = None,
|
||||
):
|
||||
"""Trigger inference after manually appending conversation items."""
|
||||
await self.push_frame(LLMFullResponseStartFrame())
|
||||
await self.start_processing_metrics()
|
||||
|
|
@ -198,7 +261,9 @@ class DograhOpenAIRealtimeLLMService(OpenAIRealtimeLLMService):
|
|||
await self.send_client_event(
|
||||
events.ResponseCreateEvent(
|
||||
response=events.ResponseProperties(
|
||||
output_modalities=self._get_enabled_modalities()
|
||||
output_modalities=self._get_enabled_modalities(),
|
||||
instructions=instructions,
|
||||
tool_choice=tool_choice,
|
||||
)
|
||||
)
|
||||
)
|
||||
|
|
|
|||
8
api/services/pipecat/realtime/static_greeting.py
Normal file
8
api/services/pipecat/realtime/static_greeting.py
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
def format_static_greeting_prompt(greeting_text: str) -> str:
|
||||
return (
|
||||
"The phone call has just connected. Greet the caller now: "
|
||||
"say the following opening line out loud, exactly as written, "
|
||||
"in a natural spoken voice, and then stop and wait for the "
|
||||
"caller to respond. Do not add anything before or after it.\n\n"
|
||||
f'"{greeting_text}"'
|
||||
)
|
||||
|
|
@ -72,7 +72,7 @@ class RealtimeFeedbackObserver(BaseObserver):
|
|||
- TTFB metrics (LLM generation time only)
|
||||
|
||||
Logs buffer persistence (only final data for post-call analysis):
|
||||
- Complete user transcripts per turn (via on_user_turn_stopped)
|
||||
- Complete user transcripts per turn (via on_user_turn_message_added)
|
||||
- Complete assistant transcripts per turn (via on_assistant_turn_stopped)
|
||||
- Function calls and TTFB metrics
|
||||
|
||||
|
|
@ -300,13 +300,13 @@ def register_turn_log_handlers(
|
|||
):
|
||||
"""Register event handlers on aggregators to persist final turn transcripts.
|
||||
|
||||
Hooks into on_user_turn_stopped and on_assistant_turn_stopped to store
|
||||
Hooks into on_user_turn_message_added and on_assistant_turn_stopped to store
|
||||
complete turn text in the logs buffer. Works for both WebRTC and telephony
|
||||
calls — independent of WebSocket availability.
|
||||
"""
|
||||
|
||||
@user_aggregator.event_handler("on_user_turn_stopped")
|
||||
async def on_user_turn_stopped(aggregator, strategy, message):
|
||||
@user_aggregator.event_handler("on_user_turn_message_added")
|
||||
async def on_user_turn_message_added(aggregator, message):
|
||||
logs_buffer.increment_turn()
|
||||
try:
|
||||
await logs_buffer.append(
|
||||
|
|
|
|||
|
|
@ -113,49 +113,53 @@ def _resolve_user_turn_stop_timeout(
|
|||
|
||||
def _create_realtime_user_turn_config(provider: str):
|
||||
"""Return user turn strategies and optional local VAD for realtime providers."""
|
||||
|
||||
def external_provider_turn_config():
|
||||
return (
|
||||
UserTurnStrategies(
|
||||
start=[ExternalUserTurnStartStrategy()],
|
||||
stop=[ExternalUserTurnStopStrategy(wait_for_transcript=False)],
|
||||
),
|
||||
None,
|
||||
)
|
||||
|
||||
def local_vad_turn_config(*, enable_interruptions: bool):
|
||||
return (
|
||||
UserTurnStrategies(
|
||||
start=[
|
||||
VADUserTurnStartStrategy(enable_interruptions=enable_interruptions)
|
||||
],
|
||||
stop=[SpeechTimeoutUserTurnStopStrategy(wait_for_transcript=False)],
|
||||
),
|
||||
SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
)
|
||||
|
||||
if provider in {
|
||||
ServiceProviders.GOOGLE_REALTIME.value,
|
||||
ServiceProviders.GOOGLE_VERTEX_REALTIME.value,
|
||||
}:
|
||||
# Let Gemini Live own barge-in via its server-side VAD, but keep local
|
||||
# Silero VAD for early user-turn start and speaking-state tracking.
|
||||
return (
|
||||
UserTurnStrategies(
|
||||
start=[VADUserTurnStartStrategy(enable_interruptions=False)],
|
||||
stop=[SpeechTimeoutUserTurnStopStrategy()],
|
||||
),
|
||||
SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
)
|
||||
return local_vad_turn_config(enable_interruptions=False)
|
||||
|
||||
if provider == ServiceProviders.OPENAI_REALTIME.value:
|
||||
# OpenAI Realtime already emits speaking-state frames and interruption
|
||||
# events from the provider, so the aggregator should follow those
|
||||
# external signals rather than run its own local VAD.
|
||||
return (
|
||||
UserTurnStrategies(
|
||||
start=[ExternalUserTurnStartStrategy()],
|
||||
stop=[ExternalUserTurnStopStrategy()],
|
||||
),
|
||||
None,
|
||||
)
|
||||
if provider in {
|
||||
ServiceProviders.OPENAI_REALTIME.value,
|
||||
ServiceProviders.AZURE_REALTIME.value,
|
||||
}:
|
||||
# OpenAI-compatible Realtime services already emit speaking-state frames
|
||||
# and interruption events from the provider, so the aggregator should
|
||||
# follow those external signals rather than run its own local VAD.
|
||||
return external_provider_turn_config()
|
||||
if provider == ServiceProviders.GROK_REALTIME.value:
|
||||
# Grok Voice Agent emits server-side speech-start/stop and
|
||||
# interruption signals, so local VAD should stay out of the way.
|
||||
return (
|
||||
UserTurnStrategies(
|
||||
start=[ExternalUserTurnStartStrategy()],
|
||||
stop=[ExternalUserTurnStopStrategy()],
|
||||
),
|
||||
None,
|
||||
)
|
||||
return external_provider_turn_config()
|
||||
if provider == ServiceProviders.ULTRAVOX_REALTIME.value:
|
||||
# Ultravox does not emit user-turn frames, so local VAD supplies
|
||||
# lifecycle signals for Dograh observers/controllers.
|
||||
return local_vad_turn_config(enable_interruptions=True)
|
||||
|
||||
return (
|
||||
UserTurnStrategies(
|
||||
start=[VADUserTurnStartStrategy()],
|
||||
stop=[SpeechTimeoutUserTurnStopStrategy()],
|
||||
),
|
||||
SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
|
||||
)
|
||||
return local_vad_turn_config(enable_interruptions=True)
|
||||
|
||||
|
||||
async def run_pipeline_telephony(
|
||||
|
|
@ -775,7 +779,10 @@ async def _run_pipeline_impl(
|
|||
vad_analyzer=user_vad_analyzer,
|
||||
)
|
||||
context_aggregator = LLMContextAggregatorPair(
|
||||
context, assistant_params=assistant_params, user_params=user_params
|
||||
context,
|
||||
assistant_params=assistant_params,
|
||||
user_params=user_params,
|
||||
realtime_service_mode=is_realtime,
|
||||
)
|
||||
|
||||
# Create usage metrics aggregator with engine's callback
|
||||
|
|
|
|||
|
|
@ -250,7 +250,6 @@ class _ToolDocumentRefsMixin(BaseModel):
|
|||
"description": (
|
||||
"Text spoken via TTS at the start of the call. Supports "
|
||||
"{{template_variables}}. Leave empty to skip the greeting. "
|
||||
"Not supported with realtime (speech-to-speech) models."
|
||||
),
|
||||
"display_options": DisplayOptions(show={"greeting_type": ["text"]}),
|
||||
"placeholder": "Hi {{first_name}}, this is Sarah from Acme.",
|
||||
|
|
|
|||
88
api/tests/test_azure_realtime_wrapper.py
Normal file
88
api/tests/test_azure_realtime_wrapper.py
Normal file
|
|
@ -0,0 +1,88 @@
|
|||
from types import SimpleNamespace
|
||||
from unittest.mock import AsyncMock
|
||||
|
||||
import pytest
|
||||
from pipecat.frames.frames import TTSSpeakFrame
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.openai.realtime import events
|
||||
|
||||
from api.services.pipecat.realtime.azure_realtime import (
|
||||
DograhAzureRealtimeLLMService,
|
||||
)
|
||||
|
||||
|
||||
def _make_service() -> DograhAzureRealtimeLLMService:
|
||||
service = DograhAzureRealtimeLLMService(
|
||||
api_key="test-key",
|
||||
base_url="wss://example.test/openai/realtime",
|
||||
)
|
||||
service._create_response = AsyncMock()
|
||||
service._process_completed_function_calls = AsyncMock()
|
||||
return service
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tts_greeting_sends_exact_static_greeting_prompt():
|
||||
service = _make_service()
|
||||
service._context = LLMContext([{"role": "user", "content": "Existing context"}])
|
||||
service._api_session_ready = True
|
||||
service.send_client_event = AsyncMock()
|
||||
service.push_frame = AsyncMock()
|
||||
service.start_processing_metrics = AsyncMock()
|
||||
service.start_ttfb_metrics = AsyncMock()
|
||||
|
||||
await service.process_frame(
|
||||
TTSSpeakFrame("Hi Sam, this is Sarah from Acme.", append_to_context=True),
|
||||
FrameDirection.DOWNSTREAM,
|
||||
)
|
||||
|
||||
sent_events = [call.args[0] for call in service.send_client_event.await_args_list]
|
||||
assert isinstance(sent_events[0], events.ConversationItemCreateEvent)
|
||||
assert sent_events[0].item.role == "user"
|
||||
assert sent_events[0].item.content[0].text == "Existing context"
|
||||
assert isinstance(sent_events[1], events.SessionUpdateEvent)
|
||||
response_event = sent_events[-1]
|
||||
assert isinstance(response_event, events.ResponseCreateEvent)
|
||||
assert response_event.response.tool_choice == "none"
|
||||
prompt = response_event.response.instructions
|
||||
assert "The phone call has just connected. Greet the caller now:" in prompt
|
||||
assert prompt.endswith('"Hi Sam, this is Sarah from Acme."')
|
||||
assert service._llm_needs_conversation_setup is False
|
||||
service._create_response.assert_not_awaited()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tts_greeting_waits_for_session_updated_before_sending_prompt():
|
||||
service = _make_service()
|
||||
service._context = LLMContext([{"role": "user", "content": "Existing context"}])
|
||||
|
||||
await service.process_frame(
|
||||
TTSSpeakFrame("Hello from Dograh.", append_to_context=True),
|
||||
FrameDirection.DOWNSTREAM,
|
||||
)
|
||||
|
||||
assert service._handled_initial_context is True
|
||||
assert service._run_llm_when_api_session_ready is True
|
||||
assert service._pending_initial_greeting_text == "Hello from Dograh."
|
||||
|
||||
service.send_client_event = AsyncMock()
|
||||
service.push_frame = AsyncMock()
|
||||
service.start_processing_metrics = AsyncMock()
|
||||
service.start_ttfb_metrics = AsyncMock()
|
||||
|
||||
await service._handle_evt_session_updated(SimpleNamespace())
|
||||
|
||||
sent_events = [call.args[0] for call in service.send_client_event.await_args_list]
|
||||
assert isinstance(sent_events[0], events.ConversationItemCreateEvent)
|
||||
assert sent_events[0].item.content[0].text == "Existing context"
|
||||
assert isinstance(sent_events[1], events.SessionUpdateEvent)
|
||||
response_event = sent_events[-1]
|
||||
assert isinstance(response_event, events.ResponseCreateEvent)
|
||||
assert response_event.response.tool_choice == "none"
|
||||
prompt = response_event.response.instructions
|
||||
assert prompt.endswith('"Hello from Dograh."')
|
||||
assert service._run_llm_when_api_session_ready is False
|
||||
assert service._pending_initial_greeting_text is None
|
||||
assert service._llm_needs_conversation_setup is False
|
||||
service._create_response.assert_not_awaited()
|
||||
|
|
@ -3,7 +3,7 @@ from types import SimpleNamespace
|
|||
from unittest.mock import AsyncMock
|
||||
|
||||
import pytest
|
||||
from pipecat.frames.frames import TranscriptionFrame
|
||||
from pipecat.frames.frames import TranscriptionFrame, TTSSpeakFrame
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
|
||||
|
|
@ -21,6 +21,7 @@ class _TestDograhGeminiLiveLLMService(DograhGeminiLiveLLMService):
|
|||
|
||||
class _FakeSession:
|
||||
def __init__(self):
|
||||
self.send_client_content = AsyncMock()
|
||||
self.send_tool_response = AsyncMock()
|
||||
self.send_realtime_input = AsyncMock()
|
||||
self.close = AsyncMock()
|
||||
|
|
@ -108,3 +109,57 @@ async def test_user_transcription_matches_upstream_upstream_push_behavior():
|
|||
assert frame.text == "Hi there"
|
||||
assert frame.finalized is False
|
||||
assert direction == FrameDirection.UPSTREAM
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tts_greeting_sends_exact_static_greeting_prompt_to_gemini():
|
||||
service = _make_service()
|
||||
service._context = LLMContext()
|
||||
service._session = _FakeSession()
|
||||
|
||||
await service.process_frame(
|
||||
TTSSpeakFrame("Hi Sam, this is Sarah from Acme.", append_to_context=True),
|
||||
FrameDirection.DOWNSTREAM,
|
||||
)
|
||||
|
||||
service._session.send_client_content.assert_awaited_once()
|
||||
kwargs = service._session.send_client_content.await_args.kwargs
|
||||
assert kwargs["turn_complete"] is True
|
||||
|
||||
turns = kwargs["turns"]
|
||||
assert len(turns) == 1
|
||||
assert turns[0].role == "user"
|
||||
prompt = turns[0].parts[0].text
|
||||
assert "The phone call has just connected. Greet the caller now:" in prompt
|
||||
assert (
|
||||
'Do not add anything before or after it.\n\n"Hi Sam, this is Sarah from Acme."'
|
||||
in prompt
|
||||
)
|
||||
|
||||
assert service._handled_initial_context is True
|
||||
assert service._pending_initial_greeting_text is None
|
||||
assert service._ready_for_realtime_input is True
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tts_greeting_waits_for_gemini_session_before_sending_prompt():
|
||||
service = _make_service()
|
||||
service._context = LLMContext()
|
||||
|
||||
await service.process_frame(
|
||||
TTSSpeakFrame("Hello from Dograh.", append_to_context=True),
|
||||
FrameDirection.DOWNSTREAM,
|
||||
)
|
||||
|
||||
assert service._handled_initial_context is True
|
||||
assert service._run_llm_when_session_ready is True
|
||||
assert service._pending_initial_greeting_text == "Hello from Dograh."
|
||||
|
||||
session = _FakeSession()
|
||||
await service._handle_session_ready(session)
|
||||
|
||||
session.send_client_content.assert_awaited_once()
|
||||
prompt = session.send_client_content.await_args.kwargs["turns"][0].parts[0].text
|
||||
assert prompt.endswith('"Hello from Dograh."')
|
||||
assert service._run_llm_when_session_ready is False
|
||||
assert service._pending_initial_greeting_text is None
|
||||
|
|
|
|||
|
|
@ -37,17 +37,71 @@ async def test_initial_context_triggers_response_when_context_was_prepopulated()
|
|||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tts_greeting_uses_initial_context_handler():
|
||||
async def test_tts_greeting_sends_exact_static_greeting_prompt():
|
||||
service = _make_service()
|
||||
service._context = LLMContext()
|
||||
service._handle_context = AsyncMock()
|
||||
service._context = LLMContext([{"role": "user", "content": "Existing context"}])
|
||||
service._api_session_ready = True
|
||||
service.send_client_event = AsyncMock()
|
||||
service.push_frame = AsyncMock()
|
||||
service.start_processing_metrics = AsyncMock()
|
||||
service.start_ttfb_metrics = AsyncMock()
|
||||
|
||||
await service.process_frame(
|
||||
TTSSpeakFrame("hello", append_to_context=True),
|
||||
TTSSpeakFrame("Hi Sam, this is Sarah from Acme.", append_to_context=True),
|
||||
FrameDirection.DOWNSTREAM,
|
||||
)
|
||||
|
||||
service._handle_context.assert_awaited_once_with(service._context)
|
||||
sent_events = [call.args[0] for call in service.send_client_event.await_args_list]
|
||||
assert isinstance(sent_events[0], events.ConversationItemCreateEvent)
|
||||
assert sent_events[0].item.role == "user"
|
||||
assert sent_events[0].item.content[0].text == "Existing context"
|
||||
assert isinstance(sent_events[1], events.SessionUpdateEvent)
|
||||
greeting_event = sent_events[2]
|
||||
assert isinstance(greeting_event, events.ConversationItemCreateEvent)
|
||||
assert greeting_event.item.role == "user"
|
||||
assert greeting_event.item.type == "message"
|
||||
prompt = greeting_event.item.content[0].text
|
||||
assert "The phone call has just connected. Greet the caller now:" in prompt
|
||||
assert prompt.endswith('"Hi Sam, this is Sarah from Acme."')
|
||||
assert isinstance(sent_events[-1], events.ResponseCreateEvent)
|
||||
assert service._llm_needs_conversation_setup is False
|
||||
service._create_response.assert_not_awaited()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tts_greeting_waits_for_session_updated_before_sending_prompt():
|
||||
service = _make_service()
|
||||
service._context = LLMContext([{"role": "user", "content": "Existing context"}])
|
||||
|
||||
await service.process_frame(
|
||||
TTSSpeakFrame("Hello from Dograh.", append_to_context=True),
|
||||
FrameDirection.DOWNSTREAM,
|
||||
)
|
||||
|
||||
assert service._handled_initial_context is True
|
||||
assert service._run_llm_when_api_session_ready is True
|
||||
assert service._pending_initial_greeting_text == "Hello from Dograh."
|
||||
|
||||
service.send_client_event = AsyncMock()
|
||||
service.push_frame = AsyncMock()
|
||||
service.start_processing_metrics = AsyncMock()
|
||||
service.start_ttfb_metrics = AsyncMock()
|
||||
|
||||
await service._handle_evt_session_updated(SimpleNamespace())
|
||||
|
||||
sent_events = [call.args[0] for call in service.send_client_event.await_args_list]
|
||||
assert isinstance(sent_events[0], events.ConversationItemCreateEvent)
|
||||
assert sent_events[0].item.content[0].text == "Existing context"
|
||||
assert isinstance(sent_events[1], events.SessionUpdateEvent)
|
||||
greeting_event = sent_events[2]
|
||||
assert isinstance(greeting_event, events.ConversationItemCreateEvent)
|
||||
prompt = greeting_event.item.content[0].text
|
||||
assert prompt.endswith('"Hello from Dograh."')
|
||||
assert isinstance(sent_events[-1], events.ResponseCreateEvent)
|
||||
assert service._run_llm_when_api_session_ready is False
|
||||
assert service._pending_initial_greeting_text is None
|
||||
assert service._llm_needs_conversation_setup is False
|
||||
service._create_response.assert_not_awaited()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ import pytest
|
|||
from pipecat.frames.frames import TTSSpeakFrame
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.openai.realtime import events
|
||||
|
||||
from api.services.pipecat.realtime.openai_realtime import (
|
||||
DograhOpenAIRealtimeLLMService,
|
||||
|
|
@ -48,17 +49,69 @@ async def test_updated_context_uses_tool_result_path_after_initial_context():
|
|||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tts_greeting_uses_initial_context_handler():
|
||||
async def test_tts_greeting_sends_exact_static_greeting_prompt():
|
||||
service = _make_service()
|
||||
service._context = LLMContext()
|
||||
service._handle_context = AsyncMock()
|
||||
service._api_session_ready = True
|
||||
service.send_client_event = AsyncMock()
|
||||
service.push_frame = AsyncMock()
|
||||
service.start_processing_metrics = AsyncMock()
|
||||
service.start_ttfb_metrics = AsyncMock()
|
||||
|
||||
await service.process_frame(
|
||||
TTSSpeakFrame("hello", append_to_context=True),
|
||||
TTSSpeakFrame("Hi Sam, this is Sarah from Acme.", append_to_context=True),
|
||||
FrameDirection.DOWNSTREAM,
|
||||
)
|
||||
|
||||
service._handle_context.assert_awaited_once_with(service._context)
|
||||
sent_events = [call.args[0] for call in service.send_client_event.await_args_list]
|
||||
assert not any(
|
||||
isinstance(event, events.ConversationItemCreateEvent) for event in sent_events
|
||||
)
|
||||
assert isinstance(sent_events[0], events.SessionUpdateEvent)
|
||||
response_event = sent_events[-1]
|
||||
assert isinstance(response_event, events.ResponseCreateEvent)
|
||||
assert response_event.response.tool_choice == "none"
|
||||
prompt = response_event.response.instructions
|
||||
assert "The phone call has just connected. Greet the caller now:" in prompt
|
||||
assert prompt.endswith('"Hi Sam, this is Sarah from Acme."')
|
||||
assert service._llm_needs_conversation_setup is False
|
||||
service._create_response.assert_not_awaited()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tts_greeting_waits_for_session_updated_before_sending_prompt():
|
||||
service = _make_service()
|
||||
service._context = LLMContext()
|
||||
|
||||
await service.process_frame(
|
||||
TTSSpeakFrame("Hello from Dograh.", append_to_context=True),
|
||||
FrameDirection.DOWNSTREAM,
|
||||
)
|
||||
|
||||
assert service._handled_initial_context is True
|
||||
assert service._run_llm_when_api_session_ready is True
|
||||
assert service._pending_initial_greeting_text == "Hello from Dograh."
|
||||
|
||||
service.send_client_event = AsyncMock()
|
||||
service.push_frame = AsyncMock()
|
||||
service.start_processing_metrics = AsyncMock()
|
||||
service.start_ttfb_metrics = AsyncMock()
|
||||
|
||||
await service._handle_evt_session_updated(SimpleNamespace())
|
||||
|
||||
sent_events = [call.args[0] for call in service.send_client_event.await_args_list]
|
||||
assert not any(
|
||||
isinstance(event, events.ConversationItemCreateEvent) for event in sent_events
|
||||
)
|
||||
assert isinstance(sent_events[0], events.SessionUpdateEvent)
|
||||
response_event = sent_events[-1]
|
||||
assert isinstance(response_event, events.ResponseCreateEvent)
|
||||
assert response_event.response.tool_choice == "none"
|
||||
prompt = response_event.response.instructions
|
||||
assert prompt.endswith('"Hello from Dograh."')
|
||||
assert service._run_llm_when_api_session_ready is False
|
||||
assert service._pending_initial_greeting_text is None
|
||||
assert service._llm_needs_conversation_setup is False
|
||||
service._create_response.assert_not_awaited()
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -7,7 +7,23 @@ from pipecat.processors.frame_processor import FrameDirection
|
|||
from pipecat.transports.base_output import BaseOutputTransport
|
||||
from pipecat.transports.base_transport import TransportParams
|
||||
|
||||
from api.services.pipecat.realtime_feedback_observer import RealtimeFeedbackObserver
|
||||
from api.services.pipecat.in_memory_buffers import InMemoryLogsBuffer
|
||||
from api.services.pipecat.realtime_feedback_observer import (
|
||||
RealtimeFeedbackObserver,
|
||||
register_turn_log_handlers,
|
||||
)
|
||||
|
||||
|
||||
class _FakeAggregator:
|
||||
def __init__(self):
|
||||
self.handlers = {}
|
||||
|
||||
def event_handler(self, event_name):
|
||||
def decorator(handler):
|
||||
self.handlers[event_name] = handler
|
||||
return handler
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
def _frame_pushed(frame, direction, *, source=None):
|
||||
|
|
@ -98,3 +114,33 @@ async def test_observer_waits_for_tts_text_from_output_transport():
|
|||
"payload": {"text": "Hello"},
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_turn_log_handlers_persist_user_message_added_events():
|
||||
logs_buffer = InMemoryLogsBuffer(workflow_run_id=123)
|
||||
user_aggregator = _FakeAggregator()
|
||||
assistant_aggregator = _FakeAggregator()
|
||||
|
||||
register_turn_log_handlers(logs_buffer, user_aggregator, assistant_aggregator)
|
||||
|
||||
assert "on_user_turn_message_added" in user_aggregator.handlers
|
||||
assert "on_user_turn_stopped" not in user_aggregator.handlers
|
||||
|
||||
await user_aggregator.handlers["on_user_turn_message_added"](
|
||||
user_aggregator,
|
||||
SimpleNamespace(
|
||||
content="Hi there",
|
||||
timestamp="2026-01-01T00:00:00+00:00",
|
||||
),
|
||||
)
|
||||
|
||||
events = logs_buffer.get_events()
|
||||
assert len(events) == 1
|
||||
assert events[0]["type"] == "rtf-user-transcription"
|
||||
assert events[0]["payload"] == {
|
||||
"text": "Hi there",
|
||||
"final": True,
|
||||
"timestamp": "2026-01-01T00:00:00+00:00",
|
||||
}
|
||||
assert events[0]["turn"] == 1
|
||||
|
|
|
|||
|
|
@ -30,6 +30,7 @@ def test_gemini_realtime_uses_local_vad_without_local_interruptions():
|
|||
assert strategies.start[0]._enable_interruptions is False
|
||||
assert len(strategies.stop) == 1
|
||||
assert isinstance(strategies.stop[0], SpeechTimeoutUserTurnStopStrategy)
|
||||
assert strategies.stop[0].wait_for_transcript is False
|
||||
|
||||
|
||||
def test_gemini_vertex_realtime_uses_same_turn_config_as_gemini_live():
|
||||
|
|
@ -41,6 +42,9 @@ def test_gemini_vertex_realtime_uses_same_turn_config_as_gemini_live():
|
|||
assert len(strategies.start) == 1
|
||||
assert isinstance(strategies.start[0], VADUserTurnStartStrategy)
|
||||
assert strategies.start[0]._enable_interruptions is False
|
||||
assert len(strategies.stop) == 1
|
||||
assert isinstance(strategies.stop[0], SpeechTimeoutUserTurnStopStrategy)
|
||||
assert strategies.stop[0].wait_for_transcript is False
|
||||
|
||||
|
||||
def test_openai_realtime_uses_provider_turn_frames_without_local_vad():
|
||||
|
|
@ -54,6 +58,21 @@ def test_openai_realtime_uses_provider_turn_frames_without_local_vad():
|
|||
assert strategies.start[0]._enable_interruptions is False
|
||||
assert len(strategies.stop) == 1
|
||||
assert isinstance(strategies.stop[0], ExternalUserTurnStopStrategy)
|
||||
assert strategies.stop[0].wait_for_transcript is False
|
||||
|
||||
|
||||
def test_azure_realtime_uses_provider_turn_frames_without_local_vad():
|
||||
strategies, vad_analyzer = _create_realtime_user_turn_config(
|
||||
ServiceProviders.AZURE_REALTIME.value
|
||||
)
|
||||
|
||||
assert vad_analyzer is None
|
||||
assert len(strategies.start) == 1
|
||||
assert isinstance(strategies.start[0], ExternalUserTurnStartStrategy)
|
||||
assert strategies.start[0]._enable_interruptions is False
|
||||
assert len(strategies.stop) == 1
|
||||
assert isinstance(strategies.stop[0], ExternalUserTurnStopStrategy)
|
||||
assert strategies.stop[0].wait_for_transcript is False
|
||||
|
||||
|
||||
def test_grok_realtime_uses_provider_turn_frames_without_local_vad():
|
||||
|
|
@ -67,6 +86,21 @@ def test_grok_realtime_uses_provider_turn_frames_without_local_vad():
|
|||
assert strategies.start[0]._enable_interruptions is False
|
||||
assert len(strategies.stop) == 1
|
||||
assert isinstance(strategies.stop[0], ExternalUserTurnStopStrategy)
|
||||
assert strategies.stop[0].wait_for_transcript is False
|
||||
|
||||
|
||||
def test_ultravox_realtime_uses_local_vad_with_local_interruptions():
|
||||
strategies, vad_analyzer = _create_realtime_user_turn_config(
|
||||
ServiceProviders.ULTRAVOX_REALTIME.value
|
||||
)
|
||||
|
||||
assert isinstance(vad_analyzer, SileroVADAnalyzer)
|
||||
assert len(strategies.start) == 1
|
||||
assert isinstance(strategies.start[0], VADUserTurnStartStrategy)
|
||||
assert strategies.start[0]._enable_interruptions is True
|
||||
assert len(strategies.stop) == 1
|
||||
assert isinstance(strategies.stop[0], SpeechTimeoutUserTurnStopStrategy)
|
||||
assert strategies.stop[0].wait_for_transcript is False
|
||||
|
||||
|
||||
def test_unknown_realtime_providers_keep_local_vad():
|
||||
|
|
@ -75,8 +109,10 @@ def test_unknown_realtime_providers_keep_local_vad():
|
|||
assert isinstance(vad_analyzer, SileroVADAnalyzer)
|
||||
assert len(strategies.start) == 1
|
||||
assert isinstance(strategies.start[0], VADUserTurnStartStrategy)
|
||||
assert strategies.start[0]._enable_interruptions is True
|
||||
assert len(strategies.stop) == 1
|
||||
assert isinstance(strategies.stop[0], SpeechTimeoutUserTurnStopStrategy)
|
||||
assert strategies.stop[0].wait_for_transcript is False
|
||||
|
||||
|
||||
def test_external_turn_stt_uses_longer_stop_timeout():
|
||||
|
|
|
|||
38
api/tests/test_template_renderer.py
Normal file
38
api/tests/test_template_renderer.py
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
from api.utils.template_renderer import render_template
|
||||
|
||||
|
||||
def test_initial_context_prefix_resolves_against_flat_context():
|
||||
context = {
|
||||
"first_name": "Abhishek",
|
||||
"runtime_configuration": {
|
||||
"realtime_model": "gpt-realtime-2",
|
||||
},
|
||||
}
|
||||
|
||||
assert (
|
||||
render_template("Hi {{initial_context.first_name | there}}", context)
|
||||
== "Hi Abhishek"
|
||||
)
|
||||
assert (
|
||||
render_template(
|
||||
"Model {{initial_context.runtime_configuration.realtime_model}}", context
|
||||
)
|
||||
== "Model gpt-realtime-2"
|
||||
)
|
||||
|
||||
|
||||
def test_initial_context_prefix_prefers_explicit_initial_context():
|
||||
context = {
|
||||
"first_name": "Flat",
|
||||
"initial_context": {
|
||||
"first_name": "Nested",
|
||||
},
|
||||
}
|
||||
|
||||
assert render_template("Hi {{initial_context.first_name}}", context) == "Hi Nested"
|
||||
|
||||
|
||||
def test_initial_context_prefix_uses_fallback_when_missing_from_both_contexts():
|
||||
assert (
|
||||
render_template("Hi {{initial_context.first_name | there}}", {}) == "Hi there"
|
||||
)
|
||||
|
|
@ -12,6 +12,7 @@ from api.services.workflow.workflow_graph import TEMPLATE_VAR_PATTERN
|
|||
|
||||
_CURRENT_TIME_PREFIX = "current_time"
|
||||
_CURRENT_WEEKDAY_PREFIX = "current_weekday"
|
||||
_INITIAL_CONTEXT_PREFIX = "initial_context."
|
||||
|
||||
|
||||
def get_nested_value(obj: Any, path: str) -> Any:
|
||||
|
|
@ -184,8 +185,14 @@ def _render_string(template_str: str, context: Dict[str, Any]) -> str:
|
|||
if builtin_value is not None:
|
||||
return builtin_value
|
||||
|
||||
# Get value using nested path lookup
|
||||
# Get value using nested path lookup. Prompts commonly reference
|
||||
# initial_context.<key>, while some runtime callers pass the initial
|
||||
# context itself as the render context.
|
||||
value = get_nested_value(context, variable_path)
|
||||
if value is None and variable_path.startswith(_INITIAL_CONTEXT_PREFIX):
|
||||
value = get_nested_value(
|
||||
context, variable_path[len(_INITIAL_CONTEXT_PREFIX) :]
|
||||
)
|
||||
|
||||
# Apply fallback: new syntax {{var | default}} or legacy {{var | fallback:default}}
|
||||
if filter_name is not None:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue