Fix realtime initial greeting handling (#481)

This commit is contained in:
Abhishek 2026-06-29 17:25:42 +05:30 committed by GitHub
parent d9800fddd6
commit 090d042a78
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
20 changed files with 714 additions and 70 deletions

View file

@ -11,6 +11,7 @@ from typing import Any
from loguru import logger
from api.services.pipecat.realtime.static_greeting import format_static_greeting_prompt
from pipecat.frames.frames import (
BotStartedSpeakingFrame,
BotStoppedSpeakingFrame,
@ -49,6 +50,7 @@ class DograhAzureRealtimeLLMService(AzureRealtimeLLMService):
self._handled_initial_context: bool = False
self._bot_is_speaking: bool = False
self._deferred_function_calls: list[FunctionCallFromLLM] = []
self._pending_initial_greeting_text: str | None = None
async def process_frame(self, frame: Frame, direction: FrameDirection):
if isinstance(frame, UserMuteStartedFrame):
@ -61,7 +63,11 @@ class DograhAzureRealtimeLLMService(AzureRealtimeLLMService):
return
if isinstance(frame, TTSSpeakFrame):
if not self._handled_initial_context:
await self._handle_context(self._context)
greeting_text = frame.text.strip() if frame.text else ""
if greeting_text:
await self._handle_initial_greeting(self._context, greeting_text)
else:
await self._handle_context(self._context)
else:
logger.warning(
f"{self}: TTSSpeakFrame after initial context already handled — "
@ -118,6 +124,57 @@ class DograhAzureRealtimeLLMService(AzureRealtimeLLMService):
self._context = context
await self._process_completed_function_calls(send_new_results=True)
async def _handle_initial_greeting(self, context: LLMContext, greeting_text: str):
if context is None:
logger.warning(
f"{self}: received initial greeting trigger before context was set"
)
return
self._handled_initial_context = True
self._context = context
await self._create_initial_greeting_response(greeting_text)
async def _create_initial_greeting_response(self, greeting_text: str):
if self._disconnecting:
return
if not self._api_session_ready:
self._pending_initial_greeting_text = greeting_text
self._run_llm_when_api_session_ready = True
return
self._pending_initial_greeting_text = None
await self._ensure_conversation_setup()
await self._send_manual_response_create(
instructions=format_static_greeting_prompt(greeting_text),
tool_choice="none",
)
async def _ensure_conversation_setup(self):
if not self._llm_needs_conversation_setup:
return
adapter = self.get_llm_adapter()
llm_invocation_params = adapter.get_llm_invocation_params(self._context)
for item in llm_invocation_params["messages"]:
evt = events.ConversationItemCreateEvent(item=item)
self._messages_added_manually[evt.item.id] = True
await self.send_client_event(evt)
await self._send_session_update()
self._llm_needs_conversation_setup = False
async def _handle_evt_session_updated(self, evt):
self._api_session_ready = True
if self._pending_initial_greeting_text is not None:
greeting_text = self._pending_initial_greeting_text
self._run_llm_when_api_session_ready = False
await self._create_initial_greeting_response(greeting_text)
elif self._run_llm_when_api_session_ready:
self._run_llm_when_api_session_ready = False
await self._create_response()
async def _send_user_audio(self, frame):
if self._user_is_muted:
return
@ -171,14 +228,21 @@ class DograhAzureRealtimeLLMService(AzureRealtimeLLMService):
return "\n".join(parts) if parts else None
return None
async def _send_manual_response_create(self):
async def _send_manual_response_create(
self,
*,
instructions: str | None = None,
tool_choice: str | None = None,
):
await self.push_frame(LLMFullResponseStartFrame())
await self.start_processing_metrics()
await self.start_ttfb_metrics()
await self.send_client_event(
events.ResponseCreateEvent(
response=events.ResponseProperties(
output_modalities=self._get_enabled_modalities()
output_modalities=self._get_enabled_modalities(),
instructions=instructions,
tool_choice=tool_choice,
)
)
)

View file

@ -20,11 +20,13 @@ Layers Dograh engine integration quirks onto upstream-pristine
from typing import Any
from google.genai.types import Content, Part
from loguru import logger
from api.services.pipecat.gemini_json_schema_adapter import (
DograhGeminiJSONSchemaAdapter,
)
from api.services.pipecat.realtime.static_greeting import format_static_greeting_prompt
from pipecat.frames.frames import (
BotStoppedSpeakingFrame,
Frame,
@ -63,6 +65,9 @@ class DograhGeminiLiveLLMService(GeminiLiveLLMService):
# Function calls emitted by Gemini mid-bot-turn are deferred here and
# invoked when the turn ends, so they don't race the turn's audio.
self._pending_function_calls: list[FunctionCallFromLLM] = []
# Text greeting captured from the first TTSSpeakFrame while the Gemini
# session is still connecting.
self._pending_initial_greeting_text: str | None = None
# ------------------------------------------------------------------
# Hooks from upstream GeminiLiveLLMService
@ -142,10 +147,15 @@ class DograhGeminiLiveLLMService(GeminiLiveLLMService):
if isinstance(frame, TTSSpeakFrame):
# Greeting trigger: the engine queues a TTSSpeakFrame to start the
# bot's first turn after node setup. Gemini Live renders its own
# audio, so we don't pass the frame through — we re-enter
# _handle_context to kick off the initial response.
# audio, so we don't pass the frame through. For configured static
# text greetings, ask Gemini to say the exact greeting; otherwise
# re-enter _handle_context to kick off the normal initial response.
if not self._handled_initial_context:
await self._handle_context(self._context)
greeting_text = frame.text.strip() if frame.text else ""
if greeting_text:
await self._handle_initial_greeting(self._context, greeting_text)
else:
await self._handle_context(self._context)
else:
logger.warning(
f"{self}: TTSSpeakFrame after initial context already "
@ -183,6 +193,49 @@ class DograhGeminiLiveLLMService(GeminiLiveLLMService):
self._context = context
await self._process_completed_function_calls(send_new_results=True)
async def _handle_initial_greeting(self, context: LLMContext, greeting_text: str):
"""Trigger the first Gemini turn with an exact static text greeting."""
if context is None:
logger.warning(
f"{self}: received initial greeting trigger before context was set"
)
return
self._handled_initial_context = True
self._context = context
await self._create_initial_greeting_response(greeting_text)
async def _create_initial_greeting_response(self, greeting_text: str):
"""Ask Gemini Live to speak the configured greeting exactly once."""
if self._disconnecting:
return
if not self._session:
self._pending_initial_greeting_text = greeting_text
self._run_llm_when_session_ready = True
return
self._pending_initial_greeting_text = None
prompt = format_static_greeting_prompt(greeting_text)
turn = Content(role="user", parts=[Part(text=prompt)])
logger.debug("Creating Gemini Live initial response from static greeting")
await self.start_ttfb_metrics()
try:
await self._session.send_client_content(
turns=[turn],
turn_complete=True,
)
# Gemini 3.x also needs a realtime-input nudge to begin inference.
if self._is_gemini_3:
await self._session.send_realtime_input(text=" ")
except Exception as e:
await self._handle_send_error(e)
self._ready_for_realtime_input = True
# ------------------------------------------------------------------
# Session lifecycle: drop upstream's automatic reconnect-seed and
# initial-context-seed paths. The TTSSpeakFrame trigger and the
@ -201,7 +254,12 @@ class DograhGeminiLiveLLMService(GeminiLiveLLMService):
# Context arrived before session was ready — fulfil the queued
# initial response now.
self._run_llm_when_session_ready = False
await self._create_initial_response()
if self._pending_initial_greeting_text is not None:
await self._create_initial_greeting_response(
self._pending_initial_greeting_text
)
else:
await self._create_initial_response()
await self._drain_pending_tool_results()
# Otherwise: no automatic seed. Reconnect after a session-resumption
# update relies on the server-side restored state; reconnects without

View file

@ -22,6 +22,7 @@ from typing import Any
from loguru import logger
from api.services.pipecat.realtime.static_greeting import format_static_greeting_prompt
from pipecat.frames.frames import (
BotStartedSpeakingFrame,
BotStoppedSpeakingFrame,
@ -50,6 +51,7 @@ class DograhGrokRealtimeLLMService(GrokRealtimeLLMService):
self._handled_initial_context: bool = False
self._bot_is_speaking: bool = False
self._deferred_function_calls: list[FunctionCallFromLLM] = []
self._pending_initial_greeting_text: str | None = None
async def process_frame(self, frame: Frame, direction: FrameDirection):
if isinstance(frame, UserMuteStartedFrame):
@ -62,7 +64,11 @@ class DograhGrokRealtimeLLMService(GrokRealtimeLLMService):
return
if isinstance(frame, TTSSpeakFrame):
if not self._handled_initial_context:
await self._handle_context(self._context)
greeting_text = frame.text.strip() if frame.text else ""
if greeting_text:
await self._handle_initial_greeting(self._context, greeting_text)
else:
await self._handle_context(self._context)
else:
logger.warning(
f"{self}: TTSSpeakFrame after initial context already "
@ -120,6 +126,67 @@ class DograhGrokRealtimeLLMService(GrokRealtimeLLMService):
self._context = context
await self._process_completed_function_calls(send_new_results=True)
async def _handle_initial_greeting(self, context: LLMContext, greeting_text: str):
if context is None:
logger.warning(
f"{self}: received initial greeting trigger before context was set"
)
return
self._handled_initial_context = True
self._context = context
await self._create_initial_greeting_response(greeting_text)
async def _create_initial_greeting_response(self, greeting_text: str):
if self._disconnecting:
return
if not self._api_session_ready:
self._pending_initial_greeting_text = greeting_text
self._run_llm_when_api_session_ready = True
return
self._pending_initial_greeting_text = None
await self._ensure_conversation_setup()
item = events.ConversationItem(
type="message",
role="user",
content=[
events.ItemContent(
type="input_text",
text=format_static_greeting_prompt(greeting_text),
)
],
)
evt = events.ConversationItemCreateEvent(item=item)
self._messages_added_manually[evt.item.id] = True
await self.send_client_event(evt)
await self._send_manual_response_create()
async def _ensure_conversation_setup(self):
if not self._llm_needs_conversation_setup:
return
adapter = self.get_llm_adapter()
llm_invocation_params = adapter.get_llm_invocation_params(self._context)
for item in llm_invocation_params["messages"]:
evt = events.ConversationItemCreateEvent(item=item)
self._messages_added_manually[evt.item.id] = True
await self.send_client_event(evt)
await self._send_session_update()
self._llm_needs_conversation_setup = False
async def _handle_evt_session_updated(self, evt):
self._api_session_ready = True
if self._pending_initial_greeting_text is not None:
greeting_text = self._pending_initial_greeting_text
self._run_llm_when_api_session_ready = False
await self._create_initial_greeting_response(greeting_text)
elif self._run_llm_when_api_session_ready:
self._run_llm_when_api_session_ready = False
await self._create_response()
async def _send_user_audio(self, frame):
if self._user_is_muted:
return

View file

@ -22,6 +22,7 @@ from typing import Any
from loguru import logger
from api.services.pipecat.realtime.static_greeting import format_static_greeting_prompt
from pipecat.frames.frames import (
BotStartedSpeakingFrame,
BotStoppedSpeakingFrame,
@ -56,6 +57,7 @@ class DograhOpenAIRealtimeLLMService(OpenAIRealtimeLLMService):
# has finished speaking, matching Dograh's Gemini Live behavior.
self._bot_is_speaking: bool = False
self._deferred_function_calls: list[FunctionCallFromLLM] = []
self._pending_initial_greeting_text: str | None = None
# ------------------------------------------------------------------
# Frame handling: mute, TTSSpeakFrame as greeting trigger
@ -73,11 +75,16 @@ class DograhOpenAIRealtimeLLMService(OpenAIRealtimeLLMService):
if isinstance(frame, TTSSpeakFrame):
# Greeting trigger: the engine queues a TTSSpeakFrame after node
# setup. OpenAI Realtime renders its own audio, so we don't pass
# the frame to TTS. Route through _handle_context so the initial
# response and later tool-result turns share the same context
# lifecycle even when Dograh has already pre-populated self._context.
# the frame to TTS. For configured static text greetings, ask the
# model to say the exact greeting; otherwise route through
# _handle_context so the initial response and later tool-result
# turns share the same context lifecycle.
if not self._handled_initial_context:
await self._handle_context(self._context)
greeting_text = frame.text.strip() if frame.text else ""
if greeting_text:
await self._handle_initial_greeting(self._context, greeting_text)
else:
await self._handle_context(self._context)
else:
logger.warning(
f"{self}: TTSSpeakFrame after initial context already "
@ -137,6 +144,57 @@ class DograhOpenAIRealtimeLLMService(OpenAIRealtimeLLMService):
self._context = context
await self._process_completed_function_calls(send_new_results=True)
async def _handle_initial_greeting(self, context: LLMContext, greeting_text: str):
if context is None:
logger.warning(
f"{self}: received initial greeting trigger before context was set"
)
return
self._handled_initial_context = True
self._context = context
await self._create_initial_greeting_response(greeting_text)
async def _create_initial_greeting_response(self, greeting_text: str):
if self._disconnecting:
return
if not self._api_session_ready:
self._pending_initial_greeting_text = greeting_text
self._run_llm_when_api_session_ready = True
return
self._pending_initial_greeting_text = None
await self._ensure_conversation_setup()
await self._send_manual_response_create(
instructions=format_static_greeting_prompt(greeting_text),
tool_choice="none",
)
async def _ensure_conversation_setup(self):
if not self._llm_needs_conversation_setup:
return
adapter = self.get_llm_adapter()
llm_invocation_params = adapter.get_llm_invocation_params(self._context)
for item in llm_invocation_params["messages"]:
evt = events.ConversationItemCreateEvent(item=item)
self._messages_added_manually[evt.item.id] = True
await self.send_client_event(evt)
await self._send_session_update()
self._llm_needs_conversation_setup = False
async def _handle_evt_session_updated(self, evt):
self._api_session_ready = True
if self._pending_initial_greeting_text is not None:
greeting_text = self._pending_initial_greeting_text
self._run_llm_when_api_session_ready = False
await self._create_initial_greeting_response(greeting_text)
elif self._run_llm_when_api_session_ready:
self._run_llm_when_api_session_ready = False
await self._create_response()
async def _send_user_audio(self, frame):
if self._user_is_muted:
return
@ -190,7 +248,12 @@ class DograhOpenAIRealtimeLLMService(OpenAIRealtimeLLMService):
return "\n".join(parts) if parts else None
return None
async def _send_manual_response_create(self):
async def _send_manual_response_create(
self,
*,
instructions: str | None = None,
tool_choice: str | None = None,
):
"""Trigger inference after manually appending conversation items."""
await self.push_frame(LLMFullResponseStartFrame())
await self.start_processing_metrics()
@ -198,7 +261,9 @@ class DograhOpenAIRealtimeLLMService(OpenAIRealtimeLLMService):
await self.send_client_event(
events.ResponseCreateEvent(
response=events.ResponseProperties(
output_modalities=self._get_enabled_modalities()
output_modalities=self._get_enabled_modalities(),
instructions=instructions,
tool_choice=tool_choice,
)
)
)

View file

@ -0,0 +1,8 @@
def format_static_greeting_prompt(greeting_text: str) -> str:
return (
"The phone call has just connected. Greet the caller now: "
"say the following opening line out loud, exactly as written, "
"in a natural spoken voice, and then stop and wait for the "
"caller to respond. Do not add anything before or after it.\n\n"
f'"{greeting_text}"'
)

View file

@ -72,7 +72,7 @@ class RealtimeFeedbackObserver(BaseObserver):
- TTFB metrics (LLM generation time only)
Logs buffer persistence (only final data for post-call analysis):
- Complete user transcripts per turn (via on_user_turn_stopped)
- Complete user transcripts per turn (via on_user_turn_message_added)
- Complete assistant transcripts per turn (via on_assistant_turn_stopped)
- Function calls and TTFB metrics
@ -300,13 +300,13 @@ def register_turn_log_handlers(
):
"""Register event handlers on aggregators to persist final turn transcripts.
Hooks into on_user_turn_stopped and on_assistant_turn_stopped to store
Hooks into on_user_turn_message_added and on_assistant_turn_stopped to store
complete turn text in the logs buffer. Works for both WebRTC and telephony
calls independent of WebSocket availability.
"""
@user_aggregator.event_handler("on_user_turn_stopped")
async def on_user_turn_stopped(aggregator, strategy, message):
@user_aggregator.event_handler("on_user_turn_message_added")
async def on_user_turn_message_added(aggregator, message):
logs_buffer.increment_turn()
try:
await logs_buffer.append(

View file

@ -113,49 +113,53 @@ def _resolve_user_turn_stop_timeout(
def _create_realtime_user_turn_config(provider: str):
"""Return user turn strategies and optional local VAD for realtime providers."""
def external_provider_turn_config():
return (
UserTurnStrategies(
start=[ExternalUserTurnStartStrategy()],
stop=[ExternalUserTurnStopStrategy(wait_for_transcript=False)],
),
None,
)
def local_vad_turn_config(*, enable_interruptions: bool):
return (
UserTurnStrategies(
start=[
VADUserTurnStartStrategy(enable_interruptions=enable_interruptions)
],
stop=[SpeechTimeoutUserTurnStopStrategy(wait_for_transcript=False)],
),
SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
)
if provider in {
ServiceProviders.GOOGLE_REALTIME.value,
ServiceProviders.GOOGLE_VERTEX_REALTIME.value,
}:
# Let Gemini Live own barge-in via its server-side VAD, but keep local
# Silero VAD for early user-turn start and speaking-state tracking.
return (
UserTurnStrategies(
start=[VADUserTurnStartStrategy(enable_interruptions=False)],
stop=[SpeechTimeoutUserTurnStopStrategy()],
),
SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
)
return local_vad_turn_config(enable_interruptions=False)
if provider == ServiceProviders.OPENAI_REALTIME.value:
# OpenAI Realtime already emits speaking-state frames and interruption
# events from the provider, so the aggregator should follow those
# external signals rather than run its own local VAD.
return (
UserTurnStrategies(
start=[ExternalUserTurnStartStrategy()],
stop=[ExternalUserTurnStopStrategy()],
),
None,
)
if provider in {
ServiceProviders.OPENAI_REALTIME.value,
ServiceProviders.AZURE_REALTIME.value,
}:
# OpenAI-compatible Realtime services already emit speaking-state frames
# and interruption events from the provider, so the aggregator should
# follow those external signals rather than run its own local VAD.
return external_provider_turn_config()
if provider == ServiceProviders.GROK_REALTIME.value:
# Grok Voice Agent emits server-side speech-start/stop and
# interruption signals, so local VAD should stay out of the way.
return (
UserTurnStrategies(
start=[ExternalUserTurnStartStrategy()],
stop=[ExternalUserTurnStopStrategy()],
),
None,
)
return external_provider_turn_config()
if provider == ServiceProviders.ULTRAVOX_REALTIME.value:
# Ultravox does not emit user-turn frames, so local VAD supplies
# lifecycle signals for Dograh observers/controllers.
return local_vad_turn_config(enable_interruptions=True)
return (
UserTurnStrategies(
start=[VADUserTurnStartStrategy()],
stop=[SpeechTimeoutUserTurnStopStrategy()],
),
SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
)
return local_vad_turn_config(enable_interruptions=True)
async def run_pipeline_telephony(
@ -775,7 +779,10 @@ async def _run_pipeline_impl(
vad_analyzer=user_vad_analyzer,
)
context_aggregator = LLMContextAggregatorPair(
context, assistant_params=assistant_params, user_params=user_params
context,
assistant_params=assistant_params,
user_params=user_params,
realtime_service_mode=is_realtime,
)
# Create usage metrics aggregator with engine's callback

View file

@ -250,7 +250,6 @@ class _ToolDocumentRefsMixin(BaseModel):
"description": (
"Text spoken via TTS at the start of the call. Supports "
"{{template_variables}}. Leave empty to skip the greeting. "
"Not supported with realtime (speech-to-speech) models."
),
"display_options": DisplayOptions(show={"greeting_type": ["text"]}),
"placeholder": "Hi {{first_name}}, this is Sarah from Acme.",