mirror of
https://github.com/dograh-hq/dograh.git
synced 2026-07-01 08:59:46 +02:00
Fix realtime initial greeting handling (#481)
This commit is contained in:
parent
d9800fddd6
commit
090d042a78
20 changed files with 714 additions and 70 deletions
|
|
@ -11,6 +11,7 @@ from typing import Any
|
|||
|
||||
from loguru import logger
|
||||
|
||||
from api.services.pipecat.realtime.static_greeting import format_static_greeting_prompt
|
||||
from pipecat.frames.frames import (
|
||||
BotStartedSpeakingFrame,
|
||||
BotStoppedSpeakingFrame,
|
||||
|
|
@ -49,6 +50,7 @@ class DograhAzureRealtimeLLMService(AzureRealtimeLLMService):
|
|||
self._handled_initial_context: bool = False
|
||||
self._bot_is_speaking: bool = False
|
||||
self._deferred_function_calls: list[FunctionCallFromLLM] = []
|
||||
self._pending_initial_greeting_text: str | None = None
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
if isinstance(frame, UserMuteStartedFrame):
|
||||
|
|
@ -61,7 +63,11 @@ class DograhAzureRealtimeLLMService(AzureRealtimeLLMService):
|
|||
return
|
||||
if isinstance(frame, TTSSpeakFrame):
|
||||
if not self._handled_initial_context:
|
||||
await self._handle_context(self._context)
|
||||
greeting_text = frame.text.strip() if frame.text else ""
|
||||
if greeting_text:
|
||||
await self._handle_initial_greeting(self._context, greeting_text)
|
||||
else:
|
||||
await self._handle_context(self._context)
|
||||
else:
|
||||
logger.warning(
|
||||
f"{self}: TTSSpeakFrame after initial context already handled — "
|
||||
|
|
@ -118,6 +124,57 @@ class DograhAzureRealtimeLLMService(AzureRealtimeLLMService):
|
|||
self._context = context
|
||||
await self._process_completed_function_calls(send_new_results=True)
|
||||
|
||||
async def _handle_initial_greeting(self, context: LLMContext, greeting_text: str):
|
||||
if context is None:
|
||||
logger.warning(
|
||||
f"{self}: received initial greeting trigger before context was set"
|
||||
)
|
||||
return
|
||||
|
||||
self._handled_initial_context = True
|
||||
self._context = context
|
||||
await self._create_initial_greeting_response(greeting_text)
|
||||
|
||||
async def _create_initial_greeting_response(self, greeting_text: str):
|
||||
if self._disconnecting:
|
||||
return
|
||||
|
||||
if not self._api_session_ready:
|
||||
self._pending_initial_greeting_text = greeting_text
|
||||
self._run_llm_when_api_session_ready = True
|
||||
return
|
||||
|
||||
self._pending_initial_greeting_text = None
|
||||
await self._ensure_conversation_setup()
|
||||
await self._send_manual_response_create(
|
||||
instructions=format_static_greeting_prompt(greeting_text),
|
||||
tool_choice="none",
|
||||
)
|
||||
|
||||
async def _ensure_conversation_setup(self):
|
||||
if not self._llm_needs_conversation_setup:
|
||||
return
|
||||
|
||||
adapter = self.get_llm_adapter()
|
||||
llm_invocation_params = adapter.get_llm_invocation_params(self._context)
|
||||
for item in llm_invocation_params["messages"]:
|
||||
evt = events.ConversationItemCreateEvent(item=item)
|
||||
self._messages_added_manually[evt.item.id] = True
|
||||
await self.send_client_event(evt)
|
||||
|
||||
await self._send_session_update()
|
||||
self._llm_needs_conversation_setup = False
|
||||
|
||||
async def _handle_evt_session_updated(self, evt):
|
||||
self._api_session_ready = True
|
||||
if self._pending_initial_greeting_text is not None:
|
||||
greeting_text = self._pending_initial_greeting_text
|
||||
self._run_llm_when_api_session_ready = False
|
||||
await self._create_initial_greeting_response(greeting_text)
|
||||
elif self._run_llm_when_api_session_ready:
|
||||
self._run_llm_when_api_session_ready = False
|
||||
await self._create_response()
|
||||
|
||||
async def _send_user_audio(self, frame):
|
||||
if self._user_is_muted:
|
||||
return
|
||||
|
|
@ -171,14 +228,21 @@ class DograhAzureRealtimeLLMService(AzureRealtimeLLMService):
|
|||
return "\n".join(parts) if parts else None
|
||||
return None
|
||||
|
||||
async def _send_manual_response_create(self):
|
||||
async def _send_manual_response_create(
|
||||
self,
|
||||
*,
|
||||
instructions: str | None = None,
|
||||
tool_choice: str | None = None,
|
||||
):
|
||||
await self.push_frame(LLMFullResponseStartFrame())
|
||||
await self.start_processing_metrics()
|
||||
await self.start_ttfb_metrics()
|
||||
await self.send_client_event(
|
||||
events.ResponseCreateEvent(
|
||||
response=events.ResponseProperties(
|
||||
output_modalities=self._get_enabled_modalities()
|
||||
output_modalities=self._get_enabled_modalities(),
|
||||
instructions=instructions,
|
||||
tool_choice=tool_choice,
|
||||
)
|
||||
)
|
||||
)
|
||||
|
|
|
|||
|
|
@ -20,11 +20,13 @@ Layers Dograh engine integration quirks onto upstream-pristine
|
|||
|
||||
from typing import Any
|
||||
|
||||
from google.genai.types import Content, Part
|
||||
from loguru import logger
|
||||
|
||||
from api.services.pipecat.gemini_json_schema_adapter import (
|
||||
DograhGeminiJSONSchemaAdapter,
|
||||
)
|
||||
from api.services.pipecat.realtime.static_greeting import format_static_greeting_prompt
|
||||
from pipecat.frames.frames import (
|
||||
BotStoppedSpeakingFrame,
|
||||
Frame,
|
||||
|
|
@ -63,6 +65,9 @@ class DograhGeminiLiveLLMService(GeminiLiveLLMService):
|
|||
# Function calls emitted by Gemini mid-bot-turn are deferred here and
|
||||
# invoked when the turn ends, so they don't race the turn's audio.
|
||||
self._pending_function_calls: list[FunctionCallFromLLM] = []
|
||||
# Text greeting captured from the first TTSSpeakFrame while the Gemini
|
||||
# session is still connecting.
|
||||
self._pending_initial_greeting_text: str | None = None
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Hooks from upstream GeminiLiveLLMService
|
||||
|
|
@ -142,10 +147,15 @@ class DograhGeminiLiveLLMService(GeminiLiveLLMService):
|
|||
if isinstance(frame, TTSSpeakFrame):
|
||||
# Greeting trigger: the engine queues a TTSSpeakFrame to start the
|
||||
# bot's first turn after node setup. Gemini Live renders its own
|
||||
# audio, so we don't pass the frame through — we re-enter
|
||||
# _handle_context to kick off the initial response.
|
||||
# audio, so we don't pass the frame through. For configured static
|
||||
# text greetings, ask Gemini to say the exact greeting; otherwise
|
||||
# re-enter _handle_context to kick off the normal initial response.
|
||||
if not self._handled_initial_context:
|
||||
await self._handle_context(self._context)
|
||||
greeting_text = frame.text.strip() if frame.text else ""
|
||||
if greeting_text:
|
||||
await self._handle_initial_greeting(self._context, greeting_text)
|
||||
else:
|
||||
await self._handle_context(self._context)
|
||||
else:
|
||||
logger.warning(
|
||||
f"{self}: TTSSpeakFrame after initial context already "
|
||||
|
|
@ -183,6 +193,49 @@ class DograhGeminiLiveLLMService(GeminiLiveLLMService):
|
|||
self._context = context
|
||||
await self._process_completed_function_calls(send_new_results=True)
|
||||
|
||||
async def _handle_initial_greeting(self, context: LLMContext, greeting_text: str):
|
||||
"""Trigger the first Gemini turn with an exact static text greeting."""
|
||||
if context is None:
|
||||
logger.warning(
|
||||
f"{self}: received initial greeting trigger before context was set"
|
||||
)
|
||||
return
|
||||
|
||||
self._handled_initial_context = True
|
||||
self._context = context
|
||||
await self._create_initial_greeting_response(greeting_text)
|
||||
|
||||
async def _create_initial_greeting_response(self, greeting_text: str):
|
||||
"""Ask Gemini Live to speak the configured greeting exactly once."""
|
||||
if self._disconnecting:
|
||||
return
|
||||
|
||||
if not self._session:
|
||||
self._pending_initial_greeting_text = greeting_text
|
||||
self._run_llm_when_session_ready = True
|
||||
return
|
||||
|
||||
self._pending_initial_greeting_text = None
|
||||
prompt = format_static_greeting_prompt(greeting_text)
|
||||
turn = Content(role="user", parts=[Part(text=prompt)])
|
||||
|
||||
logger.debug("Creating Gemini Live initial response from static greeting")
|
||||
|
||||
await self.start_ttfb_metrics()
|
||||
|
||||
try:
|
||||
await self._session.send_client_content(
|
||||
turns=[turn],
|
||||
turn_complete=True,
|
||||
)
|
||||
# Gemini 3.x also needs a realtime-input nudge to begin inference.
|
||||
if self._is_gemini_3:
|
||||
await self._session.send_realtime_input(text=" ")
|
||||
except Exception as e:
|
||||
await self._handle_send_error(e)
|
||||
|
||||
self._ready_for_realtime_input = True
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Session lifecycle: drop upstream's automatic reconnect-seed and
|
||||
# initial-context-seed paths. The TTSSpeakFrame trigger and the
|
||||
|
|
@ -201,7 +254,12 @@ class DograhGeminiLiveLLMService(GeminiLiveLLMService):
|
|||
# Context arrived before session was ready — fulfil the queued
|
||||
# initial response now.
|
||||
self._run_llm_when_session_ready = False
|
||||
await self._create_initial_response()
|
||||
if self._pending_initial_greeting_text is not None:
|
||||
await self._create_initial_greeting_response(
|
||||
self._pending_initial_greeting_text
|
||||
)
|
||||
else:
|
||||
await self._create_initial_response()
|
||||
await self._drain_pending_tool_results()
|
||||
# Otherwise: no automatic seed. Reconnect after a session-resumption
|
||||
# update relies on the server-side restored state; reconnects without
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@ from typing import Any
|
|||
|
||||
from loguru import logger
|
||||
|
||||
from api.services.pipecat.realtime.static_greeting import format_static_greeting_prompt
|
||||
from pipecat.frames.frames import (
|
||||
BotStartedSpeakingFrame,
|
||||
BotStoppedSpeakingFrame,
|
||||
|
|
@ -50,6 +51,7 @@ class DograhGrokRealtimeLLMService(GrokRealtimeLLMService):
|
|||
self._handled_initial_context: bool = False
|
||||
self._bot_is_speaking: bool = False
|
||||
self._deferred_function_calls: list[FunctionCallFromLLM] = []
|
||||
self._pending_initial_greeting_text: str | None = None
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
if isinstance(frame, UserMuteStartedFrame):
|
||||
|
|
@ -62,7 +64,11 @@ class DograhGrokRealtimeLLMService(GrokRealtimeLLMService):
|
|||
return
|
||||
if isinstance(frame, TTSSpeakFrame):
|
||||
if not self._handled_initial_context:
|
||||
await self._handle_context(self._context)
|
||||
greeting_text = frame.text.strip() if frame.text else ""
|
||||
if greeting_text:
|
||||
await self._handle_initial_greeting(self._context, greeting_text)
|
||||
else:
|
||||
await self._handle_context(self._context)
|
||||
else:
|
||||
logger.warning(
|
||||
f"{self}: TTSSpeakFrame after initial context already "
|
||||
|
|
@ -120,6 +126,67 @@ class DograhGrokRealtimeLLMService(GrokRealtimeLLMService):
|
|||
self._context = context
|
||||
await self._process_completed_function_calls(send_new_results=True)
|
||||
|
||||
async def _handle_initial_greeting(self, context: LLMContext, greeting_text: str):
|
||||
if context is None:
|
||||
logger.warning(
|
||||
f"{self}: received initial greeting trigger before context was set"
|
||||
)
|
||||
return
|
||||
|
||||
self._handled_initial_context = True
|
||||
self._context = context
|
||||
await self._create_initial_greeting_response(greeting_text)
|
||||
|
||||
async def _create_initial_greeting_response(self, greeting_text: str):
|
||||
if self._disconnecting:
|
||||
return
|
||||
|
||||
if not self._api_session_ready:
|
||||
self._pending_initial_greeting_text = greeting_text
|
||||
self._run_llm_when_api_session_ready = True
|
||||
return
|
||||
|
||||
self._pending_initial_greeting_text = None
|
||||
await self._ensure_conversation_setup()
|
||||
item = events.ConversationItem(
|
||||
type="message",
|
||||
role="user",
|
||||
content=[
|
||||
events.ItemContent(
|
||||
type="input_text",
|
||||
text=format_static_greeting_prompt(greeting_text),
|
||||
)
|
||||
],
|
||||
)
|
||||
evt = events.ConversationItemCreateEvent(item=item)
|
||||
self._messages_added_manually[evt.item.id] = True
|
||||
await self.send_client_event(evt)
|
||||
await self._send_manual_response_create()
|
||||
|
||||
async def _ensure_conversation_setup(self):
|
||||
if not self._llm_needs_conversation_setup:
|
||||
return
|
||||
|
||||
adapter = self.get_llm_adapter()
|
||||
llm_invocation_params = adapter.get_llm_invocation_params(self._context)
|
||||
for item in llm_invocation_params["messages"]:
|
||||
evt = events.ConversationItemCreateEvent(item=item)
|
||||
self._messages_added_manually[evt.item.id] = True
|
||||
await self.send_client_event(evt)
|
||||
|
||||
await self._send_session_update()
|
||||
self._llm_needs_conversation_setup = False
|
||||
|
||||
async def _handle_evt_session_updated(self, evt):
|
||||
self._api_session_ready = True
|
||||
if self._pending_initial_greeting_text is not None:
|
||||
greeting_text = self._pending_initial_greeting_text
|
||||
self._run_llm_when_api_session_ready = False
|
||||
await self._create_initial_greeting_response(greeting_text)
|
||||
elif self._run_llm_when_api_session_ready:
|
||||
self._run_llm_when_api_session_ready = False
|
||||
await self._create_response()
|
||||
|
||||
async def _send_user_audio(self, frame):
|
||||
if self._user_is_muted:
|
||||
return
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@ from typing import Any
|
|||
|
||||
from loguru import logger
|
||||
|
||||
from api.services.pipecat.realtime.static_greeting import format_static_greeting_prompt
|
||||
from pipecat.frames.frames import (
|
||||
BotStartedSpeakingFrame,
|
||||
BotStoppedSpeakingFrame,
|
||||
|
|
@ -56,6 +57,7 @@ class DograhOpenAIRealtimeLLMService(OpenAIRealtimeLLMService):
|
|||
# has finished speaking, matching Dograh's Gemini Live behavior.
|
||||
self._bot_is_speaking: bool = False
|
||||
self._deferred_function_calls: list[FunctionCallFromLLM] = []
|
||||
self._pending_initial_greeting_text: str | None = None
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Frame handling: mute, TTSSpeakFrame as greeting trigger
|
||||
|
|
@ -73,11 +75,16 @@ class DograhOpenAIRealtimeLLMService(OpenAIRealtimeLLMService):
|
|||
if isinstance(frame, TTSSpeakFrame):
|
||||
# Greeting trigger: the engine queues a TTSSpeakFrame after node
|
||||
# setup. OpenAI Realtime renders its own audio, so we don't pass
|
||||
# the frame to TTS. Route through _handle_context so the initial
|
||||
# response and later tool-result turns share the same context
|
||||
# lifecycle even when Dograh has already pre-populated self._context.
|
||||
# the frame to TTS. For configured static text greetings, ask the
|
||||
# model to say the exact greeting; otherwise route through
|
||||
# _handle_context so the initial response and later tool-result
|
||||
# turns share the same context lifecycle.
|
||||
if not self._handled_initial_context:
|
||||
await self._handle_context(self._context)
|
||||
greeting_text = frame.text.strip() if frame.text else ""
|
||||
if greeting_text:
|
||||
await self._handle_initial_greeting(self._context, greeting_text)
|
||||
else:
|
||||
await self._handle_context(self._context)
|
||||
else:
|
||||
logger.warning(
|
||||
f"{self}: TTSSpeakFrame after initial context already "
|
||||
|
|
@ -137,6 +144,57 @@ class DograhOpenAIRealtimeLLMService(OpenAIRealtimeLLMService):
|
|||
self._context = context
|
||||
await self._process_completed_function_calls(send_new_results=True)
|
||||
|
||||
async def _handle_initial_greeting(self, context: LLMContext, greeting_text: str):
|
||||
if context is None:
|
||||
logger.warning(
|
||||
f"{self}: received initial greeting trigger before context was set"
|
||||
)
|
||||
return
|
||||
|
||||
self._handled_initial_context = True
|
||||
self._context = context
|
||||
await self._create_initial_greeting_response(greeting_text)
|
||||
|
||||
async def _create_initial_greeting_response(self, greeting_text: str):
|
||||
if self._disconnecting:
|
||||
return
|
||||
|
||||
if not self._api_session_ready:
|
||||
self._pending_initial_greeting_text = greeting_text
|
||||
self._run_llm_when_api_session_ready = True
|
||||
return
|
||||
|
||||
self._pending_initial_greeting_text = None
|
||||
await self._ensure_conversation_setup()
|
||||
await self._send_manual_response_create(
|
||||
instructions=format_static_greeting_prompt(greeting_text),
|
||||
tool_choice="none",
|
||||
)
|
||||
|
||||
async def _ensure_conversation_setup(self):
|
||||
if not self._llm_needs_conversation_setup:
|
||||
return
|
||||
|
||||
adapter = self.get_llm_adapter()
|
||||
llm_invocation_params = adapter.get_llm_invocation_params(self._context)
|
||||
for item in llm_invocation_params["messages"]:
|
||||
evt = events.ConversationItemCreateEvent(item=item)
|
||||
self._messages_added_manually[evt.item.id] = True
|
||||
await self.send_client_event(evt)
|
||||
|
||||
await self._send_session_update()
|
||||
self._llm_needs_conversation_setup = False
|
||||
|
||||
async def _handle_evt_session_updated(self, evt):
|
||||
self._api_session_ready = True
|
||||
if self._pending_initial_greeting_text is not None:
|
||||
greeting_text = self._pending_initial_greeting_text
|
||||
self._run_llm_when_api_session_ready = False
|
||||
await self._create_initial_greeting_response(greeting_text)
|
||||
elif self._run_llm_when_api_session_ready:
|
||||
self._run_llm_when_api_session_ready = False
|
||||
await self._create_response()
|
||||
|
||||
async def _send_user_audio(self, frame):
|
||||
if self._user_is_muted:
|
||||
return
|
||||
|
|
@ -190,7 +248,12 @@ class DograhOpenAIRealtimeLLMService(OpenAIRealtimeLLMService):
|
|||
return "\n".join(parts) if parts else None
|
||||
return None
|
||||
|
||||
async def _send_manual_response_create(self):
|
||||
async def _send_manual_response_create(
|
||||
self,
|
||||
*,
|
||||
instructions: str | None = None,
|
||||
tool_choice: str | None = None,
|
||||
):
|
||||
"""Trigger inference after manually appending conversation items."""
|
||||
await self.push_frame(LLMFullResponseStartFrame())
|
||||
await self.start_processing_metrics()
|
||||
|
|
@ -198,7 +261,9 @@ class DograhOpenAIRealtimeLLMService(OpenAIRealtimeLLMService):
|
|||
await self.send_client_event(
|
||||
events.ResponseCreateEvent(
|
||||
response=events.ResponseProperties(
|
||||
output_modalities=self._get_enabled_modalities()
|
||||
output_modalities=self._get_enabled_modalities(),
|
||||
instructions=instructions,
|
||||
tool_choice=tool_choice,
|
||||
)
|
||||
)
|
||||
)
|
||||
|
|
|
|||
8
api/services/pipecat/realtime/static_greeting.py
Normal file
8
api/services/pipecat/realtime/static_greeting.py
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
def format_static_greeting_prompt(greeting_text: str) -> str:
|
||||
return (
|
||||
"The phone call has just connected. Greet the caller now: "
|
||||
"say the following opening line out loud, exactly as written, "
|
||||
"in a natural spoken voice, and then stop and wait for the "
|
||||
"caller to respond. Do not add anything before or after it.\n\n"
|
||||
f'"{greeting_text}"'
|
||||
)
|
||||
Loading…
Add table
Add a link
Reference in a new issue