mirror of
https://github.com/dograh-hq/dograh.git
synced 2026-06-07 07:55:16 +02:00
fix: fix rtf logs and gemini live turn taking
This commit is contained in:
parent
25751efe3c
commit
0c0b8383bf
6 changed files with 159 additions and 148 deletions
|
|
@ -16,9 +16,6 @@ Layers Dograh engine integration quirks onto upstream-pristine
|
|||
- **TTSSpeakFrame as greeting trigger.** The engine queues a TTSSpeakFrame
|
||||
to kick off the first response after node setup; the service intercepts
|
||||
it and runs the initial-context path.
|
||||
- **Finalize-pending on transcriptions.** Marks the transcription emitted
|
||||
immediately after VAD-stop as finalized, distinguishing it from
|
||||
mid-turn partials.
|
||||
"""
|
||||
|
||||
from typing import Any
|
||||
|
|
@ -28,7 +25,6 @@ from loguru import logger
|
|||
from pipecat.frames.frames import (
|
||||
BotStoppedSpeakingFrame,
|
||||
Frame,
|
||||
TranscriptionFrame,
|
||||
TTSSpeakFrame,
|
||||
UserMuteStartedFrame,
|
||||
UserMuteStoppedFrame,
|
||||
|
|
@ -37,7 +33,6 @@ from pipecat.processors.aggregators.llm_context import LLMContext
|
|||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
|
||||
from pipecat.services.llm_service import FunctionCallFromLLM
|
||||
from pipecat.utils.time import time_now_iso8601
|
||||
from pipecat.utils.tracing.service_decorators import traced_gemini_live
|
||||
|
||||
|
||||
|
|
@ -58,9 +53,6 @@ class DograhGeminiLiveLLMService(GeminiLiveLLMService):
|
|||
# Function calls emitted by Gemini mid-bot-turn are deferred here and
|
||||
# invoked when the turn ends, so they don't race the turn's audio.
|
||||
self._pending_function_calls: list[FunctionCallFromLLM] = []
|
||||
# Tracks whether the next transcription to arrive should be marked as
|
||||
# the finalized transcription for the current user turn.
|
||||
self._finalize_pending: bool = False
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Hooks from upstream GeminiLiveLLMService
|
||||
|
|
@ -206,32 +198,3 @@ class DograhGeminiLiveLLMService(GeminiLiveLLMService):
|
|||
# a handle (e.g. node transitions before any handle was issued) are
|
||||
# followed by a function-call-result LLMContextFrame which feeds the
|
||||
# updated-context branch in _handle_context.
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Transcription: broadcast (so downstream voicemail detector and
|
||||
# logs buffer both see it) and set finalized= for turn-boundary
|
||||
# semantics.
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def _handle_user_started_speaking(self, frame):
|
||||
await super()._handle_user_started_speaking(frame)
|
||||
# A new VAD start invalidates any pending finalize from a prior stop
|
||||
# that hasn't been paired with a transcription yet.
|
||||
self._finalize_pending = False
|
||||
|
||||
async def _handle_user_stopped_speaking(self, frame):
|
||||
await super()._handle_user_stopped_speaking(frame)
|
||||
self._finalize_pending = True
|
||||
|
||||
async def _push_user_transcription(self, text: str, result=None):
|
||||
await self._handle_user_transcription(text, True, self._settings.language)
|
||||
finalized = self._finalize_pending
|
||||
self._finalize_pending = False
|
||||
await self.broadcast_frame(
|
||||
TranscriptionFrame,
|
||||
text=text,
|
||||
user_id="",
|
||||
timestamp=time_now_iso8601(),
|
||||
result=result,
|
||||
finalized=finalized,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -13,9 +13,8 @@ Adds:
|
|||
flow kicks off the bot's first response.
|
||||
- **One-off LLMMessagesAppendFrame handling** for ephemeral realtime prompts
|
||||
like user-idle checks, without mutating Dograh's local ``LLMContext``.
|
||||
- **finalized=True on TranscriptionFrame** for parity with the Gemini
|
||||
service (every OpenAI transcription via the ``completed`` event is
|
||||
final by construction).
|
||||
- **finalized=True on TranscriptionFrame** because every OpenAI
|
||||
transcription via the ``completed`` event is final by construction.
|
||||
"""
|
||||
|
||||
import json
|
||||
|
|
@ -254,9 +253,8 @@ class DograhOpenAIRealtimeLLMService(OpenAIRealtimeLLMService):
|
|||
logger.error(f"Failed to process function call arguments: {e}")
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Transcription: broadcast with finalized=True for parity with the
|
||||
# Gemini service (consumers that check `finalized` should see True
|
||||
# for every completed-transcription event from OpenAI).
|
||||
# Transcription: broadcast with finalized=True for every
|
||||
# completed-transcription event from OpenAI.
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
async def handle_evt_input_audio_transcription_completed(self, evt):
|
||||
|
|
|
|||
|
|
@ -4,9 +4,9 @@ This observer watches pipeline frames and sends relevant events (transcriptions,
|
|||
bot text, function calls, TTFB metrics) over WebSocket to provide real-time
|
||||
feedback in the UI.
|
||||
|
||||
For frames with presentation timestamps (pts), like TTSTextFrame, we respect
|
||||
the timing by queuing them and sending at the appropriate time, similar to
|
||||
how base_output.py handles timed frames.
|
||||
For TTS text, we wait until the frame has passed through BaseOutputTransport.
|
||||
That transport already applies presentation timestamp timing against audio
|
||||
playback, so the UI text is emitted from the same clock as the spoken audio.
|
||||
|
||||
Streaming vs. persisted data:
|
||||
- WebSocket receives all events in real-time (interim transcriptions, TTS text
|
||||
|
|
@ -20,9 +20,7 @@ rather than being observed here, to ensure precise timing at the moment of
|
|||
node changes.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import time
|
||||
from typing import TYPE_CHECKING, Awaitable, Callable, Optional, Set
|
||||
|
||||
from loguru import logger
|
||||
|
|
@ -60,8 +58,8 @@ from pipecat.frames.frames import (
|
|||
from pipecat.metrics.metrics import TTFBMetricsData
|
||||
from pipecat.observers.base_observer import BaseObserver, FramePushed
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.transports.base_output import BaseOutputTransport
|
||||
from pipecat.utils.enums import RealtimeFeedbackType
|
||||
from pipecat.utils.time import nanoseconds_to_seconds
|
||||
|
||||
|
||||
class RealtimeFeedbackObserver(BaseObserver):
|
||||
|
|
@ -69,7 +67,7 @@ class RealtimeFeedbackObserver(BaseObserver):
|
|||
|
||||
WebSocket streaming (all events for live UI):
|
||||
- User transcriptions (interim and final)
|
||||
- Bot TTS text (with pts-based timing)
|
||||
- Bot TTS text after output transport timing
|
||||
- Function calls (start/end)
|
||||
- TTFB metrics (LLM generation time only)
|
||||
|
||||
|
|
@ -78,9 +76,6 @@ class RealtimeFeedbackObserver(BaseObserver):
|
|||
- Complete assistant transcripts per turn (via on_assistant_turn_stopped)
|
||||
- Function calls and TTFB metrics
|
||||
|
||||
For frames with pts (presentation timestamp), we queue them and send at the
|
||||
appropriate time to sync with audio playback.
|
||||
|
||||
Note: Node transitions are handled by PipecatEngine.set_node() callback.
|
||||
"""
|
||||
|
||||
|
|
@ -100,105 +95,47 @@ class RealtimeFeedbackObserver(BaseObserver):
|
|||
self._logs_buffer = logs_buffer
|
||||
self._frames_seen: Set[str] = set()
|
||||
|
||||
# Clock/timing for pts-based frames (similar to base_output.py)
|
||||
self._clock_queue: Optional[asyncio.PriorityQueue] = None
|
||||
self._clock_task: Optional[asyncio.Task] = None
|
||||
self._clock_start_time: Optional[float] = (
|
||||
None # Wall clock time when we started
|
||||
)
|
||||
self._pts_start_time: Optional[int] = None # First pts value we saw
|
||||
|
||||
async def _ensure_clock_task(self):
|
||||
"""Create the clock task if it doesn't exist."""
|
||||
if self._clock_queue is None:
|
||||
self._clock_queue = asyncio.PriorityQueue()
|
||||
self._clock_task = asyncio.create_task(self._clock_task_handler())
|
||||
|
||||
async def _cancel_clock_task(self):
|
||||
"""Cancel the clock task and clear the queue.
|
||||
|
||||
Called on interruption to discard any pending bot text that
|
||||
hasn't been sent yet.
|
||||
"""
|
||||
if self._clock_task:
|
||||
self._clock_task.cancel()
|
||||
try:
|
||||
await self._clock_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
self._clock_task = None
|
||||
self._clock_queue = None
|
||||
# Reset timing references so next bot response starts fresh
|
||||
self._clock_start_time = None
|
||||
self._pts_start_time = None
|
||||
|
||||
async def cleanup(self):
|
||||
"""Clean up resources. Must be called when the observer is no longer needed."""
|
||||
await self._cancel_clock_task()
|
||||
|
||||
async def _handle_interruption(self):
|
||||
"""Handle interruption by clearing queued bot text.
|
||||
|
||||
Similar to base_output.py's handle_interruptions, we cancel the
|
||||
clock task and recreate it to discard pending frames.
|
||||
"""
|
||||
await self._cancel_clock_task()
|
||||
|
||||
async def _clock_task_handler(self):
|
||||
"""Process timed frames from the queue, respecting their presentation timestamps.
|
||||
|
||||
Similar to base_output.py's _clock_task_handler, we wait until the
|
||||
frame's pts time has arrived before sending.
|
||||
"""
|
||||
while True:
|
||||
try:
|
||||
pts, _frame_id, message = await self._clock_queue.get()
|
||||
|
||||
# Calculate when to send based on pts relative to our start time
|
||||
if (
|
||||
self._clock_start_time is not None
|
||||
and self._pts_start_time is not None
|
||||
):
|
||||
# Target time = start wall time + (frame pts - start pts) in seconds
|
||||
target_time = self._clock_start_time + nanoseconds_to_seconds(
|
||||
pts - self._pts_start_time
|
||||
)
|
||||
current_time = time.time()
|
||||
if target_time > current_time:
|
||||
await asyncio.sleep(target_time - current_time)
|
||||
|
||||
# Send the message (clock queue only has TTS text, WS-only)
|
||||
await self._send_ws(message)
|
||||
self._clock_queue.task_done()
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
except Exception as e:
|
||||
logger.debug(f"Clock task error: {e}")
|
||||
pass
|
||||
|
||||
async def on_push_frame(self, data: FramePushed):
|
||||
"""Process frames and send relevant ones to the client."""
|
||||
frame = data.frame
|
||||
frame_direction = data.direction
|
||||
source = data.source
|
||||
|
||||
# Skip already processed frames (frames can be observed multiple times).
|
||||
# ErrorFrames are accepted in either direction — push_error() emits them
|
||||
# UPSTREAM, and we still want to surface them to the UI.
|
||||
# UPSTREAM, and we still want to surface them to the UI. Upstream-only
|
||||
# transcription frames are accepted too: upstream Gemini Live emits user
|
||||
# transcripts toward the user aggregator, not downstream. Broadcast
|
||||
# transcription siblings are still handled only on the downstream copy to
|
||||
# avoid duplicate live UI messages.
|
||||
if frame.id in self._frames_seen:
|
||||
return
|
||||
if frame_direction != FrameDirection.DOWNSTREAM and not isinstance(
|
||||
frame, ErrorFrame
|
||||
if frame_direction != FrameDirection.DOWNSTREAM:
|
||||
is_upstream_transcription = (
|
||||
isinstance(frame, (InterimTranscriptionFrame, TranscriptionFrame))
|
||||
and frame.broadcast_sibling_id is None
|
||||
)
|
||||
if not isinstance(frame, ErrorFrame) and not is_upstream_transcription:
|
||||
return
|
||||
|
||||
# TTSTextFrame may be observed before the output transport has applied
|
||||
# its audio clock. Match RTVIObserver: leave the frame unmarked so the
|
||||
# transport-pushed copy can be handled with playback timing already done.
|
||||
if isinstance(frame, TTSTextFrame) and not isinstance(
|
||||
source, BaseOutputTransport
|
||||
):
|
||||
return
|
||||
|
||||
self._frames_seen.add(frame.id)
|
||||
|
||||
logger.trace(f"{self} Received Frame: {frame} Direction: {frame_direction}")
|
||||
|
||||
# Handle pipeline termination - stop clock task
|
||||
if isinstance(frame, (EndFrame, CancelFrame, StopFrame)):
|
||||
await self._cancel_clock_task()
|
||||
# Handle interruptions - clear any queued bot text
|
||||
elif isinstance(frame, InterruptionFrame):
|
||||
await self._handle_interruption()
|
||||
if isinstance(frame, (EndFrame, CancelFrame, StopFrame, InterruptionFrame)):
|
||||
return
|
||||
# Bot speaking state - WS only (ephemeral state signals, not persisted)
|
||||
elif isinstance(frame, BotStartedSpeakingFrame):
|
||||
await self._send_ws(
|
||||
|
|
@ -245,27 +182,16 @@ class RealtimeFeedbackObserver(BaseObserver):
|
|||
elif isinstance(frame, TTSSpeakFrame):
|
||||
if getattr(frame, "persist_to_logs", False):
|
||||
await self._append_to_buffer(build_bot_text_event(text=frame.text))
|
||||
# Handle bot TTS text - respect pts timing, WebSocket only
|
||||
# Handle bot TTS text after output transport timing, WebSocket only
|
||||
# Complete turn text is persisted via register_turn_handlers,
|
||||
# except for frames explicitly flagged persist_to_logs (e.g. recording
|
||||
# transcripts from play_audio) which bypass the aggregator path.
|
||||
elif isinstance(frame, TTSTextFrame):
|
||||
message = build_bot_text_event(text=frame.text)
|
||||
|
||||
# If frame has pts, queue it for timed delivery
|
||||
if frame.pts:
|
||||
# Initialize timing reference on first pts frame
|
||||
if self._pts_start_time is None:
|
||||
self._pts_start_time = frame.pts
|
||||
self._clock_start_time = time.time()
|
||||
|
||||
await self._ensure_clock_task()
|
||||
await self._clock_queue.put((frame.pts, frame.id, message))
|
||||
elif getattr(frame, "persist_to_logs", False):
|
||||
# No pts + explicit persistence request (recording transcript).
|
||||
if getattr(frame, "persist_to_logs", False):
|
||||
await self._send_message(message)
|
||||
else:
|
||||
# No pts, send immediately
|
||||
await self._send_ws(message)
|
||||
# Handle function call in progress
|
||||
elif (
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue