fix: fix rtf logs and gemini live turn taking

This commit is contained in:
Abhishek Kumar 2026-05-31 16:05:03 +05:30
parent 25751efe3c
commit 0c0b8383bf
6 changed files with 159 additions and 148 deletions

View file

@ -16,9 +16,6 @@ Layers Dograh engine integration quirks onto upstream-pristine
- **TTSSpeakFrame as greeting trigger.** The engine queues a TTSSpeakFrame
to kick off the first response after node setup; the service intercepts
it and runs the initial-context path.
- **Finalize-pending on transcriptions.** Marks the transcription emitted
immediately after VAD-stop as finalized, distinguishing it from
mid-turn partials.
"""
from typing import Any
@ -28,7 +25,6 @@ from loguru import logger
from pipecat.frames.frames import (
BotStoppedSpeakingFrame,
Frame,
TranscriptionFrame,
TTSSpeakFrame,
UserMuteStartedFrame,
UserMuteStoppedFrame,
@ -37,7 +33,6 @@ from pipecat.processors.aggregators.llm_context import LLMContext
from pipecat.processors.frame_processor import FrameDirection
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
from pipecat.services.llm_service import FunctionCallFromLLM
from pipecat.utils.time import time_now_iso8601
from pipecat.utils.tracing.service_decorators import traced_gemini_live
@ -58,9 +53,6 @@ class DograhGeminiLiveLLMService(GeminiLiveLLMService):
# Function calls emitted by Gemini mid-bot-turn are deferred here and
# invoked when the turn ends, so they don't race the turn's audio.
self._pending_function_calls: list[FunctionCallFromLLM] = []
# Tracks whether the next transcription to arrive should be marked as
# the finalized transcription for the current user turn.
self._finalize_pending: bool = False
# ------------------------------------------------------------------
# Hooks from upstream GeminiLiveLLMService
@ -206,32 +198,3 @@ class DograhGeminiLiveLLMService(GeminiLiveLLMService):
# a handle (e.g. node transitions before any handle was issued) are
# followed by a function-call-result LLMContextFrame which feeds the
# updated-context branch in _handle_context.
# ------------------------------------------------------------------
# Transcription: broadcast (so downstream voicemail detector and
# logs buffer both see it) and set finalized= for turn-boundary
# semantics.
# ------------------------------------------------------------------
async def _handle_user_started_speaking(self, frame):
await super()._handle_user_started_speaking(frame)
# A new VAD start invalidates any pending finalize from a prior stop
# that hasn't been paired with a transcription yet.
self._finalize_pending = False
async def _handle_user_stopped_speaking(self, frame):
await super()._handle_user_stopped_speaking(frame)
self._finalize_pending = True
async def _push_user_transcription(self, text: str, result=None):
await self._handle_user_transcription(text, True, self._settings.language)
finalized = self._finalize_pending
self._finalize_pending = False
await self.broadcast_frame(
TranscriptionFrame,
text=text,
user_id="",
timestamp=time_now_iso8601(),
result=result,
finalized=finalized,
)

View file

@ -13,9 +13,8 @@ Adds:
flow kicks off the bot's first response.
- **One-off LLMMessagesAppendFrame handling** for ephemeral realtime prompts
like user-idle checks, without mutating Dograh's local ``LLMContext``.
- **finalized=True on TranscriptionFrame** for parity with the Gemini
service (every OpenAI transcription via the ``completed`` event is
final by construction).
- **finalized=True on TranscriptionFrame** because every OpenAI
transcription via the ``completed`` event is final by construction.
"""
import json
@ -254,9 +253,8 @@ class DograhOpenAIRealtimeLLMService(OpenAIRealtimeLLMService):
logger.error(f"Failed to process function call arguments: {e}")
# ------------------------------------------------------------------
# Transcription: broadcast with finalized=True for parity with the
# Gemini service (consumers that check `finalized` should see True
# for every completed-transcription event from OpenAI).
# Transcription: broadcast with finalized=True for every
# completed-transcription event from OpenAI.
# ------------------------------------------------------------------
async def handle_evt_input_audio_transcription_completed(self, evt):

View file

@ -4,9 +4,9 @@ This observer watches pipeline frames and sends relevant events (transcriptions,
bot text, function calls, TTFB metrics) over WebSocket to provide real-time
feedback in the UI.
For frames with presentation timestamps (pts), like TTSTextFrame, we respect
the timing by queuing them and sending at the appropriate time, similar to
how base_output.py handles timed frames.
For TTS text, we wait until the frame has passed through BaseOutputTransport.
That transport already applies presentation timestamp timing against audio
playback, so the UI text is emitted from the same clock as the spoken audio.
Streaming vs. persisted data:
- WebSocket receives all events in real-time (interim transcriptions, TTS text
@ -20,9 +20,7 @@ rather than being observed here, to ensure precise timing at the moment of
node changes.
"""
import asyncio
import json
import time
from typing import TYPE_CHECKING, Awaitable, Callable, Optional, Set
from loguru import logger
@ -60,8 +58,8 @@ from pipecat.frames.frames import (
from pipecat.metrics.metrics import TTFBMetricsData
from pipecat.observers.base_observer import BaseObserver, FramePushed
from pipecat.processors.frame_processor import FrameDirection
from pipecat.transports.base_output import BaseOutputTransport
from pipecat.utils.enums import RealtimeFeedbackType
from pipecat.utils.time import nanoseconds_to_seconds
class RealtimeFeedbackObserver(BaseObserver):
@ -69,7 +67,7 @@ class RealtimeFeedbackObserver(BaseObserver):
WebSocket streaming (all events for live UI):
- User transcriptions (interim and final)
- Bot TTS text (with pts-based timing)
- Bot TTS text after output transport timing
- Function calls (start/end)
- TTFB metrics (LLM generation time only)
@ -78,9 +76,6 @@ class RealtimeFeedbackObserver(BaseObserver):
- Complete assistant transcripts per turn (via on_assistant_turn_stopped)
- Function calls and TTFB metrics
For frames with pts (presentation timestamp), we queue them and send at the
appropriate time to sync with audio playback.
Note: Node transitions are handled by PipecatEngine.set_node() callback.
"""
@ -100,105 +95,47 @@ class RealtimeFeedbackObserver(BaseObserver):
self._logs_buffer = logs_buffer
self._frames_seen: Set[str] = set()
# Clock/timing for pts-based frames (similar to base_output.py)
self._clock_queue: Optional[asyncio.PriorityQueue] = None
self._clock_task: Optional[asyncio.Task] = None
self._clock_start_time: Optional[float] = (
None # Wall clock time when we started
)
self._pts_start_time: Optional[int] = None # First pts value we saw
async def _ensure_clock_task(self):
"""Create the clock task if it doesn't exist."""
if self._clock_queue is None:
self._clock_queue = asyncio.PriorityQueue()
self._clock_task = asyncio.create_task(self._clock_task_handler())
async def _cancel_clock_task(self):
"""Cancel the clock task and clear the queue.
Called on interruption to discard any pending bot text that
hasn't been sent yet.
"""
if self._clock_task:
self._clock_task.cancel()
try:
await self._clock_task
except asyncio.CancelledError:
pass
self._clock_task = None
self._clock_queue = None
# Reset timing references so next bot response starts fresh
self._clock_start_time = None
self._pts_start_time = None
async def cleanup(self):
"""Clean up resources. Must be called when the observer is no longer needed."""
await self._cancel_clock_task()
async def _handle_interruption(self):
"""Handle interruption by clearing queued bot text.
Similar to base_output.py's handle_interruptions, we cancel the
clock task and recreate it to discard pending frames.
"""
await self._cancel_clock_task()
async def _clock_task_handler(self):
"""Process timed frames from the queue, respecting their presentation timestamps.
Similar to base_output.py's _clock_task_handler, we wait until the
frame's pts time has arrived before sending.
"""
while True:
try:
pts, _frame_id, message = await self._clock_queue.get()
# Calculate when to send based on pts relative to our start time
if (
self._clock_start_time is not None
and self._pts_start_time is not None
):
# Target time = start wall time + (frame pts - start pts) in seconds
target_time = self._clock_start_time + nanoseconds_to_seconds(
pts - self._pts_start_time
)
current_time = time.time()
if target_time > current_time:
await asyncio.sleep(target_time - current_time)
# Send the message (clock queue only has TTS text, WS-only)
await self._send_ws(message)
self._clock_queue.task_done()
except asyncio.CancelledError:
break
except Exception as e:
logger.debug(f"Clock task error: {e}")
pass
async def on_push_frame(self, data: FramePushed):
"""Process frames and send relevant ones to the client."""
frame = data.frame
frame_direction = data.direction
source = data.source
# Skip already processed frames (frames can be observed multiple times).
# ErrorFrames are accepted in either direction — push_error() emits them
# UPSTREAM, and we still want to surface them to the UI.
# UPSTREAM, and we still want to surface them to the UI. Upstream-only
# transcription frames are accepted too: upstream Gemini Live emits user
# transcripts toward the user aggregator, not downstream. Broadcast
# transcription siblings are still handled only on the downstream copy to
# avoid duplicate live UI messages.
if frame.id in self._frames_seen:
return
if frame_direction != FrameDirection.DOWNSTREAM and not isinstance(
frame, ErrorFrame
if frame_direction != FrameDirection.DOWNSTREAM:
is_upstream_transcription = (
isinstance(frame, (InterimTranscriptionFrame, TranscriptionFrame))
and frame.broadcast_sibling_id is None
)
if not isinstance(frame, ErrorFrame) and not is_upstream_transcription:
return
# TTSTextFrame may be observed before the output transport has applied
# its audio clock. Match RTVIObserver: leave the frame unmarked so the
# transport-pushed copy can be handled with playback timing already done.
if isinstance(frame, TTSTextFrame) and not isinstance(
source, BaseOutputTransport
):
return
self._frames_seen.add(frame.id)
logger.trace(f"{self} Received Frame: {frame} Direction: {frame_direction}")
# Handle pipeline termination - stop clock task
if isinstance(frame, (EndFrame, CancelFrame, StopFrame)):
await self._cancel_clock_task()
# Handle interruptions - clear any queued bot text
elif isinstance(frame, InterruptionFrame):
await self._handle_interruption()
if isinstance(frame, (EndFrame, CancelFrame, StopFrame, InterruptionFrame)):
return
# Bot speaking state - WS only (ephemeral state signals, not persisted)
elif isinstance(frame, BotStartedSpeakingFrame):
await self._send_ws(
@ -245,27 +182,16 @@ class RealtimeFeedbackObserver(BaseObserver):
elif isinstance(frame, TTSSpeakFrame):
if getattr(frame, "persist_to_logs", False):
await self._append_to_buffer(build_bot_text_event(text=frame.text))
# Handle bot TTS text - respect pts timing, WebSocket only
# Handle bot TTS text after output transport timing, WebSocket only
# Complete turn text is persisted via register_turn_handlers,
# except for frames explicitly flagged persist_to_logs (e.g. recording
# transcripts from play_audio) which bypass the aggregator path.
elif isinstance(frame, TTSTextFrame):
message = build_bot_text_event(text=frame.text)
# If frame has pts, queue it for timed delivery
if frame.pts:
# Initialize timing reference on first pts frame
if self._pts_start_time is None:
self._pts_start_time = frame.pts
self._clock_start_time = time.time()
await self._ensure_clock_task()
await self._clock_queue.put((frame.pts, frame.id, message))
elif getattr(frame, "persist_to_logs", False):
# No pts + explicit persistence request (recording transcript).
if getattr(frame, "persist_to_logs", False):
await self._send_message(message)
else:
# No pts, send immediately
await self._send_ws(message)
# Handle function call in progress
elif (